Niko 3877d51560 Remove default values for CLI arguments, all params except -h now required
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-05 15:57:25 +08:00

556 lines
24 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package com.ets.scraper;
import com.microsoft.playwright.*;
import com.microsoft.playwright.Frame;
import com.microsoft.playwright.options.LoadState;
import com.microsoft.playwright.options.WaitUntilState;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Base64;
import static java.nio.file.Files.createDirectories;
/**
* ETS (Construction Waste Management Platform) Web Scraper
* Uses Playwright to automate login and data extraction.
*/
public class EtsScraper {
public static final String FRAME_URL = "https://101.227.180.215/SHCityEnvCW/CWS/frame.html";
public static final String LOGIN_URL = "https://101.227.180.215/SHCityEnvCW/CWS/userlogin.html";
private static final String USERNAME = "sccw";
private static final String PASSWORD = "slife@123";
private static final Path SCREENSHOT_DIR = Path.of("screenshots");
private static final String OLLAMA_URL = "http://10.0.1.39:11434";
private static final String OLLAMA_MODEL = "qwen3-vl:4b";
private static final String PROXY_HOST = "http://127.0.0.1:8081";
public static void main(String[] args) throws Exception {
// Parse CLI arguments
String proxyHost = null;
String proxyUser = null;
String proxyPass = null;
String dateStr = null;
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "-h":
printHelp();
return;
case "-s":
proxyHost = args[++i];
break;
case "-u":
proxyUser = args[++i];
break;
case "-p":
proxyPass = args[++i];
break;
case "-d":
dateStr = args[++i];
break;
default:
System.err.println("[-] Unknown option: " + args[i]);
printHelp();
return;
}
}
if (proxyHost == null || proxyUser == null || proxyPass == null || dateStr == null) {
System.err.println("[-] Missing required arguments");
printHelp();
return;
}
java.time.LocalDate targetDate;
try {
targetDate = java.time.LocalDate.parse(dateStr);
} catch (Exception e) {
System.err.println("[-] Invalid date format: " + dateStr + ", expected yyyy-MM-dd");
return;
}
String dateStrFormatted = targetDate.format(java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd"));
String dateStrFileName = targetDate.format(DateTimeFormatter.ofPattern("yyyyMMdd"));
try {
createDirectories(SCREENSHOT_DIR);
} catch (Exception e) {
System.err.println("Failed to create directories: " + e.getMessage());
}
try (Playwright playwright = Playwright.create()) {
Browser browser = playwright.chromium().launch(
new BrowserType.LaunchOptions().setHeadless(false)
);
BrowserContext context = browser.newContext(
new Browser.NewContextOptions().setIgnoreHTTPSErrors(true)
);
Page page = context.newPage();
try {
// Navigate to frame.html first to establish session/cookies
System.out.println("[*] Establishing session via " + FRAME_URL);
page.navigate(FRAME_URL, new Page.NavigateOptions()
.setTimeout(30000)
.setWaitUntil(WaitUntilState.DOMCONTENTLOADED));
sleep(3000);
// Navigate directly to the login page
System.out.println("[*] Navigating to login page: " + LOGIN_URL);
page.navigate(LOGIN_URL, new Page.NavigateOptions()
.setTimeout(30000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
sleep(2000);
// Close notification dialog FIRST (before filling credentials)
closeNotificationDialog(page);
screenshot(page, "after_close_dialog");
// Download captcha image
downloadCaptcha(page);
// Close dialog again after page reload
closeNotificationDialog(page);
// Recognize captcha and perform login
boolean loggedin = doLoginWithCaptcha(page);
if (loggedin) {
System.out.println("[+] Login successful!");
sleep(2000);
screenshot(page, "after_login");
System.out.println("[+] Page title: " + page.title());
System.out.println("[+] Page URL: " + page.url());
// 点击三联单菜单
System.out.println("[*] Clicking 三联单 menu...");
page.locator("#module_2094F683-C542-4904-B33E-0D227C4DE199").first().click();
sleep(3000);
screenshot(page, "after_sanliandan");
System.out.println("[+] 三联单 page title: " + page.title());
// 设置日期筛选
System.out.println("[*] Setting date filter to: " + dateStrFormatted);
// 检查元素是否存在
boolean startDateExists = page.locator("#Search_ThreeBillList_startWdate").count() > 0;
boolean endDateExists = page.locator("#Search_ThreeBillList_endWdate").count() > 0;
boolean queryBtnExists = page.locator("#Search_ThreeBillList_Button").count() > 0;
System.out.println("[*] Elements found - startDate: " + startDateExists + ", endDate: " + endDateExists + ", queryBtn: " + queryBtnExists);
// 直接设置日期值WdatePicker 类型输入框)
if (startDateExists) {
System.out.println("[*] Setting start date to: " + dateStrFormatted);
page.locator("#Search_ThreeBillList_startWdate").first().fill(dateStrFormatted);
sleep(500);
} else {
System.out.println("[!] Start date element not found");
}
// 设置结束日期
if (endDateExists) {
System.out.println("[*] Setting end date to: " + dateStrFormatted);
page.locator("#Search_ThreeBillList_endWdate").first().fill(dateStrFormatted);
sleep(500);
} else {
System.out.println("[!] End date element not found");
}
// 点击查询按钮,等待列表加载
if (queryBtnExists) {
System.out.println("[*] Clicking query button...");
page.locator("#Search_ThreeBillList_Button").first().click();
// 等待列表内容出现
try {
page.waitForSelector("tbody tr", new Page.WaitForSelectorOptions()
.setTimeout(30000));
System.out.println("[+] Query completed, list loaded");
} catch (Exception e) {
System.out.println("[!] Wait for list timeout, but query was submitted");
}
} else {
System.out.println("[!] Query button not found");
}
screenshot(page, "after_query");
// 点击导出按钮
if (page.locator("#Export_ThreeBillList_Button").count() > 0) {
System.out.println("[*] Clicking export button...");
// 设置下载目录
Path downloadPath = Path.of("downloads").toAbsolutePath().normalize();
java.nio.file.Files.createDirectories(downloadPath);
// 点击主导出按钮打开对话框,再用 JS click 触发对话框内导出按钮
Download dl = page.waitForDownload(
new Page.WaitForDownloadOptions().setTimeout(300000),
() -> {
page.locator("#Export_ThreeBillList_Button").first().click();
sleep(2000);
System.out.println("[*] Triggering dialog export via JS...");
page.evaluate("document.querySelectorAll('button').forEach(b => { if (b.textContent.trim() === '导出') b.click(); })");
});
System.out.println("[*] Waiting for download to complete...");
Path savedFile = downloadPath.resolve("三联单列表_" + dateStrFileName + ".xls");
dl.saveAs(savedFile);
System.out.println("[+] Download saved to: " + savedFile);
if (java.nio.file.Files.size(savedFile) == 0) {
System.out.println("[-] Downloaded file is empty");
} else {
System.out.println("[+] Download size: " + java.nio.file.Files.size(savedFile) + " bytes");
// Auto-import to ets-proxy
autoImportBill(savedFile, proxyHost, proxyUser, proxyPass);
}
}
screenshot(page, "after_export");
System.out.println("[+] Query and export completed!");
String content = page.textContent("body");
if (content != null) {
String preview = content.length() > 500
? content.substring(0, 500) + ".."
: content;
System.out.println("[+] Page content preview:\n" + preview);
}
} else {
System.out.println("[-] Login failed. Check screenshots/ for debugging.");
screenshot(page, "login_failed");
}
} finally {
browser.close();
}
}
}
private static void printHelp() {
System.out.println("""
ETS 三联单爬虫 - 导出并导入三联单 Excel 数据
用法: java -jar ets-playwright.jar [选项]
选项:
-s <url> ets-proxy 服务器地址
-u <user> ets-proxy 用户名
-p <pass> ets-proxy 密码
-d <date> 查询日期,格式 yyyy-MM-dd
-h 显示此帮助信息
示例:
java -jar ets-playwright.jar -s https://api.ets.niko.red -u admin -p 123456 -d 2026-05-04
""");
}
public static boolean doLoginWithCaptcha(Page page) throws Exception {
// Recognize captcha first
Path captchaPath = SCREENSHOT_DIR.resolve("captcha.png");
System.out.println("[*] Recognizing captcha with Ollama...");
String captchaText = recognizeCaptcha(captchaPath);
if (captchaText == null || captchaText.isEmpty()) {
System.out.println("[-] Failed to recognize captcha");
return false;
}
System.out.println("[+] Captcha recognized: " + captchaText);
// Fill using correct ID selectors
System.out.println("[*] Filling credentials...");
page.locator("#inputLoginUser").first().fill(USERNAME);
sleep(300);
page.locator("#inputLoginPassWord").first().fill(PASSWORD);
sleep(300);
page.locator("#txt_ValidatePic").first().fill(captchaText);
sleep(500);
// Click submit button
System.out.println("[*] Clicking login button...");
page.locator("#inputLoginButton").first().click();
try {
page.waitForLoadState(LoadState.DOMCONTENTLOADED,
new Page.WaitForLoadStateOptions().setTimeout(10000));
return true;
} catch (Exception e) {
System.out.println("[!] Navigation timed out, but credentials were submitted");
return true;
}
}
public static void downloadCaptcha(Page page) {
try {
// Set up listener FIRST, then reload to trigger the request
Response resp = page.waitForResponse(
"https://101.227.180.215/SHCityEnvCW/Services/ValiDateImage.ashx*",
() -> {
page.reload(new Page.ReloadOptions()
.setWaitUntil(WaitUntilState.NETWORKIDLE)
.setTimeout(10000));
}
);
if (resp != null) {
byte[] body = resp.body();
Path captchaPath = SCREENSHOT_DIR.resolve("captcha.png");
java.nio.file.Files.write(captchaPath, body);
System.out.println("[+] Captcha saved to: " + captchaPath);
System.out.println("[+] Captcha size: " + body.length + " bytes");
}
} catch (Exception e) {
System.out.println("[-] Failed to download captcha: " + e.getMessage());
}
}
public static void closeNotificationDialog(Page page) {
// Find the frame that contains the notification dialog
Frame dialogFrame = null;
for (Frame f : page.frames()) {
try {
String hasDialog = (String) f.evaluate(
"() => document.getElementById('Div_GG_Box') ? 'FOUND' : 'NOT_HERE'");
if ("FOUND".equals(hasDialog)) {
dialogFrame = f;
break;
}
} catch (Exception ignored) {
}
}
if (dialogFrame == null) {
System.out.println("[*] No notification dialog found");
return;
}
System.out.println("[*] Closing notification dialog in frame: " + dialogFrame.url());
// Click the X button in the correct frame
dialogFrame.locator(".green_popup_close").first().click();
sleep(500);
// Force hide via JS in the correct frame (onclick uses jQuery which may fail)
dialogFrame.evaluate("document.getElementById('Div_GG_Box').style.display = 'none';");
sleep(500);
System.out.println("[*] Notification dialog closed");
}
private static void screenshot(Page page, String name) {
try {
String timestamp = LocalDateTime.now()
.format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
Path path = SCREENSHOT_DIR.resolve(name + "_" + timestamp + ".png");
page.screenshot(new Page.ScreenshotOptions().setPath(path));
System.out.println("[+] Screenshot saved: " + path);
} catch (Exception e) {
System.err.println("[-] Screenshot failed: " + e.getMessage());
}
}
public static void sleep(long ms) {
try {
Thread.sleep(ms);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
public static String recognizeCaptcha(Path imagePath) throws Exception {
byte[] imageBytes = Files.readAllBytes(imagePath);
// Convert GIF to PNG and resize (Ollama qwen3-vl needs larger PNG images)
ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes);
BufferedImage srcImage = ImageIO.read(bais);
if (srcImage == null) {
String base64 = Base64.getEncoder().encodeToString(imageBytes);
return callOllama(base64);
}
// Resize to at least 200px width for better recognition
int scale = Math.max(1, 200 / srcImage.getWidth());
if (scale < 1) scale = 1;
int newWidth = srcImage.getWidth() * scale;
int newHeight = srcImage.getHeight() * scale;
Image scaled = srcImage.getScaledInstance(newWidth, newHeight, Image.SCALE_SMOOTH);
BufferedImage resized = new BufferedImage(newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
Graphics2D g2d = resized.createGraphics();
g2d.drawImage(scaled, 0, 0, null);
g2d.dispose();
ByteArrayOutputStream pngOut = new ByteArrayOutputStream();
ImageIO.write(resized, "png", pngOut);
byte[] pngBytes = pngOut.toByteArray();
String base64 = Base64.getEncoder().encodeToString(pngBytes);
return callOllama(base64);
}
private static String callOllama(String base64Image) throws Exception {
String json = "{"
+ "\"model\":\"" + OLLAMA_MODEL + "\","
+ "\"messages\":["
+ " {"
+ " \"role\":\"user\","
+ " \"content\":\"识别图中的验证码文字,只返回文字内容\","
+ " \"images\":[\"" + base64Image + "\"]"
+ " }"
+ "]"
+ "}";
URL url = new URL(OLLAMA_URL + "/api/chat");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("POST");
conn.setConnectTimeout(15000);
conn.setReadTimeout(60000);
conn.setDoOutput(true);
conn.setRequestProperty("Content-Type", "application/json; charset=utf-8");
conn.getOutputStream().write(json.getBytes("utf-8"));
conn.getOutputStream().flush();
conn.getOutputStream().close();
BufferedReader reader = new BufferedReader(
new InputStreamReader(conn.getInputStream(), "utf-8"));
try {
StringBuilder fullContent = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
// Parse each line as a separate JSON object (streaming response)
int contentIdx = line.indexOf("\"content\":");
if (contentIdx >= 0) {
int start = line.indexOf('"', contentIdx + 10) + 1;
int end = line.indexOf('"', start);
if (start > 0 && end > start) {
fullContent.append(line.substring(start, end));
}
}
// Check for done marker
if (line.contains("\"done\":true")) {
break;
}
}
return normalizeCaptcha(fullContent.toString());
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
// ignore
}
}
conn.disconnect();
}
}
private static String normalizeCaptcha(String raw) {
if (raw == null || raw.isBlank()) {
return "";
}
String s = raw.strip().replaceAll("\\s+", "");
s = s.replaceAll("^[`'\\\"]|[`'\\\"]+$", "");
return s;
}
public static void autoImportBill(Path filePath, String proxyHost, String username, String password) {
String token = proxyLogin(proxyHost, username, password);
if (token == null) {
System.out.println("[-] Proxy login failed, skipping import");
return;
}
proxyImport(filePath, proxyHost, token);
}
public static String proxyLogin(String proxyHost, String username, String password) {
try {
String loginUrl = proxyHost + "/api/auth/login?username=" + username + "&password=" + password;
java.net.URI uri = java.net.URI.create(loginUrl);
java.net.http.HttpClient client = java.net.http.HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(10))
.build();
java.net.http.HttpRequest request = java.net.http.HttpRequest.newBuilder()
.uri(uri)
.POST(java.net.http.HttpRequest.BodyPublishers.noBody())
.header("Content-Type", "application/json")
.build();
java.net.http.HttpResponse<String> response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofString());
String body = response.body();
int dataIdx = body.indexOf("\"data\"");
int tokenIdx = body.indexOf("\"accessToken\"");
if (dataIdx < 0 || tokenIdx < 0) {
System.out.println("[-] Login response unexpected: " + body.substring(0, Math.min(200, body.length())));
return null;
}
// Extract accessToken value
int colonStart = body.indexOf("\":", tokenIdx);
if (colonStart < 0) return null;
int quoteStart = body.indexOf("\"", colonStart + 2);
int quoteEnd = body.indexOf("\"", quoteStart + 1);
if (quoteStart < 0 || quoteEnd <= quoteStart) return null;
String token = body.substring(quoteStart + 1, quoteEnd);
System.out.println("[+] Proxy login successful, token: " + token.substring(0, Math.min(20, token.length())) + "...");
return token;
} catch (Exception e) {
System.out.println("[-] Proxy login failed: " + e.getMessage());
return null;
}
}
public static void proxyImport(Path filePath, String proxyHost, String token) {
try {
java.io.File file = filePath.toFile();
String boundary = "----FormBoundary" + System.currentTimeMillis();
String boundaryLine = "--" + boundary;
java.net.URI uri = java.net.URI.create(proxyHost + "/api/bill/import");
java.io.ByteArrayOutputStream out = new java.io.ByteArrayOutputStream();
java.io.OutputStream os = out;
// Write file part
os.write((boundaryLine + "\r\n").getBytes());
os.write(("Content-Disposition: form-data; name=\"file\"; filename=\"" + file.getName() + "\"\r\n").getBytes());
os.write("Content-Type: application/octet-stream\r\n\r\n".getBytes());
os.flush();
// Write file bytes
try (java.io.FileInputStream fis = new java.io.FileInputStream(file)) {
byte[] buf = new byte[8192];
int n;
while ((n = fis.read(buf)) > 0) {
os.write(buf, 0, n);
}
}
os.flush();
// Write closing boundary
os.write(("\r\n" + boundaryLine + "--\r\n").getBytes());
os.flush();
byte[] entityBytes = out.toByteArray();
java.net.http.HttpClient client = java.net.http.HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(30))
.build();
java.net.http.HttpRequest request = java.net.http.HttpRequest.newBuilder()
.uri(uri)
.header("Content-Type", "multipart/form-data; boundary=" + boundary)
.header("authorization", token)
.POST(java.net.http.HttpRequest.BodyPublishers.ofByteArray(entityBytes))
.build();
java.net.http.HttpResponse<String> response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofString());
String body = response.body();
System.out.println("[+] Import response (" + response.statusCode() + "): " + body);
} catch (Exception e) {
System.out.println("[-] Import failed: " + e.getMessage());
e.printStackTrace();
}
}
}