556 lines
24 KiB
Java
556 lines
24 KiB
Java
package com.ets.scraper;
|
||
|
||
import com.microsoft.playwright.*;
|
||
import com.microsoft.playwright.Frame;
|
||
import com.microsoft.playwright.options.LoadState;
|
||
import com.microsoft.playwright.options.WaitUntilState;
|
||
|
||
import javax.imageio.ImageIO;
|
||
import java.awt.*;
|
||
import java.awt.image.BufferedImage;
|
||
import java.io.*;
|
||
import java.net.HttpURLConnection;
|
||
import java.net.URL;
|
||
import java.nio.file.Files;
|
||
import java.nio.file.Path;
|
||
import java.time.LocalDateTime;
|
||
import java.time.format.DateTimeFormatter;
|
||
import java.util.Base64;
|
||
|
||
import static java.nio.file.Files.createDirectories;
|
||
|
||
/**
|
||
* ETS (Construction Waste Management Platform) Web Scraper
|
||
* Uses Playwright to automate login and data extraction.
|
||
*/
|
||
public class EtsScraper {
|
||
public static final String FRAME_URL = "https://101.227.180.215/SHCityEnvCW/CWS/frame.html";
|
||
public static final String LOGIN_URL = "https://101.227.180.215/SHCityEnvCW/CWS/userlogin.html";
|
||
private static final String USERNAME = "sccw";
|
||
private static final String PASSWORD = "slife@123";
|
||
private static final Path SCREENSHOT_DIR = Path.of("screenshots");
|
||
private static final String OLLAMA_URL = "http://10.0.1.39:11434";
|
||
private static final String OLLAMA_MODEL = "qwen3-vl:4b";
|
||
private static final String PROXY_HOST = "http://127.0.0.1:8081";
|
||
|
||
public static void main(String[] args) throws Exception {
|
||
// Parse CLI arguments
|
||
String proxyHost = null;
|
||
String proxyUser = null;
|
||
String proxyPass = null;
|
||
String dateStr = null;
|
||
|
||
for (int i = 0; i < args.length; i++) {
|
||
switch (args[i]) {
|
||
case "-h":
|
||
printHelp();
|
||
return;
|
||
case "-s":
|
||
proxyHost = args[++i];
|
||
break;
|
||
case "-u":
|
||
proxyUser = args[++i];
|
||
break;
|
||
case "-p":
|
||
proxyPass = args[++i];
|
||
break;
|
||
case "-d":
|
||
dateStr = args[++i];
|
||
break;
|
||
default:
|
||
System.err.println("[-] Unknown option: " + args[i]);
|
||
printHelp();
|
||
return;
|
||
}
|
||
}
|
||
|
||
if (proxyHost == null || proxyUser == null || proxyPass == null || dateStr == null) {
|
||
System.err.println("[-] Missing required arguments");
|
||
printHelp();
|
||
return;
|
||
}
|
||
|
||
java.time.LocalDate targetDate;
|
||
try {
|
||
targetDate = java.time.LocalDate.parse(dateStr);
|
||
} catch (Exception e) {
|
||
System.err.println("[-] Invalid date format: " + dateStr + ", expected yyyy-MM-dd");
|
||
return;
|
||
}
|
||
String dateStrFormatted = targetDate.format(java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd"));
|
||
|
||
String dateStrFileName = targetDate.format(DateTimeFormatter.ofPattern("yyyyMMdd"));
|
||
|
||
try {
|
||
createDirectories(SCREENSHOT_DIR);
|
||
} catch (Exception e) {
|
||
System.err.println("Failed to create directories: " + e.getMessage());
|
||
}
|
||
|
||
try (Playwright playwright = Playwright.create()) {
|
||
Browser browser = playwright.chromium().launch(
|
||
new BrowserType.LaunchOptions().setHeadless(false)
|
||
);
|
||
BrowserContext context = browser.newContext(
|
||
new Browser.NewContextOptions().setIgnoreHTTPSErrors(true)
|
||
);
|
||
Page page = context.newPage();
|
||
|
||
try {
|
||
// Navigate to frame.html first to establish session/cookies
|
||
System.out.println("[*] Establishing session via " + FRAME_URL);
|
||
page.navigate(FRAME_URL, new Page.NavigateOptions()
|
||
.setTimeout(30000)
|
||
.setWaitUntil(WaitUntilState.DOMCONTENTLOADED));
|
||
sleep(3000);
|
||
|
||
// Navigate directly to the login page
|
||
System.out.println("[*] Navigating to login page: " + LOGIN_URL);
|
||
page.navigate(LOGIN_URL, new Page.NavigateOptions()
|
||
.setTimeout(30000)
|
||
.setWaitUntil(WaitUntilState.NETWORKIDLE));
|
||
sleep(2000);
|
||
|
||
// Close notification dialog FIRST (before filling credentials)
|
||
closeNotificationDialog(page);
|
||
|
||
screenshot(page, "after_close_dialog");
|
||
|
||
// Download captcha image
|
||
downloadCaptcha(page);
|
||
|
||
// Close dialog again after page reload
|
||
closeNotificationDialog(page);
|
||
|
||
// Recognize captcha and perform login
|
||
boolean loggedin = doLoginWithCaptcha(page);
|
||
|
||
if (loggedin) {
|
||
System.out.println("[+] Login successful!");
|
||
sleep(2000);
|
||
|
||
screenshot(page, "after_login");
|
||
|
||
System.out.println("[+] Page title: " + page.title());
|
||
System.out.println("[+] Page URL: " + page.url());
|
||
|
||
// 点击三联单菜单
|
||
System.out.println("[*] Clicking 三联单 menu...");
|
||
page.locator("#module_2094F683-C542-4904-B33E-0D227C4DE199").first().click();
|
||
sleep(3000);
|
||
|
||
screenshot(page, "after_sanliandan");
|
||
System.out.println("[+] 三联单 page title: " + page.title());
|
||
|
||
// 设置日期筛选
|
||
System.out.println("[*] Setting date filter to: " + dateStrFormatted);
|
||
|
||
// 检查元素是否存在
|
||
boolean startDateExists = page.locator("#Search_ThreeBillList_startWdate").count() > 0;
|
||
boolean endDateExists = page.locator("#Search_ThreeBillList_endWdate").count() > 0;
|
||
boolean queryBtnExists = page.locator("#Search_ThreeBillList_Button").count() > 0;
|
||
System.out.println("[*] Elements found - startDate: " + startDateExists + ", endDate: " + endDateExists + ", queryBtn: " + queryBtnExists);
|
||
|
||
// 直接设置日期值(WdatePicker 类型输入框)
|
||
if (startDateExists) {
|
||
System.out.println("[*] Setting start date to: " + dateStrFormatted);
|
||
page.locator("#Search_ThreeBillList_startWdate").first().fill(dateStrFormatted);
|
||
sleep(500);
|
||
} else {
|
||
System.out.println("[!] Start date element not found");
|
||
}
|
||
|
||
// 设置结束日期
|
||
if (endDateExists) {
|
||
System.out.println("[*] Setting end date to: " + dateStrFormatted);
|
||
page.locator("#Search_ThreeBillList_endWdate").first().fill(dateStrFormatted);
|
||
sleep(500);
|
||
} else {
|
||
System.out.println("[!] End date element not found");
|
||
}
|
||
|
||
// 点击查询按钮,等待列表加载
|
||
if (queryBtnExists) {
|
||
System.out.println("[*] Clicking query button...");
|
||
page.locator("#Search_ThreeBillList_Button").first().click();
|
||
|
||
// 等待列表内容出现
|
||
try {
|
||
page.waitForSelector("tbody tr", new Page.WaitForSelectorOptions()
|
||
.setTimeout(30000));
|
||
System.out.println("[+] Query completed, list loaded");
|
||
} catch (Exception e) {
|
||
System.out.println("[!] Wait for list timeout, but query was submitted");
|
||
}
|
||
} else {
|
||
System.out.println("[!] Query button not found");
|
||
}
|
||
|
||
screenshot(page, "after_query");
|
||
|
||
// 点击导出按钮
|
||
if (page.locator("#Export_ThreeBillList_Button").count() > 0) {
|
||
System.out.println("[*] Clicking export button...");
|
||
// 设置下载目录
|
||
Path downloadPath = Path.of("downloads").toAbsolutePath().normalize();
|
||
java.nio.file.Files.createDirectories(downloadPath);
|
||
// 点击主导出按钮打开对话框,再用 JS click 触发对话框内导出按钮
|
||
Download dl = page.waitForDownload(
|
||
new Page.WaitForDownloadOptions().setTimeout(300000),
|
||
() -> {
|
||
page.locator("#Export_ThreeBillList_Button").first().click();
|
||
sleep(2000);
|
||
System.out.println("[*] Triggering dialog export via JS...");
|
||
page.evaluate("document.querySelectorAll('button').forEach(b => { if (b.textContent.trim() === '导出') b.click(); })");
|
||
});
|
||
System.out.println("[*] Waiting for download to complete...");
|
||
Path savedFile = downloadPath.resolve("三联单列表_" + dateStrFileName + ".xls");
|
||
dl.saveAs(savedFile);
|
||
System.out.println("[+] Download saved to: " + savedFile);
|
||
if (java.nio.file.Files.size(savedFile) == 0) {
|
||
System.out.println("[-] Downloaded file is empty");
|
||
} else {
|
||
System.out.println("[+] Download size: " + java.nio.file.Files.size(savedFile) + " bytes");
|
||
// Auto-import to ets-proxy
|
||
autoImportBill(savedFile, proxyHost, proxyUser, proxyPass);
|
||
}
|
||
}
|
||
screenshot(page, "after_export");
|
||
System.out.println("[+] Query and export completed!");
|
||
|
||
String content = page.textContent("body");
|
||
if (content != null) {
|
||
String preview = content.length() > 500
|
||
? content.substring(0, 500) + ".."
|
||
: content;
|
||
System.out.println("[+] Page content preview:\n" + preview);
|
||
}
|
||
} else {
|
||
System.out.println("[-] Login failed. Check screenshots/ for debugging.");
|
||
screenshot(page, "login_failed");
|
||
}
|
||
} finally {
|
||
browser.close();
|
||
}
|
||
}
|
||
}
|
||
|
||
private static void printHelp() {
|
||
System.out.println("""
|
||
ETS 三联单爬虫 - 导出并导入三联单 Excel 数据
|
||
|
||
用法: java -jar ets-playwright.jar [选项]
|
||
|
||
选项:
|
||
-s <url> ets-proxy 服务器地址
|
||
-u <user> ets-proxy 用户名
|
||
-p <pass> ets-proxy 密码
|
||
-d <date> 查询日期,格式 yyyy-MM-dd
|
||
-h 显示此帮助信息
|
||
|
||
示例:
|
||
java -jar ets-playwright.jar -s https://api.ets.niko.red -u admin -p 123456 -d 2026-05-04
|
||
""");
|
||
}
|
||
|
||
public static boolean doLoginWithCaptcha(Page page) throws Exception {
|
||
// Recognize captcha first
|
||
Path captchaPath = SCREENSHOT_DIR.resolve("captcha.png");
|
||
System.out.println("[*] Recognizing captcha with Ollama...");
|
||
String captchaText = recognizeCaptcha(captchaPath);
|
||
if (captchaText == null || captchaText.isEmpty()) {
|
||
System.out.println("[-] Failed to recognize captcha");
|
||
return false;
|
||
}
|
||
System.out.println("[+] Captcha recognized: " + captchaText);
|
||
|
||
// Fill using correct ID selectors
|
||
System.out.println("[*] Filling credentials...");
|
||
page.locator("#inputLoginUser").first().fill(USERNAME);
|
||
sleep(300);
|
||
page.locator("#inputLoginPassWord").first().fill(PASSWORD);
|
||
sleep(300);
|
||
page.locator("#txt_ValidatePic").first().fill(captchaText);
|
||
sleep(500);
|
||
|
||
// Click submit button
|
||
System.out.println("[*] Clicking login button...");
|
||
page.locator("#inputLoginButton").first().click();
|
||
|
||
try {
|
||
page.waitForLoadState(LoadState.DOMCONTENTLOADED,
|
||
new Page.WaitForLoadStateOptions().setTimeout(10000));
|
||
return true;
|
||
} catch (Exception e) {
|
||
System.out.println("[!] Navigation timed out, but credentials were submitted");
|
||
return true;
|
||
}
|
||
}
|
||
|
||
public static void downloadCaptcha(Page page) {
|
||
try {
|
||
// Set up listener FIRST, then reload to trigger the request
|
||
Response resp = page.waitForResponse(
|
||
"https://101.227.180.215/SHCityEnvCW/Services/ValiDateImage.ashx*",
|
||
() -> {
|
||
page.reload(new Page.ReloadOptions()
|
||
.setWaitUntil(WaitUntilState.NETWORKIDLE)
|
||
.setTimeout(10000));
|
||
}
|
||
);
|
||
if (resp != null) {
|
||
byte[] body = resp.body();
|
||
Path captchaPath = SCREENSHOT_DIR.resolve("captcha.png");
|
||
java.nio.file.Files.write(captchaPath, body);
|
||
System.out.println("[+] Captcha saved to: " + captchaPath);
|
||
System.out.println("[+] Captcha size: " + body.length + " bytes");
|
||
}
|
||
} catch (Exception e) {
|
||
System.out.println("[-] Failed to download captcha: " + e.getMessage());
|
||
}
|
||
}
|
||
|
||
public static void closeNotificationDialog(Page page) {
|
||
// Find the frame that contains the notification dialog
|
||
Frame dialogFrame = null;
|
||
for (Frame f : page.frames()) {
|
||
try {
|
||
String hasDialog = (String) f.evaluate(
|
||
"() => document.getElementById('Div_GG_Box') ? 'FOUND' : 'NOT_HERE'");
|
||
if ("FOUND".equals(hasDialog)) {
|
||
dialogFrame = f;
|
||
break;
|
||
}
|
||
} catch (Exception ignored) {
|
||
}
|
||
}
|
||
|
||
if (dialogFrame == null) {
|
||
System.out.println("[*] No notification dialog found");
|
||
return;
|
||
}
|
||
|
||
System.out.println("[*] Closing notification dialog in frame: " + dialogFrame.url());
|
||
// Click the X button in the correct frame
|
||
dialogFrame.locator(".green_popup_close").first().click();
|
||
sleep(500);
|
||
|
||
// Force hide via JS in the correct frame (onclick uses jQuery which may fail)
|
||
dialogFrame.evaluate("document.getElementById('Div_GG_Box').style.display = 'none';");
|
||
sleep(500);
|
||
|
||
System.out.println("[*] Notification dialog closed");
|
||
}
|
||
|
||
private static void screenshot(Page page, String name) {
|
||
try {
|
||
String timestamp = LocalDateTime.now()
|
||
.format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
|
||
Path path = SCREENSHOT_DIR.resolve(name + "_" + timestamp + ".png");
|
||
page.screenshot(new Page.ScreenshotOptions().setPath(path));
|
||
System.out.println("[+] Screenshot saved: " + path);
|
||
} catch (Exception e) {
|
||
System.err.println("[-] Screenshot failed: " + e.getMessage());
|
||
}
|
||
}
|
||
|
||
public static void sleep(long ms) {
|
||
try {
|
||
Thread.sleep(ms);
|
||
} catch (InterruptedException e) {
|
||
Thread.currentThread().interrupt();
|
||
}
|
||
}
|
||
|
||
public static String recognizeCaptcha(Path imagePath) throws Exception {
|
||
byte[] imageBytes = Files.readAllBytes(imagePath);
|
||
|
||
// Convert GIF to PNG and resize (Ollama qwen3-vl needs larger PNG images)
|
||
ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes);
|
||
BufferedImage srcImage = ImageIO.read(bais);
|
||
if (srcImage == null) {
|
||
String base64 = Base64.getEncoder().encodeToString(imageBytes);
|
||
return callOllama(base64);
|
||
}
|
||
|
||
// Resize to at least 200px width for better recognition
|
||
int scale = Math.max(1, 200 / srcImage.getWidth());
|
||
if (scale < 1) scale = 1;
|
||
int newWidth = srcImage.getWidth() * scale;
|
||
int newHeight = srcImage.getHeight() * scale;
|
||
|
||
Image scaled = srcImage.getScaledInstance(newWidth, newHeight, Image.SCALE_SMOOTH);
|
||
BufferedImage resized = new BufferedImage(newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
|
||
Graphics2D g2d = resized.createGraphics();
|
||
g2d.drawImage(scaled, 0, 0, null);
|
||
g2d.dispose();
|
||
|
||
ByteArrayOutputStream pngOut = new ByteArrayOutputStream();
|
||
ImageIO.write(resized, "png", pngOut);
|
||
byte[] pngBytes = pngOut.toByteArray();
|
||
String base64 = Base64.getEncoder().encodeToString(pngBytes);
|
||
return callOllama(base64);
|
||
}
|
||
|
||
private static String callOllama(String base64Image) throws Exception {
|
||
String json = "{"
|
||
+ "\"model\":\"" + OLLAMA_MODEL + "\","
|
||
+ "\"messages\":["
|
||
+ " {"
|
||
+ " \"role\":\"user\","
|
||
+ " \"content\":\"识别图中的验证码文字,只返回文字内容\","
|
||
+ " \"images\":[\"" + base64Image + "\"]"
|
||
+ " }"
|
||
+ "]"
|
||
+ "}";
|
||
|
||
URL url = new URL(OLLAMA_URL + "/api/chat");
|
||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||
conn.setRequestMethod("POST");
|
||
conn.setConnectTimeout(15000);
|
||
conn.setReadTimeout(60000);
|
||
conn.setDoOutput(true);
|
||
conn.setRequestProperty("Content-Type", "application/json; charset=utf-8");
|
||
|
||
conn.getOutputStream().write(json.getBytes("utf-8"));
|
||
conn.getOutputStream().flush();
|
||
conn.getOutputStream().close();
|
||
|
||
BufferedReader reader = new BufferedReader(
|
||
new InputStreamReader(conn.getInputStream(), "utf-8"));
|
||
try {
|
||
StringBuilder fullContent = new StringBuilder();
|
||
String line;
|
||
while ((line = reader.readLine()) != null) {
|
||
// Parse each line as a separate JSON object (streaming response)
|
||
int contentIdx = line.indexOf("\"content\":");
|
||
if (contentIdx >= 0) {
|
||
int start = line.indexOf('"', contentIdx + 10) + 1;
|
||
int end = line.indexOf('"', start);
|
||
if (start > 0 && end > start) {
|
||
fullContent.append(line.substring(start, end));
|
||
}
|
||
}
|
||
// Check for done marker
|
||
if (line.contains("\"done\":true")) {
|
||
break;
|
||
}
|
||
}
|
||
return normalizeCaptcha(fullContent.toString());
|
||
} finally {
|
||
if (reader != null) {
|
||
try {
|
||
reader.close();
|
||
} catch (IOException e) {
|
||
// ignore
|
||
}
|
||
}
|
||
conn.disconnect();
|
||
}
|
||
}
|
||
|
||
private static String normalizeCaptcha(String raw) {
|
||
if (raw == null || raw.isBlank()) {
|
||
return "";
|
||
}
|
||
String s = raw.strip().replaceAll("\\s+", "");
|
||
s = s.replaceAll("^[`'\\\"]|[`'\\\"]+$", "");
|
||
return s;
|
||
}
|
||
|
||
public static void autoImportBill(Path filePath, String proxyHost, String username, String password) {
|
||
String token = proxyLogin(proxyHost, username, password);
|
||
if (token == null) {
|
||
System.out.println("[-] Proxy login failed, skipping import");
|
||
return;
|
||
}
|
||
proxyImport(filePath, proxyHost, token);
|
||
}
|
||
|
||
public static String proxyLogin(String proxyHost, String username, String password) {
|
||
try {
|
||
String loginUrl = proxyHost + "/api/auth/login?username=" + username + "&password=" + password;
|
||
java.net.URI uri = java.net.URI.create(loginUrl);
|
||
java.net.http.HttpClient client = java.net.http.HttpClient.newBuilder()
|
||
.connectTimeout(java.time.Duration.ofSeconds(10))
|
||
.build();
|
||
java.net.http.HttpRequest request = java.net.http.HttpRequest.newBuilder()
|
||
.uri(uri)
|
||
.POST(java.net.http.HttpRequest.BodyPublishers.noBody())
|
||
.header("Content-Type", "application/json")
|
||
.build();
|
||
java.net.http.HttpResponse<String> response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofString());
|
||
String body = response.body();
|
||
int dataIdx = body.indexOf("\"data\"");
|
||
int tokenIdx = body.indexOf("\"accessToken\"");
|
||
if (dataIdx < 0 || tokenIdx < 0) {
|
||
System.out.println("[-] Login response unexpected: " + body.substring(0, Math.min(200, body.length())));
|
||
return null;
|
||
}
|
||
// Extract accessToken value
|
||
int colonStart = body.indexOf("\":", tokenIdx);
|
||
if (colonStart < 0) return null;
|
||
int quoteStart = body.indexOf("\"", colonStart + 2);
|
||
int quoteEnd = body.indexOf("\"", quoteStart + 1);
|
||
if (quoteStart < 0 || quoteEnd <= quoteStart) return null;
|
||
String token = body.substring(quoteStart + 1, quoteEnd);
|
||
System.out.println("[+] Proxy login successful, token: " + token.substring(0, Math.min(20, token.length())) + "...");
|
||
return token;
|
||
} catch (Exception e) {
|
||
System.out.println("[-] Proxy login failed: " + e.getMessage());
|
||
return null;
|
||
}
|
||
}
|
||
|
||
public static void proxyImport(Path filePath, String proxyHost, String token) {
|
||
try {
|
||
java.io.File file = filePath.toFile();
|
||
String boundary = "----FormBoundary" + System.currentTimeMillis();
|
||
String boundaryLine = "--" + boundary;
|
||
java.net.URI uri = java.net.URI.create(proxyHost + "/api/bill/import");
|
||
|
||
java.io.ByteArrayOutputStream out = new java.io.ByteArrayOutputStream();
|
||
java.io.OutputStream os = out;
|
||
|
||
// Write file part
|
||
os.write((boundaryLine + "\r\n").getBytes());
|
||
os.write(("Content-Disposition: form-data; name=\"file\"; filename=\"" + file.getName() + "\"\r\n").getBytes());
|
||
os.write("Content-Type: application/octet-stream\r\n\r\n".getBytes());
|
||
os.flush();
|
||
|
||
// Write file bytes
|
||
try (java.io.FileInputStream fis = new java.io.FileInputStream(file)) {
|
||
byte[] buf = new byte[8192];
|
||
int n;
|
||
while ((n = fis.read(buf)) > 0) {
|
||
os.write(buf, 0, n);
|
||
}
|
||
}
|
||
os.flush();
|
||
|
||
// Write closing boundary
|
||
os.write(("\r\n" + boundaryLine + "--\r\n").getBytes());
|
||
os.flush();
|
||
|
||
byte[] entityBytes = out.toByteArray();
|
||
|
||
java.net.http.HttpClient client = java.net.http.HttpClient.newBuilder()
|
||
.connectTimeout(java.time.Duration.ofSeconds(30))
|
||
.build();
|
||
java.net.http.HttpRequest request = java.net.http.HttpRequest.newBuilder()
|
||
.uri(uri)
|
||
.header("Content-Type", "multipart/form-data; boundary=" + boundary)
|
||
.header("authorization", token)
|
||
.POST(java.net.http.HttpRequest.BodyPublishers.ofByteArray(entityBytes))
|
||
.build();
|
||
|
||
java.net.http.HttpResponse<String> response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofString());
|
||
String body = response.body();
|
||
System.out.println("[+] Import response (" + response.statusCode() + "): " + body);
|
||
} catch (Exception e) {
|
||
System.out.println("[-] Import failed: " + e.getMessage());
|
||
e.printStackTrace();
|
||
}
|
||
}
|
||
}
|