diff --git a/src/main/java/com/ets/scraper/EtsScraper.java b/src/main/java/com/ets/scraper/EtsScraper.java index c605772..babbb80 100644 --- a/src/main/java/com/ets/scraper/EtsScraper.java +++ b/src/main/java/com/ets/scraper/EtsScraper.java @@ -11,6 +11,8 @@ import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.awt.image.BufferedImage; +import java.awt.Image; +import java.awt.Graphics2D; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.nio.file.Files; @@ -38,46 +40,46 @@ public class EtsScraper { public static void main(String[] args) throws Exception { try { createDirectories(SCREENSHOT_DIR); - } catch (Exception e) { + } catch (Exception e) { System.err.println("Failed to create screenshots dir: " + e.getMessage()); - } + } try (Playwright playwright = Playwright.create()) { Browser browser = playwright.chromium().launch( new BrowserType.LaunchOptions().setHeadless(false) - ); + ); BrowserContext context = browser.newContext( new Browser.NewContextOptions().setIgnoreHTTPSErrors(true) - ); + ); Page page = context.newPage(); try { - // Navigate to frame.html first to establish session/cookies + // Navigate to frame.html first to establish session/cookies System.out.println("[*] Establishing session via " + FRAME_URL); page.navigate(FRAME_URL, new Page.NavigateOptions() - .setTimeout(30000) - .setWaitUntil(WaitUntilState.DOMCONTENTLOADED)); + .setTimeout(30000) + .setWaitUntil(WaitUntilState.DOMCONTENTLOADED)); sleep(3000); - // Navigate directly to the login page + // Navigate directly to the login page System.out.println("[*] Navigating to login page: " + LOGIN_URL); page.navigate(LOGIN_URL, new Page.NavigateOptions() - .setTimeout(30000) - .setWaitUntil(WaitUntilState.NETWORKIDLE)); + .setTimeout(30000) + .setWaitUntil(WaitUntilState.NETWORKIDLE)); sleep(2000); - // Close notification dialog FIRST (before filling credentials) + // Close notification dialog FIRST (before filling credentials) closeNotificationDialog(page); screenshot(page, "after_close_dialog"); - // Download captcha image + // Download captcha image downloadCaptcha(page); - // Close dialog again after page reload + // Close dialog again after page reload closeNotificationDialog(page); - // Recognize captcha and perform login + // Recognize captcha and perform login boolean loggedin = doLoginWithCaptcha(page); if (loggedin) { @@ -92,125 +94,124 @@ public class EtsScraper { String content = page.textContent("body"); if (content != null) { String preview = content.length() > 500 - ? content.substring(0, 500) + "..." - : content; + ? content.substring(0, 500) + ".." + : content; System.out.println("[+] Page content preview:\n" + preview); - } - } else { + } + } else { System.out.println("[-] Login failed. Check screenshots/ for debugging."); screenshot(page, "login_failed"); - } - } finally { + } + } finally { browser.close(); - } - } - } + } + } + } private static boolean doLogin(Page page) { - // Find and fill username + // Find and fill username String usernameInput = findInput(page, new String[]{ - "input[placeholder*='用户名']", - "input[placeholder*='username']", - "input[placeholder*='账号']", - "input[name*='user']", - "input[name='username']", - "input[type='text']", - }); + "input[placeholder*='用户名']", + "input[placeholder*='username']", + "input[placeholder*='账号']", + "input[name*='user']", + "input[name='username']", + "input[type='text']", + }); if (usernameInput == null) { System.out.println("[-] Could not find username input"); return false; - } + } - // Find and fill password + // Find and fill password String passwordInput = findInput(page, new String[]{ - "input[placeholder*='密码']", - "input[placeholder*='password']", - "input[name*='pass']", - "input[name='password']", - "input[name='pwd']", - "input[type='password']", - }); + "input[placeholder*='密码']", + "input[placeholder*='password']", + "input[name*='pass']", + "input[name='password']", + "input[name='pwd']", + "input[type='password']", + }); if (passwordInput == null) { System.out.println("[-] Could not find password input"); return false; - } + } System.out.println("[*] Filling credentials..."); page.locator(usernameInput).first().fill(USERNAME); page.locator(passwordInput).first().fill(PASSWORD); sleep(500); - // Find and click submit, or press Enter + // Find and click submit, or press Enter String submitBtn = findSubmit(page); if (submitBtn != null) { System.out.println("[*] Clicking submit button: " + submitBtn); page.locator(submitBtn).first().click(); - } else { + } else { System.out.println("[*] No submit button found, pressing Enter"); page.locator(passwordInput).first().press("Enter"); - } + } try { page.waitForLoadState(LoadState.DOMCONTENTLOADED, new Page.WaitForLoadStateOptions().setTimeout(10000)); return true; - } catch (Exception e) { + } catch (Exception e) { System.out.println("[!] Navigation timed out, but credentials were submitted"); return true; - } - } - + } + } private static boolean doLoginWithCaptcha(Page page) throws Exception { - // Find and fill username + // Find and fill username String usernameInput = findInput(page, new String[]{ - "input[placeholder*='用户名']", - "input[placeholder*='username']", - "input[placeholder*='账号']", - "input[name*='user']", - "input[name='username']", - "input[type='text']", - }); + "input[placeholder*='用户名']", + "input[placeholder*='username']", + "input[placeholder*='账号']", + "input[name*='user']", + "input[name='username']", + "input[type='text']", + }); if (usernameInput == null) { System.out.println("[-] Could not find username input"); return false; - } + } - // Find and fill password + // Find and fill password String passwordInput = findInput(page, new String[]{ - "input[placeholder*='密码']", - "input[placeholder*='password']", - "input[name*='pass']", - "input[name='password']", - "input[name='pwd']", - "input[type='password']", - }); + "input[placeholder*='密码']", + "input[placeholder*='password']", + "input[name*='pass']", + "input[name='password']", + "input[name='pwd']", + "input[type='password']", + }); if (passwordInput == null) { System.out.println("[-] Could not find password input"); return false; - } + } - // Find and fill captcha + // Find and fill captcha String captchaInput = findInput(page, new String[]{ - "input[placeholder*='验证码']", - "input[placeholder*='captcha']", - "input[name*='captcha']", - "input[name='code']", - "input[type='text']", - }); + "input[placeholder*='验证码']", + "input[placeholder*='captcha']", + "input[name*='captcha']", + "input[name='code']", + "input[type='text']", + }); if (captchaInput == null) { System.out.println("[-] Could not find captcha input"); return false; - } + } - // Recognize captcha + // Recognize captcha Path captchaPath = SCREENSHOT_DIR.resolve("captcha.png"); System.out.println("[*] Recognizing captcha with Ollama..."); String captchaText = recognizeCaptcha(captchaPath); if (captchaText == null || captchaText.isEmpty()) { System.out.println("[-] Failed to recognize captcha"); return false; - } + } System.out.println("[+] Captcha recognized: " + captchaText); System.out.println("[*] Filling credentials..."); @@ -219,79 +220,80 @@ public class EtsScraper { page.locator(captchaInput).first().fill(captchaText); sleep(500); - // Click submit or press Enter + // Click submit or press Enter String submitBtn = findSubmit(page); if (submitBtn != null) { System.out.println("[*] Clicking submit button: " + submitBtn); page.locator(submitBtn).first().click(); - } else { + } else { System.out.println("[*] No submit button found, pressing Enter"); page.locator(captchaInput).first().press("Enter"); - } + } try { page.waitForLoadState(LoadState.DOMCONTENTLOADED, new Page.WaitForLoadStateOptions().setTimeout(10000)); return true; - } catch (Exception e) { + } catch (Exception e) { System.out.println("[!] Navigation timed out, but credentials were submitted"); return true; - } - } + } + } + private static void downloadCaptcha(Page page) { try { - // Set up listener FIRST, then reload to trigger the request + // Set up listener FIRST, then reload to trigger the request Response resp = page.waitForResponse( - "https://101.227.180.215/SHCityEnvCW/Services/ValiDateImage.ashx*", - () -> { + "https://101.227.180.215/SHCityEnvCW/Services/ValiDateImage.ashx*", + () -> { page.reload(new Page.ReloadOptions() - .setWaitUntil(WaitUntilState.NETWORKIDLE) - .setTimeout(10000)); - } - ); + .setWaitUntil(WaitUntilState.NETWORKIDLE) + .setTimeout(10000)); + } + ); if (resp != null) { - byte[] body = resp.body(); - Path captchaPath = SCREENSHOT_DIR.resolve("captcha.png"); - java.nio.file.Files.write(captchaPath, body); - System.out.println("[+] Captcha saved to: " + captchaPath); - System.out.println("[+] Captcha size: " + body.length + " bytes"); - } - } catch (Exception e) { - System.out.println("[-] Failed to download captcha: " + e.getMessage()); + byte[] body = resp.body(); + Path captchaPath = SCREENSHOT_DIR.resolve("captcha.png"); + java.nio.file.Files.write(captchaPath, body); + System.out.println("[+] Captcha saved to: " + captchaPath); + System.out.println("[+] Captcha size: " + body.length + " bytes"); } - } + } catch (Exception e) { + System.out.println("[-] Failed to download captcha: " + e.getMessage()); + } + } private static void closeNotificationDialog(Page page) { - // Find the frame that contains the notification dialog - Frame dialogFrame = null; - for (Frame f : page.frames()) { - try { - String hasDialog = (String) f.evaluate( - "() => document.getElementById('Div_GG_Box') ? 'FOUND' : 'NOT_HERE'"); - if ("FOUND".equals(hasDialog)) { - dialogFrame = f; - break; - } - } catch (Exception ignored) { - } - } - - if (dialogFrame == null) { - System.out.println("[*] No notification dialog found"); - return; + // Find the frame that contains the notification dialog + Frame dialogFrame = null; + for (Frame f : page.frames()) { + try { + String hasDialog = (String) f.evaluate( + "() => document.getElementById('Div_GG_Box') ? 'FOUND' : 'NOT_HERE'"); + if ("FOUND".equals(hasDialog)) { + dialogFrame = f; + break; + } + } catch (Exception ignored) { } + } - System.out.println("[*] Closing notification dialog in frame: " + dialogFrame.url()); - // Click the X button in the correct frame - dialogFrame.locator(".green_popup_close").first().click(); - sleep(500); + if (dialogFrame == null) { + System.out.println("[*] No notification dialog found"); + return; + } - // Force hide via JS in the correct frame (onclick uses jQuery which may fail) - dialogFrame.evaluate("document.getElementById('Div_GG_Box').style.display = 'none';"); - sleep(500); + System.out.println("[*] Closing notification dialog in frame: " + dialogFrame.url()); + // Click the X button in the correct frame + dialogFrame.locator(".green_popup_close").first().click(); + sleep(500); - System.out.println("[*] Notification dialog closed"); - } + // Force hide via JS in the correct frame (onclick uses jQuery which may fail) + dialogFrame.evaluate("document.getElementById('Div_GG_Box').style.display = 'none';"); + sleep(500); + + System.out.println("[*] Notification dialog closed"); + } private static String findInput(Page page, String[] selectors) { for (String selector : selectors) { @@ -299,116 +301,147 @@ public class EtsScraper { if (page.locator(selector).first().isVisible( new Locator.IsVisibleOptions().setTimeout(1000))) { return selector; - } - } catch (Exception ignored) { - } - } + } + } catch (Exception ignored) { + } + } return null; - } + } private static String findSubmit(Page page) { String[] selectors = new String[]{ - "button[type='submit']", - "input[type='submit']", - "button:has-text('登录')", - "button:has-text('Login')", - ".login-btn", - "#loginBtn", - }; + "button[type='submit']", + "input[type='submit']", + "button:has-text('登录')", + "button:has-text('Login')", + ".login-btn", + "#loginBtn", + }; for (String selector : selectors) { try { if (page.locator(selector).first().isVisible( new Locator.IsVisibleOptions().setTimeout(1000))) { return selector; - } - } catch (Exception ignored) { - } - } + } + } catch (Exception ignored) { + } + } return null; - } + } private static void screenshot(Page page, String name) { try { String timestamp = LocalDateTime.now() - .format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); + .format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); Path path = SCREENSHOT_DIR.resolve(name + "_" + timestamp + ".png"); page.screenshot(new Page.ScreenshotOptions().setPath(path)); System.out.println("[+] Screenshot saved: " + path); - } catch (Exception e) { + } catch (Exception e) { System.err.println("[-] Screenshot failed: " + e.getMessage()); - } - } + } + } private static void sleep(long ms) { try { Thread.sleep(ms); - } catch (InterruptedException e) { + } catch (InterruptedException e) { Thread.currentThread().interrupt(); - } - } + } + } public static String recognizeCaptcha(Path imagePath) throws Exception { - byte[] imageBytes = Files.readAllBytes(imagePath); + byte[] imageBytes = Files.readAllBytes(imagePath); - // Convert GIF to PNG (Ollama doesn't support GIF) - ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes); - BufferedImage gifImage = ImageIO.read(bais); - if (gifImage == null) { - // Fallback: send raw bytes if conversion fails - String base64 = Base64.getEncoder().encodeToString(imageBytes); - return callOllama(base64); - } - ByteArrayOutputStream pngOut = new ByteArrayOutputStream(); - ImageIO.write(gifImage, "png", pngOut); - byte[] pngBytes = pngOut.toByteArray(); - String base64 = Base64.getEncoder().encodeToString(pngBytes); - return callOllama(base64); - } + // Convert GIF to PNG and resize (Ollama qwen3-vl needs larger PNG images) + ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes); + BufferedImage srcImage = ImageIO.read(bais); + if (srcImage == null) { + String base64 = Base64.getEncoder().encodeToString(imageBytes); + return callOllama(base64); + } + + // Resize to at least 200px width for better recognition + int scale = Math.max(1, 200 / srcImage.getWidth()); + if (scale < 1) scale = 1; + int newWidth = srcImage.getWidth() * scale; + int newHeight = srcImage.getHeight() * scale; + + Image scaled = srcImage.getScaledInstance(newWidth, newHeight, Image.SCALE_SMOOTH); + BufferedImage resized = new BufferedImage(newWidth, newHeight, BufferedImage.TYPE_INT_RGB); + Graphics2D g2d = resized.createGraphics(); + g2d.drawImage(scaled, 0, 0, null); + g2d.dispose(); + + ByteArrayOutputStream pngOut = new ByteArrayOutputStream(); + ImageIO.write(resized, "png", pngOut); + byte[] pngBytes = pngOut.toByteArray(); + String base64 = Base64.getEncoder().encodeToString(pngBytes); + return callOllama(base64); + } private static String callOllama(String base64Image) throws Exception { - String json = "{" - + "\"model\":\"" + OLLAMA_MODEL + "\"," - + "\"messages\":[" - + " {" - + " \"role\":\"user\"," - + " \"content\":\"识别图中的验证码文字,只返回文字内容,不要有其他解释\"," - + " \"images\":[\"" + base64Image + "\"]" - + " }" - + "]" - + "}"; + String json = "{" + + "\"model\":\"" + OLLAMA_MODEL + "\"," + + "\"messages\":[" + + " {" + + " \"role\":\"user\"," + + " \"content\":\"识别图中的验证码文字,只返回文字内容\"," + + " \"images\":[\"" + base64Image + "\"]" + + " }" + + "]" + + "}"; - URL url = new URL(OLLAMA_URL + "/api/chat"); - HttpURLConnection conn = (HttpURLConnection) url.openConnection(); - conn.setRequestMethod("POST"); - conn.setConnectTimeout(15000); - conn.setReadTimeout(30000); - conn.setDoOutput(true); - conn.setRequestProperty("Content-Type", "application/json; charset=utf-8"); + URL url = new URL(OLLAMA_URL + "/api/chat"); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("POST"); + conn.setConnectTimeout(15000); + conn.setReadTimeout(60000); + conn.setDoOutput(true); + conn.setRequestProperty("Content-Type", "application/json; charset=utf-8"); - conn.getOutputStream().write(json.getBytes("utf-8")); - conn.getOutputStream().flush(); - conn.getOutputStream().close(); + conn.getOutputStream().write(json.getBytes("utf-8")); + conn.getOutputStream().flush(); + conn.getOutputStream().close(); - try (BufferedReader reader = new BufferedReader( - new InputStreamReader(conn.getInputStream(), "utf-8"))) { - StringBuilder sb = new StringBuilder(); - String line; - while ((line = reader.readLine()) != null) { - sb.append(line); - } - String response = sb.toString(); - // Parse "content":"..." from the JSON response - int contentIdx = response.indexOf("\"content\":"); - if (contentIdx >= 0) { - int start = response.indexOf('"', contentIdx + 10) + 1; - int end = response.indexOf('"', start); - if (start > 0 && end > start) { - return response.substring(start, end).trim(); - } - } - return null; - } finally { - conn.disconnect(); - } - } - } \ No newline at end of file + BufferedReader reader = new BufferedReader( + new InputStreamReader(conn.getInputStream(), "utf-8")); + try { + StringBuilder fullContent = new StringBuilder(); + String line; + while ((line = reader.readLine()) != null) { + // Parse each line as a separate JSON object (streaming response) + int contentIdx = line.indexOf("\"content\":"); + if (contentIdx >= 0) { + int start = line.indexOf('"', contentIdx + 10) + 1; + int end = line.indexOf('"', start); + if (start > 0 && end > start) { + fullContent.append(line.substring(start, end)); + } + } + // Check for done marker + if (line.contains("\"done\":true")) { + break; + } + } + return normalizeCaptcha(fullContent.toString()); + } finally { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + // ignore + } + } + conn.disconnect(); + } + } + + private static String normalizeCaptcha(String raw) { + if (raw == null || raw.isBlank()) { + return ""; + } + String s = raw.strip().replaceAll("\\s+", ""); + s = s.replaceAll("^[`'\\\"]|[`'\\\"]+$", ""); + return s; + } +} diff --git a/src/test/java/com/ets/scraper/EtsScraperTest.java b/src/test/java/com/ets/scraper/EtsScraperTest.java index 194b414..7336e55 100644 --- a/src/test/java/com/ets/scraper/EtsScraperTest.java +++ b/src/test/java/com/ets/scraper/EtsScraperTest.java @@ -76,4 +76,15 @@ class EtsScraperTest { byte[] decoded = java.util.Base64.getDecoder().decode(base64); assertArrayEquals(imageBytes, decoded, "Base64 roundtrip should match original"); } + + @Test + void testCaptchaRecognition() throws Exception { + Path captchaPath = Path.of("screenshots/captcha.png"); + String captchaText = EtsScraper.recognizeCaptcha(captchaPath); + + System.out.println("[+] Recognized captcha: " + captchaText); + assertNotNull(captchaText, "Captcha recognition should return a result"); + assertFalse(captchaText.isEmpty(), "Captcha text should not be empty"); + System.out.println("[+] Captcha length: " + captchaText.length() + " chars"); + } }