开发手册 欢迎您!
软件开发者资料库

Java 使用Selenium调用浏览器(chrome)下载动态网页源代码

本文主要介绍Java中,使用Selenium调用浏览器(chrome)下载动态网页源代码,并且实现WebDriverPool来进行性能优化,从WebDriverPool池中获取WebDriver对象,以及相关实现示例代码。

1、下载引用Selenium

参考文档https://www.wonhero.com/article/613/

2、WebDriverPool实现代码

package us.codecraft.webmagic.downloader.selenium;import org.openqa.selenium.WebDriver;import org.openqa.selenium.chrome.ChromeDriver;import org.openqa.selenium.firefox.FirefoxDriver;import org.openqa.selenium.phantomjs.PhantomJSDriver;import org.openqa.selenium.phantomjs.PhantomJSDriverService;import org.openqa.selenium.remote.DesiredCapabilities;import org.openqa.selenium.remote.RemoteWebDriver;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.FileReader;import java.io.IOException;import java.net.MalformedURLException;import java.net.URL;import java.util.ArrayList;import java.util.Collections;import java.util.List;import java.util.Properties;import java.util.concurrent.BlockingDeque;import java.util.concurrent.LinkedBlockingDeque;import java.util.concurrent.atomic.AtomicInteger;class WebDriverPool {private Logger logger = LoggerFactory.getLogger(getClass());private final static int DEFAULT_CAPACITY = 5;private final int capacity;private final static int STAT_RUNNING = 1;private final static int STAT_CLODED = 2;private AtomicInteger stat = new AtomicInteger(STAT_RUNNING);private WebDriver mDriver = null;private boolean mAutoQuitDriver = true;private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";private static final String DRIVER_FIREFOX = "firefox";private static final String DRIVER_CHROME = "chrome";private static final String DRIVER_PHANTOMJS = "phantomjs";protected static Properties sConfig;protected static DesiredCapabilities sCaps;public void configure() throws IOException {// Read config filesConfig = new Properties();String configFile = DEFAULT_CONFIG_FILE;if (System.getProperty("selenuim_config")!=null){configFile = System.getProperty("selenuim_config");}sConfig.load(new FileReader(configFile));// Prepare capabilitiessCaps = new DesiredCapabilities();sCaps.setJavascriptEnabled(true);sCaps.setCapability("takesScreenshot", false);String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);// Fetch PhantomJS-specific configuration parametersif (driver.equals(DRIVER_PHANTOMJS)) {// "phantomjs_exec_path"if (sConfig.getProperty("phantomjs_exec_path") != null) {sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,sConfig.getProperty("phantomjs_exec_path"));} else {throw new IOException(String.format("Property '%s' not set!",PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY));}// "phantomjs_driver_path"if (sConfig.getProperty("phantomjs_driver_path") != null) {System.out.println("Test will use an external GhostDriver");sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY,sConfig.getProperty("phantomjs_driver_path"));} else {System.out.println("Test will use PhantomJS internal GhostDriver");}}// Disable "web-security", enable all possible "ssl-protocols" and// "ignore-ssl-errors" for PhantomJSDriver// sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new// String[] {// "--web-security=false",// "--ssl-protocol=any",// "--ignore-ssl-errors=true"// });ArrayList cliArgsCap = new ArrayList();cliArgsCap.add("--web-security=false");cliArgsCap.add("--ssl-protocol=any");cliArgsCap.add("--ignore-ssl-errors=true");sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS,cliArgsCap);// Control LogLevel for GhostDriver, via CLI argumentssCaps.setCapability(PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS,new String[] { "--logLevel="+ (sConfig.getProperty("phantomjs_driver_loglevel") != null ? sConfig.getProperty("phantomjs_driver_loglevel"): "INFO") });// String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);// Start appropriate Driverif (isUrl(driver)) {sCaps.setBrowserName("phantomjs");mDriver = new RemoteWebDriver(new URL(driver), sCaps);} else if (driver.equals(DRIVER_FIREFOX)) {mDriver = new FirefoxDriver(sCaps);} else if (driver.equals(DRIVER_CHROME)) {mDriver = new ChromeDriver(sCaps);} else if (driver.equals(DRIVER_PHANTOMJS)) {mDriver = new PhantomJSDriver(sCaps);}}private boolean isUrl(String urlString) {try {new URL(urlString);return true;} catch (MalformedURLException mue) {return false;}}/** * store webDrivers created */private List webDriverList = Collections.synchronizedList(new ArrayList());/** * store webDrivers available */private BlockingDeque innerQueue = new LinkedBlockingDeque();public WebDriverPool(int capacity) {this.capacity = capacity;}public WebDriverPool() {this(DEFAULT_CAPACITY);}public WebDriver get() throws InterruptedException {checkRunning();WebDriver poll = innerQueue.poll();if (poll != null) {return poll;}if (webDriverList.size() < capacity) {synchronized (webDriverList) {if (webDriverList.size() < capacity) {// add new WebDriver instance into pooltry {configure();innerQueue.add(mDriver);webDriverList.add(mDriver);} catch (IOException e) {e.printStackTrace();}// ChromeDriver e = new ChromeDriver();// WebDriver e = getWebDriver();// innerQueue.add(e);// webDriverList.add(e);}}}return innerQueue.take();}public void returnToPool(WebDriver webDriver) {checkRunning();innerQueue.add(webDriver);}protected void checkRunning() {if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {throw new IllegalStateException("Already closed!");}}public void closeAll() {boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED);if (!b) {throw new IllegalStateException("Already closed!");}for (WebDriver webDriver : webDriverList) {logger.info("Quit webDriver" + webDriver);webDriver.quit();webDriver = null;}}}

3、调用Selenium(WebDriver)下载网页

通过WebDriverPool中获取WebDriver对象,调用浏览器下载动态静态网页代码。

package us.codecraft.webmagic.downloader.selenium;import org.openqa.selenium.By;import org.openqa.selenium.Cookie;import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Request;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.downloader.Downloader;import us.codecraft.webmagic.selector.Html;import us.codecraft.webmagic.selector.PlainText;import java.io.Closeable;import java.io.IOException;import java.util.Map;/** * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
* 需要下载Selenium driver支持。
*/public class SeleniumDownloader{private volatile WebDriverPool webDriverPool;private Logger logger = LoggerFactory.getLogger(getClass());private int sleepTime = 0;private int poolSize = 1;private static final String DRIVER_PHANTOMJS = "phantomjs";public SeleniumDownloader(String chromeDriverPath) {System.getProperties().setProperty("webdriver.chrome.driver",chromeDriverPath);}public SeleniumDownloader() {// System.setProperty("phantomjs.binary.path",// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");}public SeleniumDownloader setSleepTime(int sleepTime) {this.sleepTime = sleepTime;return this;}public String download(Request request, Task task) {checkInit();WebDriver webDriver;try {webDriver = webDriverPool.get();} catch (InterruptedException e) {logger.warn("interrupted", e);return null;}logger.info("downloading page " + request.getUrl());webDriver.get(request.getUrl());try {Thread.sleep(sleepTime);} catch (InterruptedException e) {e.printStackTrace();}WebDriver.Options manage = webDriver.manage();Site site = task.getSite();if (site.getCookies() != null) {for (Map.Entry cookieEntry : site.getCookies().entrySet()) {Cookie cookie = new Cookie(cookieEntry.getKey(),cookieEntry.getValue());manage.addCookie(cookie);}}WebElement webElement = webDriver.findElement(By.xpath("/html"));String content = webElement.getAttribute("outerHTML");return content;}private void checkInit() {if (webDriverPool == null) {synchronized (this) {webDriverPool = new WebDriverPool(poolSize);}}}@Overridepublic void setThread(int thread) {this.poolSize = thread;}@Overridepublic void close() throws IOException {webDriverPool.closeAll();}}

config.ini配置文件:

# What WebDriver to use for the testsdriver=phantomjs#driver=firefox#driver=chrome#driver=http://localhost:8910#driver=http://localhost:4444/wd/hub# PhantomJS specific config (change according to your installation)#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5phantomjs_exec_path=/Users/Bingo/Downloads/phantomjs-1.9.8-macosx/bin/phantomjs#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.jsphantomjs_driver_loglevel=DEBUG

相关文档:

Java Selenium(Chrome)载取滚动条网页长图的方法及示例代码

Java Selenium WebDriver中executeAsyncScript和executeScript方法的使用

Java Selenium WebDriver操作调用浏览器后台执行Js(JavaScript)代码

https://github.com/code4craft/webmagic/tree/master/webmagic-selenium