無(wú)錫高端網(wǎng)站建設(shè)開(kāi)發(fā)在線咨詢 1 網(wǎng)站宣傳
selenium 可以動(dòng)態(tài)爬取網(wǎng)頁(yè)數(shù)據(jù),就像真實(shí)用戶操作瀏覽器一樣,從終端用戶的角度測(cè)試應(yīng)用程序,WebDriver通過(guò)原生瀏覽器支持或者瀏覽器擴(kuò)展直接控制瀏覽器
webdriver下載
因?yàn)閟elenuim對(duì)瀏覽器的版本存在兼容問(wèn)題,顧需要針對(duì)指定瀏覽器下載指定版本。
1、添加依賴
<dependency><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-java</artifactId><version>4.11.0</version></dependency><dependency><groupId>com.google.guava</groupId><artifactId>guava</artifactId><version>32.1.2-jre</version></dependency>
2、工具類
import cn.hutool.core.collection.CollectionUtil;
import com.google.common.collect.Lists;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.edge.EdgeDriver;
import org.openqa.selenium.edge.EdgeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.springframework.stereotype.Component;import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;/*** Selenium 工具類** @author kou*/
@Slf4j
@RequiredArgsConstructor
@Component
public class SeleniumUtil {private final ReptileProperties reptileProperties;/*** 獲取chromeDriver** @return chromeDriver*/public WebDriver chromeDriver() {// 加載驅(qū)動(dòng)路徑System.setProperty("webdriver.chrome.driver", "D:/chromedriver.exe");// Chrome默認(rèn)不允許跨機(jī)器調(diào)試,需要給啟動(dòng)命令加上白名單System.setProperty("webdriver.chrome.whitelistedIps", "");ChromeOptions options = new ChromeOptions();// 開(kāi)啟一個(gè)實(shí)驗(yàn)性參數(shù)excludeSwitches,用來(lái)隱藏window.navigator.webdriver返回true,這個(gè)參數(shù)必須是Listoptions.setExperimentalOption("useAutomationExtension", false);// 開(kāi)啟開(kāi)發(fā)者模式options.setExperimentalOption("excludeSwitches", Lists.newArrayList("enable-automation"));// 發(fā)現(xiàn)主要是這句是關(guān)鍵options.addArguments("--disable-blink-features=AutomationControlled");// options.addArguments("--incognito");// options.addArguments("--disable-infobars");//options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36");// 禁用沙箱options.addArguments("--no-sandbox");// 無(wú)頭瀏覽器,這樣不會(huì)打開(kāi)瀏覽器窗口// options.addArguments("--headless");// options.addArguments("--disable-gpu");options.addArguments("--remote-allow-origins=*");// 初始化一個(gè)谷歌瀏覽器實(shí)例,實(shí)例名稱叫driverWebDriver driver = new ChromeDriver(options);return driver;}/*** 獲取edgeDriver** @return edgeDriver*/public WebDriver edgeDriver() {// 加載驅(qū)動(dòng)路徑System.setProperty("webdriver.edge.driver", "D:/msedgedriver.exe");EdgeOptions options = new EdgeOptions();// 開(kāi)啟一個(gè)實(shí)驗(yàn)性參數(shù)excludeSwitches,用來(lái)隱藏window.navigator.webdriver返回true,這個(gè)參數(shù)必須是Listoptions.setExperimentalOption("useAutomationExtension", false);//開(kāi)啟開(kāi)發(fā)者模式options.setExperimentalOption("excludeSwitches", Lists.newArrayList("enable-automation"));// 發(fā)現(xiàn)主要是這句是關(guān)鍵options.addArguments("--disable-blink-features=AutomationControlled");options.addArguments("--incognito", "--disable-infobars");// options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36");options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");// 禁用沙箱options.addArguments("--no-sandbox");// 無(wú)頭瀏覽器,這樣不會(huì)打開(kāi)瀏覽器窗口// options.addArguments("--headless");options.addArguments("--disable-gpu");options.addArguments("--remote-allow-origins=*");// 初始化一個(gè)谷歌瀏覽器實(shí)例,實(shí)例名稱叫driverWebDriver driver = new EdgeDriver(options);return driver;}/*** 獲取firefoxDriver** @return firefoxDriver*/public WebDriver firefoxDriver() {// 加載驅(qū)動(dòng)路徑System.setProperty("webdriver.gecko.driver", "D:/geckodriver.exe");System.setProperty("webdriver.chrome.whitelistedIps", "");FirefoxOptions options = new FirefoxOptions();options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");// 無(wú)頭瀏覽器,這樣不會(huì)打開(kāi)瀏覽器窗口options.addArguments("--headless");// 初始化一個(gè)谷歌瀏覽器實(shí)例,實(shí)例名稱叫driverWebDriver driver = new FirefoxDriver(options);return driver;}/*** 獲取表頭** @param table 表格* @return 表頭*/public List<String> getTableHead(WebElement table) {log.info("開(kāi)始解析表頭...");// 獲取表頭WebElement head = table.findElement(By.tagName("thead"));if (null == head) {return Collections.emptyList();}List<WebElement> headths = head.findElements(By.tagName("th"));List<String> headList = new ArrayList<>(headths.size());headths.forEach(t -> {headList.add(t.getText());});log.info("表頭解析完成!!!");return headList;}/*** 獲取表數(shù)據(jù)** @param table 表格* @return 表頭*/public List<List<String>> getTableBody(WebElement table) {log.info("開(kāi)始解析表數(shù)據(jù)...");// 獲取表頭WebElement tbody = table.findElement(By.tagName("tbody"));if (null == tbody) {return Collections.emptyList();}// 獲取body數(shù)據(jù)行List<WebElement> bodyTrs = tbody.findElements(By.tagName("tr"));if (CollectionUtil.isEmpty(bodyTrs)) {return Collections.emptyList();}List<List<String>> bodyDatas = new ArrayList<>(bodyTrs.size());bodyTrs.stream().forEach(r -> {List<WebElement> tds = r.findElements(By.tagName("td"));List<String> rows = new ArrayList<>(tds.size());tds.forEach(d -> {rows.add(d.getText());});bodyDatas.add(rows);});log.info("表數(shù)據(jù)解析完成!!!");return bodyDatas;}/*** 將參數(shù)轉(zhuǎn)化為路徑參數(shù)** @param params 參數(shù)* @return 路徑參數(shù)*/public String convertPathParams(Map<String, Object> params) {if (CollectionUtil.isEmpty(params)) {return "";}StringBuffer path = new StringBuffer();for (Map.Entry<String, Object> p : params.entrySet()) {path.append(p.getKey()).append("=").append(p.getValue().toString()).append("&");}return path.substring(0, path.length() - 1);}}
3、爬取數(shù)據(jù)
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;/*** 數(shù)據(jù)接口實(shí)現(xiàn)類** @author kou*/
@Slf4j
@RequiredArgsConstructor
@Service
public class DataServiceImpl {private final SeleniumUtil seleniumUtil;/*** 獲取頁(yè)面數(shù)據(jù)** @return 數(shù)據(jù)*/@Overridepublic Map<String, Object> getHtmlData() {try {Map<String, Object> data = new HashMap<>();String url = "url";Map<String, Object> params = new HashMap<>();params.put("pageNum", 1);params.put("pageSize", 1000);String fullUrl = url + seleniumUtil.convertPathParams(params);WebDriver driver = seleniumUtil.firefoxDriver();driver.get(fullUrl);// 打開(kāi)一個(gè)站點(diǎn)log.info("開(kāi)始訪問(wèn):{}", fullUrl);driver.get(fullUrl);String title = driver.getTitle();log.info("網(wǎng)頁(yè):{}", title);// 獲取表格數(shù)據(jù)WebElement table = driver.findElement(By.id("table"));//顯式等待,針對(duì)某個(gè)元素等待,等待超時(shí)時(shí)間100s,2s檢測(cè)一次WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(100), Duration.ofSeconds(2));// wait.until(ExpectedConditions.presenceOfElementLocated(By.id("table")));wait.until(new ExpectedCondition<WebElement>() {@Overridepublic WebElement apply(WebDriver text) {log.info("開(kāi)始檢查tbody數(shù)據(jù)是否已加載");WebElement table = text.findElement(By.id("table")).findElement(By.tagName("tbody"));if (!table.isDisplayed()) {log.info("檢查結(jié)果:tbody數(shù)據(jù)未加載完,等待加載...");return null;}log.info("檢查結(jié)果:tbody數(shù)據(jù)加載完成!!!");return table;}});// 獲取表頭List<String> headList = seleniumUtil.getTableHead(table);List<List<String>> bodyList = seleniumUtil.getTableBody(table);data.put("header", headList);data.put("body", bodyList);driver.close();return data;} catch (Exception e) {throw new RuntimeException(e);}}}