(1). DumpHtmlFromPageExample
package com.github.kklisura.cdt.examples;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.CountDownLatch;
import org.apache.commons.io.FileUtils;
import com.github.kklisura.cdt.launch.ChromeLauncher;
import com.github.kklisura.cdt.protocol.commands.Network;
import com.github.kklisura.cdt.protocol.commands.Page;
import com.github.kklisura.cdt.protocol.commands.Runtime;
import com.github.kklisura.cdt.protocol.types.runtime.Evaluate;
import com.github.kklisura.cdt.services.ChromeDevToolsService;
import com.github.kklisura.cdt.services.ChromeService;
import com.github.kklisura.cdt.services.types.ChromeTab;
import com.github.kklisura.cdt.utils.FilesUtils;
/**
* The following example dumps the index html from github.com.
*
* @author Kenan Klisura
*/
public class DumpHtmlFromPageExample{
public static void main(String[] args) throws Exception {
// Create chrome launcher.
final ChromeLauncher launcher = new ChromeLauncher();
// Launch chrome either as headless (true) or regular (false).
final ChromeService chromeService = launcher.launch(false);
// Create empty tab ie about:blank.
final ChromeTab tab = chromeService.createTab();
// Get DevTools service to this tab
final ChromeDevToolsService devToolsService = chromeService.createDevToolsService(tab);
// Get individual commands
final Page page = devToolsService.getPage();
final Network network = devToolsService.getNetwork();
final Runtime runtime = devToolsService.getRuntime();
network.onRequestWillBeSent(
event ->
System.out.printf(
"request: %s %s%s",
event.getRequest().getMethod(),
event.getRequest().getUrl(),
System.lineSeparator()));
// Wait for on load event
page.onLoadEventFired(
event -> {
System.out.println("==============onLoadEventFired==================");
try {
Thread.sleep(2000L);
} catch (InterruptedException e) {
e.printStackTrace();
}
// Evaluate javascript
Evaluate evaluation = runtime.evaluate("document.documentElement.outerHTML");
System.out.println(evaluation.getResult().getValue());
byte[] bodys = evaluation.getResult().getValue().toString().getBytes();
try {
FileUtils.writeByteArrayToFile(new File("/Users/lixin/GitRepository/chrome-devtools-java-client/cdt-examples/hello.html"), bodys);
} catch (IOException e) {
e.printStackTrace();
}
// // Close devtools.
// devToolsService.close();
});
// Enable page events.
network.enable();
page.enable();
// Navigate to github.com.
// page.navigate("https://open.tongtool.com/apiDoc.html#/?docId=43a41f3680e04756a122d8671f2fc0ca");
page.navigate("https://github.com");
CountDownLatch latch = new CountDownLatch(1);
latch.await();
// Wait until devtools is closed.
// devToolsService.waitUntilClosed();
// Close tab.
// chromeService.closeTab(tab);
}
}
(2). 结果
Page.onLoadEventFired只调用了一次.
在解析到的HTML里,是包含有AJAX渲染后的内容.这样就能做到:爬取与解析HTML进行分析.