Automate the Web for Fun & Profit with Puppeteer

What Is Puppeteer?

From the Puppeteer repo README:

Puppeteer is a Node library which provides a high-level API to control headless Chrome over the DevTools Protocol. It can also be configured to use full (non-headless) Chrome.

What Can I Do With Puppeteer?

Getting Started

Required

Note: Puppeteer requires at least Node v6.4.0, but the examples below use async/await which is only supported in Node v7.6.0 or greater

Installation

yarn add puppeteer
# or "npm i puppeteer"

Recipes

Below are a few simple examples to get you started:

Screenshot: Viewport

'use strict';

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  // Large viewport
  page.setViewport({ width: 1280, height: 800 });

  await page.goto('https://github.com', { waitUntil: 'networkidle' });

  // Screenshot
  await page.screenshot({ path: 'screenshot-viewport.png' });

  browser.close();
})();

Screenshot: Full Page

'use strict';

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  // Large viewport
  page.setViewport({ width: 1280, height: 800 });

  await page.goto('https://github.com', { waitUntil: 'networkidle' });

  // Screenshot
  await page.screenshot({ path: 'screenshot-full.png', fullPage: true });

  browser.close();
})();

Screenshot: Page Coordinates

'use strict';

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  // Large viewport
  page.setViewport({ width: 1280, height: 800 });

  await page.goto('https://github.com', { waitUntil: 'networkidle' });

  // Screenshot
  await page.screenshot({
    path: 'screenshot-coords.png',
    clip: { x: 147, y: 19, width: 34, height: 34 },
  });

  browser.close();
})();

Screenshot: DOM Element

'use strict';

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  // Large viewport
  page.setViewport({ width: 1280, height: 800 });

  await page.goto('https://github.com', { waitUntil: 'networkidle' });

  async function screenshotDOMElement(selector, padding = 0) {
    const rect = await page.evaluate(selector => {
      const element = document.querySelector(selector);
      const { x, y, width, height } = element.getBoundingClientRect();
      return { left: x, top: y, width, height, id: element.id };
    }, selector);
    console.log('rect: ', rect);

    return await page.screenshot({
      path: 'screenshot-element.png',
      clip: {
        x: rect.left - padding,
        y: rect.top - padding,
        width: rect.width + padding * 2,
        height: rect.height + padding * 2,
      },
    });
  }

  // Screenshot
  await screenshotDOMElement('.header-logo-invertocat', 1);

  browser.close();
})();

PDF

'use strict';

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  // Large viewport
  page.setViewport({ width: 1280, height: 800 });

  await page.goto('https://github.com', { waitUntil: 'networkidle' });

  // PDF
  await page.pdf({ path: 'github.pdf', format: 'letter' });

  browser.close();
})();

Search GitHub & Extract Results

'use strict';

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  // Large viewport
  page.setViewport({ width: 1280, height: 800 });

  await page.goto('https://github.com', { waitUntil: 'networkidle' });

  // Click seach input
  await page.click('.header-search-input');

  // Type query into search input
  await page.type('puppeteer');

  // Submit search form
  const result = await page.evaluate(() => {
    const searchForm = document.querySelector('form[action="/search"]');
    searchForm.submit();
  });

  // Wait for results
  await page.waitForSelector('h3 a');

  // Extract results from the page
  const links = await page.evaluate(() => {
    const anchors = Array.from(document.querySelectorAll('h3 a'));
    return anchors.map(anchor => anchor.textContent);
  });
  console.log(links.join('\n'));

  browser.close();
})();

Start Automating!

Now that you’ve seen a few examples, you’re ready to start automating the web for fun & profit! You’re limited only by your imagination. Explore Puppeteer’s API Docs to learn more.

Leave a Reply

Your email address will not be published. Required fields are marked *