How to scrape a website with Selenium + python

Scraping a website with python and selenium requires:

Selenium: http://docs.seleniumhq.org/

    Python
    A webbrowser (Firefox, Chrome (Chromium too?), Opera?)

NB: For the purposes of this tutorial it is NOT NECESSARY to install the Selenium Server.

In addition, some "glue" that allows python to talk to the browser:

    The python selenium library    
    A "driver" for the browser you want to use 

Installing the python selenium library

Easiest is to use python's package manager *pip*, and install with a command like:


You also need a "driver" that connects to a specific browser:


Untar/zip them and place in your PATH. For instance for me that is ~/bin which is in my PATH.

Links

* Python + Selenium tutorial http://selenium-python.readthedocs.io/installation.html
* Python Selenium package https://pypi.python.org/pypi/selenium


Recipes


Getting Started: Writing and Running a Python Script
1. Necessary to have a CODE EDITOR of some sort https://www.sublimetext.com/
https://atom.io/


Installing PIP

PIP is the "app store" of python...

Try typing pip at the terminal, you can see if it's installed already...

To Install it (if it says command not found)

On a mac, this might work:

When you use sudo, it will ask you to enter your mac's passowrd (it will not appear as you type it, that's normal).. Press Enter...

On linux,

Once installed, then typing
Should show all sorts of options specific to pip


Use PIP to install selenium


Write a "Hello Browser" simple script to test python + selenium

Create a new text file in your code editor and paste in:


Add this to your "hello.py" and save it

Run this in the terminal:

    python hello.py

ERROR 1:
If you see this:
    Traceback (most recent call last):
  File "hello.py", line 2, in <module>
    from selenium import webdriver
ImportError: No module named selenium

Selenium is not installed. try again with pip.

ERROR 2:
      os.path.basename(self.path), self.start_error_message)
selenium.common.exceptions.WebDriverException: Message: 'geckodriver' executable needs to be in PATH. 



Opening the browser

Going to a URL

Searching the page with a CSS selector

Extracting text


print "hello world"
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("http://carmentis.be")
from time import sleep

raw_input("press enter to continue")

while True:
        print driver.find_element_by_css_selector("li.inventoryNb .tspValue").text
        sleep(4)
        driver.find_element_by_css_selector("li.next a").click()



Downloading an Image

Outputting data in CSV format

Outputting data in JSON format


Reading data in CSV format
python's csv DictReader is very nice when the csv file has a header line telling the name of the columns!

from csv import DictReader
import sys
reader = DictReader(sys.stdin)
for item in reader:

Converting CSV to JSON

from csv import DictReader
import sys, json
items = []
reader = DictReader(sys.stdin)
for item in reader:
print json.dumps(items, indent=2)



Traduire
Traduire
Traduire

You find instructions on how to use this script here: 
https://gitlab.constantvzw.org/diversions/differentorders


Python script Michael


from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser


BUFSIZE  = 1024 * 1000
def wget (url, tofile):
    f = urlopen(url)
    count = 0
    with open(tofile, "wb") as fout:
        while True:
            data = f.read(BUFSIZE)
            if data == "":
                break
            count += len(data)
            fout.write(data)
    return count

def image_path (x):
    x = x.replace(" ", "_").lower()
    x = x+".jpg"
    return x

def log (*msg):
    print (*msg, file=sys.stderr)


ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default="http://carmentis.be")
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--imagepath", default="images")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="firefox", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="json", help="output format: json (default), csv")
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")

args = ap.parse_args()
if not args.skipimages:
    try:
        os.makedirs(args.imagepath)
    except OSError:
        pass

sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
    b = webdriver.Opera()
elif args.browser == "chrome":
    b = webdriver.Chrome()
else:
    b = webdriver.Firefox()
    if sleeptime == None:
        sleeptime = 0.5

# wait = WebDriverWait(b, 20)
# b.implicitly_wait(10) # seconds
b.get(args.starturl)

log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()

# # switch to detail view...
# b.find_element_by_css_selector(".contentViews .arrowDownButton").click()
# sleep(0.25)
# b.find_element_by_id("viewTypes-detailView").click()
# wait.until(EC.visibility_of_element_located((By.ID, 'collectionDetailItem')))


props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()

if args.format == "csv":
    from csv import DictWriter
    fieldnames = props[:]
    fieldnames.append("url")
    if not args.skipimages:
        fieldnames.extend(("imageurl", "image"))
    csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
    csvout.writeheader()

count = 0

while True:
    b.implicitly_wait(0)
    count += 1
    item = {}
    for p in props:
        try:
            li = b.find_element_by_css_selector("li."+p)
            name = li.find_element_by_css_selector(".tspPrefix")
            span = li.find_element_by_css_selector(".tspValue")
            # item[name.text] = span.text
            item[p] = span.text
        except NoSuchElementException as e:
            pass

    # permalink / bookmark
    tries = 0
    while tries < 5:
        try:
            permalink = b.find_element_by_css_selector("li.bookmark")\
                .find_element_by_css_selector("input")\
                .get_attribute("value")
            item['url'] = permalink
            break
        except NoSuchElementException:
            tries += 1
            sleep(0.1)
    # print ("PERMALINK: {0}".format(permalink))

    if not args.skipimages:
        imglink = b.find_element_by_css_selector("dt.detailImg a")
        imglink.click()


        img_src = None
        tries = 0
        while tries < 5:
            try:
                b.switch_to_window('HighResImage')
                img = b.find_element_by_css_selector("img")
                img_src = img.get_attribute("src")
                item['imageurl'] = img_src
                b.close()
                b.switch_to_window("")
                break
            except NoSuchWindowException as e:
                log("NoSuchWindowException", tries)
                sleep(1)
                tries += 1


        if img_src:
            # log("IMAGE: {0}".format(img_src))
            ifilename = image_path(item['inventoryNb'])
            ipath = os.path.join(args.imagepath, ifilename)
            if wget(img_src, ipath):
                item['image'] = ifilename

    if args.format == "json":
        print (json.dumps(item))
    elif args.format == "csv":
        csvout.writerow({k:v.encode('utf8') for k,v in item.items()})

    next = None
    try:
        next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
    except NoSuchElementException as e:
        pass
    if next == None:
        log("END OF LIST")
        break
    log("NEXT")
    b.implicitly_wait(10)
    next.click()
    if sleeptime:
        sleep(sleeptime)

log("output {0} items".format(count))
b.close()



LAST VERSION:
============
Edits enable grabbing entries without inventory number

from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser
import re

BUFSIZE  = 1024 * 1000
def wget (url, tofile):
    f = urlopen(url)
    count = 0
    with open(tofile, "wb") as fout:
        while True:
            data = f.read(BUFSIZE)
            if data == "":
                break
            count += len(data)
            fout.write(data)
    return count

def image_path (x):
    x = x.replace(" ", "_").lower()
    x = x+".jpg"
    return x

def extractobjid (url):
    m = re.search("objectId=([0-9]+)", url)
    if m:
        return m.group(1)

def log (*msg):
    print (*msg, file=sys.stderr)


ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default=" http://carmentis.be ")
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--imagepath", default="images")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="firefox", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="json", help="output format: json (default), csv")
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")

args = ap.parse_args()
if not args.skipimages:
    try:
        os.makedirs(args.imagepath)
    except OSError:
        pass

sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
    b = webdriver.Opera()
elif args.browser == "chrome":
    b = webdriver.Chrome()
else:
    b = webdriver.Firefox()
    if sleeptime == None:
        sleeptime = 0.5

# wait = WebDriverWait(b, 20)
# b.implicitly_wait(10) # seconds
b.get(args.starturl)

log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()

# # switch to detail view...
# b.find_element_by_css_selector(".contentViews .arrowDownButton").click()
# sleep(0.25)
# b.find_element_by_id("viewTypes-detailView").click()
# wait.until(EC.visibility_of_element_located((By.ID, 'collectionDetailItem')))


props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()

if args.format == "csv":
    from csv import DictWriter
    fieldnames = props[:]
    fieldnames.append("url")
    if not args.skipimages:
        fieldnames.extend(("imageurl", "image"))
    csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
    csvout.writeheader()

count = 0

while True:
    b.implicitly_wait(0)
    count += 1
    item = {}
    for p in props:
        try:
            li = b.find_element_by_css_selector("li."+p)
            name = li.find_element_by_css_selector(".tspPrefix")
            span = li.find_element_by_css_selector(".tspValue")
            # item[name.text] = span.text
            item[p] = span.text
        except NoSuchElementException as e:
            pass

    # permalink / bookmark
    tries = 0
    while tries < 5:
        try:
            permalink = b.find_element_by_css_selector("li.bookmark")\
                .find_element_by_css_selector("input")\
                .get_attribute("value")
            item['url'] = permalink
            break
        except NoSuchElementException:
            tries += 1
            sleep(0.1)
    # print ("PERMALINK: {0}".format(permalink))

    if not args.skipimages:
        imglink = b.find_element_by_css_selector("dt.detailImg a")
        imglink.click()


        img_src = None
        tries = 0
        while tries < 5:
            try:
                b.switch_to_window('HighResImage')
                img = b.find_element_by_css_selector("img")
                img_src = img.get_attribute("src")
                item['imageurl'] = img_src
                b.close()
                b.switch_to_window("")
                break
            except NoSuchWindowException as e:
                log("NoSuchWindowException", tries)
                sleep(1)
                tries += 1

#if there is no inventary number, take anyway

        if img_src:
            # log("IMAGE: {0}".format(img_src))
            #any img
            print (img_src)
            #img within inventory
            slug = item.get('inventoryNb')
            if slug == None:
                slug = extractobjid(item['url'])

            ifilename = image_path(slug)
            print (ifilename)
            ipath = os.path.join(args.imagepath, ifilename)
            print (ipath)
            if wget(img_src, ipath):
                item['image'] = ifilename


    if args.format == "json":
        print (json.dumps(item))
    elif args.format == "csv":
        csvout.writerow({k:v.encode('utf8') for k,v in item.items()})

    next = None
    try:
        next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
    except NoSuchElementException as e:
        pass
    if next == None:
        log("END OF LIST")
        break
    log("NEXT")
    b.implicitly_wait(10)
    next.click()
    if sleeptime:
        sleep(sleeptime)

log("output {0} items".format(count))
b.close()