How to scrape a website with Selenium + python
Scraping a website with python and selenium requires:
Selenium: http://docs.seleniumhq.org/
Python
A webbrowser (Firefox, Chrome (Chromium too?), Opera?)
NB: For the purposes of this tutorial it is NOT NECESSARY to install the Selenium Server.
In addition, some "glue" that allows python to talk to the browser:
The python selenium library
A "driver" for the browser you want to use
Installing the python selenium library
Easiest is to use python's package manager *pip*, and install with a command like:
- sudo pip install selenium
You also need a "driver" that connects to a specific browser:
Untar/zip them and place in your PATH. For instance for me that is ~/bin which is in my PATH.
Links
* Python + Selenium tutorial http://selenium-python.readthedocs.io/installation.html
* Python Selenium package https://pypi.python.org/pypi/selenium
Recipes
Getting Started: Writing and Running a Python Script
1. Necessary to have a CODE EDITOR of some sort https://www.sublimetext.com/
https://atom.io/
Installing PIP
PIP is the "app store" of python...
Try typing pip at the terminal, you can see if it's installed already...
To Install it (if it says command not found)
On a mac, this might work:
When you use sudo, it will ask you to enter your mac's passowrd (it will not appear as you type it, that's normal).. Press Enter...
On linux,
- sudo apt-get install python-pip
Once installed, then typing
Should show all sorts of options specific to pip
Use PIP to install selenium
- sudo pip install selenium
Write a "Hello Browser" simple script to test python + selenium
Create a new text file in your code editor and paste in:
- from selenium import webdriver
- driver = webdriver.Firefox()
Add this to your "hello.py" and save it
Run this in the terminal:
python hello.py
ERROR 1:
If you see this:
Traceback (most recent call last):
File "hello.py", line 2, in <module>
from selenium import webdriver
ImportError: No module named selenium
Selenium is not installed. try again with pip.
ERROR 2:
os.path.basename(self.path), self.start_error_message)
selenium.common.exceptions.WebDriverException: Message: 'geckodriver' executable needs to be in PATH.
Opening the browser
Going to a URL
Searching the page with a CSS selector
Extracting text
print "hello world"
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("http://carmentis.be")
from time import sleep
raw_input("press enter to continue")
while True:
print driver.find_element_by_css_selector("li.inventoryNb .tspValue").text
sleep(4)
driver.find_element_by_css_selector("li.next a").click()
Downloading an Image
Outputting data in CSV format
Outputting data in JSON format
Reading data in CSV format
python's csv DictReader is very nice when the csv file has a header line telling the name of the columns!
from csv import DictReader
import sys
reader = DictReader(sys.stdin)
for item in reader:
Converting CSV to JSON
from csv import DictReader
import sys, json
items = []
reader = DictReader(sys.stdin)
for item in reader:
print json.dumps(items, indent=2)
Traduire
Traduire
Traduire
You find instructions on how to use this script here:
https://gitlab.constantvzw.org/diversions/differentorders
Python script Michael
from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser
BUFSIZE = 1024 * 1000
def wget (url, tofile):
f = urlopen(url)
count = 0
with open(tofile, "wb") as fout:
while True:
data = f.read(BUFSIZE)
if data == "":
break
count += len(data)
fout.write(data)
return count
def image_path (x):
x = x.replace(" ", "_").lower()
x = x+".jpg"
return x
def log (*msg):
print (*msg, file=sys.stderr)
ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default="http://carmentis.be")
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--imagepath", default="images")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="firefox", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="json", help="output format: json (default), csv")
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")
args = ap.parse_args()
if not args.skipimages:
try:
os.makedirs(args.imagepath)
except OSError:
pass
sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
b = webdriver.Opera()
elif args.browser == "chrome":
b = webdriver.Chrome()
else:
b = webdriver.Firefox()
if sleeptime == None:
sleeptime = 0.5
# wait = WebDriverWait(b, 20)
# b.implicitly_wait(10) # seconds
b.get(args.starturl)
log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()
# # switch to detail view...
# b.find_element_by_css_selector(".contentViews .arrowDownButton").click()
# sleep(0.25)
# b.find_element_by_id("viewTypes-detailView").click()
# wait.until(EC.visibility_of_element_located((By.ID, 'collectionDetailItem')))
props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()
if args.format == "csv":
from csv import DictWriter
fieldnames = props[:]
fieldnames.append("url")
if not args.skipimages:
fieldnames.extend(("imageurl", "image"))
csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
csvout.writeheader()
count = 0
while True:
b.implicitly_wait(0)
count += 1
item = {}
for p in props:
try:
li = b.find_element_by_css_selector("li."+p)
name = li.find_element_by_css_selector(".tspPrefix")
span = li.find_element_by_css_selector(".tspValue")
# item[name.text] = span.text
item[p] = span.text
except NoSuchElementException as e:
pass
# permalink / bookmark
tries = 0
while tries < 5:
try:
permalink = b.find_element_by_css_selector("li.bookmark")\
.find_element_by_css_selector("input")\
.get_attribute("value")
item['url'] = permalink
break
except NoSuchElementException:
tries += 1
sleep(0.1)
# print ("PERMALINK: {0}".format(permalink))
if not args.skipimages:
imglink = b.find_element_by_css_selector("dt.detailImg a")
imglink.click()
img_src = None
tries = 0
while tries < 5:
try:
b.switch_to_window('HighResImage')
img = b.find_element_by_css_selector("img")
img_src = img.get_attribute("src")
item['imageurl'] = img_src
b.close()
b.switch_to_window("")
break
except NoSuchWindowException as e:
log("NoSuchWindowException", tries)
sleep(1)
tries += 1
if img_src:
# log("IMAGE: {0}".format(img_src))
ifilename = image_path(item['inventoryNb'])
ipath = os.path.join(args.imagepath, ifilename)
if wget(img_src, ipath):
item['image'] = ifilename
if args.format == "json":
print (json.dumps(item))
elif args.format == "csv":
csvout.writerow({k:v.encode('utf8') for k,v in item.items()})
next = None
try:
next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
except NoSuchElementException as e:
pass
if next == None:
log("END OF LIST")
break
log("NEXT")
b.implicitly_wait(10)
next.click()
if sleeptime:
sleep(sleeptime)
log("output {0} items".format(count))
b.close()
LAST VERSION:
============
Edits enable grabbing entries without inventory number
from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser
import re
BUFSIZE = 1024 * 1000
def wget (url, tofile):
f = urlopen(url)
count = 0
with open(tofile, "wb") as fout:
while True:
data = f.read(BUFSIZE)
if data == "":
break
count += len(data)
fout.write(data)
return count
def image_path (x):
x = x.replace(" ", "_").lower()
x = x+".jpg"
return x
def extractobjid (url):
m = re.search("objectId=([0-9]+)", url)
if m:
return m.group(1)
def log (*msg):
print (*msg, file=sys.stderr)
ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default=" http://carmentis.be ")
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--imagepath", default="images")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="firefox", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="json", help="output format: json (default), csv")
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")
args = ap.parse_args()
if not args.skipimages:
try:
os.makedirs(args.imagepath)
except OSError:
pass
sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
b = webdriver.Opera()
elif args.browser == "chrome":
b = webdriver.Chrome()
else:
b = webdriver.Firefox()
if sleeptime == None:
sleeptime = 0.5
# wait = WebDriverWait(b, 20)
# b.implicitly_wait(10) # seconds
b.get(args.starturl)
log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()
# # switch to detail view...
# b.find_element_by_css_selector(".contentViews .arrowDownButton").click()
# sleep(0.25)
# b.find_element_by_id("viewTypes-detailView").click()
# wait.until(EC.visibility_of_element_located((By.ID, 'collectionDetailItem')))
props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()
if args.format == "csv":
from csv import DictWriter
fieldnames = props[:]
fieldnames.append("url")
if not args.skipimages:
fieldnames.extend(("imageurl", "image"))
csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
csvout.writeheader()
count = 0
while True:
b.implicitly_wait(0)
count += 1
item = {}
for p in props:
try:
li = b.find_element_by_css_selector("li."+p)
name = li.find_element_by_css_selector(".tspPrefix")
span = li.find_element_by_css_selector(".tspValue")
# item[name.text] = span.text
item[p] = span.text
except NoSuchElementException as e:
pass
# permalink / bookmark
tries = 0
while tries < 5:
try:
permalink = b.find_element_by_css_selector("li.bookmark")\
.find_element_by_css_selector("input")\
.get_attribute("value")
item['url'] = permalink
break
except NoSuchElementException:
tries += 1
sleep(0.1)
# print ("PERMALINK: {0}".format(permalink))
if not args.skipimages:
imglink = b.find_element_by_css_selector("dt.detailImg a")
imglink.click()
img_src = None
tries = 0
while tries < 5:
try:
b.switch_to_window('HighResImage')
img = b.find_element_by_css_selector("img")
img_src = img.get_attribute("src")
item['imageurl'] = img_src
b.close()
b.switch_to_window("")
break
except NoSuchWindowException as e:
log("NoSuchWindowException", tries)
sleep(1)
tries += 1
#if there is no inventary number, take anyway
if img_src:
# log("IMAGE: {0}".format(img_src))
#any img
print (img_src)
#img within inventory
slug = item.get('inventoryNb')
if slug == None:
slug = extractobjid(item['url'])
ifilename = image_path(slug)
print (ifilename)
ipath = os.path.join(args.imagepath, ifilename)
print (ipath)
if wget(img_src, ipath):
item['image'] = ifilename
if args.format == "json":
print (json.dumps(item))
elif args.format == "csv":
csvout.writerow({k:v.encode('utf8') for k,v in item.items()})
next = None
try:
next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
except NoSuchElementException as e:
pass
if next == None:
log("END OF LIST")
break
log("NEXT")
b.implicitly_wait(10)
next.click()
if sleeptime:
sleep(sleeptime)
log("output {0} items".format(count))
b.close()