Installation
Install Qt4
1 2 3 4 5 6 7 |
# download http://qt.nokia.com/downloads/sdk-linux-x11-32bit-cpp-offline wget http://www.developer.nokia.com/dp?uri=http%3A%2F%2Fsw.nokia.com%2Fid%2F8ea74da4-fec1-4277-8b26-c58cc82e204b%2FQt_SDK_Lin32_offline chmod u+x ./QtSdk-offline-linux-x86-v1.2.1.run sudo ./QtSdk-offline-linux-x86-v1.2.1.run # install Qt4 Library sudo apt-get install -y python-lxml qt4-qmake |
Python Libraries
1 2 3 4 5 6 7 8 9 10 11 12 13 |
# install cssselect - fix "ImportError: No module named cssselect" sudo pip install cssselect # install webkit-server git clone https://github.com/niklasb/webkit-server.git webkit-server cd webkit-server sudo python setup.py install # install dryscrape # sudo pip install dryscrape git clone https://github.com/niklasb/dryscrape.git dryscrape cd dryscrape sudo python setup.py install |
dryscrape_test.py – Test File
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
cat > dryscrape_test.py <<"_EOF_" # -*- coding: utf-8 -*- import dryscrape search_keyword = 'dryscrape' # set up a web scraping session session = dryscrape.Session(base_url = 'http://google.com') # we don't need images session.set_attribute('auto_load_images', False) # visit homepage and search for a term session.visit('/') q = session.at_xpath('//*[@name="q"]') q.set(search_keyword) q.form().submit() # extract all links for link in session.xpath('//a[@href]'): print link['href'] # save a screenshot of the web page session.render('google.png') print("Screenshot written to 'google.png'") |