Extracting text by looking for classes - using lxml
Using Scraperwiki to extract text from a webpage:
This uses the lxml module - documentation here: http://lxml.de/lxmlhtml.html#parsing-html
# import a module (library) that helps us do scraping
import scraperwiki
# import another that helps us extract things from the scraped data
import lxml.html
# use that module's scrape function to grab the contents of a URL and put it in the variable HTML
html = scraperwiki.scrape("http://www.nhs.uk/Services/Trusts/GPs/DefaultView.aspx?id=5PG")
# use the lxml.html's fromstring function to grab some structured data, put in a variable called gplist
gplist = lxml.html.fromstring(html)
# get the first <p class="child-org-name"> <a> tags from within that, put in a list(?) called gpname. The class is indicated by the period before it.
gpname = gplist.cssselect(".child-org-name a")
#loop through the list of items and...
for gp in list(gpname):
record = { "gp" : gp.text } # create a column name and store the text of each occurrence
scraperwiki.sqlite.save(["gp"], record) # save the records one by one
import scraperwiki
# import another that helps us extract things from the scraped data
import lxml.html
# use that module's scrape function to grab the contents of a URL and put it in the variable HTML
html = scraperwiki.scrape("http://www.nhs.uk/Services/Trusts/GPs/DefaultView.aspx?id=5PG")
# use the lxml.html's fromstring function to grab some structured data, put in a variable called gplist
gplist = lxml.html.fromstring(html)
# get the first <p class="child-org-name"> <a> tags from within that, put in a list(?) called gpname. The class is indicated by the period before it.
gpname = gplist.cssselect(".child-org-name a")
#loop through the list of items and...
for gp in list(gpname):
record = { "gp" : gp.text } # create a column name and store the text of each occurrence
scraperwiki.sqlite.save(["gp"], record) # save the records one by one