Code: Select all
import urllib2
from bs4 import BeautifulSoup
from collections import defaultdict
from datetime import datetime
HTML_FILE = "searches.html"
STORAGE = "listings_ghia.tsv"
SCHEMA = ("full_url", "price", "title")
BLACK_TERMS = ("wanted", "parts")
WHITE_TERMS = ("ghia",)
SEARCH_TERMS = ("ghia",)
OREGON_CITIES = ["portland", "bend", "corvallis", "eastoregon", "eugene", "klamath", "medford", "oregoncoast",
"roseburg", "salem"]
WASHINGTON_CITIES = ["bellingham", "kpr", "moseslake", "olympic", "pullman", "seattle",
"skagit", "spokane", "wenatchee", "yakima"]
N_CALI_CITIES = ["siskiyou", "humboldt", "redding", "susanville",
"chico", "mendocino", "yubasutter", "reno", "sacramento", "goldcountry", "stockton", "sfbay",
"modesto", "merced"]
CITIES = OREGON_CITIES + WASHINGTON_CITIES + N_CALI_CITIES
#CITIES = ["portland",]
def read_listings(storage, schema):
listings = {}
for l in open(storage, "r"):
parts = l.strip().split("\t")
data = dict(zip(schema, parts))
listings[data['full_url']] = data
return listings
def write_listings(listings, storage, schema):
f = open(storage, "w")
for data in listings.itervalues():
try:
f.write("\t".join(map(lambda x: data.get(x), schema)) + "\n")
except:
pass
f.close()
def write_html(listings, html_path):
f = open(html_path, "w")
f.write("<html><body>")
listing_block = '<a href="{url}">{price} - {title}</a><br>'
for data in listings:
try:
f.write(listing_block.format(url=data['full_url'], price=data['price'],
title=data['title']))
except:
pass
f.write("</body></html>")
f.close()
def handle_parsings(existing, read, black_terms, white_terms):
for data in read:
if data['full_url'] in existing:
data['new'] = False
continue
# Any black term in title skips post
blacked = False
for term in black_terms:
if term in data['title'].lower():
blacked = True
break
data['blacked'] = blacked
# All white terms must be in title
matched = True
for term in white_terms:
if not term in data['title'].lower():
matched = False
break
data['matched'] = matched
open_date = datetime.strptime(data['open_date'], '%b %d')
open_date = datetime(datetime.now().year, open_date.month, open_date.day)
data['open_date'] = open_date.strftime("%Y%m%d")
data['new'] = True
existing[data['full_url']] = data
def parse(city, term, index=None,):
url = "http://{city}.craiglist.org/search/sss{index}query={query}"
if index is not None:
index = "?s=%i00&" % index
else:
index = "?"
form_url = url.format(city=city, index=index, query=term.replace(" ", "%20"))
print form_url
try:
request = urllib2.urlopen(form_url).read()
except:
print "FAILED :::: ", form_url
return []
soup = BeautifulSoup(request)
listings = soup.find_all('p', class_='row')
post_url = "http://{city}.craigslist.org{url}"
parsed_listings = []
for l in listings:
parts = l.find_all('a')
url = parts[0].get('href')
category = parts[-1].text
open_date = l.find_all('span', class_='date')[0].text
title = l.find_all('span', class_='pl')[0].find_all('a')[0].text
try:
price = l.find_all('span', class_='price')[0].text
except IndexError:
price = "-1"
if "craigslist" not in url:
full_url = post_url.format(city=city, url=url)
else:
full_url = url
parsed_listings.append({'url': url, 'category': category,
'open_date': open_date,
'title': title, 'price': price,
'full_url': full_url})
return parsed_listings
if __name__ == '__main__':
listings = read_listings(STORAGE, SCHEMA)
for city in CITIES:
for term in SEARCH_TERMS:
handle_parsings(listings, parse(city, term), BLACK_TERMS, WHITE_TERMS)
new_listings = []
for data in listings.itervalues():
if data.get('new', False) and data.get('matched', False) and not data.get('blacked', True):
new_listings.append(data)
new_listings = sorted(new_listings, key=lambda x: int(x.get("price", "-1").strip("$")))
for data in new_listings:
print "\t".join(map(lambda x: data.get(x, "None"), ("title", "price", "full_url")))
write_listings(listings, STORAGE, SCHEMA)
write_html(new_listings, HTML_FILE)
BLACK_LIST if any of the strings in this list are in the post title the post is ignored
WHITE_LIST all strings in this list must appear in the post title or the post is ignored
SEARCH_TERMS the query to pass to craigslist, what you would type in the search bar. Multiple entries result in multiple searches
Notes:
- All posts will be saved to the STORAGE file so they can be reviewed later if your white/black listing is too harsh.
This only works for US based cities. It is left as an exercise to the reader to extend this to other countries.
Search terms and white/black list ignore case.