Scrape Foursquare API para obter informações do local

import urllib
import urllib2
import json
import datetime
import pandas as pd
from pandas.io.json import json_normalize
import math
import time
from math import cos
from pandas import DataFrame

### Helper function for converting meters to lat/long

def distcust(p, d, lat_m, long_m):
lat
= p['lat']
long
= p['long']

lat1
= lat + lat_m * (d / (11100.0/90*1000) * cos(lat))
long1
= long + long_m * (d / (11100.0/90*1000))

return {'lat': lat1, 'long': long1}

client_id
= "YOUR_CLIENT_ID"
client_secret
= "YOUR_CLIENT_SECRET"
#p = {'lat': 37.7833, 'long': -122.4167} # central San Francisco, at Van Ness and Market
#p = {'lat': 40.783011, 'long': -73.965368} # central NYC, at Central Park
p
= {'lat': 42.963601, 'long': -85.66878} # grand rapids, mi at division and fulton
distance
= 100
limit
= 50
gridSize
= 10
df
= DataFrame()
requested_keys
= ["categories","id","location","name"]
category
= "bar"
category_id
= "4d4b7105d754a06376d81259"

for x in [x1 / 10.0 for x1 in range(-3*gridSize, 3*gridSize)]:
for y in [y1 / 10.0 for y1 in range(-3*gridSize, 3*gridSize)]:
center
= distcust(p,distance,x,y)
url
= "https://api.foursquare.com/v2/venues/search?ll=%s,%s&intent=browse&radius=%s&categoryId=%s&client_id=%s&client_secret=%s&v=%s" % (center["lat"], center["long"], distance, category_id, client_id, client_secret, time.strftime("%Y%m%d"))
try:
req
= urllib2.Request(url)
response
= urllib2.urlopen(req)
data
= json.loads(response.read())
response
.close()
#print data["response"]['venues']
data
= DataFrame(data["response"]['venues'])[requested_keys]


df2
= DataFrame()
venue_ids
= []
frames
= []

#print data["id"]
for d in data["id"]:
requested_keys2
= ["id", "price.currency","rating", "likes.count"]

url2
= "https://api.foursquare.com/v2/venues/%s?client_id=%s&client_secret=%s&v=%s" % (d, client_id, client_secret, time.strftime("%Y%m%d"))
req2
= urllib2.Request(url2)
response2
= urllib2.urlopen(req2)
data2
= json.loads(response2.read())
response
.close()
ddata
= data2['response']

nom_data
= json_normalize(ddata['venue'])

if "price.currency" not in nom_data.columns:
nom_data
["price.currency"] = 'NONE'

if "rating" not in nom_data.columns:
nom_data
["rating"] = 'NONE'

venue_ids
.append(d)
frames
.append(nom_data[requested_keys2])
#print "getting attr for %s" % nom_data["name"]
time
.sleep(1)


df2
= pd.concat(frames, keys=venue_ids)

mdata
= pd.merge(data, df2,how='left',on='id', suffixes=('_x', '_y'))

#print mdata

df
= df.append(mdata,ignore_index=True)
#print df

#df.to_csv("test.csv")

print center
time
.sleep(1) # stay within API limits
except Exception, e:
print e

df
= df.drop_duplicates(cols='id',take_last=True)
print df

df
["categories"] = df["categories"].apply(lambda x: dict(x[0])['name'])
df
["lat"] = df["location"].apply(lambda x: dict(x)["lat"])
df
["long"] = df["location"].apply(lambda x: dict(x)["lng"])
df
["distance"] = df["location"].apply(lambda x: dict(x)["distance"])
df
["checkins"] = df["stats"].apply(lambda x: dict(x)["checkinsCount"])

ordered_df
= df[["id_x","name_x","categories","checkins", "distance","lat","long", "price.currency", "rating", "likes.count"]]
ordered_df
.to_csv("foursquare_%s_grand_rapids.csv" % category,encoding='utf-8', index=False)