获取新浪微博1000w用户及最近50条微博的Python脚本
2012-09-12 09:22:47 来源:我爱运维网 评论: 点击:
功能是:获取新浪微博1000w用户的基本信息和每个爬取用户最近发表的50条微博,使用python编写,多进程爬取,将数据存储在了mongodb中[代码][P...
功能是:获取新浪
微博1000w用户的基本信息和每个爬取用户最近发表的50条
微博,使用python编写,多进程爬取,将数据存储在了mongodb中
[代码] [Python]代码
004 |
from pprint import pprint |
005 |
from weibopy.auth import OAuthHandler |
006 |
from weibopy.api import API |
007 |
from weibopy.binder import bind_api |
008 |
from weibopy.error import WeibopError |
009 |
import time,os,pickle,sys |
010 |
import logging.config |
011 |
from multiprocessing import Process |
012 |
from pymongo import Connection |
015 |
mongo_addr = 'localhost' |
019 |
class Sina_reptile(): |
024 |
def __init__(self,consumer_key,consumer_secret): |
025 |
self.consumer_key,self.consumer_secret = consumer_key,consumer_secret |
026 |
self.connection = Connection(mongo_addr,mongo_port) |
027 |
self.db = self.connection[db_name] |
028 |
self.collection_userprofile = self.db['userprofile'] |
029 |
self.collection_statuses = self.db['statuses'] |
031 |
def getAtt(self, key): |
033 |
return self.obj.__getattribute__(key) |
038 |
def getAttValue(self, obj, key): |
040 |
return obj.__getattribute__(key) |
047 |
用于获取sina微博 access_token 和access_secret |
049 |
if len(self.consumer_key) == 0: |
050 |
print "Please set consumer_key" |
053 |
if len(self.consumer_key) == 0: |
054 |
print "Please set consumer_secret" |
057 |
self.auth = OAuthHandler(self.consumer_key, self.consumer_secret) |
058 |
auth_url = self.auth.get_authorization_url() |
059 |
print 'Please authorize: ' + auth_url |
060 |
verifier = raw_input('PIN: ').strip() |
061 |
self.auth.get_access_token(verifier) |
062 |
self.api = API(self.auth) |
064 |
def setToken(self, token, tokenSecret): |
066 |
通过oauth协议以便能获取sina微博数据 |
068 |
self.auth = OAuthHandler(self.consumer_key, self.consumer_secret) |
069 |
self.auth.setToken(token, tokenSecret) |
070 |
self.api = API(self.auth) |
072 |
def get_userprofile(self,id): |
078 |
userprofile['id'] = id |
079 |
user = self.api.get_user(id) |
082 |
userprofile['screen_name'] = self.getAtt("screen_name") |
083 |
userprofile['name'] = self.getAtt("name") |
084 |
userprofile['province'] = self.getAtt("province") |
085 |
userprofile['city'] = self.getAtt("city") |
086 |
userprofile['location'] = self.getAtt("location") |
087 |
userprofile['description'] = self.getAtt("description") |
088 |
userprofile['url'] = self.getAtt("url") |
089 |
userprofile['profile_image_url'] = self.getAtt("profile_image_url") |
090 |
userprofile['domain'] = self.getAtt("domain") |
091 |
userprofile['gender'] = self.getAtt("gender") |
092 |
userprofile['followers_count'] = self.getAtt("followers_count") |
093 |
userprofile['friends_count'] = self.getAtt("friends_count") |
094 |
userprofile['statuses_count'] = self.getAtt("statuses_count") |
095 |
userprofile['favourites_count'] = self.getAtt("favourites_count") |
096 |
userprofile['created_at'] = self.getAtt("created_at") |
097 |
userprofile['following'] = self.getAtt("following") |
098 |
userprofile['allow_all_act_msg'] = self.getAtt("allow_all_act_msg") |
099 |
userprofile['geo_enabled'] = self.getAtt("geo_enabled") |
100 |
userprofile['verified'] = self.getAtt("verified") |
102 |
# for i in userprofile: |
103 |
# print type(i),type(userprofile[i]) |
104 |
# print i,userprofile[i] |
107 |
except WeibopError, e: #捕获到的WeibopError错误的详细原因会被放置在对象e中 |
108 |
print "error occured when access userprofile use user_id:",id |
110 |
log.error("Error occured when access userprofile use user_id:{0}\nError:{1}".format(id, e),exc_info=sys.exc_info()) |
115 |
def get_specific_weibo(self,id): |
120 |
statusprofile['id'] = id |
123 |
get_status = bind_api( path = '/statuses/show/{id}.json', |
124 |
payload_type = 'status', |
125 |
allowed_param = ['id']) |
128 |
status = get_status(self.api,id) |
130 |
statusprofile['created_at'] = self.getAtt("created_at") |
131 |
statusprofile['text'] = self.getAtt("text") |
132 |
statusprofile['source'] = self.getAtt("source") |
133 |
statusprofile['favorited'] = self.getAtt("favorited") |
134 |
statusprofile['truncated'] = self.getAtt("ntruncatedame") |
135 |
statusprofile['in_reply_to_status_id'] =self.getAtt("in_reply_to_status_id") |
136 |
statusprofile['in_reply_to_user_id'] =self.getAtt("in_reply_to_user_id") |
137 |
statusprofile['in_reply_to_screen_name'] =self.getAtt("in_reply_to_screen_name") |
138 |
statusprofile['thumbnail_pic'] = self.getAtt("thumbnail_pic") |
139 |
statusprofile['bmiddle_pic'] = self.getAtt("bmiddle_pic") |
140 |
statusprofile['original_pic'] = self.getAtt("original_pic") |
141 |
statusprofile['geo'] = self.getAtt("geo") |
142 |
statusprofile['mid'] = self.getAtt("mid") |
143 |
statusprofile['retweeted_status'] = self.getAtt("retweeted_status") |
146 |
def get_latest_weibo(self,user_id,count): |
150 |
statuses,statusprofile = [],{} |
151 |
try: #error occur in the SDK |
152 |
timeline = self.api.user_timeline(count=count, user_id=user_id) |
153 |
except Exception as e: |
154 |
print "error occured when access status use user_id:",user_id |
156 |
log.error("Error occured when access status use user_id:{0}\nError:{1}".format(user_id, e),exc_info=sys.exc_info()) |
158 |
for line in timeline: |
160 |
statusprofile['usr_id'] = user_id |
161 |
statusprofile['id'] = self.getAtt("id") |
162 |
statusprofile['created_at'] = self.getAtt("created_at") |
163 |
statusprofile['text'] = self.getAtt("text") |
164 |
statusprofile['source'] = self.getAtt("source") |
165 |
statusprofile['favorited'] = self.getAtt("favorited") |
166 |
statusprofile['truncated'] = self.getAtt("ntruncatedame") |
167 |
statusprofile['in_reply_to_status_id'] =self.getAtt("in_reply_to_status_id") |
168 |
statusprofile['in_reply_to_user_id'] =self.getAtt("in_reply_to_user_id") |
169 |
statusprofile['in_reply_to_screen_name'] =self.getAtt("in_reply_to_screen_name") |
170 |
statusprofile['thumbnail_pic'] = self.getAtt("thumbnail_pic") |
171 |
statusprofile['bmiddle_pic'] = self.getAtt("bmiddle_pic") |
172 |
statusprofile['original_pic'] = self.getAtt("original_pic") |
173 |
statusprofile['geo'] =repr(pickle.dumps(self.getAtt("geo"),pickle.HIGHEST_PROTOCOL)) |
174 |
statusprofile['mid'] = self.getAtt("mid") |
175 |
statusprofile['retweeted_status'] =repr(pickle.dumps(self.getAtt("retweeted_status"),pickle.HIGHEST_PROTOCOL)) |
176 |
statuses.append(statusprofile) |
180 |
def friends_ids(self,id): |
184 |
next_cursor,cursor = 1,0 |
186 |
while(0!=next_cursor): |
187 |
fids = self.api.friends_ids(user_id=id,cursor=cursor) |
189 |
ids.extend(self.getAtt("ids")) |
190 |
cursor = next_cursor = self.getAtt("next_cursor") |
191 |
previous_cursor = self.getAtt("previous_cursor") |
194 |
def manage_access(self): |
198 |
info = self.api.rate_limit_status() |
200 |
sleep_time = round( (float)(self.getAtt("reset_time_in_seconds"))/self.getAtt("remaining_hits"),2 ) ifself.getAtt("remaining_hits") else self.getAtt("reset_time_in_seconds") |
201 |
printself.getAtt("remaining_hits"),self.getAtt("reset_time_in_seconds"),self.getAtt("hourly_limit"),self.getAtt("reset_time") |
202 |
print "sleep time:",sleep_time,'pid:',os.getpid() |
203 |
time.sleep(sleep_time + 1.5) |
205 |
def save_data(self,userprofile,statuses): |
206 |
self.collection_statuses.insert(statuses) |
207 |
self.collection_userprofile.insert(userprofile) |
209 |
def reptile(sina_reptile,userid): |
210 |
ids_num,ids,new_ids,return_ids = 1,[userid],[userid],[] |
211 |
while(ids_num <= 10000000): |
215 |
sina_reptile.manage_access() |
216 |
return_ids = sina_reptile.friends_ids(id) |
217 |
ids.extend(return_ids) |
218 |
userprofile = sina_reptile.get_userprofile(id) |
219 |
statuses = sina_reptile.get_latest_weibo(count=50, user_id=id) |
220 |
if statuses is None or userprofile is None: |
222 |
sina_reptile.save_data(userprofile,statuses) |
223 |
except Exception as e: |
224 |
log.error("Error occured in reptile,id:{0}\nError:{1}".format(id, e),exc_info=sys.exc_info()) |
229 |
if(ids_num >= 10000000):break |
230 |
next_ids.extend(return_ids) |
231 |
next_ids,new_ids = new_ids,next_ids |
233 |
def run_crawler(consumer_key,consumer_secret,key,secret,userid): |
235 |
sina_reptile = Sina_reptile(consumer_key,consumer_secret) |
236 |
sina_reptile.setToken(key, secret) |
237 |
reptile(sina_reptile,userid) |
238 |
sina_reptile.connection.close() |
239 |
except Exception as e: |
241 |
log.error("Error occured in run_crawler,pid:{1}\nError:{2}".format(os.getpid(), e),exc_info=sys.exc_info()) |
243 |
if __name__ == "__main__": |
244 |
logging.config.fileConfig("logging.conf") |
245 |
log = logging.getLogger('logger_sina_reptile') |
246 |
with open('test.txt') as f: |
247 |
for i in f.readlines(): |
248 |
j = i.strip().split(' ') |
249 |
p = Process(target=run_crawler, args=(j[0],j[1],j[2],j[3],j[4])) |
相关热词搜索:新浪 微博 python
上一篇:用python程序连接hive
下一篇:最后一页