本来想正面刚一下这个验证码的,但是一直post不上去,只好设置随机延迟,防止反爬
fangdd.py
1 import requests 2 from lxml import etree 3 import re 4 from check_code import * #验证码处理模块 5 from get_pinyin import * #汉子转拼音模块 6 from save_to_mongo import * 7 import time 8 import random 9 10 11 class Fangdd(): 12 def __init__(self): 13 user_agent_list = [ 14 "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1", 15 "Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)", 16 "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11", 17 "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11", 18 "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)", 19 "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)" 20 ] 21 #产生一个随机user-agent 22 self.headers={ 23 #从上面的列表上随机取一个 24 "User-Agent":random.choice(user_agent_list) 25 } 26 # self.c = Code_text() 27 # self.rand_time = random.randint(1, 5) 28 29 def get_html(self, url): 30 # 请求网页源代码 31 response = requests.get(url, headers=self.headers) 32 html = response.text 33 element = etree.HTML(html) 34 # 根据title判断是否反爬 35 title = element.xpath('//title/text()') 36 title = title[0] 37 if title == 'chech captcha': 38 print('有内鬼,终止交易(爬虫已被发现,正在进行验证码处理)') 39 # self.c.post_code() 40 time.sleep(5) 41 else: 42 return html 43 44 45 def get_location(self, html): 46 # 得到全国各地地名 47 addresses = [] 48 element = etree.HTML(html) 49 lis = element.xpath('//div[@class="q3rm0"]/li[position()>1]//a/text()') 50 for li in lis: 51 # 得到的是汉字,而网址中是拼音,转换一下 52 li_pinyin = get_pinyin(li) 53 addresses.append(li_pinyin) 54 return addresses 55 56 57 def get_all_url(self, addresses): 58 # 对每一个地名的网址进行解析 59 urls = [] 60 for address in addresses: 61 addr_url = 'https://%s.fangdd.com/xiaoqu' % address 62 urls.append(addr_url) 63 return urls 64 65 66 def parse_list_page(self, urls): 67 not_found = [] #网址中的拼音不全是汉字转换的,无法统一抓取 68 for p_url in urls: 69 # 设置随机数进行睡眠,防止反爬 70 # time.sleep(self.rand_time) 71 html = self.get_html(p_url) 72 element = etree.HTML(html) 73 # 多音字转换报错,根据title内容来确定 74 title = element.xpath('//title/text()') 75 title = title[0] 76 if title == '很抱歉!您访问的页面不存在!': 77 # 因为有的地区名字过长,网址中采用省略写法,所以根据拼音拼接的网址与真实网址不对应,导致404 78 print('由于拼音转换与网址不对应,从列表中删除该网址:%s' % p_url) 79 else: 80 print(title) 81 # 当小区数量不足20时,只有一页,最大页数为1 82 max_xiaoqu = element.xpath('//p[@class="filter-result"]/b/text()') 83 max_xiaoqu = max_xiaoqu[0] 84 max_xiaoqu = int(max_xiaoqu) 85 86 if max_xiaoqu <= 20 and max_xiaoqu != 0 : 87 print('该地区只有一页') 88 print('max_xiaoqu=%s' % max_xiaoqu) 89 max_page = 1 90 self.get_informations(p_url, max_page) 91 92 elif max_xiaoqu > 20: 93 # 找到最大页数,来确定循环的边界 94 max_page = element.xpath('//div[@class="pagebox"]/a[position()
get_pinyin.py
1 from pypinyin import pinyin 2 3 4 def get_pinyin(text): 5 # style=0参数设置取消声调,详情http://pypinyin.mozillazg.com/zh_CN/v0.9.1/ 6 p = pinyin(text, style=0) 7 # [['chong'], ['qing']] 8 a = [] 9 for i in p:10 a.append(i[0])11 # print(p)12 b = ''.join(a)13 return b
save_to_mongo.py
1 import pymongo 2 3 client = pymongo.MongoClient('127.0.0.1', port=27017) 4 db = client.fangdd_mark 5 collection = db.informations 6 7 def save_to_mongo(result): 8 try: 9 if collection.insert(result):10 pass11 # print('success save to mongodb')12 except Exception:13 print('error to mongo')14
因为设置了延迟,再加上数据量比较大,所以爬取时间有点长,我打完了一把王者荣耀,c开头的还没爬完,此时数据库中已经有22000条信息了
运行结果: