链家网站爬取


import requests
import pandas as pd
from lxml import etree
import time
from multiprocessing.dummy import Pool

start = time.time()
title = []
house = []
place = []
rent = []
area = []
direction = []
distinguish = []
detail = {
"标题": title,
"房屋户型": house,
"位置": place,
"月租金": rent,
"面积": area,
"方向": direction,
"区": distinguish,
}


def request(link):
print("开始爬取" + link)
for i in range(1, 101):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Referer": link
}
data = {
"type": "1",
"query": link + "pg{}/".format(i - 1)
}
url = link + "pg{}/".format(i)
s = requests.Session()
response = s.get(url=url, headers=headers, data=data).text
tree = etree.HTML(response)
li_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div')
if li_list:
for li in li_list:
# 标题
title.append(li.xpath('./div/p/a/text()')[0].strip().split()[0])
# 区
t1 = li.xpath('./div[1]/p[2]/a//text()')[0]
distinguish.append(t1)
# 位置
t2 = li.xpath('./div[1]/p[2]/a//text()')[1]
place.append(t1 + " " + t2)
tt = li.xpath('./div/p[2]/text()')
tt = "".join(tt).split()
# 面积
area.append(tt[1])
# 方向
direction.append(tt[2])
# 房屋户型
house.append(tt[-1])
# 月租金
rent.append(li.xpath('./div[1]/span/em/text()')[0])
else:
break

print(link + "爬取完成")


lis = ['https://bj.lianjia.com/zufang/dongcheng/',
'https://bj.lianjia.com/zufang/xicheng/',
'https://bj.lianjia.com/zufang/chaoyang/',
'https://bj.lianjia.com/zufang/haidian/',
'https://bj.lianjia.com/zufang/fengtai/',
'https://bj.lianjia.com/zufang/shijingshan/',
'https://bj.lianjia.com/zufang/tongzhou/',
'https://bj.lianjia.com/zufang/changping/',
'https://bj.lianjia.com/zufang/daxing/',
'https://bj.lianjia.com/zufang/yizhuangkaifaqu/',
'https://bj.lianjia.com/zufang/shunyi/',
'https://bj.lianjia.com/zufang/fangshan/',
'https://bj.lianjia.com/zufang/mentougou/',
'https://bj.lianjia.com/zufang/huairou/'
]
pool = Pool(12)
pool.map(request, lis)

data = pd.DataFrame(detail)
data.to_csv("link6.csv", mode='a', encoding='utf_8_sig')
print("全部爬取完毕!!!!")
print("总共用时", time.time() - start)


  上一篇:SyntaxError: unexpected EOF while parsing错误 下一篇:github超时解决  

湘ICP备19016894号 © 2019 小钱