数据采集方法四月小组合作--数据提取策略

9.2和9.3部分

正则

1
2
3
4
5
6
7
8
9
import requests
url = "https://rate.tmall.com/list_detail_rate.htm?itemId=621844857713&sellerId=3251681972&order=3&currentPage=1&pageSize=20&&callback=_DLP_2665_der_3_currentPage_1_pageSize_10_"

header = {
'Referer': url,
'cookie':'',
}
response = requests.get(url,headers = header).text
response

1
2
3
4
texts = []
pat = re.compile('"rateContent":"(.*?)","fromMall"')
texts.extend(pat.findall(response))
texts

css选择器和xpath

剥洋葱

一层层看

css

1
2
3
4
5
6
7
response = requests.get(url=url,headers=header)
html_data = response.text
selector = parsel.Selector(html_data)
lis = selector.css('.resblock-list.post_ulog_exposure_scroll.has-results') #10个li ctrl+F
for li in lis:
area = li.css('.resblock-area span::text').get() # 面积
print(area)

xpath

1
2
3
4
5
6
7
response = requests.get(url=url,headers=header)
html_data = response.text
parse_html = etree.HTML(html_data)
lis = parse_html.xpath('//li[@class="resblock-list post_ulog_exposure_scroll has-results"]')
for li in lis:
area = li.xpath('//div[@class="resblock-area"]/span/text()')[0] # 面积
print(area)

解析json

1
2
3
4
5
6
url = 'https://server.cbaleague.com/data/team_season?matchtypeid=1&ranktype=PointsAverage'
header = {
'Referer':url,
'user-agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)'}
res = requests.get(url=url,headers=header).json()
res['data']['data']

1
2
for i in res['data']['data']:
print(i['TeamCNAlias'])

高德API

1
2
3
4
5
6
url = 'https://restapi.amap.com/v3/geocode/geo'
params = { 'key': ' ',
'address': address}
res = requests.get(url, params)
j = json.loads(res.text)
return j['geocodes'][0]['location']


感谢聆听。