目标:从当当网的图书排行榜中,爬取全部25页的图书名。
技巧:
- 通过翻页查看网页URL变化,推断页数和URL的关系。
- 通过所要爬取内容周围HTML文本的特点,写正则表达式提取特定内容。
使用正则表达式需要引入re模块
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 
 | import requestsimport re
 
 """
 <div class="name"><a href="http://product.dangdang.com/29168581.html" target="_blank" title="医路向前巍子给中国人的救护指南">医路向前巍子给中国人的救护指南</a></div>
 """
 if __name__ == "__main__":
 url = "http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent30-0-0-1-{:d}"
 headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
 }
 
 
 ex = '<div class="name">.*?target="_blank".*?>(.*?)<'
 
 page_num = 25
 
 top_book_names = []
 
 for i in range(1, page_num + 1):
 
 url_temp = url.format(i)
 page_text = requests.get(url=url_temp, headers=headers).text
 
 book_names_temp = re.findall(ex, page_text, re.S)
 top_book_names.extend(book_names_temp)
 print("爬取第{:d}页成功".format(i), book_names_temp)
 
 with open('output/top_book_names.txt', 'at', encoding='utf-8') as file:
 for i in range(len(top_book_names)):
 file.write(str(i + 1) + '. ' + top_book_names[i] + '\n')
 
 |