本文共 2315 字,大约阅读时间需要 7 分钟。
文章链接:https://www.jianshu.com/p/85f4624485b9
# datetime:2020/10/6 13:53# 抓取简书文章标题链接import pandas as pdfrom requests_html import HTMLSession# 建立一个会话与服务器交谈session = HTMLSession()# 输入网址,存储到url变量名中url = 'https://www.jianshu.com/p/85f4624485b9'# 获取网页内容r = session.get(url)# 查看网页内容# print(r.html.text)# 查看links属性(可省)# print(r.html.links)# 查看绝对链接(可省)# print(r.html.absolute_links)# 找到链接a的路经,定义变量selsel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article > p:nth-child(4) > a'# 把结果存到results变量中results = r.html.find(sel)# 查看results内容(可省)# print(results)# 让python显示results结果数据对应的文本(可省)# print(results[0].text)# 把链接提取出来(可省)# print(results[0].absolute_links)# {'https://www.jianshu.com/nb/130182'}显示的结果是集合# 只要连接的字符list(results[0].absolute_links)[0]# print(list(results[0].absolute_links)[0])# 编写函数获取 文本和链接def get_text_link_from_sel(sel): mylist = [] try: results = r.html.find(sel) for result in results: mytext = result.text mylink = list(result.absolute_links)[0] mylist.append((mytext, mylink)) return mylist except: return Nonesel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article >p> a'# 查看输出结果# print(get_text_link_from_sel(sel))#将列表转换为数据框df = pd.DataFrame(get_text_link_from_sel(sel))# 设置表头df.columns = ['text', 'link']# 查看输出结果print(df)# 存入csv文档df.to_csv('output.csv', encoding='GBK', index=False)
# datetime:2020/10/6 13:53# 抓取简书文章标题链接import pandas as pdfrom requests_html import HTMLSession# 建立一个会话与服务器交谈session = HTMLSession()# 输入网址,存储到url变量名中url = 'https://www.jianshu.com/p/85f4624485b9'# 获取网页内容r = session.get(url)# 找到链接a的路经,定义变量selsel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article >p> a'# 把结果存到results变量中results = r.html.find(sel)# 编写函数获取 文本和链接def get_text_link_from_sel(sel): mylist = [] try: results = r.html.find(sel) for result in results: mytext = result.text mylink = list(result.absolute_links)[0] mylist.append((mytext, mylink)) return mylist except: return None# 将列表转换为数据框df = pd.DataFrame(get_text_link_from_sel(sel))# 设置表头df.columns = ['text', 'link']# 查看输出结果print(df)# 存入csv文档df.to_csv('output.csv', encoding='GBK', index=False)
转载地址:http://qxdwi.baihongyu.com/