python爬虫!🦸🏻‍♂️

python爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import json

chrome_options = Options()
# chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)

url_conf_cat = 'https://papers.cool'

with open('confs_data.json', 'r', encoding='utf-8') as f:
datas = json.load(f)
for num, data in enumerate(datas):
href = data['url']
conf_name = data['name']

if num >= 1:
print('num:'+str(num))
print('conf_name:'+str(conf_name))

driver.get(href)
# print(href)

scroll_pause_time = 2 # 每次滚动后暂停的时间
last_height = driver.execute_script("return document.body.scrollHeight") # 获取当前页面的总高度

while True:
# 执行 JavaScript 将页面滚动到底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# 等待页面加载
time.sleep(scroll_pause_time)
# 获取当前页面的总高度
new_height = driver.execute_script("return document.body.scrollHeight")
# 如果滚动条已经到达页面底部,则退出循环
if new_height == last_height:
break

last_height = new_height
# print(new_height)

page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
conf_items = soup.find_all('a', class_='title-link')

link_lists = []
name_lists = []
for item in conf_items:
url_p_list = url_conf_cat + item.attrs['href']
name_lists.append(item.text)
link_lists.append(url_p_list)
# print(len(link_lists))
# print(name_lists)
time.sleep(4)

for i, link in enumerate(link_lists):
name = name_lists[i]
try:
driver.get(link)

time.sleep(2)

# 查找下载按钮并点击
pdf_link = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CLASS_NAME, "title-pdf"))
)
pdf_link.click()

# 等待页面加载完成
time.sleep(1)

# 获取页面源码
page_source = driver.page_source

# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(page_source, 'html.parser')

# 查找iframe标签
iframe = soup.find('iframe')

# 提取iframe中的src链接
iframe_src = iframe['src']
# print("iframe中的src链接:", iframe_src)

pdf_url = 'https://papers.cool' + iframe_src

driver.get(pdf_url)

page_source1 = driver.page_source

# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(page_source, 'html.parser')

# print(soup.prettify())

time.sleep(2)

save_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, "download"))
)
time.sleep(12)
save_button.click()

time.sleep(3)

print('confs:' + str(conf_name) + '-paper:' + str(name))

except:
print('confs:' + str(conf_name) + '-paper:' + str(name) + '下载失败')

# 等待一段时间,确保页面加载完成
time.sleep(4)

上述部分是爬取论文网站的代码。


python爬虫!🦸🏻‍♂️
https://yangchuanzhi20.github.io/2024/06/14/工具/python爬虫/
作者
白色很哇塞
发布于
2024年6月14日
许可协议