1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
| from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time from bs4 import BeautifulSoup import json
chrome_options = Options()
driver = webdriver.Chrome(options=chrome_options)
url_conf_cat = 'https://papers.cool'
with open('confs_data.json', 'r', encoding='utf-8') as f: datas = json.load(f) for num, data in enumerate(datas): href = data['url'] conf_name = data['name']
if num >= 1: print('num:'+str(num)) print('conf_name:'+str(conf_name))
driver.get(href)
scroll_pause_time = 2 last_height = driver.execute_script("return document.body.scrollHeight")
while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(scroll_pause_time) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break
last_height = new_height
page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') conf_items = soup.find_all('a', class_='title-link')
link_lists = [] name_lists = [] for item in conf_items: url_p_list = url_conf_cat + item.attrs['href'] name_lists.append(item.text) link_lists.append(url_p_list) time.sleep(4)
for i, link in enumerate(link_lists): name = name_lists[i] try: driver.get(link)
time.sleep(2)
pdf_link = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CLASS_NAME, "title-pdf")) ) pdf_link.click()
time.sleep(1)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
iframe = soup.find('iframe')
iframe_src = iframe['src']
pdf_url = 'https://papers.cool' + iframe_src
driver.get(pdf_url)
page_source1 = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
time.sleep(2)
save_button = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.ID, "download")) ) time.sleep(12) save_button.click()
time.sleep(3)
print('confs:' + str(conf_name) + '-paper:' + str(name))
except: print('confs:' + str(conf_name) + '-paper:' + str(name) + '下载失败')
time.sleep(4)
|