python爬虫！🦸🏻‍♂️

python爬虫

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import json

chrome_options = Options()
# chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)

url_conf_cat = 'https://papers.cool'

with open('confs_data.json', 'r', encoding='utf-8') as f:
    datas = json.load(f)
    for num, data in enumerate(datas):
        href = data['url']
        conf_name = data['name']

        if num >= 1:
            print('num:'+str(num))
            print('conf_name:'+str(conf_name))

            driver.get(href)
            # print(href)

            scroll_pause_time = 2  # 每次滚动后暂停的时间
            last_height = driver.execute_script("return document.body.scrollHeight")  # 获取当前页面的总高度

            while True:
                # 执行 JavaScript 将页面滚动到底部
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                # 等待页面加载
                time.sleep(scroll_pause_time)
                # 获取当前页面的总高度
                new_height = driver.execute_script("return document.body.scrollHeight")
                # 如果滚动条已经到达页面底部，则退出循环
                if new_height == last_height:
                    break

                last_height = new_height
                # print(new_height)

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            conf_items = soup.find_all('a', class_='title-link')

            link_lists = []
            name_lists = []
            for item in conf_items:
                url_p_list = url_conf_cat + item.attrs['href']
                name_lists.append(item.text)
                link_lists.append(url_p_list)
            # print(len(link_lists))
            # print(name_lists)
            time.sleep(4)

            for i, link in enumerate(link_lists):
                name = name_lists[i]
                try:
                    driver.get(link)

                    time.sleep(2)

                    # 查找下载按钮并点击
                    pdf_link = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.CLASS_NAME, "title-pdf"))
                    )
                    pdf_link.click()

                    # 等待页面加载完成
                    time.sleep(1)

                    # 获取页面源码
                    page_source = driver.page_source

                    # 使用BeautifulSoup解析HTML内容
                    soup = BeautifulSoup(page_source, 'html.parser')

                    # 查找iframe标签
                    iframe = soup.find('iframe')

                    # 提取iframe中的src链接
                    iframe_src = iframe['src']
                    # print("iframe中的src链接:", iframe_src)

                    pdf_url = 'https://papers.cool' + iframe_src

                    driver.get(pdf_url)

                    page_source1 = driver.page_source

                    # 使用BeautifulSoup解析HTML内容
                    soup = BeautifulSoup(page_source, 'html.parser')

                    # print(soup.prettify())

                    time.sleep(2)

                    save_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.ID, "download"))
                    )
                    time.sleep(12)
                    save_button.click()

                    time.sleep(3)

                    print('confs:' + str(conf_name) + '-paper:' + str(name))

                except:
                    print('confs:' + str(conf_name) + '-paper:' + str(name) + '下载失败')

            # 等待一段时间，确保页面加载完成
            time.sleep(4)
上述部分是爬取论文网站的代码。
工具 ʕ•ᴥ•ʔ
#工具 #教程
python爬虫！🦸🏻‍♂️
https://yangchuanzhi20.github.io/2024/06/14/工具/python爬虫/
作者
白色很哇塞
发布于
2024年6月14日
许可协议
latex使用!🦸🏻‍♂️ 上一篇
实用网站大全！🦸🏻‍♂️ 下一篇