用requests.get(url)采集阿里巴巴商品列表的时候,每页只能采集8条。通过分析发现,更多的产品URL需要拖动鼠标下拉才能加载。网上看了一些资料,说需要找到JS入口。搞了几次,我依然找不到。最后只能采用加载浏览器来实现了。加载浏览器采集,首先要下载浏览器驱动器,然后才能顺利运行。代码如下:

from selenium import webdriver
import time,bs4,os,csv
from selenium.webdriver.chrome.options import Options
def pro_urls(n):
    opt = webdriver.ChromeOptions()
    opt.add_argument("--headless") #设置无界面模式
    opt.add_argument("--disable-gpu")

    driver = webdriver.Chrome(executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe',options=opt) #驱动器路径

    url = "https://www.alibaba.com/products/Basketball_Jersey.html?spm=a2700.galleryofferlist.0.0.3d492c63yoMZNz&IndexArea=product_en&page="+str(n)

    driver.get(url)

    time.sleep(3)

    a=0

    while a<5:
        #下拉鼠标
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;")
        time.sleep(1)
        a=a+1

    html=driver.page_source

    bs=bs4.BeautifulSoup(html,'html.parser')

    prourls=bs.select('.organic-gallery-title')

    i=0

    for prourl in prourls:
        #print('https:'+prourl.get('href'))
        file=['https:'+prourl.get('href')]
        writer_pro('D:\Backup\桌面\篮球球衣\球衣url.csv',file)

    driver.quit()

def writer_pro(site,file):
    headfile=['url']
    csvf=open(site,'a',newline='')#newline=''去除空格
    size=os.path.getsize(site)
    writer=csv.writer(csvf)
    if size==0:
        writer.writerow(headfile)
        writer.writerow(file)
        csvf.close()
    else:
        writer.writerow(file)
        csvf.close()


n=1
while n<=100:
    pro_urls(n)
    print('采集完',n,'页')
    n=n+1

发表评论