做seo的时候,我们难免不进行数据采集和数据入库。以我目前的水平,采集内容已经比较简单了,然而困难的是居然没有办法把内容导入wordpress中。准确的说并不是无法导入,而是没有办法把内容完美导入,我遇到的问题就是没有办法自动添加特色图片。

实现思路是这样的,首先安装一个插件,让外链图片自动本地化(见文尾附件)。然后才是用python3导入拼接好的内容。代码如下:

from selenium import webdriver
import time,bs4,os,csv,json,re,requests
from selenium.webdriver.chrome.options import Options

'''把关键词转化为小短线链接形式'''
def key_ss(key):
    key_s=re.sub(' +','_',key)
    return key_s
def key_imgs(key):
    key_img=re.sub(' +','-',key)
    return key_img

def to_url(key):
    url='https://cn.bing.com/images/search?sp=-1&pq='+key+'&sc=0-18&sk=&cvid=1CAF0B3BD39E45DA87946D04BA398F6F&q='+key+'&qft=+filterui:imagesize-large&FORM=BESBTB&ensearch=1'
    return url

def connect(url):#链接网络,获得html
    opt = webdriver.ChromeOptions()
    opt.add_argument("--headless") #新版本设置无头
    opt.add_argument("--disable-gpu")
    driver = webdriver.Chrome(executable_path='/Users/gaotiansong/Downloads/chromedriver',options=opt) #驱动器路径
    m=0
    while m<3:
        m=m+1
        driver.set_page_load_timeout(10)
        try:
            driver.get(url)
            a=0
            while a<1:
                #下拉鼠标
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;")
                time.sleep(2)
                html=driver.page_source
                a=a+1
            break
        except:
            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),'链接失败,重试',m,'次')
            html=''
    if m>=2:
        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),'没找到页面内容,跳过此条')
        print('')
        html=''
    time.sleep(1)
    driver.quit()
    return html
    

def img_urls(html):#获取图片url列表
    if html=='':
        imgs=[]
        return imgs
    else:
        bs=bs4.BeautifulSoup(html,'html.parser')    
        prourls=bs.select('.iusc')
        #driver.quit()
        n=1
        imgs=[]
        for prourl in prourls:
            prourl=json.loads(prourl.get('m'))['murl']
            imgs.append(prourl)#把多张图片变成图片列表
            n=n+1
        return imgs
    
def shitu(imgurl):
    if imgurl=='':
        return ''
    else:
        #https://www.captionbot.ai/ 在线识图工具
        url='https://captionbot.azurewebsites.net/api/messages?language=en-US'
        data={
            "Type":"CaptionRequest",
            "Content":imgurl
            }
        m=0
        while m<2:
            m=m+1
            try:
                r=requests.post(url,data=data,timeout=3)
                r.encoding='utf-8'
                des=r.text
                #print('描述:',des)
                break
            except:
                print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':识图失败,重试中。。。。',m,'次')
                time.sleep(1)
                des=''
        #des1=re.sub('我无法完全理解这副图片,但是我看到了|。|我觉得这是','',des) #中文版
        des1=re.sub('I am not really confident, but I think it \'s a|I think it\'s a ','',des)
        es1=re.sub('I can’t really describe line drawings :\(','A beautiful line painting',des1)
        des1=re.sub('I am not really confident, but','',des1)
        des1=re.sub('\"','',des1)
        return des1

def wt_wordpress(name,des):#把内容写入wordpress
    from wordpress_xmlrpc import Client, WordPressPost
    from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
    from wordpress_xmlrpc.methods.users import GetUserInfo
    from wordpress_xmlrpc.methods import posts
    from wordpress_xmlrpc.methods import taxonomies
    from wordpress_xmlrpc import WordPressTerm
    from wordpress_xmlrpc.compat import xmlrpc_client
    from wordpress_xmlrpc.methods import media, posts
    import sys
    import importlib
    import random
    
    importlib.reload(sys)
    wp = Client('http://customfw.com/xmlrpc.php', 'customfw', 'customfw@20200119')
    post = WordPressPost()
    post.title = name
    post.content = des
    post.post_status = 'publish' #文章状态,不写默认是草稿,private表示私密的,draft表示草稿,publish表示发布

    post.terms_names = {
        'post_tag':random.sample(set(['disney fuzzy socks','disney princess 12 days of socks','disney crew socks','disney socks for adults','stitch socks disney']), 3), #文章所属标签,没有则自动创建
        'category':random.sample(set(['disney socks','stance disney socks','disney socks mens','mens disney socks','stance disney socks']), 2) #文章所属分类,没有则自动创建
        }

    post.custom_fields = []   #自定义字段列表
    post.custom_fields.append({  #添加一个自定义字段
        'key': 'price',
        'value': 3
        })

    post.custom_fields.append({ #添加第二个自定义字段
        'key': 'ok',
        'value': 'customfw.com'
        })
    post.id = wp.call(posts.NewPost(post))
    time.sleep(5)

'''判断关键词是否在一个文件中'''
def panduan(key,key_path1):
    f1=open(key_path1,'a')
    #f1.write()
    with open(key_path1) as f:
        txt=f.read()
        f.close()
        if key in txt:
            return True
        else:
            return False

#采集成功并记录
def okok(key_path1,key):
    with open(key_path1,'a') as f:
        f.write(key+'\n')
        f.close()

'''主程序开始'''
global key_path,post_n
#设置关键词位置
path=r'/Users/gaotiansong/Desktop/bing/' #关键词路径
keyf='dsnkey.txt'#关键词文件名
keyf1='dsnkey_socks.txt' #进度保存
key_path1=path+keyf1 #已经采集过的关键词
key_path=path+keyf #待采集关键词
#设置筛选范围
post_n=50

with open(key_path, 'r', encoding='utf-8') as f:
    for line in f:
        key1=line[:-1] #去掉换行符
        key='disney '+key1+' socks'
        if panduan(key,key_path1)==True:#如果关键词已经采集过,则跳过
            print(key,'已经存在,跳过')
            continue
        else:
            pass
        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':开始采集关键词',key)

        #把关键词变成下划线链接的字符串
        key_s=key_ss(key)
        key_img=key_imgs(key)

        #把关键词拼接成url
        url=to_url(key_s)

        #链接网络
        html=connect(url)

        #获取图片列表
        imgs=img_urls(html)
        if imgs==[]:
            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':没有发现图片,跳过:',key)
            continue
        
        #设置每篇文章的图片数量
        if len(imgs)>post_n:
            imgs=imgs[:post_n]
        else:
            imgs=imgs

        #取出每一张图片进行识图处理,得出每一张图片的描述,并把描述整理成列表
        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':识别图片')
        des=''
        n=1
        for img in imgs:
            if n>10:
                break
            #print('img',img)
            try:
                d=shitu(img)#得出这张图片的描述
                #d=''
            except:
                print('出错,跳过')
            try:
                pass
                #print('寻找第',n,'张图片,图片内容是:',d)
            except:
                print('图片内容无法显示')
            #print('开始选择图片')
            if True:
                print('找到',n,'张图片')
                imgurl=img
                imgpj=key_img+str(n)
                de='<center><img class="wp-image-10 size-full" src="'+imgurl+'" alt="'+key+'" width="500" height="472"></center>'+'<br><center><h2>'+d+'</h2></center></br>'
                des=des+de
                n=n+1
            elif 'I really can’t describe the picture' in d:
                continue
            else:
                continue
        if des=='':
            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':没有找到图片,跳过')
            continue
        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':把内容写入wordpress')
        name='Custom'+key+' '+'Beautiful '+key+' Photo list'
        try:
            wt_wordpress(name,des)#写入wordpress
            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':成功采集',key)
            print('')
            okok(key_path1,key)#记录采集历史
        except:
            print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':写入失败,跳过该条目')
            okok(key_path1,key+'失败')#记录采集历史
        
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),':采集完毕')

'''主程序结束'''