Python qichacha 上市公司专利爬虫

from bs4 import BeautifulSoupimport requestsimport timeimport pandas as pdfrom selenium import webdriverimport csvimport reimport numpy as npimport osafterLogin_headers = {'User-Agent': 'Mozilla/5.0 (

Along1617188

1736人浏览 · 2021-12-02 19:32:22

Along1617188 · 2021-12-02 19:32:22 发布

from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from selenium import webdriver
import csv
import re
import numpy as np
import os

afterLogin_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'}
# driver = webdriver.PhantomJS(executable_path=r'D:\code\patent_info\phantomjs-2.1.1-windows\bin\phantomjs.exe', service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
chrome_driver = r'D:\code\patent_info\chromedriver.exe'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(r'--user-data-dir=D:\code\patent_info\ChromeUserqichacha0623')
driver = webdriver.Chrome(executable_path = chrome_driver, options=chrome_options)
# driver.maximize_window() 
# # driver.add_argument('--headless')  # 开启无界面模式 

def get_company_message(company):
    '''
    Input: company is a chinese word
    Todo: get company code and  patent url 
    Return: patent url 
    '''
    driver.get('https://www.qcc.com/search?key={}'.format(company))
    time.sleep(5)
    print('https://www.qcc.com/search?key={}'.format(company))
    html_page = driver.page_source
    soup = BeautifulSoup(html_page,features="lxml")
    href = soup.find_all('a',{'class': 'title'})[0].get('href')
    print('href is {}'.format(href))
    href2 = href.replace('firm', 'cassets')
    print('href2 is {}'.format(href2))
    return href2

def iselementExist(element):
    '''
    Input: xpath of patent_table element
    Todo: Whether the patent exists
    Return: bools
    '''
    flag = True
    try:
        driver.find_element_by_xpath(element)
        return flag
    except:
        flag=False
        return flag
    # //*[@id="zhuanlilist"]/div[1]/h3
def get_patent_infomation(number_page):# get information single web
    '''
    Input: href2 get from 'get_company_message' 
    Todo: get patents information from table
    Return: soup.select
    '''
    number_page = BeautifulSoup(number_page,features="lxml")
    data_infos = number_page.select('#zhuanlilist .app-ntable td')
    return data_infos

def save_patient(data_infos, key_company):
    '''
    Input: 1.data_infos: get one page information from 'get_patent_infomation'
           2.key_company: company name read from excel
    Todo: explain and write input to csv
    return: None
    '''
    company_patient = []
    for info in data_infos:
        company_patient.append(info.text)
    company_patient_classified =  [company_patient[i:i+10] for i in range(0,len(company_patient),10)]
    company_patient_classified = np.array(company_patient_classified) # 使用numpy中的array，将列表转化为标准的数组s
    dataframe = pd.DataFrame({  'company': key_company,
                                'Number': company_patient_classified[:,0],
                                'Patent_name': company_patient_classified[:,1],
                                'Patent_type': company_patient_classified[:,2],
                                'Patent_statu': company_patient_classified[:,3],
                                'Application number': company_patient_classified[:,4],
                                'Apply_data': company_patient_classified[:,5],
                                'Public_announcement_No': company_patient_classified[:,6],
                                'Public_announcement_Data': company_patient_classified[:,7],
                                'Inventor': company_patient_classified[:,8],
                                'More': company_patient_classified[:,9]
                                })
    if not os.path.exists('D:\code\patent_info\data\output_data\company_patient1.csv'):
        dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030')
    else:
        dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030',header=False)

def whether_turn_page(element1):
    '''
    Input: xpath of patent_table element
    Todo: Whether the pages_number table exists
    Return: bools
    '''
    flag1 = True
    try:
        driver.find_element_by_xpath(element1)
        return flag1
    except:
        flag1=False
        return flag1
  # //*[@id="zhuanlilist"]/div[4]/nav/ul

def turn_next_page(key_company):  
    '''
    Input: key_company: company name read from excel
    Todo: design how to turn next page in defferent condition
    return: key_company: company name read from excel
    '''
    print(driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text)
    list_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text
    # num_max = int(re.sub("\D", "", list_max)) # just keep int number
    if list_max == '>': # patent number between (10,70]
        list_second_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').text
        for turn_index in range(int(list_second_max) - 1):
            driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').click()
            time.sleep(1)
            number_page = driver.page_source
            data_infos = get_patent_infomation(number_page)
            save_patient(data_infos, key_company)
    elif int(re.sub("\D", "", list_max)):
    # elif isinstance(num_max, int):# patent number above 70
        for turn_index in range(int(re.sub("\D", "", list_max)) - 1):
            driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').click()
            time.sleep(1)
            number_page = driver.page_source
            data_infos = get_patent_infomation(number_page)
            save_patient(data_infos, key_company)
    else:
        print('error company name is {}'.format(key_company))

    #next_page_button //*[@id="zhuanlilist"]/div[4]/nav/ul/li[8]/a 
if __name__ == '__main__':
    csv_file = r"D:\code\patent_info\patient1.csv"
    with open(csv_file, encoding='utf-8') as csvfile:
        reader=csv.reader(csvfile)
        for i,key_company in enumerate(reader):
            print('i is {}'.format(i))
            print('rows is {}'.format(key_company))
            # print(type(key_company))
            key_company = ' '.join(key_company)
            patent_url = get_company_message(key_company) # patent url
            driver.get(patent_url)
            time.sleep(1)
            if iselementExist('//*[@id="zhuanlilist"]/div[1]/h3'):
                number_page = driver.page_source
                data_infos = get_patent_infomation(number_page)
                save_patient(data_infos, key_company)
                if whether_turn_page('//*[@id="zhuanlilist"]/div[4]/nav/ul'):
                    turn_next_page(key_company)

天启AI社区

GitCode 天启AI是一款由 GitCode 团队打造的智能助手，基于先进的LLM（大语言模型）与多智能体 Agent 技术构建，致力于为用户提供高效、智能、多模态的创作与开发支持。它不仅支持自然语言对话，还具备处理文件、生成 PPT、撰写分析报告、开发 Web 应用等多项能力，真正做到“一句话，让 Al帮你完成复杂任务”。

更多推荐