python读取pdf文件提取关键信息到Excel中

以前做过的一个项目，需要从pdf格式的检查报告提取关键信息到Execl中import numpy as npimport pandas as pdimport reimport pdfplumberimport osfrom tqdm import tqdmpd.set_option('max_row',None)pd.set_option('max_columns',None)a = 0xuej

今日你饮左未啊

986人浏览 · 2021-05-08 14:42:24

今日你饮左未啊 · 2021-05-08 14:42:24 发布

以前做过的一个项目，需要从pdf格式的检查报告提取关键信息到Execl中

import numpy as np
import pandas as pd
import re
import pdfplumber
import os
from tqdm import tqdm

pd.set_option('max_row',None)
pd.set_option('max_columns',None)

a = 0
xuejian_list = []
xuejian_id_list = []
#shunde_file = []
#shunde_id_file = []

#读取路径下的所有pdf文件
for root,dirs,files in os.walk(r'C:\Users\XX'):
    for file1 in files:
        if file1.endswith('.pdf') or file1.endswith('.PDF'):
            file2 = re.search('(.+?).[pdf,PDF]',file1)
            xuejian_id_list.append(file2.group(1))
            file = os.path.join(root,file1)
            xuejian_list.append(file)
            '''if file2.group(1) in shunde_id_list:
                shunde_file.append(file)
                shunde_id_file.append(file2.group(1))'''
            #print(file)
#print(xuejian_id_list)
num = len(xuejian_list)

def pdf_to_excel(file):
    pdf = pdfplumber.open(file)

    a = 0
    dist = {}
    dist['id'] = xuejian_id_list[x]
    
    pdf_table_path = pd.DataFrame(None)
    for page in pdf.pages:
        a = a + 1
        if a == 1:
            #print(page.extract_text())
            pdf_text = page.extract_text()

            #诊断结果1
            result1 = re.search('结果[:,：,：,\s]{1,}(.+?)[\。]{0,}[\n,\s]',pdf_text)
            if result1 != None:
                l5 = result1.group(1)
                dist['result1'] = l5
            else:
                result1 = re.search('结果[:,：,\s]{0,}(.+?)\。',pdf_text)
                l5 = result1.group(1)
                dist['result1'] = l5

            #诊断结果2，无规则可循，考虑其他读取pdf包
            result2 = re.search('(本次[\s\S]+?)[\n,\s]{0,}主[\s]{0,}治[\s]{0,}医[\s]{0,}生',pdf_text)
            if result2 != None:
                l6 = result2.group(1).replace('\s','').replace('\n','')
                dist['result2'] = l6
            else:
                result2 = re.search('(本次[\s\S]+?)[\n,\s]{0,}检[\s]{0,}验[\s]{0,}者',pdf_text)
                l6 = result2.group(1).replace('\s','').replace('\n','')
                l66 = re.search('(.+\。)',l6)
                dist['result2'] = l66.group(1)

            dist_new = pd.DataFrame.from_dict(dist,orient='index').T
            #print(dist_new)


        #表格提取为pd.DataFrame
        for pdf_table in page.extract_tables():
            pdf_table = np.array(pdf_table)
            pdf_table = pd.DataFrame(pdf_table[1:],columns=pdf_table[0])
        if str(pdf_table_path.empty) == 'False':
            pdf_table = pd.concat([pdf_table_path,pdf_table],axis=0)
        pdf_table_path = pdf_table.copy()
            #print(pdf_table)
            #print('/n')
    pdf_table = pdf_table.reset_index(drop=True)
    pdf_table1 = pd.DataFrame(pdf_table['检测结果'].values)
    pdf_table1.index = pd.Series(pdf_table['英文缩写'].values)

    #pdf_table2 = pd.pivot_table(pdf_table1,columns=['英文缩写'],values=['检测结果'],aggfunc=[np.sum],fill_value=np.nan)

    #数据合并
    pdf_data = pd.concat([dist_new,pdf_table1.T],axis=1)
    #print(pdf_data)
    #pdf_data_columns = pdf_data.columns.values.tolist()

    pdf.close()
    return pdf_data

#xuejian_dict = {}
xuejian_path = pd.DataFrame(None)
false_file = []
for x in tqdm(range(len(xuejian_list))):
    try:
        #xuejian_dict['id'] = xuejian_id_list[x]
        pdf_data = pdf_to_excel(xuejian_list[x])
        if str(xuejian_path.empty) == 'False':
               pdf_data = pd.concat([xuejian_path,pdf_data],axis=0)
        xuejian_path = pdf_data.copy()
            
    except:
        false_file.append(xuejian_list[x])

pdf_data1 = pdf_data.reset_index(drop=True)
pdf_data1.to_csv(r'xx.csv',encoding="gbk",index=False)
false_file1 = pd.DataFrame(false_file,columns=['file_name'])
false_file1.to_csv(r'xx',encoding='gbk',index=False)
print("总pdf文件数：{}".format(len(xuejian_list)))
print("已提取的pdf文件数：{}".format(len(pdf_data1)))

天启AI社区

GitCode 天启AI是一款由 GitCode 团队打造的智能助手，基于先进的LLM（大语言模型）与多智能体 Agent 技术构建，致力于为用户提供高效、智能、多模态的创作与开发支持。它不仅支持自然语言对话，还具备处理文件、生成 PPT、撰写分析报告、开发 Web 应用等多项能力，真正做到“一句话，让 Al帮你完成复杂任务”。

更多推荐