| # -*- coding: utf-8 -*- | |
| ''' | |
| Created by Shengbo.Zhang on 2021/08/13 | |
| ''' | |
| import sys | |
| import time | |
| ################################################## | |
| ############## 算法:PDF2TXT_V3.py ################ | |
| ############## 测试示例 ################ | |
| ################################################## | |
| from Pdf2Txt.pdf2txt_v1 import find_all_local_file | |
| from Pdf2Txt.pdf2txt_v3 import * | |
| while True: | |
| count_total = 0 | |
| count_success = 0 | |
| count_failed = 0 | |
| test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') | |
| if test_file_dir == 'exit': | |
| sys.exit() | |
| print('*****************************************************') | |
| t1 = time.time() | |
| for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): | |
| count_total += 1 | |
| pdf_file_path = path | |
| pdf_dir_path = os.path.dirname(path) | |
| pdf_file_name = os.path.basename(path)[:-4] | |
| output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" | |
| print(f'开始处理: 第 {idx + 1} 个文件...') | |
| print(f'文件名: {pdf_file_name}.pdf') | |
| tt1 = time.time() | |
| try: | |
| txt_string = get_txt_from_pdf(pdf_file_path) | |
| if txt_string != '': | |
| output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string) | |
| count_success += 1 | |
| print('处理成功.') | |
| else: | |
| count_failed += 1 | |
| print('处理失败!') | |
| except Exception as e: | |
| print(e) | |
| count_failed += 1 | |
| print('处理失败!') | |
| tt2 = time.time() | |
| print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒') | |
| print('*****************************************************') | |
| t2 = time.time() | |
| print('\n所有PDF格式的公告文件已处理完毕!') | |
| print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}') | |
| print('执行耗时:', round(t2-t1, 3), '秒') | |
| print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个') | |
| # ################################################## | |
| # ############## 算法:PDF2TXT_V2.py ################ | |
| # ############## 测试示例 ################ | |
| # ################################################## | |
| # from Pdf2Txt.pdf2txt_v1 import find_all_local_file | |
| # from Pdf2Txt.pdf2txt_v2 import * | |
| # while True: | |
| # count_total = 0 | |
| # count_success = 0 | |
| # count_failed = 0 | |
| # | |
| # test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') | |
| # if test_file_dir == 'exit': | |
| # sys.exit() | |
| # | |
| # print('*****************************************************') | |
| # t1 = time.time() | |
| # for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): | |
| # count_total += 1 | |
| # | |
| # pdf_file_path = path | |
| # pdf_dir_path = os.path.dirname(path) | |
| # pdf_file_name = os.path.basename(path)[:-4] | |
| # output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" | |
| # | |
| # print(f'开始处理: 第 {idx + 1} 个文件...') | |
| # print(f'文件名: {pdf_file_name}.pdf') | |
| # tt1 = time.time() | |
| # try: | |
| # txt_string = get_txt_from_pdf(pdf_file_path) | |
| # if txt_string != '': | |
| # output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string) | |
| # count_success += 1 | |
| # print('处理成功.') | |
| # else: | |
| # count_failed += 1 | |
| # print('处理失败!') | |
| # except Exception as e: | |
| # print(e) | |
| # count_failed += 1 | |
| # print('处理失败!') | |
| # tt2 = time.time() | |
| # print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒') | |
| # | |
| # print('*****************************************************') | |
| # | |
| # t2 = time.time() | |
| # print('\n所有PDF格式的公告文件已处理完毕!') | |
| # print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}') | |
| # print('执行耗时:', round(t2-t1, 3), '秒') | |
| # print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个') | |
| # ################################################## | |
| # ############## 算法:PDF2TXT_V1.py ################ | |
| # ############## 测试示例 ################ | |
| # ################################################## | |
| # from Pdf2Txt.pdf2txt_v1 import * | |
| # while True: | |
| # count_total = 0 | |
| # count_success = 0 | |
| # count_failed = 0 | |
| # | |
| # test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') | |
| # if test_file_dir == 'exit': | |
| # sys.exit() | |
| # txt_output_mode = input('\n请选择TXT输出模式: 1. 带段头段尾表标识符 2. 不带段头段尾标识符(默认,按enter键) ') | |
| # if txt_output_mode == '1': | |
| # txt_output_mode = True | |
| # else: | |
| # txt_output_mode = False | |
| # | |
| # print('*****************************************************') | |
| # for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): | |
| # count_total += 1 | |
| # | |
| # pdf_file_path = path | |
| # pdf_dir_path = os.path.dirname(path) | |
| # pdf_file_name = os.path.basename(pdf_file_path)[:-4] | |
| # output_docx_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.docx" | |
| # output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" | |
| # output_csv_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.csv" | |
| # | |
| # t1 = time.time() | |
| # is_success = get_docx_from_pdf(pdf_path=pdf_file_path, out_path=output_docx_file_path) | |
| # t2 = time.time() | |
| # print(f'开始处理: 第 {idx + 1} 个文件...') | |
| # print(f'文件名: {pdf_file_name}.pdf') | |
| # print('步骤-1: 公告pdf文件已转换为docx格式并进行页数校验!') | |
| # print('--> 执行耗时:', int((t2 - t1) * 1000.0), 'ms') | |
| # | |
| # if not is_success: | |
| # | |
| # count_failed += 1 | |
| # print(f'文件: {pdf_file_path}') | |
| # print('错误: 原始pdf与生成的docx文件页数校验失败,拒绝进行下一步转换.') | |
| # # 校验失败的原因在于pdf2docx有暂无法处理少量包含特殊layout的pdf文件,待原作者更新; | |
| # # 若发生校验失败,后续可考虑直接丢弃该公告数据,或使用_get_txt_from_pdf()函数作直接转换。 | |
| # | |
| # else: | |
| # | |
| # document = Document(output_docx_file_path) | |
| # | |
| # is_success, txt_list = get_txt_from_docx(doc=document) | |
| # t3 = time.time() | |
| # print('步骤-2: 公告docx文件的段落提取与格式化已完成!') | |
| # print('--> 执行耗时:', int((t3 - t2) * 1000.0), 'ms') | |
| # | |
| # if not is_success: | |
| # count_failed += 1 | |
| # print(f'文件: {pdf_file_path}') | |
| # print('错误: 原始docx转换为txt文本的过程中出错,拒绝进行下一步转换.') | |
| # else: | |
| # txt_list, attach_list = get_table_from_docx(doc=document, txt=txt_list, out_path=output_csv_file_path, | |
| # is_out_flag=False) | |
| # t4 = time.time() | |
| # print('步骤-3: 公告docx文件的表格提取与格式化已完成!') | |
| # print('--> 执行耗时:', int((t4 - t3) * 1000.0), 'ms') | |
| # | |
| # txt_list = refine_pdf2txt_list_result(txt=txt_list, att_txt=attach_list) | |
| # t5 = time.time() | |
| # print('步骤-4: 公告txt文件的校对已完成!') | |
| # print('--> 执行耗时:', int((t5 - t4) * 1000.0), 'ms') | |
| # | |
| # write_pdf2txt_list_result(out_path=output_txt_file_path, txt=txt_list, out_mode_flag=txt_output_mode) | |
| # str_result = get_pdf2txt_str_result(txt=txt_list, out_mode_flag=txt_output_mode) | |
| # t6 = time.time() | |
| # print('步骤-5: 公告txt文件的输出已完成!') | |
| # print('--> 执行耗时:', int((t6 - t5) * 1000.0), 'ms') | |
| # | |
| # print('----> 总运行时间:', int((t6 - t1) * 1000.0), 'ms') | |
| # count_success += 1 | |
| # | |
| # if os.path.exists(output_docx_file_path): | |
| # os.remove(output_docx_file_path) | |
| # if os.path.exists(output_csv_file_path): | |
| # os.remove(output_csv_file_path) | |
| # print('*****************************************************') | |
| # | |
| # print('\n所有PDF格式的公告文件已处理完毕!') | |
| # print(f'【文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}】') |