| | import gradio as gr |
| | import spaces |
| | from paddleocr import PaddleOCR |
| | import fitz |
| | from PIL import Image |
| | import numpy as np |
| | import os |
| |
|
| | |
| | OUTPUT_DIR = "output_results" |
| | os.makedirs(OUTPUT_DIR, exist_ok=True) |
| |
|
| | |
| | def load_gpu_model(): |
| | print("正在Docker容器中加载PaddleOCR GPU模型...") |
| | |
| | ocr_model = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=True) |
| | print("GPU模型加载成功。") |
| | return ocr_model |
| |
|
| | |
| | @spaces.GPU |
| | def process_pdf_max_speed(pdf_file, progress=gr.Progress(track_tqdm=True)): |
| | """ |
| | 使用GPU和批处理来极速处理PDF,并实时更新进度条。 |
| | """ |
| | if pdf_file is None: |
| | return "请先上传一个PDF文件。", None |
| |
|
| | try: |
| | |
| | ocr = load_gpu_model() |
| | |
| | doc = fitz.open(pdf_file.name) |
| | total_pages = len(doc) |
| | batch_size = 4 |
| | full_text_result = [] |
| |
|
| | for i in progress.tqdm(range(0, total_pages, batch_size), desc="🚀 批处理中..."): |
| | |
| | batch_images = [] |
| | for page_num in range(i, min(i + batch_size, total_pages)): |
| | page = doc.load_page(page_num) |
| | pix = page.get_pixmap(dpi=200) |
| | img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
| | batch_images.append(np.array(img)) |
| |
|
| | if batch_images: |
| | results = ocr.ocr(batch_images, cls=True) |
| | |
| | for page_index, page_result in enumerate(results): |
| | page_texts = [] |
| | current_page_num = i + page_index + 1 |
| | if page_result: |
| | for line in page_result: |
| | page_texts.append(line[1][0]) |
| | |
| | full_text_result.append(f"--- Page {current_page_num} ---\n" + "\n".join(page_texts)) |
| |
|
| | doc.close() |
| | |
| | final_text = "\n\n".join(full_text_result) |
| | output_filename = os.path.join(OUTPUT_DIR, f"{os.path.splitext(os.path.basename(pdf_file.name))[0]}_result.txt") |
| | with open(output_filename, 'w', encoding='utf-8') as f: |
| | f.write(final_text) |
| |
|
| | print(f"处理完成!结果已保存到 {output_filename}") |
| | return final_text, output_filename |
| |
|
| | except Exception as e: |
| | error_message = f"处理过程中发生错误: {str(e)}" |
| | print(error_message) |
| | return error_message, None |
| |
|
| | |
| | with gr.Blocks(title="极速PDF识别器", theme=gr.themes.Soft()) as demo: |
| | gr.Markdown( |
| | """ |
| | # ✅ 极速PDF识别器 (终极稳定版) ✅ |
| | **速度拉满!实时进度显示,处理期间请勿关闭页面。** |
| | """ |
| | ) |
| | |
| | with gr.Row(): |
| | pdf_input = gr.File(label="📄 上传PDF文件", file_types=[".pdf"]) |
| | |
| | submit_btn = gr.Button("⚡️ 开始极速处理", variant="primary") |
| | |
| | result_display = gr.Textbox(label="识别结果", lines=20, show_copy_button=True) |
| | download_link = gr.File(label="📥 点击此处下载结果文件", interactive=False) |
| |
|
| | submit_btn.click( |
| | fn=process_pdf_max_speed, |
| | inputs=[pdf_input], |
| | outputs=[display, download_link] |
| | ) |
| |
|
| | demo.queue().launch() |