| import gradio as gr |
| import spaces |
| from paddleocr import PaddleOCR |
| import fitz |
| from PIL import Image |
| import numpy as np |
| import os |
|
|
| |
| OUTPUT_DIR = "output_results" |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| |
| def load_gpu_model(): |
| print("正在加载经过版本锁定的PaddleOCR GPU模型...") |
| |
| ocr_model = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=True) |
| print("GPU模型加载成功。") |
| return ocr_model |
|
|
| |
| @spaces.GPU |
| def process_pdf_max_speed(pdf_file, progress=gr.Progress(track_tqdm=True)): |
| """ |
| 使用GPU和批处理来极速处理PDF,并实时更新进度条。 |
| """ |
| if pdf_file is None: |
| return "请先上传一个PDF文件。", None |
|
|
| try: |
| |
| ocr = load_gpu_model() |
| |
| doc = fitz.open(pdf_file.name) |
| total_pages = len(doc) |
| batch_size = 4 |
| full_text_result = [] |
|
|
| for i in progress.tqdm(range(0, total_pages, batch_size), desc="🚀 批处理中..."): |
| |
| batch_images = [] |
| for page_num in range(i, min(i + batch_size, total_pages)): |
| page = doc.load_page(page_num) |
| pix = page.get_pixmap(dpi=200) |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
| batch_images.append(np.array(img)) |
|
|
| if batch_images: |
| |
| results = ocr.ocr(batch_images, cls=True) |
| |
| for page_index, page_result in enumerate(results): |
| page_texts = [] |
| current_page_num = i + page_index + 1 |
| if page_result: |
| for line in page_result: |
| page_texts.append(line[1][0]) |
| |
| full_text_result.append(f"--- Page {current_page_num} ---\n" + "\n".join(page_texts)) |
|
|
| doc.close() |
| |
| final_text = "\n\n".join(full_text_result) |
| output_filename = os.path.join(OUTPUT_DIR, f"{os.path.splitext(os.path.basename(pdf_file.name))[0]}_result.txt") |
| with open(output_filename, 'w', encoding='utf-8') as f: |
| f.write(final_text) |
|
|
| print(f"处理完成!结果已保存到 {output_filename}") |
| return final_text, output_filename |
|
|
| except Exception as e: |
| error_message = f"处理过程中发生错误: {str(e)}" |
| print(error_message) |
| return error_message, None |
|
|
| |
| with gr.Blocks(title="极速PDF识别器", theme=gr.themes.Soft()) as demo: |
| gr.Markdown( |
| """ |
| # 🔥 极速PDF识别器 (GPU加速稳定版) 🔥 |
| **速度拉满!实时进度显示,处理期间请勿关闭页面。** |
| """ |
| ) |
| |
| with gr.Row(): |
| pdf_input = gr.File(label="📄 上传PDF文件", file_types=[".pdf"]) |
| |
| submit_btn = gr.Button("⚡️ 开始极速处理", variant="primary") |
| |
| result_display = gr.Textbox(label="识别结果", lines=20, show_copy_button=True) |
| download_link = gr.File(label="📥 点击此处下载结果文件", interactive=False) |
|
|
| submit_btn.click( |
| fn=process_pdf_max_speed, |
| inputs=[pdf_input], |
| outputs=[result_display, download_link] |
| ) |
|
|
| demo.queue().launch() |