import gradio as gr import fitz # PyMuPDF from PIL import Image import pytesseract import os def extract_pdf(pdf_file, lang_codes: str, force_ocr: bool): if pdf_file is None: return "لطفاً یک فایل PDF انتخاب کنید.", None path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file doc = fitz.open(path) langs = lang_codes.strip() or "fas+ara+eng" page_texts = [] total_chars = 0 for page in doc: t = page.get_text("text") or "" page_texts.append(t.strip()) total_chars += len(t) if force_ocr or total_chars < 50: ocr_texts = [] for page in doc: pix = page.get_pixmap(matrix=fitz.Matrix(2,2)) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) text = pytesseract.image_to_string(img, lang=langs) ocr_texts.append(text.strip()) page_texts = ocr_texts combined = "\n\n".join([f"--- Page {i+1} ---\n{t}" for i,t in enumerate(page_texts)]) out_path = os.path.join("/mnt/data", os.path.basename(path) + "_extracted.txt") with open(out_path, "w", encoding="utf-8") as f: f.write(combined) return combined, out_path demo = gr.Interface( fn=extract_pdf, inputs=[ gr.File(label="فایل PDF", file_types=[".pdf"]), gr.Textbox(label="کد زبان تسرکت", value="fas+ara+eng"), gr.Checkbox(label="اجبار OCR", value=False) ], outputs=[ gr.Textbox(label="متن خروجی", lines=20), gr.File(label="دانلود فایل خروجی") ], title="استخراج متن PDF چندزبانه", description="پشتیبانی از فارسی/عربی با Tesseract. مصرف CPU خیلی کمتر از EasyOCR." ) if __name__ == "__main__": demo.launch()