Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| from PIL import Image | |
| import pytesseract | |
| import os | |
| def extract_pdf(pdf_file, lang_codes: str, force_ocr: bool): | |
| if pdf_file is None: | |
| return "لطفاً یک فایل PDF انتخاب کنید.", None | |
| path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file | |
| doc = fitz.open(path) | |
| langs = lang_codes.strip() or "fas+ara+eng" | |
| page_texts = [] | |
| total_chars = 0 | |
| for page in doc: | |
| t = page.get_text("text") or "" | |
| page_texts.append(t.strip()) | |
| total_chars += len(t) | |
| if force_ocr or total_chars < 50: | |
| ocr_texts = [] | |
| for page in doc: | |
| pix = page.get_pixmap(matrix=fitz.Matrix(2,2)) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| text = pytesseract.image_to_string(img, lang=langs) | |
| ocr_texts.append(text.strip()) | |
| page_texts = ocr_texts | |
| combined = "\n\n".join([f"--- Page {i+1} ---\n{t}" for i,t in enumerate(page_texts)]) | |
| out_path = os.path.join("/mnt/data", os.path.basename(path) + "_extracted.txt") | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| f.write(combined) | |
| return combined, out_path | |
| demo = gr.Interface( | |
| fn=extract_pdf, | |
| inputs=[ | |
| gr.File(label="فایل PDF", file_types=[".pdf"]), | |
| gr.Textbox(label="کد زبان تسرکت", value="fas+ara+eng"), | |
| gr.Checkbox(label="اجبار OCR", value=False) | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="متن خروجی", lines=20), | |
| gr.File(label="دانلود فایل خروجی") | |
| ], | |
| title="استخراج متن PDF چندزبانه", | |
| description="پشتیبانی از فارسی/عربی با Tesseract. مصرف CPU خیلی کمتر از EasyOCR." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |