Spaces:

suprimedev
/

pdf2text5

Runtime error

App Files Files Community

suprimedev commited on Aug 23, 2025

Commit

c8d9a94

verified ·

1 Parent(s): 6b784e9

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -46

app.py CHANGED Viewed

@@ -1,54 +1,85 @@
 import gradio as gr
-import fitz
 import arabic_reshaper
 from PIL import Image
-import easyocr
-# Reader سبک برای کاهش مصرف CPU و زمان لود
-reader = easyocr.Reader(['fa','ar','en'], gpu=False, detector='craft_mini')
 def extract_text_from_pdf(pdf_file):
-    if pdf_file is None:
-        return "لطفاً یک فایل PDF آپلود کنید.", None
-    pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
-    all_text = []
-    pdf_document = fitz.open(pdf_path)
-    for page_num, page in enumerate(pdf_document):
-        text = page.get_text("text")
-        # فقط اگر متن خالی یا مشکل‌دار بود → OCR
-        if not text.strip() or len(set(text)) < 10:
-            pix = page.get_pixmap(dpi=150)
-            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            text = "\n".join(reader.readtext(img, detail=0))
-        if any('\u0600' <= c <= '\u06FF' or '\u0750' <= c <= '\u077F' for c in text):
-            text = arabic_reshaper.reshape(text)
-        all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
-    pdf_document.close()
-    extracted_text = "\n".join(all_text)
-    output_file = "extracted_text.txt"
-    with open(output_file, "w", encoding="utf-8") as f:
-        f.write(extracted_text)
-    return extracted_text, output_file
-def create_interface():
-    with gr.Blocks() as interface:
-        gr.Markdown("## استخراج متن PDF سریع و کم‌مصرف")
-        pdf_input = gr.File(label="آپلود PDF", file_types=[".pdf"], type="filepath")
-        extract_btn = gr.Button("استخراج متن")
-        text_output = gr.Textbox(label="متن استخراج شده", lines=20)
-        download_output = gr.File(label="دانلود TXT")
-        extract_btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=[text_output, download_output])
-    return interface
 if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch()

 import gradio as gr
+import fitz  # PyMuPDF
 import arabic_reshaper
+from bidi.algorithm import get_display
+import pytesseract
 from PIL import Image
+import io
+import numpy as np
 def extract_text_from_pdf(pdf_file):
+    try:
+        # باز کردن فایل PDF
+        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        full_text = ""
+        has_ocr_processed = False
+        # استخراج متن از تمام صفحات
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            # ابتدا سعی می‌کنیم متن را مستقیماً استخراج کنیم
+            text = page.get_text()
+            # اگر متن کمی استخراج شده (صفحه احتمالاً تصویری است)
+            if len(text.strip()) < 100:
+                # تبدیل صفحه به تصویر با وضوح بالا
+                mat = fitz.Matrix(300/72, 300/72)  # وضوح 300 DPI
+                pix = page.get_pixmap(matrix=mat)
+                img_data = pix.tobytes("ppm")
+                # استفاده از OCR برای استخراج متن از تصویر
+                image = Image.open(io.BytesIO(img_data))
+                ocr_text = pytesseract.image_to_string(image, lang='fas+ara+eng')
+                text = ocr_text
+                has_ocr_processed = True
+            full_text += f"--- صفحه {page_num + 1} ---\n"
+            full_text += text + "\n\n"
+        doc.close()
+        # پردازش متن برای زبان‌های راست‌به‌چپ
+        try:
+            reshaped_text = arabic_reshaper.reshape(full_text)
+            bidi_text = get_display(reshaped_text)
+            if has_ocr_processed:
+                bidi_text = "[⚠️ برخی صفحات با OCR پردازش شدند]\n\n" + bidi_text
+            return bidi_text
+        except:
+            return full_text
+    except Exception as e:
+        return f"خطا در پردازش فایل: {str(e)}"
+# ایجاد رابط Gradio
+with gr.Blocks(title="PDF Text Extractor with OCR") as demo:
+    gr.Markdown("# 📄 استخراج متن از فایل PDF")
+    gr.Markdown("با قابلیت پردازش OCR برای PDFهای تصویری")
+    with gr.Row():
+        pdf_input = gr.File(label="فایل PDF را انتخاب کنید", file_types=[".pdf"])
+    extract_btn = gr.Button("🔄 استخراج متن")
+    with gr.Row():
+        text_output = gr.Textbox(label="متن استخراج شده", lines=20, interactive=False)
+    gr.Markdown("""
+    **⚠️ توجه:**
+    - برای PDFهای متنی، متن مستقیماً استخراج می‌شود
+    - برای PDFهای تصویری، از OCR استفاده می‌شود
+    - پشتیبانی از زبان‌های فارسی، عربی و انگلیسی
+    - پردازش ممکن است برای فایل‌های بزرگ کمی زمان‌بر باشد
+    """)
+    extract_btn.click(
+        fn=extract_text_from_pdf,
+        inputs=pdf_input,
+        outputs=text_output
+    )
 if __name__ == "__main__":
+    demo.launch()