Spaces:

suprimedev
/

pdf2text5

Runtime error

App Files Files Community

suprimedev commited on Aug 23, 2025

Commit

6b784e9

verified ·

1 Parent(s): b5f1a8b

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -43

app.py CHANGED Viewed

@@ -1,50 +1,54 @@
-import os
 import fitz
-import easyocr
-from PIL import Image
 import arabic_reshaper
-reader = easyocr.Reader(['fa','ar','en'], gpu=False)
 def extract_text_from_pdf(pdf_file):
     if pdf_file is None:
         return "لطفاً یک فایل PDF آپلود کنید.", None
-    # تبدیل مسیر یا فایل به مسیر قابل استفاده
-    if hasattr(pdf_file, "name"):  # اگر TemporaryFile است
-        pdf_path = pdf_file.name
-    elif isinstance(pdf_file, str):  # اگر مسیر string است
-        pdf_path = pdf_file
-    else:
-        return "نوع فایل نامعتبر است.", None
-    try:
-        pdf_document = fitz.open(pdf_path)
-        all_text = []
-        for page_num in range(len(pdf_document)):
-            page = pdf_document[page_num]
-            text = page.get_text("text")
-            if not text.strip() or len(set(text)) < 10:
-                pix = page.get_pixmap(dpi=150)
-                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-                text_lines = reader.readtext(img, detail=0)
-                text = "\n".join(text_lines)
-            if any('\u0600' <= char <= '\u06FF' or '\u0750' <= char <= '\u077F' for char in text):
-                text = arabic_reshaper.reshape(text)
-            all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
-        pdf_document.close()
-        extracted_text = "\n".join(all_text)
-        output_file = "extracted_text.txt"
-        with open(output_file, "w", encoding="utf-8") as f:
-            f.write(extracted_text)
-        return extracted_text, output_file
-    except Exception as e:
-        return f"خطا در پردازش فایل: {str(e)}", None

+import gradio as gr
 import fitz
 import arabic_reshaper
+from PIL import Image
+import easyocr
+# Reader سبک برای کاهش مصرف CPU و زمان لود
+reader = easyocr.Reader(['fa','ar','en'], gpu=False, detector='craft_mini')
 def extract_text_from_pdf(pdf_file):
     if pdf_file is None:
         return "لطفاً یک فایل PDF آپلود کنید.", None
+    pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
+    all_text = []
+    pdf_document = fitz.open(pdf_path)
+    for page_num, page in enumerate(pdf_document):
+        text = page.get_text("text")
+        # فقط اگر متن خالی یا مشکل‌دار بود → OCR
+        if not text.strip() or len(set(text)) < 10:
+            pix = page.get_pixmap(dpi=150)
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            text = "\n".join(reader.readtext(img, detail=0))
+        if any('\u0600' <= c <= '\u06FF' or '\u0750' <= c <= '\u077F' for c in text):
+            text = arabic_reshaper.reshape(text)
+        all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
+    pdf_document.close()
+    extracted_text = "\n".join(all_text)
+    output_file = "extracted_text.txt"
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write(extracted_text)
+    return extracted_text, output_file
+def create_interface():
+    with gr.Blocks() as interface:
+        gr.Markdown("## استخراج متن PDF سریع و کم‌مصرف")
+        pdf_input = gr.File(label="آپلود PDF", file_types=[".pdf"], type="filepath")
+        extract_btn = gr.Button("استخراج متن")
+        text_output = gr.Textbox(label="متن استخراج شده", lines=20)
+        download_output = gr.File(label="دانلود TXT")
+        extract_btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=[text_output, download_output])
+    return interface
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()