Spaces:

suprimedev
/

pdf2text5

Runtime error

App Files Files Community

suprimedev commited on Aug 23, 2025

Commit

b5f1a8b

verified ·

1 Parent(s): d67ab18

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -29

app.py CHANGED Viewed

@@ -1,58 +1,50 @@
-import gradio as gr
-import fitz  # PyMuPDF
-import arabic_reshaper
 import easyocr
 from PIL import Image
-import io
 reader = easyocr.Reader(['fa','ar','en'], gpu=False)
 def extract_text_from_pdf(pdf_file):
     if pdf_file is None:
         return "لطفاً یک فایل PDF آپلود کنید.", None
     try:
-        pdf_document = fitz.open(pdf_file)
         all_text = []
         for page_num in range(len(pdf_document)):
             page = pdf_document[page_num]
             text = page.get_text("text")
             if not text.strip() or len(set(text)) < 10:
                 pix = page.get_pixmap(dpi=150)
                 img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                 text_lines = reader.readtext(img, detail=0)
                 text = "\n".join(text_lines)
             if any('\u0600' <= char <= '\u06FF' or '\u0750' <= char <= '\u077F' for char in text):
                 text = arabic_reshaper.reshape(text)
             all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
         pdf_document.close()
         extracted_text = "\n".join(all_text)
         output_file = "extracted_text.txt"
         with open(output_file, "w", encoding="utf-8") as f:
             f.write(extracted_text)
         return extracted_text, output_file
     except Exception as e:
         return f"خطا در پردازش فایل: {str(e)}", None
-def create_interface():
-    with gr.Blocks(theme=gr.themes.Soft()) as interface:
-        gr.Markdown("""# 📄 استخراج متن از PDF با OCR بهینه
-این برنامه متن PDF را استخراج می‌کند.
-- اگر PDF متن واقعی داشته باشد مستقیم می‌خواند.
-- اگر اسکن شده باشد فقط صفحات خالی یا غیرقابل‌خواندن OCR می‌شوند.
-### نحوه استفاده:
-1. فایل PDF را آپلود کنید
-2. روی "استخراج متن" کلیک کنید
-3. متن استخراج‌شده را مشاهده و دانلود کنید""")
-        with gr.Row():
-            with gr.Column(scale=1):
-                pdf_input = gr.File(label="📂 آپلود PDF", file_types=[".pdf"], type="filepath")
-                extract_btn = gr.Button("🔍 استخراج متن", variant="primary")
-            with gr.Column(scale=2):
-                text_output = gr.Textbox(label="📝 متن استخراج شده", placeholder="متن PDF اینجا نمایش داده می‌شود...", lines=20, max_lines=30)
-                download_output = gr.File(label="⬇️ دانلود txt")
-        extract_btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=[text_output, download_output])
-    return interface
-if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch()

+import os
+import fitz
 import easyocr
 from PIL import Image
+import arabic_reshaper
 reader = easyocr.Reader(['fa','ar','en'], gpu=False)
 def extract_text_from_pdf(pdf_file):
     if pdf_file is None:
         return "لطفاً یک فایل PDF آپلود کنید.", None
+    # تبدیل مسیر یا فایل به مسیر قابل استفاده
+    if hasattr(pdf_file, "name"):  # اگر TemporaryFile است
+        pdf_path = pdf_file.name
+    elif isinstance(pdf_file, str):  # اگر مسیر string است
+        pdf_path = pdf_file
+    else:
+        return "نوع فایل نامعتبر است.", None
     try:
+        pdf_document = fitz.open(pdf_path)
         all_text = []
         for page_num in range(len(pdf_document)):
             page = pdf_document[page_num]
             text = page.get_text("text")
             if not text.strip() or len(set(text)) < 10:
                 pix = page.get_pixmap(dpi=150)
                 img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                 text_lines = reader.readtext(img, detail=0)
                 text = "\n".join(text_lines)
             if any('\u0600' <= char <= '\u06FF' or '\u0750' <= char <= '\u077F' for char in text):
                 text = arabic_reshaper.reshape(text)
             all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
         pdf_document.close()
         extracted_text = "\n".join(all_text)
         output_file = "extracted_text.txt"
         with open(output_file, "w", encoding="utf-8") as f:
             f.write(extracted_text)
         return extracted_text, output_file
     except Exception as e:
         return f"خطا در پردازش فایل: {str(e)}", None