Spaces:

suprimedev
/

pdf2text5

Runtime error

App Files Files Community

suprimedev commited on Aug 23, 2025

Commit

d67ab18

verified ·

1 Parent(s): 9172e12

Upload 2 files

Browse files

Files changed (2) hide show

app.py +58 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+import fitz  # PyMuPDF
+import arabic_reshaper
+import easyocr
+from PIL import Image
+import io
+reader = easyocr.Reader(['fa','ar','en'], gpu=False)
+def extract_text_from_pdf(pdf_file):
+    if pdf_file is None:
+        return "لطفاً یک فایل PDF آپلود کنید.", None
+    try:
+        pdf_document = fitz.open(pdf_file)
+        all_text = []
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            text = page.get_text("text")
+            if not text.strip() or len(set(text)) < 10:
+                pix = page.get_pixmap(dpi=150)
+                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                text_lines = reader.readtext(img, detail=0)
+                text = "\n".join(text_lines)
+            if any('\u0600' <= char <= '\u06FF' or '\u0750' <= char <= '\u077F' for char in text):
+                text = arabic_reshaper.reshape(text)
+            all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
+        pdf_document.close()
+        extracted_text = "\n".join(all_text)
+        output_file = "extracted_text.txt"
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(extracted_text)
+        return extracted_text, output_file
+    except Exception as e:
+        return f"خطا در پردازش فایل: {str(e)}", None
+def create_interface():
+    with gr.Blocks(theme=gr.themes.Soft()) as interface:
+        gr.Markdown("""# 📄 استخراج متن از PDF با OCR بهینه
+این برنامه متن PDF را استخراج می‌کند.
+- اگر PDF متن واقعی داشته باشد مستقیم می‌خواند.
+- اگر اسکن شده باشد فقط صفحات خالی یا غیرقابل‌خواندن OCR می‌شوند.
+### نحوه استفاده:
+1. فایل PDF را آپلود کنید
+2. روی "استخراج متن" کلیک کنید
+3. متن استخراج‌شده را مشاهده و دانلود کنید""")
+        with gr.Row():
+            with gr.Column(scale=1):
+                pdf_input = gr.File(label="📂 آپلود PDF", file_types=[".pdf"], type="filepath")
+                extract_btn = gr.Button("🔍 استخراج متن", variant="primary")
+            with gr.Column(scale=2):
+                text_output = gr.Textbox(label="📝 متن استخراج شده", placeholder="متن PDF اینجا نمایش داده می‌شود...", lines=20, max_lines=30)
+                download_output = gr.File(label="⬇️ دانلود txt")
+        extract_btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=[text_output, download_output])
+    return interface
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+PyMuPDF
+arabic-reshaper
+python-bidi
+pillow
+easyocr