Spaces:

suprimedev
/

pdf2text4

Sleeping

File size: 3,923 Bytes

bea0406
da8d102
9fe191e
da8d102
 
f3fb6b1
bea0406
 
 
da8d102
bea0406
 
4f46d61
9fe191e
bea0406
f3fb6b1
9fe191e
 
 
 
da8d102
 
 
 
 
 
 
 
 
9fe191e
4f46d61
9fe191e
4f46d61
9fe191e
 
 
 
4f46d61
 
da8d102
4f46d61
 
 
9fe191e
4f46d61
 
bea0406
4f46d61
bea0406
f3fb6b1
9fe191e
 
da8d102
9fe191e
 
 
 
da8d102
9fe191e
da8d102
 
 
9fe191e
 
da8d102
 
 
9fe191e
 
 
 
 
 
da8d102
9fe191e
 
 
 
 
 
 
 
 
 
da8d102
 
9fe191e
f3fb6b1
9fe191e
4f46d61
da8d102
4f46d61
9fe191e
da8d102
9fe191e
 
 
4f46d61
9fe191e
 
 
bea0406
f3fb6b1
bea0406
9fe191e
f3fb6b1

import gradio as gr
import fitz  # PyMuPDF
import arabic_reshaper
import pytesseract
from PIL import Image


def extract_text_from_pdf(pdf_file):
    """
    استخراج متن از PDF با پشتیبانی از OCR برای فایل‌های اسکن‌شده یا غیرقابل‌خواندن
    """
    if pdf_file is None:
        return "لطفاً یک فایل PDF آپلود کنید.", None
    
    try:
        pdf_document = fitz.open(pdf_file)
        all_text = []
        
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            
            # تلاش اول: استخراج متن مستقیم
            text = page.get_text("text")
            
            # اگر متن خالی یا بی‌معنی بود → OCR
            if not text.strip() or len(set(text)) < 10:
                pix = page.get_pixmap(dpi=200)  # صفحه به تصویر
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                text = pytesseract.image_to_string(img, lang="fas+ara+eng")
            
            # پردازش متن فارسی/عربی
            if any('\u0600' <= char <= '\u06FF' or '\u0750' <= char <= '\u077F' for char in text):
                text = arabic_reshaper.reshape(text)
            
            all_text.append(f"--- صفحه {page_num + 1} ---\n{text}\n")
        
        pdf_document.close()
        extracted_text = "\n".join(all_text)

        # ذخیره در فایل txt
        output_file = "extracted_text.txt"
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(extracted_text)
        
        return extracted_text, output_file
    
    except Exception as e:
        return f"خطا در پردازش فایل: {str(e)}", None


def create_interface():
    """
    رابط کاربری Gradio
    """
    with gr.Blocks(theme=gr.themes.Soft()) as interface:
        gr.Markdown(
            """
            # 📄 استخراج متن از PDF با OCR
            
            این برنامه متن را از فایل‌های PDF استخراج می‌کند.  
            - ابتدا سعی می‌کند متن را مستقیماً بخواند.  
            - اگر PDF اسکن‌شده یا رمزگذاری‌شده باشد، از OCR (تشخیص متن از تصویر) استفاده می‌کند.  
            
            ### نحوه استفاده:
            1. فایل PDF خود را آپلود کنید  
            2. روی دکمه "استخراج متن" کلیک کنید  
            3. متن استخراج‌شده را ببینید یا فایل txt را دانلود کنید  
            """
        )
        
        with gr.Row():
            with gr.Column(scale=1):
                pdf_input = gr.File(
                    label="📂 فایل PDF را آپلود کنید",
                    file_types=[".pdf"],
                    type="filepath"
                )
                extract_btn = gr.Button(
                    "🔍 استخراج متن",
                    variant="primary"
                )
            
            with gr.Column(scale=2):
                text_output = gr.Textbox(
                    label="📝 متن استخراج شده",
                    placeholder="اینجا متن PDF نمایش داده می‌شود...",
                    lines=20,
                    max_lines=30
                )
                download_output = gr.File(
                    label="⬇️ دانلود خروجی به صورت txt"
                )
        
        # اتصال دکمه
        extract_btn.click(
            fn=extract_text_from_pdf,
            inputs=pdf_input,
            outputs=[text_output, download_output]
        )
    
    return interface


if __name__ == "__main__":
    interface = create_interface()
    interface.launch()