File size: 1,833 Bytes
6fed9ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import os

def extract_pdf(pdf_file, lang_codes: str, force_ocr: bool):
    if pdf_file is None:
        return "لطفاً یک فایل PDF انتخاب کنید.", None

    path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
    doc = fitz.open(path)

    langs = lang_codes.strip() or "fas+ara+eng"

    page_texts = []
    total_chars = 0
    for page in doc:
        t = page.get_text("text") or ""
        page_texts.append(t.strip())
        total_chars += len(t)

    if force_ocr or total_chars < 50:
        ocr_texts = []
        for page in doc:
            pix = page.get_pixmap(matrix=fitz.Matrix(2,2))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text = pytesseract.image_to_string(img, lang=langs)
            ocr_texts.append(text.strip())
        page_texts = ocr_texts

    combined = "\n\n".join([f"--- Page {i+1} ---\n{t}" for i,t in enumerate(page_texts)])

    out_path = os.path.join("/mnt/data", os.path.basename(path) + "_extracted.txt")
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(combined)

    return combined, out_path

demo = gr.Interface(
    fn=extract_pdf,
    inputs=[
        gr.File(label="فایل PDF", file_types=[".pdf"]),
        gr.Textbox(label="کد زبان تسرکت", value="fas+ara+eng"),
        gr.Checkbox(label="اجبار OCR", value=False)
    ],
    outputs=[
        gr.Textbox(label="متن خروجی", lines=20),
        gr.File(label="دانلود فایل خروجی")
    ],
    title="استخراج متن PDF چندزبانه",
    description="پشتیبانی از فارسی/عربی با Tesseract. مصرف CPU خیلی کمتر از EasyOCR."
)

if __name__ == "__main__":
    demo.launch()