pdf2text2 / app.py
suprimedev's picture
Upload 3 files
6fed9ca verified
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import os
def extract_pdf(pdf_file, lang_codes: str, force_ocr: bool):
if pdf_file is None:
return "لطفاً یک فایل PDF انتخاب کنید.", None
path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
doc = fitz.open(path)
langs = lang_codes.strip() or "fas+ara+eng"
page_texts = []
total_chars = 0
for page in doc:
t = page.get_text("text") or ""
page_texts.append(t.strip())
total_chars += len(t)
if force_ocr or total_chars < 50:
ocr_texts = []
for page in doc:
pix = page.get_pixmap(matrix=fitz.Matrix(2,2))
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
text = pytesseract.image_to_string(img, lang=langs)
ocr_texts.append(text.strip())
page_texts = ocr_texts
combined = "\n\n".join([f"--- Page {i+1} ---\n{t}" for i,t in enumerate(page_texts)])
out_path = os.path.join("/mnt/data", os.path.basename(path) + "_extracted.txt")
with open(out_path, "w", encoding="utf-8") as f:
f.write(combined)
return combined, out_path
demo = gr.Interface(
fn=extract_pdf,
inputs=[
gr.File(label="فایل PDF", file_types=[".pdf"]),
gr.Textbox(label="کد زبان تسرکت", value="fas+ara+eng"),
gr.Checkbox(label="اجبار OCR", value=False)
],
outputs=[
gr.Textbox(label="متن خروجی", lines=20),
gr.File(label="دانلود فایل خروجی")
],
title="استخراج متن PDF چندزبانه",
description="پشتیبانی از فارسی/عربی با Tesseract. مصرف CPU خیلی کمتر از EasyOCR."
)
if __name__ == "__main__":
demo.launch()