Spaces:

F-allahmoradi
/

Persian-Book-Info-Extractor

Sleeping

App Files Files Community

F-allahmoradi commited on Nov 2, 2025

Commit

3f6b428

verified ·

1 Parent(s): b189993

Upload app.py

Browse files

Files changed (1) hide show

app.py +223 -0

app.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# ==============================
+# 📦 وارد کردن کتابخانه‌ها
+# ==============================
+import json
+import re
+import os
+import tempfile
+import torch
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from groq import Groq
+import gradio as gr
+# ==============================
+# 🌐 تنظیمات جهانی
+# ==============================
+ZWNJ = '\u200c'
+MLM_NAME = "bert-base-multilingual-cased"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+GROQ_API_KEYS = [
+    "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",
+    "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",
+    "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"
+]
+print("⏳ بارگذاری BERT برای اصلاح املایی...")
+tok = AutoTokenizer.from_pretrained(MLM_NAME)
+mlm = AutoModelForMaskedLM.from_pretrained(MLM_NAME).to(DEVICE)
+# ==============================
+# 🧠 توابع پردازش متن (همان کد شما)
+# ==============================
+def is_persian(text: str) -> bool:
+    if not text: return True
+    fa_cnt = sum(1 for ch in text if '\u0600' <= ch <= '\u06FF')
+    return (fa_cnt / max(len(text), 1)) > 0.5
+def fix_persian(text: str) -> str:
+    n = text
+    n = n.replace('ك', 'ک').replace('ي', 'ی').replace('ئ', 'ی')
+    n = n.replace('أ', 'ا').replace('ة', 'ه')
+    n = re.sub(r'\(0', '', n)
+    n = n.replace('پسرده', 'پرده')
+    n = re.sub(r'\bاینن\b', 'این', n)
+    n = re.sub(r'\bخلسوت\b', 'خلوت', n)
+    n = re.sub(r'نضورد', 'نخورد', n)
+    n = re.sub(r'سبصد', 'سیصد', n)
+    n = re.sub(r'صایون', 'صابون', n)
+    n = re.sub(r' +', ' ', n).strip()
+    n = re.sub(r'(می) (باشد|کند|شود|روم)', r'\1' + ZWNJ + r'\2', n)
+    return n
+def mlm_correct(word: str) -> str:
+    if not word.isalpha(): return word
+    text = f"[MASK] {word[1:]}" if len(word) > 1 else "[MASK]"
+    inputs = tok(text, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        logits = mlm(**inputs).logits
+    mask_id = torch.where(inputs.input_ids[0] == tok.mask_token_id)[0]
+    if mask_id.numel() == 0: return word
+    probs = logits[0, mask_id].softmax(dim=-1)
+    best_id = probs.argmax().item()
+    best_token = tok.decode(best_id).strip()
+    if best_token.isalpha() and len(best_token) >= 2:
+        return best_token + word[1:] if len(word) > 1 else best_token
+    return word
+def fix_multilingual(text: str) -> str:
+    tokens = re.findall(r'\b\w+\b|[^\w\s]', text, flags=re.UNICODE)
+    fixed = [mlm_correct(t) if t.isalpha() else t for t in tokens]
+    return re.sub(r'\s+', ' ', ' '.join(fixed)).strip()
+def smart_correct(text: str) -> str:
+    return fix_persian(text) if is_persian(text) else fix_multilingual(text)
+# ==============================
+# 📄 OCR (7 صفحه، DPI=250)
+# ==============================
+def ocr_with_enhancement(input_path, max_pages=7):
+    extracted = ""
+    if input_path.lower().endswith('.pdf'):
+        images = convert_from_path(input_path, first_page=1, last_page=max_pages, dpi=250)
+        for i, img in enumerate(images, 1):
+            img = img.convert("L")
+            raw = pytesseract.image_to_string(img, lang='fas+eng')
+            corrected = smart_correct(raw)
+            extracted += f"\n--- صفحه {i} ---\n{corrected}"
+    else:
+        img = Image.open(input_path).convert("L")
+        raw = pytesseract.image_to_string(img, lang='fas+eng')
+        extracted = smart_correct(raw)
+    return extracted.strip()
+# ==============================
+# 🤖 استخراج اطلاعات کتاب با چرخش کلیدهای Groq
+# ==============================
+def extract_book_info(extracted_text):
+    system_prompt = """شما یک متخصص استخراج اطلاعات کتاب هستید. از متن زیر، اطلاعات زیر را استخراج کرده و **فقط یک JSON معتبر** برگردانید:
+{
+  "title": "عنوان کتاب",
+  "author": "نام نویسنده",
+  "translator": "نام مترجم",
+  "publisher": "ناشر",
+  "edition": "نوبت چاپ",
+  "subject": "موضوع کتاب",
+}
+اگر فیلدی یافت نشد، مقدار آن را null قرار دهید.
+هیچ توضیحی جز JSON ندهید."""
+    for key in GROQ_API_KEYS:
+        try:
+            client = Groq(api_key=key)
+            resp = client.chat.completions.create(
+                messages=[{"role": "system", "content": system_prompt},
+                          {"role": "user", "content": f"متن استخراج‌شده:\n\n{extracted_text}"}],
+                model="llama-3.1-8b-instant",
+                temperature=0.1,
+                max_tokens=1000,
+                timeout=30
+            )
+            raw = resp.choices[0].message.content.strip()
+            json_match = re.search(r'\{.*\}', raw, re.DOTALL)
+            if json_match:
+                return json.loads(json_match.group())
+        except:
+            continue
+    return {"error": "همه کلیدهای Groq شکست خوردند."}
+# ==============================
+# 🎛 تابع اصلی برای Gradio
+# ==============================
+def process_file(file):
+    if not file:
+        return "❌ فایلی انتخاب نشده است.", None
+    full_text = ocr_with_enhancement(file.name, max_pages=7)
+    if not full_text:
+        return "❌ متنی استخراج نشد.", None
+    book_info = extract_book_info(full_text)
+    book_info["extracted_text_sample"] = full_text[:1000]
+    output_text = "📚 **اطلاعات استخراج‌شده:**\n\n"
+    fields = {
+        "title": "📖 عنوان",
+        "author": "✍️ نویسنده",
+        "translator": "🔤 مترجم",
+        "publisher": "🏢 ناشر",
+        "edition": "🔄 نوبت چاپ",
+        "subject": "🏷️ موضوع",
+    }
+    for k, label in fields.items():
+        v = book_info.get(k)
+        output_text += f"{label}: {'✅ ' + str(v) if v else '❌ یافت نشد'}\n"
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f:
+        json.dump(book_info, f, ensure_ascii=False, indent=2)
+        json_path = f.name
+        book_info.pop("isbn", None)
+    return output_text, json_path
+# ==============================
+# 🖥 رابط Gradio با تم هکری و راست‌چین کامل
+# ==============================
+with gr.Blocks(css="""
+    body {
+        background-color: #000 !important;
+        color#32CD32 !important;
+        direction: rtl !important;
+        text-align: right !important;
+        font-family: 'Courier New', monospace;
+    }
+    .gr-button {
+        background: #0a0 !important;
+        color: #000 !important;
+        border: 1px solid #32CD32 !important;
+        font-family: 'Courier New', monospace;
+    }
+    .gr-input, .gr-output, .gr-file {
+        background: #111 !important;
+        color:#32CD32 !important;
+        border: 1px solid #0a0 !important;
+        direction: rtl !important;
+        text-align: right !important;
+        font-family: 'Courier New', monospace;
+    }
+    .output-markdown h1 {
+        text-align: center !important;
+        direction: ltr !important;
+        color:#32CD32 !important;
+        font-family: 'Courier New', monospace;
+    }
+    .gr-textbox textarea {
+        text-align: right !important;
+        direction: rtl !important;
+    }
+    label {
+        color:#32CD32 !important;
+        font-family: 'Courier New', monospace;
+    }
+""") as demo:
+    gr.Markdown("# استخراج اطلاعات کتاب ")
+    with gr.Row():
+        file_input = gr.File(label="آپلود فایل", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
+        json_output = gr.File(label="دانلود نتیجه (JSON)")
+    text_output = gr.Textbox(
+        label="نتایج",
+        lines=15,
+        interactive=False,
+        elem_classes=["gr-textbox"]
+    )
+    btn = gr.Button("پردازش", variant="primary")
+    btn.click(fn=process_file, inputs=file_input, outputs=[text_output, json_output])
+# ==============================
+# ▶️ اجرا
+# ==============================
+demo.launch(debug=True)