Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,847 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
# 🔍 نسخه پیشرفته - رابط Gradio تاریک و ساده - رفع مشکل نمایش نتایج
|
| 3 |
+
|
| 4 |
+
# ============================================================================
|
| 5 |
+
# 🔧 Import کتابخانهها
|
| 6 |
+
# ============================================================================
|
| 7 |
+
|
| 8 |
+
import pytesseract
|
| 9 |
+
from pdf2image import convert_from_path
|
| 10 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
| 11 |
+
import re
|
| 12 |
+
import json
|
| 13 |
+
from typing import Dict, Any, List, Tuple, Optional
|
| 14 |
+
import os
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import numpy as np
|
| 17 |
+
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForTokenClassification
|
| 18 |
+
from sentence_transformers import SentenceTransformer, util
|
| 19 |
+
import torch
|
| 20 |
+
import arabic_reshaper
|
| 21 |
+
from bidi.algorithm import get_display
|
| 22 |
+
import gradio as gr
|
| 23 |
+
|
| 24 |
+
# تنظیم مسیر Tesseract (در Hugging Face این مسیر ممکن است لازم نباشد)
|
| 25 |
+
# pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' # فقط در Colab
|
| 26 |
+
|
| 27 |
+
# ============================================================================
|
| 28 |
+
# 🧠 بخش ۱: مدلهای هوش مصنوعی با دقت بالا
|
| 29 |
+
# ============================================================================
|
| 30 |
+
|
| 31 |
+
class HighAccuracyAIModels:
|
| 32 |
+
"""مدلهای هوش مصنوعی با دقت فوق العاده"""
|
| 33 |
+
|
| 34 |
+
def __init__(self):
|
| 35 |
+
self.setup_high_accuracy_models()
|
| 36 |
+
|
| 37 |
+
def setup_high_accuracy_models(self):
|
| 38 |
+
"""بارگذاری مدلهای با دقت بالا"""
|
| 39 |
+
print("🧠 در حال بارگذاری مدلها...")
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
self.ner_pipeline = pipeline(
|
| 43 |
+
"token-classification",
|
| 44 |
+
model="HooshvareLab/bert-fa-zwnj-base-ner",
|
| 45 |
+
aggregation_strategy="max",
|
| 46 |
+
device=0 if torch.cuda.is_available() else -1
|
| 47 |
+
)
|
| 48 |
+
print("✅ مدل NER بارگذاری شد")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"⚠️ خطا در بارگذاری مدل NER: {e}")
|
| 51 |
+
self.ner_pipeline = None
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
self.embedding_model = SentenceTransformer(
|
| 55 |
+
'all-MiniLM-L6-v2',
|
| 56 |
+
device='cuda' if torch.cuda.is_available() else 'cpu'
|
| 57 |
+
)
|
| 58 |
+
print("✅ مدل Embedding بارگذاری شد")
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f"⚠️ خطا در بارگذاری مدل Embedding: {e}")
|
| 61 |
+
self.embedding_model = None
|
| 62 |
+
|
| 63 |
+
# ============================================================================
|
| 64 |
+
# 🔍 بخش ۲: OCR با دقت فوق العاده
|
| 65 |
+
# ============================================================================
|
| 66 |
+
|
| 67 |
+
class UltraAccuracyOCREngine:
|
| 68 |
+
"""موتور OCR با دقت فوق العاده"""
|
| 69 |
+
|
| 70 |
+
def __init__(self, ai_models):
|
| 71 |
+
self.ai_models = ai_models
|
| 72 |
+
self.setup_advanced_ocr()
|
| 73 |
+
|
| 74 |
+
def setup_advanced_ocr(self):
|
| 75 |
+
"""تنظیمات پیشرفته OCR"""
|
| 76 |
+
self.tesseract_configs = [
|
| 77 |
+
r'--oem 3 --psm 6 -c tessedit_char_whitelist=آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیءةيك۰۱۲۳۴۵۶۷۸۹٠١٢٣٤٥٦٧٨٩ :.,-()',
|
| 78 |
+
r'--oem 3 --psm 4 -c preserve_interword_spaces=1'
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
def extract_text_ultra_accurate(self, input_file, progress_callback=None) -> Tuple[str, Dict[str, Any]]:
|
| 82 |
+
"""استخراج متن با دقت فوق العاده"""
|
| 83 |
+
if progress_callback:
|
| 84 |
+
progress_callback(5, "شروع استخراج متن...")
|
| 85 |
+
|
| 86 |
+
results = {
|
| 87 |
+
'methods': {},
|
| 88 |
+
'quality_metrics': {}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
if progress_callback:
|
| 93 |
+
progress_callback(20, "پردازش صفحات...")
|
| 94 |
+
|
| 95 |
+
tesseract_results = self._advanced_tesseract_extraction(input_file, progress_callback)
|
| 96 |
+
results['methods']['tesseract_advanced'] = tesseract_results
|
| 97 |
+
|
| 98 |
+
combined_text = tesseract_results['text']
|
| 99 |
+
results['combined_text'] = combined_text
|
| 100 |
+
results['quality_metrics'] = self._calculate_comprehensive_quality(combined_text)
|
| 101 |
+
|
| 102 |
+
if progress_callback:
|
| 103 |
+
progress_callback(80, "استخراج متن کامل شد")
|
| 104 |
+
|
| 105 |
+
return combined_text, results
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"❌ خطا در استخراج متن: {e}")
|
| 109 |
+
return "", results
|
| 110 |
+
|
| 111 |
+
def _advanced_tesseract_extraction(self, input_file, progress_callback=None) -> Dict[str, Any]:
|
| 112 |
+
"""استخراج پیشرفته با Tesseract"""
|
| 113 |
+
start_time = datetime.now()
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
if isinstance(input_file, str) and input_file.lower().endswith('.pdf'):
|
| 117 |
+
images = convert_from_path(input_file, first_page=1, last_page=3, dpi=300)
|
| 118 |
+
all_texts = []
|
| 119 |
+
|
| 120 |
+
for i, image in enumerate(images):
|
| 121 |
+
if progress_callback:
|
| 122 |
+
progress_callback(20 + (i * 15), f"پردازش صفحه {i+1}...")
|
| 123 |
+
|
| 124 |
+
processed_image = self._preprocess_image(image)
|
| 125 |
+
|
| 126 |
+
page_texts = []
|
| 127 |
+
for config in self.tesseract_configs:
|
| 128 |
+
try:
|
| 129 |
+
text = pytesseract.image_to_string(
|
| 130 |
+
processed_image,
|
| 131 |
+
lang='fas+eng',
|
| 132 |
+
config=config
|
| 133 |
+
)
|
| 134 |
+
if text.strip():
|
| 135 |
+
page_texts.append(text)
|
| 136 |
+
except:
|
| 137 |
+
continue
|
| 138 |
+
|
| 139 |
+
if page_texts:
|
| 140 |
+
best_page_text = max(page_texts, key=lambda x: len(x))
|
| 141 |
+
all_texts.append(best_page_text)
|
| 142 |
+
|
| 143 |
+
final_text = '\n'.join(all_texts)
|
| 144 |
+
|
| 145 |
+
else:
|
| 146 |
+
if isinstance(input_file, str):
|
| 147 |
+
image = Image.open(input_file)
|
| 148 |
+
else:
|
| 149 |
+
image = input_file
|
| 150 |
+
|
| 151 |
+
processed_image = self._preprocess_image(image)
|
| 152 |
+
final_text = pytesseract.image_to_string(processed_image, lang='fas+eng')
|
| 153 |
+
|
| 154 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 155 |
+
|
| 156 |
+
return {
|
| 157 |
+
'text': final_text,
|
| 158 |
+
'confidence': 0.8,
|
| 159 |
+
'processing_time': processing_time,
|
| 160 |
+
'method': 'tesseract_advanced'
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
except Exception as e:
|
| 164 |
+
print(f"⚠️ خطا در Tesseract: {e}")
|
| 165 |
+
return {'text': '', 'confidence': 0, 'processing_time': 0, 'method': 'error'}
|
| 166 |
+
|
| 167 |
+
def _preprocess_image(self, image):
|
| 168 |
+
"""پیشپردازش تصویر"""
|
| 169 |
+
try:
|
| 170 |
+
if image.mode != 'L':
|
| 171 |
+
image = image.convert('L')
|
| 172 |
+
enhancer = ImageEnhance.Contrast(image)
|
| 173 |
+
image = enhancer.enhance(1.3)
|
| 174 |
+
return image
|
| 175 |
+
except:
|
| 176 |
+
return image
|
| 177 |
+
|
| 178 |
+
def _calculate_comprehensive_quality(self, text: str) -> Dict[str, float]:
|
| 179 |
+
"""محاسبه کیفیت جامع متن"""
|
| 180 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 181 |
+
persian_chars = len(re.findall(r'[آ-ی]', text))
|
| 182 |
+
total_chars = len(text)
|
| 183 |
+
|
| 184 |
+
return {
|
| 185 |
+
'overall_score': min(persian_chars / max(total_chars, 1) * 2, 1.0),
|
| 186 |
+
'line_count': len(lines),
|
| 187 |
+
'persian_ratio': persian_chars / total_chars if total_chars > 0 else 0,
|
| 188 |
+
'total_chars': total_chars
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
# ============================================================================
|
| 192 |
+
# 🤖 بخش ۳: سیستم RAG سادهسازی شده
|
| 193 |
+
# ============================================================================
|
| 194 |
+
|
| 195 |
+
class SimpleRAGSystem:
|
| 196 |
+
"""سیستم RAG سادهسازی شده"""
|
| 197 |
+
|
| 198 |
+
def __init__(self):
|
| 199 |
+
# حذف کلید API یا استفاده از متغیر محیطی
|
| 200 |
+
self.api_key = os.getenv("OPENROUTER_API_KEY", "")
|
| 201 |
+
if not self.api_key:
|
| 202 |
+
print("⚠️ کلید API یافت نشد. مدل RAG غیرفعال است.")
|
| 203 |
+
self.setup_llm()
|
| 204 |
+
self.knowledge_base = None
|
| 205 |
+
|
| 206 |
+
def setup_llm(self):
|
| 207 |
+
"""تنظیم مدل زبانی"""
|
| 208 |
+
if not self.api_key:
|
| 209 |
+
print("❌ کلید API تنظیم نشده است.")
|
| 210 |
+
self.llm = None
|
| 211 |
+
self.embed_model = None
|
| 212 |
+
return
|
| 213 |
+
|
| 214 |
+
try:
|
| 215 |
+
from llama_index.llms.openrouter import OpenRouter
|
| 216 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 217 |
+
|
| 218 |
+
self.llm = OpenRouter(
|
| 219 |
+
model="meta-llama/llama-3-70b-instruct",
|
| 220 |
+
temperature=0.1,
|
| 221 |
+
max_tokens=2000,
|
| 222 |
+
api_key=self.api_key
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
self.embed_model = HuggingFaceEmbedding(
|
| 226 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
print("✅ مدل زبانی RAG تنظیم شد")
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
print(f"❌ خطا در تنظیم مدل RAG: {e}")
|
| 233 |
+
self.llm = None
|
| 234 |
+
self.embed_model = None
|
| 235 |
+
|
| 236 |
+
def create_knowledge_base(self, text: str):
|
| 237 |
+
"""ایجاد پایگاه دانش"""
|
| 238 |
+
if not self.llm:
|
| 239 |
+
return False
|
| 240 |
+
|
| 241 |
+
try:
|
| 242 |
+
from llama_index.core import Document, VectorStoreIndex
|
| 243 |
+
|
| 244 |
+
short_text = text[:2000]
|
| 245 |
+
document = Document(text=short_text)
|
| 246 |
+
|
| 247 |
+
self.knowledge_base = VectorStoreIndex.from_documents(
|
| 248 |
+
[document],
|
| 249 |
+
embed_model=self.embed_model
|
| 250 |
+
)
|
| 251 |
+
return True
|
| 252 |
+
|
| 253 |
+
except Exception as e:
|
| 254 |
+
print(f"❌ خطا در ایجاد پایگاه دانش: {e}")
|
| 255 |
+
return False
|
| 256 |
+
|
| 257 |
+
# ============================================================================
|
| 258 |
+
# 🚀 بخش ۴: استخراجکننده متادیتا
|
| 259 |
+
# ============================================================================
|
| 260 |
+
|
| 261 |
+
class APIEnhancedMetadataExtractor:
|
| 262 |
+
"""استخراج کننده متادیتا"""
|
| 263 |
+
|
| 264 |
+
def __init__(self, rag_system):
|
| 265 |
+
self.rag_system = rag_system
|
| 266 |
+
|
| 267 |
+
def extract_with_api_power(self, text: str, progress_callback=None) -> Dict[str, Any]:
|
| 268 |
+
"""استخراج متادیتا با قدرت API"""
|
| 269 |
+
if progress_callback:
|
| 270 |
+
progress_callback(85, "استخراج اطلاعات با هوش مصنوعی...")
|
| 271 |
+
|
| 272 |
+
if not self.rag_system.llm:
|
| 273 |
+
return self._extract_without_api(text)
|
| 274 |
+
|
| 275 |
+
try:
|
| 276 |
+
chunk = text[:2000]
|
| 277 |
+
prompt = f"""
|
| 278 |
+
از متن زیر اطلاعات کتاب را استخراج کن:
|
| 279 |
+
{chunk}
|
| 280 |
+
|
| 281 |
+
اطلاعات مورد نیاز:
|
| 282 |
+
- عنوان کتاب
|
| 283 |
+
- نویسنده/مؤلف
|
| 284 |
+
- مترجم (اگر وجود دارد)
|
| 285 |
+
- ناشر
|
| 286 |
+
- سال انتشار
|
| 287 |
+
- شابک (ISBN)
|
| 288 |
+
- نوبت چاپ
|
| 289 |
+
|
| 290 |
+
پاسخ را به صورت JSON برگردان.
|
| 291 |
+
"""
|
| 292 |
+
|
| 293 |
+
metadata = self._call_api_for_extraction(prompt)
|
| 294 |
+
return metadata
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
print(f"❌ خطا در استخراج با API: {e}")
|
| 298 |
+
return self._extract_without_api(text)
|
| 299 |
+
|
| 300 |
+
def _extract_without_api(self, text: str) -> Dict[str, Any]:
|
| 301 |
+
"""استخراج بدون API"""
|
| 302 |
+
metadata = {}
|
| 303 |
+
|
| 304 |
+
# استخراج عنوان
|
| 305 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 306 |
+
for line in lines[:10]:
|
| 307 |
+
if (10 <= len(line) <= 150 and
|
| 308 |
+
len(re.findall(r'[آ-ی]', line)) >= 2 and
|
| 309 |
+
not any(word in line for word in ['نویسنده', 'مؤلف', 'ناشر', 'چاپ', 'شابک'])):
|
| 310 |
+
metadata['title'] = line
|
| 311 |
+
break
|
| 312 |
+
|
| 313 |
+
# استخراج سایر اطلاعات
|
| 314 |
+
patterns = {
|
| 315 |
+
'author': r'نویسنده\s*[:\-]\s*([^\n]+)',
|
| 316 |
+
'publisher': r'ناشر\s*[:\-]\s*([^\n]+)',
|
| 317 |
+
'publication_year': r'سال\s*انتشار\s*[:\-]\s*([۱۳۴۰-۹]{4})',
|
| 318 |
+
'isbn': r'شابک\s*[:\-]\s*([۰-۹\-–]+)',
|
| 319 |
+
'edition': r'چاپ\s*(\d+)',
|
| 320 |
+
'translator': r'مترجم\s*[:\-]\s*([^\n]+)'
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
for field, pattern in patterns.items():
|
| 324 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 325 |
+
if match:
|
| 326 |
+
metadata[field] = match.group(1).strip()
|
| 327 |
+
|
| 328 |
+
return metadata
|
| 329 |
+
|
| 330 |
+
def _call_api_for_extraction(self, prompt: str) -> Dict[str, Any]:
|
| 331 |
+
"""فراخوانی API"""
|
| 332 |
+
try:
|
| 333 |
+
from llama_index.core import Document
|
| 334 |
+
|
| 335 |
+
doc = Document(text=prompt)
|
| 336 |
+
query_engine = self.rag_system.knowledge_base.as_query_engine(
|
| 337 |
+
llm=self.rag_system.llm,
|
| 338 |
+
similarity_top_k=2
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
response = query_engine.query(prompt)
|
| 342 |
+
response_text = str(response).strip()
|
| 343 |
+
|
| 344 |
+
# استخراج JSON از پاسخ
|
| 345 |
+
json_match = re.search(r'\{[^{}]*\}', response_text)
|
| 346 |
+
if json_match:
|
| 347 |
+
return json.loads(json_match.group())
|
| 348 |
+
|
| 349 |
+
return {}
|
| 350 |
+
|
| 351 |
+
except Exception as e:
|
| 352 |
+
print(f"⚠️ خطا در فراخوانی API: {e}")
|
| 353 |
+
return {}
|
| 354 |
+
|
| 355 |
+
# ============================================================================
|
| 356 |
+
# 📝 بخش ۵: استخراجکننده ساده
|
| 357 |
+
# ============================================================================
|
| 358 |
+
|
| 359 |
+
class SimpleBookExtractor:
|
| 360 |
+
"""استخراج ساده اطلاعات کتاب"""
|
| 361 |
+
|
| 362 |
+
def extract_basic_info(self, text: str) -> Dict[str, Any]:
|
| 363 |
+
"""استخراج اطلاعات پایه"""
|
| 364 |
+
results = {}
|
| 365 |
+
lines = text.split('\n')
|
| 366 |
+
|
| 367 |
+
# استخراج عنوان
|
| 368 |
+
for line in lines[:10]:
|
| 369 |
+
line = line.strip()
|
| 370 |
+
if (10 <= len(line) <= 150 and
|
| 371 |
+
len(re.findall(r'[آ-ی]', line)) >= 2):
|
| 372 |
+
results['title'] = line
|
| 373 |
+
break
|
| 374 |
+
|
| 375 |
+
for line in lines:
|
| 376 |
+
line = line.strip()
|
| 377 |
+
if not line:
|
| 378 |
+
continue
|
| 379 |
+
|
| 380 |
+
# استخراج سال
|
| 381 |
+
year_match = re.search(r'۱۳[۷-۹][۰-۹]|۱۴۰[۰-۴]', line)
|
| 382 |
+
if year_match and 'year' not in results:
|
| 383 |
+
results['publication_year'] = year_match.group()
|
| 384 |
+
|
| 385 |
+
# استخراج نویسنده
|
| 386 |
+
author_match = re.search(r'نویسنده\s*[:\-]\s*(.+)', line, re.IGNORECASE)
|
| 387 |
+
if author_match and 'author' not in results:
|
| 388 |
+
results['author'] = author_match.group(1).strip()
|
| 389 |
+
|
| 390 |
+
# استخراج ناشر
|
| 391 |
+
publisher_match = re.search(r'ناشر\s*[:\-]\s*(.+)', line, re.IGNORECASE)
|
| 392 |
+
if publisher_match and 'publisher' not in results:
|
| 393 |
+
results['publisher'] = publisher_match.group(1).strip()
|
| 394 |
+
|
| 395 |
+
# استخراج مترجم
|
| 396 |
+
translator_match = re.search(r'مترجم\s*[:\-]\s*(.+)', line, re.IGNORECASE)
|
| 397 |
+
if translator_match and 'translator' not in results:
|
| 398 |
+
results['translator'] = translator_match.group(1).strip()
|
| 399 |
+
|
| 400 |
+
# استخراج شابک
|
| 401 |
+
isbn_match = re.search(r'شابک\s*[:\-]\s*([۰-۹\-–]+)', line, re.IGNORECASE)
|
| 402 |
+
if isbn_match and 'isbn' not in results:
|
| 403 |
+
results['isbn'] = isbn_match.group(1).strip()
|
| 404 |
+
|
| 405 |
+
# استخراج نوبت چاپ
|
| 406 |
+
edition_match = re.search(r'چاپ\s*(\d+)', line, re.IGNORECASE)
|
| 407 |
+
if edition_match and 'edition' not in results:
|
| 408 |
+
results['edition'] = edition_match.group(1).strip()
|
| 409 |
+
|
| 410 |
+
return results
|
| 411 |
+
|
| 412 |
+
def extract_additional_info(self, text: str) -> Dict[str, Any]:
|
| 413 |
+
"""استخراج اطلاعات تکمیلی"""
|
| 414 |
+
results = {}
|
| 415 |
+
|
| 416 |
+
patterns = {
|
| 417 |
+
'publisher': r'ناشر\s*[:\-]\s*(.+)',
|
| 418 |
+
'isbn': r'شابک\s*[:\-]\s*([۰-۹\-–]+)',
|
| 419 |
+
'translator': r'مترجم\s*[:\-]\s*(.+)',
|
| 420 |
+
'price': r'قیمت\s*[:\-]\s*([۰-۹,]+)',
|
| 421 |
+
'subject': r'موضوع\s*[:\-]\s*(.+)'
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
for field, pattern in patterns.items():
|
| 425 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 426 |
+
if match:
|
| 427 |
+
results[field] = match.group(1).strip()
|
| 428 |
+
|
| 429 |
+
return results
|
| 430 |
+
|
| 431 |
+
# ============================================================================
|
| 432 |
+
# 🔄 بخش ۶: پردازشگر اصلی
|
| 433 |
+
# ============================================================================
|
| 434 |
+
|
| 435 |
+
class UltraAccuracyBookProcessor:
|
| 436 |
+
"""پردازشگر اصلی"""
|
| 437 |
+
|
| 438 |
+
def __init__(self):
|
| 439 |
+
self.ai_models = HighAccuracyAIModels()
|
| 440 |
+
self.ocr_engine = UltraAccuracyOCREngine(self.ai_models)
|
| 441 |
+
self.rag_system = SimpleRAGSystem()
|
| 442 |
+
self.api_extractor = APIEnhancedMetadataExtractor(self.rag_system)
|
| 443 |
+
self.simple_extractor = SimpleBookExtractor()
|
| 444 |
+
|
| 445 |
+
def process_book_ultra_accurate(self, input_file, progress_callback=None) -> Dict[str, Any]:
|
| 446 |
+
"""پردازش کتاب"""
|
| 447 |
+
if progress_callback:
|
| 448 |
+
progress_callback(0, "شروع پردازش...")
|
| 449 |
+
|
| 450 |
+
start_time = datetime.now()
|
| 451 |
+
|
| 452 |
+
# استخراج متن
|
| 453 |
+
if progress_callback:
|
| 454 |
+
progress_callback(10, "استخراج متن از فایل...")
|
| 455 |
+
raw_text, ocr_results = self.ocr_engine.extract_text_ultra_accurate(input_file, progress_callback)
|
| 456 |
+
|
| 457 |
+
if not raw_text or len(raw_text.strip()) < 10:
|
| 458 |
+
return self._create_error_result("متن کافی استخراج نشد", ocr_results, start_time)
|
| 459 |
+
|
| 460 |
+
# ایجاد پایگاه دانش
|
| 461 |
+
if progress_callback:
|
| 462 |
+
progress_callback(70, "راهاندازی هوش مصنوعی...")
|
| 463 |
+
rag_ready = self.rag_system.create_knowledge_base(raw_text)
|
| 464 |
+
|
| 465 |
+
if rag_ready:
|
| 466 |
+
# استخراج با API
|
| 467 |
+
api_metadata = self.api_extractor.extract_with_api_power(raw_text, progress_callback)
|
| 468 |
+
backup_metadata = self._extract_backup_metadata(raw_text)
|
| 469 |
+
final_metadata = self._final_fusion(api_metadata, backup_metadata, raw_text)
|
| 470 |
+
else:
|
| 471 |
+
# فقط روش معمولی
|
| 472 |
+
if progress_callback:
|
| 473 |
+
progress_callback(75, "استخراج اطلاعات...")
|
| 474 |
+
final_metadata = self._extract_backup_metadata(raw_text)
|
| 475 |
+
|
| 476 |
+
# ایجاد نتایج نهایی
|
| 477 |
+
if progress_callback:
|
| 478 |
+
progress_callback(95, "ذخیرهسازی نتایج...")
|
| 479 |
+
results = self._create_final_results(final_metadata, raw_text, ocr_results,
|
| 480 |
+
rag_ready, start_time, input_file)
|
| 481 |
+
|
| 482 |
+
if progress_callback:
|
| 483 |
+
progress_callback(100, "پردازش کامل شد!")
|
| 484 |
+
|
| 485 |
+
return results
|
| 486 |
+
|
| 487 |
+
def _extract_backup_metadata(self, text: str) -> Dict[str, Any]:
|
| 488 |
+
"""استخراج پشتیبان"""
|
| 489 |
+
basic = self.simple_extractor.extract_basic_info(text)
|
| 490 |
+
additional = self.simple_extractor.extract_additional_info(text)
|
| 491 |
+
|
| 492 |
+
# ترکیب اطلاعات پایه و تکمیلی
|
| 493 |
+
combined_basic = basic.copy()
|
| 494 |
+
for key, value in additional.items():
|
| 495 |
+
if key not in combined_basic and value:
|
| 496 |
+
combined_basic[key] = value
|
| 497 |
+
|
| 498 |
+
return {
|
| 499 |
+
'basic_info': combined_basic,
|
| 500 |
+
'additional_info': additional
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
def _final_fusion(self, api_metadata: Dict, backup_metadata: Dict, text: str) -> Dict[str, Any]:
|
| 504 |
+
"""ترکیب نهایی"""
|
| 505 |
+
# ابتدا اطلاعات پشتیبان را کپی میکنیم
|
| 506 |
+
final_basic = backup_metadata.get('basic_info', {}).copy()
|
| 507 |
+
|
| 508 |
+
# سپس اطلاعات API را اضافه میکنیم (در صورت وجود)
|
| 509 |
+
if api_metadata:
|
| 510 |
+
for field, value in api_metadata.items():
|
| 511 |
+
if value and value not in ['یافت نشد', '']:
|
| 512 |
+
final_basic[field] = value
|
| 513 |
+
|
| 514 |
+
return {
|
| 515 |
+
'basic_info': final_basic,
|
| 516 |
+
'additional_info': backup_metadata.get('additional_info', {}),
|
| 517 |
+
'api_enhanced': bool(api_metadata and len(api_metadata) > 0)
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
def _create_final_results(self, metadata: Dict, text: str, ocr_results: Dict,
|
| 521 |
+
rag_ready: bool, start_time: datetime, input_file) -> Dict[str, Any]:
|
| 522 |
+
|
| 523 |
+
return {
|
| 524 |
+
'basic_info': metadata.get('basic_info', {}),
|
| 525 |
+
'additional_info': metadata.get('additional_info', {}),
|
| 526 |
+
'processing_time': (datetime.now() - start_time).total_seconds(),
|
| 527 |
+
'ocr_analysis': ocr_results,
|
| 528 |
+
'rag_available': rag_ready,
|
| 529 |
+
'api_enhanced': metadata.get('api_enhanced', False),
|
| 530 |
+
'total_text_length': len(text),
|
| 531 |
+
'file_info': {
|
| 532 |
+
'file_name': os.path.basename(input_file) if isinstance(input_file, str) else 'uploaded_file',
|
| 533 |
+
'file_type': 'PDF' if isinstance(input_file, str) and input_file.lower().endswith('.pdf') else 'Image'
|
| 534 |
+
}
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
def _create_error_result(self, error: str, ocr_results: Dict, start_time: datetime):
|
| 538 |
+
"""ایجاد نتیجه خطا"""
|
| 539 |
+
return {
|
| 540 |
+
'error': error,
|
| 541 |
+
'processing_time': (datetime.now() - start_time).total_seconds(),
|
| 542 |
+
'ocr_analysis': ocr_results
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
# ============================================================================
|
| 546 |
+
# 🎨 بخش ۷: رابط Gradio تاریک و ساده - نمایش کامل نتایج
|
| 547 |
+
# ============================================================================
|
| 548 |
+
|
| 549 |
+
class GradioInterface:
|
| 550 |
+
"""رابط کاربری Gradio"""
|
| 551 |
+
|
| 552 |
+
def __init__(self):
|
| 553 |
+
self.processor = None
|
| 554 |
+
self.current_progress = 0
|
| 555 |
+
self.current_status = "آماده"
|
| 556 |
+
|
| 557 |
+
def initialize_processor(self):
|
| 558 |
+
"""راهاندازی پردازشگر"""
|
| 559 |
+
if self.processor is None:
|
| 560 |
+
self.processor = UltraAccuracyBookProcessor()
|
| 561 |
+
|
| 562 |
+
def update_progress(self, progress, status):
|
| 563 |
+
"""بهروزرسانی پیشرفت"""
|
| 564 |
+
self.current_progress = progress
|
| 565 |
+
self.current_status = status
|
| 566 |
+
|
| 567 |
+
def process_file(self, file):
|
| 568 |
+
"""پردازش فایل"""
|
| 569 |
+
if file is None:
|
| 570 |
+
return "لطفاً یک فایل آپلود کنید", "", "", 0, "آماده"
|
| 571 |
+
|
| 572 |
+
try:
|
| 573 |
+
self.initialize_processor()
|
| 574 |
+
file_path = file.name
|
| 575 |
+
|
| 576 |
+
results = self.processor.process_book_ultra_accurate(
|
| 577 |
+
file_path,
|
| 578 |
+
progress_callback=self.update_progress
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
if 'error' in results:
|
| 582 |
+
return f"خطا: {results['error']}", "", "", 0, "خطا"
|
| 583 |
+
|
| 584 |
+
report = self._generate_complete_report(results)
|
| 585 |
+
download_info = self._save_results(results, file_path)
|
| 586 |
+
json_output = self._generate_json_output(results)
|
| 587 |
+
|
| 588 |
+
return report, download_info, json_output, 100, "پردازش کامل شد"
|
| 589 |
+
|
| 590 |
+
except Exception as e:
|
| 591 |
+
return f"خطا در پردازش: {str(e)}", "", "", 0, "خطا"
|
| 592 |
+
|
| 593 |
+
def _generate_complete_report(self, results: Dict[str, Any]) -> str:
|
| 594 |
+
"""تولید گزارش کامل"""
|
| 595 |
+
basic_info = results.get('basic_info', {})
|
| 596 |
+
additional_info = results.get('additional_info', {})
|
| 597 |
+
|
| 598 |
+
report = "📚 نتایج کامل استخراج اطلاعات کتاب\n"
|
| 599 |
+
report += "=" * 50 + "\n\n"
|
| 600 |
+
|
| 601 |
+
# اطلاعات اصلی کتاب
|
| 602 |
+
report += "📖 اطلاعات اصلی:\n"
|
| 603 |
+
report += "-" * 20 + "\n"
|
| 604 |
+
|
| 605 |
+
main_fields = [
|
| 606 |
+
('title', 'عنوان کتاب'),
|
| 607 |
+
('author', 'نویسنده/مؤلف'),
|
| 608 |
+
('translator', 'مترجم'),
|
| 609 |
+
('publisher', 'ناشر'),
|
| 610 |
+
('publication_year', 'سال انتشار'),
|
| 611 |
+
('isbn', 'شابک (ISBN)'),
|
| 612 |
+
('edition', 'نوبت چاپ')
|
| 613 |
+
]
|
| 614 |
+
|
| 615 |
+
for field, title in main_fields:
|
| 616 |
+
value = basic_info.get(field, "یافت نشد")
|
| 617 |
+
report += f"• {title}: {value}\n"
|
| 618 |
+
|
| 619 |
+
# اطلاعات تکمیلی
|
| 620 |
+
if additional_info:
|
| 621 |
+
report += "\n📋 اطلاعات تکمیلی:\n"
|
| 622 |
+
report += "-" * 20 + "\n"
|
| 623 |
+
for key, value in additional_info.items():
|
| 624 |
+
if value and value != "یافت نشد":
|
| 625 |
+
display_key = self._translate_key(key)
|
| 626 |
+
report += f"• {display_key}: {value}\n"
|
| 627 |
+
|
| 628 |
+
# اطلاعات فنی
|
| 629 |
+
report += "\n🔧 اطلاعات فنی:\n"
|
| 630 |
+
report += "-" * 15 + "\n"
|
| 631 |
+
report += f"• زمان پردازش: {results.get('processing_time', 0):.1f} ثانیه\n"
|
| 632 |
+
report += f"• طول متن استخراج شده: {results.get('total_text_length', 0)} کاراکتر\n"
|
| 633 |
+
report += f"• کیفیت OCR: {results.get('ocr_analysis', {}).get('quality_metrics', {}).get('overall_score', 0):.1%}\n"
|
| 634 |
+
report += f"• سیستم هوش مصنوعی: {'فعال' if results.get('api_enhanced') else 'غیرفعال'}\n"
|
| 635 |
+
|
| 636 |
+
# اطلاعات فایل
|
| 637 |
+
file_info = results.get('file_info', {})
|
| 638 |
+
report += f"• نام فایل: {file_info.get('file_name', 'نامشخص')}\n"
|
| 639 |
+
report += f"• نوع فایل: {file_info.get('file_type', 'نامشخص')}\n"
|
| 640 |
+
|
| 641 |
+
report += f"\n🕒 تاریخ پردازش: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
|
| 642 |
+
|
| 643 |
+
return report
|
| 644 |
+
|
| 645 |
+
def _translate_key(self, key: str) -> str:
|
| 646 |
+
"""ترجمه کلیدهای انگلیسی به فارسی"""
|
| 647 |
+
translations = {
|
| 648 |
+
'publisher': 'ناشر',
|
| 649 |
+
'isbn': 'شابک',
|
| 650 |
+
'price': 'قیمت',
|
| 651 |
+
'subject': 'موضوع',
|
| 652 |
+
'translator': 'مترجم',
|
| 653 |
+
'author': 'نویسنده',
|
| 654 |
+
'publication_year': 'سال انتشار',
|
| 655 |
+
'edition': 'نوبت چاپ',
|
| 656 |
+
'title': 'عنوان'
|
| 657 |
+
}
|
| 658 |
+
return translations.get(key, key)
|
| 659 |
+
|
| 660 |
+
def _save_results(self, results: Dict[str, Any], file_path: str) -> str:
|
| 661 |
+
"""ذخیره نتایج"""
|
| 662 |
+
try:
|
| 663 |
+
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
| 664 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 665 |
+
|
| 666 |
+
# ذخیره JSON
|
| 667 |
+
json_filename = f"{base_name}_نتایج_{timestamp}.json"
|
| 668 |
+
with open(json_filename, 'w', encoding='utf-8') as f:
|
| 669 |
+
json.dump(results, f, ensure_ascii=False, indent=2)
|
| 670 |
+
|
| 671 |
+
# ذخیره گزارش متنی
|
| 672 |
+
txt_filename = f"{base_name}_گزارش_{timestamp}.txt"
|
| 673 |
+
report_content = self._generate_complete_report(results)
|
| 674 |
+
with open(txt_filename, 'w', encoding='utf-8') as f:
|
| 675 |
+
f.write(report_content)
|
| 676 |
+
|
| 677 |
+
return f"✅ فایلها با موفقیت ذخیره شدند:\n📄 {json_filename}\n📝 {txt_filename}"
|
| 678 |
+
|
| 679 |
+
except Exception as e:
|
| 680 |
+
return f"⚠️ خطا در ذخیرهسازی: {str(e)}"
|
| 681 |
+
|
| 682 |
+
def _generate_json_output(self, results: Dict[str, Any]) -> str:
|
| 683 |
+
"""تولید خروجی JSON"""
|
| 684 |
+
return json.dumps(results, ensure_ascii=False, indent=2)
|
| 685 |
+
|
| 686 |
+
# ============================================================================
|
| 687 |
+
# 🚀 راهاندازی رابط Gradio تاریک و ساده
|
| 688 |
+
# ============================================================================
|
| 689 |
+
|
| 690 |
+
def create_dark_simple_interface():
|
| 691 |
+
"""ایجاد رابط تاریک و ساده"""
|
| 692 |
+
|
| 693 |
+
interface = GradioInterface()
|
| 694 |
+
|
| 695 |
+
# CSS برای رابط تاریک
|
| 696 |
+
dark_css = """
|
| 697 |
+
.gradio-container {
|
| 698 |
+
background: #000000 !important;
|
| 699 |
+
color: #ffffff !important;
|
| 700 |
+
font-family: Arial, sans-serif !important;
|
| 701 |
+
}
|
| 702 |
+
.container {
|
| 703 |
+
background: #000000 !important;
|
| 704 |
+
}
|
| 705 |
+
.panel {
|
| 706 |
+
background: #1a1a1a !important;
|
| 707 |
+
border: 1px solid #333 !important;
|
| 708 |
+
border-radius: 5px !important;
|
| 709 |
+
padding: 10px !important;
|
| 710 |
+
margin: 5px 0 !important;
|
| 711 |
+
}
|
| 712 |
+
.progress-text {
|
| 713 |
+
color: #00ff00 !important;
|
| 714 |
+
font-weight: bold;
|
| 715 |
+
}
|
| 716 |
+
.dark-button {
|
| 717 |
+
background: #333 !important;
|
| 718 |
+
color: white !important;
|
| 719 |
+
border: 1px solid #555 !important;
|
| 720 |
+
}
|
| 721 |
+
.dark-button:hover {
|
| 722 |
+
background: #444 !important;
|
| 723 |
+
}
|
| 724 |
+
.dark-input {
|
| 725 |
+
background: #1a1a1a !important;
|
| 726 |
+
color: white !important;
|
| 727 |
+
border: 1px solid #333 !important;
|
| 728 |
+
}
|
| 729 |
+
.dark-slider {
|
| 730 |
+
background: #333 !important;
|
| 731 |
+
}
|
| 732 |
+
.success-text {
|
| 733 |
+
color: #00ff00 !important;
|
| 734 |
+
}
|
| 735 |
+
.error-text {
|
| 736 |
+
color: #ff4444 !important;
|
| 737 |
+
}
|
| 738 |
+
"""
|
| 739 |
+
|
| 740 |
+
with gr.Blocks(
|
| 741 |
+
title="استخراج اطلاعات کتاب",
|
| 742 |
+
css=dark_css
|
| 743 |
+
) as demo:
|
| 744 |
+
|
| 745 |
+
gr.Markdown(
|
| 746 |
+
"""
|
| 747 |
+
<div style='text-align: center; color: white;'>
|
| 748 |
+
<h1>📚 استخراج اطلاعات کتاب</h1>
|
| 749 |
+
<p>آپلود فایل کتاب (PDF یا تصویر) برای استخراج خودکار اطلاعات</p>
|
| 750 |
+
</div>
|
| 751 |
+
"""
|
| 752 |
+
)
|
| 753 |
+
|
| 754 |
+
with gr.Row():
|
| 755 |
+
with gr.Column(scale=1):
|
| 756 |
+
# بخش آپلود فایل
|
| 757 |
+
gr.Markdown("### 📁 آپلود فایل")
|
| 758 |
+
file_input = gr.File(
|
| 759 |
+
label="",
|
| 760 |
+
file_types=[".pdf", ".jpg", ".jpeg", ".png"],
|
| 761 |
+
type="filepath",
|
| 762 |
+
elem_classes="dark-input"
|
| 763 |
+
)
|
| 764 |
+
|
| 765 |
+
# دکمه پردازش
|
| 766 |
+
process_btn = gr.Button(
|
| 767 |
+
"🚀 شروع پردازش",
|
| 768 |
+
variant="primary",
|
| 769 |
+
elem_classes="dark-button",
|
| 770 |
+
size="lg"
|
| 771 |
+
)
|
| 772 |
+
|
| 773 |
+
# بخش پیشرفت
|
| 774 |
+
gr.Markdown("### 📊 پیشرفت")
|
| 775 |
+
progress_bar = gr.Slider(
|
| 776 |
+
minimum=0,
|
| 777 |
+
maximum=100,
|
| 778 |
+
value=0,
|
| 779 |
+
label="",
|
| 780 |
+
interactive=False,
|
| 781 |
+
elem_classes="dark-slider"
|
| 782 |
+
)
|
| 783 |
+
|
| 784 |
+
progress_text = gr.Textbox(
|
| 785 |
+
label="وضعیت",
|
| 786 |
+
value="آماده",
|
| 787 |
+
interactive=False,
|
| 788 |
+
elem_classes="dark-input"
|
| 789 |
+
)
|
| 790 |
+
|
| 791 |
+
with gr.Column(scale=2):
|
| 792 |
+
# بخش نتایج
|
| 793 |
+
gr.Markdown("### 📄 نتایج کامل")
|
| 794 |
+
output_report = gr.Textbox(
|
| 795 |
+
label="",
|
| 796 |
+
lines=12,
|
| 797 |
+
show_copy_button=True,
|
| 798 |
+
elem_classes="dark-input"
|
| 799 |
+
)
|
| 800 |
+
|
| 801 |
+
# بخش ذخیرهسازی
|
| 802 |
+
gr.Markdown("### 💾 ذخیرهسازی")
|
| 803 |
+
download_info = gr.Textbox(
|
| 804 |
+
label="",
|
| 805 |
+
lines=3,
|
| 806 |
+
interactive=False,
|
| 807 |
+
elem_classes="dark-input"
|
| 808 |
+
)
|
| 809 |
+
|
| 810 |
+
# بخش JSON
|
| 811 |
+
gr.Markdown("### 🔧 خروجی فنی (JSON)")
|
| 812 |
+
json_output = gr.Textbox(
|
| 813 |
+
label="",
|
| 814 |
+
lines=8,
|
| 815 |
+
show_copy_button=True,
|
| 816 |
+
elem_classes="dark-input"
|
| 817 |
+
)
|
| 818 |
+
|
| 819 |
+
# اتصال رویداد
|
| 820 |
+
process_btn.click(
|
| 821 |
+
fn=interface.process_file,
|
| 822 |
+
inputs=[file_input],
|
| 823 |
+
outputs=[output_report, download_info, json_output, progress_bar, progress_text]
|
| 824 |
+
)
|
| 825 |
+
|
| 826 |
+
gr.Markdown(
|
| 827 |
+
"""
|
| 828 |
+
<div style='text-align: center; color: #888; margin-top: 20px;'>
|
| 829 |
+
<p>سیستم استخراج خودکار اطلاعات کتاب - نسخه ساده و تاریک</p>
|
| 830 |
+
<p>📖 تمام اطلاعات کتاب به صورت کامل نمایش داده میشود</p>
|
| 831 |
+
</div>
|
| 832 |
+
"""
|
| 833 |
+
)
|
| 834 |
+
|
| 835 |
+
return demo
|
| 836 |
+
|
| 837 |
+
# این بخش را تغییر میدهیم تا با Hugging Face سازگار شود
|
| 838 |
+
def main():
|
| 839 |
+
demo = create_dark_simple_interface()
|
| 840 |
+
demo.launch(
|
| 841 |
+
server_name="0.0.0.0", # برای Hugging Face
|
| 842 |
+
server_port=int(os.environ.get("PORT", 7860)), # پورت Hugging Face
|
| 843 |
+
share=False # در Hugging Face نیازی به share نیست
|
| 844 |
+
)
|
| 845 |
+
|
| 846 |
+
if __name__ == "__main__":
|
| 847 |
+
main()
|