Spaces:

F-allahmoradi
/

OCR1

Sleeping

App Files Files Community

F-allahmoradi commited on Oct 23, 2025

Commit

4e093a1

verified ·

1 Parent(s): 91848f2

Upload app.py

Browse files

Files changed (1) hide show

app.py +919 -0

app.py ADDED Viewed

	@@ -0,0 +1,919 @@

+# app.py
+# ============================================================================
+# 📦 ایمپورت کتابخانه‌ها
+# ============================================================================
+import os
+import requests
+import json
+import re
+import time
+import threading
+from PIL import Image, ImageEnhance
+import pytesseract
+from pdf2image import convert_from_path
+import gradio as gr
+from groq import Groq
+import numpy as np
+import cv2
+from collections import Counter
+import easyocr
+from persian_tools import digits
+from rapidfuzz import fuzz
+from datetime import datetime
+import concurrent.futures
+# تنظیم مسیر Tesseract
+pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
+print("✅ کتابخانه‌ها بارگذاری شدند")
+# ============================================================================
+# 🔍 موتور OCR حرفه‌ای با DPI 200
+# ============================================================================
+class ProfessionalOCREngine:
+    """موتور OCR حرفه‌ای با کیفیت بالا"""
+    def __init__(self):
+        self.setup_professional_ocr()
+        self.setup_easyocr()
+    def setup_professional_ocr(self):
+        """تنظیمات حرفه‌ای OCR"""
+        self.tesseract_configs = [
+            '--oem 3 --psm 6 -c tessedit_char_whitelist=آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیءةيك012345678۹۰۱۲۳۴۵۶۷۸۹ :.,-()',
+            '--oem 3 --psm 4 -c preserve_interword_spaces=1',
+            '--oem 3 --psm 8 -c tessedit_char_blacklist=|\\/><[]{}',
+        ]
+    def setup_easyocr(self):
+        """راه‌اندازی EasyOCR حرفه‌ای"""
+        try:
+            self.easy_reader = easyocr.Reader(['fa', 'en'], gpu=False)
+            self.easyocr_available = True
+            print("✅ EasyOCR حرفه‌ای راه‌اندازی شد")
+        except Exception as e:
+            print(f"⚠️ EasyOCR راه‌اندازی نشد: {e}")
+            self.easyocr_available = False
+    def extract_text_professional(self, input_file, num_pages=5):
+        """استخراج متن با روش حرفه‌ای - ۵ صفحه با DPI 200"""
+        try:
+            print(f"🔍 شروع استخراج متن حرفه‌ای از {num_pages} صفحه با DPI 200...")
+            if isinstance(input_file, str) and input_file.lower().endswith('.pdf'):
+                # استفاده از DPI 200 برای تعادل سرعت و کیفیت
+                images = convert_from_path(input_file, first_page=1, last_page=num_pages, dpi=200)
+                all_texts = []
+                for i, image in enumerate(images):
+                    print(f"📄 پردازش صفحه {i+1} از {num_pages} با DPI 200...")
+                    # پردازش حرفه‌ای با کیفیت بالا
+                    tesseract_text = self._extract_with_pro_tesseract(image)
+                    easyocr_text = self._extract_with_pro_easyocr(image) if self.easyocr_available else ""
+                    # ترکیب پیشرفته
+                    combined_text = self._advanced_combination([tesseract_text, easyocr_text])
+                    if combined_text.strip():
+                        page_result = f"""
+{'='*40}
+📄 صفحه {i+1}:
+{'='*40}
+{combined_text}"""
+                        all_texts.append(page_result)
+                        print(f"✅ صفحه {i+1} پردازش شد: {len(combined_text)} کاراکتر")
+                result = '\n'.join(all_texts)
+                print(f"✅ پردازش {len(images)} صفحه با DPI 200 کامل شد: {len(result)} کاراکتر")
+                return result
+            else:
+                # فایل تصویری با کیفیت بالا
+                image = Image.open(input_file)
+                tesseract_text = self._extract_with_pro_tesseract(image)
+                easyocr_text = self._extract_with_pro_easyocr(image) if self.easyocr_available else ""
+                combined_text = self._advanced_combination([tesseract_text, easyocr_text])
+                result = f"""
+{'='*40}
+📄 صفحه 1:
+{'='*40}
+{combined_text}"""
+                print(f"✅ پردازش تصویر با کیفیت بالا کامل شد: {len(combined_text)} کاراکتر")
+                return result
+        except Exception as e:
+            return f"❌ خطا در پردازش صفحات: {str(e)}"
+    def _extract_with_pro_tesseract(self, image):
+        """استخراج حرفه‌ای با Tesseract"""
+        try:
+            # پیش‌پردازش حرفه‌ای برای DPI 200
+            processed_images = [
+                self._preprocess_high_quality(image),
+                self._preprocess_enhanced_contrast(image),
+                self._preprocess_denoise_advanced(image)
+            ]
+            all_texts = []
+            for processed_img in processed_images:
+                for config in self.tesseract_configs:
+                    try:
+                        text = pytesseract.image_to_string(processed_img, lang='fas+eng', config=config)
+                        if text.strip():
+                            all_texts.append(text)
+                    except:
+                        continue
+            return self._select_best_quality_text(all_texts) if all_texts else ""
+        except Exception as e:
+            print(f"⚠️ خطا در Tesseract حرفه‌ای: {e}")
+            return ""
+    def _extract_with_pro_easyocr(self, image):
+        """استخراج حرفه‌ای با EasyOCR"""
+        try:
+            image_np = np.array(image)
+            # تنظیمات پیشرفته برای کیفیت بالا
+            results = self.easy_reader.readtext(image_np, paragraph=True, text_threshold=0.3, batch_size=1)
+            extracted_texts = []
+            for result in results:
+                if len(result) >= 2:
+                    text = result[1]
+                    confidence = result[2] if len(result) > 2 else 0.5
+                    if confidence > 0.2:  # آستانه پایین‌تر برای دریافت متن بیشتر
+                        extracted_texts.append(text)
+            return " ".join(extracted_texts)
+        except Exception as e:
+            print(f"⚠️ خطا در EasyOCR حرفه‌ای: {e}")
+            return ""
+    def _preprocess_high_quality(self, image):
+        """پیش‌پردازش برای کیفیت بالا"""
+        try:
+            if image.mode != 'L':
+                image = image.convert('L')
+            # افزایش کنتراست برای DPI 200
+            enhancer = ImageEnhance.Contrast(image)
+            image = enhancer.enhance(2.5)
+            # افزایش وضوح
+            enhancer = ImageEnhance.Sharpness(image)
+            image = enhancer.enhance(2.0)
+            return image
+        except:
+            return image
+    def _preprocess_enhanced_contrast(self, image):
+        """پیش‌پردازش با کنتراست پیشرفته"""
+        try:
+            if image.mode != 'L':
+                image = image.convert('L')
+            # استفاده از CLAHE برای کنتراست پیشرفته
+            img_np = np.array(image)
+            clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8))
+            img_contrast = clahe.apply(img_np)
+            return Image.fromarray(img_contrast)
+        except:
+            return image
+    def _preprocess_denoise_advanced(self, image):
+        """پیش‌پردازش حذف نویز پیشرفته"""
+        try:
+            if image.mode != 'L':
+                image = image.convert('L')
+            img_np = np.array(image)
+            # حذف نویز با فیلترهای پیشرفته
+            img_denoised = cv2.medianBlur(img_np, 3)
+            img_denoised = cv2.GaussianBlur(img_denoised, (1, 1), 0)
+            return Image.fromarray(img_denoised)
+        except:
+            return image
+    def _select_best_quality_text(self, text_list):
+        """انتخاب متن با بالاترین کیفیت"""
+        if not text_list:
+            return ""
+        scored_texts = []
+        for text in text_list:
+            score = self._calculate_advanced_quality(text)
+            scored_texts.append((text, score))
+        return max(scored_texts, key=lambda x: x[1])[0]
+    def _calculate_advanced_quality(self, text):
+        """محاسبه کیفیت پیشرفته متن"""
+        if not text.strip():
+            return 0
+        score = 0
+        # امتیاز بر اساس طول متن
+        if 50 <= len(text) <= 5000:
+            score += 3
+        # امتیاز بر اساس کاراکترهای فارسی
+        persian_chars = len(re.findall(r'[آ-ی]', text))
+        persian_ratio = persian_chars / len(text) if len(text) > 0 else 0
+        if persian_ratio > 0.3:
+            score += persian_ratio * 4
+        # امتیاز بر اساس ساختار متن
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        if len(lines) > 2:
+            valid_lines = sum(1 for line in lines if 5 <= len(line) <= 200)
+            score += (valid_lines / len(lines)) * 3
+        # امتیاز بر اساس کلمات کلیدی کتاب
+        book_keywords = ['عنوان', 'نویسنده', 'مؤلف', 'ناشر', 'چاپ', 'شابک', 'قیمت', 'تیراژ', 'کتاب', 'انتشارات', 'مترجم', 'فهرست', 'مقدمه']
+        keyword_count = sum(1 for keyword in book_keywords if keyword in text)
+        score += keyword_count * 0.5
+        return score
+    def _advanced_combination(self, texts):
+        """ترکیب پیشرفته نتایج"""
+        valid_texts = [t for t in texts if t and t.strip()]
+        if not valid_texts:
+            return ""
+        if len(valid_texts) == 1:
+            return valid_texts[0]
+        # ترکیب هوشمند بر اساس کیفیت
+        best_text = max(valid_texts, key=lambda x: self._calculate_advanced_quality(x))
+        return best_text
+# ============================================================================
+# 🧠 استخراج‌کننده متادیتای حرفه‌ای
+# ============================================================================
+class ProfessionalMetadataExtractor:
+    """استخراج‌کننده متادیتای حرفه‌ای"""
+    def __init__(self):
+        self.setup_professional_patterns()
+    def setup_professional_patterns(self):
+        """الگوهای حرفه‌ای برای استخراج اطلاعات"""
+        self.patterns = {
+            'title': [
+                r'عنوان\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'نام\s*کتاب\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'کتاب\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'^(?!.*(نویسنده|مؤلف|ناشر|چاپ|شابک))(.{10,120}?)(?=\n|$)',
+            ],
+            'author': [
+                r'نویسنده\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'مؤلف\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'پدیدآور\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'تألیف\s*[:\-]\s*(.+?)(?=\n|$)',
+            ],
+            'translator': [
+                r'مترجم\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'ترجمه\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'برگردان\s*[:\-]\s*(.+?)(?=\n|$)',
+            ],
+            'publisher': [
+                r'ناشر\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'انتشارات\s*[:\-]\s*(.+?)(?=\n|$)',
+                r'چاپ\s*[:\-]\s*(.+?)(?=\n|$)',
+            ],
+            'publication_year': [
+                r'سال\s*انتشار\s*[:\-]\s*(\d{4})',
+                r'تاریخ\s*چاپ\s*[:\-]\s*(\d{4})',
+                r'چاپ\s*[:\-].*?(\d{4})',
+                r'(\d{4})\s*,\s*تیراژ',
+                r'۱۳[۴-۹]\d',
+            ],
+            'edition': [
+                r'نوبت\s*چاپ\s*[:\-]\s*(\S+)',
+                r'چاپ\s*[:\-]\s*(\S+)',
+                r'چاپ\s*(اول|دوم|سوم|چهارم|پنجم|ششم|هفتم|هشتم|نهم|دهم)',
+            ],
+            'isbn': [
+                r'شابک\s*[:\-]\s*([\d\-]+)',
+                r'ISBN\s*[:\-]\s*([\d\-]+)',
+                r'[\d\-]{10,17}',
+            ]
+        }
+    def extract_metadata_professional(self, extracted_text):
+        """استخراج متادیتا با روش حرفه‌ای"""
+        try:
+            if not extracted_text or len(extracted_text.strip()) < 100:
+                return self._get_empty_result("متن کافی برای تحلیل یافت نشد")
+            # استخراج پیشرفته
+            pattern_results = self._extract_with_advanced_patterns(extracted_text)
+            intelligent_results = self._intelligent_analysis(extracted_text)
+            combined_results = self._professional_combination(pattern_results, intelligent_results)
+            enhanced_results = self._enhance_with_context(combined_results, extracted_text)
+            validated_results = self._professional_validation(enhanced_results)
+            confidence = self._calculate_professional_confidence(validated_results)
+            return {
+                'success': True,
+                'metadata': validated_results,
+                'confidence': confidence,
+                'extraction_method': 'professional'
+            }
+        except Exception as e:
+            return self._get_empty_result(str(e))
+    def _extract_with_advanced_patterns(self, text):
+        """استخراج با الگوهای پیشرفته"""
+        results = {}
+        for field, patterns in self.patterns.items():
+            best_match = None
+            for pattern in patterns:
+                try:
+                    matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
+                    for match in matches:
+                        if isinstance(match, tuple):
+                            match = match[0] if match[0] else (match[1] if len(match) > 1 else "")
+                        if match:
+                            clean_value = self._professional_clean(field, str(match))
+                            if self._is_professionally_valid(field, clean_value):
+                                if not best_match or len(clean_value) > len(best_match):
+                                    best_match = clean_value
+                except:
+                    continue
+            if best_match:
+                results[field] = best_match
+        return results
+    def _intelligent_analysis(self, text):
+        """تحلیل هوشمند"""
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        results = {}
+        # تحلیل عمیق خطوط
+        for i, line in enumerate(lines):
+            if i < 10:  # فقط 10 خط اول برای کارایی
+                if not results.get('title') and self._is_professional_title(line, i):
+                    results['title'] = self._professional_title_clean(line)
+                if not results.get('author') and self._contains_author_indicator(line):
+                    author = self._extract_professional_author(line)
+                    if author:
+                        results['author'] = author
+                if not results.get('publisher') and self._contains_publisher_indicator(line):
+                    publisher = self._extract_professional_publisher(line)
+                    if publisher:
+                        results['publisher'] = publisher
+        return results
+    def _is_professional_title(self, line, line_index):
+        """بررسی حرفه‌ای عنوان"""
+        if len(line) < 8 or len(line) > 150:
+            return False
+        # خطوط اول احتمال بیشتری برای عنوان دارند
+        title_probability = max(0.8 - (line_index * 0.1), 0.3)
+        exclude_patterns = [
+            r'نویسنده', r'مؤلف', r'ناشر', r'چاپ', r'شابک',
+            r'قیمت', r'تیراژ', r'صفحه', r'فهرست', r'مقدمه'
+        ]
+        if any(re.search(pattern, line) for pattern in exclude_patterns):
+            return False
+        persian_ratio = len(re.findall(r'[آ-ی]', line)) / len(line) if len(line) > 0 else 0
+        if persian_ratio < 0.4:
+            return False
+        return True
+    def _professional_combination(self, pattern_results, intelligent_results):
+        """ترکیب حرفه‌ای نتایج"""
+        combined = pattern_results.copy()
+        # اولویت با نتایج الگوها، سپس نتایج هوشمند
+        for field, value in intelligent_results.items():
+            if value and not combined.get(field):
+                combined[field] = value
+        return combined
+    def _professional_clean(self, field, value):
+        """پاکسازی حرفه‌ای"""
+        if not value:
+            return value
+        value = re.sub(r'[ـ\r\x200c\x200d]', '', value)
+        value = re.sub(r'\s+', ' ', value).strip()
+        # پاکسازی ویژه هر فیلد
+        cleaners = {
+            'title': lambda x: re.sub(r'^[:\-\s]*', '', x),
+            'author': lambda x: re.sub(r'^(نویسنده|مؤلف|پدیدآور)[:\-\s]*', '', x),
+            'publisher': lambda x: re.sub(r'^(ناشر|انتشارات)[:\-\s]*', '', x),
+            'publication_year': lambda x: re.sub(r'[^\d]', '', x),
+        }
+        if field in cleaners:
+            value = cleaners[field](value)
+        return value.strip()
+    def _is_professionally_valid(self, field, value):
+        """اعتبارسنجی حرفه‌ای"""
+        if not value:
+            return False
+        validators = {
+            'title': lambda x: 5 <= len(x) <= 200,
+            'author': lambda x: 3 <= len(x) <= 100,
+            'publisher': lambda x: 3 <= len(x) <= 100,
+            'publication_year': lambda x: x.isdigit() and 1300 <= int(x) <= 1500,
+            'edition': lambda x: 1 <= len(x) <= 50,
+            'isbn': lambda x: 10 <= len(x.replace('-', '')) <= 17
+        }
+        return field in validators and validators[field](value)
+    def _calculate_professional_confidence(self, results):
+        """محاسبه اطمینان حرفه‌ای"""
+        if not results:
+            return 0.0
+        weights = {
+            'title': 0.25,
+            'author': 0.20,
+            'publisher': 0.15,
+            'publication_year': 0.15,
+            'edition': 0.10,
+            'isbn': 0.10,
+            'translator': 0.05
+        }
+        total_score = 0.0
+        for field, weight in weights.items():
+            if field in results and results[field]:
+                total_score += weight
+        return total_score
+# ============================================================================
+# 🤖 سیستم پردازش ترکیبی حرفه‌ای
+# ============================================================================
+class ProfessionalPersianBookProcessor:
+    def __init__(self):
+        # خواندن کلیدها از متغیرهای محیطی (Secrets)
+        key1 = os.getenv("GROQ_API_KEY_1")
+        key2 = os.getenv("GROQ_API_KEY_2")
+        key3 = os.getenv("GROQ_API_KEY_3")
+        self.groq_keys = [k for k in [key1, key2, key3] if k]
+        if not self.groq_keys:
+            print("⚠️ هیچ کلید Groq از Secrets یافت نشد. ممکن است عملکرد محدود شود.")
+            self.groq_keys = []
+        self.current_key_index = 0
+        self.ocr_engine = ProfessionalOCREngine()
+        self.metadata_extractor = ProfessionalMetadataExtractor()
+        print(f"✅ سیستم حرفه‌ای با {len(self.groq_keys)} کلید Groq راه‌اندازی شد")
+    def get_next_groq_client(self):
+        """دریافت کلاینت Groq بعدی"""
+        if not self.groq_keys:
+            return None
+        self.current_key_index = (self.current_key_index + 1) % len(self.groq_keys)
+        try:
+            return Groq(api_key=self.groq_keys[self.current_key_index])
+        except:
+            return None
+    def extract_text_professional(self, file_path, num_pages=5):
+        """استخراج متن حرفه‌ای - ۵ صفحه با DPI 200"""
+        return self.ocr_engine.extract_text_professional(file_path, num_pages)
+    def analyze_with_groq_professional(self, full_text):
+        """تحلیل حرفه‌ای با Groq - پرامپت پیشرفته"""
+        for key_index in range(len(self.groq_keys)):
+            client = self.get_next_groq_client()
+            if not client:
+                continue
+            try:
+                print(f"   🤖 تحلیل حرفه‌ای با Groq (کلید {key_index + 1})...")
+                # پرامپت حرفه‌ای و جامع
+                prompt = f"""
+                شما یک متخصص حرفه‌ای در تحلیل و استخراج اطلاعات از کتاب‌های فارسی هستید. لطفاً با دقت بالا اطلاعات زیر را از متن ۵ صفحه اول کتاب استخراج کنید.
+                **متن کامل ۵ صفحه اول کتاب:**
+                {full_text[:4000]}
+                **اطلاعات مورد نیاز برای استخراج:**
+                ۱. **عنوان اصلی کتاب** (title):
+                   - دقیق‌ترین و کامل‌ترین عنوان را پیدا کنید
+                   - عناوین فرعی را نیز در صورت وجود شامل شود
+                ۲. **نام نویسنده/مؤلف** (author):
+                   - نام کامل نویسنده یا مؤلف
+                   - در صورت وجود چند نویسنده، همه را ذکر کنید
+                ۳. **نام مترجم** (translator):
+                   - اگر کتاب ترجمه است، نام کامل مترجم
+                   - در صورت عدم ترجمه، "یافت نشد"
+                ۴. **نام ناشر** (publisher):
+                   - نام کامل انتشارات یا ناشر
+                   - شامل شهر در صورت ذکر شدن
+                ۵. **سال انتشار** (publication_year):
+                   - سال چاپ به صورت عدد (مثال: 1402)
+                   - از تاریخ‌های هجری شمسی استفاده شود
+                ۶. **شماره شابک** (isbn):
+                   - شماره ۱۰ یا ۱۳ رقمی شابک
+                   - با فرمت استاندارد
+                ۷. **نوبت چاپ** (edition):
+                   - شماره یا عنوان نوبت چاپ
+                   - مثال: اول، دوم، سوم...
+                ۸. **موضوع کتاب** (subject):
+                   - حوزه موضوعی اصلی کتاب
+                   - ژانر و زمینه محتوایی
+                ۹. **خلاصه محتوا** (summary):
+                   - خلاصه‌ای جامع از محتوای ۵ صفحه اول
+                   - حدود ۱۰۰-۱۵۰ کلمه
+                   -突出重点 و مفاهیم اصلی
+                **دستورات مهم:**
+                - پاسخ را **فقط و فقط** به صورت JSON برگردانید
+                - از هیچ متن اضافی قبل یا بعد از JSON استفاده نکنید
+                - برای فیلدهایی که اطلاعاتی پیدا نکردید از "یافت نشد" استفاده کنید
+                - از قالب‌بندی استاندارد JSON استفاده کنید
+                - دقت و صحت اطلاعات اولویت دارد
+                **قالب خروجی JSON:**
+                {{
+                    "title": "عنوان کامل کتاب",
+                    "author": "نام کامل نویسنده",
+                    "translator": "نام کامل مترجم",
+                    "publisher": "نام کامل ناشر",
+                    "publication_year": "سال انتشار",
+                    "isbn": "شماره شابک",
+                    "edition": "نوبت چاپ",
+                }}
+                **تأکید: فقط JSON خالص برگردانید، بدون هیچ توضیح اضافی!**
+                """
+                response = client.chat.completions.create(
+                    messages=[{"role": "user", "content": prompt}],
+                    model="llama-3.1-8b-instant",
+                    temperature=0.1,
+                    max_tokens=2000,  # افزایش به 2000 توکن
+                    timeout=30
+                )
+                result_text = response.choices[0].message.content
+                print(f"   ✅ پاسخ Groq دریافت شد ({len(result_text)} کاراکتر)")
+                # استخراج پیشرفته JSON
+                json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', result_text, re.DOTALL)
+                if json_match:
+                    json_str = json_match.group()
+                    try:
+                        result_json = json.loads(json_str)
+                        filled_fields = sum(1 for v in result_json.values() if v and v != "یافت نشد")
+                        print(f"   📊 فیلدهای پر: {filled_fields} از ۹")
+                        if filled_fields > 0:
+                            return {
+                                "success": True,
+                                "source": "groq_professional",
+                                "results": result_json,
+                                "filled_fields": filled_fields,
+                                "response_length": len(result_text)
+                            }
+                    except json.JSONDecodeError as e:
+                        print(f"   ❌ خطای JSON: {e}")
+                else:
+                    print("   ⚠️ JSON در پاسخ یافت نشد")
+            except Exception as e:
+                print(f"   ⚠️ خطا در Groq حرفه‌ای: {e}")
+                continue
+        # استفاده از روش محلی حرفه‌ای
+        print("   🔄 استفاده از روش محلی حرفه‌ای...")
+        return self._analyze_with_local_professional(full_text)
+    def _analyze_with_local_professional(self, full_text):
+        """تحلیل با روش محلی حرفه‌ای"""
+        try:
+            result = self.metadata_extractor.extract_metadata_professional(full_text)
+            if result['success']:
+                metadata = result['metadata']
+                formatted_results = {
+                    "title": metadata.get('title', 'یافت نشد'),
+                    "author": metadata.get('author', 'یافت نشد'),
+                    "translator": metadata.get('translator', 'یافت نشد'),
+                    "publisher": metadata.get('publisher', 'یافت نشد'),
+                    "publication_year": metadata.get('publication_year', 'یافت نشد'),
+                    "isbn": metadata.get('isbn', 'یافت نشد'),
+                    "edition": metadata.get('edition', 'یافت نشد'),
+                    "subject": 'یافت نشد',
+                    "summary": 'یافت نشد'
+                }
+                filled_fields = sum(1 for v in formatted_results.values() if v and v != "یافت نشد")
+                return {
+                    "success": True,
+                    "source": "local_professional",
+                    "results": formatted_results,
+                    "filled_fields": filled_fields,
+                    "confidence": result['confidence']
+                }
+            else:
+                return {
+                    "success": False,
+                    "source": "local_professional",
+                    "error": result.get('error', 'خطای ناشناخته'),
+                    "results": self._get_fallback_results()
+                }
+        except Exception as e:
+            return {
+                "success": False,
+                "source": "local_professional",
+                "error": str(e),
+                "results": self._get_fallback_results()
+            }
+    def _get_fallback_results(self):
+        """نتایج پیش‌فرض"""
+        return {
+            "title": "یافت نشد",
+            "author": "یافت نشد",
+            "translator": "یافت نشد",
+            "publisher": "یافت نشد",
+            "publication_year": "یافت نشد",
+            "isbn": "یافت نشد",
+            "edition": "یافت نشد",
+            "subject": "یافت نشد",
+            "summary": "یافت نشد"
+        }
+# ============================================================================
+# 🎯 رابط کاربری حرفه‌ای
+# ============================================================================
+def process_book_professional(file):
+    """پردازش کتاب با سیستم حرفه‌ای - ۵ صفحه با DPI 200"""
+    if file is None:
+        return create_empty_display(), None, "📊 منتظر پردازش..."
+    try:
+        processor = ProfessionalPersianBookProcessor()
+        print("=" * 60)
+        print("🔄 شروع پردازش حرفه‌ای (۵ صفحه اول با DPI 200)...")
+        start_time = time.time()
+        # استخراج متن حرفه‌ای - ۵ صفحه با DPI 200
+        print("🔍 در حال استخراج متن از ۵ صفحه اول با DPI 200...")
+        extracted_text = processor.extract_text_professional(file.name, num_pages=5)
+        extraction_time = time.time() - start_time
+        if "❌" in extracted_text:
+            return create_error_display(extracted_text), None, f"❌ خطا در استخراج ({extraction_time:.1f}ثانیه)"
+        if not extracted_text.strip():
+            return create_error_display("متن قابل استخراج یافت نشد"), None, f"⚠️ متن خالی ({extraction_time:.1f}ثانیه)"
+        print(f"�� استخراج متن کامل: {len(extracted_text)} کاراکتر")
+        print(f"⏱️ زمان استخراج: {extraction_time:.2f} ثانیه")
+        # تحلیل حرفه‌ای با پرامپت پیشرفته
+        print("🔍 در حال تحلیل حرفه‌ای با AI...")
+        analysis_start_time = time.time()
+        analysis_result = processor.analyze_with_groq_professional(extracted_text)
+        analysis_time = time.time() - analysis_start_time
+        total_time = extraction_time + analysis_time
+        print(f"⏱️ زمان تحلیل: {analysis_time:.2f} ثانیه")
+        print(f"⏱️ زمان کل: {total_time:.2f} ثانیه")
+        # ایجاد گزارش حرفه‌ای
+        if analysis_result["success"]:
+            print(f"✅ تحلیل موفق با {analysis_result['source']}")
+            report = create_professional_report(analysis_result, extracted_text, total_time)
+            stats = f"✅ {analysis_result['source']} - {analysis_result['filled_fields']}/۹ فیلد - {total_time:.1f}ثانیه"
+            if analysis_result.get('response_length'):
+                stats += f" - پاسخ: {analysis_result['response_length']}کاراکتر"
+            if analysis_result["source"] == "local_professional":
+                stats += f" - اطمینان: {analysis_result.get('confidence', 0)*100:.1f}%"
+        else:
+            print(f"❌ خطا در تحلیل")
+            report = create_error_display(f"خطا در تحلیل: {analysis_result.get('error', 'نامشخص')}")
+            stats = f"❌ خطا در تحلیل - {total_time:.1f}ثانیه"
+        # ذخیره فایل
+        output_file = "/tmp/book_analysis_professional.txt"  # در Hugging Face از /tmp استفاده کنید
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(report)
+        print("✅ پردازش حرفه‌ای ۵ صفحه کامل شد")
+        print("=" * 60)
+        return create_professional_display(analysis_result, total_time), output_file, stats
+    except Exception as e:
+        error_msg = f"❌ خطای سیستمی: {str(e)}"
+        print(error_msg)
+        return create_error_display(error_msg), None, "❌ خطا"
+def create_professional_report(analysis_result, full_text, processing_time):
+    """ایجاد گزارش حرفه‌ای"""
+    results = analysis_result["results"]
+    source = analysis_result["source"]
+    report = f"""
+📚 گزارش تحلیل حرفه‌ای کتاب - سیستم پیشرفته
+{'='*50}
+⚙️  اطلاعات پردازش:
+• روش تحلیل: {source.upper()}
+• زمان پردازش: {processing_time:.2f} ثانیه
+• کیفیت تصویر: DPI 200
+• طول متن: {len(full_text)} کاراکتر
+• فیلدهای پر: {analysis_result['filled_fields']} از ۹
+• صفحات پردازش شده: ۵ صفحه اول
+📖 اطلاعات استخراج شده:
+{'‐'*30}
+"""
+    fields = [
+        ('📖 عنوان کتاب', 'title'),
+        ('✍️ نویسنده/مؤلف', 'author'),
+        ('🌐 مترجم', 'translator'),
+        ('🏢 ناشر', 'publisher'),
+        ('📅 سال انتشار', 'publication_year'),
+        ('🔖 شابک (ISBN)', 'isbn'),
+        ('🔄 نوبت چاپ', 'edition'),
+        ('📚 موضوع کتاب', 'subject'),
+        ('📝 خلاصه محتوا', 'summary')
+    ]
+    for persian_name, english_key in fields:
+        value = results.get(english_key, 'یافت نشد')
+        report += f"{persian_name}: {value}\n"
+    # آمار پیشرفته
+    page_count = len([p for p in full_text.split('📄 صفحه') if p.strip()])
+    report += f"""
+📊 آمار حرفه‌ای:
+• صفحات پردازش شده: {page_count} از ۵ صفحه
+• کیفیت استخراج: DPI 200
+• دقت تحلیل: {analysis_result['filled_fields'] * 11.1:.1f}%
+"""
+    return report
+def create_professional_display(analysis_result, processing_time):
+    """ایجاد نمایش حرفه‌ای"""
+    results = analysis_result["results"]
+    source = analysis_result["source"]
+    filled_fields = analysis_result["filled_fields"]
+    basic_html = ""
+    primary_fields = [
+        ('title', '📚 عنوان کتاب', 'عنوانی یافت نشد'),
+        ('author', '✍️ نویسنده/مؤلف', 'نویسنده‌ای یافت نشد'),
+        ('publisher', '🏢 ناشر', 'ناشری یافت نشد'),
+        ('publication_year', '📅 سال انتشار', 'سال انتشار یافت نشد'),
+    ]
+    for field, display, not_found in primary_fields:
+        value = results.get(field, not_found)
+        if value != not_found:
+            basic_html += f"""
+            <div style="background: linear-gradient(135deg, #2a2a2a 0%, #1a3a1a 100%); color: #00ff00; padding: 15px; margin: 8px 0; border-radius: 8px; border-left: 4px solid #00ff00; border-right: 1px solid #00ff00;">
+                <strong style="color: #00ff00; font-size: 16px;">{display}:</strong>
+                <div style="color: #ffffff; font-size: 15px; margin-top: 5px;">{value}</div>
+            </div>
+            """
+        else:
+            basic_html += f"""
+            <div style="background: #1a1a1a; color: #666; padding: 15px; margin: 8px 0; border-radius: 8px; border-left: 4px solid #666;">
+                <strong style="color: #666;">{display}:</strong> {not_found}
+            </div>
+            """
+    # فیلدهای تکمیلی
+    secondary_html = ""
+    secondary_fields = [
+        ('translator', '🌐 مترجم', 'مترجمی یافت نشد'),
+        ('isbn', '🔖 شابک (ISBN)', 'شابکی یافت نشد'),
+        ('edition', '🔄 نوبت چاپ', 'نوبت چاپی یافت نشد'),
+        ('subject', '📚 موضوع کتاب', 'موضوعی یافت نشد'),
+    ]
+    for field, display, not_found in secondary_fields:
+        value = results.get(field, not_found)
+        secondary_html += f"""
+        <div style="background: #2a2a2a; color: #ccc; padding: 12px; margin: 6px 0; border-radius: 6px; border: 1px solid #444;">
+            <strong>{display}:</strong> {value}
+        </div>
+        """
+    # خلاصه
+    summary_html = ""
+    summary = results.get('summary', 'یافت نشد')
+    if summary != 'یافت نشد':
+        summary_html = f"""
+        <div style="background: #1a2a1a; color: #aaffaa; padding: 15px; margin: 10px 0; border-radius: 8px; border: 1px solid #00aa00;">
+            <strong style="color: #00ff00;">📝 خلاصه محتوا:</strong>
+            <div style="color: #e0e0e0; margin-top: 8px; line-height: 1.6;">{summary}</div>
+        </div>
+        """
+    confidence_html = ""
+    if analysis_result.get('confidence'):
+        confidence_color = "#00ff00" if analysis_result['confidence'] > 0.7 else "#ffff00" if analysis_result['confidence'] > 0.4 else "#ff4444"
+        confidence_html = f"""
+        <div style="color: {confidence_color}; font-weight: bold; margin-top: 10px;">
+            🎯 میزان اطمینان تحلیل: {analysis_result['confidence']*100:.1f}%
+        </div>
+        """
+    return f"""
+    <div style="font-family: 'Tahoma', 'Arial', sans-serif; background: #000000; color: #00ff00;">
+        <div style="background: linear-gradient(135deg, #001a00 0%, #004400 100%); color: #00ff00; padding: 20px; border-radius: 10px 10px 0 0; border: 2px solid #00ff00;">
+            <h2 style="margin: 0; text-align: center; color: #00ff00;">🎯 نتایج تحلیل حرفه‌ای (۵ صفحه اول)</h2>
+            <p style="text-align: center; margin: 5px 0 0 0; color: #aaffaa;">کیفیت DPI 200 - پرامپت پیشرفته</p>
+        </div>
+        <div style="padding: 20px; background: #000000; border-radius: 0 0 10px 10px; border: 2px solid #00ff00; border-top: none;">
+            <h3 style="color: #00ff00; border-bottom: 2px solid #00ff00; padding-bottom: 10px;">📖 اطلاعات اصلی کتاب</h3>
+            {basic_html}
+            <div style="margin-top: 20px;">
+                <h4 style="color: #00ff00; margin-bottom: 10px;">📋 اطلاعات تکمیلی</h4>
+                {secondary_html}
+            </div>
+            {summary_html}
+            <div style="background: #1a1a1a; color: #00ff00; padding: 15px; border-radius: 8px; margin-top: 20px; border: 2px solid #00ff00;">
+                <h4 style="margin-top: 0; color: #00ff00;">🔧 اطلاعات فنی پردازش</h4>
+                <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;">
+                    <div>⏱️ زمان پردازش: <strong style="color: #ffffff;">{processing_time:.2f} ثانیه</strong></div>
+                    <div>📊 فیلدهای پر: <strong style="color: #ffffff;">{filled_fields} از ۹</strong></div>
+                    <div>🤖 روش استخراج: <strong style="color: #ffffff;">{source.upper()}</strong></div>
+                    <div>📄 کیفیت تصویر: <strong style="color: #ffffff;">DPI 200</strong></div>
+                    <div>🔍 صفحات پردازش: <strong style="color: #ffffff;">۵ صفحه اول</strong></div>
+                    <div>💾 max_tokens: <strong style="color: #ffffff;">۲۰۰۰</strong></div>
+                </div>
+                {confidence_html}
+            </div>
+        </div>
+    </div>
+    """
+def create_error_display(error_message):
+    """ایجاد نمایش خطا"""
+    return f"""
+    <div style="background: #1a0000; color: #ff0000; padding: 15px; border-radius: 8px; border: 2px solid #ff0000;">
+        <h4 style="margin: 0;">❌ خطا در پردازش</h4>
+        <p style="margin: 10px 0 0 0;">{error_message}</p>
+    </div>
+    """
+def create_empty_display():
+    """ایجاد نمایش خالی"""
+    return """
+    <div style="background: #000000; color: #00ff00; padding: 30px; text-align: center; border-radius: 8px; border: 2px dashed #00ff00;">
+        <h4 style="margin: 0 0 10px 0;">📚 سیستم حرفه‌ای ا��تخراج اطلاعات کتاب</h4>
+        <p style="margin: 0;">لطفاً یک فایل PDF یا تصویر آپلود کنید</p>
+        <p style="margin: 10px 0 0 0; font-size: 12px; color: #00cc00;">📄 ۵ صفحه اول با DPI 200 پردازش می‌شود</p>
+    </div>
+    """
+# ============================================================================
+# 🚀 اجرای سیستم حرفه‌ای
+# ============================================================================
+print("🎯 ایجاد رابط کاربری حرفه‌ای...")
+professional_css = """
+.gradio-container {
+    background: #000000 !important;
+    color: #00ff00 !important;
+}
+.gradio-container .panel {
+    background: #000000 !important;
+    border: 2px solid #00ff00 !important;
+}
+.gradio-container .button {
+    background: linear-gradient(135deg, #001a00 0%, #004400 100%) !important;
+    color: #00ff00 !important;
+    border: 2px solid #00ff00 !important;
+    font-weight: bold !important;
+}
+.gradio-container .button:hover {
+    background: linear-gradient(135deg, #003300 0%, #006600 100%) !important;
+}
+"""
+with gr.Blocks(title="سیستم حرفه‌ای استخراج کتاب", theme=gr.themes.Default(primary_hue="green"), css=professional_css) as demo:
+    gr.Markdown("""
+    <div style="background: linear-gradient(135deg, #001a00 0%, #004400 100%); color: #00ff00; padding: 25px; border-radius: 12px; border: 3px solid #00ff00; font-family: 'Tahoma', sans-serif;">
+        <h1 style="text-align: center; margin: 0; color: #00ff00;">📚 سیستم حرفه‌ای استخراج اطلاعات کتاب</h1>
+        <p style="text-align: center; color: #aaffaa; margin: 10px 0; font-size: 16px;">پردازش ۵ صفحه اول با کیفیت DPI 200 - پرامپت پیشرفته</p>
+        <div style="text-align: center; color: #88ff88; font-size: 14px;">
+            • کیفیت تصویر: DPI 200 • max_tokens: 2000 • پردازش ۵ صفحه •
+        </div>
+    </div>
+    """)
+    with gr.Row():
+        file_input = gr.File(
+            label="📁 آپلود فایل کتاب (PDF یا تصویر)",
+            file_types=[".pdf", ".jpg", ".jpeg", ".png"],
+            height=100
+        )
+    with gr.Row():
+        process_btn = gr.Button(
+            "🚀 شروع پردازش حرفه‌ای (۵ صفحه با DPI 200)",
+            variant="primary",
+            size="lg",
+            scale=2
+        )
+    with gr.Row():
+        output_display = gr.HTML(
+            label="🎯 نتایج تحلیل حرفه‌ای",
+            value=create_empty_display()
+        )
+    with gr.Row():
+        download_output = gr.File(
+            label="📥 دانلود گزارش کامل",
+            interactive=False
+        )
+    with gr.Row():
+        stats_display = gr.Textbox(
+            label="📊 وضعیت پردازش حرفه‌ای",
+            lines=2,
+            interactive=False
+        )
+    process_btn.click(
+        fn=process_book_professional,
+        inputs=[file_input],
+        outputs=[output_display, download_output, stats_display]
+    )
+print("✅ سیستم حرفه‌ای با DPI 200 و max_tokens 2000 آماده است!")
+print("🌐 در حال راه‌اندازی سرور Gradio...")
+# تغییرات برای Hugging Face Spaces
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.environ.get("PORT", 7860)),
+        share=False
+    )