Spaces:

samyhusy
/

OCR

Sleeping

App Files Files Community

samyhusy commited on Nov 2, 2025

Commit

eaf7e1e

verified ·

1 Parent(s): 1920732

Upload 5 files

Browse files

Files changed (6) hide show

.gitattributes +2 -0
app.py +199 -0
example1.png +3 -0
example2.png +3 -0
main_ocr_T4.py +105 -0
requirements.txt +17 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example1.png filter=lfs diff=lfs merge=lfs -text
+example2.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import gradio as gr
+import spaces
+from main import process_image_ocr
+import time
+# CSS for better Persian styling
+custom_css = """
+.persian-text {
+    font-family: "Vazirmatn", "Tahoma", "Arial", sans-serif;
+    direction: rtl;
+}
+.rtl-direction {
+    direction: rtl;
+    text-align: right;
+}
+.center-content {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+}
+.progress-text {
+    text-align: center;
+    font-weight: bold;
+    margin: 10px 0;
+}
+.markdown-output {
+    min-height: 400px;
+    border: 1px solid #e0e0e0;
+    padding: 15px;
+    border-radius: 8px;
+}
+"""
+def process_image_with_progress(image, model_size, task_type):
+    """
+    تابع پردازش تصویر با نوار پیشرفت
+    """
+    progress = gr.Progress()
+    # شبیه‌سازی مراحل پیشرفت
+    progress(0, desc="در حال آماده‌سازی مدل...")
+    time.sleep(0.5)
+    progress(0.3, desc="در حال پردازش تصویر...")
+    time.sleep(0.5)
+    progress(0.6, desc="در حال استخراج متن...")
+    time.sleep(0.5)
+    progress(0.8, desc="در حال تولید خروجی...")
+    # پردازش اصلی
+    result_image, markdown_content, text_result = process_image_ocr(
+        image, model_size, task_type, is_eval_mode=False
+    )
+    progress(1.0, desc="پردازش کامل شد!")
+    return markdown_content, text_result
+# ایجاد رابط Gradio بهبود یافته
+with gr.Blocks(
+    title=" OCR استخراج متن از تصویر",
+    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="teal"),
+    css=custom_css
+) as demo:
+    # هدر اصلی
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML(
+                """
+                <div class="persian-text" style="text-align: center;">
+                    <h1>🧠 پردازش هوشمند تصویر-OCR</h1>
+                    <h3>استخراج هوشمند متن از تصاویر</h3>
+                    <p>تصویر خود را آپلود کنید تا متن آن به صورت خودکار استخراج شود</p>
+                </div>
+                """
+            )
+    with gr.Row():
+        # پنل ورودی‌ها
+        with gr.Column(scale=1, min_width=400):
+            with gr.Group():
+                gr.Markdown("### ⚙️ تنظیمات پردازش", elem_classes="persian-text")
+                image_input = gr.Image(
+                    type="pil",
+                    label="📷 تصویر ورودی",
+                    sources=["upload", "clipboard"],
+                    height=300,
+                    elem_classes="rtl-direction"
+                )
+                model_size = gr.Dropdown(
+                    choices=["کوچک", "پایه (توصیه شده)", "بزرگ"],
+                    value="پایه (توصیه شده)",
+                    label="📊 اندازه مدل",
+                    info="مدل بزرگتر دقت بهتر اما سرعت کمتر",
+                    elem_classes="rtl-direction"
+                )
+                task_type = gr.Dropdown(
+                    choices=["OCR", "تبدیل به Markdown"],
+                    value="OCR",
+                    label="🎯 نوع وظیفه",
+                    info="OCR: فقط استخراج متن | Markdown: ساختاردهی پیشرفته",
+                    elem_classes="rtl-direction"
+                )
+                with gr.Row():
+                    clear_btn = gr.Button("🗑️ پاک کردن", size="sm")
+                    submit_btn = gr.Button("🚀 شروع پردازش", variant="primary", size="lg")
+        # پنل خروجی‌ها
+        with gr.Column(scale=2, min_width=600):
+            with gr.Tabs() as tabs:
+                # تب پیش‌نمایش Markdown
+                with gr.TabItem("📝 پیش‌ نمایش", id=1):
+                    gr.Markdown("**خروجی قالب‌ بندی شده:**", elem_classes="persian-text")
+                    output_markdown = gr.Markdown(
+                        elem_classes=["persian-text", "markdown-output"],
+                        value="خروجی اینجا نمایش داده می‌شود..."
+                    )
+                # تب متن خام
+                with gr.TabItem("📄 متن خام", id=2):
+                    output_text = gr.Textbox(
+                        lines=20,
+                        show_copy_button=True,
+                        label="متن استخراج شده",
+                        elem_classes="rtl-direction",
+                        value="متن استخراج شده در اینجا نمایش داده می‌شود..."
+                    )
+            # بخش اطلاعات و راهنما
+            with gr.Accordion("ℹ️ راهنمای استفاده", open=False):
+                gr.Markdown("""
+                **راهنمای سریع:**
+                - **تصویر با کیفیت بالا** آپلود کنید
+                - برای **اسناد متنی** از حالت 'پایه' استفاده کنید
+                - برای **تصاویر پیچیده** از حالت 'بزرگ' استفاده کنید
+                - حالت **Markdown** برای اسناد ساختاریافته مناسب است
+                **نکات:**
+                - فرمت‌های پشتیبانی شده: JPG, PNG, WebP
+                - حداکثر حجم تصویر: 10MB
+                - پردازش ممکن است 10-30 ثانیه زمان ببرد
+                """, elem_classes="persian-text")
+    # بخش مثال‌ها
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 📁 مثال‌های آماده", elem_classes="persian-text")
+            gr.Examples(
+                examples=[
+                    ["example1.png", "پایه (توصیه شده)", "OCR"],
+                    ["example2.png", "پایه (توصیه شده)", "تبدیل به Markdown"],
+                ],
+                inputs=[image_input, model_size, task_type],
+                outputs=[output_markdown, output_text],
+                fn=process_image_with_progress,
+                cache_examples=False,
+                label="برای تست سریع روی یکی از مثال‌ها کلیک کنید",
+                examples_per_page=3
+            )
+    # وضعیت سیستم
+    with gr.Row():
+        gr.HTML("""
+        <div class="persian-text" style="text-align: center; color: #666; font-size: 0.9em; margin-top: 20px;">
+            <p>ساخته شده توسط *سامان زیتونیان* | OCR | پردازش تصویر هوشمند</p>
+        </div>
+        """)
+    # مدیریت رویدادها
+    def clear_all():
+        return None, "خروجی اینجا نمایش داده می‌شود...", "متن استخراج شده در اینجا نمایش داده می‌شود..."
+    # اتصال دکمه‌ها
+    submit_btn.click(
+        fn=process_image_with_progress,
+        inputs=[image_input, model_size, task_type],
+        outputs=[output_markdown, output_text],
+        show_progress="minimal"
+    )
+    clear_btn.click(
+        fn=clear_all,
+        outputs=[image_input, output_markdown, output_text]
+    )
+# راه‌اندازی برنامه
+if __name__ == "__main__":
+    demo.launch(
+        share=True,
+        show_error=True
+    )

example1.png ADDED Viewed

Git LFS Details

SHA256: eaa2e32eab2c18c1075a9be5f6250c1cef4763456f23e096d7e29fb14cfcbead
Pointer size: 131 Bytes
Size of remote file: 159 kB

example2.png ADDED Viewed

Git LFS Details

SHA256: 15785cd7389f1ceea34fc5513b44a7a6f1bbd7fcfb4c0639632dbb2de590e5e8
Pointer size: 131 Bytes
Size of remote file: 103 kB

main_ocr_T4.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+from transformers import AutoModel, AutoTokenizer
+import os
+import tempfile
+from PIL import Image
+import gradio as gr
+# Load model and tokenizer
+model_name = "deepseek-ai/DeepSeek-OCR"
+try:
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModel.from_pretrained(
+        model_name,
+        _attn_implementation="sdpa",
+        trust_remote_code=True,
+        device_map="auto",
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    )
+    model = model.eval()
+    print("✅ مدل با موفقیت بارگذاری شد!")
+except Exception as e:
+    print(f"❌ خطا در بارگذاری مدل: {e}")
+    raise
+def process_image_ocr(image, model_size, task_type, is_eval_mode=False, progress=gr.Progress()):
+    """
+    پردازش تصاویر برای وظایف OCR و Markdown با پشتیبانی از پیشرفت
+    """
+    if image is None:
+        return None, "لطفا ابتدا یک تصویر آپلود کنید.", "لطفا ابتدا یک تصویر آپلود کنید."
+    try:
+        # به‌روزرسانی پیشرفت
+        if progress is not None:
+            progress(0.1, desc="در حال آماده‌سازی محیط پردازش...")
+        # ایجاد دایرکتوری موقت برای خروجی
+        with tempfile.TemporaryDirectory() as output_path:
+            # تنظیم prompt بر اساس نوع وظیفه
+            if task_type == "OCR":
+                prompt = "<image>\nFree OCR. "
+            elif task_type == "تبدیل به Markdown":
+                prompt = "<image>\n<|grounding|>Convert the document to markdown. "
+            else:
+                prompt = "<image>\nFree OCR. "
+            # ذخیره تصویر آپلود شده به صورت موقت
+            temp_image_path = os.path.join(output_path, "temp_image.jpg")
+            image.save(temp_image_path, quality=95)
+            if progress is not None:
+                progress(0.3, desc="در حال بارگذاری و تنظیم تصویر...")
+            # پیکربندی پارامترهای اندازه مدل
+            size_configs = {
+                "کوچک": {"base_size": 640, "image_size": 640, "crop_mode": False},
+                "پایه (توصیه شده)": {"base_size": 1024, "image_size": 1024, "crop_mode": True},
+                "بزرگ": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+            }
+            config = size_configs.get(model_size, size_configs["پایه (توصیه شده)"])
+            if progress is not None:
+                progress(0.5, desc="در حال اجرای مدل هوشمند...")
+            # اجرای استنتاج
+            plain_text_result = model.infer(
+                tokenizer,
+                prompt=prompt,
+                image_file=temp_image_path,
+                output_path=output_path,
+                base_size=config["base_size"],
+                image_size=config["image_size"],
+                crop_mode=config["crop_mode"],
+                save_results=True,
+                test_compress=True,
+                eval_mode=is_eval_mode,
+            )
+            if progress is not None:
+                progress(0.8, desc="در حال پردازش نتایج...")
+            # تعریف مسیرهای فایل‌های تولید شده
+            image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
+            markdown_result_path = os.path.join(output_path, "result.mmd")
+            # خواندن محتوای فایل markdown در صورت وجود
+            markdown_content = ""
+            if os.path.exists(markdown_result_path):
+                with open(markdown_result_path, "r", encoding="utf-8") as f:
+                    markdown_content = f.read()
+            else:
+                markdown_content = plain_text_result if plain_text_result else "نتیجه‌ای تولید نشد."
+            if progress is not None:
+                progress(1.0, desc="پردازش کامل شد!")
+            # بازگرداندن نتایج
+            text_result = plain_text_result if plain_text_result else markdown_content
+            return None, markdown_content, text_result
+    except Exception as e:
+        error_msg = f"خطا در پردازش تصویر: {str(e)}"
+        print(f"❌ {error_msg}")
+        return None, error_msg, error_msg

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+torch==2.6.0
+torchvision
+transformers==4.46.3
+tokenizers==0.20.3
+safetensors>=0.4.0
+accelerate>=0.26.0
+protobuf>=3.20.0
+spaces>=0.20.0
+Pillow>=10.0.0
+einops
+addict
+easydict
+gradio
+einops
+addict
+easydict
+# pip install flash-attn==2.7.3 --no-build-isolation  (only support 3xxx or higher RTX GPU with Ampere architecture)