Spaces:

oddadmix
/

Arabic-OCR-Models-Demos

Running on Zero

App Files Files Community

Update app.py

by sherif1313 - opened Mar 23

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+276

-168

Files changed (1) hide show

app.py +276 -168

app.py CHANGED Viewed

@@ -1,191 +1,299 @@
-import gradio as gr
 import time
-import spaces
 from PIL import Image
-from transformers import Qwen3VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText
 from qwen_vl_utils import process_vision_info
-import torch
-import uuid
-import os
-import numpy as np
-# Model configurations
-MODEL_CONFIGS = {
-    "KATIB OCR 0.8B 0.1": {
-        "name": "oddadmix/Katib-Qwen3.5-0.8B-0.3",
-        "class": AutoModelForImageTextToText,
-        "prompt": "Free OCR.",
-        "use_qwen3": True
-    },
-    "Qari OCR 0.2.2.1": {
-        "name": "oddadmix/Qari-OCR-0.2.2.1-VL-2B-Instruct-merged",
-        "class": Qwen2VLForConditionalGeneration,
-        "prompt": "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate.",
-        "use_qwen3": False
-    }
-}
-# Load models
-models = {}
-processors = {}
-for model_key, config in MODEL_CONFIGS.items():
-    print(f"Loading {model_key}...")
-    models[model_key] = config["class"].from_pretrained(
-        config["name"],
-        torch_dtype="auto",
-        device_map="cuda"
     )
-    processors[model_key] = AutoProcessor.from_pretrained(config["name"])
-max_tokens = 2000
-def resizeImage(image):
-    if image.height > 1500:
-        image = image.resize((int(image.width * 1500 / image.height), 1500), Image.Resampling.LANCZOS)
     return image
-@spaces.GPU
-def perform_ocr(image, model_choice):
-    inputArray = np.any(image)
-    if inputArray == False:
-        return "Error Processing"
-    """Process image and extract text using selected OCR model"""
-    image = Image.fromarray(image)
-    # Get model configuration
-    config = MODEL_CONFIGS[model_choice]
-    model = models[model_choice]
-    processor = processors[model_choice]
-    prompt = config["prompt"]
-    use_qwen3 = config["use_qwen3"]
-    # Resize image for Qwen3 model
-    # image = resizeImage(image)
-    print("Image resized")
-    src = str(uuid.uuid4()) + ".png"
-    image.save(src)
-    print(src)
-    # Prepare messages based on model type
-    if use_qwen3:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": f"./{src}"},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-    else:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": f"file://{src}"},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-    # Process inputs based on model type
-    if use_qwen3:
-        inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
-        inputs = inputs.to(model.device)
-    else:
-        text = processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
-        image_inputs, video_inputs = process_vision_info(messages)
         inputs = processor(
-            text=[text],
             images=image_inputs,
-            videos=video_inputs,
             padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to("cuda")
-    # Generate text
-    generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
-    # Cleanup
-    os.remove(src)
-    return output_text
-# Create Gradio interface
-with gr.Blocks(title="Arabic OCR Models Demo") as demo:
-    gr.Markdown("# Arabic OCR Models Demo")
-    gr.Markdown("Upload an image to extract Arabic text in real-time. Choose between different OCR models.")
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Model selection dropdown
-            model_dropdown = gr.Dropdown(
-                choices=list(MODEL_CONFIGS.keys()),
-                value=list(MODEL_CONFIGS.keys())[0],
-                label="Select OCR Model",
-                interactive=True
-            )
-            # Input image
-            image_input = gr.Image(type="numpy", label="Upload Image")
-            # Example gallery
-            gr.Examples(
-                examples=[
-                    ["0.4.png"],
-                    ["2.jpg"],
-                    ["3.jpg"]
-                ],
-                inputs=image_input,
-                label="Example Images",
-                examples_per_page=4
             )
-            # Submit button
-            submit_btn = gr.Button("Extract Text")
-        with gr.Column(scale=1):
-            # Output text
-            output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
-            # Model details
-            with gr.Accordion("Model Information", open=False):
-                gr.Markdown("""
-                **Available Models:**
-                1. **KATIB OCR 0.1 0.8B **
-                   - Model: oddadmix/Katib-Qwen3.5-0.8B-0.1
-                   - Based on Qwen3.5
-                   - Size: 0.8B parameters
-                2. **Qari OCR 0.2.2.1**
-                   - Model: NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct
-                   - Based on Qwen2-VL architecture
-                   - Size: 2B parameters
-                **Context window:** Supports up to 2000 output tokens
-                """)
-    # Set up processing flow
-    submit_btn.click(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
-    image_input.change(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
-demo.launch()

+# -*- coding: utf-8 -*-
+"""
+🤖 Arabic OCR - Hugging Face Spaces Version
+Model: Qwen3.5-0.8B-VL with LoRA
+No Quantization - Full Precision
+"""
+import os
 import time
+import torch
 from PIL import Image
+import gradio as gr
+from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
 from qwen_vl_utils import process_vision_info
+# ==================== ⚙️ إعدادات الجهاز ====================
+if torch.cuda.is_available():
+    device = "cuda"
+    dtype = torch.float16
+    print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
+elif torch.backends.mps.is_available():
+    device = "mps"
+    dtype = torch.float16
+    print("✅ Using Apple Silicon (MPS)")
+else:
+    device = "cpu"
+    dtype = torch.float32
+    print("⚠️ Using CPU (slower inference)")
+print(f"[INFO] Device: {device} | Dtype: {dtype}")
+# ==================== 🔄 تحميل النموذج ====================
+def load_model():
+    """تحميل النموذج والمعالج مع إدارة الذاكرة"""
+    model_path = os.getenv("MODEL_PATH", "sherif1313/Arabic-Qwen3.5-OCR-v4")
+    print(f"[INFO] Loading model from: {model_path}")
+    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+    model = Qwen3_5ForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=dtype,
+        device_map="auto" if device == "cuda" else None,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
     )
+    model.eval()
+    print("[INFO] Model loaded successfully!")
+    return model, processor
+# تحميل عالمي (يتم مرة واحدة عند بدء التطبيق)
+try:
+    model, processor = load_model()
+except Exception as e:
+    print(f"[ERROR] Failed to load model: {e}")
+    model = None
+    processor = None
+# ==================== 🧹 دوال مساعدة ====================
+def prepare_image(image: Image.Image, max_size: int = 768) -> Image.Image:
+    """تحضير الصورة: ضغط + ضبط الأبعاد لمضاعفات 64"""
+    if max(image.size) > max_size:
+        image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+    w, h = image.size
+    new_w = ((w + 63) // 64) * 64
+    new_h = ((h + 63) // 64) * 64
+    if (new_w, new_h) != image.size:
+        image = image.resize((new_w, new_h), Image.Resampling.LANCZOS)
     return image
+def clean_output(text: str, max_repetitions: int = 2) -> str:
+    """تنظيف التكرار في المخرجات"""
+    if not text:
+        return text
+    import re
+    text = re.sub(r'(.)\1{4,}', r'\1\1\1', text)
+    lines = text.strip().split('\n')
+    cleaned = []
+    seen = {}
+    for line in lines:
+        line_stripped = line.strip()
+        if not line_stripped:
+            continue
+        count = seen.get(line_stripped, 0) + 1
+        if count <= max_repetitions:
+            cleaned.append(line)
+        seen[line_stripped] = count
+    return '\n'.join(cleaned).strip()
+# ==================== 🔍 دالة الاستدلال ====================
+def extract_text(image, prompt: str = None) -> tuple[str, str]:
+    """استخراج النص من الصورة"""
+    if model is None or processor is None:
+        return "❌ Error: Model not loaded", "0.00"
+    if image is None:
+        return "⚠️ Please upload an image", "0.00"
+    start_time = time.time()
+    try:
+        if isinstance(image, str):
+            image_pil = Image.open(image).convert("RGB")
+        elif isinstance(image, Image.Image):
+            image_pil = image.convert("RGB")
+        else:
+            image_pil = Image.fromarray(image).convert("RGB")
+        image_pil = prepare_image(image_pil)
+        if prompt is None or not prompt.strip():
+            prompt = "اقرأ النص في هذه الصورة كاملاً من البداية إلى النهاية."
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image_pil},
+                {"type": "text", "text": prompt}
+            ]
+        }]
+        text_input = processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
+        image_inputs, _ = process_vision_info(messages)
         inputs = processor(
+            text=[text_input],
             images=image_inputs,
             padding=True,
+            return_tensors="pt"
+        ).to(device)
+        with torch.inference_mode():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                do_sample=False,
+                temperature=1.0,
+                repetition_penalty=1.2,
+                no_repeat_ngram_size=3,
+                pad_token_id=processor.tokenizer.pad_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id,
             )
+        input_len = inputs.input_ids.shape[1]
+        output_text = processor.batch_decode(
+            generated_ids[:, input_len:],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        output_text = clean_output(output_text.strip())
+        elapsed = time.time() - start_time
+        return output_text, f"{elapsed:.2f} seconds"
+    except torch.cuda.OutOfMemoryError:
+        torch.cuda.empty_cache()
+        return "❌ Out of Memory. Try a smaller image.", "0.00"
+    except Exception as e:
+        print(f"[ERROR] {e}")
+        import traceback
+        traceback.print_exc()
+        return f"❌ Error: {str(e)}", "0.00"
+# ==================== 🎨 واجهة Gradio ====================
+def create_interface():
+    """إنشاء واجهة المستخدم"""
+    with gr.Blocks(
+        title="Arabic OCR - Qwen3.5-0.8B"
+        # theme and css removed from here – moved to launch()
+    ) as demo:
+        gr.Markdown("""
+        # 📝 Arabic Handwritten & Printed OCR V4
+        ### Powered by Qwen3.5-0.8B
+        Upload an image containing Arabic text, and the model will extract it.
+        ✨ **Features:**
+        - 🌍 Arabic support
+        - ✍️ Handwritten & printed text
+        - 🔤 Preserves diacritics (تشكيل)
+        - ⚡ Full precision (no quantization)
+        """, elem_classes="header")
+        with gr.Row():
+            with gr.Column(scale=1):
+                image_input = gr.Image(
+                    label="📷 Upload Image",
+                    type="pil",
+                    height=300,
+                    sources=["upload", "clipboard"]
+                )
+                prompt_input = gr.Textbox(
+                    label="📝 Custom Prompt (Optional)",
+                    placeholder="اقرأ النص في هذه الصورة...",
+                    value="اقرأ النص في هذه الصورة كاملاً من البداية إلى النهاية.",
+                    lines=2
+                )
+                submit_btn = gr.Button(
+                    "🔍 Extract Text",
+                    variant="primary",
+                    size="lg"
+                )
+                # Examples – use local files or remote URLs (remote may fail in some environments)
+                # For production, copy images to an 'examples' folder and use local paths.
+                gr.Examples(
+                    label="📋 Examples (Optional)",
+                    examples=[
+                        # You can replace these with local files like ["examples/sample1.jpg"]
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00002.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00106.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00107.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00113.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00126.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00135.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00141.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00197.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00198.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00199.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00216.png"],
+                        ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00240.png"],
+                    ],
+                    inputs=[image_input],
+                    cache_examples=False
+                )
+            with gr.Column(scale=1):
+                # Removed show_copy_button parameter (not available in older Gradio)
+                output_text = gr.Textbox(
+                    label="📄 Extracted Text",
+                    lines=12,
+                    elem_classes="output-box"
+                )
+                time_output = gr.Textbox(
+                    label="⏱️ Inference Time",
+                    interactive=False,
+                    value="-"
+                )
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+        # ربط الأحداث
+        submit_btn.click(
+            fn=extract_text,
+            inputs=[image_input, prompt_input],
+            outputs=[output_text, time_output]
+        )
+        clear_btn.click(
+            fn=lambda: (None, "", "-"),
+            inputs=[],
+            outputs=[image_input, prompt_input, time_output]
+        )
+        gr.Markdown("""
+        ### 💡 Tips for Best Results:
+        1. Use clear, well-lit images
+        2. Crop to the text region if possible
+        3. For handwritten text, ensure good contrast
+        4. Custom prompts can improve accuracy for specific formats
+        """)
+    return demo
+# ==================== 🚀 نقطة الدخول ====================
+if __name__ == "__main__":
+    print("[INFO] Creating Gradio interface...")
+    demo = create_interface()
+    # تشغيل التطبيق مع تمرير theme و css هنا (Gradio 6+)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.getenv("PORT", 7860)),
+        share=False,
+        debug=os.getenv("DEBUG", "false").lower() == "true",
+        show_error=True,
+        theme=gr.themes.Soft(),   # moved from Blocks
+        css="""
+        .header { text-align: center; margin-bottom: 20px; }
+        .output-box { min-height: 200px; }
+        """
+    )