Spaces:
Running on Zero
Running on Zero
| import gradio as gr | |
| import time | |
| import spaces | |
| from PIL import Image | |
| from transformers import Qwen3VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText | |
| from qwen_vl_utils import process_vision_info | |
| import torch | |
| import uuid | |
| import os | |
| import numpy as np | |
| # Model configurations | |
| MODEL_CONFIGS = { | |
| "KATIB OCR 0.8B 0.1": { | |
| "name": "oddadmix/Katib-Qwen3.5-0.8B-0.3", | |
| "class": AutoModelForImageTextToText, | |
| "prompt": "Free OCR.", | |
| "use_qwen3": True | |
| }, | |
| "Qari OCR 0.2.2.1": { | |
| "name": "oddadmix/Qari-OCR-0.2.2.1-VL-2B-Instruct-merged", | |
| "class": Qwen2VLForConditionalGeneration, | |
| "prompt": "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate.", | |
| "use_qwen3": False | |
| } | |
| } | |
| # Load models | |
| models = {} | |
| processors = {} | |
| for model_key, config in MODEL_CONFIGS.items(): | |
| print(f"Loading {model_key}...") | |
| models[model_key] = config["class"].from_pretrained( | |
| config["name"], | |
| torch_dtype="auto", | |
| device_map="cuda" | |
| ) | |
| processors[model_key] = AutoProcessor.from_pretrained(config["name"]) | |
| max_tokens = 2000 | |
| def resizeImage(image): | |
| if image.height > 1500: | |
| image = image.resize((int(image.width * 1500 / image.height), 1500), Image.Resampling.LANCZOS) | |
| return image | |
| def perform_ocr(image, model_choice): | |
| inputArray = np.any(image) | |
| if inputArray == False: | |
| return "Error Processing" | |
| """Process image and extract text using selected OCR model""" | |
| image = Image.fromarray(image) | |
| # Get model configuration | |
| config = MODEL_CONFIGS[model_choice] | |
| model = models[model_choice] | |
| processor = processors[model_choice] | |
| prompt = config["prompt"] | |
| use_qwen3 = config["use_qwen3"] | |
| # Resize image for Qwen3 model | |
| # image = resizeImage(image) | |
| print("Image resized") | |
| src = str(uuid.uuid4()) + ".png" | |
| image.save(src) | |
| print(src) | |
| # Prepare messages based on model type | |
| if use_qwen3: | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": f"./{src}"}, | |
| {"type": "text", "text": prompt}, | |
| ], | |
| } | |
| ] | |
| else: | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": f"file://{src}"}, | |
| {"type": "text", "text": prompt}, | |
| ], | |
| } | |
| ] | |
| # Process inputs based on model type | |
| if use_qwen3: | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ) | |
| inputs = inputs.to(model.device) | |
| else: | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to("cuda") | |
| # Generate text | |
| generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| # Cleanup | |
| os.remove(src) | |
| return output_text | |
| # Create Gradio interface | |
| with gr.Blocks(title="Arabic OCR Models Demo") as demo: | |
| gr.Markdown("# Arabic OCR Models Demo") | |
| gr.Markdown("Upload an image to extract Arabic text in real-time. Choose between different OCR models.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Model selection dropdown | |
| model_dropdown = gr.Dropdown( | |
| choices=list(MODEL_CONFIGS.keys()), | |
| value=list(MODEL_CONFIGS.keys())[0], | |
| label="Select OCR Model", | |
| interactive=True | |
| ) | |
| # Input image | |
| image_input = gr.Image(type="numpy", label="Upload Image") | |
| # Example gallery | |
| gr.Examples( | |
| examples=[ | |
| ["0.4.png"], | |
| ["2.jpg"], | |
| ["3.jpg"] | |
| ], | |
| inputs=image_input, | |
| label="Example Images", | |
| examples_per_page=4 | |
| ) | |
| # Submit button | |
| submit_btn = gr.Button("Extract Text") | |
| with gr.Column(scale=1): | |
| # Output text | |
| output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True) | |
| # Model details | |
| with gr.Accordion("Model Information", open=False): | |
| gr.Markdown(""" | |
| **Available Models:** | |
| 1. **KATIB OCR 0.1 0.8B ** | |
| - Model: oddadmix/Katib-Qwen3.5-0.8B-0.1 | |
| - Based on Qwen3.5 | |
| - Size: 0.8B parameters | |
| 2. **Qari OCR 0.2.2.1** | |
| - Model: NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct | |
| - Based on Qwen2-VL architecture | |
| - Size: 2B parameters | |
| **Context window:** Supports up to 2000 output tokens | |
| """) | |
| # Set up processing flow | |
| submit_btn.click(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output) | |
| image_input.change(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output) | |
| demo.launch() |