Spaces:

oddadmix
/

Arabic-OCR-Models-Demos

Running on Zero

App Files Files Community

Arabic-OCR-Models-Demos / app.py

oddadmix

Update app.py

d5f9cb3 verified 14 days ago

raw

history blame contribute delete

6.11 kB

	import gradio as gr
	import time
	import spaces
	from PIL import Image
	from transformers import Qwen3VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText
	from qwen_vl_utils import process_vision_info
	import torch
	import uuid
	import os
	import numpy as np

	# Model configurations
	MODEL_CONFIGS = {

	"KATIB OCR 0.8B 0.1": {
	"name": "oddadmix/Katib-Qwen3.5-0.8B-0.3",
	"class": AutoModelForImageTextToText,
	"prompt": "Free OCR.",
	"use_qwen3": True
	},
	"Qari OCR 0.2.2.1": {
	"name": "oddadmix/Qari-OCR-0.2.2.1-VL-2B-Instruct-merged",
	"class": Qwen2VLForConditionalGeneration,
	"prompt": "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate.",
	"use_qwen3": False
	}
	}

	# Load models
	models = {}
	processors = {}

	for model_key, config in MODEL_CONFIGS.items():
	print(f"Loading {model_key}...")
	models[model_key] = config["class"].from_pretrained(
	config["name"],
	torch_dtype="auto",
	device_map="cuda"
	)
	processors[model_key] = AutoProcessor.from_pretrained(config["name"])

	max_tokens = 2000

	def resizeImage(image):
	if image.height > 1500:
	image = image.resize((int(image.width * 1500 / image.height), 1500), Image.Resampling.LANCZOS)
	return image

	@spaces.GPU
	def perform_ocr(image, model_choice):
	inputArray = np.any(image)
	if inputArray == False:
	return "Error Processing"

	"""Process image and extract text using selected OCR model"""
	image = Image.fromarray(image)

	# Get model configuration
	config = MODEL_CONFIGS[model_choice]
	model = models[model_choice]
	processor = processors[model_choice]
	prompt = config["prompt"]
	use_qwen3 = config["use_qwen3"]

	# Resize image for Qwen3 model

	# image = resizeImage(image)
	print("Image resized")

	src = str(uuid.uuid4()) + ".png"
	image.save(src)
	print(src)
	# Prepare messages based on model type
	if use_qwen3:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"./{src}"},
	{"type": "text", "text": prompt},
	],
	}
	]
	else:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"file://{src}"},
	{"type": "text", "text": prompt},
	],
	}
	]

	# Process inputs based on model type
	if use_qwen3:
	inputs = processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	)
	inputs = inputs.to(model.device)
	else:
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")

	# Generate text
	generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	# Cleanup
	os.remove(src)
	return output_text

	# Create Gradio interface
	with gr.Blocks(title="Arabic OCR Models Demo") as demo:
	gr.Markdown("# Arabic OCR Models Demo")
	gr.Markdown("Upload an image to extract Arabic text in real-time. Choose between different OCR models.")

	with gr.Row():
	with gr.Column(scale=1):
	# Model selection dropdown
	model_dropdown = gr.Dropdown(
	choices=list(MODEL_CONFIGS.keys()),
	value=list(MODEL_CONFIGS.keys())[0],
	label="Select OCR Model",
	interactive=True
	)

	# Input image
	image_input = gr.Image(type="numpy", label="Upload Image")

	# Example gallery
	gr.Examples(
	examples=[
	["0.4.png"],
	["2.jpg"],
	["3.jpg"]
	],
	inputs=image_input,
	label="Example Images",
	examples_per_page=4
	)

	# Submit button
	submit_btn = gr.Button("Extract Text")

	with gr.Column(scale=1):
	# Output text
	output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)

	# Model details
	with gr.Accordion("Model Information", open=False):
	gr.Markdown("""
	Available Models:

	1. KATIB OCR 0.1 0.8B
	- Model: oddadmix/Katib-Qwen3.5-0.8B-0.1
	- Based on Qwen3.5
	- Size: 0.8B parameters

	2. Qari OCR 0.2.2.1
	- Model: NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct
	- Based on Qwen2-VL architecture
	- Size: 2B parameters

	Context window: Supports up to 2000 output tokens
	""")

	# Set up processing flow
	submit_btn.click(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
	image_input.change(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)

	demo.launch()