Files changed (1) hide show
  1. app.py +276 -168
app.py CHANGED
@@ -1,191 +1,299 @@
1
- import gradio as gr
 
 
 
 
 
 
 
2
  import time
3
- import spaces
4
  from PIL import Image
5
- from transformers import Qwen3VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText
 
6
  from qwen_vl_utils import process_vision_info
7
- import torch
8
- import uuid
9
- import os
10
- import numpy as np
11
 
12
- # Model configurations
13
- MODEL_CONFIGS = {
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- "KATIB OCR 0.8B 0.1": {
16
- "name": "oddadmix/Katib-Qwen3.5-0.8B-0.3",
17
- "class": AutoModelForImageTextToText,
18
- "prompt": "Free OCR.",
19
- "use_qwen3": True
20
- },
21
- "Qari OCR 0.2.2.1": {
22
- "name": "oddadmix/Qari-OCR-0.2.2.1-VL-2B-Instruct-merged",
23
- "class": Qwen2VLForConditionalGeneration,
24
- "prompt": "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate.",
25
- "use_qwen3": False
26
- }
27
- }
28
 
29
- # Load models
30
- models = {}
31
- processors = {}
32
-
33
- for model_key, config in MODEL_CONFIGS.items():
34
- print(f"Loading {model_key}...")
35
- models[model_key] = config["class"].from_pretrained(
36
- config["name"],
37
- torch_dtype="auto",
38
- device_map="cuda"
 
 
 
 
 
39
  )
40
- processors[model_key] = AutoProcessor.from_pretrained(config["name"])
 
 
 
41
 
42
- max_tokens = 2000
 
 
 
 
 
 
43
 
44
- def resizeImage(image):
45
- if image.height > 1500:
46
- image = image.resize((int(image.width * 1500 / image.height), 1500), Image.Resampling.LANCZOS)
 
 
 
 
 
 
 
 
 
47
  return image
48
 
49
- @spaces.GPU
50
- def perform_ocr(image, model_choice):
51
- inputArray = np.any(image)
52
- if inputArray == False:
53
- return "Error Processing"
54
-
55
- """Process image and extract text using selected OCR model"""
56
- image = Image.fromarray(image)
57
-
58
- # Get model configuration
59
- config = MODEL_CONFIGS[model_choice]
60
- model = models[model_choice]
61
- processor = processors[model_choice]
62
- prompt = config["prompt"]
63
- use_qwen3 = config["use_qwen3"]
64
-
65
- # Resize image for Qwen3 model
 
 
 
 
66
 
67
- # image = resizeImage(image)
68
- print("Image resized")
69
-
70
- src = str(uuid.uuid4()) + ".png"
71
- image.save(src)
72
- print(src)
73
- # Prepare messages based on model type
74
- if use_qwen3:
75
- messages = [
76
- {
77
- "role": "user",
78
- "content": [
79
- {"type": "image", "image": f"./{src}"},
80
- {"type": "text", "text": prompt},
81
- ],
82
- }
83
- ]
84
- else:
85
- messages = [
86
- {
87
- "role": "user",
88
- "content": [
89
- {"type": "image", "image": f"file://{src}"},
90
- {"type": "text", "text": prompt},
91
- ],
92
- }
93
- ]
94
-
95
- # Process inputs based on model type
96
- if use_qwen3:
97
- inputs = processor.apply_chat_template(
98
- messages,
99
- tokenize=True,
100
- add_generation_prompt=True,
101
- return_dict=True,
102
- return_tensors="pt"
103
- )
104
- inputs = inputs.to(model.device)
105
- else:
106
- text = processor.apply_chat_template(
107
  messages, tokenize=False, add_generation_prompt=True
108
  )
109
- image_inputs, video_inputs = process_vision_info(messages)
 
110
  inputs = processor(
111
- text=[text],
112
  images=image_inputs,
113
- videos=video_inputs,
114
  padding=True,
115
- return_tensors="pt",
116
- )
117
- inputs = inputs.to("cuda")
118
-
119
- # Generate text
120
- generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
121
- generated_ids_trimmed = [
122
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
123
- ]
124
- output_text = processor.batch_decode(
125
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
126
- )[0]
127
-
128
- # Cleanup
129
- os.remove(src)
130
- return output_text
131
-
132
- # Create Gradio interface
133
- with gr.Blocks(title="Arabic OCR Models Demo") as demo:
134
- gr.Markdown("# Arabic OCR Models Demo")
135
- gr.Markdown("Upload an image to extract Arabic text in real-time. Choose between different OCR models.")
136
-
137
- with gr.Row():
138
- with gr.Column(scale=1):
139
- # Model selection dropdown
140
- model_dropdown = gr.Dropdown(
141
- choices=list(MODEL_CONFIGS.keys()),
142
- value=list(MODEL_CONFIGS.keys())[0],
143
- label="Select OCR Model",
144
- interactive=True
145
- )
146
-
147
- # Input image
148
- image_input = gr.Image(type="numpy", label="Upload Image")
149
-
150
- # Example gallery
151
- gr.Examples(
152
- examples=[
153
- ["0.4.png"],
154
- ["2.jpg"],
155
- ["3.jpg"]
156
- ],
157
- inputs=image_input,
158
- label="Example Images",
159
- examples_per_page=4
160
  )
161
-
162
- # Submit button
163
- submit_btn = gr.Button("Extract Text")
164
-
165
- with gr.Column(scale=1):
166
- # Output text
167
- output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
168
-
169
- # Model details
170
- with gr.Accordion("Model Information", open=False):
171
- gr.Markdown("""
172
- **Available Models:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
- 1. **KATIB OCR 0.1 0.8B **
175
- - Model: oddadmix/Katib-Qwen3.5-0.8B-0.1
176
- - Based on Qwen3.5
177
- - Size: 0.8B parameters
 
178
 
179
- 2. **Qari OCR 0.2.2.1**
180
- - Model: NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct
181
- - Based on Qwen2-VL architecture
182
- - Size: 2B parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- **Context window:** Supports up to 2000 output tokens
185
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- # Set up processing flow
188
- submit_btn.click(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
189
- image_input.change(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
190
 
191
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ๐Ÿค– Arabic OCR - Hugging Face Spaces Version
4
+ Model: Qwen3.5-0.8B-VL with LoRA
5
+ No Quantization - Full Precision
6
+ """
7
+
8
+ import os
9
  import time
10
+ import torch
11
  from PIL import Image
12
+ import gradio as gr
13
+ from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
14
  from qwen_vl_utils import process_vision_info
 
 
 
 
15
 
16
+ # ==================== โš™๏ธ ุฅุนุฏุงุฏุงุช ุงู„ุฌู‡ุงุฒ ====================
17
+ if torch.cuda.is_available():
18
+ device = "cuda"
19
+ dtype = torch.float16
20
+ print(f"โœ… Using GPU: {torch.cuda.get_device_name(0)}")
21
+ elif torch.backends.mps.is_available():
22
+ device = "mps"
23
+ dtype = torch.float16
24
+ print("โœ… Using Apple Silicon (MPS)")
25
+ else:
26
+ device = "cpu"
27
+ dtype = torch.float32
28
+ print("โš ๏ธ Using CPU (slower inference)")
29
 
30
+ print(f"[INFO] Device: {device} | Dtype: {dtype}")
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # ==================== ๐Ÿ”„ ุชุญู…ูŠู„ ุงู„ู†ู…ูˆุฐุฌ ====================
33
+ def load_model():
34
+ """ุชุญู…ูŠู„ ุงู„ู†ู…ูˆุฐุฌ ูˆุงู„ู…ุนุงู„ุฌ ู…ุน ุฅุฏุงุฑุฉ ุงู„ุฐุงูƒุฑุฉ"""
35
+ model_path = os.getenv("MODEL_PATH", "sherif1313/Arabic-Qwen3.5-OCR-v4")
36
+
37
+ print(f"[INFO] Loading model from: {model_path}")
38
+
39
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
40
+
41
+ model = Qwen3_5ForConditionalGeneration.from_pretrained(
42
+ model_path,
43
+ torch_dtype=dtype,
44
+ device_map="auto" if device == "cuda" else None,
45
+ trust_remote_code=True,
46
+ low_cpu_mem_usage=True,
47
  )
48
+
49
+ model.eval()
50
+ print("[INFO] Model loaded successfully!")
51
+ return model, processor
52
 
53
+ # ุชุญู…ูŠู„ ุนุงู„ู…ูŠ (ูŠุชู… ู…ุฑุฉ ูˆุงุญุฏุฉ ุนู†ุฏ ุจุฏุก ุงู„ุชุทุจูŠู‚)
54
+ try:
55
+ model, processor = load_model()
56
+ except Exception as e:
57
+ print(f"[ERROR] Failed to load model: {e}")
58
+ model = None
59
+ processor = None
60
 
61
+ # ==================== ๐Ÿงน ุฏูˆุงู„ ู…ุณุงุนุฏุฉ ====================
62
+ def prepare_image(image: Image.Image, max_size: int = 768) -> Image.Image:
63
+ """ุชุญุถูŠุฑ ุงู„ุตูˆุฑุฉ: ุถุบุท + ุถุจุท ุงู„ุฃุจุนุงุฏ ู„ู…ุถุงุนูุงุช 64"""
64
+ if max(image.size) > max_size:
65
+ image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
66
+
67
+ w, h = image.size
68
+ new_w = ((w + 63) // 64) * 64
69
+ new_h = ((h + 63) // 64) * 64
70
+ if (new_w, new_h) != image.size:
71
+ image = image.resize((new_w, new_h), Image.Resampling.LANCZOS)
72
+
73
  return image
74
 
75
+ def clean_output(text: str, max_repetitions: int = 2) -> str:
76
+ """ุชู†ุธูŠู ุงู„ุชูƒุฑุงุฑ ููŠ ุงู„ู…ุฎุฑุฌุงุช"""
77
+ if not text:
78
+ return text
79
+
80
+ import re
81
+ text = re.sub(r'(.)\1{4,}', r'\1\1\1', text)
82
+
83
+ lines = text.strip().split('\n')
84
+ cleaned = []
85
+ seen = {}
86
+ for line in lines:
87
+ line_stripped = line.strip()
88
+ if not line_stripped:
89
+ continue
90
+ count = seen.get(line_stripped, 0) + 1
91
+ if count <= max_repetitions:
92
+ cleaned.append(line)
93
+ seen[line_stripped] = count
94
+
95
+ return '\n'.join(cleaned).strip()
96
 
97
+ # ==================== ๐Ÿ” ุฏุงู„ุฉ ุงู„ุงุณุชุฏู„ุงู„ ====================
98
+ def extract_text(image, prompt: str = None) -> tuple[str, str]:
99
+ """ุงุณุชุฎุฑุงุฌ ุงู„ู†ุต ู…ู† ุงู„ุตูˆุฑุฉ"""
100
+ if model is None or processor is None:
101
+ return "โŒ Error: Model not loaded", "0.00"
102
+
103
+ if image is None:
104
+ return "โš ๏ธ Please upload an image", "0.00"
105
+
106
+ start_time = time.time()
107
+
108
+ try:
109
+ if isinstance(image, str):
110
+ image_pil = Image.open(image).convert("RGB")
111
+ elif isinstance(image, Image.Image):
112
+ image_pil = image.convert("RGB")
113
+ else:
114
+ image_pil = Image.fromarray(image).convert("RGB")
115
+
116
+ image_pil = prepare_image(image_pil)
117
+
118
+ if prompt is None or not prompt.strip():
119
+ prompt = "ุงู‚ุฑุฃ ุงู„ู†ุต ููŠ ู‡ุฐู‡ ุงู„ุตูˆุฑุฉ ูƒุงู…ู„ุงู‹ ู…ู† ุงู„ุจุฏุงูŠุฉ ุฅู„ู‰ ุงู„ู†ู‡ุงูŠุฉ."
120
+
121
+ messages = [{
122
+ "role": "user",
123
+ "content": [
124
+ {"type": "image", "image": image_pil},
125
+ {"type": "text", "text": prompt}
126
+ ]
127
+ }]
128
+
129
+ text_input = processor.apply_chat_template(
 
 
 
 
 
 
 
130
  messages, tokenize=False, add_generation_prompt=True
131
  )
132
+ image_inputs, _ = process_vision_info(messages)
133
+
134
  inputs = processor(
135
+ text=[text_input],
136
  images=image_inputs,
 
137
  padding=True,
138
+ return_tensors="pt"
139
+ ).to(device)
140
+
141
+ with torch.inference_mode():
142
+ generated_ids = model.generate(
143
+ **inputs,
144
+ max_new_tokens=512,
145
+ do_sample=False,
146
+ temperature=1.0,
147
+ repetition_penalty=1.2,
148
+ no_repeat_ngram_size=3,
149
+ pad_token_id=processor.tokenizer.pad_token_id,
150
+ eos_token_id=processor.tokenizer.eos_token_id,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  )
152
+
153
+ input_len = inputs.input_ids.shape[1]
154
+ output_text = processor.batch_decode(
155
+ generated_ids[:, input_len:],
156
+ skip_special_tokens=True,
157
+ clean_up_tokenization_spaces=False
158
+ )[0]
159
+
160
+ output_text = clean_output(output_text.strip())
161
+
162
+ elapsed = time.time() - start_time
163
+
164
+ return output_text, f"{elapsed:.2f} seconds"
165
+
166
+ except torch.cuda.OutOfMemoryError:
167
+ torch.cuda.empty_cache()
168
+ return "โŒ Out of Memory. Try a smaller image.", "0.00"
169
+ except Exception as e:
170
+ print(f"[ERROR] {e}")
171
+ import traceback
172
+ traceback.print_exc()
173
+ return f"โŒ Error: {str(e)}", "0.00"
174
+
175
+ # ==================== ๐ŸŽจ ูˆุงุฌู‡ุฉ Gradio ====================
176
+ def create_interface():
177
+ """ุฅู†ุดุงุก ูˆุงุฌู‡ุฉ ุงู„ู…ุณุชุฎุฏู…"""
178
+
179
+ with gr.Blocks(
180
+ title="Arabic OCR - Qwen3.5-0.8B"
181
+ # theme and css removed from here โ€“ moved to launch()
182
+ ) as demo:
183
+
184
+ gr.Markdown("""
185
+ # ๐Ÿ“ Arabic Handwritten & Printed OCR V4
186
+ ### Powered by Qwen3.5-0.8B
187
+
188
+ Upload an image containing Arabic text, and the model will extract it.
189
+
190
+ โœจ **Features:**
191
+ - ๐ŸŒ Arabic support
192
+ - โœ๏ธ Handwritten & printed text
193
+ - ๐Ÿ”ค Preserves diacritics (ุชุดูƒูŠู„)
194
+ - โšก Full precision (no quantization)
195
+ """, elem_classes="header")
196
+
197
+ with gr.Row():
198
+ with gr.Column(scale=1):
199
+ image_input = gr.Image(
200
+ label="๐Ÿ“ท Upload Image",
201
+ type="pil",
202
+ height=300,
203
+ sources=["upload", "clipboard"]
204
+ )
205
+
206
+ prompt_input = gr.Textbox(
207
+ label="๐Ÿ“ Custom Prompt (Optional)",
208
+ placeholder="ุงู‚ุฑุฃ ุงู„ู†ุต ููŠ ู‡ุฐู‡ ุงู„ุตูˆุฑุฉ...",
209
+ value="ุงู‚ุฑุฃ ุงู„ู†ุต ููŠ ู‡ุฐู‡ ุงู„ุตูˆุฑุฉ ูƒุงู…ู„ุงู‹ ู…ู† ุงู„ุจุฏุงูŠุฉ ุฅู„ู‰ ุงู„ู†ู‡ุงูŠุฉ.",
210
+ lines=2
211
+ )
212
 
213
+ submit_btn = gr.Button(
214
+ "๐Ÿ” Extract Text",
215
+ variant="primary",
216
+ size="lg"
217
+ )
218
 
219
+ # Examples โ€“ use local files or remote URLs (remote may fail in some environments)
220
+ # For production, copy images to an 'examples' folder and use local paths.
221
+ gr.Examples(
222
+ label="๐Ÿ“‹ Examples (Optional)",
223
+ examples=[
224
+ # You can replace these with local files like ["examples/sample1.jpg"]
225
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00002.png"],
226
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00106.png"],
227
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00107.png"],
228
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00113.png"],
229
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00126.png"],
230
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00135.png"],
231
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00141.png"],
232
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00197.png"],
233
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00198.png"],
234
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00199.png"],
235
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00216.png"],
236
+ ["https://huggingface.co/sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2/resolve/main/assets/00240.png"],
237
+ ],
238
+ inputs=[image_input],
239
+ cache_examples=False
240
+ )
241
 
242
+ with gr.Column(scale=1):
243
+ # Removed show_copy_button parameter (not available in older Gradio)
244
+ output_text = gr.Textbox(
245
+ label="๐Ÿ“„ Extracted Text",
246
+ lines=12,
247
+ elem_classes="output-box"
248
+ )
249
+
250
+ time_output = gr.Textbox(
251
+ label="โฑ๏ธ Inference Time",
252
+ interactive=False,
253
+ value="-"
254
+ )
255
+
256
+ clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear", variant="secondary")
257
+
258
+ # ุฑุจุท ุงู„ุฃุญุฏุงุซ
259
+ submit_btn.click(
260
+ fn=extract_text,
261
+ inputs=[image_input, prompt_input],
262
+ outputs=[output_text, time_output]
263
+ )
264
+
265
+ clear_btn.click(
266
+ fn=lambda: (None, "", "-"),
267
+ inputs=[],
268
+ outputs=[image_input, prompt_input, time_output]
269
+ )
270
+
271
+ gr.Markdown("""
272
+ ### ๐Ÿ’ก Tips for Best Results:
273
+ 1. Use clear, well-lit images
274
+ 2. Crop to the text region if possible
275
+ 3. For handwritten text, ensure good contrast
276
+ 4. Custom prompts can improve accuracy for specific formats
277
+ """)
278
 
279
+ return demo
 
 
280
 
281
+ # ==================== ๐Ÿš€ ู†ู‚ุทุฉ ุงู„ุฏุฎูˆู„ ====================
282
+ if __name__ == "__main__":
283
+ print("[INFO] Creating Gradio interface...")
284
+
285
+ demo = create_interface()
286
+
287
+ # ุชุดุบูŠู„ ุงู„ุชุทุจูŠู‚ ู…ุน ุชู…ุฑูŠุฑ theme ูˆ css ู‡ู†ุง (Gradio 6+)
288
+ demo.launch(
289
+ server_name="0.0.0.0",
290
+ server_port=int(os.getenv("PORT", 7860)),
291
+ share=False,
292
+ debug=os.getenv("DEBUG", "false").lower() == "true",
293
+ show_error=True,
294
+ theme=gr.themes.Soft(), # moved from Blocks
295
+ css="""
296
+ .header { text-align: center; margin-bottom: 20px; }
297
+ .output-box { min-height: 200px; }
298
+ """
299
+ )