khodour commited on
Commit
90c31bc
·
verified ·
1 Parent(s): b1d157c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -17
app.py CHANGED
@@ -1,23 +1,112 @@
1
  import gradio as gr
2
- from ollama import chat # make sure you have `ollama` (not ollama-client) in requirements.txt
 
 
 
 
 
 
 
 
3
 
4
- def ocr_fn(image):
5
- if image is None:
6
- return "⚠️ Please upload an image first."
7
- # call your Ollama model
8
- resp = chat(
9
- model="Qwen/Qwen2.5-VL-7B-Instruct",
10
- images=[image],
11
- messages=[{"role":"user","content":"Extract all Arabic text, preserving layout."}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  )
13
- return resp
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- with gr.Blocks() as demo:
16
- gr.Markdown("## Arabic OCR Demo")
 
 
 
17
  with gr.Row():
18
- inp = gr.Image(type="pil", label="📤 Upload an Arabic text image")
19
- out = gr.Textbox(lines=10, label="📋 Extracted Text")
20
- btn = gr.Button("Submit")
21
- btn.click(fn=ocr_fn, inputs=inp, outputs=out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- demo.launch()
 
1
  import gradio as gr
2
+ import time
3
+ import spaces
4
+ from PIL import Image
5
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
6
+ from qwen_vl_utils import process_vision_info
7
+ import torch
8
+ import uuid
9
+ import os
10
+ import numpy as np
11
 
12
+ # Load model and processor
13
+ model_name = "gagan3012/Florence-2-FT-ArabicOCR"
14
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
15
+ model_name,
16
+ torch_dtype="auto",
17
+ device_map="cuda"
18
+ )
19
+ processor = AutoProcessor.from_pretrained(model_name)
20
+ max_tokens = 2000
21
+
22
+
23
+ @spaces.GPU(duration=120)
24
+ def perform_ocr(image):
25
+ inputArray = np.any(image)
26
+ if inputArray == False:
27
+ return "Error Processing"
28
+ """Process image and extract text using OCR model"""
29
+ image = Image.fromarray(image)
30
+ src = str(uuid.uuid4()) + ".png"
31
+ prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
32
+ image.save(src)
33
+
34
+ messages = [
35
+ {
36
+ "role": "user",
37
+ "content": [
38
+ {"type": "image", "image": f"file://{src}"},
39
+ {"type": "text", "text": prompt},
40
+ ],
41
+ }
42
+ ]
43
+
44
+ # Process inputs
45
+ text = processor.apply_chat_template(
46
+ messages, tokenize=False, add_generation_prompt=True
47
+ )
48
+ image_inputs, video_inputs = process_vision_info(messages)
49
+ inputs = processor(
50
+ text=[text],
51
+ images=image_inputs,
52
+ videos=video_inputs,
53
+ padding=True,
54
+ return_tensors="pt",
55
  )
56
+ inputs = inputs.to("cuda")
57
+
58
+ # Generate text
59
+ generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
60
+ generated_ids_trimmed = [
61
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
62
+ ]
63
+ output_text = processor.batch_decode(
64
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
65
+ )[0]
66
+
67
+ # Cleanup
68
+ os.remove(src)
69
+ return output_text
70
 
71
+ # Create Gradio interface
72
+ with gr.Blocks(title="Qari Arabic OCR") as demo:
73
+ gr.Markdown("# Qari Arabic OCR")
74
+ gr.Markdown("Upload an image to extract Arabic text in real-time. This model is specialized for Arabic document OCR.")
75
+
76
  with gr.Row():
77
+ with gr.Column(scale=1):
78
+ # Input image
79
+ image_input = gr.Image(type="numpy", label="Upload Image")
80
+
81
+ # Example gallery
82
+ gr.Examples(
83
+ examples=[
84
+ ["2.jpg"],
85
+ ["3.jpg"]
86
+ ],
87
+ inputs=image_input,
88
+ label="Example Images",
89
+ examples_per_page=4
90
+ )
91
+
92
+ # Submit button
93
+ submit_btn = gr.Button("Extract Text")
94
+
95
+ with gr.Column(scale=1):
96
+ # Output text
97
+ output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
98
+
99
+ # Model details
100
+ with gr.Accordion("Model Information", open=False):
101
+ gr.Markdown("""
102
+ **Model:** Qari-OCR-0.1-VL-2B-Instruct
103
+ **Description:** Arabic OCR model based on Qwen2-VL architecture
104
+ **Size:** 2B parameters
105
+ **Context window:** Supports up to 2000 output tokens
106
+ """)
107
+
108
+ # Set up processing flow
109
+ submit_btn.click(fn=perform_ocr, inputs=image_input, outputs=output)
110
+ image_input.change(fn=perform_ocr, inputs=image_input, outputs=output)
111
 
112
+ demo.launch(debug=True)