Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,7 @@ import cv2
|
|
| 16 |
from transformers import (
|
| 17 |
Qwen2VLForConditionalGeneration,
|
| 18 |
Qwen2_5_VLForConditionalGeneration,
|
|
|
|
| 19 |
AutoModelForImageTextToText,
|
| 20 |
AutoProcessor,
|
| 21 |
TextIteratorStreamer,
|
|
@@ -159,7 +160,7 @@ class RadioAnimated(gr.HTML):
|
|
| 159 |
uid = uuid.uuid4().hex[:8]
|
| 160 |
group_name = f"ra-{uid}"
|
| 161 |
|
| 162 |
-
inputs_html = "\n".join(
|
| 163 |
f"""
|
| 164 |
<input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
|
| 165 |
<label class="ra-label" for="{group_name}-{i}">{c}</label>
|
|
@@ -216,47 +217,52 @@ class RadioAnimated(gr.HTML):
|
|
| 216 |
def apply_gpu_duration(val: str):
|
| 217 |
return int(val)
|
| 218 |
|
|
|
|
| 219 |
MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
|
| 220 |
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
|
| 221 |
model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 222 |
MODEL_ID_V,
|
| 223 |
-
attn_implementation="
|
| 224 |
trust_remote_code=True,
|
| 225 |
torch_dtype=torch.float16
|
| 226 |
).to(device).eval()
|
| 227 |
|
|
|
|
| 228 |
MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
| 229 |
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
|
| 230 |
model_x = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 231 |
MODEL_ID_X,
|
| 232 |
-
attn_implementation="
|
| 233 |
trust_remote_code=True,
|
| 234 |
torch_dtype=torch.float16
|
| 235 |
).to(device).eval()
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
|
|
|
| 242 |
trust_remote_code=True,
|
| 243 |
torch_dtype=torch.float16
|
| 244 |
).to(device).eval()
|
| 245 |
|
|
|
|
| 246 |
MODEL_ID_W = "allenai/olmOCR-7B-0725"
|
| 247 |
processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
|
| 248 |
model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 249 |
MODEL_ID_W,
|
| 250 |
-
attn_implementation="
|
| 251 |
trust_remote_code=True,
|
| 252 |
torch_dtype=torch.float16
|
| 253 |
).to(device).eval()
|
| 254 |
|
|
|
|
| 255 |
MODEL_ID_M = "reducto/RolmOCR"
|
| 256 |
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
|
| 257 |
model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 258 |
MODEL_ID_M,
|
| 259 |
-
attn_implementation="
|
| 260 |
trust_remote_code=True,
|
| 261 |
torch_dtype=torch.float16
|
| 262 |
).to(device).eval()
|
|
@@ -288,9 +294,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 288 |
elif model_name == "Nanonets-OCR2-3B":
|
| 289 |
processor = processor_v
|
| 290 |
model = model_v
|
| 291 |
-
elif model_name == "
|
| 292 |
-
processor =
|
| 293 |
-
model =
|
| 294 |
elif model_name == "olmOCR-7B-0725":
|
| 295 |
processor = processor_w
|
| 296 |
model = model_w
|
|
@@ -346,7 +352,7 @@ image_examples = [
|
|
| 346 |
["Convert this page to docling", "examples/3.jpg"],
|
| 347 |
]
|
| 348 |
|
| 349 |
-
with gr.Blocks() as demo:
|
| 350 |
gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
|
| 351 |
with gr.Row():
|
| 352 |
with gr.Column(scale=2):
|
|
@@ -374,7 +380,7 @@ with gr.Blocks() as demo:
|
|
| 374 |
|
| 375 |
model_choice = gr.Radio(
|
| 376 |
choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
|
| 377 |
-
|
| 378 |
label="Select Model",
|
| 379 |
value="Nanonets-OCR2-3B"
|
| 380 |
)
|
|
@@ -405,4 +411,4 @@ with gr.Blocks() as demo:
|
|
| 405 |
)
|
| 406 |
|
| 407 |
if __name__ == "__main__":
|
| 408 |
-
demo.queue(max_size=50).launch(
|
|
|
|
| 16 |
from transformers import (
|
| 17 |
Qwen2VLForConditionalGeneration,
|
| 18 |
Qwen2_5_VLForConditionalGeneration,
|
| 19 |
+
Qwen3VLForConditionalGeneration,
|
| 20 |
AutoModelForImageTextToText,
|
| 21 |
AutoProcessor,
|
| 22 |
TextIteratorStreamer,
|
|
|
|
| 160 |
uid = uuid.uuid4().hex[:8]
|
| 161 |
group_name = f"ra-{uid}"
|
| 162 |
|
| 163 |
+
inputs_html = "\\n".join(
|
| 164 |
f"""
|
| 165 |
<input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
|
| 166 |
<label class="ra-label" for="{group_name}-{i}">{c}</label>
|
|
|
|
| 217 |
def apply_gpu_duration(val: str):
|
| 218 |
return int(val)
|
| 219 |
|
| 220 |
+
# Model V: Nanonets-OCR2-3B
|
| 221 |
MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
|
| 222 |
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
|
| 223 |
model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 224 |
MODEL_ID_V,
|
| 225 |
+
attn_implementation="flash_attention_2",
|
| 226 |
trust_remote_code=True,
|
| 227 |
torch_dtype=torch.float16
|
| 228 |
).to(device).eval()
|
| 229 |
|
| 230 |
+
# Model X: Qwen2-VL-OCR-2B
|
| 231 |
MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
| 232 |
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
|
| 233 |
model_x = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 234 |
MODEL_ID_X,
|
| 235 |
+
attn_implementation="flash_attention_2",
|
| 236 |
trust_remote_code=True,
|
| 237 |
torch_dtype=torch.float16
|
| 238 |
).to(device).eval()
|
| 239 |
|
| 240 |
+
# Model C: Chhagan-DocVL-Qwen3 (NEW)
|
| 241 |
+
MODEL_ID_C = "Chhagan005/Chhagan-DocVL-Qwen3"
|
| 242 |
+
processor_c = AutoProcessor.from_pretrained(MODEL_ID_C, trust_remote_code=True)
|
| 243 |
+
model_c = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 244 |
+
MODEL_ID_C,
|
| 245 |
+
attn_implementation="flash_attention_2",
|
| 246 |
trust_remote_code=True,
|
| 247 |
torch_dtype=torch.float16
|
| 248 |
).to(device).eval()
|
| 249 |
|
| 250 |
+
# Model W: olmOCR-7B-0725
|
| 251 |
MODEL_ID_W = "allenai/olmOCR-7B-0725"
|
| 252 |
processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
|
| 253 |
model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 254 |
MODEL_ID_W,
|
| 255 |
+
attn_implementation="flash_attention_2",
|
| 256 |
trust_remote_code=True,
|
| 257 |
torch_dtype=torch.float16
|
| 258 |
).to(device).eval()
|
| 259 |
|
| 260 |
+
# Model M: RolmOCR
|
| 261 |
MODEL_ID_M = "reducto/RolmOCR"
|
| 262 |
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
|
| 263 |
model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 264 |
MODEL_ID_M,
|
| 265 |
+
attn_implementation="flash_attention_2",
|
| 266 |
trust_remote_code=True,
|
| 267 |
torch_dtype=torch.float16
|
| 268 |
).to(device).eval()
|
|
|
|
| 294 |
elif model_name == "Nanonets-OCR2-3B":
|
| 295 |
processor = processor_v
|
| 296 |
model = model_v
|
| 297 |
+
elif model_name == "Chhagan-DocVL-Qwen3":
|
| 298 |
+
processor = processor_c
|
| 299 |
+
model = model_c
|
| 300 |
elif model_name == "olmOCR-7B-0725":
|
| 301 |
processor = processor_w
|
| 302 |
model = model_w
|
|
|
|
| 352 |
["Convert this page to docling", "examples/3.jpg"],
|
| 353 |
]
|
| 354 |
|
| 355 |
+
with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
| 356 |
gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
|
| 357 |
with gr.Row():
|
| 358 |
with gr.Column(scale=2):
|
|
|
|
| 380 |
|
| 381 |
model_choice = gr.Radio(
|
| 382 |
choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
|
| 383 |
+
"Chhagan-DocVL-Qwen3", "Qwen2-VL-OCR-2B"],
|
| 384 |
label="Select Model",
|
| 385 |
value="Nanonets-OCR2-3B"
|
| 386 |
)
|
|
|
|
| 411 |
)
|
| 412 |
|
| 413 |
if __name__ == "__main__":
|
| 414 |
+
demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
|