Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,13 +13,17 @@ import numpy as np
|
|
| 13 |
from PIL import Image
|
| 14 |
import cv2
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
from transformers import (
|
| 17 |
Qwen2_5_VLForConditionalGeneration,
|
| 18 |
AutoProcessor,
|
| 19 |
TextIteratorStreamer,
|
| 20 |
)
|
| 21 |
|
| 22 |
-
# Try importing Qwen3VL
|
| 23 |
try:
|
| 24 |
from transformers import Qwen3VLForConditionalGeneration
|
| 25 |
QWEN3_AVAILABLE = True
|
|
@@ -246,25 +250,43 @@ def apply_gpu_duration(val: str):
|
|
| 246 |
return int(val)
|
| 247 |
|
| 248 |
# Model V: Nanonets-OCR2-3B (Kept)
|
|
|
|
| 249 |
MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
|
| 261 |
try:
|
| 262 |
-
processor_c1 = AutoProcessor.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
model_c1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 264 |
MODEL_ID_C1,
|
| 265 |
attn_implementation="flash_attention_2",
|
| 266 |
trust_remote_code=True,
|
| 267 |
-
torch_dtype=torch.float16
|
|
|
|
|
|
|
|
|
|
| 268 |
).to(device).eval()
|
| 269 |
C1_AVAILABLE = True
|
| 270 |
print("✓ Chhagan_ML-VL-OCR-v1 loaded")
|
|
@@ -274,29 +296,8 @@ except Exception as e:
|
|
| 274 |
processor_c1 = None
|
| 275 |
model_c1 = None
|
| 276 |
|
| 277 |
-
# Model
|
| 278 |
-
|
| 279 |
-
C2_AVAILABLE = False
|
| 280 |
-
if QWEN3_AVAILABLE:
|
| 281 |
-
try:
|
| 282 |
-
processor_c2 = AutoProcessor.from_pretrained(MODEL_ID_C2, trust_remote_code=True)
|
| 283 |
-
model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 284 |
-
MODEL_ID_C2,
|
| 285 |
-
attn_implementation="flash_attention_2",
|
| 286 |
-
trust_remote_code=True,
|
| 287 |
-
torch_dtype=torch.float16
|
| 288 |
-
).to(device).eval()
|
| 289 |
-
C2_AVAILABLE = True
|
| 290 |
-
print("✓ Chhagan-DocVL-Qwen3 loaded")
|
| 291 |
-
except Exception as e:
|
| 292 |
-
print(f"✗ Chhagan-DocVL-Qwen3 failed: {e}")
|
| 293 |
-
processor_c2 = None
|
| 294 |
-
model_c2 = None
|
| 295 |
-
else:
|
| 296 |
-
processor_c2 = None
|
| 297 |
-
model_c2 = None
|
| 298 |
-
|
| 299 |
-
# Model Q3: Qwen3-VL-2B-Instruct (NEW - Official)
|
| 300 |
MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
|
| 301 |
Q3_AVAILABLE = False
|
| 302 |
if QWEN3_AVAILABLE:
|
|
@@ -317,6 +318,11 @@ if QWEN3_AVAILABLE:
|
|
| 317 |
else:
|
| 318 |
processor_q3 = None
|
| 319 |
model_q3 = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
|
| 322 |
max_new_tokens: int, temperature: float, top_p: float,
|
|
@@ -338,6 +344,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 338 |
"""
|
| 339 |
# Select model and processor
|
| 340 |
if model_name == "Nanonets-OCR2-3B":
|
|
|
|
|
|
|
|
|
|
| 341 |
processor = processor_v
|
| 342 |
model = model_v
|
| 343 |
elif model_name == "Chhagan-ML-VL-OCR-v1":
|
|
@@ -346,12 +355,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 346 |
return
|
| 347 |
processor = processor_c1
|
| 348 |
model = model_c1
|
| 349 |
-
elif model_name == "Chhagan-DocVL-Qwen3":
|
| 350 |
-
if not C2_AVAILABLE:
|
| 351 |
-
yield "Chhagan-DocVL-Qwen3 model is not available. Requires transformers>=4.57", "Chhagan-DocVL-Qwen3 model is not available."
|
| 352 |
-
return
|
| 353 |
-
processor = processor_c2
|
| 354 |
-
model = model_c2
|
| 355 |
elif model_name == "Qwen3-VL-2B-Instruct":
|
| 356 |
if not Q3_AVAILABLE:
|
| 357 |
yield "Qwen3-VL-2B-Instruct model is not available. Requires transformers>=4.57", "Qwen3-VL-2B-Instruct model is not available."
|
|
@@ -367,7 +370,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 367 |
return
|
| 368 |
|
| 369 |
# Use multilingual prompt if user query is empty or simple
|
| 370 |
-
if not text or text.strip().lower() in ["ocr", "extract", "read"]:
|
| 371 |
text = MULTILINGUAL_OCR_PROMPT
|
| 372 |
|
| 373 |
messages = [{
|
|
@@ -421,15 +424,19 @@ image_examples = [
|
|
| 421 |
]
|
| 422 |
|
| 423 |
# Build model choices dynamically
|
| 424 |
-
model_choices = [
|
|
|
|
|
|
|
| 425 |
if C1_AVAILABLE:
|
| 426 |
model_choices.append("Chhagan-ML-VL-OCR-v1")
|
| 427 |
-
if C2_AVAILABLE:
|
| 428 |
-
model_choices.append("Chhagan-DocVL-Qwen3")
|
| 429 |
if Q3_AVAILABLE:
|
| 430 |
model_choices.append("Qwen3-VL-2B-Instruct")
|
| 431 |
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
gr.Markdown("# **Multimodal Multilingual OCR**", elem_id="main-title")
|
| 434 |
gr.Markdown("*Supports multilingual text extraction with automatic English translation*")
|
| 435 |
|
|
@@ -464,7 +471,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
|
| 464 |
model_choice = gr.Radio(
|
| 465 |
choices=model_choices,
|
| 466 |
label="Select Model",
|
| 467 |
-
value=model_choices[0]
|
| 468 |
)
|
| 469 |
|
| 470 |
with gr.Row(elem_id="gpu-duration-container"):
|
|
@@ -494,4 +501,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
|
| 494 |
)
|
| 495 |
|
| 496 |
if __name__ == "__main__":
|
| 497 |
-
demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
|
|
|
|
| 13 |
from PIL import Image
|
| 14 |
import cv2
|
| 15 |
|
| 16 |
+
# Clear any local cache conflicts
|
| 17 |
+
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
| 18 |
+
os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
|
| 19 |
+
|
| 20 |
from transformers import (
|
| 21 |
Qwen2_5_VLForConditionalGeneration,
|
| 22 |
AutoProcessor,
|
| 23 |
TextIteratorStreamer,
|
| 24 |
)
|
| 25 |
|
| 26 |
+
# Try importing Qwen3VL
|
| 27 |
try:
|
| 28 |
from transformers import Qwen3VLForConditionalGeneration
|
| 29 |
QWEN3_AVAILABLE = True
|
|
|
|
| 250 |
return int(val)
|
| 251 |
|
| 252 |
# Model V: Nanonets-OCR2-3B (Kept)
|
| 253 |
+
print("Loading Nanonets-OCR2-3B...")
|
| 254 |
MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
|
| 255 |
+
try:
|
| 256 |
+
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
|
| 257 |
+
model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 258 |
+
MODEL_ID_V,
|
| 259 |
+
attn_implementation="flash_attention_2",
|
| 260 |
+
trust_remote_code=True,
|
| 261 |
+
torch_dtype=torch.float16
|
| 262 |
+
).to(device).eval()
|
| 263 |
+
print("✓ Nanonets-OCR2-3B loaded")
|
| 264 |
+
NANONETS_AVAILABLE = True
|
| 265 |
+
except Exception as e:
|
| 266 |
+
print(f"✗ Nanonets-OCR2-3B failed: {e}")
|
| 267 |
+
NANONETS_AVAILABLE = False
|
| 268 |
+
processor_v = None
|
| 269 |
+
model_v = None
|
| 270 |
+
|
| 271 |
+
# Model C1: Chhagan_ML-VL-OCR-v1 (NEW - with proper cache handling)
|
| 272 |
+
print("Loading Chhagan_ML-VL-OCR-v1...")
|
| 273 |
MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
|
| 274 |
try:
|
| 275 |
+
processor_c1 = AutoProcessor.from_pretrained(
|
| 276 |
+
MODEL_ID_C1,
|
| 277 |
+
trust_remote_code=True,
|
| 278 |
+
cache_dir="/tmp/transformers_cache",
|
| 279 |
+
force_download=False,
|
| 280 |
+
local_files_only=False
|
| 281 |
+
)
|
| 282 |
model_c1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 283 |
MODEL_ID_C1,
|
| 284 |
attn_implementation="flash_attention_2",
|
| 285 |
trust_remote_code=True,
|
| 286 |
+
torch_dtype=torch.float16,
|
| 287 |
+
cache_dir="/tmp/transformers_cache",
|
| 288 |
+
force_download=False,
|
| 289 |
+
local_files_only=False
|
| 290 |
).to(device).eval()
|
| 291 |
C1_AVAILABLE = True
|
| 292 |
print("✓ Chhagan_ML-VL-OCR-v1 loaded")
|
|
|
|
| 296 |
processor_c1 = None
|
| 297 |
model_c1 = None
|
| 298 |
|
| 299 |
+
# Model Q3: Qwen3-VL-2B-Instruct (Official)
|
| 300 |
+
print("Loading Qwen3-VL-2B-Instruct...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
|
| 302 |
Q3_AVAILABLE = False
|
| 303 |
if QWEN3_AVAILABLE:
|
|
|
|
| 318 |
else:
|
| 319 |
processor_q3 = None
|
| 320 |
model_q3 = None
|
| 321 |
+
print("✗ Qwen3VL architecture not available")
|
| 322 |
+
|
| 323 |
+
# Note: Chhagan-DocVL-Qwen3 has tokenizer compatibility issues, skipping
|
| 324 |
+
print("\n⚠️ Note: Chhagan-DocVL-Qwen3 skipped due to tokenizer compatibility issues")
|
| 325 |
+
print("Available alternative: Using official Qwen3-VL-2B-Instruct instead\n")
|
| 326 |
|
| 327 |
def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
|
| 328 |
max_new_tokens: int, temperature: float, top_p: float,
|
|
|
|
| 344 |
"""
|
| 345 |
# Select model and processor
|
| 346 |
if model_name == "Nanonets-OCR2-3B":
|
| 347 |
+
if not NANONETS_AVAILABLE:
|
| 348 |
+
yield "Nanonets-OCR2-3B model is not available.", "Nanonets-OCR2-3B model is not available."
|
| 349 |
+
return
|
| 350 |
processor = processor_v
|
| 351 |
model = model_v
|
| 352 |
elif model_name == "Chhagan-ML-VL-OCR-v1":
|
|
|
|
| 355 |
return
|
| 356 |
processor = processor_c1
|
| 357 |
model = model_c1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
elif model_name == "Qwen3-VL-2B-Instruct":
|
| 359 |
if not Q3_AVAILABLE:
|
| 360 |
yield "Qwen3-VL-2B-Instruct model is not available. Requires transformers>=4.57", "Qwen3-VL-2B-Instruct model is not available."
|
|
|
|
| 370 |
return
|
| 371 |
|
| 372 |
# Use multilingual prompt if user query is empty or simple
|
| 373 |
+
if not text or text.strip().lower() in ["ocr", "extract", "read", ""]:
|
| 374 |
text = MULTILINGUAL_OCR_PROMPT
|
| 375 |
|
| 376 |
messages = [{
|
|
|
|
| 424 |
]
|
| 425 |
|
| 426 |
# Build model choices dynamically
|
| 427 |
+
model_choices = []
|
| 428 |
+
if NANONETS_AVAILABLE:
|
| 429 |
+
model_choices.append("Nanonets-OCR2-3B")
|
| 430 |
if C1_AVAILABLE:
|
| 431 |
model_choices.append("Chhagan-ML-VL-OCR-v1")
|
|
|
|
|
|
|
| 432 |
if Q3_AVAILABLE:
|
| 433 |
model_choices.append("Qwen3-VL-2B-Instruct")
|
| 434 |
|
| 435 |
+
if not model_choices:
|
| 436 |
+
model_choices = ["No models available"]
|
| 437 |
+
|
| 438 |
+
demo = gr.Blocks()
|
| 439 |
+
with demo:
|
| 440 |
gr.Markdown("# **Multimodal Multilingual OCR**", elem_id="main-title")
|
| 441 |
gr.Markdown("*Supports multilingual text extraction with automatic English translation*")
|
| 442 |
|
|
|
|
| 471 |
model_choice = gr.Radio(
|
| 472 |
choices=model_choices,
|
| 473 |
label="Select Model",
|
| 474 |
+
value=model_choices[0] if model_choices else None
|
| 475 |
)
|
| 476 |
|
| 477 |
with gr.Row(elem_id="gpu-duration-container"):
|
|
|
|
| 501 |
)
|
| 502 |
|
| 503 |
if __name__ == "__main__":
|
| 504 |
+
demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)
|