Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,15 +9,11 @@ Features:
|
|
| 9 |
12 models: 6 Qwen VL Instruct + 6 Custom CSM/Chhagan VL models across all tabs.
|
| 10 |
"""
|
| 11 |
|
| 12 |
-
|
| 13 |
import os
|
| 14 |
-
import random
|
| 15 |
-
import uuid
|
| 16 |
-
import json
|
| 17 |
import time
|
| 18 |
import warnings
|
| 19 |
from threading import Thread
|
| 20 |
-
from typing import Optional, Tuple, Dict, Any, List
|
| 21 |
|
| 22 |
from qwen_vl_utils import process_vision_info
|
| 23 |
|
|
@@ -28,7 +24,6 @@ import numpy as np
|
|
| 28 |
from PIL import Image, ImageDraw, ImageFont
|
| 29 |
import cv2
|
| 30 |
|
| 31 |
-
|
| 32 |
from transformers import (
|
| 33 |
Qwen2_5_VLForConditionalGeneration,
|
| 34 |
Qwen3VLForConditionalGeneration,
|
|
@@ -36,17 +31,14 @@ from transformers import (
|
|
| 36 |
TextIteratorStreamer,
|
| 37 |
)
|
| 38 |
|
| 39 |
-
|
| 40 |
from gradio.themes import Soft
|
| 41 |
from gradio.themes.utils import colors, fonts, sizes
|
| 42 |
|
| 43 |
-
|
| 44 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
# Suppress warnings
|
| 46 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
warnings.filterwarnings('ignore', message='.*meta device.*')
|
| 48 |
|
| 49 |
-
|
| 50 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
# Custom Premium Theme
|
| 52 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -57,7 +49,6 @@ colors.deep_indigo = colors.Color(
|
|
| 57 |
c800="#3730A3", c900="#312E81", c950="#1E1B4B",
|
| 58 |
)
|
| 59 |
|
| 60 |
-
|
| 61 |
colors.cyber_teal = colors.Color(
|
| 62 |
name="cyber_teal",
|
| 63 |
c50="#F0FDFA", c100="#CCFBF1", c200="#99F6E4", c300="#5EEAD4",
|
|
@@ -65,7 +56,6 @@ colors.cyber_teal = colors.Color(
|
|
| 65 |
c800="#115E59", c900="#134E4A", c950="#042F2E",
|
| 66 |
)
|
| 67 |
|
| 68 |
-
|
| 69 |
class PremiumTheme(Soft):
|
| 70 |
def __init__(self):
|
| 71 |
super().__init__(
|
|
@@ -96,10 +86,8 @@ class PremiumTheme(Soft):
|
|
| 96 |
block_label_background_fill="*primary_100",
|
| 97 |
)
|
| 98 |
|
| 99 |
-
|
| 100 |
premium_theme = PremiumTheme()
|
| 101 |
|
| 102 |
-
|
| 103 |
css = """
|
| 104 |
#app-title h1 {
|
| 105 |
font-size: 2.5em !important;
|
|
@@ -133,17 +121,10 @@ css = """
|
|
| 133 |
padding: 12px;
|
| 134 |
background: var(--background-fill-secondary);
|
| 135 |
}
|
| 136 |
-
.face-box {
|
| 137 |
-
|
| 138 |
-
border-radius: 8px;
|
| 139 |
-
}
|
| 140 |
-
.sig-box {
|
| 141 |
-
border: 3px solid #3b82f6;
|
| 142 |
-
border-radius: 8px;
|
| 143 |
-
}
|
| 144 |
"""
|
| 145 |
|
| 146 |
-
|
| 147 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 148 |
# Device & Constants
|
| 149 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -152,10 +133,8 @@ DEFAULT_MAX_NEW_TOKENS = 1024
|
|
| 152 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 153 |
print(f"π Using device: {device}")
|
| 154 |
|
| 155 |
-
|
| 156 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 157 |
# ALL 12 MODELS
|
| 158 |
-
# 6 Qwen Instruct (original) + 6 Custom CSM/Chhagan (replaced Thinking models)
|
| 159 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 160 |
ALL_MODELS = [
|
| 161 |
# ββ Qwen Official Instruct Models ββ
|
|
@@ -174,65 +153,113 @@ ALL_MODELS = [
|
|
| 174 |
"Chhagan005/Chhagan-DocVL-Qwen3",
|
| 175 |
]
|
| 176 |
|
| 177 |
-
|
| 178 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 179 |
-
# Lazy Model Loading
|
| 180 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 181 |
_model_cache: Dict[str, Tuple[Any, Any]] = {}
|
| 182 |
|
| 183 |
-
|
| 184 |
def get_model_class(model_id: str):
|
| 185 |
if "Qwen2.5" in model_id:
|
| 186 |
return Qwen2_5_VLForConditionalGeneration
|
| 187 |
return Qwen3VLForConditionalGeneration
|
| 188 |
|
| 189 |
-
|
| 190 |
def load_model(model_id: str):
|
| 191 |
if model_id in _model_cache:
|
| 192 |
return _model_cache[model_id]
|
| 193 |
-
|
| 194 |
print(f"β³ Loading model: {model_id}")
|
| 195 |
model_cls = get_model_class(model_id)
|
| 196 |
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 197 |
-
|
| 198 |
with warnings.catch_warnings():
|
| 199 |
warnings.filterwarnings('ignore')
|
| 200 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
| 201 |
model = model_cls.from_pretrained(
|
| 202 |
-
model_id,
|
| 203 |
)
|
| 204 |
model.eval()
|
| 205 |
-
|
| 206 |
_model_cache[model_id] = (processor, model)
|
| 207 |
print(f"β
Model {model_id} loaded on {device}")
|
| 208 |
return processor, model
|
| 209 |
|
| 210 |
-
|
| 211 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 212 |
-
#
|
| 213 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 214 |
DEFAULT_MODEL = "Qwen/Qwen3-VL-8B-Instruct"
|
| 215 |
print(f"β³ Pre-loading default model at startup: {DEFAULT_MODEL}")
|
| 216 |
load_model(DEFAULT_MODEL)
|
| 217 |
print(f"β
Default model ready!")
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
|
| 221 |
def prepare_inputs(processor, model, messages):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
text = processor.apply_chat_template(
|
| 223 |
-
|
| 224 |
tokenize=False,
|
| 225 |
add_generation_prompt=True,
|
| 226 |
)
|
| 227 |
-
image_inputs, video_inputs = process_vision_info(messages)
|
| 228 |
inputs = processor(
|
| 229 |
text=[text],
|
| 230 |
-
images=
|
| 231 |
-
videos=video_inputs if video_inputs else None,
|
| 232 |
padding=True,
|
| 233 |
return_tensors="pt",
|
| 234 |
)
|
| 235 |
return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
|
|
|
| 236 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 237 |
# Utility Functions
|
| 238 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -247,14 +274,12 @@ def ensure_rgb(image: Image.Image) -> Optional[Image.Image]:
|
|
| 247 |
return image.convert("RGB")
|
| 248 |
return image
|
| 249 |
|
| 250 |
-
|
| 251 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 252 |
# π Face Detection, Signature Extraction & Annotation Engine
|
| 253 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 254 |
def detect_faces(image: Image.Image):
|
| 255 |
img_array = np.array(image)
|
| 256 |
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 257 |
-
|
| 258 |
face_cascade = cv2.CascadeClassifier(
|
| 259 |
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
|
| 260 |
)
|
|
@@ -262,7 +287,6 @@ def detect_faces(image: Image.Image):
|
|
| 262 |
gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
|
| 263 |
flags=cv2.CASCADE_SCALE_IMAGE,
|
| 264 |
)
|
| 265 |
-
|
| 266 |
if len(faces) == 0:
|
| 267 |
profile_cascade = cv2.CascadeClassifier(
|
| 268 |
cv2.data.haarcascades + 'haarcascade_profileface.xml'
|
|
@@ -270,12 +294,10 @@ def detect_faces(image: Image.Image):
|
|
| 270 |
faces = profile_cascade.detectMultiScale(
|
| 271 |
gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
|
| 272 |
)
|
| 273 |
-
|
| 274 |
if len(faces) == 0:
|
| 275 |
return None, []
|
| 276 |
|
| 277 |
faces_sorted = sorted(faces, key=lambda f: f[2] * f[3], reverse=True)
|
| 278 |
-
|
| 279 |
x, y, w, h = faces_sorted[0]
|
| 280 |
pad = int(0.2 * max(w, h))
|
| 281 |
x1 = max(0, x - pad)
|
|
@@ -296,7 +318,7 @@ def detect_faces(image: Image.Image):
|
|
| 296 |
y2 = min(img_array.shape[0], y + h + pad)
|
| 297 |
face_gray2 = gray[y1:y2, x1:x2]
|
| 298 |
if face_gray2.size > 0 and cv2.Laplacian(face_gray2, cv2.CV_64F).var() < 30:
|
| 299 |
-
return None,
|
| 300 |
else:
|
| 301 |
return None, [tuple(f) for f in faces_sorted]
|
| 302 |
|
|
@@ -308,20 +330,15 @@ def detect_faces(image: Image.Image):
|
|
| 308 |
def detect_signature(image: Image.Image):
|
| 309 |
img_array = np.array(image)
|
| 310 |
h, w = img_array.shape[:2]
|
| 311 |
-
|
| 312 |
search_top = int(h * 0.5)
|
| 313 |
lower_region = img_array[search_top:, :]
|
| 314 |
gray = cv2.cvtColor(lower_region, cv2.COLOR_RGB2GRAY)
|
| 315 |
-
|
| 316 |
binary = cv2.adaptiveThreshold(
|
| 317 |
-
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 318 |
-
cv2.THRESH_BINARY_INV, 15, 10
|
| 319 |
)
|
| 320 |
-
|
| 321 |
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
|
| 322 |
binary = cv2.dilate(binary, kernel, iterations=2)
|
| 323 |
binary = cv2.erode(binary, kernel, iterations=1)
|
| 324 |
-
|
| 325 |
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 326 |
if not contours:
|
| 327 |
return None, None
|
|
@@ -341,7 +358,6 @@ def detect_signature(image: Image.Image):
|
|
| 341 |
|
| 342 |
all_points = np.concatenate(sig_contours)
|
| 343 |
rx, ry, rw, rh = cv2.boundingRect(all_points)
|
| 344 |
-
|
| 345 |
if rw < 30 or rh < 10:
|
| 346 |
return None, None
|
| 347 |
|
|
@@ -358,13 +374,11 @@ def detect_signature(image: Image.Image):
|
|
| 358 |
return None, None
|
| 359 |
|
| 360 |
sig_crop = image.crop((sig_x1, sig_y1, sig_x2, sig_y2))
|
| 361 |
-
|
| 362 |
-
return sig_crop, bbox
|
| 363 |
|
| 364 |
|
| 365 |
def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Optional[tuple]):
|
| 366 |
img_array = np.array(image).copy()
|
| 367 |
-
|
| 368 |
for i, (x, y, w, h) in enumerate(face_bboxes):
|
| 369 |
color = (34, 197, 94)
|
| 370 |
cv2.rectangle(img_array, (x, y), (x + w, y + h), color, 3)
|
|
@@ -373,7 +387,6 @@ def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Opti
|
|
| 373 |
cv2.rectangle(img_array, (x, y - th - 10), (x + tw + 6, y), color, -1)
|
| 374 |
cv2.putText(img_array, label, (x + 3, y - 5),
|
| 375 |
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
|
| 376 |
-
|
| 377 |
if sig_bbox:
|
| 378 |
x1, y1, x2, y2 = sig_bbox
|
| 379 |
color = (59, 130, 246)
|
|
@@ -383,25 +396,22 @@ def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Opti
|
|
| 383 |
cv2.rectangle(img_array, (x1, y1 - th - 10), (x1 + tw + 6, y1), color, -1)
|
| 384 |
cv2.putText(img_array, label, (x1 + 3, y1 - 5),
|
| 385 |
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
|
| 386 |
-
|
| 387 |
return Image.fromarray(img_array)
|
| 388 |
|
| 389 |
|
| 390 |
def run_visual_extraction(image: Optional[Image.Image]):
|
| 391 |
if image is None:
|
| 392 |
return None, None, None, "_Upload an image to detect visual elements._"
|
| 393 |
-
|
| 394 |
image = ensure_rgb(image)
|
| 395 |
detections = []
|
| 396 |
|
| 397 |
face_crop, face_bboxes = detect_faces(image)
|
| 398 |
if face_crop is not None:
|
| 399 |
detections.append(f"β
**Face detected** β {len(face_bboxes)} face(s) found, largest extracted")
|
|
|
|
|
|
|
| 400 |
else:
|
| 401 |
-
|
| 402 |
-
detections.append(f"β οΈ **Face found but too blurry/small** β {len(face_bboxes)} face(s) detected but quality insufficient")
|
| 403 |
-
else:
|
| 404 |
-
detections.append("β **No face detected** in this image")
|
| 405 |
|
| 406 |
sig_crop, sig_bbox = detect_signature(image)
|
| 407 |
if sig_crop is not None:
|
|
@@ -410,10 +420,11 @@ def run_visual_extraction(image: Optional[Image.Image]):
|
|
| 410 |
detections.append("βΉοΈ **No signature detected** in this image")
|
| 411 |
|
| 412 |
annotated = create_annotated_image(image, face_bboxes, sig_bbox)
|
| 413 |
-
detections.append(
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
|
|
|
| 417 |
return face_crop, sig_crop, annotated, summary_md
|
| 418 |
|
| 419 |
|
|
@@ -429,11 +440,9 @@ def generate_document_scan(
|
|
| 429 |
if front_image is None and back_image is None:
|
| 430 |
yield "β οΈ Please upload at least one image.", "β οΈ Please upload at least one image."
|
| 431 |
return
|
| 432 |
-
|
| 433 |
if not prompt.strip():
|
| 434 |
prompt = ("Analyze this document. Extract all text, key details "
|
| 435 |
"(name, dates, numbers, etc.) and provide a structured summary.")
|
| 436 |
-
|
| 437 |
try:
|
| 438 |
processor, model = load_model(model_name)
|
| 439 |
except Exception as e:
|
|
@@ -443,31 +452,18 @@ def generate_document_scan(
|
|
| 443 |
content = []
|
| 444 |
if front_image is not None:
|
| 445 |
front_image = ensure_rgb(front_image)
|
| 446 |
-
content.append({"type": "text",
|
| 447 |
content.append({"type": "image", "image": front_image})
|
| 448 |
-
|
| 449 |
if back_image is not None:
|
| 450 |
back_image = ensure_rgb(back_image)
|
| 451 |
-
content.append({"type": "text",
|
| 452 |
content.append({"type": "image", "image": back_image})
|
| 453 |
-
|
| 454 |
content.append({"type": "text", "text": prompt})
|
| 455 |
|
| 456 |
messages = [{"role": "user", "content": content}]
|
| 457 |
-
|
| 458 |
-
# inputs = processor.apply_chat_template(
|
| 459 |
-
# messages,
|
| 460 |
-
# tokenize=True,
|
| 461 |
-
# add_generation_prompt=True,
|
| 462 |
-
# return_dict=True,
|
| 463 |
-
# return_tensors="pt"
|
| 464 |
-
# )
|
| 465 |
-
# inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
| 466 |
-
|
| 467 |
inputs = prepare_inputs(processor, model, messages)
|
| 468 |
|
| 469 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 470 |
-
|
| 471 |
gen_kwargs = {
|
| 472 |
**inputs,
|
| 473 |
"streamer": streamer,
|
|
@@ -477,7 +473,6 @@ def generate_document_scan(
|
|
| 477 |
"top_p": top_p,
|
| 478 |
"top_k": top_k,
|
| 479 |
}
|
| 480 |
-
|
| 481 |
thread = Thread(target=model.generate, kwargs=gen_kwargs)
|
| 482 |
thread.start()
|
| 483 |
buffer = ""
|
|
@@ -501,7 +496,6 @@ def generate_image_analysis(
|
|
| 501 |
return
|
| 502 |
if not text.strip():
|
| 503 |
text = "Describe this image in detail."
|
| 504 |
-
|
| 505 |
try:
|
| 506 |
processor, model = load_model(model_name)
|
| 507 |
except Exception as e:
|
|
@@ -509,25 +503,13 @@ def generate_image_analysis(
|
|
| 509 |
return
|
| 510 |
|
| 511 |
image = ensure_rgb(image)
|
| 512 |
-
|
| 513 |
messages = [{"role": "user", "content": [
|
| 514 |
{"type": "image", "image": image},
|
| 515 |
-
{"type": "text",
|
| 516 |
]}]
|
| 517 |
-
|
| 518 |
-
# inputs = processor.apply_chat_template(
|
| 519 |
-
# messages,
|
| 520 |
-
# tokenize=True,
|
| 521 |
-
# add_generation_prompt=True,
|
| 522 |
-
# return_dict=True,
|
| 523 |
-
# return_tensors="pt"
|
| 524 |
-
# )
|
| 525 |
-
# inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
| 526 |
-
|
| 527 |
inputs = prepare_inputs(processor, model, messages)
|
| 528 |
|
| 529 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 530 |
-
|
| 531 |
gen_kwargs = {
|
| 532 |
**inputs,
|
| 533 |
"streamer": streamer,
|
|
@@ -537,7 +519,6 @@ def generate_image_analysis(
|
|
| 537 |
"top_p": top_p,
|
| 538 |
"top_k": top_k,
|
| 539 |
}
|
| 540 |
-
|
| 541 |
thread = Thread(target=model.generate, kwargs=gen_kwargs)
|
| 542 |
thread.start()
|
| 543 |
buffer = ""
|
|
@@ -560,7 +541,7 @@ def process_batch_images(
|
|
| 560 |
if not prompts_text.strip():
|
| 561 |
return "β οΈ Please enter prompts (one per line)."
|
| 562 |
|
| 563 |
-
prompts = [p.strip() for p in prompts_text.split('\
|
| 564 |
if len(prompts) == 1:
|
| 565 |
prompts = prompts * len(files)
|
| 566 |
elif len(prompts) != len(files):
|
|
@@ -576,24 +557,13 @@ def process_batch_images(
|
|
| 576 |
try:
|
| 577 |
image_path = file.name if hasattr(file, 'name') else file
|
| 578 |
image = Image.open(image_path).convert("RGB")
|
| 579 |
-
|
| 580 |
if seed != -1:
|
| 581 |
torch.manual_seed(seed + idx - 1)
|
| 582 |
|
| 583 |
messages = [{"role": "user", "content": [
|
| 584 |
{"type": "image", "image": image},
|
| 585 |
-
{"type": "text",
|
| 586 |
]}]
|
| 587 |
-
|
| 588 |
-
# inputs = processor.apply_chat_template(
|
| 589 |
-
# messages,
|
| 590 |
-
# tokenize=True,
|
| 591 |
-
# add_generation_prompt=True,
|
| 592 |
-
# return_dict=True,
|
| 593 |
-
# return_tensors="pt"
|
| 594 |
-
# )
|
| 595 |
-
# inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
| 596 |
-
|
| 597 |
inputs = prepare_inputs(processor, model, messages)
|
| 598 |
|
| 599 |
with torch.no_grad():
|
|
@@ -605,7 +575,6 @@ def process_batch_images(
|
|
| 605 |
top_k=top_k,
|
| 606 |
do_sample=temperature > 0,
|
| 607 |
)
|
| 608 |
-
|
| 609 |
generated_ids_trimmed = [
|
| 610 |
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
|
| 611 |
]
|
|
@@ -617,9 +586,9 @@ def process_batch_images(
|
|
| 617 |
|
| 618 |
results.append(f"βββ Image {idx}: {os.path.basename(str(image_path))} βββ")
|
| 619 |
results.append(f"π Prompt: {prompt}")
|
| 620 |
-
results.append(f"π Result: {result}\
|
| 621 |
|
| 622 |
-
return "\
|
| 623 |
|
| 624 |
|
| 625 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -650,15 +619,6 @@ def process_chat_message(
|
|
| 650 |
if content:
|
| 651 |
messages.append({"role": "user", "content": content})
|
| 652 |
|
| 653 |
-
# inputs = processor.apply_chat_template(
|
| 654 |
-
# messages,
|
| 655 |
-
# tokenize=True,
|
| 656 |
-
# add_generation_prompt=True,
|
| 657 |
-
# return_dict=True,
|
| 658 |
-
# return_tensors="pt"
|
| 659 |
-
# )
|
| 660 |
-
# inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
| 661 |
-
|
| 662 |
inputs = prepare_inputs(processor, model, messages)
|
| 663 |
|
| 664 |
with torch.no_grad():
|
|
@@ -669,7 +629,6 @@ def process_chat_message(
|
|
| 669 |
do_sample=True,
|
| 670 |
top_p=0.95,
|
| 671 |
)
|
| 672 |
-
|
| 673 |
generated_ids_trimmed = [
|
| 674 |
out_ids[len(in_ids):]
|
| 675 |
for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
|
|
@@ -682,9 +641,8 @@ def process_chat_message(
|
|
| 682 |
|
| 683 |
|
| 684 |
def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str):
|
| 685 |
-
text
|
| 686 |
files = message.get("files", [])
|
| 687 |
-
|
| 688 |
image = None
|
| 689 |
if files and len(files) > 0:
|
| 690 |
try:
|
|
@@ -704,8 +662,8 @@ def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name:
|
|
| 704 |
except Exception as e:
|
| 705 |
response = f"β Error: {str(e)}"
|
| 706 |
|
| 707 |
-
user_content = f"{text}\
|
| 708 |
-
history.append({"role": "user",
|
| 709 |
history.append({"role": "assistant", "content": response})
|
| 710 |
return "", history
|
| 711 |
|
|
@@ -719,10 +677,7 @@ def retry_fn(history, model_name):
|
|
| 719 |
return "", history
|
| 720 |
history = history[:-1]
|
| 721 |
user_content = last_user_msg.get("content", "")
|
| 722 |
-
|
| 723 |
-
text = user_content.replace("\\nπ [Image attached]", "").replace("π [Image attached]", "")
|
| 724 |
-
else:
|
| 725 |
-
text = user_content
|
| 726 |
return chat_fn({"text": text}, history, model_name)
|
| 727 |
|
| 728 |
|
|
@@ -757,28 +712,28 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 757 |
|
| 758 |
with gr.Accordion("βοΈ Advanced Generation Parameters", open=False):
|
| 759 |
with gr.Row():
|
| 760 |
-
max_new_tokens
|
| 761 |
-
temperature
|
| 762 |
with gr.Row():
|
| 763 |
-
top_p
|
| 764 |
-
top_k
|
| 765 |
with gr.Row():
|
| 766 |
-
repetition_penalty = gr.Slider(1.0, 2.0, 1.2,
|
| 767 |
-
seed_number
|
| 768 |
|
| 769 |
with gr.Tabs():
|
| 770 |
|
| 771 |
# βββ TAB 1: Document Scanner βββ
|
| 772 |
with gr.TabItem("πͺͺ Document Scanner"):
|
| 773 |
gr.Markdown(
|
| 774 |
-
"### Scan Front & Back of Documents\
|
| 775 |
-
"Upload front and/or back side images. Both analyzed together by the selected model.\
|
| 776 |
"Face profiles and signatures are **auto-detected** on front image upload."
|
| 777 |
)
|
| 778 |
with gr.Row():
|
| 779 |
with gr.Column(scale=1):
|
| 780 |
doc_front_image = gr.Image(type="pil", label="π Front Side", height=280)
|
| 781 |
-
doc_back_image
|
| 782 |
doc_prompt = gr.Textbox(
|
| 783 |
label="Custom Prompt (optional)", lines=3,
|
| 784 |
placeholder="e.g., Extract all text, MRZ data, name, DOB, ID number...",
|
|
@@ -794,9 +749,9 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 794 |
gr.Markdown("### π Visual Element Detection _(auto-detected on front image upload)_")
|
| 795 |
with gr.Row():
|
| 796 |
with gr.Column(scale=1):
|
| 797 |
-
doc_face_output
|
| 798 |
with gr.Column(scale=1):
|
| 799 |
-
doc_sig_output
|
| 800 |
with gr.Column(scale=1):
|
| 801 |
doc_annotated_output = gr.Image(label="π― Annotated Image (Highlights)", height=220)
|
| 802 |
doc_detection_summary = gr.Markdown("_Upload a front side image to detect visual elements._")
|
|
@@ -806,7 +761,6 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 806 |
inputs=[doc_front_image],
|
| 807 |
outputs=[doc_face_output, doc_sig_output, doc_annotated_output, doc_detection_summary],
|
| 808 |
)
|
| 809 |
-
|
| 810 |
doc_submit.click(
|
| 811 |
fn=generate_document_scan,
|
| 812 |
inputs=[model_choice, doc_front_image, doc_back_image, doc_prompt,
|
|
@@ -817,14 +771,14 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 817 |
# βββ TAB 2: Image Analysis βββ
|
| 818 |
with gr.TabItem("πΌοΈ Image Analysis"):
|
| 819 |
gr.Markdown(
|
| 820 |
-
"### Smart Image Analysis\
|
| 821 |
"Upload an image to auto-detect **face profiles**, **signatures**, and see "
|
| 822 |
"**highlighted annotations**. Then run model analysis with a custom prompt."
|
| 823 |
)
|
| 824 |
with gr.Row():
|
| 825 |
with gr.Column(scale=1):
|
| 826 |
img_upload = gr.Image(type="pil", label="Upload Image", height=320)
|
| 827 |
-
img_query
|
| 828 |
label="Query / Prompt", lines=2,
|
| 829 |
placeholder="What do you see in this image? / Extract all text / Describe in detail...",
|
| 830 |
)
|
|
@@ -838,9 +792,9 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 838 |
gr.Markdown("### π Visual Element Detection _(auto-detected on upload)_")
|
| 839 |
with gr.Row():
|
| 840 |
with gr.Column(scale=1):
|
| 841 |
-
face_output
|
| 842 |
with gr.Column(scale=1):
|
| 843 |
-
sig_output
|
| 844 |
with gr.Column(scale=1):
|
| 845 |
annotated_output = gr.Image(label="π― Annotated Image (Highlights)", height=220)
|
| 846 |
detection_summary = gr.Markdown("_Upload an image to detect visual elements._")
|
|
@@ -850,7 +804,6 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 850 |
inputs=[img_upload],
|
| 851 |
outputs=[face_output, sig_output, annotated_output, detection_summary],
|
| 852 |
)
|
| 853 |
-
|
| 854 |
img_submit.click(
|
| 855 |
fn=generate_image_analysis,
|
| 856 |
inputs=[model_choice, img_query, img_upload, max_new_tokens, temperature,
|
|
@@ -863,10 +816,10 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 863 |
gr.Markdown("### Process Multiple Images at Once")
|
| 864 |
with gr.Row():
|
| 865 |
with gr.Column(scale=1):
|
| 866 |
-
batch_images
|
| 867 |
batch_prompts = gr.Textbox(
|
| 868 |
label="Prompts (one per line)", lines=5,
|
| 869 |
-
placeholder="Describe this image in detail\
|
| 870 |
info="One prompt for all images OR one prompt per image",
|
| 871 |
)
|
| 872 |
batch_submit = gr.Button("π Process Batch", variant="primary")
|
|
@@ -883,23 +836,19 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 883 |
# βββ TAB 4: Chat βββ
|
| 884 |
with gr.TabItem("π¬ Chat"):
|
| 885 |
gr.Markdown(
|
| 886 |
-
"### Multi-Turn Chat with Image Attachments\
|
| 887 |
"Converse with the model. Attach images at any point in the conversation."
|
| 888 |
)
|
| 889 |
with gr.Row():
|
| 890 |
with gr.Column(scale=1):
|
| 891 |
gr.Markdown(
|
| 892 |
-
"**π‘ Tips:**\
|
| 893 |
-
"- Upload an image and ask questions\
|
| 894 |
-
"- Detailed descriptions & visual QA\
|
| 895 |
-
"- Multi-turn conversation memory\
|
| 896 |
)
|
| 897 |
with gr.Column(scale=3):
|
| 898 |
-
chatbot = gr.Chatbot(
|
| 899 |
-
label="Chat",
|
| 900 |
-
height=450,
|
| 901 |
-
value=[],
|
| 902 |
-
)
|
| 903 |
with gr.Row():
|
| 904 |
chat_msg = gr.MultimodalTextbox(
|
| 905 |
label="Message",
|
|
@@ -908,24 +857,23 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
|
|
| 908 |
)
|
| 909 |
with gr.Row():
|
| 910 |
retry_btn = gr.Button("π Retry", variant="secondary", size="sm")
|
| 911 |
-
undo_btn
|
| 912 |
clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="sm")
|
| 913 |
|
| 914 |
-
chat_msg.submit(chat_fn,
|
| 915 |
-
retry_btn.click(retry_fn, [chatbot, model_choice],
|
| 916 |
-
undo_btn.click(undo_fn,
|
| 917 |
-
clear_btn.click(clear_fn, outputs=[chat_msg, chatbot],
|
| 918 |
|
| 919 |
gr.Markdown(
|
| 920 |
-
"---\
|
| 921 |
-
"**π§ Chhagan's Multi-Model Studio** β’ 12 Models Total\
|
| 922 |
"Qwen3-VL (2B/4B/8B/32B) Instruct + Qwen2.5-VL (3B/7B) Instruct + "
|
| 923 |
"CSM-DocExtract-VL β’ CSM-DocExtract-VL-Q4KM β’ CSM-DocExtract-VL-Q4KM-merged-fp16 β’ "
|
| 924 |
-
"CSM-DocExtract-VL-HF β’ Chhagan_ML-VL-OCR-v1 β’ Chhagan-DocVL-Qwen3\
|
| 925 |
"_Built with β€οΈ using Gradio_"
|
| 926 |
)
|
| 927 |
|
| 928 |
-
|
| 929 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 930 |
# Launch
|
| 931 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 9 |
12 models: 6 Qwen VL Instruct + 6 Custom CSM/Chhagan VL models across all tabs.
|
| 10 |
"""
|
| 11 |
|
|
|
|
| 12 |
import os
|
|
|
|
|
|
|
|
|
|
| 13 |
import time
|
| 14 |
import warnings
|
| 15 |
from threading import Thread
|
| 16 |
+
from typing import Optional, Tuple, Dict, Any, List
|
| 17 |
|
| 18 |
from qwen_vl_utils import process_vision_info
|
| 19 |
|
|
|
|
| 24 |
from PIL import Image, ImageDraw, ImageFont
|
| 25 |
import cv2
|
| 26 |
|
|
|
|
| 27 |
from transformers import (
|
| 28 |
Qwen2_5_VLForConditionalGeneration,
|
| 29 |
Qwen3VLForConditionalGeneration,
|
|
|
|
| 31 |
TextIteratorStreamer,
|
| 32 |
)
|
| 33 |
|
|
|
|
| 34 |
from gradio.themes import Soft
|
| 35 |
from gradio.themes.utils import colors, fonts, sizes
|
| 36 |
|
|
|
|
| 37 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
# Suppress warnings
|
| 39 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
warnings.filterwarnings('ignore', message='.*meta device.*')
|
| 41 |
|
|
|
|
| 42 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
# Custom Premium Theme
|
| 44 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 49 |
c800="#3730A3", c900="#312E81", c950="#1E1B4B",
|
| 50 |
)
|
| 51 |
|
|
|
|
| 52 |
colors.cyber_teal = colors.Color(
|
| 53 |
name="cyber_teal",
|
| 54 |
c50="#F0FDFA", c100="#CCFBF1", c200="#99F6E4", c300="#5EEAD4",
|
|
|
|
| 56 |
c800="#115E59", c900="#134E4A", c950="#042F2E",
|
| 57 |
)
|
| 58 |
|
|
|
|
| 59 |
class PremiumTheme(Soft):
|
| 60 |
def __init__(self):
|
| 61 |
super().__init__(
|
|
|
|
| 86 |
block_label_background_fill="*primary_100",
|
| 87 |
)
|
| 88 |
|
|
|
|
| 89 |
premium_theme = PremiumTheme()
|
| 90 |
|
|
|
|
| 91 |
css = """
|
| 92 |
#app-title h1 {
|
| 93 |
font-size: 2.5em !important;
|
|
|
|
| 121 |
padding: 12px;
|
| 122 |
background: var(--background-fill-secondary);
|
| 123 |
}
|
| 124 |
+
.face-box { border: 3px solid #22c55e; border-radius: 8px; }
|
| 125 |
+
.sig-box { border: 3px solid #3b82f6; border-radius: 8px; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
"""
|
| 127 |
|
|
|
|
| 128 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 129 |
# Device & Constants
|
| 130 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 133 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 134 |
print(f"π Using device: {device}")
|
| 135 |
|
|
|
|
| 136 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
# ALL 12 MODELS
|
|
|
|
| 138 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 139 |
ALL_MODELS = [
|
| 140 |
# ββ Qwen Official Instruct Models ββ
|
|
|
|
| 153 |
"Chhagan005/Chhagan-DocVL-Qwen3",
|
| 154 |
]
|
| 155 |
|
|
|
|
| 156 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 157 |
+
# Lazy Model Loading
|
| 158 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 159 |
_model_cache: Dict[str, Tuple[Any, Any]] = {}
|
| 160 |
|
|
|
|
| 161 |
def get_model_class(model_id: str):
|
| 162 |
if "Qwen2.5" in model_id:
|
| 163 |
return Qwen2_5_VLForConditionalGeneration
|
| 164 |
return Qwen3VLForConditionalGeneration
|
| 165 |
|
|
|
|
| 166 |
def load_model(model_id: str):
|
| 167 |
if model_id in _model_cache:
|
| 168 |
return _model_cache[model_id]
|
|
|
|
| 169 |
print(f"β³ Loading model: {model_id}")
|
| 170 |
model_cls = get_model_class(model_id)
|
| 171 |
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
|
|
|
| 172 |
with warnings.catch_warnings():
|
| 173 |
warnings.filterwarnings('ignore')
|
| 174 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
| 175 |
model = model_cls.from_pretrained(
|
| 176 |
+
model_id, torch_dtype=dtype, device_map="auto", trust_remote_code=True,
|
| 177 |
)
|
| 178 |
model.eval()
|
|
|
|
| 179 |
_model_cache[model_id] = (processor, model)
|
| 180 |
print(f"β
Model {model_id} loaded on {device}")
|
| 181 |
return processor, model
|
| 182 |
|
|
|
|
| 183 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 184 |
+
# Pre-load default model
|
| 185 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
DEFAULT_MODEL = "Qwen/Qwen3-VL-8B-Instruct"
|
| 187 |
print(f"β³ Pre-loading default model at startup: {DEFAULT_MODEL}")
|
| 188 |
load_model(DEFAULT_MODEL)
|
| 189 |
print(f"β
Default model ready!")
|
| 190 |
|
| 191 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 192 |
+
# β
CORE FIX: Universal Input Processor
|
| 193 |
+
# Handles both standard Qwen templates AND custom CSM jinja templates
|
| 194 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 195 |
+
def _flatten_messages_for_custom_template(messages):
|
| 196 |
+
"""
|
| 197 |
+
Custom CSM/Chhagan models have jinja templates that expect
|
| 198 |
+
plain string content, not multimodal list-of-dicts.
|
| 199 |
+
This flattens content lists β string, extracts PIL images separately.
|
| 200 |
+
"""
|
| 201 |
+
flat_messages = []
|
| 202 |
+
extracted_images = []
|
| 203 |
+
for msg in messages:
|
| 204 |
+
content = msg.get("content", "")
|
| 205 |
+
if isinstance(content, list):
|
| 206 |
+
parts = []
|
| 207 |
+
for item in content:
|
| 208 |
+
if isinstance(item, dict):
|
| 209 |
+
if item.get("type") == "image":
|
| 210 |
+
img = item.get("image")
|
| 211 |
+
if img is not None:
|
| 212 |
+
extracted_images.append(img)
|
| 213 |
+
# Qwen vision special token placeholder
|
| 214 |
+
parts.append("<|vision_start|><|image_pad|><|vision_end|>")
|
| 215 |
+
elif item.get("type") == "text":
|
| 216 |
+
parts.append(item.get("text", ""))
|
| 217 |
+
flat_messages.append({"role": msg["role"], "content": "".join(parts)})
|
| 218 |
+
else:
|
| 219 |
+
flat_messages.append(msg)
|
| 220 |
+
return flat_messages, extracted_images
|
| 221 |
|
| 222 |
|
| 223 |
def prepare_inputs(processor, model, messages):
|
| 224 |
+
"""
|
| 225 |
+
Attempt 1 β Standard multimodal path (works for official Qwen models).
|
| 226 |
+
Attempt 2 β Flatten fallback (works for custom CSM/Chhagan jinja templates).
|
| 227 |
+
"""
|
| 228 |
+
# ββ Attempt 1: Standard multimodal ββββββββββββββββββββββ
|
| 229 |
+
try:
|
| 230 |
+
text = processor.apply_chat_template(
|
| 231 |
+
messages,
|
| 232 |
+
tokenize=False,
|
| 233 |
+
add_generation_prompt=True,
|
| 234 |
+
)
|
| 235 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 236 |
+
inputs = processor(
|
| 237 |
+
text=[text],
|
| 238 |
+
images=image_inputs if image_inputs else None,
|
| 239 |
+
videos=video_inputs if video_inputs else None,
|
| 240 |
+
padding=True,
|
| 241 |
+
return_tensors="pt",
|
| 242 |
+
)
|
| 243 |
+
return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
| 244 |
+
except TypeError:
|
| 245 |
+
# Custom template doesn't support list content β use fallback
|
| 246 |
+
pass
|
| 247 |
+
|
| 248 |
+
# ββ Attempt 2: Flatten for custom jinja templates ββββββββ
|
| 249 |
+
flat_msgs, extracted_images = _flatten_messages_for_custom_template(messages)
|
| 250 |
text = processor.apply_chat_template(
|
| 251 |
+
flat_msgs,
|
| 252 |
tokenize=False,
|
| 253 |
add_generation_prompt=True,
|
| 254 |
)
|
|
|
|
| 255 |
inputs = processor(
|
| 256 |
text=[text],
|
| 257 |
+
images=extracted_images if extracted_images else None,
|
|
|
|
| 258 |
padding=True,
|
| 259 |
return_tensors="pt",
|
| 260 |
)
|
| 261 |
return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
| 262 |
+
|
| 263 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 264 |
# Utility Functions
|
| 265 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 274 |
return image.convert("RGB")
|
| 275 |
return image
|
| 276 |
|
|
|
|
| 277 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 278 |
# π Face Detection, Signature Extraction & Annotation Engine
|
| 279 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
def detect_faces(image: Image.Image):
|
| 281 |
img_array = np.array(image)
|
| 282 |
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
|
|
| 283 |
face_cascade = cv2.CascadeClassifier(
|
| 284 |
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
|
| 285 |
)
|
|
|
|
| 287 |
gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
|
| 288 |
flags=cv2.CASCADE_SCALE_IMAGE,
|
| 289 |
)
|
|
|
|
| 290 |
if len(faces) == 0:
|
| 291 |
profile_cascade = cv2.CascadeClassifier(
|
| 292 |
cv2.data.haarcascades + 'haarcascade_profileface.xml'
|
|
|
|
| 294 |
faces = profile_cascade.detectMultiScale(
|
| 295 |
gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
|
| 296 |
)
|
|
|
|
| 297 |
if len(faces) == 0:
|
| 298 |
return None, []
|
| 299 |
|
| 300 |
faces_sorted = sorted(faces, key=lambda f: f[2] * f[3], reverse=True)
|
|
|
|
| 301 |
x, y, w, h = faces_sorted[0]
|
| 302 |
pad = int(0.2 * max(w, h))
|
| 303 |
x1 = max(0, x - pad)
|
|
|
|
| 318 |
y2 = min(img_array.shape[0], y + h + pad)
|
| 319 |
face_gray2 = gray[y1:y2, x1:x2]
|
| 320 |
if face_gray2.size > 0 and cv2.Laplacian(face_gray2, cv2.CV_64F).var() < 30:
|
| 321 |
+
return None, [tuple(f) for f in faces_sorted]
|
| 322 |
else:
|
| 323 |
return None, [tuple(f) for f in faces_sorted]
|
| 324 |
|
|
|
|
| 330 |
def detect_signature(image: Image.Image):
|
| 331 |
img_array = np.array(image)
|
| 332 |
h, w = img_array.shape[:2]
|
|
|
|
| 333 |
search_top = int(h * 0.5)
|
| 334 |
lower_region = img_array[search_top:, :]
|
| 335 |
gray = cv2.cvtColor(lower_region, cv2.COLOR_RGB2GRAY)
|
|
|
|
| 336 |
binary = cv2.adaptiveThreshold(
|
| 337 |
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 10
|
|
|
|
| 338 |
)
|
|
|
|
| 339 |
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
|
| 340 |
binary = cv2.dilate(binary, kernel, iterations=2)
|
| 341 |
binary = cv2.erode(binary, kernel, iterations=1)
|
|
|
|
| 342 |
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 343 |
if not contours:
|
| 344 |
return None, None
|
|
|
|
| 358 |
|
| 359 |
all_points = np.concatenate(sig_contours)
|
| 360 |
rx, ry, rw, rh = cv2.boundingRect(all_points)
|
|
|
|
| 361 |
if rw < 30 or rh < 10:
|
| 362 |
return None, None
|
| 363 |
|
|
|
|
| 374 |
return None, None
|
| 375 |
|
| 376 |
sig_crop = image.crop((sig_x1, sig_y1, sig_x2, sig_y2))
|
| 377 |
+
return sig_crop, (sig_x1, sig_y1, sig_x2, sig_y2)
|
|
|
|
| 378 |
|
| 379 |
|
| 380 |
def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Optional[tuple]):
|
| 381 |
img_array = np.array(image).copy()
|
|
|
|
| 382 |
for i, (x, y, w, h) in enumerate(face_bboxes):
|
| 383 |
color = (34, 197, 94)
|
| 384 |
cv2.rectangle(img_array, (x, y), (x + w, y + h), color, 3)
|
|
|
|
| 387 |
cv2.rectangle(img_array, (x, y - th - 10), (x + tw + 6, y), color, -1)
|
| 388 |
cv2.putText(img_array, label, (x + 3, y - 5),
|
| 389 |
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
|
|
|
|
| 390 |
if sig_bbox:
|
| 391 |
x1, y1, x2, y2 = sig_bbox
|
| 392 |
color = (59, 130, 246)
|
|
|
|
| 396 |
cv2.rectangle(img_array, (x1, y1 - th - 10), (x1 + tw + 6, y1), color, -1)
|
| 397 |
cv2.putText(img_array, label, (x1 + 3, y1 - 5),
|
| 398 |
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
|
|
|
|
| 399 |
return Image.fromarray(img_array)
|
| 400 |
|
| 401 |
|
| 402 |
def run_visual_extraction(image: Optional[Image.Image]):
|
| 403 |
if image is None:
|
| 404 |
return None, None, None, "_Upload an image to detect visual elements._"
|
|
|
|
| 405 |
image = ensure_rgb(image)
|
| 406 |
detections = []
|
| 407 |
|
| 408 |
face_crop, face_bboxes = detect_faces(image)
|
| 409 |
if face_crop is not None:
|
| 410 |
detections.append(f"β
**Face detected** β {len(face_bboxes)} face(s) found, largest extracted")
|
| 411 |
+
elif face_bboxes:
|
| 412 |
+
detections.append(f"β οΈ **Face found but too blurry/small** β {len(face_bboxes)} face(s) detected but quality insufficient")
|
| 413 |
else:
|
| 414 |
+
detections.append("β **No face detected** in this image")
|
|
|
|
|
|
|
|
|
|
| 415 |
|
| 416 |
sig_crop, sig_bbox = detect_signature(image)
|
| 417 |
if sig_crop is not None:
|
|
|
|
| 420 |
detections.append("βΉοΈ **No signature detected** in this image")
|
| 421 |
|
| 422 |
annotated = create_annotated_image(image, face_bboxes, sig_bbox)
|
| 423 |
+
detections.append(
|
| 424 |
+
f"\nπ― **Annotated image** generated with {len(face_bboxes)} face box(es)"
|
| 425 |
+
+ (" + 1 signature box" if sig_bbox else "")
|
| 426 |
+
)
|
| 427 |
+
summary_md = "### π Detection Results\n\n" + "\n\n".join(detections)
|
| 428 |
return face_crop, sig_crop, annotated, summary_md
|
| 429 |
|
| 430 |
|
|
|
|
| 440 |
if front_image is None and back_image is None:
|
| 441 |
yield "β οΈ Please upload at least one image.", "β οΈ Please upload at least one image."
|
| 442 |
return
|
|
|
|
| 443 |
if not prompt.strip():
|
| 444 |
prompt = ("Analyze this document. Extract all text, key details "
|
| 445 |
"(name, dates, numbers, etc.) and provide a structured summary.")
|
|
|
|
| 446 |
try:
|
| 447 |
processor, model = load_model(model_name)
|
| 448 |
except Exception as e:
|
|
|
|
| 452 |
content = []
|
| 453 |
if front_image is not None:
|
| 454 |
front_image = ensure_rgb(front_image)
|
| 455 |
+
content.append({"type": "text", "text": "**[FRONT SIDE]**"})
|
| 456 |
content.append({"type": "image", "image": front_image})
|
|
|
|
| 457 |
if back_image is not None:
|
| 458 |
back_image = ensure_rgb(back_image)
|
| 459 |
+
content.append({"type": "text", "text": "**[BACK SIDE]**"})
|
| 460 |
content.append({"type": "image", "image": back_image})
|
|
|
|
| 461 |
content.append({"type": "text", "text": prompt})
|
| 462 |
|
| 463 |
messages = [{"role": "user", "content": content}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
inputs = prepare_inputs(processor, model, messages)
|
| 465 |
|
| 466 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
|
|
|
| 467 |
gen_kwargs = {
|
| 468 |
**inputs,
|
| 469 |
"streamer": streamer,
|
|
|
|
| 473 |
"top_p": top_p,
|
| 474 |
"top_k": top_k,
|
| 475 |
}
|
|
|
|
| 476 |
thread = Thread(target=model.generate, kwargs=gen_kwargs)
|
| 477 |
thread.start()
|
| 478 |
buffer = ""
|
|
|
|
| 496 |
return
|
| 497 |
if not text.strip():
|
| 498 |
text = "Describe this image in detail."
|
|
|
|
| 499 |
try:
|
| 500 |
processor, model = load_model(model_name)
|
| 501 |
except Exception as e:
|
|
|
|
| 503 |
return
|
| 504 |
|
| 505 |
image = ensure_rgb(image)
|
|
|
|
| 506 |
messages = [{"role": "user", "content": [
|
| 507 |
{"type": "image", "image": image},
|
| 508 |
+
{"type": "text", "text": text},
|
| 509 |
]}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
inputs = prepare_inputs(processor, model, messages)
|
| 511 |
|
| 512 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
|
|
|
| 513 |
gen_kwargs = {
|
| 514 |
**inputs,
|
| 515 |
"streamer": streamer,
|
|
|
|
| 519 |
"top_p": top_p,
|
| 520 |
"top_k": top_k,
|
| 521 |
}
|
|
|
|
| 522 |
thread = Thread(target=model.generate, kwargs=gen_kwargs)
|
| 523 |
thread.start()
|
| 524 |
buffer = ""
|
|
|
|
| 541 |
if not prompts_text.strip():
|
| 542 |
return "β οΈ Please enter prompts (one per line)."
|
| 543 |
|
| 544 |
+
prompts = [p.strip() for p in prompts_text.split('\n') if p.strip()]
|
| 545 |
if len(prompts) == 1:
|
| 546 |
prompts = prompts * len(files)
|
| 547 |
elif len(prompts) != len(files):
|
|
|
|
| 557 |
try:
|
| 558 |
image_path = file.name if hasattr(file, 'name') else file
|
| 559 |
image = Image.open(image_path).convert("RGB")
|
|
|
|
| 560 |
if seed != -1:
|
| 561 |
torch.manual_seed(seed + idx - 1)
|
| 562 |
|
| 563 |
messages = [{"role": "user", "content": [
|
| 564 |
{"type": "image", "image": image},
|
| 565 |
+
{"type": "text", "text": prompt},
|
| 566 |
]}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
inputs = prepare_inputs(processor, model, messages)
|
| 568 |
|
| 569 |
with torch.no_grad():
|
|
|
|
| 575 |
top_k=top_k,
|
| 576 |
do_sample=temperature > 0,
|
| 577 |
)
|
|
|
|
| 578 |
generated_ids_trimmed = [
|
| 579 |
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
|
| 580 |
]
|
|
|
|
| 586 |
|
| 587 |
results.append(f"βββ Image {idx}: {os.path.basename(str(image_path))} βββ")
|
| 588 |
results.append(f"π Prompt: {prompt}")
|
| 589 |
+
results.append(f"π Result: {result}\n")
|
| 590 |
|
| 591 |
+
return "\n".join(results)
|
| 592 |
|
| 593 |
|
| 594 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 619 |
if content:
|
| 620 |
messages.append({"role": "user", "content": content})
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
inputs = prepare_inputs(processor, model, messages)
|
| 623 |
|
| 624 |
with torch.no_grad():
|
|
|
|
| 629 |
do_sample=True,
|
| 630 |
top_p=0.95,
|
| 631 |
)
|
|
|
|
| 632 |
generated_ids_trimmed = [
|
| 633 |
out_ids[len(in_ids):]
|
| 634 |
for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
|
|
|
|
| 641 |
|
| 642 |
|
| 643 |
def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str):
|
| 644 |
+
text = message.get("text", "")
|
| 645 |
files = message.get("files", [])
|
|
|
|
| 646 |
image = None
|
| 647 |
if files and len(files) > 0:
|
| 648 |
try:
|
|
|
|
| 662 |
except Exception as e:
|
| 663 |
response = f"β Error: {str(e)}"
|
| 664 |
|
| 665 |
+
user_content = f"{text}\nπ [Image attached]" if image is not None else text
|
| 666 |
+
history.append({"role": "user", "content": user_content})
|
| 667 |
history.append({"role": "assistant", "content": response})
|
| 668 |
return "", history
|
| 669 |
|
|
|
|
| 677 |
return "", history
|
| 678 |
history = history[:-1]
|
| 679 |
user_content = last_user_msg.get("content", "")
|
| 680 |
+
text = user_content.replace("\nπ [Image attached]", "").replace("π [Image attached]", "")
|
|
|
|
|
|
|
|
|
|
| 681 |
return chat_fn({"text": text}, history, model_name)
|
| 682 |
|
| 683 |
|
|
|
|
| 712 |
|
| 713 |
with gr.Accordion("βοΈ Advanced Generation Parameters", open=False):
|
| 714 |
with gr.Row():
|
| 715 |
+
max_new_tokens = gr.Slider(64, MAX_MAX_NEW_TOKENS, DEFAULT_MAX_NEW_TOKENS, step=64, label="Max New Tokens")
|
| 716 |
+
temperature = gr.Slider(0.1, 2.0, 0.6, step=0.1, label="Temperature")
|
| 717 |
with gr.Row():
|
| 718 |
+
top_p = gr.Slider(0.05, 1.0, 0.9, step=0.05, label="Top-p")
|
| 719 |
+
top_k = gr.Slider(1, 1000, 50, step=1, label="Top-k")
|
| 720 |
with gr.Row():
|
| 721 |
+
repetition_penalty = gr.Slider(1.0, 2.0, 1.2, step=0.05, label="Repetition Penalty")
|
| 722 |
+
seed_number = gr.Number(value=-1, label="Seed (-1 = random)", precision=0)
|
| 723 |
|
| 724 |
with gr.Tabs():
|
| 725 |
|
| 726 |
# βββ TAB 1: Document Scanner βββ
|
| 727 |
with gr.TabItem("πͺͺ Document Scanner"):
|
| 728 |
gr.Markdown(
|
| 729 |
+
"### Scan Front & Back of Documents\n"
|
| 730 |
+
"Upload front and/or back side images. Both analyzed together by the selected model.\n"
|
| 731 |
"Face profiles and signatures are **auto-detected** on front image upload."
|
| 732 |
)
|
| 733 |
with gr.Row():
|
| 734 |
with gr.Column(scale=1):
|
| 735 |
doc_front_image = gr.Image(type="pil", label="π Front Side", height=280)
|
| 736 |
+
doc_back_image = gr.Image(type="pil", label="π Back Side", height=280)
|
| 737 |
doc_prompt = gr.Textbox(
|
| 738 |
label="Custom Prompt (optional)", lines=3,
|
| 739 |
placeholder="e.g., Extract all text, MRZ data, name, DOB, ID number...",
|
|
|
|
| 749 |
gr.Markdown("### π Visual Element Detection _(auto-detected on front image upload)_")
|
| 750 |
with gr.Row():
|
| 751 |
with gr.Column(scale=1):
|
| 752 |
+
doc_face_output = gr.Image(label="π€ Detected Face Profile", height=220, elem_classes="face-box")
|
| 753 |
with gr.Column(scale=1):
|
| 754 |
+
doc_sig_output = gr.Image(label="βοΈ Detected Signature", height=220, elem_classes="sig-box")
|
| 755 |
with gr.Column(scale=1):
|
| 756 |
doc_annotated_output = gr.Image(label="π― Annotated Image (Highlights)", height=220)
|
| 757 |
doc_detection_summary = gr.Markdown("_Upload a front side image to detect visual elements._")
|
|
|
|
| 761 |
inputs=[doc_front_image],
|
| 762 |
outputs=[doc_face_output, doc_sig_output, doc_annotated_output, doc_detection_summary],
|
| 763 |
)
|
|
|
|
| 764 |
doc_submit.click(
|
| 765 |
fn=generate_document_scan,
|
| 766 |
inputs=[model_choice, doc_front_image, doc_back_image, doc_prompt,
|
|
|
|
| 771 |
# βββ TAB 2: Image Analysis βββ
|
| 772 |
with gr.TabItem("πΌοΈ Image Analysis"):
|
| 773 |
gr.Markdown(
|
| 774 |
+
"### Smart Image Analysis\n"
|
| 775 |
"Upload an image to auto-detect **face profiles**, **signatures**, and see "
|
| 776 |
"**highlighted annotations**. Then run model analysis with a custom prompt."
|
| 777 |
)
|
| 778 |
with gr.Row():
|
| 779 |
with gr.Column(scale=1):
|
| 780 |
img_upload = gr.Image(type="pil", label="Upload Image", height=320)
|
| 781 |
+
img_query = gr.Textbox(
|
| 782 |
label="Query / Prompt", lines=2,
|
| 783 |
placeholder="What do you see in this image? / Extract all text / Describe in detail...",
|
| 784 |
)
|
|
|
|
| 792 |
gr.Markdown("### π Visual Element Detection _(auto-detected on upload)_")
|
| 793 |
with gr.Row():
|
| 794 |
with gr.Column(scale=1):
|
| 795 |
+
face_output = gr.Image(label="π€ Detected Face Profile", height=220, elem_classes="face-box")
|
| 796 |
with gr.Column(scale=1):
|
| 797 |
+
sig_output = gr.Image(label="βοΈ Detected Signature", height=220, elem_classes="sig-box")
|
| 798 |
with gr.Column(scale=1):
|
| 799 |
annotated_output = gr.Image(label="π― Annotated Image (Highlights)", height=220)
|
| 800 |
detection_summary = gr.Markdown("_Upload an image to detect visual elements._")
|
|
|
|
| 804 |
inputs=[img_upload],
|
| 805 |
outputs=[face_output, sig_output, annotated_output, detection_summary],
|
| 806 |
)
|
|
|
|
| 807 |
img_submit.click(
|
| 808 |
fn=generate_image_analysis,
|
| 809 |
inputs=[model_choice, img_query, img_upload, max_new_tokens, temperature,
|
|
|
|
| 816 |
gr.Markdown("### Process Multiple Images at Once")
|
| 817 |
with gr.Row():
|
| 818 |
with gr.Column(scale=1):
|
| 819 |
+
batch_images = gr.File(file_count="multiple", label="Upload Images", file_types=["image"])
|
| 820 |
batch_prompts = gr.Textbox(
|
| 821 |
label="Prompts (one per line)", lines=5,
|
| 822 |
+
placeholder="Describe this image in detail\nExtract all text...",
|
| 823 |
info="One prompt for all images OR one prompt per image",
|
| 824 |
)
|
| 825 |
batch_submit = gr.Button("π Process Batch", variant="primary")
|
|
|
|
| 836 |
# βββ TAB 4: Chat βββ
|
| 837 |
with gr.TabItem("π¬ Chat"):
|
| 838 |
gr.Markdown(
|
| 839 |
+
"### Multi-Turn Chat with Image Attachments\n"
|
| 840 |
"Converse with the model. Attach images at any point in the conversation."
|
| 841 |
)
|
| 842 |
with gr.Row():
|
| 843 |
with gr.Column(scale=1):
|
| 844 |
gr.Markdown(
|
| 845 |
+
"**π‘ Tips:**\n"
|
| 846 |
+
"- Upload an image and ask questions\n"
|
| 847 |
+
"- Detailed descriptions & visual QA\n"
|
| 848 |
+
"- Multi-turn conversation memory\n"
|
| 849 |
)
|
| 850 |
with gr.Column(scale=3):
|
| 851 |
+
chatbot = gr.Chatbot(label="Chat", height=450, value=[])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
with gr.Row():
|
| 853 |
chat_msg = gr.MultimodalTextbox(
|
| 854 |
label="Message",
|
|
|
|
| 857 |
)
|
| 858 |
with gr.Row():
|
| 859 |
retry_btn = gr.Button("π Retry", variant="secondary", size="sm")
|
| 860 |
+
undo_btn = gr.Button("β©οΈ Undo", variant="secondary", size="sm")
|
| 861 |
clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="sm")
|
| 862 |
|
| 863 |
+
chat_msg.submit(chat_fn, [chat_msg, chatbot, model_choice], [chat_msg, chatbot], queue=True)
|
| 864 |
+
retry_btn.click(retry_fn, [chatbot, model_choice], [chat_msg, chatbot], queue=True)
|
| 865 |
+
undo_btn.click( undo_fn, [chatbot], [chatbot], queue=False)
|
| 866 |
+
clear_btn.click(clear_fn, outputs=[chat_msg, chatbot], queue=False)
|
| 867 |
|
| 868 |
gr.Markdown(
|
| 869 |
+
"---\n"
|
| 870 |
+
"**π§ Chhagan's Multi-Model Studio** β’ 12 Models Total\n\n"
|
| 871 |
"Qwen3-VL (2B/4B/8B/32B) Instruct + Qwen2.5-VL (3B/7B) Instruct + "
|
| 872 |
"CSM-DocExtract-VL β’ CSM-DocExtract-VL-Q4KM β’ CSM-DocExtract-VL-Q4KM-merged-fp16 β’ "
|
| 873 |
+
"CSM-DocExtract-VL-HF β’ Chhagan_ML-VL-OCR-v1 β’ Chhagan-DocVL-Qwen3\n\n"
|
| 874 |
"_Built with β€οΈ using Gradio_"
|
| 875 |
)
|
| 876 |
|
|
|
|
| 877 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 878 |
# Launch
|
| 879 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|