Chhagan005 commited on
Commit
c4a193f
Β·
verified Β·
1 Parent(s): e8236d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -170
app.py CHANGED
@@ -9,15 +9,11 @@ Features:
9
  12 models: 6 Qwen VL Instruct + 6 Custom CSM/Chhagan VL models across all tabs.
10
  """
11
 
12
-
13
  import os
14
- import random
15
- import uuid
16
- import json
17
  import time
18
  import warnings
19
  from threading import Thread
20
- from typing import Optional, Tuple, Dict, Any, List, Iterable
21
 
22
  from qwen_vl_utils import process_vision_info
23
 
@@ -28,7 +24,6 @@ import numpy as np
28
  from PIL import Image, ImageDraw, ImageFont
29
  import cv2
30
 
31
-
32
  from transformers import (
33
  Qwen2_5_VLForConditionalGeneration,
34
  Qwen3VLForConditionalGeneration,
@@ -36,17 +31,14 @@ from transformers import (
36
  TextIteratorStreamer,
37
  )
38
 
39
-
40
  from gradio.themes import Soft
41
  from gradio.themes.utils import colors, fonts, sizes
42
 
43
-
44
  # ──────────────────────────────────────────────────────────────
45
  # Suppress warnings
46
  # ──────────────────────────────────────────────────────────────
47
  warnings.filterwarnings('ignore', message='.*meta device.*')
48
 
49
-
50
  # ──────────────────────────────────────────────────────────────
51
  # Custom Premium Theme
52
  # ──────────────────────────────────────────────────────────────
@@ -57,7 +49,6 @@ colors.deep_indigo = colors.Color(
57
  c800="#3730A3", c900="#312E81", c950="#1E1B4B",
58
  )
59
 
60
-
61
  colors.cyber_teal = colors.Color(
62
  name="cyber_teal",
63
  c50="#F0FDFA", c100="#CCFBF1", c200="#99F6E4", c300="#5EEAD4",
@@ -65,7 +56,6 @@ colors.cyber_teal = colors.Color(
65
  c800="#115E59", c900="#134E4A", c950="#042F2E",
66
  )
67
 
68
-
69
  class PremiumTheme(Soft):
70
  def __init__(self):
71
  super().__init__(
@@ -96,10 +86,8 @@ class PremiumTheme(Soft):
96
  block_label_background_fill="*primary_100",
97
  )
98
 
99
-
100
  premium_theme = PremiumTheme()
101
 
102
-
103
  css = """
104
  #app-title h1 {
105
  font-size: 2.5em !important;
@@ -133,17 +121,10 @@ css = """
133
  padding: 12px;
134
  background: var(--background-fill-secondary);
135
  }
136
- .face-box {
137
- border: 3px solid #22c55e;
138
- border-radius: 8px;
139
- }
140
- .sig-box {
141
- border: 3px solid #3b82f6;
142
- border-radius: 8px;
143
- }
144
  """
145
 
146
-
147
  # ──────────────────────────────────────────────────────────────
148
  # Device & Constants
149
  # ──────────────────────────────────────────────────────────────
@@ -152,10 +133,8 @@ DEFAULT_MAX_NEW_TOKENS = 1024
152
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
153
  print(f"πŸš€ Using device: {device}")
154
 
155
-
156
  # ──────────────────────────────────────────────────────────────
157
  # ALL 12 MODELS
158
- # 6 Qwen Instruct (original) + 6 Custom CSM/Chhagan (replaced Thinking models)
159
  # ──────────────────────────────────────────────────────────────
160
  ALL_MODELS = [
161
  # ── Qwen Official Instruct Models ──
@@ -174,65 +153,113 @@ ALL_MODELS = [
174
  "Chhagan005/Chhagan-DocVL-Qwen3",
175
  ]
176
 
177
-
178
  # ──────────────────────────────────────────────────────────────
179
- # Lazy Model Loading (load on demand, cache in memory)
180
  # ──────────────────────────────────────────────────────────────
181
  _model_cache: Dict[str, Tuple[Any, Any]] = {}
182
 
183
-
184
  def get_model_class(model_id: str):
185
  if "Qwen2.5" in model_id:
186
  return Qwen2_5_VLForConditionalGeneration
187
  return Qwen3VLForConditionalGeneration
188
 
189
-
190
  def load_model(model_id: str):
191
  if model_id in _model_cache:
192
  return _model_cache[model_id]
193
-
194
  print(f"⏳ Loading model: {model_id}")
195
  model_cls = get_model_class(model_id)
196
  dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
197
-
198
  with warnings.catch_warnings():
199
  warnings.filterwarnings('ignore')
200
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
201
  model = model_cls.from_pretrained(
202
- model_id, dtype=dtype, device_map="auto", trust_remote_code=True,
203
  )
204
  model.eval()
205
-
206
  _model_cache[model_id] = (processor, model)
207
  print(f"βœ… Model {model_id} loaded on {device}")
208
  return processor, model
209
 
210
-
211
  # ──────────────────────────────────────────────────────────────
212
- # βœ… FIX 1: Pre-load default model at startup (OUTSIDE GPU context)
213
  # ──────────────────────────────────────────────────────────────
214
  DEFAULT_MODEL = "Qwen/Qwen3-VL-8B-Instruct"
215
  print(f"⏳ Pre-loading default model at startup: {DEFAULT_MODEL}")
216
  load_model(DEFAULT_MODEL)
217
  print(f"βœ… Default model ready!")
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
 
221
  def prepare_inputs(processor, model, messages):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  text = processor.apply_chat_template(
223
- messages,
224
  tokenize=False,
225
  add_generation_prompt=True,
226
  )
227
- image_inputs, video_inputs = process_vision_info(messages)
228
  inputs = processor(
229
  text=[text],
230
- images=image_inputs if image_inputs else None,
231
- videos=video_inputs if video_inputs else None,
232
  padding=True,
233
  return_tensors="pt",
234
  )
235
  return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
 
236
  # ──────────────────────────────────────────────────────────────
237
  # Utility Functions
238
  # ──────────────────────────────────────────────────────────────
@@ -247,14 +274,12 @@ def ensure_rgb(image: Image.Image) -> Optional[Image.Image]:
247
  return image.convert("RGB")
248
  return image
249
 
250
-
251
  # ──────────────────────────────────────────────────────────────
252
  # πŸ” Face Detection, Signature Extraction & Annotation Engine
253
  # ──────────────────────────────────────────────────────────────
254
  def detect_faces(image: Image.Image):
255
  img_array = np.array(image)
256
  gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
257
-
258
  face_cascade = cv2.CascadeClassifier(
259
  cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
260
  )
@@ -262,7 +287,6 @@ def detect_faces(image: Image.Image):
262
  gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
263
  flags=cv2.CASCADE_SCALE_IMAGE,
264
  )
265
-
266
  if len(faces) == 0:
267
  profile_cascade = cv2.CascadeClassifier(
268
  cv2.data.haarcascades + 'haarcascade_profileface.xml'
@@ -270,12 +294,10 @@ def detect_faces(image: Image.Image):
270
  faces = profile_cascade.detectMultiScale(
271
  gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
272
  )
273
-
274
  if len(faces) == 0:
275
  return None, []
276
 
277
  faces_sorted = sorted(faces, key=lambda f: f[2] * f[3], reverse=True)
278
-
279
  x, y, w, h = faces_sorted[0]
280
  pad = int(0.2 * max(w, h))
281
  x1 = max(0, x - pad)
@@ -296,7 +318,7 @@ def detect_faces(image: Image.Image):
296
  y2 = min(img_array.shape[0], y + h + pad)
297
  face_gray2 = gray[y1:y2, x1:x2]
298
  if face_gray2.size > 0 and cv2.Laplacian(face_gray2, cv2.CV_64F).var() < 30:
299
- return None, faces_sorted.tolist() if hasattr(faces_sorted, 'tolist') else [tuple(f) for f in faces_sorted]
300
  else:
301
  return None, [tuple(f) for f in faces_sorted]
302
 
@@ -308,20 +330,15 @@ def detect_faces(image: Image.Image):
308
  def detect_signature(image: Image.Image):
309
  img_array = np.array(image)
310
  h, w = img_array.shape[:2]
311
-
312
  search_top = int(h * 0.5)
313
  lower_region = img_array[search_top:, :]
314
  gray = cv2.cvtColor(lower_region, cv2.COLOR_RGB2GRAY)
315
-
316
  binary = cv2.adaptiveThreshold(
317
- gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
318
- cv2.THRESH_BINARY_INV, 15, 10
319
  )
320
-
321
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
322
  binary = cv2.dilate(binary, kernel, iterations=2)
323
  binary = cv2.erode(binary, kernel, iterations=1)
324
-
325
  contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
326
  if not contours:
327
  return None, None
@@ -341,7 +358,6 @@ def detect_signature(image: Image.Image):
341
 
342
  all_points = np.concatenate(sig_contours)
343
  rx, ry, rw, rh = cv2.boundingRect(all_points)
344
-
345
  if rw < 30 or rh < 10:
346
  return None, None
347
 
@@ -358,13 +374,11 @@ def detect_signature(image: Image.Image):
358
  return None, None
359
 
360
  sig_crop = image.crop((sig_x1, sig_y1, sig_x2, sig_y2))
361
- bbox = (sig_x1, sig_y1, sig_x2, sig_y2)
362
- return sig_crop, bbox
363
 
364
 
365
  def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Optional[tuple]):
366
  img_array = np.array(image).copy()
367
-
368
  for i, (x, y, w, h) in enumerate(face_bboxes):
369
  color = (34, 197, 94)
370
  cv2.rectangle(img_array, (x, y), (x + w, y + h), color, 3)
@@ -373,7 +387,6 @@ def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Opti
373
  cv2.rectangle(img_array, (x, y - th - 10), (x + tw + 6, y), color, -1)
374
  cv2.putText(img_array, label, (x + 3, y - 5),
375
  cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
376
-
377
  if sig_bbox:
378
  x1, y1, x2, y2 = sig_bbox
379
  color = (59, 130, 246)
@@ -383,25 +396,22 @@ def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Opti
383
  cv2.rectangle(img_array, (x1, y1 - th - 10), (x1 + tw + 6, y1), color, -1)
384
  cv2.putText(img_array, label, (x1 + 3, y1 - 5),
385
  cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
386
-
387
  return Image.fromarray(img_array)
388
 
389
 
390
  def run_visual_extraction(image: Optional[Image.Image]):
391
  if image is None:
392
  return None, None, None, "_Upload an image to detect visual elements._"
393
-
394
  image = ensure_rgb(image)
395
  detections = []
396
 
397
  face_crop, face_bboxes = detect_faces(image)
398
  if face_crop is not None:
399
  detections.append(f"βœ… **Face detected** β€” {len(face_bboxes)} face(s) found, largest extracted")
 
 
400
  else:
401
- if face_bboxes:
402
- detections.append(f"⚠️ **Face found but too blurry/small** β€” {len(face_bboxes)} face(s) detected but quality insufficient")
403
- else:
404
- detections.append("❌ **No face detected** in this image")
405
 
406
  sig_crop, sig_bbox = detect_signature(image)
407
  if sig_crop is not None:
@@ -410,10 +420,11 @@ def run_visual_extraction(image: Optional[Image.Image]):
410
  detections.append("ℹ️ **No signature detected** in this image")
411
 
412
  annotated = create_annotated_image(image, face_bboxes, sig_bbox)
413
- detections.append(f"\\n🎯 **Annotated image** generated with {len(face_bboxes)} face box(es)" +
414
- (" + 1 signature box" if sig_bbox else ""))
415
-
416
- summary_md = "### πŸ” Detection Results\\n\\n" + "\\n\\n".join(detections)
 
417
  return face_crop, sig_crop, annotated, summary_md
418
 
419
 
@@ -429,11 +440,9 @@ def generate_document_scan(
429
  if front_image is None and back_image is None:
430
  yield "⚠️ Please upload at least one image.", "⚠️ Please upload at least one image."
431
  return
432
-
433
  if not prompt.strip():
434
  prompt = ("Analyze this document. Extract all text, key details "
435
  "(name, dates, numbers, etc.) and provide a structured summary.")
436
-
437
  try:
438
  processor, model = load_model(model_name)
439
  except Exception as e:
@@ -443,31 +452,18 @@ def generate_document_scan(
443
  content = []
444
  if front_image is not None:
445
  front_image = ensure_rgb(front_image)
446
- content.append({"type": "text", "text": "**[FRONT SIDE]**"})
447
  content.append({"type": "image", "image": front_image})
448
-
449
  if back_image is not None:
450
  back_image = ensure_rgb(back_image)
451
- content.append({"type": "text", "text": "**[BACK SIDE]**"})
452
  content.append({"type": "image", "image": back_image})
453
-
454
  content.append({"type": "text", "text": prompt})
455
 
456
  messages = [{"role": "user", "content": content}]
457
-
458
- # inputs = processor.apply_chat_template(
459
- # messages,
460
- # tokenize=True,
461
- # add_generation_prompt=True,
462
- # return_dict=True,
463
- # return_tensors="pt"
464
- # )
465
- # inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
466
-
467
  inputs = prepare_inputs(processor, model, messages)
468
 
469
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
470
-
471
  gen_kwargs = {
472
  **inputs,
473
  "streamer": streamer,
@@ -477,7 +473,6 @@ def generate_document_scan(
477
  "top_p": top_p,
478
  "top_k": top_k,
479
  }
480
-
481
  thread = Thread(target=model.generate, kwargs=gen_kwargs)
482
  thread.start()
483
  buffer = ""
@@ -501,7 +496,6 @@ def generate_image_analysis(
501
  return
502
  if not text.strip():
503
  text = "Describe this image in detail."
504
-
505
  try:
506
  processor, model = load_model(model_name)
507
  except Exception as e:
@@ -509,25 +503,13 @@ def generate_image_analysis(
509
  return
510
 
511
  image = ensure_rgb(image)
512
-
513
  messages = [{"role": "user", "content": [
514
  {"type": "image", "image": image},
515
- {"type": "text", "text": text}
516
  ]}]
517
-
518
- # inputs = processor.apply_chat_template(
519
- # messages,
520
- # tokenize=True,
521
- # add_generation_prompt=True,
522
- # return_dict=True,
523
- # return_tensors="pt"
524
- # )
525
- # inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
526
-
527
  inputs = prepare_inputs(processor, model, messages)
528
 
529
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
530
-
531
  gen_kwargs = {
532
  **inputs,
533
  "streamer": streamer,
@@ -537,7 +519,6 @@ def generate_image_analysis(
537
  "top_p": top_p,
538
  "top_k": top_k,
539
  }
540
-
541
  thread = Thread(target=model.generate, kwargs=gen_kwargs)
542
  thread.start()
543
  buffer = ""
@@ -560,7 +541,7 @@ def process_batch_images(
560
  if not prompts_text.strip():
561
  return "⚠️ Please enter prompts (one per line)."
562
 
563
- prompts = [p.strip() for p in prompts_text.split('\\n') if p.strip()]
564
  if len(prompts) == 1:
565
  prompts = prompts * len(files)
566
  elif len(prompts) != len(files):
@@ -576,24 +557,13 @@ def process_batch_images(
576
  try:
577
  image_path = file.name if hasattr(file, 'name') else file
578
  image = Image.open(image_path).convert("RGB")
579
-
580
  if seed != -1:
581
  torch.manual_seed(seed + idx - 1)
582
 
583
  messages = [{"role": "user", "content": [
584
  {"type": "image", "image": image},
585
- {"type": "text", "text": prompt},
586
  ]}]
587
-
588
- # inputs = processor.apply_chat_template(
589
- # messages,
590
- # tokenize=True,
591
- # add_generation_prompt=True,
592
- # return_dict=True,
593
- # return_tensors="pt"
594
- # )
595
- # inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
596
-
597
  inputs = prepare_inputs(processor, model, messages)
598
 
599
  with torch.no_grad():
@@ -605,7 +575,6 @@ def process_batch_images(
605
  top_k=top_k,
606
  do_sample=temperature > 0,
607
  )
608
-
609
  generated_ids_trimmed = [
610
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
611
  ]
@@ -617,9 +586,9 @@ def process_batch_images(
617
 
618
  results.append(f"═══ Image {idx}: {os.path.basename(str(image_path))} ═══")
619
  results.append(f"πŸ“ Prompt: {prompt}")
620
- results.append(f"πŸ“„ Result: {result}\\n")
621
 
622
- return "\\n".join(results)
623
 
624
 
625
  # ──────────────────────────────────────────────────────────────
@@ -650,15 +619,6 @@ def process_chat_message(
650
  if content:
651
  messages.append({"role": "user", "content": content})
652
 
653
- # inputs = processor.apply_chat_template(
654
- # messages,
655
- # tokenize=True,
656
- # add_generation_prompt=True,
657
- # return_dict=True,
658
- # return_tensors="pt"
659
- # )
660
- # inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
661
-
662
  inputs = prepare_inputs(processor, model, messages)
663
 
664
  with torch.no_grad():
@@ -669,7 +629,6 @@ def process_chat_message(
669
  do_sample=True,
670
  top_p=0.95,
671
  )
672
-
673
  generated_ids_trimmed = [
674
  out_ids[len(in_ids):]
675
  for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
@@ -682,9 +641,8 @@ def process_chat_message(
682
 
683
 
684
  def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str):
685
- text = message.get("text", "")
686
  files = message.get("files", [])
687
-
688
  image = None
689
  if files and len(files) > 0:
690
  try:
@@ -704,8 +662,8 @@ def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name:
704
  except Exception as e:
705
  response = f"❌ Error: {str(e)}"
706
 
707
- user_content = f"{text}\\nπŸ“Ž [Image attached]" if image is not None else text
708
- history.append({"role": "user", "content": user_content})
709
  history.append({"role": "assistant", "content": response})
710
  return "", history
711
 
@@ -719,10 +677,7 @@ def retry_fn(history, model_name):
719
  return "", history
720
  history = history[:-1]
721
  user_content = last_user_msg.get("content", "")
722
- if "πŸ“Ž [Image attached]" in user_content:
723
- text = user_content.replace("\\nπŸ“Ž [Image attached]", "").replace("πŸ“Ž [Image attached]", "")
724
- else:
725
- text = user_content
726
  return chat_fn({"text": text}, history, model_name)
727
 
728
 
@@ -757,28 +712,28 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
757
 
758
  with gr.Accordion("βš™οΈ Advanced Generation Parameters", open=False):
759
  with gr.Row():
760
- max_new_tokens = gr.Slider(64, MAX_MAX_NEW_TOKENS, DEFAULT_MAX_NEW_TOKENS, step=64, label="Max New Tokens")
761
- temperature = gr.Slider(0.1, 2.0, 0.6, step=0.1, label="Temperature")
762
  with gr.Row():
763
- top_p = gr.Slider(0.05, 1.0, 0.9, step=0.05, label="Top-p")
764
- top_k = gr.Slider(1, 1000, 50, step=1, label="Top-k")
765
  with gr.Row():
766
- repetition_penalty = gr.Slider(1.0, 2.0, 1.2, step=0.05, label="Repetition Penalty")
767
- seed_number = gr.Number(value=-1, label="Seed (-1 = random)", precision=0)
768
 
769
  with gr.Tabs():
770
 
771
  # ─── TAB 1: Document Scanner ───
772
  with gr.TabItem("πŸͺͺ Document Scanner"):
773
  gr.Markdown(
774
- "### Scan Front & Back of Documents\\n"
775
- "Upload front and/or back side images. Both analyzed together by the selected model.\\n"
776
  "Face profiles and signatures are **auto-detected** on front image upload."
777
  )
778
  with gr.Row():
779
  with gr.Column(scale=1):
780
  doc_front_image = gr.Image(type="pil", label="πŸ“„ Front Side", height=280)
781
- doc_back_image = gr.Image(type="pil", label="πŸ“„ Back Side", height=280)
782
  doc_prompt = gr.Textbox(
783
  label="Custom Prompt (optional)", lines=3,
784
  placeholder="e.g., Extract all text, MRZ data, name, DOB, ID number...",
@@ -794,9 +749,9 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
794
  gr.Markdown("### πŸ” Visual Element Detection _(auto-detected on front image upload)_")
795
  with gr.Row():
796
  with gr.Column(scale=1):
797
- doc_face_output = gr.Image(label="πŸ‘€ Detected Face Profile", height=220, elem_classes="face-box")
798
  with gr.Column(scale=1):
799
- doc_sig_output = gr.Image(label="✍️ Detected Signature", height=220, elem_classes="sig-box")
800
  with gr.Column(scale=1):
801
  doc_annotated_output = gr.Image(label="🎯 Annotated Image (Highlights)", height=220)
802
  doc_detection_summary = gr.Markdown("_Upload a front side image to detect visual elements._")
@@ -806,7 +761,6 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
806
  inputs=[doc_front_image],
807
  outputs=[doc_face_output, doc_sig_output, doc_annotated_output, doc_detection_summary],
808
  )
809
-
810
  doc_submit.click(
811
  fn=generate_document_scan,
812
  inputs=[model_choice, doc_front_image, doc_back_image, doc_prompt,
@@ -817,14 +771,14 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
817
  # ─── TAB 2: Image Analysis ───
818
  with gr.TabItem("πŸ–ΌοΈ Image Analysis"):
819
  gr.Markdown(
820
- "### Smart Image Analysis\\n"
821
  "Upload an image to auto-detect **face profiles**, **signatures**, and see "
822
  "**highlighted annotations**. Then run model analysis with a custom prompt."
823
  )
824
  with gr.Row():
825
  with gr.Column(scale=1):
826
  img_upload = gr.Image(type="pil", label="Upload Image", height=320)
827
- img_query = gr.Textbox(
828
  label="Query / Prompt", lines=2,
829
  placeholder="What do you see in this image? / Extract all text / Describe in detail...",
830
  )
@@ -838,9 +792,9 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
838
  gr.Markdown("### πŸ” Visual Element Detection _(auto-detected on upload)_")
839
  with gr.Row():
840
  with gr.Column(scale=1):
841
- face_output = gr.Image(label="πŸ‘€ Detected Face Profile", height=220, elem_classes="face-box")
842
  with gr.Column(scale=1):
843
- sig_output = gr.Image(label="✍️ Detected Signature", height=220, elem_classes="sig-box")
844
  with gr.Column(scale=1):
845
  annotated_output = gr.Image(label="🎯 Annotated Image (Highlights)", height=220)
846
  detection_summary = gr.Markdown("_Upload an image to detect visual elements._")
@@ -850,7 +804,6 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
850
  inputs=[img_upload],
851
  outputs=[face_output, sig_output, annotated_output, detection_summary],
852
  )
853
-
854
  img_submit.click(
855
  fn=generate_image_analysis,
856
  inputs=[model_choice, img_query, img_upload, max_new_tokens, temperature,
@@ -863,10 +816,10 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
863
  gr.Markdown("### Process Multiple Images at Once")
864
  with gr.Row():
865
  with gr.Column(scale=1):
866
- batch_images = gr.File(file_count="multiple", label="Upload Images", file_types=["image"])
867
  batch_prompts = gr.Textbox(
868
  label="Prompts (one per line)", lines=5,
869
- placeholder="Describe this image in detail\\nExtract all text...",
870
  info="One prompt for all images OR one prompt per image",
871
  )
872
  batch_submit = gr.Button("πŸš€ Process Batch", variant="primary")
@@ -883,23 +836,19 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
883
  # ─── TAB 4: Chat ───
884
  with gr.TabItem("πŸ’¬ Chat"):
885
  gr.Markdown(
886
- "### Multi-Turn Chat with Image Attachments\\n"
887
  "Converse with the model. Attach images at any point in the conversation."
888
  )
889
  with gr.Row():
890
  with gr.Column(scale=1):
891
  gr.Markdown(
892
- "**πŸ’‘ Tips:**\\n"
893
- "- Upload an image and ask questions\\n"
894
- "- Detailed descriptions & visual QA\\n"
895
- "- Multi-turn conversation memory\\n"
896
  )
897
  with gr.Column(scale=3):
898
- chatbot = gr.Chatbot(
899
- label="Chat",
900
- height=450,
901
- value=[],
902
- )
903
  with gr.Row():
904
  chat_msg = gr.MultimodalTextbox(
905
  label="Message",
@@ -908,24 +857,23 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
908
  )
909
  with gr.Row():
910
  retry_btn = gr.Button("πŸ”„ Retry", variant="secondary", size="sm")
911
- undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm")
912
  clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary", size="sm")
913
 
914
- chat_msg.submit(chat_fn, [chat_msg, chatbot, model_choice], [chat_msg, chatbot], queue=True)
915
- retry_btn.click(retry_fn, [chatbot, model_choice], [chat_msg, chatbot], queue=True)
916
- undo_btn.click(undo_fn, [chatbot], [chatbot], queue=False)
917
- clear_btn.click(clear_fn, outputs=[chat_msg, chatbot], queue=False)
918
 
919
  gr.Markdown(
920
- "---\\n"
921
- "**🧠 Chhagan's Multi-Model Studio** β€’ 12 Models Total\\n\\n"
922
  "Qwen3-VL (2B/4B/8B/32B) Instruct + Qwen2.5-VL (3B/7B) Instruct + "
923
  "CSM-DocExtract-VL β€’ CSM-DocExtract-VL-Q4KM β€’ CSM-DocExtract-VL-Q4KM-merged-fp16 β€’ "
924
- "CSM-DocExtract-VL-HF β€’ Chhagan_ML-VL-OCR-v1 β€’ Chhagan-DocVL-Qwen3\\n\\n"
925
  "_Built with ❀️ using Gradio_"
926
  )
927
 
928
-
929
  # ──────────────────────────────────────────────────────────────
930
  # Launch
931
  # ──────────────────────────────────────────────────────────────
 
9
  12 models: 6 Qwen VL Instruct + 6 Custom CSM/Chhagan VL models across all tabs.
10
  """
11
 
 
12
  import os
 
 
 
13
  import time
14
  import warnings
15
  from threading import Thread
16
+ from typing import Optional, Tuple, Dict, Any, List
17
 
18
  from qwen_vl_utils import process_vision_info
19
 
 
24
  from PIL import Image, ImageDraw, ImageFont
25
  import cv2
26
 
 
27
  from transformers import (
28
  Qwen2_5_VLForConditionalGeneration,
29
  Qwen3VLForConditionalGeneration,
 
31
  TextIteratorStreamer,
32
  )
33
 
 
34
  from gradio.themes import Soft
35
  from gradio.themes.utils import colors, fonts, sizes
36
 
 
37
  # ──────────────────────────────────────────────────────────────
38
  # Suppress warnings
39
  # ──────────────────────────────────────────────────────────────
40
  warnings.filterwarnings('ignore', message='.*meta device.*')
41
 
 
42
  # ──────────────────────────────────────────────────────────────
43
  # Custom Premium Theme
44
  # ──────────────────────────────────────────────────────────────
 
49
  c800="#3730A3", c900="#312E81", c950="#1E1B4B",
50
  )
51
 
 
52
  colors.cyber_teal = colors.Color(
53
  name="cyber_teal",
54
  c50="#F0FDFA", c100="#CCFBF1", c200="#99F6E4", c300="#5EEAD4",
 
56
  c800="#115E59", c900="#134E4A", c950="#042F2E",
57
  )
58
 
 
59
  class PremiumTheme(Soft):
60
  def __init__(self):
61
  super().__init__(
 
86
  block_label_background_fill="*primary_100",
87
  )
88
 
 
89
  premium_theme = PremiumTheme()
90
 
 
91
  css = """
92
  #app-title h1 {
93
  font-size: 2.5em !important;
 
121
  padding: 12px;
122
  background: var(--background-fill-secondary);
123
  }
124
+ .face-box { border: 3px solid #22c55e; border-radius: 8px; }
125
+ .sig-box { border: 3px solid #3b82f6; border-radius: 8px; }
 
 
 
 
 
 
126
  """
127
 
 
128
  # ──────────────────────────────────────────────────────────────
129
  # Device & Constants
130
  # ──────────────────────────────────────────────────────────────
 
133
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
134
  print(f"πŸš€ Using device: {device}")
135
 
 
136
  # ──────────────────────────────────────────────────────────────
137
  # ALL 12 MODELS
 
138
  # ──────────────────────────────────────────────────────────────
139
  ALL_MODELS = [
140
  # ── Qwen Official Instruct Models ──
 
153
  "Chhagan005/Chhagan-DocVL-Qwen3",
154
  ]
155
 
 
156
  # ──────────────────────────────────────────────────────────────
157
+ # Lazy Model Loading
158
  # ──────────────────────────────────────────────────────────────
159
  _model_cache: Dict[str, Tuple[Any, Any]] = {}
160
 
 
161
  def get_model_class(model_id: str):
162
  if "Qwen2.5" in model_id:
163
  return Qwen2_5_VLForConditionalGeneration
164
  return Qwen3VLForConditionalGeneration
165
 
 
166
  def load_model(model_id: str):
167
  if model_id in _model_cache:
168
  return _model_cache[model_id]
 
169
  print(f"⏳ Loading model: {model_id}")
170
  model_cls = get_model_class(model_id)
171
  dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 
172
  with warnings.catch_warnings():
173
  warnings.filterwarnings('ignore')
174
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
175
  model = model_cls.from_pretrained(
176
+ model_id, torch_dtype=dtype, device_map="auto", trust_remote_code=True,
177
  )
178
  model.eval()
 
179
  _model_cache[model_id] = (processor, model)
180
  print(f"βœ… Model {model_id} loaded on {device}")
181
  return processor, model
182
 
 
183
  # ──────────────────────────────────────────────────────────────
184
+ # Pre-load default model
185
  # ──────────────────────────────────────────────────────────────
186
  DEFAULT_MODEL = "Qwen/Qwen3-VL-8B-Instruct"
187
  print(f"⏳ Pre-loading default model at startup: {DEFAULT_MODEL}")
188
  load_model(DEFAULT_MODEL)
189
  print(f"βœ… Default model ready!")
190
 
191
+ # ──────────────────────────────────────────────────────────────
192
+ # βœ… CORE FIX: Universal Input Processor
193
+ # Handles both standard Qwen templates AND custom CSM jinja templates
194
+ # ──────────────────────────────────────────────────────────────
195
+ def _flatten_messages_for_custom_template(messages):
196
+ """
197
+ Custom CSM/Chhagan models have jinja templates that expect
198
+ plain string content, not multimodal list-of-dicts.
199
+ This flattens content lists β†’ string, extracts PIL images separately.
200
+ """
201
+ flat_messages = []
202
+ extracted_images = []
203
+ for msg in messages:
204
+ content = msg.get("content", "")
205
+ if isinstance(content, list):
206
+ parts = []
207
+ for item in content:
208
+ if isinstance(item, dict):
209
+ if item.get("type") == "image":
210
+ img = item.get("image")
211
+ if img is not None:
212
+ extracted_images.append(img)
213
+ # Qwen vision special token placeholder
214
+ parts.append("<|vision_start|><|image_pad|><|vision_end|>")
215
+ elif item.get("type") == "text":
216
+ parts.append(item.get("text", ""))
217
+ flat_messages.append({"role": msg["role"], "content": "".join(parts)})
218
+ else:
219
+ flat_messages.append(msg)
220
+ return flat_messages, extracted_images
221
 
222
 
223
  def prepare_inputs(processor, model, messages):
224
+ """
225
+ Attempt 1 β€” Standard multimodal path (works for official Qwen models).
226
+ Attempt 2 β€” Flatten fallback (works for custom CSM/Chhagan jinja templates).
227
+ """
228
+ # ── Attempt 1: Standard multimodal ──────────────────────
229
+ try:
230
+ text = processor.apply_chat_template(
231
+ messages,
232
+ tokenize=False,
233
+ add_generation_prompt=True,
234
+ )
235
+ image_inputs, video_inputs = process_vision_info(messages)
236
+ inputs = processor(
237
+ text=[text],
238
+ images=image_inputs if image_inputs else None,
239
+ videos=video_inputs if video_inputs else None,
240
+ padding=True,
241
+ return_tensors="pt",
242
+ )
243
+ return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
244
+ except TypeError:
245
+ # Custom template doesn't support list content β†’ use fallback
246
+ pass
247
+
248
+ # ── Attempt 2: Flatten for custom jinja templates ────────
249
+ flat_msgs, extracted_images = _flatten_messages_for_custom_template(messages)
250
  text = processor.apply_chat_template(
251
+ flat_msgs,
252
  tokenize=False,
253
  add_generation_prompt=True,
254
  )
 
255
  inputs = processor(
256
  text=[text],
257
+ images=extracted_images if extracted_images else None,
 
258
  padding=True,
259
  return_tensors="pt",
260
  )
261
  return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
262
+
263
  # ──────────────────────────────────────────────────────────────
264
  # Utility Functions
265
  # ──────────────────────────────────────────────────────────────
 
274
  return image.convert("RGB")
275
  return image
276
 
 
277
  # ──────────────────────────────────────────────────────────────
278
  # πŸ” Face Detection, Signature Extraction & Annotation Engine
279
  # ──────────────────────────────────────────────────────────────
280
  def detect_faces(image: Image.Image):
281
  img_array = np.array(image)
282
  gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
 
283
  face_cascade = cv2.CascadeClassifier(
284
  cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
285
  )
 
287
  gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
288
  flags=cv2.CASCADE_SCALE_IMAGE,
289
  )
 
290
  if len(faces) == 0:
291
  profile_cascade = cv2.CascadeClassifier(
292
  cv2.data.haarcascades + 'haarcascade_profileface.xml'
 
294
  faces = profile_cascade.detectMultiScale(
295
  gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
296
  )
 
297
  if len(faces) == 0:
298
  return None, []
299
 
300
  faces_sorted = sorted(faces, key=lambda f: f[2] * f[3], reverse=True)
 
301
  x, y, w, h = faces_sorted[0]
302
  pad = int(0.2 * max(w, h))
303
  x1 = max(0, x - pad)
 
318
  y2 = min(img_array.shape[0], y + h + pad)
319
  face_gray2 = gray[y1:y2, x1:x2]
320
  if face_gray2.size > 0 and cv2.Laplacian(face_gray2, cv2.CV_64F).var() < 30:
321
+ return None, [tuple(f) for f in faces_sorted]
322
  else:
323
  return None, [tuple(f) for f in faces_sorted]
324
 
 
330
  def detect_signature(image: Image.Image):
331
  img_array = np.array(image)
332
  h, w = img_array.shape[:2]
 
333
  search_top = int(h * 0.5)
334
  lower_region = img_array[search_top:, :]
335
  gray = cv2.cvtColor(lower_region, cv2.COLOR_RGB2GRAY)
 
336
  binary = cv2.adaptiveThreshold(
337
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 10
 
338
  )
 
339
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
340
  binary = cv2.dilate(binary, kernel, iterations=2)
341
  binary = cv2.erode(binary, kernel, iterations=1)
 
342
  contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
343
  if not contours:
344
  return None, None
 
358
 
359
  all_points = np.concatenate(sig_contours)
360
  rx, ry, rw, rh = cv2.boundingRect(all_points)
 
361
  if rw < 30 or rh < 10:
362
  return None, None
363
 
 
374
  return None, None
375
 
376
  sig_crop = image.crop((sig_x1, sig_y1, sig_x2, sig_y2))
377
+ return sig_crop, (sig_x1, sig_y1, sig_x2, sig_y2)
 
378
 
379
 
380
  def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Optional[tuple]):
381
  img_array = np.array(image).copy()
 
382
  for i, (x, y, w, h) in enumerate(face_bboxes):
383
  color = (34, 197, 94)
384
  cv2.rectangle(img_array, (x, y), (x + w, y + h), color, 3)
 
387
  cv2.rectangle(img_array, (x, y - th - 10), (x + tw + 6, y), color, -1)
388
  cv2.putText(img_array, label, (x + 3, y - 5),
389
  cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
 
390
  if sig_bbox:
391
  x1, y1, x2, y2 = sig_bbox
392
  color = (59, 130, 246)
 
396
  cv2.rectangle(img_array, (x1, y1 - th - 10), (x1 + tw + 6, y1), color, -1)
397
  cv2.putText(img_array, label, (x1 + 3, y1 - 5),
398
  cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
 
399
  return Image.fromarray(img_array)
400
 
401
 
402
  def run_visual_extraction(image: Optional[Image.Image]):
403
  if image is None:
404
  return None, None, None, "_Upload an image to detect visual elements._"
 
405
  image = ensure_rgb(image)
406
  detections = []
407
 
408
  face_crop, face_bboxes = detect_faces(image)
409
  if face_crop is not None:
410
  detections.append(f"βœ… **Face detected** β€” {len(face_bboxes)} face(s) found, largest extracted")
411
+ elif face_bboxes:
412
+ detections.append(f"⚠️ **Face found but too blurry/small** β€” {len(face_bboxes)} face(s) detected but quality insufficient")
413
  else:
414
+ detections.append("❌ **No face detected** in this image")
 
 
 
415
 
416
  sig_crop, sig_bbox = detect_signature(image)
417
  if sig_crop is not None:
 
420
  detections.append("ℹ️ **No signature detected** in this image")
421
 
422
  annotated = create_annotated_image(image, face_bboxes, sig_bbox)
423
+ detections.append(
424
+ f"\n🎯 **Annotated image** generated with {len(face_bboxes)} face box(es)"
425
+ + (" + 1 signature box" if sig_bbox else "")
426
+ )
427
+ summary_md = "### πŸ” Detection Results\n\n" + "\n\n".join(detections)
428
  return face_crop, sig_crop, annotated, summary_md
429
 
430
 
 
440
  if front_image is None and back_image is None:
441
  yield "⚠️ Please upload at least one image.", "⚠️ Please upload at least one image."
442
  return
 
443
  if not prompt.strip():
444
  prompt = ("Analyze this document. Extract all text, key details "
445
  "(name, dates, numbers, etc.) and provide a structured summary.")
 
446
  try:
447
  processor, model = load_model(model_name)
448
  except Exception as e:
 
452
  content = []
453
  if front_image is not None:
454
  front_image = ensure_rgb(front_image)
455
+ content.append({"type": "text", "text": "**[FRONT SIDE]**"})
456
  content.append({"type": "image", "image": front_image})
 
457
  if back_image is not None:
458
  back_image = ensure_rgb(back_image)
459
+ content.append({"type": "text", "text": "**[BACK SIDE]**"})
460
  content.append({"type": "image", "image": back_image})
 
461
  content.append({"type": "text", "text": prompt})
462
 
463
  messages = [{"role": "user", "content": content}]
 
 
 
 
 
 
 
 
 
 
464
  inputs = prepare_inputs(processor, model, messages)
465
 
466
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
467
  gen_kwargs = {
468
  **inputs,
469
  "streamer": streamer,
 
473
  "top_p": top_p,
474
  "top_k": top_k,
475
  }
 
476
  thread = Thread(target=model.generate, kwargs=gen_kwargs)
477
  thread.start()
478
  buffer = ""
 
496
  return
497
  if not text.strip():
498
  text = "Describe this image in detail."
 
499
  try:
500
  processor, model = load_model(model_name)
501
  except Exception as e:
 
503
  return
504
 
505
  image = ensure_rgb(image)
 
506
  messages = [{"role": "user", "content": [
507
  {"type": "image", "image": image},
508
+ {"type": "text", "text": text},
509
  ]}]
 
 
 
 
 
 
 
 
 
 
510
  inputs = prepare_inputs(processor, model, messages)
511
 
512
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
513
  gen_kwargs = {
514
  **inputs,
515
  "streamer": streamer,
 
519
  "top_p": top_p,
520
  "top_k": top_k,
521
  }
 
522
  thread = Thread(target=model.generate, kwargs=gen_kwargs)
523
  thread.start()
524
  buffer = ""
 
541
  if not prompts_text.strip():
542
  return "⚠️ Please enter prompts (one per line)."
543
 
544
+ prompts = [p.strip() for p in prompts_text.split('\n') if p.strip()]
545
  if len(prompts) == 1:
546
  prompts = prompts * len(files)
547
  elif len(prompts) != len(files):
 
557
  try:
558
  image_path = file.name if hasattr(file, 'name') else file
559
  image = Image.open(image_path).convert("RGB")
 
560
  if seed != -1:
561
  torch.manual_seed(seed + idx - 1)
562
 
563
  messages = [{"role": "user", "content": [
564
  {"type": "image", "image": image},
565
+ {"type": "text", "text": prompt},
566
  ]}]
 
 
 
 
 
 
 
 
 
 
567
  inputs = prepare_inputs(processor, model, messages)
568
 
569
  with torch.no_grad():
 
575
  top_k=top_k,
576
  do_sample=temperature > 0,
577
  )
 
578
  generated_ids_trimmed = [
579
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
580
  ]
 
586
 
587
  results.append(f"═══ Image {idx}: {os.path.basename(str(image_path))} ═══")
588
  results.append(f"πŸ“ Prompt: {prompt}")
589
+ results.append(f"πŸ“„ Result: {result}\n")
590
 
591
+ return "\n".join(results)
592
 
593
 
594
  # ──────────────────────────────────────────────────────────────
 
619
  if content:
620
  messages.append({"role": "user", "content": content})
621
 
 
 
 
 
 
 
 
 
 
622
  inputs = prepare_inputs(processor, model, messages)
623
 
624
  with torch.no_grad():
 
629
  do_sample=True,
630
  top_p=0.95,
631
  )
 
632
  generated_ids_trimmed = [
633
  out_ids[len(in_ids):]
634
  for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
 
641
 
642
 
643
  def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str):
644
+ text = message.get("text", "")
645
  files = message.get("files", [])
 
646
  image = None
647
  if files and len(files) > 0:
648
  try:
 
662
  except Exception as e:
663
  response = f"❌ Error: {str(e)}"
664
 
665
+ user_content = f"{text}\nπŸ“Ž [Image attached]" if image is not None else text
666
+ history.append({"role": "user", "content": user_content})
667
  history.append({"role": "assistant", "content": response})
668
  return "", history
669
 
 
677
  return "", history
678
  history = history[:-1]
679
  user_content = last_user_msg.get("content", "")
680
+ text = user_content.replace("\nπŸ“Ž [Image attached]", "").replace("πŸ“Ž [Image attached]", "")
 
 
 
681
  return chat_fn({"text": text}, history, model_name)
682
 
683
 
 
712
 
713
  with gr.Accordion("βš™οΈ Advanced Generation Parameters", open=False):
714
  with gr.Row():
715
+ max_new_tokens = gr.Slider(64, MAX_MAX_NEW_TOKENS, DEFAULT_MAX_NEW_TOKENS, step=64, label="Max New Tokens")
716
+ temperature = gr.Slider(0.1, 2.0, 0.6, step=0.1, label="Temperature")
717
  with gr.Row():
718
+ top_p = gr.Slider(0.05, 1.0, 0.9, step=0.05, label="Top-p")
719
+ top_k = gr.Slider(1, 1000, 50, step=1, label="Top-k")
720
  with gr.Row():
721
+ repetition_penalty = gr.Slider(1.0, 2.0, 1.2, step=0.05, label="Repetition Penalty")
722
+ seed_number = gr.Number(value=-1, label="Seed (-1 = random)", precision=0)
723
 
724
  with gr.Tabs():
725
 
726
  # ─── TAB 1: Document Scanner ───
727
  with gr.TabItem("πŸͺͺ Document Scanner"):
728
  gr.Markdown(
729
+ "### Scan Front & Back of Documents\n"
730
+ "Upload front and/or back side images. Both analyzed together by the selected model.\n"
731
  "Face profiles and signatures are **auto-detected** on front image upload."
732
  )
733
  with gr.Row():
734
  with gr.Column(scale=1):
735
  doc_front_image = gr.Image(type="pil", label="πŸ“„ Front Side", height=280)
736
+ doc_back_image = gr.Image(type="pil", label="πŸ“„ Back Side", height=280)
737
  doc_prompt = gr.Textbox(
738
  label="Custom Prompt (optional)", lines=3,
739
  placeholder="e.g., Extract all text, MRZ data, name, DOB, ID number...",
 
749
  gr.Markdown("### πŸ” Visual Element Detection _(auto-detected on front image upload)_")
750
  with gr.Row():
751
  with gr.Column(scale=1):
752
+ doc_face_output = gr.Image(label="πŸ‘€ Detected Face Profile", height=220, elem_classes="face-box")
753
  with gr.Column(scale=1):
754
+ doc_sig_output = gr.Image(label="✍️ Detected Signature", height=220, elem_classes="sig-box")
755
  with gr.Column(scale=1):
756
  doc_annotated_output = gr.Image(label="🎯 Annotated Image (Highlights)", height=220)
757
  doc_detection_summary = gr.Markdown("_Upload a front side image to detect visual elements._")
 
761
  inputs=[doc_front_image],
762
  outputs=[doc_face_output, doc_sig_output, doc_annotated_output, doc_detection_summary],
763
  )
 
764
  doc_submit.click(
765
  fn=generate_document_scan,
766
  inputs=[model_choice, doc_front_image, doc_back_image, doc_prompt,
 
771
  # ─── TAB 2: Image Analysis ───
772
  with gr.TabItem("πŸ–ΌοΈ Image Analysis"):
773
  gr.Markdown(
774
+ "### Smart Image Analysis\n"
775
  "Upload an image to auto-detect **face profiles**, **signatures**, and see "
776
  "**highlighted annotations**. Then run model analysis with a custom prompt."
777
  )
778
  with gr.Row():
779
  with gr.Column(scale=1):
780
  img_upload = gr.Image(type="pil", label="Upload Image", height=320)
781
+ img_query = gr.Textbox(
782
  label="Query / Prompt", lines=2,
783
  placeholder="What do you see in this image? / Extract all text / Describe in detail...",
784
  )
 
792
  gr.Markdown("### πŸ” Visual Element Detection _(auto-detected on upload)_")
793
  with gr.Row():
794
  with gr.Column(scale=1):
795
+ face_output = gr.Image(label="πŸ‘€ Detected Face Profile", height=220, elem_classes="face-box")
796
  with gr.Column(scale=1):
797
+ sig_output = gr.Image(label="✍️ Detected Signature", height=220, elem_classes="sig-box")
798
  with gr.Column(scale=1):
799
  annotated_output = gr.Image(label="🎯 Annotated Image (Highlights)", height=220)
800
  detection_summary = gr.Markdown("_Upload an image to detect visual elements._")
 
804
  inputs=[img_upload],
805
  outputs=[face_output, sig_output, annotated_output, detection_summary],
806
  )
 
807
  img_submit.click(
808
  fn=generate_image_analysis,
809
  inputs=[model_choice, img_query, img_upload, max_new_tokens, temperature,
 
816
  gr.Markdown("### Process Multiple Images at Once")
817
  with gr.Row():
818
  with gr.Column(scale=1):
819
+ batch_images = gr.File(file_count="multiple", label="Upload Images", file_types=["image"])
820
  batch_prompts = gr.Textbox(
821
  label="Prompts (one per line)", lines=5,
822
+ placeholder="Describe this image in detail\nExtract all text...",
823
  info="One prompt for all images OR one prompt per image",
824
  )
825
  batch_submit = gr.Button("πŸš€ Process Batch", variant="primary")
 
836
  # ─── TAB 4: Chat ───
837
  with gr.TabItem("πŸ’¬ Chat"):
838
  gr.Markdown(
839
+ "### Multi-Turn Chat with Image Attachments\n"
840
  "Converse with the model. Attach images at any point in the conversation."
841
  )
842
  with gr.Row():
843
  with gr.Column(scale=1):
844
  gr.Markdown(
845
+ "**πŸ’‘ Tips:**\n"
846
+ "- Upload an image and ask questions\n"
847
+ "- Detailed descriptions & visual QA\n"
848
+ "- Multi-turn conversation memory\n"
849
  )
850
  with gr.Column(scale=3):
851
+ chatbot = gr.Chatbot(label="Chat", height=450, value=[])
 
 
 
 
852
  with gr.Row():
853
  chat_msg = gr.MultimodalTextbox(
854
  label="Message",
 
857
  )
858
  with gr.Row():
859
  retry_btn = gr.Button("πŸ”„ Retry", variant="secondary", size="sm")
860
+ undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm")
861
  clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary", size="sm")
862
 
863
+ chat_msg.submit(chat_fn, [chat_msg, chatbot, model_choice], [chat_msg, chatbot], queue=True)
864
+ retry_btn.click(retry_fn, [chatbot, model_choice], [chat_msg, chatbot], queue=True)
865
+ undo_btn.click( undo_fn, [chatbot], [chatbot], queue=False)
866
+ clear_btn.click(clear_fn, outputs=[chat_msg, chatbot], queue=False)
867
 
868
  gr.Markdown(
869
+ "---\n"
870
+ "**🧠 Chhagan's Multi-Model Studio** β€’ 12 Models Total\n\n"
871
  "Qwen3-VL (2B/4B/8B/32B) Instruct + Qwen2.5-VL (3B/7B) Instruct + "
872
  "CSM-DocExtract-VL β€’ CSM-DocExtract-VL-Q4KM β€’ CSM-DocExtract-VL-Q4KM-merged-fp16 β€’ "
873
+ "CSM-DocExtract-VL-HF β€’ Chhagan_ML-VL-OCR-v1 β€’ Chhagan-DocVL-Qwen3\n\n"
874
  "_Built with ❀️ using Gradio_"
875
  )
876
 
 
877
  # ──────────────────────────────────────────────────────────────
878
  # Launch
879
  # ──────────────────────────────────────────────────────────────