Spaces:

Chhagan005
/

8B_MRZ_OCR_Auto_Translitration

Runtime error

App Files Files Community

Chhagan005 commited on Feb 22

Commit

c4a193f

verified ·

1 Parent(s): e8236d9

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -170

app.py CHANGED Viewed

@@ -9,15 +9,11 @@ Features:
 12 models: 6 Qwen VL Instruct + 6 Custom CSM/Chhagan VL models across all tabs.
 """
 import os
-import random
-import uuid
-import json
 import time
 import warnings
 from threading import Thread
-from typing import Optional, Tuple, Dict, Any, List, Iterable
 from qwen_vl_utils import process_vision_info
@@ -28,7 +24,6 @@ import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     Qwen3VLForConditionalGeneration,
@@ -36,17 +31,14 @@ from transformers import (
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 # ──────────────────────────────────────────────────────────────
 # Suppress warnings
 # ──────────────────────────────────────────────────────────────
 warnings.filterwarnings('ignore', message='.*meta device.*')
 # ──────────────────────────────────────────────────────────────
 # Custom Premium Theme
 # ──────────────────────────────────────────────────────────────
@@ -57,7 +49,6 @@ colors.deep_indigo = colors.Color(
     c800="#3730A3", c900="#312E81", c950="#1E1B4B",
 )
 colors.cyber_teal = colors.Color(
     name="cyber_teal",
     c50="#F0FDFA", c100="#CCFBF1", c200="#99F6E4", c300="#5EEAD4",
@@ -65,7 +56,6 @@ colors.cyber_teal = colors.Color(
     c800="#115E59", c900="#134E4A", c950="#042F2E",
 )
 class PremiumTheme(Soft):
     def __init__(self):
         super().__init__(
@@ -96,10 +86,8 @@ class PremiumTheme(Soft):
             block_label_background_fill="*primary_100",
         )
 premium_theme = PremiumTheme()
 css = """
 #app-title h1 {
     font-size: 2.5em !important;
@@ -133,17 +121,10 @@ css = """
     padding: 12px;
     background: var(--background-fill-secondary);
 }
-.face-box {
-    border: 3px solid #22c55e;
-    border-radius: 8px;
-}
-.sig-box {
-    border: 3px solid #3b82f6;
-    border-radius: 8px;
-}
 """
 # ──────────────────────────────────────────────────────────────
 # Device & Constants
 # ──────────────────────────────────────────────────────────────
@@ -152,10 +133,8 @@ DEFAULT_MAX_NEW_TOKENS = 1024
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"🚀 Using device: {device}")
 # ──────────────────────────────────────────────────────────────
 # ALL 12 MODELS
-# 6 Qwen Instruct (original) + 6 Custom CSM/Chhagan (replaced Thinking models)
 # ──────────────────────────────────────────────────────────────
 ALL_MODELS = [
     # ── Qwen Official Instruct Models ──
@@ -174,65 +153,113 @@ ALL_MODELS = [
     "Chhagan005/Chhagan-DocVL-Qwen3",
 ]
 # ──────────────────────────────────────────────────────────────
-# Lazy Model Loading (load on demand, cache in memory)
 # ──────────────────────────────────────────────────────────────
 _model_cache: Dict[str, Tuple[Any, Any]] = {}
 def get_model_class(model_id: str):
     if "Qwen2.5" in model_id:
         return Qwen2_5_VLForConditionalGeneration
     return Qwen3VLForConditionalGeneration
 def load_model(model_id: str):
     if model_id in _model_cache:
         return _model_cache[model_id]
     print(f"⏳ Loading model: {model_id}")
     model_cls = get_model_class(model_id)
     dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore')
         processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
         model = model_cls.from_pretrained(
-            model_id, dtype=dtype, device_map="auto", trust_remote_code=True,
         )
         model.eval()
     _model_cache[model_id] = (processor, model)
     print(f"✅ Model {model_id} loaded on {device}")
     return processor, model
 # ──────────────────────────────────────────────────────────────
-# ✅ FIX 1: Pre-load default model at startup (OUTSIDE GPU context)
 # ──────────────────────────────────────────────────────────────
 DEFAULT_MODEL = "Qwen/Qwen3-VL-8B-Instruct"
 print(f"⏳ Pre-loading default model at startup: {DEFAULT_MODEL}")
 load_model(DEFAULT_MODEL)
 print(f"✅ Default model ready!")
 def prepare_inputs(processor, model, messages):
     text = processor.apply_chat_template(
-        messages,
         tokenize=False,
         add_generation_prompt=True,
     )
-    image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
         text=[text],
-        images=image_inputs if image_inputs else None,
-        videos=video_inputs if video_inputs else None,
         padding=True,
         return_tensors="pt",
     )
     return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
 # ──────────────────────────────────────────────────────────────
 # Utility Functions
 # ──────────────────────────────────────────────────────────────
@@ -247,14 +274,12 @@ def ensure_rgb(image: Image.Image) -> Optional[Image.Image]:
         return image.convert("RGB")
     return image
 # ──────────────────────────────────────────────────────────────
 # 🔍 Face Detection, Signature Extraction & Annotation Engine
 # ──────────────────────────────────────────────────────────────
 def detect_faces(image: Image.Image):
     img_array = np.array(image)
     gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
     face_cascade = cv2.CascadeClassifier(
         cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
     )
@@ -262,7 +287,6 @@ def detect_faces(image: Image.Image):
         gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
         flags=cv2.CASCADE_SCALE_IMAGE,
     )
     if len(faces) == 0:
         profile_cascade = cv2.CascadeClassifier(
             cv2.data.haarcascades + 'haarcascade_profileface.xml'
@@ -270,12 +294,10 @@ def detect_faces(image: Image.Image):
         faces = profile_cascade.detectMultiScale(
             gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
         )
     if len(faces) == 0:
         return None, []
     faces_sorted = sorted(faces, key=lambda f: f[2] * f[3], reverse=True)
     x, y, w, h = faces_sorted[0]
     pad = int(0.2 * max(w, h))
     x1 = max(0, x - pad)
@@ -296,7 +318,7 @@ def detect_faces(image: Image.Image):
                 y2 = min(img_array.shape[0], y + h + pad)
                 face_gray2 = gray[y1:y2, x1:x2]
                 if face_gray2.size > 0 and cv2.Laplacian(face_gray2, cv2.CV_64F).var() < 30:
-                    return None, faces_sorted.tolist() if hasattr(faces_sorted, 'tolist') else [tuple(f) for f in faces_sorted]
             else:
                 return None, [tuple(f) for f in faces_sorted]
@@ -308,20 +330,15 @@ def detect_faces(image: Image.Image):
 def detect_signature(image: Image.Image):
     img_array = np.array(image)
     h, w = img_array.shape[:2]
     search_top = int(h * 0.5)
     lower_region = img_array[search_top:, :]
     gray = cv2.cvtColor(lower_region, cv2.COLOR_RGB2GRAY)
     binary = cv2.adaptiveThreshold(
-        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-        cv2.THRESH_BINARY_INV, 15, 10
     )
     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
     binary = cv2.dilate(binary, kernel, iterations=2)
     binary = cv2.erode(binary, kernel, iterations=1)
     contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     if not contours:
         return None, None
@@ -341,7 +358,6 @@ def detect_signature(image: Image.Image):
     all_points = np.concatenate(sig_contours)
     rx, ry, rw, rh = cv2.boundingRect(all_points)
     if rw < 30 or rh < 10:
         return None, None
@@ -358,13 +374,11 @@ def detect_signature(image: Image.Image):
         return None, None
     sig_crop = image.crop((sig_x1, sig_y1, sig_x2, sig_y2))
-    bbox = (sig_x1, sig_y1, sig_x2, sig_y2)
-    return sig_crop, bbox
 def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Optional[tuple]):
     img_array = np.array(image).copy()
     for i, (x, y, w, h) in enumerate(face_bboxes):
         color = (34, 197, 94)
         cv2.rectangle(img_array, (x, y), (x + w, y + h), color, 3)
@@ -373,7 +387,6 @@ def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Opti
         cv2.rectangle(img_array, (x, y - th - 10), (x + tw + 6, y), color, -1)
         cv2.putText(img_array, label, (x + 3, y - 5),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
     if sig_bbox:
         x1, y1, x2, y2 = sig_bbox
         color = (59, 130, 246)
@@ -383,25 +396,22 @@ def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Opti
         cv2.rectangle(img_array, (x1, y1 - th - 10), (x1 + tw + 6, y1), color, -1)
         cv2.putText(img_array, label, (x1 + 3, y1 - 5),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
     return Image.fromarray(img_array)
 def run_visual_extraction(image: Optional[Image.Image]):
     if image is None:
         return None, None, None, "_Upload an image to detect visual elements._"
     image = ensure_rgb(image)
     detections = []
     face_crop, face_bboxes = detect_faces(image)
     if face_crop is not None:
         detections.append(f"✅ **Face detected** — {len(face_bboxes)} face(s) found, largest extracted")
     else:
-        if face_bboxes:
-            detections.append(f"⚠️ **Face found but too blurry/small** — {len(face_bboxes)} face(s) detected but quality insufficient")
-        else:
-            detections.append("❌ **No face detected** in this image")
     sig_crop, sig_bbox = detect_signature(image)
     if sig_crop is not None:
@@ -410,10 +420,11 @@ def run_visual_extraction(image: Optional[Image.Image]):
         detections.append("ℹ️ **No signature detected** in this image")
     annotated = create_annotated_image(image, face_bboxes, sig_bbox)
-    detections.append(f"\\n🎯 **Annotated image** generated with {len(face_bboxes)} face box(es)" +
-                      (" + 1 signature box" if sig_bbox else ""))
-    summary_md = "### 🔍 Detection Results\\n\\n" + "\\n\\n".join(detections)
     return face_crop, sig_crop, annotated, summary_md
@@ -429,11 +440,9 @@ def generate_document_scan(
     if front_image is None and back_image is None:
         yield "⚠️ Please upload at least one image.", "⚠️ Please upload at least one image."
         return
     if not prompt.strip():
         prompt = ("Analyze this document. Extract all text, key details "
                   "(name, dates, numbers, etc.) and provide a structured summary.")
     try:
         processor, model = load_model(model_name)
     except Exception as e:
@@ -443,31 +452,18 @@ def generate_document_scan(
     content = []
     if front_image is not None:
         front_image = ensure_rgb(front_image)
-        content.append({"type": "text", "text": "**[FRONT SIDE]**"})
         content.append({"type": "image", "image": front_image})
     if back_image is not None:
         back_image = ensure_rgb(back_image)
-        content.append({"type": "text", "text": "**[BACK SIDE]**"})
         content.append({"type": "image", "image": back_image})
     content.append({"type": "text", "text": prompt})
     messages = [{"role": "user", "content": content}]
-    # inputs = processor.apply_chat_template(
-    #     messages,
-    #     tokenize=True,
-    #     add_generation_prompt=True,
-    #     return_dict=True,
-    #     return_tensors="pt"
-    # )
-   # inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
     inputs = prepare_inputs(processor, model, messages)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = {
         **inputs,
         "streamer": streamer,
@@ -477,7 +473,6 @@ def generate_document_scan(
         "top_p": top_p,
         "top_k": top_k,
     }
     thread = Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
     buffer = ""
@@ -501,7 +496,6 @@ def generate_image_analysis(
         return
     if not text.strip():
         text = "Describe this image in detail."
     try:
         processor, model = load_model(model_name)
     except Exception as e:
@@ -509,25 +503,13 @@ def generate_image_analysis(
         return
     image = ensure_rgb(image)
     messages = [{"role": "user", "content": [
         {"type": "image", "image": image},
-        {"type": "text", "text": text}
     ]}]
-    # inputs = processor.apply_chat_template(
-    #     messages,
-    #     tokenize=True,
-    #     add_generation_prompt=True,
-    #     return_dict=True,
-    #     return_tensors="pt"
-    # )
-    # inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
     inputs = prepare_inputs(processor, model, messages)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = {
         **inputs,
         "streamer": streamer,
@@ -537,7 +519,6 @@ def generate_image_analysis(
         "top_p": top_p,
         "top_k": top_k,
     }
     thread = Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
     buffer = ""
@@ -560,7 +541,7 @@ def process_batch_images(
     if not prompts_text.strip():
         return "⚠️ Please enter prompts (one per line)."
-    prompts = [p.strip() for p in prompts_text.split('\\n') if p.strip()]
     if len(prompts) == 1:
         prompts = prompts * len(files)
     elif len(prompts) != len(files):
@@ -576,24 +557,13 @@ def process_batch_images(
         try:
             image_path = file.name if hasattr(file, 'name') else file
             image = Image.open(image_path).convert("RGB")
             if seed != -1:
                 torch.manual_seed(seed + idx - 1)
             messages = [{"role": "user", "content": [
                 {"type": "image", "image": image},
-                {"type": "text", "text": prompt},
             ]}]
-            # inputs = processor.apply_chat_template(
-            #     messages,
-            #     tokenize=True,
-            #     add_generation_prompt=True,
-            #     return_dict=True,
-            #     return_tensors="pt"
-            # )
-            # inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
             inputs = prepare_inputs(processor, model, messages)
             with torch.no_grad():
@@ -605,7 +575,6 @@ def process_batch_images(
                     top_k=top_k,
                     do_sample=temperature > 0,
                 )
             generated_ids_trimmed = [
                 out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
             ]
@@ -617,9 +586,9 @@ def process_batch_images(
         results.append(f"═══ Image {idx}: {os.path.basename(str(image_path))} ═══")
         results.append(f"📝 Prompt: {prompt}")
-        results.append(f"📄 Result: {result}\\n")
-    return "\\n".join(results)
 # ──────────────────────────────────────────────────────────────
@@ -650,15 +619,6 @@ def process_chat_message(
     if content:
         messages.append({"role": "user", "content": content})
-    # inputs = processor.apply_chat_template(
-    #     messages,
-    #     tokenize=True,
-    #     add_generation_prompt=True,
-    #     return_dict=True,
-    #     return_tensors="pt"
-    # )
-    # inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
     inputs = prepare_inputs(processor, model, messages)
     with torch.no_grad():
@@ -669,7 +629,6 @@ def process_chat_message(
             do_sample=True,
             top_p=0.95,
         )
     generated_ids_trimmed = [
         out_ids[len(in_ids):]
         for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
@@ -682,9 +641,8 @@ def process_chat_message(
 def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str):
-    text = message.get("text", "")
     files = message.get("files", [])
     image = None
     if files and len(files) > 0:
         try:
@@ -704,8 +662,8 @@ def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name:
     except Exception as e:
         response = f"❌ Error: {str(e)}"
-    user_content = f"{text}\\n📎 [Image attached]" if image is not None else text
-    history.append({"role": "user", "content": user_content})
     history.append({"role": "assistant", "content": response})
     return "", history
@@ -719,10 +677,7 @@ def retry_fn(history, model_name):
         return "", history
     history = history[:-1]
     user_content = last_user_msg.get("content", "")
-    if "📎 [Image attached]" in user_content:
-        text = user_content.replace("\\n📎 [Image attached]", "").replace("📎 [Image attached]", "")
-    else:
-        text = user_content
     return chat_fn({"text": text}, history, model_name)
@@ -757,28 +712,28 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
     with gr.Accordion("⚙️ Advanced Generation Parameters", open=False):
         with gr.Row():
-            max_new_tokens = gr.Slider(64, MAX_MAX_NEW_TOKENS, DEFAULT_MAX_NEW_TOKENS, step=64, label="Max New Tokens")
-            temperature = gr.Slider(0.1, 2.0, 0.6, step=0.1, label="Temperature")
         with gr.Row():
-            top_p = gr.Slider(0.05, 1.0, 0.9, step=0.05, label="Top-p")
-            top_k = gr.Slider(1, 1000, 50, step=1, label="Top-k")
         with gr.Row():
-            repetition_penalty = gr.Slider(1.0, 2.0, 1.2, step=0.05, label="Repetition Penalty")
-            seed_number = gr.Number(value=-1, label="Seed (-1 = random)", precision=0)
     with gr.Tabs():
         # ─── TAB 1: Document Scanner ───
         with gr.TabItem("🪪 Document Scanner"):
             gr.Markdown(
-                "### Scan Front & Back of Documents\\n"
-                "Upload front and/or back side images. Both analyzed together by the selected model.\\n"
                 "Face profiles and signatures are **auto-detected** on front image upload."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                     doc_front_image = gr.Image(type="pil", label="📄 Front Side", height=280)
-                    doc_back_image = gr.Image(type="pil", label="📄 Back Side", height=280)
                     doc_prompt = gr.Textbox(
                         label="Custom Prompt (optional)", lines=3,
                         placeholder="e.g., Extract all text, MRZ data, name, DOB, ID number...",
@@ -794,9 +749,9 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
             gr.Markdown("### 🔍 Visual Element Detection _(auto-detected on front image upload)_")
             with gr.Row():
                 with gr.Column(scale=1):
-                    doc_face_output = gr.Image(label="👤 Detected Face Profile", height=220, elem_classes="face-box")
                 with gr.Column(scale=1):
-                    doc_sig_output = gr.Image(label="✍️ Detected Signature", height=220, elem_classes="sig-box")
                 with gr.Column(scale=1):
                     doc_annotated_output = gr.Image(label="🎯 Annotated Image (Highlights)", height=220)
             doc_detection_summary = gr.Markdown("_Upload a front side image to detect visual elements._")
@@ -806,7 +761,6 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
                 inputs=[doc_front_image],
                 outputs=[doc_face_output, doc_sig_output, doc_annotated_output, doc_detection_summary],
             )
             doc_submit.click(
                 fn=generate_document_scan,
                 inputs=[model_choice, doc_front_image, doc_back_image, doc_prompt,
@@ -817,14 +771,14 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
         # ─── TAB 2: Image Analysis ───
         with gr.TabItem("🖼️ Image Analysis"):
             gr.Markdown(
-                "### Smart Image Analysis\\n"
                 "Upload an image to auto-detect **face profiles**, **signatures**, and see "
                 "**highlighted annotations**. Then run model analysis with a custom prompt."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                     img_upload = gr.Image(type="pil", label="Upload Image", height=320)
-                    img_query = gr.Textbox(
                         label="Query / Prompt", lines=2,
                         placeholder="What do you see in this image? / Extract all text / Describe in detail...",
                     )
@@ -838,9 +792,9 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
             gr.Markdown("### 🔍 Visual Element Detection _(auto-detected on upload)_")
             with gr.Row():
                 with gr.Column(scale=1):
-                    face_output = gr.Image(label="👤 Detected Face Profile", height=220, elem_classes="face-box")
                 with gr.Column(scale=1):
-                    sig_output = gr.Image(label="✍️ Detected Signature", height=220, elem_classes="sig-box")
                 with gr.Column(scale=1):
                     annotated_output = gr.Image(label="🎯 Annotated Image (Highlights)", height=220)
             detection_summary = gr.Markdown("_Upload an image to detect visual elements._")
@@ -850,7 +804,6 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
                 inputs=[img_upload],
                 outputs=[face_output, sig_output, annotated_output, detection_summary],
             )
             img_submit.click(
                 fn=generate_image_analysis,
                 inputs=[model_choice, img_query, img_upload, max_new_tokens, temperature,
@@ -863,10 +816,10 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
             gr.Markdown("### Process Multiple Images at Once")
             with gr.Row():
                 with gr.Column(scale=1):
-                    batch_images = gr.File(file_count="multiple", label="Upload Images", file_types=["image"])
                     batch_prompts = gr.Textbox(
                         label="Prompts (one per line)", lines=5,
-                        placeholder="Describe this image in detail\\nExtract all text...",
                         info="One prompt for all images OR one prompt per image",
                     )
                     batch_submit = gr.Button("🚀 Process Batch", variant="primary")
@@ -883,23 +836,19 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
         # ─── TAB 4: Chat ───
         with gr.TabItem("💬 Chat"):
             gr.Markdown(
-                "### Multi-Turn Chat with Image Attachments\\n"
                 "Converse with the model. Attach images at any point in the conversation."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.Markdown(
-                        "**💡 Tips:**\\n"
-                        "- Upload an image and ask questions\\n"
-                        "- Detailed descriptions & visual QA\\n"
-                        "- Multi-turn conversation memory\\n"
                     )
                 with gr.Column(scale=3):
-                    chatbot = gr.Chatbot(
-                        label="Chat",
-                        height=450,
-                        value=[],
-                    )
                     with gr.Row():
                         chat_msg = gr.MultimodalTextbox(
                             label="Message",
@@ -908,24 +857,23 @@ with gr.Blocks(title="Chhagan's Multi-Model Studio") as demo:
                         )
                     with gr.Row():
                         retry_btn = gr.Button("🔄 Retry", variant="secondary", size="sm")
-                        undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm")
                         clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm")
-            chat_msg.submit(chat_fn, [chat_msg, chatbot, model_choice], [chat_msg, chatbot], queue=True)
-            retry_btn.click(retry_fn, [chatbot, model_choice], [chat_msg, chatbot], queue=True)
-            undo_btn.click(undo_fn, [chatbot], [chatbot], queue=False)
-            clear_btn.click(clear_fn, outputs=[chat_msg, chatbot], queue=False)
     gr.Markdown(
-        "---\\n"
-        "**🧠 Chhagan's  Multi-Model Studio** • 12 Models Total\\n\\n"
         "Qwen3-VL (2B/4B/8B/32B) Instruct + Qwen2.5-VL (3B/7B) Instruct + "
         "CSM-DocExtract-VL • CSM-DocExtract-VL-Q4KM • CSM-DocExtract-VL-Q4KM-merged-fp16 • "
-        "CSM-DocExtract-VL-HF • Chhagan_ML-VL-OCR-v1 • Chhagan-DocVL-Qwen3\\n\\n"
         "_Built with ❤️ using Gradio_"
     )
 # ──────────────────────────────────────────────────────────────
 # Launch
 # ──────────────────────────────────────────────────────────────

 12 models: 6 Qwen VL Instruct + 6 Custom CSM/Chhagan VL models across all tabs.
 """
 import os
 import time
 import warnings
 from threading import Thread
+from typing import Optional, Tuple, Dict, Any, List
 from qwen_vl_utils import process_vision_info
 from PIL import Image, ImageDraw, ImageFont
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     Qwen3VLForConditionalGeneration,
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 # ──────────────────────────────────────────────────────────────
 # Suppress warnings
 # ──────────────────────────────────────────────────────────────
 warnings.filterwarnings('ignore', message='.*meta device.*')
 # ──────────────────────────────────────────────────────────────
 # Custom Premium Theme
 # ──────────────────────────────────────────────────────────────
     c800="#3730A3", c900="#312E81", c950="#1E1B4B",
 )
 colors.cyber_teal = colors.Color(
     name="cyber_teal",
     c50="#F0FDFA", c100="#CCFBF1", c200="#99F6E4", c300="#5EEAD4",
     c800="#115E59", c900="#134E4A", c950="#042F2E",
 )
 class PremiumTheme(Soft):
     def __init__(self):
         super().__init__(
             block_label_background_fill="*primary_100",
         )
 premium_theme = PremiumTheme()
 css = """
 #app-title h1 {
     font-size: 2.5em !important;
     padding: 12px;
     background: var(--background-fill-secondary);
 }
+.face-box { border: 3px solid #22c55e; border-radius: 8px; }
+.sig-box  { border: 3px solid #3b82f6; border-radius: 8px; }
 """
 # ──────────────────────────────────────────────────────────────
 # Device & Constants
 # ──────────────────────────────────────────────────────────────
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"🚀 Using device: {device}")
 # ──────────────────────────────────────────────────────────────
 # ALL 12 MODELS
 # ──────────────────────────────────────────────────────────────
 ALL_MODELS = [
     # ── Qwen Official Instruct Models ──
     "Chhagan005/Chhagan-DocVL-Qwen3",
 ]
 # ──────────────────────────────────────────────────────────────
+# Lazy Model Loading
 # ──────────────────────────────────────────────────────────────
 _model_cache: Dict[str, Tuple[Any, Any]] = {}
 def get_model_class(model_id: str):
     if "Qwen2.5" in model_id:
         return Qwen2_5_VLForConditionalGeneration
     return Qwen3VLForConditionalGeneration
 def load_model(model_id: str):
     if model_id in _model_cache:
         return _model_cache[model_id]
     print(f"⏳ Loading model: {model_id}")
     model_cls = get_model_class(model_id)
     dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore')
         processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
         model = model_cls.from_pretrained(
+            model_id, torch_dtype=dtype, device_map="auto", trust_remote_code=True,
         )
         model.eval()
     _model_cache[model_id] = (processor, model)
     print(f"✅ Model {model_id} loaded on {device}")
     return processor, model
 # ──────────────────────────────────────────────────────────────
+# Pre-load default model
 # ──────────────────────────────────────────────────────────────
 DEFAULT_MODEL = "Qwen/Qwen3-VL-8B-Instruct"
 print(f"⏳ Pre-loading default model at startup: {DEFAULT_MODEL}")
 load_model(DEFAULT_MODEL)
 print(f"✅ Default model ready!")
+# ──────────────────────────────────────────────────────────────
+# ✅ CORE FIX: Universal Input Processor
+# Handles both standard Qwen templates AND custom CSM jinja templates
+# ──────────────────────────────────────────────────────────────
+def _flatten_messages_for_custom_template(messages):
+    """
+    Custom CSM/Chhagan models have jinja templates that expect
+    plain string content, not multimodal list-of-dicts.
+    This flattens content lists → string, extracts PIL images separately.
+    """
+    flat_messages = []
+    extracted_images = []
+    for msg in messages:
+        content = msg.get("content", "")
+        if isinstance(content, list):
+            parts = []
+            for item in content:
+                if isinstance(item, dict):
+                    if item.get("type") == "image":
+                        img = item.get("image")
+                        if img is not None:
+                            extracted_images.append(img)
+                        # Qwen vision special token placeholder
+                        parts.append("<|vision_start|><|image_pad|><|vision_end|>")
+                    elif item.get("type") == "text":
+                        parts.append(item.get("text", ""))
+            flat_messages.append({"role": msg["role"], "content": "".join(parts)})
+        else:
+            flat_messages.append(msg)
+    return flat_messages, extracted_images
 def prepare_inputs(processor, model, messages):
+    """
+    Attempt 1 — Standard multimodal path (works for official Qwen models).
+    Attempt 2 — Flatten fallback (works for custom CSM/Chhagan jinja templates).
+    """
+    # ── Attempt 1: Standard multimodal ──────────────────────
+    try:
+        text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs if image_inputs else None,
+            videos=video_inputs if video_inputs else None,
+            padding=True,
+            return_tensors="pt",
+        )
+        return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
+    except TypeError:
+        # Custom template doesn't support list content → use fallback
+        pass
+    # ── Attempt 2: Flatten for custom jinja templates ────────
+    flat_msgs, extracted_images = _flatten_messages_for_custom_template(messages)
     text = processor.apply_chat_template(
+        flat_msgs,
         tokenize=False,
         add_generation_prompt=True,
     )
     inputs = processor(
         text=[text],
+        images=extracted_images if extracted_images else None,
         padding=True,
         return_tensors="pt",
     )
     return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
 # ──────────────────────────────────────────────────────────────
 # Utility Functions
 # ──────────────────────────────────────────────────────────────
         return image.convert("RGB")
     return image
 # ──────────────────────────────────────────────────────────────
 # 🔍 Face Detection, Signature Extraction & Annotation Engine
 # ──────────────────────────────────────────────────────────────
 def detect_faces(image: Image.Image):
     img_array = np.array(image)
     gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
     face_cascade = cv2.CascadeClassifier(
         cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
     )
         gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
         flags=cv2.CASCADE_SCALE_IMAGE,
     )
     if len(faces) == 0:
         profile_cascade = cv2.CascadeClassifier(
             cv2.data.haarcascades + 'haarcascade_profileface.xml'
         faces = profile_cascade.detectMultiScale(
             gray, scaleFactor=1.08, minNeighbors=4, minSize=(40, 40),
         )
     if len(faces) == 0:
         return None, []
     faces_sorted = sorted(faces, key=lambda f: f[2] * f[3], reverse=True)
     x, y, w, h = faces_sorted[0]
     pad = int(0.2 * max(w, h))
     x1 = max(0, x - pad)
                 y2 = min(img_array.shape[0], y + h + pad)
                 face_gray2 = gray[y1:y2, x1:x2]
                 if face_gray2.size > 0 and cv2.Laplacian(face_gray2, cv2.CV_64F).var() < 30:
+                    return None, [tuple(f) for f in faces_sorted]
             else:
                 return None, [tuple(f) for f in faces_sorted]
 def detect_signature(image: Image.Image):
     img_array = np.array(image)
     h, w = img_array.shape[:2]
     search_top = int(h * 0.5)
     lower_region = img_array[search_top:, :]
     gray = cv2.cvtColor(lower_region, cv2.COLOR_RGB2GRAY)
     binary = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 10
     )
     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
     binary = cv2.dilate(binary, kernel, iterations=2)
     binary = cv2.erode(binary, kernel, iterations=1)
     contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     if not contours:
         return None, None
     all_points = np.concatenate(sig_contours)
     rx, ry, rw, rh = cv2.boundingRect(all_points)
     if rw < 30 or rh < 10:
         return None, None
         return None, None
     sig_crop = image.crop((sig_x1, sig_y1, sig_x2, sig_y2))
+    return sig_crop, (sig_x1, sig_y1, sig_x2, sig_y2)
 def create_annotated_image(image: Image.Image, face_bboxes: list, sig_bbox: Optional[tuple]):
     img_array = np.array(image).copy()
     for i, (x, y, w, h) in enumerate(face_bboxes):
         color = (34, 197, 94)
         cv2.rectangle(img_array, (x, y), (x + w, y + h), color, 3)
         cv2.rectangle(img_array, (x, y - th - 10), (x + tw + 6, y), color, -1)
         cv2.putText(img_array, label, (x + 3, y - 5),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
     if sig_bbox:
         x1, y1, x2, y2 = sig_bbox
         color = (59, 130, 246)
         cv2.rectangle(img_array, (x1, y1 - th - 10), (x1 + tw + 6, y1), color, -1)
         cv2.putText(img_array, label, (x1 + 3, y1 - 5),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
     return Image.fromarray(img_array)
 def run_visual_extraction(image: Optional[Image.Image]):
     if image is None:
         return None, None, None, "_Upload an image to detect visual elements._"
     image = ensure_rgb(image)
     detections = []
     face_crop, face_bboxes = detect_faces(image)
     if face_crop is not None:
         detections.append(f"✅ **Face detected** — {len(face_bboxes)} face(s) found, largest extracted")
+    elif face_bboxes:
+        detections.append(f"⚠️ **Face found but too blurry/small** — {len(face_bboxes)} face(s) detected but quality insufficient")
     else:
+        detections.append("❌ **No face detected** in this image")
     sig_crop, sig_bbox = detect_signature(image)
     if sig_crop is not None:
         detections.append("ℹ️ **No signature detected** in this image")
     annotated = create_annotated_image(image, face_bboxes, sig_bbox)
+    detections.append(
+        f"\n🎯 **Annotated image** generated with {len(face_bboxes)} face box(es)"
+        + (" + 1 signature box" if sig_bbox else "")
+    )
+    summary_md = "### 🔍 Detection Results\n\n" + "\n\n".join(detections)
     return face_crop, sig_crop, annotated, summary_md
     if front_image is None and back_image is None:
         yield "⚠️ Please upload at least one image.", "⚠️ Please upload at least one image."
         return
     if not prompt.strip():
         prompt = ("Analyze this document. Extract all text, key details "
                   "(name, dates, numbers, etc.) and provide a structured summary.")
     try:
         processor, model = load_model(model_name)
     except Exception as e:
     content = []
     if front_image is not None:
         front_image = ensure_rgb(front_image)
+        content.append({"type": "text",  "text": "**[FRONT SIDE]**"})
         content.append({"type": "image", "image": front_image})
     if back_image is not None:
         back_image = ensure_rgb(back_image)
+        content.append({"type": "text",  "text": "**[BACK SIDE]**"})
         content.append({"type": "image", "image": back_image})
     content.append({"type": "text", "text": prompt})
     messages = [{"role": "user", "content": content}]
     inputs = prepare_inputs(processor, model, messages)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = {
         **inputs,
         "streamer": streamer,
         "top_p": top_p,
         "top_k": top_k,
     }
     thread = Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
     buffer = ""
         return
     if not text.strip():
         text = "Describe this image in detail."
     try:
         processor, model = load_model(model_name)
     except Exception as e:
         return
     image = ensure_rgb(image)
     messages = [{"role": "user", "content": [
         {"type": "image", "image": image},
+        {"type": "text",  "text": text},
     ]}]
     inputs = prepare_inputs(processor, model, messages)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = {
         **inputs,
         "streamer": streamer,
         "top_p": top_p,
         "top_k": top_k,
     }
     thread = Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
     buffer = ""
     if not prompts_text.strip():
         return "⚠️ Please enter prompts (one per line)."
+    prompts = [p.strip() for p in prompts_text.split('\n') if p.strip()]
     if len(prompts) == 1:
         prompts = prompts * len(files)
     elif len(prompts) != len(files):
         try:
             image_path = file.name if hasattr(file, 'name') else file
             image = Image.open(image_path).convert("RGB")
             if seed != -1:
                 torch.manual_seed(seed + idx - 1)
             messages = [{"role": "user", "content": [
                 {"type": "image", "image": image},
+                {"type": "text",  "text": prompt},
             ]}]
             inputs = prepare_inputs(processor, model, messages)
             with torch.no_grad():
                     top_k=top_k,
                     do_sample=temperature > 0,
                 )
             generated_ids_trimmed = [
                 out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
             ]
         results.append(f"═══ Image {idx}: {os.path.basename(str(image_path))} ═══")
         results.append(f"📝 Prompt: {prompt}")
+        results.append(f"📄 Result: {result}\n")
+    return "\n".join(results)
 # ──────────────────────────────────────────────────────────────
     if content:
         messages.append({"role": "user", "content": content})
     inputs = prepare_inputs(processor, model, messages)
     with torch.no_grad():
             do_sample=True,
             top_p=0.95,
         )
     generated_ids_trimmed = [
         out_ids[len(in_ids):]
         for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
 def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str):
+    text  = message.get("text", "")
     files = message.get("files", [])
     image = None
     if files and len(files) > 0:
         try:
     except Exception as e:
         response = f"❌ Error: {str(e)}"
+    user_content = f"{text}\n📎 [Image attached]" if image is not None else text
+    history.append({"role": "user",      "content": user_content})
     history.append({"role": "assistant", "content": response})
     return "", history
         return "", history
     history = history[:-1]
     user_content = last_user_msg.get("content", "")
+    text = user_content.replace("\n📎 [Image attached]", "").replace("📎 [Image attached]", "")
     return chat_fn({"text": text}, history, model_name)
     with gr.Accordion("⚙️ Advanced Generation Parameters", open=False):
         with gr.Row():
+            max_new_tokens    = gr.Slider(64, MAX_MAX_NEW_TOKENS, DEFAULT_MAX_NEW_TOKENS, step=64,   label="Max New Tokens")
+            temperature       = gr.Slider(0.1, 2.0, 0.6,  step=0.1,  label="Temperature")
         with gr.Row():
+            top_p             = gr.Slider(0.05, 1.0, 0.9,  step=0.05, label="Top-p")
+            top_k             = gr.Slider(1, 1000, 50,      step=1,    label="Top-k")
         with gr.Row():
+            repetition_penalty = gr.Slider(1.0, 2.0, 1.2,  step=0.05, label="Repetition Penalty")
+            seed_number        = gr.Number(value=-1, label="Seed (-1 = random)", precision=0)
     with gr.Tabs():
         # ─── TAB 1: Document Scanner ───
         with gr.TabItem("🪪 Document Scanner"):
             gr.Markdown(
+                "### Scan Front & Back of Documents\n"
+                "Upload front and/or back side images. Both analyzed together by the selected model.\n"
                 "Face profiles and signatures are **auto-detected** on front image upload."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                     doc_front_image = gr.Image(type="pil", label="📄 Front Side", height=280)
+                    doc_back_image  = gr.Image(type="pil", label="📄 Back Side",  height=280)
                     doc_prompt = gr.Textbox(
                         label="Custom Prompt (optional)", lines=3,
                         placeholder="e.g., Extract all text, MRZ data, name, DOB, ID number...",
             gr.Markdown("### 🔍 Visual Element Detection _(auto-detected on front image upload)_")
             with gr.Row():
                 with gr.Column(scale=1):
+                    doc_face_output     = gr.Image(label="👤 Detected Face Profile",       height=220, elem_classes="face-box")
                 with gr.Column(scale=1):
+                    doc_sig_output      = gr.Image(label="✍️ Detected Signature",          height=220, elem_classes="sig-box")
                 with gr.Column(scale=1):
                     doc_annotated_output = gr.Image(label="🎯 Annotated Image (Highlights)", height=220)
             doc_detection_summary = gr.Markdown("_Upload a front side image to detect visual elements._")
                 inputs=[doc_front_image],
                 outputs=[doc_face_output, doc_sig_output, doc_annotated_output, doc_detection_summary],
             )
             doc_submit.click(
                 fn=generate_document_scan,
                 inputs=[model_choice, doc_front_image, doc_back_image, doc_prompt,
         # ─── TAB 2: Image Analysis ───
         with gr.TabItem("🖼️ Image Analysis"):
             gr.Markdown(
+                "### Smart Image Analysis\n"
                 "Upload an image to auto-detect **face profiles**, **signatures**, and see "
                 "**highlighted annotations**. Then run model analysis with a custom prompt."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                     img_upload = gr.Image(type="pil", label="Upload Image", height=320)
+                    img_query  = gr.Textbox(
                         label="Query / Prompt", lines=2,
                         placeholder="What do you see in this image? / Extract all text / Describe in detail...",
                     )
             gr.Markdown("### 🔍 Visual Element Detection _(auto-detected on upload)_")
             with gr.Row():
                 with gr.Column(scale=1):
+                    face_output      = gr.Image(label="👤 Detected Face Profile",       height=220, elem_classes="face-box")
                 with gr.Column(scale=1):
+                    sig_output       = gr.Image(label="✍️ Detected Signature",          height=220, elem_classes="sig-box")
                 with gr.Column(scale=1):
                     annotated_output = gr.Image(label="🎯 Annotated Image (Highlights)", height=220)
             detection_summary = gr.Markdown("_Upload an image to detect visual elements._")
                 inputs=[img_upload],
                 outputs=[face_output, sig_output, annotated_output, detection_summary],
             )
             img_submit.click(
                 fn=generate_image_analysis,
                 inputs=[model_choice, img_query, img_upload, max_new_tokens, temperature,
             gr.Markdown("### Process Multiple Images at Once")
             with gr.Row():
                 with gr.Column(scale=1):
+                    batch_images  = gr.File(file_count="multiple", label="Upload Images", file_types=["image"])
                     batch_prompts = gr.Textbox(
                         label="Prompts (one per line)", lines=5,
+                        placeholder="Describe this image in detail\nExtract all text...",
                         info="One prompt for all images OR one prompt per image",
                     )
                     batch_submit = gr.Button("🚀 Process Batch", variant="primary")
         # ─── TAB 4: Chat ───
         with gr.TabItem("💬 Chat"):
             gr.Markdown(
+                "### Multi-Turn Chat with Image Attachments\n"
                 "Converse with the model. Attach images at any point in the conversation."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.Markdown(
+                        "**💡 Tips:**\n"
+                        "- Upload an image and ask questions\n"
+                        "- Detailed descriptions & visual QA\n"
+                        "- Multi-turn conversation memory\n"
                     )
                 with gr.Column(scale=3):
+                    chatbot = gr.Chatbot(label="Chat", height=450, value=[])
                     with gr.Row():
                         chat_msg = gr.MultimodalTextbox(
                             label="Message",
                         )
                     with gr.Row():
                         retry_btn = gr.Button("🔄 Retry", variant="secondary", size="sm")
+                        undo_btn  = gr.Button("↩️ Undo",  variant="secondary", size="sm")
                         clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm")
+            chat_msg.submit(chat_fn,  [chat_msg, chatbot, model_choice], [chat_msg, chatbot], queue=True)
+            retry_btn.click(retry_fn, [chatbot, model_choice],           [chat_msg, chatbot], queue=True)
+            undo_btn.click( undo_fn,  [chatbot],                         [chatbot],           queue=False)
+            clear_btn.click(clear_fn, outputs=[chat_msg, chatbot],                            queue=False)
     gr.Markdown(
+        "---\n"
+        "**🧠 Chhagan's  Multi-Model Studio** • 12 Models Total\n\n"
         "Qwen3-VL (2B/4B/8B/32B) Instruct + Qwen2.5-VL (3B/7B) Instruct + "
         "CSM-DocExtract-VL • CSM-DocExtract-VL-Q4KM • CSM-DocExtract-VL-Q4KM-merged-fp16 • "
+        "CSM-DocExtract-VL-HF • Chhagan_ML-VL-OCR-v1 • Chhagan-DocVL-Qwen3\n\n"
         "_Built with ❤️ using Gradio_"
     )
 # ──────────────────────────────────────────────────────────────
 # Launch
 # ──────────────────────────────────────────────────────────────