Spaces:

Chhagan005
/

CSM-KIE-Scanner

Sleeping

App Files Files Community

Chhagan005 commited on Mar 9

Commit

660df5d

verified ·

1 Parent(s): 29d1fd9

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +83 -99

app.py CHANGED Viewed

@@ -5,166 +5,170 @@ import torch
 import gc
 from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
-from qwen_vl_utils import process_vision_info
 import json
 import re
 from typing import Dict, List, Any, Optional
 HF_TOKEN = os.environ.get("HF_TOKEN")
-# ──────────────────────────────────────────────────────────────
-# 1. Smart Memory Cache (From your reference, heavily optimized)
-# ──────────────────────────────────────────────────────────────
 _model_cache = {}
-MAX_CACHED_MODELS = 2  # Limits RAM usage on free HF Space CPU
 def load_model(model_id: str):
-    # 1. Agar cache me hai, wahi se return karo (0 loading time)
     if model_id in _model_cache:
-        print(f"⚡ Fast Load: {model_id} already in cache!")
         return _model_cache[model_id]
-    # 2. RAM check (Agar memory full hai, toh sabse purana model nikal do)
     if len(_model_cache) >= MAX_CACHED_MODELS:
-        oldest_model = list(_model_cache.keys())[0]
-        print(f"🧹 Memory Full! Unloading old model: {oldest_model}")
-        del _model_cache[oldest_model]
         gc.collect()
-    # 3. Pehli baar model load karo
-    print(f"⏳ Loading model into memory: {model_id}")
     try:
         processor = AutoProcessor.from_pretrained(model_id, token=HF_TOKEN)
-        # Check for GPU (from reference)
-        device_type = "auto" if torch.cuda.is_available() else "cpu"
         model = AutoModelForImageTextToText.from_pretrained(
-            model_id,
-            device_map=device_type,
-            low_cpu_mem_usage=True,
-            token=HF_TOKEN
         )
         model.eval()
         _model_cache[model_id] = (processor, model)
-        print(f"✅ {model_id} loaded successfully!")
         return processor, model
     except Exception as e:
-        print(f"❌ Error loading {model_id}: {str(e)}")
         return None, None
 def ui_model_change(model_id):
     processor, model = load_model(model_id)
-    if model:
-        return f"✅ Model Active: {model_id} (Cached in Memory)"
     return f"❌ Failed to load {model_id}"
-# ──────────────────────────────────────────────────────────────
-# 2. Enterprise OCR JSON Parsing (Our logic)
-# ──────────────────────────────────────────────────────────────
 def extract_tag(tag, text):
     match = re.search(f"<(?:{tag})?>(.*?)</(?:{tag})?", text, re.IGNORECASE)
     if not match: match = re.search(f"<{tag}>(.*?)</{tag}>", text, re.IGNORECASE)
     return match.group(1).strip() if match else "UNKNOWN"
 def build_enterprise_json(raw_text):
-    civ_id = extract_tag("ID", raw_text)
-    name = extract_tag("NAME", raw_text)
-    dob = extract_tag("DOB", raw_text)
-    nat = extract_tag("NAT", raw_text)
     result_json = {
       "DocumentMetadata": {"document_type": "Resident Card", "has_mrz": True},
       "StructuredData": {
-        "civil_number": civ_id, "full_name": name, "date_of_birth": dob, "nationality": nat
       }
     }
     return json.dumps(result_json, indent=2, ensure_ascii=False)
 def run_document_scan(front_img, model_name):
     if front_img is None: return "Error: Please upload document image."
     processor, model = load_model(model_name)
     if not model: return "Error: Model not loaded."
     prompt = "Extract details inside these XML tags ONLY:\n<ID></ID>\n<NAME></NAME>\n<DOB></DOB>\n<NAT></NAT>"
     messages = [{"role": "user", "content": [{"type": "image", "image": front_img}, {"type": "text", "text": prompt}]}]
-    try:
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
-        inputs = {k: v.to(model.device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
         with torch.no_grad():
             generated_ids = model.generate(**inputs, max_new_tokens=150, temperature=0.1)
         trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)]
         raw_output = processor.batch_decode(trimmed, skip_special_tokens=True)[0]
         return build_enterprise_json(raw_output)
     except Exception as e:
-         return f"Extraction Failed: {str(e)}"
-# ──────────────────────────────────────────────────────────────
-# 3. Chat Inference (Reference Architecture Logic)
-# ──────────────────────────────────────────────────────────────
-def process_chat(message: str, image: Optional[Image.Image], history: List[Dict[str, Any]], model_name: str) -> str:
     processor, model = load_model(model_name)
     if not model: return "Error: Model not loaded."
     content = []
     if image is not None:
         content.append({"type": "image", "image": image})
-    if message:
-        content.append({"type": "text", "text": message})
-    # Prepare pure history dictionary
-    messages = [{"role": m["role"], "content": m["content"]} for m in history if m.get("role") in ("user", "assistant")]
     if content:
         messages.append({"role": "user", "content": content})
     try:
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
-        inputs = {k: v.to(model.device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
         with torch.no_grad():
             generated_ids = model.generate(**inputs, max_new_tokens=512, temperature=0.7, top_p=0.9)
         trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], generated_ids)]
-        return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     except Exception as e:
         return f"❌ Error: {str(e)}"
-# Chat Wrapper handling the UI logic
-def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str):
     text  = message.get("text", "")
     files = message.get("files", [])
     image = None
     if files:
         try: image = Image.open(files[0]).convert("RGB")
-        except Exception as e: print(f"Image load error: {e}")
     response = process_chat(text, image, history, model_name)
-    # Append to history precisely as dictionaries (Fixes all Gradio 5+ type errors)
     display_text = f"{text}\n📎 [Image attached]" if image else text
     history.append({"role": "user", "content": display_text})
     history.append({"role": "assistant", "content": response})
-    # Clears the multimodal textbox on send
     return gr.update(value={"text": "", "files": []}), history
-# ──────────────────────────────────────────────────────────────
-# 4. Gradio Interface (Unified UI)
-# ──────────────────────────────────────────────────────────────
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🪪 CSM Smart Document Engine")
-    gr.Markdown("_Unified architecture with On-Demand Caching & Multi-Turn Chat_")
     with gr.Row(variant="panel"):
         model_dropdown = gr.Dropdown(
@@ -172,50 +176,30 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 "Chhagan005/CSM-KIE-Universal",
                 "Chhagan005/CSM-DocExtract-8N",
                 "Chhagan005/CSM-DocExtract-4N",
-                "Chhagan005/CSM-DocExtract-2N"
             ],
-            label="🤖 Select Model",
-            value="Chhagan005/CSM-KIE-Universal",
-            interactive=True
         )
         status_bar = gr.Textbox(label="Memory Status", value="Select a model to load into memory", interactive=False)
-    # Load model dynamically when dropdown changes
     model_dropdown.change(fn=ui_model_change, inputs=[model_dropdown], outputs=[status_bar])
     with gr.Tabs():
-        # TAB 1: Document Scan
         with gr.TabItem("📄 Document Scanner"):
             with gr.Row():
                 with gr.Column():
-                    doc_img = gr.Image(type="pil", label="Upload ID Card")
                     scan_btn = gr.Button("🔍 Extract JSON", variant="primary")
                 with gr.Column():
                     json_output = gr.Code(language="json", label="Enterprise Result")
             scan_btn.click(fn=run_document_scan, inputs=[doc_img, model_dropdown], outputs=[json_output])
-        # TAB 2: Multimodal Chat
         with gr.TabItem("💬 Intelligent Chat"):
-            gr.Markdown("**Tips:** Upload an image using the + icon inside the chatbox.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    # Pure Gradio Chatbot (No type=tuples needed since we pass strict dicts now)
-                    chatbot = gr.Chatbot(label="Chat History", height=450, value=[])
-                    # Multimodal box exactly like your reference
-                    chat_msg = gr.MultimodalTextbox(
-                        label="Message",
-                        placeholder="Type a message or click 📎 to upload an image...",
-                        file_types=["image"],
-                        submit_btn=True
-                    )
-            # Submitting the Multimodal Box
-            chat_msg.submit(
-                fn=chat_fn,
-                inputs=[chat_msg, chatbot, model_dropdown],
-                outputs=[chat_msg, chatbot]
             )
-# Kickoff initialization
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import gc
 from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
 import json
 import re
 from typing import Dict, List, Any, Optional
 HF_TOKEN = os.environ.get("HF_TOKEN")
+# ── Model Cache ──────────────────────────────────────────────
 _model_cache = {}
+MAX_CACHED_MODELS = 2
+QWEN_VL_IMG_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
 def load_model(model_id: str):
     if model_id in _model_cache:
+        print(f"⚡ Cache Hit: {model_id}")
         return _model_cache[model_id]
     if len(_model_cache) >= MAX_CACHED_MODELS:
+        oldest = list(_model_cache.keys())[0]
+        print(f"🧹 Unloading: {oldest}")
+        del _model_cache[oldest]
         gc.collect()
+    print(f"⏳ Loading: {model_id}")
     try:
         processor = AutoProcessor.from_pretrained(model_id, token=HF_TOKEN)
+        device_map = "auto" if torch.cuda.is_available() else "cpu"
         model = AutoModelForImageTextToText.from_pretrained(
+            model_id, device_map=device_map, low_cpu_mem_usage=True, token=HF_TOKEN
         )
         model.eval()
         _model_cache[model_id] = (processor, model)
+        print(f"✅ Loaded: {model_id}")
         return processor, model
     except Exception as e:
         return None, None
 def ui_model_change(model_id):
     processor, model = load_model(model_id)
+    if model: return f"✅ Model Active: {model_id}"
     return f"❌ Failed to load {model_id}"
+# ── THE FIX: prepare_inputs (from your reference app.py) ──────
+# Yeh function mixed content (string + list) ko flat format me
+# convert karke processor ko safe tarike se deta hai
+def prepare_inputs(processor, model, messages: List[Dict]) -> Dict:
+    pil_images = []
+    flat_messages = []
+    for msg in messages:
+        role    = msg.get("role", "user")
+        content = msg.get("content", "")
+        if isinstance(content, list):
+            parts = []
+            for item in content:
+                if not isinstance(item, dict):
+                    parts.append(str(item))
+                    continue
+                t = item.get("type", "")
+                if t == "text":
+                    parts.append(item.get("text", ""))
+                elif t == "image":
+                    img = item.get("image")
+                    if img is not None and isinstance(img, Image.Image):
+                        pil_images.append(img)
+                    parts.append(QWEN_VL_IMG_TOKEN)
+            flat_messages.append({"role": role, "content": "".join(parts)})
+        else:
+            # History string messages directly add kar do
+            flat_messages.append({"role": role, "content": str(content)})
+    text = processor.apply_chat_template(flat_messages, tokenize=False, add_generation_prompt=True)
+    if pil_images and hasattr(processor, "image_processor"):
+        inputs = processor(text=[text], images=pil_images, padding=True, return_tensors="pt")
+    else:
+        inputs = processor(text=[text], padding=True, return_tensors="pt")
+    return {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
+# ── Enterprise OCR ────────────────────────────────────────────
 def extract_tag(tag, text):
     match = re.search(f"<(?:{tag})?>(.*?)</(?:{tag})?", text, re.IGNORECASE)
     if not match: match = re.search(f"<{tag}>(.*?)</{tag}>", text, re.IGNORECASE)
     return match.group(1).strip() if match else "UNKNOWN"
 def build_enterprise_json(raw_text):
     result_json = {
       "DocumentMetadata": {"document_type": "Resident Card", "has_mrz": True},
       "StructuredData": {
+        "civil_number": extract_tag("ID", raw_text),
+        "full_name": extract_tag("NAME", raw_text),
+        "date_of_birth": extract_tag("DOB", raw_text),
+        "nationality": extract_tag("NAT", raw_text)
       }
     }
     return json.dumps(result_json, indent=2, ensure_ascii=False)
 def run_document_scan(front_img, model_name):
     if front_img is None: return "Error: Please upload document image."
     processor, model = load_model(model_name)
     if not model: return "Error: Model not loaded."
     prompt = "Extract details inside these XML tags ONLY:\n<ID></ID>\n<NAME></NAME>\n<DOB></DOB>\n<NAT></NAT>"
     messages = [{"role": "user", "content": [{"type": "image", "image": front_img}, {"type": "text", "text": prompt}]}]
+    try:
+        inputs = prepare_inputs(processor, model, messages)
         with torch.no_grad():
             generated_ids = model.generate(**inputs, max_new_tokens=150, temperature=0.1)
         trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)]
         raw_output = processor.batch_decode(trimmed, skip_special_tokens=True)[0]
         return build_enterprise_json(raw_output)
     except Exception as e:
+        return f"Extraction Failed: {str(e)}"
+# ── Chat ──────────────────────────────────────────────────────
+def process_chat(text: str, image: Optional[Image.Image], history: List[Dict], model_name: str) -> str:
     processor, model = load_model(model_name)
     if not model: return "Error: Model not loaded."
+    # Build history messages first
+    messages = [{"role": m["role"], "content": m["content"]}
+                for m in history if m.get("role") in ("user", "assistant")]
+    # Current message with optional image (as list)
     content = []
     if image is not None:
         content.append({"type": "image", "image": image})
+    if text:
+        content.append({"type": "text", "text": text})
     if content:
         messages.append({"role": "user", "content": content})
     try:
+        # prepare_inputs now handles mixed string/list content safely
+        inputs = prepare_inputs(processor, model, messages)
         with torch.no_grad():
             generated_ids = model.generate(**inputs, max_new_tokens=512, temperature=0.7, top_p=0.9)
         trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], generated_ids)]
+        return processor.batch_decode(trimmed, skip_special_tokens=True)[0]
     except Exception as e:
         return f"❌ Error: {str(e)}"
+def chat_fn(message: Dict[str, Any], history: List[Dict], model_name: str):
     text  = message.get("text", "")
     files = message.get("files", [])
     image = None
     if files:
         try: image = Image.open(files[0]).convert("RGB")
+        except Exception as e: print(f"Image error: {e}")
     response = process_chat(text, image, history, model_name)
     display_text = f"{text}\n📎 [Image attached]" if image else text
     history.append({"role": "user", "content": display_text})
     history.append({"role": "assistant", "content": response})
     return gr.update(value={"text": "", "files": []}), history
+# ── Gradio UI ─────────────────────────────────────────────────
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🪪 CSM Smart Document Engine")
+    gr.Markdown("_On-Demand Caching • Document Scanner • Intelligent Multi-Turn Chat_")
     with gr.Row(variant="panel"):
         model_dropdown = gr.Dropdown(
                 "Chhagan005/CSM-KIE-Universal",
                 "Chhagan005/CSM-DocExtract-8N",
                 "Chhagan005/CSM-DocExtract-4N",
             ],
+            label="🤖 Select Model", value="Chhagan005/CSM-KIE-Universal", interactive=True
         )
         status_bar = gr.Textbox(label="Memory Status", value="Select a model to load into memory", interactive=False)
     model_dropdown.change(fn=ui_model_change, inputs=[model_dropdown], outputs=[status_bar])
     with gr.Tabs():
         with gr.TabItem("📄 Document Scanner"):
             with gr.Row():
                 with gr.Column():
+                    doc_img  = gr.Image(type="pil", label="Upload ID Card")
                     scan_btn = gr.Button("🔍 Extract JSON", variant="primary")
                 with gr.Column():
                     json_output = gr.Code(language="json", label="Enterprise Result")
             scan_btn.click(fn=run_document_scan, inputs=[doc_img, model_dropdown], outputs=[json_output])
         with gr.TabItem("💬 Intelligent Chat"):
+            chatbot  = gr.Chatbot(label="Chat History", height=450, value=[])
+            chat_msg = gr.MultimodalTextbox(
+                label="Message", placeholder="Type a message or click 📎 to attach an image...",
+                file_types=["image"], submit_btn=True
             )
+            chat_msg.submit(fn=chat_fn, inputs=[chat_msg, chatbot, model_dropdown], outputs=[chat_msg, chatbot])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)