Spaces:

Chhagan005
/

CSM-KIE-Scanner

Sleeping

App Files Files Community

Chhagan005 commited on Mar 9

Commit

9e25965

verified ·

1 Parent(s): 6442ebb

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +149 -72

app.py CHANGED Viewed

@@ -3,48 +3,68 @@ import os
 import gradio as gr
 import torch
 import gc
 from transformers import AutoModelForImageTextToText, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import json
 import re
-# Global State
-current_model_id = None
-model = None
-processor = None
 HF_TOKEN = os.environ.get("HF_TOKEN")
-def load_selected_model(repo_id):
-    global model, processor, current_model_id
-    if repo_id == current_model_id and model is not None:
-        return f"✅ Model {repo_id} is already active."
     try:
-        if model is not None:
-            del model
-            del processor
-            gc.collect()
-        print(f"Loading {repo_id}...")
-        processor = AutoProcessor.from_pretrained(repo_id, token=HF_TOKEN)
         model = AutoModelForImageTextToText.from_pretrained(
-            repo_id,
-            device_map="cpu",
             low_cpu_mem_usage=True,
             token=HF_TOKEN
         )
         model.eval()
-        current_model_id = repo_id
-        return f"🚀 Successfully Loaded: {repo_id}"
     except Exception as e:
-        return f"❌ Error loading {repo_id}: {str(e)}"
 def extract_tag(tag, text):
     match = re.search(f"<(?:{tag})?>(.*?)</(?:{tag})?", text, re.IGNORECASE)
-    if not match:
-        match = re.search(f"<{tag}>(.*?)</{tag}>", text, re.IGNORECASE)
     return match.group(1).strip() if match else "UNKNOWN"
 def build_enterprise_json(raw_text):
@@ -61,36 +81,91 @@ def build_enterprise_json(raw_text):
     }
     return json.dumps(result_json, indent=2, ensure_ascii=False)
-def run_qwen(image, prompt_text, max_tokens=150):
-    if model is None:
-        return "Error: Please load a model from the dropdown first."
-    if image is None:
-        return "Error: Image required."
-    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_text}]}]
     try:
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
         inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
-        inputs = {k: v.to("cpu") for k, v in inputs.items() if isinstance(v, torch.Tensor)}
         with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, temperature=0.1)
-        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)]
-        return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     except Exception as e:
          return f"Extraction Failed: {str(e)}"
-def ocr_extraction(front_img):
-    prompt = "Extract details inside these XML tags ONLY:\n<ID></ID>\n<NAME></NAME>\n<DOB></DOB>\n<NAT></NAT>"
-    raw_output = run_qwen(front_img, prompt, max_tokens=150)
-    return build_enterprise_json(raw_output)
-with gr.Blocks() as demo:
-    gr.Markdown("# 🪪 CSM Universal Model Testing Playground")
     with gr.Row(variant="panel"):
         model_dropdown = gr.Dropdown(
             choices=[
@@ -99,46 +174,48 @@ with gr.Blocks() as demo:
                 "Chhagan005/CSM-DocExtract-4N",
                 "Chhagan005/CSM-DocExtract-2N"
             ],
-            label="1. Select Model Version",
-            value="Chhagan005/CSM-KIE-Universal"
         )
-        load_btn = gr.Button("🔄 Load Model")
-        load_status = gr.Textbox(label="Status", interactive=False)
-    load_btn.click(load_selected_model, inputs=[model_dropdown], outputs=[load_status])
     with gr.Tabs():
-        with gr.Tab("Enterprise OCR Check"):
             with gr.Row():
                 with gr.Column():
-                    img_input = gr.Image(type="pil", label="Upload Document")
-                    extract_btn = gr.Button("🔍 Test Extraction", variant="primary")
                 with gr.Column():
-                    json_output = gr.Code(language="json", label="JSON Output")
-            extract_btn.click(ocr_extraction, inputs=[img_input], outputs=[json_output])
-        with gr.Tab("Document Chat Check"):
             with gr.Row():
                 with gr.Column(scale=1):
-                    chat_img_input = gr.Image(type="pil", label="Document Attachment")
-                with gr.Column(scale=2):
-                    # ✨ THE FIX: We explicitly define type="tuples" so Gradio accepts our format without crashing
-                    chatbot = gr.Chatbot(label="Chat Interface", height=400, type="tuples")
-                    with gr.Row():
-                        chat_input = gr.Textbox(placeholder="Ask anything...", show_label=False)
-                        send_btn = gr.Button("Send")
-            def chat_wrapper(image, user_message, chat_history):
-                # Ensure chat_history is a list
-                if chat_history is None:
-                    chat_history = []
-                ai_response = run_qwen(image, user_message, max_tokens=200)
-                # Appending as a tuple (user_message, ai_response) which matches type="tuples"
-                chat_history.append((user_message, ai_response))
-                return "", chat_history
-            send_btn.click(chat_wrapper, inputs=[chat_img_input, chat_input, chatbot], outputs=[chat_input, chatbot])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import torch
 import gc
+from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import json
 import re
+from typing import Dict, List, Any, Optional
 HF_TOKEN = os.environ.get("HF_TOKEN")
+# ──────────────────────────────────────────────────────────────
+# 1. Smart Memory Cache (From your reference, heavily optimized)
+# ──────────────────────────────────────────────────────────────
+_model_cache = {}
+MAX_CACHED_MODELS = 2  # Limits RAM usage on free HF Space CPU
+def load_model(model_id: str):
+    # 1. Agar cache me hai, wahi se return karo (0 loading time)
+    if model_id in _model_cache:
+        print(f"⚡ Fast Load: {model_id} already in cache!")
+        return _model_cache[model_id]
+    # 2. RAM check (Agar memory full hai, toh sabse purana model nikal do)
+    if len(_model_cache) >= MAX_CACHED_MODELS:
+        oldest_model = list(_model_cache.keys())[0]
+        print(f"🧹 Memory Full! Unloading old model: {oldest_model}")
+        del _model_cache[oldest_model]
+        gc.collect()
+    # 3. Pehli baar model load karo
+    print(f"⏳ Loading model into memory: {model_id}")
     try:
+        processor = AutoProcessor.from_pretrained(model_id, token=HF_TOKEN)
+        # Check for GPU (from reference)
+        device_type = "auto" if torch.cuda.is_available() else "cpu"
         model = AutoModelForImageTextToText.from_pretrained(
+            model_id,
+            device_map=device_type,
             low_cpu_mem_usage=True,
             token=HF_TOKEN
         )
         model.eval()
+        _model_cache[model_id] = (processor, model)
+        print(f"✅ {model_id} loaded successfully!")
+        return processor, model
     except Exception as e:
+        print(f"❌ Error loading {model_id}: {str(e)}")
+        return None, None
+def ui_model_change(model_id):
+    processor, model = load_model(model_id)
+    if model:
+        return f"✅ Model Active: {model_id} (Cached in Memory)"
+    return f"❌ Failed to load {model_id}"
+# ──────────────────────────────────────────────────────────────
+# 2. Enterprise OCR JSON Parsing (Our logic)
+# ──────────────────────────────────────────────────────────────
 def extract_tag(tag, text):
     match = re.search(f"<(?:{tag})?>(.*?)</(?:{tag})?", text, re.IGNORECASE)
+    if not match: match = re.search(f"<{tag}>(.*?)</{tag}>", text, re.IGNORECASE)
     return match.group(1).strip() if match else "UNKNOWN"
 def build_enterprise_json(raw_text):
     }
     return json.dumps(result_json, indent=2, ensure_ascii=False)
+def run_document_scan(front_img, model_name):
+    if front_img is None: return "Error: Please upload document image."
+    processor, model = load_model(model_name)
+    if not model: return "Error: Model not loaded."
+    prompt = "Extract details inside these XML tags ONLY:\n<ID></ID>\n<NAME></NAME>\n<DOB></DOB>\n<NAT></NAT>"
+    messages = [{"role": "user", "content": [{"type": "image", "image": front_img}, {"type": "text", "text": prompt}]}]
     try:
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
         inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+        inputs = {k: v.to(model.device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
         with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=150, temperature=0.1)
+        trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)]
+        raw_output = processor.batch_decode(trimmed, skip_special_tokens=True)[0]
+        return build_enterprise_json(raw_output)
     except Exception as e:
          return f"Extraction Failed: {str(e)}"
+# ──────────────────────────────────────────────────────────────
+# 3. Chat Inference (Reference Architecture Logic)
+# ──────────────────────────────────────────────────────────────
+def process_chat(message: str, image: Optional[Image.Image], history: List[Dict[str, Any]], model_name: str) -> str:
+    processor, model = load_model(model_name)
+    if not model: return "Error: Model not loaded."
+    content = []
+    if image is not None:
+        content.append({"type": "image", "image": image})
+    if message:
+        content.append({"type": "text", "text": message})
+    # Prepare pure history dictionary
+    messages = [{"role": m["role"], "content": m["content"]} for m in history if m.get("role") in ("user", "assistant")]
+    if content:
+        messages.append({"role": "user", "content": content})
+    try:
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+        inputs = {k: v.to(model.device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=512, temperature=0.7, top_p=0.9)
+        trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], generated_ids)]
+        return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# Chat Wrapper handling the UI logic
+def chat_fn(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str):
+    text  = message.get("text", "")
+    files = message.get("files", [])
+    image = None
+    if files:
+        try: image = Image.open(files[0]).convert("RGB")
+        except Exception as e: print(f"Image load error: {e}")
+    response = process_chat(text, image, history, model_name)
+    # Append to history precisely as dictionaries (Fixes all Gradio 5+ type errors)
+    display_text = f"{text}\n📎 [Image attached]" if image else text
+    history.append({"role": "user", "content": display_text})
+    history.append({"role": "assistant", "content": response})
+    # Clears the multimodal textbox on send
+    return gr.update(value={"text": "", "files": []}), history
+# ──────────────────────────────────────────────────────────────
+# 4. Gradio Interface (Unified UI)
+# ──────────────────────────────────────────────────────────────
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🪪 CSM Smart Document Engine")
+    gr.Markdown("_Unified architecture with On-Demand Caching & Multi-Turn Chat_")
     with gr.Row(variant="panel"):
         model_dropdown = gr.Dropdown(
             choices=[
                 "Chhagan005/CSM-DocExtract-4N",
                 "Chhagan005/CSM-DocExtract-2N"
             ],
+            label="🤖 Select Model",
+            value="Chhagan005/CSM-KIE-Universal",
+            interactive=True
         )
+        status_bar = gr.Textbox(label="Memory Status", value="Select a model to load into memory", interactive=False)
+    # Load model dynamically when dropdown changes
+    model_dropdown.change(fn=ui_model_change, inputs=[model_dropdown], outputs=[status_bar])
     with gr.Tabs():
+        # TAB 1: Document Scan
+        with gr.TabItem("📄 Document Scanner"):
             with gr.Row():
                 with gr.Column():
+                    doc_img = gr.Image(type="pil", label="Upload ID Card")
+                    scan_btn = gr.Button("🔍 Extract JSON", variant="primary")
                 with gr.Column():
+                    json_output = gr.Code(language="json", label="Enterprise Result")
+            scan_btn.click(fn=run_document_scan, inputs=[doc_img, model_dropdown], outputs=[json_output])
+        # TAB 2: Multimodal Chat
+        with gr.TabItem("💬 Intelligent Chat"):
+            gr.Markdown("**Tips:** Upload an image using the + icon inside the chatbox.")
             with gr.Row():
                 with gr.Column(scale=1):
+                    # Pure Gradio Chatbot (No type=tuples needed since we pass strict dicts now)
+                    chatbot = gr.Chatbot(label="Chat History", height=450, value=[])
+                    # Multimodal box exactly like your reference
+                    chat_msg = gr.MultimodalTextbox(
+                        label="Message",
+                        placeholder="Type a message or click 📎 to upload an image...",
+                        file_types=["image"],
+                        submit_btn=True
+                    )
+            # Submitting the Multimodal Box
+            chat_msg.submit(
+                fn=chat_fn,
+                inputs=[chat_msg, chatbot, model_dropdown],
+                outputs=[chat_msg, chatbot]
+            )
+# Kickoff initialization
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)