Spaces:

johnbridges
/

TestHolo

Sleeping

App Files Files Community

johnbridges commited on Aug 10, 2025

Commit

dc49ef7

verified ·

1 Parent(s): e15bd89

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -29

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import gradio as gr
 import json
 import os
 from typing import Any, List, Dict
@@ -23,6 +23,7 @@ def locate_text_backbone(model):
     Tries common attribute names used by VLMs to find the LLM/text stack.
     Falls back to the whole model if unknown.
     """
     for name in [
         "language_model",   # e.g., model.language_model
         "text_model",       # e.g., model.text_model
@@ -33,20 +34,31 @@ def locate_text_backbone(model):
         m = getattr(model, name, None)
         if m is not None:
             return m, name
     for name, child in model.named_children():
         if hasattr(child, "lm_head") or hasattr(child, "get_input_embeddings"):
             return child, name
     return model, None
 def pick_device() -> str:
-    return "cpu"  # force CPU
 def apply_chat_template_compat(processor, messages: List[Dict[str, Any]]) -> str:
     tok = getattr(processor, "tokenizer", None)
     if hasattr(processor, "apply_chat_template"):
         return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     if tok is not None and hasattr(tok, "apply_chat_template"):
         return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     texts = []
     for m in messages:
         for c in m.get("content", []):
@@ -63,6 +75,9 @@ def batch_decode_compat(processor, token_id_batches, **kw):
     raise AttributeError("No batch_decode available on processor or tokenizer.")
 def get_image_proc_params(processor) -> Dict[str, int]:
     ip = getattr(processor, "image_processor", None)
     return {
         "patch_size": getattr(ip, "patch_size", 14),
@@ -72,6 +87,9 @@ def get_image_proc_params(processor) -> Dict[str, int]:
     }
 def trim_generated(generated_ids, inputs):
     in_ids = getattr(inputs, "input_ids", None)
     if in_ids is None and isinstance(inputs, dict):
         in_ids = inputs.get("input_ids", None)
@@ -87,34 +105,13 @@ model_loaded = False
 load_error_message = ""
 try:
     model = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.bfloat16,   # CPU-friendly
         trust_remote_code=True
     ).to(pick_device())
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-    # >>> INT8 QUANT START -----------------------------------------------------
-    # Quantize only the text/LLM backbone (nn.Linear layers) to dynamic INT8.
-    text_backbone, attr_name = locate_text_backbone(model)
-    print("[INT8] Quantizing text backbone with dynamic INT8...")
-    quantized_llm = quantize_dynamic(
-        text_backbone,
-        {torch.nn.Linear},
-        dtype=torch.qint8
-    )
-    if attr_name is not None:
-        setattr(model, attr_name, quantized_llm)
-    else:
-        for name, child in list(model.named_children()):
-            if child is text_backbone:
-                setattr(model, name, quantized_llm)
-                break
-    torch.set_num_threads(max(1, os.cpu_count() or 1))
-    model.eval()
-    print("[INT8] Done.")
-    # <<< INT8 QUANT END -------------------------------------------------------
     model_loaded = True
     print("Model and processor loaded successfully.")
 except Exception as e:
@@ -148,11 +145,16 @@ def run_inference_localization(
     messages_for_template: List[dict[str, Any]],
     pil_image_for_processing: Image.Image
 ) -> str:
     try:
         model.to(pick_device())
         text_prompt = apply_chat_template_compat(processor, messages_for_template)
         inputs = processor(
             text=[text_prompt],
             images=[pil_image_for_processing],
@@ -160,19 +162,23 @@ def run_inference_localization(
             return_tensors="pt",
         )
         if isinstance(inputs, dict):
             for k, v in list(inputs.items()):
                 if hasattr(v, "to"):
                     inputs[k] = v.to(model.device)
         generated_ids = model.generate(
             **inputs,
             max_new_tokens=128,
             do_sample=False,
         )
         generated_ids_trimmed = trim_generated(generated_ids, inputs)
         decoded_output = batch_decode_compat(
             processor,
             generated_ids_trimmed,
@@ -195,6 +201,7 @@ def predict_click_location(input_pil_image: Image.Image, instruction: str):
     if not instruction or instruction.strip() == "":
         return "No instruction provided. Please type an instruction.", input_pil_image.copy().convert("RGB")
     try:
         ip = get_image_proc_params(processor)
         resized_height, resized_width = smart_resize(
@@ -213,18 +220,22 @@ def predict_click_location(input_pil_image: Image.Image, instruction: str):
         traceback.print_exc()
         return f"Error resizing image: {e}", input_pil_image.copy().convert("RGB")
     messages = get_localization_prompt(resized_image, instruction)
     try:
         coordinates_str = run_inference_localization(messages, resized_image)
     except Exception as e:
         return f"Error during model inference: {e}", resized_image.copy().convert("RGB")
     output_image_with_click = resized_image.copy().convert("RGB")
     match = re.search(r"Click\((\d+),\s*(\d+)\)", coordinates_str)
     if match:
         try:
-            x = int(match.group(1)); y = int(match.group(2))
             draw = ImageDraw.Draw(output_image_with_click)
             radius = max(5, min(resized_width // 100, resized_height // 100, 15))
             bbox = (x - radius, y - radius, x + radius, y + radius)
@@ -255,7 +266,7 @@ except Exception as e:
         pass
 # --- Gradio UI ---
-title = "Holo1-3B: Action VLM Localization Demo (CPU + INT8 text)"
 article = f"""
 <p style='text-align: center'>
 Model: <a href='https://huggingface.co/{MODEL_ID}' target='_blank'>{MODEL_ID}</a> by HCompany |
@@ -313,5 +324,5 @@ else:
         )
 if __name__ == "__main__":
-    demo.launch(debug=True)

+this is what I have so far : import gradio as gr
 import json
 import os
 from typing import Any, List, Dict
     Tries common attribute names used by VLMs to find the LLM/text stack.
     Falls back to the whole model if unknown.
     """
+    # common in Qwen-like / custom repos
     for name in [
         "language_model",   # e.g., model.language_model
         "text_model",       # e.g., model.text_model
         m = getattr(model, name, None)
         if m is not None:
             return m, name
+    # last resort: look for a child that has an lm_head or tied weights
     for name, child in model.named_children():
         if hasattr(child, "lm_head") or hasattr(child, "get_input_embeddings"):
             return child, name
+    # if still not found, return the model itself
     return model, None
 def pick_device() -> str:
+    # Force CPU per request
+    return "cpu"
 def apply_chat_template_compat(processor, messages: List[Dict[str, Any]]) -> str:
+    """
+    Works whether apply_chat_template lives on the processor or tokenizer,
+    or not at all (falls back to naive text join of 'text' contents).
+    """
     tok = getattr(processor, "tokenizer", None)
     if hasattr(processor, "apply_chat_template"):
         return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     if tok is not None and hasattr(tok, "apply_chat_template"):
         return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Fallback: concatenate visible text segments
     texts = []
     for m in messages:
         for c in m.get("content", []):
     raise AttributeError("No batch_decode available on processor or tokenizer.")
 def get_image_proc_params(processor) -> Dict[str, int]:
+    """
+    Safely access image processor params with defaults that work for Qwen2-VL family.
+    """
     ip = getattr(processor, "image_processor", None)
     return {
         "patch_size": getattr(ip, "patch_size", 14),
     }
 def trim_generated(generated_ids, inputs):
+    """
+    Trim prompt tokens from generated tokens when input_ids exist.
+    """
     in_ids = getattr(inputs, "input_ids", None)
     if in_ids is None and isinstance(inputs, dict):
         in_ids = inputs.get("input_ids", None)
 load_error_message = ""
 try:
+    # CPU-friendly dtype; bf16 on CPU is spotty, so prefer bfloat16
     model = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID,
+        torch_dtype=torch.bfloat16,
         trust_remote_code=True
     ).to(pick_device())
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
     model_loaded = True
     print("Model and processor loaded successfully.")
 except Exception as e:
     messages_for_template: List[dict[str, Any]],
     pil_image_for_processing: Image.Image
 ) -> str:
+    """
+    CPU inference; robust to processor/tokenizer differences and logs full traceback on failure.
+    """
     try:
         model.to(pick_device())
+        # 1) Build prompt text via robust helper
         text_prompt = apply_chat_template_compat(processor, messages_for_template)
+        # 2) Prepare inputs (text + image)
         inputs = processor(
             text=[text_prompt],
             images=[pil_image_for_processing],
             return_tensors="pt",
         )
+        # Move tensor inputs to the same device as model (CPU)
         if isinstance(inputs, dict):
             for k, v in list(inputs.items()):
                 if hasattr(v, "to"):
                     inputs[k] = v.to(model.device)
+        # 3) Generate (deterministic)
         generated_ids = model.generate(
             **inputs,
             max_new_tokens=128,
             do_sample=False,
         )
+        # 4) Trim prompt tokens if possible
         generated_ids_trimmed = trim_generated(generated_ids, inputs)
+        # 5) Decode via robust helper
         decoded_output = batch_decode_compat(
             processor,
             generated_ids_trimmed,
     if not instruction or instruction.strip() == "":
         return "No instruction provided. Please type an instruction.", input_pil_image.copy().convert("RGB")
+    # 1) Resize according to image processor params (safe defaults if missing)
     try:
         ip = get_image_proc_params(processor)
         resized_height, resized_width = smart_resize(
         traceback.print_exc()
         return f"Error resizing image: {e}", input_pil_image.copy().convert("RGB")
+    # 2) Build messages with image + instruction
     messages = get_localization_prompt(resized_image, instruction)
+    # 3) Run inference
     try:
         coordinates_str = run_inference_localization(messages, resized_image)
     except Exception as e:
         return f"Error during model inference: {e}", resized_image.copy().convert("RGB")
+    # 4) Parse coordinates and draw marker
     output_image_with_click = resized_image.copy().convert("RGB")
     match = re.search(r"Click\((\d+),\s*(\d+)\)", coordinates_str)
     if match:
         try:
+            x = int(match.group(1))
+            y = int(match.group(2))
             draw = ImageDraw.Draw(output_image_with_click)
             radius = max(5, min(resized_width // 100, resized_height // 100, 15))
             bbox = (x - radius, y - radius, x + radius, y + radius)
         pass
 # --- Gradio UI ---
+title = "Holo1-3B: Action VLM Localization Demo (CPU)"
 article = f"""
 <p style='text-align: center'>
 Model: <a href='https://huggingface.co/{MODEL_ID}' target='_blank'>{MODEL_ID}</a> by HCompany |
         )
 if __name__ == "__main__":
+    # CPU Spaces can be slow; keep debug True for logs
+    demo.launch(debug=True)  .... I cant see where to put it all