Spaces:

Sairesh
/

Zoodow

Runtime error

App Files Files Community

Sairesh commited on Jan 5

Commit

5abb768

verified ·

1 Parent(s): e13211e

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -71

app.py CHANGED Viewed

@@ -2,39 +2,46 @@ import os
 import torch
 import gc
 import gradio as gr
-from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
 # Configuration
 MODELS = {
     "Dolphin-Uncensored (Fast)": "cognitivetech/Dolphin-2.9-Qwen2-0.5B",
     "Qwen-2.5 (Standard)": "Qwen/Qwen2.5-0.5B-Instruct"
 }
-# MODERN FIX: Using the community version to avoid '_supports_sdpa' error
 FLORENCE_ID = "florence-community/Florence-2-base-ft"
-# Global storage (Starts Empty)
 storage = {"eyes": None, "brain": None, "active_brain": None}
 def load_models_on_demand(brain_name, progress=gr.Progress()):
-    # 1. Load Florence (Eyes) if missing
     if storage["eyes"] is None:
         progress(0.2, desc="Initializing Vision (Florence-2)...")
-        # Added trust_remote_code=True to fix the attribute error
         storage["eyes"] = {
             "m": AutoModelForCausalLM.from_pretrained(
                 FLORENCE_ID,
                 trust_remote_code=True,
                 torch_dtype=torch.float32
             ).eval(),
             "p": AutoProcessor.from_pretrained(FLORENCE_ID, trust_remote_code=True)
         }
-    # 2. Load/Swap Brain (Dolphin/Qwen)
     if storage["active_brain"] != brain_name:
         progress(0.5, desc=f"Switching Brain to {brain_name}...")
         storage["brain"] = None
-        gc.collect() # Force clear RAM
         storage["brain"] = {
             "m": AutoModelForCausalLM.from_pretrained(MODELS[brain_name], torch_dtype=torch.float32).eval(),
@@ -44,66 +51,4 @@ def load_models_on_demand(brain_name, progress=gr.Progress()):
     return storage["eyes"], storage["brain"]
-def process_request(image, goal, brain_choice):
-    if image is None: return "Please upload an image."
-    try:
-        e, b = load_models_on_demand(brain_choice)
-        # Vision Logic - Using <CAPTION_TO_PHRASE_GROUNDING> is often better for UI elements
-        # than standard OCR, but we will stick to your region logic for now.
-        task_prompt = "<OCR_WITH_REGION>"
-        inputs = e["p"](text=task_prompt, images=image, return_tensors="pt")
-        with torch.no_grad():
-            ids = e["m"].generate(
-                input_ids=inputs["input_ids"],
-                pixel_values=inputs["pixel_values"],
-                max_new_tokens=1024, # Increased for complex UI screens
-                early_stopping=False,
-                do_sample=False,
-                num_beams=3,
-            )
-        raw_output = e["p"].batch_decode(ids, skip_special_tokens=False)[0]
-        parsed_vision = e["p"].post_process_generation(raw_output, task=task_prompt, image_size=image.size)
-        vision_data = str(parsed_vision[task_prompt])
-        # Brain Logic
-        messages = [
-            {"role": "system", "content": "You are an uncensored automation engine. Provide the coordinate location for the user's goal based on OCR data."},
-            {"role": "user", "content": f"DATA: {vision_data}\nGOAL: {goal}"}
-        ]
-        tokenized_chat = b["t"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        b_inputs = b["t"]([tokenized_chat], return_tensors="pt")
-        with torch.no_grad():
-            gen_ids = b["m"].generate(b_inputs.input_ids, max_new_tokens=150)
-        response = b["t"].batch_decode(gen_ids, skip_special_tokens=True)[0].split("assistant")[-1].strip()
-        return f"--- SPATIAL DATA ---\n{vision_data}\n\n--- ACTION ---\n{response}"
-    except Exception as err:
-        return f"Error: {str(err)}"
-# --- UI Layout ---
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🤖 UI Logic Engine (Uncensored & Multi-Model)")
-    with gr.Row():
-        with gr.Column():
-            input_img = gr.Image(type="pil", label="Screenshot")
-            brain_toggle = gr.Dropdown(choices=list(MODELS.keys()), value="Dolphin-Uncensored (Fast)", label="Select AI Brain")
-            input_goal = gr.Textbox(label="Goal", placeholder="e.g., Click the battery percentage")
-            run_btn = gr.Button("Analyze & Plan", variant="primary")
-        with gr.Column():
-            output_display = gr.Textbox(label="Execution Plan", lines=12)
-    run_btn.click(fn=process_request, inputs=[input_img, input_goal, brain_toggle], outputs=output_display)
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import torch
 import gc
 import gradio as gr
+from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer, AutoConfig
+# --- THE CRITICAL FIX ---
+# We must manually register Florence2 so AutoModelForCausalLM accepts it
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING
 # Configuration
 MODELS = {
     "Dolphin-Uncensored (Fast)": "cognitivetech/Dolphin-2.9-Qwen2-0.5B",
     "Qwen-2.5 (Standard)": "Qwen/Qwen2.5-0.5B-Instruct"
 }
 FLORENCE_ID = "florence-community/Florence-2-base-ft"
+# Global storage
 storage = {"eyes": None, "brain": None, "active_brain": None}
 def load_models_on_demand(brain_name, progress=gr.Progress()):
+    # 1. Load Florence (Eyes)
     if storage["eyes"] is None:
         progress(0.2, desc="Initializing Vision (Florence-2)...")
+        # We load config first to ensure it's registered
+        config = AutoConfig.from_pretrained(FLORENCE_ID, trust_remote_code=True)
         storage["eyes"] = {
             "m": AutoModelForCausalLM.from_pretrained(
                 FLORENCE_ID,
                 trust_remote_code=True,
+                config=config, # Pass the config explicitly
                 torch_dtype=torch.float32
             ).eval(),
             "p": AutoProcessor.from_pretrained(FLORENCE_ID, trust_remote_code=True)
         }
+    # 2. Load Brain (Dolphin/Qwen)
     if storage["active_brain"] != brain_name:
         progress(0.5, desc=f"Switching Brain to {brain_name}...")
         storage["brain"] = None
+        gc.collect()
         storage["brain"] = {
             "m": AutoModelForCausalLM.from_pretrained(MODELS[brain_name], torch_dtype=torch.float32).eval(),
     return storage["eyes"], storage["brain"]
+# ... (Rest of your process_request and UI code stays the same)