Sairesh commited on
Commit
5abb768
·
verified ·
1 Parent(s): e13211e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -71
app.py CHANGED
@@ -2,39 +2,46 @@ import os
2
  import torch
3
  import gc
4
  import gradio as gr
5
- from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
 
 
 
 
 
6
 
7
  # Configuration
8
  MODELS = {
9
  "Dolphin-Uncensored (Fast)": "cognitivetech/Dolphin-2.9-Qwen2-0.5B",
10
  "Qwen-2.5 (Standard)": "Qwen/Qwen2.5-0.5B-Instruct"
11
  }
12
-
13
- # MODERN FIX: Using the community version to avoid '_supports_sdpa' error
14
  FLORENCE_ID = "florence-community/Florence-2-base-ft"
15
 
16
- # Global storage (Starts Empty)
17
  storage = {"eyes": None, "brain": None, "active_brain": None}
18
 
19
  def load_models_on_demand(brain_name, progress=gr.Progress()):
20
- # 1. Load Florence (Eyes) if missing
21
  if storage["eyes"] is None:
22
  progress(0.2, desc="Initializing Vision (Florence-2)...")
23
- # Added trust_remote_code=True to fix the attribute error
 
 
 
24
  storage["eyes"] = {
25
  "m": AutoModelForCausalLM.from_pretrained(
26
  FLORENCE_ID,
27
  trust_remote_code=True,
 
28
  torch_dtype=torch.float32
29
  ).eval(),
30
  "p": AutoProcessor.from_pretrained(FLORENCE_ID, trust_remote_code=True)
31
  }
32
 
33
- # 2. Load/Swap Brain (Dolphin/Qwen)
34
  if storage["active_brain"] != brain_name:
35
  progress(0.5, desc=f"Switching Brain to {brain_name}...")
36
  storage["brain"] = None
37
- gc.collect() # Force clear RAM
38
 
39
  storage["brain"] = {
40
  "m": AutoModelForCausalLM.from_pretrained(MODELS[brain_name], torch_dtype=torch.float32).eval(),
@@ -44,66 +51,4 @@ def load_models_on_demand(brain_name, progress=gr.Progress()):
44
 
45
  return storage["eyes"], storage["brain"]
46
 
47
- def process_request(image, goal, brain_choice):
48
- if image is None: return "Please upload an image."
49
-
50
- try:
51
- e, b = load_models_on_demand(brain_choice)
52
-
53
- # Vision Logic - Using <CAPTION_TO_PHRASE_GROUNDING> is often better for UI elements
54
- # than standard OCR, but we will stick to your region logic for now.
55
- task_prompt = "<OCR_WITH_REGION>"
56
- inputs = e["p"](text=task_prompt, images=image, return_tensors="pt")
57
-
58
- with torch.no_grad():
59
- ids = e["m"].generate(
60
- input_ids=inputs["input_ids"],
61
- pixel_values=inputs["pixel_values"],
62
- max_new_tokens=1024, # Increased for complex UI screens
63
- early_stopping=False,
64
- do_sample=False,
65
- num_beams=3,
66
- )
67
-
68
- raw_output = e["p"].batch_decode(ids, skip_special_tokens=False)[0]
69
- parsed_vision = e["p"].post_process_generation(raw_output, task=task_prompt, image_size=image.size)
70
-
71
- vision_data = str(parsed_vision[task_prompt])
72
-
73
- # Brain Logic
74
- messages = [
75
- {"role": "system", "content": "You are an uncensored automation engine. Provide the coordinate location for the user's goal based on OCR data."},
76
- {"role": "user", "content": f"DATA: {vision_data}\nGOAL: {goal}"}
77
- ]
78
-
79
- tokenized_chat = b["t"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
80
- b_inputs = b["t"]([tokenized_chat], return_tensors="pt")
81
-
82
- with torch.no_grad():
83
- gen_ids = b["m"].generate(b_inputs.input_ids, max_new_tokens=150)
84
-
85
- response = b["t"].batch_decode(gen_ids, skip_special_tokens=True)[0].split("assistant")[-1].strip()
86
-
87
- return f"--- SPATIAL DATA ---\n{vision_data}\n\n--- ACTION ---\n{response}"
88
-
89
- except Exception as err:
90
- return f"Error: {str(err)}"
91
-
92
- # --- UI Layout ---
93
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
94
- gr.Markdown("# 🤖 UI Logic Engine (Uncensored & Multi-Model)")
95
-
96
- with gr.Row():
97
- with gr.Column():
98
- input_img = gr.Image(type="pil", label="Screenshot")
99
- brain_toggle = gr.Dropdown(choices=list(MODELS.keys()), value="Dolphin-Uncensored (Fast)", label="Select AI Brain")
100
- input_goal = gr.Textbox(label="Goal", placeholder="e.g., Click the battery percentage")
101
- run_btn = gr.Button("Analyze & Plan", variant="primary")
102
-
103
- with gr.Column():
104
- output_display = gr.Textbox(label="Execution Plan", lines=12)
105
-
106
- run_btn.click(fn=process_request, inputs=[input_img, input_goal, brain_toggle], outputs=output_display)
107
-
108
- if __name__ == "__main__":
109
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
2
  import torch
3
  import gc
4
  import gradio as gr
5
+ from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer, AutoConfig
6
+
7
+ # --- THE CRITICAL FIX ---
8
+ # We must manually register Florence2 so AutoModelForCausalLM accepts it
9
+ from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
10
+ from transformers.models.auto.configuration_auto import CONFIG_MAPPING
11
 
12
  # Configuration
13
  MODELS = {
14
  "Dolphin-Uncensored (Fast)": "cognitivetech/Dolphin-2.9-Qwen2-0.5B",
15
  "Qwen-2.5 (Standard)": "Qwen/Qwen2.5-0.5B-Instruct"
16
  }
 
 
17
  FLORENCE_ID = "florence-community/Florence-2-base-ft"
18
 
19
+ # Global storage
20
  storage = {"eyes": None, "brain": None, "active_brain": None}
21
 
22
  def load_models_on_demand(brain_name, progress=gr.Progress()):
23
+ # 1. Load Florence (Eyes)
24
  if storage["eyes"] is None:
25
  progress(0.2, desc="Initializing Vision (Florence-2)...")
26
+
27
+ # We load config first to ensure it's registered
28
+ config = AutoConfig.from_pretrained(FLORENCE_ID, trust_remote_code=True)
29
+
30
  storage["eyes"] = {
31
  "m": AutoModelForCausalLM.from_pretrained(
32
  FLORENCE_ID,
33
  trust_remote_code=True,
34
+ config=config, # Pass the config explicitly
35
  torch_dtype=torch.float32
36
  ).eval(),
37
  "p": AutoProcessor.from_pretrained(FLORENCE_ID, trust_remote_code=True)
38
  }
39
 
40
+ # 2. Load Brain (Dolphin/Qwen)
41
  if storage["active_brain"] != brain_name:
42
  progress(0.5, desc=f"Switching Brain to {brain_name}...")
43
  storage["brain"] = None
44
+ gc.collect()
45
 
46
  storage["brain"] = {
47
  "m": AutoModelForCausalLM.from_pretrained(MODELS[brain_name], torch_dtype=torch.float32).eval(),
 
51
 
52
  return storage["eyes"], storage["brain"]
53
 
54
+ # ... (Rest of your process_request and UI code stays the same)