Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import AutoProcessor, AutoModelForCausalLM | |
| from PIL import Image | |
| # --- STRATEGY: MEMORY MANAGEMENT --- | |
| # We force the device to "cpu" because we are on the free tier. | |
| # We trust remote code because Florence-2 uses custom architecture. | |
| print(">>> INITIALIZING THE BRAIN...") | |
| # 1. LOAD FLORENCE-2 (The Eyes) | |
| # This model converts the UI screenshot into text/coordinates. | |
| FLORENCE_ID = "microsoft/Florence-2-base" | |
| print(f"Loading {FLORENCE_ID}...") | |
| flo_model = AutoModelForCausalLM.from_pretrained(FLORENCE_ID, trust_remote_code=True).to("cpu").eval() | |
| flo_processor = AutoProcessor.from_pretrained(FLORENCE_ID, trust_remote_code=True) | |
| # 2. LOAD DOLPHIN-QWEN (The Logic) | |
| # This model decides what to do based on what Florence sees. | |
| DOLPHIN_ID = "cognitivecomputations/dolphin-2.9.4-qwen2-1.5b" | |
| print(f"Loading {DOLPHIN_ID}...") | |
| dolphin_model = AutoModelForCausalLM.from_pretrained(DOLPHIN_ID).to("cpu").eval() | |
| dolphin_processor = AutoProcessor.from_pretrained(DOLPHIN_ID) | |
| # --- THE LOGIC LOOP --- | |
| def run_brain(image, user_instruction): | |
| if image is None: | |
| return "Error: No image provided." | |
| # STEP A: Use Florence to find elements in the image | |
| # We ask it to describe the UI or find specific widgets | |
| prompt = "<OD>" # Object Detection prompt | |
| inputs = flo_processor(text=prompt, images=image, return_tensors="pt").to("cpu") | |
| with torch.no_grad(): | |
| generated_ids = flo_model.generate( | |
| input_ids=inputs["input_ids"], | |
| pixel_values=inputs["pixel_values"], | |
| max_new_tokens=1024, | |
| num_beams=3 | |
| ) | |
| # Decode Florence's vision into text | |
| vision_text = flo_processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
| # STEP B: Pass Vision Data to Dolphin (The Planning) | |
| # We format the prompt so Dolphin knows what is on screen | |
| dolphin_prompt = ( | |
| f"User Instruction: {user_instruction}\n" | |
| f"Screen Analysis: {vision_text}\n" | |
| f"Task: Decide which element to click. Return the HEX Packet ID." | |
| ) | |
| # (Simple Dolphin inference for now - we will fine-tune this later) | |
| dolphin_inputs = dolphin_processor(dolphin_prompt, return_tensors="pt").to("cpu") | |
| with torch.no_grad(): | |
| output_ids = dolphin_model.generate(**dolphin_inputs, max_new_tokens=50) | |
| final_decision = dolphin_processor.decode(output_ids[0], skip_special_tokens=True) | |
| return f"Vision Saw: {vision_text}\n\nBrain Decided: {final_decision}" | |
| # --- USER INTERFACE --- | |
| demo = gr.Interface( | |
| fn=run_brain, | |
| inputs=[gr.Image(label="Android Screenshot", type="pil"), gr.Textbox(label="Goal (e.g., Open Game)")], | |
| outputs="text", | |
| title="Android Automation Brain", | |
| description="Florence-2 for Vision + Dolphin for Logic" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |