import gradio as gr import torch from PIL import Image import json import os from transformers import AutoProcessor, AutoModelForImageTextToText from typing import List, Dict, Any import logging import spaces # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Model configuration MODEL_ID = "Tonic/l-operator" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Get Hugging Face token from environment variable (Spaces secrets) import os HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.") logger.warning("Please set HF_TOKEN in your environment variables or Spaces secrets.") class LOperatorDemo: def __init__(self): self.model = None self.processor = None self.is_loaded = False def load_model(self): """Load the L-Operator model and processor with timeout handling""" try: import time start_time = time.time() logger.info(f"Loading model {MODEL_ID} on device {DEVICE}") # Check if token is available if not HF_TOKEN: return "❌ HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets." # Load model with progress logging logger.info("Downloading and loading model weights...") self.model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, device_map="auto", torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32, trust_remote_code=True ) # Load processor logger.info("Loading processor...") self.processor = AutoProcessor.from_pretrained( MODEL_ID, trust_remote_code=True ) if DEVICE == "cpu": self.model = self.model.to(DEVICE) self.is_loaded = True load_time = time.time() - start_time logger.info(".1f") return ".1f" except Exception as e: logger.error(f"Error loading model: {str(e)}") return f"❌ Error loading model: {str(e)} - This may be a custom model requiring special handling" @spaces.GPU(duration=120) # 2 minutes for action generation def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str: """Generate action based on image and text inputs""" if not self.is_loaded: return "❌ Model not loaded. Please load the model first." try: # Convert image to RGB if needed if image.mode != "RGB": image = image.convert("RGB") # Build conversation conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."} ] }, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."} ] } ] # Process inputs inputs = self.processor.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt" ).to(self.model.device) # Generate response with torch.no_grad(): outputs = self.model.generate( inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.9 ) response = self.processor.tokenizer.decode( outputs[0][inputs.shape[1]:], skip_special_tokens=True ) # Try to parse as JSON for better formatting try: parsed_response = json.loads(response) return json.dumps(parsed_response, indent=2) except: return response except Exception as e: logger.error(f"Error generating action: {str(e)}") return f"❌ Error generating action: {str(e)}" @spaces.GPU(duration=90) # 1.5 minutes for chat responses def chat_with_model(self, message: str, history: List[Dict[str, str]], image: Image.Image = None) -> List[Dict[str, str]]: """Chat interface function for Gradio""" if not self.is_loaded: return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Model not loaded. Please load the model first."}] if image is None: return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Please upload an Android screenshot image."}] try: # Extract goal and instruction from message if "Goal:" in message and "Step:" in message: # Parse structured input lines = message.split('\n') goal = "" instruction = "" for line in lines: if line.startswith("Goal:"): goal = line.replace("Goal:", "").strip() elif line.startswith("Step:"): instruction = line.replace("Step:", "").strip() if not goal or not instruction: return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Please provide both Goal and Step in your message."}] else: # Treat as general instruction goal = "Complete the requested action" instruction = message # Generate action response = self.generate_action(image, goal, instruction) return history + [{"role": "user", "content": message}, {"role": "assistant", "content": response}] except Exception as e: logger.error(f"Error in chat: {str(e)}") return history + [{"role": "user", "content": message}, {"role": "assistant", "content": f"❌ Error: {str(e)}"}] # Initialize demo demo_instance = LOperatorDemo() def load_model_with_timeout(timeout_seconds=600): # 10 minutes timeout """Load model with timeout protection""" import signal import time def timeout_handler(signum, frame): raise TimeoutError("Model loading timed out") # Set up the signal handler for timeout old_handler = signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(timeout_seconds) try: logger.info("Loading L-Operator model with timeout protection...") result = demo_instance.load_model() logger.info(f"Model loading result: {result}") return result except TimeoutError: logger.error("Model loading timed out - this may be due to network issues or large model size") return "❌ Model loading timed out. Please try again or check your internet connection." except Exception as e: logger.error(f"Error loading model: {str(e)}") return f"❌ Error loading model: {str(e)}" finally: # Restore the original signal handler signal.alarm(0) signal.signal(signal.SIGALRM, old_handler) # Load example episodes (lazy loading to avoid startup timeout) def load_example_episodes(): """Load example episodes from the extracted data - simplified for fast startup""" examples = [] try: # Load episode metadata quickly without PIL validation episodes_data = [] episode_dirs = ["episode_13", "episode_53", "episode_73"] for episode_dir in episode_dirs: try: metadata_path = f"extracted_episodes_duckdb/{episode_dir}/metadata.json" with open(metadata_path, "r") as f: metadata = json.load(f) episodes_data.append(metadata) except Exception as e: logger.warning(f"Could not load metadata for {episode_dir}: {str(e)}") continue # Create examples with simple path checks (no PIL validation) for i, metadata in enumerate(episodes_data): episode_num = ["13", "53", "73"][i] image_path = f"extracted_episodes_duckdb/episode_{episode_num}/screenshots/screenshot_1.png" # Simple file existence check instead of PIL validation if os.path.exists(image_path): goal_text = metadata.get('goal', f'Episode {episode_num} example') examples.append([ image_path, f"Episode {episode_num}: {goal_text[:50]}..." ]) except Exception as e: logger.error(f"Error loading examples: {str(e)}") examples = [] logger.info(f"Loaded {len(examples)} examples (without validation for faster startup)") return examples # Create Gradio interface def create_demo(): """Create the Gradio demo interface""" with gr.Blocks( title="L-Operator: Android Device Control Demo", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } .chat-container { height: 600px; } """ ) as demo: gr.Markdown(""" # 🤖 L-Operator: Android Device Control Demo **Lightweight Multimodal Android Device Control Agent** This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, optimized for Android device control through visual understanding and action generation. ## 🚀 How to Use 1. **Model Loading**: The L-Operator model loads automatically on startup 2. **Upload Screenshot**: Upload an Android device screenshot 3. **Provide Instructions**: Enter your goal and step instructions 4. **Get Actions**: The model will generate JSON actions for Android device control ## 📋 Expected Output Format The model generates JSON actions in the following format: ```json { "action_type": "tap", "x": 540, "y": 1200, "text": "Settings", "app_name": "com.android.settings", "confidence": 0.92 } ``` --- """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🤖 Model Status") model_status = gr.Textbox( label="L-Operator Model", value="🔄 Loading model on startup...", interactive=False ) gr.Markdown("### 📱 Input") image_input = gr.Image( label="Android Screenshot", type="pil", height=400, sources=["upload"] ) gr.Markdown("### 📝 Instructions") goal_input = gr.Textbox( label="Goal", placeholder="e.g., Open the Settings app and navigate to Display settings", lines=2 ) step_input = gr.Textbox( label="Step Instruction", placeholder="e.g., Tap on the Settings app icon on the home screen", lines=2 ) generate_btn = gr.Button("🎯 Generate Action", variant="secondary") with gr.Column(scale=2): gr.Markdown("### 💬 Chat Interface") chat_interface = gr.ChatInterface( fn=demo_instance.chat_with_model, additional_inputs=[image_input], title="L-Operator Chat", description="Chat with L-Operator using screenshots and text instructions", examples=load_example_episodes(), type="messages", cache_examples=False ) gr.Markdown("### 🎯 Action Output") action_output = gr.JSON( label="Generated Action", value={}, height=200 ) # Event handlers def on_generate_action(image, goal, step): if not image: return {"error": "Please upload an image"} if not goal or not step: return {"error": "Please provide both goal and step"} response = demo_instance.generate_action(image, goal, step) try: # Try to parse as JSON parsed = json.loads(response) return parsed except: return {"raw_response": response} # Update model status on page load (with timeout-protected model loading) def update_model_status(): if not demo_instance.is_loaded: logger.info("Loading model on Gradio startup with timeout protection...") result = load_model_with_timeout(timeout_seconds=900) # 15 minutes for Spaces logger.info(f"Model loading result: {result}") return result if demo_instance.is_loaded: return "✅ L-Operator model loaded and ready!" else: return "❌ Model failed to load. Please check logs." generate_btn.click( fn=on_generate_action, inputs=[image_input, goal_input, step_input], outputs=action_output ) # Load model and update status on page load demo.load( fn=update_model_status, outputs=model_status ) # Update chat interface when image changes def update_chat_image(image): return image image_input.change( fn=update_chat_image, inputs=[image_input], outputs=[chat_interface.chatbot] ) gr.Markdown(""" --- ## 📊 Model Details | Property | Value | |----------|-------| | **Base Model** | LiquidAI/LFM2-VL-1.6B | | **Architecture** | LFM2-VL (1.6B parameters) | | **Fine-tuning** | LoRA (Low-Rank Adaptation) | | **Training Data** | Android control episodes with screenshots and actions | ## 🎯 Use Cases - **Mobile App Testing**: Automated UI testing for Android applications - **Accessibility Applications**: Voice-controlled device navigation - **Remote Support**: Remote device troubleshooting - **Development Workflows**: UI/UX testing automation --- **Made with ❤️ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) """) return demo # Create and launch the demo with optimized settings if __name__ == "__main__": try: logger.info("Creating Gradio demo interface...") demo = create_demo() logger.info("Launching Gradio server...") demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=False, # Disable debug to reduce startup time show_error=True, ssr_mode=False, max_threads=2, # Limit threads to prevent resource exhaustion quiet=True # Reduce startup logging noise ) except Exception as e: logger.error(f"Failed to launch Gradio app: {str(e)}") raise