import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel, PeftConfig import spaces # Model configuration BASE_MODEL = "unsloth/qwen3-30b-a3b" LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b" # Load model and tokenizer at startup (once) print("=" * 60) print("🚀 INITIALIZING CEO AI EXECUTIVE") print("=" * 60) print("\n[1/4] Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) print("✓ Tokenizer loaded successfully!") print("\n[2/4] Configuring 4-bit quantization...") # Use 4-bit quantization to fit in GPU memory quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) print("✓ Quantization config ready!") print("\n[3/4] Loading base model (Qwen3-30B)...") print("⏳ This may take 2-3 minutes - downloading and quantizing 30B parameters...") model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config=quantization_config, device_map="auto", trust_remote_code=True ) print("✓ Base model loaded successfully!") print("\n[4/4] Loading LoRA adapter (CEO fine-tuning)...") model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH) model.eval() print("✓ LoRA adapter loaded successfully!") print("\n" + "=" * 60) print("🎯 CEO AI EXECUTIVE IS READY!") print("=" * 60) print("Model is loaded in memory and ready for fast inference.\n") @spaces.GPU(duration=60) def chat_with_ceo(message, history): """ Chat function that responds like the CEO Args: message: User's current message history: List of previous message tuples [(user_msg, bot_msg), ...] """ # Build conversation context (limit history to last 5 exchanges for speed) conversation = [] # Process history - ChatInterface passes history as list of tuples recent_history = history[-5:] if len(history) > 5 else history # Last 5 exchanges for user_msg, bot_msg in recent_history: conversation.append({"role": "user", "content": user_msg}) conversation.append({"role": "assistant", "content": bot_msg}) # Add current message conversation.append({"role": "user", "content": message}) # Apply chat template prompt = tokenizer.apply_chat_template( conversation, tokenize=False, add_generation_prompt=True ) # Tokenize inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048) inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate response with optimized parameters for speed with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True, repetition_penalty=1.1, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, use_cache=True ) # Decode response response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) # Return just the response string - ChatInterface handles the history return response # Create Gradio ChatInterface demo = gr.ChatInterface( fn=chat_with_ceo, title="🎯 CEO AI Executive", description="""Chat with an AI trained on your CEO's writing style and thoughts. ✅ **Model Status:** Loaded and ready! The model is kept in memory for fast responses.""", examples=[ "What's your vision for the company?", "How do you approach leadership?", "What are your thoughts on innovation?", "Can you share your perspective on team building?", "What drives your business strategy?" ], chatbot=gr.Chatbot(height=500) ) if __name__ == "__main__": demo.launch()