exec_chatbot_v1 / app.py
AI-Talent-Force
Remove unsupported ChatInterface parameters
da0c75c
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import spaces
# Model configuration
BASE_MODEL = "unsloth/qwen3-30b-a3b"
LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"
# Load model and tokenizer at startup (once)
print("=" * 60)
print("πŸš€ INITIALIZING CEO AI EXECUTIVE")
print("=" * 60)
print("\n[1/4] Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
print("βœ“ Tokenizer loaded successfully!")
print("\n[2/4] Configuring 4-bit quantization...")
# Use 4-bit quantization to fit in GPU memory
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
print("βœ“ Quantization config ready!")
print("\n[3/4] Loading base model (Qwen3-30B)...")
print("⏳ This may take 2-3 minutes - downloading and quantizing 30B parameters...")
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=quantization_config,
device_map="auto",
trust_remote_code=True
)
print("βœ“ Base model loaded successfully!")
print("\n[4/4] Loading LoRA adapter (CEO fine-tuning)...")
model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
model.eval()
print("βœ“ LoRA adapter loaded successfully!")
print("\n" + "=" * 60)
print("🎯 CEO AI EXECUTIVE IS READY!")
print("=" * 60)
print("Model is loaded in memory and ready for fast inference.\n")
@spaces.GPU(duration=60)
def chat_with_ceo(message, history):
"""
Chat function that responds like the CEO
Args:
message: User's current message
history: List of previous message tuples [(user_msg, bot_msg), ...]
"""
# Build conversation context (limit history to last 5 exchanges for speed)
conversation = []
# Process history - ChatInterface passes history as list of tuples
recent_history = history[-5:] if len(history) > 5 else history # Last 5 exchanges
for user_msg, bot_msg in recent_history:
conversation.append({"role": "user", "content": user_msg})
conversation.append({"role": "assistant", "content": bot_msg})
# Add current message
conversation.append({"role": "user", "content": message})
# Apply chat template
prompt = tokenizer.apply_chat_template(
conversation,
tokenize=False,
add_generation_prompt=True
)
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generate response with optimized parameters for speed
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
do_sample=True,
repetition_penalty=1.1,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
use_cache=True
)
# Decode response
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
# Return just the response string - ChatInterface handles the history
return response
# Create Gradio ChatInterface
demo = gr.ChatInterface(
fn=chat_with_ceo,
title="🎯 CEO AI Executive",
description="""Chat with an AI trained on your CEO's writing style and thoughts.
βœ… **Model Status:** Loaded and ready! The model is kept in memory for fast responses.""",
examples=[
"What's your vision for the company?",
"How do you approach leadership?",
"What are your thoughts on innovation?",
"Can you share your perspective on team building?",
"What drives your business strategy?"
],
chatbot=gr.Chatbot(height=500)
)
if __name__ == "__main__":
demo.launch()