File size: 3,923 Bytes
e1c2a9e 6fdb30f c8d6960 6fdb30f c8d6960 6fdb30f c8d6960 6fdb30f c8d6960 6fdb30f c8d6960 6fdb30f c8d6960 6fdb30f c8d6960 6fdb30f c8d6960 6fdb30f c8d6960 e1c2a9e eaa113d e1c2a9e 6a3b5fa e1c2a9e eaa113d e1c2a9e 6a3b5fa ecb4524 e1c2a9e eaa113d e1c2a9e eaa113d e1c2a9e eaa113d e1c2a9e eaa113d e1c2a9e 9078a1e ecb4524 da0c75c ecb4524 da0c75c ecb4524 da0c75c ecb4524 e1c2a9e ecb4524 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import spaces
# Model configuration
BASE_MODEL = "unsloth/qwen3-30b-a3b"
LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"
# Load model and tokenizer at startup (once)
print("=" * 60)
print("π INITIALIZING CEO AI EXECUTIVE")
print("=" * 60)
print("\n[1/4] Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
print("β Tokenizer loaded successfully!")
print("\n[2/4] Configuring 4-bit quantization...")
# Use 4-bit quantization to fit in GPU memory
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
print("β Quantization config ready!")
print("\n[3/4] Loading base model (Qwen3-30B)...")
print("β³ This may take 2-3 minutes - downloading and quantizing 30B parameters...")
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=quantization_config,
device_map="auto",
trust_remote_code=True
)
print("β Base model loaded successfully!")
print("\n[4/4] Loading LoRA adapter (CEO fine-tuning)...")
model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
model.eval()
print("β LoRA adapter loaded successfully!")
print("\n" + "=" * 60)
print("π― CEO AI EXECUTIVE IS READY!")
print("=" * 60)
print("Model is loaded in memory and ready for fast inference.\n")
@spaces.GPU(duration=60)
def chat_with_ceo(message, history):
"""
Chat function that responds like the CEO
Args:
message: User's current message
history: List of previous message tuples [(user_msg, bot_msg), ...]
"""
# Build conversation context (limit history to last 5 exchanges for speed)
conversation = []
# Process history - ChatInterface passes history as list of tuples
recent_history = history[-5:] if len(history) > 5 else history # Last 5 exchanges
for user_msg, bot_msg in recent_history:
conversation.append({"role": "user", "content": user_msg})
conversation.append({"role": "assistant", "content": bot_msg})
# Add current message
conversation.append({"role": "user", "content": message})
# Apply chat template
prompt = tokenizer.apply_chat_template(
conversation,
tokenize=False,
add_generation_prompt=True
)
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generate response with optimized parameters for speed
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
do_sample=True,
repetition_penalty=1.1,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
use_cache=True
)
# Decode response
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
# Return just the response string - ChatInterface handles the history
return response
# Create Gradio ChatInterface
demo = gr.ChatInterface(
fn=chat_with_ceo,
title="π― CEO AI Executive",
description="""Chat with an AI trained on your CEO's writing style and thoughts.
β
**Model Status:** Loaded and ready! The model is kept in memory for fast responses.""",
examples=[
"What's your vision for the company?",
"How do you approach leadership?",
"What are your thoughts on innovation?",
"Can you share your perspective on team building?",
"What drives your business strategy?"
],
chatbot=gr.Chatbot(height=500)
)
if __name__ == "__main__":
demo.launch()
|