File size: 3,923 Bytes
e1c2a9e
 
 
 
 
 
 
 
 
 
6fdb30f
c8d6960
 
 
 
 
6fdb30f
c8d6960
6fdb30f
c8d6960
6fdb30f
 
 
 
 
 
 
c8d6960
6fdb30f
c8d6960
 
6fdb30f
 
 
 
 
 
c8d6960
6fdb30f
c8d6960
6fdb30f
 
c8d6960
6fdb30f
c8d6960
 
 
 
e1c2a9e
eaa113d
e1c2a9e
 
 
 
 
6a3b5fa
e1c2a9e
eaa113d
e1c2a9e
 
6a3b5fa
 
 
 
 
ecb4524
 
e1c2a9e
 
 
 
 
 
 
 
 
 
eaa113d
e1c2a9e
 
eaa113d
e1c2a9e
 
 
eaa113d
e1c2a9e
 
 
 
 
eaa113d
 
e1c2a9e
 
 
 
9078a1e
ecb4524
 
 
 
 
 
 
da0c75c
ecb4524
da0c75c
ecb4524
 
 
 
 
 
 
da0c75c
ecb4524
e1c2a9e
 
ecb4524
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import spaces

# Model configuration
BASE_MODEL = "unsloth/qwen3-30b-a3b"
LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"

# Load model and tokenizer at startup (once)
print("=" * 60)
print("πŸš€ INITIALIZING CEO AI EXECUTIVE")
print("=" * 60)

print("\n[1/4] Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
print("βœ“ Tokenizer loaded successfully!")

print("\n[2/4] Configuring 4-bit quantization...")
# Use 4-bit quantization to fit in GPU memory
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)
print("βœ“ Quantization config ready!")

print("\n[3/4] Loading base model (Qwen3-30B)...")
print("⏳ This may take 2-3 minutes - downloading and quantizing 30B parameters...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)
print("βœ“ Base model loaded successfully!")

print("\n[4/4] Loading LoRA adapter (CEO fine-tuning)...")
model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
model.eval()
print("βœ“ LoRA adapter loaded successfully!")

print("\n" + "=" * 60)
print("🎯 CEO AI EXECUTIVE IS READY!")
print("=" * 60)
print("Model is loaded in memory and ready for fast inference.\n")

@spaces.GPU(duration=60)
def chat_with_ceo(message, history):
    """
    Chat function that responds like the CEO
    Args:
        message: User's current message
        history: List of previous message tuples [(user_msg, bot_msg), ...]
    """
    # Build conversation context (limit history to last 5 exchanges for speed)
    conversation = []

    # Process history - ChatInterface passes history as list of tuples
    recent_history = history[-5:] if len(history) > 5 else history  # Last 5 exchanges
    for user_msg, bot_msg in recent_history:
        conversation.append({"role": "user", "content": user_msg})
        conversation.append({"role": "assistant", "content": bot_msg})

    # Add current message
    conversation.append({"role": "user", "content": message})

    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        conversation,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate response with optimized parameters for speed
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True
        )

    # Decode response
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Return just the response string - ChatInterface handles the history
    return response

# Create Gradio ChatInterface
demo = gr.ChatInterface(
    fn=chat_with_ceo,
    title="🎯 CEO AI Executive",
    description="""Chat with an AI trained on your CEO's writing style and thoughts.

βœ… **Model Status:** Loaded and ready! The model is kept in memory for fast responses.""",
    examples=[
        "What's your vision for the company?",
        "How do you approach leadership?",
        "What are your thoughts on innovation?",
        "Can you share your perspective on team building?",
        "What drives your business strategy?"
    ],
    chatbot=gr.Chatbot(height=500)
)

if __name__ == "__main__":
    demo.launch()