| | import gradio as gr |
| | import torch |
| | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| | from peft import PeftModel, PeftConfig |
| | import spaces |
| |
|
| | |
| | BASE_MODEL = "unsloth/qwen3-30b-a3b" |
| | LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b" |
| |
|
| | |
| | print("=" * 60) |
| | print("π INITIALIZING CEO AI EXECUTIVE") |
| | print("=" * 60) |
| |
|
| | print("\n[1/4] Loading tokenizer...") |
| | tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
| | print("β Tokenizer loaded successfully!") |
| |
|
| | print("\n[2/4] Configuring 4-bit quantization...") |
| | |
| | quantization_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_compute_dtype=torch.bfloat16, |
| | bnb_4bit_use_double_quant=True, |
| | bnb_4bit_quant_type="nf4" |
| | ) |
| | print("β Quantization config ready!") |
| |
|
| | print("\n[3/4] Loading base model (Qwen3-30B)...") |
| | print("β³ This may take 2-3 minutes - downloading and quantizing 30B parameters...") |
| | model = AutoModelForCausalLM.from_pretrained( |
| | BASE_MODEL, |
| | quantization_config=quantization_config, |
| | device_map="auto", |
| | trust_remote_code=True |
| | ) |
| | print("β Base model loaded successfully!") |
| |
|
| | print("\n[4/4] Loading LoRA adapter (CEO fine-tuning)...") |
| | model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH) |
| | model.eval() |
| | print("β LoRA adapter loaded successfully!") |
| |
|
| | print("\n" + "=" * 60) |
| | print("π― CEO AI EXECUTIVE IS READY!") |
| | print("=" * 60) |
| | print("Model is loaded in memory and ready for fast inference.\n") |
| |
|
| | @spaces.GPU(duration=60) |
| | def chat_with_ceo(message, history): |
| | """ |
| | Chat function that responds like the CEO |
| | Args: |
| | message: User's current message |
| | history: List of previous message tuples [(user_msg, bot_msg), ...] |
| | """ |
| | |
| | conversation = [] |
| |
|
| | |
| | recent_history = history[-5:] if len(history) > 5 else history |
| | for user_msg, bot_msg in recent_history: |
| | conversation.append({"role": "user", "content": user_msg}) |
| | conversation.append({"role": "assistant", "content": bot_msg}) |
| |
|
| | |
| | conversation.append({"role": "user", "content": message}) |
| |
|
| | |
| | prompt = tokenizer.apply_chat_template( |
| | conversation, |
| | tokenize=False, |
| | add_generation_prompt=True |
| | ) |
| |
|
| | |
| | inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048) |
| | inputs = {k: v.to(model.device) for k, v in inputs.items()} |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=256, |
| | temperature=0.7, |
| | top_p=0.9, |
| | do_sample=True, |
| | repetition_penalty=1.1, |
| | pad_token_id=tokenizer.pad_token_id, |
| | eos_token_id=tokenizer.eos_token_id, |
| | use_cache=True |
| | ) |
| |
|
| | |
| | response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) |
| |
|
| | |
| | return response |
| |
|
| | |
| | demo = gr.ChatInterface( |
| | fn=chat_with_ceo, |
| | title="π― CEO AI Executive", |
| | description="""Chat with an AI trained on your CEO's writing style and thoughts. |
| | |
| | β
**Model Status:** Loaded and ready! The model is kept in memory for fast responses.""", |
| | examples=[ |
| | "What's your vision for the company?", |
| | "How do you approach leadership?", |
| | "What are your thoughts on innovation?", |
| | "Can you share your perspective on team building?", |
| | "What drives your business strategy?" |
| | ], |
| | chatbot=gr.Chatbot(height=500) |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|