Spaces:

AI-Talent-Force
/

exec_chatbot_v1

Paused

AI-Talent-Force

Remove unsupported ChatInterface parameters

da0c75c 23 days ago

3.92 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from peft import PeftModel, PeftConfig
	import spaces

	# Model configuration
	BASE_MODEL = "unsloth/qwen3-30b-a3b"
	LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"

	# Load model and tokenizer at startup (once)
	print("=" * 60)
	print("🚀 INITIALIZING CEO AI EXECUTIVE")
	print("=" * 60)

	print("\n[1/4] Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
	print("✓ Tokenizer loaded successfully!")

	print("\n[2/4] Configuring 4-bit quantization...")
	# Use 4-bit quantization to fit in GPU memory
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4"
	)
	print("✓ Quantization config ready!")

	print("\n[3/4] Loading base model (Qwen3-30B)...")
	print("⏳ This may take 2-3 minutes - downloading and quantizing 30B parameters...")
	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	quantization_config=quantization_config,
	device_map="auto",
	trust_remote_code=True
	)
	print("✓ Base model loaded successfully!")

	print("\n[4/4] Loading LoRA adapter (CEO fine-tuning)...")
	model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
	model.eval()
	print("✓ LoRA adapter loaded successfully!")

	print("\n" + "=" * 60)
	print("🎯 CEO AI EXECUTIVE IS READY!")
	print("=" * 60)
	print("Model is loaded in memory and ready for fast inference.\n")

	@spaces.GPU(duration=60)
	def chat_with_ceo(message, history):
	"""
	Chat function that responds like the CEO
	Args:
	message: User's current message
	history: List of previous message tuples [(user_msg, bot_msg), ...]
	"""
	# Build conversation context (limit history to last 5 exchanges for speed)
	conversation = []

	# Process history - ChatInterface passes history as list of tuples
	recent_history = history[-5:] if len(history) > 5 else history # Last 5 exchanges
	for user_msg, bot_msg in recent_history:
	conversation.append({"role": "user", "content": user_msg})
	conversation.append({"role": "assistant", "content": bot_msg})

	# Add current message
	conversation.append({"role": "user", "content": message})

	# Apply chat template
	prompt = tokenizer.apply_chat_template(
	conversation,
	tokenize=False,
	add_generation_prompt=True
	)

	# Tokenize
	inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}

	# Generate response with optimized parameters for speed
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=256,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	repetition_penalty=1.1,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id,
	use_cache=True
	)

	# Decode response
	response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

	# Return just the response string - ChatInterface handles the history
	return response

	# Create Gradio ChatInterface
	demo = gr.ChatInterface(
	fn=chat_with_ceo,
	title="🎯 CEO AI Executive",
	description="""Chat with an AI trained on your CEO's writing style and thoughts.

	✅ Model Status: Loaded and ready! The model is kept in memory for fast responses.""",
	examples=[
	"What's your vision for the company?",
	"How do you approach leadership?",
	"What are your thoughts on innovation?",
	"Can you share your perspective on team building?",
	"What drives your business strategy?"
	],
	chatbot=gr.Chatbot(height=500)
	)

	if __name__ == "__main__":
	demo.launch()