File size: 2,972 Bytes
0c528f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | import os
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
# Load model (will load from HF Hub or local)
print("Loading LTO model...")
# Base model
base_model_name = "unsloth/Llama-3.2-3B-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# Load with transformers + PEFT
print("Loading with transformers...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=bnb_config,
device_map="auto",
)
model = PeftModel.from_pretrained(model, "./lora_model_lto")
model.eval()
print("Model loaded!")
# Chat template
SYSTEM_PROMPT = """You are LTO, a French member of CS City Discord. You do technical analysis on stocks and crypto (fundas are trash). You're aggressive in banter and use phrases like "on my wife", "kys", "die", "bozo", "dubai scammer", "fr", "ngl", "bcs". Keep it real and match the energy."""
def format_prompt(message, history):
# Build conversation context
context_parts = []
for user_msg, bot_msg in history[-3:]: # Last 3 exchanges
context_parts.append(f"[earlier] User: {user_msg}")
context_parts.append(f"[earlier] LTO: {bot_msg}")
if context_parts:
full_input = "\n".join(context_parts) + f"\nUser: {message}"
else:
full_input = f"User: {message}"
prompt = f"""<|system|>
{SYSTEM_PROMPT}
<|user|>
{full_input}
<|assistant|>
"""
return prompt
def respond(message, history):
prompt = format_prompt(message, history)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=150,
temperature=0.75,
top_p=0.9,
do_sample=True,
repetition_penalty=1.15,
pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract assistant response
if "<|assistant|>" in response:
response = response.split("<|assistant|>")[-1].strip()
# Clean up
response = response.replace("<|system|>", "").replace("<|user|>", "").strip()
if "\n" in response:
response = response.split("\n")[0].strip()
return response
# Create Gradio interface
demo = gr.ChatInterface(
respond,
title="🇫🇷 Chat with LTO",
description="LTO from CS City Discord. He does TA, hates fundas, and says 'on my wife' a lot. Be ready for aggressive banter!",
examples=[
"hey",
"what do you think of fundas?",
"cap",
"you're lying",
"what crypto should I buy?",
],
)
if __name__ == "__main__":
demo.launch()
|