""" import gradio as gr from huggingface_hub import InferenceClient def respond( message, history: list[dict[str, str]], system_message, max_tokens, temperature, top_p, hf_token: gr.OAuthToken, ): """ # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b") messages = [{"role": "system", "content": system_message}] messages.extend(history) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): choices = message.choices token = "" if len(choices) and choices[0].delta.content: token = choices[0].delta.content response += token yield response """ #For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ chatbot = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) with gr.Blocks() as demo: with gr.Sidebar(): gr.LoginButton() chatbot.render() if __name__ == "__main__": demo.launch() """ """ import gradio as gr import torch from threading import Thread from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import os token = os.getenv("HF_TOKEN") # load model model_id = "meta-llama/Llama-3.2-3B-Instruct" #model_id = "Qwen/Qwen2.5-3B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( model_id, device_map="cpu", token=token, torch_dtype=torch.float32, ) model.eval() session_log = [] # input normaliser def extract_user_text(message): def collect(x): if x is None: return [] if isinstance(x, str): return [x] if isinstance(x, dict): results = [] for key in ("text", "content"): if key in x: results.extend(collect(x[key])) return results if isinstance(x, list): results = [] for item in x: results.extend(collect(item)) return results return [] texts = collect(message) cleaned = " ".join(t.strip() for t in texts if isinstance(t, str) and t.strip()) return cleaned if cleaned else "" # for storing the chat def build_messages(history, message): messages = [] if history is not None: for h in history: # old Gradio format: (user, assistant) if isinstance(h, (list, tuple)) and len(h) == 2: user_text = extract_user_text(h[0]) # <-- FIX (no str()) assistant_text = extract_user_text(h[1]) # <-- FIX (no str()) if user_text: messages.append({"role": "user", "content": user_text}) if assistant_text: messages.append({"role": "assistant", "content": assistant_text}) continue # new Gradio format: {"role":..., "content":...} if isinstance(h, dict): if "role" in h and "content" in h: role = h.get("role") content = extract_user_text(h.get("content")) # <-- FIX (normalize) if role in ("user", "assistant", "system") and content: messages.append({"role": role, "content": content}) continue # fallback for other dict shapes text = extract_user_text(h) if text: messages.append({"role": "user", "content": text}) messages.append({"role": "user", "content": message}) return messages # stream chat to gradio def stream_chat(message, history): # STEP 1: NORMALISE INPUT message = extract_user_text(message) # STEP 2: BUILD MESSAGE HISTORY messages = build_messages(history, message) # STEP 3: TOKENIZE (use text prompt to avoid odd tensor/dict behavior) prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) input_ids = tokenizer(prompt, return_tensors="pt").input_ids # STEP 4: STREAMER streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True, ) # STEP 5: GENERATION THREAD def _gen(): with torch.inference_mode(): model.generate( input_ids=input_ids, streamer=streamer, max_new_tokens=120, # CPU SAFE LIMIT temperature=0.7, do_sample=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) thread = Thread(target=_gen) thread.start() output = "" for text in streamer: if text: output += text yield output session_log.append({"user": message, "assistant": output}) # launch gradio ui demo = gr.ChatInterface( fn=stream_chat, title="🦙 LLaMA 3", description="Hi", ) demo.launch() """ import gradio as gr import torch import os import numpy as np from threading import Thread from empath import Empath from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import torch.nn as nn # ------------------------- # CONFIG # ------------------------- token = os.getenv("HF_TOKEN") model_id = "meta-llama/Llama-3.2-3B-Instruct" device = "cpu" # ------------------------- # LOAD MODEL (FROZEN LLM) # ------------------------- tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( model_id, token=token, torch_dtype=torch.float32, device_map="cpu", ) model.eval() for p in model.parameters(): p.requires_grad = False # IMPORTANT: freeze LLM # ------------------------- # EMPATH STYLE ENCODER # ------------------------- class EmpathVectorizer: def __init__(self): self.lexicon = Empath() # fixed style space self.dimensions = [ "positive_emotion", "negative_emotion", "social", "communication", "technology", "cognitive_process", "work", "family", ] def transform(self, text): raw = self.lexicon.analyze(text, normalize=True) vec = np.array( [raw.get(d, 0.0) for d in self.dimensions], dtype=np.float32 ) vec = vec / (np.linalg.norm(vec) + 1e-8) return vec # ------------------------- # PREFIX TUNING MODULE (PEFT) # ------------------------- class PrefixPEFT(nn.Module): def __init__(self, style_dim, d_model, prefix_len=8): super().__init__() self.prefix_len = prefix_len self.mlp = nn.Sequential( nn.Linear(style_dim, d_model * prefix_len), nn.Tanh() ) def forward(self, style_vec): x = self.mlp(style_vec) return x.view(1, self.prefix_len, -1) # ------------------------- # INIT SYSTEM # ------------------------- vectorizer = EmpathVectorizer() prefix_model = PrefixPEFT( style_dim=len(vectorizer.dimensions), d_model=model.config.hidden_size, prefix_len=8 ) optimizer = torch.optim.Adam(prefix_model.parameters(), lr=1e-3) # ------------------------- # TEXT UTILS # ------------------------- def extract_user_text(x): if isinstance(x, str): return x if isinstance(x, dict): return x.get("text") or x.get("content") or "" return str(x) # ------------------------- # STYLE FROM USER HISTORY (CORRECT DESIGN) # ------------------------- def build_user_style(history): """ Empath is computed ONLY on USER messages. """ texts = [] if history: for h in history: if isinstance(h, (list, tuple)) and len(h) == 2: user = extract_user_text(h[0]) if user: texts.append(user) return " ".join(texts)[-3000:] # cap # ------------------------- # CHAT BUILDING # ------------------------- def build_messages(history, message): messages = [] if history: for h in history: if isinstance(h, (list, tuple)) and len(h) == 2: u = extract_user_text(h[0]) a = extract_user_text(h[1]) if u: messages.append({"role": "user", "content": u}) if a: messages.append({"role": "assistant", "content": a}) messages.append({"role": "user", "content": message}) return messages # ------------------------- # TRAIN STEP (CORE LEARNING LOGIC) # ------------------------- def train_step(user_history, prompt, target): optimizer.zero_grad() # 1. STYLE VECTOR (Empath on USER HISTORY ONLY) style_text = build_user_style(user_history) style_vec = vectorizer.transform(style_text) style_vec = torch.tensor(style_vec, dtype=torch.float32).unsqueeze(0) # 2. PREFIX prefix = prefix_model(style_vec) # 3. TOKENIZE prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids target_ids = tokenizer(target, return_tensors="pt").input_ids # 4. EMBEDDINGS prompt_embeds = model.model.embed_tokens(prompt_ids) # 5. PREFIX + PROMPT inputs_embeds = torch.cat([prefix, prompt_embeds], dim=1) attention_mask = torch.ones(inputs_embeds.shape[:2]) # 6. FORWARD outputs = model( inputs_embeds=inputs_embeds, attention_mask=attention_mask ) logits = outputs.logits[:, prefix.shape[1]:, :] # 7. LOSS loss = torch.nn.functional.cross_entropy( logits.reshape(-1, logits.size(-1)), target_ids.reshape(-1) ) loss.backward() optimizer.step() return loss.item() # ------------------------- # STREAM CHAT (INFERENCE) # ------------------------- def stream_chat(message, history): message = extract_user_text(message) messages = build_messages(history, message) # prompt = last user message context prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"] # STYLE (USER HISTORY ONLY) style_text = build_user_style(history + [(message, "")]) style_vec = vectorizer.transform(style_text) style_vec = torch.tensor(style_vec, dtype=torch.float32).unsqueeze(0) prefix = prefix_model(style_vec) token_embeds = model.model.embed_tokens(input_ids) inputs_embeds = torch.cat([prefix, token_embeds], dim=1) attention_mask = torch.ones(inputs_embeds.shape[:2]) streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True, ) def _gen(): with torch.inference_mode(): model.generate( inputs_embeds=inputs_embeds, attention_mask=attention_mask, streamer=streamer, max_new_tokens=120, temperature=0.7, do_sample=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) Thread(target=_gen).start() out = "" for t in streamer: out += t yield out # ------------------------- # GRADIO UI # ------------------------- demo = gr.ChatInterface( fn=stream_chat, title="🧠 Empath + Prefix-Tuned LLaMA (User-Conditioned Style)", description="Style is learned ONLY from user history via Empath → prefix tuning." ) demo.launch()