import gradio as gr import torch import re from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel # Model configuration MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq" DPO_ADAPTER = "jmcinern/qomhra-8B-awq-dpo-beta-0.5-checkpoint-checkpoint-100" THINK_TAG_PATTERN = re.compile(r".*?\s*", flags=re.DOTALL) class ChatBot: def __init__(self): self.model = None self.tokenizer = None self.loading = True self.load_model() def load_model(self): """Load model and tokenizer sequentially""" try: print("Loading tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True ) print(self.tokenizer.eos_token_id) print("Tokenizer loaded!") print("Loading model...") base_model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, device_map="auto", torch_dtype="auto", low_cpu_mem_usage=True, ) self.model = base_model # Uncomment to use DPO adapter, I suspect it is giving problem. ''' self.model = PeftModel.from_pretrained( base_model, DPO_ADAPTER ) ''' print("Model loaded!") except Exception as e: print(f"Error loading model: {e}") finally: self.loading = False def chat(self, message, history): if self.loading: return history + [(message, "Model is loading, please wait...")] if not self.model: return history + [(message, "Model failed to load")] # Build messages messages = [] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) # Apply chat template and strip thinking tags prompt = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) prompt = THINK_TAG_PATTERN.sub("", prompt) print("----------PROMPT--------------") print(prompt) # Tokenize inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) # Set EOS seen in training (per Qwen chat template) stop_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>") # Generate response with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=2048, temperature=0.7, do_sample=True, return_dict_in_generate=True, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=stop_id ) # Decode and clean response, with multiple debugs # Inspect token IDs gen_ids = outputs.sequences[0][len(inputs.input_ids[0]):] print("\n--- GENERATED TOKEN IDS ---\n", gen_ids.tolist()) # Decode without skipping specials raw_output = self.tokenizer.decode(gen_ids, skip_special_tokens=True) print("\n--- RAW DECODED OUTPUT ---\n", repr(raw_output)) # Show first generated token decoded individually if len(gen_ids) > 0: first_token = self.tokenizer.decode([gen_ids[0]]) print(f"\n--- FIRST TOKEN --- '{first_token}' ---") # Clean as usual response = THINK_TAG_PATTERN.sub("", raw_output).strip() print("\n--- CLEANED RESPONSE ---\n", repr(response)) return history + [(message, response)] # Initialize chatbot bot = ChatBot() # Create interface with gr.Blocks() as demo: gr.HTML('

LLM dátheangach Gaeilge–Béarla forbartha ag Abair.ie

') chatbot = gr.Chatbot(height=400) msg = gr.Textbox(placeholder="Type your message...", show_label=False) msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg) if __name__ == "__main__": demo.launch()