import gradio as gr
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
# Model configuration
MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
DPO_ADAPTER = "jmcinern/qomhra-8B-awq-dpo-beta-0.5-checkpoint-checkpoint-100"
THINK_TAG_PATTERN = re.compile(r".*?\s*", flags=re.DOTALL)
class ChatBot:
def __init__(self):
self.model = None
self.tokenizer = None
self.loading = True
self.load_model()
def load_model(self):
"""Load model and tokenizer sequentially"""
try:
print("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME, trust_remote_code=True
)
print(self.tokenizer.eos_token_id)
print("Tokenizer loaded!")
print("Loading model...")
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
device_map="auto",
torch_dtype="auto",
low_cpu_mem_usage=True,
)
self.model = base_model
# Uncomment to use DPO adapter, I suspect it is giving problem.
'''
self.model = PeftModel.from_pretrained(
base_model,
DPO_ADAPTER
)
'''
print("Model loaded!")
except Exception as e:
print(f"Error loading model: {e}")
finally:
self.loading = False
def chat(self, message, history):
if self.loading:
return history + [(message, "Model is loading, please wait...")]
if not self.model:
return history + [(message, "Model failed to load")]
# Build messages
messages = []
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
# Apply chat template and strip thinking tags
prompt = self.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
)
prompt = THINK_TAG_PATTERN.sub("", prompt)
print("----------PROMPT--------------")
print(prompt)
# Tokenize
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
# Set EOS seen in training (per Qwen chat template)
stop_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
# Generate response
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=2048,
temperature=0.7,
do_sample=True,
return_dict_in_generate=True,
pad_token_id=self.tokenizer.eos_token_id,
eos_token_id=stop_id
)
# Decode and clean response, with multiple debugs
# Inspect token IDs
gen_ids = outputs.sequences[0][len(inputs.input_ids[0]):]
print("\n--- GENERATED TOKEN IDS ---\n", gen_ids.tolist())
# Decode without skipping specials
raw_output = self.tokenizer.decode(gen_ids, skip_special_tokens=True)
print("\n--- RAW DECODED OUTPUT ---\n", repr(raw_output))
# Show first generated token decoded individually
if len(gen_ids) > 0:
first_token = self.tokenizer.decode([gen_ids[0]])
print(f"\n--- FIRST TOKEN --- '{first_token}' ---")
# Clean as usual
response = THINK_TAG_PATTERN.sub("", raw_output).strip()
print("\n--- CLEANED RESPONSE ---\n", repr(response))
return history + [(message, response)]
# Initialize chatbot
bot = ChatBot()
# Create interface
with gr.Blocks() as demo:
gr.HTML('
LLM dátheangach Gaeilge–Béarla forbartha ag Abair.ie
')
chatbot = gr.Chatbot(height=400)
msg = gr.Textbox(placeholder="Type your message...", show_label=False)
msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg)
if __name__ == "__main__":
demo.launch()