Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import re | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from peft import PeftModel | |
| # Model configuration | |
| MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq" | |
| DPO_ADAPTER = "jmcinern/qomhra-8B-awq-dpo-beta-0.5-checkpoint-checkpoint-100" | |
| THINK_TAG_PATTERN = re.compile(r"<think>.*?</think>\s*", flags=re.DOTALL) | |
| class ChatBot: | |
| def __init__(self): | |
| self.model = None | |
| self.tokenizer = None | |
| self.loading = True | |
| self.load_model() | |
| def load_model(self): | |
| """Load model and tokenizer sequentially""" | |
| try: | |
| print("Loading tokenizer...") | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_NAME, trust_remote_code=True | |
| ) | |
| print(self.tokenizer.eos_token_id) | |
| print("Tokenizer loaded!") | |
| print("Loading model...") | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| torch_dtype="auto", | |
| low_cpu_mem_usage=True, | |
| ) | |
| self.model = base_model | |
| # Uncomment to use DPO adapter, I suspect it is giving problem. | |
| ''' | |
| self.model = PeftModel.from_pretrained( | |
| base_model, | |
| DPO_ADAPTER | |
| ) | |
| ''' | |
| print("Model loaded!") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| finally: | |
| self.loading = False | |
| def chat(self, message, history): | |
| if self.loading: | |
| return history + [(message, "Model is loading, please wait...")] | |
| if not self.model: | |
| return history + [(message, "Model failed to load")] | |
| # Build messages | |
| messages = [] | |
| for user_msg, bot_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| messages.append({"role": "assistant", "content": bot_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| # Apply chat template and strip thinking tags | |
| prompt = self.tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True, enable_thinking=False | |
| ) | |
| prompt = THINK_TAG_PATTERN.sub("", prompt) | |
| print("----------PROMPT--------------") | |
| print(prompt) | |
| # Tokenize | |
| inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) | |
| # Set EOS seen in training (per Qwen chat template) | |
| stop_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>") | |
| # Generate response | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_new_tokens=2048, | |
| temperature=0.7, | |
| do_sample=True, | |
| return_dict_in_generate=True, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| eos_token_id=stop_id | |
| ) | |
| # Decode and clean response, with multiple debugs | |
| # Inspect token IDs | |
| gen_ids = outputs.sequences[0][len(inputs.input_ids[0]):] | |
| print("\n--- GENERATED TOKEN IDS ---\n", gen_ids.tolist()) | |
| # Decode without skipping specials | |
| raw_output = self.tokenizer.decode(gen_ids, skip_special_tokens=True) | |
| print("\n--- RAW DECODED OUTPUT ---\n", repr(raw_output)) | |
| # Show first generated token decoded individually | |
| if len(gen_ids) > 0: | |
| first_token = self.tokenizer.decode([gen_ids[0]]) | |
| print(f"\n--- FIRST TOKEN --- '{first_token}' ---") | |
| # Clean as usual | |
| response = THINK_TAG_PATTERN.sub("", raw_output).strip() | |
| print("\n--- CLEANED RESPONSE ---\n", repr(response)) | |
| return history + [(message, response)] | |
| # Initialize chatbot | |
| bot = ChatBot() | |
| # Create interface | |
| with gr.Blocks() as demo: | |
| gr.HTML('<h1 style="margin:0;">LLM dátheangach Gaeilge–Béarla forbartha ag Abair.ie</h1>') | |
| chatbot = gr.Chatbot(height=400) | |
| msg = gr.Textbox(placeholder="Type your message...", show_label=False) | |
| msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg) | |
| if __name__ == "__main__": | |
| demo.launch() | |