Spaces:

Vivek16
/

vvv

Runtime error

App Files Files Community

Vivek16 commited on Oct 21, 2025

Commit

e4cb5da

verified ·

1 Parent(s): b2c590c

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -63

app.py CHANGED Viewed

@@ -3,27 +3,23 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
-# --- Configuration ---
 BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 ADAPTER_MODEL_ID = "Vivek16/Root_Math-TinyLlama-CPU"
-# Define the single, strong system instruction
-SYSTEM_INSTRUCTION = (
-    "You are a friendly, helpful, and highly SKILLED assistant named Kutti. "
-    "Your responses MUST be concise and direct. You can handle any conversation, "
-    "but when asked a problem (especially math), provide the correct step-by-step solution. "
-    "DO NOT use excessive conversational filler or repetitive phrases. Stick to the point."
-)
 # --- Model Loading Function ---
 def load_model():
     """Loads the base model and merges the LoRA adapters."""
     print("Loading base model...")
-    # Load the tokenizer, which includes the necessary chat template
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
-    # Force loading to CPU as per your setup
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL_ID,
         torch_dtype=torch.bfloat16,
@@ -31,94 +27,84 @@ def load_model():
     )
     print("Loading and merging PEFT adapters...")
     model = PeftModel.from_pretrained(model, ADAPTER_MODEL_ID)
     model = model.merge_and_unload()
     model.eval()
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     print("Model loaded and merged successfully!")
     return tokenizer, model
 tokenizer, model = load_model()
-# --- Prediction Function (Modified for MAX stability and lower temperature) ---
 def generate_response(message, history):
-    """Generates a response using the official chat template and generation constraints."""
-    # 1. Prepare messages list for the chat template
-    messages = []
-    # Add the system instruction first
-    messages.append({"role": "system", "content": SYSTEM_INSTRUCTION})
-    # Add historical messages
-    for message_dict in history:
-        messages.append({"role": message_dict['role'], "content": message_dict['content']})
-    # Add the current user message
-    messages.append({"role": "user", "content": message})
-    # 2. Apply the model's official chat template
-    # NOTE: The "TinyLlama/TinyLlama-1.1B-Chat-v1.0" model expects a template like:
-    # <|system|>\nSYSTEM_INSTRUCTION</s>\n<|user|>\nMESSAGE</s>\n<|assistant|>\n
-    full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # 3. Tokenize the input
     inputs = tokenizer(full_prompt, return_tensors="pt")
-    # 4. Generate the response with anti-repetition constraints and LOWER TEMPERATURE
     with torch.no_grad():
         output_tokens = model.generate(
             **inputs,
             max_new_tokens=256,
             do_sample=True,
-            temperature=0.6,  # Slightly lower temp for less gibberish
             top_k=50,
-            pad_token_id=tokenizer.eos_token_id,
-            # Constraints to prevent repetitive filler:
-            no_repeat_ngram_size=5,
-            repetition_penalty=1.5
         )
-    # 5. Decode and clean the output using skip_special_tokens=True for max cleanup
-    # We still need to find where the *new* response begins.
-    generated_text_with_prompt = tokenizer.decode(output_tokens[0], skip_special_tokens=False)
-    # Extract only the model's new response by finding the last <|assistant|> tag
-    # The last tag marks the beginning of the new response.
-    assistant_prefix_tag = "<|assistant|>"
-    response_start_index = generated_text_with_prompt.rfind(assistant_prefix_tag)
-    if response_start_index != -1:
-        # Get everything after the last <|assistant|> tag
-        raw_response = generated_text_with_prompt[response_start_index + len(assistant_prefix_tag):].strip()
-        # Clean up any trailing end-of-sequence tags (</s>) or user tags (<|user|>)
-        assistant_response = raw_response.split("</s>")[0].split("<|user|>")[0].strip()
     else:
-        # Fallback to the decoded text if the tag is not found (and hope for the best)
-        assistant_response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
     return assistant_response
-# --- Gradio Chat Interface (No Change) ---
-title = "Kutti: Your TinyLlama Problem Solver"
-description = "Hello! I'm Kutti. How can I help you? Ask me anything from math problems to general questions."
 gr.ChatInterface(
     fn=generate_response,
-    chatbot=gr.Chatbot(
-        height=500,
-        type='messages',
-        value=[{'role': 'assistant', 'content': "Hello! I'm Kutti. How can I help you today?"}]
-    ),
-    textbox=gr.Textbox(placeholder="Ask your question or problem here...", scale=7),
     title=title,
     description=description,
-    submit_btn="Send",
     theme="soft"
 ).queue().launch()

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
+# --- Configuration (Verified) ---
 BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# Ensure this is correct for your model repository
 ADAPTER_MODEL_ID = "Vivek16/Root_Math-TinyLlama-CPU"
+# Define the instruction template components
+SYSTEM_INSTRUCTION = "Solve the following math problem:"
+USER_TEMPLATE = "<|user|>\n{}</s>"
+ASSISTANT_TEMPLATE = "<|assistant|>\n{}</s>"
 # --- Model Loading Function ---
 def load_model():
     """Loads the base model and merges the LoRA adapters."""
     print("Loading base model...")
+    # Use bfloat16 for efficiency on CPU
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL_ID,
         torch_dtype=torch.bfloat16,
     )
     print("Loading and merging PEFT adapters...")
+    # Load the trained LoRA adapters
     model = PeftModel.from_pretrained(model, ADAPTER_MODEL_ID)
     model = model.merge_and_unload()
     model.eval()
+    # Ensure pad token is set for generation
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     print("Model loaded and merged successfully!")
     return tokenizer, model
+# Load the model outside the prediction function for efficiency
 tokenizer, model = load_model()
+# --- Prediction Function for gr.ChatInterface ---
 def generate_response(message, history):
+    """Generates a response using chat history and the fine-tuned model."""
+    # 1. Build the full prompt including System Instruction, History, and current Message
+    # Start with the system instruction
+    full_prompt = f"<|system|>\n{SYSTEM_INSTRUCTION}</s>\n"
+    # Append the chat history (if any)
+    for user_msg, assistant_msg in history:
+        full_prompt += USER_TEMPLATE.format(user_msg) + "\n"
+        full_prompt += ASSISTANT_TEMPLATE.format(assistant_msg) + "\n"
+    # Append the current user message and the start of the assistant's turn
+    full_prompt += USER_TEMPLATE.format(message) + "\n"
+    full_prompt += "<|assistant|>\n"
+    print(f"--- Full Prompt ---\n{full_prompt}")
+    # 2. Tokenize the input
     inputs = tokenizer(full_prompt, return_tensors="pt")
+    # 3. Generate the response (on CPU)
     with torch.no_grad():
         output_tokens = model.generate(
             **inputs,
             max_new_tokens=256,
             do_sample=True,
+            temperature=0.7,
             top_k=50,
+            pad_token_id=tokenizer.eos_token_id
         )
+    # 4. Decode the output
+    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=False)
+    # 5. Extract only the model's new response
+    # Find the start of the assistant's turn in the output and everything after it
+    response_start = generated_text.rfind('<|assistant|>')
+    if response_start != -1:
+        # Get the text after <|assistant|> and strip the trailing </s>
+        raw_response = generated_text[response_start + len('<|assistant|>'):].strip()
+        assistant_response = raw_response.split('</s>')[0].strip()
     else:
+        assistant_response = "Error: Could not parse model output."
     return assistant_response
+# --- Gradio Chat Interface ---
+title = "Root Math TinyLlama 1.1B - Gemini-Like Chat Demo"
+description = "A conversational interface for the CPU-friendly TinyLlama model fine-tuned for math problems. Ask follow-up questions!"
 gr.ChatInterface(
     fn=generate_response,
+    chatbot=gr.Chatbot(height=500), # Makes the chat history window taller
+    textbox=gr.Textbox(placeholder="Enter your math problem or follow-up question...", scale=7),
     title=title,
     description=description,
+    submit_btn="Ask Model",
+    clear_btn="Start New Chat",
+    undo_btn="Undo Last Message",
     theme="soft"
 ).queue().launch()