Spaces:

Vivek16
/

vvv

Runtime error

App Files Files Community

Vivek16 commited on Oct 21, 2025

Commit

b2c590c

verified ·

1 Parent(s): f94bcb7

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -36

app.py CHANGED Viewed

@@ -16,13 +16,14 @@ SYSTEM_INSTRUCTION = (
 )
-# --- Model Loading Function (No change from last successful load) ---
 def load_model():
     """Loads the base model and merges the LoRA adapters."""
     print("Loading base model...")
     # Load the tokenizer, which includes the necessary chat template
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL_ID,
         torch_dtype=torch.bfloat16,
@@ -43,7 +44,7 @@ def load_model():
 tokenizer, model = load_model()
-# --- Prediction Function (KEY MODIFICATION: Using tokenizer.apply_chat_template) ---
 def generate_response(message, history):
     """Generates a response using the official chat template and generation constraints."""
@@ -55,25 +56,26 @@ def generate_response(message, history):
     # Add historical messages
     for message_dict in history:
-        # Gradio history items are dicts with 'role' and 'content' keys
         messages.append({"role": message_dict['role'], "content": message_dict['content']})
     # Add the current user message
     messages.append({"role": "user", "content": message})
-    # 2. Apply the model's official chat template to the entire conversation
     full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     # 3. Tokenize the input
     inputs = tokenizer(full_prompt, return_tensors="pt")
-    # 4. Generate the response with anti-repetition constraints
     with torch.no_grad():
         output_tokens = model.generate(
             **inputs,
             max_new_tokens=256,
             do_sample=True,
-            temperature=0.7,
             top_k=50,
             pad_token_id=tokenizer.eos_token_id,
             # Constraints to prevent repetitive filler:
@@ -81,37 +83,24 @@ def generate_response(message, history):
             repetition_penalty=1.5
         )
-    # 5. Decode and clean the output
-    # Decode the entire output sequence
-    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
-    # The output contains the *entire* prompt + the new response.
-    # We must strip the prompt and the user's final message to get the clean response.
-    # The last user message is the end of the prompt we want to remove
-    last_user_message = messages[-1]["content"]
-    # Find the beginning of the model's answer, which comes after the last user message
-    # We use the full user message content for a reliable split point.
-    # In the full_prompt format, the model is expected to start immediately after the last user turn.
-    # We use a simple method: find the last user message and take everything after it.
-    try:
-        # Find where the final user message ends in the generated text (plus a little padding for the template)
-        split_point = generated_text.rfind(last_user_message)
-        if split_point != -1:
-            # Everything after the split point is the generated response
-            assistant_response = generated_text[split_point + len(last_user_message):].strip()
-        else:
-            # Fallback extraction (may be less reliable)
-            assistant_response = generated_text.strip()
-    except Exception:
-        # General safety fallback
-        assistant_response = generated_text.strip()
-    # Final cleanup to ensure no special tokens or remnants are left if skip_special_tokens=False
-    assistant_response = assistant_response.split('</s>')[0].split('<|user|>')[0].strip()
     return assistant_response

 )
+# --- Model Loading Function ---
 def load_model():
     """Loads the base model and merges the LoRA adapters."""
     print("Loading base model...")
     # Load the tokenizer, which includes the necessary chat template
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
+    # Force loading to CPU as per your setup
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL_ID,
         torch_dtype=torch.bfloat16,
 tokenizer, model = load_model()
+# --- Prediction Function (Modified for MAX stability and lower temperature) ---
 def generate_response(message, history):
     """Generates a response using the official chat template and generation constraints."""
     # Add historical messages
     for message_dict in history:
         messages.append({"role": message_dict['role'], "content": message_dict['content']})
     # Add the current user message
     messages.append({"role": "user", "content": message})
+    # 2. Apply the model's official chat template
+    # NOTE: The "TinyLlama/TinyLlama-1.1B-Chat-v1.0" model expects a template like:
+    # <|system|>\nSYSTEM_INSTRUCTION</s>\n<|user|>\nMESSAGE</s>\n<|assistant|>\n
     full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     # 3. Tokenize the input
     inputs = tokenizer(full_prompt, return_tensors="pt")
+    # 4. Generate the response with anti-repetition constraints and LOWER TEMPERATURE
     with torch.no_grad():
         output_tokens = model.generate(
             **inputs,
             max_new_tokens=256,
             do_sample=True,
+            temperature=0.6,  # Slightly lower temp for less gibberish
             top_k=50,
             pad_token_id=tokenizer.eos_token_id,
             # Constraints to prevent repetitive filler:
             repetition_penalty=1.5
         )
+    # 5. Decode and clean the output using skip_special_tokens=True for max cleanup
+    # We still need to find where the *new* response begins.
+    generated_text_with_prompt = tokenizer.decode(output_tokens[0], skip_special_tokens=False)
+    # Extract only the model's new response by finding the last <|assistant|> tag
+    # The last tag marks the beginning of the new response.
+    assistant_prefix_tag = "<|assistant|>"
+    response_start_index = generated_text_with_prompt.rfind(assistant_prefix_tag)
+    if response_start_index != -1:
+        # Get everything after the last <|assistant|> tag
+        raw_response = generated_text_with_prompt[response_start_index + len(assistant_prefix_tag):].strip()
+        # Clean up any trailing end-of-sequence tags (</s>) or user tags (<|user|>)
+        assistant_response = raw_response.split("</s>")[0].split("<|user|>")[0].strip()
+    else:
+        # Fallback to the decoded text if the tag is not found (and hope for the best)
+        assistant_response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
     return assistant_response