Brain-LLM
/

phi4-mini-raw

Safetensors

phi3

custom_code

Model card Files Files and versions

xet

Community

Yong Liu commited on Apr 17, 2025

Commit

ead8711

1 Parent(s): 0b6ae9b

handler.py updated

Browse files

Files changed (1) hide show

handler.py +26 -1

handler.py CHANGED Viewed

@@ -237,11 +237,16 @@ class EndpointHandler:
                 logger.info(f"Input prompt length: {len(input_text)} characters")
                 # Generate one token at a time to avoid index errors
-                max_steps = min(max_new_tokens, 250)  # Limit to 250 tokens for reliability
                 current_ids = input_ids.clone()
                 logger.info(f"Generating up to {max_steps} tokens")
                 for i in range(max_steps):
                     if i % 50 == 0:
                         logger.info(f"Generated {i} tokens so far")
@@ -284,6 +289,19 @@ class EndpointHandler:
                     current_ids = torch.cat([current_ids, next_token], dim=-1)
                     attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
                     # Check if we've generated an EOS token
                     if next_token[0, 0].item() == self.tokenizer.eos_token_id:
                         logger.info(f"EOS token generated after {i+1} tokens")
@@ -298,6 +316,13 @@ class EndpointHandler:
                 if len(split_text) > 1:
                     response_text = split_text[1].strip()
                     logger.info(f"Extracted assistant response: {len(response_text)} characters")
                 else:
                     # Fallback if the expected format is not found
                     logger.warning("Could not find assistant tag in generated text")

                 logger.info(f"Input prompt length: {len(input_text)} characters")
                 # Generate one token at a time to avoid index errors
+                # Increase from 250 to 500 to allow for longer completions
+                max_steps = min(max_new_tokens, 500)
                 current_ids = input_ids.clone()
                 logger.info(f"Generating up to {max_steps} tokens")
+                # Keep track of last 5 tokens to detect repetition
+                last_tokens = []
+                repetition_detected = False
                 for i in range(max_steps):
                     if i % 50 == 0:
                         logger.info(f"Generated {i} tokens so far")
                     current_ids = torch.cat([current_ids, next_token], dim=-1)
                     attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
+                    # Add to last tokens list for repetition detection
+                    last_tokens.append(next_token.item())
+                    if len(last_tokens) > 5:
+                        last_tokens.pop(0)
+                    # Check for repetition (if we have at least 5 tokens)
+                    if len(last_tokens) >= 5:
+                        # Check if all last 5 tokens are the same
+                        if len(set(last_tokens)) == 1:
+                            logger.warning(f"Repetition detected after {i+1} tokens, stopping generation")
+                            repetition_detected = True
+                            break
                     # Check if we've generated an EOS token
                     if next_token[0, 0].item() == self.tokenizer.eos_token_id:
                         logger.info(f"EOS token generated after {i+1} tokens")
                 if len(split_text) > 1:
                     response_text = split_text[1].strip()
                     logger.info(f"Extracted assistant response: {len(response_text)} characters")
+                    # Check if the response text ends with a complete sentence
+                    if not repetition_detected and not response_text.endswith(('.', '!', '?', ':', ';', '"', "'", ')', ']', '}')):
+                        # Add an ellipsis to indicate truncation
+                        response_text += "..."
+                        logger.info("Added ellipsis to incomplete sentence")
                 else:
                     # Fallback if the expected format is not found
                     logger.warning("Could not find assistant tag in generated text")