Brain-LLM
/

phi4-mini-raw

Safetensors

phi3

custom_code

Model card Files Files and versions

xet

Community

yol146 commited on Apr 12, 2025

Commit

290cf25

1 Parent(s): 72ed73b

modify the handler

Browse files

Files changed (1) hide show

handler.py +8 -7

handler.py CHANGED Viewed

@@ -106,7 +106,7 @@ class EndpointHandler:
             # Tokenize the input safely
             inputs = self.tokenizer(prompt, return_tensors="pt")
-            logger.info(f"Input tokens shape: {inputs.input_ids.shape}")
             # Move to device
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -135,9 +135,10 @@ class EndpointHandler:
                 logger.info(f"Generating with config: {generation_config}")
                 outputs = self.model.generate(
-                    inputs.input_ids,
-                    attention_mask=inputs.attention_mask if hasattr(inputs, 'attention_mask') else None,
                     **generation_config
                 )
@@ -145,7 +146,7 @@ class EndpointHandler:
             generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             # Return only the newly generated text (without the prompt)
-            input_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
             if generated_text.startswith(input_text):
                 response_text = generated_text[len(input_text):]
@@ -168,8 +169,8 @@ class EndpointHandler:
             # Set up generation in a separate thread
             generation_kwargs = {
-                "input_ids": inputs.input_ids,
-                "attention_mask": inputs.attention_mask if hasattr(inputs, 'attention_mask') else None,
                 "streamer": streamer,
                 "max_new_tokens": max_new_tokens,
                 "temperature": temperature,
@@ -182,7 +183,7 @@ class EndpointHandler:
             thread.start()
             # Determine input text length to strip it from outputs
-            input_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
             # Stream the output
             def generate_stream():

             # Tokenize the input safely
             inputs = self.tokenizer(prompt, return_tensors="pt")
+            logger.info(f"Input tokens shape: {inputs['input_ids'].shape}")
             # Move to device
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
                 logger.info(f"Generating with config: {generation_config}")
+                # Fix: inputs is a dictionary, not an object with attributes
                 outputs = self.model.generate(
+                    inputs["input_ids"],
+                    attention_mask=inputs.get("attention_mask", None),
                     **generation_config
                 )
             generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             # Return only the newly generated text (without the prompt)
+            input_text = self.tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
             if generated_text.startswith(input_text):
                 response_text = generated_text[len(input_text):]
             # Set up generation in a separate thread
             generation_kwargs = {
+                "input_ids": inputs["input_ids"],
+                "attention_mask": inputs.get("attention_mask", None),
                 "streamer": streamer,
                 "max_new_tokens": max_new_tokens,
                 "temperature": temperature,
             thread.start()
             # Determine input text length to strip it from outputs
+            input_text = self.tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
             # Stream the output
             def generate_stream():