YFolla
/

gemma-3-dnd

@@ -25,66 +25,71 @@ class EndpointHandler():
         # Optional: Explicitly set pad token if needed
         # if self.tokenizer.pad_token is None:
         #    self.tokenizer.pad_token = self.tokenizer.eos_token
-        # Create a text-generation pipeline for easier handling
-        self.pipeline = pipeline(
-            "text-generation",
-            model=self.model,
-            tokenizer=self.tokenizer,
-            # device_map="auto" # device_map should be handled by model loading
-        )
         print("Handler initialized: Model and tokenizer loaded.")
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
-        Handles the inference request.
-        'data' is a dictionary containing the request payload.
-        We expect 'inputs' to hold the prompt text.
-        Optional 'parameters' can control generation settings.
         """
         try:
             # Extract inputs and parameters
-            inputs = data.pop("inputs", None)
             parameters = data.pop("parameters", {})
-            if inputs is None:
                 return [{"error": "Missing 'inputs' key in request data."}]
-            # --- Handle different input types ---
-            if isinstance(inputs, str):
-                 processed_inputs = [inputs]
-            elif isinstance(inputs, list) and all(isinstance(i, str) for i in inputs):
-                 processed_inputs = inputs # Already a list of strings
-            else:
-                 return [{"error": "Invalid 'inputs' format. Must be a string or a list of strings."}]
-            # --- End input handling ---
-            # Set generation parameters (use pipeline defaults + overrides)
-            # Sensible defaults if not provided by user
-            parameters.setdefault("max_new_tokens", 64)
-            parameters.setdefault("temperature", 1.0)
-            parameters.setdefault("top_p", 0.95)
-            parameters.setdefault("top_k", 64)
-            # Ensure pipeline doesn't add EOS if user controls max_new_tokens precisely
-            # parameters.setdefault("return_full_text", False) # Often useful
-            # --- ADD THIS LINE ---
-            parameters["return_full_text"] = False
-            # ---------------------
-            print(f"Received inputs: {processed_inputs}")
-            print(f"Using parameters: {parameters}")
-            # Run inference through the pipeline
-            results = self.pipeline(processed_inputs, **parameters)
-            print(f"Pipeline results: {results}")
-            # Return the results directly (pipeline usually formats correctly)
-            return results
         except Exception as e:
-            # More detailed error logging
             import traceback
             print(f"Error during inference: {e}")
             print(traceback.format_exc())

         # Optional: Explicitly set pad token if needed
         # if self.tokenizer.pad_token is None:
         #    self.tokenizer.pad_token = self.tokenizer.eos_token
         print("Handler initialized: Model and tokenizer loaded.")
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
+        Handles the inference request using manual generation.
         """
         try:
             # Extract inputs and parameters
+            inputs_text = data.pop("inputs", None)
             parameters = data.pop("parameters", {})
+            if inputs_text is None:
                 return [{"error": "Missing 'inputs' key in request data."}]
+            # Basic input validation
+            if not isinstance(inputs_text, str):
+                 return [{"error": "Invalid 'inputs' format. Must be a single string for this handler."}]
+            # Set generation parameters
+            params = {
+                "max_new_tokens": 64,
+                "temperature": 1.0,
+                "top_p": 0.95,
+                "top_k": 64,
+                "do_sample": True, # Explicitly enable sampling
+                "pad_token_id": self.tokenizer.eos_token_id # Use EOS for padding
+            }
+            # Update with user-provided parameters
+            params.update(parameters)
+            print(f"Received input: '{inputs_text}'")
+            print(f"Using parameters: {params}")
+            # Manually tokenize
+            # Important: Add generation prompt structure if needed by the model/tokenizer chat template!
+            # Assuming the tokenizer's chat template handles adding the prompt correctly when needed.
+            # If not, you might need manual formatting here before tokenizing.
+            # Let's try applying the chat template explicitly for robustness:
+            messages = [{"role": "user", "content": inputs_text}]
+            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            print(f"Formatted prompt: '{prompt}'")
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+            # Manually generate
+            # Use torch.no_grad() for efficiency during inference
+            with torch.no_grad():
+                outputs = self.model.generate(**inputs, **params)
+            # Decode the output
+            # outputs[0] contains the full sequence (prompt + generation)
+            # We need to decode only the generated part
+            input_length = inputs.input_ids.shape[1]
+            generated_ids = outputs[0][input_length:]
+            generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+            print(f"Generated IDs length: {len(generated_ids)}")
+            print(f"Decoded generated text: '{generated_text}'")
+            # Return the results
+            return [{"generated_text": generated_text}]
         except Exception as e:
             import traceback
             print(f"Error during inference: {e}")
             print(traceback.format_exc())