Brain-LLM
/

phi4-mini-raw

Safetensors

phi3

custom_code

Model card Files Files and versions

xet

Community

Yong Liu commited on Apr 18, 2025

Commit

4aa4d08

1 Parent(s): 093ad9c

update handler

Browse files

Files changed (1) hide show

handler.py +54 -7

handler.py CHANGED Viewed

@@ -25,14 +25,37 @@ class EndpointHandler:
         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
         # Load model directly without pipeline
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             torch_dtype=torch.float16,
             device_map="auto"
         )
         print("Model loaded successfully")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Handle inference request in OpenAI-like format or HuggingFace Inference API format"""
         try:
@@ -142,8 +165,13 @@ class EndpointHandler:
         prompt = inputs["prompt"]
         params = inputs["generation_params"]
-        # Tokenize input
-        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
         # Count input tokens
         input_tokens = input_ids.shape[1]
@@ -159,11 +187,30 @@ class EndpointHandler:
         }
         # Generate output
-        with torch.no_grad():
-            outputs = self.model.generate(
-                input_ids,
-                **generation_kwargs
-            )
         # Decode output
         generated_texts = []

         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        # Determine the device to use
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
         # Load model directly without pipeline
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             torch_dtype=torch.float16,
             device_map="auto"
         )
+        # Ensure model is on the correct device
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
         print("Model loaded successfully")
+        # For Phi3 models, monkey patch the RotaryEmbedding
+        try:
+            from transformers.models.phi3.modeling_phi3 import PhiRotaryEmbedding
+            original_forward = PhiRotaryEmbedding.forward
+            def patched_forward(self, position_ids, query, key, value=None):
+                # Ensure position_ids is on the same device as query
+                position_ids = position_ids.to(query.device)
+                return original_forward(self, position_ids, query, key, value)
+            PhiRotaryEmbedding.forward = patched_forward
+            print("Successfully patched PhiRotaryEmbedding.forward")
+        except Exception as e:
+            print(f"Could not patch PhiRotaryEmbedding: {str(e)}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Handle inference request in OpenAI-like format or HuggingFace Inference API format"""
         try:
         prompt = inputs["prompt"]
         params = inputs["generation_params"]
+        # Get the model's device
+        device = next(self.model.parameters()).device
+        print(f"Model is on device: {device}")
+        # Tokenize input and ensure it's on the correct device
+        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(device)
+        print(f"Input tensor device: {input_ids.device}")
         # Count input tokens
         input_tokens = input_ids.shape[1]
         }
         # Generate output
+        try:
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    input_ids,
+                    **generation_kwargs
+                )
+                print(f"Output tensor device: {outputs.device}")
+        except RuntimeError as e:
+            if "Expected all tensors to be on the same device" in str(e):
+                print("Caught device mismatch error, trying to fix...")
+                # A more drastic approach: move the model completely to CPU if there's a device issue
+                if torch.cuda.is_available():
+                    print("Moving everything to CPU as a fallback")
+                    self.model = self.model.cpu()
+                    input_ids = input_ids.cpu()
+                    with torch.no_grad():
+                        outputs = self.model.generate(
+                            input_ids,
+                            **generation_kwargs
+                        )
+                else:
+                    raise
+            else:
+                raise
         # Decode output
         generated_texts = []