chiemekakalu
/

talentfilterdeployment

Safetensors

phi

Model card Files Files and versions

xet

Community

chiemekakalu commited on Mar 18, 2025

Commit

3149176

verified ·

1 Parent(s): c2f6ac2

Update handler.py

Browse files

Files changed (1) hide show

handler.py +71 -19

handler.py CHANGED Viewed

@@ -45,32 +45,49 @@ class EndpointHandler:
         # Find input length to properly calculate output length
         input_length = inputs.shape[1]
-        # Generate with optimized parameters for GPU performance
-        outputs = self.model.generate(
-            inputs,
-            attention_mask=attention_mask,
-            max_new_tokens=max_new_tokens,
             # Performance options
-            use_cache=True,                            # Use KV cache for faster generation
             # Quality vs. speed tradeoff
-            temperature=0.7 if self.use_sampling else 1.0,
-            top_p=0.9 if self.use_sampling else 1.0,
-            do_sample=self.use_sampling,               # Sampling is slightly slower but better quality
-            num_beams=1,                               # Beam search is slower but better quality (1 = no beam search)
             # Token handling
-            pad_token_id=self.tokenizer.pad_token_id,
-            eos_token_id=self.tokenizer.eos_token_id,
             # Content quality
-            repetition_penalty=1.1,                    # Reduce repetition
-            # Memory optimization - enabled only if supported
-            flash_attn=self.flash_attention_supported,
-            flash_attn_cross_entropy=self.flash_attention_supported
-        )
         return outputs, input_length
@@ -190,6 +207,41 @@ class EndpointHandler:
                         device_map="auto",
                     )
             print(f"Model loaded successfully on {self.device}")
             return True
         except Exception as e:
@@ -879,7 +931,7 @@ Return a JSON array containing ONLY the candidate numbers (starting from 1) that
             return {
                 "team_analysis": team_analysis,
                 "model_info": {
-                    "device": str(self.device),
                     "model_type": "phi-2-qlora-finetuned"
                 }
             }

         # Find input length to properly calculate output length
         input_length = inputs.shape[1]
+        # Basic generation parameters
+        generation_kwargs = {
+            "inputs": inputs,
+            "attention_mask": attention_mask,
+            "max_new_tokens": max_new_tokens,
             # Performance options
+            "use_cache": True,                         # Use KV cache for faster generation
             # Quality vs. speed tradeoff
+            "temperature": 0.7 if self.use_sampling else 1.0,
+            "top_p": 0.9 if self.use_sampling else 1.0,
+            "do_sample": self.use_sampling,            # Sampling is slightly slower but better quality
+            "num_beams": 1,                            # Beam search is slower but better quality (1 = no beam search)
             # Token handling
+            "pad_token_id": self.tokenizer.pad_token_id,
+            "eos_token_id": self.tokenizer.eos_token_id,
             # Content quality
+            "repetition_penalty": 1.1,                 # Reduce repetition
+        }
+        # Add Flash Attention parameters only if supported by the transformers version
+        # We check the transformer version by testing in a safe way
+        try:
+            import importlib
+            transformers_version = importlib.import_module('transformers').__version__
+            major, minor = map(int, transformers_version.split('.')[:2])
+            if major > 4 or (major == 4 and minor >= 32):
+                # Flash Attention support was added in transformers 4.32.0
+                if self.flash_attention_supported:
+                    print("Using Flash Attention in generation")
+                    generation_kwargs["flash_attn"] = True
+                    generation_kwargs["flash_attn_cross_entropy"] = True
+            else:
+                print(f"Flash Attention not added - transformers version {transformers_version} doesn't support it")
+        except Exception as e:
+            print(f"Error checking transformers version, skipping Flash Attention: {e}")
+        # Generate with optimized parameters for GPU performance
+        outputs = self.model.generate(**generation_kwargs)
         return outputs, input_length
                         device_map="auto",
                     )
+            # Check for Flash Attention support with better error handling
+            try:
+                # First check if the transformers version supports it
+                import importlib
+                transformers_version = importlib.import_module('transformers').__version__
+                major, minor = map(int, transformers_version.split('.')[:2])
+                if major > 4 or (major == 4 and minor >= 32):
+                    # Flash Attention support was added in transformers 4.32.0
+                    try:
+                        import flash_attn
+                        self.flash_attention_supported = True
+                        print(f"Flash Attention {flash_attn.__version__} detected and will be used if available!")
+                    except ImportError:
+                        print("Flash Attention library not installed. Using standard attention mechanism.")
+                        self.flash_attention_supported = False
+                else:
+                    print(f"Transformers version {transformers_version} doesn't support Flash Attention parameters. Using standard attention.")
+                    self.flash_attention_supported = False
+            except Exception as e:
+                print(f"Error checking Flash Attention support: {e}")
+                print("Falling back to standard attention mechanism.")
+                self.flash_attention_supported = False
+            # Enable TF32 precision for higher performance on newer NVIDIA GPUs
+            if self.device == "cuda":
+                # Only available on Ampere+ GPUs (A100, RTX 3090, etc.)
+                try:
+                    if torch.cuda.get_device_capability()[0] >= 8:
+                        print("Enabling TF32 precision for faster matrix operations")
+                        torch.backends.cuda.matmul.allow_tf32 = True
+                        torch.backends.cudnn.allow_tf32 = True
+                except Exception as e:
+                    print(f"Error enabling TF32 precision: {e}")
             print(f"Model loaded successfully on {self.device}")
             return True
         except Exception as e:
             return {
                 "team_analysis": team_analysis,
                 "model_info": {
+                    "x": str(self.device),
                     "model_type": "phi-2-qlora-finetuned"
                 }
             }