chiemekakalu
/

talentfilterdeployment

Safetensors

phi

Model card Files Files and versions

xet

Community

chiemekakalu commited on Mar 18, 2025

Commit

48fa2fe

verified ·

1 Parent(s): 2e1f144

Update handler.py

Browse files

Files changed (1) hide show

handler.py +293 -49

handler.py CHANGED Viewed

@@ -1,8 +1,17 @@
 import os
 import json
 import torch
-from typing import Dict, List, Any
-from transformers import AutoModelForCausalLM, AutoTokenizer
 class EndpointHandler:
@@ -17,39 +26,175 @@ class EndpointHandler:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model_dir = model_dir or os.getenv("MODEL_PATH", "/model")
         # Load model immediately
         self.load_model()
     def load_model(self):
         """Load the finetuned model and tokenizer."""
         try:
             print(f"Loading model from {self.model_dir} to {self.device}...")
-            # Load tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
-            # Try to load model with quantization, fall back to standard loading if bitsandbytes is missing
             try:
-                # First try with 4-bit quantization
-                self.model = AutoModelForCausalLM.from_pretrained(
                     self.model_dir,
-                    torch_dtype=torch.float16,  # Use FP16 for efficiency
-                    device_map="auto",          # Auto-assign to available devices
-                    load_in_4bit=True,          # Use 4-bit quantization for memory efficiency
                 )
-            except ImportError as e:
-                print(f"Warning: Could not use quantization, falling back to standard loading: {e}")
-                # Fallback to standard loading without quantization
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_dir,
-                    torch_dtype=torch.float16,  # Still use FP16 for efficiency
-                    device_map="auto",          # Auto-assign to available devices
                 )
             print(f"Model loaded successfully on {self.device}")
             return True
         except Exception as e:
             print(f"Error loading model: {e}")
             return False
     def format_candidates_for_prompt(self, candidates: List[Dict[str, Any]]) -> str:
@@ -131,21 +276,54 @@ Format your response carefully with clear headings and make it comprehensive eno
                 return_tensors="pt"
             ).to(self.device)
-            # Generate
             with torch.no_grad():
-                outputs = self.model.generate(
                     inputs,
-                    max_length=4096,
-                    temperature=0.7,
-                    top_p=0.9,
-                    do_sample=True,
                 )
-            # Decode
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract the assistant's response (everything after the user's prompt)
-            assistant_response = response.split(prompt)[-1].strip()
             return assistant_response
@@ -219,21 +397,54 @@ Please provide:
                 return_tensors="pt"
             ).to(self.device)
-            # Generate
             with torch.no_grad():
-                outputs = self.model.generate(
                     inputs,
-                    max_length=3072,
-                    temperature=0.7,
-                    top_p=0.9,
-                    do_sample=True,
                 )
-            # Decode
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract the assistant's response
-            assistant_response = response.split(prompt)[-1].strip()
             return assistant_response
@@ -337,21 +548,54 @@ Format your analysis with clear sections and detailed insights to help assess th
                 return_tensors="pt"
             ).to(self.device)
-            # Generate
             with torch.no_grad():
-                outputs = self.model.generate(
                     inputs,
-                    max_length=3072,
-                    temperature=0.7,
-                    top_p=0.9,
-                    do_sample=True,
                 )
-            # Decode
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract the assistant's response
-            assistant_response = response.split(prompt)[-1].strip()
             return assistant_response

 import os
 import json
 import torch
+from typing import Dict, List, Any, Optional, Union
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# Import PEFT for adapter handling
+try:
+    import peft
+    from peft import PeftModel, PeftConfig
+    PEFT_AVAILABLE = True
+except ImportError:
+    PEFT_AVAILABLE = False
+    print("Warning: PEFT library not available. Adapter loading may fail.")
 class EndpointHandler:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model_dir = model_dir or os.getenv("MODEL_PATH", "/model")
+        # GPU performance optimization flags
+        self.flash_attention_supported = False  # Will be set during model loading
+        self.use_sampling = True                # Better quality but slightly slower than greedy
         # Load model immediately
         self.load_model()
+    def generate_optimized(self, inputs, attention_mask=None, max_new_tokens=512):
+        """
+        Optimized generation function that maximizes GPU utilization
+        while respecting model constraints.
+        """
+        # Check if we need to create an attention mask
+        if attention_mask is None:
+            attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
+        # Find input length to properly calculate output length
+        input_length = inputs.shape[1]
+        # Generate with optimized parameters for GPU performance
+        outputs = self.model.generate(
+            inputs,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+            # Performance options
+            use_cache=True,                            # Use KV cache for faster generation
+            # Quality vs. speed tradeoff
+            temperature=0.7 if self.use_sampling else 1.0,
+            top_p=0.9 if self.use_sampling else 1.0,
+            do_sample=self.use_sampling,               # Sampling is slightly slower but better quality
+            num_beams=1,                               # Beam search is slower but better quality (1 = no beam search)
+            # Token handling
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            # Content quality
+            repetition_penalty=1.1,                    # Reduce repetition
+            # Memory optimization - enabled only if supported
+            flash_attn=self.flash_attention_supported,
+            flash_attn_cross_entropy=self.flash_attention_supported
+        )
+        return outputs, input_length
     def load_model(self):
         """Load the finetuned model and tokenizer."""
         try:
             print(f"Loading model from {self.model_dir} to {self.device}...")
+            # Load tokenizer with explicit padding token configuration
             try:
+                self.tokenizer = AutoTokenizer.from_pretrained(
                     self.model_dir,
+                    padding_side="left",  # Set padding to left side for causal LM
+                    trust_remote_code=False
                 )
+                # Ensure pad token is set properly (important for attention masks)
+                if self.tokenizer.pad_token is None:
+                    self.tokenizer.pad_token = self.tokenizer.eos_token
+                    print("Set pad_token to eos_token")
+            except Exception as tokenizer_error:
+                print(f"Error loading tokenizer from {self.model_dir}: {tokenizer_error}")
+                print("Attempting to load base Phi-2 tokenizer...")
+                # Fall back to base Phi-2 tokenizer if model dir tokenizer fails
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    "microsoft/phi-2",
+                    padding_side="left",
+                    trust_remote_code=False
                 )
+                if self.tokenizer.pad_token is None:
+                    self.tokenizer.pad_token = self.tokenizer.eos_token
+            # Try to load model with quantization with consistent dtype settings
+            try:
+                from bitsandbytes.nn import Linear4bit
+                from transformers import BitsAndBytesConfig
+                print("Using 4-bit quantization with float16 compute type")
+                # Use consistent float16 for both compute and parameters
+                quantization_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,  # Match with model dtype
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4"
+                )
+                # Try to load with base model specification for better adapter compatibility
+                if os.path.exists(os.path.join(self.model_dir, "adapter_model.safetensors")):
+                    print("Found adapter model, loading Phi-2 base with adapter")
+                    # Check if PEFT is available
+                    if not PEFT_AVAILABLE:
+                        print("PEFT not available, installing...")
+                        try:
+                            import pip
+                            pip.main(['install', 'peft'])
+                            import peft
+                            from peft import PeftModel, PeftConfig
+                            PEFT_AVAILABLE = True
+                        except Exception as e:
+                            print(f"Failed to install PEFT: {e}")
+                    # First load base model with quantization
+                    base_model = AutoModelForCausalLM.from_pretrained(
+                        "microsoft/phi-2",
+                        quantization_config=quantization_config,
+                        torch_dtype=torch.float16,
+                        device_map="auto"
+                    )
+                    try:
+                        # Then load adapter on top
+                        self.model = PeftModel.from_pretrained(
+                            base_model,
+                            self.model_dir,
+                            torch_dtype=torch.float16,
+                            device_map="auto"
+                        )
+                        print("Successfully loaded adapter model")
+                    except Exception as adapter_error:
+                        print(f"Error loading adapter: {adapter_error}")
+                        # Fall back to just using the base model
+                        print("Falling back to base model without adapter")
+                        self.model = base_model
+                else:
+                    # Load as a standard model if no adapter is found
+                    print("Loading model directly from directory")
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        self.model_dir,
+                        torch_dtype=torch.float16,
+                        device_map="auto",
+                        quantization_config=quantization_config
+                    )
+            except ImportError as e:
+                print(f"Warning: Could not use bitsandbytes quantization, falling back to standard loading: {e}")
+                # Fallback to standard FP16 loading without quantization
+                try:
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        self.model_dir,
+                        torch_dtype=torch.float16,
+                        device_map="auto",
+                    )
+                except Exception as model_error:
+                    print(f"Error loading from model directory: {model_error}")
+                    print("Attempting to load base Phi-2 model...")
+                    # Final fallback - try loading just the base model
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        "microsoft/phi-2",
+                        torch_dtype=torch.float16,
+                        device_map="auto",
+                    )
             print(f"Model loaded successfully on {self.device}")
             return True
         except Exception as e:
             print(f"Error loading model: {e}")
+            import traceback
+            print(traceback.format_exc())
             return False
     def format_candidates_for_prompt(self, candidates: List[Dict[str, Any]]) -> str:
                 return_tensors="pt"
             ).to(self.device)
+            # Generate with proper context limits and attention masks
             with torch.no_grad():
+                # Find input length to set appropriate output length
+                input_length = inputs.shape[1]
+                # Phi-2 has a context limit of 2048
+                max_context_length = 2048
+                # Calculate max new tokens to avoid exceeding model's context limits
+                max_new_tokens = max(100, min(1024, max_context_length - input_length))
+                print(f"Input length: {input_length}, Max new tokens: {max_new_tokens}")
+                # Create attention mask (explicitly handle padding)
+                attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
+                # Use the optimized generator instead of direct model.generate call
+                outputs, input_length = self.generate_optimized(
                     inputs,
+                    attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens
                 )
+            # Decode more carefully
+            try:
+                # Get only the generated part (exclude input tokens)
+                generated_output = outputs[0][input_length:]
+                # Decode just the new tokens
+                generated_text = self.tokenizer.decode(
+                    generated_output,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=True
+                )
+                # Remove any model-specific artifacts
+                generated_text = generated_text.replace("<|im_end|>", "").replace("<|im_start|>", "")
+                assistant_response = generated_text.strip()
+                # If that failed, try traditional approach
+                if not assistant_response:
+                    full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    assistant_response = full_response.split(prompt)[-1].strip()
+            except Exception as decode_error:
+                print(f"Error decoding response: {decode_error}")
+                # Fallback to simpler decoding
+                full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                assistant_response = full_response.split(prompt)[-1].strip()
             return assistant_response
                 return_tensors="pt"
             ).to(self.device)
+            # Generate with proper context limits and attention masks
             with torch.no_grad():
+                # Find input length to set appropriate output length
+                input_length = inputs.shape[1]
+                # Phi-2 has a context limit of 2048
+                max_context_length = 2048
+                # Calculate max new tokens to avoid exceeding model's context limits
+                max_new_tokens = max(100, min(1024, max_context_length - input_length))
+                print(f"Team analysis - Input length: {input_length}, Max new tokens: {max_new_tokens}")
+                # Create attention mask (explicitly handle padding)
+                attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
+                # Use the optimized generator instead of direct model.generate call
+                outputs, input_length = self.generate_optimized(
                     inputs,
+                    attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens
                 )
+            # Decode more carefully
+            try:
+                # Get only the generated part (exclude input tokens)
+                generated_output = outputs[0][input_length:]
+                # Decode just the new tokens
+                generated_text = self.tokenizer.decode(
+                    generated_output,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=True
+                )
+                # Remove any model-specific artifacts
+                generated_text = generated_text.replace("<|im_end|>", "").replace("<|im_start|>", "")
+                assistant_response = generated_text.strip()
+                # If that failed, try traditional approach
+                if not assistant_response:
+                    full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    assistant_response = full_response.split(prompt)[-1].strip()
+            except Exception as decode_error:
+                print(f"Error decoding team analysis response: {decode_error}")
+                # Fallback to simpler decoding
+                full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                assistant_response = full_response.split(prompt)[-1].strip()
             return assistant_response
                 return_tensors="pt"
             ).to(self.device)
+            # Generate with proper context limits and attention masks
             with torch.no_grad():
+                # Find input length to set appropriate output length
+                input_length = inputs.shape[1]
+                # Phi-2 has a context limit of 2048
+                max_context_length = 2048
+                # Calculate max new tokens to avoid exceeding model's context limits
+                max_new_tokens = max(100, min(1024, max_context_length - input_length))
+                print(f"Candidate analysis - Input length: {input_length}, Max new tokens: {max_new_tokens}")
+                # Create attention mask (explicitly handle padding)
+                attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
+                # Use the optimized generator instead of direct model.generate call
+                outputs, input_length = self.generate_optimized(
                     inputs,
+                    attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens
                 )
+            # Decode more carefully
+            try:
+                # Get only the generated part (exclude input tokens)
+                generated_output = outputs[0][input_length:]
+                # Decode just the new tokens
+                generated_text = self.tokenizer.decode(
+                    generated_output,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=True
+                )
+                # Remove any model-specific artifacts
+                generated_text = generated_text.replace("<|im_end|>", "").replace("<|im_start|>", "")
+                assistant_response = generated_text.strip()
+                # If that failed, try traditional approach
+                if not assistant_response:
+                    full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    assistant_response = full_response.split(prompt)[-1].strip()
+            except Exception as decode_error:
+                print(f"Error decoding candidate analysis response: {decode_error}")
+                # Fallback to simpler decoding
+                full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                assistant_response = full_response.split(prompt)[-1].strip()
             return assistant_response