chiemekakalu
/

talentfilterdeployment

Safetensors

phi

Model card Files Files and versions

xet

Community

chiemekakalu commited on Mar 18, 2025

Commit

7e483d6

verified ·

1 Parent(s): b5bfda0

Update handler.py

Browse files

Files changed (1) hide show

handler.py +142 -119

handler.py CHANGED Viewed

@@ -32,7 +32,7 @@ class EndpointHandler:
         # Load model immediately
         self.load_model()
     def generate_optimized(self, inputs, attention_mask=None, max_new_tokens=512):
         """
         Optimized generation function that maximizes GPU utilization
@@ -45,32 +45,49 @@ class EndpointHandler:
         # Find input length to properly calculate output length
         input_length = inputs.shape[1]
-        # Generate with optimized parameters for GPU performance
-        outputs = self.model.generate(
-            inputs,
-            attention_mask=attention_mask,
-            max_new_tokens=max_new_tokens,
             # Performance options
-            use_cache=True,                            # Use KV cache for faster generation
             # Quality vs. speed tradeoff
-            temperature=0.7 if self.use_sampling else 1.0,
-            top_p=0.9 if self.use_sampling else 1.0,
-            do_sample=self.use_sampling,               # Sampling is slightly slower but better quality
-            num_beams=1,                               # Beam search is slower but better quality (1 = no beam search)
             # Token handling
-            pad_token_id=self.tokenizer.pad_token_id,
-            eos_token_id=self.tokenizer.eos_token_id,
             # Content quality
-            repetition_penalty=1.1,                    # Reduce repetition
-            # Memory optimization - enabled only if supported
-            flash_attn=self.flash_attention_supported,
-            flash_attn_cross_entropy=self.flash_attention_supported
-        )
         return outputs, input_length
@@ -124,7 +141,8 @@ class EndpointHandler:
                 if os.path.exists(os.path.join(self.model_dir, "adapter_model.safetensors")):
                     print("Found adapter model, loading Phi-2 base with adapter")
-                    # Check if PEFT is available
                     if not PEFT_AVAILABLE:
                         print("PEFT not available, installing...")
                         try:
@@ -189,22 +207,40 @@ class EndpointHandler:
                         device_map="auto",
                     )
-            # Check for Flash Attention support
             try:
-                import flash_attn
-                self.flash_attention_supported = True
-                print("Flash Attention support detected and enabled!")
-            except ImportError:
-                print("Flash Attention not available. Using standard attention mechanism.")
                 self.flash_attention_supported = False
             # Enable TF32 precision for higher performance on newer NVIDIA GPUs
             if self.device == "cuda":
                 # Only available on Ampere+ GPUs (A100, RTX 3090, etc.)
-                if torch.cuda.get_device_capability()[0] >= 8:
-                    print("Enabling TF32 precision for faster matrix operations")
-                    torch.backends.cuda.matmul.allow_tf32 = True
-                    torch.backends.cudnn.allow_tf32 = True
             print(f"Model loaded successfully on {self.device}")
             return True
@@ -248,7 +284,7 @@ class EndpointHandler:
             # Format candidate information
             candidate_summary = self.format_candidates_for_prompt(candidates)
-            # Build a concise team analysis prompt
             prompt = f"""Analyze these candidates and create THREE different optimal startup team compositions of {team_size} people each.
 CANDIDATES:
@@ -258,31 +294,32 @@ TEAM REQUIREMENTS:
 {requirements or "Create a balanced team with complementary skills"}
 For EACH team composition, please provide:
-1. Team Name: A short, memorable name based on the team's strengths
-2. Selected Members: For each team member:
-   - Name and recommended role
-   - ONE specific sentence on why they're valuable
-   - Brief note on how they complement others
-3. Team Analysis:
-   - 3-4 bullet points of key team strengths
-   - 3-4 bullet points of potential challenges
-   - Brief assessment of skill coverage and team dynamics
-   - How this team aligns with requirements
-4. Best For/Worst For:
-   - Type of startup that would be MOST successful with this team
-   - Type of startup that would be LEAST successful with this team
-After presenting all three compositions, provide a 2-3 sentence recommendation on which team would be best and why.
-Use clear headings. Be direct and concise with minimal filler language or unnecessary explanations.
 """
-            # Format as chat with concise system prompt
             messages = [
-                {"role": "system", "content": "You are an elite startup advisor with deep expertise in team composition. Provide direct, actionable analysis without fluff or filler language. Focus on concrete insights and avoid AI-sounding generalizations and repetitive structures."},
                 {"role": "user", "content": prompt}
             ]
@@ -307,17 +344,11 @@ Use clear headings. Be direct and concise with minimal filler language or unnece
                 # Create attention mask (explicitly handle padding)
                 attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
-                # Generate with appropriate parameters
-                outputs = self.model.generate(
                     inputs,
                     attention_mask=attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    temperature=0.7,
-                    top_p=0.9,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id,
-                    repetition_penalty=1.1
                 )
             # Decode more carefully
@@ -374,43 +405,42 @@ Use clear headings. Be direct and concise with minimal filler language or unnece
             # Format team information
             team_summary = self.format_candidates_for_prompt(team)
-            # Build concise team analysis prompt
-            prompt = f"""Analyze this existing startup team:
 TEAM MEMBERS:
 {team_summary}
 Please provide:
-1. Team Composition Analysis:
-   - Key team strengths and complementary skills (3-4 points)
-   - Major skill gaps (2-3 points)
-   - Brief assessment of team dynamics
-2. Success Factors:
-   - Startup types most likely to succeed with this team
-   - 2-3 specific competitive advantages this team offers
-   - How these advantages translate to business outcomes
-3. Risk Factors:
-   - Startup types poorly suited for this team
-   - 2-3 critical blind spots or weaknesses
-   - Specific talent gaps that should be addressed
-Be direct and concise with minimal filler language. Focus on actionable insights rather than generalizations.
 """
             if include_startup_comparison:
                 prompt += """
-4. Comparison to Successful Startups:
-   - 2-3 specific similarities to successful startup teams
-   - 1-2 notable differences from typical success patterns
-   - Brief mention of similar historical team compositions that succeeded
 """
-            # Format as chat with concise system prompt
             messages = [
-                {"role": "system", "content": "You are an elite startup advisor with deep expertise in team composition. Provide direct, actionable analysis without filler language. Focus on specific insights rather than generic observations. Avoid AI-sounding generalizations and clichés."},
                 {"role": "user", "content": prompt}
             ]
@@ -435,17 +465,11 @@ Be direct and concise with minimal filler language. Focus on actionable insights
                 # Create attention mask (explicitly handle padding)
                 attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
-                # Generate with appropriate parameters
-                outputs = self.model.generate(
                     inputs,
                     attention_mask=attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    temperature=0.7,
-                    top_p=0.9,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id,
-                    repetition_penalty=1.1
                 )
             # Decode more carefully
@@ -524,8 +548,8 @@ Be direct and concise with minimal filler language. Focus on actionable insights
                 skills = candidate['skills'] if isinstance(candidate['skills'], list) else [candidate['skills']]
                 skills_info = ", ".join(skills)
-            # Build concise prompt
-            prompt = f"""Analyze this candidate for a startup founder or early employee role:
 CANDIDATE PROFILE:
 Name: {name}
@@ -539,30 +563,35 @@ Experience:
 Skills:
 {skills_info}
-Please provide a concise analysis including:
-1. Strengths Analysis:
    - Key professional strengths based on background and skills
-   - Areas of expertise and how they apply to startups
-2. Founder/Early Employee Fit:
    - Assessment of suitability for founder or early employee roles
    - Ideal role recommendations in a startup team
-3. Complementary Team Members:
-   - What types of co-founders would complement this candidate
    - Skills gaps that should be filled by other team members
-4. Risk Assessment:
-   - Potential blind spots or weaknesses
-   - Areas where the candidate might need support
-Be direct and concise with minimal filler language. Focus on actionable insights.
 """
-            # Format as chat with concise system prompt
             messages = [
-                {"role": "system", "content": "You are an elite talent assessor specializing in startup founders and early employees. You provide concise, direct analysis without unnecessary filler language. Focus on concrete insights with minimal AI-sounding platitudes."},
                 {"role": "user", "content": prompt}
             ]
@@ -587,17 +616,11 @@ Be direct and concise with minimal filler language. Focus on actionable insights
                 # Create attention mask (explicitly handle padding)
                 attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
-                # Generate with appropriate parameters
-                outputs = self.model.generate(
                     inputs,
                     attention_mask=attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    temperature=0.7,
-                    top_p=0.9,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id,
-                    repetition_penalty=1.1
                 )
             # Decode more carefully
@@ -908,7 +931,7 @@ Return a JSON array containing ONLY the candidate numbers (starting from 1) that
             return {
                 "team_analysis": team_analysis,
                 "model_info": {
-                    "device": str(self.device),
                     "model_type": "phi-2-qlora-finetuned"
                 }
             }

         # Load model immediately
         self.load_model()
     def generate_optimized(self, inputs, attention_mask=None, max_new_tokens=512):
         """
         Optimized generation function that maximizes GPU utilization
         # Find input length to properly calculate output length
         input_length = inputs.shape[1]
+        # Basic generation parameters
+        generation_kwargs = {
+            "inputs": inputs,
+            "attention_mask": attention_mask,
+            "max_new_tokens": max_new_tokens,
             # Performance options
+            "use_cache": True,                         # Use KV cache for faster generation
             # Quality vs. speed tradeoff
+            "temperature": 0.7 if self.use_sampling else 1.0,
+            "top_p": 0.9 if self.use_sampling else 1.0,
+            "do_sample": self.use_sampling,            # Sampling is slightly slower but better quality
+            "num_beams": 1,                            # Beam search is slower but better quality (1 = no beam search)
             # Token handling
+            "pad_token_id": self.tokenizer.pad_token_id,
+            "eos_token_id": self.tokenizer.eos_token_id,
             # Content quality
+            "repetition_penalty": 1.1,                 # Reduce repetition
+        }
+        # Add Flash Attention parameters only if supported by the transformers version
+        # We check the transformer version by testing in a safe way
+        try:
+            import importlib
+            transformers_version = importlib.import_module('transformers').__version__
+            major, minor = map(int, transformers_version.split('.')[:2])
+            if major > 4 or (major == 4 and minor >= 32):
+                # Flash Attention support was added in transformers 4.32.0
+                if self.flash_attention_supported:
+                    print("Using Flash Attention in generation")
+                    generation_kwargs["flash_attn"] = True
+                    generation_kwargs["flash_attn_cross_entropy"] = True
+            else:
+                print(f"Flash Attention not added - transformers version {transformers_version} doesn't support it")
+        except Exception as e:
+            print(f"Error checking transformers version, skipping Flash Attention: {e}")
+        # Generate with optimized parameters for GPU performance
+        outputs = self.model.generate(**generation_kwargs)
         return outputs, input_length
                 if os.path.exists(os.path.join(self.model_dir, "adapter_model.safetensors")):
                     print("Found adapter model, loading Phi-2 base with adapter")
+                    # Check if PEFT is available - using the global variable
+                    global PEFT_AVAILABLE
                     if not PEFT_AVAILABLE:
                         print("PEFT not available, installing...")
                         try:
                         device_map="auto",
                     )
+            # Check for Flash Attention support with better error handling
             try:
+                # First check if the transformers version supports it
+                import importlib
+                transformers_version = importlib.import_module('transformers').__version__
+                major, minor = map(int, transformers_version.split('.')[:2])
+                if major > 4 or (major == 4 and minor >= 32):
+                    # Flash Attention support was added in transformers 4.32.0
+                    try:
+                        import flash_attn
+                        self.flash_attention_supported = True
+                        print(f"Flash Attention {flash_attn.__version__} detected and will be used if available!")
+                    except ImportError:
+                        print("Flash Attention library not installed. Using standard attention mechanism.")
+                        self.flash_attention_supported = False
+                else:
+                    print(f"Transformers version {transformers_version} doesn't support Flash Attention parameters. Using standard attention.")
+                    self.flash_attention_supported = False
+            except Exception as e:
+                print(f"Error checking Flash Attention support: {e}")
+                print("Falling back to standard attention mechanism.")
                 self.flash_attention_supported = False
             # Enable TF32 precision for higher performance on newer NVIDIA GPUs
             if self.device == "cuda":
                 # Only available on Ampere+ GPUs (A100, RTX 3090, etc.)
+                try:
+                    if torch.cuda.get_device_capability()[0] >= 8:
+                        print("Enabling TF32 precision for faster matrix operations")
+                        torch.backends.cuda.matmul.allow_tf32 = True
+                        torch.backends.cudnn.allow_tf32 = True
+                except Exception as e:
+                    print(f"Error enabling TF32 precision: {e}")
             print(f"Model loaded successfully on {self.device}")
             return True
             # Format candidate information
             candidate_summary = self.format_candidates_for_prompt(candidates)
+            # Build the enhanced prompt for more detailed analysis
             prompt = f"""Analyze these candidates and create THREE different optimal startup team compositions of {team_size} people each.
 CANDIDATES:
 {requirements or "Create a balanced team with complementary skills"}
 For EACH team composition, please provide:
+1. Team Name: Give this team composition a memorable name based on its strengths
+2. Selected Members: List each selected team member with:
+   - Their name
+   - Recommended role in the team
+   - 2-3 sentences on WHY they specifically are valuable to this team composition
+   - How they complement other team members
+3. Team Analysis (minimum 250 words):
+   - Detailed strengths of this specific team combination
+   - Potential weaknesses or challenges this team might face
+   - Assessment of skill coverage and diversity of thinking
+   - Team dynamics and how members would likely work together
+   - How this team aligns with the stated requirements
+4. Alternative Applications:
+   - What type of startup would be MOST successful with this team
+   - What type of startup would be LEAST successful with this team
+After presenting all three team compositions, provide a final recommendation on which team would be best and why.
+Format your response carefully with clear headings and make it comprehensive enough for founders to make informed decisions.
 """
+            # Format as chat with improved system prompt
             messages = [
+                {"role": "system", "content": "You are an elite startup advisor with deep expertise in team composition and founder dynamics. You specialize in analyzing candidate profiles and determining optimal team compositions that maximize chances of startup success."},
                 {"role": "user", "content": prompt}
             ]
                 # Create attention mask (explicitly handle padding)
                 attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
+                # Use the optimized generator instead of direct model.generate call
+                outputs, input_length = self.generate_optimized(
                     inputs,
                     attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens
                 )
             # Decode more carefully
             # Format team information
             team_summary = self.format_candidates_for_prompt(team)
+            # Build the prompt
+            prompt = f"""Analyze this existing startup team in depth:
 TEAM MEMBERS:
 {team_summary}
 Please provide:
+1. Team Composition Analysis (minimum 150 words):
+   - Overall assessment of the team's strengths and complementary skills
+   - Key skill coverage and potential skill gaps
+   - Team dynamics and how members would likely work together
+   - Potential areas of conflict or collaboration challenges
+2. Success Factors (minimum 100 words):
+   - What types of startups would be MOST successful with this team
+   - Key advantages this team has compared to typical startup teams
+   - How team members' backgrounds create competitive advantages
+3. Risk Factors (minimum 100 words):
+   - What types of startups would be LEAST successful with this team
+   - Potential blind spots or weaknesses in the team composition
+   - Suggested additions or changes to strengthen the team
 """
             if include_startup_comparison:
                 prompt += """
+4. Comparison to Successful Startups (minimum 100 words):
+   - How this team compares to founding teams of successful startups
+   - Historical examples of similar team compositions that succeeded
+   - Key differentiating factors from typical successful startup teams
 """
+            # Format as chat with improved system prompt
             messages = [
+                {"role": "system", "content": "You are an elite startup advisor with deep expertise in team composition and founder dynamics. You specialize in analyzing team profiles and providing actionable insights to maximize chances of startup success."},
                 {"role": "user", "content": prompt}
             ]
                 # Create attention mask (explicitly handle padding)
                 attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
+                # Use the optimized generator instead of direct model.generate call
+                outputs, input_length = self.generate_optimized(
                     inputs,
                     attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens
                 )
             # Decode more carefully
                 skills = candidate['skills'] if isinstance(candidate['skills'], list) else [candidate['skills']]
                 skills_info = ", ".join(skills)
+            # Build comprehensive prompt
+            prompt = f"""Analyze this candidate in depth for a startup founder or early employee role:
 CANDIDATE PROFILE:
 Name: {name}
 Skills:
 {skills_info}
+Please provide a comprehensive analysis including:
+1. Strengths Analysis (minimum 150 words):
    - Key professional strengths based on background and skills
+   - Notable accomplishments and their significance
+   - Areas of deep expertise and how they apply to startups
+2. Founder/Early Employee Fit (minimum 150 words):
    - Assessment of suitability for founder or early employee roles
+   - Specific founder archetype this candidate represents
+   - Optimal startup stages for this candidate
    - Ideal role recommendations in a startup team
+3. Complementary Team Members (minimum 100 words):
+   - What types of co-founders or team members would complement this candidate
+   - Potential team dynamics when working with different personality types
    - Skills gaps that should be filled by other team members
+4. Risk Assessment (minimum 100 words):
+   - Potential blind spots or weaknesses based on background
+   - Areas where the candidate might need support or development
+   - Situations where this candidate might struggle in a startup environment
+Format your analysis with clear sections and detailed insights to help assess this candidate for startup roles.
 """
+            # Format as chat with system prompt
             messages = [
+                {"role": "system", "content": "You are an elite talent assessor specializing in startup founders and early employees. You provide in-depth analysis of candidates' strengths, founder fit, and team compatibility."},
                 {"role": "user", "content": prompt}
             ]
                 # Create attention mask (explicitly handle padding)
                 attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
+                # Use the optimized generator instead of direct model.generate call
+                outputs, input_length = self.generate_optimized(
                     inputs,
                     attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens
                 )
             # Decode more carefully
             return {
                 "team_analysis": team_analysis,
                 "model_info": {
+                    "x": str(self.device),
                     "model_type": "phi-2-qlora-finetuned"
                 }
             }