chiemekakalu commited on
Commit
7e483d6
·
verified ·
1 Parent(s): b5bfda0

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +142 -119
handler.py CHANGED
@@ -32,7 +32,7 @@ class EndpointHandler:
32
 
33
  # Load model immediately
34
  self.load_model()
35
-
36
  def generate_optimized(self, inputs, attention_mask=None, max_new_tokens=512):
37
  """
38
  Optimized generation function that maximizes GPU utilization
@@ -45,32 +45,49 @@ class EndpointHandler:
45
  # Find input length to properly calculate output length
46
  input_length = inputs.shape[1]
47
 
48
- # Generate with optimized parameters for GPU performance
49
- outputs = self.model.generate(
50
- inputs,
51
- attention_mask=attention_mask,
52
- max_new_tokens=max_new_tokens,
53
 
54
  # Performance options
55
- use_cache=True, # Use KV cache for faster generation
56
 
57
  # Quality vs. speed tradeoff
58
- temperature=0.7 if self.use_sampling else 1.0,
59
- top_p=0.9 if self.use_sampling else 1.0,
60
- do_sample=self.use_sampling, # Sampling is slightly slower but better quality
61
- num_beams=1, # Beam search is slower but better quality (1 = no beam search)
62
 
63
  # Token handling
64
- pad_token_id=self.tokenizer.pad_token_id,
65
- eos_token_id=self.tokenizer.eos_token_id,
66
 
67
  # Content quality
68
- repetition_penalty=1.1, # Reduce repetition
69
-
70
- # Memory optimization - enabled only if supported
71
- flash_attn=self.flash_attention_supported,
72
- flash_attn_cross_entropy=self.flash_attention_supported
73
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  return outputs, input_length
76
 
@@ -124,7 +141,8 @@ class EndpointHandler:
124
  if os.path.exists(os.path.join(self.model_dir, "adapter_model.safetensors")):
125
  print("Found adapter model, loading Phi-2 base with adapter")
126
 
127
- # Check if PEFT is available
 
128
  if not PEFT_AVAILABLE:
129
  print("PEFT not available, installing...")
130
  try:
@@ -189,22 +207,40 @@ class EndpointHandler:
189
  device_map="auto",
190
  )
191
 
192
- # Check for Flash Attention support
193
  try:
194
- import flash_attn
195
- self.flash_attention_supported = True
196
- print("Flash Attention support detected and enabled!")
197
- except ImportError:
198
- print("Flash Attention not available. Using standard attention mechanism.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  self.flash_attention_supported = False
200
 
201
  # Enable TF32 precision for higher performance on newer NVIDIA GPUs
202
  if self.device == "cuda":
203
  # Only available on Ampere+ GPUs (A100, RTX 3090, etc.)
204
- if torch.cuda.get_device_capability()[0] >= 8:
205
- print("Enabling TF32 precision for faster matrix operations")
206
- torch.backends.cuda.matmul.allow_tf32 = True
207
- torch.backends.cudnn.allow_tf32 = True
 
 
 
208
 
209
  print(f"Model loaded successfully on {self.device}")
210
  return True
@@ -248,7 +284,7 @@ class EndpointHandler:
248
  # Format candidate information
249
  candidate_summary = self.format_candidates_for_prompt(candidates)
250
 
251
- # Build a concise team analysis prompt
252
  prompt = f"""Analyze these candidates and create THREE different optimal startup team compositions of {team_size} people each.
253
 
254
  CANDIDATES:
@@ -258,31 +294,32 @@ TEAM REQUIREMENTS:
258
  {requirements or "Create a balanced team with complementary skills"}
259
 
260
  For EACH team composition, please provide:
261
- 1. Team Name: A short, memorable name based on the team's strengths
262
-
263
- 2. Selected Members: For each team member:
264
- - Name and recommended role
265
- - ONE specific sentence on why they're valuable
266
- - Brief note on how they complement others
267
-
268
- 3. Team Analysis:
269
- - 3-4 bullet points of key team strengths
270
- - 3-4 bullet points of potential challenges
271
- - Brief assessment of skill coverage and team dynamics
272
- - How this team aligns with requirements
273
-
274
- 4. Best For/Worst For:
275
- - Type of startup that would be MOST successful with this team
276
- - Type of startup that would be LEAST successful with this team
277
-
278
- After presenting all three compositions, provide a 2-3 sentence recommendation on which team would be best and why.
279
-
280
- Use clear headings. Be direct and concise with minimal filler language or unnecessary explanations.
 
281
  """
282
 
283
- # Format as chat with concise system prompt
284
  messages = [
285
- {"role": "system", "content": "You are an elite startup advisor with deep expertise in team composition. Provide direct, actionable analysis without fluff or filler language. Focus on concrete insights and avoid AI-sounding generalizations and repetitive structures."},
286
  {"role": "user", "content": prompt}
287
  ]
288
 
@@ -307,17 +344,11 @@ Use clear headings. Be direct and concise with minimal filler language or unnece
307
  # Create attention mask (explicitly handle padding)
308
  attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
309
 
310
- # Generate with appropriate parameters
311
- outputs = self.model.generate(
312
  inputs,
313
  attention_mask=attention_mask,
314
- max_new_tokens=max_new_tokens,
315
- temperature=0.7,
316
- top_p=0.9,
317
- do_sample=True,
318
- pad_token_id=self.tokenizer.pad_token_id,
319
- eos_token_id=self.tokenizer.eos_token_id,
320
- repetition_penalty=1.1
321
  )
322
 
323
  # Decode more carefully
@@ -374,43 +405,42 @@ Use clear headings. Be direct and concise with minimal filler language or unnece
374
  # Format team information
375
  team_summary = self.format_candidates_for_prompt(team)
376
 
377
- # Build concise team analysis prompt
378
- prompt = f"""Analyze this existing startup team:
379
 
380
  TEAM MEMBERS:
381
  {team_summary}
382
 
383
  Please provide:
384
 
385
- 1. Team Composition Analysis:
386
- - Key team strengths and complementary skills (3-4 points)
387
- - Major skill gaps (2-3 points)
388
- - Brief assessment of team dynamics
389
-
390
- 2. Success Factors:
391
- - Startup types most likely to succeed with this team
392
- - 2-3 specific competitive advantages this team offers
393
- - How these advantages translate to business outcomes
394
-
395
- 3. Risk Factors:
396
- - Startup types poorly suited for this team
397
- - 2-3 critical blind spots or weaknesses
398
- - Specific talent gaps that should be addressed
399
-
400
- Be direct and concise with minimal filler language. Focus on actionable insights rather than generalizations.
401
  """
402
 
403
  if include_startup_comparison:
404
  prompt += """
405
- 4. Comparison to Successful Startups:
406
- - 2-3 specific similarities to successful startup teams
407
- - 1-2 notable differences from typical success patterns
408
- - Brief mention of similar historical team compositions that succeeded
409
  """
410
 
411
- # Format as chat with concise system prompt
412
  messages = [
413
- {"role": "system", "content": "You are an elite startup advisor with deep expertise in team composition. Provide direct, actionable analysis without filler language. Focus on specific insights rather than generic observations. Avoid AI-sounding generalizations and clichés."},
414
  {"role": "user", "content": prompt}
415
  ]
416
 
@@ -435,17 +465,11 @@ Be direct and concise with minimal filler language. Focus on actionable insights
435
  # Create attention mask (explicitly handle padding)
436
  attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
437
 
438
- # Generate with appropriate parameters
439
- outputs = self.model.generate(
440
  inputs,
441
  attention_mask=attention_mask,
442
- max_new_tokens=max_new_tokens,
443
- temperature=0.7,
444
- top_p=0.9,
445
- do_sample=True,
446
- pad_token_id=self.tokenizer.pad_token_id,
447
- eos_token_id=self.tokenizer.eos_token_id,
448
- repetition_penalty=1.1
449
  )
450
 
451
  # Decode more carefully
@@ -524,8 +548,8 @@ Be direct and concise with minimal filler language. Focus on actionable insights
524
  skills = candidate['skills'] if isinstance(candidate['skills'], list) else [candidate['skills']]
525
  skills_info = ", ".join(skills)
526
 
527
- # Build concise prompt
528
- prompt = f"""Analyze this candidate for a startup founder or early employee role:
529
 
530
  CANDIDATE PROFILE:
531
  Name: {name}
@@ -539,30 +563,35 @@ Experience:
539
  Skills:
540
  {skills_info}
541
 
542
- Please provide a concise analysis including:
543
 
544
- 1. Strengths Analysis:
545
  - Key professional strengths based on background and skills
546
- - Areas of expertise and how they apply to startups
 
547
 
548
- 2. Founder/Early Employee Fit:
549
  - Assessment of suitability for founder or early employee roles
 
 
550
  - Ideal role recommendations in a startup team
551
 
552
- 3. Complementary Team Members:
553
- - What types of co-founders would complement this candidate
 
554
  - Skills gaps that should be filled by other team members
555
 
556
- 4. Risk Assessment:
557
- - Potential blind spots or weaknesses
558
- - Areas where the candidate might need support
 
559
 
560
- Be direct and concise with minimal filler language. Focus on actionable insights.
561
  """
562
 
563
- # Format as chat with concise system prompt
564
  messages = [
565
- {"role": "system", "content": "You are an elite talent assessor specializing in startup founders and early employees. You provide concise, direct analysis without unnecessary filler language. Focus on concrete insights with minimal AI-sounding platitudes."},
566
  {"role": "user", "content": prompt}
567
  ]
568
 
@@ -587,17 +616,11 @@ Be direct and concise with minimal filler language. Focus on actionable insights
587
  # Create attention mask (explicitly handle padding)
588
  attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
589
 
590
- # Generate with appropriate parameters
591
- outputs = self.model.generate(
592
  inputs,
593
  attention_mask=attention_mask,
594
- max_new_tokens=max_new_tokens,
595
- temperature=0.7,
596
- top_p=0.9,
597
- do_sample=True,
598
- pad_token_id=self.tokenizer.pad_token_id,
599
- eos_token_id=self.tokenizer.eos_token_id,
600
- repetition_penalty=1.1
601
  )
602
 
603
  # Decode more carefully
@@ -908,7 +931,7 @@ Return a JSON array containing ONLY the candidate numbers (starting from 1) that
908
  return {
909
  "team_analysis": team_analysis,
910
  "model_info": {
911
- "device": str(self.device),
912
  "model_type": "phi-2-qlora-finetuned"
913
  }
914
  }
 
32
 
33
  # Load model immediately
34
  self.load_model()
35
+
36
  def generate_optimized(self, inputs, attention_mask=None, max_new_tokens=512):
37
  """
38
  Optimized generation function that maximizes GPU utilization
 
45
  # Find input length to properly calculate output length
46
  input_length = inputs.shape[1]
47
 
48
+ # Basic generation parameters
49
+ generation_kwargs = {
50
+ "inputs": inputs,
51
+ "attention_mask": attention_mask,
52
+ "max_new_tokens": max_new_tokens,
53
 
54
  # Performance options
55
+ "use_cache": True, # Use KV cache for faster generation
56
 
57
  # Quality vs. speed tradeoff
58
+ "temperature": 0.7 if self.use_sampling else 1.0,
59
+ "top_p": 0.9 if self.use_sampling else 1.0,
60
+ "do_sample": self.use_sampling, # Sampling is slightly slower but better quality
61
+ "num_beams": 1, # Beam search is slower but better quality (1 = no beam search)
62
 
63
  # Token handling
64
+ "pad_token_id": self.tokenizer.pad_token_id,
65
+ "eos_token_id": self.tokenizer.eos_token_id,
66
 
67
  # Content quality
68
+ "repetition_penalty": 1.1, # Reduce repetition
69
+ }
70
+
71
+ # Add Flash Attention parameters only if supported by the transformers version
72
+ # We check the transformer version by testing in a safe way
73
+ try:
74
+ import importlib
75
+ transformers_version = importlib.import_module('transformers').__version__
76
+ major, minor = map(int, transformers_version.split('.')[:2])
77
+
78
+ if major > 4 or (major == 4 and minor >= 32):
79
+ # Flash Attention support was added in transformers 4.32.0
80
+ if self.flash_attention_supported:
81
+ print("Using Flash Attention in generation")
82
+ generation_kwargs["flash_attn"] = True
83
+ generation_kwargs["flash_attn_cross_entropy"] = True
84
+ else:
85
+ print(f"Flash Attention not added - transformers version {transformers_version} doesn't support it")
86
+ except Exception as e:
87
+ print(f"Error checking transformers version, skipping Flash Attention: {e}")
88
+
89
+ # Generate with optimized parameters for GPU performance
90
+ outputs = self.model.generate(**generation_kwargs)
91
 
92
  return outputs, input_length
93
 
 
141
  if os.path.exists(os.path.join(self.model_dir, "adapter_model.safetensors")):
142
  print("Found adapter model, loading Phi-2 base with adapter")
143
 
144
+ # Check if PEFT is available - using the global variable
145
+ global PEFT_AVAILABLE
146
  if not PEFT_AVAILABLE:
147
  print("PEFT not available, installing...")
148
  try:
 
207
  device_map="auto",
208
  )
209
 
210
+ # Check for Flash Attention support with better error handling
211
  try:
212
+ # First check if the transformers version supports it
213
+ import importlib
214
+ transformers_version = importlib.import_module('transformers').__version__
215
+ major, minor = map(int, transformers_version.split('.')[:2])
216
+
217
+ if major > 4 or (major == 4 and minor >= 32):
218
+ # Flash Attention support was added in transformers 4.32.0
219
+ try:
220
+ import flash_attn
221
+ self.flash_attention_supported = True
222
+ print(f"Flash Attention {flash_attn.__version__} detected and will be used if available!")
223
+ except ImportError:
224
+ print("Flash Attention library not installed. Using standard attention mechanism.")
225
+ self.flash_attention_supported = False
226
+ else:
227
+ print(f"Transformers version {transformers_version} doesn't support Flash Attention parameters. Using standard attention.")
228
+ self.flash_attention_supported = False
229
+ except Exception as e:
230
+ print(f"Error checking Flash Attention support: {e}")
231
+ print("Falling back to standard attention mechanism.")
232
  self.flash_attention_supported = False
233
 
234
  # Enable TF32 precision for higher performance on newer NVIDIA GPUs
235
  if self.device == "cuda":
236
  # Only available on Ampere+ GPUs (A100, RTX 3090, etc.)
237
+ try:
238
+ if torch.cuda.get_device_capability()[0] >= 8:
239
+ print("Enabling TF32 precision for faster matrix operations")
240
+ torch.backends.cuda.matmul.allow_tf32 = True
241
+ torch.backends.cudnn.allow_tf32 = True
242
+ except Exception as e:
243
+ print(f"Error enabling TF32 precision: {e}")
244
 
245
  print(f"Model loaded successfully on {self.device}")
246
  return True
 
284
  # Format candidate information
285
  candidate_summary = self.format_candidates_for_prompt(candidates)
286
 
287
+ # Build the enhanced prompt for more detailed analysis
288
  prompt = f"""Analyze these candidates and create THREE different optimal startup team compositions of {team_size} people each.
289
 
290
  CANDIDATES:
 
294
  {requirements or "Create a balanced team with complementary skills"}
295
 
296
  For EACH team composition, please provide:
297
+ 1. Team Name: Give this team composition a memorable name based on its strengths
298
+ 2. Selected Members: List each selected team member with:
299
+ - Their name
300
+ - Recommended role in the team
301
+ - 2-3 sentences on WHY they specifically are valuable to this team composition
302
+ - How they complement other team members
303
+
304
+ 3. Team Analysis (minimum 250 words):
305
+ - Detailed strengths of this specific team combination
306
+ - Potential weaknesses or challenges this team might face
307
+ - Assessment of skill coverage and diversity of thinking
308
+ - Team dynamics and how members would likely work together
309
+ - How this team aligns with the stated requirements
310
+
311
+ 4. Alternative Applications:
312
+ - What type of startup would be MOST successful with this team
313
+ - What type of startup would be LEAST successful with this team
314
+
315
+ After presenting all three team compositions, provide a final recommendation on which team would be best and why.
316
+
317
+ Format your response carefully with clear headings and make it comprehensive enough for founders to make informed decisions.
318
  """
319
 
320
+ # Format as chat with improved system prompt
321
  messages = [
322
+ {"role": "system", "content": "You are an elite startup advisor with deep expertise in team composition and founder dynamics. You specialize in analyzing candidate profiles and determining optimal team compositions that maximize chances of startup success."},
323
  {"role": "user", "content": prompt}
324
  ]
325
 
 
344
  # Create attention mask (explicitly handle padding)
345
  attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
346
 
347
+ # Use the optimized generator instead of direct model.generate call
348
+ outputs, input_length = self.generate_optimized(
349
  inputs,
350
  attention_mask=attention_mask,
351
+ max_new_tokens=max_new_tokens
 
 
 
 
 
 
352
  )
353
 
354
  # Decode more carefully
 
405
  # Format team information
406
  team_summary = self.format_candidates_for_prompt(team)
407
 
408
+ # Build the prompt
409
+ prompt = f"""Analyze this existing startup team in depth:
410
 
411
  TEAM MEMBERS:
412
  {team_summary}
413
 
414
  Please provide:
415
 
416
+ 1. Team Composition Analysis (minimum 150 words):
417
+ - Overall assessment of the team's strengths and complementary skills
418
+ - Key skill coverage and potential skill gaps
419
+ - Team dynamics and how members would likely work together
420
+ - Potential areas of conflict or collaboration challenges
421
+
422
+ 2. Success Factors (minimum 100 words):
423
+ - What types of startups would be MOST successful with this team
424
+ - Key advantages this team has compared to typical startup teams
425
+ - How team members' backgrounds create competitive advantages
426
+
427
+ 3. Risk Factors (minimum 100 words):
428
+ - What types of startups would be LEAST successful with this team
429
+ - Potential blind spots or weaknesses in the team composition
430
+ - Suggested additions or changes to strengthen the team
 
431
  """
432
 
433
  if include_startup_comparison:
434
  prompt += """
435
+ 4. Comparison to Successful Startups (minimum 100 words):
436
+ - How this team compares to founding teams of successful startups
437
+ - Historical examples of similar team compositions that succeeded
438
+ - Key differentiating factors from typical successful startup teams
439
  """
440
 
441
+ # Format as chat with improved system prompt
442
  messages = [
443
+ {"role": "system", "content": "You are an elite startup advisor with deep expertise in team composition and founder dynamics. You specialize in analyzing team profiles and providing actionable insights to maximize chances of startup success."},
444
  {"role": "user", "content": prompt}
445
  ]
446
 
 
465
  # Create attention mask (explicitly handle padding)
466
  attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
467
 
468
+ # Use the optimized generator instead of direct model.generate call
469
+ outputs, input_length = self.generate_optimized(
470
  inputs,
471
  attention_mask=attention_mask,
472
+ max_new_tokens=max_new_tokens
 
 
 
 
 
 
473
  )
474
 
475
  # Decode more carefully
 
548
  skills = candidate['skills'] if isinstance(candidate['skills'], list) else [candidate['skills']]
549
  skills_info = ", ".join(skills)
550
 
551
+ # Build comprehensive prompt
552
+ prompt = f"""Analyze this candidate in depth for a startup founder or early employee role:
553
 
554
  CANDIDATE PROFILE:
555
  Name: {name}
 
563
  Skills:
564
  {skills_info}
565
 
566
+ Please provide a comprehensive analysis including:
567
 
568
+ 1. Strengths Analysis (minimum 150 words):
569
  - Key professional strengths based on background and skills
570
+ - Notable accomplishments and their significance
571
+ - Areas of deep expertise and how they apply to startups
572
 
573
+ 2. Founder/Early Employee Fit (minimum 150 words):
574
  - Assessment of suitability for founder or early employee roles
575
+ - Specific founder archetype this candidate represents
576
+ - Optimal startup stages for this candidate
577
  - Ideal role recommendations in a startup team
578
 
579
+ 3. Complementary Team Members (minimum 100 words):
580
+ - What types of co-founders or team members would complement this candidate
581
+ - Potential team dynamics when working with different personality types
582
  - Skills gaps that should be filled by other team members
583
 
584
+ 4. Risk Assessment (minimum 100 words):
585
+ - Potential blind spots or weaknesses based on background
586
+ - Areas where the candidate might need support or development
587
+ - Situations where this candidate might struggle in a startup environment
588
 
589
+ Format your analysis with clear sections and detailed insights to help assess this candidate for startup roles.
590
  """
591
 
592
+ # Format as chat with system prompt
593
  messages = [
594
+ {"role": "system", "content": "You are an elite talent assessor specializing in startup founders and early employees. You provide in-depth analysis of candidates' strengths, founder fit, and team compatibility."},
595
  {"role": "user", "content": prompt}
596
  ]
597
 
 
616
  # Create attention mask (explicitly handle padding)
617
  attention_mask = inputs.ne(self.tokenizer.pad_token_id).long()
618
 
619
+ # Use the optimized generator instead of direct model.generate call
620
+ outputs, input_length = self.generate_optimized(
621
  inputs,
622
  attention_mask=attention_mask,
623
+ max_new_tokens=max_new_tokens
 
 
 
 
 
 
624
  )
625
 
626
  # Decode more carefully
 
931
  return {
932
  "team_analysis": team_analysis,
933
  "model_info": {
934
+ "x": str(self.device),
935
  "model_type": "phi-2-qlora-finetuned"
936
  }
937
  }