norhan12 commited on
Commit
93c279b
·
verified ·
1 Parent(s): 293b493

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +65 -25
process_interview.py CHANGED
@@ -43,25 +43,40 @@ AUDIO_DIR = "./Uploads"
43
  OUTPUT_DIR = "./processed_audio"
44
  os.makedirs(OUTPUT_DIR, exist_ok=True)
45
 
46
- # API Keys (replace with actual keys or environment variables)
47
  PINECONE_KEY = os.getenv("PINECONE_KEY", "your-pinecone-key")
48
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY", "your-assemblyai-key")
49
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "your-gemini-key")
50
 
 
 
 
 
 
 
 
 
 
51
  def download_audio_from_url(url: str) -> str:
52
  """Downloads an audio file from a URL to a temporary local path."""
 
 
 
53
  try:
54
  temp_dir = tempfile.gettempdir()
55
  temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
56
  logger.info(f"Downloading audio from {url} to {temp_path}")
57
- with requests.get(url, stream=True) as r:
58
  r.raise_for_status()
59
  with open(temp_path, 'wb') as f:
60
  for chunk in r.iter_content(chunk_size=8192):
61
  f.write(chunk)
62
  return temp_path
 
 
 
63
  except Exception as e:
64
- logger.error(f"Failed to download audio from URL {url}: {e}")
65
  raise
66
 
67
  def initialize_services():
@@ -264,7 +279,7 @@ def train_role_classifier(utterances: List[Dict]):
264
  sum(1 for token in doc if token.pos_ == 'NOUN')
265
  ])
266
  features.append(feat)
267
- labels.append(0 if i % 2 == 0 else 1) # Simplified for demo; replace with actual labels
268
  scaler = StandardScaler()
269
  X = scaler.fit_transform(features)
270
  clf = RandomForestClassifier(
@@ -443,7 +458,7 @@ def generate_report(analysis_data: Dict) -> str:
443
  else:
444
  acceptance_line += "HR Verdict: Limited fit, significant improvement required."
445
  prompt = f"""
446
- You are EvalBot, a senior HR consultant delivering a concise, professional interview analysis report. Use clear headings, bullet points ('-'), and avoid redundancy. Focus on candidate suitability, strengths, and actionable recommendations.
447
  {acceptance_line}
448
  **1. Executive Summary**
449
  - Summarize performance, key metrics, and hiring potential.
@@ -466,12 +481,12 @@ def generate_report(analysis_data: Dict) -> str:
466
  - Suggest next steps for hiring managers (advance, train, assess).
467
  """
468
  response = gemini_model.generate_content(prompt)
469
- return response.text
470
  except Exception as e:
471
  logger.error(f"Report generation failed: {str(e)}")
472
  return f"Error generating report: {str(e)}"
473
 
474
- def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
475
  try:
476
  doc = SimpleDocTemplate(output_path, pagesize=letter,
477
  rightMargin=0.75*inch, leftMargin=0.75*inch,
@@ -594,10 +609,12 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
594
  for line in lines:
595
  line = line.strip()
596
  if not line: continue
597
- if re.match(r'\s*\*\*\s*\d*\.?\s*.*?)\s*\*\*', line):
598
- section_match = re.search(r'\s*\*\*\s*\d*\.?\s*(.*?)\s*\*\*', line)
599
- section_title = section_match.group(1).strip()
600
- if section_title.startswith('Executive Summary'):
 
 
601
  current_section = 'Executive Summary'
602
  current_subsection = None
603
  elif 'Communication' in section_title:
@@ -615,17 +632,18 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
615
  elif line.startswith(('-', '*', '•')) and current_section:
616
  clean_line = line.lstrip('-*• ').strip()
617
  if not clean_line: continue
 
618
  if current_section == 'Competency':
619
- if any(k in clean_line.lower() for k in ['leadership', 'problem-solving', 'communication', 'adaptability', 'strength']):
620
  current_subsection = 'Strengths'
621
- elif any(k in clean_line.lower() for k in ['improve', 'grow', 'depth', 'challenge']):
622
  current_subsection = 'Growth Areas'
623
  if current_subsection:
624
  sections[current_section][current_subsection].append(clean_line)
625
  elif current_section == 'Recommendations':
626
- if any(k in clean_line.lower() for k in ['communication', 'technical', 'depth', 'presence']):
627
  current_subsection = 'Development'
628
- elif any(k in clean_line.lower() for k in ['advance', 'train', 'assess', 'next', 'mentor']):
629
  current_subsection = 'Next Steps'
630
  if current_subsection:
631
  sections[current_section][current_subsection].append(clean_line)
@@ -683,12 +701,12 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
683
  else:
684
  story.append(Paragraph("No next steps provided.", body_text))
685
  story.append(Spacer(1, 0.2*inch))
686
- story.append(Paragraph("This report provides actionable insights to support hiring and candidate development.", body_text))
687
 
688
  doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
 
689
  return True
690
  except Exception as e:
691
- logger.error(f"PDF generation failed: {str(e)}")
692
  return False
693
 
694
  def convert_to_serializable(obj):
@@ -698,7 +716,7 @@ def convert_to_serializable(obj):
698
  if isinstance(obj, np.ndarray): return obj.tolist()
699
  return obj
700
 
701
- def process_interview(audio_path_or_url: str):
702
  local_audio_path = None
703
  wav_file = None
704
  is_downloaded = False
@@ -709,6 +727,8 @@ def process_interview(audio_path_or_url: str):
709
  is_downloaded = True
710
  else:
711
  local_audio_path = audio_path_or_url
 
 
712
  wav_file = convert_to_wav(local_audio_path)
713
  transcript = transcribe(wav_file)
714
  for utterance in transcript['utterances']:
@@ -737,19 +757,39 @@ def process_interview(audio_path_or_url: str):
737
  base_name = str(uuid.uuid4())
738
  pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
739
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
740
- if create_pdf_report(analysis_data, pdf_path, gemini_report_text):
741
- logger.info(f"PDF report generated at: {pdf_path}")
742
  with open(json_path, 'w') as f:
743
  serializable_data = convert_to_serializable(analysis_data)
744
  json.dump(serializable_data, f, indent=2)
 
 
 
 
 
 
 
745
  logger.info(f"Processing completed for {audio_path_or_url}")
746
  return {'pdf_path': pdf_path, 'json_path': json_path}
747
  except Exception as e:
748
- logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}")
749
- raise
 
 
 
 
 
 
 
 
750
  finally:
751
  if wav_file and os.path.exists(wav_file):
752
- os.remove(wav_file)
 
 
 
753
  if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
754
- os.remove(local_audio_path)
755
- logger.info(f"Cleaned up temporary audio file: {local_audio_path}")
 
 
 
 
43
  OUTPUT_DIR = "./processed_audio"
44
  os.makedirs(OUTPUT_DIR, exist_ok=True)
45
 
46
+ # API Keys
47
  PINECONE_KEY = os.getenv("PINECONE_KEY", "your-pinecone-key")
48
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY", "your-assemblyai-key")
49
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "your-gemini-key")
50
 
51
+ def validate_url(url: str) -> bool:
52
+ """Check if the URL is accessible."""
53
+ try:
54
+ response = requests.head(url, timeout=5)
55
+ return response.status_code == 200
56
+ except requests.RequestException as e:
57
+ logger.error(f"URL validation failed for {url}: {str(e)}")
58
+ return False
59
+
60
  def download_audio_from_url(url: str) -> str:
61
  """Downloads an audio file from a URL to a temporary local path."""
62
+ if not validate_url(url):
63
+ logger.error(f"Invalid or inaccessible URL: {url}")
64
+ raise ValueError(f"Audio file not found at {url}")
65
  try:
66
  temp_dir = tempfile.gettempdir()
67
  temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
68
  logger.info(f"Downloading audio from {url} to {temp_path}")
69
+ with requests.get(url, stream=True, timeout=10) as r:
70
  r.raise_for_status()
71
  with open(temp_path, 'wb') as f:
72
  for chunk in r.iter_content(chunk_size=8192):
73
  f.write(chunk)
74
  return temp_path
75
+ except requests.HTTPError as e:
76
+ logger.error(f"HTTP error downloading audio from {url}: {str(e)}")
77
+ raise
78
  except Exception as e:
79
+ logger.error(f"Failed to download audio from URL {url}: {str(e)}")
80
  raise
81
 
82
  def initialize_services():
 
279
  sum(1 for token in doc if token.pos_ == 'NOUN')
280
  ])
281
  features.append(feat)
282
+ labels.append(0 if i % 2 == 0 else 1) # Simplified for demo
283
  scaler = StandardScaler()
284
  X = scaler.fit_transform(features)
285
  clf = RandomForestClassifier(
 
458
  else:
459
  acceptance_line += "HR Verdict: Limited fit, significant improvement required."
460
  prompt = f"""
461
+ You are EvalBot, a senior HR consultant delivering a concise, professional interview analysis report. Use clear headings, bullet points ('-'), and avoid redundancy. Ensure text is clean and free of special characters that could break formatting.
462
  {acceptance_line}
463
  **1. Executive Summary**
464
  - Summarize performance, key metrics, and hiring potential.
 
481
  - Suggest next steps for hiring managers (advance, train, assess).
482
  """
483
  response = gemini_model.generate_content(prompt)
484
+ return re.sub(r'[^\x00-\x7F]+', '', response.text) # Sanitize non-ASCII characters
485
  except Exception as e:
486
  logger.error(f"Report generation failed: {str(e)}")
487
  return f"Error generating report: {str(e)}"
488
 
489
+ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str) -> bool:
490
  try:
491
  doc = SimpleDocTemplate(output_path, pagesize=letter,
492
  rightMargin=0.75*inch, leftMargin=0.75*inch,
 
609
  for line in lines:
610
  line = line.strip()
611
  if not line: continue
612
+ # Simplified regex to avoid parenthesis issues
613
+ if line.startswith('**') and line.endswith('**'):
614
+ section_title = line.strip('**').strip()
615
+ if section_title.startswith(('1.', '2.', '3.', '4.', '5.')):
616
+ section_title = section_title[2:].strip()
617
+ if 'Executive Summary' in section_title:
618
  current_section = 'Executive Summary'
619
  current_subsection = None
620
  elif 'Communication' in section_title:
 
632
  elif line.startswith(('-', '*', '•')) and current_section:
633
  clean_line = line.lstrip('-*• ').strip()
634
  if not clean_line: continue
635
+ clean_line = re.sub(r'[()]', '', clean_line) # Remove parentheses
636
  if current_section == 'Competency':
637
+ if any(k in clean_line.lower() for k in ['leader', 'problem', 'commun', 'adapt', 'strength']):
638
  current_subsection = 'Strengths'
639
+ elif any(k in clean_line.lower() for k in ['improv', 'grow', 'depth']):
640
  current_subsection = 'Growth Areas'
641
  if current_subsection:
642
  sections[current_section][current_subsection].append(clean_line)
643
  elif current_section == 'Recommendations':
644
+ if any(k in clean_line.lower() for k in ['commun', 'tech', 'depth', 'pres']):
645
  current_subsection = 'Development'
646
+ elif any(k in clean_line.lower() for k in ['adv', 'train', 'assess', 'next', 'mentor']):
647
  current_subsection = 'Next Steps'
648
  if current_subsection:
649
  sections[current_section][current_subsection].append(clean_line)
 
701
  else:
702
  story.append(Paragraph("No next steps provided.", body_text))
703
  story.append(Spacer(1, 0.2*inch))
 
704
 
705
  doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
706
+ logger.info(f"PDF report successfully generated at {output_path}")
707
  return True
708
  except Exception as e:
709
+ logger.error(f"PDF generation failed: {str(e)}", exc_info=True)
710
  return False
711
 
712
  def convert_to_serializable(obj):
 
716
  if isinstance(obj, np.ndarray): return obj.tolist()
717
  return obj
718
 
719
+ def process_interview(audio_path_or_url: str) -> Dict:
720
  local_audio_path = None
721
  wav_file = None
722
  is_downloaded = False
 
727
  is_downloaded = True
728
  else:
729
  local_audio_path = audio_path_or_url
730
+ if not os.path.exists(local_audio_path):
731
+ raise FileNotFoundError(f"Local audio file not found: {local_audio_path}")
732
  wav_file = convert_to_wav(local_audio_path)
733
  transcript = transcribe(wav_file)
734
  for utterance in transcript['utterances']:
 
757
  base_name = str(uuid.uuid4())
758
  pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
759
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
760
+ pdf_success = create_pdf_report(analysis_data, pdf_path, gemini_report_text)
 
761
  with open(json_path, 'w') as f:
762
  serializable_data = convert_to_serializable(analysis_data)
763
  json.dump(serializable_data, f, indent=2)
764
+ if not pdf_success:
765
+ logger.warning(f"PDF report failed to generate for {audio_path_or_url}")
766
+ return {
767
+ 'pdf_path': None,
768
+ 'json_path': json_path,
769
+ 'error': 'PDF generation failed'
770
+ }
771
  logger.info(f"Processing completed for {audio_path_or_url}")
772
  return {'pdf_path': pdf_path, 'json_path': json_path}
773
  except Exception as e:
774
+ logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
775
+ base_name = str(uuid.uuid4())
776
+ json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
777
+ with open(json_path, 'w') as f:
778
+ json.dump({'error': str(e)}, f, indent=2)
779
+ return {
780
+ 'pdf_path': None,
781
+ 'json_path': json_path,
782
+ 'error': str(e)
783
+ }
784
  finally:
785
  if wav_file and os.path.exists(wav_file):
786
+ try:
787
+ os.remove(wav_file)
788
+ except Exception as e:
789
+ logger.error(f"Failed to clean up wav file {wav_file}: {str(e)}")
790
  if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
791
+ try:
792
+ os.remove(local_audio_path)
793
+ logger.info(f"Cleaned up temporary file: {local_audio_path}")
794
+ except Exception as e:
795
+ logger.error(f"Failed to clean up local audio file {local_audio_path}: {str(e)}")