Update process_interview.py
Browse files- process_interview.py +65 -25
process_interview.py
CHANGED
|
@@ -43,25 +43,40 @@ AUDIO_DIR = "./Uploads"
|
|
| 43 |
OUTPUT_DIR = "./processed_audio"
|
| 44 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 45 |
|
| 46 |
-
# API Keys
|
| 47 |
PINECONE_KEY = os.getenv("PINECONE_KEY", "your-pinecone-key")
|
| 48 |
ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY", "your-assemblyai-key")
|
| 49 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "your-gemini-key")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def download_audio_from_url(url: str) -> str:
|
| 52 |
"""Downloads an audio file from a URL to a temporary local path."""
|
|
|
|
|
|
|
|
|
|
| 53 |
try:
|
| 54 |
temp_dir = tempfile.gettempdir()
|
| 55 |
temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
|
| 56 |
logger.info(f"Downloading audio from {url} to {temp_path}")
|
| 57 |
-
with requests.get(url, stream=True) as r:
|
| 58 |
r.raise_for_status()
|
| 59 |
with open(temp_path, 'wb') as f:
|
| 60 |
for chunk in r.iter_content(chunk_size=8192):
|
| 61 |
f.write(chunk)
|
| 62 |
return temp_path
|
|
|
|
|
|
|
|
|
|
| 63 |
except Exception as e:
|
| 64 |
-
logger.error(f"Failed to download audio from URL {url}: {e}")
|
| 65 |
raise
|
| 66 |
|
| 67 |
def initialize_services():
|
|
@@ -264,7 +279,7 @@ def train_role_classifier(utterances: List[Dict]):
|
|
| 264 |
sum(1 for token in doc if token.pos_ == 'NOUN')
|
| 265 |
])
|
| 266 |
features.append(feat)
|
| 267 |
-
labels.append(0 if i % 2 == 0 else 1) # Simplified for demo
|
| 268 |
scaler = StandardScaler()
|
| 269 |
X = scaler.fit_transform(features)
|
| 270 |
clf = RandomForestClassifier(
|
|
@@ -443,7 +458,7 @@ def generate_report(analysis_data: Dict) -> str:
|
|
| 443 |
else:
|
| 444 |
acceptance_line += "HR Verdict: Limited fit, significant improvement required."
|
| 445 |
prompt = f"""
|
| 446 |
-
You are EvalBot, a senior HR consultant delivering a concise, professional interview analysis report. Use clear headings, bullet points ('-'), and avoid redundancy.
|
| 447 |
{acceptance_line}
|
| 448 |
**1. Executive Summary**
|
| 449 |
- Summarize performance, key metrics, and hiring potential.
|
|
@@ -466,12 +481,12 @@ def generate_report(analysis_data: Dict) -> str:
|
|
| 466 |
- Suggest next steps for hiring managers (advance, train, assess).
|
| 467 |
"""
|
| 468 |
response = gemini_model.generate_content(prompt)
|
| 469 |
-
return response.text
|
| 470 |
except Exception as e:
|
| 471 |
logger.error(f"Report generation failed: {str(e)}")
|
| 472 |
return f"Error generating report: {str(e)}"
|
| 473 |
|
| 474 |
-
def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
|
| 475 |
try:
|
| 476 |
doc = SimpleDocTemplate(output_path, pagesize=letter,
|
| 477 |
rightMargin=0.75*inch, leftMargin=0.75*inch,
|
|
@@ -594,10 +609,12 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
|
|
| 594 |
for line in lines:
|
| 595 |
line = line.strip()
|
| 596 |
if not line: continue
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
section_title =
|
| 600 |
-
if section_title.startswith('
|
|
|
|
|
|
|
| 601 |
current_section = 'Executive Summary'
|
| 602 |
current_subsection = None
|
| 603 |
elif 'Communication' in section_title:
|
|
@@ -615,17 +632,18 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
|
|
| 615 |
elif line.startswith(('-', '*', '•')) and current_section:
|
| 616 |
clean_line = line.lstrip('-*• ').strip()
|
| 617 |
if not clean_line: continue
|
|
|
|
| 618 |
if current_section == 'Competency':
|
| 619 |
-
if any(k in clean_line.lower() for k in ['
|
| 620 |
current_subsection = 'Strengths'
|
| 621 |
-
elif any(k in clean_line.lower() for k in ['
|
| 622 |
current_subsection = 'Growth Areas'
|
| 623 |
if current_subsection:
|
| 624 |
sections[current_section][current_subsection].append(clean_line)
|
| 625 |
elif current_section == 'Recommendations':
|
| 626 |
-
if any(k in clean_line.lower() for k in ['
|
| 627 |
current_subsection = 'Development'
|
| 628 |
-
elif any(k in clean_line.lower() for k in ['
|
| 629 |
current_subsection = 'Next Steps'
|
| 630 |
if current_subsection:
|
| 631 |
sections[current_section][current_subsection].append(clean_line)
|
|
@@ -683,12 +701,12 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
|
|
| 683 |
else:
|
| 684 |
story.append(Paragraph("No next steps provided.", body_text))
|
| 685 |
story.append(Spacer(1, 0.2*inch))
|
| 686 |
-
story.append(Paragraph("This report provides actionable insights to support hiring and candidate development.", body_text))
|
| 687 |
|
| 688 |
doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
|
|
|
|
| 689 |
return True
|
| 690 |
except Exception as e:
|
| 691 |
-
logger.error(f"PDF generation failed: {str(e)}")
|
| 692 |
return False
|
| 693 |
|
| 694 |
def convert_to_serializable(obj):
|
|
@@ -698,7 +716,7 @@ def convert_to_serializable(obj):
|
|
| 698 |
if isinstance(obj, np.ndarray): return obj.tolist()
|
| 699 |
return obj
|
| 700 |
|
| 701 |
-
def process_interview(audio_path_or_url: str):
|
| 702 |
local_audio_path = None
|
| 703 |
wav_file = None
|
| 704 |
is_downloaded = False
|
|
@@ -709,6 +727,8 @@ def process_interview(audio_path_or_url: str):
|
|
| 709 |
is_downloaded = True
|
| 710 |
else:
|
| 711 |
local_audio_path = audio_path_or_url
|
|
|
|
|
|
|
| 712 |
wav_file = convert_to_wav(local_audio_path)
|
| 713 |
transcript = transcribe(wav_file)
|
| 714 |
for utterance in transcript['utterances']:
|
|
@@ -737,19 +757,39 @@ def process_interview(audio_path_or_url: str):
|
|
| 737 |
base_name = str(uuid.uuid4())
|
| 738 |
pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
|
| 739 |
json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
|
| 740 |
-
|
| 741 |
-
logger.info(f"PDF report generated at: {pdf_path}")
|
| 742 |
with open(json_path, 'w') as f:
|
| 743 |
serializable_data = convert_to_serializable(analysis_data)
|
| 744 |
json.dump(serializable_data, f, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 745 |
logger.info(f"Processing completed for {audio_path_or_url}")
|
| 746 |
return {'pdf_path': pdf_path, 'json_path': json_path}
|
| 747 |
except Exception as e:
|
| 748 |
-
logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}")
|
| 749 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
finally:
|
| 751 |
if wav_file and os.path.exists(wav_file):
|
| 752 |
-
|
|
|
|
|
|
|
|
|
|
| 753 |
if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
|
| 754 |
-
|
| 755 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
OUTPUT_DIR = "./processed_audio"
|
| 44 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 45 |
|
| 46 |
+
# API Keys
|
| 47 |
PINECONE_KEY = os.getenv("PINECONE_KEY", "your-pinecone-key")
|
| 48 |
ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY", "your-assemblyai-key")
|
| 49 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "your-gemini-key")
|
| 50 |
|
| 51 |
+
def validate_url(url: str) -> bool:
|
| 52 |
+
"""Check if the URL is accessible."""
|
| 53 |
+
try:
|
| 54 |
+
response = requests.head(url, timeout=5)
|
| 55 |
+
return response.status_code == 200
|
| 56 |
+
except requests.RequestException as e:
|
| 57 |
+
logger.error(f"URL validation failed for {url}: {str(e)}")
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
def download_audio_from_url(url: str) -> str:
|
| 61 |
"""Downloads an audio file from a URL to a temporary local path."""
|
| 62 |
+
if not validate_url(url):
|
| 63 |
+
logger.error(f"Invalid or inaccessible URL: {url}")
|
| 64 |
+
raise ValueError(f"Audio file not found at {url}")
|
| 65 |
try:
|
| 66 |
temp_dir = tempfile.gettempdir()
|
| 67 |
temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
|
| 68 |
logger.info(f"Downloading audio from {url} to {temp_path}")
|
| 69 |
+
with requests.get(url, stream=True, timeout=10) as r:
|
| 70 |
r.raise_for_status()
|
| 71 |
with open(temp_path, 'wb') as f:
|
| 72 |
for chunk in r.iter_content(chunk_size=8192):
|
| 73 |
f.write(chunk)
|
| 74 |
return temp_path
|
| 75 |
+
except requests.HTTPError as e:
|
| 76 |
+
logger.error(f"HTTP error downloading audio from {url}: {str(e)}")
|
| 77 |
+
raise
|
| 78 |
except Exception as e:
|
| 79 |
+
logger.error(f"Failed to download audio from URL {url}: {str(e)}")
|
| 80 |
raise
|
| 81 |
|
| 82 |
def initialize_services():
|
|
|
|
| 279 |
sum(1 for token in doc if token.pos_ == 'NOUN')
|
| 280 |
])
|
| 281 |
features.append(feat)
|
| 282 |
+
labels.append(0 if i % 2 == 0 else 1) # Simplified for demo
|
| 283 |
scaler = StandardScaler()
|
| 284 |
X = scaler.fit_transform(features)
|
| 285 |
clf = RandomForestClassifier(
|
|
|
|
| 458 |
else:
|
| 459 |
acceptance_line += "HR Verdict: Limited fit, significant improvement required."
|
| 460 |
prompt = f"""
|
| 461 |
+
You are EvalBot, a senior HR consultant delivering a concise, professional interview analysis report. Use clear headings, bullet points ('-'), and avoid redundancy. Ensure text is clean and free of special characters that could break formatting.
|
| 462 |
{acceptance_line}
|
| 463 |
**1. Executive Summary**
|
| 464 |
- Summarize performance, key metrics, and hiring potential.
|
|
|
|
| 481 |
- Suggest next steps for hiring managers (advance, train, assess).
|
| 482 |
"""
|
| 483 |
response = gemini_model.generate_content(prompt)
|
| 484 |
+
return re.sub(r'[^\x00-\x7F]+', '', response.text) # Sanitize non-ASCII characters
|
| 485 |
except Exception as e:
|
| 486 |
logger.error(f"Report generation failed: {str(e)}")
|
| 487 |
return f"Error generating report: {str(e)}"
|
| 488 |
|
| 489 |
+
def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str) -> bool:
|
| 490 |
try:
|
| 491 |
doc = SimpleDocTemplate(output_path, pagesize=letter,
|
| 492 |
rightMargin=0.75*inch, leftMargin=0.75*inch,
|
|
|
|
| 609 |
for line in lines:
|
| 610 |
line = line.strip()
|
| 611 |
if not line: continue
|
| 612 |
+
# Simplified regex to avoid parenthesis issues
|
| 613 |
+
if line.startswith('**') and line.endswith('**'):
|
| 614 |
+
section_title = line.strip('**').strip()
|
| 615 |
+
if section_title.startswith(('1.', '2.', '3.', '4.', '5.')):
|
| 616 |
+
section_title = section_title[2:].strip()
|
| 617 |
+
if 'Executive Summary' in section_title:
|
| 618 |
current_section = 'Executive Summary'
|
| 619 |
current_subsection = None
|
| 620 |
elif 'Communication' in section_title:
|
|
|
|
| 632 |
elif line.startswith(('-', '*', '•')) and current_section:
|
| 633 |
clean_line = line.lstrip('-*• ').strip()
|
| 634 |
if not clean_line: continue
|
| 635 |
+
clean_line = re.sub(r'[()]', '', clean_line) # Remove parentheses
|
| 636 |
if current_section == 'Competency':
|
| 637 |
+
if any(k in clean_line.lower() for k in ['leader', 'problem', 'commun', 'adapt', 'strength']):
|
| 638 |
current_subsection = 'Strengths'
|
| 639 |
+
elif any(k in clean_line.lower() for k in ['improv', 'grow', 'depth']):
|
| 640 |
current_subsection = 'Growth Areas'
|
| 641 |
if current_subsection:
|
| 642 |
sections[current_section][current_subsection].append(clean_line)
|
| 643 |
elif current_section == 'Recommendations':
|
| 644 |
+
if any(k in clean_line.lower() for k in ['commun', 'tech', 'depth', 'pres']):
|
| 645 |
current_subsection = 'Development'
|
| 646 |
+
elif any(k in clean_line.lower() for k in ['adv', 'train', 'assess', 'next', 'mentor']):
|
| 647 |
current_subsection = 'Next Steps'
|
| 648 |
if current_subsection:
|
| 649 |
sections[current_section][current_subsection].append(clean_line)
|
|
|
|
| 701 |
else:
|
| 702 |
story.append(Paragraph("No next steps provided.", body_text))
|
| 703 |
story.append(Spacer(1, 0.2*inch))
|
|
|
|
| 704 |
|
| 705 |
doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
|
| 706 |
+
logger.info(f"PDF report successfully generated at {output_path}")
|
| 707 |
return True
|
| 708 |
except Exception as e:
|
| 709 |
+
logger.error(f"PDF generation failed: {str(e)}", exc_info=True)
|
| 710 |
return False
|
| 711 |
|
| 712 |
def convert_to_serializable(obj):
|
|
|
|
| 716 |
if isinstance(obj, np.ndarray): return obj.tolist()
|
| 717 |
return obj
|
| 718 |
|
| 719 |
+
def process_interview(audio_path_or_url: str) -> Dict:
|
| 720 |
local_audio_path = None
|
| 721 |
wav_file = None
|
| 722 |
is_downloaded = False
|
|
|
|
| 727 |
is_downloaded = True
|
| 728 |
else:
|
| 729 |
local_audio_path = audio_path_or_url
|
| 730 |
+
if not os.path.exists(local_audio_path):
|
| 731 |
+
raise FileNotFoundError(f"Local audio file not found: {local_audio_path}")
|
| 732 |
wav_file = convert_to_wav(local_audio_path)
|
| 733 |
transcript = transcribe(wav_file)
|
| 734 |
for utterance in transcript['utterances']:
|
|
|
|
| 757 |
base_name = str(uuid.uuid4())
|
| 758 |
pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
|
| 759 |
json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
|
| 760 |
+
pdf_success = create_pdf_report(analysis_data, pdf_path, gemini_report_text)
|
|
|
|
| 761 |
with open(json_path, 'w') as f:
|
| 762 |
serializable_data = convert_to_serializable(analysis_data)
|
| 763 |
json.dump(serializable_data, f, indent=2)
|
| 764 |
+
if not pdf_success:
|
| 765 |
+
logger.warning(f"PDF report failed to generate for {audio_path_or_url}")
|
| 766 |
+
return {
|
| 767 |
+
'pdf_path': None,
|
| 768 |
+
'json_path': json_path,
|
| 769 |
+
'error': 'PDF generation failed'
|
| 770 |
+
}
|
| 771 |
logger.info(f"Processing completed for {audio_path_or_url}")
|
| 772 |
return {'pdf_path': pdf_path, 'json_path': json_path}
|
| 773 |
except Exception as e:
|
| 774 |
+
logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
|
| 775 |
+
base_name = str(uuid.uuid4())
|
| 776 |
+
json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
|
| 777 |
+
with open(json_path, 'w') as f:
|
| 778 |
+
json.dump({'error': str(e)}, f, indent=2)
|
| 779 |
+
return {
|
| 780 |
+
'pdf_path': None,
|
| 781 |
+
'json_path': json_path,
|
| 782 |
+
'error': str(e)
|
| 783 |
+
}
|
| 784 |
finally:
|
| 785 |
if wav_file and os.path.exists(wav_file):
|
| 786 |
+
try:
|
| 787 |
+
os.remove(wav_file)
|
| 788 |
+
except Exception as e:
|
| 789 |
+
logger.error(f"Failed to clean up wav file {wav_file}: {str(e)}")
|
| 790 |
if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
|
| 791 |
+
try:
|
| 792 |
+
os.remove(local_audio_path)
|
| 793 |
+
logger.info(f"Cleaned up temporary file: {local_audio_path}")
|
| 794 |
+
except Exception as e:
|
| 795 |
+
logger.error(f"Failed to clean up local audio file {local_audio_path}: {str(e)}")
|