Spaces:

EvalBot
/

Audio

Sleeping

App Files Files Community

norhan12 commited on Jun 10, 2025

Commit

93c279b

verified ·

1 Parent(s): 293b493

Update process_interview.py

Browse files

Files changed (1) hide show

process_interview.py +65 -25

process_interview.py CHANGED Viewed

@@ -43,25 +43,40 @@ AUDIO_DIR = "./Uploads"
 OUTPUT_DIR = "./processed_audio"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
-# API Keys (replace with actual keys or environment variables)
 PINECONE_KEY = os.getenv("PINECONE_KEY", "your-pinecone-key")
 ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY", "your-assemblyai-key")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "your-gemini-key")
 def download_audio_from_url(url: str) -> str:
     """Downloads an audio file from a URL to a temporary local path."""
     try:
         temp_dir = tempfile.gettempdir()
         temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
         logger.info(f"Downloading audio from {url} to {temp_path}")
-        with requests.get(url, stream=True) as r:
             r.raise_for_status()
             with open(temp_path, 'wb') as f:
                 for chunk in r.iter_content(chunk_size=8192):
                     f.write(chunk)
         return temp_path
     except Exception as e:
-        logger.error(f"Failed to download audio from URL {url}: {e}")
         raise
 def initialize_services():
@@ -264,7 +279,7 @@ def train_role_classifier(utterances: List[Dict]):
                 sum(1 for token in doc if token.pos_ == 'NOUN')
             ])
             features.append(feat)
-            labels.append(0 if i % 2 == 0 else 1)  # Simplified for demo; replace with actual labels
         scaler = StandardScaler()
         X = scaler.fit_transform(features)
         clf = RandomForestClassifier(
@@ -443,7 +458,7 @@ def generate_report(analysis_data: Dict) -> str:
         else:
             acceptance_line += "HR Verdict: Limited fit, significant improvement required."
         prompt = f"""
-        You are EvalBot, a senior HR consultant delivering a concise, professional interview analysis report. Use clear headings, bullet points ('-'), and avoid redundancy. Focus on candidate suitability, strengths, and actionable recommendations.
         {acceptance_line}
         **1. Executive Summary**
         - Summarize performance, key metrics, and hiring potential.
@@ -466,12 +481,12 @@ def generate_report(analysis_data: Dict) -> str:
         - Suggest next steps for hiring managers (advance, train, assess).
         """
         response = gemini_model.generate_content(prompt)
-        return response.text
     except Exception as e:
         logger.error(f"Report generation failed: {str(e)}")
         return f"Error generating report: {str(e)}"
-def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
     try:
         doc = SimpleDocTemplate(output_path, pagesize=letter,
                                 rightMargin=0.75*inch, leftMargin=0.75*inch,
@@ -594,10 +609,12 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
         for line in lines:
             line = line.strip()
             if not line: continue
-            if re.match(r'\s*\*\*\s*\d*\.?\s*.*?)\s*\*\*', line):
-                section_match = re.search(r'\s*\*\*\s*\d*\.?\s*(.*?)\s*\*\*', line)
-                section_title = section_match.group(1).strip()
-                if section_title.startswith('Executive Summary'):
                     current_section = 'Executive Summary'
                     current_subsection = None
                 elif 'Communication' in section_title:
@@ -615,17 +632,18 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
             elif line.startswith(('-', '*', '•')) and current_section:
                 clean_line = line.lstrip('-*• ').strip()
                 if not clean_line: continue
                 if current_section == 'Competency':
-                    if any(k in clean_line.lower() for k in ['leadership', 'problem-solving', 'communication', 'adaptability', 'strength']):
                         current_subsection = 'Strengths'
-                    elif any(k in clean_line.lower() for k in ['improve', 'grow', 'depth', 'challenge']):
                         current_subsection = 'Growth Areas'
                     if current_subsection:
                         sections[current_section][current_subsection].append(clean_line)
                 elif current_section == 'Recommendations':
-                    if any(k in clean_line.lower() for k in ['communication', 'technical', 'depth', 'presence']):
                         current_subsection = 'Development'
-                    elif any(k in clean_line.lower() for k in ['advance', 'train', 'assess', 'next', 'mentor']):
                         current_subsection = 'Next Steps'
                     if current_subsection:
                         sections[current_section][current_subsection].append(clean_line)
@@ -683,12 +701,12 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
         else:
             story.append(Paragraph("No next steps provided.", body_text))
         story.append(Spacer(1, 0.2*inch))
-        story.append(Paragraph("This report provides actionable insights to support hiring and candidate development.", body_text))
         doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
         return True
     except Exception as e:
-        logger.error(f"PDF generation failed: {str(e)}")
         return False
 def convert_to_serializable(obj):
@@ -698,7 +716,7 @@ def convert_to_serializable(obj):
     if isinstance(obj, np.ndarray): return obj.tolist()
     return obj
-def process_interview(audio_path_or_url: str):
     local_audio_path = None
     wav_file = None
     is_downloaded = False
@@ -709,6 +727,8 @@ def process_interview(audio_path_or_url: str):
             is_downloaded = True
         else:
             local_audio_path = audio_path_or_url
         wav_file = convert_to_wav(local_audio_path)
         transcript = transcribe(wav_file)
         for utterance in transcript['utterances']:
@@ -737,19 +757,39 @@ def process_interview(audio_path_or_url: str):
         base_name = str(uuid.uuid4())
         pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
         json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
-        if create_pdf_report(analysis_data, pdf_path, gemini_report_text):
-            logger.info(f"PDF report generated at: {pdf_path}")
         with open(json_path, 'w') as f:
             serializable_data = convert_to_serializable(analysis_data)
             json.dump(serializable_data, f, indent=2)
         logger.info(f"Processing completed for {audio_path_or_url}")
         return {'pdf_path': pdf_path, 'json_path': json_path}
     except Exception as e:
-        logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}")
-        raise
     finally:
         if wav_file and os.path.exists(wav_file):
-            os.remove(wav_file)
         if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
-            os.remove(local_audio_path)
-            logger.info(f"Cleaned up temporary audio file: {local_audio_path}")

 OUTPUT_DIR = "./processed_audio"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
+# API Keys
 PINECONE_KEY = os.getenv("PINECONE_KEY", "your-pinecone-key")
 ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY", "your-assemblyai-key")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "your-gemini-key")
+def validate_url(url: str) -> bool:
+    """Check if the URL is accessible."""
+    try:
+        response = requests.head(url, timeout=5)
+        return response.status_code == 200
+    except requests.RequestException as e:
+        logger.error(f"URL validation failed for {url}: {str(e)}")
+        return False
 def download_audio_from_url(url: str) -> str:
     """Downloads an audio file from a URL to a temporary local path."""
+    if not validate_url(url):
+        logger.error(f"Invalid or inaccessible URL: {url}")
+        raise ValueError(f"Audio file not found at {url}")
     try:
         temp_dir = tempfile.gettempdir()
         temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
         logger.info(f"Downloading audio from {url} to {temp_path}")
+        with requests.get(url, stream=True, timeout=10) as r:
             r.raise_for_status()
             with open(temp_path, 'wb') as f:
                 for chunk in r.iter_content(chunk_size=8192):
                     f.write(chunk)
         return temp_path
+    except requests.HTTPError as e:
+        logger.error(f"HTTP error downloading audio from {url}: {str(e)}")
+        raise
     except Exception as e:
+        logger.error(f"Failed to download audio from URL {url}: {str(e)}")
         raise
 def initialize_services():
                 sum(1 for token in doc if token.pos_ == 'NOUN')
             ])
             features.append(feat)
+            labels.append(0 if i % 2 == 0 else 1)  # Simplified for demo
         scaler = StandardScaler()
         X = scaler.fit_transform(features)
         clf = RandomForestClassifier(
         else:
             acceptance_line += "HR Verdict: Limited fit, significant improvement required."
         prompt = f"""
+        You are EvalBot, a senior HR consultant delivering a concise, professional interview analysis report. Use clear headings, bullet points ('-'), and avoid redundancy. Ensure text is clean and free of special characters that could break formatting.
         {acceptance_line}
         **1. Executive Summary**
         - Summarize performance, key metrics, and hiring potential.
         - Suggest next steps for hiring managers (advance, train, assess).
         """
         response = gemini_model.generate_content(prompt)
+        return re.sub(r'[^\x00-\x7F]+', '', response.text)  # Sanitize non-ASCII characters
     except Exception as e:
         logger.error(f"Report generation failed: {str(e)}")
         return f"Error generating report: {str(e)}"
+def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str) -> bool:
     try:
         doc = SimpleDocTemplate(output_path, pagesize=letter,
                                 rightMargin=0.75*inch, leftMargin=0.75*inch,
         for line in lines:
             line = line.strip()
             if not line: continue
+            # Simplified regex to avoid parenthesis issues
+            if line.startswith('**') and line.endswith('**'):
+                section_title = line.strip('**').strip()
+                if section_title.startswith(('1.', '2.', '3.', '4.', '5.')):
+                    section_title = section_title[2:].strip()
+                if 'Executive Summary' in section_title:
                     current_section = 'Executive Summary'
                     current_subsection = None
                 elif 'Communication' in section_title:
             elif line.startswith(('-', '*', '•')) and current_section:
                 clean_line = line.lstrip('-*• ').strip()
                 if not clean_line: continue
+                clean_line = re.sub(r'[()]', '', clean_line)  # Remove parentheses
                 if current_section == 'Competency':
+                    if any(k in clean_line.lower() for k in ['leader', 'problem', 'commun', 'adapt', 'strength']):
                         current_subsection = 'Strengths'
+                    elif any(k in clean_line.lower() for k in ['improv', 'grow', 'depth']):
                         current_subsection = 'Growth Areas'
                     if current_subsection:
                         sections[current_section][current_subsection].append(clean_line)
                 elif current_section == 'Recommendations':
+                    if any(k in clean_line.lower() for k in ['commun', 'tech', 'depth', 'pres']):
                         current_subsection = 'Development'
+                    elif any(k in clean_line.lower() for k in ['adv', 'train', 'assess', 'next', 'mentor']):
                         current_subsection = 'Next Steps'
                     if current_subsection:
                         sections[current_section][current_subsection].append(clean_line)
         else:
             story.append(Paragraph("No next steps provided.", body_text))
         story.append(Spacer(1, 0.2*inch))
         doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
+        logger.info(f"PDF report successfully generated at {output_path}")
         return True
     except Exception as e:
+        logger.error(f"PDF generation failed: {str(e)}", exc_info=True)
         return False
 def convert_to_serializable(obj):
     if isinstance(obj, np.ndarray): return obj.tolist()
     return obj
+def process_interview(audio_path_or_url: str) -> Dict:
     local_audio_path = None
     wav_file = None
     is_downloaded = False
             is_downloaded = True
         else:
             local_audio_path = audio_path_or_url
+            if not os.path.exists(local_audio_path):
+                raise FileNotFoundError(f"Local audio file not found: {local_audio_path}")
         wav_file = convert_to_wav(local_audio_path)
         transcript = transcribe(wav_file)
         for utterance in transcript['utterances']:
         base_name = str(uuid.uuid4())
         pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
         json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
+        pdf_success = create_pdf_report(analysis_data, pdf_path, gemini_report_text)
         with open(json_path, 'w') as f:
             serializable_data = convert_to_serializable(analysis_data)
             json.dump(serializable_data, f, indent=2)
+        if not pdf_success:
+            logger.warning(f"PDF report failed to generate for {audio_path_or_url}")
+            return {
+                'pdf_path': None,
+                'json_path': json_path,
+                'error': 'PDF generation failed'
+            }
         logger.info(f"Processing completed for {audio_path_or_url}")
         return {'pdf_path': pdf_path, 'json_path': json_path}
     except Exception as e:
+        logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
+        base_name = str(uuid.uuid4())
+        json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
+        with open(json_path, 'w') as f:
+            json.dump({'error': str(e)}, f, indent=2)
+        return {
+            'pdf_path': None,
+            'json_path': json_path,
+            'error': str(e)
+        }
     finally:
         if wav_file and os.path.exists(wav_file):
+            try:
+                os.remove(wav_file)
+            except Exception as e:
+                logger.error(f"Failed to clean up wav file {wav_file}: {str(e)}")
         if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
+            try:
+                os.remove(local_audio_path)
+                logger.info(f"Cleaned up temporary file: {local_audio_path}")
+            except Exception as e:
+                logger.error(f"Failed to clean up local audio file {local_audio_path}: {str(e)}")