norhan12 commited on
Commit
afeb72f
·
verified ·
1 Parent(s): e09ddd4

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +26 -19
process_interview.py CHANGED
@@ -35,12 +35,14 @@ import google.generativeai as genai
35
  import joblib
36
  from concurrent.futures import ThreadPoolExecutor
37
  from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
 
38
  # Setup logging
39
  logging.basicConfig(level=logging.INFO)
40
  logger = logging.getLogger(__name__)
41
  logging.getLogger("nemo_logging").setLevel(logging.ERROR)
42
  logging.getLogger("nemo").setLevel(logging.ERROR)
43
 
 
44
  # Configuration
45
  AUDIO_DIR = "./uploads"
46
  OUTPUT_DIR = "./processed_audio"
@@ -112,17 +114,24 @@ def load_models():
112
  speaker_model, nlp, tokenizer, llm_model = load_models()
113
 
114
 
115
- # Audio processing functions
116
- def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
117
  try:
118
- audio = AudioSegment.from_file(audio_path)
119
- if audio.channels > 1:
120
- audio = audio.set_channels(1)
121
- audio = audio.set_frame_rate(16000)
 
 
 
 
 
 
 
 
122
 
123
- wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
124
- audio.export(wav_file, format="wav")
125
- return wav_file
126
  except Exception as e:
127
  logger.error(f"Audio conversion failed: {str(e)}")
128
  raise
@@ -1032,7 +1041,6 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
1032
  logger.error(f"PDF generation failed: {str(e)}", exc_info=True)
1033
  return False
1034
 
1035
-
1036
  def convert_to_serializable(obj):
1037
  if isinstance(obj, np.generic):
1038
  return obj.item()
@@ -1044,7 +1052,6 @@ def convert_to_serializable(obj):
1044
  return obj.tolist()
1045
  return obj
1046
 
1047
-
1048
  def process_interview(audio_path: str):
1049
  try:
1050
  logger.info(f"Starting processing for {audio_path}")
@@ -1053,6 +1060,12 @@ def process_interview(audio_path: str):
1053
 
1054
  logger.info("Starting transcription")
1055
  transcript = transcribe(wav_file)
 
 
 
 
 
 
1056
 
1057
  logger.info("Extracting prosodic features")
1058
  for utterance in transcript['utterances']:
@@ -1066,9 +1079,6 @@ def process_interview(audio_path: str):
1066
  utterances_with_speakers = identify_speakers(transcript, wav_file)
1067
 
1068
  logger.info("Classifying roles")
1069
- # Ensure role classifier models are loaded/trained only once if possible,
1070
- # or handled carefully in a multi-threaded context.
1071
- # For simplicity, keeping it inside process_interview for now.
1072
  if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
1073
  clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
1074
  vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
@@ -1091,10 +1101,8 @@ def process_interview(audio_path: str):
1091
  }
1092
  }
1093
 
1094
- # --- Calculate Acceptance Probability ---
1095
  acceptance_probability = calculate_acceptance_probability(analysis_data)
1096
  analysis_data['acceptance_probability'] = acceptance_probability
1097
- # --- End Acceptance Probability ---
1098
 
1099
  logger.info("Generating report text using Gemini")
1100
  gemini_report_text = generate_report(analysis_data)
@@ -1108,7 +1116,7 @@ def process_interview(audio_path: str):
1108
  serializable_data = convert_to_serializable(analysis_data)
1109
  json.dump(serializable_data, f, indent=2)
1110
 
1111
- os.remove(wav_file) # Clean up WAV file after processing
1112
 
1113
  logger.info(f"Processing completed for {audio_path}")
1114
  return {
@@ -1117,7 +1125,6 @@ def process_interview(audio_path: str):
1117
  }
1118
  except Exception as e:
1119
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
1120
- # Clean up wav_file in case of error
1121
  if 'wav_file' in locals() and os.path.exists(wav_file):
1122
  os.remove(wav_file)
1123
- raise
 
35
  import joblib
36
  from concurrent.futures import ThreadPoolExecutor
37
  from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
38
+ import subprocess
39
  # Setup logging
40
  logging.basicConfig(level=logging.INFO)
41
  logger = logging.getLogger(__name__)
42
  logging.getLogger("nemo_logging").setLevel(logging.ERROR)
43
  logging.getLogger("nemo").setLevel(logging.ERROR)
44
 
45
+
46
  # Configuration
47
  AUDIO_DIR = "./uploads"
48
  OUTPUT_DIR = "./processed_audio"
 
114
  speaker_model, nlp, tokenizer, llm_model = load_models()
115
 
116
 
117
+ def convert_to_wav(input_path: str, output_dir: str = OUTPUT_DIR) -> str:
 
118
  try:
119
+ os.makedirs(output_dir, exist_ok=True)
120
+ output_path = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
121
+ command = [
122
+ 'ffmpeg', '-y',
123
+ '-i', input_path,
124
+ '-vn', # ignore video stream completely
125
+ '-acodec', 'pcm_s16le',
126
+ '-ar', '16000',
127
+ '-ac', '1',
128
+ output_path
129
+ ]
130
+ subprocess.run(command, check=True)
131
 
132
+ size_in_mb = os.path.getsize(output_path) / (1024*1024)
133
+ logger.info(f"WAV file size: {size_in_mb:.2f} MB")
134
+ return output_path
135
  except Exception as e:
136
  logger.error(f"Audio conversion failed: {str(e)}")
137
  raise
 
1041
  logger.error(f"PDF generation failed: {str(e)}", exc_info=True)
1042
  return False
1043
 
 
1044
  def convert_to_serializable(obj):
1045
  if isinstance(obj, np.generic):
1046
  return obj.item()
 
1052
  return obj.tolist()
1053
  return obj
1054
 
 
1055
  def process_interview(audio_path: str):
1056
  try:
1057
  logger.info(f"Starting processing for {audio_path}")
 
1060
 
1061
  logger.info("Starting transcription")
1062
  transcript = transcribe(wav_file)
1063
+ logger.info("Transcript result: %s", transcript)
1064
+
1065
+ # Check transcript validity
1066
+ if not transcript or 'utterances' not in transcript or not transcript['utterances']:
1067
+ logger.error("Transcription failed or returned empty utterances")
1068
+ raise ValueError("Transcription failed or returned empty utterances")
1069
 
1070
  logger.info("Extracting prosodic features")
1071
  for utterance in transcript['utterances']:
 
1079
  utterances_with_speakers = identify_speakers(transcript, wav_file)
1080
 
1081
  logger.info("Classifying roles")
 
 
 
1082
  if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
1083
  clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
1084
  vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
 
1101
  }
1102
  }
1103
 
 
1104
  acceptance_probability = calculate_acceptance_probability(analysis_data)
1105
  analysis_data['acceptance_probability'] = acceptance_probability
 
1106
 
1107
  logger.info("Generating report text using Gemini")
1108
  gemini_report_text = generate_report(analysis_data)
 
1116
  serializable_data = convert_to_serializable(analysis_data)
1117
  json.dump(serializable_data, f, indent=2)
1118
 
1119
+ os.remove(wav_file)
1120
 
1121
  logger.info(f"Processing completed for {audio_path}")
1122
  return {
 
1125
  }
1126
  except Exception as e:
1127
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
 
1128
  if 'wav_file' in locals() and os.path.exists(wav_file):
1129
  os.remove(wav_file)
1130
+ raise