norhan12 commited on
Commit
abafa67
·
verified ·
1 Parent(s): dbde83d

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +16 -9
process_interview.py CHANGED
@@ -69,7 +69,6 @@ def download_audio_from_url(url: str) -> str:
69
 
70
 
71
 
72
- # Initialize services
73
  def initialize_services():
74
  # Pinecone
75
  pc = Pinecone(api_key=PINECONE_KEY)
@@ -83,12 +82,17 @@ def initialize_services():
83
  )
84
  index = pc.Index(index_name)
85
 
 
 
 
 
 
 
86
  # Gemini
87
  genai.configure(api_key=GEMINI_API_KEY)
88
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
89
 
90
  return index, gemini_model
91
-
92
  index, gemini_model = initialize_services()
93
 
94
  # Device setup
@@ -228,13 +232,14 @@ def process_utterance(utterance, full_audio, wav_file):
228
  )
229
 
230
  # Identify speaker
231
- if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
232
  speaker_id = query_result['matches'][0]['id']
233
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
234
  else:
235
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
236
  speaker_name = f"Speaker_{speaker_id[-4:]}"
237
- index.upsert([(speaker_id, embedding.tolist(), {"speaker_name": speaker_name})])
 
238
 
239
  # Cleanup
240
  os.remove(temp_path)
@@ -275,7 +280,7 @@ def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
275
  # Role classification
276
  def train_role_classifier(utterances: List[Dict]):
277
  try:
278
- # Prepare data
279
  texts = [u['text'] for u in utterances]
280
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
281
  X_text = vectorizer.fit_transform(texts)
@@ -303,16 +308,19 @@ def train_role_classifier(utterances: List[Dict]):
303
 
304
  # Linguistic features
305
  doc = nlp(utterance['text'])
 
 
306
  feat.extend([
307
- int(utterance['text'].endswith('?')),
308
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
309
  len(utterance['text'].split()),
310
  sum(1 for token in doc if token.pos_ == 'VERB'),
311
  sum(1 for token in doc if token.pos_ == 'NOUN')
312
  ])
313
 
314
  features.append(feat)
315
- labels.append(0 if i % 2 == 0 else 1) # Temporary labeling
 
316
 
317
  # Train classifier
318
  scaler = StandardScaler()
@@ -336,7 +344,6 @@ def train_role_classifier(utterances: List[Dict]):
336
  logger.error(f"Classifier training failed: {str(e)}")
337
  raise
338
 
339
-
340
  def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
341
  try:
342
  # Prepare features for classification
 
69
 
70
 
71
 
 
72
  def initialize_services():
73
  # Pinecone
74
  pc = Pinecone(api_key=PINECONE_KEY)
 
82
  )
83
  index = pc.Index(index_name)
84
 
85
+ # حذف أي بيانات قديمة (اختياري)
86
+ try:
87
+ index.delete(delete_all=True)
88
+ except Exception as e:
89
+ logger.warning(f"Could not clear index: {str(e)}")
90
+
91
  # Gemini
92
  genai.configure(api_key=GEMINI_API_KEY)
93
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
94
 
95
  return index, gemini_model
 
96
  index, gemini_model = initialize_services()
97
 
98
  # Device setup
 
232
  )
233
 
234
  # Identify speaker
235
+ if query_result['matches'] and query_result['matches'][0]['score'] > 0.5: # تخفيض العتبة
236
  speaker_id = query_result['matches'][0]['id']
237
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
238
  else:
239
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
240
  speaker_name = f"Speaker_{speaker_id[-4:]}"
241
+ # إضافة المتحدث الجديد إلى الفهرس
242
+ index.upsert([(speaker_id, embedding.cpu().numpy().tolist(), {"speaker_name": speaker_name})])
243
 
244
  # Cleanup
245
  os.remove(temp_path)
 
280
  # Role classification
281
  def train_role_classifier(utterances: List[Dict]):
282
  try:
283
+ # تحليل المحتوى للتمييز بين الأسئلة (المحاور) والإجابات (المتحدث)
284
  texts = [u['text'] for u in utterances]
285
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
286
  X_text = vectorizer.fit_transform(texts)
 
308
 
309
  # Linguistic features
310
  doc = nlp(utterance['text'])
311
+ is_question = int(utterance['text'].endswith('?'))
312
+ question_words = len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower()))
313
  feat.extend([
314
+ is_question,
315
+ question_words,
316
  len(utterance['text'].split()),
317
  sum(1 for token in doc if token.pos_ == 'VERB'),
318
  sum(1 for token in doc if token.pos_ == 'NOUN')
319
  ])
320
 
321
  features.append(feat)
322
+ # التصنيف بناءً على كون النص سؤالاً (محاور) أو لا (متحدث)
323
+ labels.append(0 if is_question or question_words > 0 else 1)
324
 
325
  # Train classifier
326
  scaler = StandardScaler()
 
344
  logger.error(f"Classifier training failed: {str(e)}")
345
  raise
346
 
 
347
  def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
348
  try:
349
  # Prepare features for classification