Ranam Hamoud commited on
Commit
0b42831
·
1 Parent(s): 8528e25

Update files and add .gitignore, remove pycache from tracking

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pipeline_scores.png
2
+ plot_component_accuracy.py
3
+ __pycache__/
__pycache__/audio_classifier.cpython-313.pyc DELETED
Binary file (19.2 kB)
 
__pycache__/pipeline.cpython-313.pyc DELETED
Binary file (8.3 kB)
 
__pycache__/plagiarism_detection.cpython-313.pyc DELETED
Binary file (15.7 kB)
 
__pycache__/speech_recognizer.cpython-313.pyc DELETED
Binary file (16 kB)
 
__pycache__/text_analyzer.cpython-313.pyc DELETED
Binary file (5.56 kB)
 
app.py CHANGED
@@ -3,36 +3,430 @@ import os
3
  from pipeline import AuthenticityDetectionPipeline
4
  import traceback
5
 
 
6
  try:
7
  pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
8
  pipeline_ready = True
9
- except Exception:
10
  pipeline_ready = False
 
 
 
 
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def analyze_audio_file(audio_file):
 
14
  if not pipeline_ready:
15
- return (
16
- "Error: Pipeline not initialized. Please check the installation.",
17
- "", "", "", ""
18
- )
 
 
 
 
19
 
 
20
  if audio_file is None:
21
- return (
22
- "Please upload an audio file.",
23
- "", "", "", ""
24
- )
 
 
25
 
 
26
  try:
27
  language_code = None
28
  results = pipeline.analyze_audio(audio_file, language=language_code)
29
 
 
30
  audio_class = results['audio_classification']
31
  asr = results['speech_recognition']
32
  text_auth = results['text_authenticity']
33
  final = results['final_assessment']
34
 
35
-
36
  verdict_color = {
37
  "AUTHENTIC": "#10b981",
38
  "LIKELY AUTHENTIC": "#3b82f6",
@@ -42,6 +436,7 @@ def analyze_audio_file(audio_file):
42
 
43
  color = verdict_color.get(final['verdict'], '#6b7280')
44
 
 
45
  overall_status = f"""
46
  <div style='background: white; border: 2px solid {color}; padding: 25px; border-radius: 16px; margin: 10px 0;'>
47
  <h2 style='color: {color}; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
@@ -66,175 +461,11 @@ def analyze_audio_file(audio_file):
66
  </div>
67
  </div>
68
  """
69
- acoustic_output = audio_class['interpretation']
70
-
71
- transcription_output = "### Speech Transcription\n\n"
72
- transcription_output += f"| Metric | Value |\n"
73
- transcription_output += f"|--------|-------|\n"
74
- transcription_output += f"| **Language** | {asr['language'].upper()} |\n"
75
- transcription_output += f"| **Duration** | {asr['duration']:.1f} seconds |\n"
76
- transcription_output += f"| **Word Count** | {asr['word_count']} words |\n"
77
- transcription_output += f"| **Speech Rate** | {asr['speech_rate']:.1f} words/min |\n\n"
78
- if asr['speech_rate'] > 160:
79
- transcription_output += "**Fast speech rate** - Above average speaking speed\n\n"
80
- elif asr['speech_rate'] < 120:
81
- transcription_output += "**Slow speech rate** - Below average speaking speed\n\n"
82
- else:
83
- transcription_output += "**Normal speech rate** - Average conversational pace\n\n"
84
-
85
- transcription_output += "---\n\n"
86
- transcription_output += "#### Full Transcription\n\n"
87
- transcription_output += f"> {asr['transcription']}"
88
-
89
-
90
- if 'kopparapu_score' in asr:
91
- classification = asr['kopparapu_classification'].upper()
92
- confidence = asr['kopparapu_score'] if asr['kopparapu_score'] >= 0.5 else (1 - asr['kopparapu_score'])
93
-
94
- speech_patterns = f" ### **Classification: {classification} SPEECH**\n\n"
95
- speech_patterns += f"**Score:** {asr['kopparapu_score']:.3f} (0=spontaneous, 1=read)\n"
96
- speech_patterns += f"**Confidence:** {confidence*100:.1f}%\n\n"
97
-
98
- speech_patterns += "---\n\n"
99
- speech_patterns += "#### Linguistic Metrics\n\n"
100
- kf = asr['kopparapu_features']
101
-
102
- speech_patterns += "| Feature | Value | Interpretation |\n"
103
- speech_patterns += "|---------|-------|----------------|\n"
104
- speech_patterns += f"| **Characters/Word** | {kf['chars_per_word']:.2f} | "
105
- if kf['chars_per_word'] > 5.5:
106
- speech_patterns += "Complex vocabulary |\n"
107
- elif kf['chars_per_word'] < 4.5:
108
- speech_patterns += "Simple vocabulary |\n"
109
- else:
110
- speech_patterns += "Average complexity |\n"
111
-
112
- speech_patterns += f"| **Words/Second** | {kf['words_per_sec']:.2f} | "
113
- if kf['words_per_sec'] > 3:
114
- speech_patterns += "Fast pacing |\n"
115
- elif kf['words_per_sec'] < 2:
116
- speech_patterns += "Slow pacing |\n"
117
- else:
118
- speech_patterns += "Normal pacing |\n"
119
-
120
- speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
121
- if kf['filler_rate'] > 0.05:
122
- speech_patterns += "High (spontaneous) |\n"
123
- elif kf['filler_rate'] < 0.02:
124
- speech_patterns += "Low (scripted) |\n"
125
- else:
126
- speech_patterns += "Moderate |\n"
127
-
128
- speech_patterns += f"| **Repetitions** | {kf['repetition_count']} | "
129
- if kf['repetition_count'] > 3:
130
- speech_patterns += "Multiple (thinking aloud) |\n"
131
- elif kf['repetition_count'] == 0:
132
- speech_patterns += "None (prepared) |\n"
133
- else:
134
- speech_patterns += "Few |\n"
135
-
136
- speech_patterns += "\n---\n\n"
137
- speech_patterns += "#### Reading Style Indicators\n\n"
138
-
139
- speech_patterns += "| Feature | Value | Interpretation |\n"
140
- speech_patterns += "|---------|-------|----------------|\n"
141
-
142
- # Pause regularity
143
- pause_reg = kf.get('pause_regularity', 0.5)
144
- speech_patterns += f"| **Pause Regularity** | {pause_reg:.2f} | "
145
- if pause_reg > 0.7:
146
- speech_patterns += "Very regular (read) |\n"
147
- elif pause_reg > 0.4:
148
- speech_patterns += "Moderate |\n"
149
- else:
150
- speech_patterns += "Irregular (spontaneous) |\n"
151
-
152
- # Speech rate variability
153
- rate_var = kf.get('speech_rate_variability', 0.0)
154
- speech_patterns += f"| **Rate Variability** | {rate_var:.2f} | "
155
- if rate_var > 0.6:
156
- speech_patterns += "High (spontaneous) |\n"
157
- elif rate_var > 0.3:
158
- speech_patterns += "Moderate |\n"
159
- else:
160
- speech_patterns += "Steady pace (read) |\n"
161
-
162
- # Sentence variance
163
- sent_var = kf.get('sentence_length_variance', 0.0)
164
- speech_patterns += f"| **Sentence Variance** | {sent_var:.2f} | "
165
- if sent_var > 0.5:
166
- speech_patterns += "Variable (spontaneous) |\n"
167
- elif sent_var > 0.25:
168
- speech_patterns += "Moderate |\n"
169
- else:
170
- speech_patterns += "Uniform (read) |\n"
171
-
172
- # Self-corrections
173
- corrections = kf.get('self_correction_count', 0)
174
- speech_patterns += f"| **Self-Corrections** | {corrections} | "
175
- if corrections > 2:
176
- speech_patterns += "Multiple (spontaneous) |\n"
177
- elif corrections > 0:
178
- speech_patterns += "Few |\n"
179
- else:
180
- speech_patterns += "None (scripted) |\n"
181
-
182
- speech_patterns += "\n"
183
-
184
- speech_patterns += "---\n\n"
185
- speech_patterns += "#### Filler Words & Disfluencies\n\n"
186
- filler_ratio = asr['filler_words']['ratio']
187
- speech_patterns += f"**Count:** {asr['filler_words']['count']} filler words\n"
188
- speech_patterns += f"**Ratio:** {filler_ratio*100:.2f}% of speech\n\n"
189
-
190
- if asr['filler_words']['details']:
191
- speech_patterns += "**Found:** " + ', '.join([f"*{k}* ({v})" for k, v in asr['filler_words']['details'].items()]) + "\n\n"
192
-
193
- if filler_ratio > 0.05:
194
- speech_patterns += "**High filler usage** - Strong indicator of spontaneous, unscripted speech\n\n"
195
- elif filler_ratio < 0.02:
196
- speech_patterns += "**Low filler usage** - May indicate reading or highly rehearsed speech\n\n"
197
- else:
198
- speech_patterns += "**Moderate filler usage** - Normal conversational pattern\n\n"
199
-
200
- speech_patterns += "---\n\n"
201
- speech_patterns += "#### Pause Patterns\n\n"
202
- pause_var = asr['pause_patterns']['pause_variability']
203
-
204
- speech_patterns += f"**Total Pauses:** {asr['pause_patterns']['num_pauses']}\n"
205
- speech_patterns += f"**Average Duration:** {asr['pause_patterns']['avg_pause']:.2f}s\n"
206
- speech_patterns += f"**Longest Pause:** {asr['pause_patterns']['max_pause']:.2f}s\n"
207
- speech_patterns += f"**Variability:** {pause_var:.2f}\n\n"
208
-
209
- if pause_var < 0.3:
210
- speech_patterns += "**Regular pauses** - Consistent pattern suggests reading at punctuation marks\n\n"
211
- elif pause_var > 0.6:
212
- speech_patterns += "**Irregular pauses** - Natural thinking breaks indicate spontaneous speech\n\n"
213
- else:
214
- speech_patterns += "**Moderate variability** - Mixed pattern\n\n"
215
-
216
- is_ai = text_auth['ai_detection']['ai_generated']
217
- ai_prob = text_auth['ai_detection']['confidence']
218
-
219
- if is_ai:
220
- ai_output = "### **AI-GENERATED LIKELY**\n\n"
221
- else:
222
- ai_output = "### **HUMAN-WRITTEN LIKELY**\n\n"
223
-
224
- ai_output += "**Confidence:**\n\n"
225
- bar_length = 30
226
- ai_bars = int(ai_prob * bar_length)
227
- human_bars = bar_length - ai_bars
228
- ai_output += f"```\nAI: [{'█' * ai_bars}{'░' * human_bars}] {ai_prob*100:.0f}%\n"
229
- ai_output += f"Human: [{'█' * human_bars}{'░' * ai_bars}] {(1-ai_prob)*100:.0f}%\n```\n\n"
230
-
231
- ai_output += "---\n\n"
232
- ai_output += "#### Interpretation\n\n"
233
- ai_interpretation = text_auth['ai_detection'].get('interpretation', 'No interpretation available.')
234
- if ai_interpretation:
235
- ai_output += ai_interpretation
236
- else:
237
- ai_output += "No interpretation available."
238
 
239
  return (
240
  overall_status,
@@ -245,11 +476,22 @@ def analyze_audio_file(audio_file):
245
  )
246
 
247
  except Exception as e:
248
- error_msg = f"Error during analysis:\n\n{str(e)}\n\n{traceback.format_exc()}"
249
- return (error_msg, "", "", "", "", "")
 
 
 
 
 
 
 
 
 
250
 
251
 
 
252
  def create_interface():
 
253
  custom_css = """
254
  @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
255
 
@@ -257,26 +499,6 @@ def create_interface():
257
  font-family: 'IBM Plex Sans', sans-serif !important;
258
  background: white !important;
259
  }
260
- .contain {
261
- max-width: 100% !important;
262
- width: 100% !important;
263
- margin: 0 auto !important;
264
- background: white !important;
265
- padding: 0 !important;
266
- }
267
- .tab-nav button {
268
- font-family: 'IBM Plex Sans', sans-serif;
269
- font-size: 14px;
270
- font-weight: 500;
271
- padding: 10px 16px;
272
- border-radius: 8px 8px 0 0;
273
- transition: all 0.2s;
274
- }
275
- .tab-nav button.selected {
276
- background: #2563eb;
277
- color: white;
278
- font-weight: 600;
279
- }
280
  button.primary, .primary {
281
  background: #2563eb !important;
282
  color: white !important;
@@ -285,23 +507,12 @@ def create_interface():
285
  font-weight: 600 !important;
286
  padding: 12px 24px !important;
287
  border-radius: 8px !important;
288
- transition: all 0.2s !important;
289
- }
290
- button.primary:hover, .primary:hover {
291
- background: #1d4ed8 !important;
292
- }
293
- .markdown-text {
294
- font-family: 'IBM Plex Sans', sans-serif;
295
- line-height: 1.7;
296
- }
297
- h1, h2, h3, h4 {
298
- font-family: 'IBM Plex Sans', sans-serif;
299
- font-weight: 600;
300
  }
301
  """
302
 
303
  with gr.Blocks(title="Authenticity Detection System") as demo:
304
 
 
305
  gr.HTML(f"""
306
  <style>
307
  {custom_css}
@@ -309,17 +520,6 @@ def create_interface():
309
  <header style='background: white; border-bottom: 1px solid #e5e7eb; margin-bottom: 32px;'>
310
  <div style='padding: 16px 0;'>
311
  <div style='display: flex; align-items: center; gap: 12px;'>
312
- <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" width="32" height="32">
313
- <defs>
314
- <linearGradient id="g" x1="0" y1="0" x2="64" y2="0" gradientUnits="userSpaceOnUse">
315
- <stop offset="0" stop-color="#1d4ed8" />
316
- <stop offset="1" stop-color="#0ea5e9" />
317
- </linearGradient>
318
- </defs>
319
- <rect x="0" y="0" width="64" height="64" rx="12" fill="#ffffff"/>
320
- <path d="M4 32 C 10 18, 18 46, 24 32 S 36 18, 40 32 52 46, 60 32"
321
- fill="none" stroke="url(#g)" stroke-width="4" stroke-linecap="round" stroke-linejoin="round"/>
322
- </svg>
323
  <div>
324
  <p style='margin: 0; font-size: 11px; text-transform: uppercase; letter-spacing: 1.5px; color: #6b7280; font-weight: 500;'>
325
  LEIDEN UNIVERSITY · LIACS
@@ -337,8 +537,6 @@ def create_interface():
337
  <h2 style='font-size: 32px; font-weight: 700; line-height: 1.2; color: #111827; margin: 0 0 16px 0;'>
338
  Detecting AI-Assisted Responses in Online Settings
339
  </h2>
340
- <p style='font-size: 18px; color: #374151; margin: 0 0 24px 0;'>
341
- </p>
342
  <div style='display: flex; flex-wrap: wrap; gap: 12px;'>
343
  <span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #eff6ff; color: #1e40af; border-radius: 8px; font-size: 14px; font-weight: 500;'>
344
  Multi-Modal Analysis
@@ -351,15 +549,17 @@ def create_interface():
351
  </section>
352
  """)
353
 
 
354
  with gr.Row():
355
  with gr.Column(scale=1):
356
  gr.HTML("""
357
- <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 20px;'>
358
  <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Audio Input</h3>
359
  <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>Upload or record your audio file</p>
360
  </div>
361
  """)
362
 
 
363
  audio_input = gr.Audio(
364
  sources=["upload", "microphone"],
365
  type="filepath",
@@ -367,12 +567,14 @@ def create_interface():
367
  show_label=False
368
  )
369
 
 
370
  analyze_btn = gr.Button(
371
  "Analyze Audio",
372
  variant="primary",
373
  size="lg"
374
  )
375
 
 
376
  gr.HTML("""
377
  <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
378
  <h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
@@ -381,40 +583,34 @@ def create_interface():
381
  <li><strong>Duration:</strong> 30 sec - 5 min</li>
382
  </ul>
383
  </div>
384
-
385
- <div style='background: #fef3c7; border: 1px solid #fbbf24; padding: 16px; border-radius: 12px; margin-top: 16px;'>
386
- <div style='font-size: 12px; color: #92400e; line-height: 1.6;'>
387
- <strong>Note:</strong> Provides probabilistic assessments.
388
- Use as one factor in evaluation.
389
- </div>
390
- </div>
391
  """)
392
 
393
  with gr.Column(scale=2):
394
  gr.HTML("""
395
- <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 20px;'>
396
  <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Analysis Results</h3>
397
  <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>You'll see results here</p>
398
  </div>
399
  """)
400
 
401
- overall_output = gr.Markdown()
 
402
 
 
403
  with gr.Tabs() as tabs:
404
  with gr.Tab("Acoustic Features"):
405
- acoustic_output = gr.Markdown()
406
 
407
  with gr.Tab("Transcription"):
408
- transcription_output = gr.Markdown()
409
 
410
  with gr.Tab("Speech Patterns"):
411
- speech_output = gr.Markdown()
412
 
413
  with gr.Tab("AI Detection"):
414
- ai_output = gr.Markdown()
415
-
416
 
417
- # Add example audio files with caching
418
  gr.HTML("""
419
  <div style='margin-top: 20px; margin-bottom: 10px;'>
420
  <h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
@@ -424,8 +620,8 @@ def create_interface():
424
  examples_dir = os.path.join(os.path.dirname(__file__), "examples")
425
  gr.Examples(
426
  examples=[
427
- [os.path.join(examples_dir, "read1.ogg")],
428
- [os.path.join(examples_dir, "spontaneous1.ogg")]
429
  ],
430
  inputs=[audio_input],
431
  outputs=[
@@ -438,31 +634,26 @@ def create_interface():
438
  fn=analyze_audio_file,
439
  label="",
440
  examples_per_page=2,
441
- cache_examples=True
442
  )
443
 
 
444
  def show_loading():
445
  loading_html = """
446
  <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border: 2px solid #667eea; padding: 30px; border-radius: 16px; margin: 10px 0; text-align: center;'>
447
  <h2 style='color: white; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
448
  Analyzing...
449
  </h2>
450
- <div style='margin-top: 20px;'>
451
- <div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out infinite;'></div>
452
- <div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out 0.2s infinite;'></div>
453
- <div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out 0.4s infinite;'></div>
454
- </div>
455
  </div>
456
- <style>
457
- @keyframes pulse {
458
- 0%, 100% { opacity: 0.3; transform: scale(0.8); }
459
- 50% { opacity: 1; transform: scale(1.2); }
460
- }
461
- </style>
462
  """
463
- loading_msg = " **Processing...**"
464
- return loading_html, loading_msg, loading_msg, loading_msg, loading_msg
 
 
 
 
465
 
 
466
  analyze_btn.click(
467
  fn=show_loading,
468
  inputs=None,
@@ -486,13 +677,12 @@ def create_interface():
486
  ]
487
  )
488
 
 
489
  gr.HTML("""
490
  <footer style='border-top: 1px solid #e5e7eb; background: white; margin-top: 48px; padding: 32px 0;'>
491
  <div style='text-align: center;'>
492
  <p style='margin: 0; font-size: 14px; color: #6b7280;'>
493
  </p>
494
- <p style='margin: 8px 0 0 0; font-size: 13px; color: #9ca3af;'>
495
- </p>
496
  </div>
497
  </footer>
498
  """)
@@ -500,6 +690,7 @@ def create_interface():
500
  return demo
501
 
502
 
 
503
  if __name__ == "__main__":
504
  demo = create_interface()
505
  demo.launch(
@@ -508,4 +699,3 @@ if __name__ == "__main__":
508
  share=False,
509
  show_error=True
510
  )
511
-
 
3
  from pipeline import AuthenticityDetectionPipeline
4
  import traceback
5
 
6
+ # initialize the pipeline on startup
7
  try:
8
  pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
9
  pipeline_ready = True
10
+ except Exception as e:
11
  pipeline_ready = False
12
+ pipeline_error = str(e)
13
+ import traceback
14
+ print(f"Could not start pipeline: {e}")
15
+ traceback.print_exc()
16
 
17
 
18
+ # build the acoustic features display HTML
19
+ def build_acoustic_features_display(audio_class):
20
+ classification = audio_class['classification']
21
+ confidence = audio_class['confidence']
22
+ cnn_class = audio_class['cnn_classification']
23
+ cnn_conf = audio_class['cnn_confidence']
24
+ prosody_class = audio_class['prosody_classification']
25
+ prosody_conf = audio_class['prosody_confidence']
26
+ prosody_scores = audio_class.get('prosody_scores', {})
27
+ acoustic_features = audio_class.get('acoustic_features', {})
28
+
29
+ # color scheme based on classification
30
+ if classification == 'spontaneous':
31
+ main_color = '#10b981'
32
+ bg_color = '#ecfdf5'
33
+ label = 'SPONTANEOUS'
34
+ else:
35
+ main_color = '#f59e0b'
36
+ bg_color = '#fffbeb'
37
+ label = 'READ'
38
+
39
+ cnn_color = '#10b981' if cnn_class == 'spontaneous' else '#f59e0b'
40
+ prosody_color = '#10b981' if prosody_class == 'spontaneous' else '#f59e0b'
41
+
42
+ # build main classification header
43
+ output = f"""
44
+ <div style="background: linear-gradient(135deg, {bg_color} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {main_color}33;">
45
+ <h3 style="margin: 0; color: {main_color}; font-size: 22px; font-weight: 700;">{label} SPEECH</h3>
46
+ <p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Combined acoustic analysis confidence: <strong>{confidence*100:.1f}%</strong></p>
47
+ </div>
48
+
49
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
50
+ <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Analysis Components</h4>
51
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
52
+ <div style="background: #f9fafb; border-radius: 10px; padding: 16px;">
53
+ <div style="font-size: 12px; color: #6b7280; margin-bottom: 8px; font-weight: 500;">CNN Neural Network</div>
54
+ <div style="font-size: 20px; font-weight: 700; color: {cnn_color}; margin-bottom: 8px;">{cnn_class.upper()}</div>
55
+ <div style="background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 6px;">
56
+ <div style="height: 100%; width: {cnn_conf*100:.0f}%; background: {cnn_color}; border-radius: 6px;"></div>
57
+ </div>
58
+ <div style="font-size: 11px; color: #9ca3af; margin-top: 6px;">{cnn_conf*100:.1f}% confidence</div>
59
+ </div>
60
+ <div style="background: #f9fafb; border-radius: 10px; padding: 16px;">
61
+ <div style="font-size: 12px; color: #6b7280; margin-bottom: 8px; font-weight: 500;">Prosody Analysis</div>
62
+ <div style="font-size: 20px; font-weight: 700; color: {prosody_color}; margin-bottom: 8px;">{prosody_class.upper()}</div>
63
+ <div style="background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 6px;">
64
+ <div style="height: 100%; width: {prosody_conf*100:.0f}%; background: {prosody_color}; border-radius: 6px;"></div>
65
+ </div>
66
+ <div style="font-size: 11px; color: #9ca3af; margin-top: 6px;">{prosody_conf*100:.1f}% confidence</div>
67
+ </div>
68
+ </div>
69
+ </div>
70
+
71
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
72
+ <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Prosody Feature Breakdown</h4>
73
+ """
74
+
75
+ # feature descriptions
76
+ feature_info = {
77
+ 'spectral_variability': {'name': 'Spectral Variability', 'unit': 'Hz', 'description': 'Variation in frequency content over time'},
78
+ 'zcr_mean': {'name': 'Zero Crossing Rate', 'unit': 'ratio', 'description': 'Rate of signal sign changes'},
79
+ 'energy_level': {'name': 'Energy Level', 'unit': 'RMS', 'description': 'Overall loudness and intensity'},
80
+ 'tempo': {'name': 'Speech Tempo', 'unit': 'BPM', 'description': 'Rhythmic pacing of speech'}
81
+ }
82
+
83
+ # add feature details
84
+ for key, info in feature_info.items():
85
+ if key in prosody_scores:
86
+ score_data = prosody_scores[key]
87
+ score = score_data['score']
88
+ value = score_data['value']
89
+ interp = score_data['interpretation']
90
+ unit = info['unit']
91
+
92
+ bar_color = '#10b981' if score < 0.4 else '#f59e0b' if score > 0.6 else '#6b7280'
93
+ indicator_position = score * 100
94
+
95
+ output += f"""
96
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; margin-bottom: 10px;">
97
+ <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 8px;">
98
+ <div>
99
+ <div style="font-weight: 600; color: #1f2937; font-size: 14px;">{info['name']}</div>
100
+ <div style="font-size: 11px; color: #9ca3af;">{info['description']}</div>
101
+ </div>
102
+ <div style="text-align: right;">
103
+ <div style="font-size: 13px; font-weight: 600; color: {bar_color};">{interp}</div>
104
+ <div style="font-size: 11px; color: #6b7280;">{value:.3f} <span style="color: #9ca3af;">{unit}</span></div>
105
+ </div>
106
+ </div>
107
+ <div style="position: relative; background: linear-gradient(to right, #10b981, #6b7280, #f59e0b); border-radius: 4px; height: 6px; margin: 10px 0 6px 0;">
108
+ <div style="position: absolute; left: {indicator_position}%; top: -4px; transform: translateX(-50%); width: 14px; height: 14px; background: white; border: 2px solid {bar_color}; border-radius: 50%; box-shadow: 0 1px 3px rgba(0,0,0,0.15);"></div>
109
+ </div>
110
+ <div style="display: flex; justify-content: space-between; font-size: 10px; color: #9ca3af;">
111
+ <span>Spontaneous</span>
112
+ <span>Read</span>
113
+ </div>
114
+ </div>
115
+ """
116
+
117
+ output += "</div>"
118
+
119
+ # add raw acoustic measurements
120
+ output += """
121
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
122
+ <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Raw Acoustic Measurements</h4>
123
+ <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px;">
124
+ """
125
+
126
+ if acoustic_features:
127
+ metrics = [
128
+ ('Tempo', f"{acoustic_features.get('tempo', 0):.1f}", 'BPM'),
129
+ ('Pitch Mean', f"{acoustic_features.get('pitch_mean', 0):.1f}", 'Hz'),
130
+ ('Energy Mean', f"{acoustic_features.get('energy_mean', 0):.4f}", ''),
131
+ ('ZCR Mean', f"{acoustic_features.get('zcr_mean', 0):.4f}", ''),
132
+ ]
133
+ for name, value, unit in metrics:
134
+ output += f"""
135
+ <div style="background: #f9fafb; border-radius: 8px; padding: 12px; text-align: center;">
136
+ <div style="font-size: 16px; font-weight: 600; color: #1f2937;">{value}</div>
137
+ <div style="font-size: 10px; color: #6b7280; margin-top: 2px;">{name} {unit}</div>
138
+ </div>
139
+ """
140
+
141
+ output += """
142
+ </div>
143
+ </div>
144
+ """
145
+
146
+ return output
147
+
148
+
149
+ # build the transcription display HTML
150
+ def build_transcription_display(asr):
151
+ # determine speech rate interpretation
152
+ if asr['speech_rate'] > 160:
153
+ rate_color = '#f59e0b'
154
+ rate_label = 'Fast'
155
+ rate_desc = 'Above average speaking speed'
156
+ elif asr['speech_rate'] < 120:
157
+ rate_color = '#3b82f6'
158
+ rate_label = 'Slow'
159
+ rate_desc = 'Below average speaking speed'
160
+ else:
161
+ rate_color = '#10b981'
162
+ rate_label = 'Normal'
163
+ rate_desc = 'Average conversational pace'
164
+
165
+ output = f"""
166
+ <div style="background: linear-gradient(135deg, #eff6ff 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid #3b82f633;">
167
+ <h3 style="margin: 0; color: #1e40af; font-size: 22px; font-weight: 700;">Speech Transcription</h3>
168
+ <p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Detected language: <strong>{asr['language'].upper()}</strong></p>
169
+ </div>
170
+
171
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
172
+ <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Speech Metrics</h4>
173
+ <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 16px;">
174
+ <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
175
+ <div style="font-size: 24px; font-weight: 700; color: #1e40af;">{asr['duration']:.1f}</div>
176
+ <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Duration (sec)</div>
177
+ </div>
178
+ <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
179
+ <div style="font-size: 24px; font-weight: 700; color: #1e40af;">{asr['word_count']}</div>
180
+ <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Words</div>
181
+ </div>
182
+ <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
183
+ <div style="font-size: 24px; font-weight: 700; color: {rate_color};">{asr['speech_rate']:.0f}</div>
184
+ <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Words/min</div>
185
+ </div>
186
+ <div style="background: {rate_color}15; border-radius: 10px; padding: 16px; text-align: center; border: 1px solid {rate_color}33;">
187
+ <div style="font-size: 18px; font-weight: 700; color: {rate_color};">{rate_label}</div>
188
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">{rate_desc}</div>
189
+ </div>
190
+ </div>
191
+ </div>
192
+
193
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px;">
194
+ <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Full Transcription</h4>
195
+ <div style="background: #f9fafb; border-radius: 10px; padding: 20px; border-left: 4px solid #3b82f6;">
196
+ <p style="margin: 0; font-size: 15px; line-height: 1.8; color: #374151; font-style: italic;">"{asr['transcription']}"</p>
197
+ </div>
198
+ </div>
199
+ """
200
+
201
+ return output
202
+
203
+
204
+ # build the speech patterns display HTML
205
+ def build_speech_patterns_display(asr):
206
+ output = ""
207
+
208
+ # kopparapu classification section
209
+ if 'kopparapu_score' in asr:
210
+ classification = asr['kopparapu_classification'].upper()
211
+ kop_score = asr['kopparapu_score']
212
+ confidence = kop_score if kop_score >= 0.5 else (1 - kop_score)
213
+
214
+ if classification == 'SPONTANEOUS':
215
+ class_color = '#10b981'
216
+ class_bg = '#ecfdf5'
217
+ else:
218
+ class_color = '#f59e0b'
219
+ class_bg = '#fffbeb'
220
+
221
+ kf = asr['kopparapu_features']
222
+
223
+ output += f"""
224
+ <div style="background: linear-gradient(135deg, {class_bg} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {class_color}33;">
225
+ <h3 style="margin: 0; color: {class_color}; font-size: 22px; font-weight: 700;">{classification} SPEECH</h3>
226
+ <p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Linguistic analysis confidence: <strong>{confidence*100:.1f}%</strong></p>
227
+ <div style="margin-top: 12px; background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 8px;">
228
+ <div style="height: 100%; width: {kop_score*100:.0f}%; background: linear-gradient(to right, #10b981, #f59e0b); border-radius: 6px;"></div>
229
+ </div>
230
+ </div>
231
+
232
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
233
+ <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Linguistic Metrics</h4>
234
+ <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px;">
235
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
236
+ <div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['chars_per_word']:.2f}</div>
237
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Chars/Word</div>
238
+ </div>
239
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
240
+ <div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['words_per_sec']:.2f}</div>
241
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Words/Sec</div>
242
+ </div>
243
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
244
+ <div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['filler_rate']*100:.1f}%</div>
245
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Filler Rate</div>
246
+ </div>
247
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
248
+ <div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['repetition_count']}</div>
249
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Repetitions</div>
250
+ </div>
251
+ </div>
252
+ </div>
253
+ """
254
+
255
+ # filler words section
256
+ filler_ratio = asr['filler_words']['ratio']
257
+ filler_count = asr['filler_words']['count']
258
+
259
+ if filler_ratio > 0.05:
260
+ filler_color = '#10b981'
261
+ filler_label = 'High filler usage'
262
+ filler_desc = 'Strong indicator of spontaneous speech'
263
+ elif filler_ratio < 0.02:
264
+ filler_color = '#f59e0b'
265
+ filler_label = 'Low filler usage'
266
+ filler_desc = 'May indicate reading or rehearsed speech'
267
+ else:
268
+ filler_color = '#6b7280'
269
+ filler_label = 'Moderate filler usage'
270
+ filler_desc = 'Normal conversational pattern'
271
+
272
+ output += f"""
273
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
274
+ <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Filler Words</h4>
275
+ <div style="display: grid; grid-template-columns: 1fr 1fr 2fr; gap: 16px; align-items: center;">
276
+ <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
277
+ <div style="font-size: 28px; font-weight: 700; color: {filler_color};">{filler_count}</div>
278
+ <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Filler Words</div>
279
+ </div>
280
+ <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
281
+ <div style="font-size: 28px; font-weight: 700; color: {filler_color};">{filler_ratio*100:.1f}%</div>
282
+ <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Of Speech</div>
283
+ </div>
284
+ <div style="background: {filler_color}10; border-radius: 10px; padding: 16px; border: 1px solid {filler_color}33;">
285
+ <div style="font-weight: 600; color: {filler_color}; font-size: 14px;">{filler_label}</div>
286
+ <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">{filler_desc}</div>
287
+ </div>
288
+ </div>
289
+ </div>
290
+ """
291
+
292
+ # pause patterns section
293
+ pause_var = asr['pause_patterns']['pause_variability']
294
+
295
+ if pause_var < 0.3:
296
+ pause_color = '#f59e0b'
297
+ pause_label = 'Regular pauses'
298
+ pause_desc = 'Suggests reading at punctuation marks'
299
+ elif pause_var > 0.6:
300
+ pause_color = '#10b981'
301
+ pause_label = 'Irregular pauses'
302
+ pause_desc = 'Natural thinking breaks indicate spontaneous speech'
303
+ else:
304
+ pause_color = '#6b7280'
305
+ pause_label = 'Moderate variability'
306
+ pause_desc = 'Mixed pattern'
307
+
308
+ output += f"""
309
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px;">
310
+ <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Pause Patterns</h4>
311
+ <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-bottom: 16px;">
312
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
313
+ <div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['num_pauses']}</div>
314
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Total Pauses</div>
315
+ </div>
316
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
317
+ <div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['avg_pause']:.2f}</div>
318
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Avg Duration</div>
319
+ </div>
320
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
321
+ <div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['max_pause']:.2f}</div>
322
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Longest Pause</div>
323
+ </div>
324
+ <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
325
+ <div style="font-size: 20px; font-weight: 700; color: {pause_color};">{pause_var:.2f}</div>
326
+ <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Variability</div>
327
+ </div>
328
+ </div>
329
+ <div style="background: {pause_color}10; border-radius: 10px; padding: 14px; border: 1px solid {pause_color}33;">
330
+ <div style="font-weight: 600; color: {pause_color}; font-size: 14px;">{pause_label}</div>
331
+ <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">{pause_desc}</div>
332
+ </div>
333
+ </div>
334
+ """
335
+
336
+ return output
337
+
338
+
339
+ # build the AI detection display HTML
340
+ def build_ai_detection_display(text_auth):
341
+ is_ai = text_auth['ai_detection']['ai_generated']
342
+ ai_prob = text_auth['ai_detection']['confidence']
343
+ human_prob = 1 - ai_prob
344
+
345
+ if is_ai:
346
+ main_color = '#ef4444'
347
+ bg_color = '#fef2f2'
348
+ label = 'AI-GENERATED LIKELY'
349
+ desc = 'The text shows patterns consistent with AI-generated content'
350
+ else:
351
+ main_color = '#10b981'
352
+ bg_color = '#ecfdf5'
353
+ label = 'HUMAN-WRITTEN LIKELY'
354
+ desc = 'The text shows patterns consistent with human-written content'
355
+
356
+ output = f"""
357
+ <div style="background: linear-gradient(135deg, {bg_color} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {main_color}33;">
358
+ <h3 style="margin: 0; color: {main_color}; font-size: 22px; font-weight: 700;">{label}</h3>
359
+ <p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">{desc}</p>
360
+ </div>
361
+
362
+ <div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
363
+ <h4 style="margin: 0 0 20px 0; color: #374151; font-size: 15px; font-weight: 600;">Confidence Analysis</h4>
364
+
365
+ <div style="margin-bottom: 20px;">
366
+ <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
367
+ <span style="font-weight: 600; color: #ef4444; font-size: 14px;">AI Generated</span>
368
+ <span style="font-weight: 700; color: #ef4444; font-size: 18px;">{ai_prob*100:.0f}%</span>
369
+ </div>
370
+ <div style="background: #fee2e2; border-radius: 8px; overflow: hidden; height: 12px;">
371
+ <div style="height: 100%; width: {ai_prob*100:.0f}%; background: #ef4444; border-radius: 8px;"></div>
372
+ </div>
373
+ </div>
374
+
375
+ <div>
376
+ <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
377
+ <span style="font-weight: 600; color: #10b981; font-size: 14px;">Human Written</span>
378
+ <span style="font-weight: 700; color: #10b981; font-size: 18px;">{human_prob*100:.0f}%</span>
379
+ </div>
380
+ <div style="background: #d1fae5; border-radius: 8px; overflow: hidden; height: 12px;">
381
+ <div style="height: 100%; width: {human_prob*100:.0f}%; background: #10b981; border-radius: 8px;"></div>
382
+ </div>
383
+ </div>
384
+ </div>
385
+
386
+ <div style="background: #fffbeb; border: 1px solid #fcd34d; border-radius: 10px; padding: 14px;">
387
+ <div style="font-size: 13px; color: #92400e; line-height: 1.5;">
388
+ <strong>Note:</strong> AI detection is probabilistic and should be used as one factor among many in your evaluation.
389
+ </div>
390
+ </div>
391
+ """
392
+
393
+ return output
394
+
395
+
396
+ # main function to analyze uploaded audio file
397
  def analyze_audio_file(audio_file):
398
+ # check if pipeline is ready
399
  if not pipeline_ready:
400
+ error_msg = pipeline_error if 'pipeline_error' in dir() else "Something went wrong"
401
+ error_html = f"""
402
+ <div style="background: #fef2f2; border: 1px solid #ef4444; border-radius: 12px; padding: 20px;">
403
+ <h3 style="margin: 0 0 8px 0; color: #dc2626; font-size: 16px;">Pipeline not ready</h3>
404
+ <p style="margin: 0; color: #7f1d1d; font-size: 14px;">{error_msg}</p>
405
+ </div>
406
+ """
407
+ return (error_html, "", "", "", "")
408
 
409
+ # check if audio file was provided
410
  if audio_file is None:
411
+ placeholder_html = """
412
+ <div style="background: #f9fafb; border: 1px solid #e5e7eb; border-radius: 12px; padding: 40px; text-align: center;">
413
+ <p style="margin: 0; color: #6b7280; font-size: 15px;">Please upload an audio file to begin analysis.</p>
414
+ </div>
415
+ """
416
+ return (placeholder_html, "", "", "", "")
417
 
418
+ # run analysis
419
  try:
420
  language_code = None
421
  results = pipeline.analyze_audio(audio_file, language=language_code)
422
 
423
+ # extract results from each component
424
  audio_class = results['audio_classification']
425
  asr = results['speech_recognition']
426
  text_auth = results['text_authenticity']
427
  final = results['final_assessment']
428
 
429
+ # color mapping for verdict
430
  verdict_color = {
431
  "AUTHENTIC": "#10b981",
432
  "LIKELY AUTHENTIC": "#3b82f6",
 
436
 
437
  color = verdict_color.get(final['verdict'], '#6b7280')
438
 
439
+ # build overall status display
440
  overall_status = f"""
441
  <div style='background: white; border: 2px solid {color}; padding: 25px; border-radius: 16px; margin: 10px 0;'>
442
  <h2 style='color: {color}; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
 
461
  </div>
462
  </div>
463
  """
464
+ # build tab outputs
465
+ acoustic_output = build_acoustic_features_display(audio_class)
466
+ transcription_output = build_transcription_display(asr)
467
+ speech_patterns = build_speech_patterns_display(asr)
468
+ ai_output = build_ai_detection_display(text_auth)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
  return (
471
  overall_status,
 
476
  )
477
 
478
  except Exception as e:
479
+ error_html = f"""
480
+ <div style="background: #fef2f2; border: 1px solid #ef4444; border-radius: 12px; padding: 20px;">
481
+ <h3 style="margin: 0 0 12px 0; color: #dc2626; font-size: 16px;">Something went wrong</h3>
482
+ <p style="margin: 0 0 12px 0; color: #7f1d1d; font-size: 14px;">{str(e)}</p>
483
+ <details style="margin-top: 12px;">
484
+ <summary style="color: #6b7280; cursor: pointer; font-size: 13px;">More info</summary>
485
+ <pre style="background: #1f2937; color: #f3f4f6; padding: 12px; border-radius: 8px; margin-top: 8px; font-size: 11px; overflow-x: auto;">{traceback.format_exc()}</pre>
486
+ </details>
487
+ </div>
488
+ """
489
+ return (error_html, "", "", "", "")
490
 
491
 
492
+ # create the gradio interface
493
  def create_interface():
494
+ # custom CSS for styling
495
  custom_css = """
496
  @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
497
 
 
499
  font-family: 'IBM Plex Sans', sans-serif !important;
500
  background: white !important;
501
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  button.primary, .primary {
503
  background: #2563eb !important;
504
  color: white !important;
 
507
  font-weight: 600 !important;
508
  padding: 12px 24px !important;
509
  border-radius: 8px !important;
 
 
 
 
 
 
 
 
 
 
 
 
510
  }
511
  """
512
 
513
  with gr.Blocks(title="Authenticity Detection System") as demo:
514
 
515
+ # header section
516
  gr.HTML(f"""
517
  <style>
518
  {custom_css}
 
520
  <header style='background: white; border-bottom: 1px solid #e5e7eb; margin-bottom: 32px;'>
521
  <div style='padding: 16px 0;'>
522
  <div style='display: flex; align-items: center; gap: 12px;'>
 
 
 
 
 
 
 
 
 
 
 
523
  <div>
524
  <p style='margin: 0; font-size: 11px; text-transform: uppercase; letter-spacing: 1.5px; color: #6b7280; font-weight: 500;'>
525
  LEIDEN UNIVERSITY · LIACS
 
537
  <h2 style='font-size: 32px; font-weight: 700; line-height: 1.2; color: #111827; margin: 0 0 16px 0;'>
538
  Detecting AI-Assisted Responses in Online Settings
539
  </h2>
 
 
540
  <div style='display: flex; flex-wrap: wrap; gap: 12px;'>
541
  <span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #eff6ff; color: #1e40af; border-radius: 8px; font-size: 14px; font-weight: 500;'>
542
  Multi-Modal Analysis
 
549
  </section>
550
  """)
551
 
552
+ # main layout
553
  with gr.Row():
554
  with gr.Column(scale=1):
555
  gr.HTML("""
556
+ <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-bottom: 20px;'>
557
  <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Audio Input</h3>
558
  <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>Upload or record your audio file</p>
559
  </div>
560
  """)
561
 
562
+ # audio input component
563
  audio_input = gr.Audio(
564
  sources=["upload", "microphone"],
565
  type="filepath",
 
567
  show_label=False
568
  )
569
 
570
+ # analyze button
571
  analyze_btn = gr.Button(
572
  "Analyze Audio",
573
  variant="primary",
574
  size="lg"
575
  )
576
 
577
+ # requirements info
578
  gr.HTML("""
579
  <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
580
  <h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
 
583
  <li><strong>Duration:</strong> 30 sec - 5 min</li>
584
  </ul>
585
  </div>
 
 
 
 
 
 
 
586
  """)
587
 
588
  with gr.Column(scale=2):
589
  gr.HTML("""
590
+ <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-bottom: 20px;'>
591
  <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Analysis Results</h3>
592
  <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>You'll see results here</p>
593
  </div>
594
  """)
595
 
596
+ # overall output
597
+ overall_output = gr.HTML()
598
 
599
+ # results tabs
600
  with gr.Tabs() as tabs:
601
  with gr.Tab("Acoustic Features"):
602
+ acoustic_output = gr.HTML()
603
 
604
  with gr.Tab("Transcription"):
605
+ transcription_output = gr.HTML()
606
 
607
  with gr.Tab("Speech Patterns"):
608
+ speech_output = gr.HTML()
609
 
610
  with gr.Tab("AI Detection"):
611
+ ai_output = gr.HTML()
 
612
 
613
+ # example audio files
614
  gr.HTML("""
615
  <div style='margin-top: 20px; margin-bottom: 10px;'>
616
  <h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
 
620
  examples_dir = os.path.join(os.path.dirname(__file__), "examples")
621
  gr.Examples(
622
  examples=[
623
+ [os.path.join(examples_dir, "read1.wav")],
624
+ [os.path.join(examples_dir, "spontaneous1.wav")]
625
  ],
626
  inputs=[audio_input],
627
  outputs=[
 
634
  fn=analyze_audio_file,
635
  label="",
636
  examples_per_page=2,
637
+ cache_examples="lazy"
638
  )
639
 
640
+ # loading animation function
641
  def show_loading():
642
  loading_html = """
643
  <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border: 2px solid #667eea; padding: 30px; border-radius: 16px; margin: 10px 0; text-align: center;'>
644
  <h2 style='color: white; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
645
  Analyzing...
646
  </h2>
 
 
 
 
 
647
  </div>
 
 
 
 
 
 
648
  """
649
+ loading_tab = """
650
+ <div style='padding: 40px; text-align: center; color: #6b7280;'>
651
+ <p style='margin-top: 16px; font-size: 14px;'>Processing...</p>
652
+ </div>
653
+ """
654
+ return loading_html, loading_tab, loading_tab, loading_tab, loading_tab
655
 
656
+ # connect button to analysis function
657
  analyze_btn.click(
658
  fn=show_loading,
659
  inputs=None,
 
677
  ]
678
  )
679
 
680
+ # footer
681
  gr.HTML("""
682
  <footer style='border-top: 1px solid #e5e7eb; background: white; margin-top: 48px; padding: 32px 0;'>
683
  <div style='text-align: center;'>
684
  <p style='margin: 0; font-size: 14px; color: #6b7280;'>
685
  </p>
 
 
686
  </div>
687
  </footer>
688
  """)
 
690
  return demo
691
 
692
 
693
+ # run the app when script is executed
694
  if __name__ == "__main__":
695
  demo = create_interface()
696
  demo.launch(
 
699
  share=False,
700
  show_error=True
701
  )
 
audio_classifier.py CHANGED
@@ -3,72 +3,95 @@ import torch.nn as nn
3
  import torch.nn.functional as F
4
  import librosa
5
  import numpy as np
6
- from typing import Dict
7
 
 
 
 
8
  class BasicBlock(nn.Module):
9
  def __init__(self, in_channels, out_channels, stride=1, downsample=None):
10
  super(BasicBlock, self).__init__()
 
11
  self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
12
  stride=stride, padding=1, bias=False)
13
  self.bn1 = nn.BatchNorm2d(out_channels)
 
14
  self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
15
  stride=1, padding=1, bias=False)
16
  self.bn2 = nn.BatchNorm2d(out_channels)
 
17
  self.downsample = downsample
18
 
19
  def forward(self, x):
 
20
  identity = x
 
21
  out = F.relu(self.bn1(self.conv1(x)))
 
22
  out = self.bn2(self.conv2(out))
23
 
 
24
  if self.downsample is not None:
25
  identity = self.downsample(x)
26
 
 
27
  out += identity
28
  out = F.relu(out)
29
  return out
30
 
31
 
 
 
32
  class SpeechStyleCNN(nn.Module):
33
  def __init__(self, num_classes=2):
34
  super(SpeechStyleCNN, self).__init__()
35
 
 
36
  self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
37
  self.bn1 = nn.BatchNorm2d(64)
38
  self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
39
 
 
40
  self.layer1 = self._make_layer(64, 64, 2, stride=1)
41
  self.layer2 = self._make_layer(64, 128, 2, stride=2)
42
  self.layer3 = self._make_layer(128, 256, 2, stride=2)
43
  self.layer4 = self._make_layer(256, 512, 2, stride=2)
44
 
 
45
  self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
46
  self.fc = nn.Linear(512, num_classes)
47
 
 
48
  def _make_layer(self, in_channels, out_channels, blocks, stride=1):
49
  downsample = None
 
50
  if stride != 1 or in_channels != out_channels:
51
  downsample = nn.Sequential(
52
  nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
53
  nn.BatchNorm2d(out_channels)
54
  )
55
 
 
56
  layers = []
 
57
  layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
 
58
  for _ in range(1, blocks):
59
  layers.append(BasicBlock(out_channels, out_channels))
60
 
61
  return nn.Sequential(*layers)
62
 
63
- def forward(self, x: torch.Tensor) -> torch.Tensor:
 
64
  x = F.relu(self.bn1(self.conv1(x)))
65
  x = self.maxpool(x)
66
 
 
67
  x = self.layer1(x)
68
  x = self.layer2(x)
69
  x = self.layer3(x)
70
  x = self.layer4(x)
71
 
 
72
  x = self.avgpool(x)
73
  x = torch.flatten(x, 1)
74
  x = self.fc(x)
@@ -76,70 +99,82 @@ class SpeechStyleCNN(nn.Module):
76
  return x
77
 
78
 
 
79
  class AudioClassifier:
 
80
  AVAILABLE_MODELS = {
81
  '3s_window': 'spectrogram_cnn_3s_window.pth',
82
- # '4s_window': 'spectrogram_cnn_4s_window.pth',
83
- # '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
84
  }
85
 
86
  @classmethod
87
- def get_model_path(cls, model_name: str = '3s_window') -> str:
 
88
  import os
89
  if model_name not in cls.AVAILABLE_MODELS:
90
- raise ValueError(f"Unknown model: {model_name}. Available: {list(cls.AVAILABLE_MODELS.keys())}")
 
91
  return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
92
 
93
- def __init__(self, model_path: str = None, device: str = None):
 
94
  if device is None:
95
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
96
  else:
97
  self.device = torch.device(device)
98
-
 
99
  self.model = SpeechStyleCNN().to(self.device)
100
 
 
101
  if model_path is None:
102
  import os
103
  model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
104
 
 
105
  try:
106
  print(f"Attempting to load model from: {model_path}")
107
- state_dict = torch.load(model_path, map_location=self.device)
108
  self.model.load_state_dict(state_dict)
109
  print(f"✓ Successfully loaded trained model from: {model_path}")
110
  except FileNotFoundError:
111
- raise FileNotFoundError(f"Model file not found at {model_path}. Please ensure the model file exists.")
 
112
  except Exception as e:
113
- raise RuntimeError(f"Error loading model from {model_path}: {e}")
114
 
 
115
  self.model.eval()
116
 
 
117
  self.sample_rate = 16000
118
  self.n_mels = 128
119
  self.n_fft = 2048
120
  self.hop_length = 512
121
 
122
- def extract_mel_spectrogram(self, audio_path: str, window_size: float = 3.0) -> np.ndarray:
 
 
123
  audio, sr = librosa.load(audio_path, sr=self.sample_rate)
124
 
125
- # If audio is longer than window_size, take multiple windows and average
126
  window_samples = int(window_size * sr)
127
 
128
- if len(audio) > window_samples * 1.5: # If significantly longer
129
- # Split into overlapping windows
130
  hop_samples = window_samples // 2
131
  windows = []
 
132
  for start in range(0, len(audio) - window_samples, hop_samples):
133
  window = audio[start:start + window_samples]
134
  windows.append(window)
135
 
136
- # Also add the last window
137
  if len(audio) > window_samples:
138
  windows.append(audio[-window_samples:])
139
 
140
- # Compute mel spectrogram for each window and average
141
  mel_specs = []
142
- for window in windows[:5]: # Limit to 5 windows to avoid too much computation
143
  mel_spec = librosa.feature.melspectrogram(
144
  y=window,
145
  sr=sr,
@@ -149,10 +184,10 @@ class AudioClassifier:
149
  )
150
  mel_specs.append(mel_spec)
151
 
152
- # Average the spectrograms
153
  mel_spec = np.mean(mel_specs, axis=0)
154
  else:
155
- # Pad or use as-is for short audio
156
  if len(audio) < window_samples:
157
  audio = np.pad(audio, (0, window_samples - len(audio)), mode='constant')
158
  else:
@@ -166,22 +201,28 @@ class AudioClassifier:
166
  hop_length=self.hop_length
167
  )
168
 
 
169
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
170
 
 
171
  mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
 
172
  mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
173
 
174
  return mel_spec_3ch
175
 
176
- def extract_acoustic_features(self, audio_path: str) -> Dict[str, float]:
 
177
  audio, sr = librosa.load(audio_path, sr=self.sample_rate)
178
 
179
  features = {}
180
 
 
181
  onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
182
  tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
183
  features['tempo'] = float(tempo)
184
 
 
185
  pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
186
  pitch_values = []
187
  for t in range(pitches.shape[1]):
@@ -190,6 +231,7 @@ class AudioClassifier:
190
  if pitch > 0:
191
  pitch_values.append(pitch)
192
 
 
193
  if pitch_values:
194
  features['pitch_mean'] = float(np.mean(pitch_values))
195
  features['pitch_std'] = float(np.std(pitch_values))
@@ -199,34 +241,40 @@ class AudioClassifier:
199
  features['pitch_std'] = 0.0
200
  features['pitch_range'] = 0.0
201
 
 
202
  rms = librosa.feature.rms(y=audio)[0]
203
  features['energy_mean'] = float(np.mean(rms))
204
  features['energy_std'] = float(np.std(rms))
205
 
 
206
  zcr = librosa.feature.zero_crossing_rate(audio)[0]
207
  features['zcr_mean'] = float(np.mean(zcr))
208
  features['zcr_std'] = float(np.std(zcr))
209
 
 
210
  spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
211
  features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
212
  features['spectral_centroid_std'] = float(np.std(spectral_centroids))
213
 
214
  return features
215
 
216
- def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
 
 
217
  individual_scores = {}
218
 
 
219
  sc_std = features['spectral_centroid_std']
220
- if sc_std >= 1100:
221
- spectral_score = 0.9 # Strongly indicates read
222
- elif sc_std >= 1050:
223
- spectral_score = 0.7 # Likely read
224
  elif sc_std >= 1000:
225
- spectral_score = 0.5 # Borderline
226
- elif sc_std >= 950:
227
- spectral_score = 0.3 # Likely spontaneous
228
  else:
229
- spectral_score = 0.1 # Strongly spontaneous
230
 
231
  individual_scores['spectral_variability'] = {
232
  'score': spectral_score,
@@ -234,17 +282,18 @@ class AudioClassifier:
234
  'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
235
  }
236
 
 
237
  zcr = features['zcr_mean']
238
- if zcr >= 0.13:
239
- zcr_score = 0.9 # Strongly indicates read
240
- elif zcr >= 0.115:
241
- zcr_score = 0.7 # Likely read
242
- elif zcr >= 0.105:
243
- zcr_score = 0.5 # Borderline
244
- elif zcr >= 0.095:
245
- zcr_score = 0.3 # Likely spontaneous
246
  else:
247
- zcr_score = 0.1 # Strongly spontaneous
248
 
249
  individual_scores['zcr_mean'] = {
250
  'score': zcr_score,
@@ -252,18 +301,16 @@ class AudioClassifier:
252
  'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
253
  }
254
 
255
- # 3. Energy mean (separation: 0.69)
256
- # Read: 0.06 avg, Spontaneous: 0.06 avg but spontaneous tends higher
257
- # Threshold: ~0.06, read < threshold
258
  energy = features['energy_mean']
259
  if energy < 0.055:
260
- energy_score = 0.8 # Low energy -> likely read
261
- elif energy < 0.065:
262
- energy_score = 0.5 # Moderate
263
- elif energy < 0.075:
264
- energy_score = 0.3 # Higher energy -> likely spontaneous
265
  else:
266
- energy_score = 0.1 # High energy -> spontaneous
267
 
268
  individual_scores['energy_level'] = {
269
  'score': energy_score,
@@ -271,45 +318,80 @@ class AudioClassifier:
271
  'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
272
  }
273
 
274
- # 4. Tempo (separation: 0.22) - less discriminative but still useful
275
- # Read: 122 avg, Spontaneous: 125 avg
276
- tempo = features['tempo']
277
- if tempo < 115:
278
- tempo_score = 0.7 # Slower -> could be read (more deliberate)
279
- elif tempo < 125:
280
- tempo_score = 0.5 # Moderate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  else:
282
- tempo_score = 0.3 # Faster -> could be spontaneous
283
 
284
- individual_scores['tempo'] = {
285
- 'score': tempo_score,
286
- 'value': tempo,
287
- 'interpretation': 'slow (read)' if tempo_score > 0.6 else 'fast (spontaneous)' if tempo_score < 0.4 else 'moderate'
288
  }
289
 
290
- # Optimized weights based on feature separation scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  weights = {
292
- 'spectral_variability': 0.40,
293
- 'zcr_mean': 0.30,
294
- 'energy_level': 0.20,
295
- 'tempo': 0.10
 
 
296
  }
297
 
 
298
  overall_score = (
299
  spectral_score * weights['spectral_variability'] +
300
  zcr_score * weights['zcr_mean'] +
301
  energy_score * weights['energy_level'] +
302
- tempo_score * weights['tempo']
 
 
303
  )
304
 
305
- if overall_score > 0.60:
 
306
  classification = 'read'
307
- confidence = 0.5 + (overall_score - 0.5) * 0.8
308
- elif overall_score < 0.40:
309
  classification = 'spontaneous'
310
- confidence = 0.5 + (0.5 - overall_score) * 0.8
311
  else:
312
- classification = 'read' if overall_score >= 0.5 else 'spontaneous'
313
  confidence = 0.5 + abs(overall_score - 0.5) * 0.6
314
 
315
  return {
@@ -319,11 +401,15 @@ class AudioClassifier:
319
  'individual_scores': individual_scores
320
  }
321
 
322
- def classify(self, audio_path: str) -> Dict[str, any]:
 
 
323
  mel_spec = self.extract_mel_spectrogram(audio_path)
324
 
 
325
  mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
326
 
 
327
  with torch.no_grad():
328
  logits = self.model(mel_tensor)
329
  probabilities = F.softmax(logits, dim=1)
@@ -334,35 +420,36 @@ class AudioClassifier:
334
  print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
335
  print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
336
 
 
337
  acoustic_features = self.extract_acoustic_features(audio_path)
338
 
 
339
  prosody_scores = self._compute_prosody_scores(acoustic_features)
340
  prosody_classification = prosody_scores['classification']
341
  prosody_confidence = prosody_scores['confidence']
342
 
343
- # Model mapping: Class 0 = read, Class 1 = spontaneous
344
  cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
 
 
345
  print(f"CNN classification: {cnn_class_name}")
346
  print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
347
 
348
-
349
- cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
350
- prosody_score = 1.0 if prosody_classification == 'read' else 0.0
351
-
352
-
353
- weighted_score = (
354
- cnn_score * cnn_confidence * 0.4 +
355
- prosody_score * prosody_confidence * 0.6
356
- ) / (cnn_confidence * 0.4 + prosody_confidence * 0.6)
357
-
358
- if weighted_score > 0.5:
359
- final_classification = 'read'
360
- final_confidence = 0.5 + (weighted_score - 0.5)
361
- else:
362
- final_classification = 'spontaneous'
363
- final_confidence = 0.5 + (0.5 - weighted_score)
364
-
365
- final_confidence = min(0.95, final_confidence)
366
 
367
  return {
368
  'classification': final_classification,
@@ -381,17 +468,18 @@ class AudioClassifier:
381
  )
382
  }
383
 
 
384
  def _interpret_classification(
385
  self,
386
- final_class: str,
387
- final_confidence: float,
388
- cnn_class: str,
389
- cnn_confidence: float,
390
- prosody_class: str,
391
- prosody_confidence: float,
392
- prosody_scores: Dict,
393
- features: Dict
394
- ) -> str:
395
  interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
396
  interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
397
 
@@ -404,10 +492,10 @@ class AudioClassifier:
404
  interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
405
  interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
406
 
407
-
408
  return interpretation
409
 
410
 
 
411
  if __name__ == "__main__":
412
  classifier = AudioClassifier()
413
  print("\nAvailable pre-trained models:")
 
3
  import torch.nn.functional as F
4
  import librosa
5
  import numpy as np
 
6
 
7
+
8
+ # Basic building block for the ResNet-style CNN
9
+ # Uses two convolutional layers with batch normalization
10
  class BasicBlock(nn.Module):
11
  def __init__(self, in_channels, out_channels, stride=1, downsample=None):
12
  super(BasicBlock, self).__init__()
13
+ # first conv layer with specified stride
14
  self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
15
  stride=stride, padding=1, bias=False)
16
  self.bn1 = nn.BatchNorm2d(out_channels)
17
+ # second conv layer always has stride 1
18
  self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
19
  stride=1, padding=1, bias=False)
20
  self.bn2 = nn.BatchNorm2d(out_channels)
21
+ # downsample is used when dimensions change
22
  self.downsample = downsample
23
 
24
  def forward(self, x):
25
+ # save input for skip connection
26
  identity = x
27
+ # pass through first conv + batchnorm + relu
28
  out = F.relu(self.bn1(self.conv1(x)))
29
+ # pass through second conv + batchnorm
30
  out = self.bn2(self.conv2(out))
31
 
32
+ # apply downsample if needed to match dimensions
33
  if self.downsample is not None:
34
  identity = self.downsample(x)
35
 
36
+ # add skip connection and apply relu
37
  out += identity
38
  out = F.relu(out)
39
  return out
40
 
41
 
42
+ # Main CNN model for speech style classification
43
+ # Architecture based on ResNet with custom layer configuration
44
  class SpeechStyleCNN(nn.Module):
45
  def __init__(self, num_classes=2):
46
  super(SpeechStyleCNN, self).__init__()
47
 
48
+ # initial convolution layer - takes 3 channel input (RGB spectrogram)
49
  self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
50
  self.bn1 = nn.BatchNorm2d(64)
51
  self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
52
 
53
+ # stack of residual blocks with increasing channel sizes
54
  self.layer1 = self._make_layer(64, 64, 2, stride=1)
55
  self.layer2 = self._make_layer(64, 128, 2, stride=2)
56
  self.layer3 = self._make_layer(128, 256, 2, stride=2)
57
  self.layer4 = self._make_layer(256, 512, 2, stride=2)
58
 
59
+ # global average pooling and final classification layer
60
  self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
61
  self.fc = nn.Linear(512, num_classes)
62
 
63
+ # helper function to create a layer of residual blocks
64
  def _make_layer(self, in_channels, out_channels, blocks, stride=1):
65
  downsample = None
66
+ # need downsample when stride changes or channels don't match
67
  if stride != 1 or in_channels != out_channels:
68
  downsample = nn.Sequential(
69
  nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
70
  nn.BatchNorm2d(out_channels)
71
  )
72
 
73
+ # create list of blocks
74
  layers = []
75
+ # first block may have different stride
76
  layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
77
+ # remaining blocks have stride 1
78
  for _ in range(1, blocks):
79
  layers.append(BasicBlock(out_channels, out_channels))
80
 
81
  return nn.Sequential(*layers)
82
 
83
+ def forward(self, x):
84
+ # initial conv block
85
  x = F.relu(self.bn1(self.conv1(x)))
86
  x = self.maxpool(x)
87
 
88
+ # pass through all residual layers
89
  x = self.layer1(x)
90
  x = self.layer2(x)
91
  x = self.layer3(x)
92
  x = self.layer4(x)
93
 
94
+ # global pooling and classification
95
  x = self.avgpool(x)
96
  x = torch.flatten(x, 1)
97
  x = self.fc(x)
 
99
  return x
100
 
101
 
102
+ # Main classifier class that combines CNN with acoustic feature analysis
103
  class AudioClassifier:
104
+ # dictionary of available pre-trained models
105
  AVAILABLE_MODELS = {
106
  '3s_window': 'spectrogram_cnn_3s_window.pth',
 
 
107
  }
108
 
109
  @classmethod
110
+ def get_model_path(cls, model_name='3s_window'):
111
+ # returns the full path to a model file
112
  import os
113
  if model_name not in cls.AVAILABLE_MODELS:
114
+ print(f"Model not found: {model_name}")
115
+ return None
116
  return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
117
 
118
+ def __init__(self, model_path=None, device=None):
119
+ # set up device - use GPU if available
120
  if device is None:
121
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
122
  else:
123
  self.device = torch.device(device)
124
+
125
+ # initialize the CNN model
126
  self.model = SpeechStyleCNN().to(self.device)
127
 
128
+ # use default model path if not specified
129
  if model_path is None:
130
  import os
131
  model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
132
 
133
+ # load pre-trained weights
134
  try:
135
  print(f"Attempting to load model from: {model_path}")
136
+ state_dict = torch.load(model_path, map_location=self.device, weights_only=False)
137
  self.model.load_state_dict(state_dict)
138
  print(f"✓ Successfully loaded trained model from: {model_path}")
139
  except FileNotFoundError:
140
+ print(f"Could not find model file at {model_path}")
141
+ print("Make sure the model file exists in the correct location")
142
  except Exception as e:
143
+ print(f"Something went wrong loading the model: {e}")
144
 
145
+ # set model to evaluation mode
146
  self.model.eval()
147
 
148
+ # audio processing parameters
149
  self.sample_rate = 16000
150
  self.n_mels = 128
151
  self.n_fft = 2048
152
  self.hop_length = 512
153
 
154
+ # extract mel spectrogram from audio file
155
+ def extract_mel_spectrogram(self, audio_path, window_size=3.0):
156
+ # load audio at target sample rate
157
  audio, sr = librosa.load(audio_path, sr=self.sample_rate)
158
 
159
+ # calculate window size in samples
160
  window_samples = int(window_size * sr)
161
 
162
+ # for longer audio, use multiple overlapping windows
163
+ if len(audio) > window_samples * 1.5:
164
  hop_samples = window_samples // 2
165
  windows = []
166
+ # extract overlapping windows
167
  for start in range(0, len(audio) - window_samples, hop_samples):
168
  window = audio[start:start + window_samples]
169
  windows.append(window)
170
 
171
+ # add the last window
172
  if len(audio) > window_samples:
173
  windows.append(audio[-window_samples:])
174
 
175
+ # compute mel spectrogram for each window
176
  mel_specs = []
177
+ for window in windows[:5]: # limit to 5 windows
178
  mel_spec = librosa.feature.melspectrogram(
179
  y=window,
180
  sr=sr,
 
184
  )
185
  mel_specs.append(mel_spec)
186
 
187
+ # average the spectrograms
188
  mel_spec = np.mean(mel_specs, axis=0)
189
  else:
190
+ # for short audio, pad or truncate
191
  if len(audio) < window_samples:
192
  audio = np.pad(audio, (0, window_samples - len(audio)), mode='constant')
193
  else:
 
201
  hop_length=self.hop_length
202
  )
203
 
204
+ # convert to decibels
205
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
206
 
207
+ # normalize to 0-1 range
208
  mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
209
+ # stack into 3 channels for CNN input
210
  mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
211
 
212
  return mel_spec_3ch
213
 
214
+ # extract acoustic features from audio
215
+ def extract_acoustic_features(self, audio_path):
216
  audio, sr = librosa.load(audio_path, sr=self.sample_rate)
217
 
218
  features = {}
219
 
220
+ # tempo/rhythm estimation
221
  onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
222
  tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
223
  features['tempo'] = float(tempo)
224
 
225
+ # pitch tracking
226
  pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
227
  pitch_values = []
228
  for t in range(pitches.shape[1]):
 
231
  if pitch > 0:
232
  pitch_values.append(pitch)
233
 
234
+ # calculate pitch statistics
235
  if pitch_values:
236
  features['pitch_mean'] = float(np.mean(pitch_values))
237
  features['pitch_std'] = float(np.std(pitch_values))
 
241
  features['pitch_std'] = 0.0
242
  features['pitch_range'] = 0.0
243
 
244
+ # energy/loudness features
245
  rms = librosa.feature.rms(y=audio)[0]
246
  features['energy_mean'] = float(np.mean(rms))
247
  features['energy_std'] = float(np.std(rms))
248
 
249
+ # zero crossing rate - indicates voice quality
250
  zcr = librosa.feature.zero_crossing_rate(audio)[0]
251
  features['zcr_mean'] = float(np.mean(zcr))
252
  features['zcr_std'] = float(np.std(zcr))
253
 
254
+ # spectral centroid - brightness of sound
255
  spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
256
  features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
257
  features['spectral_centroid_std'] = float(np.std(spectral_centroids))
258
 
259
  return features
260
 
261
+ # compute prosody scores from acoustic features
262
+ # uses thresholds calibrated from training data
263
+ def _compute_prosody_scores(self, features):
264
  individual_scores = {}
265
 
266
+ # spectral centroid variability - best discriminating feature
267
  sc_std = features['spectral_centroid_std']
268
+ if sc_std >= 1080:
269
+ spectral_score = 0.9 # strongly indicates read
270
+ elif sc_std >= 1040:
271
+ spectral_score = 0.7
272
  elif sc_std >= 1000:
273
+ spectral_score = 0.5
274
+ elif sc_std >= 970:
275
+ spectral_score = 0.3
276
  else:
277
+ spectral_score = 0.1 # strongly spontaneous
278
 
279
  individual_scores['spectral_variability'] = {
280
  'score': spectral_score,
 
282
  'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
283
  }
284
 
285
+ # zero crossing rate - second best feature
286
  zcr = features['zcr_mean']
287
+ if zcr >= 0.125:
288
+ zcr_score = 0.9
289
+ elif zcr >= 0.110:
290
+ zcr_score = 0.7
291
+ elif zcr >= 0.100:
292
+ zcr_score = 0.5
293
+ elif zcr >= 0.092:
294
+ zcr_score = 0.3
295
  else:
296
+ zcr_score = 0.1
297
 
298
  individual_scores['zcr_mean'] = {
299
  'score': zcr_score,
 
301
  'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
302
  }
303
 
304
+ # energy level - read speech tends to be lower energy
 
 
305
  energy = features['energy_mean']
306
  if energy < 0.055:
307
+ energy_score = 0.85
308
+ elif energy < 0.062:
309
+ energy_score = 0.65
310
+ elif energy < 0.070:
311
+ energy_score = 0.4
312
  else:
313
+ energy_score = 0.15
314
 
315
  individual_scores['energy_level'] = {
316
  'score': energy_score,
 
318
  'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
319
  }
320
 
321
+ # pitch range feature
322
+ pitch_range = features.get('pitch_range', 3828)
323
+ if pitch_range < 3815:
324
+ pitch_range_score = 0.7
325
+ elif pitch_range < 3828:
326
+ pitch_range_score = 0.5
327
+ else:
328
+ pitch_range_score = 0.3
329
+
330
+ individual_scores['pitch_range'] = {
331
+ 'score': pitch_range_score,
332
+ 'value': pitch_range,
333
+ 'interpretation': 'narrow (read)' if pitch_range_score > 0.6 else 'wide (spontaneous)' if pitch_range_score < 0.4 else 'moderate'
334
+ }
335
+
336
+ # energy variability
337
+ energy_std = features.get('energy_std', 0.047)
338
+ if energy_std < 0.042:
339
+ energy_std_score = 0.7
340
+ elif energy_std < 0.048:
341
+ energy_std_score = 0.5
342
  else:
343
+ energy_std_score = 0.3
344
 
345
+ individual_scores['energy_std'] = {
346
+ 'score': energy_std_score,
347
+ 'value': energy_std,
348
+ 'interpretation': 'steady (read)' if energy_std_score > 0.6 else 'variable (spontaneous)' if energy_std_score < 0.4 else 'moderate'
349
  }
350
 
351
+ # zcr variability
352
+ zcr_std = features.get('zcr_std', 0.111)
353
+ if zcr_std >= 0.115:
354
+ zcr_std_score = 0.7
355
+ elif zcr_std >= 0.105:
356
+ zcr_std_score = 0.5
357
+ else:
358
+ zcr_std_score = 0.3
359
+
360
+ individual_scores['zcr_std'] = {
361
+ 'score': zcr_std_score,
362
+ 'value': zcr_std,
363
+ 'interpretation': 'variable ZCR (read)' if zcr_std_score > 0.6 else 'steady ZCR (spontaneous)' if zcr_std_score < 0.4 else 'moderate'
364
+ }
365
+
366
+ # weights based on feature importance from analysis
367
  weights = {
368
+ 'spectral_variability': 0.30,
369
+ 'zcr_mean': 0.25,
370
+ 'energy_level': 0.20,
371
+ 'pitch_range': 0.10,
372
+ 'energy_std': 0.08,
373
+ 'zcr_std': 0.07,
374
  }
375
 
376
+ # calculate weighted overall score
377
  overall_score = (
378
  spectral_score * weights['spectral_variability'] +
379
  zcr_score * weights['zcr_mean'] +
380
  energy_score * weights['energy_level'] +
381
+ pitch_range_score * weights['pitch_range'] +
382
+ energy_std_score * weights['energy_std'] +
383
+ zcr_std_score * weights['zcr_std']
384
  )
385
 
386
+ # determine classification based on thresholds
387
+ if overall_score > 0.58:
388
  classification = 'read'
389
+ confidence = 0.5 + (overall_score - 0.5) * 0.9
390
+ elif overall_score < 0.42:
391
  classification = 'spontaneous'
392
+ confidence = 0.5 + (0.5 - overall_score) * 0.9
393
  else:
394
+ classification = 'read' if overall_score >= 0.50 else 'spontaneous'
395
  confidence = 0.5 + abs(overall_score - 0.5) * 0.6
396
 
397
  return {
 
401
  'individual_scores': individual_scores
402
  }
403
 
404
+ # main classification method - combines CNN and prosody analysis
405
+ def classify(self, audio_path):
406
+ # extract mel spectrogram for CNN
407
  mel_spec = self.extract_mel_spectrogram(audio_path)
408
 
409
+ # convert to tensor and add batch dimension
410
  mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
411
 
412
+ # get CNN predictions
413
  with torch.no_grad():
414
  logits = self.model(mel_tensor)
415
  probabilities = F.softmax(logits, dim=1)
 
420
  print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
421
  print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
422
 
423
+ # extract acoustic features for prosody analysis
424
  acoustic_features = self.extract_acoustic_features(audio_path)
425
 
426
+ # compute prosody-based scores
427
  prosody_scores = self._compute_prosody_scores(acoustic_features)
428
  prosody_classification = prosody_scores['classification']
429
  prosody_confidence = prosody_scores['confidence']
430
 
431
+ # map CNN class to label
432
  cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
433
+ read_prob = probabilities[0, 0].item()
434
+
435
  print(f"CNN classification: {cnn_class_name}")
436
  print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
437
 
438
+ # combine CNN and prosody - prosody is more reliable
439
+ final_classification = prosody_classification
440
+ final_confidence = prosody_confidence
441
+
442
+ # boost confidence when both methods agree
443
+ if cnn_class_name == prosody_classification:
444
+ final_confidence = min(0.95, prosody_confidence * 1.15)
445
+ elif read_prob > 0.85 and cnn_class_name == 'read':
446
+ if prosody_confidence < 0.65:
447
+ final_classification = 'read'
448
+ final_confidence = 0.55
449
+ elif read_prob < 0.10 and cnn_class_name == 'spontaneous':
450
+ if prosody_confidence < 0.65:
451
+ final_classification = 'spontaneous'
452
+ final_confidence = 0.55
 
 
 
453
 
454
  return {
455
  'classification': final_classification,
 
468
  )
469
  }
470
 
471
+ # generate human-readable interpretation of classification
472
  def _interpret_classification(
473
  self,
474
+ final_class,
475
+ final_confidence,
476
+ cnn_class,
477
+ cnn_confidence,
478
+ prosody_class,
479
+ prosody_confidence,
480
+ prosody_scores,
481
+ features
482
+ ):
483
  interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
484
  interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
485
 
 
492
  interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
493
  interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
494
 
 
495
  return interpretation
496
 
497
 
498
+ # test code - runs when script is executed directly
499
  if __name__ == "__main__":
500
  classifier = AudioClassifier()
501
  print("\nAvailable pre-trained models:")
examples/.DS_Store ADDED
Binary file (6.15 kB). View file
 
examples/{spontaneous1.ogg → read1.wav} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69b8aeffd1e7a02ed90bcff98d202cd7a97cc57cd1d16a4cdbd4aac2e770b6db
3
- size 323869
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca1f1a4aadad49ce045b41318eaf3e82b588231af2aee89596687731c0cef4d
3
+ size 1075710
examples/{read1.ogg → spontaneous1.wav} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c8e969d50e75835caf2a52f33c19accdb1cdfa1e069501bad0fc2fe470ea761
3
- size 157216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76f8f4a50cd6d10123058d060287ae1433b59087ad0b65b0fa6255716368d3ba
3
+ size 873470
pipeline.py CHANGED
@@ -1,49 +1,52 @@
1
- from typing import Dict, Optional
2
  import time
3
  from audio_classifier import AudioClassifier
4
  from speech_recognizer import SpeechRecognizer
5
  from text_analyzer import TextAuthenticityAnalyzer
6
 
7
 
 
8
  class AuthenticityDetectionPipeline:
9
  def __init__(
10
  self,
11
- audio_model_path: Optional[str] = None,
12
- whisper_model_size: str = "base",
13
- device: Optional[str] = None,
14
- ai_detection_threshold: float = 0.78
15
  ):
16
  print("\n" + "="*60)
17
  print("Initializing Multimodal Authenticity Detection Pipeline")
18
  print("="*60 + "\n")
19
 
20
- # Initialize components
21
  print("📊 Loading Audio Classifier (CNN)...")
22
  self.audio_classifier = AudioClassifier(
23
  model_path=audio_model_path,
24
  device=device
25
  )
26
 
 
27
  print("\n🎤 Loading Speech Recognizer (Whisper)...")
28
  self.speech_recognizer = SpeechRecognizer(
29
  model_size=whisper_model_size,
30
  device=device
31
  )
32
 
 
33
  print("\n📝 Loading Text Authenticity Analyzer...")
34
  self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
35
 
36
  print("\n✅ Pipeline initialization complete!")
37
  print("="*60 + "\n")
38
 
39
- def analyze_audio(self, audio_path: str, language: Optional[str] = None) -> Dict:
 
40
  print("\n" + "="*60)
41
  print("MULTIMODAL AUTHENTICITY ANALYSIS")
42
  print("="*60 + "\n")
43
 
44
  start_time = time.time()
45
 
46
- # Stage 1: Audio Classification (CNN-based read vs spontaneous detection)
47
  print("Stage 1: CNN Audio Classification...")
48
  print("-" * 40)
49
  audio_results = self.audio_classifier.classify(audio_path)
@@ -51,7 +54,7 @@ class AuthenticityDetectionPipeline:
51
  print(f" ## Classification: {audio_results['classification'].upper()}")
52
  print(f" Confidence: {audio_results['confidence']*100:.1f}%")
53
 
54
- # Stage 2: Speech Analysis (Whisper for linguistic analysis)
55
  print("\nStage 2: Speech Analysis (Whisper)...")
56
  print("-" * 40)
57
  asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
@@ -60,7 +63,7 @@ class AuthenticityDetectionPipeline:
60
  print(f" Word count: {asr_results['word_count']}")
61
  print(f" Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
62
 
63
- # Stage 3: Text Authenticity Analysis
64
  print("\nStage 3: Analyzing text authenticity...")
65
  print("-" * 40)
66
  text_results = self.text_analyzer.analyze(asr_results['transcription'])
@@ -68,7 +71,7 @@ class AuthenticityDetectionPipeline:
68
  print(f" Authenticity score: {text_results['authenticity_score']*100:.1f}%")
69
  print(f" Risk level: {text_results['risk_level'].upper()}")
70
 
71
- # Stage 4: Combined Assessment
72
  print("\nStage 4: Generating final assessment...")
73
  print("-" * 40)
74
  final_assessment = self._generate_final_assessment(
@@ -85,46 +88,68 @@ class AuthenticityDetectionPipeline:
85
  return {
86
  'audio_classification': audio_results,
87
  'speech_recognition': asr_results,
 
 
88
  'text_authenticity': text_results,
89
  'final_assessment': final_assessment,
90
  'processing_time': elapsed_time
91
  }
92
 
 
93
  def _generate_final_assessment(
94
  self,
95
- audio_results: Dict,
96
- asr_results: Dict,
97
- text_results: Dict
98
- ) -> Dict:
99
 
100
- # CNN score: spontaneous = authentic (high), read = inauthentic (low)
101
  if audio_results['classification'] == 'spontaneous':
102
  audio_score = audio_results['confidence']
103
- else: # read
104
  audio_score = 1.0 - audio_results['confidence']
105
 
106
- # Kopparapu score: 0=spontaneous, 1=read
107
- # Invert so spontaneous (low kopparapu) = high authenticity
108
  speech_pattern_score = 1.0 - asr_results['kopparapu_score']
109
 
110
- # Filler words: higher ratio = more spontaneous = more authentic
111
  filler_ratio = asr_results['filler_words']['ratio']
112
- filler_score = min(1.0, filler_ratio / 0.05) # Normalize: 5%+ = max score
113
 
114
- # Pause variability: higher = more spontaneous = more authentic
115
  pause_var = asr_results['pause_patterns']['pause_variability']
116
- pause_score = min(1.0, pause_var / 0.5) # Normalize: 0.5+ = max score
117
 
 
118
  text_auth_score = text_results['authenticity_score']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
 
 
120
  composite_score = (
121
- audio_score * 0.15 + # CNN - weakest component
122
- speech_pattern_score * 0.20 + # Kopparapu linguistic
123
- filler_score * 0.10 + # Filler word ratio
124
- pause_score * 0.05 + # Pause variability
125
- text_auth_score * 0.50 # Text authenticity - strongest signal
 
126
  )
127
 
 
128
  if composite_score >= 0.7:
129
  verdict = "AUTHENTIC"
130
  risk = "low"
@@ -142,37 +167,43 @@ class AuthenticityDetectionPipeline:
142
  risk = "critical"
143
  recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
144
 
 
145
  concerns = []
146
  strengths = []
147
 
 
148
  if audio_results['classification'] == 'read':
149
  concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
150
  else:
151
  strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
152
 
 
153
  if asr_results['kopparapu_classification'] == 'read':
154
  concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
155
  else:
156
  strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
157
 
 
158
  filler_ratio = asr_results['filler_words']['ratio']
159
  if filler_ratio < 0.02:
160
  concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
161
  else:
162
  strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
163
 
 
164
  if asr_results['pause_patterns']['pause_variability'] < 0.3:
165
  concerns.append("Regular pause patterns suggest reading at punctuation")
166
  else:
167
  strengths.append("Irregular pause patterns indicate spontaneous thinking")
168
 
 
169
  if text_results['ai_detection']['ai_generated']:
170
  concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
171
 
 
172
  if text_results['authenticity_score'] > 0.7:
173
  strengths.append("Text shows strong originality indicators")
174
 
175
-
176
  return {
177
  'verdict': verdict,
178
  'risk_level': risk,
@@ -182,8 +213,9 @@ class AuthenticityDetectionPipeline:
182
  'recommendation': recommendation,
183
  }
184
 
 
 
185
  if __name__ == "__main__":
186
- # Example usage
187
  print("Initializing Authenticity Detection Pipeline...")
188
  model_path = "spectrogram_cnn_3s_window.pth"
189
  pipeline = AuthenticityDetectionPipeline(
 
 
1
  import time
2
  from audio_classifier import AudioClassifier
3
  from speech_recognizer import SpeechRecognizer
4
  from text_analyzer import TextAuthenticityAnalyzer
5
 
6
 
7
+ # Main pipeline class that orchestrates all analysis components
8
  class AuthenticityDetectionPipeline:
9
  def __init__(
10
  self,
11
+ audio_model_path=None,
12
+ whisper_model_size="base",
13
+ device=None,
14
+ ai_detection_threshold=0.78
15
  ):
16
  print("\n" + "="*60)
17
  print("Initializing Multimodal Authenticity Detection Pipeline")
18
  print("="*60 + "\n")
19
 
20
+ # load the CNN-based audio classifier
21
  print("📊 Loading Audio Classifier (CNN)...")
22
  self.audio_classifier = AudioClassifier(
23
  model_path=audio_model_path,
24
  device=device
25
  )
26
 
27
+ # load whisper model for speech-to-text
28
  print("\n🎤 Loading Speech Recognizer (Whisper)...")
29
  self.speech_recognizer = SpeechRecognizer(
30
  model_size=whisper_model_size,
31
  device=device
32
  )
33
 
34
+ # load text analyzer for AI detection
35
  print("\n📝 Loading Text Authenticity Analyzer...")
36
  self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
37
 
38
  print("\n✅ Pipeline initialization complete!")
39
  print("="*60 + "\n")
40
 
41
+ # main analysis function - runs all stages
42
+ def analyze_audio(self, audio_path, language=None):
43
  print("\n" + "="*60)
44
  print("MULTIMODAL AUTHENTICITY ANALYSIS")
45
  print("="*60 + "\n")
46
 
47
  start_time = time.time()
48
 
49
+ # stage 1: classify audio using CNN
50
  print("Stage 1: CNN Audio Classification...")
51
  print("-" * 40)
52
  audio_results = self.audio_classifier.classify(audio_path)
 
54
  print(f" ## Classification: {audio_results['classification'].upper()}")
55
  print(f" Confidence: {audio_results['confidence']*100:.1f}%")
56
 
57
+ # stage 2: transcribe and analyze speech patterns
58
  print("\nStage 2: Speech Analysis (Whisper)...")
59
  print("-" * 40)
60
  asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
 
63
  print(f" Word count: {asr_results['word_count']}")
64
  print(f" Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
65
 
66
+ # stage 3: analyze transcribed text for AI patterns
67
  print("\nStage 3: Analyzing text authenticity...")
68
  print("-" * 40)
69
  text_results = self.text_analyzer.analyze(asr_results['transcription'])
 
71
  print(f" Authenticity score: {text_results['authenticity_score']*100:.1f}%")
72
  print(f" Risk level: {text_results['risk_level'].upper()}")
73
 
74
+ # stage 4: combine all results into final assessment
75
  print("\nStage 4: Generating final assessment...")
76
  print("-" * 40)
77
  final_assessment = self._generate_final_assessment(
 
88
  return {
89
  'audio_classification': audio_results,
90
  'speech_recognition': asr_results,
91
+ 'asr': asr_results, # alias for backwards compatibility
92
+ 'text_analysis': text_results,
93
  'text_authenticity': text_results,
94
  'final_assessment': final_assessment,
95
  'processing_time': elapsed_time
96
  }
97
 
98
+ # combine scores from all components into final verdict
99
  def _generate_final_assessment(
100
  self,
101
+ audio_results,
102
+ asr_results,
103
+ text_results
104
+ ):
105
 
106
+ # calculate audio score - spontaneous = authentic
107
  if audio_results['classification'] == 'spontaneous':
108
  audio_score = audio_results['confidence']
109
+ else:
110
  audio_score = 1.0 - audio_results['confidence']
111
 
112
+ # kopparapu score - invert so spontaneous = high authenticity
 
113
  speech_pattern_score = 1.0 - asr_results['kopparapu_score']
114
 
115
+ # filler words indicate spontaneous speech
116
  filler_ratio = asr_results['filler_words']['ratio']
117
+ filler_score = min(1.0, filler_ratio / 0.05)
118
 
119
+ # pause variability - higher = more spontaneous
120
  pause_var = asr_results['pause_patterns']['pause_variability']
121
+ pause_score = min(1.0, pause_var / 0.5)
122
 
123
+ # text authenticity from AI detector
124
  text_auth_score = text_results['authenticity_score']
125
+
126
+ # get additional linguistic features
127
+ kf = asr_results['kopparapu_features']
128
+
129
+ # speech rate variability
130
+ rate_var = kf.get('speech_rate_variability', 0.0)
131
+ rate_var_score = min(1.0, rate_var / 0.15)
132
+
133
+ # pause regularity - lower = more spontaneous
134
+ pause_reg = kf.get('pause_regularity', 0.5)
135
+ pause_reg_score = 1.0 - pause_reg
136
+
137
+ # self-corrections indicate spontaneous speech
138
+ corrections = kf.get('self_correction_count', 0)
139
+ correction_score = min(1.0, corrections / 2.0)
140
 
141
+ # calculate weighted composite score
142
+ # weights: CNN+Prosody=15%, Linguistic=35%, AI Detection=50%
143
  composite_score = (
144
+ audio_score * 0.15 +
145
+ speech_pattern_score * 0.25 +
146
+ filler_score * 0.05 +
147
+ pause_score * 0.03 +
148
+ rate_var_score * 0.02 +
149
+ text_auth_score * 0.50
150
  )
151
 
152
+ # determine verdict based on composite score
153
  if composite_score >= 0.7:
154
  verdict = "AUTHENTIC"
155
  risk = "low"
 
167
  risk = "critical"
168
  recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
169
 
170
+ # collect concerns and strengths
171
  concerns = []
172
  strengths = []
173
 
174
+ # check CNN classification
175
  if audio_results['classification'] == 'read':
176
  concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
177
  else:
178
  strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
179
 
180
+ # check linguistic analysis
181
  if asr_results['kopparapu_classification'] == 'read':
182
  concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
183
  else:
184
  strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
185
 
186
+ # check filler words
187
  filler_ratio = asr_results['filler_words']['ratio']
188
  if filler_ratio < 0.02:
189
  concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
190
  else:
191
  strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
192
 
193
+ # check pause patterns
194
  if asr_results['pause_patterns']['pause_variability'] < 0.3:
195
  concerns.append("Regular pause patterns suggest reading at punctuation")
196
  else:
197
  strengths.append("Irregular pause patterns indicate spontaneous thinking")
198
 
199
+ # check AI detection
200
  if text_results['ai_detection']['ai_generated']:
201
  concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
202
 
203
+ # check text originality
204
  if text_results['authenticity_score'] > 0.7:
205
  strengths.append("Text shows strong originality indicators")
206
 
 
207
  return {
208
  'verdict': verdict,
209
  'risk_level': risk,
 
213
  'recommendation': recommendation,
214
  }
215
 
216
+
217
+ # test code - runs when script is executed directly
218
  if __name__ == "__main__":
 
219
  print("Initializing Authenticity Detection Pipeline...")
220
  model_path = "spectrogram_cnn_3s_window.pth"
221
  pipeline = AuthenticityDetectionPipeline(
speech_recognizer.py CHANGED
@@ -2,58 +2,56 @@ import whisper
2
  import torch
3
  import numpy as np
4
  import re
5
- from typing import Dict, Optional, List
6
  import warnings
7
  import librosa
8
  warnings.filterwarnings("ignore")
9
 
10
 
 
11
  class SpeechRecognizer:
12
- def __init__(self, model_size: str = "base", device: str = None):
 
13
  if device is None:
14
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
15
  else:
16
  self.device = device
17
-
 
18
  print(f"Loading Whisper {model_size} model on {self.device}...")
19
  self.model = whisper.load_model(model_size, device=self.device)
20
  print(f"Whisper model loaded successfully.")
21
 
22
  self.model_size = model_size
23
 
24
- def _validate_audio(self, audio_path: str) -> tuple[bool, str, float]:
25
- """Validate audio file before transcription."""
26
  try:
27
- # Load audio to check if it's valid
28
  audio, sr = librosa.load(audio_path, sr=16000)
29
  duration = len(audio) / sr
30
 
31
- # Check if audio is too short
32
  if duration < 0.1:
33
- return False, "Audio is too short (< 0.1 seconds)", duration
34
 
35
- # Check if audio is empty or silent
36
  if np.max(np.abs(audio)) < 0.001:
37
- return False, "Audio appears to be silent or empty", duration
38
 
39
  return True, "Valid", duration
40
 
41
  except Exception as e:
42
- return False, f"Failed to load audio: {str(e)}", 0.0
43
-
44
- def transcribe(
45
- self,
46
- audio_path: str,
47
- language: Optional[str] = None,
48
- task: str = "transcribe"
49
- ) -> Dict[str, any]:
50
- # Validate audio first
51
  is_valid, message, audio_duration = self._validate_audio(audio_path)
52
  if not is_valid:
53
- print(f"Audio validation failed: {message}")
54
- # Return minimal valid response for invalid audio
55
  return self._get_empty_response(message, audio_duration)
56
 
 
57
  try:
58
  result = self.model.transcribe(
59
  audio_path,
@@ -61,17 +59,17 @@ class SpeechRecognizer:
61
  task=task,
62
  verbose=False,
63
  word_timestamps=True,
64
- fp16=False # Disable fp16 to avoid KV cache KeyError
65
  )
66
  except (KeyError, RuntimeError) as e:
67
  error_msg = str(e)
68
- # Check if it's a tensor shape error (empty audio issue)
69
  if "reshape tensor of 0 elements" in error_msg or "ambiguous" in error_msg:
70
- print(f"Audio processing failed: Audio may be too short or corrupted")
71
  return self._get_empty_response("Audio too short or corrupted", audio_duration)
72
 
73
- # Fallback: transcribe without word timestamps for other errors
74
- print(f"Warning: Transcription failed ({error_msg[:100]}), retrying without word timestamps...")
75
  try:
76
  result = self.model.transcribe(
77
  audio_path,
@@ -82,20 +80,23 @@ class SpeechRecognizer:
82
  fp16=False
83
  )
84
  except Exception as e2:
85
- print(f"Transcription completely failed: {e2}")
86
- return self._get_empty_response(f"Transcription failed: {str(e2)[:100]}", audio_duration)
87
 
 
88
  transcription = result['text'].strip()
89
  detected_language = result.get('language', 'unknown')
90
  segments = result.get('segments', [])
91
 
92
- # Handle empty transcription
93
  if not transcription or len(transcription.strip()) == 0:
94
  print("Warning: Transcription is empty")
95
  return self._get_empty_response("No speech detected in audio", audio_duration)
96
 
 
97
  analysis = self._analyze_transcription(transcription, segments)
98
 
 
99
  duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
100
  kopparapu_features = self._extract_kopparapu_features(
101
  transcription, duration, segments, analysis['pause_patterns']
@@ -117,8 +118,8 @@ class SpeechRecognizer:
117
  'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
118
  }
119
 
120
- def _get_empty_response(self, reason: str, duration: float = 0.0) -> Dict[str, any]:
121
- """Return a valid empty response when transcription fails."""
122
  return {
123
  'transcription': f"[Error: {reason}]",
124
  'language': 'unknown',
@@ -147,20 +148,23 @@ class SpeechRecognizer:
147
  },
148
  'kopparapu_score': 0.5,
149
  'kopparapu_classification': 'unknown',
150
- 'interpretation': f"⚠️ Audio processing failed: {reason}\n\nPlease ensure:\n- Audio is at least 1 second long\n- Audio contains actual speech\n- Audio file is not corrupted"
151
  }
152
 
153
- def _analyze_transcription(self, text: str, segments: List[Dict]) -> Dict:
 
154
  words = text.split()
155
  word_count = len(words)
156
 
 
157
  duration = 0
158
  if segments:
159
  duration = segments[-1]['end'] - segments[0]['start']
160
 
 
161
  speech_rate = (word_count / duration * 60) if duration > 0 else 0
162
-
163
 
 
164
  filler_words_list = [
165
  ('um', r'\bum\b'), ('uh', r'\buh\b'), ('er', r'\ber\b'),
166
  ('ah', r'\bah\b'), ('like', r'\blike\b'), ('you know', r'\byou know\b'),
@@ -170,6 +174,7 @@ class SpeechRecognizer:
170
  ('hmm', r'\bhmm+\b'), ('mm', r'\bmm+\b')
171
  ]
172
 
 
173
  text_lower = text.lower()
174
  filler_count = {}
175
  total_fillers = 0
@@ -181,8 +186,10 @@ class SpeechRecognizer:
181
  filler_count[filler_name] = count
182
  total_fillers += count
183
 
 
184
  filler_ratio = total_fillers / word_count if word_count > 0 else 0
185
 
 
186
  pause_patterns = self._analyze_pauses(segments)
187
 
188
  return {
@@ -197,24 +204,28 @@ class SpeechRecognizer:
197
  'pause_patterns': pause_patterns
198
  }
199
 
200
- def _analyze_pauses(self, segments: List[Dict]) -> Dict:
 
201
  pauses = []
202
 
 
203
  if len(segments) >= 2:
204
  for i in range(len(segments) - 1):
205
  pause = segments[i + 1]['start'] - segments[i]['end']
206
- if pause > 0.05: # Consider pauses > 50ms (lowered threshold)
207
  pauses.append(pause)
208
 
 
209
  for segment in segments:
210
  if 'words' in segment and len(segment['words']) > 1:
211
  words = segment['words']
212
  for i in range(len(words) - 1):
213
  if 'start' in words[i] and 'end' in words[i] and 'start' in words[i+1]:
214
  pause = words[i + 1]['start'] - words[i]['end']
215
- if pause > 0.15: # Word-level pauses (>150ms significant)
216
  pauses.append(pause)
217
 
 
218
  if not pauses:
219
  return {
220
  'avg_pause': 0.0,
@@ -230,11 +241,10 @@ class SpeechRecognizer:
230
  'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
231
  }
232
 
233
- def _extract_kopparapu_features(
234
- self, text: str, duration_sec: float,
235
- segments: List[Dict] = None, pause_patterns: Dict = None
236
- ) -> Dict:
237
  text = text.strip()
 
238
  if len(text) == 0:
239
  return {
240
  'alpha_ratio': 0.0,
@@ -249,24 +259,28 @@ class SpeechRecognizer:
249
  'self_correction_count': 0
250
  }
251
 
 
252
  total_chars = len(text)
253
  alpha_chars = sum(c.isalpha() for c in text)
254
  nonalpha_chars = total_chars - alpha_chars
255
 
 
256
  alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
257
 
 
258
  words = text.split()
259
  num_words = max(len(words), 1)
260
  chars_per_word = alpha_chars / num_words
261
 
 
262
  duration_sec = max(duration_sec, 1e-3)
263
  words_per_sec = num_words / duration_sec
264
  nonalpha_per_sec = nonalpha_chars / duration_sec
265
 
266
- # Character repetitions (e.g., "sooo", "ummmm")
267
  char_reps = len(re.findall(r'(.)\1{2,}', text))
268
 
269
- # Word repetitions (e.g., "I I think", "the the")
270
  words_list = text.lower().split()
271
  word_reps = 0
272
  for i in range(len(words_list) - 1):
@@ -275,7 +289,7 @@ class SpeechRecognizer:
275
 
276
  repetition_count = char_reps + word_reps
277
 
278
- # Filler words detection
279
  lower = text.lower()
280
  filler_patterns = [
281
  r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
@@ -289,23 +303,20 @@ class SpeechRecognizer:
289
  filler_count += len(re.findall(pattern, lower))
290
  filler_rate = filler_count / num_words
291
 
292
- # NEW: Pause regularity - read speech has regular pauses at punctuation
293
- # Low variability = regular pauses = likely read
294
- pause_regularity = 0.5 # neutral default
295
  if pause_patterns and pause_patterns.get('num_pauses', 0) > 2:
296
  pause_var = pause_patterns.get('pause_variability', 0.5)
297
- # Normalize: low variability (< 0.2) -> high regularity (close to 1)
298
- # High variability (> 0.6) -> low regularity (close to 0)
299
  pause_regularity = max(0.0, min(1.0, 1.0 - (pause_var / 0.6)))
300
 
301
- # NEW: Speech rate variability across segments
302
- # Read speech has consistent pacing; spontaneous varies with thinking
303
  speech_rate_variability = self._compute_rate_variability(segments) if segments else 0.0
304
 
305
- # NEW: Sentence length variance - read text has more uniform structure
306
  sentence_length_variance = self._compute_sentence_variance(text)
307
 
308
- # NEW: Self-corrections and false starts (spontaneous speech markers)
309
  self_correction_patterns = [
310
  r'\bwait\b', r'\bsorry\b', r'\bno\s*,?\s*I\b',
311
  r'\bactually\s*,?\s*no\b', r'\blet me\b', r'\bwhat I meant\b',
@@ -328,14 +339,15 @@ class SpeechRecognizer:
328
  'self_correction_count': int(self_correction_count)
329
  }
330
 
331
- def _compute_rate_variability(self, segments: List[Dict]) -> float:
 
332
  if not segments or len(segments) < 3:
333
  return 0.0
334
 
335
  segment_rates = []
336
  for seg in segments:
337
  duration = seg.get('end', 0) - seg.get('start', 0)
338
- if duration > 0.3: # Only consider segments > 300ms
339
  words_in_seg = len(seg.get('text', '').split())
340
  rate = words_in_seg / duration
341
  if rate > 0:
@@ -344,42 +356,46 @@ class SpeechRecognizer:
344
  if len(segment_rates) < 3:
345
  return 0.0
346
 
 
347
  mean_rate = np.mean(segment_rates)
348
  std_rate = np.std(segment_rates)
349
 
350
- # Coefficient of variation normalized to 0-1
351
  cv = std_rate / mean_rate if mean_rate > 0 else 0
352
- return float(min(1.0, cv / 0.5)) # CV of 0.5+ maps to 1.0
353
 
354
- def _compute_sentence_variance(self, text: str) -> float:
355
- # Split into sentences
 
356
  sentences = re.split(r'[.!?]+', text)
357
  sentences = [s.strip() for s in sentences if s.strip()]
358
 
359
  if len(sentences) < 2:
360
  return 0.0
361
 
 
362
  lengths = [len(s.split()) for s in sentences]
363
  mean_len = np.mean(lengths)
364
  std_len = np.std(lengths)
365
 
366
- # Coefficient of variation normalized
367
  cv = std_len / mean_len if mean_len > 0 else 0
368
- return float(min(1.0, cv / 0.6)) # CV of 0.6+ maps to 1.0
369
 
370
- def _logistic(self, x: float, a: float, b: float) -> float: return 1.0 / (1.0 + np.exp(-(x - a) / b))
 
 
371
 
372
- def _calculate_kopparapu_score(self, features: Dict) -> float:
373
- # L1: Vocabulary complexity - higher chars/word = more formal = read
 
374
  f1 = features['chars_per_word']
375
  L1 = self._logistic(f1, a=4.8, b=1.2)
376
 
377
- # L2: Speaking rate - faster, steadier = read
378
  f2 = features['words_per_sec']
379
  L2 = self._logistic(f2, a=2.2, b=0.6)
380
 
381
- # L3: Disfluency signal (inverted) - less disfluency = more read
382
- # Combines filler rate, nonalpha, and repetitions
383
  disfluency = (
384
  features['nonalpha_per_sec'] +
385
  8.0 * features['filler_rate'] +
@@ -387,42 +403,43 @@ class SpeechRecognizer:
387
  )
388
  L3 = self._logistic(-disfluency, a=0.0, b=0.8)
389
 
390
- # L4: Pause regularity - regular pauses = read (already 0-1)
391
  L4 = features.get('pause_regularity', 0.5)
392
 
393
- # L5: Rate variability (inverted) - low variability = read
394
  rate_var = features.get('speech_rate_variability', 0.0)
395
  L5 = 1.0 - rate_var
396
 
397
- # L6: Sentence variance (inverted) - uniform sentences = read
398
  sent_var = features.get('sentence_length_variance', 0.0)
399
  L6 = 1.0 - sent_var
400
 
401
- # L7: Self-corrections (inverted) - more corrections = spontaneous
402
  corrections = features.get('self_correction_count', 0)
403
  L7 = self._logistic(-corrections, a=0.0, b=1.5)
404
 
405
- # Weighted combination optimized for read detection
406
- # Higher weights on pause regularity and rate consistency (key read markers)
407
  score = (
408
- 0.15 * L1 + # Vocabulary complexity
409
- 0.15 * L2 + # Speaking rate
410
- 0.15 * L3 + # Disfluency (filler/repetition)
411
- 0.20 * L4 + # Pause regularity (strong read signal)
412
- 0.15 * L5 + # Rate variability
413
- 0.10 * L6 + # Sentence uniformity
414
- 0.10 * L7 # Self-corrections
415
  )
416
 
417
  return float(score)
418
 
419
- def _interpret_speech_patterns(self, analysis: Dict, kopparapu_features: Dict = None, kopparapu_score: float = None) -> str:
 
420
  filler_ratio = analysis['filler_words']['ratio']
421
  pause_patterns = analysis['pause_patterns']
422
  speech_rate = analysis['speech_rate']
423
 
424
  interpretation = "**Overall Assessment:**\n\n"
425
 
 
426
  spontaneity_score = 0
427
  indicators = []
428
 
@@ -437,7 +454,8 @@ class SpeechRecognizer:
437
  if 120 <= speech_rate <= 180:
438
  spontaneity_score += 1
439
  indicators.append(f"Natural speech rate ({speech_rate:.1f} words/min)")
440
-
 
441
  if spontaneity_score >= 2:
442
  interpretation += "✓ **Speech patterns suggest spontaneous, natural speaking.**\n\n"
443
  if indicators:
@@ -455,13 +473,14 @@ class SpeechRecognizer:
455
 
456
  return interpretation
457
 
458
- def get_detailed_segments(self, audio_path: str) -> List[Dict]:
 
459
  result = self.model.transcribe(audio_path, word_timestamps=True, verbose=False)
460
  return result.get('segments', [])
461
 
462
 
 
463
  if __name__ == "__main__":
464
  recognizer = SpeechRecognizer(model_size="base")
465
  print(f"Speech recognizer initialized with {recognizer.model_size} model")
466
  print(f"Device: {recognizer.device}")
467
-
 
2
  import torch
3
  import numpy as np
4
  import re
 
5
  import warnings
6
  import librosa
7
  warnings.filterwarnings("ignore")
8
 
9
 
10
+ # Main class for speech recognition and analysis
11
  class SpeechRecognizer:
12
+ def __init__(self, model_size="base", device=None):
13
+ # set device - use GPU if available
14
  if device is None:
15
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
16
  else:
17
  self.device = device
18
+
19
+ # load whisper model
20
  print(f"Loading Whisper {model_size} model on {self.device}...")
21
  self.model = whisper.load_model(model_size, device=self.device)
22
  print(f"Whisper model loaded successfully.")
23
 
24
  self.model_size = model_size
25
 
26
+ # check if audio file is valid before processing
27
+ def _validate_audio(self, audio_path):
28
  try:
29
+ # load and check audio
30
  audio, sr = librosa.load(audio_path, sr=16000)
31
  duration = len(audio) / sr
32
 
33
+ # audio must be at least 0.1 seconds
34
  if duration < 0.1:
35
+ return False, "Audio too short", duration
36
 
37
+ # check for silent audio
38
  if np.max(np.abs(audio)) < 0.001:
39
+ return False, "Audio is silent", duration
40
 
41
  return True, "Valid", duration
42
 
43
  except Exception as e:
44
+ return False, f"Could not load audio file", 0.0
45
+
46
+ # main transcription function
47
+ def transcribe(self, audio_path, language=None, task="transcribe"):
48
+ # validate audio first
 
 
 
 
49
  is_valid, message, audio_duration = self._validate_audio(audio_path)
50
  if not is_valid:
51
+ print(f"Audio check failed: {message}")
 
52
  return self._get_empty_response(message, audio_duration)
53
 
54
+ # try to transcribe with word timestamps
55
  try:
56
  result = self.model.transcribe(
57
  audio_path,
 
59
  task=task,
60
  verbose=False,
61
  word_timestamps=True,
62
+ fp16=False # avoid fp16 issues
63
  )
64
  except (KeyError, RuntimeError) as e:
65
  error_msg = str(e)
66
+ # handle specific errors
67
  if "reshape tensor of 0 elements" in error_msg or "ambiguous" in error_msg:
68
+ print(f"Audio might be too short or corrupted")
69
  return self._get_empty_response("Audio too short or corrupted", audio_duration)
70
 
71
+ # retry without word timestamps
72
+ print(f"First try failed, trying again...")
73
  try:
74
  result = self.model.transcribe(
75
  audio_path,
 
80
  fp16=False
81
  )
82
  except Exception as e2:
83
+ print(f"Could not transcribe audio: {e2}")
84
+ return self._get_empty_response("Transcription failed", audio_duration)
85
 
86
+ # extract transcription results
87
  transcription = result['text'].strip()
88
  detected_language = result.get('language', 'unknown')
89
  segments = result.get('segments', [])
90
 
91
+ # handle empty transcription
92
  if not transcription or len(transcription.strip()) == 0:
93
  print("Warning: Transcription is empty")
94
  return self._get_empty_response("No speech detected in audio", audio_duration)
95
 
96
+ # analyze transcription for speech patterns
97
  analysis = self._analyze_transcription(transcription, segments)
98
 
99
+ # extract kopparapu features for read/spontaneous detection
100
  duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
101
  kopparapu_features = self._extract_kopparapu_features(
102
  transcription, duration, segments, analysis['pause_patterns']
 
118
  'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
119
  }
120
 
121
+ # return empty response when transcription fails
122
+ def _get_empty_response(self, reason, duration=0.0):
123
  return {
124
  'transcription': f"[Error: {reason}]",
125
  'language': 'unknown',
 
148
  },
149
  'kopparapu_score': 0.5,
150
  'kopparapu_classification': 'unknown',
151
+ 'interpretation': f"Could not process audio: {reason}\n\nTips:\n- Make sure audio is at least 1 second\n- Check that there is actual speech\n- Try a different audio file"
152
  }
153
 
154
+ # analyze transcription for various speech metrics
155
+ def _analyze_transcription(self, text, segments):
156
  words = text.split()
157
  word_count = len(words)
158
 
159
+ # calculate duration from segments
160
  duration = 0
161
  if segments:
162
  duration = segments[-1]['end'] - segments[0]['start']
163
 
164
+ # calculate speaking rate (words per minute)
165
  speech_rate = (word_count / duration * 60) if duration > 0 else 0
 
166
 
167
+ # list of filler words to detect
168
  filler_words_list = [
169
  ('um', r'\bum\b'), ('uh', r'\buh\b'), ('er', r'\ber\b'),
170
  ('ah', r'\bah\b'), ('like', r'\blike\b'), ('you know', r'\byou know\b'),
 
174
  ('hmm', r'\bhmm+\b'), ('mm', r'\bmm+\b')
175
  ]
176
 
177
+ # count filler words
178
  text_lower = text.lower()
179
  filler_count = {}
180
  total_fillers = 0
 
186
  filler_count[filler_name] = count
187
  total_fillers += count
188
 
189
+ # calculate filler ratio
190
  filler_ratio = total_fillers / word_count if word_count > 0 else 0
191
 
192
+ # analyze pause patterns
193
  pause_patterns = self._analyze_pauses(segments)
194
 
195
  return {
 
204
  'pause_patterns': pause_patterns
205
  }
206
 
207
+ # extract pause timing information from segments
208
+ def _analyze_pauses(self, segments):
209
  pauses = []
210
 
211
+ # find pauses between segments
212
  if len(segments) >= 2:
213
  for i in range(len(segments) - 1):
214
  pause = segments[i + 1]['start'] - segments[i]['end']
215
+ if pause > 0.05: # pauses > 50ms
216
  pauses.append(pause)
217
 
218
+ # find pauses between words within segments
219
  for segment in segments:
220
  if 'words' in segment and len(segment['words']) > 1:
221
  words = segment['words']
222
  for i in range(len(words) - 1):
223
  if 'start' in words[i] and 'end' in words[i] and 'start' in words[i+1]:
224
  pause = words[i + 1]['start'] - words[i]['end']
225
+ if pause > 0.15: # word-level pauses > 150ms
226
  pauses.append(pause)
227
 
228
+ # return empty stats if no pauses found
229
  if not pauses:
230
  return {
231
  'avg_pause': 0.0,
 
241
  'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
242
  }
243
 
244
+ # extract features based on kopparapu's method for read vs spontaneous detection
245
+ def _extract_kopparapu_features(self, text, duration_sec, segments=None, pause_patterns=None):
 
 
246
  text = text.strip()
247
+ # handle empty text
248
  if len(text) == 0:
249
  return {
250
  'alpha_ratio': 0.0,
 
259
  'self_correction_count': 0
260
  }
261
 
262
+ # count character types
263
  total_chars = len(text)
264
  alpha_chars = sum(c.isalpha() for c in text)
265
  nonalpha_chars = total_chars - alpha_chars
266
 
267
+ # ratio of alphabetic characters
268
  alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
269
 
270
+ # average word length
271
  words = text.split()
272
  num_words = max(len(words), 1)
273
  chars_per_word = alpha_chars / num_words
274
 
275
+ # speaking rate features
276
  duration_sec = max(duration_sec, 1e-3)
277
  words_per_sec = num_words / duration_sec
278
  nonalpha_per_sec = nonalpha_chars / duration_sec
279
 
280
+ # detect character repetitions like "sooo" or "ummmm"
281
  char_reps = len(re.findall(r'(.)\1{2,}', text))
282
 
283
+ # detect word repetitions like "I I think"
284
  words_list = text.lower().split()
285
  word_reps = 0
286
  for i in range(len(words_list) - 1):
 
289
 
290
  repetition_count = char_reps + word_reps
291
 
292
+ # count filler words
293
  lower = text.lower()
294
  filler_patterns = [
295
  r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
 
303
  filler_count += len(re.findall(pattern, lower))
304
  filler_rate = filler_count / num_words
305
 
306
+ # pause regularity - read speech has regular pauses at punctuation
307
+ pause_regularity = 0.5
 
308
  if pause_patterns and pause_patterns.get('num_pauses', 0) > 2:
309
  pause_var = pause_patterns.get('pause_variability', 0.5)
310
+ # low variability = regular pauses = likely read
 
311
  pause_regularity = max(0.0, min(1.0, 1.0 - (pause_var / 0.6)))
312
 
313
+ # speech rate variability across segments
 
314
  speech_rate_variability = self._compute_rate_variability(segments) if segments else 0.0
315
 
316
+ # sentence length variance - uniform = likely read
317
  sentence_length_variance = self._compute_sentence_variance(text)
318
 
319
+ # count self-corrections and false starts
320
  self_correction_patterns = [
321
  r'\bwait\b', r'\bsorry\b', r'\bno\s*,?\s*I\b',
322
  r'\bactually\s*,?\s*no\b', r'\blet me\b', r'\bwhat I meant\b',
 
339
  'self_correction_count': int(self_correction_count)
340
  }
341
 
342
+ # compute variability in speaking rate across segments
343
+ def _compute_rate_variability(self, segments):
344
  if not segments or len(segments) < 3:
345
  return 0.0
346
 
347
  segment_rates = []
348
  for seg in segments:
349
  duration = seg.get('end', 0) - seg.get('start', 0)
350
+ if duration > 0.3: # only segments > 300ms
351
  words_in_seg = len(seg.get('text', '').split())
352
  rate = words_in_seg / duration
353
  if rate > 0:
 
356
  if len(segment_rates) < 3:
357
  return 0.0
358
 
359
+ # calculate coefficient of variation
360
  mean_rate = np.mean(segment_rates)
361
  std_rate = np.std(segment_rates)
362
 
 
363
  cv = std_rate / mean_rate if mean_rate > 0 else 0
364
+ return float(min(1.0, cv / 0.5))
365
 
366
+ # compute variance in sentence lengths
367
+ def _compute_sentence_variance(self, text):
368
+ # split into sentences
369
  sentences = re.split(r'[.!?]+', text)
370
  sentences = [s.strip() for s in sentences if s.strip()]
371
 
372
  if len(sentences) < 2:
373
  return 0.0
374
 
375
+ # get word counts per sentence
376
  lengths = [len(s.split()) for s in sentences]
377
  mean_len = np.mean(lengths)
378
  std_len = np.std(lengths)
379
 
380
+ # coefficient of variation normalized
381
  cv = std_len / mean_len if mean_len > 0 else 0
382
+ return float(min(1.0, cv / 0.6))
383
 
384
+ # logistic function for smooth score transitions
385
+ def _logistic(self, x, a, b):
386
+ return 1.0 / (1.0 + np.exp(-(x - a) / b))
387
 
388
+ # calculate overall kopparapu score for read vs spontaneous
389
+ def _calculate_kopparapu_score(self, features):
390
+ # L1: vocabulary complexity - higher = more formal = read
391
  f1 = features['chars_per_word']
392
  L1 = self._logistic(f1, a=4.8, b=1.2)
393
 
394
+ # L2: speaking rate - faster, steadier = read
395
  f2 = features['words_per_sec']
396
  L2 = self._logistic(f2, a=2.2, b=0.6)
397
 
398
+ # L3: disfluency - less disfluency = more read
 
399
  disfluency = (
400
  features['nonalpha_per_sec'] +
401
  8.0 * features['filler_rate'] +
 
403
  )
404
  L3 = self._logistic(-disfluency, a=0.0, b=0.8)
405
 
406
+ # L4: pause regularity - regular = read
407
  L4 = features.get('pause_regularity', 0.5)
408
 
409
+ # L5: rate variability - low = read
410
  rate_var = features.get('speech_rate_variability', 0.0)
411
  L5 = 1.0 - rate_var
412
 
413
+ # L6: sentence variance - uniform = read
414
  sent_var = features.get('sentence_length_variance', 0.0)
415
  L6 = 1.0 - sent_var
416
 
417
+ # L7: self-corrections - fewer = read
418
  corrections = features.get('self_correction_count', 0)
419
  L7 = self._logistic(-corrections, a=0.0, b=1.5)
420
 
421
+ # weighted combination
 
422
  score = (
423
+ 0.15 * L1 + # vocabulary complexity
424
+ 0.15 * L2 + # speaking rate
425
+ 0.15 * L3 + # disfluency
426
+ 0.20 * L4 + # pause regularity
427
+ 0.15 * L5 + # rate variability
428
+ 0.10 * L6 + # sentence uniformity
429
+ 0.10 * L7 # self-corrections
430
  )
431
 
432
  return float(score)
433
 
434
+ # generate human-readable interpretation of speech patterns
435
+ def _interpret_speech_patterns(self, analysis, kopparapu_features=None, kopparapu_score=None):
436
  filler_ratio = analysis['filler_words']['ratio']
437
  pause_patterns = analysis['pause_patterns']
438
  speech_rate = analysis['speech_rate']
439
 
440
  interpretation = "**Overall Assessment:**\n\n"
441
 
442
+ # calculate spontaneity score
443
  spontaneity_score = 0
444
  indicators = []
445
 
 
454
  if 120 <= speech_rate <= 180:
455
  spontaneity_score += 1
456
  indicators.append(f"Natural speech rate ({speech_rate:.1f} words/min)")
457
+
458
+ # generate interpretation based on score
459
  if spontaneity_score >= 2:
460
  interpretation += "✓ **Speech patterns suggest spontaneous, natural speaking.**\n\n"
461
  if indicators:
 
473
 
474
  return interpretation
475
 
476
+ # get detailed segment information
477
+ def get_detailed_segments(self, audio_path):
478
  result = self.model.transcribe(audio_path, word_timestamps=True, verbose=False)
479
  return result.get('segments', [])
480
 
481
 
482
+ # test code - runs when script is executed directly
483
  if __name__ == "__main__":
484
  recognizer = SpeechRecognizer(model_size="base")
485
  print(f"Speech recognizer initialized with {recognizer.model_size} model")
486
  print(f"Device: {recognizer.device}")
 
text_analyzer.py CHANGED
@@ -1,18 +1,7 @@
1
- import re
2
- import requests
3
- from typing import Dict, List, Tuple, Optional
4
- import torch
5
- from transformers import (
6
- AutoTokenizer,
7
- AutoModelForSequenceClassification,
8
- RobertaTokenizer,
9
- RobertaForSequenceClassification
10
- )
11
- import numpy as np
12
- from collections import Counter
13
  import warnings
14
  warnings.filterwarnings("ignore")
15
 
 
16
  try:
17
  from plagiarism_detection import ai_plagiarism_detection
18
  DESKLIB_AVAILABLE = True
@@ -21,12 +10,12 @@ except ImportError:
21
  print("Warning: plagiarism_detection module not found. Using fallback AI detection.")
22
 
23
 
24
-
25
-
26
  class AITextDetector:
27
- def __init__(self, device: str = None, threshold: float = 0.78):
28
  self.threshold = threshold
29
 
 
30
  if not DESKLIB_AVAILABLE:
31
  print("Warning: plagiarism_detection module not found. AI detection will not be available.")
32
  print("Ensure plagiarism_detection.py is in the same directory.")
@@ -35,10 +24,11 @@ class AITextDetector:
35
  print(f"Using Desklib AI text detector (threshold: {self.threshold})")
36
  self.available = True
37
 
38
- def detect_ai_text(self, text: str) -> Dict:
 
39
 
 
40
  if not self.available:
41
- # Return neutral result if Desklib not available
42
  return {
43
  'ai_generated': False,
44
  'confidence': 0.5,
@@ -47,7 +37,7 @@ class AITextDetector:
47
  'model_used': 'N/A (module not found)'
48
  }
49
 
50
- # Use Desklib AI detector
51
  try:
52
  probability, ai_detected = ai_plagiarism_detection(
53
  text,
@@ -63,17 +53,17 @@ class AITextDetector:
63
  'model_used': 'Desklib AI Detector v1.01'
64
  }
65
  except Exception as e:
66
- print(f"Error in AI detection: {e}")
67
  return {
68
  'ai_generated': False,
69
  'confidence': 0.5,
70
  'indicators': [],
71
- 'interpretation': f"AI detection error: {str(e)}",
72
  'model_used': 'Error'
73
  }
74
 
75
-
76
- def _identify_ai_indicators(self, probability: float) -> List[str]:
77
  indicators = []
78
 
79
  if probability > 0.9:
@@ -85,7 +75,8 @@ class AITextDetector:
85
 
86
  return indicators
87
 
88
- def _interpret_ai_detection(self, score: float) -> str:
 
89
  interpretation = f"**AI-Generated Text Detection:**\n\n"
90
  interpretation += f"- AI Probability Score: {score*100:.1f}%\n"
91
  interpretation += f"- Detection Threshold: {self.threshold*100:.0f}%\n"
@@ -93,21 +84,23 @@ class AITextDetector:
93
  return interpretation
94
 
95
 
 
96
  class TextAuthenticityAnalyzer:
97
 
98
- def __init__(self, device: str = None, ai_threshold: float = 0.78):
99
-
100
  self.ai_detector = AITextDetector(device=device, threshold=ai_threshold)
101
 
102
- def analyze(self, text: str) -> Dict:
103
- # Run AI detection
 
104
  ai_results = self.ai_detector.detect_ai_text(text)
105
 
106
- # Calculate overall authenticity score based on AI detection
107
  ai_penalty = ai_results['confidence']
108
  authenticity_score = 1.0 - ai_penalty
109
 
110
- # Determine overall assessment
111
  if authenticity_score < 0.3:
112
  overall_assessment = "HIGH RISK: Strong AI-generated text indicators"
113
  risk_level = "high"
@@ -129,9 +122,8 @@ class TextAuthenticityAnalyzer:
129
  }
130
 
131
 
 
132
  if __name__ == "__main__":
133
- # Example usage
134
  analyzer = TextAuthenticityAnalyzer()
135
  print("Text authenticity analyzer initialized.")
136
  print("Components: Plagiarism Detector + AI Text Detector")
137
-
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import warnings
2
  warnings.filterwarnings("ignore")
3
 
4
+ # try to import the desklib AI detector
5
  try:
6
  from plagiarism_detection import ai_plagiarism_detection
7
  DESKLIB_AVAILABLE = True
 
10
  print("Warning: plagiarism_detection module not found. Using fallback AI detection.")
11
 
12
 
13
+ # class for detecting AI-generated text
 
14
  class AITextDetector:
15
+ def __init__(self, device=None, threshold=0.78):
16
  self.threshold = threshold
17
 
18
+ # check if desklib model is available
19
  if not DESKLIB_AVAILABLE:
20
  print("Warning: plagiarism_detection module not found. AI detection will not be available.")
21
  print("Ensure plagiarism_detection.py is in the same directory.")
 
24
  print(f"Using Desklib AI text detector (threshold: {self.threshold})")
25
  self.available = True
26
 
27
+ # main detection function
28
+ def detect_ai_text(self, text):
29
 
30
+ # return neutral result if detector not available
31
  if not self.available:
 
32
  return {
33
  'ai_generated': False,
34
  'confidence': 0.5,
 
37
  'model_used': 'N/A (module not found)'
38
  }
39
 
40
+ # run detection using desklib model
41
  try:
42
  probability, ai_detected = ai_plagiarism_detection(
43
  text,
 
53
  'model_used': 'Desklib AI Detector v1.01'
54
  }
55
  except Exception as e:
56
+ print(f"Something went wrong with AI detection: {e}")
57
  return {
58
  'ai_generated': False,
59
  'confidence': 0.5,
60
  'indicators': [],
61
+ 'interpretation': "Could not run AI detection",
62
  'model_used': 'Error'
63
  }
64
 
65
+ # identify specific indicators based on probability
66
+ def _identify_ai_indicators(self, probability):
67
  indicators = []
68
 
69
  if probability > 0.9:
 
75
 
76
  return indicators
77
 
78
+ # generate interpretation text
79
+ def _interpret_ai_detection(self, score):
80
  interpretation = f"**AI-Generated Text Detection:**\n\n"
81
  interpretation += f"- AI Probability Score: {score*100:.1f}%\n"
82
  interpretation += f"- Detection Threshold: {self.threshold*100:.0f}%\n"
 
84
  return interpretation
85
 
86
 
87
+ # main analyzer class that combines all text analysis
88
  class TextAuthenticityAnalyzer:
89
 
90
+ def __init__(self, device=None, ai_threshold=0.78):
91
+ # initialize AI detector
92
  self.ai_detector = AITextDetector(device=device, threshold=ai_threshold)
93
 
94
+ # analyze text for authenticity
95
+ def analyze(self, text):
96
+ # run AI detection
97
  ai_results = self.ai_detector.detect_ai_text(text)
98
 
99
+ # calculate authenticity score (inverse of AI probability)
100
  ai_penalty = ai_results['confidence']
101
  authenticity_score = 1.0 - ai_penalty
102
 
103
+ # determine risk level based on authenticity
104
  if authenticity_score < 0.3:
105
  overall_assessment = "HIGH RISK: Strong AI-generated text indicators"
106
  risk_level = "high"
 
122
  }
123
 
124
 
125
+ # test code - runs when script is executed directly
126
  if __name__ == "__main__":
 
127
  analyzer = TextAuthenticityAnalyzer()
128
  print("Text authenticity analyzer initialized.")
129
  print("Components: Plagiarism Detector + AI Text Detector")