Ranam Hamoud commited on
Commit
4ec806c
·
1 Parent(s): 1089eaf

Add audio authenticity detection app with all components

Browse files
app.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pipeline import AuthenticityDetectionPipeline
4
+ import traceback
5
+
6
+ try:
7
+ pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
8
+ pipeline_ready = True
9
+ except Exception:
10
+ pipeline_ready = False
11
+
12
+
13
+ def analyze_audio_file(audio_file):
14
+ if not pipeline_ready:
15
+ return (
16
+ "Error: Pipeline not initialized. Please check the installation.",
17
+ "", "", "", "", ""
18
+ )
19
+
20
+ if audio_file is None:
21
+ return (
22
+ "Please upload an audio file.",
23
+ "", "", "", "", ""
24
+ )
25
+
26
+ try:
27
+ language_code = None
28
+ results = pipeline.analyze_audio(audio_file, language=language_code)
29
+
30
+ audio_class = results['audio_classification']
31
+ asr = results['speech_recognition']
32
+ text_auth = results['text_authenticity']
33
+ final = results['final_assessment']
34
+
35
+
36
+ verdict_color = {
37
+ "AUTHENTIC": "#10b981",
38
+ "LIKELY AUTHENTIC": "#3b82f6",
39
+ "QUESTIONABLE": "#f59e0b",
40
+ "LIKELY INAUTHENTIC": "#ef4444"
41
+ }
42
+
43
+ color = verdict_color.get(final['verdict'], '#6b7280')
44
+
45
+ overall_status = f"""
46
+ <div style='background: white; border: 2px solid {color}; padding: 25px; border-radius: 16px; margin: 10px 0;'>
47
+ <h2 style='color: {color}; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
48
+ {final['verdict']}
49
+ </h2>
50
+ <div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px; margin: 15px 0;'>
51
+ <div style='text-align: center; padding: 15px; background: white; border-radius: 10px;'>
52
+ <div style='font-size: 2em; font-weight: bold; color: {color};'>{final['composite_authenticity_score']*100:.0f}%</div>
53
+ <div style='color: #666; margin-top: 5px;'>Authenticity Score</div>
54
+ </div>
55
+ <div style='text-align: center; padding: 15px; background: white; border-radius: 10px;'>
56
+ <div style='font-size: 2em; font-weight: bold; color: {color};'>{final['risk_level'].upper()}</div>
57
+ <div style='color: #666; margin-top: 5px;'>Risk Level</div>
58
+ </div>
59
+ <div style='text-align: center; padding: 15px; background: white; border-radius: 10px;'>
60
+ <div style='font-size: 2em; font-weight: bold; color: #667eea;'>{results['processing_time']:.1f}s</div>
61
+ <div style='color: #666; margin-top: 5px;'>Processing Time</div>
62
+ </div>
63
+ </div>
64
+ <div style='background: white; padding: 15px; border-radius: 10px; margin-top: 15px;'>
65
+ <em style='color: #555;'>{final['recommendation']}</em>
66
+ </div>
67
+ </div>
68
+ """
69
+ acoustic_output = audio_class['interpretation']
70
+
71
+ transcription_output = "### Speech Transcription\n\n"
72
+ transcription_output += f"| Metric | Value |\n"
73
+ transcription_output += f"|--------|-------|\n"
74
+ transcription_output += f"| **Language** | {asr['language'].upper()} |\n"
75
+ transcription_output += f"| **Duration** | {asr['duration']:.1f} seconds |\n"
76
+ transcription_output += f"| **Word Count** | {asr['word_count']} words |\n"
77
+ transcription_output += f"| **Speech Rate** | {asr['speech_rate']:.1f} words/min |\n\n"
78
+ if asr['speech_rate'] > 160:
79
+ transcription_output += "**Fast speech rate** - Above average speaking speed\n\n"
80
+ elif asr['speech_rate'] < 120:
81
+ transcription_output += "**Slow speech rate** - Below average speaking speed\n\n"
82
+ else:
83
+ transcription_output += "**Normal speech rate** - Average conversational pace\n\n"
84
+
85
+ transcription_output += "---\n\n"
86
+ transcription_output += "#### Full Transcription\n\n"
87
+ transcription_output += f"> {asr['transcription']}"
88
+
89
+
90
+ if 'kopparapu_score' in asr:
91
+ classification = asr['kopparapu_classification'].upper()
92
+ confidence = asr['kopparapu_score'] if asr['kopparapu_score'] >= 0.5 else (1 - asr['kopparapu_score'])
93
+
94
+ speech_patterns = f" ### **Classification: {classification} SPEECH**\n\n"
95
+ speech_patterns += f"**Score:** {asr['kopparapu_score']:.3f} (0=spontaneous, 1=read)\n"
96
+ speech_patterns += f"**Confidence:** {confidence*100:.1f}%\n\n"
97
+
98
+ speech_patterns += "---\n\n"
99
+ speech_patterns += "#### Linguistic Metrics\n\n"
100
+ kf = asr['kopparapu_features']
101
+
102
+ speech_patterns += "| Feature | Value | Interpretation |\n"
103
+ speech_patterns += "|---------|-------|----------------|\n"
104
+ speech_patterns += f"| **Characters/Word** | {kf['chars_per_word']:.2f} | "
105
+ if kf['chars_per_word'] > 5.5:
106
+ speech_patterns += "Complex vocabulary |\n"
107
+ elif kf['chars_per_word'] < 4.5:
108
+ speech_patterns += "Simple vocabulary |\n"
109
+ else:
110
+ speech_patterns += "Average complexity |\n"
111
+
112
+ speech_patterns += f"| **Words/Second** | {kf['words_per_sec']:.2f} | "
113
+ if kf['words_per_sec'] > 3:
114
+ speech_patterns += "Fast pacing |\n"
115
+ elif kf['words_per_sec'] < 2:
116
+ speech_patterns += "Slow pacing |\n"
117
+ else:
118
+ speech_patterns += "Normal pacing |\n"
119
+
120
+ speech_patterns += f"| **Non-alpha chars/sec** | {kf['nonalpha_per_sec']:.2f} | "
121
+ if kf['nonalpha_per_sec'] > 2.5:
122
+ speech_patterns += "High (disfluent) |\n"
123
+ elif kf['nonalpha_per_sec'] < 1.5:
124
+ speech_patterns += "Low (fluent) |\n"
125
+ else:
126
+ speech_patterns += "Moderate |\n"
127
+
128
+ speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
129
+ if kf['filler_rate'] > 0.05:
130
+ speech_patterns += "High (spontaneous) |\n"
131
+ elif kf['filler_rate'] < 0.02:
132
+ speech_patterns += "Low (scripted) |\n"
133
+ else:
134
+ speech_patterns += "Moderate |\n"
135
+
136
+ speech_patterns += f"| **Repetitions** | {kf['repetition_count']} | "
137
+ if kf['repetition_count'] > 3:
138
+ speech_patterns += "Multiple (thinking aloud) |\n"
139
+ elif kf['repetition_count'] == 0:
140
+ speech_patterns += "None (prepared) |\n"
141
+ else:
142
+ speech_patterns += "Few |\n"
143
+
144
+ speech_patterns += f"| **Alpha Ratio** | {kf['alpha_ratio']:.2f} | "
145
+ if kf['alpha_ratio'] > 0.85:
146
+ speech_patterns += "Clean text |\n"
147
+ else:
148
+ speech_patterns += "With artifacts |\n"
149
+
150
+ speech_patterns += "\n"
151
+
152
+ speech_patterns += "---\n\n"
153
+ speech_patterns += "#### Filler Words & Disfluencies\n\n"
154
+ filler_ratio = asr['filler_words']['ratio']
155
+ speech_patterns += f"**Count:** {asr['filler_words']['count']} filler words\n"
156
+ speech_patterns += f"**Ratio:** {filler_ratio*100:.2f}% of speech\n\n"
157
+
158
+ if asr['filler_words']['details']:
159
+ speech_patterns += "**Found:** " + ', '.join([f"*{k}* ({v})" for k, v in asr['filler_words']['details'].items()]) + "\n\n"
160
+
161
+ if filler_ratio > 0.05:
162
+ speech_patterns += "**High filler usage** - Strong indicator of spontaneous, unscripted speech\n\n"
163
+ elif filler_ratio < 0.02:
164
+ speech_patterns += "**Low filler usage** - May indicate reading or highly rehearsed speech\n\n"
165
+ else:
166
+ speech_patterns += "**Moderate filler usage** - Normal conversational pattern\n\n"
167
+
168
+ speech_patterns += "---\n\n"
169
+ speech_patterns += "#### Pause Patterns\n\n"
170
+ pause_var = asr['pause_patterns']['pause_variability']
171
+
172
+ speech_patterns += f"**Total Pauses:** {asr['pause_patterns']['num_pauses']}\n"
173
+ speech_patterns += f"**Average Duration:** {asr['pause_patterns']['avg_pause']:.2f}s\n"
174
+ speech_patterns += f"**Longest Pause:** {asr['pause_patterns']['max_pause']:.2f}s\n"
175
+ speech_patterns += f"**Variability:** {pause_var:.2f}\n\n"
176
+
177
+ if pause_var < 0.3:
178
+ speech_patterns += "**Regular pauses** - Consistent pattern suggests reading at punctuation marks\n\n"
179
+ elif pause_var > 0.6:
180
+ speech_patterns += "**Irregular pauses** - Natural thinking breaks indicate spontaneous speech\n\n"
181
+ else:
182
+ speech_patterns += "**Moderate variability** - Mixed pattern\n\n"
183
+
184
+ is_ai = text_auth['ai_detection']['ai_generated']
185
+ ai_prob = text_auth['ai_detection']['confidence']
186
+
187
+ if is_ai:
188
+ ai_output = "### **AI-GENERATED LIKELY**\n\n"
189
+ else:
190
+ ai_output = "### **HUMAN-WRITTEN LIKELY**\n\n"
191
+
192
+ ai_output += "**Confidence:**\n\n"
193
+ bar_length = 30
194
+ ai_bars = int(ai_prob * bar_length)
195
+ human_bars = bar_length - ai_bars
196
+ ai_output += f"```\nAI: [{'█' * ai_bars}{'░' * human_bars}] {ai_prob*100:.0f}%\n"
197
+ ai_output += f"Human: [{'█' * human_bars}{'░' * ai_bars}] {(1-ai_prob)*100:.0f}%\n```\n\n"
198
+
199
+ ai_output += "---\n\n"
200
+ ai_output += "#### Interpretation\n\n"
201
+ ai_interpretation = text_auth['ai_detection'].get('interpretation', 'No interpretation available.')
202
+ if ai_interpretation:
203
+ ai_output += ai_interpretation
204
+ else:
205
+ ai_output += "No interpretation available."
206
+
207
+ return (
208
+ overall_status,
209
+ acoustic_output,
210
+ transcription_output,
211
+ speech_patterns,
212
+ ai_output,
213
+ )
214
+
215
+ except Exception as e:
216
+ error_msg = f"Error during analysis:\n\n{str(e)}\n\n{traceback.format_exc()}"
217
+ return (error_msg, "", "", "", "", "")
218
+
219
+
220
+ def create_interface():
221
+ """Create and configure Gradio interface."""
222
+
223
+ custom_css = """
224
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
225
+
226
+ .gradio-container {
227
+ font-family: 'IBM Plex Sans', sans-serif !important;
228
+ background: #f9fafb !important;
229
+ }
230
+ .contain {
231
+ max-width: 1280px;
232
+ margin: 0 auto;
233
+ background: white;
234
+ border-radius: 16px;
235
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
236
+ padding: 24px;
237
+ }
238
+ .tab-nav button {
239
+ font-family: 'IBM Plex Sans', sans-serif;
240
+ font-size: 14px;
241
+ font-weight: 500;
242
+ padding: 10px 16px;
243
+ border-radius: 8px 8px 0 0;
244
+ transition: all 0.2s;
245
+ }
246
+ .tab-nav button.selected {
247
+ background: #2563eb;
248
+ color: white;
249
+ font-weight: 600;
250
+ }
251
+ button.primary, .primary {
252
+ background: #2563eb !important;
253
+ color: white !important;
254
+ border: none !important;
255
+ font-size: 16px !important;
256
+ font-weight: 600 !important;
257
+ padding: 12px 24px !important;
258
+ border-radius: 8px !important;
259
+ transition: all 0.2s !important;
260
+ }
261
+ button.primary:hover, .primary:hover {
262
+ background: #1d4ed8 !important;
263
+ }
264
+ .markdown-text {
265
+ font-family: 'IBM Plex Sans', sans-serif;
266
+ line-height: 1.7;
267
+ }
268
+ h1, h2, h3, h4 {
269
+ font-family: 'IBM Plex Sans', sans-serif;
270
+ font-weight: 600;
271
+ }
272
+ """
273
+
274
+ with gr.Blocks(css=custom_css, title="Authenticity Detection System") as demo:
275
+
276
+ gr.HTML("""
277
+ <header style='background: white; border-bottom: 1px solid #e5e7eb; margin-bottom: 32px;'>
278
+ <div style='padding: 16px 0;'>
279
+ <div style='display: flex; align-items: center; gap: 12px;'>
280
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" width="32" height="32">
281
+ <defs>
282
+ <linearGradient id="g" x1="0" y1="0" x2="64" y2="0" gradientUnits="userSpaceOnUse">
283
+ <stop offset="0" stop-color="#1d4ed8" />
284
+ <stop offset="1" stop-color="#0ea5e9" />
285
+ </linearGradient>
286
+ </defs>
287
+ <rect x="0" y="0" width="64" height="64" rx="12" fill="#ffffff"/>
288
+ <path d="M4 32 C 10 18, 18 46, 24 32 S 36 18, 40 32 52 46, 60 32"
289
+ fill="none" stroke="url(#g)" stroke-width="4" stroke-linecap="round" stroke-linejoin="round"/>
290
+ </svg>
291
+ <div>
292
+ <p style='margin: 0; font-size: 11px; text-transform: uppercase; letter-spacing: 1.5px; color: #6b7280; font-weight: 500;'>
293
+ LEIDEN UNIVERSITY · LIACS
294
+ </p>
295
+ <h1 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>
296
+ Audio Processing & Indexing Project
297
+ </h1>
298
+ </div>
299
+ </div>
300
+ </div>
301
+ </header>
302
+
303
+ <section style='background: linear-gradient(to bottom, white, #f9fafb); margin-bottom: 40px;'>
304
+ <div style='padding: 32px 0;'>
305
+ <h2 style='font-size: 32px; font-weight: 700; line-height: 1.2; color: #111827; margin: 0 0 16px 0;'>
306
+ Detecting AI-Assisted Responses in Online Settings
307
+ </h2>
308
+ <p style='font-size: 18px; color: #374151; margin: 0 0 24px 0;'>
309
+ </p>
310
+ <div style='display: flex; flex-wrap: wrap; gap: 12px;'>
311
+ <span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #eff6ff; color: #1e40af; border-radius: 8px; font-size: 14px; font-weight: 500;'>
312
+ Multi-Modal Analysis
313
+ </span>
314
+ <span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #fef3c7; color: #92400e; border-radius: 8px; font-size: 14px; font-weight: 500;'>
315
+ Acoustic + Linguistic
316
+ </span>
317
+ </div>
318
+ </div>
319
+ </section>
320
+ """)
321
+
322
+ with gr.Row():
323
+ with gr.Column(scale=1):
324
+ gr.HTML("""
325
+ <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 20px;'>
326
+ <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Audio Input</h3>
327
+ <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>Upload or record your audio file</p>
328
+ </div>
329
+ """)
330
+
331
+ audio_input = gr.Audio(
332
+ sources=["upload", "microphone"],
333
+ type="filepath",
334
+ label="Audio File",
335
+ show_label=False
336
+ )
337
+
338
+ analyze_btn = gr.Button(
339
+ "Analyze Audio",
340
+ variant="primary",
341
+ size="lg"
342
+ )
343
+
344
+ gr.HTML("""
345
+ <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
346
+ <h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
347
+ <ul style='margin: 0; padding-left: 20px; font-size: 13px; color: #6b7280; line-height: 1.8;'>
348
+ <li><strong>Formats:</strong> WAV, MP3, M4A, FLAC, OGG</li>
349
+ <li><strong>Duration:</strong> 30 sec - 5 min</li>
350
+ </ul>
351
+ </div>
352
+
353
+ <div style='background: #fef3c7; border: 1px solid #fbbf24; padding: 16px; border-radius: 12px; margin-top: 16px;'>
354
+ <div style='font-size: 12px; color: #92400e; line-height: 1.6;'>
355
+ <strong>Note:</strong> Provides probabilistic assessments.
356
+ Use as one factor in evaluation.
357
+ </div>
358
+ </div>
359
+ """)
360
+
361
+ with gr.Column(scale=2):
362
+ gr.HTML("""
363
+ <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 20px;'>
364
+ <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Analysis Results</h3>
365
+ <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>You'll see results here</p>
366
+ </div>
367
+ """)
368
+
369
+ overall_output = gr.Markdown()
370
+
371
+ with gr.Tabs() as tabs:
372
+ with gr.Tab("Acoustic Features"):
373
+ acoustic_output = gr.Markdown()
374
+
375
+ with gr.Tab("Transcription"):
376
+ transcription_output = gr.Markdown()
377
+
378
+ with gr.Tab("Speech Patterns"):
379
+ speech_output = gr.Markdown()
380
+
381
+ with gr.Tab("AI Detection"):
382
+ ai_output = gr.Markdown()
383
+
384
+
385
+
386
+ analyze_btn.click(
387
+ fn=analyze_audio_file,
388
+ inputs=[audio_input],
389
+ outputs=[
390
+ overall_output,
391
+ acoustic_output,
392
+ transcription_output,
393
+ speech_output,
394
+ ai_output,
395
+ ]
396
+ )
397
+
398
+ gr.HTML("""
399
+ <footer style='border-top: 1px solid #e5e7eb; background: white; margin-top: 48px; padding: 32px 0;'>
400
+ <div style='text-align: center;'>
401
+ <p style='margin: 0; font-size: 14px; color: #6b7280;'>
402
+ </p>
403
+ <p style='margin: 8px 0 0 0; font-size: 13px; color: #9ca3af;'>
404
+ </p>
405
+ </div>
406
+ </footer>
407
+ """)
408
+
409
+ return demo
410
+
411
+
412
+ if __name__ == "__main__":
413
+ demo = create_interface()
414
+ demo.launch(
415
+ server_name="0.0.0.0",
416
+ server_port=7860,
417
+ share=False,
418
+ show_error=True
419
+ )
420
+
audio-wave.svg ADDED
audio_classifier.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import librosa
5
+ import numpy as np
6
+ from typing import Dict
7
+
8
+ class BasicBlock(nn.Module):
9
+ def __init__(self, in_channels, out_channels, stride=1, downsample=None):
10
+ super(BasicBlock, self).__init__()
11
+ self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
12
+ stride=stride, padding=1, bias=False)
13
+ self.bn1 = nn.BatchNorm2d(out_channels)
14
+ self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
15
+ stride=1, padding=1, bias=False)
16
+ self.bn2 = nn.BatchNorm2d(out_channels)
17
+ self.downsample = downsample
18
+
19
+ def forward(self, x):
20
+ identity = x
21
+ out = F.relu(self.bn1(self.conv1(x)))
22
+ out = self.bn2(self.conv2(out))
23
+
24
+ if self.downsample is not None:
25
+ identity = self.downsample(x)
26
+
27
+ out += identity
28
+ out = F.relu(out)
29
+ return out
30
+
31
+
32
+ class SpeechStyleCNN(nn.Module):
33
+ def __init__(self, num_classes=2):
34
+ super(SpeechStyleCNN, self).__init__()
35
+
36
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
37
+ self.bn1 = nn.BatchNorm2d(64)
38
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
39
+
40
+ self.layer1 = self._make_layer(64, 64, 2, stride=1)
41
+ self.layer2 = self._make_layer(64, 128, 2, stride=2)
42
+ self.layer3 = self._make_layer(128, 256, 2, stride=2)
43
+ self.layer4 = self._make_layer(256, 512, 2, stride=2)
44
+
45
+ self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
46
+ self.fc = nn.Linear(512, num_classes)
47
+
48
+ def _make_layer(self, in_channels, out_channels, blocks, stride=1):
49
+ downsample = None
50
+ if stride != 1 or in_channels != out_channels:
51
+ downsample = nn.Sequential(
52
+ nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
53
+ nn.BatchNorm2d(out_channels)
54
+ )
55
+
56
+ layers = []
57
+ layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
58
+ for _ in range(1, blocks):
59
+ layers.append(BasicBlock(out_channels, out_channels))
60
+
61
+ return nn.Sequential(*layers)
62
+
63
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
64
+ x = F.relu(self.bn1(self.conv1(x)))
65
+ x = self.maxpool(x)
66
+
67
+ x = self.layer1(x)
68
+ x = self.layer2(x)
69
+ x = self.layer3(x)
70
+ x = self.layer4(x)
71
+
72
+ x = self.avgpool(x)
73
+ x = torch.flatten(x, 1)
74
+ x = self.fc(x)
75
+
76
+ return x
77
+
78
+
79
+ class AudioClassifier:
80
+ AVAILABLE_MODELS = {
81
+ '3s_window': 'spectrogram_cnn_3s_window (1).pth',
82
+ # '4s_window': 'spectrogram_cnn_4s_window.pth',
83
+ # '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
84
+ }
85
+
86
+ @classmethod
87
+ def get_model_path(cls, model_name: str = '4s_window') -> str:
88
+ import os
89
+ if model_name not in cls.AVAILABLE_MODELS:
90
+ raise ValueError(f"Unknown model: {model_name}. Available: {list(cls.AVAILABLE_MODELS.keys())}")
91
+ return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
92
+
93
+ def __init__(self, model_path: str = None, device: str = None):
94
+ if device is None:
95
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
96
+ else:
97
+ self.device = torch.device(device)
98
+
99
+ self.model = SpeechStyleCNN().to(self.device)
100
+
101
+ if model_path is None:
102
+ import os
103
+ model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_4s_window.pth')
104
+
105
+ try:
106
+ state_dict = torch.load(model_path, map_location=self.device)
107
+ self.model.load_state_dict(state_dict)
108
+ print(f"Successfully loaded model from: {model_path}")
109
+ except FileNotFoundError:
110
+ print(f"Warning: Model file not found at {model_path}. Using untrained model.")
111
+ except Exception as e:
112
+ print(f"Warning: Error loading model from {model_path}: {e}. Using untrained model.")
113
+
114
+ self.model.eval()
115
+
116
+ self.sample_rate = 16000
117
+ self.n_mels = 128
118
+ self.n_fft = 2048
119
+ self.hop_length = 512
120
+
121
+ def extract_mel_spectrogram(self, audio_path: str) -> np.ndarray:
122
+ audio, sr = librosa.load(audio_path, sr=self.sample_rate)
123
+
124
+ mel_spec = librosa.feature.melspectrogram(
125
+ y=audio,
126
+ sr=sr,
127
+ n_mels=self.n_mels,
128
+ n_fft=self.n_fft,
129
+ hop_length=self.hop_length
130
+ )
131
+
132
+ mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
133
+
134
+ mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
135
+ mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
136
+
137
+ return mel_spec_3ch
138
+
139
+ def extract_acoustic_features(self, audio_path: str) -> Dict[str, float]:
140
+ audio, sr = librosa.load(audio_path, sr=self.sample_rate)
141
+
142
+ features = {}
143
+
144
+ onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
145
+ tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
146
+ features['tempo'] = float(tempo)
147
+
148
+ pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
149
+ pitch_values = []
150
+ for t in range(pitches.shape[1]):
151
+ index = magnitudes[:, t].argmax()
152
+ pitch = pitches[index, t]
153
+ if pitch > 0:
154
+ pitch_values.append(pitch)
155
+
156
+ if pitch_values:
157
+ features['pitch_mean'] = float(np.mean(pitch_values))
158
+ features['pitch_std'] = float(np.std(pitch_values))
159
+ features['pitch_range'] = float(np.max(pitch_values) - np.min(pitch_values))
160
+ else:
161
+ features['pitch_mean'] = 0.0
162
+ features['pitch_std'] = 0.0
163
+ features['pitch_range'] = 0.0
164
+
165
+ rms = librosa.feature.rms(y=audio)[0]
166
+ features['energy_mean'] = float(np.mean(rms))
167
+ features['energy_std'] = float(np.std(rms))
168
+
169
+ zcr = librosa.feature.zero_crossing_rate(audio)[0]
170
+ features['zcr_mean'] = float(np.mean(zcr))
171
+ features['zcr_std'] = float(np.std(zcr))
172
+
173
+ spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
174
+ features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
175
+ features['spectral_centroid_std'] = float(np.std(spectral_centroids))
176
+
177
+ return features
178
+
179
+ def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
180
+ individual_scores = {}
181
+
182
+ if features['pitch_mean'] > 0:
183
+ if features['pitch_std'] < 30:
184
+ pitch_score = 0.9 # Very monotone -> read
185
+ elif features['pitch_std'] < 50:
186
+ pitch_score = 0.7 # Somewhat monotone -> likely read
187
+ elif features['pitch_std'] < 70:
188
+ pitch_score = 0.5 # Moderate variation
189
+ elif features['pitch_std'] < 90:
190
+ pitch_score = 0.3 # Variable -> likely spontaneous
191
+ else:
192
+ pitch_score = 0.1 # Very variable -> spontaneous
193
+ else:
194
+ pitch_score = 0.5 # Unknown
195
+
196
+ individual_scores['pitch_variation'] = {
197
+ 'score': pitch_score,
198
+ 'value': features['pitch_std'],
199
+ 'interpretation': 'monotone (read)' if pitch_score > 0.6 else 'variable (spontaneous)' if pitch_score < 0.4 else 'moderate'
200
+ }
201
+
202
+ # Energy consistency score (0 = variable/spontaneous, 1 = consistent/read)
203
+ if features['energy_std'] < 0.015:
204
+ energy_score = 0.9 # Very consistent -> read
205
+ elif features['energy_std'] < 0.025:
206
+ energy_score = 0.6 # Somewhat consistent -> likely read
207
+ elif features['energy_std'] < 0.035:
208
+ energy_score = 0.4 # Moderate
209
+ else:
210
+ energy_score = 0.1 # Variable -> spontaneous
211
+
212
+ individual_scores['energy_consistency'] = {
213
+ 'score': energy_score,
214
+ 'value': features['energy_std'],
215
+ 'interpretation': 'consistent (read)' if energy_score > 0.6 else 'variable (spontaneous)' if energy_score < 0.4 else 'moderate'
216
+ }
217
+
218
+ # Tempo score (0 = slow/thoughtful/spontaneous, 1 = fast/consistent/read)
219
+ if features['tempo'] > 140:
220
+ tempo_score = 0.8 # Very fast -> likely read
221
+ elif features['tempo'] > 110:
222
+ tempo_score = 0.6 # Fast -> possibly read
223
+ elif features['tempo'] > 80:
224
+ tempo_score = 0.4 # Normal conversational
225
+ else:
226
+ tempo_score = 0.2 # Slow -> thoughtful/spontaneous
227
+
228
+ individual_scores['tempo'] = {
229
+ 'score': tempo_score,
230
+ 'value': features['tempo'],
231
+ 'interpretation': 'fast/steady (read)' if tempo_score > 0.6 else 'slow/varied (spontaneous)' if tempo_score < 0.4 else 'moderate'
232
+ }
233
+
234
+ # Spectral consistency (voice quality stability)
235
+ if features['spectral_centroid_std'] < 300:
236
+ spectral_score = 0.8 # Very stable -> read
237
+ elif features['spectral_centroid_std'] < 500:
238
+ spectral_score = 0.5 # Moderate
239
+ else:
240
+ spectral_score = 0.2 # Variable -> spontaneous
241
+
242
+ individual_scores['spectral_stability'] = {
243
+ 'score': spectral_score,
244
+ 'value': features['spectral_centroid_std'],
245
+ 'interpretation': 'stable (read)' if spectral_score > 0.6 else 'variable (spontaneous)' if spectral_score < 0.4 else 'moderate'
246
+ }
247
+
248
+ weights = {
249
+ 'pitch_variation': 0.35,
250
+ 'energy_consistency': 0.30,
251
+ 'tempo': 0.20,
252
+ 'spectral_stability': 0.15
253
+ }
254
+
255
+ overall_score = (
256
+ pitch_score * weights['pitch_variation'] +
257
+ energy_score * weights['energy_consistency'] +
258
+ tempo_score * weights['tempo'] +
259
+ spectral_score * weights['spectral_stability']
260
+ )
261
+
262
+ if overall_score > 0.65:
263
+ classification = 'read'
264
+ confidence = 0.5 + (overall_score - 0.5) # Scale to confidence
265
+ elif overall_score < 0.35:
266
+ classification = 'spontaneous'
267
+ confidence = 0.5 + (0.5 - overall_score) # Scale to confidence
268
+ else:
269
+ # Borderline case - go with majority
270
+ classification = 'read' if overall_score >= 0.5 else 'spontaneous'
271
+ confidence = 0.5 + abs(overall_score - 0.5) * 0.5
272
+
273
+ return {
274
+ 'classification': classification,
275
+ 'confidence': confidence,
276
+ 'overall_score': overall_score,
277
+ 'individual_scores': individual_scores
278
+ }
279
+
280
+ def classify(self, audio_path: str) -> Dict[str, any]:
281
+ mel_spec = self.extract_mel_spectrogram(audio_path)
282
+
283
+ mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
284
+
285
+ with torch.no_grad():
286
+ logits = self.model(mel_tensor)
287
+ probabilities = F.softmax(logits, dim=1)
288
+ predicted_class = torch.argmax(probabilities, dim=1).item()
289
+ cnn_confidence = probabilities[0, predicted_class].item()
290
+
291
+ acoustic_features = self.extract_acoustic_features(audio_path)
292
+
293
+ prosody_scores = self._compute_prosody_scores(acoustic_features)
294
+ prosody_classification = prosody_scores['classification']
295
+ prosody_confidence = prosody_scores['confidence']
296
+
297
+ cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
298
+
299
+ if cnn_class_name == prosody_classification:
300
+ final_confidence = min(0.95, (cnn_confidence * 0.7 + prosody_confidence * 0.3))
301
+ final_classification = cnn_class_name
302
+ else:
303
+ final_confidence = 0.5 + abs(cnn_confidence - prosody_confidence) * 0.3
304
+ if cnn_confidence > prosody_confidence:
305
+ final_classification = cnn_class_name
306
+ else:
307
+ final_classification = prosody_classification
308
+
309
+ return {
310
+ 'classification': final_classification,
311
+ 'confidence': float(final_confidence),
312
+ 'cnn_classification': cnn_class_name,
313
+ 'cnn_confidence': float(cnn_confidence),
314
+ 'prosody_classification': prosody_classification,
315
+ 'prosody_confidence': float(prosody_confidence),
316
+ 'prosody_scores': prosody_scores['individual_scores'],
317
+ 'acoustic_features': acoustic_features,
318
+ 'interpretation': self._interpret_classification(
319
+ final_classification, final_confidence,
320
+ cnn_class_name, cnn_confidence,
321
+ prosody_classification, prosody_confidence,
322
+ prosody_scores, acoustic_features
323
+ )
324
+ }
325
+
326
+ def _interpret_classification(
327
+ self,
328
+ final_class: str,
329
+ final_confidence: float,
330
+ cnn_class: str,
331
+ cnn_confidence: float,
332
+ prosody_class: str,
333
+ prosody_confidence: float,
334
+ prosody_scores: Dict,
335
+ features: Dict
336
+ ) -> str:
337
+ interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
338
+ interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
339
+
340
+ if final_class == 'read':
341
+ interpretation += "**Description:** The speech exhibits characteristics of read or scripted content. "
342
+ interpretation += "The audio shows consistent prosodic patterns typical of someone reading from prepared text, "
343
+ interpretation += "with steady pacing, uniform intonation, and regular energy levels.\n\n"
344
+ else:
345
+ interpretation += "**Description:** The speech exhibits characteristics of spontaneous speaking. "
346
+ interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
347
+ interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
348
+
349
+
350
+ return interpretation
351
+
352
+
353
+ if __name__ == "__main__":
354
+ classifier = AudioClassifier()
355
+ print("\nAvailable pre-trained models:")
356
+ for name, filename in AudioClassifier.AVAILABLE_MODELS.items():
357
+ print(f" - {name}: {filename}")
358
+
359
+ print("\nModel architecture:")
360
+ print(classifier.model)
361
+
pipeline.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multimodal Authenticity Detection Pipeline
3
+ Integrates CNN audio classification, Whisper ASR, and text authenticity analysis
4
+ """
5
+
6
+ from typing import Dict, Optional
7
+ import time
8
+ from audio_classifier import AudioClassifier
9
+ from speech_recognizer import SpeechRecognizer
10
+ from text_analyzer import TextAuthenticityAnalyzer
11
+
12
+
13
+ class AuthenticityDetectionPipeline:
14
+ def __init__(
15
+ self,
16
+ audio_model_path: Optional[str] = None,
17
+ whisper_model_size: str = "base",
18
+ device: Optional[str] = None,
19
+ ai_detection_threshold: float = 0.78
20
+ ):
21
+ print("\n" + "="*60)
22
+ print("Initializing Multimodal Authenticity Detection Pipeline")
23
+ print("="*60 + "\n")
24
+
25
+ # Initialize components
26
+ print("📊 Loading Audio Classifier (CNN)...")
27
+ self.audio_classifier = AudioClassifier(
28
+ model_path=audio_model_path,
29
+ device=device
30
+ )
31
+
32
+ print("\n🎤 Loading Speech Recognizer (Whisper)...")
33
+ self.speech_recognizer = SpeechRecognizer(
34
+ model_size=whisper_model_size,
35
+ device=device
36
+ )
37
+
38
+ print("\n📝 Loading Text Authenticity Analyzer...")
39
+ self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
40
+
41
+ print("\n✅ Pipeline initialization complete!")
42
+ print("="*60 + "\n")
43
+
44
+ def analyze_audio(self, audio_path: str, language: Optional[str] = None) -> Dict:
45
+ print("\n" + "="*60)
46
+ print("MULTIMODAL AUTHENTICITY ANALYSIS")
47
+ print("="*60 + "\n")
48
+
49
+ start_time = time.time()
50
+
51
+ # Stage 1: Audio Classification (CNN-based read vs spontaneous detection)
52
+ print("Stage 1: CNN Audio Classification...")
53
+ print("-" * 40)
54
+ audio_results = self.audio_classifier.classify(audio_path)
55
+ print(f"✓ CNN classification complete")
56
+ print(f" Classification: {audio_results['classification'].upper()}")
57
+ print(f" Confidence: {audio_results['confidence']*100:.1f}%")
58
+
59
+ # Stage 2: Speech Analysis (Whisper for linguistic analysis)
60
+ print("\nStage 2: Speech Analysis (Whisper)...")
61
+ print("-" * 40)
62
+ asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
63
+ print(f"✓ Speech analysis complete")
64
+ print(f" Language: {asr_results['language']}")
65
+ print(f" Word count: {asr_results['word_count']}")
66
+ print(f" Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
67
+
68
+ # Stage 3: Text Authenticity Analysis
69
+ print("\nStage 3: Analyzing text authenticity...")
70
+ print("-" * 40)
71
+ text_results = self.text_analyzer.analyze(asr_results['transcription'])
72
+ print(f"✓ Text analysis complete")
73
+ print(f" Authenticity score: {text_results['authenticity_score']*100:.1f}%")
74
+ print(f" Risk level: {text_results['risk_level'].upper()}")
75
+
76
+ # Stage 4: Combined Assessment
77
+ print("\nStage 4: Generating final assessment...")
78
+ print("-" * 40)
79
+ final_assessment = self._generate_final_assessment(
80
+ audio_results,
81
+ asr_results,
82
+ text_results
83
+ )
84
+
85
+ elapsed_time = time.time() - start_time
86
+
87
+ print(f"✓ Analysis complete in {elapsed_time:.2f} seconds")
88
+ print("\n" + "="*60 + "\n")
89
+
90
+ return {
91
+ 'audio_classification': audio_results,
92
+ 'speech_recognition': asr_results,
93
+ 'text_authenticity': text_results,
94
+ 'final_assessment': final_assessment,
95
+ 'processing_time': elapsed_time
96
+ }
97
+
98
+ def _generate_final_assessment(
99
+ self,
100
+ audio_results: Dict,
101
+ asr_results: Dict,
102
+ text_results: Dict
103
+ ) -> Dict:
104
+
105
+ if audio_results['classification'] == 'spontaneous':
106
+ audio_score = audio_results['confidence']
107
+ else: # read
108
+ audio_score = 1.0 - audio_results['confidence']
109
+
110
+ if asr_results['kopparapu_classification'] == 'spontaneous':
111
+ speech_pattern_score = asr_results['kopparapu_score']
112
+ else:
113
+ speech_pattern_score = 1.0 - asr_results['kopparapu_score']
114
+
115
+ text_auth_score = text_results['authenticity_score']
116
+
117
+ composite_score = (
118
+ audio_score * 0.30 + # CNN acoustic analysis
119
+ speech_pattern_score * 0.30 + # Speech patterns (Kopparapu)
120
+ text_auth_score * 0.40 # Text authenticity (AI detection)
121
+ )
122
+
123
+ if composite_score >= 0.7:
124
+ verdict = "AUTHENTIC"
125
+ risk = "low"
126
+ recommendation = "Response appears genuine with strong authenticity indicators."
127
+ elif composite_score >= 0.5:
128
+ verdict = "LIKELY AUTHENTIC"
129
+ risk = "moderate"
130
+ recommendation = "Response shows mostly authentic characteristics but has some concerns."
131
+ elif composite_score >= 0.3:
132
+ verdict = "QUESTIONABLE"
133
+ risk = "high"
134
+ recommendation = "Response has multiple authenticity concerns. Further investigation recommended."
135
+ else:
136
+ verdict = "LIKELY INAUTHENTIC"
137
+ risk = "critical"
138
+ recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
139
+
140
+ concerns = []
141
+ strengths = []
142
+
143
+ if audio_results['classification'] == 'read':
144
+ concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
145
+ else:
146
+ strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
147
+
148
+ if asr_results['kopparapu_classification'] == 'read':
149
+ concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
150
+ else:
151
+ strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
152
+
153
+ filler_ratio = asr_results['filler_words']['ratio']
154
+ if filler_ratio < 0.02:
155
+ concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
156
+ else:
157
+ strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
158
+
159
+ if asr_results['pause_patterns']['pause_variability'] < 0.3:
160
+ concerns.append("Regular pause patterns suggest reading at punctuation")
161
+ else:
162
+ strengths.append("Irregular pause patterns indicate spontaneous thinking")
163
+
164
+ if text_results['ai_detection']['ai_generated']:
165
+ concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
166
+
167
+ if text_results['authenticity_score'] > 0.7:
168
+ strengths.append("Text shows strong originality indicators")
169
+
170
+
171
+ return {
172
+ 'verdict': verdict,
173
+ 'risk_level': risk,
174
+ 'composite_authenticity_score': float(composite_score),
175
+ 'concerns': concerns,
176
+ 'strengths': strengths,
177
+ 'recommendation': recommendation,
178
+ }
179
+
180
+ if __name__ == "__main__":
181
+ # Example usage
182
+ print("Initializing Authenticity Detection Pipeline...")
183
+ model_path = "spectrogram_cnn_3s_window.pth"
184
+ pipeline = AuthenticityDetectionPipeline(
185
+ audio_model_path=model_path,
186
+ whisper_model_size="base"
187
+ )
188
+ print("\nPipeline ready for audio analysis.")
189
+
plagiarism_detection.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel
4
+ from pathlib import Path
5
+ import json
6
+ import pandas as pd
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+ # nothing is random here so no seed is set
10
+
11
+ # code used from https://huggingface.co/desklib/ai-text-detector-v1.01 and modified for this project
12
+ class DesklibAIDetectionModel(PreTrainedModel):
13
+ config_class = AutoConfig
14
+
15
+ def __init__(self, config):
16
+ # Initialize the PreTrainedModel
17
+ super().__init__(config)
18
+ # Initialize the base transformer model.
19
+ self.model = AutoModel.from_config(config)
20
+ # Define a classifier head.
21
+ self.classifier = nn.Linear(config.hidden_size, 1)
22
+ # Initialize weights (handled by PreTrainedModel)
23
+ self.init_weights()
24
+
25
+ def forward(self, input_ids, attention_mask=None, labels=None):
26
+ # Forward pass through the transformer
27
+ outputs = self.model(input_ids, attention_mask=attention_mask)
28
+ last_hidden_state = outputs[0]
29
+ # Mean pooling
30
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
31
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
32
+ sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
33
+ pooled_output = sum_embeddings / sum_mask
34
+
35
+ # Classifier
36
+ logits = self.classifier(pooled_output)
37
+ loss = None
38
+ if labels is not None:
39
+ loss_fct = nn.BCEWithLogitsLoss()
40
+ loss = loss_fct(logits.view(-1), labels.float())
41
+
42
+ output = {"logits": logits}
43
+ if loss is not None:
44
+ output["loss"] = loss
45
+ return output
46
+
47
+ def predict_single_text(text, model, tokenizer, device, max_len=768, threshold=0.5):
48
+ encoded = tokenizer(
49
+ text,
50
+ padding='max_length',
51
+ truncation=True,
52
+ max_length=max_len,
53
+ return_tensors='pt'
54
+ )
55
+ input_ids = encoded['input_ids'].to(device)
56
+ attention_mask = encoded['attention_mask'].to(device)
57
+
58
+ model.eval()
59
+ with torch.no_grad():
60
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
61
+ logits = outputs["logits"]
62
+ probability = torch.sigmoid(logits).item()
63
+
64
+ ai_detected = True if probability >= threshold else False
65
+ return probability, ai_detected
66
+
67
+ # own code to easily create text files, and feed them to the model for predictions
68
+ def ai_plagiarism_detection(text, threshold=0.5, show_results=False):
69
+ """
70
+ Detect if the given text is AI generated or human written.
71
+ Args:
72
+ text (str): Input text to be classified.
73
+ show_results (bool): If True, prints the results.
74
+ Returns:
75
+ probability (float): Probability of being AI generated.
76
+ ai_detected (bool): True if AI generated, Falce if human written.
77
+ """
78
+
79
+ # Model and Tokenizer Directory
80
+ model_directory = "desklib/ai-text-detector-v1.01"
81
+ # Load tokenizer and model
82
+ tokenizer = AutoTokenizer.from_pretrained(model_directory)
83
+ model = DesklibAIDetectionModel.from_pretrained(model_directory)
84
+ # Set up device
85
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
86
+ model.to(device)
87
+ # Predict
88
+ probability, ai_detected = predict_single_text(text, model, tokenizer, device, threshold=threshold)
89
+ # to print results
90
+ if show_results:
91
+ print(f"Probability of being AI generated: {probability:.4f}")
92
+ print(f"Predicted label: {'AI Generated' if ai_detected else 'Not AI Generated'}")
93
+ return probability, ai_detected
94
+
95
+
96
+ def make_textfile(file_path="text_folder/example.txt", content = "This is an example text file.\nAnd this is the second line.\n"):
97
+ """
98
+ Create a text file with the given content.
99
+ Args:
100
+ file_path (str): Path to the text file to be created.
101
+ content (str): Content to write into the text file.
102
+ """
103
+ # Open the file in write mode ('w') and write some content
104
+ with open(file_path, "w") as f:
105
+ f.write(content)
106
+ return
107
+
108
+ def get_text_from_textfile(text_dir="text_folder"):
109
+ """
110
+ Read all text files from a directory and return a dictionary with filename as key and content as value.
111
+ Args:
112
+ text_dir (str): Directory containing text files.
113
+ Returns:
114
+ text_dict (dict): Dictionary with filename as key and file content as value.
115
+ """
116
+ text_dict = {}
117
+ text_file_list = list(Path(text_dir).glob("*.txt"))
118
+ for elem in text_file_list:
119
+ content = elem.read_text(encoding="utf-8") # read file content
120
+ text_dict[elem.name] = content # use filename as key
121
+ return text_dict
122
+
123
+ def classifying_plagiarism_using_textfiles(best_threshold=0.78):
124
+ """
125
+ This function shows how this model can be used to detect ai in the text files in the text_folder folder. This is what is to be used in the pipeline.
126
+ """
127
+ # make sure folder exists
128
+ Path("text_folder").mkdir(exist_ok=True)
129
+
130
+ # create example text files
131
+ make_textfile("text_folder/ai_text.txt", "AI detection refers to the process of identifying whether a given piece of content, such as text, images, or audio, has been generated by artificial intelligence. This is achieved using various machine learning techniques, including perplexity analysis, entropy measurements, linguistic pattern recognition, and neural network classifiers trained on human and AI-generated data. Advanced AI detection tools assess writing style, coherence, and statistical properties to determine the likelihood of AI involvement. These tools are widely used in academia, journalism, and content moderation to ensure originality, prevent misinformation, and maintain ethical standards. As AI-generated content becomes increasingly sophisticated, AI detection methods continue to evolve, integrating deep learning models and ensemble techniques for improved accuracy.") # create an example text file
132
+ make_textfile("text_folder/human_text.txt", "It is estimated that a major part of the content in the internet will be generated by AI / LLMs by 2025. This leads to a lot of misinformation and credibility related issues. That is why if is important to have accurate tools to identify if a content is AI generated or human written") # create another example text file
133
+ textfile_dict = get_text_from_textfile(text_dir="text_folder") # get dict with text file and content, text_dir is folder containing text files that need to be classified
134
+
135
+ # get predictions for each text file
136
+ for textfile, text in textfile_dict.items(): # for key, value in ft_dict.items():
137
+ print(f"Getting predictions for: {textfile}")
138
+ # ---------- GET PREDICTIONS ----------
139
+ probability, ai_detected = ai_plagiarism_detection(text=text, threshold=best_threshold, show_results=False) # get predictions with the optimal threshold value: 0.78
140
+ # print results
141
+ print(f"{textfile} Results:\n Probability of being AI generated: {probability:.4f}")
142
+ print(f" Predicted label: {'AI Generated' if ai_detected else 'Not AI Generated'}\n")
143
+
144
+
145
+
146
+ def get_texts_from_jsonfile(json_file_path, sample_size=100, ignore_warning=False):
147
+ """
148
+ Get text partitions from a json file. Each partition is a text that can be given as input to the ai_plagiarism_detection model.
149
+ Args:
150
+ json_file_path (str): Path of the json file.
151
+ sample_size (int): Determines how many batches are returned.
152
+ Returns:
153
+ text_list (list): All the text batches in order of the json file as elements in a list.
154
+ """
155
+ text_list = []
156
+ try:
157
+ with open(json_file_path, "r", encoding="utf-8") as f:
158
+ for i, line in enumerate(f):
159
+ obj = json.loads(line)
160
+ text_list.append(obj["text"])
161
+ if i == sample_size-1:
162
+ break
163
+ except:
164
+ raise ValueError(f"{json_file_path} does not exist or is not found.")
165
+ # raise warning if less texts found than sample size
166
+ if ignore_warning != True:
167
+ if len(text_list) != sample_size:
168
+ raise ValueError(f"Warning: only {len(text_list)} texts found, less than sample size {sample_size}")
169
+
170
+ return text_list
171
+
172
+ def run_experiment_using_jsonfile(threshold=0.5):
173
+ """
174
+ This function runs the experiment and saves the results in ai_plagiarism_experiment/ai_plagiarism_detection_results.csv
175
+ """
176
+ # Set Total sample size, there are two datasets (json's) used, so sample_size//2 per dataset is used.
177
+ sample_size = 240
178
+ sample_size //=2
179
+
180
+
181
+ # make sure folders exist
182
+ Path("json_folder").mkdir(exist_ok=True)
183
+ Path("ai_plagiarism_experiment").mkdir(exist_ok=True)
184
+
185
+ # ------- GET TRUE NEGATIVE TEXTS (human thought and spoken) FROM JSON FILE -------
186
+ # load json file with text whisper transribed text from ML commons dataset
187
+ text_list = get_texts_from_jsonfile("json_folder/ML_commons.json", sample_size)
188
+
189
+ # get predictions for each
190
+ predictions=[]
191
+ for i, text in enumerate(text_list):
192
+ # ---------- GET PREDICTIONS ----------
193
+ probability, ai_detected = ai_plagiarism_detection(text=text, threshold=threshold, show_results=False)
194
+ # save results
195
+ predictions.append({"ML_commons_text_index": i,
196
+ "GPT_text_index": np.nan,
197
+ "text_length": len(text),
198
+ "topic": "unknown",
199
+ "probability": probability,
200
+ "ai_detected": ai_detected,
201
+ "really_ai": False
202
+ })
203
+ # convert to dataframe
204
+ df = pd.DataFrame(predictions)
205
+ print("-------- 50% of samples predicted of json experiment --------")
206
+
207
+ # ------- GET TRUE POSITIVE TEXTS (ai written) FROM JSON FILE -------
208
+ # load json file with gpt generated texts
209
+ text_list = get_texts_from_jsonfile("json_folder/gpt_generated.json", sample_size)
210
+
211
+ predictions=[]
212
+ for i, text in enumerate(text_list):
213
+ # ---------- GET PREDICTIONS ----------
214
+ probability, ai_detected = ai_plagiarism_detection(text=text, threshold=threshold, show_results=False)
215
+ # # print results
216
+ # print(f"Text {i} Results:\n Probability of being AI generated: {probability:.4f}")
217
+ # print(f" Predicted label: {'AI Generated' if ai_detected else 'Not AI Generated'}\n")
218
+
219
+ # save results
220
+ if i < 40:
221
+ topic = "astronomy"
222
+ elif i < 80:
223
+ topic = "quantum computing"
224
+ else:
225
+ topic = "daily life, personal growth, and everyday experiences"
226
+
227
+ predictions.append({"ML_commons_text_index": np.nan,
228
+ "GPT_text_index": i,
229
+ "text_length": len(text),
230
+ "topic": topic,
231
+ "probability": probability,
232
+ "ai_detected": ai_detected,
233
+ "really_ai": True
234
+ })
235
+ # convert to dataframe
236
+ new_rows = pd.DataFrame(predictions)
237
+ df = pd.concat([df, new_rows], ignore_index=True)
238
+ print("------- 100% of samples predicted of json experiment --------")
239
+ # save to csv
240
+ df.to_csv("ai_plagiarism_experiment/ai_plagiarism_detection_results.csv", index=False)
241
+
242
+ # update metrics
243
+ get_metrics(threshold=threshold)
244
+
245
+
246
+ def get_metrics(df=None, threshold=0.5, save_to_csv=True):
247
+ """
248
+ This function calculates the metrics and saves them in ai_plagiarism_experiment/res_metrics(t={threshold}).csv
249
+ """
250
+
251
+ if df is None:
252
+ # read from csv
253
+ df = pd.read_csv("ai_plagiarism_experiment/ai_plagiarism_detection_results.csv")
254
+
255
+ # calculate metrics
256
+ fp = ((df["probability"]>=threshold) & (df["really_ai"]==False)).sum() # false positives, cause all texts are human thought texts, however whisper makes text look more ai like
257
+ tn = ((df["probability"]<threshold) & (df["really_ai"]==False)).sum() # true negatives
258
+ tp = ((df["probability"]>=threshold) & (df["really_ai"]==True)).sum() # true positives
259
+ fn = ((df["probability"]<threshold) & (df["really_ai"]==True)).sum() # false negatives
260
+
261
+ recall = tp/(tp+fn) if (tp+fn) != 0 else 0
262
+ precision = tp/(tp+fp) if (tp+fp) != 0 else 0
263
+ accuracy = (tp+tn)/(tp+fp+tn+fn) if (tp+fp+tn+fn) != 0 else 0
264
+
265
+ # info of text lengths of both datasets
266
+ ML_commons_length_mean = df.loc[df["ML_commons_text_index"].notna(), "text_length"].mean()
267
+ ML_commons_length_std = df.loc[df["ML_commons_text_index"].notna(), "text_length"].std()
268
+ gpt_length_mean = df.loc[df["GPT_text_index"].notna(), "text_length"].mean()
269
+ gpt_length_std = df.loc[df["GPT_text_index"].notna(), "text_length"].std()
270
+
271
+
272
+ # save metrics in dataframe
273
+ results = pd.DataFrame({
274
+ "Metric": ["TP", "TN", "FP", "FN", "Recall", "Precision", "Accuracy", "Total samples", "ML_commons_length_mean", "ML_commons_length_std", "gpt_length_mean", "gpt_length_std"],
275
+ "Value": [tp, tn, fp, fn, recall, precision, accuracy, len(df), ML_commons_length_mean, ML_commons_length_std, gpt_length_mean, gpt_length_std]
276
+ })
277
+ if save_to_csv:
278
+ # save in csv
279
+ results.to_csv(f"ai_plagiarism_experiment/res_metrics(t={threshold}).csv", index=False)
280
+ return results
281
+
282
+ def tune_threshold(metric = "Accuracy"):
283
+ """This function maximises the accuracy of the ai plagiarism detector given the results.csv"""
284
+
285
+ df = pd.read_csv("ai_plagiarism_experiment/ai_plagiarism_detection_results.csv")
286
+ # set boundaries
287
+ min = 0.0
288
+ max = 1.0
289
+ step = 0.01
290
+ # init
291
+ best_accuracy=0
292
+ m_l=[]
293
+ t_l=[]
294
+ for threshold in np.arange(min, max+step, step):
295
+ threshold = round(threshold, 2)
296
+ results = get_metrics(df,threshold,False)
297
+ opti_metric = results.loc[results["Metric"] == metric, "Value"].iloc[0]
298
+ # save for plotting
299
+ m_l.append(opti_metric)
300
+ t_l.append(threshold)
301
+ # update best threshold
302
+ if opti_metric>best_accuracy:
303
+ best_accuracy = opti_metric
304
+ best_threshold = threshold
305
+
306
+ # plot tuning
307
+ Path("ai_plagiarism_tuning_plots").mkdir(exist_ok=True)
308
+ plt.plot(t_l, m_l)
309
+ plt.xlabel("threshold")
310
+ plt.ylabel(metric)
311
+ plt.title(f"threshold vs {metric}")
312
+ plt.savefig(f"ai_plagiarism_tuning_plots/threshold_vs_{metric}.png")
313
+ plt.close()
314
+ return best_threshold
315
+
316
+
317
+
318
+
319
+
320
+ if __name__ == "__main__":
321
+ print("-------- Starting ai plagiarism experiment! --------\n")
322
+ # run experiment using json files
323
+ run_experiment_using_jsonfile(threshold=0.5) # firstly using the default threshold
324
+
325
+ # search for the theshold that maximises accuracy
326
+ metric = "Accuracy"
327
+ best_threshold_accuracy = tune_threshold(metric=metric)
328
+ print(f"Best theshold for {metric}: {best_threshold_accuracy}")
329
+ # search for the theshold that maximises precision
330
+ metric = "Precision"
331
+ best_threshold_precision = tune_threshold(metric=metric)
332
+ print(f"Best theshold for {metric}: {best_threshold_precision}")
333
+
334
+ # run experiment using json files
335
+ run_experiment_using_jsonfile(threshold=best_threshold_accuracy) # secondly using the optimal threshold, the end result is
336
+
337
+ # example of usage that is fit for a pipeline using the best accuracy (best_threshold=0.78), when using best precision use best_threshold=0.97
338
+ classifying_plagiarism_using_textfiles(best_threshold=best_threshold_accuracy)
339
+
340
+ print("\n-------- Done! -------- ")
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ torchaudio>=2.0.0
3
+ openai-whisper>=20230314
4
+ transformers>=4.30.0
5
+ gradio>=4.0.0
6
+ numpy>=1.24.0
7
+ scikit-learn>=1.3.0
8
+ librosa>=0.10.0
9
+ soundfile>=0.12.0
10
+ scipy>=1.11.0
11
+ requests>=2.31.0
12
+ pandas>=2.0.0
13
+ matplotlib>=3.7.0
14
+
spectrogram_cnn_3s_window (1).pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8335aa8c8932430ad456f12fe37eba28c7253f75dcda9d513ec3054f7b14f264
3
+ size 44788683
spectrogram_cnn_4s_window.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42534fa18a4df083acfae6dc6fe0bbe24af622a1a7b68938ff6eb1d10eb4b6e5
3
+ size 44790091
spectrogram_cnn_4s_window_488_x_488.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c8109600c503b699d10c02a29cf5bca147c1cf22ad428c618f70ade567d6aa0
3
+ size 44791371
speech_recognizer.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import torch
3
+ import numpy as np
4
+ import re
5
+ from typing import Dict, Optional, List
6
+ import warnings
7
+ warnings.filterwarnings("ignore")
8
+
9
+
10
+ class SpeechRecognizer:
11
+ def __init__(self, model_size: str = "base", device: str = None):
12
+ if device is None:
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ else:
15
+ self.device = device
16
+
17
+ print(f"Loading Whisper {model_size} model on {self.device}...")
18
+ self.model = whisper.load_model(model_size, device=self.device)
19
+ print(f"Whisper model loaded successfully.")
20
+
21
+ self.model_size = model_size
22
+
23
+ def transcribe(
24
+ self,
25
+ audio_path: str,
26
+ language: Optional[str] = None,
27
+ task: str = "transcribe"
28
+ ) -> Dict[str, any]:
29
+ # Transcribe with Whisper (with word-level timestamps for better pause detection)
30
+ result = self.model.transcribe(
31
+ audio_path,
32
+ language=language,
33
+ task=task,
34
+ verbose=False,
35
+ word_timestamps=True
36
+ )
37
+
38
+ transcription = result['text'].strip()
39
+ detected_language = result.get('language', 'unknown')
40
+ segments = result.get('segments', [])
41
+
42
+ analysis = self._analyze_transcription(transcription, segments)
43
+
44
+ duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
45
+ kopparapu_features = self._extract_kopparapu_features(transcription, duration)
46
+ kopparapu_score = self._calculate_kopparapu_score(kopparapu_features)
47
+
48
+ return {
49
+ 'transcription': transcription,
50
+ 'language': detected_language,
51
+ 'segments': segments,
52
+ 'word_count': analysis['word_count'],
53
+ 'duration': analysis['duration'],
54
+ 'speech_rate': analysis['speech_rate'],
55
+ 'pause_patterns': analysis['pause_patterns'],
56
+ 'filler_words': analysis['filler_words'],
57
+ 'kopparapu_features': kopparapu_features,
58
+ 'kopparapu_score': kopparapu_score,
59
+ 'kopparapu_classification': 'read' if kopparapu_score >= 0.5 else 'spontaneous',
60
+ 'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
61
+ }
62
+
63
+ def _analyze_transcription(self, text: str, segments: List[Dict]) -> Dict:
64
+ words = text.split()
65
+ word_count = len(words)
66
+
67
+ duration = 0
68
+ if segments:
69
+ duration = segments[-1]['end'] - segments[0]['start']
70
+
71
+ speech_rate = (word_count / duration * 60) if duration > 0 else 0
72
+
73
+
74
+ filler_words_list = [
75
+ ('um', r'\bum\b'), ('uh', r'\buh\b'), ('er', r'\ber\b'),
76
+ ('ah', r'\bah\b'), ('like', r'\blike\b'), ('you know', r'\byou know\b'),
77
+ ('i mean', r'\bi mean\b'), ('actually', r'\bactually\b'),
78
+ ('basically', r'\bbasically\b'), ('literally', r'\bliterally\b'),
79
+ ('so', r'\bso\b'), ('well', r'\bwell\b'), ('okay', r'\bokay\b'),
80
+ ('hmm', r'\bhmm+\b'), ('mm', r'\bmm+\b')
81
+ ]
82
+
83
+ text_lower = text.lower()
84
+ filler_count = {}
85
+ total_fillers = 0
86
+
87
+ for filler_name, filler_pattern in filler_words_list:
88
+ matches = re.findall(filler_pattern, text_lower, re.IGNORECASE)
89
+ count = len(matches)
90
+ if count > 0:
91
+ filler_count[filler_name] = count
92
+ total_fillers += count
93
+
94
+ filler_ratio = total_fillers / word_count if word_count > 0 else 0
95
+
96
+ pause_patterns = self._analyze_pauses(segments)
97
+
98
+ return {
99
+ 'word_count': word_count,
100
+ 'duration': duration,
101
+ 'speech_rate': speech_rate,
102
+ 'filler_words': {
103
+ 'count': total_fillers,
104
+ 'ratio': filler_ratio,
105
+ 'details': filler_count
106
+ },
107
+ 'pause_patterns': pause_patterns
108
+ }
109
+
110
+ def _analyze_pauses(self, segments: List[Dict]) -> Dict:
111
+ pauses = []
112
+
113
+ if len(segments) >= 2:
114
+ for i in range(len(segments) - 1):
115
+ pause = segments[i + 1]['start'] - segments[i]['end']
116
+ if pause > 0.05: # Consider pauses > 50ms (lowered threshold)
117
+ pauses.append(pause)
118
+
119
+ for segment in segments:
120
+ if 'words' in segment and len(segment['words']) > 1:
121
+ words = segment['words']
122
+ for i in range(len(words) - 1):
123
+ if 'start' in words[i] and 'end' in words[i] and 'start' in words[i+1]:
124
+ pause = words[i + 1]['start'] - words[i]['end']
125
+ if pause > 0.15: # Word-level pauses (>150ms significant)
126
+ pauses.append(pause)
127
+
128
+ if not pauses:
129
+ return {
130
+ 'avg_pause': 0.0,
131
+ 'max_pause': 0.0,
132
+ 'num_pauses': 0,
133
+ 'pause_variability': 0.0
134
+ }
135
+
136
+ return {
137
+ 'avg_pause': float(np.mean(pauses)),
138
+ 'max_pause': float(np.max(pauses)),
139
+ 'num_pauses': len(pauses),
140
+ 'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
141
+ }
142
+
143
+ def _extract_kopparapu_features(self, text: str, duration_sec: float) -> Dict:
144
+ """
145
+ Extract Kopparapu-like linguistic features from transcription.
146
+ Based on: https://arxiv.org/pdf/2306.08012
147
+ """
148
+ text = text.strip()
149
+ if len(text) == 0:
150
+ return {
151
+ 'alpha_ratio': 0.0,
152
+ 'chars_per_word': 0.0,
153
+ 'words_per_sec': 0.0,
154
+ 'nonalpha_per_sec': 0.0,
155
+ 'repetition_count': 0,
156
+ 'filler_rate': 0.0
157
+ }
158
+
159
+ total_chars = len(text)
160
+ alpha_chars = sum(c.isalpha() for c in text)
161
+ nonalpha_chars = total_chars - alpha_chars
162
+
163
+ alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
164
+
165
+ words = text.split()
166
+ num_words = max(len(words), 1)
167
+ chars_per_word = alpha_chars / num_words
168
+
169
+ duration_sec = max(duration_sec, 1e-3)
170
+ words_per_sec = num_words / duration_sec
171
+ nonalpha_per_sec = nonalpha_chars / duration_sec
172
+
173
+ char_reps = len(re.findall(r'(.)\1{2,}', text))
174
+
175
+ words_list = text.lower().split()
176
+ word_reps = 0
177
+ for i in range(len(words_list) - 1):
178
+ if words_list[i] == words_list[i + 1] and len(words_list[i]) > 2:
179
+ word_reps += 1
180
+
181
+ repetition_count = char_reps + word_reps
182
+
183
+ lower = text.lower()
184
+ filler_patterns = [
185
+ r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
186
+ r'\blike\b', r'\byou know\b', r'\bi mean\b',
187
+ r'\bactually\b', r'\bbasically\b', r'\bliterally\b',
188
+ r'\bso\b', r'\bwell\b', r'\bokay\b',
189
+ r'\bhmm+\b', r'\bmm+\b', r'\boh\b'
190
+ ]
191
+ filler_count = 0
192
+ for pattern in filler_patterns:
193
+ filler_count += len(re.findall(pattern, lower))
194
+ filler_rate = filler_count / num_words
195
+
196
+ return {
197
+ 'alpha_ratio': float(alpha_ratio),
198
+ 'chars_per_word': float(chars_per_word),
199
+ 'words_per_sec': float(words_per_sec),
200
+ 'nonalpha_per_sec': float(nonalpha_per_sec),
201
+ 'repetition_count': int(repetition_count),
202
+ 'filler_rate': float(filler_rate)
203
+ }
204
+
205
+ def _logistic(self, x: float, a: float, b: float) -> float:
206
+ return 1.0 / (1.0 + np.exp(-(x - a) / b))
207
+
208
+ def _calculate_kopparapu_score(self, features: Dict) -> float:
209
+ f1 = features['chars_per_word']
210
+ L1 = self._logistic(f1, a=5.0, b=1.5)
211
+
212
+ f2 = features['words_per_sec']
213
+ L2 = self._logistic(f2, a=2.0, b=0.7)
214
+
215
+ f3_raw = features['nonalpha_per_sec'] + 10.0 * features['filler_rate']
216
+ L3 = self._logistic(-f3_raw, a=0.0, b=1.0)
217
+
218
+ score = 0.4 * L1 + 0.4 * L2 + 0.2 * L3
219
+
220
+ return float(score)
221
+
222
+ def _interpret_speech_patterns(self, analysis: Dict, kopparapu_features: Dict = None, kopparapu_score: float = None) -> str:
223
+ filler_ratio = analysis['filler_words']['ratio']
224
+ pause_patterns = analysis['pause_patterns']
225
+ speech_rate = analysis['speech_rate']
226
+
227
+ interpretation = "**Overall Assessment:**\n\n"
228
+
229
+ spontaneity_score = 0
230
+ indicators = []
231
+
232
+ if filler_ratio > 0.03:
233
+ spontaneity_score += 1
234
+ indicators.append(f"Filler words present ({filler_ratio*100:.1f}%)")
235
+
236
+ if pause_patterns['pause_variability'] > 0.5:
237
+ spontaneity_score += 1
238
+ indicators.append(f"Irregular pause patterns (variability: {pause_patterns['pause_variability']:.2f})")
239
+
240
+ if 120 <= speech_rate <= 180:
241
+ spontaneity_score += 1
242
+ indicators.append(f"Natural speech rate ({speech_rate:.1f} words/min)")
243
+
244
+ if spontaneity_score >= 2:
245
+ interpretation += "✓ **Speech patterns suggest spontaneous, natural speaking.**\n\n"
246
+ if indicators:
247
+ interpretation += "Key indicators:\n"
248
+ for indicator in indicators:
249
+ interpretation += f"- {indicator}\n"
250
+ else:
251
+ interpretation += "⚠ **Speech patterns suggest potentially scripted or read speech.**\n\n"
252
+ if filler_ratio < 0.02:
253
+ interpretation += "- Very low filler word usage\n"
254
+ if pause_patterns['pause_variability'] < 0.3:
255
+ interpretation += "- Regular, consistent pause patterns\n"
256
+ if speech_rate > 180:
257
+ interpretation += "- Fast, steady speaking rate\n"
258
+
259
+ return interpretation
260
+
261
+ def get_detailed_segments(self, audio_path: str) -> List[Dict]:
262
+ result = self.model.transcribe(audio_path, word_timestamps=True, verbose=False)
263
+ return result.get('segments', [])
264
+
265
+
266
+ if __name__ == "__main__":
267
+ recognizer = SpeechRecognizer(model_size="base")
268
+ print(f"Speech recognizer initialized with {recognizer.model_size} model")
269
+ print(f"Device: {recognizer.device}")
270
+
text_analyzer.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from typing import Dict, List, Tuple, Optional
4
+ import torch
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModelForSequenceClassification,
8
+ RobertaTokenizer,
9
+ RobertaForSequenceClassification
10
+ )
11
+ import numpy as np
12
+ from collections import Counter
13
+ import warnings
14
+ warnings.filterwarnings("ignore")
15
+
16
+ try:
17
+ from plagiarism_detection import ai_plagiarism_detection
18
+ DESKLIB_AVAILABLE = True
19
+ except ImportError:
20
+ DESKLIB_AVAILABLE = False
21
+ print("Warning: plagiarism_detection module not found. Using fallback AI detection.")
22
+
23
+
24
+
25
+
26
+ class AITextDetector:
27
+ def __init__(self, device: str = None, threshold: float = 0.78):
28
+ self.threshold = threshold
29
+
30
+ if not DESKLIB_AVAILABLE:
31
+ print("Warning: plagiarism_detection module not found. AI detection will not be available.")
32
+ print("Ensure plagiarism_detection.py is in the same directory.")
33
+ self.available = False
34
+ else:
35
+ print(f"Using Desklib AI text detector (threshold: {self.threshold})")
36
+ self.available = True
37
+
38
+ def detect_ai_text(self, text: str) -> Dict:
39
+
40
+ if not self.available:
41
+ # Return neutral result if Desklib not available
42
+ return {
43
+ 'ai_generated': False,
44
+ 'confidence': 0.5,
45
+ 'indicators': [],
46
+ 'interpretation': "AI detection not available. Install plagiarism_detection module.",
47
+ 'model_used': 'N/A (module not found)'
48
+ }
49
+
50
+ # Use Desklib AI detector
51
+ try:
52
+ probability, ai_detected = ai_plagiarism_detection(
53
+ text,
54
+ threshold=self.threshold,
55
+ show_results=False
56
+ )
57
+
58
+ return {
59
+ 'ai_generated': ai_detected,
60
+ 'confidence': float(probability),
61
+ 'indicators': self._identify_ai_indicators(probability),
62
+ 'interpretation': self._interpret_ai_detection(probability),
63
+ 'model_used': 'Desklib AI Detector v1.01'
64
+ }
65
+ except Exception as e:
66
+ print(f"Error in AI detection: {e}")
67
+ return {
68
+ 'ai_generated': False,
69
+ 'confidence': 0.5,
70
+ 'indicators': [],
71
+ 'interpretation': f"AI detection error: {str(e)}",
72
+ 'model_used': 'Error'
73
+ }
74
+
75
+
76
+ def _identify_ai_indicators(self, probability: float) -> List[str]:
77
+ indicators = []
78
+
79
+ if probability > 0.9:
80
+ indicators.append("Very high AI probability (>90%)")
81
+ elif probability > 0.7:
82
+ indicators.append("High AI probability (70-90%)")
83
+ elif probability > self.threshold:
84
+ indicators.append(f"AI detected above threshold ({self.threshold*100:.0f}%)")
85
+
86
+ return indicators
87
+
88
+ def _interpret_ai_detection(self, score: float) -> str:
89
+ interpretation = f"**AI-Generated Text Detection:**\n\n"
90
+ interpretation += f"- AI Probability Score: {score*100:.1f}%\n"
91
+ interpretation += f"- Detection Threshold: {self.threshold*100:.0f}%\n"
92
+
93
+ return interpretation
94
+
95
+
96
+ class TextAuthenticityAnalyzer:
97
+
98
+ def __init__(self, device: str = None, ai_threshold: float = 0.78):
99
+
100
+ self.ai_detector = AITextDetector(device=device, threshold=ai_threshold)
101
+
102
+ def analyze(self, text: str) -> Dict:
103
+ # Run AI detection
104
+ ai_results = self.ai_detector.detect_ai_text(text)
105
+
106
+ # Calculate overall authenticity score based on AI detection
107
+ ai_penalty = ai_results['confidence']
108
+ authenticity_score = 1.0 - ai_penalty
109
+
110
+ # Determine overall assessment
111
+ if authenticity_score < 0.3:
112
+ overall_assessment = "HIGH RISK: Strong AI-generated text indicators"
113
+ risk_level = "high"
114
+ elif authenticity_score < 0.5:
115
+ overall_assessment = "MODERATE RISK: Likely AI-generated"
116
+ risk_level = "moderate"
117
+ elif authenticity_score < 0.7:
118
+ overall_assessment = "LOW RISK: Some AI characteristics"
119
+ risk_level = "low"
120
+ else:
121
+ overall_assessment = "AUTHENTIC: Text appears human-written"
122
+ risk_level = "minimal"
123
+
124
+ return {
125
+ 'authenticity_score': float(authenticity_score),
126
+ 'risk_level': risk_level,
127
+ 'overall_assessment': overall_assessment,
128
+ 'ai_detection': ai_results,
129
+ }
130
+
131
+
132
+ if __name__ == "__main__":
133
+ # Example usage
134
+ analyzer = TextAuthenticityAnalyzer()
135
+ print("Text authenticity analyzer initialized.")
136
+ print("Components: Plagiarism Detector + AI Text Detector")
137
+