Spaces:
Sleeping
Sleeping
Ranam Hamoud
commited on
Commit
·
0b42831
1
Parent(s):
8528e25
Update files and add .gitignore, remove pycache from tracking
Browse files- .gitignore +3 -0
- __pycache__/audio_classifier.cpython-313.pyc +0 -0
- __pycache__/pipeline.cpython-313.pyc +0 -0
- __pycache__/plagiarism_detection.cpython-313.pyc +0 -0
- __pycache__/speech_recognizer.cpython-313.pyc +0 -0
- __pycache__/text_analyzer.cpython-313.pyc +0 -0
- app.py +451 -261
- audio_classifier.py +189 -101
- examples/.DS_Store +0 -0
- examples/{spontaneous1.ogg → read1.wav} +2 -2
- examples/{read1.ogg → spontaneous1.wav} +2 -2
- pipeline.py +62 -30
- speech_recognizer.py +103 -84
- text_analyzer.py +23 -31
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pipeline_scores.png
|
| 2 |
+
plot_component_accuracy.py
|
| 3 |
+
__pycache__/
|
__pycache__/audio_classifier.cpython-313.pyc
DELETED
|
Binary file (19.2 kB)
|
|
|
__pycache__/pipeline.cpython-313.pyc
DELETED
|
Binary file (8.3 kB)
|
|
|
__pycache__/plagiarism_detection.cpython-313.pyc
DELETED
|
Binary file (15.7 kB)
|
|
|
__pycache__/speech_recognizer.cpython-313.pyc
DELETED
|
Binary file (16 kB)
|
|
|
__pycache__/text_analyzer.cpython-313.pyc
DELETED
|
Binary file (5.56 kB)
|
|
|
app.py
CHANGED
|
@@ -3,36 +3,430 @@ import os
|
|
| 3 |
from pipeline import AuthenticityDetectionPipeline
|
| 4 |
import traceback
|
| 5 |
|
|
|
|
| 6 |
try:
|
| 7 |
pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
|
| 8 |
pipeline_ready = True
|
| 9 |
-
except Exception:
|
| 10 |
pipeline_ready = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def analyze_audio_file(audio_file):
|
|
|
|
| 14 |
if not pipeline_ready:
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
|
|
|
| 20 |
if audio_file is None:
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
|
|
|
|
| 26 |
try:
|
| 27 |
language_code = None
|
| 28 |
results = pipeline.analyze_audio(audio_file, language=language_code)
|
| 29 |
|
|
|
|
| 30 |
audio_class = results['audio_classification']
|
| 31 |
asr = results['speech_recognition']
|
| 32 |
text_auth = results['text_authenticity']
|
| 33 |
final = results['final_assessment']
|
| 34 |
|
| 35 |
-
|
| 36 |
verdict_color = {
|
| 37 |
"AUTHENTIC": "#10b981",
|
| 38 |
"LIKELY AUTHENTIC": "#3b82f6",
|
|
@@ -42,6 +436,7 @@ def analyze_audio_file(audio_file):
|
|
| 42 |
|
| 43 |
color = verdict_color.get(final['verdict'], '#6b7280')
|
| 44 |
|
|
|
|
| 45 |
overall_status = f"""
|
| 46 |
<div style='background: white; border: 2px solid {color}; padding: 25px; border-radius: 16px; margin: 10px 0;'>
|
| 47 |
<h2 style='color: {color}; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
|
|
@@ -66,175 +461,11 @@ def analyze_audio_file(audio_file):
|
|
| 66 |
</div>
|
| 67 |
</div>
|
| 68 |
"""
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
transcription_output =
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
transcription_output += f"| **Language** | {asr['language'].upper()} |\n"
|
| 75 |
-
transcription_output += f"| **Duration** | {asr['duration']:.1f} seconds |\n"
|
| 76 |
-
transcription_output += f"| **Word Count** | {asr['word_count']} words |\n"
|
| 77 |
-
transcription_output += f"| **Speech Rate** | {asr['speech_rate']:.1f} words/min |\n\n"
|
| 78 |
-
if asr['speech_rate'] > 160:
|
| 79 |
-
transcription_output += "**Fast speech rate** - Above average speaking speed\n\n"
|
| 80 |
-
elif asr['speech_rate'] < 120:
|
| 81 |
-
transcription_output += "**Slow speech rate** - Below average speaking speed\n\n"
|
| 82 |
-
else:
|
| 83 |
-
transcription_output += "**Normal speech rate** - Average conversational pace\n\n"
|
| 84 |
-
|
| 85 |
-
transcription_output += "---\n\n"
|
| 86 |
-
transcription_output += "#### Full Transcription\n\n"
|
| 87 |
-
transcription_output += f"> {asr['transcription']}"
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
if 'kopparapu_score' in asr:
|
| 91 |
-
classification = asr['kopparapu_classification'].upper()
|
| 92 |
-
confidence = asr['kopparapu_score'] if asr['kopparapu_score'] >= 0.5 else (1 - asr['kopparapu_score'])
|
| 93 |
-
|
| 94 |
-
speech_patterns = f" ### **Classification: {classification} SPEECH**\n\n"
|
| 95 |
-
speech_patterns += f"**Score:** {asr['kopparapu_score']:.3f} (0=spontaneous, 1=read)\n"
|
| 96 |
-
speech_patterns += f"**Confidence:** {confidence*100:.1f}%\n\n"
|
| 97 |
-
|
| 98 |
-
speech_patterns += "---\n\n"
|
| 99 |
-
speech_patterns += "#### Linguistic Metrics\n\n"
|
| 100 |
-
kf = asr['kopparapu_features']
|
| 101 |
-
|
| 102 |
-
speech_patterns += "| Feature | Value | Interpretation |\n"
|
| 103 |
-
speech_patterns += "|---------|-------|----------------|\n"
|
| 104 |
-
speech_patterns += f"| **Characters/Word** | {kf['chars_per_word']:.2f} | "
|
| 105 |
-
if kf['chars_per_word'] > 5.5:
|
| 106 |
-
speech_patterns += "Complex vocabulary |\n"
|
| 107 |
-
elif kf['chars_per_word'] < 4.5:
|
| 108 |
-
speech_patterns += "Simple vocabulary |\n"
|
| 109 |
-
else:
|
| 110 |
-
speech_patterns += "Average complexity |\n"
|
| 111 |
-
|
| 112 |
-
speech_patterns += f"| **Words/Second** | {kf['words_per_sec']:.2f} | "
|
| 113 |
-
if kf['words_per_sec'] > 3:
|
| 114 |
-
speech_patterns += "Fast pacing |\n"
|
| 115 |
-
elif kf['words_per_sec'] < 2:
|
| 116 |
-
speech_patterns += "Slow pacing |\n"
|
| 117 |
-
else:
|
| 118 |
-
speech_patterns += "Normal pacing |\n"
|
| 119 |
-
|
| 120 |
-
speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
|
| 121 |
-
if kf['filler_rate'] > 0.05:
|
| 122 |
-
speech_patterns += "High (spontaneous) |\n"
|
| 123 |
-
elif kf['filler_rate'] < 0.02:
|
| 124 |
-
speech_patterns += "Low (scripted) |\n"
|
| 125 |
-
else:
|
| 126 |
-
speech_patterns += "Moderate |\n"
|
| 127 |
-
|
| 128 |
-
speech_patterns += f"| **Repetitions** | {kf['repetition_count']} | "
|
| 129 |
-
if kf['repetition_count'] > 3:
|
| 130 |
-
speech_patterns += "Multiple (thinking aloud) |\n"
|
| 131 |
-
elif kf['repetition_count'] == 0:
|
| 132 |
-
speech_patterns += "None (prepared) |\n"
|
| 133 |
-
else:
|
| 134 |
-
speech_patterns += "Few |\n"
|
| 135 |
-
|
| 136 |
-
speech_patterns += "\n---\n\n"
|
| 137 |
-
speech_patterns += "#### Reading Style Indicators\n\n"
|
| 138 |
-
|
| 139 |
-
speech_patterns += "| Feature | Value | Interpretation |\n"
|
| 140 |
-
speech_patterns += "|---------|-------|----------------|\n"
|
| 141 |
-
|
| 142 |
-
# Pause regularity
|
| 143 |
-
pause_reg = kf.get('pause_regularity', 0.5)
|
| 144 |
-
speech_patterns += f"| **Pause Regularity** | {pause_reg:.2f} | "
|
| 145 |
-
if pause_reg > 0.7:
|
| 146 |
-
speech_patterns += "Very regular (read) |\n"
|
| 147 |
-
elif pause_reg > 0.4:
|
| 148 |
-
speech_patterns += "Moderate |\n"
|
| 149 |
-
else:
|
| 150 |
-
speech_patterns += "Irregular (spontaneous) |\n"
|
| 151 |
-
|
| 152 |
-
# Speech rate variability
|
| 153 |
-
rate_var = kf.get('speech_rate_variability', 0.0)
|
| 154 |
-
speech_patterns += f"| **Rate Variability** | {rate_var:.2f} | "
|
| 155 |
-
if rate_var > 0.6:
|
| 156 |
-
speech_patterns += "High (spontaneous) |\n"
|
| 157 |
-
elif rate_var > 0.3:
|
| 158 |
-
speech_patterns += "Moderate |\n"
|
| 159 |
-
else:
|
| 160 |
-
speech_patterns += "Steady pace (read) |\n"
|
| 161 |
-
|
| 162 |
-
# Sentence variance
|
| 163 |
-
sent_var = kf.get('sentence_length_variance', 0.0)
|
| 164 |
-
speech_patterns += f"| **Sentence Variance** | {sent_var:.2f} | "
|
| 165 |
-
if sent_var > 0.5:
|
| 166 |
-
speech_patterns += "Variable (spontaneous) |\n"
|
| 167 |
-
elif sent_var > 0.25:
|
| 168 |
-
speech_patterns += "Moderate |\n"
|
| 169 |
-
else:
|
| 170 |
-
speech_patterns += "Uniform (read) |\n"
|
| 171 |
-
|
| 172 |
-
# Self-corrections
|
| 173 |
-
corrections = kf.get('self_correction_count', 0)
|
| 174 |
-
speech_patterns += f"| **Self-Corrections** | {corrections} | "
|
| 175 |
-
if corrections > 2:
|
| 176 |
-
speech_patterns += "Multiple (spontaneous) |\n"
|
| 177 |
-
elif corrections > 0:
|
| 178 |
-
speech_patterns += "Few |\n"
|
| 179 |
-
else:
|
| 180 |
-
speech_patterns += "None (scripted) |\n"
|
| 181 |
-
|
| 182 |
-
speech_patterns += "\n"
|
| 183 |
-
|
| 184 |
-
speech_patterns += "---\n\n"
|
| 185 |
-
speech_patterns += "#### Filler Words & Disfluencies\n\n"
|
| 186 |
-
filler_ratio = asr['filler_words']['ratio']
|
| 187 |
-
speech_patterns += f"**Count:** {asr['filler_words']['count']} filler words\n"
|
| 188 |
-
speech_patterns += f"**Ratio:** {filler_ratio*100:.2f}% of speech\n\n"
|
| 189 |
-
|
| 190 |
-
if asr['filler_words']['details']:
|
| 191 |
-
speech_patterns += "**Found:** " + ', '.join([f"*{k}* ({v})" for k, v in asr['filler_words']['details'].items()]) + "\n\n"
|
| 192 |
-
|
| 193 |
-
if filler_ratio > 0.05:
|
| 194 |
-
speech_patterns += "**High filler usage** - Strong indicator of spontaneous, unscripted speech\n\n"
|
| 195 |
-
elif filler_ratio < 0.02:
|
| 196 |
-
speech_patterns += "**Low filler usage** - May indicate reading or highly rehearsed speech\n\n"
|
| 197 |
-
else:
|
| 198 |
-
speech_patterns += "**Moderate filler usage** - Normal conversational pattern\n\n"
|
| 199 |
-
|
| 200 |
-
speech_patterns += "---\n\n"
|
| 201 |
-
speech_patterns += "#### Pause Patterns\n\n"
|
| 202 |
-
pause_var = asr['pause_patterns']['pause_variability']
|
| 203 |
-
|
| 204 |
-
speech_patterns += f"**Total Pauses:** {asr['pause_patterns']['num_pauses']}\n"
|
| 205 |
-
speech_patterns += f"**Average Duration:** {asr['pause_patterns']['avg_pause']:.2f}s\n"
|
| 206 |
-
speech_patterns += f"**Longest Pause:** {asr['pause_patterns']['max_pause']:.2f}s\n"
|
| 207 |
-
speech_patterns += f"**Variability:** {pause_var:.2f}\n\n"
|
| 208 |
-
|
| 209 |
-
if pause_var < 0.3:
|
| 210 |
-
speech_patterns += "**Regular pauses** - Consistent pattern suggests reading at punctuation marks\n\n"
|
| 211 |
-
elif pause_var > 0.6:
|
| 212 |
-
speech_patterns += "**Irregular pauses** - Natural thinking breaks indicate spontaneous speech\n\n"
|
| 213 |
-
else:
|
| 214 |
-
speech_patterns += "**Moderate variability** - Mixed pattern\n\n"
|
| 215 |
-
|
| 216 |
-
is_ai = text_auth['ai_detection']['ai_generated']
|
| 217 |
-
ai_prob = text_auth['ai_detection']['confidence']
|
| 218 |
-
|
| 219 |
-
if is_ai:
|
| 220 |
-
ai_output = "### **AI-GENERATED LIKELY**\n\n"
|
| 221 |
-
else:
|
| 222 |
-
ai_output = "### **HUMAN-WRITTEN LIKELY**\n\n"
|
| 223 |
-
|
| 224 |
-
ai_output += "**Confidence:**\n\n"
|
| 225 |
-
bar_length = 30
|
| 226 |
-
ai_bars = int(ai_prob * bar_length)
|
| 227 |
-
human_bars = bar_length - ai_bars
|
| 228 |
-
ai_output += f"```\nAI: [{'█' * ai_bars}{'░' * human_bars}] {ai_prob*100:.0f}%\n"
|
| 229 |
-
ai_output += f"Human: [{'█' * human_bars}{'░' * ai_bars}] {(1-ai_prob)*100:.0f}%\n```\n\n"
|
| 230 |
-
|
| 231 |
-
ai_output += "---\n\n"
|
| 232 |
-
ai_output += "#### Interpretation\n\n"
|
| 233 |
-
ai_interpretation = text_auth['ai_detection'].get('interpretation', 'No interpretation available.')
|
| 234 |
-
if ai_interpretation:
|
| 235 |
-
ai_output += ai_interpretation
|
| 236 |
-
else:
|
| 237 |
-
ai_output += "No interpretation available."
|
| 238 |
|
| 239 |
return (
|
| 240 |
overall_status,
|
|
@@ -245,11 +476,22 @@ def analyze_audio_file(audio_file):
|
|
| 245 |
)
|
| 246 |
|
| 247 |
except Exception as e:
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
|
|
|
|
| 252 |
def create_interface():
|
|
|
|
| 253 |
custom_css = """
|
| 254 |
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
|
| 255 |
|
|
@@ -257,26 +499,6 @@ def create_interface():
|
|
| 257 |
font-family: 'IBM Plex Sans', sans-serif !important;
|
| 258 |
background: white !important;
|
| 259 |
}
|
| 260 |
-
.contain {
|
| 261 |
-
max-width: 100% !important;
|
| 262 |
-
width: 100% !important;
|
| 263 |
-
margin: 0 auto !important;
|
| 264 |
-
background: white !important;
|
| 265 |
-
padding: 0 !important;
|
| 266 |
-
}
|
| 267 |
-
.tab-nav button {
|
| 268 |
-
font-family: 'IBM Plex Sans', sans-serif;
|
| 269 |
-
font-size: 14px;
|
| 270 |
-
font-weight: 500;
|
| 271 |
-
padding: 10px 16px;
|
| 272 |
-
border-radius: 8px 8px 0 0;
|
| 273 |
-
transition: all 0.2s;
|
| 274 |
-
}
|
| 275 |
-
.tab-nav button.selected {
|
| 276 |
-
background: #2563eb;
|
| 277 |
-
color: white;
|
| 278 |
-
font-weight: 600;
|
| 279 |
-
}
|
| 280 |
button.primary, .primary {
|
| 281 |
background: #2563eb !important;
|
| 282 |
color: white !important;
|
|
@@ -285,23 +507,12 @@ def create_interface():
|
|
| 285 |
font-weight: 600 !important;
|
| 286 |
padding: 12px 24px !important;
|
| 287 |
border-radius: 8px !important;
|
| 288 |
-
transition: all 0.2s !important;
|
| 289 |
-
}
|
| 290 |
-
button.primary:hover, .primary:hover {
|
| 291 |
-
background: #1d4ed8 !important;
|
| 292 |
-
}
|
| 293 |
-
.markdown-text {
|
| 294 |
-
font-family: 'IBM Plex Sans', sans-serif;
|
| 295 |
-
line-height: 1.7;
|
| 296 |
-
}
|
| 297 |
-
h1, h2, h3, h4 {
|
| 298 |
-
font-family: 'IBM Plex Sans', sans-serif;
|
| 299 |
-
font-weight: 600;
|
| 300 |
}
|
| 301 |
"""
|
| 302 |
|
| 303 |
with gr.Blocks(title="Authenticity Detection System") as demo:
|
| 304 |
|
|
|
|
| 305 |
gr.HTML(f"""
|
| 306 |
<style>
|
| 307 |
{custom_css}
|
|
@@ -309,17 +520,6 @@ def create_interface():
|
|
| 309 |
<header style='background: white; border-bottom: 1px solid #e5e7eb; margin-bottom: 32px;'>
|
| 310 |
<div style='padding: 16px 0;'>
|
| 311 |
<div style='display: flex; align-items: center; gap: 12px;'>
|
| 312 |
-
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" width="32" height="32">
|
| 313 |
-
<defs>
|
| 314 |
-
<linearGradient id="g" x1="0" y1="0" x2="64" y2="0" gradientUnits="userSpaceOnUse">
|
| 315 |
-
<stop offset="0" stop-color="#1d4ed8" />
|
| 316 |
-
<stop offset="1" stop-color="#0ea5e9" />
|
| 317 |
-
</linearGradient>
|
| 318 |
-
</defs>
|
| 319 |
-
<rect x="0" y="0" width="64" height="64" rx="12" fill="#ffffff"/>
|
| 320 |
-
<path d="M4 32 C 10 18, 18 46, 24 32 S 36 18, 40 32 52 46, 60 32"
|
| 321 |
-
fill="none" stroke="url(#g)" stroke-width="4" stroke-linecap="round" stroke-linejoin="round"/>
|
| 322 |
-
</svg>
|
| 323 |
<div>
|
| 324 |
<p style='margin: 0; font-size: 11px; text-transform: uppercase; letter-spacing: 1.5px; color: #6b7280; font-weight: 500;'>
|
| 325 |
LEIDEN UNIVERSITY · LIACS
|
|
@@ -337,8 +537,6 @@ def create_interface():
|
|
| 337 |
<h2 style='font-size: 32px; font-weight: 700; line-height: 1.2; color: #111827; margin: 0 0 16px 0;'>
|
| 338 |
Detecting AI-Assisted Responses in Online Settings
|
| 339 |
</h2>
|
| 340 |
-
<p style='font-size: 18px; color: #374151; margin: 0 0 24px 0;'>
|
| 341 |
-
</p>
|
| 342 |
<div style='display: flex; flex-wrap: wrap; gap: 12px;'>
|
| 343 |
<span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #eff6ff; color: #1e40af; border-radius: 8px; font-size: 14px; font-weight: 500;'>
|
| 344 |
Multi-Modal Analysis
|
|
@@ -351,15 +549,17 @@ def create_interface():
|
|
| 351 |
</section>
|
| 352 |
""")
|
| 353 |
|
|
|
|
| 354 |
with gr.Row():
|
| 355 |
with gr.Column(scale=1):
|
| 356 |
gr.HTML("""
|
| 357 |
-
<div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px;
|
| 358 |
<h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Audio Input</h3>
|
| 359 |
<p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>Upload or record your audio file</p>
|
| 360 |
</div>
|
| 361 |
""")
|
| 362 |
|
|
|
|
| 363 |
audio_input = gr.Audio(
|
| 364 |
sources=["upload", "microphone"],
|
| 365 |
type="filepath",
|
|
@@ -367,12 +567,14 @@ def create_interface():
|
|
| 367 |
show_label=False
|
| 368 |
)
|
| 369 |
|
|
|
|
| 370 |
analyze_btn = gr.Button(
|
| 371 |
"Analyze Audio",
|
| 372 |
variant="primary",
|
| 373 |
size="lg"
|
| 374 |
)
|
| 375 |
|
|
|
|
| 376 |
gr.HTML("""
|
| 377 |
<div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
|
| 378 |
<h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
|
|
@@ -381,40 +583,34 @@ def create_interface():
|
|
| 381 |
<li><strong>Duration:</strong> 30 sec - 5 min</li>
|
| 382 |
</ul>
|
| 383 |
</div>
|
| 384 |
-
|
| 385 |
-
<div style='background: #fef3c7; border: 1px solid #fbbf24; padding: 16px; border-radius: 12px; margin-top: 16px;'>
|
| 386 |
-
<div style='font-size: 12px; color: #92400e; line-height: 1.6;'>
|
| 387 |
-
<strong>Note:</strong> Provides probabilistic assessments.
|
| 388 |
-
Use as one factor in evaluation.
|
| 389 |
-
</div>
|
| 390 |
-
</div>
|
| 391 |
""")
|
| 392 |
|
| 393 |
with gr.Column(scale=2):
|
| 394 |
gr.HTML("""
|
| 395 |
-
<div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px;
|
| 396 |
<h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Analysis Results</h3>
|
| 397 |
<p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>You'll see results here</p>
|
| 398 |
</div>
|
| 399 |
""")
|
| 400 |
|
| 401 |
-
|
|
|
|
| 402 |
|
|
|
|
| 403 |
with gr.Tabs() as tabs:
|
| 404 |
with gr.Tab("Acoustic Features"):
|
| 405 |
-
acoustic_output = gr.
|
| 406 |
|
| 407 |
with gr.Tab("Transcription"):
|
| 408 |
-
transcription_output = gr.
|
| 409 |
|
| 410 |
with gr.Tab("Speech Patterns"):
|
| 411 |
-
speech_output = gr.
|
| 412 |
|
| 413 |
with gr.Tab("AI Detection"):
|
| 414 |
-
ai_output = gr.
|
| 415 |
-
|
| 416 |
|
| 417 |
-
#
|
| 418 |
gr.HTML("""
|
| 419 |
<div style='margin-top: 20px; margin-bottom: 10px;'>
|
| 420 |
<h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
|
|
@@ -424,8 +620,8 @@ def create_interface():
|
|
| 424 |
examples_dir = os.path.join(os.path.dirname(__file__), "examples")
|
| 425 |
gr.Examples(
|
| 426 |
examples=[
|
| 427 |
-
[os.path.join(examples_dir, "read1.
|
| 428 |
-
[os.path.join(examples_dir, "spontaneous1.
|
| 429 |
],
|
| 430 |
inputs=[audio_input],
|
| 431 |
outputs=[
|
|
@@ -438,31 +634,26 @@ def create_interface():
|
|
| 438 |
fn=analyze_audio_file,
|
| 439 |
label="",
|
| 440 |
examples_per_page=2,
|
| 441 |
-
cache_examples=
|
| 442 |
)
|
| 443 |
|
|
|
|
| 444 |
def show_loading():
|
| 445 |
loading_html = """
|
| 446 |
<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border: 2px solid #667eea; padding: 30px; border-radius: 16px; margin: 10px 0; text-align: center;'>
|
| 447 |
<h2 style='color: white; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
|
| 448 |
Analyzing...
|
| 449 |
</h2>
|
| 450 |
-
<div style='margin-top: 20px;'>
|
| 451 |
-
<div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out infinite;'></div>
|
| 452 |
-
<div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out 0.2s infinite;'></div>
|
| 453 |
-
<div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out 0.4s infinite;'></div>
|
| 454 |
-
</div>
|
| 455 |
</div>
|
| 456 |
-
<style>
|
| 457 |
-
@keyframes pulse {
|
| 458 |
-
0%, 100% { opacity: 0.3; transform: scale(0.8); }
|
| 459 |
-
50% { opacity: 1; transform: scale(1.2); }
|
| 460 |
-
}
|
| 461 |
-
</style>
|
| 462 |
"""
|
| 463 |
-
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
|
|
|
| 466 |
analyze_btn.click(
|
| 467 |
fn=show_loading,
|
| 468 |
inputs=None,
|
|
@@ -486,13 +677,12 @@ def create_interface():
|
|
| 486 |
]
|
| 487 |
)
|
| 488 |
|
|
|
|
| 489 |
gr.HTML("""
|
| 490 |
<footer style='border-top: 1px solid #e5e7eb; background: white; margin-top: 48px; padding: 32px 0;'>
|
| 491 |
<div style='text-align: center;'>
|
| 492 |
<p style='margin: 0; font-size: 14px; color: #6b7280;'>
|
| 493 |
</p>
|
| 494 |
-
<p style='margin: 8px 0 0 0; font-size: 13px; color: #9ca3af;'>
|
| 495 |
-
</p>
|
| 496 |
</div>
|
| 497 |
</footer>
|
| 498 |
""")
|
|
@@ -500,6 +690,7 @@ def create_interface():
|
|
| 500 |
return demo
|
| 501 |
|
| 502 |
|
|
|
|
| 503 |
if __name__ == "__main__":
|
| 504 |
demo = create_interface()
|
| 505 |
demo.launch(
|
|
@@ -508,4 +699,3 @@ if __name__ == "__main__":
|
|
| 508 |
share=False,
|
| 509 |
show_error=True
|
| 510 |
)
|
| 511 |
-
|
|
|
|
| 3 |
from pipeline import AuthenticityDetectionPipeline
|
| 4 |
import traceback
|
| 5 |
|
| 6 |
+
# initialize the pipeline on startup
|
| 7 |
try:
|
| 8 |
pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
|
| 9 |
pipeline_ready = True
|
| 10 |
+
except Exception as e:
|
| 11 |
pipeline_ready = False
|
| 12 |
+
pipeline_error = str(e)
|
| 13 |
+
import traceback
|
| 14 |
+
print(f"Could not start pipeline: {e}")
|
| 15 |
+
traceback.print_exc()
|
| 16 |
|
| 17 |
|
| 18 |
+
# build the acoustic features display HTML
|
| 19 |
+
def build_acoustic_features_display(audio_class):
|
| 20 |
+
classification = audio_class['classification']
|
| 21 |
+
confidence = audio_class['confidence']
|
| 22 |
+
cnn_class = audio_class['cnn_classification']
|
| 23 |
+
cnn_conf = audio_class['cnn_confidence']
|
| 24 |
+
prosody_class = audio_class['prosody_classification']
|
| 25 |
+
prosody_conf = audio_class['prosody_confidence']
|
| 26 |
+
prosody_scores = audio_class.get('prosody_scores', {})
|
| 27 |
+
acoustic_features = audio_class.get('acoustic_features', {})
|
| 28 |
+
|
| 29 |
+
# color scheme based on classification
|
| 30 |
+
if classification == 'spontaneous':
|
| 31 |
+
main_color = '#10b981'
|
| 32 |
+
bg_color = '#ecfdf5'
|
| 33 |
+
label = 'SPONTANEOUS'
|
| 34 |
+
else:
|
| 35 |
+
main_color = '#f59e0b'
|
| 36 |
+
bg_color = '#fffbeb'
|
| 37 |
+
label = 'READ'
|
| 38 |
+
|
| 39 |
+
cnn_color = '#10b981' if cnn_class == 'spontaneous' else '#f59e0b'
|
| 40 |
+
prosody_color = '#10b981' if prosody_class == 'spontaneous' else '#f59e0b'
|
| 41 |
+
|
| 42 |
+
# build main classification header
|
| 43 |
+
output = f"""
|
| 44 |
+
<div style="background: linear-gradient(135deg, {bg_color} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {main_color}33;">
|
| 45 |
+
<h3 style="margin: 0; color: {main_color}; font-size: 22px; font-weight: 700;">{label} SPEECH</h3>
|
| 46 |
+
<p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Combined acoustic analysis confidence: <strong>{confidence*100:.1f}%</strong></p>
|
| 47 |
+
</div>
|
| 48 |
+
|
| 49 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
|
| 50 |
+
<h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Analysis Components</h4>
|
| 51 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
|
| 52 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 16px;">
|
| 53 |
+
<div style="font-size: 12px; color: #6b7280; margin-bottom: 8px; font-weight: 500;">CNN Neural Network</div>
|
| 54 |
+
<div style="font-size: 20px; font-weight: 700; color: {cnn_color}; margin-bottom: 8px;">{cnn_class.upper()}</div>
|
| 55 |
+
<div style="background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 6px;">
|
| 56 |
+
<div style="height: 100%; width: {cnn_conf*100:.0f}%; background: {cnn_color}; border-radius: 6px;"></div>
|
| 57 |
+
</div>
|
| 58 |
+
<div style="font-size: 11px; color: #9ca3af; margin-top: 6px;">{cnn_conf*100:.1f}% confidence</div>
|
| 59 |
+
</div>
|
| 60 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 16px;">
|
| 61 |
+
<div style="font-size: 12px; color: #6b7280; margin-bottom: 8px; font-weight: 500;">Prosody Analysis</div>
|
| 62 |
+
<div style="font-size: 20px; font-weight: 700; color: {prosody_color}; margin-bottom: 8px;">{prosody_class.upper()}</div>
|
| 63 |
+
<div style="background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 6px;">
|
| 64 |
+
<div style="height: 100%; width: {prosody_conf*100:.0f}%; background: {prosody_color}; border-radius: 6px;"></div>
|
| 65 |
+
</div>
|
| 66 |
+
<div style="font-size: 11px; color: #9ca3af; margin-top: 6px;">{prosody_conf*100:.1f}% confidence</div>
|
| 67 |
+
</div>
|
| 68 |
+
</div>
|
| 69 |
+
</div>
|
| 70 |
+
|
| 71 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
|
| 72 |
+
<h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Prosody Feature Breakdown</h4>
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
# feature descriptions
|
| 76 |
+
feature_info = {
|
| 77 |
+
'spectral_variability': {'name': 'Spectral Variability', 'unit': 'Hz', 'description': 'Variation in frequency content over time'},
|
| 78 |
+
'zcr_mean': {'name': 'Zero Crossing Rate', 'unit': 'ratio', 'description': 'Rate of signal sign changes'},
|
| 79 |
+
'energy_level': {'name': 'Energy Level', 'unit': 'RMS', 'description': 'Overall loudness and intensity'},
|
| 80 |
+
'tempo': {'name': 'Speech Tempo', 'unit': 'BPM', 'description': 'Rhythmic pacing of speech'}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# add feature details
|
| 84 |
+
for key, info in feature_info.items():
|
| 85 |
+
if key in prosody_scores:
|
| 86 |
+
score_data = prosody_scores[key]
|
| 87 |
+
score = score_data['score']
|
| 88 |
+
value = score_data['value']
|
| 89 |
+
interp = score_data['interpretation']
|
| 90 |
+
unit = info['unit']
|
| 91 |
+
|
| 92 |
+
bar_color = '#10b981' if score < 0.4 else '#f59e0b' if score > 0.6 else '#6b7280'
|
| 93 |
+
indicator_position = score * 100
|
| 94 |
+
|
| 95 |
+
output += f"""
|
| 96 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; margin-bottom: 10px;">
|
| 97 |
+
<div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 8px;">
|
| 98 |
+
<div>
|
| 99 |
+
<div style="font-weight: 600; color: #1f2937; font-size: 14px;">{info['name']}</div>
|
| 100 |
+
<div style="font-size: 11px; color: #9ca3af;">{info['description']}</div>
|
| 101 |
+
</div>
|
| 102 |
+
<div style="text-align: right;">
|
| 103 |
+
<div style="font-size: 13px; font-weight: 600; color: {bar_color};">{interp}</div>
|
| 104 |
+
<div style="font-size: 11px; color: #6b7280;">{value:.3f} <span style="color: #9ca3af;">{unit}</span></div>
|
| 105 |
+
</div>
|
| 106 |
+
</div>
|
| 107 |
+
<div style="position: relative; background: linear-gradient(to right, #10b981, #6b7280, #f59e0b); border-radius: 4px; height: 6px; margin: 10px 0 6px 0;">
|
| 108 |
+
<div style="position: absolute; left: {indicator_position}%; top: -4px; transform: translateX(-50%); width: 14px; height: 14px; background: white; border: 2px solid {bar_color}; border-radius: 50%; box-shadow: 0 1px 3px rgba(0,0,0,0.15);"></div>
|
| 109 |
+
</div>
|
| 110 |
+
<div style="display: flex; justify-content: space-between; font-size: 10px; color: #9ca3af;">
|
| 111 |
+
<span>Spontaneous</span>
|
| 112 |
+
<span>Read</span>
|
| 113 |
+
</div>
|
| 114 |
+
</div>
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
output += "</div>"
|
| 118 |
+
|
| 119 |
+
# add raw acoustic measurements
|
| 120 |
+
output += """
|
| 121 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
|
| 122 |
+
<h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Raw Acoustic Measurements</h4>
|
| 123 |
+
<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px;">
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
if acoustic_features:
|
| 127 |
+
metrics = [
|
| 128 |
+
('Tempo', f"{acoustic_features.get('tempo', 0):.1f}", 'BPM'),
|
| 129 |
+
('Pitch Mean', f"{acoustic_features.get('pitch_mean', 0):.1f}", 'Hz'),
|
| 130 |
+
('Energy Mean', f"{acoustic_features.get('energy_mean', 0):.4f}", ''),
|
| 131 |
+
('ZCR Mean', f"{acoustic_features.get('zcr_mean', 0):.4f}", ''),
|
| 132 |
+
]
|
| 133 |
+
for name, value, unit in metrics:
|
| 134 |
+
output += f"""
|
| 135 |
+
<div style="background: #f9fafb; border-radius: 8px; padding: 12px; text-align: center;">
|
| 136 |
+
<div style="font-size: 16px; font-weight: 600; color: #1f2937;">{value}</div>
|
| 137 |
+
<div style="font-size: 10px; color: #6b7280; margin-top: 2px;">{name} {unit}</div>
|
| 138 |
+
</div>
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
output += """
|
| 142 |
+
</div>
|
| 143 |
+
</div>
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
return output
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# build the transcription display HTML
|
| 150 |
+
def build_transcription_display(asr):
|
| 151 |
+
# determine speech rate interpretation
|
| 152 |
+
if asr['speech_rate'] > 160:
|
| 153 |
+
rate_color = '#f59e0b'
|
| 154 |
+
rate_label = 'Fast'
|
| 155 |
+
rate_desc = 'Above average speaking speed'
|
| 156 |
+
elif asr['speech_rate'] < 120:
|
| 157 |
+
rate_color = '#3b82f6'
|
| 158 |
+
rate_label = 'Slow'
|
| 159 |
+
rate_desc = 'Below average speaking speed'
|
| 160 |
+
else:
|
| 161 |
+
rate_color = '#10b981'
|
| 162 |
+
rate_label = 'Normal'
|
| 163 |
+
rate_desc = 'Average conversational pace'
|
| 164 |
+
|
| 165 |
+
output = f"""
|
| 166 |
+
<div style="background: linear-gradient(135deg, #eff6ff 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid #3b82f633;">
|
| 167 |
+
<h3 style="margin: 0; color: #1e40af; font-size: 22px; font-weight: 700;">Speech Transcription</h3>
|
| 168 |
+
<p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Detected language: <strong>{asr['language'].upper()}</strong></p>
|
| 169 |
+
</div>
|
| 170 |
+
|
| 171 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
|
| 172 |
+
<h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Speech Metrics</h4>
|
| 173 |
+
<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 16px;">
|
| 174 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
|
| 175 |
+
<div style="font-size: 24px; font-weight: 700; color: #1e40af;">{asr['duration']:.1f}</div>
|
| 176 |
+
<div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Duration (sec)</div>
|
| 177 |
+
</div>
|
| 178 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
|
| 179 |
+
<div style="font-size: 24px; font-weight: 700; color: #1e40af;">{asr['word_count']}</div>
|
| 180 |
+
<div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Words</div>
|
| 181 |
+
</div>
|
| 182 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
|
| 183 |
+
<div style="font-size: 24px; font-weight: 700; color: {rate_color};">{asr['speech_rate']:.0f}</div>
|
| 184 |
+
<div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Words/min</div>
|
| 185 |
+
</div>
|
| 186 |
+
<div style="background: {rate_color}15; border-radius: 10px; padding: 16px; text-align: center; border: 1px solid {rate_color}33;">
|
| 187 |
+
<div style="font-size: 18px; font-weight: 700; color: {rate_color};">{rate_label}</div>
|
| 188 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">{rate_desc}</div>
|
| 189 |
+
</div>
|
| 190 |
+
</div>
|
| 191 |
+
</div>
|
| 192 |
+
|
| 193 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px;">
|
| 194 |
+
<h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Full Transcription</h4>
|
| 195 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 20px; border-left: 4px solid #3b82f6;">
|
| 196 |
+
<p style="margin: 0; font-size: 15px; line-height: 1.8; color: #374151; font-style: italic;">"{asr['transcription']}"</p>
|
| 197 |
+
</div>
|
| 198 |
+
</div>
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
return output
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
# build the speech patterns display HTML
|
| 205 |
+
def build_speech_patterns_display(asr):
|
| 206 |
+
output = ""
|
| 207 |
+
|
| 208 |
+
# kopparapu classification section
|
| 209 |
+
if 'kopparapu_score' in asr:
|
| 210 |
+
classification = asr['kopparapu_classification'].upper()
|
| 211 |
+
kop_score = asr['kopparapu_score']
|
| 212 |
+
confidence = kop_score if kop_score >= 0.5 else (1 - kop_score)
|
| 213 |
+
|
| 214 |
+
if classification == 'SPONTANEOUS':
|
| 215 |
+
class_color = '#10b981'
|
| 216 |
+
class_bg = '#ecfdf5'
|
| 217 |
+
else:
|
| 218 |
+
class_color = '#f59e0b'
|
| 219 |
+
class_bg = '#fffbeb'
|
| 220 |
+
|
| 221 |
+
kf = asr['kopparapu_features']
|
| 222 |
+
|
| 223 |
+
output += f"""
|
| 224 |
+
<div style="background: linear-gradient(135deg, {class_bg} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {class_color}33;">
|
| 225 |
+
<h3 style="margin: 0; color: {class_color}; font-size: 22px; font-weight: 700;">{classification} SPEECH</h3>
|
| 226 |
+
<p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Linguistic analysis confidence: <strong>{confidence*100:.1f}%</strong></p>
|
| 227 |
+
<div style="margin-top: 12px; background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 8px;">
|
| 228 |
+
<div style="height: 100%; width: {kop_score*100:.0f}%; background: linear-gradient(to right, #10b981, #f59e0b); border-radius: 6px;"></div>
|
| 229 |
+
</div>
|
| 230 |
+
</div>
|
| 231 |
+
|
| 232 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
|
| 233 |
+
<h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Linguistic Metrics</h4>
|
| 234 |
+
<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px;">
|
| 235 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
|
| 236 |
+
<div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['chars_per_word']:.2f}</div>
|
| 237 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Chars/Word</div>
|
| 238 |
+
</div>
|
| 239 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
|
| 240 |
+
<div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['words_per_sec']:.2f}</div>
|
| 241 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Words/Sec</div>
|
| 242 |
+
</div>
|
| 243 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
|
| 244 |
+
<div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['filler_rate']*100:.1f}%</div>
|
| 245 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Filler Rate</div>
|
| 246 |
+
</div>
|
| 247 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
|
| 248 |
+
<div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['repetition_count']}</div>
|
| 249 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Repetitions</div>
|
| 250 |
+
</div>
|
| 251 |
+
</div>
|
| 252 |
+
</div>
|
| 253 |
+
"""
|
| 254 |
+
|
| 255 |
+
# filler words section
|
| 256 |
+
filler_ratio = asr['filler_words']['ratio']
|
| 257 |
+
filler_count = asr['filler_words']['count']
|
| 258 |
+
|
| 259 |
+
if filler_ratio > 0.05:
|
| 260 |
+
filler_color = '#10b981'
|
| 261 |
+
filler_label = 'High filler usage'
|
| 262 |
+
filler_desc = 'Strong indicator of spontaneous speech'
|
| 263 |
+
elif filler_ratio < 0.02:
|
| 264 |
+
filler_color = '#f59e0b'
|
| 265 |
+
filler_label = 'Low filler usage'
|
| 266 |
+
filler_desc = 'May indicate reading or rehearsed speech'
|
| 267 |
+
else:
|
| 268 |
+
filler_color = '#6b7280'
|
| 269 |
+
filler_label = 'Moderate filler usage'
|
| 270 |
+
filler_desc = 'Normal conversational pattern'
|
| 271 |
+
|
| 272 |
+
output += f"""
|
| 273 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
|
| 274 |
+
<h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Filler Words</h4>
|
| 275 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr 2fr; gap: 16px; align-items: center;">
|
| 276 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
|
| 277 |
+
<div style="font-size: 28px; font-weight: 700; color: {filler_color};">{filler_count}</div>
|
| 278 |
+
<div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Filler Words</div>
|
| 279 |
+
</div>
|
| 280 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
|
| 281 |
+
<div style="font-size: 28px; font-weight: 700; color: {filler_color};">{filler_ratio*100:.1f}%</div>
|
| 282 |
+
<div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Of Speech</div>
|
| 283 |
+
</div>
|
| 284 |
+
<div style="background: {filler_color}10; border-radius: 10px; padding: 16px; border: 1px solid {filler_color}33;">
|
| 285 |
+
<div style="font-weight: 600; color: {filler_color}; font-size: 14px;">{filler_label}</div>
|
| 286 |
+
<div style="font-size: 12px; color: #6b7280; margin-top: 4px;">{filler_desc}</div>
|
| 287 |
+
</div>
|
| 288 |
+
</div>
|
| 289 |
+
</div>
|
| 290 |
+
"""
|
| 291 |
+
|
| 292 |
+
# pause patterns section
|
| 293 |
+
pause_var = asr['pause_patterns']['pause_variability']
|
| 294 |
+
|
| 295 |
+
if pause_var < 0.3:
|
| 296 |
+
pause_color = '#f59e0b'
|
| 297 |
+
pause_label = 'Regular pauses'
|
| 298 |
+
pause_desc = 'Suggests reading at punctuation marks'
|
| 299 |
+
elif pause_var > 0.6:
|
| 300 |
+
pause_color = '#10b981'
|
| 301 |
+
pause_label = 'Irregular pauses'
|
| 302 |
+
pause_desc = 'Natural thinking breaks indicate spontaneous speech'
|
| 303 |
+
else:
|
| 304 |
+
pause_color = '#6b7280'
|
| 305 |
+
pause_label = 'Moderate variability'
|
| 306 |
+
pause_desc = 'Mixed pattern'
|
| 307 |
+
|
| 308 |
+
output += f"""
|
| 309 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px;">
|
| 310 |
+
<h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Pause Patterns</h4>
|
| 311 |
+
<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-bottom: 16px;">
|
| 312 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
|
| 313 |
+
<div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['num_pauses']}</div>
|
| 314 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Total Pauses</div>
|
| 315 |
+
</div>
|
| 316 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
|
| 317 |
+
<div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['avg_pause']:.2f}</div>
|
| 318 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Avg Duration</div>
|
| 319 |
+
</div>
|
| 320 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
|
| 321 |
+
<div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['max_pause']:.2f}</div>
|
| 322 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Longest Pause</div>
|
| 323 |
+
</div>
|
| 324 |
+
<div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
|
| 325 |
+
<div style="font-size: 20px; font-weight: 700; color: {pause_color};">{pause_var:.2f}</div>
|
| 326 |
+
<div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Variability</div>
|
| 327 |
+
</div>
|
| 328 |
+
</div>
|
| 329 |
+
<div style="background: {pause_color}10; border-radius: 10px; padding: 14px; border: 1px solid {pause_color}33;">
|
| 330 |
+
<div style="font-weight: 600; color: {pause_color}; font-size: 14px;">{pause_label}</div>
|
| 331 |
+
<div style="font-size: 12px; color: #6b7280; margin-top: 4px;">{pause_desc}</div>
|
| 332 |
+
</div>
|
| 333 |
+
</div>
|
| 334 |
+
"""
|
| 335 |
+
|
| 336 |
+
return output
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
# build the AI detection display HTML
|
| 340 |
+
def build_ai_detection_display(text_auth):
|
| 341 |
+
is_ai = text_auth['ai_detection']['ai_generated']
|
| 342 |
+
ai_prob = text_auth['ai_detection']['confidence']
|
| 343 |
+
human_prob = 1 - ai_prob
|
| 344 |
+
|
| 345 |
+
if is_ai:
|
| 346 |
+
main_color = '#ef4444'
|
| 347 |
+
bg_color = '#fef2f2'
|
| 348 |
+
label = 'AI-GENERATED LIKELY'
|
| 349 |
+
desc = 'The text shows patterns consistent with AI-generated content'
|
| 350 |
+
else:
|
| 351 |
+
main_color = '#10b981'
|
| 352 |
+
bg_color = '#ecfdf5'
|
| 353 |
+
label = 'HUMAN-WRITTEN LIKELY'
|
| 354 |
+
desc = 'The text shows patterns consistent with human-written content'
|
| 355 |
+
|
| 356 |
+
output = f"""
|
| 357 |
+
<div style="background: linear-gradient(135deg, {bg_color} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {main_color}33;">
|
| 358 |
+
<h3 style="margin: 0; color: {main_color}; font-size: 22px; font-weight: 700;">{label}</h3>
|
| 359 |
+
<p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">{desc}</p>
|
| 360 |
+
</div>
|
| 361 |
+
|
| 362 |
+
<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
|
| 363 |
+
<h4 style="margin: 0 0 20px 0; color: #374151; font-size: 15px; font-weight: 600;">Confidence Analysis</h4>
|
| 364 |
+
|
| 365 |
+
<div style="margin-bottom: 20px;">
|
| 366 |
+
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
|
| 367 |
+
<span style="font-weight: 600; color: #ef4444; font-size: 14px;">AI Generated</span>
|
| 368 |
+
<span style="font-weight: 700; color: #ef4444; font-size: 18px;">{ai_prob*100:.0f}%</span>
|
| 369 |
+
</div>
|
| 370 |
+
<div style="background: #fee2e2; border-radius: 8px; overflow: hidden; height: 12px;">
|
| 371 |
+
<div style="height: 100%; width: {ai_prob*100:.0f}%; background: #ef4444; border-radius: 8px;"></div>
|
| 372 |
+
</div>
|
| 373 |
+
</div>
|
| 374 |
+
|
| 375 |
+
<div>
|
| 376 |
+
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
|
| 377 |
+
<span style="font-weight: 600; color: #10b981; font-size: 14px;">Human Written</span>
|
| 378 |
+
<span style="font-weight: 700; color: #10b981; font-size: 18px;">{human_prob*100:.0f}%</span>
|
| 379 |
+
</div>
|
| 380 |
+
<div style="background: #d1fae5; border-radius: 8px; overflow: hidden; height: 12px;">
|
| 381 |
+
<div style="height: 100%; width: {human_prob*100:.0f}%; background: #10b981; border-radius: 8px;"></div>
|
| 382 |
+
</div>
|
| 383 |
+
</div>
|
| 384 |
+
</div>
|
| 385 |
+
|
| 386 |
+
<div style="background: #fffbeb; border: 1px solid #fcd34d; border-radius: 10px; padding: 14px;">
|
| 387 |
+
<div style="font-size: 13px; color: #92400e; line-height: 1.5;">
|
| 388 |
+
<strong>Note:</strong> AI detection is probabilistic and should be used as one factor among many in your evaluation.
|
| 389 |
+
</div>
|
| 390 |
+
</div>
|
| 391 |
+
"""
|
| 392 |
+
|
| 393 |
+
return output
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
# main function to analyze uploaded audio file
|
| 397 |
def analyze_audio_file(audio_file):
|
| 398 |
+
# check if pipeline is ready
|
| 399 |
if not pipeline_ready:
|
| 400 |
+
error_msg = pipeline_error if 'pipeline_error' in dir() else "Something went wrong"
|
| 401 |
+
error_html = f"""
|
| 402 |
+
<div style="background: #fef2f2; border: 1px solid #ef4444; border-radius: 12px; padding: 20px;">
|
| 403 |
+
<h3 style="margin: 0 0 8px 0; color: #dc2626; font-size: 16px;">Pipeline not ready</h3>
|
| 404 |
+
<p style="margin: 0; color: #7f1d1d; font-size: 14px;">{error_msg}</p>
|
| 405 |
+
</div>
|
| 406 |
+
"""
|
| 407 |
+
return (error_html, "", "", "", "")
|
| 408 |
|
| 409 |
+
# check if audio file was provided
|
| 410 |
if audio_file is None:
|
| 411 |
+
placeholder_html = """
|
| 412 |
+
<div style="background: #f9fafb; border: 1px solid #e5e7eb; border-radius: 12px; padding: 40px; text-align: center;">
|
| 413 |
+
<p style="margin: 0; color: #6b7280; font-size: 15px;">Please upload an audio file to begin analysis.</p>
|
| 414 |
+
</div>
|
| 415 |
+
"""
|
| 416 |
+
return (placeholder_html, "", "", "", "")
|
| 417 |
|
| 418 |
+
# run analysis
|
| 419 |
try:
|
| 420 |
language_code = None
|
| 421 |
results = pipeline.analyze_audio(audio_file, language=language_code)
|
| 422 |
|
| 423 |
+
# extract results from each component
|
| 424 |
audio_class = results['audio_classification']
|
| 425 |
asr = results['speech_recognition']
|
| 426 |
text_auth = results['text_authenticity']
|
| 427 |
final = results['final_assessment']
|
| 428 |
|
| 429 |
+
# color mapping for verdict
|
| 430 |
verdict_color = {
|
| 431 |
"AUTHENTIC": "#10b981",
|
| 432 |
"LIKELY AUTHENTIC": "#3b82f6",
|
|
|
|
| 436 |
|
| 437 |
color = verdict_color.get(final['verdict'], '#6b7280')
|
| 438 |
|
| 439 |
+
# build overall status display
|
| 440 |
overall_status = f"""
|
| 441 |
<div style='background: white; border: 2px solid {color}; padding: 25px; border-radius: 16px; margin: 10px 0;'>
|
| 442 |
<h2 style='color: {color}; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
|
|
|
|
| 461 |
</div>
|
| 462 |
</div>
|
| 463 |
"""
|
| 464 |
+
# build tab outputs
|
| 465 |
+
acoustic_output = build_acoustic_features_display(audio_class)
|
| 466 |
+
transcription_output = build_transcription_display(asr)
|
| 467 |
+
speech_patterns = build_speech_patterns_display(asr)
|
| 468 |
+
ai_output = build_ai_detection_display(text_auth)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
return (
|
| 471 |
overall_status,
|
|
|
|
| 476 |
)
|
| 477 |
|
| 478 |
except Exception as e:
|
| 479 |
+
error_html = f"""
|
| 480 |
+
<div style="background: #fef2f2; border: 1px solid #ef4444; border-radius: 12px; padding: 20px;">
|
| 481 |
+
<h3 style="margin: 0 0 12px 0; color: #dc2626; font-size: 16px;">Something went wrong</h3>
|
| 482 |
+
<p style="margin: 0 0 12px 0; color: #7f1d1d; font-size: 14px;">{str(e)}</p>
|
| 483 |
+
<details style="margin-top: 12px;">
|
| 484 |
+
<summary style="color: #6b7280; cursor: pointer; font-size: 13px;">More info</summary>
|
| 485 |
+
<pre style="background: #1f2937; color: #f3f4f6; padding: 12px; border-radius: 8px; margin-top: 8px; font-size: 11px; overflow-x: auto;">{traceback.format_exc()}</pre>
|
| 486 |
+
</details>
|
| 487 |
+
</div>
|
| 488 |
+
"""
|
| 489 |
+
return (error_html, "", "", "", "")
|
| 490 |
|
| 491 |
|
| 492 |
+
# create the gradio interface
|
| 493 |
def create_interface():
|
| 494 |
+
# custom CSS for styling
|
| 495 |
custom_css = """
|
| 496 |
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
|
| 497 |
|
|
|
|
| 499 |
font-family: 'IBM Plex Sans', sans-serif !important;
|
| 500 |
background: white !important;
|
| 501 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
button.primary, .primary {
|
| 503 |
background: #2563eb !important;
|
| 504 |
color: white !important;
|
|
|
|
| 507 |
font-weight: 600 !important;
|
| 508 |
padding: 12px 24px !important;
|
| 509 |
border-radius: 8px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
}
|
| 511 |
"""
|
| 512 |
|
| 513 |
with gr.Blocks(title="Authenticity Detection System") as demo:
|
| 514 |
|
| 515 |
+
# header section
|
| 516 |
gr.HTML(f"""
|
| 517 |
<style>
|
| 518 |
{custom_css}
|
|
|
|
| 520 |
<header style='background: white; border-bottom: 1px solid #e5e7eb; margin-bottom: 32px;'>
|
| 521 |
<div style='padding: 16px 0;'>
|
| 522 |
<div style='display: flex; align-items: center; gap: 12px;'>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
<div>
|
| 524 |
<p style='margin: 0; font-size: 11px; text-transform: uppercase; letter-spacing: 1.5px; color: #6b7280; font-weight: 500;'>
|
| 525 |
LEIDEN UNIVERSITY · LIACS
|
|
|
|
| 537 |
<h2 style='font-size: 32px; font-weight: 700; line-height: 1.2; color: #111827; margin: 0 0 16px 0;'>
|
| 538 |
Detecting AI-Assisted Responses in Online Settings
|
| 539 |
</h2>
|
|
|
|
|
|
|
| 540 |
<div style='display: flex; flex-wrap: wrap; gap: 12px;'>
|
| 541 |
<span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #eff6ff; color: #1e40af; border-radius: 8px; font-size: 14px; font-weight: 500;'>
|
| 542 |
Multi-Modal Analysis
|
|
|
|
| 549 |
</section>
|
| 550 |
""")
|
| 551 |
|
| 552 |
+
# main layout
|
| 553 |
with gr.Row():
|
| 554 |
with gr.Column(scale=1):
|
| 555 |
gr.HTML("""
|
| 556 |
+
<div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-bottom: 20px;'>
|
| 557 |
<h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Audio Input</h3>
|
| 558 |
<p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>Upload or record your audio file</p>
|
| 559 |
</div>
|
| 560 |
""")
|
| 561 |
|
| 562 |
+
# audio input component
|
| 563 |
audio_input = gr.Audio(
|
| 564 |
sources=["upload", "microphone"],
|
| 565 |
type="filepath",
|
|
|
|
| 567 |
show_label=False
|
| 568 |
)
|
| 569 |
|
| 570 |
+
# analyze button
|
| 571 |
analyze_btn = gr.Button(
|
| 572 |
"Analyze Audio",
|
| 573 |
variant="primary",
|
| 574 |
size="lg"
|
| 575 |
)
|
| 576 |
|
| 577 |
+
# requirements info
|
| 578 |
gr.HTML("""
|
| 579 |
<div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
|
| 580 |
<h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
|
|
|
|
| 583 |
<li><strong>Duration:</strong> 30 sec - 5 min</li>
|
| 584 |
</ul>
|
| 585 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
""")
|
| 587 |
|
| 588 |
with gr.Column(scale=2):
|
| 589 |
gr.HTML("""
|
| 590 |
+
<div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-bottom: 20px;'>
|
| 591 |
<h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Analysis Results</h3>
|
| 592 |
<p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>You'll see results here</p>
|
| 593 |
</div>
|
| 594 |
""")
|
| 595 |
|
| 596 |
+
# overall output
|
| 597 |
+
overall_output = gr.HTML()
|
| 598 |
|
| 599 |
+
# results tabs
|
| 600 |
with gr.Tabs() as tabs:
|
| 601 |
with gr.Tab("Acoustic Features"):
|
| 602 |
+
acoustic_output = gr.HTML()
|
| 603 |
|
| 604 |
with gr.Tab("Transcription"):
|
| 605 |
+
transcription_output = gr.HTML()
|
| 606 |
|
| 607 |
with gr.Tab("Speech Patterns"):
|
| 608 |
+
speech_output = gr.HTML()
|
| 609 |
|
| 610 |
with gr.Tab("AI Detection"):
|
| 611 |
+
ai_output = gr.HTML()
|
|
|
|
| 612 |
|
| 613 |
+
# example audio files
|
| 614 |
gr.HTML("""
|
| 615 |
<div style='margin-top: 20px; margin-bottom: 10px;'>
|
| 616 |
<h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
|
|
|
|
| 620 |
examples_dir = os.path.join(os.path.dirname(__file__), "examples")
|
| 621 |
gr.Examples(
|
| 622 |
examples=[
|
| 623 |
+
[os.path.join(examples_dir, "read1.wav")],
|
| 624 |
+
[os.path.join(examples_dir, "spontaneous1.wav")]
|
| 625 |
],
|
| 626 |
inputs=[audio_input],
|
| 627 |
outputs=[
|
|
|
|
| 634 |
fn=analyze_audio_file,
|
| 635 |
label="",
|
| 636 |
examples_per_page=2,
|
| 637 |
+
cache_examples="lazy"
|
| 638 |
)
|
| 639 |
|
| 640 |
+
# loading animation function
|
| 641 |
def show_loading():
|
| 642 |
loading_html = """
|
| 643 |
<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border: 2px solid #667eea; padding: 30px; border-radius: 16px; margin: 10px 0; text-align: center;'>
|
| 644 |
<h2 style='color: white; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
|
| 645 |
Analyzing...
|
| 646 |
</h2>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
"""
|
| 649 |
+
loading_tab = """
|
| 650 |
+
<div style='padding: 40px; text-align: center; color: #6b7280;'>
|
| 651 |
+
<p style='margin-top: 16px; font-size: 14px;'>Processing...</p>
|
| 652 |
+
</div>
|
| 653 |
+
"""
|
| 654 |
+
return loading_html, loading_tab, loading_tab, loading_tab, loading_tab
|
| 655 |
|
| 656 |
+
# connect button to analysis function
|
| 657 |
analyze_btn.click(
|
| 658 |
fn=show_loading,
|
| 659 |
inputs=None,
|
|
|
|
| 677 |
]
|
| 678 |
)
|
| 679 |
|
| 680 |
+
# footer
|
| 681 |
gr.HTML("""
|
| 682 |
<footer style='border-top: 1px solid #e5e7eb; background: white; margin-top: 48px; padding: 32px 0;'>
|
| 683 |
<div style='text-align: center;'>
|
| 684 |
<p style='margin: 0; font-size: 14px; color: #6b7280;'>
|
| 685 |
</p>
|
|
|
|
|
|
|
| 686 |
</div>
|
| 687 |
</footer>
|
| 688 |
""")
|
|
|
|
| 690 |
return demo
|
| 691 |
|
| 692 |
|
| 693 |
+
# run the app when script is executed
|
| 694 |
if __name__ == "__main__":
|
| 695 |
demo = create_interface()
|
| 696 |
demo.launch(
|
|
|
|
| 699 |
share=False,
|
| 700 |
show_error=True
|
| 701 |
)
|
|
|
audio_classifier.py
CHANGED
|
@@ -3,72 +3,95 @@ import torch.nn as nn
|
|
| 3 |
import torch.nn.functional as F
|
| 4 |
import librosa
|
| 5 |
import numpy as np
|
| 6 |
-
from typing import Dict
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
class BasicBlock(nn.Module):
|
| 9 |
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
|
| 10 |
super(BasicBlock, self).__init__()
|
|
|
|
| 11 |
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
|
| 12 |
stride=stride, padding=1, bias=False)
|
| 13 |
self.bn1 = nn.BatchNorm2d(out_channels)
|
|
|
|
| 14 |
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
|
| 15 |
stride=1, padding=1, bias=False)
|
| 16 |
self.bn2 = nn.BatchNorm2d(out_channels)
|
|
|
|
| 17 |
self.downsample = downsample
|
| 18 |
|
| 19 |
def forward(self, x):
|
|
|
|
| 20 |
identity = x
|
|
|
|
| 21 |
out = F.relu(self.bn1(self.conv1(x)))
|
|
|
|
| 22 |
out = self.bn2(self.conv2(out))
|
| 23 |
|
|
|
|
| 24 |
if self.downsample is not None:
|
| 25 |
identity = self.downsample(x)
|
| 26 |
|
|
|
|
| 27 |
out += identity
|
| 28 |
out = F.relu(out)
|
| 29 |
return out
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
| 32 |
class SpeechStyleCNN(nn.Module):
|
| 33 |
def __init__(self, num_classes=2):
|
| 34 |
super(SpeechStyleCNN, self).__init__()
|
| 35 |
|
|
|
|
| 36 |
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
|
| 37 |
self.bn1 = nn.BatchNorm2d(64)
|
| 38 |
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
| 39 |
|
|
|
|
| 40 |
self.layer1 = self._make_layer(64, 64, 2, stride=1)
|
| 41 |
self.layer2 = self._make_layer(64, 128, 2, stride=2)
|
| 42 |
self.layer3 = self._make_layer(128, 256, 2, stride=2)
|
| 43 |
self.layer4 = self._make_layer(256, 512, 2, stride=2)
|
| 44 |
|
|
|
|
| 45 |
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
|
| 46 |
self.fc = nn.Linear(512, num_classes)
|
| 47 |
|
|
|
|
| 48 |
def _make_layer(self, in_channels, out_channels, blocks, stride=1):
|
| 49 |
downsample = None
|
|
|
|
| 50 |
if stride != 1 or in_channels != out_channels:
|
| 51 |
downsample = nn.Sequential(
|
| 52 |
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
|
| 53 |
nn.BatchNorm2d(out_channels)
|
| 54 |
)
|
| 55 |
|
|
|
|
| 56 |
layers = []
|
|
|
|
| 57 |
layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
|
|
|
|
| 58 |
for _ in range(1, blocks):
|
| 59 |
layers.append(BasicBlock(out_channels, out_channels))
|
| 60 |
|
| 61 |
return nn.Sequential(*layers)
|
| 62 |
|
| 63 |
-
def forward(self, x
|
|
|
|
| 64 |
x = F.relu(self.bn1(self.conv1(x)))
|
| 65 |
x = self.maxpool(x)
|
| 66 |
|
|
|
|
| 67 |
x = self.layer1(x)
|
| 68 |
x = self.layer2(x)
|
| 69 |
x = self.layer3(x)
|
| 70 |
x = self.layer4(x)
|
| 71 |
|
|
|
|
| 72 |
x = self.avgpool(x)
|
| 73 |
x = torch.flatten(x, 1)
|
| 74 |
x = self.fc(x)
|
|
@@ -76,70 +99,82 @@ class SpeechStyleCNN(nn.Module):
|
|
| 76 |
return x
|
| 77 |
|
| 78 |
|
|
|
|
| 79 |
class AudioClassifier:
|
|
|
|
| 80 |
AVAILABLE_MODELS = {
|
| 81 |
'3s_window': 'spectrogram_cnn_3s_window.pth',
|
| 82 |
-
# '4s_window': 'spectrogram_cnn_4s_window.pth',
|
| 83 |
-
# '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
|
| 84 |
}
|
| 85 |
|
| 86 |
@classmethod
|
| 87 |
-
def get_model_path(cls, model_name
|
|
|
|
| 88 |
import os
|
| 89 |
if model_name not in cls.AVAILABLE_MODELS:
|
| 90 |
-
|
|
|
|
| 91 |
return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
|
| 92 |
|
| 93 |
-
def __init__(self, model_path
|
|
|
|
| 94 |
if device is None:
|
| 95 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 96 |
else:
|
| 97 |
self.device = torch.device(device)
|
| 98 |
-
|
|
|
|
| 99 |
self.model = SpeechStyleCNN().to(self.device)
|
| 100 |
|
|
|
|
| 101 |
if model_path is None:
|
| 102 |
import os
|
| 103 |
model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
|
| 104 |
|
|
|
|
| 105 |
try:
|
| 106 |
print(f"Attempting to load model from: {model_path}")
|
| 107 |
-
state_dict = torch.load(model_path, map_location=self.device)
|
| 108 |
self.model.load_state_dict(state_dict)
|
| 109 |
print(f"✓ Successfully loaded trained model from: {model_path}")
|
| 110 |
except FileNotFoundError:
|
| 111 |
-
|
|
|
|
| 112 |
except Exception as e:
|
| 113 |
-
|
| 114 |
|
|
|
|
| 115 |
self.model.eval()
|
| 116 |
|
|
|
|
| 117 |
self.sample_rate = 16000
|
| 118 |
self.n_mels = 128
|
| 119 |
self.n_fft = 2048
|
| 120 |
self.hop_length = 512
|
| 121 |
|
| 122 |
-
|
|
|
|
|
|
|
| 123 |
audio, sr = librosa.load(audio_path, sr=self.sample_rate)
|
| 124 |
|
| 125 |
-
#
|
| 126 |
window_samples = int(window_size * sr)
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
hop_samples = window_samples // 2
|
| 131 |
windows = []
|
|
|
|
| 132 |
for start in range(0, len(audio) - window_samples, hop_samples):
|
| 133 |
window = audio[start:start + window_samples]
|
| 134 |
windows.append(window)
|
| 135 |
|
| 136 |
-
#
|
| 137 |
if len(audio) > window_samples:
|
| 138 |
windows.append(audio[-window_samples:])
|
| 139 |
|
| 140 |
-
#
|
| 141 |
mel_specs = []
|
| 142 |
-
for window in windows[:5]: #
|
| 143 |
mel_spec = librosa.feature.melspectrogram(
|
| 144 |
y=window,
|
| 145 |
sr=sr,
|
|
@@ -149,10 +184,10 @@ class AudioClassifier:
|
|
| 149 |
)
|
| 150 |
mel_specs.append(mel_spec)
|
| 151 |
|
| 152 |
-
#
|
| 153 |
mel_spec = np.mean(mel_specs, axis=0)
|
| 154 |
else:
|
| 155 |
-
#
|
| 156 |
if len(audio) < window_samples:
|
| 157 |
audio = np.pad(audio, (0, window_samples - len(audio)), mode='constant')
|
| 158 |
else:
|
|
@@ -166,22 +201,28 @@ class AudioClassifier:
|
|
| 166 |
hop_length=self.hop_length
|
| 167 |
)
|
| 168 |
|
|
|
|
| 169 |
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 170 |
|
|
|
|
| 171 |
mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
|
|
|
|
| 172 |
mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
|
| 173 |
|
| 174 |
return mel_spec_3ch
|
| 175 |
|
| 176 |
-
|
|
|
|
| 177 |
audio, sr = librosa.load(audio_path, sr=self.sample_rate)
|
| 178 |
|
| 179 |
features = {}
|
| 180 |
|
|
|
|
| 181 |
onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
|
| 182 |
tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
|
| 183 |
features['tempo'] = float(tempo)
|
| 184 |
|
|
|
|
| 185 |
pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
|
| 186 |
pitch_values = []
|
| 187 |
for t in range(pitches.shape[1]):
|
|
@@ -190,6 +231,7 @@ class AudioClassifier:
|
|
| 190 |
if pitch > 0:
|
| 191 |
pitch_values.append(pitch)
|
| 192 |
|
|
|
|
| 193 |
if pitch_values:
|
| 194 |
features['pitch_mean'] = float(np.mean(pitch_values))
|
| 195 |
features['pitch_std'] = float(np.std(pitch_values))
|
|
@@ -199,34 +241,40 @@ class AudioClassifier:
|
|
| 199 |
features['pitch_std'] = 0.0
|
| 200 |
features['pitch_range'] = 0.0
|
| 201 |
|
|
|
|
| 202 |
rms = librosa.feature.rms(y=audio)[0]
|
| 203 |
features['energy_mean'] = float(np.mean(rms))
|
| 204 |
features['energy_std'] = float(np.std(rms))
|
| 205 |
|
|
|
|
| 206 |
zcr = librosa.feature.zero_crossing_rate(audio)[0]
|
| 207 |
features['zcr_mean'] = float(np.mean(zcr))
|
| 208 |
features['zcr_std'] = float(np.std(zcr))
|
| 209 |
|
|
|
|
| 210 |
spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
|
| 211 |
features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
|
| 212 |
features['spectral_centroid_std'] = float(np.std(spectral_centroids))
|
| 213 |
|
| 214 |
return features
|
| 215 |
|
| 216 |
-
|
|
|
|
|
|
|
| 217 |
individual_scores = {}
|
| 218 |
|
|
|
|
| 219 |
sc_std = features['spectral_centroid_std']
|
| 220 |
-
if sc_std >=
|
| 221 |
-
spectral_score = 0.9 #
|
| 222 |
-
elif sc_std >=
|
| 223 |
-
spectral_score = 0.7
|
| 224 |
elif sc_std >= 1000:
|
| 225 |
-
spectral_score = 0.5
|
| 226 |
-
elif sc_std >=
|
| 227 |
-
spectral_score = 0.3
|
| 228 |
else:
|
| 229 |
-
spectral_score = 0.1 #
|
| 230 |
|
| 231 |
individual_scores['spectral_variability'] = {
|
| 232 |
'score': spectral_score,
|
|
@@ -234,17 +282,18 @@ class AudioClassifier:
|
|
| 234 |
'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
|
| 235 |
}
|
| 236 |
|
|
|
|
| 237 |
zcr = features['zcr_mean']
|
| 238 |
-
if zcr >= 0.
|
| 239 |
-
zcr_score = 0.9
|
| 240 |
-
elif zcr >= 0.
|
| 241 |
-
zcr_score = 0.7
|
| 242 |
-
elif zcr >= 0.
|
| 243 |
-
zcr_score = 0.5
|
| 244 |
-
elif zcr >= 0.
|
| 245 |
-
zcr_score = 0.3
|
| 246 |
else:
|
| 247 |
-
zcr_score = 0.1
|
| 248 |
|
| 249 |
individual_scores['zcr_mean'] = {
|
| 250 |
'score': zcr_score,
|
|
@@ -252,18 +301,16 @@ class AudioClassifier:
|
|
| 252 |
'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
|
| 253 |
}
|
| 254 |
|
| 255 |
-
#
|
| 256 |
-
# Read: 0.06 avg, Spontaneous: 0.06 avg but spontaneous tends higher
|
| 257 |
-
# Threshold: ~0.06, read < threshold
|
| 258 |
energy = features['energy_mean']
|
| 259 |
if energy < 0.055:
|
| 260 |
-
energy_score = 0.
|
| 261 |
-
elif energy < 0.
|
| 262 |
-
energy_score = 0.
|
| 263 |
-
elif energy < 0.
|
| 264 |
-
energy_score = 0.
|
| 265 |
else:
|
| 266 |
-
energy_score = 0.
|
| 267 |
|
| 268 |
individual_scores['energy_level'] = {
|
| 269 |
'score': energy_score,
|
|
@@ -271,45 +318,80 @@ class AudioClassifier:
|
|
| 271 |
'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
|
| 272 |
}
|
| 273 |
|
| 274 |
-
#
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
else:
|
| 282 |
-
|
| 283 |
|
| 284 |
-
individual_scores['
|
| 285 |
-
'score':
|
| 286 |
-
'value':
|
| 287 |
-
'interpretation': '
|
| 288 |
}
|
| 289 |
|
| 290 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
weights = {
|
| 292 |
-
'spectral_variability': 0.
|
| 293 |
-
'zcr_mean': 0.
|
| 294 |
-
'energy_level': 0.20,
|
| 295 |
-
'
|
|
|
|
|
|
|
| 296 |
}
|
| 297 |
|
|
|
|
| 298 |
overall_score = (
|
| 299 |
spectral_score * weights['spectral_variability'] +
|
| 300 |
zcr_score * weights['zcr_mean'] +
|
| 301 |
energy_score * weights['energy_level'] +
|
| 302 |
-
|
|
|
|
|
|
|
| 303 |
)
|
| 304 |
|
| 305 |
-
|
|
|
|
| 306 |
classification = 'read'
|
| 307 |
-
confidence = 0.5 + (overall_score - 0.5) * 0.
|
| 308 |
-
elif overall_score < 0.
|
| 309 |
classification = 'spontaneous'
|
| 310 |
-
confidence = 0.5 + (0.5 - overall_score) * 0.
|
| 311 |
else:
|
| 312 |
-
classification = 'read' if overall_score >= 0.
|
| 313 |
confidence = 0.5 + abs(overall_score - 0.5) * 0.6
|
| 314 |
|
| 315 |
return {
|
|
@@ -319,11 +401,15 @@ class AudioClassifier:
|
|
| 319 |
'individual_scores': individual_scores
|
| 320 |
}
|
| 321 |
|
| 322 |
-
|
|
|
|
|
|
|
| 323 |
mel_spec = self.extract_mel_spectrogram(audio_path)
|
| 324 |
|
|
|
|
| 325 |
mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
|
| 326 |
|
|
|
|
| 327 |
with torch.no_grad():
|
| 328 |
logits = self.model(mel_tensor)
|
| 329 |
probabilities = F.softmax(logits, dim=1)
|
|
@@ -334,35 +420,36 @@ class AudioClassifier:
|
|
| 334 |
print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
|
| 335 |
print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
|
| 336 |
|
|
|
|
| 337 |
acoustic_features = self.extract_acoustic_features(audio_path)
|
| 338 |
|
|
|
|
| 339 |
prosody_scores = self._compute_prosody_scores(acoustic_features)
|
| 340 |
prosody_classification = prosody_scores['classification']
|
| 341 |
prosody_confidence = prosody_scores['confidence']
|
| 342 |
|
| 343 |
-
#
|
| 344 |
cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
|
|
|
|
|
|
|
| 345 |
print(f"CNN classification: {cnn_class_name}")
|
| 346 |
print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
|
| 347 |
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
final_confidence = 0.5 + (0.5 - weighted_score)
|
| 364 |
-
|
| 365 |
-
final_confidence = min(0.95, final_confidence)
|
| 366 |
|
| 367 |
return {
|
| 368 |
'classification': final_classification,
|
|
@@ -381,17 +468,18 @@ class AudioClassifier:
|
|
| 381 |
)
|
| 382 |
}
|
| 383 |
|
|
|
|
| 384 |
def _interpret_classification(
|
| 385 |
self,
|
| 386 |
-
final_class
|
| 387 |
-
final_confidence
|
| 388 |
-
cnn_class
|
| 389 |
-
cnn_confidence
|
| 390 |
-
prosody_class
|
| 391 |
-
prosody_confidence
|
| 392 |
-
prosody_scores
|
| 393 |
-
features
|
| 394 |
-
)
|
| 395 |
interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
|
| 396 |
interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
|
| 397 |
|
|
@@ -404,10 +492,10 @@ class AudioClassifier:
|
|
| 404 |
interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
|
| 405 |
interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
|
| 406 |
|
| 407 |
-
|
| 408 |
return interpretation
|
| 409 |
|
| 410 |
|
|
|
|
| 411 |
if __name__ == "__main__":
|
| 412 |
classifier = AudioClassifier()
|
| 413 |
print("\nAvailable pre-trained models:")
|
|
|
|
| 3 |
import torch.nn.functional as F
|
| 4 |
import librosa
|
| 5 |
import numpy as np
|
|
|
|
| 6 |
|
| 7 |
+
|
| 8 |
+
# Basic building block for the ResNet-style CNN
|
| 9 |
+
# Uses two convolutional layers with batch normalization
|
| 10 |
class BasicBlock(nn.Module):
|
| 11 |
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
|
| 12 |
super(BasicBlock, self).__init__()
|
| 13 |
+
# first conv layer with specified stride
|
| 14 |
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
|
| 15 |
stride=stride, padding=1, bias=False)
|
| 16 |
self.bn1 = nn.BatchNorm2d(out_channels)
|
| 17 |
+
# second conv layer always has stride 1
|
| 18 |
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
|
| 19 |
stride=1, padding=1, bias=False)
|
| 20 |
self.bn2 = nn.BatchNorm2d(out_channels)
|
| 21 |
+
# downsample is used when dimensions change
|
| 22 |
self.downsample = downsample
|
| 23 |
|
| 24 |
def forward(self, x):
|
| 25 |
+
# save input for skip connection
|
| 26 |
identity = x
|
| 27 |
+
# pass through first conv + batchnorm + relu
|
| 28 |
out = F.relu(self.bn1(self.conv1(x)))
|
| 29 |
+
# pass through second conv + batchnorm
|
| 30 |
out = self.bn2(self.conv2(out))
|
| 31 |
|
| 32 |
+
# apply downsample if needed to match dimensions
|
| 33 |
if self.downsample is not None:
|
| 34 |
identity = self.downsample(x)
|
| 35 |
|
| 36 |
+
# add skip connection and apply relu
|
| 37 |
out += identity
|
| 38 |
out = F.relu(out)
|
| 39 |
return out
|
| 40 |
|
| 41 |
|
| 42 |
+
# Main CNN model for speech style classification
|
| 43 |
+
# Architecture based on ResNet with custom layer configuration
|
| 44 |
class SpeechStyleCNN(nn.Module):
|
| 45 |
def __init__(self, num_classes=2):
|
| 46 |
super(SpeechStyleCNN, self).__init__()
|
| 47 |
|
| 48 |
+
# initial convolution layer - takes 3 channel input (RGB spectrogram)
|
| 49 |
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
|
| 50 |
self.bn1 = nn.BatchNorm2d(64)
|
| 51 |
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
| 52 |
|
| 53 |
+
# stack of residual blocks with increasing channel sizes
|
| 54 |
self.layer1 = self._make_layer(64, 64, 2, stride=1)
|
| 55 |
self.layer2 = self._make_layer(64, 128, 2, stride=2)
|
| 56 |
self.layer3 = self._make_layer(128, 256, 2, stride=2)
|
| 57 |
self.layer4 = self._make_layer(256, 512, 2, stride=2)
|
| 58 |
|
| 59 |
+
# global average pooling and final classification layer
|
| 60 |
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
|
| 61 |
self.fc = nn.Linear(512, num_classes)
|
| 62 |
|
| 63 |
+
# helper function to create a layer of residual blocks
|
| 64 |
def _make_layer(self, in_channels, out_channels, blocks, stride=1):
|
| 65 |
downsample = None
|
| 66 |
+
# need downsample when stride changes or channels don't match
|
| 67 |
if stride != 1 or in_channels != out_channels:
|
| 68 |
downsample = nn.Sequential(
|
| 69 |
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
|
| 70 |
nn.BatchNorm2d(out_channels)
|
| 71 |
)
|
| 72 |
|
| 73 |
+
# create list of blocks
|
| 74 |
layers = []
|
| 75 |
+
# first block may have different stride
|
| 76 |
layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
|
| 77 |
+
# remaining blocks have stride 1
|
| 78 |
for _ in range(1, blocks):
|
| 79 |
layers.append(BasicBlock(out_channels, out_channels))
|
| 80 |
|
| 81 |
return nn.Sequential(*layers)
|
| 82 |
|
| 83 |
+
def forward(self, x):
|
| 84 |
+
# initial conv block
|
| 85 |
x = F.relu(self.bn1(self.conv1(x)))
|
| 86 |
x = self.maxpool(x)
|
| 87 |
|
| 88 |
+
# pass through all residual layers
|
| 89 |
x = self.layer1(x)
|
| 90 |
x = self.layer2(x)
|
| 91 |
x = self.layer3(x)
|
| 92 |
x = self.layer4(x)
|
| 93 |
|
| 94 |
+
# global pooling and classification
|
| 95 |
x = self.avgpool(x)
|
| 96 |
x = torch.flatten(x, 1)
|
| 97 |
x = self.fc(x)
|
|
|
|
| 99 |
return x
|
| 100 |
|
| 101 |
|
| 102 |
+
# Main classifier class that combines CNN with acoustic feature analysis
|
| 103 |
class AudioClassifier:
|
| 104 |
+
# dictionary of available pre-trained models
|
| 105 |
AVAILABLE_MODELS = {
|
| 106 |
'3s_window': 'spectrogram_cnn_3s_window.pth',
|
|
|
|
|
|
|
| 107 |
}
|
| 108 |
|
| 109 |
@classmethod
|
| 110 |
+
def get_model_path(cls, model_name='3s_window'):
|
| 111 |
+
# returns the full path to a model file
|
| 112 |
import os
|
| 113 |
if model_name not in cls.AVAILABLE_MODELS:
|
| 114 |
+
print(f"Model not found: {model_name}")
|
| 115 |
+
return None
|
| 116 |
return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
|
| 117 |
|
| 118 |
+
def __init__(self, model_path=None, device=None):
|
| 119 |
+
# set up device - use GPU if available
|
| 120 |
if device is None:
|
| 121 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 122 |
else:
|
| 123 |
self.device = torch.device(device)
|
| 124 |
+
|
| 125 |
+
# initialize the CNN model
|
| 126 |
self.model = SpeechStyleCNN().to(self.device)
|
| 127 |
|
| 128 |
+
# use default model path if not specified
|
| 129 |
if model_path is None:
|
| 130 |
import os
|
| 131 |
model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
|
| 132 |
|
| 133 |
+
# load pre-trained weights
|
| 134 |
try:
|
| 135 |
print(f"Attempting to load model from: {model_path}")
|
| 136 |
+
state_dict = torch.load(model_path, map_location=self.device, weights_only=False)
|
| 137 |
self.model.load_state_dict(state_dict)
|
| 138 |
print(f"✓ Successfully loaded trained model from: {model_path}")
|
| 139 |
except FileNotFoundError:
|
| 140 |
+
print(f"Could not find model file at {model_path}")
|
| 141 |
+
print("Make sure the model file exists in the correct location")
|
| 142 |
except Exception as e:
|
| 143 |
+
print(f"Something went wrong loading the model: {e}")
|
| 144 |
|
| 145 |
+
# set model to evaluation mode
|
| 146 |
self.model.eval()
|
| 147 |
|
| 148 |
+
# audio processing parameters
|
| 149 |
self.sample_rate = 16000
|
| 150 |
self.n_mels = 128
|
| 151 |
self.n_fft = 2048
|
| 152 |
self.hop_length = 512
|
| 153 |
|
| 154 |
+
# extract mel spectrogram from audio file
|
| 155 |
+
def extract_mel_spectrogram(self, audio_path, window_size=3.0):
|
| 156 |
+
# load audio at target sample rate
|
| 157 |
audio, sr = librosa.load(audio_path, sr=self.sample_rate)
|
| 158 |
|
| 159 |
+
# calculate window size in samples
|
| 160 |
window_samples = int(window_size * sr)
|
| 161 |
|
| 162 |
+
# for longer audio, use multiple overlapping windows
|
| 163 |
+
if len(audio) > window_samples * 1.5:
|
| 164 |
hop_samples = window_samples // 2
|
| 165 |
windows = []
|
| 166 |
+
# extract overlapping windows
|
| 167 |
for start in range(0, len(audio) - window_samples, hop_samples):
|
| 168 |
window = audio[start:start + window_samples]
|
| 169 |
windows.append(window)
|
| 170 |
|
| 171 |
+
# add the last window
|
| 172 |
if len(audio) > window_samples:
|
| 173 |
windows.append(audio[-window_samples:])
|
| 174 |
|
| 175 |
+
# compute mel spectrogram for each window
|
| 176 |
mel_specs = []
|
| 177 |
+
for window in windows[:5]: # limit to 5 windows
|
| 178 |
mel_spec = librosa.feature.melspectrogram(
|
| 179 |
y=window,
|
| 180 |
sr=sr,
|
|
|
|
| 184 |
)
|
| 185 |
mel_specs.append(mel_spec)
|
| 186 |
|
| 187 |
+
# average the spectrograms
|
| 188 |
mel_spec = np.mean(mel_specs, axis=0)
|
| 189 |
else:
|
| 190 |
+
# for short audio, pad or truncate
|
| 191 |
if len(audio) < window_samples:
|
| 192 |
audio = np.pad(audio, (0, window_samples - len(audio)), mode='constant')
|
| 193 |
else:
|
|
|
|
| 201 |
hop_length=self.hop_length
|
| 202 |
)
|
| 203 |
|
| 204 |
+
# convert to decibels
|
| 205 |
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 206 |
|
| 207 |
+
# normalize to 0-1 range
|
| 208 |
mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
|
| 209 |
+
# stack into 3 channels for CNN input
|
| 210 |
mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
|
| 211 |
|
| 212 |
return mel_spec_3ch
|
| 213 |
|
| 214 |
+
# extract acoustic features from audio
|
| 215 |
+
def extract_acoustic_features(self, audio_path):
|
| 216 |
audio, sr = librosa.load(audio_path, sr=self.sample_rate)
|
| 217 |
|
| 218 |
features = {}
|
| 219 |
|
| 220 |
+
# tempo/rhythm estimation
|
| 221 |
onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
|
| 222 |
tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
|
| 223 |
features['tempo'] = float(tempo)
|
| 224 |
|
| 225 |
+
# pitch tracking
|
| 226 |
pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
|
| 227 |
pitch_values = []
|
| 228 |
for t in range(pitches.shape[1]):
|
|
|
|
| 231 |
if pitch > 0:
|
| 232 |
pitch_values.append(pitch)
|
| 233 |
|
| 234 |
+
# calculate pitch statistics
|
| 235 |
if pitch_values:
|
| 236 |
features['pitch_mean'] = float(np.mean(pitch_values))
|
| 237 |
features['pitch_std'] = float(np.std(pitch_values))
|
|
|
|
| 241 |
features['pitch_std'] = 0.0
|
| 242 |
features['pitch_range'] = 0.0
|
| 243 |
|
| 244 |
+
# energy/loudness features
|
| 245 |
rms = librosa.feature.rms(y=audio)[0]
|
| 246 |
features['energy_mean'] = float(np.mean(rms))
|
| 247 |
features['energy_std'] = float(np.std(rms))
|
| 248 |
|
| 249 |
+
# zero crossing rate - indicates voice quality
|
| 250 |
zcr = librosa.feature.zero_crossing_rate(audio)[0]
|
| 251 |
features['zcr_mean'] = float(np.mean(zcr))
|
| 252 |
features['zcr_std'] = float(np.std(zcr))
|
| 253 |
|
| 254 |
+
# spectral centroid - brightness of sound
|
| 255 |
spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
|
| 256 |
features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
|
| 257 |
features['spectral_centroid_std'] = float(np.std(spectral_centroids))
|
| 258 |
|
| 259 |
return features
|
| 260 |
|
| 261 |
+
# compute prosody scores from acoustic features
|
| 262 |
+
# uses thresholds calibrated from training data
|
| 263 |
+
def _compute_prosody_scores(self, features):
|
| 264 |
individual_scores = {}
|
| 265 |
|
| 266 |
+
# spectral centroid variability - best discriminating feature
|
| 267 |
sc_std = features['spectral_centroid_std']
|
| 268 |
+
if sc_std >= 1080:
|
| 269 |
+
spectral_score = 0.9 # strongly indicates read
|
| 270 |
+
elif sc_std >= 1040:
|
| 271 |
+
spectral_score = 0.7
|
| 272 |
elif sc_std >= 1000:
|
| 273 |
+
spectral_score = 0.5
|
| 274 |
+
elif sc_std >= 970:
|
| 275 |
+
spectral_score = 0.3
|
| 276 |
else:
|
| 277 |
+
spectral_score = 0.1 # strongly spontaneous
|
| 278 |
|
| 279 |
individual_scores['spectral_variability'] = {
|
| 280 |
'score': spectral_score,
|
|
|
|
| 282 |
'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
|
| 283 |
}
|
| 284 |
|
| 285 |
+
# zero crossing rate - second best feature
|
| 286 |
zcr = features['zcr_mean']
|
| 287 |
+
if zcr >= 0.125:
|
| 288 |
+
zcr_score = 0.9
|
| 289 |
+
elif zcr >= 0.110:
|
| 290 |
+
zcr_score = 0.7
|
| 291 |
+
elif zcr >= 0.100:
|
| 292 |
+
zcr_score = 0.5
|
| 293 |
+
elif zcr >= 0.092:
|
| 294 |
+
zcr_score = 0.3
|
| 295 |
else:
|
| 296 |
+
zcr_score = 0.1
|
| 297 |
|
| 298 |
individual_scores['zcr_mean'] = {
|
| 299 |
'score': zcr_score,
|
|
|
|
| 301 |
'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
|
| 302 |
}
|
| 303 |
|
| 304 |
+
# energy level - read speech tends to be lower energy
|
|
|
|
|
|
|
| 305 |
energy = features['energy_mean']
|
| 306 |
if energy < 0.055:
|
| 307 |
+
energy_score = 0.85
|
| 308 |
+
elif energy < 0.062:
|
| 309 |
+
energy_score = 0.65
|
| 310 |
+
elif energy < 0.070:
|
| 311 |
+
energy_score = 0.4
|
| 312 |
else:
|
| 313 |
+
energy_score = 0.15
|
| 314 |
|
| 315 |
individual_scores['energy_level'] = {
|
| 316 |
'score': energy_score,
|
|
|
|
| 318 |
'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
|
| 319 |
}
|
| 320 |
|
| 321 |
+
# pitch range feature
|
| 322 |
+
pitch_range = features.get('pitch_range', 3828)
|
| 323 |
+
if pitch_range < 3815:
|
| 324 |
+
pitch_range_score = 0.7
|
| 325 |
+
elif pitch_range < 3828:
|
| 326 |
+
pitch_range_score = 0.5
|
| 327 |
+
else:
|
| 328 |
+
pitch_range_score = 0.3
|
| 329 |
+
|
| 330 |
+
individual_scores['pitch_range'] = {
|
| 331 |
+
'score': pitch_range_score,
|
| 332 |
+
'value': pitch_range,
|
| 333 |
+
'interpretation': 'narrow (read)' if pitch_range_score > 0.6 else 'wide (spontaneous)' if pitch_range_score < 0.4 else 'moderate'
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
# energy variability
|
| 337 |
+
energy_std = features.get('energy_std', 0.047)
|
| 338 |
+
if energy_std < 0.042:
|
| 339 |
+
energy_std_score = 0.7
|
| 340 |
+
elif energy_std < 0.048:
|
| 341 |
+
energy_std_score = 0.5
|
| 342 |
else:
|
| 343 |
+
energy_std_score = 0.3
|
| 344 |
|
| 345 |
+
individual_scores['energy_std'] = {
|
| 346 |
+
'score': energy_std_score,
|
| 347 |
+
'value': energy_std,
|
| 348 |
+
'interpretation': 'steady (read)' if energy_std_score > 0.6 else 'variable (spontaneous)' if energy_std_score < 0.4 else 'moderate'
|
| 349 |
}
|
| 350 |
|
| 351 |
+
# zcr variability
|
| 352 |
+
zcr_std = features.get('zcr_std', 0.111)
|
| 353 |
+
if zcr_std >= 0.115:
|
| 354 |
+
zcr_std_score = 0.7
|
| 355 |
+
elif zcr_std >= 0.105:
|
| 356 |
+
zcr_std_score = 0.5
|
| 357 |
+
else:
|
| 358 |
+
zcr_std_score = 0.3
|
| 359 |
+
|
| 360 |
+
individual_scores['zcr_std'] = {
|
| 361 |
+
'score': zcr_std_score,
|
| 362 |
+
'value': zcr_std,
|
| 363 |
+
'interpretation': 'variable ZCR (read)' if zcr_std_score > 0.6 else 'steady ZCR (spontaneous)' if zcr_std_score < 0.4 else 'moderate'
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
# weights based on feature importance from analysis
|
| 367 |
weights = {
|
| 368 |
+
'spectral_variability': 0.30,
|
| 369 |
+
'zcr_mean': 0.25,
|
| 370 |
+
'energy_level': 0.20,
|
| 371 |
+
'pitch_range': 0.10,
|
| 372 |
+
'energy_std': 0.08,
|
| 373 |
+
'zcr_std': 0.07,
|
| 374 |
}
|
| 375 |
|
| 376 |
+
# calculate weighted overall score
|
| 377 |
overall_score = (
|
| 378 |
spectral_score * weights['spectral_variability'] +
|
| 379 |
zcr_score * weights['zcr_mean'] +
|
| 380 |
energy_score * weights['energy_level'] +
|
| 381 |
+
pitch_range_score * weights['pitch_range'] +
|
| 382 |
+
energy_std_score * weights['energy_std'] +
|
| 383 |
+
zcr_std_score * weights['zcr_std']
|
| 384 |
)
|
| 385 |
|
| 386 |
+
# determine classification based on thresholds
|
| 387 |
+
if overall_score > 0.58:
|
| 388 |
classification = 'read'
|
| 389 |
+
confidence = 0.5 + (overall_score - 0.5) * 0.9
|
| 390 |
+
elif overall_score < 0.42:
|
| 391 |
classification = 'spontaneous'
|
| 392 |
+
confidence = 0.5 + (0.5 - overall_score) * 0.9
|
| 393 |
else:
|
| 394 |
+
classification = 'read' if overall_score >= 0.50 else 'spontaneous'
|
| 395 |
confidence = 0.5 + abs(overall_score - 0.5) * 0.6
|
| 396 |
|
| 397 |
return {
|
|
|
|
| 401 |
'individual_scores': individual_scores
|
| 402 |
}
|
| 403 |
|
| 404 |
+
# main classification method - combines CNN and prosody analysis
|
| 405 |
+
def classify(self, audio_path):
|
| 406 |
+
# extract mel spectrogram for CNN
|
| 407 |
mel_spec = self.extract_mel_spectrogram(audio_path)
|
| 408 |
|
| 409 |
+
# convert to tensor and add batch dimension
|
| 410 |
mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
|
| 411 |
|
| 412 |
+
# get CNN predictions
|
| 413 |
with torch.no_grad():
|
| 414 |
logits = self.model(mel_tensor)
|
| 415 |
probabilities = F.softmax(logits, dim=1)
|
|
|
|
| 420 |
print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
|
| 421 |
print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
|
| 422 |
|
| 423 |
+
# extract acoustic features for prosody analysis
|
| 424 |
acoustic_features = self.extract_acoustic_features(audio_path)
|
| 425 |
|
| 426 |
+
# compute prosody-based scores
|
| 427 |
prosody_scores = self._compute_prosody_scores(acoustic_features)
|
| 428 |
prosody_classification = prosody_scores['classification']
|
| 429 |
prosody_confidence = prosody_scores['confidence']
|
| 430 |
|
| 431 |
+
# map CNN class to label
|
| 432 |
cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
|
| 433 |
+
read_prob = probabilities[0, 0].item()
|
| 434 |
+
|
| 435 |
print(f"CNN classification: {cnn_class_name}")
|
| 436 |
print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
|
| 437 |
|
| 438 |
+
# combine CNN and prosody - prosody is more reliable
|
| 439 |
+
final_classification = prosody_classification
|
| 440 |
+
final_confidence = prosody_confidence
|
| 441 |
+
|
| 442 |
+
# boost confidence when both methods agree
|
| 443 |
+
if cnn_class_name == prosody_classification:
|
| 444 |
+
final_confidence = min(0.95, prosody_confidence * 1.15)
|
| 445 |
+
elif read_prob > 0.85 and cnn_class_name == 'read':
|
| 446 |
+
if prosody_confidence < 0.65:
|
| 447 |
+
final_classification = 'read'
|
| 448 |
+
final_confidence = 0.55
|
| 449 |
+
elif read_prob < 0.10 and cnn_class_name == 'spontaneous':
|
| 450 |
+
if prosody_confidence < 0.65:
|
| 451 |
+
final_classification = 'spontaneous'
|
| 452 |
+
final_confidence = 0.55
|
|
|
|
|
|
|
|
|
|
| 453 |
|
| 454 |
return {
|
| 455 |
'classification': final_classification,
|
|
|
|
| 468 |
)
|
| 469 |
}
|
| 470 |
|
| 471 |
+
# generate human-readable interpretation of classification
|
| 472 |
def _interpret_classification(
|
| 473 |
self,
|
| 474 |
+
final_class,
|
| 475 |
+
final_confidence,
|
| 476 |
+
cnn_class,
|
| 477 |
+
cnn_confidence,
|
| 478 |
+
prosody_class,
|
| 479 |
+
prosody_confidence,
|
| 480 |
+
prosody_scores,
|
| 481 |
+
features
|
| 482 |
+
):
|
| 483 |
interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
|
| 484 |
interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
|
| 485 |
|
|
|
|
| 492 |
interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
|
| 493 |
interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
|
| 494 |
|
|
|
|
| 495 |
return interpretation
|
| 496 |
|
| 497 |
|
| 498 |
+
# test code - runs when script is executed directly
|
| 499 |
if __name__ == "__main__":
|
| 500 |
classifier = AudioClassifier()
|
| 501 |
print("\nAvailable pre-trained models:")
|
examples/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
examples/{spontaneous1.ogg → read1.wav}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ca1f1a4aadad49ce045b41318eaf3e82b588231af2aee89596687731c0cef4d
|
| 3 |
+
size 1075710
|
examples/{read1.ogg → spontaneous1.wav}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76f8f4a50cd6d10123058d060287ae1433b59087ad0b65b0fa6255716368d3ba
|
| 3 |
+
size 873470
|
pipeline.py
CHANGED
|
@@ -1,49 +1,52 @@
|
|
| 1 |
-
from typing import Dict, Optional
|
| 2 |
import time
|
| 3 |
from audio_classifier import AudioClassifier
|
| 4 |
from speech_recognizer import SpeechRecognizer
|
| 5 |
from text_analyzer import TextAuthenticityAnalyzer
|
| 6 |
|
| 7 |
|
|
|
|
| 8 |
class AuthenticityDetectionPipeline:
|
| 9 |
def __init__(
|
| 10 |
self,
|
| 11 |
-
audio_model_path
|
| 12 |
-
whisper_model_size
|
| 13 |
-
device
|
| 14 |
-
ai_detection_threshold
|
| 15 |
):
|
| 16 |
print("\n" + "="*60)
|
| 17 |
print("Initializing Multimodal Authenticity Detection Pipeline")
|
| 18 |
print("="*60 + "\n")
|
| 19 |
|
| 20 |
-
#
|
| 21 |
print("📊 Loading Audio Classifier (CNN)...")
|
| 22 |
self.audio_classifier = AudioClassifier(
|
| 23 |
model_path=audio_model_path,
|
| 24 |
device=device
|
| 25 |
)
|
| 26 |
|
|
|
|
| 27 |
print("\n🎤 Loading Speech Recognizer (Whisper)...")
|
| 28 |
self.speech_recognizer = SpeechRecognizer(
|
| 29 |
model_size=whisper_model_size,
|
| 30 |
device=device
|
| 31 |
)
|
| 32 |
|
|
|
|
| 33 |
print("\n📝 Loading Text Authenticity Analyzer...")
|
| 34 |
self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
|
| 35 |
|
| 36 |
print("\n✅ Pipeline initialization complete!")
|
| 37 |
print("="*60 + "\n")
|
| 38 |
|
| 39 |
-
|
|
|
|
| 40 |
print("\n" + "="*60)
|
| 41 |
print("MULTIMODAL AUTHENTICITY ANALYSIS")
|
| 42 |
print("="*60 + "\n")
|
| 43 |
|
| 44 |
start_time = time.time()
|
| 45 |
|
| 46 |
-
#
|
| 47 |
print("Stage 1: CNN Audio Classification...")
|
| 48 |
print("-" * 40)
|
| 49 |
audio_results = self.audio_classifier.classify(audio_path)
|
|
@@ -51,7 +54,7 @@ class AuthenticityDetectionPipeline:
|
|
| 51 |
print(f" ## Classification: {audio_results['classification'].upper()}")
|
| 52 |
print(f" Confidence: {audio_results['confidence']*100:.1f}%")
|
| 53 |
|
| 54 |
-
#
|
| 55 |
print("\nStage 2: Speech Analysis (Whisper)...")
|
| 56 |
print("-" * 40)
|
| 57 |
asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
|
|
@@ -60,7 +63,7 @@ class AuthenticityDetectionPipeline:
|
|
| 60 |
print(f" Word count: {asr_results['word_count']}")
|
| 61 |
print(f" Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
|
| 62 |
|
| 63 |
-
#
|
| 64 |
print("\nStage 3: Analyzing text authenticity...")
|
| 65 |
print("-" * 40)
|
| 66 |
text_results = self.text_analyzer.analyze(asr_results['transcription'])
|
|
@@ -68,7 +71,7 @@ class AuthenticityDetectionPipeline:
|
|
| 68 |
print(f" Authenticity score: {text_results['authenticity_score']*100:.1f}%")
|
| 69 |
print(f" Risk level: {text_results['risk_level'].upper()}")
|
| 70 |
|
| 71 |
-
#
|
| 72 |
print("\nStage 4: Generating final assessment...")
|
| 73 |
print("-" * 40)
|
| 74 |
final_assessment = self._generate_final_assessment(
|
|
@@ -85,46 +88,68 @@ class AuthenticityDetectionPipeline:
|
|
| 85 |
return {
|
| 86 |
'audio_classification': audio_results,
|
| 87 |
'speech_recognition': asr_results,
|
|
|
|
|
|
|
| 88 |
'text_authenticity': text_results,
|
| 89 |
'final_assessment': final_assessment,
|
| 90 |
'processing_time': elapsed_time
|
| 91 |
}
|
| 92 |
|
|
|
|
| 93 |
def _generate_final_assessment(
|
| 94 |
self,
|
| 95 |
-
audio_results
|
| 96 |
-
asr_results
|
| 97 |
-
text_results
|
| 98 |
-
)
|
| 99 |
|
| 100 |
-
#
|
| 101 |
if audio_results['classification'] == 'spontaneous':
|
| 102 |
audio_score = audio_results['confidence']
|
| 103 |
-
else:
|
| 104 |
audio_score = 1.0 - audio_results['confidence']
|
| 105 |
|
| 106 |
-
#
|
| 107 |
-
# Invert so spontaneous (low kopparapu) = high authenticity
|
| 108 |
speech_pattern_score = 1.0 - asr_results['kopparapu_score']
|
| 109 |
|
| 110 |
-
#
|
| 111 |
filler_ratio = asr_results['filler_words']['ratio']
|
| 112 |
-
filler_score = min(1.0, filler_ratio / 0.05)
|
| 113 |
|
| 114 |
-
#
|
| 115 |
pause_var = asr_results['pause_patterns']['pause_variability']
|
| 116 |
-
pause_score = min(1.0, pause_var / 0.5)
|
| 117 |
|
|
|
|
| 118 |
text_auth_score = text_results['authenticity_score']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
|
|
|
|
|
|
| 120 |
composite_score = (
|
| 121 |
-
audio_score * 0.15 +
|
| 122 |
-
speech_pattern_score * 0.
|
| 123 |
-
filler_score * 0.
|
| 124 |
-
pause_score * 0.
|
| 125 |
-
|
|
|
|
| 126 |
)
|
| 127 |
|
|
|
|
| 128 |
if composite_score >= 0.7:
|
| 129 |
verdict = "AUTHENTIC"
|
| 130 |
risk = "low"
|
|
@@ -142,37 +167,43 @@ class AuthenticityDetectionPipeline:
|
|
| 142 |
risk = "critical"
|
| 143 |
recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
|
| 144 |
|
|
|
|
| 145 |
concerns = []
|
| 146 |
strengths = []
|
| 147 |
|
|
|
|
| 148 |
if audio_results['classification'] == 'read':
|
| 149 |
concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
|
| 150 |
else:
|
| 151 |
strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
|
| 152 |
|
|
|
|
| 153 |
if asr_results['kopparapu_classification'] == 'read':
|
| 154 |
concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
|
| 155 |
else:
|
| 156 |
strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
|
| 157 |
|
|
|
|
| 158 |
filler_ratio = asr_results['filler_words']['ratio']
|
| 159 |
if filler_ratio < 0.02:
|
| 160 |
concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
|
| 161 |
else:
|
| 162 |
strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
|
| 163 |
|
|
|
|
| 164 |
if asr_results['pause_patterns']['pause_variability'] < 0.3:
|
| 165 |
concerns.append("Regular pause patterns suggest reading at punctuation")
|
| 166 |
else:
|
| 167 |
strengths.append("Irregular pause patterns indicate spontaneous thinking")
|
| 168 |
|
|
|
|
| 169 |
if text_results['ai_detection']['ai_generated']:
|
| 170 |
concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
|
| 171 |
|
|
|
|
| 172 |
if text_results['authenticity_score'] > 0.7:
|
| 173 |
strengths.append("Text shows strong originality indicators")
|
| 174 |
|
| 175 |
-
|
| 176 |
return {
|
| 177 |
'verdict': verdict,
|
| 178 |
'risk_level': risk,
|
|
@@ -182,8 +213,9 @@ class AuthenticityDetectionPipeline:
|
|
| 182 |
'recommendation': recommendation,
|
| 183 |
}
|
| 184 |
|
|
|
|
|
|
|
| 185 |
if __name__ == "__main__":
|
| 186 |
-
# Example usage
|
| 187 |
print("Initializing Authenticity Detection Pipeline...")
|
| 188 |
model_path = "spectrogram_cnn_3s_window.pth"
|
| 189 |
pipeline = AuthenticityDetectionPipeline(
|
|
|
|
|
|
|
| 1 |
import time
|
| 2 |
from audio_classifier import AudioClassifier
|
| 3 |
from speech_recognizer import SpeechRecognizer
|
| 4 |
from text_analyzer import TextAuthenticityAnalyzer
|
| 5 |
|
| 6 |
|
| 7 |
+
# Main pipeline class that orchestrates all analysis components
|
| 8 |
class AuthenticityDetectionPipeline:
|
| 9 |
def __init__(
|
| 10 |
self,
|
| 11 |
+
audio_model_path=None,
|
| 12 |
+
whisper_model_size="base",
|
| 13 |
+
device=None,
|
| 14 |
+
ai_detection_threshold=0.78
|
| 15 |
):
|
| 16 |
print("\n" + "="*60)
|
| 17 |
print("Initializing Multimodal Authenticity Detection Pipeline")
|
| 18 |
print("="*60 + "\n")
|
| 19 |
|
| 20 |
+
# load the CNN-based audio classifier
|
| 21 |
print("📊 Loading Audio Classifier (CNN)...")
|
| 22 |
self.audio_classifier = AudioClassifier(
|
| 23 |
model_path=audio_model_path,
|
| 24 |
device=device
|
| 25 |
)
|
| 26 |
|
| 27 |
+
# load whisper model for speech-to-text
|
| 28 |
print("\n🎤 Loading Speech Recognizer (Whisper)...")
|
| 29 |
self.speech_recognizer = SpeechRecognizer(
|
| 30 |
model_size=whisper_model_size,
|
| 31 |
device=device
|
| 32 |
)
|
| 33 |
|
| 34 |
+
# load text analyzer for AI detection
|
| 35 |
print("\n📝 Loading Text Authenticity Analyzer...")
|
| 36 |
self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
|
| 37 |
|
| 38 |
print("\n✅ Pipeline initialization complete!")
|
| 39 |
print("="*60 + "\n")
|
| 40 |
|
| 41 |
+
# main analysis function - runs all stages
|
| 42 |
+
def analyze_audio(self, audio_path, language=None):
|
| 43 |
print("\n" + "="*60)
|
| 44 |
print("MULTIMODAL AUTHENTICITY ANALYSIS")
|
| 45 |
print("="*60 + "\n")
|
| 46 |
|
| 47 |
start_time = time.time()
|
| 48 |
|
| 49 |
+
# stage 1: classify audio using CNN
|
| 50 |
print("Stage 1: CNN Audio Classification...")
|
| 51 |
print("-" * 40)
|
| 52 |
audio_results = self.audio_classifier.classify(audio_path)
|
|
|
|
| 54 |
print(f" ## Classification: {audio_results['classification'].upper()}")
|
| 55 |
print(f" Confidence: {audio_results['confidence']*100:.1f}%")
|
| 56 |
|
| 57 |
+
# stage 2: transcribe and analyze speech patterns
|
| 58 |
print("\nStage 2: Speech Analysis (Whisper)...")
|
| 59 |
print("-" * 40)
|
| 60 |
asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
|
|
|
|
| 63 |
print(f" Word count: {asr_results['word_count']}")
|
| 64 |
print(f" Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
|
| 65 |
|
| 66 |
+
# stage 3: analyze transcribed text for AI patterns
|
| 67 |
print("\nStage 3: Analyzing text authenticity...")
|
| 68 |
print("-" * 40)
|
| 69 |
text_results = self.text_analyzer.analyze(asr_results['transcription'])
|
|
|
|
| 71 |
print(f" Authenticity score: {text_results['authenticity_score']*100:.1f}%")
|
| 72 |
print(f" Risk level: {text_results['risk_level'].upper()}")
|
| 73 |
|
| 74 |
+
# stage 4: combine all results into final assessment
|
| 75 |
print("\nStage 4: Generating final assessment...")
|
| 76 |
print("-" * 40)
|
| 77 |
final_assessment = self._generate_final_assessment(
|
|
|
|
| 88 |
return {
|
| 89 |
'audio_classification': audio_results,
|
| 90 |
'speech_recognition': asr_results,
|
| 91 |
+
'asr': asr_results, # alias for backwards compatibility
|
| 92 |
+
'text_analysis': text_results,
|
| 93 |
'text_authenticity': text_results,
|
| 94 |
'final_assessment': final_assessment,
|
| 95 |
'processing_time': elapsed_time
|
| 96 |
}
|
| 97 |
|
| 98 |
+
# combine scores from all components into final verdict
|
| 99 |
def _generate_final_assessment(
|
| 100 |
self,
|
| 101 |
+
audio_results,
|
| 102 |
+
asr_results,
|
| 103 |
+
text_results
|
| 104 |
+
):
|
| 105 |
|
| 106 |
+
# calculate audio score - spontaneous = authentic
|
| 107 |
if audio_results['classification'] == 'spontaneous':
|
| 108 |
audio_score = audio_results['confidence']
|
| 109 |
+
else:
|
| 110 |
audio_score = 1.0 - audio_results['confidence']
|
| 111 |
|
| 112 |
+
# kopparapu score - invert so spontaneous = high authenticity
|
|
|
|
| 113 |
speech_pattern_score = 1.0 - asr_results['kopparapu_score']
|
| 114 |
|
| 115 |
+
# filler words indicate spontaneous speech
|
| 116 |
filler_ratio = asr_results['filler_words']['ratio']
|
| 117 |
+
filler_score = min(1.0, filler_ratio / 0.05)
|
| 118 |
|
| 119 |
+
# pause variability - higher = more spontaneous
|
| 120 |
pause_var = asr_results['pause_patterns']['pause_variability']
|
| 121 |
+
pause_score = min(1.0, pause_var / 0.5)
|
| 122 |
|
| 123 |
+
# text authenticity from AI detector
|
| 124 |
text_auth_score = text_results['authenticity_score']
|
| 125 |
+
|
| 126 |
+
# get additional linguistic features
|
| 127 |
+
kf = asr_results['kopparapu_features']
|
| 128 |
+
|
| 129 |
+
# speech rate variability
|
| 130 |
+
rate_var = kf.get('speech_rate_variability', 0.0)
|
| 131 |
+
rate_var_score = min(1.0, rate_var / 0.15)
|
| 132 |
+
|
| 133 |
+
# pause regularity - lower = more spontaneous
|
| 134 |
+
pause_reg = kf.get('pause_regularity', 0.5)
|
| 135 |
+
pause_reg_score = 1.0 - pause_reg
|
| 136 |
+
|
| 137 |
+
# self-corrections indicate spontaneous speech
|
| 138 |
+
corrections = kf.get('self_correction_count', 0)
|
| 139 |
+
correction_score = min(1.0, corrections / 2.0)
|
| 140 |
|
| 141 |
+
# calculate weighted composite score
|
| 142 |
+
# weights: CNN+Prosody=15%, Linguistic=35%, AI Detection=50%
|
| 143 |
composite_score = (
|
| 144 |
+
audio_score * 0.15 +
|
| 145 |
+
speech_pattern_score * 0.25 +
|
| 146 |
+
filler_score * 0.05 +
|
| 147 |
+
pause_score * 0.03 +
|
| 148 |
+
rate_var_score * 0.02 +
|
| 149 |
+
text_auth_score * 0.50
|
| 150 |
)
|
| 151 |
|
| 152 |
+
# determine verdict based on composite score
|
| 153 |
if composite_score >= 0.7:
|
| 154 |
verdict = "AUTHENTIC"
|
| 155 |
risk = "low"
|
|
|
|
| 167 |
risk = "critical"
|
| 168 |
recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
|
| 169 |
|
| 170 |
+
# collect concerns and strengths
|
| 171 |
concerns = []
|
| 172 |
strengths = []
|
| 173 |
|
| 174 |
+
# check CNN classification
|
| 175 |
if audio_results['classification'] == 'read':
|
| 176 |
concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
|
| 177 |
else:
|
| 178 |
strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
|
| 179 |
|
| 180 |
+
# check linguistic analysis
|
| 181 |
if asr_results['kopparapu_classification'] == 'read':
|
| 182 |
concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
|
| 183 |
else:
|
| 184 |
strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
|
| 185 |
|
| 186 |
+
# check filler words
|
| 187 |
filler_ratio = asr_results['filler_words']['ratio']
|
| 188 |
if filler_ratio < 0.02:
|
| 189 |
concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
|
| 190 |
else:
|
| 191 |
strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
|
| 192 |
|
| 193 |
+
# check pause patterns
|
| 194 |
if asr_results['pause_patterns']['pause_variability'] < 0.3:
|
| 195 |
concerns.append("Regular pause patterns suggest reading at punctuation")
|
| 196 |
else:
|
| 197 |
strengths.append("Irregular pause patterns indicate spontaneous thinking")
|
| 198 |
|
| 199 |
+
# check AI detection
|
| 200 |
if text_results['ai_detection']['ai_generated']:
|
| 201 |
concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
|
| 202 |
|
| 203 |
+
# check text originality
|
| 204 |
if text_results['authenticity_score'] > 0.7:
|
| 205 |
strengths.append("Text shows strong originality indicators")
|
| 206 |
|
|
|
|
| 207 |
return {
|
| 208 |
'verdict': verdict,
|
| 209 |
'risk_level': risk,
|
|
|
|
| 213 |
'recommendation': recommendation,
|
| 214 |
}
|
| 215 |
|
| 216 |
+
|
| 217 |
+
# test code - runs when script is executed directly
|
| 218 |
if __name__ == "__main__":
|
|
|
|
| 219 |
print("Initializing Authenticity Detection Pipeline...")
|
| 220 |
model_path = "spectrogram_cnn_3s_window.pth"
|
| 221 |
pipeline = AuthenticityDetectionPipeline(
|
speech_recognizer.py
CHANGED
|
@@ -2,58 +2,56 @@ import whisper
|
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
import re
|
| 5 |
-
from typing import Dict, Optional, List
|
| 6 |
import warnings
|
| 7 |
import librosa
|
| 8 |
warnings.filterwarnings("ignore")
|
| 9 |
|
| 10 |
|
|
|
|
| 11 |
class SpeechRecognizer:
|
| 12 |
-
def __init__(self, model_size
|
|
|
|
| 13 |
if device is None:
|
| 14 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
else:
|
| 16 |
self.device = device
|
| 17 |
-
|
|
|
|
| 18 |
print(f"Loading Whisper {model_size} model on {self.device}...")
|
| 19 |
self.model = whisper.load_model(model_size, device=self.device)
|
| 20 |
print(f"Whisper model loaded successfully.")
|
| 21 |
|
| 22 |
self.model_size = model_size
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
try:
|
| 27 |
-
#
|
| 28 |
audio, sr = librosa.load(audio_path, sr=16000)
|
| 29 |
duration = len(audio) / sr
|
| 30 |
|
| 31 |
-
#
|
| 32 |
if duration < 0.1:
|
| 33 |
-
return False, "Audio
|
| 34 |
|
| 35 |
-
#
|
| 36 |
if np.max(np.abs(audio)) < 0.001:
|
| 37 |
-
return False, "Audio
|
| 38 |
|
| 39 |
return True, "Valid", duration
|
| 40 |
|
| 41 |
except Exception as e:
|
| 42 |
-
return False, f"
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
language: Optional[str] = None,
|
| 48 |
-
task: str = "transcribe"
|
| 49 |
-
) -> Dict[str, any]:
|
| 50 |
-
# Validate audio first
|
| 51 |
is_valid, message, audio_duration = self._validate_audio(audio_path)
|
| 52 |
if not is_valid:
|
| 53 |
-
print(f"Audio
|
| 54 |
-
# Return minimal valid response for invalid audio
|
| 55 |
return self._get_empty_response(message, audio_duration)
|
| 56 |
|
|
|
|
| 57 |
try:
|
| 58 |
result = self.model.transcribe(
|
| 59 |
audio_path,
|
|
@@ -61,17 +59,17 @@ class SpeechRecognizer:
|
|
| 61 |
task=task,
|
| 62 |
verbose=False,
|
| 63 |
word_timestamps=True,
|
| 64 |
-
fp16=False #
|
| 65 |
)
|
| 66 |
except (KeyError, RuntimeError) as e:
|
| 67 |
error_msg = str(e)
|
| 68 |
-
#
|
| 69 |
if "reshape tensor of 0 elements" in error_msg or "ambiguous" in error_msg:
|
| 70 |
-
print(f"Audio
|
| 71 |
return self._get_empty_response("Audio too short or corrupted", audio_duration)
|
| 72 |
|
| 73 |
-
#
|
| 74 |
-
print(f"
|
| 75 |
try:
|
| 76 |
result = self.model.transcribe(
|
| 77 |
audio_path,
|
|
@@ -82,20 +80,23 @@ class SpeechRecognizer:
|
|
| 82 |
fp16=False
|
| 83 |
)
|
| 84 |
except Exception as e2:
|
| 85 |
-
print(f"
|
| 86 |
-
return self._get_empty_response(
|
| 87 |
|
|
|
|
| 88 |
transcription = result['text'].strip()
|
| 89 |
detected_language = result.get('language', 'unknown')
|
| 90 |
segments = result.get('segments', [])
|
| 91 |
|
| 92 |
-
#
|
| 93 |
if not transcription or len(transcription.strip()) == 0:
|
| 94 |
print("Warning: Transcription is empty")
|
| 95 |
return self._get_empty_response("No speech detected in audio", audio_duration)
|
| 96 |
|
|
|
|
| 97 |
analysis = self._analyze_transcription(transcription, segments)
|
| 98 |
|
|
|
|
| 99 |
duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
|
| 100 |
kopparapu_features = self._extract_kopparapu_features(
|
| 101 |
transcription, duration, segments, analysis['pause_patterns']
|
|
@@ -117,8 +118,8 @@ class SpeechRecognizer:
|
|
| 117 |
'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
|
| 118 |
}
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
return {
|
| 123 |
'transcription': f"[Error: {reason}]",
|
| 124 |
'language': 'unknown',
|
|
@@ -147,20 +148,23 @@ class SpeechRecognizer:
|
|
| 147 |
},
|
| 148 |
'kopparapu_score': 0.5,
|
| 149 |
'kopparapu_classification': 'unknown',
|
| 150 |
-
'interpretation': f"
|
| 151 |
}
|
| 152 |
|
| 153 |
-
|
|
|
|
| 154 |
words = text.split()
|
| 155 |
word_count = len(words)
|
| 156 |
|
|
|
|
| 157 |
duration = 0
|
| 158 |
if segments:
|
| 159 |
duration = segments[-1]['end'] - segments[0]['start']
|
| 160 |
|
|
|
|
| 161 |
speech_rate = (word_count / duration * 60) if duration > 0 else 0
|
| 162 |
-
|
| 163 |
|
|
|
|
| 164 |
filler_words_list = [
|
| 165 |
('um', r'\bum\b'), ('uh', r'\buh\b'), ('er', r'\ber\b'),
|
| 166 |
('ah', r'\bah\b'), ('like', r'\blike\b'), ('you know', r'\byou know\b'),
|
|
@@ -170,6 +174,7 @@ class SpeechRecognizer:
|
|
| 170 |
('hmm', r'\bhmm+\b'), ('mm', r'\bmm+\b')
|
| 171 |
]
|
| 172 |
|
|
|
|
| 173 |
text_lower = text.lower()
|
| 174 |
filler_count = {}
|
| 175 |
total_fillers = 0
|
|
@@ -181,8 +186,10 @@ class SpeechRecognizer:
|
|
| 181 |
filler_count[filler_name] = count
|
| 182 |
total_fillers += count
|
| 183 |
|
|
|
|
| 184 |
filler_ratio = total_fillers / word_count if word_count > 0 else 0
|
| 185 |
|
|
|
|
| 186 |
pause_patterns = self._analyze_pauses(segments)
|
| 187 |
|
| 188 |
return {
|
|
@@ -197,24 +204,28 @@ class SpeechRecognizer:
|
|
| 197 |
'pause_patterns': pause_patterns
|
| 198 |
}
|
| 199 |
|
| 200 |
-
|
|
|
|
| 201 |
pauses = []
|
| 202 |
|
|
|
|
| 203 |
if len(segments) >= 2:
|
| 204 |
for i in range(len(segments) - 1):
|
| 205 |
pause = segments[i + 1]['start'] - segments[i]['end']
|
| 206 |
-
if pause > 0.05: #
|
| 207 |
pauses.append(pause)
|
| 208 |
|
|
|
|
| 209 |
for segment in segments:
|
| 210 |
if 'words' in segment and len(segment['words']) > 1:
|
| 211 |
words = segment['words']
|
| 212 |
for i in range(len(words) - 1):
|
| 213 |
if 'start' in words[i] and 'end' in words[i] and 'start' in words[i+1]:
|
| 214 |
pause = words[i + 1]['start'] - words[i]['end']
|
| 215 |
-
if pause > 0.15: #
|
| 216 |
pauses.append(pause)
|
| 217 |
|
|
|
|
| 218 |
if not pauses:
|
| 219 |
return {
|
| 220 |
'avg_pause': 0.0,
|
|
@@ -230,11 +241,10 @@ class SpeechRecognizer:
|
|
| 230 |
'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
|
| 231 |
}
|
| 232 |
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
segments: List[Dict] = None, pause_patterns: Dict = None
|
| 236 |
-
) -> Dict:
|
| 237 |
text = text.strip()
|
|
|
|
| 238 |
if len(text) == 0:
|
| 239 |
return {
|
| 240 |
'alpha_ratio': 0.0,
|
|
@@ -249,24 +259,28 @@ class SpeechRecognizer:
|
|
| 249 |
'self_correction_count': 0
|
| 250 |
}
|
| 251 |
|
|
|
|
| 252 |
total_chars = len(text)
|
| 253 |
alpha_chars = sum(c.isalpha() for c in text)
|
| 254 |
nonalpha_chars = total_chars - alpha_chars
|
| 255 |
|
|
|
|
| 256 |
alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
|
| 257 |
|
|
|
|
| 258 |
words = text.split()
|
| 259 |
num_words = max(len(words), 1)
|
| 260 |
chars_per_word = alpha_chars / num_words
|
| 261 |
|
|
|
|
| 262 |
duration_sec = max(duration_sec, 1e-3)
|
| 263 |
words_per_sec = num_words / duration_sec
|
| 264 |
nonalpha_per_sec = nonalpha_chars / duration_sec
|
| 265 |
|
| 266 |
-
#
|
| 267 |
char_reps = len(re.findall(r'(.)\1{2,}', text))
|
| 268 |
|
| 269 |
-
#
|
| 270 |
words_list = text.lower().split()
|
| 271 |
word_reps = 0
|
| 272 |
for i in range(len(words_list) - 1):
|
|
@@ -275,7 +289,7 @@ class SpeechRecognizer:
|
|
| 275 |
|
| 276 |
repetition_count = char_reps + word_reps
|
| 277 |
|
| 278 |
-
#
|
| 279 |
lower = text.lower()
|
| 280 |
filler_patterns = [
|
| 281 |
r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
|
|
@@ -289,23 +303,20 @@ class SpeechRecognizer:
|
|
| 289 |
filler_count += len(re.findall(pattern, lower))
|
| 290 |
filler_rate = filler_count / num_words
|
| 291 |
|
| 292 |
-
#
|
| 293 |
-
|
| 294 |
-
pause_regularity = 0.5 # neutral default
|
| 295 |
if pause_patterns and pause_patterns.get('num_pauses', 0) > 2:
|
| 296 |
pause_var = pause_patterns.get('pause_variability', 0.5)
|
| 297 |
-
#
|
| 298 |
-
# High variability (> 0.6) -> low regularity (close to 0)
|
| 299 |
pause_regularity = max(0.0, min(1.0, 1.0 - (pause_var / 0.6)))
|
| 300 |
|
| 301 |
-
#
|
| 302 |
-
# Read speech has consistent pacing; spontaneous varies with thinking
|
| 303 |
speech_rate_variability = self._compute_rate_variability(segments) if segments else 0.0
|
| 304 |
|
| 305 |
-
#
|
| 306 |
sentence_length_variance = self._compute_sentence_variance(text)
|
| 307 |
|
| 308 |
-
#
|
| 309 |
self_correction_patterns = [
|
| 310 |
r'\bwait\b', r'\bsorry\b', r'\bno\s*,?\s*I\b',
|
| 311 |
r'\bactually\s*,?\s*no\b', r'\blet me\b', r'\bwhat I meant\b',
|
|
@@ -328,14 +339,15 @@ class SpeechRecognizer:
|
|
| 328 |
'self_correction_count': int(self_correction_count)
|
| 329 |
}
|
| 330 |
|
| 331 |
-
|
|
|
|
| 332 |
if not segments or len(segments) < 3:
|
| 333 |
return 0.0
|
| 334 |
|
| 335 |
segment_rates = []
|
| 336 |
for seg in segments:
|
| 337 |
duration = seg.get('end', 0) - seg.get('start', 0)
|
| 338 |
-
if duration > 0.3: #
|
| 339 |
words_in_seg = len(seg.get('text', '').split())
|
| 340 |
rate = words_in_seg / duration
|
| 341 |
if rate > 0:
|
|
@@ -344,42 +356,46 @@ class SpeechRecognizer:
|
|
| 344 |
if len(segment_rates) < 3:
|
| 345 |
return 0.0
|
| 346 |
|
|
|
|
| 347 |
mean_rate = np.mean(segment_rates)
|
| 348 |
std_rate = np.std(segment_rates)
|
| 349 |
|
| 350 |
-
# Coefficient of variation normalized to 0-1
|
| 351 |
cv = std_rate / mean_rate if mean_rate > 0 else 0
|
| 352 |
-
return float(min(1.0, cv / 0.5))
|
| 353 |
|
| 354 |
-
|
| 355 |
-
|
|
|
|
| 356 |
sentences = re.split(r'[.!?]+', text)
|
| 357 |
sentences = [s.strip() for s in sentences if s.strip()]
|
| 358 |
|
| 359 |
if len(sentences) < 2:
|
| 360 |
return 0.0
|
| 361 |
|
|
|
|
| 362 |
lengths = [len(s.split()) for s in sentences]
|
| 363 |
mean_len = np.mean(lengths)
|
| 364 |
std_len = np.std(lengths)
|
| 365 |
|
| 366 |
-
#
|
| 367 |
cv = std_len / mean_len if mean_len > 0 else 0
|
| 368 |
-
return float(min(1.0, cv / 0.6))
|
| 369 |
|
| 370 |
-
|
|
|
|
|
|
|
| 371 |
|
| 372 |
-
|
| 373 |
-
|
|
|
|
| 374 |
f1 = features['chars_per_word']
|
| 375 |
L1 = self._logistic(f1, a=4.8, b=1.2)
|
| 376 |
|
| 377 |
-
# L2:
|
| 378 |
f2 = features['words_per_sec']
|
| 379 |
L2 = self._logistic(f2, a=2.2, b=0.6)
|
| 380 |
|
| 381 |
-
# L3:
|
| 382 |
-
# Combines filler rate, nonalpha, and repetitions
|
| 383 |
disfluency = (
|
| 384 |
features['nonalpha_per_sec'] +
|
| 385 |
8.0 * features['filler_rate'] +
|
|
@@ -387,42 +403,43 @@ class SpeechRecognizer:
|
|
| 387 |
)
|
| 388 |
L3 = self._logistic(-disfluency, a=0.0, b=0.8)
|
| 389 |
|
| 390 |
-
# L4:
|
| 391 |
L4 = features.get('pause_regularity', 0.5)
|
| 392 |
|
| 393 |
-
# L5:
|
| 394 |
rate_var = features.get('speech_rate_variability', 0.0)
|
| 395 |
L5 = 1.0 - rate_var
|
| 396 |
|
| 397 |
-
# L6:
|
| 398 |
sent_var = features.get('sentence_length_variance', 0.0)
|
| 399 |
L6 = 1.0 - sent_var
|
| 400 |
|
| 401 |
-
# L7:
|
| 402 |
corrections = features.get('self_correction_count', 0)
|
| 403 |
L7 = self._logistic(-corrections, a=0.0, b=1.5)
|
| 404 |
|
| 405 |
-
#
|
| 406 |
-
# Higher weights on pause regularity and rate consistency (key read markers)
|
| 407 |
score = (
|
| 408 |
-
0.15 * L1 + #
|
| 409 |
-
0.15 * L2 + #
|
| 410 |
-
0.15 * L3 + #
|
| 411 |
-
0.20 * L4 + #
|
| 412 |
-
0.15 * L5 + #
|
| 413 |
-
0.10 * L6 + #
|
| 414 |
-
0.10 * L7 #
|
| 415 |
)
|
| 416 |
|
| 417 |
return float(score)
|
| 418 |
|
| 419 |
-
|
|
|
|
| 420 |
filler_ratio = analysis['filler_words']['ratio']
|
| 421 |
pause_patterns = analysis['pause_patterns']
|
| 422 |
speech_rate = analysis['speech_rate']
|
| 423 |
|
| 424 |
interpretation = "**Overall Assessment:**\n\n"
|
| 425 |
|
|
|
|
| 426 |
spontaneity_score = 0
|
| 427 |
indicators = []
|
| 428 |
|
|
@@ -437,7 +454,8 @@ class SpeechRecognizer:
|
|
| 437 |
if 120 <= speech_rate <= 180:
|
| 438 |
spontaneity_score += 1
|
| 439 |
indicators.append(f"Natural speech rate ({speech_rate:.1f} words/min)")
|
| 440 |
-
|
|
|
|
| 441 |
if spontaneity_score >= 2:
|
| 442 |
interpretation += "✓ **Speech patterns suggest spontaneous, natural speaking.**\n\n"
|
| 443 |
if indicators:
|
|
@@ -455,13 +473,14 @@ class SpeechRecognizer:
|
|
| 455 |
|
| 456 |
return interpretation
|
| 457 |
|
| 458 |
-
|
|
|
|
| 459 |
result = self.model.transcribe(audio_path, word_timestamps=True, verbose=False)
|
| 460 |
return result.get('segments', [])
|
| 461 |
|
| 462 |
|
|
|
|
| 463 |
if __name__ == "__main__":
|
| 464 |
recognizer = SpeechRecognizer(model_size="base")
|
| 465 |
print(f"Speech recognizer initialized with {recognizer.model_size} model")
|
| 466 |
print(f"Device: {recognizer.device}")
|
| 467 |
-
|
|
|
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
import re
|
|
|
|
| 5 |
import warnings
|
| 6 |
import librosa
|
| 7 |
warnings.filterwarnings("ignore")
|
| 8 |
|
| 9 |
|
| 10 |
+
# Main class for speech recognition and analysis
|
| 11 |
class SpeechRecognizer:
|
| 12 |
+
def __init__(self, model_size="base", device=None):
|
| 13 |
+
# set device - use GPU if available
|
| 14 |
if device is None:
|
| 15 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 16 |
else:
|
| 17 |
self.device = device
|
| 18 |
+
|
| 19 |
+
# load whisper model
|
| 20 |
print(f"Loading Whisper {model_size} model on {self.device}...")
|
| 21 |
self.model = whisper.load_model(model_size, device=self.device)
|
| 22 |
print(f"Whisper model loaded successfully.")
|
| 23 |
|
| 24 |
self.model_size = model_size
|
| 25 |
|
| 26 |
+
# check if audio file is valid before processing
|
| 27 |
+
def _validate_audio(self, audio_path):
|
| 28 |
try:
|
| 29 |
+
# load and check audio
|
| 30 |
audio, sr = librosa.load(audio_path, sr=16000)
|
| 31 |
duration = len(audio) / sr
|
| 32 |
|
| 33 |
+
# audio must be at least 0.1 seconds
|
| 34 |
if duration < 0.1:
|
| 35 |
+
return False, "Audio too short", duration
|
| 36 |
|
| 37 |
+
# check for silent audio
|
| 38 |
if np.max(np.abs(audio)) < 0.001:
|
| 39 |
+
return False, "Audio is silent", duration
|
| 40 |
|
| 41 |
return True, "Valid", duration
|
| 42 |
|
| 43 |
except Exception as e:
|
| 44 |
+
return False, f"Could not load audio file", 0.0
|
| 45 |
+
|
| 46 |
+
# main transcription function
|
| 47 |
+
def transcribe(self, audio_path, language=None, task="transcribe"):
|
| 48 |
+
# validate audio first
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
is_valid, message, audio_duration = self._validate_audio(audio_path)
|
| 50 |
if not is_valid:
|
| 51 |
+
print(f"Audio check failed: {message}")
|
|
|
|
| 52 |
return self._get_empty_response(message, audio_duration)
|
| 53 |
|
| 54 |
+
# try to transcribe with word timestamps
|
| 55 |
try:
|
| 56 |
result = self.model.transcribe(
|
| 57 |
audio_path,
|
|
|
|
| 59 |
task=task,
|
| 60 |
verbose=False,
|
| 61 |
word_timestamps=True,
|
| 62 |
+
fp16=False # avoid fp16 issues
|
| 63 |
)
|
| 64 |
except (KeyError, RuntimeError) as e:
|
| 65 |
error_msg = str(e)
|
| 66 |
+
# handle specific errors
|
| 67 |
if "reshape tensor of 0 elements" in error_msg or "ambiguous" in error_msg:
|
| 68 |
+
print(f"Audio might be too short or corrupted")
|
| 69 |
return self._get_empty_response("Audio too short or corrupted", audio_duration)
|
| 70 |
|
| 71 |
+
# retry without word timestamps
|
| 72 |
+
print(f"First try failed, trying again...")
|
| 73 |
try:
|
| 74 |
result = self.model.transcribe(
|
| 75 |
audio_path,
|
|
|
|
| 80 |
fp16=False
|
| 81 |
)
|
| 82 |
except Exception as e2:
|
| 83 |
+
print(f"Could not transcribe audio: {e2}")
|
| 84 |
+
return self._get_empty_response("Transcription failed", audio_duration)
|
| 85 |
|
| 86 |
+
# extract transcription results
|
| 87 |
transcription = result['text'].strip()
|
| 88 |
detected_language = result.get('language', 'unknown')
|
| 89 |
segments = result.get('segments', [])
|
| 90 |
|
| 91 |
+
# handle empty transcription
|
| 92 |
if not transcription or len(transcription.strip()) == 0:
|
| 93 |
print("Warning: Transcription is empty")
|
| 94 |
return self._get_empty_response("No speech detected in audio", audio_duration)
|
| 95 |
|
| 96 |
+
# analyze transcription for speech patterns
|
| 97 |
analysis = self._analyze_transcription(transcription, segments)
|
| 98 |
|
| 99 |
+
# extract kopparapu features for read/spontaneous detection
|
| 100 |
duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
|
| 101 |
kopparapu_features = self._extract_kopparapu_features(
|
| 102 |
transcription, duration, segments, analysis['pause_patterns']
|
|
|
|
| 118 |
'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
|
| 119 |
}
|
| 120 |
|
| 121 |
+
# return empty response when transcription fails
|
| 122 |
+
def _get_empty_response(self, reason, duration=0.0):
|
| 123 |
return {
|
| 124 |
'transcription': f"[Error: {reason}]",
|
| 125 |
'language': 'unknown',
|
|
|
|
| 148 |
},
|
| 149 |
'kopparapu_score': 0.5,
|
| 150 |
'kopparapu_classification': 'unknown',
|
| 151 |
+
'interpretation': f"Could not process audio: {reason}\n\nTips:\n- Make sure audio is at least 1 second\n- Check that there is actual speech\n- Try a different audio file"
|
| 152 |
}
|
| 153 |
|
| 154 |
+
# analyze transcription for various speech metrics
|
| 155 |
+
def _analyze_transcription(self, text, segments):
|
| 156 |
words = text.split()
|
| 157 |
word_count = len(words)
|
| 158 |
|
| 159 |
+
# calculate duration from segments
|
| 160 |
duration = 0
|
| 161 |
if segments:
|
| 162 |
duration = segments[-1]['end'] - segments[0]['start']
|
| 163 |
|
| 164 |
+
# calculate speaking rate (words per minute)
|
| 165 |
speech_rate = (word_count / duration * 60) if duration > 0 else 0
|
|
|
|
| 166 |
|
| 167 |
+
# list of filler words to detect
|
| 168 |
filler_words_list = [
|
| 169 |
('um', r'\bum\b'), ('uh', r'\buh\b'), ('er', r'\ber\b'),
|
| 170 |
('ah', r'\bah\b'), ('like', r'\blike\b'), ('you know', r'\byou know\b'),
|
|
|
|
| 174 |
('hmm', r'\bhmm+\b'), ('mm', r'\bmm+\b')
|
| 175 |
]
|
| 176 |
|
| 177 |
+
# count filler words
|
| 178 |
text_lower = text.lower()
|
| 179 |
filler_count = {}
|
| 180 |
total_fillers = 0
|
|
|
|
| 186 |
filler_count[filler_name] = count
|
| 187 |
total_fillers += count
|
| 188 |
|
| 189 |
+
# calculate filler ratio
|
| 190 |
filler_ratio = total_fillers / word_count if word_count > 0 else 0
|
| 191 |
|
| 192 |
+
# analyze pause patterns
|
| 193 |
pause_patterns = self._analyze_pauses(segments)
|
| 194 |
|
| 195 |
return {
|
|
|
|
| 204 |
'pause_patterns': pause_patterns
|
| 205 |
}
|
| 206 |
|
| 207 |
+
# extract pause timing information from segments
|
| 208 |
+
def _analyze_pauses(self, segments):
|
| 209 |
pauses = []
|
| 210 |
|
| 211 |
+
# find pauses between segments
|
| 212 |
if len(segments) >= 2:
|
| 213 |
for i in range(len(segments) - 1):
|
| 214 |
pause = segments[i + 1]['start'] - segments[i]['end']
|
| 215 |
+
if pause > 0.05: # pauses > 50ms
|
| 216 |
pauses.append(pause)
|
| 217 |
|
| 218 |
+
# find pauses between words within segments
|
| 219 |
for segment in segments:
|
| 220 |
if 'words' in segment and len(segment['words']) > 1:
|
| 221 |
words = segment['words']
|
| 222 |
for i in range(len(words) - 1):
|
| 223 |
if 'start' in words[i] and 'end' in words[i] and 'start' in words[i+1]:
|
| 224 |
pause = words[i + 1]['start'] - words[i]['end']
|
| 225 |
+
if pause > 0.15: # word-level pauses > 150ms
|
| 226 |
pauses.append(pause)
|
| 227 |
|
| 228 |
+
# return empty stats if no pauses found
|
| 229 |
if not pauses:
|
| 230 |
return {
|
| 231 |
'avg_pause': 0.0,
|
|
|
|
| 241 |
'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
|
| 242 |
}
|
| 243 |
|
| 244 |
+
# extract features based on kopparapu's method for read vs spontaneous detection
|
| 245 |
+
def _extract_kopparapu_features(self, text, duration_sec, segments=None, pause_patterns=None):
|
|
|
|
|
|
|
| 246 |
text = text.strip()
|
| 247 |
+
# handle empty text
|
| 248 |
if len(text) == 0:
|
| 249 |
return {
|
| 250 |
'alpha_ratio': 0.0,
|
|
|
|
| 259 |
'self_correction_count': 0
|
| 260 |
}
|
| 261 |
|
| 262 |
+
# count character types
|
| 263 |
total_chars = len(text)
|
| 264 |
alpha_chars = sum(c.isalpha() for c in text)
|
| 265 |
nonalpha_chars = total_chars - alpha_chars
|
| 266 |
|
| 267 |
+
# ratio of alphabetic characters
|
| 268 |
alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
|
| 269 |
|
| 270 |
+
# average word length
|
| 271 |
words = text.split()
|
| 272 |
num_words = max(len(words), 1)
|
| 273 |
chars_per_word = alpha_chars / num_words
|
| 274 |
|
| 275 |
+
# speaking rate features
|
| 276 |
duration_sec = max(duration_sec, 1e-3)
|
| 277 |
words_per_sec = num_words / duration_sec
|
| 278 |
nonalpha_per_sec = nonalpha_chars / duration_sec
|
| 279 |
|
| 280 |
+
# detect character repetitions like "sooo" or "ummmm"
|
| 281 |
char_reps = len(re.findall(r'(.)\1{2,}', text))
|
| 282 |
|
| 283 |
+
# detect word repetitions like "I I think"
|
| 284 |
words_list = text.lower().split()
|
| 285 |
word_reps = 0
|
| 286 |
for i in range(len(words_list) - 1):
|
|
|
|
| 289 |
|
| 290 |
repetition_count = char_reps + word_reps
|
| 291 |
|
| 292 |
+
# count filler words
|
| 293 |
lower = text.lower()
|
| 294 |
filler_patterns = [
|
| 295 |
r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
|
|
|
|
| 303 |
filler_count += len(re.findall(pattern, lower))
|
| 304 |
filler_rate = filler_count / num_words
|
| 305 |
|
| 306 |
+
# pause regularity - read speech has regular pauses at punctuation
|
| 307 |
+
pause_regularity = 0.5
|
|
|
|
| 308 |
if pause_patterns and pause_patterns.get('num_pauses', 0) > 2:
|
| 309 |
pause_var = pause_patterns.get('pause_variability', 0.5)
|
| 310 |
+
# low variability = regular pauses = likely read
|
|
|
|
| 311 |
pause_regularity = max(0.0, min(1.0, 1.0 - (pause_var / 0.6)))
|
| 312 |
|
| 313 |
+
# speech rate variability across segments
|
|
|
|
| 314 |
speech_rate_variability = self._compute_rate_variability(segments) if segments else 0.0
|
| 315 |
|
| 316 |
+
# sentence length variance - uniform = likely read
|
| 317 |
sentence_length_variance = self._compute_sentence_variance(text)
|
| 318 |
|
| 319 |
+
# count self-corrections and false starts
|
| 320 |
self_correction_patterns = [
|
| 321 |
r'\bwait\b', r'\bsorry\b', r'\bno\s*,?\s*I\b',
|
| 322 |
r'\bactually\s*,?\s*no\b', r'\blet me\b', r'\bwhat I meant\b',
|
|
|
|
| 339 |
'self_correction_count': int(self_correction_count)
|
| 340 |
}
|
| 341 |
|
| 342 |
+
# compute variability in speaking rate across segments
|
| 343 |
+
def _compute_rate_variability(self, segments):
|
| 344 |
if not segments or len(segments) < 3:
|
| 345 |
return 0.0
|
| 346 |
|
| 347 |
segment_rates = []
|
| 348 |
for seg in segments:
|
| 349 |
duration = seg.get('end', 0) - seg.get('start', 0)
|
| 350 |
+
if duration > 0.3: # only segments > 300ms
|
| 351 |
words_in_seg = len(seg.get('text', '').split())
|
| 352 |
rate = words_in_seg / duration
|
| 353 |
if rate > 0:
|
|
|
|
| 356 |
if len(segment_rates) < 3:
|
| 357 |
return 0.0
|
| 358 |
|
| 359 |
+
# calculate coefficient of variation
|
| 360 |
mean_rate = np.mean(segment_rates)
|
| 361 |
std_rate = np.std(segment_rates)
|
| 362 |
|
|
|
|
| 363 |
cv = std_rate / mean_rate if mean_rate > 0 else 0
|
| 364 |
+
return float(min(1.0, cv / 0.5))
|
| 365 |
|
| 366 |
+
# compute variance in sentence lengths
|
| 367 |
+
def _compute_sentence_variance(self, text):
|
| 368 |
+
# split into sentences
|
| 369 |
sentences = re.split(r'[.!?]+', text)
|
| 370 |
sentences = [s.strip() for s in sentences if s.strip()]
|
| 371 |
|
| 372 |
if len(sentences) < 2:
|
| 373 |
return 0.0
|
| 374 |
|
| 375 |
+
# get word counts per sentence
|
| 376 |
lengths = [len(s.split()) for s in sentences]
|
| 377 |
mean_len = np.mean(lengths)
|
| 378 |
std_len = np.std(lengths)
|
| 379 |
|
| 380 |
+
# coefficient of variation normalized
|
| 381 |
cv = std_len / mean_len if mean_len > 0 else 0
|
| 382 |
+
return float(min(1.0, cv / 0.6))
|
| 383 |
|
| 384 |
+
# logistic function for smooth score transitions
|
| 385 |
+
def _logistic(self, x, a, b):
|
| 386 |
+
return 1.0 / (1.0 + np.exp(-(x - a) / b))
|
| 387 |
|
| 388 |
+
# calculate overall kopparapu score for read vs spontaneous
|
| 389 |
+
def _calculate_kopparapu_score(self, features):
|
| 390 |
+
# L1: vocabulary complexity - higher = more formal = read
|
| 391 |
f1 = features['chars_per_word']
|
| 392 |
L1 = self._logistic(f1, a=4.8, b=1.2)
|
| 393 |
|
| 394 |
+
# L2: speaking rate - faster, steadier = read
|
| 395 |
f2 = features['words_per_sec']
|
| 396 |
L2 = self._logistic(f2, a=2.2, b=0.6)
|
| 397 |
|
| 398 |
+
# L3: disfluency - less disfluency = more read
|
|
|
|
| 399 |
disfluency = (
|
| 400 |
features['nonalpha_per_sec'] +
|
| 401 |
8.0 * features['filler_rate'] +
|
|
|
|
| 403 |
)
|
| 404 |
L3 = self._logistic(-disfluency, a=0.0, b=0.8)
|
| 405 |
|
| 406 |
+
# L4: pause regularity - regular = read
|
| 407 |
L4 = features.get('pause_regularity', 0.5)
|
| 408 |
|
| 409 |
+
# L5: rate variability - low = read
|
| 410 |
rate_var = features.get('speech_rate_variability', 0.0)
|
| 411 |
L5 = 1.0 - rate_var
|
| 412 |
|
| 413 |
+
# L6: sentence variance - uniform = read
|
| 414 |
sent_var = features.get('sentence_length_variance', 0.0)
|
| 415 |
L6 = 1.0 - sent_var
|
| 416 |
|
| 417 |
+
# L7: self-corrections - fewer = read
|
| 418 |
corrections = features.get('self_correction_count', 0)
|
| 419 |
L7 = self._logistic(-corrections, a=0.0, b=1.5)
|
| 420 |
|
| 421 |
+
# weighted combination
|
|
|
|
| 422 |
score = (
|
| 423 |
+
0.15 * L1 + # vocabulary complexity
|
| 424 |
+
0.15 * L2 + # speaking rate
|
| 425 |
+
0.15 * L3 + # disfluency
|
| 426 |
+
0.20 * L4 + # pause regularity
|
| 427 |
+
0.15 * L5 + # rate variability
|
| 428 |
+
0.10 * L6 + # sentence uniformity
|
| 429 |
+
0.10 * L7 # self-corrections
|
| 430 |
)
|
| 431 |
|
| 432 |
return float(score)
|
| 433 |
|
| 434 |
+
# generate human-readable interpretation of speech patterns
|
| 435 |
+
def _interpret_speech_patterns(self, analysis, kopparapu_features=None, kopparapu_score=None):
|
| 436 |
filler_ratio = analysis['filler_words']['ratio']
|
| 437 |
pause_patterns = analysis['pause_patterns']
|
| 438 |
speech_rate = analysis['speech_rate']
|
| 439 |
|
| 440 |
interpretation = "**Overall Assessment:**\n\n"
|
| 441 |
|
| 442 |
+
# calculate spontaneity score
|
| 443 |
spontaneity_score = 0
|
| 444 |
indicators = []
|
| 445 |
|
|
|
|
| 454 |
if 120 <= speech_rate <= 180:
|
| 455 |
spontaneity_score += 1
|
| 456 |
indicators.append(f"Natural speech rate ({speech_rate:.1f} words/min)")
|
| 457 |
+
|
| 458 |
+
# generate interpretation based on score
|
| 459 |
if spontaneity_score >= 2:
|
| 460 |
interpretation += "✓ **Speech patterns suggest spontaneous, natural speaking.**\n\n"
|
| 461 |
if indicators:
|
|
|
|
| 473 |
|
| 474 |
return interpretation
|
| 475 |
|
| 476 |
+
# get detailed segment information
|
| 477 |
+
def get_detailed_segments(self, audio_path):
|
| 478 |
result = self.model.transcribe(audio_path, word_timestamps=True, verbose=False)
|
| 479 |
return result.get('segments', [])
|
| 480 |
|
| 481 |
|
| 482 |
+
# test code - runs when script is executed directly
|
| 483 |
if __name__ == "__main__":
|
| 484 |
recognizer = SpeechRecognizer(model_size="base")
|
| 485 |
print(f"Speech recognizer initialized with {recognizer.model_size} model")
|
| 486 |
print(f"Device: {recognizer.device}")
|
|
|
text_analyzer.py
CHANGED
|
@@ -1,18 +1,7 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import requests
|
| 3 |
-
from typing import Dict, List, Tuple, Optional
|
| 4 |
-
import torch
|
| 5 |
-
from transformers import (
|
| 6 |
-
AutoTokenizer,
|
| 7 |
-
AutoModelForSequenceClassification,
|
| 8 |
-
RobertaTokenizer,
|
| 9 |
-
RobertaForSequenceClassification
|
| 10 |
-
)
|
| 11 |
-
import numpy as np
|
| 12 |
-
from collections import Counter
|
| 13 |
import warnings
|
| 14 |
warnings.filterwarnings("ignore")
|
| 15 |
|
|
|
|
| 16 |
try:
|
| 17 |
from plagiarism_detection import ai_plagiarism_detection
|
| 18 |
DESKLIB_AVAILABLE = True
|
|
@@ -21,12 +10,12 @@ except ImportError:
|
|
| 21 |
print("Warning: plagiarism_detection module not found. Using fallback AI detection.")
|
| 22 |
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
class AITextDetector:
|
| 27 |
-
def __init__(self, device
|
| 28 |
self.threshold = threshold
|
| 29 |
|
|
|
|
| 30 |
if not DESKLIB_AVAILABLE:
|
| 31 |
print("Warning: plagiarism_detection module not found. AI detection will not be available.")
|
| 32 |
print("Ensure plagiarism_detection.py is in the same directory.")
|
|
@@ -35,10 +24,11 @@ class AITextDetector:
|
|
| 35 |
print(f"Using Desklib AI text detector (threshold: {self.threshold})")
|
| 36 |
self.available = True
|
| 37 |
|
| 38 |
-
|
|
|
|
| 39 |
|
|
|
|
| 40 |
if not self.available:
|
| 41 |
-
# Return neutral result if Desklib not available
|
| 42 |
return {
|
| 43 |
'ai_generated': False,
|
| 44 |
'confidence': 0.5,
|
|
@@ -47,7 +37,7 @@ class AITextDetector:
|
|
| 47 |
'model_used': 'N/A (module not found)'
|
| 48 |
}
|
| 49 |
|
| 50 |
-
#
|
| 51 |
try:
|
| 52 |
probability, ai_detected = ai_plagiarism_detection(
|
| 53 |
text,
|
|
@@ -63,17 +53,17 @@ class AITextDetector:
|
|
| 63 |
'model_used': 'Desklib AI Detector v1.01'
|
| 64 |
}
|
| 65 |
except Exception as e:
|
| 66 |
-
print(f"
|
| 67 |
return {
|
| 68 |
'ai_generated': False,
|
| 69 |
'confidence': 0.5,
|
| 70 |
'indicators': [],
|
| 71 |
-
'interpretation':
|
| 72 |
'model_used': 'Error'
|
| 73 |
}
|
| 74 |
|
| 75 |
-
|
| 76 |
-
def _identify_ai_indicators(self, probability
|
| 77 |
indicators = []
|
| 78 |
|
| 79 |
if probability > 0.9:
|
|
@@ -85,7 +75,8 @@ class AITextDetector:
|
|
| 85 |
|
| 86 |
return indicators
|
| 87 |
|
| 88 |
-
|
|
|
|
| 89 |
interpretation = f"**AI-Generated Text Detection:**\n\n"
|
| 90 |
interpretation += f"- AI Probability Score: {score*100:.1f}%\n"
|
| 91 |
interpretation += f"- Detection Threshold: {self.threshold*100:.0f}%\n"
|
|
@@ -93,21 +84,23 @@ class AITextDetector:
|
|
| 93 |
return interpretation
|
| 94 |
|
| 95 |
|
|
|
|
| 96 |
class TextAuthenticityAnalyzer:
|
| 97 |
|
| 98 |
-
def __init__(self, device
|
| 99 |
-
|
| 100 |
self.ai_detector = AITextDetector(device=device, threshold=ai_threshold)
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
|
|
|
| 104 |
ai_results = self.ai_detector.detect_ai_text(text)
|
| 105 |
|
| 106 |
-
#
|
| 107 |
ai_penalty = ai_results['confidence']
|
| 108 |
authenticity_score = 1.0 - ai_penalty
|
| 109 |
|
| 110 |
-
#
|
| 111 |
if authenticity_score < 0.3:
|
| 112 |
overall_assessment = "HIGH RISK: Strong AI-generated text indicators"
|
| 113 |
risk_level = "high"
|
|
@@ -129,9 +122,8 @@ class TextAuthenticityAnalyzer:
|
|
| 129 |
}
|
| 130 |
|
| 131 |
|
|
|
|
| 132 |
if __name__ == "__main__":
|
| 133 |
-
# Example usage
|
| 134 |
analyzer = TextAuthenticityAnalyzer()
|
| 135 |
print("Text authenticity analyzer initialized.")
|
| 136 |
print("Components: Plagiarism Detector + AI Text Detector")
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import warnings
|
| 2 |
warnings.filterwarnings("ignore")
|
| 3 |
|
| 4 |
+
# try to import the desklib AI detector
|
| 5 |
try:
|
| 6 |
from plagiarism_detection import ai_plagiarism_detection
|
| 7 |
DESKLIB_AVAILABLE = True
|
|
|
|
| 10 |
print("Warning: plagiarism_detection module not found. Using fallback AI detection.")
|
| 11 |
|
| 12 |
|
| 13 |
+
# class for detecting AI-generated text
|
|
|
|
| 14 |
class AITextDetector:
|
| 15 |
+
def __init__(self, device=None, threshold=0.78):
|
| 16 |
self.threshold = threshold
|
| 17 |
|
| 18 |
+
# check if desklib model is available
|
| 19 |
if not DESKLIB_AVAILABLE:
|
| 20 |
print("Warning: plagiarism_detection module not found. AI detection will not be available.")
|
| 21 |
print("Ensure plagiarism_detection.py is in the same directory.")
|
|
|
|
| 24 |
print(f"Using Desklib AI text detector (threshold: {self.threshold})")
|
| 25 |
self.available = True
|
| 26 |
|
| 27 |
+
# main detection function
|
| 28 |
+
def detect_ai_text(self, text):
|
| 29 |
|
| 30 |
+
# return neutral result if detector not available
|
| 31 |
if not self.available:
|
|
|
|
| 32 |
return {
|
| 33 |
'ai_generated': False,
|
| 34 |
'confidence': 0.5,
|
|
|
|
| 37 |
'model_used': 'N/A (module not found)'
|
| 38 |
}
|
| 39 |
|
| 40 |
+
# run detection using desklib model
|
| 41 |
try:
|
| 42 |
probability, ai_detected = ai_plagiarism_detection(
|
| 43 |
text,
|
|
|
|
| 53 |
'model_used': 'Desklib AI Detector v1.01'
|
| 54 |
}
|
| 55 |
except Exception as e:
|
| 56 |
+
print(f"Something went wrong with AI detection: {e}")
|
| 57 |
return {
|
| 58 |
'ai_generated': False,
|
| 59 |
'confidence': 0.5,
|
| 60 |
'indicators': [],
|
| 61 |
+
'interpretation': "Could not run AI detection",
|
| 62 |
'model_used': 'Error'
|
| 63 |
}
|
| 64 |
|
| 65 |
+
# identify specific indicators based on probability
|
| 66 |
+
def _identify_ai_indicators(self, probability):
|
| 67 |
indicators = []
|
| 68 |
|
| 69 |
if probability > 0.9:
|
|
|
|
| 75 |
|
| 76 |
return indicators
|
| 77 |
|
| 78 |
+
# generate interpretation text
|
| 79 |
+
def _interpret_ai_detection(self, score):
|
| 80 |
interpretation = f"**AI-Generated Text Detection:**\n\n"
|
| 81 |
interpretation += f"- AI Probability Score: {score*100:.1f}%\n"
|
| 82 |
interpretation += f"- Detection Threshold: {self.threshold*100:.0f}%\n"
|
|
|
|
| 84 |
return interpretation
|
| 85 |
|
| 86 |
|
| 87 |
+
# main analyzer class that combines all text analysis
|
| 88 |
class TextAuthenticityAnalyzer:
|
| 89 |
|
| 90 |
+
def __init__(self, device=None, ai_threshold=0.78):
|
| 91 |
+
# initialize AI detector
|
| 92 |
self.ai_detector = AITextDetector(device=device, threshold=ai_threshold)
|
| 93 |
|
| 94 |
+
# analyze text for authenticity
|
| 95 |
+
def analyze(self, text):
|
| 96 |
+
# run AI detection
|
| 97 |
ai_results = self.ai_detector.detect_ai_text(text)
|
| 98 |
|
| 99 |
+
# calculate authenticity score (inverse of AI probability)
|
| 100 |
ai_penalty = ai_results['confidence']
|
| 101 |
authenticity_score = 1.0 - ai_penalty
|
| 102 |
|
| 103 |
+
# determine risk level based on authenticity
|
| 104 |
if authenticity_score < 0.3:
|
| 105 |
overall_assessment = "HIGH RISK: Strong AI-generated text indicators"
|
| 106 |
risk_level = "high"
|
|
|
|
| 122 |
}
|
| 123 |
|
| 124 |
|
| 125 |
+
# test code - runs when script is executed directly
|
| 126 |
if __name__ == "__main__":
|
|
|
|
| 127 |
analyzer = TextAuthenticityAnalyzer()
|
| 128 |
print("Text authenticity analyzer initialized.")
|
| 129 |
print("Components: Plagiarism Detector + AI Text Detector")
|
|
|