import gradio as gr
import os
from pipeline import AuthenticityDetectionPipeline
import traceback
try:
pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
pipeline_ready = True
except Exception:
pipeline_ready = False
def analyze_audio_file(audio_file):
if not pipeline_ready:
return (
"Error: Pipeline not initialized. Please check the installation.",
"", "", "", ""
)
if audio_file is None:
return (
"Please upload an audio file.",
"", "", "", ""
)
try:
language_code = None
results = pipeline.analyze_audio(audio_file, language=language_code)
audio_class = results['audio_classification']
asr = results['speech_recognition']
text_auth = results['text_authenticity']
final = results['final_assessment']
verdict_color = {
"AUTHENTIC": "#10b981",
"LIKELY AUTHENTIC": "#3b82f6",
"QUESTIONABLE": "#f59e0b",
"LIKELY INAUTHENTIC": "#ef4444"
}
color = verdict_color.get(final['verdict'], '#6b7280')
overall_status = f"""
{final['verdict']}
{final['composite_authenticity_score']*100:.0f}%
Authenticity Score
{final['risk_level'].upper()}
Risk Level
{results['processing_time']:.1f}s
Processing Time
{final['recommendation']}
"""
acoustic_output = audio_class['interpretation']
transcription_output = "### Speech Transcription\n\n"
transcription_output += f"| Metric | Value |\n"
transcription_output += f"|--------|-------|\n"
transcription_output += f"| **Language** | {asr['language'].upper()} |\n"
transcription_output += f"| **Duration** | {asr['duration']:.1f} seconds |\n"
transcription_output += f"| **Word Count** | {asr['word_count']} words |\n"
transcription_output += f"| **Speech Rate** | {asr['speech_rate']:.1f} words/min |\n\n"
if asr['speech_rate'] > 160:
transcription_output += "**Fast speech rate** - Above average speaking speed\n\n"
elif asr['speech_rate'] < 120:
transcription_output += "**Slow speech rate** - Below average speaking speed\n\n"
else:
transcription_output += "**Normal speech rate** - Average conversational pace\n\n"
transcription_output += "---\n\n"
transcription_output += "#### Full Transcription\n\n"
transcription_output += f"> {asr['transcription']}"
if 'kopparapu_score' in asr:
classification = asr['kopparapu_classification'].upper()
confidence = asr['kopparapu_score'] if asr['kopparapu_score'] >= 0.5 else (1 - asr['kopparapu_score'])
speech_patterns = f" ### **Classification: {classification} SPEECH**\n\n"
speech_patterns += f"**Score:** {asr['kopparapu_score']:.3f} (0=spontaneous, 1=read)\n"
speech_patterns += f"**Confidence:** {confidence*100:.1f}%\n\n"
speech_patterns += "---\n\n"
speech_patterns += "#### Linguistic Metrics\n\n"
kf = asr['kopparapu_features']
speech_patterns += "| Feature | Value | Interpretation |\n"
speech_patterns += "|---------|-------|----------------|\n"
speech_patterns += f"| **Characters/Word** | {kf['chars_per_word']:.2f} | "
if kf['chars_per_word'] > 5.5:
speech_patterns += "Complex vocabulary |\n"
elif kf['chars_per_word'] < 4.5:
speech_patterns += "Simple vocabulary |\n"
else:
speech_patterns += "Average complexity |\n"
speech_patterns += f"| **Words/Second** | {kf['words_per_sec']:.2f} | "
if kf['words_per_sec'] > 3:
speech_patterns += "Fast pacing |\n"
elif kf['words_per_sec'] < 2:
speech_patterns += "Slow pacing |\n"
else:
speech_patterns += "Normal pacing |\n"
speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
if kf['filler_rate'] > 0.05:
speech_patterns += "High (spontaneous) |\n"
elif kf['filler_rate'] < 0.02:
speech_patterns += "Low (scripted) |\n"
else:
speech_patterns += "Moderate |\n"
speech_patterns += f"| **Repetitions** | {kf['repetition_count']} | "
if kf['repetition_count'] > 3:
speech_patterns += "Multiple (thinking aloud) |\n"
elif kf['repetition_count'] == 0:
speech_patterns += "None (prepared) |\n"
else:
speech_patterns += "Few |\n"
speech_patterns += "\n---\n\n"
speech_patterns += "#### Reading Style Indicators\n\n"
speech_patterns += "| Feature | Value | Interpretation |\n"
speech_patterns += "|---------|-------|----------------|\n"
# Pause regularity
pause_reg = kf.get('pause_regularity', 0.5)
speech_patterns += f"| **Pause Regularity** | {pause_reg:.2f} | "
if pause_reg > 0.7:
speech_patterns += "Very regular (read) |\n"
elif pause_reg > 0.4:
speech_patterns += "Moderate |\n"
else:
speech_patterns += "Irregular (spontaneous) |\n"
# Speech rate variability
rate_var = kf.get('speech_rate_variability', 0.0)
speech_patterns += f"| **Rate Variability** | {rate_var:.2f} | "
if rate_var > 0.6:
speech_patterns += "High (spontaneous) |\n"
elif rate_var > 0.3:
speech_patterns += "Moderate |\n"
else:
speech_patterns += "Steady pace (read) |\n"
# Sentence variance
sent_var = kf.get('sentence_length_variance', 0.0)
speech_patterns += f"| **Sentence Variance** | {sent_var:.2f} | "
if sent_var > 0.5:
speech_patterns += "Variable (spontaneous) |\n"
elif sent_var > 0.25:
speech_patterns += "Moderate |\n"
else:
speech_patterns += "Uniform (read) |\n"
# Self-corrections
corrections = kf.get('self_correction_count', 0)
speech_patterns += f"| **Self-Corrections** | {corrections} | "
if corrections > 2:
speech_patterns += "Multiple (spontaneous) |\n"
elif corrections > 0:
speech_patterns += "Few |\n"
else:
speech_patterns += "None (scripted) |\n"
speech_patterns += "\n"
speech_patterns += "---\n\n"
speech_patterns += "#### Filler Words & Disfluencies\n\n"
filler_ratio = asr['filler_words']['ratio']
speech_patterns += f"**Count:** {asr['filler_words']['count']} filler words\n"
speech_patterns += f"**Ratio:** {filler_ratio*100:.2f}% of speech\n\n"
if asr['filler_words']['details']:
speech_patterns += "**Found:** " + ', '.join([f"*{k}* ({v})" for k, v in asr['filler_words']['details'].items()]) + "\n\n"
if filler_ratio > 0.05:
speech_patterns += "**High filler usage** - Strong indicator of spontaneous, unscripted speech\n\n"
elif filler_ratio < 0.02:
speech_patterns += "**Low filler usage** - May indicate reading or highly rehearsed speech\n\n"
else:
speech_patterns += "**Moderate filler usage** - Normal conversational pattern\n\n"
speech_patterns += "---\n\n"
speech_patterns += "#### Pause Patterns\n\n"
pause_var = asr['pause_patterns']['pause_variability']
speech_patterns += f"**Total Pauses:** {asr['pause_patterns']['num_pauses']}\n"
speech_patterns += f"**Average Duration:** {asr['pause_patterns']['avg_pause']:.2f}s\n"
speech_patterns += f"**Longest Pause:** {asr['pause_patterns']['max_pause']:.2f}s\n"
speech_patterns += f"**Variability:** {pause_var:.2f}\n\n"
if pause_var < 0.3:
speech_patterns += "**Regular pauses** - Consistent pattern suggests reading at punctuation marks\n\n"
elif pause_var > 0.6:
speech_patterns += "**Irregular pauses** - Natural thinking breaks indicate spontaneous speech\n\n"
else:
speech_patterns += "**Moderate variability** - Mixed pattern\n\n"
is_ai = text_auth['ai_detection']['ai_generated']
ai_prob = text_auth['ai_detection']['confidence']
if is_ai:
ai_output = "### **AI-GENERATED LIKELY**\n\n"
else:
ai_output = "### **HUMAN-WRITTEN LIKELY**\n\n"
ai_output += "**Confidence:**\n\n"
bar_length = 30
ai_bars = int(ai_prob * bar_length)
human_bars = bar_length - ai_bars
ai_output += f"```\nAI: [{'█' * ai_bars}{'░' * human_bars}] {ai_prob*100:.0f}%\n"
ai_output += f"Human: [{'█' * human_bars}{'░' * ai_bars}] {(1-ai_prob)*100:.0f}%\n```\n\n"
ai_output += "---\n\n"
ai_output += "#### Interpretation\n\n"
ai_interpretation = text_auth['ai_detection'].get('interpretation', 'No interpretation available.')
if ai_interpretation:
ai_output += ai_interpretation
else:
ai_output += "No interpretation available."
return (
overall_status,
acoustic_output,
transcription_output,
speech_patterns,
ai_output,
)
except Exception as e:
error_msg = f"Error during analysis:\n\n{str(e)}\n\n{traceback.format_exc()}"
return (error_msg, "", "", "", "", "")
def create_interface():
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
.gradio-container {
font-family: 'IBM Plex Sans', sans-serif !important;
background: white !important;
}
.contain {
max-width: 100% !important;
width: 100% !important;
margin: 0 auto !important;
background: white !important;
padding: 0 !important;
}
.tab-nav button {
font-family: 'IBM Plex Sans', sans-serif;
font-size: 14px;
font-weight: 500;
padding: 10px 16px;
border-radius: 8px 8px 0 0;
transition: all 0.2s;
}
.tab-nav button.selected {
background: #2563eb;
color: white;
font-weight: 600;
}
button.primary, .primary {
background: #2563eb !important;
color: white !important;
border: none !important;
font-size: 16px !important;
font-weight: 600 !important;
padding: 12px 24px !important;
border-radius: 8px !important;
transition: all 0.2s !important;
}
button.primary:hover, .primary:hover {
background: #1d4ed8 !important;
}
.markdown-text {
font-family: 'IBM Plex Sans', sans-serif;
line-height: 1.7;
}
h1, h2, h3, h4 {
font-family: 'IBM Plex Sans', sans-serif;
font-weight: 600;
}
"""
with gr.Blocks(title="Authenticity Detection System") as demo:
gr.HTML(f"""
LEIDEN UNIVERSITY · LIACS
Audio Processing & Indexing Project
Detecting AI-Assisted Responses in Online Settings
Multi-Modal Analysis
Acoustic + Linguistic
""")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("""
Audio Input
Upload or record your audio file
""")
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Audio File",
show_label=False
)
analyze_btn = gr.Button(
"Analyze Audio",
variant="primary",
size="lg"
)
gr.HTML("""
Requirements
Formats: WAV, MP3, M4A, FLAC, OGG
Duration: 30 sec - 5 min
Note: Provides probabilistic assessments.
Use as one factor in evaluation.
""")
with gr.Column(scale=2):
gr.HTML("""
Analysis Results
You'll see results here
""")
overall_output = gr.Markdown()
with gr.Tabs() as tabs:
with gr.Tab("Acoustic Features"):
acoustic_output = gr.Markdown()
with gr.Tab("Transcription"):
transcription_output = gr.Markdown()
with gr.Tab("Speech Patterns"):
speech_output = gr.Markdown()
with gr.Tab("AI Detection"):
ai_output = gr.Markdown()
# Add example audio files with caching
gr.HTML("""
Try these examples:
""")
examples_dir = os.path.join(os.path.dirname(__file__), "examples")
gr.Examples(
examples=[
[os.path.join(examples_dir, "read1.ogg")],
[os.path.join(examples_dir, "spontaneous1.ogg")]
],
inputs=[audio_input],
outputs=[
overall_output,
acoustic_output,
transcription_output,
speech_output,
ai_output,
],
fn=analyze_audio_file,
label="",
examples_per_page=2,
cache_examples=True
)
def show_loading():
loading_html = """
"""
loading_msg = " **Processing...**"
return loading_html, loading_msg, loading_msg, loading_msg, loading_msg
analyze_btn.click(
fn=show_loading,
inputs=None,
outputs=[
overall_output,
acoustic_output,
transcription_output,
speech_output,
ai_output,
],
queue=False
).then(
fn=analyze_audio_file,
inputs=[audio_input],
outputs=[
overall_output,
acoustic_output,
transcription_output,
speech_output,
ai_output,
]
)
gr.HTML("""
""")
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)