import gradio as gr import os from pipeline import AuthenticityDetectionPipeline import traceback try: pipeline = AuthenticityDetectionPipeline(whisper_model_size="base") pipeline_ready = True except Exception: pipeline_ready = False def analyze_audio_file(audio_file): if not pipeline_ready: return ( "Error: Pipeline not initialized. Please check the installation.", "", "", "", "" ) if audio_file is None: return ( "Please upload an audio file.", "", "", "", "" ) try: language_code = None results = pipeline.analyze_audio(audio_file, language=language_code) audio_class = results['audio_classification'] asr = results['speech_recognition'] text_auth = results['text_authenticity'] final = results['final_assessment'] verdict_color = { "AUTHENTIC": "#10b981", "LIKELY AUTHENTIC": "#3b82f6", "QUESTIONABLE": "#f59e0b", "LIKELY INAUTHENTIC": "#ef4444" } color = verdict_color.get(final['verdict'], '#6b7280') overall_status = f"""

{final['verdict']}

{final['composite_authenticity_score']*100:.0f}%

Authenticity Score

{final['risk_level'].upper()}

Risk Level

{results['processing_time']:.1f}s

Processing Time

{final['recommendation']}

""" acoustic_output = audio_class['interpretation'] transcription_output = "### Speech Transcription\n\n" transcription_output += f"| Metric | Value |\n" transcription_output += f"|--------|-------|\n" transcription_output += f"| **Language** | {asr['language'].upper()} |\n" transcription_output += f"| **Duration** | {asr['duration']:.1f} seconds |\n" transcription_output += f"| **Word Count** | {asr['word_count']} words |\n" transcription_output += f"| **Speech Rate** | {asr['speech_rate']:.1f} words/min |\n\n" if asr['speech_rate'] > 160: transcription_output += "**Fast speech rate** - Above average speaking speed\n\n" elif asr['speech_rate'] < 120: transcription_output += "**Slow speech rate** - Below average speaking speed\n\n" else: transcription_output += "**Normal speech rate** - Average conversational pace\n\n" transcription_output += "---\n\n" transcription_output += "#### Full Transcription\n\n" transcription_output += f"> {asr['transcription']}" if 'kopparapu_score' in asr: classification = asr['kopparapu_classification'].upper() confidence = asr['kopparapu_score'] if asr['kopparapu_score'] >= 0.5 else (1 - asr['kopparapu_score']) speech_patterns = f" ### **Classification: {classification} SPEECH**\n\n" speech_patterns += f"**Score:** {asr['kopparapu_score']:.3f} (0=spontaneous, 1=read)\n" speech_patterns += f"**Confidence:** {confidence*100:.1f}%\n\n" speech_patterns += "---\n\n" speech_patterns += "#### Linguistic Metrics\n\n" kf = asr['kopparapu_features'] speech_patterns += "| Feature | Value | Interpretation |\n" speech_patterns += "|---------|-------|----------------|\n" speech_patterns += f"| **Characters/Word** | {kf['chars_per_word']:.2f} | " if kf['chars_per_word'] > 5.5: speech_patterns += "Complex vocabulary |\n" elif kf['chars_per_word'] < 4.5: speech_patterns += "Simple vocabulary |\n" else: speech_patterns += "Average complexity |\n" speech_patterns += f"| **Words/Second** | {kf['words_per_sec']:.2f} | " if kf['words_per_sec'] > 3: speech_patterns += "Fast pacing |\n" elif kf['words_per_sec'] < 2: speech_patterns += "Slow pacing |\n" else: speech_patterns += "Normal pacing |\n" speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | " if kf['filler_rate'] > 0.05: speech_patterns += "High (spontaneous) |\n" elif kf['filler_rate'] < 0.02: speech_patterns += "Low (scripted) |\n" else: speech_patterns += "Moderate |\n" speech_patterns += f"| **Repetitions** | {kf['repetition_count']} | " if kf['repetition_count'] > 3: speech_patterns += "Multiple (thinking aloud) |\n" elif kf['repetition_count'] == 0: speech_patterns += "None (prepared) |\n" else: speech_patterns += "Few |\n" speech_patterns += "\n---\n\n" speech_patterns += "#### Reading Style Indicators\n\n" speech_patterns += "| Feature | Value | Interpretation |\n" speech_patterns += "|---------|-------|----------------|\n" # Pause regularity pause_reg = kf.get('pause_regularity', 0.5) speech_patterns += f"| **Pause Regularity** | {pause_reg:.2f} | " if pause_reg > 0.7: speech_patterns += "Very regular (read) |\n" elif pause_reg > 0.4: speech_patterns += "Moderate |\n" else: speech_patterns += "Irregular (spontaneous) |\n" # Speech rate variability rate_var = kf.get('speech_rate_variability', 0.0) speech_patterns += f"| **Rate Variability** | {rate_var:.2f} | " if rate_var > 0.6: speech_patterns += "High (spontaneous) |\n" elif rate_var > 0.3: speech_patterns += "Moderate |\n" else: speech_patterns += "Steady pace (read) |\n" # Sentence variance sent_var = kf.get('sentence_length_variance', 0.0) speech_patterns += f"| **Sentence Variance** | {sent_var:.2f} | " if sent_var > 0.5: speech_patterns += "Variable (spontaneous) |\n" elif sent_var > 0.25: speech_patterns += "Moderate |\n" else: speech_patterns += "Uniform (read) |\n" # Self-corrections corrections = kf.get('self_correction_count', 0) speech_patterns += f"| **Self-Corrections** | {corrections} | " if corrections > 2: speech_patterns += "Multiple (spontaneous) |\n" elif corrections > 0: speech_patterns += "Few |\n" else: speech_patterns += "None (scripted) |\n" speech_patterns += "\n" speech_patterns += "---\n\n" speech_patterns += "#### Filler Words & Disfluencies\n\n" filler_ratio = asr['filler_words']['ratio'] speech_patterns += f"**Count:** {asr['filler_words']['count']} filler words\n" speech_patterns += f"**Ratio:** {filler_ratio*100:.2f}% of speech\n\n" if asr['filler_words']['details']: speech_patterns += "**Found:** " + ', '.join([f"*{k}* ({v})" for k, v in asr['filler_words']['details'].items()]) + "\n\n" if filler_ratio > 0.05: speech_patterns += "**High filler usage** - Strong indicator of spontaneous, unscripted speech\n\n" elif filler_ratio < 0.02: speech_patterns += "**Low filler usage** - May indicate reading or highly rehearsed speech\n\n" else: speech_patterns += "**Moderate filler usage** - Normal conversational pattern\n\n" speech_patterns += "---\n\n" speech_patterns += "#### Pause Patterns\n\n" pause_var = asr['pause_patterns']['pause_variability'] speech_patterns += f"**Total Pauses:** {asr['pause_patterns']['num_pauses']}\n" speech_patterns += f"**Average Duration:** {asr['pause_patterns']['avg_pause']:.2f}s\n" speech_patterns += f"**Longest Pause:** {asr['pause_patterns']['max_pause']:.2f}s\n" speech_patterns += f"**Variability:** {pause_var:.2f}\n\n" if pause_var < 0.3: speech_patterns += "**Regular pauses** - Consistent pattern suggests reading at punctuation marks\n\n" elif pause_var > 0.6: speech_patterns += "**Irregular pauses** - Natural thinking breaks indicate spontaneous speech\n\n" else: speech_patterns += "**Moderate variability** - Mixed pattern\n\n" is_ai = text_auth['ai_detection']['ai_generated'] ai_prob = text_auth['ai_detection']['confidence'] if is_ai: ai_output = "### **AI-GENERATED LIKELY**\n\n" else: ai_output = "### **HUMAN-WRITTEN LIKELY**\n\n" ai_output += "**Confidence:**\n\n" bar_length = 30 ai_bars = int(ai_prob * bar_length) human_bars = bar_length - ai_bars ai_output += f"```\nAI: [{'█' * ai_bars}{'░' * human_bars}] {ai_prob*100:.0f}%\n" ai_output += f"Human: [{'█' * human_bars}{'░' * ai_bars}] {(1-ai_prob)*100:.0f}%\n```\n\n" ai_output += "---\n\n" ai_output += "#### Interpretation\n\n" ai_interpretation = text_auth['ai_detection'].get('interpretation', 'No interpretation available.') if ai_interpretation: ai_output += ai_interpretation else: ai_output += "No interpretation available." return ( overall_status, acoustic_output, transcription_output, speech_patterns, ai_output, ) except Exception as e: error_msg = f"Error during analysis:\n\n{str(e)}\n\n{traceback.format_exc()}" return (error_msg, "", "", "", "", "") def create_interface(): custom_css = """ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap'); .gradio-container { font-family: 'IBM Plex Sans', sans-serif !important; background: white !important; } .contain { max-width: 100% !important; width: 100% !important; margin: 0 auto !important; background: white !important; padding: 0 !important; } .tab-nav button { font-family: 'IBM Plex Sans', sans-serif; font-size: 14px; font-weight: 500; padding: 10px 16px; border-radius: 8px 8px 0 0; transition: all 0.2s; } .tab-nav button.selected { background: #2563eb; color: white; font-weight: 600; } button.primary, .primary { background: #2563eb !important; color: white !important; border: none !important; font-size: 16px !important; font-weight: 600 !important; padding: 12px 24px !important; border-radius: 8px !important; transition: all 0.2s !important; } button.primary:hover, .primary:hover { background: #1d4ed8 !important; } .markdown-text { font-family: 'IBM Plex Sans', sans-serif; line-height: 1.7; } h1, h2, h3, h4 { font-family: 'IBM Plex Sans', sans-serif; font-weight: 600; } """ with gr.Blocks(title="Authenticity Detection System") as demo: gr.HTML(f"""

LEIDEN UNIVERSITY · LIACS

Audio Processing & Indexing Project

Detecting AI-Assisted Responses in Online Settings

Multi-Modal Analysis Acoustic + Linguistic

""") with gr.Row(): with gr.Column(scale=1): gr.HTML("""

Audio Input

Upload or record your audio file

""") audio_input = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Audio File", show_label=False ) analyze_btn = gr.Button( "Analyze Audio", variant="primary", size="lg" ) gr.HTML("""

Requirements

Formats: WAV, MP3, M4A, FLAC, OGG
Duration: 30 sec - 5 min

Note: Provides probabilistic assessments. Use as one factor in evaluation.

""") with gr.Column(scale=2): gr.HTML("""

Analysis Results

You'll see results here

""") overall_output = gr.Markdown() with gr.Tabs() as tabs: with gr.Tab("Acoustic Features"): acoustic_output = gr.Markdown() with gr.Tab("Transcription"): transcription_output = gr.Markdown() with gr.Tab("Speech Patterns"): speech_output = gr.Markdown() with gr.Tab("AI Detection"): ai_output = gr.Markdown() # Add example audio files with caching gr.HTML("""

Try these examples:

""") examples_dir = os.path.join(os.path.dirname(__file__), "examples") gr.Examples( examples=[ [os.path.join(examples_dir, "read1.ogg")], [os.path.join(examples_dir, "spontaneous1.ogg")] ], inputs=[audio_input], outputs=[ overall_output, acoustic_output, transcription_output, speech_output, ai_output, ], fn=analyze_audio_file, label="", examples_per_page=2, cache_examples=True ) def show_loading(): loading_html = """

Analyzing...

""" loading_msg = " **Processing...**" return loading_html, loading_msg, loading_msg, loading_msg, loading_msg analyze_btn.click( fn=show_loading, inputs=None, outputs=[ overall_output, acoustic_output, transcription_output, speech_output, ai_output, ], queue=False ).then( fn=analyze_audio_file, inputs=[audio_input], outputs=[ overall_output, acoustic_output, transcription_output, speech_output, ai_output, ] ) gr.HTML(""" """) return demo if __name__ == "__main__": demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )