Spaces:

ranamhamoud
/

Authenticity

Sleeping

App Files Files Community

Ranam Hamoud commited on Dec 3, 2025

Commit

0b42831

1 Parent(s): 8528e25

Update files and add .gitignore, remove pycache from tracking

Browse files

Files changed (14) hide show

.gitignore +3 -0
__pycache__/audio_classifier.cpython-313.pyc +0 -0
__pycache__/pipeline.cpython-313.pyc +0 -0
__pycache__/plagiarism_detection.cpython-313.pyc +0 -0
__pycache__/speech_recognizer.cpython-313.pyc +0 -0
__pycache__/text_analyzer.cpython-313.pyc +0 -0
app.py +451 -261
audio_classifier.py +189 -101
examples/.DS_Store +0 -0
examples/{spontaneous1.ogg → read1.wav} +2 -2
examples/{read1.ogg → spontaneous1.wav} +2 -2
pipeline.py +62 -30
speech_recognizer.py +103 -84
text_analyzer.py +23 -31

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+pipeline_scores.png
+plot_component_accuracy.py
+__pycache__/

__pycache__/audio_classifier.cpython-313.pyc DELETED Viewed

Binary file (19.2 kB)

__pycache__/pipeline.cpython-313.pyc DELETED Viewed

Binary file (8.3 kB)

__pycache__/plagiarism_detection.cpython-313.pyc DELETED Viewed

Binary file (15.7 kB)

__pycache__/speech_recognizer.cpython-313.pyc DELETED Viewed

Binary file (16 kB)

__pycache__/text_analyzer.cpython-313.pyc DELETED Viewed

Binary file (5.56 kB)

app.py CHANGED Viewed

@@ -3,36 +3,430 @@ import os
 from pipeline import AuthenticityDetectionPipeline
 import traceback
 try:
     pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
     pipeline_ready = True
-except Exception:
     pipeline_ready = False
 def analyze_audio_file(audio_file):
     if not pipeline_ready:
-        return (
-            "Error: Pipeline not initialized. Please check the installation.",
-            "", "", "", ""
-        )
     if audio_file is None:
-        return (
-            "Please upload an audio file.",
-            "", "", "", ""
-        )
     try:
         language_code = None
         results = pipeline.analyze_audio(audio_file, language=language_code)
         audio_class = results['audio_classification']
         asr = results['speech_recognition']
         text_auth = results['text_authenticity']
         final = results['final_assessment']
         verdict_color = {
             "AUTHENTIC": "#10b981",
             "LIKELY AUTHENTIC": "#3b82f6",
@@ -42,6 +436,7 @@ def analyze_audio_file(audio_file):
         color = verdict_color.get(final['verdict'], '#6b7280')
         overall_status = f"""
 <div style='background: white; border: 2px solid {color}; padding: 25px; border-radius: 16px; margin: 10px 0;'>
     <h2 style='color: {color}; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
@@ -66,175 +461,11 @@ def analyze_audio_file(audio_file):
     </div>
 </div>
 """
-        acoustic_output = audio_class['interpretation']
-        transcription_output = "### Speech Transcription\n\n"
-        transcription_output += f"| Metric | Value |\n"
-        transcription_output += f"|--------|-------|\n"
-        transcription_output += f"| **Language** | {asr['language'].upper()} |\n"
-        transcription_output += f"| **Duration** | {asr['duration']:.1f} seconds |\n"
-        transcription_output += f"| **Word Count** | {asr['word_count']} words |\n"
-        transcription_output += f"| **Speech Rate** | {asr['speech_rate']:.1f} words/min |\n\n"
-        if asr['speech_rate'] > 160:
-            transcription_output += "**Fast speech rate** - Above average speaking speed\n\n"
-        elif asr['speech_rate'] < 120:
-            transcription_output += "**Slow speech rate** - Below average speaking speed\n\n"
-        else:
-            transcription_output += "**Normal speech rate** - Average conversational pace\n\n"
-        transcription_output += "---\n\n"
-        transcription_output += "#### Full Transcription\n\n"
-        transcription_output += f"> {asr['transcription']}"
-        if 'kopparapu_score' in asr:
-            classification = asr['kopparapu_classification'].upper()
-            confidence = asr['kopparapu_score'] if asr['kopparapu_score'] >= 0.5 else (1 - asr['kopparapu_score'])
-            speech_patterns = f" ### **Classification: {classification} SPEECH**\n\n"
-            speech_patterns += f"**Score:** {asr['kopparapu_score']:.3f} (0=spontaneous, 1=read)\n"
-            speech_patterns += f"**Confidence:** {confidence*100:.1f}%\n\n"
-            speech_patterns += "---\n\n"
-            speech_patterns += "#### Linguistic Metrics\n\n"
-            kf = asr['kopparapu_features']
-            speech_patterns += "| Feature | Value | Interpretation |\n"
-            speech_patterns += "|---------|-------|----------------|\n"
-            speech_patterns += f"| **Characters/Word** | {kf['chars_per_word']:.2f} | "
-            if kf['chars_per_word'] > 5.5:
-                speech_patterns += "Complex vocabulary |\n"
-            elif kf['chars_per_word'] < 4.5:
-                speech_patterns += "Simple vocabulary |\n"
-            else:
-                speech_patterns += "Average complexity |\n"
-            speech_patterns += f"| **Words/Second** | {kf['words_per_sec']:.2f} | "
-            if kf['words_per_sec'] > 3:
-                speech_patterns += "Fast pacing |\n"
-            elif kf['words_per_sec'] < 2:
-                speech_patterns += "Slow pacing |\n"
-            else:
-                speech_patterns += "Normal pacing |\n"
-            speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
-            if kf['filler_rate'] > 0.05:
-                speech_patterns += "High (spontaneous) |\n"
-            elif kf['filler_rate'] < 0.02:
-                speech_patterns += "Low (scripted) |\n"
-            else:
-                speech_patterns += "Moderate |\n"
-            speech_patterns += f"| **Repetitions** | {kf['repetition_count']} | "
-            if kf['repetition_count'] > 3:
-                speech_patterns += "Multiple (thinking aloud) |\n"
-            elif kf['repetition_count'] == 0:
-                speech_patterns += "None (prepared) |\n"
-            else:
-                speech_patterns += "Few |\n"
-            speech_patterns += "\n---\n\n"
-            speech_patterns += "#### Reading Style Indicators\n\n"
-            speech_patterns += "| Feature | Value | Interpretation |\n"
-            speech_patterns += "|---------|-------|----------------|\n"
-            # Pause regularity
-            pause_reg = kf.get('pause_regularity', 0.5)
-            speech_patterns += f"| **Pause Regularity** | {pause_reg:.2f} | "
-            if pause_reg > 0.7:
-                speech_patterns += "Very regular (read) |\n"
-            elif pause_reg > 0.4:
-                speech_patterns += "Moderate |\n"
-            else:
-                speech_patterns += "Irregular (spontaneous) |\n"
-            # Speech rate variability
-            rate_var = kf.get('speech_rate_variability', 0.0)
-            speech_patterns += f"| **Rate Variability** | {rate_var:.2f} | "
-            if rate_var > 0.6:
-                speech_patterns += "High (spontaneous) |\n"
-            elif rate_var > 0.3:
-                speech_patterns += "Moderate |\n"
-            else:
-                speech_patterns += "Steady pace (read) |\n"
-            # Sentence variance
-            sent_var = kf.get('sentence_length_variance', 0.0)
-            speech_patterns += f"| **Sentence Variance** | {sent_var:.2f} | "
-            if sent_var > 0.5:
-                speech_patterns += "Variable (spontaneous) |\n"
-            elif sent_var > 0.25:
-                speech_patterns += "Moderate |\n"
-            else:
-                speech_patterns += "Uniform (read) |\n"
-            # Self-corrections
-            corrections = kf.get('self_correction_count', 0)
-            speech_patterns += f"| **Self-Corrections** | {corrections} | "
-            if corrections > 2:
-                speech_patterns += "Multiple (spontaneous) |\n"
-            elif corrections > 0:
-                speech_patterns += "Few |\n"
-            else:
-                speech_patterns += "None (scripted) |\n"
-            speech_patterns += "\n"
-        speech_patterns += "---\n\n"
-        speech_patterns += "#### Filler Words & Disfluencies\n\n"
-        filler_ratio = asr['filler_words']['ratio']
-        speech_patterns += f"**Count:** {asr['filler_words']['count']} filler words\n"
-        speech_patterns += f"**Ratio:** {filler_ratio*100:.2f}% of speech\n\n"
-        if asr['filler_words']['details']:
-            speech_patterns += "**Found:** " + ', '.join([f"*{k}* ({v})" for k, v in asr['filler_words']['details'].items()]) + "\n\n"
-        if filler_ratio > 0.05:
-            speech_patterns += "**High filler usage** - Strong indicator of spontaneous, unscripted speech\n\n"
-        elif filler_ratio < 0.02:
-            speech_patterns += "**Low filler usage** - May indicate reading or highly rehearsed speech\n\n"
-        else:
-            speech_patterns += "**Moderate filler usage** - Normal conversational pattern\n\n"
-        speech_patterns += "---\n\n"
-        speech_patterns += "#### Pause Patterns\n\n"
-        pause_var = asr['pause_patterns']['pause_variability']
-        speech_patterns += f"**Total Pauses:** {asr['pause_patterns']['num_pauses']}\n"
-        speech_patterns += f"**Average Duration:** {asr['pause_patterns']['avg_pause']:.2f}s\n"
-        speech_patterns += f"**Longest Pause:** {asr['pause_patterns']['max_pause']:.2f}s\n"
-        speech_patterns += f"**Variability:** {pause_var:.2f}\n\n"
-        if pause_var < 0.3:
-            speech_patterns += "**Regular pauses** - Consistent pattern suggests reading at punctuation marks\n\n"
-        elif pause_var > 0.6:
-            speech_patterns += "**Irregular pauses** - Natural thinking breaks indicate spontaneous speech\n\n"
-        else:
-            speech_patterns += "**Moderate variability** - Mixed pattern\n\n"
-        is_ai = text_auth['ai_detection']['ai_generated']
-        ai_prob = text_auth['ai_detection']['confidence']
-        if is_ai:
-            ai_output = "### **AI-GENERATED LIKELY**\n\n"
-        else:
-            ai_output = "### **HUMAN-WRITTEN LIKELY**\n\n"
-        ai_output += "**Confidence:**\n\n"
-        bar_length = 30
-        ai_bars = int(ai_prob * bar_length)
-        human_bars = bar_length - ai_bars
-        ai_output += f"```\nAI:    [{'█' * ai_bars}{'░' * human_bars}] {ai_prob*100:.0f}%\n"
-        ai_output += f"Human: [{'█' * human_bars}{'░' * ai_bars}] {(1-ai_prob)*100:.0f}%\n```\n\n"
-        ai_output += "---\n\n"
-        ai_output += "#### Interpretation\n\n"
-        ai_interpretation = text_auth['ai_detection'].get('interpretation', 'No interpretation available.')
-        if ai_interpretation:
-            ai_output += ai_interpretation
-        else:
-            ai_output += "No interpretation available."
         return (
             overall_status,
@@ -245,11 +476,22 @@ def analyze_audio_file(audio_file):
         )
     except Exception as e:
-        error_msg = f"Error during analysis:\n\n{str(e)}\n\n{traceback.format_exc()}"
-        return (error_msg, "", "", "", "", "")
 def create_interface():
     custom_css = """
     @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
@@ -257,26 +499,6 @@ def create_interface():
         font-family: 'IBM Plex Sans', sans-serif !important;
         background: white !important;
     }
-    .contain {
-        max-width: 100% !important;
-        width: 100% !important;
-        margin: 0 auto !important;
-        background: white !important;
-        padding: 0 !important;
-    }
-    .tab-nav button {
-        font-family: 'IBM Plex Sans', sans-serif;
-        font-size: 14px;
-        font-weight: 500;
-        padding: 10px 16px;
-        border-radius: 8px 8px 0 0;
-        transition: all 0.2s;
-    }
-    .tab-nav button.selected {
-        background: #2563eb;
-        color: white;
-        font-weight: 600;
-    }
     button.primary, .primary {
         background: #2563eb !important;
         color: white !important;
@@ -285,23 +507,12 @@ def create_interface():
         font-weight: 600 !important;
         padding: 12px 24px !important;
         border-radius: 8px !important;
-        transition: all 0.2s !important;
-    }
-    button.primary:hover, .primary:hover {
-        background: #1d4ed8 !important;
-    }
-    .markdown-text {
-        font-family: 'IBM Plex Sans', sans-serif;
-        line-height: 1.7;
-    }
-    h1, h2, h3, h4 {
-        font-family: 'IBM Plex Sans', sans-serif;
-        font-weight: 600;
     }
     """
     with gr.Blocks(title="Authenticity Detection System") as demo:
         gr.HTML(f"""
         <style>
         {custom_css}
@@ -309,17 +520,6 @@ def create_interface():
         <header style='background: white; border-bottom: 1px solid #e5e7eb; margin-bottom: 32px;'>
             <div style='padding: 16px 0;'>
                 <div style='display: flex; align-items: center; gap: 12px;'>
-                    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" width="32" height="32">
-                        <defs>
-                            <linearGradient id="g" x1="0" y1="0" x2="64" y2="0" gradientUnits="userSpaceOnUse">
-                                <stop offset="0" stop-color="#1d4ed8" />
-                                <stop offset="1" stop-color="#0ea5e9" />
-                            </linearGradient>
-                        </defs>
-                        <rect x="0" y="0" width="64" height="64" rx="12" fill="#ffffff"/>
-                        <path d="M4 32 C 10 18, 18 46, 24 32 S 36 18, 40 32 52 46, 60 32"
-                              fill="none" stroke="url(#g)" stroke-width="4" stroke-linecap="round" stroke-linejoin="round"/>
-                    </svg>
                     <div>
                         <p style='margin: 0; font-size: 11px; text-transform: uppercase; letter-spacing: 1.5px; color: #6b7280; font-weight: 500;'>
                             LEIDEN UNIVERSITY · LIACS
@@ -337,8 +537,6 @@ def create_interface():
                 <h2 style='font-size: 32px; font-weight: 700; line-height: 1.2; color: #111827; margin: 0 0 16px 0;'>
                     Detecting AI-Assisted Responses in Online Settings
                 </h2>
-                <p style='font-size: 18px; color: #374151; margin: 0 0 24px 0;'>
-                </p>
                 <div style='display: flex; flex-wrap: wrap; gap: 12px;'>
                     <span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #eff6ff; color: #1e40af; border-radius: 8px; font-size: 14px; font-weight: 500;'>
                         Multi-Modal Analysis
@@ -351,15 +549,17 @@ def create_interface():
         </section>
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 gr.HTML("""
-                <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 20px;'>
                     <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Audio Input</h3>
                     <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>Upload or record your audio file</p>
                 </div>
                 """)
                 audio_input = gr.Audio(
                     sources=["upload", "microphone"],
                     type="filepath",
@@ -367,12 +567,14 @@ def create_interface():
                     show_label=False
                 )
                 analyze_btn = gr.Button(
                     "Analyze Audio",
                     variant="primary",
                     size="lg"
                 )
                 gr.HTML("""
                 <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
                     <h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
@@ -381,40 +583,34 @@ def create_interface():
                         <li><strong>Duration:</strong> 30 sec - 5 min</li>
                     </ul>
                 </div>
-                <div style='background: #fef3c7; border: 1px solid #fbbf24; padding: 16px; border-radius: 12px; margin-top: 16px;'>
-                    <div style='font-size: 12px; color: #92400e; line-height: 1.6;'>
-                        <strong>Note:</strong> Provides probabilistic assessments.
-                        Use as one factor in evaluation.
-                    </div>
-                </div>
                 """)
             with gr.Column(scale=2):
                 gr.HTML("""
-                <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 20px;'>
                     <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Analysis Results</h3>
                     <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>You'll see results here</p>
                 </div>
                 """)
-                overall_output = gr.Markdown()
                 with gr.Tabs() as tabs:
                     with gr.Tab("Acoustic Features"):
-                        acoustic_output = gr.Markdown()
                     with gr.Tab("Transcription"):
-                        transcription_output = gr.Markdown()
                     with gr.Tab("Speech Patterns"):
-                        speech_output = gr.Markdown()
                     with gr.Tab("AI Detection"):
-                        ai_output = gr.Markdown()
-        # Add example audio files with caching
         gr.HTML("""
         <div style='margin-top: 20px; margin-bottom: 10px;'>
             <h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
@@ -424,8 +620,8 @@ def create_interface():
         examples_dir = os.path.join(os.path.dirname(__file__), "examples")
         gr.Examples(
             examples=[
-                [os.path.join(examples_dir, "read1.ogg")],
-                [os.path.join(examples_dir, "spontaneous1.ogg")]
             ],
             inputs=[audio_input],
             outputs=[
@@ -438,31 +634,26 @@ def create_interface():
             fn=analyze_audio_file,
             label="",
             examples_per_page=2,
-            cache_examples=True
         )
         def show_loading():
             loading_html = """
 <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border: 2px solid #667eea; padding: 30px; border-radius: 16px; margin: 10px 0; text-align: center;'>
     <h2 style='color: white; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
         Analyzing...
     </h2>
-    <div style='margin-top: 20px;'>
-        <div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out infinite;'></div>
-        <div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out 0.2s infinite;'></div>
-        <div style='display: inline-block; width: 12px; height: 12px; border-radius: 50%; background: white; margin: 0 4px; animation: pulse 1.5s ease-in-out 0.4s infinite;'></div>
-    </div>
 </div>
-<style>
-@keyframes pulse {
-    0%, 100% { opacity: 0.3; transform: scale(0.8); }
-    50% { opacity: 1; transform: scale(1.2); }
-}
-</style>
 """
-            loading_msg = " **Processing...**"
-            return loading_html, loading_msg, loading_msg, loading_msg, loading_msg
         analyze_btn.click(
             fn=show_loading,
             inputs=None,
@@ -486,13 +677,12 @@ def create_interface():
             ]
         )
         gr.HTML("""
         <footer style='border-top: 1px solid #e5e7eb; background: white; margin-top: 48px; padding: 32px 0;'>
             <div style='text-align: center;'>
                 <p style='margin: 0; font-size: 14px; color: #6b7280;'>
                 </p>
-                <p style='margin: 8px 0 0 0; font-size: 13px; color: #9ca3af;'>
-                </p>
             </div>
         </footer>
         """)
@@ -500,6 +690,7 @@ def create_interface():
     return demo
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(
@@ -508,4 +699,3 @@ if __name__ == "__main__":
         share=False,
         show_error=True
     )

 from pipeline import AuthenticityDetectionPipeline
 import traceback
+# initialize the pipeline on startup
 try:
     pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
     pipeline_ready = True
+except Exception as e:
     pipeline_ready = False
+    pipeline_error = str(e)
+    import traceback
+    print(f"Could not start pipeline: {e}")
+    traceback.print_exc()
+# build the acoustic features display HTML
+def build_acoustic_features_display(audio_class):
+    classification = audio_class['classification']
+    confidence = audio_class['confidence']
+    cnn_class = audio_class['cnn_classification']
+    cnn_conf = audio_class['cnn_confidence']
+    prosody_class = audio_class['prosody_classification']
+    prosody_conf = audio_class['prosody_confidence']
+    prosody_scores = audio_class.get('prosody_scores', {})
+    acoustic_features = audio_class.get('acoustic_features', {})
+    # color scheme based on classification
+    if classification == 'spontaneous':
+        main_color = '#10b981'
+        bg_color = '#ecfdf5'
+        label = 'SPONTANEOUS'
+    else:
+        main_color = '#f59e0b'
+        bg_color = '#fffbeb'
+        label = 'READ'
+    cnn_color = '#10b981' if cnn_class == 'spontaneous' else '#f59e0b'
+    prosody_color = '#10b981' if prosody_class == 'spontaneous' else '#f59e0b'
+    # build main classification header
+    output = f"""
+<div style="background: linear-gradient(135deg, {bg_color} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {main_color}33;">
+    <h3 style="margin: 0; color: {main_color}; font-size: 22px; font-weight: 700;">{label} SPEECH</h3>
+    <p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Combined acoustic analysis confidence: <strong>{confidence*100:.1f}%</strong></p>
+</div>
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
+    <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Analysis Components</h4>
+    <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
+        <div style="background: #f9fafb; border-radius: 10px; padding: 16px;">
+            <div style="font-size: 12px; color: #6b7280; margin-bottom: 8px; font-weight: 500;">CNN Neural Network</div>
+            <div style="font-size: 20px; font-weight: 700; color: {cnn_color}; margin-bottom: 8px;">{cnn_class.upper()}</div>
+            <div style="background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 6px;">
+                <div style="height: 100%; width: {cnn_conf*100:.0f}%; background: {cnn_color}; border-radius: 6px;"></div>
+            </div>
+            <div style="font-size: 11px; color: #9ca3af; margin-top: 6px;">{cnn_conf*100:.1f}% confidence</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 16px;">
+            <div style="font-size: 12px; color: #6b7280; margin-bottom: 8px; font-weight: 500;">Prosody Analysis</div>
+            <div style="font-size: 20px; font-weight: 700; color: {prosody_color}; margin-bottom: 8px;">{prosody_class.upper()}</div>
+            <div style="background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 6px;">
+                <div style="height: 100%; width: {prosody_conf*100:.0f}%; background: {prosody_color}; border-radius: 6px;"></div>
+            </div>
+            <div style="font-size: 11px; color: #9ca3af; margin-top: 6px;">{prosody_conf*100:.1f}% confidence</div>
+        </div>
+    </div>
+</div>
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
+    <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Prosody Feature Breakdown</h4>
+"""
+    # feature descriptions
+    feature_info = {
+        'spectral_variability': {'name': 'Spectral Variability', 'unit': 'Hz', 'description': 'Variation in frequency content over time'},
+        'zcr_mean': {'name': 'Zero Crossing Rate', 'unit': 'ratio', 'description': 'Rate of signal sign changes'},
+        'energy_level': {'name': 'Energy Level', 'unit': 'RMS', 'description': 'Overall loudness and intensity'},
+        'tempo': {'name': 'Speech Tempo', 'unit': 'BPM', 'description': 'Rhythmic pacing of speech'}
+    }
+    # add feature details
+    for key, info in feature_info.items():
+        if key in prosody_scores:
+            score_data = prosody_scores[key]
+            score = score_data['score']
+            value = score_data['value']
+            interp = score_data['interpretation']
+            unit = info['unit']
+            bar_color = '#10b981' if score < 0.4 else '#f59e0b' if score > 0.6 else '#6b7280'
+            indicator_position = score * 100
+            output += f"""
+    <div style="background: #f9fafb; border-radius: 10px; padding: 14px; margin-bottom: 10px;">
+        <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 8px;">
+            <div>
+                <div style="font-weight: 600; color: #1f2937; font-size: 14px;">{info['name']}</div>
+                <div style="font-size: 11px; color: #9ca3af;">{info['description']}</div>
+            </div>
+            <div style="text-align: right;">
+                <div style="font-size: 13px; font-weight: 600; color: {bar_color};">{interp}</div>
+                <div style="font-size: 11px; color: #6b7280;">{value:.3f} <span style="color: #9ca3af;">{unit}</span></div>
+            </div>
+        </div>
+        <div style="position: relative; background: linear-gradient(to right, #10b981, #6b7280, #f59e0b); border-radius: 4px; height: 6px; margin: 10px 0 6px 0;">
+            <div style="position: absolute; left: {indicator_position}%; top: -4px; transform: translateX(-50%); width: 14px; height: 14px; background: white; border: 2px solid {bar_color}; border-radius: 50%; box-shadow: 0 1px 3px rgba(0,0,0,0.15);"></div>
+        </div>
+        <div style="display: flex; justify-content: space-between; font-size: 10px; color: #9ca3af;">
+            <span>Spontaneous</span>
+            <span>Read</span>
+        </div>
+    </div>
+"""
+    output += "</div>"
+    # add raw acoustic measurements
+    output += """
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
+    <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Raw Acoustic Measurements</h4>
+    <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px;">
+"""
+    if acoustic_features:
+        metrics = [
+            ('Tempo', f"{acoustic_features.get('tempo', 0):.1f}", 'BPM'),
+            ('Pitch Mean', f"{acoustic_features.get('pitch_mean', 0):.1f}", 'Hz'),
+            ('Energy Mean', f"{acoustic_features.get('energy_mean', 0):.4f}", ''),
+            ('ZCR Mean', f"{acoustic_features.get('zcr_mean', 0):.4f}", ''),
+        ]
+        for name, value, unit in metrics:
+            output += f"""
+        <div style="background: #f9fafb; border-radius: 8px; padding: 12px; text-align: center;">
+            <div style="font-size: 16px; font-weight: 600; color: #1f2937;">{value}</div>
+            <div style="font-size: 10px; color: #6b7280; margin-top: 2px;">{name} {unit}</div>
+        </div>
+"""
+    output += """
+    </div>
+</div>
+"""
+    return output
+# build the transcription display HTML
+def build_transcription_display(asr):
+    # determine speech rate interpretation
+    if asr['speech_rate'] > 160:
+        rate_color = '#f59e0b'
+        rate_label = 'Fast'
+        rate_desc = 'Above average speaking speed'
+    elif asr['speech_rate'] < 120:
+        rate_color = '#3b82f6'
+        rate_label = 'Slow'
+        rate_desc = 'Below average speaking speed'
+    else:
+        rate_color = '#10b981'
+        rate_label = 'Normal'
+        rate_desc = 'Average conversational pace'
+    output = f"""
+<div style="background: linear-gradient(135deg, #eff6ff 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid #3b82f633;">
+    <h3 style="margin: 0; color: #1e40af; font-size: 22px; font-weight: 700;">Speech Transcription</h3>
+    <p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Detected language: <strong>{asr['language'].upper()}</strong></p>
+</div>
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
+    <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Speech Metrics</h4>
+    <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 16px;">
+        <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
+            <div style="font-size: 24px; font-weight: 700; color: #1e40af;">{asr['duration']:.1f}</div>
+            <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Duration (sec)</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
+            <div style="font-size: 24px; font-weight: 700; color: #1e40af;">{asr['word_count']}</div>
+            <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Words</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
+            <div style="font-size: 24px; font-weight: 700; color: {rate_color};">{asr['speech_rate']:.0f}</div>
+            <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Words/min</div>
+        </div>
+        <div style="background: {rate_color}15; border-radius: 10px; padding: 16px; text-align: center; border: 1px solid {rate_color}33;">
+            <div style="font-size: 18px; font-weight: 700; color: {rate_color};">{rate_label}</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">{rate_desc}</div>
+        </div>
+    </div>
+</div>
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px;">
+    <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Full Transcription</h4>
+    <div style="background: #f9fafb; border-radius: 10px; padding: 20px; border-left: 4px solid #3b82f6;">
+        <p style="margin: 0; font-size: 15px; line-height: 1.8; color: #374151; font-style: italic;">"{asr['transcription']}"</p>
+    </div>
+</div>
+"""
+    return output
+# build the speech patterns display HTML
+def build_speech_patterns_display(asr):
+    output = ""
+    # kopparapu classification section
+    if 'kopparapu_score' in asr:
+        classification = asr['kopparapu_classification'].upper()
+        kop_score = asr['kopparapu_score']
+        confidence = kop_score if kop_score >= 0.5 else (1 - kop_score)
+        if classification == 'SPONTANEOUS':
+            class_color = '#10b981'
+            class_bg = '#ecfdf5'
+        else:
+            class_color = '#f59e0b'
+            class_bg = '#fffbeb'
+        kf = asr['kopparapu_features']
+        output += f"""
+<div style="background: linear-gradient(135deg, {class_bg} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {class_color}33;">
+    <h3 style="margin: 0; color: {class_color}; font-size: 22px; font-weight: 700;">{classification} SPEECH</h3>
+    <p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">Linguistic analysis confidence: <strong>{confidence*100:.1f}%</strong></p>
+    <div style="margin-top: 12px; background: #e5e7eb; border-radius: 6px; overflow: hidden; height: 8px;">
+        <div style="height: 100%; width: {kop_score*100:.0f}%; background: linear-gradient(to right, #10b981, #f59e0b); border-radius: 6px;"></div>
+    </div>
+</div>
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
+    <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Linguistic Metrics</h4>
+    <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px;">
+        <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
+            <div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['chars_per_word']:.2f}</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Chars/Word</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
+            <div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['words_per_sec']:.2f}</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Words/Sec</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
+            <div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['filler_rate']*100:.1f}%</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Filler Rate</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
+            <div style="font-size: 20px; font-weight: 700; color: #6b7280;">{kf['repetition_count']}</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Repetitions</div>
+        </div>
+    </div>
+</div>
+"""
+    # filler words section
+    filler_ratio = asr['filler_words']['ratio']
+    filler_count = asr['filler_words']['count']
+    if filler_ratio > 0.05:
+        filler_color = '#10b981'
+        filler_label = 'High filler usage'
+        filler_desc = 'Strong indicator of spontaneous speech'
+    elif filler_ratio < 0.02:
+        filler_color = '#f59e0b'
+        filler_label = 'Low filler usage'
+        filler_desc = 'May indicate reading or rehearsed speech'
+    else:
+        filler_color = '#6b7280'
+        filler_label = 'Moderate filler usage'
+        filler_desc = 'Normal conversational pattern'
+    output += f"""
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
+    <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Filler Words</h4>
+    <div style="display: grid; grid-template-columns: 1fr 1fr 2fr; gap: 16px; align-items: center;">
+        <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
+            <div style="font-size: 28px; font-weight: 700; color: {filler_color};">{filler_count}</div>
+            <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Filler Words</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 16px; text-align: center;">
+            <div style="font-size: 28px; font-weight: 700; color: {filler_color};">{filler_ratio*100:.1f}%</div>
+            <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">Of Speech</div>
+        </div>
+        <div style="background: {filler_color}10; border-radius: 10px; padding: 16px; border: 1px solid {filler_color}33;">
+            <div style="font-weight: 600; color: {filler_color}; font-size: 14px;">{filler_label}</div>
+            <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">{filler_desc}</div>
+        </div>
+    </div>
+</div>
+"""
+    # pause patterns section
+    pause_var = asr['pause_patterns']['pause_variability']
+    if pause_var < 0.3:
+        pause_color = '#f59e0b'
+        pause_label = 'Regular pauses'
+        pause_desc = 'Suggests reading at punctuation marks'
+    elif pause_var > 0.6:
+        pause_color = '#10b981'
+        pause_label = 'Irregular pauses'
+        pause_desc = 'Natural thinking breaks indicate spontaneous speech'
+    else:
+        pause_color = '#6b7280'
+        pause_label = 'Moderate variability'
+        pause_desc = 'Mixed pattern'
+    output += f"""
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px;">
+    <h4 style="margin: 0 0 16px 0; color: #374151; font-size: 15px; font-weight: 600;">Pause Patterns</h4>
+    <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-bottom: 16px;">
+        <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
+            <div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['num_pauses']}</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Total Pauses</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
+            <div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['avg_pause']:.2f}</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Avg Duration</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
+            <div style="font-size: 20px; font-weight: 700; color: #374151;">{asr['pause_patterns']['max_pause']:.2f}</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Longest Pause</div>
+        </div>
+        <div style="background: #f9fafb; border-radius: 10px; padding: 14px; text-align: center;">
+            <div style="font-size: 20px; font-weight: 700; color: {pause_color};">{pause_var:.2f}</div>
+            <div style="font-size: 11px; color: #6b7280; margin-top: 4px;">Variability</div>
+        </div>
+    </div>
+    <div style="background: {pause_color}10; border-radius: 10px; padding: 14px; border: 1px solid {pause_color}33;">
+        <div style="font-weight: 600; color: {pause_color}; font-size: 14px;">{pause_label}</div>
+        <div style="font-size: 12px; color: #6b7280; margin-top: 4px;">{pause_desc}</div>
+    </div>
+</div>
+"""
+    return output
+# build the AI detection display HTML
+def build_ai_detection_display(text_auth):
+    is_ai = text_auth['ai_detection']['ai_generated']
+    ai_prob = text_auth['ai_detection']['confidence']
+    human_prob = 1 - ai_prob
+    if is_ai:
+        main_color = '#ef4444'
+        bg_color = '#fef2f2'
+        label = 'AI-GENERATED LIKELY'
+        desc = 'The text shows patterns consistent with AI-generated content'
+    else:
+        main_color = '#10b981'
+        bg_color = '#ecfdf5'
+        label = 'HUMAN-WRITTEN LIKELY'
+        desc = 'The text shows patterns consistent with human-written content'
+    output = f"""
+<div style="background: linear-gradient(135deg, {bg_color} 0%, white 100%); border-radius: 16px; padding: 24px; margin-bottom: 20px; border: 1px solid {main_color}33;">
+    <h3 style="margin: 0; color: {main_color}; font-size: 22px; font-weight: 700;">{label}</h3>
+    <p style="margin: 8px 0 0 0; color: #6b7280; font-size: 14px;">{desc}</p>
+</div>
+<div style="background: white; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 16px;">
+    <h4 style="margin: 0 0 20px 0; color: #374151; font-size: 15px; font-weight: 600;">Confidence Analysis</h4>
+    <div style="margin-bottom: 20px;">
+        <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
+            <span style="font-weight: 600; color: #ef4444; font-size: 14px;">AI Generated</span>
+            <span style="font-weight: 700; color: #ef4444; font-size: 18px;">{ai_prob*100:.0f}%</span>
+        </div>
+        <div style="background: #fee2e2; border-radius: 8px; overflow: hidden; height: 12px;">
+            <div style="height: 100%; width: {ai_prob*100:.0f}%; background: #ef4444; border-radius: 8px;"></div>
+        </div>
+    </div>
+    <div>
+        <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
+            <span style="font-weight: 600; color: #10b981; font-size: 14px;">Human Written</span>
+            <span style="font-weight: 700; color: #10b981; font-size: 18px;">{human_prob*100:.0f}%</span>
+        </div>
+        <div style="background: #d1fae5; border-radius: 8px; overflow: hidden; height: 12px;">
+            <div style="height: 100%; width: {human_prob*100:.0f}%; background: #10b981; border-radius: 8px;"></div>
+        </div>
+    </div>
+</div>
+<div style="background: #fffbeb; border: 1px solid #fcd34d; border-radius: 10px; padding: 14px;">
+    <div style="font-size: 13px; color: #92400e; line-height: 1.5;">
+        <strong>Note:</strong> AI detection is probabilistic and should be used as one factor among many in your evaluation.
+    </div>
+</div>
+"""
+    return output
+# main function to analyze uploaded audio file
 def analyze_audio_file(audio_file):
+    # check if pipeline is ready
     if not pipeline_ready:
+        error_msg = pipeline_error if 'pipeline_error' in dir() else "Something went wrong"
+        error_html = f"""
+<div style="background: #fef2f2; border: 1px solid #ef4444; border-radius: 12px; padding: 20px;">
+    <h3 style="margin: 0 0 8px 0; color: #dc2626; font-size: 16px;">Pipeline not ready</h3>
+    <p style="margin: 0; color: #7f1d1d; font-size: 14px;">{error_msg}</p>
+</div>
+"""
+        return (error_html, "", "", "", "")
+    # check if audio file was provided
     if audio_file is None:
+        placeholder_html = """
+<div style="background: #f9fafb; border: 1px solid #e5e7eb; border-radius: 12px; padding: 40px; text-align: center;">
+    <p style="margin: 0; color: #6b7280; font-size: 15px;">Please upload an audio file to begin analysis.</p>
+</div>
+"""
+        return (placeholder_html, "", "", "", "")
+    # run analysis
     try:
         language_code = None
         results = pipeline.analyze_audio(audio_file, language=language_code)
+        # extract results from each component
         audio_class = results['audio_classification']
         asr = results['speech_recognition']
         text_auth = results['text_authenticity']
         final = results['final_assessment']
+        # color mapping for verdict
         verdict_color = {
             "AUTHENTIC": "#10b981",
             "LIKELY AUTHENTIC": "#3b82f6",
         color = verdict_color.get(final['verdict'], '#6b7280')
+        # build overall status display
         overall_status = f"""
 <div style='background: white; border: 2px solid {color}; padding: 25px; border-radius: 16px; margin: 10px 0;'>
     <h2 style='color: {color}; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
     </div>
 </div>
 """
+        # build tab outputs
+        acoustic_output = build_acoustic_features_display(audio_class)
+        transcription_output = build_transcription_display(asr)
+        speech_patterns = build_speech_patterns_display(asr)
+        ai_output = build_ai_detection_display(text_auth)
         return (
             overall_status,
         )
     except Exception as e:
+        error_html = f"""
+<div style="background: #fef2f2; border: 1px solid #ef4444; border-radius: 12px; padding: 20px;">
+    <h3 style="margin: 0 0 12px 0; color: #dc2626; font-size: 16px;">Something went wrong</h3>
+    <p style="margin: 0 0 12px 0; color: #7f1d1d; font-size: 14px;">{str(e)}</p>
+    <details style="margin-top: 12px;">
+        <summary style="color: #6b7280; cursor: pointer; font-size: 13px;">More info</summary>
+        <pre style="background: #1f2937; color: #f3f4f6; padding: 12px; border-radius: 8px; margin-top: 8px; font-size: 11px; overflow-x: auto;">{traceback.format_exc()}</pre>
+    </details>
+</div>
+"""
+        return (error_html, "", "", "", "")
+# create the gradio interface
 def create_interface():
+    # custom CSS for styling
     custom_css = """
     @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
         font-family: 'IBM Plex Sans', sans-serif !important;
         background: white !important;
     }
     button.primary, .primary {
         background: #2563eb !important;
         color: white !important;
         font-weight: 600 !important;
         padding: 12px 24px !important;
         border-radius: 8px !important;
     }
     """
     with gr.Blocks(title="Authenticity Detection System") as demo:
+        # header section
         gr.HTML(f"""
         <style>
         {custom_css}
         <header style='background: white; border-bottom: 1px solid #e5e7eb; margin-bottom: 32px;'>
             <div style='padding: 16px 0;'>
                 <div style='display: flex; align-items: center; gap: 12px;'>
                     <div>
                         <p style='margin: 0; font-size: 11px; text-transform: uppercase; letter-spacing: 1.5px; color: #6b7280; font-weight: 500;'>
                             LEIDEN UNIVERSITY · LIACS
                 <h2 style='font-size: 32px; font-weight: 700; line-height: 1.2; color: #111827; margin: 0 0 16px 0;'>
                     Detecting AI-Assisted Responses in Online Settings
                 </h2>
                 <div style='display: flex; flex-wrap: wrap; gap: 12px;'>
                     <span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #eff6ff; color: #1e40af; border-radius: 8px; font-size: 14px; font-weight: 500;'>
                         Multi-Modal Analysis
         </section>
         """)
+        # main layout
         with gr.Row():
             with gr.Column(scale=1):
                 gr.HTML("""
+                <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-bottom: 20px;'>
                     <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Audio Input</h3>
                     <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>Upload or record your audio file</p>
                 </div>
                 """)
+                # audio input component
                 audio_input = gr.Audio(
                     sources=["upload", "microphone"],
                     type="filepath",
                     show_label=False
                 )
+                # analyze button
                 analyze_btn = gr.Button(
                     "Analyze Audio",
                     variant="primary",
                     size="lg"
                 )
+                # requirements info
                 gr.HTML("""
                 <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
                     <h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
                         <li><strong>Duration:</strong> 30 sec - 5 min</li>
                     </ul>
                 </div>
                 """)
             with gr.Column(scale=2):
                 gr.HTML("""
+                <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-bottom: 20px;'>
                     <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Analysis Results</h3>
                     <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>You'll see results here</p>
                 </div>
                 """)
+                # overall output
+                overall_output = gr.HTML()
+                # results tabs
                 with gr.Tabs() as tabs:
                     with gr.Tab("Acoustic Features"):
+                        acoustic_output = gr.HTML()
                     with gr.Tab("Transcription"):
+                        transcription_output = gr.HTML()
                     with gr.Tab("Speech Patterns"):
+                        speech_output = gr.HTML()
                     with gr.Tab("AI Detection"):
+                        ai_output = gr.HTML()
+        # example audio files
         gr.HTML("""
         <div style='margin-top: 20px; margin-bottom: 10px;'>
             <h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
         examples_dir = os.path.join(os.path.dirname(__file__), "examples")
         gr.Examples(
             examples=[
+                [os.path.join(examples_dir, "read1.wav")],
+                [os.path.join(examples_dir, "spontaneous1.wav")]
             ],
             inputs=[audio_input],
             outputs=[
             fn=analyze_audio_file,
             label="",
             examples_per_page=2,
+            cache_examples="lazy"
         )
+        # loading animation function
         def show_loading():
             loading_html = """
 <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border: 2px solid #667eea; padding: 30px; border-radius: 16px; margin: 10px 0; text-align: center;'>
     <h2 style='color: white; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
         Analyzing...
     </h2>
 </div>
 """
+            loading_tab = """
+<div style='padding: 40px; text-align: center; color: #6b7280;'>
+    <p style='margin-top: 16px; font-size: 14px;'>Processing...</p>
+</div>
+"""
+            return loading_html, loading_tab, loading_tab, loading_tab, loading_tab
+        # connect button to analysis function
         analyze_btn.click(
             fn=show_loading,
             inputs=None,
             ]
         )
+        # footer
         gr.HTML("""
         <footer style='border-top: 1px solid #e5e7eb; background: white; margin-top: 48px; padding: 32px 0;'>
             <div style='text-align: center;'>
                 <p style='margin: 0; font-size: 14px; color: #6b7280;'>
                 </p>
             </div>
         </footer>
         """)
     return demo
+# run the app when script is executed
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(
         share=False,
         show_error=True
     )

audio_classifier.py CHANGED Viewed

@@ -3,72 +3,95 @@ import torch.nn as nn
 import torch.nn.functional as F
 import librosa
 import numpy as np
-from typing import Dict
 class BasicBlock(nn.Module):
     def __init__(self, in_channels, out_channels, stride=1, downsample=None):
         super(BasicBlock, self).__init__()
         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                                stride=stride, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(out_channels)
         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                                stride=1, padding=1, bias=False)
         self.bn2 = nn.BatchNorm2d(out_channels)
         self.downsample = downsample
     def forward(self, x):
         identity = x
         out = F.relu(self.bn1(self.conv1(x)))
         out = self.bn2(self.conv2(out))
         if self.downsample is not None:
             identity = self.downsample(x)
         out += identity
         out = F.relu(out)
         return out
 class SpeechStyleCNN(nn.Module):
     def __init__(self, num_classes=2):
         super(SpeechStyleCNN, self).__init__()
         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
         self.layer1 = self._make_layer(64, 64, 2, stride=1)
         self.layer2 = self._make_layer(64, 128, 2, stride=2)
         self.layer3 = self._make_layer(128, 256, 2, stride=2)
         self.layer4 = self._make_layer(256, 512, 2, stride=2)
         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
         self.fc = nn.Linear(512, num_classes)
     def _make_layer(self, in_channels, out_channels, blocks, stride=1):
         downsample = None
         if stride != 1 or in_channels != out_channels:
             downsample = nn.Sequential(
                 nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                 nn.BatchNorm2d(out_channels)
             )
         layers = []
         layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
         for _ in range(1, blocks):
             layers.append(BasicBlock(out_channels, out_channels))
         return nn.Sequential(*layers)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = F.relu(self.bn1(self.conv1(x)))
         x = self.maxpool(x)
         x = self.layer1(x)
         x = self.layer2(x)
         x = self.layer3(x)
         x = self.layer4(x)
         x = self.avgpool(x)
         x = torch.flatten(x, 1)
         x = self.fc(x)
@@ -76,70 +99,82 @@ class SpeechStyleCNN(nn.Module):
         return x
 class AudioClassifier:
     AVAILABLE_MODELS = {
         '3s_window': 'spectrogram_cnn_3s_window.pth',
-        # '4s_window': 'spectrogram_cnn_4s_window.pth',
-        # '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
     }
     @classmethod
-    def get_model_path(cls, model_name: str = '3s_window') -> str:
         import os
         if model_name not in cls.AVAILABLE_MODELS:
-            raise ValueError(f"Unknown model: {model_name}. Available: {list(cls.AVAILABLE_MODELS.keys())}")
         return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
-    def __init__(self, model_path: str = None, device: str = None):
         if device is None:
             self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         else:
             self.device = torch.device(device)
         self.model = SpeechStyleCNN().to(self.device)
         if model_path is None:
             import os
             model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
         try:
             print(f"Attempting to load model from: {model_path}")
-            state_dict = torch.load(model_path, map_location=self.device)
             self.model.load_state_dict(state_dict)
             print(f"✓ Successfully loaded trained model from: {model_path}")
         except FileNotFoundError:
-            raise FileNotFoundError(f"Model file not found at {model_path}. Please ensure the model file exists.")
         except Exception as e:
-            raise RuntimeError(f"Error loading model from {model_path}: {e}")
         self.model.eval()
         self.sample_rate = 16000
         self.n_mels = 128
         self.n_fft = 2048
         self.hop_length = 512
-    def extract_mel_spectrogram(self, audio_path: str, window_size: float = 3.0) -> np.ndarray:
         audio, sr = librosa.load(audio_path, sr=self.sample_rate)
-        # If audio is longer than window_size, take multiple windows and average
         window_samples = int(window_size * sr)
-        if len(audio) > window_samples * 1.5:  # If significantly longer
-            # Split into overlapping windows
             hop_samples = window_samples // 2
             windows = []
             for start in range(0, len(audio) - window_samples, hop_samples):
                 window = audio[start:start + window_samples]
                 windows.append(window)
-            # Also add the last window
             if len(audio) > window_samples:
                 windows.append(audio[-window_samples:])
-            # Compute mel spectrogram for each window and average
             mel_specs = []
-            for window in windows[:5]:  # Limit to 5 windows to avoid too much computation
                 mel_spec = librosa.feature.melspectrogram(
                     y=window,
                     sr=sr,
@@ -149,10 +184,10 @@ class AudioClassifier:
                 )
                 mel_specs.append(mel_spec)
-            # Average the spectrograms
             mel_spec = np.mean(mel_specs, axis=0)
         else:
-            # Pad or use as-is for short audio
             if len(audio) < window_samples:
                 audio = np.pad(audio, (0, window_samples - len(audio)), mode='constant')
             else:
@@ -166,22 +201,28 @@ class AudioClassifier:
                 hop_length=self.hop_length
             )
         mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
         mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
         mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
         return mel_spec_3ch
-    def extract_acoustic_features(self, audio_path: str) -> Dict[str, float]:
         audio, sr = librosa.load(audio_path, sr=self.sample_rate)
         features = {}
         onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
         tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
         features['tempo'] = float(tempo)
         pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
         pitch_values = []
         for t in range(pitches.shape[1]):
@@ -190,6 +231,7 @@ class AudioClassifier:
             if pitch > 0:
                 pitch_values.append(pitch)
         if pitch_values:
             features['pitch_mean'] = float(np.mean(pitch_values))
             features['pitch_std'] = float(np.std(pitch_values))
@@ -199,34 +241,40 @@ class AudioClassifier:
             features['pitch_std'] = 0.0
             features['pitch_range'] = 0.0
         rms = librosa.feature.rms(y=audio)[0]
         features['energy_mean'] = float(np.mean(rms))
         features['energy_std'] = float(np.std(rms))
         zcr = librosa.feature.zero_crossing_rate(audio)[0]
         features['zcr_mean'] = float(np.mean(zcr))
         features['zcr_std'] = float(np.std(zcr))
         spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
         features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
         features['spectral_centroid_std'] = float(np.std(spectral_centroids))
         return features
-    def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
         individual_scores = {}
         sc_std = features['spectral_centroid_std']
-        if sc_std >= 1100:
-            spectral_score = 0.9  # Strongly indicates read
-        elif sc_std >= 1050:
-            spectral_score = 0.7  # Likely read
         elif sc_std >= 1000:
-            spectral_score = 0.5  # Borderline
-        elif sc_std >= 950:
-            spectral_score = 0.3  # Likely spontaneous
         else:
-            spectral_score = 0.1  # Strongly spontaneous
         individual_scores['spectral_variability'] = {
             'score': spectral_score,
@@ -234,17 +282,18 @@ class AudioClassifier:
             'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
         }
         zcr = features['zcr_mean']
-        if zcr >= 0.13:
-            zcr_score = 0.9  # Strongly indicates read
-        elif zcr >= 0.115:
-            zcr_score = 0.7  # Likely read
-        elif zcr >= 0.105:
-            zcr_score = 0.5  # Borderline
-        elif zcr >= 0.095:
-            zcr_score = 0.3  # Likely spontaneous
         else:
-            zcr_score = 0.1  # Strongly spontaneous
         individual_scores['zcr_mean'] = {
             'score': zcr_score,
@@ -252,18 +301,16 @@ class AudioClassifier:
             'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
         }
-        # 3. Energy mean (separation: 0.69)
-        # Read: 0.06 avg, Spontaneous: 0.06 avg but spontaneous tends higher
-        # Threshold: ~0.06, read < threshold
         energy = features['energy_mean']
         if energy < 0.055:
-            energy_score = 0.8  # Low energy -> likely read
-        elif energy < 0.065:
-            energy_score = 0.5  # Moderate
-        elif energy < 0.075:
-            energy_score = 0.3  # Higher energy -> likely spontaneous
         else:
-            energy_score = 0.1  # High energy -> spontaneous
         individual_scores['energy_level'] = {
             'score': energy_score,
@@ -271,45 +318,80 @@ class AudioClassifier:
             'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
         }
-        # 4. Tempo (separation: 0.22) - less discriminative but still useful
-        # Read: 122 avg, Spontaneous: 125 avg
-        tempo = features['tempo']
-        if tempo < 115:
-            tempo_score = 0.7  # Slower -> could be read (more deliberate)
-        elif tempo < 125:
-            tempo_score = 0.5  # Moderate
         else:
-            tempo_score = 0.3  # Faster -> could be spontaneous
-        individual_scores['tempo'] = {
-            'score': tempo_score,
-            'value': tempo,
-            'interpretation': 'slow (read)' if tempo_score > 0.6 else 'fast (spontaneous)' if tempo_score < 0.4 else 'moderate'
         }
-        # Optimized weights based on feature separation scores
         weights = {
-            'spectral_variability': 0.40,
-            'zcr_mean': 0.30,
-            'energy_level': 0.20,
-            'tempo': 0.10
         }
         overall_score = (
             spectral_score * weights['spectral_variability'] +
             zcr_score * weights['zcr_mean'] +
             energy_score * weights['energy_level'] +
-            tempo_score * weights['tempo']
         )
-        if overall_score > 0.60:
             classification = 'read'
-            confidence = 0.5 + (overall_score - 0.5) * 0.8
-        elif overall_score < 0.40:
             classification = 'spontaneous'
-            confidence = 0.5 + (0.5 - overall_score) * 0.8
         else:
-            classification = 'read' if overall_score >= 0.5 else 'spontaneous'
             confidence = 0.5 + abs(overall_score - 0.5) * 0.6
         return {
@@ -319,11 +401,15 @@ class AudioClassifier:
             'individual_scores': individual_scores
         }
-    def classify(self, audio_path: str) -> Dict[str, any]:
         mel_spec = self.extract_mel_spectrogram(audio_path)
         mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
         with torch.no_grad():
             logits = self.model(mel_tensor)
             probabilities = F.softmax(logits, dim=1)
@@ -334,35 +420,36 @@ class AudioClassifier:
             print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
             print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
         acoustic_features = self.extract_acoustic_features(audio_path)
         prosody_scores = self._compute_prosody_scores(acoustic_features)
         prosody_classification = prosody_scores['classification']
         prosody_confidence = prosody_scores['confidence']
-        # Model mapping: Class 0 = read, Class 1 = spontaneous
         cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
         print(f"CNN classification: {cnn_class_name}")
         print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
-        cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
-        prosody_score = 1.0 if prosody_classification == 'read' else 0.0
-        weighted_score = (
-            cnn_score * cnn_confidence * 0.4 +
-            prosody_score * prosody_confidence * 0.6
-        ) / (cnn_confidence * 0.4 + prosody_confidence * 0.6)
-        if weighted_score > 0.5:
-            final_classification = 'read'
-            final_confidence = 0.5 + (weighted_score - 0.5)
-        else:
-            final_classification = 'spontaneous'
-            final_confidence = 0.5 + (0.5 - weighted_score)
-        final_confidence = min(0.95, final_confidence)
         return {
             'classification': final_classification,
@@ -381,17 +468,18 @@ class AudioClassifier:
             )
         }
     def _interpret_classification(
         self,
-        final_class: str,
-        final_confidence: float,
-        cnn_class: str,
-        cnn_confidence: float,
-        prosody_class: str,
-        prosody_confidence: float,
-        prosody_scores: Dict,
-        features: Dict
-    ) -> str:
         interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
         interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
@@ -404,10 +492,10 @@ class AudioClassifier:
             interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
             interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
         return interpretation
 if __name__ == "__main__":
     classifier = AudioClassifier()
     print("\nAvailable pre-trained models:")

 import torch.nn.functional as F
 import librosa
 import numpy as np
+# Basic building block for the ResNet-style CNN
+# Uses two convolutional layers with batch normalization
 class BasicBlock(nn.Module):
     def __init__(self, in_channels, out_channels, stride=1, downsample=None):
         super(BasicBlock, self).__init__()
+        # first conv layer with specified stride
         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                                stride=stride, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(out_channels)
+        # second conv layer always has stride 1
         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                                stride=1, padding=1, bias=False)
         self.bn2 = nn.BatchNorm2d(out_channels)
+        # downsample is used when dimensions change
         self.downsample = downsample
     def forward(self, x):
+        # save input for skip connection
         identity = x
+        # pass through first conv + batchnorm + relu
         out = F.relu(self.bn1(self.conv1(x)))
+        # pass through second conv + batchnorm
         out = self.bn2(self.conv2(out))
+        # apply downsample if needed to match dimensions
         if self.downsample is not None:
             identity = self.downsample(x)
+        # add skip connection and apply relu
         out += identity
         out = F.relu(out)
         return out
+# Main CNN model for speech style classification
+# Architecture based on ResNet with custom layer configuration
 class SpeechStyleCNN(nn.Module):
     def __init__(self, num_classes=2):
         super(SpeechStyleCNN, self).__init__()
+        # initial convolution layer - takes 3 channel input (RGB spectrogram)
         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        # stack of residual blocks with increasing channel sizes
         self.layer1 = self._make_layer(64, 64, 2, stride=1)
         self.layer2 = self._make_layer(64, 128, 2, stride=2)
         self.layer3 = self._make_layer(128, 256, 2, stride=2)
         self.layer4 = self._make_layer(256, 512, 2, stride=2)
+        # global average pooling and final classification layer
         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
         self.fc = nn.Linear(512, num_classes)
+    # helper function to create a layer of residual blocks
     def _make_layer(self, in_channels, out_channels, blocks, stride=1):
         downsample = None
+        # need downsample when stride changes or channels don't match
         if stride != 1 or in_channels != out_channels:
             downsample = nn.Sequential(
                 nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                 nn.BatchNorm2d(out_channels)
             )
+        # create list of blocks
         layers = []
+        # first block may have different stride
         layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
+        # remaining blocks have stride 1
         for _ in range(1, blocks):
             layers.append(BasicBlock(out_channels, out_channels))
         return nn.Sequential(*layers)
+    def forward(self, x):
+        # initial conv block
         x = F.relu(self.bn1(self.conv1(x)))
         x = self.maxpool(x)
+        # pass through all residual layers
         x = self.layer1(x)
         x = self.layer2(x)
         x = self.layer3(x)
         x = self.layer4(x)
+        # global pooling and classification
         x = self.avgpool(x)
         x = torch.flatten(x, 1)
         x = self.fc(x)
         return x
+# Main classifier class that combines CNN with acoustic feature analysis
 class AudioClassifier:
+    # dictionary of available pre-trained models
     AVAILABLE_MODELS = {
         '3s_window': 'spectrogram_cnn_3s_window.pth',
     }
     @classmethod
+    def get_model_path(cls, model_name='3s_window'):
+        # returns the full path to a model file
         import os
         if model_name not in cls.AVAILABLE_MODELS:
+            print(f"Model not found: {model_name}")
+            return None
         return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
+    def __init__(self, model_path=None, device=None):
+        # set up device - use GPU if available
         if device is None:
             self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         else:
             self.device = torch.device(device)
+        # initialize the CNN model
         self.model = SpeechStyleCNN().to(self.device)
+        # use default model path if not specified
         if model_path is None:
             import os
             model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
+        # load pre-trained weights
         try:
             print(f"Attempting to load model from: {model_path}")
+            state_dict = torch.load(model_path, map_location=self.device, weights_only=False)
             self.model.load_state_dict(state_dict)
             print(f"✓ Successfully loaded trained model from: {model_path}")
         except FileNotFoundError:
+            print(f"Could not find model file at {model_path}")
+            print("Make sure the model file exists in the correct location")
         except Exception as e:
+            print(f"Something went wrong loading the model: {e}")
+        # set model to evaluation mode
         self.model.eval()
+        # audio processing parameters
         self.sample_rate = 16000
         self.n_mels = 128
         self.n_fft = 2048
         self.hop_length = 512
+    # extract mel spectrogram from audio file
+    def extract_mel_spectrogram(self, audio_path, window_size=3.0):
+        # load audio at target sample rate
         audio, sr = librosa.load(audio_path, sr=self.sample_rate)
+        # calculate window size in samples
         window_samples = int(window_size * sr)
+        # for longer audio, use multiple overlapping windows
+        if len(audio) > window_samples * 1.5:
             hop_samples = window_samples // 2
             windows = []
+            # extract overlapping windows
             for start in range(0, len(audio) - window_samples, hop_samples):
                 window = audio[start:start + window_samples]
                 windows.append(window)
+            # add the last window
             if len(audio) > window_samples:
                 windows.append(audio[-window_samples:])
+            # compute mel spectrogram for each window
             mel_specs = []
+            for window in windows[:5]:  # limit to 5 windows
                 mel_spec = librosa.feature.melspectrogram(
                     y=window,
                     sr=sr,
                 )
                 mel_specs.append(mel_spec)
+            # average the spectrograms
             mel_spec = np.mean(mel_specs, axis=0)
         else:
+            # for short audio, pad or truncate
             if len(audio) < window_samples:
                 audio = np.pad(audio, (0, window_samples - len(audio)), mode='constant')
             else:
                 hop_length=self.hop_length
             )
+        # convert to decibels
         mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
+        # normalize to 0-1 range
         mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
+        # stack into 3 channels for CNN input
         mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
         return mel_spec_3ch
+    # extract acoustic features from audio
+    def extract_acoustic_features(self, audio_path):
         audio, sr = librosa.load(audio_path, sr=self.sample_rate)
         features = {}
+        # tempo/rhythm estimation
         onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
         tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
         features['tempo'] = float(tempo)
+        # pitch tracking
         pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
         pitch_values = []
         for t in range(pitches.shape[1]):
             if pitch > 0:
                 pitch_values.append(pitch)
+        # calculate pitch statistics
         if pitch_values:
             features['pitch_mean'] = float(np.mean(pitch_values))
             features['pitch_std'] = float(np.std(pitch_values))
             features['pitch_std'] = 0.0
             features['pitch_range'] = 0.0
+        # energy/loudness features
         rms = librosa.feature.rms(y=audio)[0]
         features['energy_mean'] = float(np.mean(rms))
         features['energy_std'] = float(np.std(rms))
+        # zero crossing rate - indicates voice quality
         zcr = librosa.feature.zero_crossing_rate(audio)[0]
         features['zcr_mean'] = float(np.mean(zcr))
         features['zcr_std'] = float(np.std(zcr))
+        # spectral centroid - brightness of sound
         spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
         features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
         features['spectral_centroid_std'] = float(np.std(spectral_centroids))
         return features
+    # compute prosody scores from acoustic features
+    # uses thresholds calibrated from training data
+    def _compute_prosody_scores(self, features):
         individual_scores = {}
+        # spectral centroid variability - best discriminating feature
         sc_std = features['spectral_centroid_std']
+        if sc_std >= 1080:
+            spectral_score = 0.9  # strongly indicates read
+        elif sc_std >= 1040:
+            spectral_score = 0.7
         elif sc_std >= 1000:
+            spectral_score = 0.5
+        elif sc_std >= 970:
+            spectral_score = 0.3
         else:
+            spectral_score = 0.1  # strongly spontaneous
         individual_scores['spectral_variability'] = {
             'score': spectral_score,
             'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
         }
+        # zero crossing rate - second best feature
         zcr = features['zcr_mean']
+        if zcr >= 0.125:
+            zcr_score = 0.9
+        elif zcr >= 0.110:
+            zcr_score = 0.7
+        elif zcr >= 0.100:
+            zcr_score = 0.5
+        elif zcr >= 0.092:
+            zcr_score = 0.3
         else:
+            zcr_score = 0.1
         individual_scores['zcr_mean'] = {
             'score': zcr_score,
             'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
         }
+        # energy level - read speech tends to be lower energy
         energy = features['energy_mean']
         if energy < 0.055:
+            energy_score = 0.85
+        elif energy < 0.062:
+            energy_score = 0.65
+        elif energy < 0.070:
+            energy_score = 0.4
         else:
+            energy_score = 0.15
         individual_scores['energy_level'] = {
             'score': energy_score,
             'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
         }
+        # pitch range feature
+        pitch_range = features.get('pitch_range', 3828)
+        if pitch_range < 3815:
+            pitch_range_score = 0.7
+        elif pitch_range < 3828:
+            pitch_range_score = 0.5
+        else:
+            pitch_range_score = 0.3
+        individual_scores['pitch_range'] = {
+            'score': pitch_range_score,
+            'value': pitch_range,
+            'interpretation': 'narrow (read)' if pitch_range_score > 0.6 else 'wide (spontaneous)' if pitch_range_score < 0.4 else 'moderate'
+        }
+        # energy variability
+        energy_std = features.get('energy_std', 0.047)
+        if energy_std < 0.042:
+            energy_std_score = 0.7
+        elif energy_std < 0.048:
+            energy_std_score = 0.5
         else:
+            energy_std_score = 0.3
+        individual_scores['energy_std'] = {
+            'score': energy_std_score,
+            'value': energy_std,
+            'interpretation': 'steady (read)' if energy_std_score > 0.6 else 'variable (spontaneous)' if energy_std_score < 0.4 else 'moderate'
         }
+        # zcr variability
+        zcr_std = features.get('zcr_std', 0.111)
+        if zcr_std >= 0.115:
+            zcr_std_score = 0.7
+        elif zcr_std >= 0.105:
+            zcr_std_score = 0.5
+        else:
+            zcr_std_score = 0.3
+        individual_scores['zcr_std'] = {
+            'score': zcr_std_score,
+            'value': zcr_std,
+            'interpretation': 'variable ZCR (read)' if zcr_std_score > 0.6 else 'steady ZCR (spontaneous)' if zcr_std_score < 0.4 else 'moderate'
+        }
+        # weights based on feature importance from analysis
         weights = {
+            'spectral_variability': 0.30,
+            'zcr_mean': 0.25,
+            'energy_level': 0.20,
+            'pitch_range': 0.10,
+            'energy_std': 0.08,
+            'zcr_std': 0.07,
         }
+        # calculate weighted overall score
         overall_score = (
             spectral_score * weights['spectral_variability'] +
             zcr_score * weights['zcr_mean'] +
             energy_score * weights['energy_level'] +
+            pitch_range_score * weights['pitch_range'] +
+            energy_std_score * weights['energy_std'] +
+            zcr_std_score * weights['zcr_std']
         )
+        # determine classification based on thresholds
+        if overall_score > 0.58:
             classification = 'read'
+            confidence = 0.5 + (overall_score - 0.5) * 0.9
+        elif overall_score < 0.42:
             classification = 'spontaneous'
+            confidence = 0.5 + (0.5 - overall_score) * 0.9
         else:
+            classification = 'read' if overall_score >= 0.50 else 'spontaneous'
             confidence = 0.5 + abs(overall_score - 0.5) * 0.6
         return {
             'individual_scores': individual_scores
         }
+    # main classification method - combines CNN and prosody analysis
+    def classify(self, audio_path):
+        # extract mel spectrogram for CNN
         mel_spec = self.extract_mel_spectrogram(audio_path)
+        # convert to tensor and add batch dimension
         mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
+        # get CNN predictions
         with torch.no_grad():
             logits = self.model(mel_tensor)
             probabilities = F.softmax(logits, dim=1)
             print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
             print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
+        # extract acoustic features for prosody analysis
         acoustic_features = self.extract_acoustic_features(audio_path)
+        # compute prosody-based scores
         prosody_scores = self._compute_prosody_scores(acoustic_features)
         prosody_classification = prosody_scores['classification']
         prosody_confidence = prosody_scores['confidence']
+        # map CNN class to label
         cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
+        read_prob = probabilities[0, 0].item()
         print(f"CNN classification: {cnn_class_name}")
         print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
+        # combine CNN and prosody - prosody is more reliable
+        final_classification = prosody_classification
+        final_confidence = prosody_confidence
+        # boost confidence when both methods agree
+        if cnn_class_name == prosody_classification:
+            final_confidence = min(0.95, prosody_confidence * 1.15)
+        elif read_prob > 0.85 and cnn_class_name == 'read':
+            if prosody_confidence < 0.65:
+                final_classification = 'read'
+                final_confidence = 0.55
+        elif read_prob < 0.10 and cnn_class_name == 'spontaneous':
+            if prosody_confidence < 0.65:
+                final_classification = 'spontaneous'
+                final_confidence = 0.55
         return {
             'classification': final_classification,
             )
         }
+    # generate human-readable interpretation of classification
     def _interpret_classification(
         self,
+        final_class,
+        final_confidence,
+        cnn_class,
+        cnn_confidence,
+        prosody_class,
+        prosody_confidence,
+        prosody_scores,
+        features
+    ):
         interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
         interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
             interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
             interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
         return interpretation
+# test code - runs when script is executed directly
 if __name__ == "__main__":
     classifier = AudioClassifier()
     print("\nAvailable pre-trained models:")

examples/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

examples/{spontaneous1.ogg → read1.wav} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69b8aeffd1e7a02ed90bcff98d202cd7a97cc57cd1d16a4cdbd4aac2e770b6db
-size 323869

 version https://git-lfs.github.com/spec/v1
+oid sha256:9ca1f1a4aadad49ce045b41318eaf3e82b588231af2aee89596687731c0cef4d
+size 1075710

examples/{read1.ogg → spontaneous1.wav} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1c8e969d50e75835caf2a52f33c19accdb1cdfa1e069501bad0fc2fe470ea761
-size 157216

 version https://git-lfs.github.com/spec/v1
+oid sha256:76f8f4a50cd6d10123058d060287ae1433b59087ad0b65b0fa6255716368d3ba
+size 873470

pipeline.py CHANGED Viewed

@@ -1,49 +1,52 @@
-from typing import Dict, Optional
 import time
 from audio_classifier import AudioClassifier
 from speech_recognizer import SpeechRecognizer
 from text_analyzer import TextAuthenticityAnalyzer
 class AuthenticityDetectionPipeline:
     def __init__(
         self,
-        audio_model_path: Optional[str] = None,
-        whisper_model_size: str = "base",
-        device: Optional[str] = None,
-        ai_detection_threshold: float = 0.78
     ):
         print("\n" + "="*60)
         print("Initializing Multimodal Authenticity Detection Pipeline")
         print("="*60 + "\n")
-        # Initialize components
         print("📊 Loading Audio Classifier (CNN)...")
         self.audio_classifier = AudioClassifier(
             model_path=audio_model_path,
             device=device
         )
         print("\n🎤 Loading Speech Recognizer (Whisper)...")
         self.speech_recognizer = SpeechRecognizer(
             model_size=whisper_model_size,
             device=device
         )
         print("\n📝 Loading Text Authenticity Analyzer...")
         self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
         print("\n✅ Pipeline initialization complete!")
         print("="*60 + "\n")
-    def analyze_audio(self, audio_path: str, language: Optional[str] = None) -> Dict:
         print("\n" + "="*60)
         print("MULTIMODAL AUTHENTICITY ANALYSIS")
         print("="*60 + "\n")
         start_time = time.time()
-        # Stage 1: Audio Classification (CNN-based read vs spontaneous detection)
         print("Stage 1: CNN Audio Classification...")
         print("-" * 40)
         audio_results = self.audio_classifier.classify(audio_path)
@@ -51,7 +54,7 @@ class AuthenticityDetectionPipeline:
         print(f"  ## Classification: {audio_results['classification'].upper()}")
         print(f"  Confidence: {audio_results['confidence']*100:.1f}%")
-        # Stage 2: Speech Analysis (Whisper for linguistic analysis)
         print("\nStage 2: Speech Analysis (Whisper)...")
         print("-" * 40)
         asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
@@ -60,7 +63,7 @@ class AuthenticityDetectionPipeline:
         print(f"  Word count: {asr_results['word_count']}")
         print(f"  Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
-        # Stage 3: Text Authenticity Analysis
         print("\nStage 3: Analyzing text authenticity...")
         print("-" * 40)
         text_results = self.text_analyzer.analyze(asr_results['transcription'])
@@ -68,7 +71,7 @@ class AuthenticityDetectionPipeline:
         print(f"  Authenticity score: {text_results['authenticity_score']*100:.1f}%")
         print(f"  Risk level: {text_results['risk_level'].upper()}")
-        # Stage 4: Combined Assessment
         print("\nStage 4: Generating final assessment...")
         print("-" * 40)
         final_assessment = self._generate_final_assessment(
@@ -85,46 +88,68 @@ class AuthenticityDetectionPipeline:
         return {
             'audio_classification': audio_results,
             'speech_recognition': asr_results,
             'text_authenticity': text_results,
             'final_assessment': final_assessment,
             'processing_time': elapsed_time
         }
     def _generate_final_assessment(
         self,
-        audio_results: Dict,
-        asr_results: Dict,
-        text_results: Dict
-    ) -> Dict:
-        # CNN score: spontaneous = authentic (high), read = inauthentic (low)
         if audio_results['classification'] == 'spontaneous':
             audio_score = audio_results['confidence']
-        else:  # read
             audio_score = 1.0 - audio_results['confidence']
-        # Kopparapu score: 0=spontaneous, 1=read
-        # Invert so spontaneous (low kopparapu) = high authenticity
         speech_pattern_score = 1.0 - asr_results['kopparapu_score']
-        # Filler words: higher ratio = more spontaneous = more authentic
         filler_ratio = asr_results['filler_words']['ratio']
-        filler_score = min(1.0, filler_ratio / 0.05)  # Normalize: 5%+ = max score
-        # Pause variability: higher = more spontaneous = more authentic
         pause_var = asr_results['pause_patterns']['pause_variability']
-        pause_score = min(1.0, pause_var / 0.5)  # Normalize: 0.5+ = max score
         text_auth_score = text_results['authenticity_score']
         composite_score = (
-            audio_score * 0.15 +            # CNN - weakest component
-            speech_pattern_score * 0.20 +   # Kopparapu linguistic
-            filler_score * 0.10 +           # Filler word ratio
-            pause_score * 0.05 +            # Pause variability
-            text_auth_score * 0.50          # Text authenticity - strongest signal
         )
         if composite_score >= 0.7:
             verdict = "AUTHENTIC"
             risk = "low"
@@ -142,37 +167,43 @@ class AuthenticityDetectionPipeline:
             risk = "critical"
             recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
         concerns = []
         strengths = []
         if audio_results['classification'] == 'read':
             concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
         else:
             strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
         if asr_results['kopparapu_classification'] == 'read':
             concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
         else:
             strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
         filler_ratio = asr_results['filler_words']['ratio']
         if filler_ratio < 0.02:
             concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
         else:
             strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
         if asr_results['pause_patterns']['pause_variability'] < 0.3:
             concerns.append("Regular pause patterns suggest reading at punctuation")
         else:
             strengths.append("Irregular pause patterns indicate spontaneous thinking")
         if text_results['ai_detection']['ai_generated']:
             concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
         if text_results['authenticity_score'] > 0.7:
             strengths.append("Text shows strong originality indicators")
         return {
             'verdict': verdict,
             'risk_level': risk,
@@ -182,8 +213,9 @@ class AuthenticityDetectionPipeline:
             'recommendation': recommendation,
         }
 if __name__ == "__main__":
-    # Example usage
     print("Initializing Authenticity Detection Pipeline...")
     model_path = "spectrogram_cnn_3s_window.pth"
     pipeline = AuthenticityDetectionPipeline(

 import time
 from audio_classifier import AudioClassifier
 from speech_recognizer import SpeechRecognizer
 from text_analyzer import TextAuthenticityAnalyzer
+# Main pipeline class that orchestrates all analysis components
 class AuthenticityDetectionPipeline:
     def __init__(
         self,
+        audio_model_path=None,
+        whisper_model_size="base",
+        device=None,
+        ai_detection_threshold=0.78
     ):
         print("\n" + "="*60)
         print("Initializing Multimodal Authenticity Detection Pipeline")
         print("="*60 + "\n")
+        # load the CNN-based audio classifier
         print("📊 Loading Audio Classifier (CNN)...")
         self.audio_classifier = AudioClassifier(
             model_path=audio_model_path,
             device=device
         )
+        # load whisper model for speech-to-text
         print("\n🎤 Loading Speech Recognizer (Whisper)...")
         self.speech_recognizer = SpeechRecognizer(
             model_size=whisper_model_size,
             device=device
         )
+        # load text analyzer for AI detection
         print("\n📝 Loading Text Authenticity Analyzer...")
         self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
         print("\n✅ Pipeline initialization complete!")
         print("="*60 + "\n")
+    # main analysis function - runs all stages
+    def analyze_audio(self, audio_path, language=None):
         print("\n" + "="*60)
         print("MULTIMODAL AUTHENTICITY ANALYSIS")
         print("="*60 + "\n")
         start_time = time.time()
+        # stage 1: classify audio using CNN
         print("Stage 1: CNN Audio Classification...")
         print("-" * 40)
         audio_results = self.audio_classifier.classify(audio_path)
         print(f"  ## Classification: {audio_results['classification'].upper()}")
         print(f"  Confidence: {audio_results['confidence']*100:.1f}%")
+        # stage 2: transcribe and analyze speech patterns
         print("\nStage 2: Speech Analysis (Whisper)...")
         print("-" * 40)
         asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
         print(f"  Word count: {asr_results['word_count']}")
         print(f"  Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
+        # stage 3: analyze transcribed text for AI patterns
         print("\nStage 3: Analyzing text authenticity...")
         print("-" * 40)
         text_results = self.text_analyzer.analyze(asr_results['transcription'])
         print(f"  Authenticity score: {text_results['authenticity_score']*100:.1f}%")
         print(f"  Risk level: {text_results['risk_level'].upper()}")
+        # stage 4: combine all results into final assessment
         print("\nStage 4: Generating final assessment...")
         print("-" * 40)
         final_assessment = self._generate_final_assessment(
         return {
             'audio_classification': audio_results,
             'speech_recognition': asr_results,
+            'asr': asr_results,  # alias for backwards compatibility
+            'text_analysis': text_results,
             'text_authenticity': text_results,
             'final_assessment': final_assessment,
             'processing_time': elapsed_time
         }
+    # combine scores from all components into final verdict
     def _generate_final_assessment(
         self,
+        audio_results,
+        asr_results,
+        text_results
+    ):
+        # calculate audio score - spontaneous = authentic
         if audio_results['classification'] == 'spontaneous':
             audio_score = audio_results['confidence']
+        else:
             audio_score = 1.0 - audio_results['confidence']
+        # kopparapu score - invert so spontaneous = high authenticity
         speech_pattern_score = 1.0 - asr_results['kopparapu_score']
+        # filler words indicate spontaneous speech
         filler_ratio = asr_results['filler_words']['ratio']
+        filler_score = min(1.0, filler_ratio / 0.05)
+        # pause variability - higher = more spontaneous
         pause_var = asr_results['pause_patterns']['pause_variability']
+        pause_score = min(1.0, pause_var / 0.5)
+        # text authenticity from AI detector
         text_auth_score = text_results['authenticity_score']
+        # get additional linguistic features
+        kf = asr_results['kopparapu_features']
+        # speech rate variability
+        rate_var = kf.get('speech_rate_variability', 0.0)
+        rate_var_score = min(1.0, rate_var / 0.15)
+        # pause regularity - lower = more spontaneous
+        pause_reg = kf.get('pause_regularity', 0.5)
+        pause_reg_score = 1.0 - pause_reg
+        # self-corrections indicate spontaneous speech
+        corrections = kf.get('self_correction_count', 0)
+        correction_score = min(1.0, corrections / 2.0)
+        # calculate weighted composite score
+        # weights: CNN+Prosody=15%, Linguistic=35%, AI Detection=50%
         composite_score = (
+            audio_score * 0.15 +
+            speech_pattern_score * 0.25 +
+            filler_score * 0.05 +
+            pause_score * 0.03 +
+            rate_var_score * 0.02 +
+            text_auth_score * 0.50
         )
+        # determine verdict based on composite score
         if composite_score >= 0.7:
             verdict = "AUTHENTIC"
             risk = "low"
             risk = "critical"
             recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
+        # collect concerns and strengths
         concerns = []
         strengths = []
+        # check CNN classification
         if audio_results['classification'] == 'read':
             concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
         else:
             strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
+        # check linguistic analysis
         if asr_results['kopparapu_classification'] == 'read':
             concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
         else:
             strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
+        # check filler words
         filler_ratio = asr_results['filler_words']['ratio']
         if filler_ratio < 0.02:
             concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
         else:
             strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
+        # check pause patterns
         if asr_results['pause_patterns']['pause_variability'] < 0.3:
             concerns.append("Regular pause patterns suggest reading at punctuation")
         else:
             strengths.append("Irregular pause patterns indicate spontaneous thinking")
+        # check AI detection
         if text_results['ai_detection']['ai_generated']:
             concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
+        # check text originality
         if text_results['authenticity_score'] > 0.7:
             strengths.append("Text shows strong originality indicators")
         return {
             'verdict': verdict,
             'risk_level': risk,
             'recommendation': recommendation,
         }
+# test code - runs when script is executed directly
 if __name__ == "__main__":
     print("Initializing Authenticity Detection Pipeline...")
     model_path = "spectrogram_cnn_3s_window.pth"
     pipeline = AuthenticityDetectionPipeline(

speech_recognizer.py CHANGED Viewed

@@ -2,58 +2,56 @@ import whisper
 import torch
 import numpy as np
 import re
-from typing import Dict, Optional, List
 import warnings
 import librosa
 warnings.filterwarnings("ignore")
 class SpeechRecognizer:
-    def __init__(self, model_size: str = "base", device: str = None):
         if device is None:
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
         else:
             self.device = device
         print(f"Loading Whisper {model_size} model on {self.device}...")
         self.model = whisper.load_model(model_size, device=self.device)
         print(f"Whisper model loaded successfully.")
         self.model_size = model_size
-    def _validate_audio(self, audio_path: str) -> tuple[bool, str, float]:
-        """Validate audio file before transcription."""
         try:
-            # Load audio to check if it's valid
             audio, sr = librosa.load(audio_path, sr=16000)
             duration = len(audio) / sr
-            # Check if audio is too short
             if duration < 0.1:
-                return False, "Audio is too short (< 0.1 seconds)", duration
-            # Check if audio is empty or silent
             if np.max(np.abs(audio)) < 0.001:
-                return False, "Audio appears to be silent or empty", duration
             return True, "Valid", duration
         except Exception as e:
-            return False, f"Failed to load audio: {str(e)}", 0.0
-    def transcribe(
-        self,
-        audio_path: str,
-        language: Optional[str] = None,
-        task: str = "transcribe"
-    ) -> Dict[str, any]:
-        # Validate audio first
         is_valid, message, audio_duration = self._validate_audio(audio_path)
         if not is_valid:
-            print(f"Audio validation failed: {message}")
-            # Return minimal valid response for invalid audio
             return self._get_empty_response(message, audio_duration)
         try:
             result = self.model.transcribe(
                 audio_path,
@@ -61,17 +59,17 @@ class SpeechRecognizer:
                 task=task,
                 verbose=False,
                 word_timestamps=True,
-                fp16=False  # Disable fp16 to avoid KV cache KeyError
             )
         except (KeyError, RuntimeError) as e:
             error_msg = str(e)
-            # Check if it's a tensor shape error (empty audio issue)
             if "reshape tensor of 0 elements" in error_msg or "ambiguous" in error_msg:
-                print(f"Audio processing failed: Audio may be too short or corrupted")
                 return self._get_empty_response("Audio too short or corrupted", audio_duration)
-            # Fallback: transcribe without word timestamps for other errors
-            print(f"Warning: Transcription failed ({error_msg[:100]}), retrying without word timestamps...")
             try:
                 result = self.model.transcribe(
                     audio_path,
@@ -82,20 +80,23 @@ class SpeechRecognizer:
                     fp16=False
                 )
             except Exception as e2:
-                print(f"Transcription completely failed: {e2}")
-                return self._get_empty_response(f"Transcription failed: {str(e2)[:100]}", audio_duration)
         transcription = result['text'].strip()
         detected_language = result.get('language', 'unknown')
         segments = result.get('segments', [])
-        # Handle empty transcription
         if not transcription or len(transcription.strip()) == 0:
             print("Warning: Transcription is empty")
             return self._get_empty_response("No speech detected in audio", audio_duration)
         analysis = self._analyze_transcription(transcription, segments)
         duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
         kopparapu_features = self._extract_kopparapu_features(
             transcription, duration, segments, analysis['pause_patterns']
@@ -117,8 +118,8 @@ class SpeechRecognizer:
             'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
         }
-    def _get_empty_response(self, reason: str, duration: float = 0.0) -> Dict[str, any]:
-        """Return a valid empty response when transcription fails."""
         return {
             'transcription': f"[Error: {reason}]",
             'language': 'unknown',
@@ -147,20 +148,23 @@ class SpeechRecognizer:
             },
             'kopparapu_score': 0.5,
             'kopparapu_classification': 'unknown',
-            'interpretation': f"⚠️ Audio processing failed: {reason}\n\nPlease ensure:\n- Audio is at least 1 second long\n- Audio contains actual speech\n- Audio file is not corrupted"
         }
-    def _analyze_transcription(self, text: str, segments: List[Dict]) -> Dict:
         words = text.split()
         word_count = len(words)
         duration = 0
         if segments:
             duration = segments[-1]['end'] - segments[0]['start']
         speech_rate = (word_count / duration * 60) if duration > 0 else 0
         filler_words_list = [
             ('um', r'\bum\b'), ('uh', r'\buh\b'), ('er', r'\ber\b'),
             ('ah', r'\bah\b'), ('like', r'\blike\b'), ('you know', r'\byou know\b'),
@@ -170,6 +174,7 @@ class SpeechRecognizer:
             ('hmm', r'\bhmm+\b'), ('mm', r'\bmm+\b')
         ]
         text_lower = text.lower()
         filler_count = {}
         total_fillers = 0
@@ -181,8 +186,10 @@ class SpeechRecognizer:
                 filler_count[filler_name] = count
                 total_fillers += count
         filler_ratio = total_fillers / word_count if word_count > 0 else 0
         pause_patterns = self._analyze_pauses(segments)
         return {
@@ -197,24 +204,28 @@ class SpeechRecognizer:
             'pause_patterns': pause_patterns
         }
-    def _analyze_pauses(self, segments: List[Dict]) -> Dict:
         pauses = []
         if len(segments) >= 2:
             for i in range(len(segments) - 1):
                 pause = segments[i + 1]['start'] - segments[i]['end']
-                if pause > 0.05:  # Consider pauses > 50ms (lowered threshold)
                     pauses.append(pause)
         for segment in segments:
             if 'words' in segment and len(segment['words']) > 1:
                 words = segment['words']
                 for i in range(len(words) - 1):
                     if 'start' in words[i] and 'end' in words[i] and 'start' in words[i+1]:
                         pause = words[i + 1]['start'] - words[i]['end']
-                        if pause > 0.15:  # Word-level pauses (>150ms significant)
                             pauses.append(pause)
         if not pauses:
             return {
                 'avg_pause': 0.0,
@@ -230,11 +241,10 @@ class SpeechRecognizer:
             'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
         }
-    def _extract_kopparapu_features(
-        self, text: str, duration_sec: float,
-        segments: List[Dict] = None, pause_patterns: Dict = None
-    ) -> Dict:
         text = text.strip()
         if len(text) == 0:
             return {
                 'alpha_ratio': 0.0,
@@ -249,24 +259,28 @@ class SpeechRecognizer:
                 'self_correction_count': 0
             }
         total_chars = len(text)
         alpha_chars = sum(c.isalpha() for c in text)
         nonalpha_chars = total_chars - alpha_chars
         alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
         words = text.split()
         num_words = max(len(words), 1)
         chars_per_word = alpha_chars / num_words
         duration_sec = max(duration_sec, 1e-3)
         words_per_sec = num_words / duration_sec
         nonalpha_per_sec = nonalpha_chars / duration_sec
-        # Character repetitions (e.g., "sooo", "ummmm")
         char_reps = len(re.findall(r'(.)\1{2,}', text))
-        # Word repetitions (e.g., "I I think", "the the")
         words_list = text.lower().split()
         word_reps = 0
         for i in range(len(words_list) - 1):
@@ -275,7 +289,7 @@ class SpeechRecognizer:
         repetition_count = char_reps + word_reps
-        # Filler words detection
         lower = text.lower()
         filler_patterns = [
             r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
@@ -289,23 +303,20 @@ class SpeechRecognizer:
             filler_count += len(re.findall(pattern, lower))
         filler_rate = filler_count / num_words
-        # NEW: Pause regularity - read speech has regular pauses at punctuation
-        # Low variability = regular pauses = likely read
-        pause_regularity = 0.5  # neutral default
         if pause_patterns and pause_patterns.get('num_pauses', 0) > 2:
             pause_var = pause_patterns.get('pause_variability', 0.5)
-            # Normalize: low variability (< 0.2) -> high regularity (close to 1)
-            # High variability (> 0.6) -> low regularity (close to 0)
             pause_regularity = max(0.0, min(1.0, 1.0 - (pause_var / 0.6)))
-        # NEW: Speech rate variability across segments
-        # Read speech has consistent pacing; spontaneous varies with thinking
         speech_rate_variability = self._compute_rate_variability(segments) if segments else 0.0
-        # NEW: Sentence length variance - read text has more uniform structure
         sentence_length_variance = self._compute_sentence_variance(text)
-        # NEW: Self-corrections and false starts (spontaneous speech markers)
         self_correction_patterns = [
             r'\bwait\b', r'\bsorry\b', r'\bno\s*,?\s*I\b',
             r'\bactually\s*,?\s*no\b', r'\blet me\b', r'\bwhat I meant\b',
@@ -328,14 +339,15 @@ class SpeechRecognizer:
             'self_correction_count': int(self_correction_count)
         }
-    def _compute_rate_variability(self, segments: List[Dict]) -> float:
         if not segments or len(segments) < 3:
             return 0.0
         segment_rates = []
         for seg in segments:
             duration = seg.get('end', 0) - seg.get('start', 0)
-            if duration > 0.3:  # Only consider segments > 300ms
                 words_in_seg = len(seg.get('text', '').split())
                 rate = words_in_seg / duration
                 if rate > 0:
@@ -344,42 +356,46 @@ class SpeechRecognizer:
         if len(segment_rates) < 3:
             return 0.0
         mean_rate = np.mean(segment_rates)
         std_rate = np.std(segment_rates)
-        # Coefficient of variation normalized to 0-1
         cv = std_rate / mean_rate if mean_rate > 0 else 0
-        return float(min(1.0, cv / 0.5))  # CV of 0.5+ maps to 1.0
-    def _compute_sentence_variance(self, text: str) -> float:
-        # Split into sentences
         sentences = re.split(r'[.!?]+', text)
         sentences = [s.strip() for s in sentences if s.strip()]
         if len(sentences) < 2:
             return 0.0
         lengths = [len(s.split()) for s in sentences]
         mean_len = np.mean(lengths)
         std_len = np.std(lengths)
-        # Coefficient of variation normalized
         cv = std_len / mean_len if mean_len > 0 else 0
-        return float(min(1.0, cv / 0.6))  # CV of 0.6+ maps to 1.0
-    def _logistic(self, x: float, a: float, b: float) -> float:        return 1.0 / (1.0 + np.exp(-(x - a) / b))
-    def _calculate_kopparapu_score(self, features: Dict) -> float:
-        # L1: Vocabulary complexity - higher chars/word = more formal = read
         f1 = features['chars_per_word']
         L1 = self._logistic(f1, a=4.8, b=1.2)
-        # L2: Speaking rate - faster, steadier = read
         f2 = features['words_per_sec']
         L2 = self._logistic(f2, a=2.2, b=0.6)
-        # L3: Disfluency signal (inverted) - less disfluency = more read
-        # Combines filler rate, nonalpha, and repetitions
         disfluency = (
             features['nonalpha_per_sec'] +
             8.0 * features['filler_rate'] +
@@ -387,42 +403,43 @@ class SpeechRecognizer:
         )
         L3 = self._logistic(-disfluency, a=0.0, b=0.8)
-        # L4: Pause regularity - regular pauses = read (already 0-1)
         L4 = features.get('pause_regularity', 0.5)
-        # L5: Rate variability (inverted) - low variability = read
         rate_var = features.get('speech_rate_variability', 0.0)
         L5 = 1.0 - rate_var
-        # L6: Sentence variance (inverted) - uniform sentences = read
         sent_var = features.get('sentence_length_variance', 0.0)
         L6 = 1.0 - sent_var
-        # L7: Self-corrections (inverted) - more corrections = spontaneous
         corrections = features.get('self_correction_count', 0)
         L7 = self._logistic(-corrections, a=0.0, b=1.5)
-        # Weighted combination optimized for read detection
-        # Higher weights on pause regularity and rate consistency (key read markers)
         score = (
-            0.15 * L1 +  # Vocabulary complexity
-            0.15 * L2 +  # Speaking rate
-            0.15 * L3 +  # Disfluency (filler/repetition)
-            0.20 * L4 +  # Pause regularity (strong read signal)
-            0.15 * L5 +  # Rate variability
-            0.10 * L6 +  # Sentence uniformity
-            0.10 * L7    # Self-corrections
         )
         return float(score)
-    def _interpret_speech_patterns(self, analysis: Dict, kopparapu_features: Dict = None, kopparapu_score: float = None) -> str:
         filler_ratio = analysis['filler_words']['ratio']
         pause_patterns = analysis['pause_patterns']
         speech_rate = analysis['speech_rate']
         interpretation = "**Overall Assessment:**\n\n"
         spontaneity_score = 0
         indicators = []
@@ -437,7 +454,8 @@ class SpeechRecognizer:
         if 120 <= speech_rate <= 180:
             spontaneity_score += 1
             indicators.append(f"Natural speech rate ({speech_rate:.1f} words/min)")
         if spontaneity_score >= 2:
             interpretation += "✓ **Speech patterns suggest spontaneous, natural speaking.**\n\n"
             if indicators:
@@ -455,13 +473,14 @@ class SpeechRecognizer:
         return interpretation
-    def get_detailed_segments(self, audio_path: str) -> List[Dict]:
         result = self.model.transcribe(audio_path, word_timestamps=True, verbose=False)
         return result.get('segments', [])
 if __name__ == "__main__":
     recognizer = SpeechRecognizer(model_size="base")
     print(f"Speech recognizer initialized with {recognizer.model_size} model")
     print(f"Device: {recognizer.device}")

 import torch
 import numpy as np
 import re
 import warnings
 import librosa
 warnings.filterwarnings("ignore")
+# Main class for speech recognition and analysis
 class SpeechRecognizer:
+    def __init__(self, model_size="base", device=None):
+        # set device - use GPU if available
         if device is None:
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
         else:
             self.device = device
+        # load whisper model
         print(f"Loading Whisper {model_size} model on {self.device}...")
         self.model = whisper.load_model(model_size, device=self.device)
         print(f"Whisper model loaded successfully.")
         self.model_size = model_size
+    # check if audio file is valid before processing
+    def _validate_audio(self, audio_path):
         try:
+            # load and check audio
             audio, sr = librosa.load(audio_path, sr=16000)
             duration = len(audio) / sr
+            # audio must be at least 0.1 seconds
             if duration < 0.1:
+                return False, "Audio too short", duration
+            # check for silent audio
             if np.max(np.abs(audio)) < 0.001:
+                return False, "Audio is silent", duration
             return True, "Valid", duration
         except Exception as e:
+            return False, f"Could not load audio file", 0.0
+    # main transcription function
+    def transcribe(self, audio_path, language=None, task="transcribe"):
+        # validate audio first
         is_valid, message, audio_duration = self._validate_audio(audio_path)
         if not is_valid:
+            print(f"Audio check failed: {message}")
             return self._get_empty_response(message, audio_duration)
+        # try to transcribe with word timestamps
         try:
             result = self.model.transcribe(
                 audio_path,
                 task=task,
                 verbose=False,
                 word_timestamps=True,
+                fp16=False  # avoid fp16 issues
             )
         except (KeyError, RuntimeError) as e:
             error_msg = str(e)
+            # handle specific errors
             if "reshape tensor of 0 elements" in error_msg or "ambiguous" in error_msg:
+                print(f"Audio might be too short or corrupted")
                 return self._get_empty_response("Audio too short or corrupted", audio_duration)
+            # retry without word timestamps
+            print(f"First try failed, trying again...")
             try:
                 result = self.model.transcribe(
                     audio_path,
                     fp16=False
                 )
             except Exception as e2:
+                print(f"Could not transcribe audio: {e2}")
+                return self._get_empty_response("Transcription failed", audio_duration)
+        # extract transcription results
         transcription = result['text'].strip()
         detected_language = result.get('language', 'unknown')
         segments = result.get('segments', [])
+        # handle empty transcription
         if not transcription or len(transcription.strip()) == 0:
             print("Warning: Transcription is empty")
             return self._get_empty_response("No speech detected in audio", audio_duration)
+        # analyze transcription for speech patterns
         analysis = self._analyze_transcription(transcription, segments)
+        # extract kopparapu features for read/spontaneous detection
         duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
         kopparapu_features = self._extract_kopparapu_features(
             transcription, duration, segments, analysis['pause_patterns']
             'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
         }
+    # return empty response when transcription fails
+    def _get_empty_response(self, reason, duration=0.0):
         return {
             'transcription': f"[Error: {reason}]",
             'language': 'unknown',
             },
             'kopparapu_score': 0.5,
             'kopparapu_classification': 'unknown',
+            'interpretation': f"Could not process audio: {reason}\n\nTips:\n- Make sure audio is at least 1 second\n- Check that there is actual speech\n- Try a different audio file"
         }
+    # analyze transcription for various speech metrics
+    def _analyze_transcription(self, text, segments):
         words = text.split()
         word_count = len(words)
+        # calculate duration from segments
         duration = 0
         if segments:
             duration = segments[-1]['end'] - segments[0]['start']
+        # calculate speaking rate (words per minute)
         speech_rate = (word_count / duration * 60) if duration > 0 else 0
+        # list of filler words to detect
         filler_words_list = [
             ('um', r'\bum\b'), ('uh', r'\buh\b'), ('er', r'\ber\b'),
             ('ah', r'\bah\b'), ('like', r'\blike\b'), ('you know', r'\byou know\b'),
             ('hmm', r'\bhmm+\b'), ('mm', r'\bmm+\b')
         ]
+        # count filler words
         text_lower = text.lower()
         filler_count = {}
         total_fillers = 0
                 filler_count[filler_name] = count
                 total_fillers += count
+        # calculate filler ratio
         filler_ratio = total_fillers / word_count if word_count > 0 else 0
+        # analyze pause patterns
         pause_patterns = self._analyze_pauses(segments)
         return {
             'pause_patterns': pause_patterns
         }
+    # extract pause timing information from segments
+    def _analyze_pauses(self, segments):
         pauses = []
+        # find pauses between segments
         if len(segments) >= 2:
             for i in range(len(segments) - 1):
                 pause = segments[i + 1]['start'] - segments[i]['end']
+                if pause > 0.05:  # pauses > 50ms
                     pauses.append(pause)
+        # find pauses between words within segments
         for segment in segments:
             if 'words' in segment and len(segment['words']) > 1:
                 words = segment['words']
                 for i in range(len(words) - 1):
                     if 'start' in words[i] and 'end' in words[i] and 'start' in words[i+1]:
                         pause = words[i + 1]['start'] - words[i]['end']
+                        if pause > 0.15:  # word-level pauses > 150ms
                             pauses.append(pause)
+        # return empty stats if no pauses found
         if not pauses:
             return {
                 'avg_pause': 0.0,
             'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
         }
+    # extract features based on kopparapu's method for read vs spontaneous detection
+    def _extract_kopparapu_features(self, text, duration_sec, segments=None, pause_patterns=None):
         text = text.strip()
+        # handle empty text
         if len(text) == 0:
             return {
                 'alpha_ratio': 0.0,
                 'self_correction_count': 0
             }
+        # count character types
         total_chars = len(text)
         alpha_chars = sum(c.isalpha() for c in text)
         nonalpha_chars = total_chars - alpha_chars
+        # ratio of alphabetic characters
         alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
+        # average word length
         words = text.split()
         num_words = max(len(words), 1)
         chars_per_word = alpha_chars / num_words
+        # speaking rate features
         duration_sec = max(duration_sec, 1e-3)
         words_per_sec = num_words / duration_sec
         nonalpha_per_sec = nonalpha_chars / duration_sec
+        # detect character repetitions like "sooo" or "ummmm"
         char_reps = len(re.findall(r'(.)\1{2,}', text))
+        # detect word repetitions like "I I think"
         words_list = text.lower().split()
         word_reps = 0
         for i in range(len(words_list) - 1):
         repetition_count = char_reps + word_reps
+        # count filler words
         lower = text.lower()
         filler_patterns = [
             r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
             filler_count += len(re.findall(pattern, lower))
         filler_rate = filler_count / num_words
+        # pause regularity - read speech has regular pauses at punctuation
+        pause_regularity = 0.5
         if pause_patterns and pause_patterns.get('num_pauses', 0) > 2:
             pause_var = pause_patterns.get('pause_variability', 0.5)
+            # low variability = regular pauses = likely read
             pause_regularity = max(0.0, min(1.0, 1.0 - (pause_var / 0.6)))
+        # speech rate variability across segments
         speech_rate_variability = self._compute_rate_variability(segments) if segments else 0.0
+        # sentence length variance - uniform = likely read
         sentence_length_variance = self._compute_sentence_variance(text)
+        # count self-corrections and false starts
         self_correction_patterns = [
             r'\bwait\b', r'\bsorry\b', r'\bno\s*,?\s*I\b',
             r'\bactually\s*,?\s*no\b', r'\blet me\b', r'\bwhat I meant\b',
             'self_correction_count': int(self_correction_count)
         }
+    # compute variability in speaking rate across segments
+    def _compute_rate_variability(self, segments):
         if not segments or len(segments) < 3:
             return 0.0
         segment_rates = []
         for seg in segments:
             duration = seg.get('end', 0) - seg.get('start', 0)
+            if duration > 0.3:  # only segments > 300ms
                 words_in_seg = len(seg.get('text', '').split())
                 rate = words_in_seg / duration
                 if rate > 0:
         if len(segment_rates) < 3:
             return 0.0
+        # calculate coefficient of variation
         mean_rate = np.mean(segment_rates)
         std_rate = np.std(segment_rates)
         cv = std_rate / mean_rate if mean_rate > 0 else 0
+        return float(min(1.0, cv / 0.5))
+    # compute variance in sentence lengths
+    def _compute_sentence_variance(self, text):
+        # split into sentences
         sentences = re.split(r'[.!?]+', text)
         sentences = [s.strip() for s in sentences if s.strip()]
         if len(sentences) < 2:
             return 0.0
+        # get word counts per sentence
         lengths = [len(s.split()) for s in sentences]
         mean_len = np.mean(lengths)
         std_len = np.std(lengths)
+        # coefficient of variation normalized
         cv = std_len / mean_len if mean_len > 0 else 0
+        return float(min(1.0, cv / 0.6))
+    # logistic function for smooth score transitions
+    def _logistic(self, x, a, b):
+        return 1.0 / (1.0 + np.exp(-(x - a) / b))
+    # calculate overall kopparapu score for read vs spontaneous
+    def _calculate_kopparapu_score(self, features):
+        # L1: vocabulary complexity - higher = more formal = read
         f1 = features['chars_per_word']
         L1 = self._logistic(f1, a=4.8, b=1.2)
+        # L2: speaking rate - faster, steadier = read
         f2 = features['words_per_sec']
         L2 = self._logistic(f2, a=2.2, b=0.6)
+        # L3: disfluency - less disfluency = more read
         disfluency = (
             features['nonalpha_per_sec'] +
             8.0 * features['filler_rate'] +
         )
         L3 = self._logistic(-disfluency, a=0.0, b=0.8)
+        # L4: pause regularity - regular = read
         L4 = features.get('pause_regularity', 0.5)
+        # L5: rate variability - low = read
         rate_var = features.get('speech_rate_variability', 0.0)
         L5 = 1.0 - rate_var
+        # L6: sentence variance - uniform = read
         sent_var = features.get('sentence_length_variance', 0.0)
         L6 = 1.0 - sent_var
+        # L7: self-corrections - fewer = read
         corrections = features.get('self_correction_count', 0)
         L7 = self._logistic(-corrections, a=0.0, b=1.5)
+        # weighted combination
         score = (
+            0.15 * L1 +  # vocabulary complexity
+            0.15 * L2 +  # speaking rate
+            0.15 * L3 +  # disfluency
+            0.20 * L4 +  # pause regularity
+            0.15 * L5 +  # rate variability
+            0.10 * L6 +  # sentence uniformity
+            0.10 * L7    # self-corrections
         )
         return float(score)
+    # generate human-readable interpretation of speech patterns
+    def _interpret_speech_patterns(self, analysis, kopparapu_features=None, kopparapu_score=None):
         filler_ratio = analysis['filler_words']['ratio']
         pause_patterns = analysis['pause_patterns']
         speech_rate = analysis['speech_rate']
         interpretation = "**Overall Assessment:**\n\n"
+        # calculate spontaneity score
         spontaneity_score = 0
         indicators = []
         if 120 <= speech_rate <= 180:
             spontaneity_score += 1
             indicators.append(f"Natural speech rate ({speech_rate:.1f} words/min)")
+        # generate interpretation based on score
         if spontaneity_score >= 2:
             interpretation += "✓ **Speech patterns suggest spontaneous, natural speaking.**\n\n"
             if indicators:
         return interpretation
+    # get detailed segment information
+    def get_detailed_segments(self, audio_path):
         result = self.model.transcribe(audio_path, word_timestamps=True, verbose=False)
         return result.get('segments', [])
+# test code - runs when script is executed directly
 if __name__ == "__main__":
     recognizer = SpeechRecognizer(model_size="base")
     print(f"Speech recognizer initialized with {recognizer.model_size} model")
     print(f"Device: {recognizer.device}")

text_analyzer.py CHANGED Viewed

@@ -1,18 +1,7 @@
-import re
-import requests
-from typing import Dict, List, Tuple, Optional
-import torch
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSequenceClassification,
-    RobertaTokenizer,
-    RobertaForSequenceClassification
-)
-import numpy as np
-from collections import Counter
 import warnings
 warnings.filterwarnings("ignore")
 try:
     from plagiarism_detection import ai_plagiarism_detection
     DESKLIB_AVAILABLE = True
@@ -21,12 +10,12 @@ except ImportError:
     print("Warning: plagiarism_detection module not found. Using fallback AI detection.")
 class AITextDetector:
-    def __init__(self, device: str = None, threshold: float = 0.78):
         self.threshold = threshold
         if not DESKLIB_AVAILABLE:
             print("Warning: plagiarism_detection module not found. AI detection will not be available.")
             print("Ensure plagiarism_detection.py is in the same directory.")
@@ -35,10 +24,11 @@ class AITextDetector:
             print(f"Using Desklib AI text detector (threshold: {self.threshold})")
             self.available = True
-    def detect_ai_text(self, text: str) -> Dict:
         if not self.available:
-            # Return neutral result if Desklib not available
             return {
                 'ai_generated': False,
                 'confidence': 0.5,
@@ -47,7 +37,7 @@ class AITextDetector:
                 'model_used': 'N/A (module not found)'
             }
-        # Use Desklib AI detector
         try:
             probability, ai_detected = ai_plagiarism_detection(
                 text,
@@ -63,17 +53,17 @@ class AITextDetector:
                 'model_used': 'Desklib AI Detector v1.01'
             }
         except Exception as e:
-            print(f"Error in AI detection: {e}")
             return {
                 'ai_generated': False,
                 'confidence': 0.5,
                 'indicators': [],
-                'interpretation': f"AI detection error: {str(e)}",
                 'model_used': 'Error'
             }
-    def _identify_ai_indicators(self, probability: float) -> List[str]:
         indicators = []
         if probability > 0.9:
@@ -85,7 +75,8 @@ class AITextDetector:
         return indicators
-    def _interpret_ai_detection(self, score: float) -> str:
         interpretation = f"**AI-Generated Text Detection:**\n\n"
         interpretation += f"- AI Probability Score: {score*100:.1f}%\n"
         interpretation += f"- Detection Threshold: {self.threshold*100:.0f}%\n"
@@ -93,21 +84,23 @@ class AITextDetector:
         return interpretation
 class TextAuthenticityAnalyzer:
-    def __init__(self, device: str = None, ai_threshold: float = 0.78):
         self.ai_detector = AITextDetector(device=device, threshold=ai_threshold)
-    def analyze(self, text: str) -> Dict:
-        # Run AI detection
         ai_results = self.ai_detector.detect_ai_text(text)
-        # Calculate overall authenticity score based on AI detection
         ai_penalty = ai_results['confidence']
         authenticity_score = 1.0 - ai_penalty
-        # Determine overall assessment
         if authenticity_score < 0.3:
             overall_assessment = "HIGH RISK: Strong AI-generated text indicators"
             risk_level = "high"
@@ -129,9 +122,8 @@ class TextAuthenticityAnalyzer:
         }
 if __name__ == "__main__":
-    # Example usage
     analyzer = TextAuthenticityAnalyzer()
     print("Text authenticity analyzer initialized.")
     print("Components: Plagiarism Detector + AI Text Detector")

 import warnings
 warnings.filterwarnings("ignore")
+# try to import the desklib AI detector
 try:
     from plagiarism_detection import ai_plagiarism_detection
     DESKLIB_AVAILABLE = True
     print("Warning: plagiarism_detection module not found. Using fallback AI detection.")
+# class for detecting AI-generated text
 class AITextDetector:
+    def __init__(self, device=None, threshold=0.78):
         self.threshold = threshold
+        # check if desklib model is available
         if not DESKLIB_AVAILABLE:
             print("Warning: plagiarism_detection module not found. AI detection will not be available.")
             print("Ensure plagiarism_detection.py is in the same directory.")
             print(f"Using Desklib AI text detector (threshold: {self.threshold})")
             self.available = True
+    # main detection function
+    def detect_ai_text(self, text):
+        # return neutral result if detector not available
         if not self.available:
             return {
                 'ai_generated': False,
                 'confidence': 0.5,
                 'model_used': 'N/A (module not found)'
             }
+        # run detection using desklib model
         try:
             probability, ai_detected = ai_plagiarism_detection(
                 text,
                 'model_used': 'Desklib AI Detector v1.01'
             }
         except Exception as e:
+            print(f"Something went wrong with AI detection: {e}")
             return {
                 'ai_generated': False,
                 'confidence': 0.5,
                 'indicators': [],
+                'interpretation': "Could not run AI detection",
                 'model_used': 'Error'
             }
+    # identify specific indicators based on probability
+    def _identify_ai_indicators(self, probability):
         indicators = []
         if probability > 0.9:
         return indicators
+    # generate interpretation text
+    def _interpret_ai_detection(self, score):
         interpretation = f"**AI-Generated Text Detection:**\n\n"
         interpretation += f"- AI Probability Score: {score*100:.1f}%\n"
         interpretation += f"- Detection Threshold: {self.threshold*100:.0f}%\n"
         return interpretation
+# main analyzer class that combines all text analysis
 class TextAuthenticityAnalyzer:
+    def __init__(self, device=None, ai_threshold=0.78):
+        # initialize AI detector
         self.ai_detector = AITextDetector(device=device, threshold=ai_threshold)
+    # analyze text for authenticity
+    def analyze(self, text):
+        # run AI detection
         ai_results = self.ai_detector.detect_ai_text(text)
+        # calculate authenticity score (inverse of AI probability)
         ai_penalty = ai_results['confidence']
         authenticity_score = 1.0 - ai_penalty
+        # determine risk level based on authenticity
         if authenticity_score < 0.3:
             overall_assessment = "HIGH RISK: Strong AI-generated text indicators"
             risk_level = "high"
         }
+# test code - runs when script is executed directly
 if __name__ == "__main__":
     analyzer = TextAuthenticityAnalyzer()
     print("Text authenticity analyzer initialized.")
     print("Components: Plagiarism Detector + AI Text Detector")