Spaces:

ranamhamoud
/

Authenticity

Sleeping

App Files Files Community

Ranam Hamoud commited on Nov 25, 2025

Commit

4ec806c

1 Parent(s): 1089eaf

Add audio authenticity detection app with all components

Browse files

Files changed (11) hide show

app.py +420 -0
audio-wave.svg +12 -0
audio_classifier.py +361 -0
pipeline.py +189 -0
plagiarism_detection.py +340 -0
requirements.txt +14 -0
spectrogram_cnn_3s_window (1).pth +3 -0
spectrogram_cnn_4s_window.pth +3 -0
spectrogram_cnn_4s_window_488_x_488.pth +3 -0
speech_recognizer.py +270 -0
text_analyzer.py +137 -0

app.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import gradio as gr
+import os
+from pipeline import AuthenticityDetectionPipeline
+import traceback
+try:
+    pipeline = AuthenticityDetectionPipeline(whisper_model_size="base")
+    pipeline_ready = True
+except Exception:
+    pipeline_ready = False
+def analyze_audio_file(audio_file):
+    if not pipeline_ready:
+        return (
+            "Error: Pipeline not initialized. Please check the installation.",
+            "", "", "", "", ""
+        )
+    if audio_file is None:
+        return (
+            "Please upload an audio file.",
+            "", "", "", "", ""
+        )
+    try:
+        language_code = None
+        results = pipeline.analyze_audio(audio_file, language=language_code)
+        audio_class = results['audio_classification']
+        asr = results['speech_recognition']
+        text_auth = results['text_authenticity']
+        final = results['final_assessment']
+        verdict_color = {
+            "AUTHENTIC": "#10b981",
+            "LIKELY AUTHENTIC": "#3b82f6",
+            "QUESTIONABLE": "#f59e0b",
+            "LIKELY INAUTHENTIC": "#ef4444"
+        }
+        color = verdict_color.get(final['verdict'], '#6b7280')
+        overall_status = f"""
+<div style='background: white; border: 2px solid {color}; padding: 25px; border-radius: 16px; margin: 10px 0;'>
+    <h2 style='color: {color}; margin: 0 0 15px 0; font-size: 24px; font-weight: 700;'>
+        {final['verdict']}
+    </h2>
+    <div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px; margin: 15px 0;'>
+        <div style='text-align: center; padding: 15px; background: white; border-radius: 10px;'>
+            <div style='font-size: 2em; font-weight: bold; color: {color};'>{final['composite_authenticity_score']*100:.0f}%</div>
+            <div style='color: #666; margin-top: 5px;'>Authenticity Score</div>
+        </div>
+        <div style='text-align: center; padding: 15px; background: white; border-radius: 10px;'>
+            <div style='font-size: 2em; font-weight: bold; color: {color};'>{final['risk_level'].upper()}</div>
+            <div style='color: #666; margin-top: 5px;'>Risk Level</div>
+        </div>
+        <div style='text-align: center; padding: 15px; background: white; border-radius: 10px;'>
+            <div style='font-size: 2em; font-weight: bold; color: #667eea;'>{results['processing_time']:.1f}s</div>
+            <div style='color: #666; margin-top: 5px;'>Processing Time</div>
+        </div>
+    </div>
+    <div style='background: white; padding: 15px; border-radius: 10px; margin-top: 15px;'>
+        <em style='color: #555;'>{final['recommendation']}</em>
+    </div>
+</div>
+"""
+        acoustic_output = audio_class['interpretation']
+        transcription_output = "### Speech Transcription\n\n"
+        transcription_output += f"| Metric | Value |\n"
+        transcription_output += f"|--------|-------|\n"
+        transcription_output += f"| **Language** | {asr['language'].upper()} |\n"
+        transcription_output += f"| **Duration** | {asr['duration']:.1f} seconds |\n"
+        transcription_output += f"| **Word Count** | {asr['word_count']} words |\n"
+        transcription_output += f"| **Speech Rate** | {asr['speech_rate']:.1f} words/min |\n\n"
+        if asr['speech_rate'] > 160:
+            transcription_output += "**Fast speech rate** - Above average speaking speed\n\n"
+        elif asr['speech_rate'] < 120:
+            transcription_output += "**Slow speech rate** - Below average speaking speed\n\n"
+        else:
+            transcription_output += "**Normal speech rate** - Average conversational pace\n\n"
+        transcription_output += "---\n\n"
+        transcription_output += "#### Full Transcription\n\n"
+        transcription_output += f"> {asr['transcription']}"
+        if 'kopparapu_score' in asr:
+            classification = asr['kopparapu_classification'].upper()
+            confidence = asr['kopparapu_score'] if asr['kopparapu_score'] >= 0.5 else (1 - asr['kopparapu_score'])
+            speech_patterns = f" ### **Classification: {classification} SPEECH**\n\n"
+            speech_patterns += f"**Score:** {asr['kopparapu_score']:.3f} (0=spontaneous, 1=read)\n"
+            speech_patterns += f"**Confidence:** {confidence*100:.1f}%\n\n"
+            speech_patterns += "---\n\n"
+            speech_patterns += "#### Linguistic Metrics\n\n"
+            kf = asr['kopparapu_features']
+            speech_patterns += "| Feature | Value | Interpretation |\n"
+            speech_patterns += "|---------|-------|----------------|\n"
+            speech_patterns += f"| **Characters/Word** | {kf['chars_per_word']:.2f} | "
+            if kf['chars_per_word'] > 5.5:
+                speech_patterns += "Complex vocabulary |\n"
+            elif kf['chars_per_word'] < 4.5:
+                speech_patterns += "Simple vocabulary |\n"
+            else:
+                speech_patterns += "Average complexity |\n"
+            speech_patterns += f"| **Words/Second** | {kf['words_per_sec']:.2f} | "
+            if kf['words_per_sec'] > 3:
+                speech_patterns += "Fast pacing |\n"
+            elif kf['words_per_sec'] < 2:
+                speech_patterns += "Slow pacing |\n"
+            else:
+                speech_patterns += "Normal pacing |\n"
+            speech_patterns += f"| **Non-alpha chars/sec** | {kf['nonalpha_per_sec']:.2f} | "
+            if kf['nonalpha_per_sec'] > 2.5:
+                speech_patterns += "High (disfluent) |\n"
+            elif kf['nonalpha_per_sec'] < 1.5:
+                speech_patterns += "Low (fluent) |\n"
+            else:
+                speech_patterns += "Moderate |\n"
+            speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
+            if kf['filler_rate'] > 0.05:
+                speech_patterns += "High (spontaneous) |\n"
+            elif kf['filler_rate'] < 0.02:
+                speech_patterns += "Low (scripted) |\n"
+            else:
+                speech_patterns += "Moderate |\n"
+            speech_patterns += f"| **Repetitions** | {kf['repetition_count']} | "
+            if kf['repetition_count'] > 3:
+                speech_patterns += "Multiple (thinking aloud) |\n"
+            elif kf['repetition_count'] == 0:
+                speech_patterns += "None (prepared) |\n"
+            else:
+                speech_patterns += "Few |\n"
+            speech_patterns += f"| **Alpha Ratio** | {kf['alpha_ratio']:.2f} | "
+            if kf['alpha_ratio'] > 0.85:
+                speech_patterns += "Clean text |\n"
+            else:
+                speech_patterns += "With artifacts |\n"
+            speech_patterns += "\n"
+        speech_patterns += "---\n\n"
+        speech_patterns += "#### Filler Words & Disfluencies\n\n"
+        filler_ratio = asr['filler_words']['ratio']
+        speech_patterns += f"**Count:** {asr['filler_words']['count']} filler words\n"
+        speech_patterns += f"**Ratio:** {filler_ratio*100:.2f}% of speech\n\n"
+        if asr['filler_words']['details']:
+            speech_patterns += "**Found:** " + ', '.join([f"*{k}* ({v})" for k, v in asr['filler_words']['details'].items()]) + "\n\n"
+        if filler_ratio > 0.05:
+            speech_patterns += "**High filler usage** - Strong indicator of spontaneous, unscripted speech\n\n"
+        elif filler_ratio < 0.02:
+            speech_patterns += "**Low filler usage** - May indicate reading or highly rehearsed speech\n\n"
+        else:
+            speech_patterns += "**Moderate filler usage** - Normal conversational pattern\n\n"
+        speech_patterns += "---\n\n"
+        speech_patterns += "#### Pause Patterns\n\n"
+        pause_var = asr['pause_patterns']['pause_variability']
+        speech_patterns += f"**Total Pauses:** {asr['pause_patterns']['num_pauses']}\n"
+        speech_patterns += f"**Average Duration:** {asr['pause_patterns']['avg_pause']:.2f}s\n"
+        speech_patterns += f"**Longest Pause:** {asr['pause_patterns']['max_pause']:.2f}s\n"
+        speech_patterns += f"**Variability:** {pause_var:.2f}\n\n"
+        if pause_var < 0.3:
+            speech_patterns += "**Regular pauses** - Consistent pattern suggests reading at punctuation marks\n\n"
+        elif pause_var > 0.6:
+            speech_patterns += "**Irregular pauses** - Natural thinking breaks indicate spontaneous speech\n\n"
+        else:
+            speech_patterns += "**Moderate variability** - Mixed pattern\n\n"
+        is_ai = text_auth['ai_detection']['ai_generated']
+        ai_prob = text_auth['ai_detection']['confidence']
+        if is_ai:
+            ai_output = "### **AI-GENERATED LIKELY**\n\n"
+        else:
+            ai_output = "### **HUMAN-WRITTEN LIKELY**\n\n"
+        ai_output += "**Confidence:**\n\n"
+        bar_length = 30
+        ai_bars = int(ai_prob * bar_length)
+        human_bars = bar_length - ai_bars
+        ai_output += f"```\nAI:    [{'█' * ai_bars}{'░' * human_bars}] {ai_prob*100:.0f}%\n"
+        ai_output += f"Human: [{'█' * human_bars}{'░' * ai_bars}] {(1-ai_prob)*100:.0f}%\n```\n\n"
+        ai_output += "---\n\n"
+        ai_output += "#### Interpretation\n\n"
+        ai_interpretation = text_auth['ai_detection'].get('interpretation', 'No interpretation available.')
+        if ai_interpretation:
+            ai_output += ai_interpretation
+        else:
+            ai_output += "No interpretation available."
+        return (
+            overall_status,
+            acoustic_output,
+            transcription_output,
+            speech_patterns,
+            ai_output,
+        )
+    except Exception as e:
+        error_msg = f"Error during analysis:\n\n{str(e)}\n\n{traceback.format_exc()}"
+        return (error_msg, "", "", "", "", "")
+def create_interface():
+    """Create and configure Gradio interface."""
+    custom_css = """
+    @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
+    .gradio-container {
+        font-family: 'IBM Plex Sans', sans-serif !important;
+        background: #f9fafb !important;
+    }
+    .contain {
+        max-width: 1280px;
+        margin: 0 auto;
+        background: white;
+        border-radius: 16px;
+        box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+        padding: 24px;
+    }
+    .tab-nav button {
+        font-family: 'IBM Plex Sans', sans-serif;
+        font-size: 14px;
+        font-weight: 500;
+        padding: 10px 16px;
+        border-radius: 8px 8px 0 0;
+        transition: all 0.2s;
+    }
+    .tab-nav button.selected {
+        background: #2563eb;
+        color: white;
+        font-weight: 600;
+    }
+    button.primary, .primary {
+        background: #2563eb !important;
+        color: white !important;
+        border: none !important;
+        font-size: 16px !important;
+        font-weight: 600 !important;
+        padding: 12px 24px !important;
+        border-radius: 8px !important;
+        transition: all 0.2s !important;
+    }
+    button.primary:hover, .primary:hover {
+        background: #1d4ed8 !important;
+    }
+    .markdown-text {
+        font-family: 'IBM Plex Sans', sans-serif;
+        line-height: 1.7;
+    }
+    h1, h2, h3, h4 {
+        font-family: 'IBM Plex Sans', sans-serif;
+        font-weight: 600;
+    }
+    """
+    with gr.Blocks(css=custom_css, title="Authenticity Detection System") as demo:
+        gr.HTML("""
+        <header style='background: white; border-bottom: 1px solid #e5e7eb; margin-bottom: 32px;'>
+            <div style='padding: 16px 0;'>
+                <div style='display: flex; align-items: center; gap: 12px;'>
+                    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" width="32" height="32">
+                        <defs>
+                            <linearGradient id="g" x1="0" y1="0" x2="64" y2="0" gradientUnits="userSpaceOnUse">
+                                <stop offset="0" stop-color="#1d4ed8" />
+                                <stop offset="1" stop-color="#0ea5e9" />
+                            </linearGradient>
+                        </defs>
+                        <rect x="0" y="0" width="64" height="64" rx="12" fill="#ffffff"/>
+                        <path d="M4 32 C 10 18, 18 46, 24 32 S 36 18, 40 32 52 46, 60 32"
+                              fill="none" stroke="url(#g)" stroke-width="4" stroke-linecap="round" stroke-linejoin="round"/>
+                    </svg>
+                    <div>
+                        <p style='margin: 0; font-size: 11px; text-transform: uppercase; letter-spacing: 1.5px; color: #6b7280; font-weight: 500;'>
+                            LEIDEN UNIVERSITY · LIACS
+                        </p>
+                        <h1 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>
+                            Audio Processing & Indexing Project
+                        </h1>
+                    </div>
+                </div>
+            </div>
+        </header>
+        <section style='background: linear-gradient(to bottom, white, #f9fafb); margin-bottom: 40px;'>
+            <div style='padding: 32px 0;'>
+                <h2 style='font-size: 32px; font-weight: 700; line-height: 1.2; color: #111827; margin: 0 0 16px 0;'>
+                    Detecting AI-Assisted Responses in Online Settings
+                </h2>
+                <p style='font-size: 18px; color: #374151; margin: 0 0 24px 0;'>
+                </p>
+                <div style='display: flex; flex-wrap: wrap; gap: 12px;'>
+                    <span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #eff6ff; color: #1e40af; border-radius: 8px; font-size: 14px; font-weight: 500;'>
+                        Multi-Modal Analysis
+                    </span>
+                    <span style='display: inline-flex; align-items: center; padding: 8px 16px; background: #fef3c7; color: #92400e; border-radius: 8px; font-size: 14px; font-weight: 500;'>
+                        Acoustic + Linguistic
+                    </span>
+                </div>
+            </div>
+        </section>
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.HTML("""
+                <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 20px;'>
+                    <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Audio Input</h3>
+                    <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>Upload or record your audio file</p>
+                </div>
+                """)
+                audio_input = gr.Audio(
+                    sources=["upload", "microphone"],
+                    type="filepath",
+                    label="Audio File",
+                    show_label=False
+                )
+                analyze_btn = gr.Button(
+                    "Analyze Audio",
+                    variant="primary",
+                    size="lg"
+                )
+                gr.HTML("""
+                <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
+                    <h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
+                    <ul style='margin: 0; padding-left: 20px; font-size: 13px; color: #6b7280; line-height: 1.8;'>
+                        <li><strong>Formats:</strong> WAV, MP3, M4A, FLAC, OGG</li>
+                        <li><strong>Duration:</strong> 30 sec - 5 min</li>
+                    </ul>
+                </div>
+                <div style='background: #fef3c7; border: 1px solid #fbbf24; padding: 16px; border-radius: 12px; margin-top: 16px;'>
+                    <div style='font-size: 12px; color: #92400e; line-height: 1.6;'>
+                        <strong>Note:</strong> Provides probabilistic assessments.
+                        Use as one factor in evaluation.
+                    </div>
+                </div>
+                """)
+            with gr.Column(scale=2):
+                gr.HTML("""
+                <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 20px;'>
+                    <h3 style='margin: 0; font-size: 18px; font-weight: 600; color: #111827;'>Analysis Results</h3>
+                    <p style='margin: 8px 0 0 0; font-size: 14px; color: #6b7280;'>You'll see results here</p>
+                </div>
+                """)
+                overall_output = gr.Markdown()
+                with gr.Tabs() as tabs:
+                    with gr.Tab("Acoustic Features"):
+                        acoustic_output = gr.Markdown()
+                    with gr.Tab("Transcription"):
+                        transcription_output = gr.Markdown()
+                    with gr.Tab("Speech Patterns"):
+                        speech_output = gr.Markdown()
+                    with gr.Tab("AI Detection"):
+                        ai_output = gr.Markdown()
+        analyze_btn.click(
+            fn=analyze_audio_file,
+            inputs=[audio_input],
+            outputs=[
+                overall_output,
+                acoustic_output,
+                transcription_output,
+                speech_output,
+                ai_output,
+            ]
+        )
+        gr.HTML("""
+        <footer style='border-top: 1px solid #e5e7eb; background: white; margin-top: 48px; padding: 32px 0;'>
+            <div style='text-align: center;'>
+                <p style='margin: 0; font-size: 14px; color: #6b7280;'>
+                </p>
+                <p style='margin: 8px 0 0 0; font-size: 13px; color: #9ca3af;'>
+                </p>
+            </div>
+        </footer>
+        """)
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

audio-wave.svg ADDED Viewed

audio_classifier.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import librosa
+import numpy as np
+from typing import Dict
+class BasicBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
+                               stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
+                               stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.downsample = downsample
+    def forward(self, x):
+        identity = x
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = F.relu(out)
+        return out
+class SpeechStyleCNN(nn.Module):
+    def __init__(self, num_classes=2):
+        super(SpeechStyleCNN, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(64, 64, 2, stride=1)
+        self.layer2 = self._make_layer(64, 128, 2, stride=2)
+        self.layer3 = self._make_layer(128, 256, 2, stride=2)
+        self.layer4 = self._make_layer(256, 512, 2, stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512, num_classes)
+    def _make_layer(self, in_channels, out_channels, blocks, stride=1):
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels)
+            )
+        layers = []
+        layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
+        for _ in range(1, blocks):
+            layers.append(BasicBlock(out_channels, out_channels))
+        return nn.Sequential(*layers)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x
+class AudioClassifier:
+    AVAILABLE_MODELS = {
+        '3s_window': 'spectrogram_cnn_3s_window (1).pth',
+        # '4s_window': 'spectrogram_cnn_4s_window.pth',
+        # '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
+    }
+    @classmethod
+    def get_model_path(cls, model_name: str = '4s_window') -> str:
+        import os
+        if model_name not in cls.AVAILABLE_MODELS:
+            raise ValueError(f"Unknown model: {model_name}. Available: {list(cls.AVAILABLE_MODELS.keys())}")
+        return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
+    def __init__(self, model_path: str = None, device: str = None):
+        if device is None:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        else:
+            self.device = torch.device(device)
+        self.model = SpeechStyleCNN().to(self.device)
+        if model_path is None:
+            import os
+            model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_4s_window.pth')
+        try:
+            state_dict = torch.load(model_path, map_location=self.device)
+            self.model.load_state_dict(state_dict)
+            print(f"Successfully loaded model from: {model_path}")
+        except FileNotFoundError:
+            print(f"Warning: Model file not found at {model_path}. Using untrained model.")
+        except Exception as e:
+            print(f"Warning: Error loading model from {model_path}: {e}. Using untrained model.")
+        self.model.eval()
+        self.sample_rate = 16000
+        self.n_mels = 128
+        self.n_fft = 2048
+        self.hop_length = 512
+    def extract_mel_spectrogram(self, audio_path: str) -> np.ndarray:
+        audio, sr = librosa.load(audio_path, sr=self.sample_rate)
+        mel_spec = librosa.feature.melspectrogram(
+            y=audio,
+            sr=sr,
+            n_mels=self.n_mels,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length
+        )
+        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
+        mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
+        mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
+        return mel_spec_3ch
+    def extract_acoustic_features(self, audio_path: str) -> Dict[str, float]:
+        audio, sr = librosa.load(audio_path, sr=self.sample_rate)
+        features = {}
+        onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
+        tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
+        features['tempo'] = float(tempo)
+        pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
+        pitch_values = []
+        for t in range(pitches.shape[1]):
+            index = magnitudes[:, t].argmax()
+            pitch = pitches[index, t]
+            if pitch > 0:
+                pitch_values.append(pitch)
+        if pitch_values:
+            features['pitch_mean'] = float(np.mean(pitch_values))
+            features['pitch_std'] = float(np.std(pitch_values))
+            features['pitch_range'] = float(np.max(pitch_values) - np.min(pitch_values))
+        else:
+            features['pitch_mean'] = 0.0
+            features['pitch_std'] = 0.0
+            features['pitch_range'] = 0.0
+        rms = librosa.feature.rms(y=audio)[0]
+        features['energy_mean'] = float(np.mean(rms))
+        features['energy_std'] = float(np.std(rms))
+        zcr = librosa.feature.zero_crossing_rate(audio)[0]
+        features['zcr_mean'] = float(np.mean(zcr))
+        features['zcr_std'] = float(np.std(zcr))
+        spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
+        features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
+        features['spectral_centroid_std'] = float(np.std(spectral_centroids))
+        return features
+    def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
+        individual_scores = {}
+        if features['pitch_mean'] > 0:
+            if features['pitch_std'] < 30:
+                pitch_score = 0.9  # Very monotone -> read
+            elif features['pitch_std'] < 50:
+                pitch_score = 0.7  # Somewhat monotone -> likely read
+            elif features['pitch_std'] < 70:
+                pitch_score = 0.5  # Moderate variation
+            elif features['pitch_std'] < 90:
+                pitch_score = 0.3  # Variable -> likely spontaneous
+            else:
+                pitch_score = 0.1  # Very variable -> spontaneous
+        else:
+            pitch_score = 0.5  # Unknown
+        individual_scores['pitch_variation'] = {
+            'score': pitch_score,
+            'value': features['pitch_std'],
+            'interpretation': 'monotone (read)' if pitch_score > 0.6 else 'variable (spontaneous)' if pitch_score < 0.4 else 'moderate'
+        }
+        # Energy consistency score (0 = variable/spontaneous, 1 = consistent/read)
+        if features['energy_std'] < 0.015:
+            energy_score = 0.9  # Very consistent -> read
+        elif features['energy_std'] < 0.025:
+            energy_score = 0.6  # Somewhat consistent -> likely read
+        elif features['energy_std'] < 0.035:
+            energy_score = 0.4  # Moderate
+        else:
+            energy_score = 0.1  # Variable -> spontaneous
+        individual_scores['energy_consistency'] = {
+            'score': energy_score,
+            'value': features['energy_std'],
+            'interpretation': 'consistent (read)' if energy_score > 0.6 else 'variable (spontaneous)' if energy_score < 0.4 else 'moderate'
+        }
+        # Tempo score (0 = slow/thoughtful/spontaneous, 1 = fast/consistent/read)
+        if features['tempo'] > 140:
+            tempo_score = 0.8  # Very fast -> likely read
+        elif features['tempo'] > 110:
+            tempo_score = 0.6  # Fast -> possibly read
+        elif features['tempo'] > 80:
+            tempo_score = 0.4  # Normal conversational
+        else:
+            tempo_score = 0.2  # Slow -> thoughtful/spontaneous
+        individual_scores['tempo'] = {
+            'score': tempo_score,
+            'value': features['tempo'],
+            'interpretation': 'fast/steady (read)' if tempo_score > 0.6 else 'slow/varied (spontaneous)' if tempo_score < 0.4 else 'moderate'
+        }
+        # Spectral consistency (voice quality stability)
+        if features['spectral_centroid_std'] < 300:
+            spectral_score = 0.8  # Very stable -> read
+        elif features['spectral_centroid_std'] < 500:
+            spectral_score = 0.5  # Moderate
+        else:
+            spectral_score = 0.2  # Variable -> spontaneous
+        individual_scores['spectral_stability'] = {
+            'score': spectral_score,
+            'value': features['spectral_centroid_std'],
+            'interpretation': 'stable (read)' if spectral_score > 0.6 else 'variable (spontaneous)' if spectral_score < 0.4 else 'moderate'
+        }
+        weights = {
+            'pitch_variation': 0.35,
+            'energy_consistency': 0.30,
+            'tempo': 0.20,
+            'spectral_stability': 0.15
+        }
+        overall_score = (
+            pitch_score * weights['pitch_variation'] +
+            energy_score * weights['energy_consistency'] +
+            tempo_score * weights['tempo'] +
+            spectral_score * weights['spectral_stability']
+        )
+        if overall_score > 0.65:
+            classification = 'read'
+            confidence = 0.5 + (overall_score - 0.5)  # Scale to confidence
+        elif overall_score < 0.35:
+            classification = 'spontaneous'
+            confidence = 0.5 + (0.5 - overall_score)  # Scale to confidence
+        else:
+            # Borderline case - go with majority
+            classification = 'read' if overall_score >= 0.5 else 'spontaneous'
+            confidence = 0.5 + abs(overall_score - 0.5) * 0.5
+        return {
+            'classification': classification,
+            'confidence': confidence,
+            'overall_score': overall_score,
+            'individual_scores': individual_scores
+        }
+    def classify(self, audio_path: str) -> Dict[str, any]:
+        mel_spec = self.extract_mel_spectrogram(audio_path)
+        mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            logits = self.model(mel_tensor)
+            probabilities = F.softmax(logits, dim=1)
+            predicted_class = torch.argmax(probabilities, dim=1).item()
+            cnn_confidence = probabilities[0, predicted_class].item()
+        acoustic_features = self.extract_acoustic_features(audio_path)
+        prosody_scores = self._compute_prosody_scores(acoustic_features)
+        prosody_classification = prosody_scores['classification']
+        prosody_confidence = prosody_scores['confidence']
+        cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
+        if cnn_class_name == prosody_classification:
+            final_confidence = min(0.95, (cnn_confidence * 0.7 + prosody_confidence * 0.3))
+            final_classification = cnn_class_name
+        else:
+            final_confidence = 0.5 + abs(cnn_confidence - prosody_confidence) * 0.3
+            if cnn_confidence > prosody_confidence:
+                final_classification = cnn_class_name
+            else:
+                final_classification = prosody_classification
+        return {
+            'classification': final_classification,
+            'confidence': float(final_confidence),
+            'cnn_classification': cnn_class_name,
+            'cnn_confidence': float(cnn_confidence),
+            'prosody_classification': prosody_classification,
+            'prosody_confidence': float(prosody_confidence),
+            'prosody_scores': prosody_scores['individual_scores'],
+            'acoustic_features': acoustic_features,
+            'interpretation': self._interpret_classification(
+                final_classification, final_confidence,
+                cnn_class_name, cnn_confidence,
+                prosody_classification, prosody_confidence,
+                prosody_scores, acoustic_features
+            )
+        }
+    def _interpret_classification(
+        self,
+        final_class: str,
+        final_confidence: float,
+        cnn_class: str,
+        cnn_confidence: float,
+        prosody_class: str,
+        prosody_confidence: float,
+        prosody_scores: Dict,
+        features: Dict
+    ) -> str:
+        interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
+        interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
+        if final_class == 'read':
+            interpretation += "**Description:** The speech exhibits characteristics of read or scripted content. "
+            interpretation += "The audio shows consistent prosodic patterns typical of someone reading from prepared text, "
+            interpretation += "with steady pacing, uniform intonation, and regular energy levels.\n\n"
+        else:
+            interpretation += "**Description:** The speech exhibits characteristics of spontaneous speaking. "
+            interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
+            interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
+        return interpretation
+if __name__ == "__main__":
+    classifier = AudioClassifier()
+    print("\nAvailable pre-trained models:")
+    for name, filename in AudioClassifier.AVAILABLE_MODELS.items():
+        print(f"  - {name}: {filename}")
+    print("\nModel architecture:")
+    print(classifier.model)

pipeline.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Multimodal Authenticity Detection Pipeline
+Integrates CNN audio classification, Whisper ASR, and text authenticity analysis
+"""
+from typing import Dict, Optional
+import time
+from audio_classifier import AudioClassifier
+from speech_recognizer import SpeechRecognizer
+from text_analyzer import TextAuthenticityAnalyzer
+class AuthenticityDetectionPipeline:
+    def __init__(
+        self,
+        audio_model_path: Optional[str] = None,
+        whisper_model_size: str = "base",
+        device: Optional[str] = None,
+        ai_detection_threshold: float = 0.78
+    ):
+        print("\n" + "="*60)
+        print("Initializing Multimodal Authenticity Detection Pipeline")
+        print("="*60 + "\n")
+        # Initialize components
+        print("📊 Loading Audio Classifier (CNN)...")
+        self.audio_classifier = AudioClassifier(
+            model_path=audio_model_path,
+            device=device
+        )
+        print("\n🎤 Loading Speech Recognizer (Whisper)...")
+        self.speech_recognizer = SpeechRecognizer(
+            model_size=whisper_model_size,
+            device=device
+        )
+        print("\n📝 Loading Text Authenticity Analyzer...")
+        self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
+        print("\n✅ Pipeline initialization complete!")
+        print("="*60 + "\n")
+    def analyze_audio(self, audio_path: str, language: Optional[str] = None) -> Dict:
+        print("\n" + "="*60)
+        print("MULTIMODAL AUTHENTICITY ANALYSIS")
+        print("="*60 + "\n")
+        start_time = time.time()
+        # Stage 1: Audio Classification (CNN-based read vs spontaneous detection)
+        print("Stage 1: CNN Audio Classification...")
+        print("-" * 40)
+        audio_results = self.audio_classifier.classify(audio_path)
+        print(f"✓ CNN classification complete")
+        print(f"  Classification: {audio_results['classification'].upper()}")
+        print(f"  Confidence: {audio_results['confidence']*100:.1f}%")
+        # Stage 2: Speech Analysis (Whisper for linguistic analysis)
+        print("\nStage 2: Speech Analysis (Whisper)...")
+        print("-" * 40)
+        asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
+        print(f"✓ Speech analysis complete")
+        print(f"  Language: {asr_results['language']}")
+        print(f"  Word count: {asr_results['word_count']}")
+        print(f"  Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
+        # Stage 3: Text Authenticity Analysis
+        print("\nStage 3: Analyzing text authenticity...")
+        print("-" * 40)
+        text_results = self.text_analyzer.analyze(asr_results['transcription'])
+        print(f"✓ Text analysis complete")
+        print(f"  Authenticity score: {text_results['authenticity_score']*100:.1f}%")
+        print(f"  Risk level: {text_results['risk_level'].upper()}")
+        # Stage 4: Combined Assessment
+        print("\nStage 4: Generating final assessment...")
+        print("-" * 40)
+        final_assessment = self._generate_final_assessment(
+            audio_results,
+            asr_results,
+            text_results
+        )
+        elapsed_time = time.time() - start_time
+        print(f"✓ Analysis complete in {elapsed_time:.2f} seconds")
+        print("\n" + "="*60 + "\n")
+        return {
+            'audio_classification': audio_results,
+            'speech_recognition': asr_results,
+            'text_authenticity': text_results,
+            'final_assessment': final_assessment,
+            'processing_time': elapsed_time
+        }
+    def _generate_final_assessment(
+        self,
+        audio_results: Dict,
+        asr_results: Dict,
+        text_results: Dict
+    ) -> Dict:
+        if audio_results['classification'] == 'spontaneous':
+            audio_score = audio_results['confidence']
+        else:  # read
+            audio_score = 1.0 - audio_results['confidence']
+        if asr_results['kopparapu_classification'] == 'spontaneous':
+            speech_pattern_score = asr_results['kopparapu_score']
+        else:
+            speech_pattern_score = 1.0 - asr_results['kopparapu_score']
+        text_auth_score = text_results['authenticity_score']
+        composite_score = (
+            audio_score * 0.30 +            # CNN acoustic analysis
+            speech_pattern_score * 0.30 +   # Speech patterns (Kopparapu)
+            text_auth_score * 0.40          # Text authenticity (AI detection)
+        )
+        if composite_score >= 0.7:
+            verdict = "AUTHENTIC"
+            risk = "low"
+            recommendation = "Response appears genuine with strong authenticity indicators."
+        elif composite_score >= 0.5:
+            verdict = "LIKELY AUTHENTIC"
+            risk = "moderate"
+            recommendation = "Response shows mostly authentic characteristics but has some concerns."
+        elif composite_score >= 0.3:
+            verdict = "QUESTIONABLE"
+            risk = "high"
+            recommendation = "Response has multiple authenticity concerns. Further investigation recommended."
+        else:
+            verdict = "LIKELY INAUTHENTIC"
+            risk = "critical"
+            recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
+        concerns = []
+        strengths = []
+        if audio_results['classification'] == 'read':
+            concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
+        else:
+            strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
+        if asr_results['kopparapu_classification'] == 'read':
+            concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
+        else:
+            strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
+        filler_ratio = asr_results['filler_words']['ratio']
+        if filler_ratio < 0.02:
+            concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
+        else:
+            strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
+        if asr_results['pause_patterns']['pause_variability'] < 0.3:
+            concerns.append("Regular pause patterns suggest reading at punctuation")
+        else:
+            strengths.append("Irregular pause patterns indicate spontaneous thinking")
+        if text_results['ai_detection']['ai_generated']:
+            concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
+        if text_results['authenticity_score'] > 0.7:
+            strengths.append("Text shows strong originality indicators")
+        return {
+            'verdict': verdict,
+            'risk_level': risk,
+            'composite_authenticity_score': float(composite_score),
+            'concerns': concerns,
+            'strengths': strengths,
+            'recommendation': recommendation,
+        }
+if __name__ == "__main__":
+    # Example usage
+    print("Initializing Authenticity Detection Pipeline...")
+    model_path = "spectrogram_cnn_3s_window.pth"
+    pipeline = AuthenticityDetectionPipeline(
+        audio_model_path=model_path,
+        whisper_model_size="base"
+    )
+    print("\nPipeline ready for audio analysis.")

plagiarism_detection.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel
+from pathlib import Path
+import json
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+# nothing is random here so no seed is set
+# code used from https://huggingface.co/desklib/ai-text-detector-v1.01 and modified for this project
+class DesklibAIDetectionModel(PreTrainedModel):
+    config_class = AutoConfig
+    def __init__(self, config):
+        # Initialize the PreTrainedModel
+        super().__init__(config)
+        # Initialize the base transformer model.
+        self.model = AutoModel.from_config(config)
+        # Define a classifier head.
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        # Initialize weights (handled by PreTrainedModel)
+        self.init_weights()
+    def forward(self, input_ids, attention_mask=None, labels=None):
+        # Forward pass through the transformer
+        outputs = self.model(input_ids, attention_mask=attention_mask)
+        last_hidden_state = outputs[0]
+        # Mean pooling
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
+        sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
+        pooled_output = sum_embeddings / sum_mask
+        # Classifier
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.BCEWithLogitsLoss()
+            loss = loss_fct(logits.view(-1), labels.float())
+        output = {"logits": logits}
+        if loss is not None:
+            output["loss"] = loss
+        return output
+def predict_single_text(text, model, tokenizer, device, max_len=768, threshold=0.5):
+    encoded = tokenizer(
+        text,
+        padding='max_length',
+        truncation=True,
+        max_length=max_len,
+        return_tensors='pt'
+    )
+    input_ids = encoded['input_ids'].to(device)
+    attention_mask = encoded['attention_mask'].to(device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        logits = outputs["logits"]
+        probability = torch.sigmoid(logits).item()
+    ai_detected = True if probability >= threshold else False
+    return probability, ai_detected
+# own code to easily create text files, and feed them to the model for predictions
+def ai_plagiarism_detection(text, threshold=0.5, show_results=False):
+    """
+    Detect if the given text is AI generated or human written.
+    Args:
+        text (str): Input text to be classified.
+        show_results (bool): If True, prints the results.
+    Returns:
+        probability (float): Probability of being AI generated.
+        ai_detected (bool): True if AI generated, Falce if human written.
+    """
+    # Model and Tokenizer Directory
+    model_directory = "desklib/ai-text-detector-v1.01"
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_directory)
+    model = DesklibAIDetectionModel.from_pretrained(model_directory)
+    # Set up device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    # Predict
+    probability, ai_detected = predict_single_text(text, model, tokenizer, device, threshold=threshold)
+    # to print results
+    if show_results:
+        print(f"Probability of being AI generated: {probability:.4f}")
+        print(f"Predicted label: {'AI Generated' if ai_detected else 'Not AI Generated'}")
+    return probability, ai_detected
+def make_textfile(file_path="text_folder/example.txt", content = "This is an example text file.\nAnd this is the second line.\n"):
+    """
+    Create a text file with the given content.
+    Args:
+        file_path (str): Path to the text file to be created.
+        content (str): Content to write into the text file.
+    """
+    # Open the file in write mode ('w') and write some content
+    with open(file_path, "w") as f:
+        f.write(content)
+    return
+def get_text_from_textfile(text_dir="text_folder"):
+    """
+    Read all text files from a directory and return a dictionary with filename as key and content as value.
+    Args:
+        text_dir (str): Directory containing text files.
+    Returns:
+        text_dict (dict): Dictionary with filename as key and file content as value.
+    """
+    text_dict = {}
+    text_file_list = list(Path(text_dir).glob("*.txt"))
+    for elem in text_file_list:
+        content = elem.read_text(encoding="utf-8")  # read file content
+        text_dict[elem.name] = content  # use filename as key
+    return text_dict
+def classifying_plagiarism_using_textfiles(best_threshold=0.78):
+    """
+    This function shows how this model can be used to detect ai in the text files in the text_folder folder. This is what is to be used in the pipeline.
+    """
+    # make sure folder exists
+    Path("text_folder").mkdir(exist_ok=True)
+    # create example text files
+    make_textfile("text_folder/ai_text.txt", "AI detection refers to the process of identifying whether a given piece of content, such as text, images, or audio, has been generated by artificial intelligence. This is achieved using various machine learning techniques, including perplexity analysis, entropy measurements, linguistic pattern recognition, and neural network classifiers trained on human and AI-generated data. Advanced AI detection tools assess writing style, coherence, and statistical properties to determine the likelihood of AI involvement. These tools are widely used in academia, journalism, and content moderation to ensure originality, prevent misinformation, and maintain ethical standards. As AI-generated content becomes increasingly sophisticated, AI detection methods continue to evolve, integrating deep learning models and ensemble techniques for improved accuracy.")  # create an example text file
+    make_textfile("text_folder/human_text.txt", "It is estimated that a major part of the content in the internet will be generated by AI / LLMs by 2025. This leads to a lot of misinformation and credibility related issues. That is why if is important to have accurate tools to identify if a content is AI generated or human written")  # create another example text file
+    textfile_dict = get_text_from_textfile(text_dir="text_folder")  # get dict with text file and content, text_dir is folder containing text files that need to be classified
+    # get predictions for each text file
+    for textfile, text in textfile_dict.items(): # for key, value in ft_dict.items():
+        print(f"Getting predictions for: {textfile}")
+        # ---------- GET PREDICTIONS ----------
+        probability, ai_detected = ai_plagiarism_detection(text=text, threshold=best_threshold, show_results=False) # get predictions with the optimal threshold value: 0.78
+        # print results
+        print(f"{textfile} Results:\n Probability of being AI generated: {probability:.4f}")
+        print(f" Predicted label: {'AI Generated' if ai_detected else 'Not AI Generated'}\n")
+def get_texts_from_jsonfile(json_file_path, sample_size=100, ignore_warning=False):
+    """
+    Get text partitions from a json file. Each partition is a text that can be given as input to the ai_plagiarism_detection model.
+    Args:
+        json_file_path (str): Path of the json file.
+        sample_size (int): Determines how many batches are returned.
+    Returns:
+        text_list (list): All the text batches in order of the json file as elements in a list.
+    """
+    text_list = []
+    try:
+        with open(json_file_path, "r", encoding="utf-8") as f:
+            for i, line in enumerate(f):
+                obj = json.loads(line)
+                text_list.append(obj["text"])
+                if i == sample_size-1:
+                    break
+    except:
+        raise ValueError(f"{json_file_path} does not exist or is not found.")
+    # raise warning if less texts found than sample size
+    if ignore_warning != True:
+        if len(text_list) != sample_size:
+            raise ValueError(f"Warning: only {len(text_list)} texts found, less than sample size {sample_size}")
+    return text_list
+def run_experiment_using_jsonfile(threshold=0.5):
+    """
+    This function runs the experiment and saves the results in ai_plagiarism_experiment/ai_plagiarism_detection_results.csv
+    """
+    # Set Total sample size, there are two datasets (json's) used, so sample_size//2 per dataset is used.
+    sample_size = 240
+    sample_size //=2
+    # make sure folders exist
+    Path("json_folder").mkdir(exist_ok=True)
+    Path("ai_plagiarism_experiment").mkdir(exist_ok=True)
+    # ------- GET TRUE NEGATIVE TEXTS (human thought and spoken) FROM JSON FILE -------
+    # load json file with text whisper transribed text from ML commons dataset
+    text_list = get_texts_from_jsonfile("json_folder/ML_commons.json", sample_size)
+    # get predictions for each
+    predictions=[]
+    for i, text in enumerate(text_list):
+        # ---------- GET PREDICTIONS ----------
+        probability, ai_detected = ai_plagiarism_detection(text=text, threshold=threshold, show_results=False)
+        # save results
+        predictions.append({"ML_commons_text_index": i,
+                            "GPT_text_index": np.nan,
+                            "text_length": len(text),
+                            "topic": "unknown",
+                            "probability": probability,
+                            "ai_detected": ai_detected,
+                            "really_ai": False
+                            })
+    # convert to dataframe
+    df = pd.DataFrame(predictions)
+    print("-------- 50% of samples predicted of json experiment --------")
+    # ------- GET TRUE POSITIVE TEXTS (ai written) FROM JSON FILE -------
+    # load json file with gpt generated texts
+    text_list = get_texts_from_jsonfile("json_folder/gpt_generated.json", sample_size)
+    predictions=[]
+    for i, text in enumerate(text_list):
+        # ---------- GET PREDICTIONS ----------
+        probability, ai_detected = ai_plagiarism_detection(text=text, threshold=threshold, show_results=False)
+        # # print results
+        # print(f"Text {i} Results:\n Probability of being AI generated: {probability:.4f}")
+        # print(f" Predicted label: {'AI Generated' if ai_detected else 'Not AI Generated'}\n")
+        # save results
+        if i < 40:
+            topic = "astronomy"
+        elif i < 80:
+            topic = "quantum computing"
+        else:
+            topic = "daily life, personal growth, and everyday experiences"
+        predictions.append({"ML_commons_text_index": np.nan,
+                            "GPT_text_index": i,
+                            "text_length": len(text),
+                            "topic": topic,
+                            "probability": probability,
+                            "ai_detected": ai_detected,
+                            "really_ai": True
+                            })
+    # convert to dataframe
+    new_rows = pd.DataFrame(predictions)
+    df = pd.concat([df, new_rows], ignore_index=True)
+    print("------- 100% of samples predicted of json experiment --------")
+    # save to csv
+    df.to_csv("ai_plagiarism_experiment/ai_plagiarism_detection_results.csv", index=False)
+    # update metrics
+    get_metrics(threshold=threshold)
+def get_metrics(df=None, threshold=0.5, save_to_csv=True):
+    """
+    This function calculates the metrics and saves them in ai_plagiarism_experiment/res_metrics(t={threshold}).csv
+    """
+    if df is None:
+        # read from csv
+        df = pd.read_csv("ai_plagiarism_experiment/ai_plagiarism_detection_results.csv")
+    # calculate metrics
+    fp = ((df["probability"]>=threshold) & (df["really_ai"]==False)).sum()  # false positives, cause all texts are human thought texts, however whisper makes text look more ai like
+    tn = ((df["probability"]<threshold) & (df["really_ai"]==False)).sum()   # true negatives
+    tp = ((df["probability"]>=threshold) & (df["really_ai"]==True)).sum()   # true positives
+    fn = ((df["probability"]<threshold) & (df["really_ai"]==True)).sum()    # false negatives
+    recall = tp/(tp+fn) if (tp+fn) != 0 else 0
+    precision = tp/(tp+fp) if (tp+fp) != 0 else 0
+    accuracy = (tp+tn)/(tp+fp+tn+fn) if (tp+fp+tn+fn) != 0 else 0
+    # info of text lengths of both datasets
+    ML_commons_length_mean = df.loc[df["ML_commons_text_index"].notna(), "text_length"].mean()
+    ML_commons_length_std = df.loc[df["ML_commons_text_index"].notna(), "text_length"].std()
+    gpt_length_mean = df.loc[df["GPT_text_index"].notna(), "text_length"].mean()
+    gpt_length_std = df.loc[df["GPT_text_index"].notna(), "text_length"].std()
+    # save metrics in dataframe
+    results = pd.DataFrame({
+        "Metric": ["TP", "TN", "FP", "FN", "Recall", "Precision", "Accuracy", "Total samples", "ML_commons_length_mean", "ML_commons_length_std", "gpt_length_mean", "gpt_length_std"],
+        "Value": [tp, tn, fp, fn, recall, precision, accuracy, len(df), ML_commons_length_mean, ML_commons_length_std, gpt_length_mean, gpt_length_std]
+    })
+    if save_to_csv:
+        # save in csv
+        results.to_csv(f"ai_plagiarism_experiment/res_metrics(t={threshold}).csv", index=False)
+    return results
+def tune_threshold(metric = "Accuracy"):
+    """This function maximises the accuracy of the ai plagiarism detector given the results.csv"""
+    df = pd.read_csv("ai_plagiarism_experiment/ai_plagiarism_detection_results.csv")
+    # set boundaries
+    min = 0.0
+    max = 1.0
+    step = 0.01
+    # init
+    best_accuracy=0
+    m_l=[]
+    t_l=[]
+    for threshold in np.arange(min, max+step, step):
+        threshold = round(threshold, 2)
+        results = get_metrics(df,threshold,False)
+        opti_metric = results.loc[results["Metric"] == metric, "Value"].iloc[0]
+        # save for plotting
+        m_l.append(opti_metric)
+        t_l.append(threshold)
+        # update best threshold
+        if opti_metric>best_accuracy:
+            best_accuracy = opti_metric
+            best_threshold = threshold
+    # plot tuning
+    Path("ai_plagiarism_tuning_plots").mkdir(exist_ok=True)
+    plt.plot(t_l, m_l)
+    plt.xlabel("threshold")
+    plt.ylabel(metric)
+    plt.title(f"threshold vs {metric}")
+    plt.savefig(f"ai_plagiarism_tuning_plots/threshold_vs_{metric}.png")
+    plt.close()
+    return best_threshold
+if __name__ == "__main__":
+    print("-------- Starting ai plagiarism experiment! --------\n")
+    # run experiment using json files
+    run_experiment_using_jsonfile(threshold=0.5) # firstly using the default threshold
+    # search for the theshold that maximises accuracy
+    metric = "Accuracy"
+    best_threshold_accuracy = tune_threshold(metric=metric)
+    print(f"Best theshold for {metric}: {best_threshold_accuracy}")
+    # search for the theshold that maximises precision
+    metric = "Precision"
+    best_threshold_precision = tune_threshold(metric=metric)
+    print(f"Best theshold for {metric}: {best_threshold_precision}")
+    # run experiment using json files
+    run_experiment_using_jsonfile(threshold=best_threshold_accuracy) # secondly using the optimal threshold, the end result is
+    # example of usage that is fit for a pipeline using the best accuracy (best_threshold=0.78), when using best precision use best_threshold=0.97
+    classifying_plagiarism_using_textfiles(best_threshold=best_threshold_accuracy)
+    print("\n-------- Done! -------- ")

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch>=2.0.0
+torchaudio>=2.0.0
+openai-whisper>=20230314
+transformers>=4.30.0
+gradio>=4.0.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+librosa>=0.10.0
+soundfile>=0.12.0
+scipy>=1.11.0
+requests>=2.31.0
+pandas>=2.0.0
+matplotlib>=3.7.0

spectrogram_cnn_3s_window (1).pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8335aa8c8932430ad456f12fe37eba28c7253f75dcda9d513ec3054f7b14f264
+size 44788683

spectrogram_cnn_4s_window.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42534fa18a4df083acfae6dc6fe0bbe24af622a1a7b68938ff6eb1d10eb4b6e5
+size 44790091

spectrogram_cnn_4s_window_488_x_488.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c8109600c503b699d10c02a29cf5bca147c1cf22ad428c618f70ade567d6aa0
+size 44791371

speech_recognizer.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import whisper
+import torch
+import numpy as np
+import re
+from typing import Dict, Optional, List
+import warnings
+warnings.filterwarnings("ignore")
+class SpeechRecognizer:
+    def __init__(self, model_size: str = "base", device: str = None):
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        print(f"Loading Whisper {model_size} model on {self.device}...")
+        self.model = whisper.load_model(model_size, device=self.device)
+        print(f"Whisper model loaded successfully.")
+        self.model_size = model_size
+    def transcribe(
+        self,
+        audio_path: str,
+        language: Optional[str] = None,
+        task: str = "transcribe"
+    ) -> Dict[str, any]:
+        # Transcribe with Whisper (with word-level timestamps for better pause detection)
+        result = self.model.transcribe(
+            audio_path,
+            language=language,
+            task=task,
+            verbose=False,
+            word_timestamps=True
+        )
+        transcription = result['text'].strip()
+        detected_language = result.get('language', 'unknown')
+        segments = result.get('segments', [])
+        analysis = self._analyze_transcription(transcription, segments)
+        duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
+        kopparapu_features = self._extract_kopparapu_features(transcription, duration)
+        kopparapu_score = self._calculate_kopparapu_score(kopparapu_features)
+        return {
+            'transcription': transcription,
+            'language': detected_language,
+            'segments': segments,
+            'word_count': analysis['word_count'],
+            'duration': analysis['duration'],
+            'speech_rate': analysis['speech_rate'],
+            'pause_patterns': analysis['pause_patterns'],
+            'filler_words': analysis['filler_words'],
+            'kopparapu_features': kopparapu_features,
+            'kopparapu_score': kopparapu_score,
+            'kopparapu_classification': 'read' if kopparapu_score >= 0.5 else 'spontaneous',
+            'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
+        }
+    def _analyze_transcription(self, text: str, segments: List[Dict]) -> Dict:
+        words = text.split()
+        word_count = len(words)
+        duration = 0
+        if segments:
+            duration = segments[-1]['end'] - segments[0]['start']
+        speech_rate = (word_count / duration * 60) if duration > 0 else 0
+        filler_words_list = [
+            ('um', r'\bum\b'), ('uh', r'\buh\b'), ('er', r'\ber\b'),
+            ('ah', r'\bah\b'), ('like', r'\blike\b'), ('you know', r'\byou know\b'),
+            ('i mean', r'\bi mean\b'), ('actually', r'\bactually\b'),
+            ('basically', r'\bbasically\b'), ('literally', r'\bliterally\b'),
+            ('so', r'\bso\b'), ('well', r'\bwell\b'), ('okay', r'\bokay\b'),
+            ('hmm', r'\bhmm+\b'), ('mm', r'\bmm+\b')
+        ]
+        text_lower = text.lower()
+        filler_count = {}
+        total_fillers = 0
+        for filler_name, filler_pattern in filler_words_list:
+            matches = re.findall(filler_pattern, text_lower, re.IGNORECASE)
+            count = len(matches)
+            if count > 0:
+                filler_count[filler_name] = count
+                total_fillers += count
+        filler_ratio = total_fillers / word_count if word_count > 0 else 0
+        pause_patterns = self._analyze_pauses(segments)
+        return {
+            'word_count': word_count,
+            'duration': duration,
+            'speech_rate': speech_rate,
+            'filler_words': {
+                'count': total_fillers,
+                'ratio': filler_ratio,
+                'details': filler_count
+            },
+            'pause_patterns': pause_patterns
+        }
+    def _analyze_pauses(self, segments: List[Dict]) -> Dict:
+        pauses = []
+        if len(segments) >= 2:
+            for i in range(len(segments) - 1):
+                pause = segments[i + 1]['start'] - segments[i]['end']
+                if pause > 0.05:  # Consider pauses > 50ms (lowered threshold)
+                    pauses.append(pause)
+        for segment in segments:
+            if 'words' in segment and len(segment['words']) > 1:
+                words = segment['words']
+                for i in range(len(words) - 1):
+                    if 'start' in words[i] and 'end' in words[i] and 'start' in words[i+1]:
+                        pause = words[i + 1]['start'] - words[i]['end']
+                        if pause > 0.15:  # Word-level pauses (>150ms significant)
+                            pauses.append(pause)
+        if not pauses:
+            return {
+                'avg_pause': 0.0,
+                'max_pause': 0.0,
+                'num_pauses': 0,
+                'pause_variability': 0.0
+            }
+        return {
+            'avg_pause': float(np.mean(pauses)),
+            'max_pause': float(np.max(pauses)),
+            'num_pauses': len(pauses),
+            'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
+        }
+    def _extract_kopparapu_features(self, text: str, duration_sec: float) -> Dict:
+        """
+        Extract Kopparapu-like linguistic features from transcription.
+        Based on: https://arxiv.org/pdf/2306.08012
+        """
+        text = text.strip()
+        if len(text) == 0:
+            return {
+                'alpha_ratio': 0.0,
+                'chars_per_word': 0.0,
+                'words_per_sec': 0.0,
+                'nonalpha_per_sec': 0.0,
+                'repetition_count': 0,
+                'filler_rate': 0.0
+            }
+        total_chars = len(text)
+        alpha_chars = sum(c.isalpha() for c in text)
+        nonalpha_chars = total_chars - alpha_chars
+        alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
+        words = text.split()
+        num_words = max(len(words), 1)
+        chars_per_word = alpha_chars / num_words
+        duration_sec = max(duration_sec, 1e-3)
+        words_per_sec = num_words / duration_sec
+        nonalpha_per_sec = nonalpha_chars / duration_sec
+        char_reps = len(re.findall(r'(.)\1{2,}', text))
+        words_list = text.lower().split()
+        word_reps = 0
+        for i in range(len(words_list) - 1):
+            if words_list[i] == words_list[i + 1] and len(words_list[i]) > 2:
+                word_reps += 1
+        repetition_count = char_reps + word_reps
+        lower = text.lower()
+        filler_patterns = [
+            r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
+            r'\blike\b', r'\byou know\b', r'\bi mean\b',
+            r'\bactually\b', r'\bbasically\b', r'\bliterally\b',
+            r'\bso\b', r'\bwell\b', r'\bokay\b',
+            r'\bhmm+\b', r'\bmm+\b', r'\boh\b'
+        ]
+        filler_count = 0
+        for pattern in filler_patterns:
+            filler_count += len(re.findall(pattern, lower))
+        filler_rate = filler_count / num_words
+        return {
+            'alpha_ratio': float(alpha_ratio),
+            'chars_per_word': float(chars_per_word),
+            'words_per_sec': float(words_per_sec),
+            'nonalpha_per_sec': float(nonalpha_per_sec),
+            'repetition_count': int(repetition_count),
+            'filler_rate': float(filler_rate)
+        }
+    def _logistic(self, x: float, a: float, b: float) -> float:
+        return 1.0 / (1.0 + np.exp(-(x - a) / b))
+    def _calculate_kopparapu_score(self, features: Dict) -> float:
+        f1 = features['chars_per_word']
+        L1 = self._logistic(f1, a=5.0, b=1.5)
+        f2 = features['words_per_sec']
+        L2 = self._logistic(f2, a=2.0, b=0.7)
+        f3_raw = features['nonalpha_per_sec'] + 10.0 * features['filler_rate']
+        L3 = self._logistic(-f3_raw, a=0.0, b=1.0)
+        score = 0.4 * L1 + 0.4 * L2 + 0.2 * L3
+        return float(score)
+    def _interpret_speech_patterns(self, analysis: Dict, kopparapu_features: Dict = None, kopparapu_score: float = None) -> str:
+        filler_ratio = analysis['filler_words']['ratio']
+        pause_patterns = analysis['pause_patterns']
+        speech_rate = analysis['speech_rate']
+        interpretation = "**Overall Assessment:**\n\n"
+        spontaneity_score = 0
+        indicators = []
+        if filler_ratio > 0.03:
+            spontaneity_score += 1
+            indicators.append(f"Filler words present ({filler_ratio*100:.1f}%)")
+        if pause_patterns['pause_variability'] > 0.5:
+            spontaneity_score += 1
+            indicators.append(f"Irregular pause patterns (variability: {pause_patterns['pause_variability']:.2f})")
+        if 120 <= speech_rate <= 180:
+            spontaneity_score += 1
+            indicators.append(f"Natural speech rate ({speech_rate:.1f} words/min)")
+        if spontaneity_score >= 2:
+            interpretation += "✓ **Speech patterns suggest spontaneous, natural speaking.**\n\n"
+            if indicators:
+                interpretation += "Key indicators:\n"
+                for indicator in indicators:
+                    interpretation += f"- {indicator}\n"
+        else:
+            interpretation += "⚠ **Speech patterns suggest potentially scripted or read speech.**\n\n"
+            if filler_ratio < 0.02:
+                interpretation += "- Very low filler word usage\n"
+            if pause_patterns['pause_variability'] < 0.3:
+                interpretation += "- Regular, consistent pause patterns\n"
+            if speech_rate > 180:
+                interpretation += "- Fast, steady speaking rate\n"
+        return interpretation
+    def get_detailed_segments(self, audio_path: str) -> List[Dict]:
+        result = self.model.transcribe(audio_path, word_timestamps=True, verbose=False)
+        return result.get('segments', [])
+if __name__ == "__main__":
+    recognizer = SpeechRecognizer(model_size="base")
+    print(f"Speech recognizer initialized with {recognizer.model_size} model")
+    print(f"Device: {recognizer.device}")

text_analyzer.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import re
+import requests
+from typing import Dict, List, Tuple, Optional
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    RobertaTokenizer,
+    RobertaForSequenceClassification
+)
+import numpy as np
+from collections import Counter
+import warnings
+warnings.filterwarnings("ignore")
+try:
+    from plagiarism_detection import ai_plagiarism_detection
+    DESKLIB_AVAILABLE = True
+except ImportError:
+    DESKLIB_AVAILABLE = False
+    print("Warning: plagiarism_detection module not found. Using fallback AI detection.")
+class AITextDetector:
+    def __init__(self, device: str = None, threshold: float = 0.78):
+        self.threshold = threshold
+        if not DESKLIB_AVAILABLE:
+            print("Warning: plagiarism_detection module not found. AI detection will not be available.")
+            print("Ensure plagiarism_detection.py is in the same directory.")
+            self.available = False
+        else:
+            print(f"Using Desklib AI text detector (threshold: {self.threshold})")
+            self.available = True
+    def detect_ai_text(self, text: str) -> Dict:
+        if not self.available:
+            # Return neutral result if Desklib not available
+            return {
+                'ai_generated': False,
+                'confidence': 0.5,
+                'indicators': [],
+                'interpretation': "AI detection not available. Install plagiarism_detection module.",
+                'model_used': 'N/A (module not found)'
+            }
+        # Use Desklib AI detector
+        try:
+            probability, ai_detected = ai_plagiarism_detection(
+                text,
+                threshold=self.threshold,
+                show_results=False
+            )
+            return {
+                'ai_generated': ai_detected,
+                'confidence': float(probability),
+                'indicators': self._identify_ai_indicators(probability),
+                'interpretation': self._interpret_ai_detection(probability),
+                'model_used': 'Desklib AI Detector v1.01'
+            }
+        except Exception as e:
+            print(f"Error in AI detection: {e}")
+            return {
+                'ai_generated': False,
+                'confidence': 0.5,
+                'indicators': [],
+                'interpretation': f"AI detection error: {str(e)}",
+                'model_used': 'Error'
+            }
+    def _identify_ai_indicators(self, probability: float) -> List[str]:
+        indicators = []
+        if probability > 0.9:
+            indicators.append("Very high AI probability (>90%)")
+        elif probability > 0.7:
+            indicators.append("High AI probability (70-90%)")
+        elif probability > self.threshold:
+            indicators.append(f"AI detected above threshold ({self.threshold*100:.0f}%)")
+        return indicators
+    def _interpret_ai_detection(self, score: float) -> str:
+        interpretation = f"**AI-Generated Text Detection:**\n\n"
+        interpretation += f"- AI Probability Score: {score*100:.1f}%\n"
+        interpretation += f"- Detection Threshold: {self.threshold*100:.0f}%\n"
+        return interpretation
+class TextAuthenticityAnalyzer:
+    def __init__(self, device: str = None, ai_threshold: float = 0.78):
+        self.ai_detector = AITextDetector(device=device, threshold=ai_threshold)
+    def analyze(self, text: str) -> Dict:
+        # Run AI detection
+        ai_results = self.ai_detector.detect_ai_text(text)
+        # Calculate overall authenticity score based on AI detection
+        ai_penalty = ai_results['confidence']
+        authenticity_score = 1.0 - ai_penalty
+        # Determine overall assessment
+        if authenticity_score < 0.3:
+            overall_assessment = "HIGH RISK: Strong AI-generated text indicators"
+            risk_level = "high"
+        elif authenticity_score < 0.5:
+            overall_assessment = "MODERATE RISK: Likely AI-generated"
+            risk_level = "moderate"
+        elif authenticity_score < 0.7:
+            overall_assessment = "LOW RISK: Some AI characteristics"
+            risk_level = "low"
+        else:
+            overall_assessment = "AUTHENTIC: Text appears human-written"
+            risk_level = "minimal"
+        return {
+            'authenticity_score': float(authenticity_score),
+            'risk_level': risk_level,
+            'overall_assessment': overall_assessment,
+            'ai_detection': ai_results,
+        }
+if __name__ == "__main__":
+    # Example usage
+    analyzer = TextAuthenticityAnalyzer()
+    print("Text authenticity analyzer initialized.")
+    print("Components: Plagiarism Detector + AI Text Detector")