File size: 9,162 Bytes
4ec806c
 
 
 
 
 
0b42831
4ec806c
 
 
0b42831
 
 
 
4ec806c
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
0b42831
 
4ec806c
 
 
 
 
 
0b42831
4ec806c
 
 
 
4c2ceb8
4ec806c
 
0b42831
4ec806c
 
 
 
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b42831
 
4ec806c
 
 
 
 
0b42831
4ec806c
 
0b42831
 
 
 
4ec806c
0b42831
4ec806c
 
0b42831
4ec806c
 
0b42831
95ad43e
4ec806c
0b42831
95ad43e
0b42831
4ec806c
0b42831
95ad43e
0b42831
95ad43e
0b42831
95ad43e
0b42831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95ad43e
0b42831
 
4ec806c
0b42831
 
 
 
 
 
4ec806c
 
0b42831
4ec806c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b42831
4ec806c
 
 
0b42831
4ec806c
 
 
 
 
0b42831
4ec806c
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
0b42831
4ec806c
 
 
 
 
0b42831
4ec806c
 
 
0b42831
4ec806c
 
 
 
 
 
 
 
 
 
 
 
0b42831
 
4ec806c
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import time
from audio_classifier import AudioClassifier
from speech_recognizer import SpeechRecognizer
from text_analyzer import TextAuthenticityAnalyzer


# Main pipeline class that orchestrates all analysis components
class AuthenticityDetectionPipeline:
    def __init__(
        self,
        audio_model_path=None,
        whisper_model_size="base",
        device=None,
        ai_detection_threshold=0.78
    ):
        print("\n" + "="*60)
        print("Initializing Multimodal Authenticity Detection Pipeline")
        print("="*60 + "\n")
        
        # load the CNN-based audio classifier
        print("πŸ“Š Loading Audio Classifier (CNN)...")
        self.audio_classifier = AudioClassifier(
            model_path=audio_model_path,
            device=device
        )
        
        # load whisper model for speech-to-text
        print("\n🎀 Loading Speech Recognizer (Whisper)...")
        self.speech_recognizer = SpeechRecognizer(
            model_size=whisper_model_size,
            device=device
        )
        
        # load text analyzer for AI detection
        print("\nπŸ“ Loading Text Authenticity Analyzer...")
        self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold)
        
        print("\nβœ… Pipeline initialization complete!")
        print("="*60 + "\n")
    
    # main analysis function - runs all stages
    def analyze_audio(self, audio_path, language=None):
        print("\n" + "="*60)
        print("MULTIMODAL AUTHENTICITY ANALYSIS")
        print("="*60 + "\n")
        
        start_time = time.time()
        
        # stage 1: classify audio using CNN
        print("Stage 1: CNN Audio Classification...")
        print("-" * 40)
        audio_results = self.audio_classifier.classify(audio_path)
        print(f"βœ“ CNN classification complete")
        print(f"  ## Classification: {audio_results['classification'].upper()}")
        print(f"  Confidence: {audio_results['confidence']*100:.1f}%")
        
        # stage 2: transcribe and analyze speech patterns
        print("\nStage 2: Speech Analysis (Whisper)...")
        print("-" * 40)
        asr_results = self.speech_recognizer.transcribe(audio_path, language=language)
        print(f"βœ“ Speech analysis complete")
        print(f"  Language: {asr_results['language']}")
        print(f"  Word count: {asr_results['word_count']}")
        print(f"  Kopparapu classification: {asr_results['kopparapu_classification'].upper()}")
        
        # stage 3: analyze transcribed text for AI patterns
        print("\nStage 3: Analyzing text authenticity...")
        print("-" * 40)
        text_results = self.text_analyzer.analyze(asr_results['transcription'])
        print(f"βœ“ Text analysis complete")
        print(f"  Authenticity score: {text_results['authenticity_score']*100:.1f}%")
        print(f"  Risk level: {text_results['risk_level'].upper()}")
        
        # stage 4: combine all results into final assessment
        print("\nStage 4: Generating final assessment...")
        print("-" * 40)
        final_assessment = self._generate_final_assessment(
            audio_results,
            asr_results,
            text_results
        )
        
        elapsed_time = time.time() - start_time
        
        print(f"βœ“ Analysis complete in {elapsed_time:.2f} seconds")
        print("\n" + "="*60 + "\n")
        
        return {
            'audio_classification': audio_results,
            'speech_recognition': asr_results,
            'asr': asr_results,  # alias for backwards compatibility
            'text_analysis': text_results,
            'text_authenticity': text_results,
            'final_assessment': final_assessment,
            'processing_time': elapsed_time
        }
    
    # combine scores from all components into final verdict
    def _generate_final_assessment(
        self,
        audio_results,
        asr_results,
        text_results
    ):

        # calculate audio score - spontaneous = authentic
        if audio_results['classification'] == 'spontaneous':
            audio_score = audio_results['confidence']
        else:
            audio_score = 1.0 - audio_results['confidence']
        
        # kopparapu score - invert so spontaneous = high authenticity
        speech_pattern_score = 1.0 - asr_results['kopparapu_score']
        
        # filler words indicate spontaneous speech
        filler_ratio = asr_results['filler_words']['ratio']
        filler_score = min(1.0, filler_ratio / 0.05)
        
        # pause variability - higher = more spontaneous
        pause_var = asr_results['pause_patterns']['pause_variability']
        pause_score = min(1.0, pause_var / 0.5)
        
        # text authenticity from AI detector
        text_auth_score = text_results['authenticity_score']
        
        # get additional linguistic features
        kf = asr_results['kopparapu_features']
        
        # speech rate variability
        rate_var = kf.get('speech_rate_variability', 0.0)
        rate_var_score = min(1.0, rate_var / 0.15)
        
        # pause regularity - lower = more spontaneous
        pause_reg = kf.get('pause_regularity', 0.5)
        pause_reg_score = 1.0 - pause_reg
        
        # self-corrections indicate spontaneous speech
        corrections = kf.get('self_correction_count', 0)
        correction_score = min(1.0, corrections / 2.0)

        # calculate weighted composite score
        # weights: CNN+Prosody=15%, Linguistic=35%, AI Detection=50%
        composite_score = (
            audio_score * 0.15 +
            speech_pattern_score * 0.25 +
            filler_score * 0.05 +
            pause_score * 0.03 +
            rate_var_score * 0.02 +
            text_auth_score * 0.50
        )
        
        # determine verdict based on composite score
        if composite_score >= 0.7:
            verdict = "AUTHENTIC"
            risk = "low"
            recommendation = "Response appears genuine with strong authenticity indicators."
        elif composite_score >= 0.5:
            verdict = "LIKELY AUTHENTIC"
            risk = "moderate"
            recommendation = "Response shows mostly authentic characteristics but has some concerns."
        elif composite_score >= 0.3:
            verdict = "QUESTIONABLE"
            risk = "high"
            recommendation = "Response has multiple authenticity concerns. Further investigation recommended."
        else:
            verdict = "LIKELY INAUTHENTIC"
            risk = "critical"
            recommendation = "Response shows strong indicators of inauthenticity. Manual review required."
        
        # collect concerns and strengths
        concerns = []
        strengths = []
        
        # check CNN classification
        if audio_results['classification'] == 'read':
            concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)")
        else:
            strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)")
        
        # check linguistic analysis
        if asr_results['kopparapu_classification'] == 'read':
            concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})")
        else:
            strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})")
        
        # check filler words
        filler_ratio = asr_results['filler_words']['ratio']
        if filler_ratio < 0.02:
            concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech")
        else:
            strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity")
        
        # check pause patterns
        if asr_results['pause_patterns']['pause_variability'] < 0.3:
            concerns.append("Regular pause patterns suggest reading at punctuation")
        else:
            strengths.append("Irregular pause patterns indicate spontaneous thinking")
        
        # check AI detection
        if text_results['ai_detection']['ai_generated']:
            concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)")
        
        # check text originality
        if text_results['authenticity_score'] > 0.7:
            strengths.append("Text shows strong originality indicators")

        return {
            'verdict': verdict,
            'risk_level': risk,
            'composite_authenticity_score': float(composite_score),
            'concerns': concerns,
            'strengths': strengths,
            'recommendation': recommendation,
        }
    

# test code - runs when script is executed directly
if __name__ == "__main__":
    print("Initializing Authenticity Detection Pipeline...")
    model_path = "spectrogram_cnn_3s_window.pth"
    pipeline = AuthenticityDetectionPipeline(
        audio_model_path=model_path,
        whisper_model_size="base"
    )
    print("\nPipeline ready for audio analysis.")