Spaces:
Sleeping
Sleeping
| import time | |
| from audio_classifier import AudioClassifier | |
| from speech_recognizer import SpeechRecognizer | |
| from text_analyzer import TextAuthenticityAnalyzer | |
| # Main pipeline class that orchestrates all analysis components | |
| class AuthenticityDetectionPipeline: | |
| def __init__( | |
| self, | |
| audio_model_path=None, | |
| whisper_model_size="base", | |
| device=None, | |
| ai_detection_threshold=0.78 | |
| ): | |
| print("\n" + "="*60) | |
| print("Initializing Multimodal Authenticity Detection Pipeline") | |
| print("="*60 + "\n") | |
| # load the CNN-based audio classifier | |
| print("π Loading Audio Classifier (CNN)...") | |
| self.audio_classifier = AudioClassifier( | |
| model_path=audio_model_path, | |
| device=device | |
| ) | |
| # load whisper model for speech-to-text | |
| print("\nπ€ Loading Speech Recognizer (Whisper)...") | |
| self.speech_recognizer = SpeechRecognizer( | |
| model_size=whisper_model_size, | |
| device=device | |
| ) | |
| # load text analyzer for AI detection | |
| print("\nπ Loading Text Authenticity Analyzer...") | |
| self.text_analyzer = TextAuthenticityAnalyzer(device=device, ai_threshold=ai_detection_threshold) | |
| print("\nβ Pipeline initialization complete!") | |
| print("="*60 + "\n") | |
| # main analysis function - runs all stages | |
| def analyze_audio(self, audio_path, language=None): | |
| print("\n" + "="*60) | |
| print("MULTIMODAL AUTHENTICITY ANALYSIS") | |
| print("="*60 + "\n") | |
| start_time = time.time() | |
| # stage 1: classify audio using CNN | |
| print("Stage 1: CNN Audio Classification...") | |
| print("-" * 40) | |
| audio_results = self.audio_classifier.classify(audio_path) | |
| print(f"β CNN classification complete") | |
| print(f" ## Classification: {audio_results['classification'].upper()}") | |
| print(f" Confidence: {audio_results['confidence']*100:.1f}%") | |
| # stage 2: transcribe and analyze speech patterns | |
| print("\nStage 2: Speech Analysis (Whisper)...") | |
| print("-" * 40) | |
| asr_results = self.speech_recognizer.transcribe(audio_path, language=language) | |
| print(f"β Speech analysis complete") | |
| print(f" Language: {asr_results['language']}") | |
| print(f" Word count: {asr_results['word_count']}") | |
| print(f" Kopparapu classification: {asr_results['kopparapu_classification'].upper()}") | |
| # stage 3: analyze transcribed text for AI patterns | |
| print("\nStage 3: Analyzing text authenticity...") | |
| print("-" * 40) | |
| text_results = self.text_analyzer.analyze(asr_results['transcription']) | |
| print(f"β Text analysis complete") | |
| print(f" Authenticity score: {text_results['authenticity_score']*100:.1f}%") | |
| print(f" Risk level: {text_results['risk_level'].upper()}") | |
| # stage 4: combine all results into final assessment | |
| print("\nStage 4: Generating final assessment...") | |
| print("-" * 40) | |
| final_assessment = self._generate_final_assessment( | |
| audio_results, | |
| asr_results, | |
| text_results | |
| ) | |
| elapsed_time = time.time() - start_time | |
| print(f"β Analysis complete in {elapsed_time:.2f} seconds") | |
| print("\n" + "="*60 + "\n") | |
| return { | |
| 'audio_classification': audio_results, | |
| 'speech_recognition': asr_results, | |
| 'asr': asr_results, # alias for backwards compatibility | |
| 'text_analysis': text_results, | |
| 'text_authenticity': text_results, | |
| 'final_assessment': final_assessment, | |
| 'processing_time': elapsed_time | |
| } | |
| # combine scores from all components into final verdict | |
| def _generate_final_assessment( | |
| self, | |
| audio_results, | |
| asr_results, | |
| text_results | |
| ): | |
| # calculate audio score - spontaneous = authentic | |
| if audio_results['classification'] == 'spontaneous': | |
| audio_score = audio_results['confidence'] | |
| else: | |
| audio_score = 1.0 - audio_results['confidence'] | |
| # kopparapu score - invert so spontaneous = high authenticity | |
| speech_pattern_score = 1.0 - asr_results['kopparapu_score'] | |
| # filler words indicate spontaneous speech | |
| filler_ratio = asr_results['filler_words']['ratio'] | |
| filler_score = min(1.0, filler_ratio / 0.05) | |
| # pause variability - higher = more spontaneous | |
| pause_var = asr_results['pause_patterns']['pause_variability'] | |
| pause_score = min(1.0, pause_var / 0.5) | |
| # text authenticity from AI detector | |
| text_auth_score = text_results['authenticity_score'] | |
| # get additional linguistic features | |
| kf = asr_results['kopparapu_features'] | |
| # speech rate variability | |
| rate_var = kf.get('speech_rate_variability', 0.0) | |
| rate_var_score = min(1.0, rate_var / 0.15) | |
| # pause regularity - lower = more spontaneous | |
| pause_reg = kf.get('pause_regularity', 0.5) | |
| pause_reg_score = 1.0 - pause_reg | |
| # self-corrections indicate spontaneous speech | |
| corrections = kf.get('self_correction_count', 0) | |
| correction_score = min(1.0, corrections / 2.0) | |
| # calculate weighted composite score | |
| # weights: CNN+Prosody=15%, Linguistic=35%, AI Detection=50% | |
| composite_score = ( | |
| audio_score * 0.15 + | |
| speech_pattern_score * 0.25 + | |
| filler_score * 0.05 + | |
| pause_score * 0.03 + | |
| rate_var_score * 0.02 + | |
| text_auth_score * 0.50 | |
| ) | |
| # determine verdict based on composite score | |
| if composite_score >= 0.7: | |
| verdict = "AUTHENTIC" | |
| risk = "low" | |
| recommendation = "Response appears genuine with strong authenticity indicators." | |
| elif composite_score >= 0.5: | |
| verdict = "LIKELY AUTHENTIC" | |
| risk = "moderate" | |
| recommendation = "Response shows mostly authentic characteristics but has some concerns." | |
| elif composite_score >= 0.3: | |
| verdict = "QUESTIONABLE" | |
| risk = "high" | |
| recommendation = "Response has multiple authenticity concerns. Further investigation recommended." | |
| else: | |
| verdict = "LIKELY INAUTHENTIC" | |
| risk = "critical" | |
| recommendation = "Response shows strong indicators of inauthenticity. Manual review required." | |
| # collect concerns and strengths | |
| concerns = [] | |
| strengths = [] | |
| # check CNN classification | |
| if audio_results['classification'] == 'read': | |
| concerns.append(f"CNN detected read speech pattern ({audio_results['confidence']*100:.0f}% confidence)") | |
| else: | |
| strengths.append(f"CNN detected spontaneous speech ({audio_results['confidence']*100:.0f}% confidence)") | |
| # check linguistic analysis | |
| if asr_results['kopparapu_classification'] == 'read': | |
| concerns.append(f"Linguistic analysis suggests read speech (score: {asr_results['kopparapu_score']:.2f})") | |
| else: | |
| strengths.append(f"Linguistic analysis suggests spontaneous speech (score: {asr_results['kopparapu_score']:.2f})") | |
| # check filler words | |
| filler_ratio = asr_results['filler_words']['ratio'] | |
| if filler_ratio < 0.02: | |
| concerns.append(f"Low filler word usage ({filler_ratio*100:.1f}%) suggests scripted speech") | |
| else: | |
| strengths.append(f"Natural filler word usage ({filler_ratio*100:.1f}%) indicates spontaneity") | |
| # check pause patterns | |
| if asr_results['pause_patterns']['pause_variability'] < 0.3: | |
| concerns.append("Regular pause patterns suggest reading at punctuation") | |
| else: | |
| strengths.append("Irregular pause patterns indicate spontaneous thinking") | |
| # check AI detection | |
| if text_results['ai_detection']['ai_generated']: | |
| concerns.append(f"AI-generated text detected ({text_results['ai_detection']['confidence']*100:.0f}% probability)") | |
| # check text originality | |
| if text_results['authenticity_score'] > 0.7: | |
| strengths.append("Text shows strong originality indicators") | |
| return { | |
| 'verdict': verdict, | |
| 'risk_level': risk, | |
| 'composite_authenticity_score': float(composite_score), | |
| 'concerns': concerns, | |
| 'strengths': strengths, | |
| 'recommendation': recommendation, | |
| } | |
| # test code - runs when script is executed directly | |
| if __name__ == "__main__": | |
| print("Initializing Authenticity Detection Pipeline...") | |
| model_path = "spectrogram_cnn_3s_window.pth" | |
| pipeline = AuthenticityDetectionPipeline( | |
| audio_model_path=model_path, | |
| whisper_model_size="base" | |
| ) | |
| print("\nPipeline ready for audio analysis.") | |