File size: 20,168 Bytes

b5e57ee

# import sounddevice as sd
# import scipy.io.wavfile as wav
# import nemo.collections.asr as nemo_asr
# import torch
# import numpy as np
# from typing import List, Tuple

# # ===== SETTINGS =====
# SAMPLE_RATE = 16000
# DURATION = 10  # seconds
# OUTPUT_FILE = "arabic_recording.wav"

# class RepetitionAwareTranscriber:
#     def __init__(self, model_path: str):
#         """Initialize ASR model with repetition-aware configuration"""
#         print("📥 Loading Arabic ASR model...")
#         self.asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
#         self._configure_decoding()
    
#     def _configure_decoding(self):
#         """Configure advanced decoding strategy"""
#         decoding_cfg = self.asr_model.cfg.decoding
        
#         # Use beam search for better sequence modeling
#         decoding_cfg.strategy = "beam"
#         decoding_cfg.beam.beam_size = 128  # Larger beam for more candidates
#         decoding_cfg.beam.return_best_hypothesis = False  # Get multiple hypotheses
        
#         # Language model parameters (if available)
#         if hasattr(decoding_cfg.beam, 'beam_alpha'):
#             decoding_cfg.beam.beam_alpha = 0.3  # LM weight (lower = less LM influence)
#         if hasattr(decoding_cfg.beam, 'beam_beta'):
#             decoding_cfg.beam.beam_beta = 0.5   # Word insertion bonus
        
#         self.asr_model.change_decoding_strategy(decoding_cfg)
    
#     def transcribe_with_logprobs(self, audio_file: str, temperature: float = 1.0):
#         """
#         Transcribe with log probabilities and temperature scaling
        
#         Args:
#             audio_file: Path to audio file
#             temperature: Controls randomness (lower = more conservative, higher = more diverse)
#                         0.5 = more deterministic
#                         1.0 = standard
#                         1.5 = more exploratory
#         """
#         print(f"🔍 Transcribing with temperature={temperature}...")
        
#         # Update temperature in decoding config
#         if hasattr(self.asr_model.cfg.decoding, 'temperature'):
#             self.asr_model.cfg.decoding.temperature = temperature
#         if hasattr(self.asr_model.cfg.decoding.beam, 'softmax_temperature'):
#             self.asr_model.cfg.decoding.beam.softmax_temperature = temperature
        
#         self.asr_model.change_decoding_strategy(self.asr_model.cfg.decoding)
        
#         # Get multiple hypotheses with their scores
#         hypotheses = self.asr_model.transcribe(
#             [audio_file],
#             batch_size=1,
#             return_hypotheses=True,
#             num_workers=0
#         )
        
#         # Handle different return types
#         if isinstance(hypotheses, list) and len(hypotheses) > 0:
#             hyp = hypotheses[0]
            
#             # Check if it's a Hypothesis object or a list
#             if isinstance(hyp, list):
#                 # It's already a list of transcriptions
#                 best_text = hyp[0] if len(hyp) > 0 else ""
#                 print(f"\n📊 Top hypothesis: {best_text}")
#                 return best_text
#             elif hasattr(hyp, 'text'):
#                 # It's a Hypothesis object
#                 text = hyp.text
                
#                 # Check for nbest hypotheses
#                 if hasattr(hyp, 'nbest') and len(hyp.nbest) > 1:
#                     print(f"\n📊 Top {min(5, len(hyp.nbest))} hypotheses:")
#                     for i, nbest_hyp in enumerate(hyp.nbest[:5]):
#                         score = nbest_hyp.score if hasattr(nbest_hyp, 'score') else 'N/A'
#                         hyp_text = nbest_hyp.text if hasattr(nbest_hyp, 'text') else str(nbest_hyp)
#                         print(f"  {i+1}. [{score}] {hyp_text}")
                
#                 return text
#             else:
#                 # Fallback: convert to string
#                 return str(hyp)
        
#         return ""
    
#     def transcribe_with_frame_analysis(self, audio_file: str):
#         """
#         Analyze frame-level predictions to detect repetitions
#         This examines the raw CTC outputs before collapsing
#         """
#         print("🔍 Performing frame-level analysis...")
        
#         # Get log probabilities at frame level
#         log_probs = self.asr_model.transcribe(
#             [audio_file],
#             batch_size=1,
#             logprobs=True
#         )
        
#         # Standard transcription
#         transcription = self.asr_model.transcribe([audio_file])
        
#         return transcription[0], log_probs
    
#     def transcribe_with_all_methods(self, audio_file: str):
#         """Try multiple decoding strategies and return all results"""
#         results = {}
        
#         # Method 1: Standard beam search
#         print("\n--- Method 1: Standard Beam Search ---")
#         results['beam_standard'] = self.transcribe_with_logprobs(audio_file, temperature=1.0)
        
#         # Method 2: Lower temperature (more conservative)
#         print("\n--- Method 2: Conservative (temp=0.5) ---")
#         results['beam_conservative'] = self.transcribe_with_logprobs(audio_file, temperature=0.5)
        
#         # Method 3: Higher temperature (more exploratory)
#         print("\n--- Method 3: Exploratory (temp=1.5) ---")
#         results['beam_exploratory'] = self.transcribe_with_logprobs(audio_file, temperature=1.5)
        
#         # Method 4: Frame-level analysis
#         print("\n--- Method 4: Frame-level Analysis ---")
#         results['frame_analysis'], _ = self.transcribe_with_frame_analysis(audio_file)
        
#         return results


# def post_process_repetitions(text: str, audio_duration: float, expected_word_count: int = None) -> str:
#     """
#     Heuristic post-processing to restore repetitions
    
#     Args:
#         text: Transcribed text
#         audio_duration: Duration of audio in seconds
#         expected_word_count: Expected number of words (if known)
#     """
#     words = text.split()
    
#     # Calculate speaking rate (words per second)
#     speaking_rate = len(words) / audio_duration
    
#     # Normal Arabic speaking rate is 2-3 words per second
#     # For numbers, it's often slower (1-2 words per second)
#     # If rate is too high, likely missing repetitions
    
#     if speaking_rate > 3.0 and expected_word_count:
#         print(f"⚠️ Speaking rate unusually high ({speaking_rate:.1f} w/s)")
#         print(f"   Expected ~{expected_word_count} words, got {len(words)}")
#         print("   Possible missing repetitions detected")
    
#     return text


# def detect_number_patterns(text: str) -> List[str]:
#     """Detect if text contains Arabic number words"""
#     arabic_numbers = [
#         'صفر', 'زيرو', 'واحد', 'اثنين', 'ثلاثة', 'أربعة', 
#         'خمسة', 'ستة', 'سبعة', 'ثمانية', 'تسعة'
#     ]
    
#     words = text.split()
#     detected = [w for w in words if w in arabic_numbers]
    
#     if detected:
#         print(f"🔢 Detected number words: {' '.join(detected)}")
    
#     return detected


# # ===== MAIN EXECUTION =====
# if __name__ == "__main__":
#     # ===== STEP 1: Record audio =====
#     print("🎙️ Recording... Speak Arabic now!")
#     print("💡 TIP: For repeated numbers, pause slightly between each repetition")
#     print("   Example: 'زيرو [pause] زيرو [pause] واحد [pause] واحد'\n")
    
#     audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
#     sd.wait()
#     wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
#     print(f"✅ Recording finished. Saved as {OUTPUT_FILE}\n")
    
#     # ===== STEP 2: Initialize transcriber =====
#     model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
#     transcriber = RepetitionAwareTranscriber(model_path)
    
#     # ===== STEP 3: Transcribe with all methods =====
#     results = transcriber.transcribe_with_all_methods(OUTPUT_FILE)
    
#     # ===== STEP 4: Display all results =====
#     print("\n" + "="*60)
#     print("📝 FINAL RESULTS:")
#     print("="*60)
    
#     for method, transcription in results.items():
#         print(f"\n{method.upper()}:")
#         print(f"  {transcription}")
#         detect_number_patterns(transcription)
    
#     # ===== STEP 5: Post-processing analysis =====
#     print("\n" + "="*60)
#     print("🔍 POST-PROCESSING ANALYSIS:")
#     print("="*60)
    
#     best_transcription = results['beam_standard']
#     processed = post_process_repetitions(best_transcription, DURATION)
    
#     print(f"\nBest transcription: {best_transcription}")
#     print(f"Word count: {len(best_transcription.split())}")
#     print(f"Speaking rate: {len(best_transcription.split()) / DURATION:.2f} words/sec")
    
#     # ===== STEP 6: Recommendations =====
#     print("\n" + "="*60)
#     print("💡 RECOMMENDATIONS:")
#     print("="*60)
#     print("1. Compare all method outputs above")
#     print("2. If all methods miss repetitions, the issue is in the trained model")
#     print("3. Consider retraining with more repetitive sequences in training data")
#     print("4. When speaking, add slight pauses between repeated words")
#     print("5. If transcribing phone numbers, use digit-by-digit model instead")


import sounddevice as sd
import scipy.io.wavfile as wav
import nemo.collections.asr as nemo_asr
import torch
import numpy as np
from typing import List, Tuple

# ===== SETTINGS =====
SAMPLE_RATE = 16000
DURATION = 10  # seconds
OUTPUT_FILE = "arabic_recording.wav"

class RepetitionAwareTranscriber:
    def __init__(self, model_path: str):
        """Initialize ASR model with repetition-aware configuration"""
        print("📥 Loading Arabic ASR model...")
        # Try to load as Hybrid RNNT-CTC first (better for repetitions!)
        try:
            self.asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path)
            self.model_type = "hybrid_rnnt_ctc"
            print("✅ Loaded as Hybrid RNNT-CTC model (excellent for repetitions!)")
        except:
            try:
                self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(model_path)
                self.model_type = "rnnt"
                print("✅ Loaded as RNNT model")
            except:
                self.asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
                self.model_type = "ctc"
                print("✅ Loaded as CTC model")
        
        self._configure_decoding()
    
    def _configure_decoding(self):
        """Configure advanced decoding strategy"""
        decoding_cfg = self.asr_model.cfg.decoding
        
        # Use beam search for better sequence modeling
        decoding_cfg.strategy = "beam"
        decoding_cfg.beam.beam_size = 128  # Larger beam for more candidates
        decoding_cfg.beam.return_best_hypothesis = False  # Get multiple hypotheses
        
        # Language model parameters (if available)
        if hasattr(decoding_cfg.beam, 'beam_alpha'):
            decoding_cfg.beam.beam_alpha = 0.3  # LM weight (lower = less LM influence)
        if hasattr(decoding_cfg.beam, 'beam_beta'):
            decoding_cfg.beam.beam_beta = 0.5   # Word insertion bonus
        
        self.asr_model.change_decoding_strategy(decoding_cfg)
    
    def transcribe_with_logprobs(self, audio_file: str, temperature: float = 1.0):
        """

        Transcribe with log probabilities and temperature scaling

        

        Args:

            audio_file: Path to audio file

            temperature: Controls randomness (lower = more conservative, higher = more diverse)

                        0.5 = more deterministic

                        1.0 = standard

                        1.5 = more exploratory

        """
        print(f"🔍 Transcribing with temperature={temperature}...")
        
        # Update temperature in decoding config
        if hasattr(self.asr_model.cfg.decoding, 'temperature'):
            self.asr_model.cfg.decoding.temperature = temperature
        if hasattr(self.asr_model.cfg.decoding.beam, 'softmax_temperature'):
            self.asr_model.cfg.decoding.beam.softmax_temperature = temperature
        
        self.asr_model.change_decoding_strategy(self.asr_model.cfg.decoding)
        
        # Get multiple hypotheses with their scores
        hypotheses = self.asr_model.transcribe(
            [audio_file],
            batch_size=1,
            return_hypotheses=True,
            num_workers=0
        )
        print(hypotheses)
        # Handle different return types
        if isinstance(hypotheses, list) and len(hypotheses) > 0:
            hyp = hypotheses[0]
            
            # Check if it's a Hypothesis object or a list
            if isinstance(hyp, list):
                # It's already a list of transcriptions
                best_text = hyp[0] if len(hyp) > 0 else ""
                print(f"\n📊 Top hypothesis: {best_text}")
                return best_text
            elif hasattr(hyp, 'text'):
                # It's a Hypothesis object
                text = hyp.text
                
                # Check for nbest hypotheses
                if hasattr(hyp, 'nbest') and len(hyp.nbest) > 1:
                    print(f"\n📊 Top {min(5, len(hyp.nbest))} hypotheses:")
                    for i, nbest_hyp in enumerate(hyp.nbest[:5]):
                        score = nbest_hyp.score if hasattr(nbest_hyp, 'score') else 'N/A'
                        hyp_text = nbest_hyp.text if hasattr(nbest_hyp, 'text') else str(nbest_hyp)
                        print(f"  {i+1}. [{score}] {hyp_text}")
                
                return text
            else:
                # Fallback: convert to string
                return str(hyp)
        
        return ""
    
    def transcribe_with_frame_analysis(self, audio_file: str):
        """

        Analyze frame-level predictions to detect repetitions

        This examines the raw CTC outputs before collapsing

        """
        print("🔍 Performing frame-level analysis...")
        
        # Get log probabilities at frame level
        log_probs = self.asr_model.transcribe(
            [audio_file],
            batch_size=1,
            logprobs=True
        )
        
        # Standard transcription
        transcription = self.asr_model.transcribe([audio_file])
        
        return transcription[0], log_probs
    
    def transcribe_with_all_methods(self, audio_file: str):
        """Try multiple decoding strategies and return all results"""
        results = {}
        
        # Method 1: Standard beam search
        print("\n--- Method 1: Standard Beam Search ---")
        results['beam_standard'] = self.transcribe_with_logprobs(audio_file, temperature=1.0)
        print(f"Results with Temp 1.0 : {results['beam_standard']}")
        
        # Method 2: Lower temperature (more conservative)
        print("\n--- Method 2: Conservative (temp=0.5) ---")
        results['beam_conservative'] = self.transcribe_with_logprobs(audio_file, temperature=0.5)
        print(f"Results with Temp 0.5 : {results['beam_conservative']}")
        # Method 3: Higher temperature (more exploratory)
        print("\n--- Method 3: Exploratory (temp=1.5) ---")
        results['beam_exploratory'] = self.transcribe_with_logprobs(audio_file, temperature=1.5)
        print(f"Results with Temp 1.5 : {results['beam_exploratory']}")
        # Method 4: Frame-level analysis
        # print("\n--- Method 4: Frame-level Analysis ---")
        # results['frame_analysis'], _ = self.transcribe_with_frame_analysis(audio_file)
        
        return results


def post_process_repetitions(text: str, audio_duration: float, expected_word_count: int = None) -> str:
    """

    Heuristic post-processing to restore repetitions

    

    Args:

        text: Transcribed text

        audio_duration: Duration of audio in seconds

        expected_word_count: Expected number of words (if known)

    """
    words = text.split()
    
    # Calculate speaking rate (words per second)
    speaking_rate = len(words) / audio_duration
    
    # Normal Arabic speaking rate is 2-3 words per second
    # For numbers, it's often slower (1-2 words per second)
    # If rate is too high, likely missing repetitions
    
    if speaking_rate > 3.0 and expected_word_count:
        print(f"⚠️ Speaking rate unusually high ({speaking_rate:.1f} w/s)")
        print(f"   Expected ~{expected_word_count} words, got {len(words)}")
        print("   Possible missing repetitions detected")
    
    return text


def detect_number_patterns(text: str) -> List[str]:
    """Detect if text contains Arabic number words"""
    arabic_numbers = [
        'صفر', 'زيرو', 'واحد', 'اثنين', 'ثلاثة', 'أربعة', 
        'خمسة', 'ستة', 'سبعة', 'ثمانية', 'تسعة'
    ]
    
    words = text.split()
    detected = [w for w in words if w in arabic_numbers]
    
    if detected:
        print(f"🔢 Detected number words: {' '.join(detected)}")
    
    return detected


# ===== MAIN EXECUTION =====
if __name__ == "__main__":
    # ===== STEP 1: Record audio =====
    print("🎙️ Recording... Speak Arabic now!")
    print("💡 TIP: For repeated numbers, pause slightly between each repetition")
    print("   Example: 'زيرو [pause] زيرو [pause] واحد [pause] واحد'\n")
    
    audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
    sd.wait()
    wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
    print(f"✅ Recording finished. Saved as {OUTPUT_FILE}\n")
    
    # ===== STEP 2: Initialize transcriber =====
    model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
    transcriber = RepetitionAwareTranscriber(model_path)
    
    # ===== STEP 3: Transcribe with all methods =====
    results = transcriber.transcribe_with_all_methods(OUTPUT_FILE)
    
    # ===== STEP 4: Display all results =====
    print("\n" + "="*60)
    print("📝 FINAL RESULTS:")
    print("="*60)
    
    for method, transcription in results.items():
        print(f"\n{method.upper()}:")
        print(f"  {transcription}")
        detect_number_patterns(transcription)
    
    # ===== STEP 5: Post-processing analysis =====
    print("\n" + "="*60)
    print("🔍 POST-PROCESSING ANALYSIS:")
    print("="*60)
    
    best_transcription = results['beam_standard']
    processed = post_process_repetitions(best_transcription, DURATION)
    
    print(f"\nBest transcription: {best_transcription}")
    print(f"Word count: {len(best_transcription.split())}")
    print(f"Speaking rate: {len(best_transcription.split()) / DURATION:.2f} words/sec")
    
    # ===== STEP 6: Recommendations =====
    print("\n" + "="*60)
    print("💡 RECOMMENDATIONS:")
    print("="*60)
    print("1. Compare all method outputs above")
    print("2. If all methods miss repetitions, the issue is in the trained model")
    print("3. Consider retraining with more repetitive sequences in training data")
    print("4. When speaking, add slight pauses between repeated words")
    print("5. If transcribing phone numbers, use digit-by-digit model instead")