# import sounddevice as sd # import scipy.io.wavfile as wav # import nemo.collections.asr as nemo_asr # # ===== SETTINGS ===== # SAMPLE_RATE = 16000 # DURATION = 10 # seconds # OUTPUT_FILE = "arabic_recording.wav" # # ===== STEP 1: Record audio ===== # print("🎙️ Recording... Speak Arabic now!") # audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16') # sd.wait() # wav.write(OUTPUT_FILE, SAMPLE_RATE, audio) # print(f"✅ Recording finished. Saved as {OUTPUT_FILE}") # # ===== STEP 2: Load ASR model ===== # print("📥 Loading Arabic ASR model...") # asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from( # "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo" # ) # # ===== STEP 3: Configure Decoding ===== # print("🔍 Configuring decoding strategy...") # # Get decoding config # decoding_cfg = asr_model.cfg.decoding # # Print available parameters to debug # print(f"Available decoding strategies: {decoding_cfg.keys() if hasattr(decoding_cfg, 'keys') else 'N/A'}") # # After loading the model, add this to inspect the config: # print("🔍 Beam config structure:") # print(decoding_cfg.beam) # # Set beam search strategy # decoding_cfg.strategy = "beam" # decoding_cfg.beam.beam_size = 128 # decoding_cfg.beam.return_best_hypothesis = True # # Only set parameters that exist # if hasattr(decoding_cfg.beam, 'beam_alpha'): # decoding_cfg.beam.beam_alpha = 0.3 # print("✓ Set beam_alpha") # if hasattr(decoding_cfg.beam, 'beam_beta'): # decoding_cfg.beam.beam_beta = 0.5 # print("✓ Set beam_beta") # # Remove softmax_temperature - it's not supported in this config # # If you need temperature sampling, you might need to use a different strategy # # Apply the decoding configuration # asr_model.change_decoding_strategy(decoding_cfg) # # ===== STEP 4: Transcribe ===== # print("🔍 Transcribing...") # transcription = asr_model.transcribe( # [OUTPUT_FILE], # batch_size=1, # num_workers=0 # ) # print("📝 Transcription:", transcription[0]) # import sounddevice as sd # import scipy.io.wavfile as wav # import nemo.collections.asr as nemo_asr # # ===== SETTINGS ===== # SAMPLE_RATE = 16000 # DURATION = 10 # OUTPUT_FILE = "arabic_recording.wav" # # ===== STEP 1: Record audio ===== # print("🎙️ Recording... Speak Arabic now!") # audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16') # sd.wait() # wav.write(OUTPUT_FILE, SAMPLE_RATE, audio) # print(f"✅ Recording finished. Saved as {OUTPUT_FILE}") # # ===== STEP 2: Load ASR model ===== # print("📥 Loading Arabic ASR model...") # asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from( # "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo" # ) # # ===== STEP 3: Configure for LITERAL transcription ===== # print("🔍 Configuring greedy decoding for literal output...") # decoding_cfg = asr_model.cfg.decoding # decoding_cfg.strategy = "greedy" # # CRITICAL: Increase max_symbols to avoid truncating repetitions # # The default is only 10, which is very restrictive! # decoding_cfg.greedy.max_symbols = 1000 # Allow much longer sequences # decoding_cfg.beam.beam_size = 64 # decoding_cfg.beam.search_type = "beam" # print(f"✓ Set max_symbols to {decoding_cfg.greedy.max_symbols}") # print("Updated config:", decoding_cfg) # # Apply configuration # asr_model.change_decoding_strategy(decoding_cfg) # # ===== STEP 4: Transcribe ===== # print("🔍 Transcribing...") # transcription = asr_model.transcribe( # [OUTPUT_FILE], # batch_size=1, # num_workers=0 # ) # print("📝 Literal Transcription:", transcription[0]) import sounddevice as sd import scipy.io.wavfile as wav import nemo.collections.asr as nemo_asr from omegaconf import OmegaConf # ===== SETTINGS ===== SAMPLE_RATE = 16000 DURATION = 10 OUTPUT_FILE = "arabic_recording.wav" # ===== STEP 2: Load ASR model ===== print("📥 Loading Arabic ASR model...") asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from( "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo" ) # Add this right after loading the model to see what's actually available: print("Available greedy parameters:") print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy)) # ===== STEP 3: Configure for LITERAL transcription ===== print("🔍 Configuring greedy decoding for literal output...") # Set struct mode to False temporarily to allow modifications OmegaConf.set_struct(asr_model.cfg.decoding, False) OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False) decoding_cfg = asr_model.cfg.decoding decoding_cfg.strategy = "maes" # Now try setting the parameters try: decoding_cfg.greedy.max_symbols_per_step = 300 print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}") except: print("⚠ Could not set max_symbols_per_step") decoding_cfg.greedy.max_symbols = 500 decoding_cfg.greedy.loop_labels = True decoding_cfg.greedy.preserve_alignments = True decoding_cfg.preserve_alignments = True decoding_cfg.compute_timestamps = True decoding_cfg.temperature = 1.3 decoding_cfg.beam.beam_size = 64 decoding_cfg.beam.softmax_temperature = 1.3 decoding_cfg.beam.search_type = "beam" print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}") print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}") print(f"✓ temperature: {decoding_cfg.temperature}") # Re-enable struct mode OmegaConf.set_struct(asr_model.cfg.decoding, True) OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True) # Apply configuration asr_model.change_decoding_strategy(decoding_cfg) # ===== STEP 1: Record audio ===== print("🎙️ Recording... Speak Arabic now!") audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16') sd.wait() wav.write(OUTPUT_FILE, SAMPLE_RATE, audio) print(f"✅ Recording finished. Saved as {OUTPUT_FILE}") # ===== STEP 4: Transcribe ===== print("🔍 Transcribing...") transcription = asr_model.transcribe( [OUTPUT_FILE], batch_size=1, num_workers=0 ) print("📝 Literal Transcription:", transcription[0])