| # import sounddevice as sd | |
| # import scipy.io.wavfile as wav | |
| # import nemo.collections.asr as nemo_asr | |
| # # ===== SETTINGS ===== | |
| # SAMPLE_RATE = 16000 | |
| # DURATION = 10 # seconds | |
| # OUTPUT_FILE = "arabic_recording.wav" | |
| # # ===== STEP 1: Record audio ===== | |
| # print("🎙️ Recording... Speak Arabic now!") | |
| # audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16') | |
| # sd.wait() | |
| # wav.write(OUTPUT_FILE, SAMPLE_RATE, audio) | |
| # print(f"✅ Recording finished. Saved as {OUTPUT_FILE}") | |
| # # ===== STEP 2: Load ASR model ===== | |
| # print("📥 Loading Arabic ASR model...") | |
| # asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from( | |
| # "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo" | |
| # ) | |
| # # ===== STEP 3: Configure Decoding ===== | |
| # print("🔍 Configuring decoding strategy...") | |
| # # Get decoding config | |
| # decoding_cfg = asr_model.cfg.decoding | |
| # # Print available parameters to debug | |
| # print(f"Available decoding strategies: {decoding_cfg.keys() if hasattr(decoding_cfg, 'keys') else 'N/A'}") | |
| # # After loading the model, add this to inspect the config: | |
| # print("🔍 Beam config structure:") | |
| # print(decoding_cfg.beam) | |
| # # Set beam search strategy | |
| # decoding_cfg.strategy = "beam" | |
| # decoding_cfg.beam.beam_size = 128 | |
| # decoding_cfg.beam.return_best_hypothesis = True | |
| # # Only set parameters that exist | |
| # if hasattr(decoding_cfg.beam, 'beam_alpha'): | |
| # decoding_cfg.beam.beam_alpha = 0.3 | |
| # print("✓ Set beam_alpha") | |
| # if hasattr(decoding_cfg.beam, 'beam_beta'): | |
| # decoding_cfg.beam.beam_beta = 0.5 | |
| # print("✓ Set beam_beta") | |
| # # Remove softmax_temperature - it's not supported in this config | |
| # # If you need temperature sampling, you might need to use a different strategy | |
| # # Apply the decoding configuration | |
| # asr_model.change_decoding_strategy(decoding_cfg) | |
| # # ===== STEP 4: Transcribe ===== | |
| # print("🔍 Transcribing...") | |
| # transcription = asr_model.transcribe( | |
| # [OUTPUT_FILE], | |
| # batch_size=1, | |
| # num_workers=0 | |
| # ) | |
| # print("📝 Transcription:", transcription[0]) | |
| # import sounddevice as sd | |
| # import scipy.io.wavfile as wav | |
| # import nemo.collections.asr as nemo_asr | |
| # # ===== SETTINGS ===== | |
| # SAMPLE_RATE = 16000 | |
| # DURATION = 10 | |
| # OUTPUT_FILE = "arabic_recording.wav" | |
| # # ===== STEP 1: Record audio ===== | |
| # print("🎙️ Recording... Speak Arabic now!") | |
| # audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16') | |
| # sd.wait() | |
| # wav.write(OUTPUT_FILE, SAMPLE_RATE, audio) | |
| # print(f"✅ Recording finished. Saved as {OUTPUT_FILE}") | |
| # # ===== STEP 2: Load ASR model ===== | |
| # print("📥 Loading Arabic ASR model...") | |
| # asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from( | |
| # "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo" | |
| # ) | |
| # # ===== STEP 3: Configure for LITERAL transcription ===== | |
| # print("🔍 Configuring greedy decoding for literal output...") | |
| # decoding_cfg = asr_model.cfg.decoding | |
| # decoding_cfg.strategy = "greedy" | |
| # # CRITICAL: Increase max_symbols to avoid truncating repetitions | |
| # # The default is only 10, which is very restrictive! | |
| # decoding_cfg.greedy.max_symbols = 1000 # Allow much longer sequences | |
| # decoding_cfg.beam.beam_size = 64 | |
| # decoding_cfg.beam.search_type = "beam" | |
| # print(f"✓ Set max_symbols to {decoding_cfg.greedy.max_symbols}") | |
| # print("Updated config:", decoding_cfg) | |
| # # Apply configuration | |
| # asr_model.change_decoding_strategy(decoding_cfg) | |
| # # ===== STEP 4: Transcribe ===== | |
| # print("🔍 Transcribing...") | |
| # transcription = asr_model.transcribe( | |
| # [OUTPUT_FILE], | |
| # batch_size=1, | |
| # num_workers=0 | |
| # ) | |
| # print("📝 Literal Transcription:", transcription[0]) | |
| import sounddevice as sd | |
| import scipy.io.wavfile as wav | |
| import nemo.collections.asr as nemo_asr | |
| from omegaconf import OmegaConf | |
| # ===== SETTINGS ===== | |
| SAMPLE_RATE = 16000 | |
| DURATION = 10 | |
| OUTPUT_FILE = "arabic_recording.wav" | |
| # ===== STEP 2: Load ASR model ===== | |
| print("📥 Loading Arabic ASR model...") | |
| asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from( | |
| "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo" | |
| ) | |
| # Add this right after loading the model to see what's actually available: | |
| print("Available greedy parameters:") | |
| print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy)) | |
| # ===== STEP 3: Configure for LITERAL transcription ===== | |
| print("🔍 Configuring greedy decoding for literal output...") | |
| # Set struct mode to False temporarily to allow modifications | |
| OmegaConf.set_struct(asr_model.cfg.decoding, False) | |
| OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False) | |
| decoding_cfg = asr_model.cfg.decoding | |
| decoding_cfg.strategy = "maes" | |
| # Now try setting the parameters | |
| try: | |
| decoding_cfg.greedy.max_symbols_per_step = 300 | |
| print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}") | |
| except: | |
| print("⚠ Could not set max_symbols_per_step") | |
| decoding_cfg.greedy.max_symbols = 500 | |
| decoding_cfg.greedy.loop_labels = True | |
| decoding_cfg.greedy.preserve_alignments = True | |
| decoding_cfg.preserve_alignments = True | |
| decoding_cfg.compute_timestamps = True | |
| decoding_cfg.temperature = 1.3 | |
| decoding_cfg.beam.beam_size = 64 | |
| decoding_cfg.beam.softmax_temperature = 1.3 | |
| decoding_cfg.beam.search_type = "beam" | |
| print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}") | |
| print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}") | |
| print(f"✓ temperature: {decoding_cfg.temperature}") | |
| # Re-enable struct mode | |
| OmegaConf.set_struct(asr_model.cfg.decoding, True) | |
| OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True) | |
| # Apply configuration | |
| asr_model.change_decoding_strategy(decoding_cfg) | |
| # ===== STEP 1: Record audio ===== | |
| print("🎙️ Recording... Speak Arabic now!") | |
| audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16') | |
| sd.wait() | |
| wav.write(OUTPUT_FILE, SAMPLE_RATE, audio) | |
| print(f"✅ Recording finished. Saved as {OUTPUT_FILE}") | |
| # ===== STEP 4: Transcribe ===== | |
| print("🔍 Transcribing...") | |
| transcription = asr_model.transcribe( | |
| [OUTPUT_FILE], | |
| batch_size=1, | |
| num_workers=0 | |
| ) | |
| print("📝 Literal Transcription:", transcription[0]) |