Arabic_Finetuned_ASR_Nemo / testing_main.py
alaatiger989's picture
Add files using upload-large-folder tool
b5e57ee verified
# import sounddevice as sd
# import scipy.io.wavfile as wav
# import nemo.collections.asr as nemo_asr
# # ===== SETTINGS =====
# SAMPLE_RATE = 16000
# DURATION = 10 # seconds
# OUTPUT_FILE = "arabic_recording.wav"
# # ===== STEP 1: Record audio =====
# print("🎙️ Recording... Speak Arabic now!")
# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
# sd.wait()
# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
# # ===== STEP 2: Load ASR model =====
# print("📥 Loading Arabic ASR model...")
# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
# "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
# )
# # ===== STEP 3: Configure Decoding =====
# print("🔍 Configuring decoding strategy...")
# # Get decoding config
# decoding_cfg = asr_model.cfg.decoding
# # Print available parameters to debug
# print(f"Available decoding strategies: {decoding_cfg.keys() if hasattr(decoding_cfg, 'keys') else 'N/A'}")
# # After loading the model, add this to inspect the config:
# print("🔍 Beam config structure:")
# print(decoding_cfg.beam)
# # Set beam search strategy
# decoding_cfg.strategy = "beam"
# decoding_cfg.beam.beam_size = 128
# decoding_cfg.beam.return_best_hypothesis = True
# # Only set parameters that exist
# if hasattr(decoding_cfg.beam, 'beam_alpha'):
# decoding_cfg.beam.beam_alpha = 0.3
# print("✓ Set beam_alpha")
# if hasattr(decoding_cfg.beam, 'beam_beta'):
# decoding_cfg.beam.beam_beta = 0.5
# print("✓ Set beam_beta")
# # Remove softmax_temperature - it's not supported in this config
# # If you need temperature sampling, you might need to use a different strategy
# # Apply the decoding configuration
# asr_model.change_decoding_strategy(decoding_cfg)
# # ===== STEP 4: Transcribe =====
# print("🔍 Transcribing...")
# transcription = asr_model.transcribe(
# [OUTPUT_FILE],
# batch_size=1,
# num_workers=0
# )
# print("📝 Transcription:", transcription[0])
# import sounddevice as sd
# import scipy.io.wavfile as wav
# import nemo.collections.asr as nemo_asr
# # ===== SETTINGS =====
# SAMPLE_RATE = 16000
# DURATION = 10
# OUTPUT_FILE = "arabic_recording.wav"
# # ===== STEP 1: Record audio =====
# print("🎙️ Recording... Speak Arabic now!")
# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
# sd.wait()
# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
# # ===== STEP 2: Load ASR model =====
# print("📥 Loading Arabic ASR model...")
# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
# "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
# )
# # ===== STEP 3: Configure for LITERAL transcription =====
# print("🔍 Configuring greedy decoding for literal output...")
# decoding_cfg = asr_model.cfg.decoding
# decoding_cfg.strategy = "greedy"
# # CRITICAL: Increase max_symbols to avoid truncating repetitions
# # The default is only 10, which is very restrictive!
# decoding_cfg.greedy.max_symbols = 1000 # Allow much longer sequences
# decoding_cfg.beam.beam_size = 64
# decoding_cfg.beam.search_type = "beam"
# print(f"✓ Set max_symbols to {decoding_cfg.greedy.max_symbols}")
# print("Updated config:", decoding_cfg)
# # Apply configuration
# asr_model.change_decoding_strategy(decoding_cfg)
# # ===== STEP 4: Transcribe =====
# print("🔍 Transcribing...")
# transcription = asr_model.transcribe(
# [OUTPUT_FILE],
# batch_size=1,
# num_workers=0
# )
# print("📝 Literal Transcription:", transcription[0])
import sounddevice as sd
import scipy.io.wavfile as wav
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
# ===== SETTINGS =====
SAMPLE_RATE = 16000
DURATION = 10
OUTPUT_FILE = "arabic_recording.wav"
# ===== STEP 2: Load ASR model =====
print("📥 Loading Arabic ASR model...")
asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(
"C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
)
# Add this right after loading the model to see what's actually available:
print("Available greedy parameters:")
print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))
# ===== STEP 3: Configure for LITERAL transcription =====
print("🔍 Configuring greedy decoding for literal output...")
# Set struct mode to False temporarily to allow modifications
OmegaConf.set_struct(asr_model.cfg.decoding, False)
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)
decoding_cfg = asr_model.cfg.decoding
decoding_cfg.strategy = "maes"
# Now try setting the parameters
try:
decoding_cfg.greedy.max_symbols_per_step = 300
print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
except:
print("⚠ Could not set max_symbols_per_step")
decoding_cfg.greedy.max_symbols = 500
decoding_cfg.greedy.loop_labels = True
decoding_cfg.greedy.preserve_alignments = True
decoding_cfg.preserve_alignments = True
decoding_cfg.compute_timestamps = True
decoding_cfg.temperature = 1.3
decoding_cfg.beam.beam_size = 64
decoding_cfg.beam.softmax_temperature = 1.3
decoding_cfg.beam.search_type = "beam"
print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
print(f"✓ temperature: {decoding_cfg.temperature}")
# Re-enable struct mode
OmegaConf.set_struct(asr_model.cfg.decoding, True)
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)
# Apply configuration
asr_model.change_decoding_strategy(decoding_cfg)
# ===== STEP 1: Record audio =====
print("🎙️ Recording... Speak Arabic now!")
audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
sd.wait()
wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
# ===== STEP 4: Transcribe =====
print("🔍 Transcribing...")
transcription = asr_model.transcribe(
[OUTPUT_FILE],
batch_size=1,
num_workers=0
)
print("📝 Literal Transcription:", transcription[0])