File size: 6,771 Bytes

b5e57ee

# import sounddevice as sd
# import scipy.io.wavfile as wav
# import nemo.collections.asr as nemo_asr

# # ===== SETTINGS =====
# SAMPLE_RATE = 16000
# DURATION = 10  # seconds
# OUTPUT_FILE = "arabic_recording.wav"

# # ===== STEP 1: Record audio =====
# print("🎙️ Recording... Speak Arabic now!")
# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
# sd.wait()
# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")

# # ===== STEP 2: Load ASR model =====
# print("📥 Loading Arabic ASR model...")
# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
#     "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
# )

# # ===== STEP 3: Configure Decoding =====
# print("🔍 Configuring decoding strategy...")

# # Get decoding config
# decoding_cfg = asr_model.cfg.decoding

# # Print available parameters to debug
# print(f"Available decoding strategies: {decoding_cfg.keys() if hasattr(decoding_cfg, 'keys') else 'N/A'}")
# # After loading the model, add this to inspect the config:
# print("🔍 Beam config structure:")
# print(decoding_cfg.beam)
# # Set beam search strategy
# decoding_cfg.strategy = "beam"
# decoding_cfg.beam.beam_size = 128
# decoding_cfg.beam.return_best_hypothesis = True

# # Only set parameters that exist
# if hasattr(decoding_cfg.beam, 'beam_alpha'):
#     decoding_cfg.beam.beam_alpha = 0.3
#     print("✓ Set beam_alpha")

# if hasattr(decoding_cfg.beam, 'beam_beta'):
#     decoding_cfg.beam.beam_beta = 0.5
#     print("✓ Set beam_beta")

# # Remove softmax_temperature - it's not supported in this config
# # If you need temperature sampling, you might need to use a different strategy

# # Apply the decoding configuration
# asr_model.change_decoding_strategy(decoding_cfg)

# # ===== STEP 4: Transcribe =====
# print("🔍 Transcribing...")
# transcription = asr_model.transcribe(
#     [OUTPUT_FILE],
#     batch_size=1,
#     num_workers=0
# )

# print("📝 Transcription:", transcription[0])



# import sounddevice as sd
# import scipy.io.wavfile as wav
# import nemo.collections.asr as nemo_asr

# # ===== SETTINGS =====
# SAMPLE_RATE = 16000
# DURATION = 10
# OUTPUT_FILE = "arabic_recording.wav"

# # ===== STEP 1: Record audio =====
# print("🎙️ Recording... Speak Arabic now!")
# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
# sd.wait()
# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")

# # ===== STEP 2: Load ASR model =====
# print("📥 Loading Arabic ASR model...")
# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
#     "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
# )

# # ===== STEP 3: Configure for LITERAL transcription =====
# print("🔍 Configuring greedy decoding for literal output...")

# decoding_cfg = asr_model.cfg.decoding
# decoding_cfg.strategy = "greedy"

# # CRITICAL: Increase max_symbols to avoid truncating repetitions
# # The default is only 10, which is very restrictive!
# decoding_cfg.greedy.max_symbols = 1000  # Allow much longer sequences
# decoding_cfg.beam.beam_size = 64
# decoding_cfg.beam.search_type = "beam"
# print(f"✓ Set max_symbols to {decoding_cfg.greedy.max_symbols}")
# print("Updated config:", decoding_cfg)

# # Apply configuration
# asr_model.change_decoding_strategy(decoding_cfg)

# # ===== STEP 4: Transcribe =====
# print("🔍 Transcribing...")
# transcription = asr_model.transcribe(
#     [OUTPUT_FILE],
#     batch_size=1,
#     num_workers=0
# )

# print("📝 Literal Transcription:", transcription[0])


import sounddevice as sd
import scipy.io.wavfile as wav
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf

# ===== SETTINGS =====
SAMPLE_RATE = 16000
DURATION = 10
OUTPUT_FILE = "arabic_recording.wav"
# ===== STEP 2: Load ASR model =====
print("📥 Loading Arabic ASR model...")
asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(
    "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
)


# Add this right after loading the model to see what's actually available:
print("Available greedy parameters:")
print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))

# ===== STEP 3: Configure for LITERAL transcription =====
print("🔍 Configuring greedy decoding for literal output...")

# Set struct mode to False temporarily to allow modifications
OmegaConf.set_struct(asr_model.cfg.decoding, False)
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)

decoding_cfg = asr_model.cfg.decoding
decoding_cfg.strategy = "maes"

# Now try setting the parameters
try:
    decoding_cfg.greedy.max_symbols_per_step = 300
    print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
except:
    print("⚠ Could not set max_symbols_per_step")

decoding_cfg.greedy.max_symbols = 500
decoding_cfg.greedy.loop_labels = True
decoding_cfg.greedy.preserve_alignments = True
decoding_cfg.preserve_alignments = True
decoding_cfg.compute_timestamps = True
decoding_cfg.temperature = 1.3

decoding_cfg.beam.beam_size = 64
decoding_cfg.beam.softmax_temperature = 1.3
decoding_cfg.beam.search_type = "beam"
print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
print(f"✓ temperature: {decoding_cfg.temperature}")

# Re-enable struct mode
OmegaConf.set_struct(asr_model.cfg.decoding, True)
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)

# Apply configuration
asr_model.change_decoding_strategy(decoding_cfg)


# ===== STEP 1: Record audio =====
print("🎙️ Recording... Speak Arabic now!")
audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
sd.wait()
wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")



# ===== STEP 4: Transcribe =====
print("🔍 Transcribing...")
transcription = asr_model.transcribe(
    [OUTPUT_FILE],
    batch_size=1,
    num_workers=0
)

print("📝 Literal Transcription:", transcription[0])