Arabic_Finetuned_ASR_Nemo / testing_main.py

Add files using upload-large-folder tool

b5e57ee verified about 2 months ago

6.77 kB

	# import sounddevice as sd
	# import scipy.io.wavfile as wav
	# import nemo.collections.asr as nemo_asr

	# # ===== SETTINGS =====
	# SAMPLE_RATE = 16000
	# DURATION = 10 # seconds
	# OUTPUT_FILE = "arabic_recording.wav"

	# # ===== STEP 1: Record audio =====
	# print("🎙️ Recording... Speak Arabic now!")
	# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
	# sd.wait()
	# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
	# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")

	# # ===== STEP 2: Load ASR model =====
	# print("📥 Loading Arabic ASR model...")
	# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
	# "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
	# )

	# # ===== STEP 3: Configure Decoding =====
	# print("🔍 Configuring decoding strategy...")

	# # Get decoding config
	# decoding_cfg = asr_model.cfg.decoding

	# # Print available parameters to debug
	# print(f"Available decoding strategies: {decoding_cfg.keys() if hasattr(decoding_cfg, 'keys') else 'N/A'}")
	# # After loading the model, add this to inspect the config:
	# print("🔍 Beam config structure:")
	# print(decoding_cfg.beam)
	# # Set beam search strategy
	# decoding_cfg.strategy = "beam"
	# decoding_cfg.beam.beam_size = 128
	# decoding_cfg.beam.return_best_hypothesis = True

	# # Only set parameters that exist
	# if hasattr(decoding_cfg.beam, 'beam_alpha'):
	# decoding_cfg.beam.beam_alpha = 0.3
	# print("✓ Set beam_alpha")

	# if hasattr(decoding_cfg.beam, 'beam_beta'):
	# decoding_cfg.beam.beam_beta = 0.5
	# print("✓ Set beam_beta")

	# # Remove softmax_temperature - it's not supported in this config
	# # If you need temperature sampling, you might need to use a different strategy

	# # Apply the decoding configuration
	# asr_model.change_decoding_strategy(decoding_cfg)

	# # ===== STEP 4: Transcribe =====
	# print("🔍 Transcribing...")
	# transcription = asr_model.transcribe(
	# [OUTPUT_FILE],
	# batch_size=1,
	# num_workers=0
	# )

	# print("📝 Transcription:", transcription[0])



	# import sounddevice as sd
	# import scipy.io.wavfile as wav
	# import nemo.collections.asr as nemo_asr

	# # ===== SETTINGS =====
	# SAMPLE_RATE = 16000
	# DURATION = 10
	# OUTPUT_FILE = "arabic_recording.wav"

	# # ===== STEP 1: Record audio =====
	# print("🎙️ Recording... Speak Arabic now!")
	# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
	# sd.wait()
	# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
	# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")

	# # ===== STEP 2: Load ASR model =====
	# print("📥 Loading Arabic ASR model...")
	# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
	# "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
	# )

	# # ===== STEP 3: Configure for LITERAL transcription =====
	# print("🔍 Configuring greedy decoding for literal output...")

	# decoding_cfg = asr_model.cfg.decoding
	# decoding_cfg.strategy = "greedy"

	# # CRITICAL: Increase max_symbols to avoid truncating repetitions
	# # The default is only 10, which is very restrictive!
	# decoding_cfg.greedy.max_symbols = 1000 # Allow much longer sequences
	# decoding_cfg.beam.beam_size = 64
	# decoding_cfg.beam.search_type = "beam"
	# print(f"✓ Set max_symbols to {decoding_cfg.greedy.max_symbols}")
	# print("Updated config:", decoding_cfg)

	# # Apply configuration
	# asr_model.change_decoding_strategy(decoding_cfg)

	# # ===== STEP 4: Transcribe =====
	# print("🔍 Transcribing...")
	# transcription = asr_model.transcribe(
	# [OUTPUT_FILE],
	# batch_size=1,
	# num_workers=0
	# )

	# print("📝 Literal Transcription:", transcription[0])


	import sounddevice as sd
	import scipy.io.wavfile as wav
	import nemo.collections.asr as nemo_asr
	from omegaconf import OmegaConf

	# ===== SETTINGS =====
	SAMPLE_RATE = 16000
	DURATION = 10
	OUTPUT_FILE = "arabic_recording.wav"
	# ===== STEP 2: Load ASR model =====
	print("📥 Loading Arabic ASR model...")
	asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(
	"C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
	)


	# Add this right after loading the model to see what's actually available:
	print("Available greedy parameters:")
	print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))

	# ===== STEP 3: Configure for LITERAL transcription =====
	print("🔍 Configuring greedy decoding for literal output...")

	# Set struct mode to False temporarily to allow modifications
	OmegaConf.set_struct(asr_model.cfg.decoding, False)
	OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)

	decoding_cfg = asr_model.cfg.decoding
	decoding_cfg.strategy = "maes"

	# Now try setting the parameters
	try:
	decoding_cfg.greedy.max_symbols_per_step = 300
	print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
	except:
	print("⚠ Could not set max_symbols_per_step")

	decoding_cfg.greedy.max_symbols = 500
	decoding_cfg.greedy.loop_labels = True
	decoding_cfg.greedy.preserve_alignments = True
	decoding_cfg.preserve_alignments = True
	decoding_cfg.compute_timestamps = True
	decoding_cfg.temperature = 1.3

	decoding_cfg.beam.beam_size = 64
	decoding_cfg.beam.softmax_temperature = 1.3
	decoding_cfg.beam.search_type = "beam"
	print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
	print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
	print(f"✓ temperature: {decoding_cfg.temperature}")

	# Re-enable struct mode
	OmegaConf.set_struct(asr_model.cfg.decoding, True)
	OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)

	# Apply configuration
	asr_model.change_decoding_strategy(decoding_cfg)


	# ===== STEP 1: Record audio =====
	print("🎙️ Recording... Speak Arabic now!")
	audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
	sd.wait()
	wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
	print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")



	# ===== STEP 4: Transcribe =====
	print("🔍 Transcribing...")
	transcription = asr_model.transcribe(
	[OUTPUT_FILE],
	batch_size=1,
	num_workers=0
	)

	print("📝 Literal Transcription:", transcription[0])