ipa-recognizer

Sleeping

App Files Files Community

ipa-recognizer / app.py

gagndeep

Update app.py

3b991b8 verified 5 months ago

Raw

History Blame Contribute Delete

3.99 kB

	import gradio as gr
	import torch
	import shutil
	from pathlib import Path
	from huggingface_hub import hf_hub_download
	import spaces


	# NeMo import
	try:
	from nemo.collections.asr.models import ASRModel
	except ImportError:
	print("Warning: NeMo toolkit not found. Please ensure it is installed via requirements.txt")
	ASRModel = None

	def setup_model():
	if ASRModel is None:
	raise ImportError("NeMo toolkit is required but not installed.")

	print("Downloading model files...")
	checkpoint_path = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="checkpoint.ckpt")
	tokenizer_model = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/tokenizer.model")
	vocab_txt = hf_hub_download(repo_id="boldvoice/nemotron-phoneme-ipa-v1", filename="tokenizer/vocab.txt")

	print("Setting up tokenizer...")
	tokenizer_dir = Path("tokenizer")
	tokenizer_dir.mkdir(exist_ok=True)
	shutil.copy(tokenizer_model, tokenizer_dir / "tokenizer.model")
	shutil.copy(vocab_txt, tokenizer_dir / "vocab.txt")

	print("Loading base model...")
	# Load base model and change vocabulary
	model = ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b")
	model.change_vocabulary(new_tokenizer_dir=str(tokenizer_dir), new_tokenizer_type="bpe")

	print("Loading fine-tuned weights...")
	# Load fine-tuned weights
	# Using weights_only=False as per model card instructions
	checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
	state_dict = checkpoint.get("state_dict", checkpoint)
	cleaned_state_dict = {}
	for k, v in state_dict.items():
	new_key = k.replace("model.", "", 1) if k.startswith("model.") else k
	cleaned_state_dict[new_key] = v
	model.load_state_dict(cleaned_state_dict, strict=False)

	# Disable CUDA graphs for inference
	if hasattr(model, 'decoding') and model.decoding is not None:
	decoding_cfg = model.cfg.decoding
	decoding_cfg.greedy.loop_labels = False
	decoding_cfg.greedy.use_cuda_graph_decoder = False
	model.change_decoding_strategy(decoding_cfg)

	# Configure streaming latency (Lowest latency option as default)
	model.encoder.set_default_att_context_size([70, 0])

	# Move to GPU and set eval mode
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)
	model.eval()

	return model

	# Global model variable
	model = None

	@spaces.GPU
	def predict(audio_file):
	global model
	if model is None:
	model = setup_model()

	if not audio_file:
	return ""

	try:
	# Transcribe audio
	predictions = model.transcribe([audio_file])

	# Output: list of IPA strings
	if predictions and len(predictions) > 0:
	pred = predictions[0]
	if hasattr(pred, 'text'):
	text = pred.text
	else:
	text = str(pred)
	# Remove SentencePiece artifacts
	text = text.replace("▁", "").strip()
	return text
	return ""
	except Exception as e:
	return f"Error during transcription: {str(e)}"

	# Create Gradio interface
	iface = gr.Interface(
	fn=predict,
	inputs=gr.Audio(type="filepath", label="Input Audio"),
	outputs=gr.Textbox(label="IPA Transcription"),
	title="Nemotron Phoneme IPA Recognition",
	description="Nemotron-based phoneme recognition model fine-tuned for raw IPA transcription with full diacritics preserved. Based on boldvoice/nemotron-phoneme-ipa-v1.",
	)

	if __name__ == "__main__":
	# Attempt to load model at startup
	try:
	model = setup_model()
	print("Model loaded successfully.")
	except Exception as e:
	print(f"Model loading will happen on first request. Error: {e}")

	iface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)