Spaces:

changelinglab
/

PhoneticXeus

Running

Shikhar

Enable interctc_use_conditioning for clean checkpoint load

a5801f3 18 days ago

3.04 kB

	import os
	import sys

	# Ensure vendored src/ is importable
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	import gradio as gr
	import torch
	import torchaudio
	import soundfile as sf
	from huggingface_hub import hf_hub_download

	from src.model.xeusphoneme.builders import build_xeus_pr_inference

	MAX_SECONDS = 60
	SAMPLE_RATE = 16000

	inference = None


	def load_model():
	ckpt = hf_hub_download(
	"changelinglab/PhoneticXeus", "checkpoint-22000.ckpt"
	)
	resources = os.path.join(
	os.path.dirname(__file__),
	"src", "model", "xeusphoneme", "resources",
	)
	vocab = os.path.join(resources, "ipa_vocab.json")
	config = os.path.join(resources, "xeus_config.yaml")
	return build_xeus_pr_inference(
	work_dir="/tmp/cache/xeus",
	checkpoint=ckpt,
	vocab_file=vocab,
	config_file=config,
	device="cpu",
	interctc_use_conditioning=True,
	)


	def transcribe(audio_path):
	"""Run phone recognition on uploaded/recorded audio."""
	global inference
	if audio_path is None:
	return "", ""

	if inference is None:
	inference = load_model()

	data, sr = sf.read(audio_path, dtype="float32")
	waveform = torch.from_numpy(data)
	if waveform.dim() == 2:
	waveform = waveform.mean(dim=1)
	if sr != SAMPLE_RATE:
	waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)
	waveform = waveform[: SAMPLE_RATE * MAX_SECONDS]

	if waveform.numel() == 0:
	return "", ""

	results = inference(waveform)

	processed = results[0]["processed_transcript"]
	predicted = results[0]["predicted_transcript"]
	spaced = " ".join(
	t for t in predicted.split("/")
	if not (t.startswith("<") and t.endswith(">"))
	)
	return spaced, processed


	with gr.Blocks(title="PhoneticXeus") as demo:
	gr.Markdown(
	"# PhoneticXeus\n"
	"Multilingual phone recognition -- record or upload the multilingual speech "
	"to get an IPA transcription.\n\n"
	"Model: [changelinglab/PhoneticXeus]"
	"(https://huggingface.co/changelinglab/PhoneticXeus) "
	"\| Paper: [arXiv 2603.29042]"
	"(https://arxiv.org/abs/2603.29042)"
	)

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="Input Audio",
	)

	btn = gr.Button("Transcribe", variant="primary")

	with gr.Row():
	phones_output = gr.Textbox(
	label="IPA Phones (space-separated)",
	lines=3,
	show_copy_button=True,
	)
	raw_output = gr.Textbox(
	label="Raw output (concatenated)",
	lines=3,
	show_copy_button=True,
	)

	btn.click(
	fn=transcribe,
	inputs=[audio_input],
	outputs=[phones_output, raw_output],
	)

	gr.Markdown(
	"---\n"
	f"Max audio length: {MAX_SECONDS}s. "
	"Audio is resampled to 16 kHz mono."
	)

	demo.launch()