Spaces:

hetchyy
/

Quran-multi-aligner

Running on Zero

App Files Files Community

Quran-multi-aligner / src /ui /interface.py

hetchyy

perf: eliminate double round-trip delay on audio input

681991f verified about 15 hours ago

raw

history blame contribute delete

13.8 kB

	"""Gradio UI — layout orchestrator."""
	import json
	from pathlib import Path
	from types import SimpleNamespace

	import gradio as gr

	from config import (
	DELETE_CACHE_FREQUENCY, DELETE_CACHE_AGE,
	DEV_TAB_VISIBLE,
	ANIM_WORD_COLOR, ANIM_STYLE_ROW_SCALES,
	ANIM_DISPLAY_MODES, ANIM_DISPLAY_MODE_DEFAULT,
	ANIM_OPACITY_PREV_DEFAULT, ANIM_OPACITY_AFTER_DEFAULT, ANIM_OPACITY_STEP,
	ANIM_PRESETS,
	ANIM_GRANULARITIES, ANIM_GRANULARITY_DEFAULT,
	ANIM_WINDOW_PREV_DEFAULT, ANIM_WINDOW_AFTER_DEFAULT,
	ANIM_WINDOW_PREV_MIN, ANIM_WINDOW_PREV_MAX,
	ANIM_WINDOW_AFTER_MIN, ANIM_WINDOW_AFTER_MAX,
	MEGA_WORD_SPACING_MIN, MEGA_WORD_SPACING_MAX, MEGA_WORD_SPACING_STEP, MEGA_WORD_SPACING_DEFAULT,
	MEGA_TEXT_SIZE_MIN, MEGA_TEXT_SIZE_MAX, MEGA_TEXT_SIZE_STEP, MEGA_TEXT_SIZE_DEFAULT,
	MEGA_LINE_SPACING_MIN, MEGA_LINE_SPACING_MAX, MEGA_LINE_SPACING_STEP, MEGA_LINE_SPACING_DEFAULT,
	LEFT_COLUMN_SCALE, RIGHT_COLUMN_SCALE,
	DEFAULT_INPUT_MODE,
	)
	from src.ui.styles import build_css
	from src.ui.js_config import build_js_head
	from src.ui.handlers import create_segmentation_settings
	from src.ui.event_wiring import wire_events

	# Load surah name ligature map
	with open(Path(__file__).parent.parent.parent / "data" / "ligatures.json") as _f:
	_SURAH_LIGATURES = json.load(_f)


	def build_interface():
	"""Build the Gradio interface."""
	c = SimpleNamespace()
	css = build_css()
	js = build_js_head(_SURAH_LIGATURES)

	with gr.Blocks(title="Quran Multi-Aligner", css=css, head=js, delete_cache=(DELETE_CACHE_FREQUENCY, DELETE_CACHE_AGE)) as app:
	gr.Markdown("# \U0001f399\ufe0f Quran Multi-Aligner")
	gr.Markdown("""
	- Transcribe and split any recitation by pauses within 1-2 minutes
	- Get precise pause-, verse-, word- and character-level timestamps, exportable as JSON
	- GPU-powered <a href="https://github.com/Wider-Community/quranic-universal-audio/blob/main/quranic_universal_aligner/docs/client_api.md" target="_blank">API usage</a> with daily quotas, and unlimited CPU usage
	- Reliable confidence system to flag uncertain segments and missed words — no silent errors
	- Robust tolerance to noise, speaker variation and low audio quality, particularly with the large model
	- <a href="https://github.com/Wider-Community/quranic-universal-audio/issues" target="_blank">Feedback/contributions are welcome</a>
	""")

	# API Documentation accordion
	_api_doc = (Path(__file__).parent.parent.parent / "docs" / "client_api.md").read_text()
	with gr.Accordion("\U0001f4e1 API Usage", open=False):
	gr.Markdown(_api_doc)

	# Changelog accordion
	_changelog = (Path(__file__).parent.parent.parent / "docs" / "CHANGELOG.md").read_text()
	with gr.Accordion("📋 Changelog", open=False):
	gr.Markdown(_changelog)

	if DEV_TAB_VISIBLE:
	with gr.Tabs():
	with gr.Tab("Results"):
	with gr.Row(elem_id="main-row"):
	_build_left_column(c)
	_build_right_column(c)
	with gr.Tab("Dev"):
	_build_dev_tab(c)
	else:
	with gr.Row(elem_id="main-row"):
	_build_left_column(c)
	_build_right_column(c)

	# State components for caching VAD data between runs
	c.cached_speech_intervals = gr.State(value=None)
	c.cached_is_complete = gr.State(value=None)
	c.cached_audio = gr.State(value=None)
	c.cached_sample_rate = gr.State(value=None)
	c.cached_intervals = gr.State(value=None)
	c.cached_model_name = gr.State(value=None)
	c.cached_segment_dir = gr.State(value=None)
	c.cached_log_row = gr.State(value=None)
	c.is_preset = gr.State(value=False)
	c.resegment_panel_visible = gr.State(value=False)

	# Session API components (hidden, API-only)
	c.api_audio = gr.Audio(visible=False, type="numpy")
	c.api_audio_id = gr.Textbox(visible=False)
	c.api_silence = gr.Number(visible=False, precision=0)
	c.api_speech = gr.Number(visible=False, precision=0)
	c.api_pad = gr.Number(visible=False, precision=0)
	c.api_model = gr.Textbox(visible=False)
	c.api_device = gr.Textbox(visible=False)
	c.api_timestamps = gr.JSON(visible=False)
	c.api_mfa_segments = gr.JSON(visible=False)
	c.api_mfa_granularity = gr.Textbox(visible=False)
	c.api_estimate_endpoint = gr.Textbox(visible=False)
	c.api_estimate_audio_duration = gr.Number(visible=False)
	c.api_url = gr.Textbox(visible=False)
	c.api_result = gr.JSON(visible=False)

	wire_events(app, c)

	return app


	def _build_left_column(c):
	"""Build the left input column."""
	with gr.Column(scale=LEFT_COLUMN_SCALE, elem_id="left-col"):
	_is_link = DEFAULT_INPUT_MODE == "Link"
	_is_upload = DEFAULT_INPUT_MODE == "Upload"
	_is_record = DEFAULT_INPUT_MODE == "Record"

	# Input mode toggle
	with gr.Row(elem_id="input-mode-row"):
	c.mode_link = gr.Button("Link", size="sm", min_width=0,
	elem_classes=["mode-active"] if _is_link else [])
	c.mode_upload = gr.Button("Upload", size="sm", min_width=0,
	elem_classes=["mode-active"] if _is_upload else [])
	c.mode_record = gr.Button("Record", size="sm", min_width=0,
	elem_classes=["mode-active"] if _is_record else [])

	# Link panel
	with gr.Column(visible=_is_link, elem_id="link-panel") as c.link_panel:
	c.url_input = gr.Textbox(
	label="Paste a link",
	info='e.g. TikTok · SoundCloud · [MP3Quran](https://www.mp3quran.net/) · [all supported sites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)',
	lines=1,
	)
	c.url_download_btn = gr.Button("Download", size="sm", variant="secondary", interactive=False)
	c.url_audio_player = gr.Audio(label="Downloaded Audio", visible=False, interactive=False)

	# Upload panel
	with gr.Column(visible=_is_upload, elem_id="upload-panel") as c.upload_panel:
	with gr.Row(elem_id="example-row"):
	c.btn_ex_112 = gr.Button("112", size="sm", min_width=0)
	c.btn_ex_84 = gr.Button("84", size="sm", min_width=0)
	c.btn_ex_7 = gr.Button("7", size="sm", min_width=0)
	c.btn_ex_juz30 = gr.Button("Juz' 30", size="sm", min_width=0)
	c.audio_upload = gr.Audio(label="Upload Recitation", sources=["upload"], type="filepath")

	# Record panel
	with gr.Column(visible=_is_record, elem_id="record-panel") as c.record_panel:
	c.audio_record = gr.Audio(label="Record Recitation", sources=["microphone"], type="filepath")

	# Hidden unified audio state (fed by upload, record, or URL download)
	# gr.State avoids cascading .change events that gr.Audio would fire
	c.audio_input = gr.State(value=None)

	_build_animation_settings(c)

	c.anim_cached_settings = gr.JSON(value=None, visible=False)
	with gr.Accordion("Model Settings", open=True) as c.model_accordion:
	with gr.Row():
	c.model_radio = gr.Radio(
	choices=["Base", "Large"],
	value="Base",
	label="ASR Model",
	info="Large: more robust to noisy/non-studio recitations but slower"
	)
	c.device_radio = gr.Radio(
	choices=["GPU", "CPU"],
	value="GPU",
	label="Device",
	info="Daily GPU usage limits. Unlimitted CPU usage but slower"
	)

	with gr.Accordion("Segmentation Settings", open=True) as c.seg_accordion:
	c.min_silence_slider, c.min_speech_slider, c.pad_slider, \
	c.preset_mujawwad, c.preset_murattal, c.preset_fast = create_segmentation_settings()

	# JSON download appears here after extraction
	c.export_file = gr.File(label="\U0001f4e5 Download JSON", visible=True, interactive=False)


	def _build_animation_settings(c):
	"""Build the animation settings accordion."""
	with gr.Accordion("Animation Settings", open=False, elem_id="anim-settings-accordion"):
	with gr.Row(elem_id="anim-style-row"):
	c.anim_granularity_radio = gr.Radio(
	choices=ANIM_GRANULARITIES,
	value=ANIM_GRANULARITY_DEFAULT,
	label="Granularity",
	scale=ANIM_STYLE_ROW_SCALES[0],
	)
	c.anim_mode_radio = gr.Radio(
	choices=ANIM_DISPLAY_MODES,
	value=ANIM_DISPLAY_MODE_DEFAULT,
	label="Animation Style",
	scale=ANIM_STYLE_ROW_SCALES[1],
	)
	c.anim_verse_checkbox = gr.Checkbox(
	value=False,
	label="Verse Only",
	elem_id="anim-verse-mode",
	scale=ANIM_STYLE_ROW_SCALES[2], min_width=90,
	)
	c.anim_color_picker = gr.ColorPicker(
	value=ANIM_WORD_COLOR,
	label="Color",
	scale=ANIM_STYLE_ROW_SCALES[3],
	)
	_is_custom = (ANIM_DISPLAY_MODE_DEFAULT == "Custom")
	_preset = ANIM_PRESETS.get(ANIM_DISPLAY_MODE_DEFAULT, {})
	with gr.Row():
	c.anim_opacity_prev_slider = gr.Slider(
	minimum=0, maximum=1, step=ANIM_OPACITY_STEP,
	value=_preset.get("prev_opacity", ANIM_OPACITY_PREV_DEFAULT),
	label="Before Opacity",
	interactive=_is_custom,
	elem_id="anim-opacity-prev",
	)
	c.anim_opacity_after_slider = gr.Slider(
	minimum=0, maximum=1, step=ANIM_OPACITY_STEP,
	value=_preset.get("after_opacity", ANIM_OPACITY_AFTER_DEFAULT),
	label="After Opacity",
	interactive=_is_custom,
	elem_id="anim-opacity-after",
	)
	with gr.Row():
	c.anim_window_prev_slider = gr.Slider(
	minimum=ANIM_WINDOW_PREV_MIN, maximum=ANIM_WINDOW_PREV_MAX, step=1,
	value=_preset.get("prev_words", ANIM_WINDOW_PREV_DEFAULT),
	label="Before Words", elem_id="anim-window-prev",
	interactive=_is_custom,
	)
	c.anim_window_after_slider = gr.Slider(
	minimum=ANIM_WINDOW_AFTER_MIN, maximum=ANIM_WINDOW_AFTER_MAX, step=1,
	value=_preset.get("after_words", ANIM_WINDOW_AFTER_DEFAULT),
	label="After Words", elem_id="anim-window-after",
	interactive=_is_custom,
	)
	with gr.Row(elem_id="mega-styling-row"):
	c.anim_word_spacing_slider = gr.Slider(
	minimum=MEGA_WORD_SPACING_MIN, maximum=MEGA_WORD_SPACING_MAX,
	step=MEGA_WORD_SPACING_STEP, value=MEGA_WORD_SPACING_DEFAULT,
	label="Word Spacing", elem_id="anim-word-spacing",
	)
	c.anim_text_size_slider = gr.Slider(
	minimum=MEGA_TEXT_SIZE_MIN, maximum=MEGA_TEXT_SIZE_MAX,
	step=MEGA_TEXT_SIZE_STEP, value=MEGA_TEXT_SIZE_DEFAULT,
	label="Text Size", elem_id="anim-text-size",
	)
	c.anim_line_spacing_slider = gr.Slider(
	minimum=MEGA_LINE_SPACING_MIN, maximum=MEGA_LINE_SPACING_MAX,
	step=MEGA_LINE_SPACING_STEP, value=MEGA_LINE_SPACING_DEFAULT,
	label="Line Spacing", elem_id="anim-line-spacing",
	)


	def _build_right_column(c):
	"""Build the right output column."""
	with gr.Column(scale=RIGHT_COLUMN_SCALE):
	_build_results_content(c)


	def _build_results_content(c):
	"""Build the main results content (extract/resegment/output)."""
	c.extract_btn = gr.Button("Extract Segments", variant="secondary", size="lg", interactive=False)
	c.pipeline_progress = gr.HTML(value="", visible=False)
	with gr.Row(elem_id="action-btns-row"):
	c.resegment_toggle_btn = gr.Button(
	"Resegment with New Settings", variant="primary", size="lg", visible=False
	)
	c.retranscribe_btn = gr.Button(
	"Retranscribe with Large Model", variant="primary", size="lg", visible=False
	)
	with gr.Row(elem_id="ts-row"):
	c.compute_ts_btn = gr.Button(
	"Compute Timestamps", variant="secondary", size="lg", interactive=False, visible=False
	)
	c.compute_ts_progress = gr.HTML(value="", visible=False)
	c.animate_all_html = gr.HTML(value="", visible=False)

	with gr.Column(visible=False) as c.resegment_panel:
	gr.Markdown(
	"Uses cached data, skipping the heavy computation, "
	"so it's much faster. Useful if results are over-segmented "
	"or under-segmented"
	)
	c.rs_silence, c.rs_speech, c.rs_pad, \
	c.rs_btn_muj, c.rs_btn_mur, c.rs_btn_fast = create_segmentation_settings(id_suffix="-rs")
	c.resegment_btn = gr.Button("Resegment", variant="primary", size="lg")

	c.output_html = gr.HTML(
	value='<div style="text-align: center; color: #666; padding: 60px;">Upload audio and click "Extract Segments" to begin</div>',
	elem_classes=["output-html"]
	)
	# Hidden JSON output for API consumers
	c.output_json = gr.JSON(visible=False, label="JSON Output")


	def _build_dev_tab(c):
	"""Build the Dev tab UI (delegates to dev_tools module)."""
	from src.ui.dev_tools import build_dev_tab_ui
	build_dev_tab_ui(c)