tts-evaluation

Sleeping

App Files Files Community

tts-evaluation / app.py

asuni

Upload app.py

367255e verified 6 months ago

raw

history blame contribute delete

14.4 kB

	import gradio as gr
	import os
	import csv
	import fcntl
	from datetime import datetime

	# --- Start of Local Mode Implementation ---
	# Check for an environment variable to run in a local-only mode.
	# To enable, run with: GRADIO_LOCAL_MODE=true python your_script.py
	IS_LOCAL_MODE = os.environ.get("GRADIO_LOCAL_MODE", "false").lower() in ["true", "1"]

	if IS_LOCAL_MODE:
	print("Running in LOCAL mode. Hugging Face functionalities are disabled.")
	create_repo = None
	Dataset = None
	else:
	# Optional: Hugging Face dataset push
	try:
	from huggingface_hub import create_repo
	from datasets import Dataset
	print("Hugging Face libraries found. HF push functionality is available.")
	except ImportError:
	print("Hugging Face libraries not found. HF push functionality will be disabled.")
	create_repo = None
	Dataset = None
	# --- End of Local Mode Implementation ---


	# Configuration
	SAMPLES_DIR = "sample-audios"
	OUTPUT_CSV = "responses.csv"

	# Detailed explanations for each criterion (1..5 scale)
	CRITERIA_EXPLANATIONS = {
	"Clarity & Intelligibility": {
	5: "Speech is clear, easy to understand (at all speeds).",
	4: "Mostly clear, minor issues (with fast/slow playback).",
	3: "Understandable but requires effort; some words unclear.",
	2: "Often unclear or distorted; difficult to follow.",
	1: "Unacceptable.",
	},
	"Accent & Pronunciation": {
	5: "Pronunciation is natural and appropriate for the target dialect.",
	4: "Minor pronunciation quirks but overall fine.",
	3: "Some mispronunciations that require effort to interpret.",
	2: "Frequent pronunciation issues that impede understanding.",
	1: "Severe pronunciation problems; largely unintelligible.",
	},
	"Tone & Suitability": {
	5: "Tone fits the content and use-case perfectly.",
	4: "Generally appropriate tone with small mismatches.",
	3: "Tone is acceptable but occasionally inappropriate.",
	2: "Tone often feels off or distracting from the content.",
	1: "Tone is inappropriate or harmful for the content.",
	},
	"Voice quality": {
	5: "Natural, pleasant voice with no artifacts.",
	4: "Minor artifacts but overall high quality.",
	3: "Noticeable quality issues but still usable.",
	2: "Poor quality with frequent artifacts.",
	1: "Unusable voice quality.",
	},
	"Customization & Flexibility": {
	5: "Highly flexible and customizable for different styles.",
	4: "Some customization available; works well for most cases.",
	3: "Limited customization; acceptable for simple use-cases.",
	2: "Very limited or brittle customization options.",
	1: "No useful customization; inflexible.",
	},
	"Listening comfort": {
	5: "Comfortable to listen to for extended periods.",
	4: "Mostly comfortable with occasional sharpness or fatigue.",
	3: "Some listening fatigue; tolerable for short durations.",
	2: "Often fatiguing or distracting to listen to.",
	1: "Uncomfortable or painful to listen to.",
	},
	}


	def list_samples():
	# Return sorted list of audio filenames
	if not os.path.isdir(SAMPLES_DIR):
	return []
	files = [f for f in os.listdir(SAMPLES_DIR) if f.lower().endswith(('.wav', '.mp3', '.ogg', '.flac'))]
	files.sort()
	return files


	def save_response(sample, system_path, annotator, clarity, accent, tone, voice_quality, customization, comfort, comment, session_id=None, user_email=None):
	os.makedirs(os.path.dirname(OUTPUT_CSV) or '.', exist_ok=True)
	header = [
	"timestamp",
	"sample",
	"system_path",
	"annotator",
	"session_id",
	"user_email",
	"clarity",
	"accent",
	"tone",
	"voice_quality",
	"customization",
	"comfort",
	"comment",
	]
	row = [
	datetime.utcnow().isoformat(),
	sample,
	system_path,
	annotator,
	session_id or "",
	user_email or "",
	clarity,
	accent,
	tone,
	voice_quality,
	customization,
	comfort,
	comment,
	]

	write_header = not os.path.exists(OUTPUT_CSV)
	# atomic append with advisory lock
	with open(OUTPUT_CSV, "a", newline='', encoding='utf-8') as f:
	try:
	fcntl.flock(f.fileno(), fcntl.LOCK_EX)
	except Exception:
	pass
	writer = csv.writer(f)
	if write_header:
	writer.writerow(header)
	writer.writerow(row)
	try:
	fcntl.flock(f.fileno(), fcntl.LOCK_UN)
	except Exception:
	pass

	hf_result = None
	if not IS_LOCAL_MODE:
	try:
	hf_record = dict(zip(header, row))
	hf_result = save_responses_to_hf([hf_record])
	except Exception as e:
	hf_result = {"status": "hf_error", "error": str(e)}

	return {"status": "saved", "sample": sample, "hf": hf_result}


	def save_responses_to_hf(rows, repo_id: str \| None = None, token: str \| None = None):
	if create_repo is None or Dataset is None:
	return {"status": "hf_unavailable", "reason": "missing_packages_or_local_mode"}

	token = token or os.environ.get("HF_TOKEN")
	repo_id = repo_id or os.environ.get("HF_DATASET_ID")
	if not token or not repo_id:
	return {"status": "hf_skipped", "reason": "missing_token_or_repo_env"}

	try:
	create_repo(repo_id=repo_id, repo_type="dataset", token=token, private=True, exist_ok=True)
	repo_err = None
	except Exception as e:
	repo_err = str(e)

	ds = Dataset.from_list(rows)
	try:
	ds.push_to_hub(repo_id, token=token)
	except Exception as e:
	return {"status": "hf_push_error", "error": str(e), "repo_error": repo_err}

	return {"status": "hf_pushed", "rows": len(rows), "repo": repo_id, "repo_error": repo_err}


	def make_ui():
	criteria = [
	"Clarity & Intelligibility",
	"Accent & Pronunciation",
	"Tone & Suitability",
	"Voice quality",
	"Customization & Flexibility",
	"Listening comfort",
	]

	def make_explainer_fn(crit):
	mapping = CRITERIA_EXPLANATIONS.get(crit, {})
	def expl(val):
	try:
	iv = int(val)
	except (ValueError, TypeError):
	iv = val
	text = mapping.get(iv, "Select a score to see its meaning.")
	return f"{crit} ({iv}/5): {text}"
	return expl

	with gr.Blocks() as demo:
	samples_list = gr.State(list_samples())
	current_index = gr.State(0)

	gr.Markdown("# TTS Rubric — Compact Evaluation")

	with gr.Accordion("Scoring guide & Annotator Info", open=False):
	with gr.Row():
	annotator_global = gr.Textbox(label="Annotator ID", lines=1, scale=1)
	session_id_global = gr.Textbox(label="Session ID", lines=1, scale=1)
	user_email_global = gr.Textbox(label="User email", lines=1, scale=1)
	guide_lines = []
	for crit, mapping in CRITERIA_EXPLANATIONS.items():
	guide_lines.append(f"### {crit}")
	for score in sorted(mapping.keys(), reverse=True):
	guide_lines.append(f"- {score} points: {mapping[score]}")
	guide_lines.append("")
	guide_md = "\n".join(guide_lines)
	gr.Markdown(guide_md)

	progress_md = gr.Markdown("Sample 1 of X")

	# Main evaluation layout
	with gr.Row(equal_height=True):
	# Left Column: Audio Players
	with gr.Column(scale=1):
	sample_name_md = gr.Markdown("### Sample Filename")
	reference_audio = gr.Audio(label="Reference audio")
	system_audio = gr.Audio(label="Evaluation output")
	reference_missing_md = gr.Markdown("(reference audio missing)", visible=False)
	system_missing_md = gr.Markdown("(system output missing)", visible=False)
	submit_btn = gr.Button("Save & Next", variant="primary", scale=1)
	status = gr.Textbox(label="Status", interactive=False, scale=2)
	# Right Column: All evaluation controls
	with gr.Column(scale=1):
	# Sliders in a 2x3 grid
	with gr.Column():
	#with gr.Column():
	clarity = gr.Slider(minimum=1, maximum=5, step=1, label=criteria[0], value=3)
	accent = gr.Slider(minimum=1, maximum=5, step=1, label=criteria[1], value=3)
	tone = gr.Slider(minimum=1, maximum=5, step=1, label=criteria[2], value=3)
	#with gr.Column():
	voice_quality = gr.Slider(minimum=1, maximum=5, step=1, label=criteria[3], value=3)
	customization = gr.Slider(minimum=1, maximum=5, step=1, label=criteria[4], value=3)
	comfort = gr.Slider(minimum=1, maximum=5, step=1, label=criteria[5], value=3)

	# Single explanation box
	slider_explanation_md = gr.Markdown("Select a score to see its meaning.")

	comment = gr.Textbox(label="Comments (optional)", lines=2, value="")

	#with gr.Row():
	# submit_btn = gr.Button("Save & Next", variant="primary", scale=1)
	# status = gr.Textbox(label="Status", interactive=False, scale=2)

	with gr.Row():
	export_btn = gr.Button("Export responses to CSV")
	export_file = gr.File(label="Download responses.csv", interactive=False)


	# --- LOGIC & EVENTS ---

	def load_sample(samples, index):
	total_samples = len(samples)
	if index >= total_samples:
	# End of evaluation session
	completion_msg = f"All {total_samples} samples completed! You can close this window."
	return {
	progress_md: gr.update(value=completion_msg),
	sample_name_md: gr.update(visible=False),
	reference_audio: gr.update(visible=False),
	system_audio: gr.update(visible=False),
	reference_missing_md: gr.update(visible=False),
	system_missing_md: gr.update(visible=False),
	clarity: gr.update(visible=False), accent: gr.update(visible=False), tone: gr.update(visible=False),
	voice_quality: gr.update(visible=False), customization: gr.update(visible=False), comfort: gr.update(visible=False),
	slider_explanation_md: gr.update(visible=False),
	comment: gr.update(visible=False),
	submit_btn: gr.update(visible=False),
	status: gr.update(value="Finished.")
	}

	sample = samples[index]
	sample_path = os.path.join(SAMPLES_DIR, sample)
	sys_path = os.path.join("system-outputs", "system_a", sample)
	ref_exists = os.path.exists(sample_path)
	sys_exists = os.path.exists(sys_path)

	return {
	progress_md: gr.update(value=f"Sample {index + 1} of {total_samples}"),
	sample_name_md: gr.update(value=f"### {sample}", visible=True),
	reference_audio: gr.update(value=sample_path if ref_exists else None, visible=ref_exists),
	reference_missing_md: gr.update(visible=not ref_exists),
	system_audio: gr.update(value=sys_path if sys_exists else None, visible=sys_exists),
	system_missing_md: gr.update(visible=not sys_exists),
	clarity: gr.update(value=3), accent: gr.update(value=3), tone: gr.update(value=3),
	voice_quality: gr.update(value=3), customization: gr.update(value=3), comfort: gr.update(value=3),
	slider_explanation_md: gr.update(value="Select a score to see its meaning."),
	comment: gr.update(value=""),
	submit_btn: gr.update(visible=True), # <-- THE FIX IS HERE
	status: gr.update(value="Ready."),
	}

	def save_and_next(index, samples, annotator, sid, email, cl, ac, to, vq, cu, co, comm):
	sample = samples[index]
	sys_path = os.path.join("system-outputs", "system_a", sample)
	save_status = save_response(sample, sys_path, annotator, cl, ac, to, vq, cu, co, comm, session_id=sid, user_email=email)

	next_index = index + 1
	updates = load_sample(samples, next_index)
	# The status update from the save operation is now correctly added
	updates[status] = gr.update(value=str(save_status['status']))

	# Now, the number of values returned will always match the 17 outputs
	return [next_index] + list(updates.values())

	# Wire up slider explanations to the single markdown box
	all_sliders = [clarity, accent, tone, voice_quality, customization, comfort]
	for i, slider in enumerate(all_sliders):
	slider.change(make_explainer_fn(criteria[i]), inputs=[slider], outputs=[slider_explanation_md])

	# Define outputs for loading and saving
	ui_elements = [
	progress_md, sample_name_md, reference_audio, reference_missing_md,
	system_audio, system_missing_md, clarity, accent, tone, voice_quality,
	customization, comfort, slider_explanation_md, comment, submit_btn, status
	]

	# Initial load
	demo.load(load_sample, inputs=[samples_list, current_index], outputs=ui_elements)

	# Button click event
	submit_btn.click(
	save_and_next,
	inputs=[current_index, samples_list, annotator_global, session_id_global, user_email_global, clarity, accent, tone, voice_quality, customization, comfort, comment],
	outputs=[current_index, *ui_elements],
	)

	export_btn.click(lambda: OUTPUT_CSV if os.path.exists(OUTPUT_CSV) else None, inputs=[], outputs=[export_file])

	return demo


	if __name__ == "__main__":
	app = make_ui()
	app.launch(server_name="0.0.0.0", server_port=7860)