Spaces:

aankitdas
/

tts-eval-framework

Sleeping

App Files Files Community

tts-eval-framework / app /storage.py

aankitdas

added storage limit guard

229a3e3 about 1 month ago

raw

history blame contribute delete

7.53 kB

	# app/storage.py
	# Supabase Storage integration for persisting generated audio files.
	# Uploads audio to the tts-audio public bucket and returns a public URL.
	# Called as a background thread from run_synthesis — non-blocking.

	import os
	import threading
	from pathlib import Path
	from supabase import create_client, Client


	# --- cleanup config ---
	# soft limit: trigger cleanup when total audio in bucket exceeds this
	_BUCKET_SIZE_LIMIT_BYTES = 800 * 1024 * 1024 # 800MB


	# --- client setup ---
	_client: Client \| None = None

	def _get_client() -> Client:
	global _client
	if _client is None:
	url = os.getenv("SUPABASE_URL")
	key = os.getenv("SUPABASE_ANON_KEY")
	if not url or not key:
	raise ValueError(
	"SUPABASE_URL and SUPABASE_ANON_KEY must be set in .env"
	)
	_client = create_client(url, key)
	return _client


	def upload_audio(local_path: str, filename: str) -> str \| None:
	"""
	Upload an audio file to Supabase tts-audio bucket.
	Returns the public URL on success, None on failure.

	Args:
	local_path: full path to local audio file
	filename: destination filename in bucket (e.g. '2026-04-14_kokoro_K-2.wav')
	"""
	try:
	client = _get_client()
	with open(local_path, "rb") as f:
	data = f.read()

	# detect content type
	ext = Path(local_path).suffix.lower()
	content_type = "audio/mpeg" if ext == ".mp3" else "audio/wav"

	client.storage.from_("tts-audio").upload(
	path=filename,
	file=data,
	file_options={"content-type": content_type, "upsert": "true"},
	)

	# build public URL
	result = client.storage.from_("tts-audio").get_public_url(filename)
	return result

	except Exception as e:
	print(f"[Storage] Upload failed for {filename}: {e}")
	return None


	def upload_audio_background(local_path: str, filename: str, callback=None) -> None:
	"""
	Upload audio in a background thread — non-blocking.
	Optionally calls callback(url) when done, where url is None on failure.

	Args:
	local_path: full path to local audio file
	filename: destination filename in bucket
	callback: optional function(url: str \| None) called after upload
	"""
	def _run():
	url = upload_audio(local_path, filename)
	if callback:
	callback(url)

	thread = threading.Thread(target=_run, daemon=True)
	thread.start()

	def upload_csv(local_path: str) -> bool:
	"""
	Upload eval_log.csv to Supabase tts-audio bucket.
	Uses upsert so it overwrites the existing file.
	Returns True on success, False on failure.
	"""
	try:
	client = _get_client()
	with open(local_path, "rb") as f:
	data = f.read()

	client.storage.from_("tts-audio").upload(
	path="eval_log.csv",
	file=data,
	file_options={"content-type": "text/csv", "upsert": "true"},
	)
	print("[Storage] eval_log.csv uploaded to Supabase")
	return True

	except Exception as e:
	print(f"[Storage] CSV upload failed: {e}")
	return False


	def download_csv(local_path: str) -> bool:
	"""
	Download eval_log.csv from Supabase tts-audio bucket to local path.
	Returns True on success, False on failure.
	"""
	try:
	client = _get_client()
	response = client.storage.from_("tts-audio").download("eval_log.csv")

	os.makedirs(os.path.dirname(local_path), exist_ok=True)
	with open(local_path, "wb") as f:
	f.write(response)

	print("[Storage] eval_log.csv downloaded from Supabase")
	return True

	except Exception as e:
	print(f"[Storage] CSV download failed (will use local fallback): {e}")
	return False


	def upload_csv_background(local_path: str) -> None:
	"""Upload CSV in background thread — non-blocking."""
	thread = threading.Thread(target=upload_csv, args=(local_path,), daemon=True)
	thread.start()


	def cleanup_bucket_if_needed(csv_local_path: str) -> None:
	"""
	Check total size of audio files in tts-audio bucket.
	If over _BUCKET_SIZE_LIMIT_BYTES, delete oldest files by filename
	timestamp until back under limit. Removes corresponding rows from
	local CSV and re-uploads it to Supabase.
	Skips eval_log.csv when calculating size and deleting.
	"""
	try:
	client = _get_client()

	# list all files in bucket
	files = client.storage.from_("tts-audio").list()
	if not files:
	return

	# filter out CSV — only count audio files
	audio_files = [f for f in files if f["name"] != "eval_log.csv"]

	# calculate total size
	total_bytes = sum(f.get("metadata", {}).get("size", 0) for f in audio_files)

	if total_bytes <= _BUCKET_SIZE_LIMIT_BYTES:
	return

	print(f"[Storage] Cleanup triggered: {total_bytes / 1024 / 1024:.1f}MB exceeds {_BUCKET_SIZE_LIMIT_BYTES / 1024 / 1024:.0f}MB limit")

	# sort by filename (timestamp prefix ensures chronological order)
	audio_files.sort(key=lambda f: f["name"])

	# delete oldest files until under limit
	freed_bytes = 0
	deleted_names = []

	for f in audio_files:
	if total_bytes - freed_bytes <= _BUCKET_SIZE_LIMIT_BYTES:
	break
	name = f["name"]
	size = f.get("metadata", {}).get("size", 0)
	try:
	client.storage.from_("tts-audio").remove([name])
	freed_bytes += size
	deleted_names.append(name)
	print(f"[Storage] Cleanup: deleted {name} ({size / 1024 / 1024:.2f}MB)")
	except Exception as e:
	print(f"[Storage] Cleanup: failed to delete {name}: {e}")

	print(f"[Storage] Cleanup: deleted {len(deleted_names)} files, freed {freed_bytes / 1024 / 1024:.1f}MB")

	if not deleted_names:
	return

	# remove corresponding rows from CSV
	try:
	import pandas as pd
	if not os.path.exists(csv_local_path):
	return

	df = pd.read_csv(csv_local_path, dtype={"audio_url": str})

	# build set of deleted URLs for fast lookup
	deleted_urls = set()
	for name in deleted_names:
	# reconstruct public URL pattern to match against csv
	url_fragment = f"tts-audio/{name}"
	deleted_urls.add(url_fragment)

	# drop rows whose audio_url contains a deleted filename
	original_len = len(df)
	df = df[~df["audio_url"].apply(
	lambda url: any(d in str(url) for d in deleted_urls)
	)]

	rows_removed = original_len - len(df)
	df.to_csv(csv_local_path, index=False)
	print(f"[Storage] Cleanup: removed {rows_removed} rows from CSV")

	# re-upload cleaned CSV
	upload_csv(csv_local_path)

	except Exception as e:
	print(f"[Storage] Cleanup: CSV update failed: {e}")

	except Exception as e:
	print(f"[Storage] Cleanup check failed: {e}")


	def cleanup_bucket_background(csv_local_path: str) -> None:
	"""Run bucket cleanup in background thread — non-blocking."""
	thread = threading.Thread(
	target=cleanup_bucket_if_needed,
	args=(csv_local_path,),
	daemon=True,
	)
	thread.start()