Spaces:

schroneko
/

irodori-tts-stackchan-api

Sleeping

schroneko

Expose Hugging Face token status

fec5b89 7 days ago

10.2 kB

	from __future__ import annotations

	import hashlib
	import json
	import os
	import shutil
	import time
	import uuid
	from pathlib import Path
	from typing import Any

	import requests
	from fastapi import FastAPI, HTTPException, Query, Request
	from fastapi.responses import FileResponse, JSONResponse

	ZERO_SPACE = os.getenv("ZERO_SPACE", "schroneko/irodori-tts-zerogpu")
	HF_TOKEN = os.getenv("HF_TOKEN", "") or os.getenv("HUGGING_FACE_HUB_TOKEN", "")
	TTS_API_KEY = os.getenv("TTS_API_KEY", "")
	PUBLIC_BASE_URL = os.getenv("PUBLIC_BASE_URL", "").rstrip("/")
	MAX_TEXT_LENGTH = int(os.getenv("MAX_TEXT_LENGTH", "240"))
	DEFAULT_SECONDS_RAW = os.getenv("DEFAULT_SECONDS", "").strip()
	DEFAULT_SECONDS = "" if DEFAULT_SECONDS_RAW.lower() in {"", "auto", "none"} else DEFAULT_SECONDS_RAW
	DEFAULT_DURATION_SCALE = float(os.getenv("DEFAULT_DURATION_SCALE", "0.95"))
	DEFAULT_STEPS = int(os.getenv("DEFAULT_STEPS", "18"))
	DEFAULT_SEED = int(os.getenv("DEFAULT_SEED", "3407"))
	DEFAULT_CAPTION = os.getenv(
	"DEFAULT_CAPTION",
	"若く元気な女性の声。近い距離感で、明るくやわらかく自然に話している。",
	)
	MAX_CACHE_ENTRIES = int(os.getenv("MAX_CACHE_ENTRIES", "256"))

	OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", "/tmp/stackchan-audio"))
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	app = FastAPI(title="Irodori TTS StackChan API")


	def _base_url(request: Request) -> str:
	if PUBLIC_BASE_URL:
	return PUBLIC_BASE_URL
	proto = request.headers.get("x-forwarded-proto", request.url.scheme)
	host = request.headers.get("x-forwarded-host", request.headers.get("host", request.url.netloc))
	return f"{proto}://{host}".rstrip("/")


	def _check_key(key: str \| None) -> None:
	if TTS_API_KEY and key != TTS_API_KEY:
	raise HTTPException(status_code=401, detail="invalid key")


	def _seed_for_speaker(speaker: str) -> str:
	digest = hashlib.sha256(f"{DEFAULT_SEED}:{speaker}".encode("utf-8")).digest()
	value = int.from_bytes(digest[:8], "big") & ((1 << 63) - 1)
	return str(value)


	def _cache_key(payload: dict[str, Any]) -> str:
	data = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
	return hashlib.sha256(data.encode("utf-8")).hexdigest()


	def _cache_path(cache_key: str) -> Path:
	return OUTPUT_DIR / f"cache_{cache_key}.mp3"


	def _prune_cache() -> None:
	paths = sorted(OUTPUT_DIR.glob("cache_*.mp3"), key=lambda path: path.stat().st_mtime, reverse=True)
	for path in paths[MAX_CACHE_ENTRIES:]:
	path.unlink(missing_ok=True)


	def _zero_space_base_url() -> str:
	space = ZERO_SPACE.strip().rstrip("/")
	if space.startswith("http://") or space.startswith("https://"):
	return space
	return f"https://{space.replace('/', '-')}.hf.space"


	def _zero_headers(x_ip_token: str = "") -> dict[str, str]:
	headers = {"Content-Type": "application/json"}
	if HF_TOKEN:
	headers["Authorization"] = f"Bearer {HF_TOKEN}"
	if x_ip_token:
	headers["x-ip-token"] = x_ip_token
	return headers


	def _raise_for_response(response: requests.Response) -> None:
	if response.ok:
	return
	detail = response.text.strip()
	raise RuntimeError(f"ZeroGPU request failed: {response.status_code} {detail}")


	def _predict_zero_space(payload: dict[str, Any], x_ip_token: str = "") -> Any:
	base_url = _zero_space_base_url()
	response = _post_zero_space(base_url, "v2/synthesize", payload, x_ip_token)
	if response.status_code == 405:
	response = _post_zero_space(base_url, "synthesize", payload, x_ip_token)
	_raise_for_response(response)
	event_id = response.json().get("event_id")
	if not event_id:
	raise RuntimeError(f"ZeroGPU did not return event_id: {response.text}")

	response = requests.get(
	f"{base_url}/gradio_api/call/synthesize/{event_id}",
	headers=_zero_headers(x_ip_token),
	stream=True,
	timeout=(10, 300),
	)
	_raise_for_response(response)

	event_name = ""
	for line in response.iter_lines(decode_unicode=True):
	if not line:
	continue
	if line.startswith("event:"):
	event_name = line.split(":", 1)[1].strip()
	continue
	if not line.startswith("data:"):
	continue
	data = line.split(":", 1)[1].strip()
	if event_name == "error":
	raise RuntimeError(data)
	parsed = json.loads(data)
	if event_name == "complete":
	return parsed

	raise RuntimeError("ZeroGPU stream ended before completion")


	def _post_zero_space(base_url: str, endpoint: str, payload: dict[str, Any], x_ip_token: str = "") -> requests.Response:
	body: dict[str, Any] = payload
	if endpoint == "synthesize":
	body = {
	"data": [
	payload["text"],
	payload["speaker"],
	payload["seconds"],
	payload["duration_scale"],
	payload["steps"],
	payload["seed"],
	payload["caption"],
	]
	}
	return requests.post(
	f"{base_url}/gradio_api/call/{endpoint}",
	json=body,
	headers=_zero_headers(x_ip_token),
	timeout=30,
	)


	def _copy_result_file(result: Any, x_ip_token: str = "") -> Path:
	source: str \| None = None
	if isinstance(result, (list, tuple)) and result:
	source = result[0]
	elif isinstance(result, dict):
	source = result.get("path") or result.get("name")
	elif isinstance(result, str):
	source = result

	if isinstance(source, dict):
	source = source.get("url") or source.get("path") or source.get("name")
	if not source:
	raise RuntimeError(f"Could not find generated audio path in result: {result!r}")

	output_path = OUTPUT_DIR / f"{int(time.time() * 1000)}_{uuid.uuid4().hex}.mp3"
	source_text = str(source)
	if source_text.startswith("http://") or source_text.startswith("https://"):
	response = requests.get(source_text, headers=_zero_headers(x_ip_token), timeout=60)
	_raise_for_response(response)
	output_path.write_bytes(response.content)
	return output_path

	source_path = Path(str(source))
	if not source_path.is_file():
	raise RuntimeError(f"Generated audio file is missing: {source_path}")

	shutil.copyfile(source_path, output_path)
	return output_path


	def _result_metadata(result: Any) -> dict[str, Any]:
	if isinstance(result, (list, tuple)) and len(result) > 1 and isinstance(result[1], dict):
	return result[1]
	return {}


	@app.get("/")
	def root() -> dict[str, str]:
	return {"ok": "true", "service": "irodori-tts-stackchan-api"}


	@app.get("/health")
	def health() -> dict[str, str]:
	return {
	"ok": "true",
	"zero_space": ZERO_SPACE,
	"duration_scale": str(DEFAULT_DURATION_SCALE),
	"has_hf_token": str(bool(HF_TOKEN)).lower(),
	"cache_entries": str(len(list(OUTPUT_DIR.glob("cache_*.mp3")))),
	"max_cache_entries": str(MAX_CACHE_ENTRIES),
	}


	@app.get("/audio/{filename}")
	def audio(filename: str) -> FileResponse:
	path = OUTPUT_DIR / filename
	if not path.is_file():
	raise HTTPException(status_code=404, detail="audio not found")
	return FileResponse(path, media_type="audio/mpeg", filename=filename)


	@app.get("/synthesis")
	def synthesis(
	request: Request,
	key: str \| None = Query(default=None),
	text: str = Query(..., min_length=1),
	speaker: str = Query(default="3"),
	seconds: str = Query(default=DEFAULT_SECONDS),
	duration_scale: float = Query(default=DEFAULT_DURATION_SCALE, gt=0.0, le=2.0),
	steps: int = Query(default=DEFAULT_STEPS, ge=1, le=80),
	seed: str = Query(default=""),
	caption: str = Query(default=DEFAULT_CAPTION),
	) -> JSONResponse:
	_check_key(key)
	text = text.strip()
	if not text:
	raise HTTPException(status_code=400, detail="text is required")
	if len(text) > MAX_TEXT_LENGTH:
	raise HTTPException(status_code=400, detail=f"text is too long: max {MAX_TEXT_LENGTH}")

	seed_value = str(seed).strip() or _seed_for_speaker(str(speaker))
	payload = {
	"text": text,
	"speaker": str(speaker),
	"seconds": str(seconds),
	"duration_scale": float(duration_scale),
	"steps": int(steps),
	"seed": seed_value,
	"caption": str(caption).strip() or DEFAULT_CAPTION,
	}
	cache_key = _cache_key(payload)
	cached_path = _cache_path(cache_key)
	if cached_path.is_file():
	url = f"{_base_url(request)}/audio/{cached_path.name}"
	return JSONResponse(
	{
	"success": True,
	"isApiKeyValid": True,
	"speaker": str(speaker),
	"seed": seed_value,
	"durationScale": float(duration_scale),
	"metadata": {"cacheHit": True, "cacheKey": cache_key},
	"mp3StreamingUrl": url,
	"mp3DownloadUrl": url,
	"audioStatusUrl": f"{_base_url(request)}/health",
	}
	)

	try:
	x_ip_token = request.headers.get("x-ip-token", "").strip()
	result = _predict_zero_space(payload, x_ip_token=x_ip_token)
	metadata = _result_metadata(result)
	output_path = _copy_result_file(result, x_ip_token=x_ip_token)
	shutil.copyfile(output_path, cached_path)
	_prune_cache()
	except Exception as exc:
	return JSONResponse(
	status_code=502,
	content={
	"success": False,
	"isApiKeyValid": bool(not TTS_API_KEY or key == TTS_API_KEY),
	"error": str(exc),
	},
	)

	url = f"{_base_url(request)}/audio/{output_path.name}"
	return JSONResponse(
	{
	"success": True,
	"isApiKeyValid": True,
	"speaker": str(speaker),
	"seed": seed_value,
	"durationScale": float(duration_scale),
	"metadata": {**metadata, "cacheHit": False, "cacheKey": cache_key},
	"mp3StreamingUrl": url,
	"mp3DownloadUrl": url,
	"audioStatusUrl": f"{_base_url(request)}/health",
	}
	)