Spaces:
Running on Zero
Running on Zero
| """Small client for submitting VoiceGate workflows to a local ComfyUI API.""" | |
| from __future__ import annotations | |
| import argparse | |
| import copy | |
| import json | |
| import os | |
| import time | |
| import uuid | |
| from pathlib import Path | |
| from typing import Any | |
| import requests | |
| ROOT = Path(__file__).resolve().parents[1] | |
| WORKFLOW_PATH = ROOT / "workflows" / "voicegate_api.json" | |
| STDOUT = open(1, "w", encoding="utf-8", closefd=False) | |
| def load_workflow(path: Path = WORKFLOW_PATH) -> dict[str, Any]: | |
| with path.open("r", encoding="utf-8") as file: | |
| return json.load(file) | |
| def patch_voicegate_workflow( | |
| workflow: dict[str, Any], | |
| *, | |
| audio_filename: str, | |
| target_language: str, | |
| api_key: str | None, | |
| api_baseurl: str, | |
| llm_model: str, | |
| job_id: str | None = None, | |
| tts_trim_start: float = 0.0, | |
| ) -> dict[str, Any]: | |
| patched = copy.deepcopy(workflow) | |
| job = job_id or uuid.uuid4().hex[:12] | |
| trim_start = min(1.0, max(0.0, float(tts_trim_start))) | |
| patched["16"]["inputs"]["audio"] = audio_filename | |
| patched["105"]["inputs"]["api_baseurl"] = api_baseurl | |
| patched["105"]["inputs"]["api_key"] = api_key or "" | |
| patched["105"]["inputs"]["model"] = llm_model | |
| patched["110"]["inputs"]["value"] = target_language | |
| patched["180"]["inputs"]["filename_prefix"] = f"audio/voicegate_{job}" | |
| patched["214"]["inputs"]["filename_prefix"] = f"VoiceBridge/subtitle_{job}" | |
| patched["31"]["inputs"].setdefault("source", "HuggingFace") | |
| patched["31"]["inputs"]["attention"] = "sdpa" | |
| patched["31"]["inputs"]["max_new_tokens"] = 256 | |
| patched["206"]["inputs"]["inference_steps"] = 4 | |
| patched["268"]["inputs"]["start_index"] = trim_start | |
| return patched | |
| def upload_audio( | |
| server: str, | |
| audio_path: Path, | |
| *, | |
| overwrite: bool = True, | |
| ) -> str: | |
| """Upload audio to ComfyUI and return the ComfyUI input filename. | |
| Recent ComfyUI builds accept `/upload/image` for input file uploads across | |
| several media types. If this changes, this function is the only place that | |
| should need adjustment. | |
| """ | |
| with audio_path.open("rb") as file: | |
| files = {"image": (audio_path.name, file, "application/octet-stream")} | |
| data = {"overwrite": str(overwrite).lower(), "type": "input"} | |
| response = requests.post(f"{server}/upload/image", files=files, data=data, timeout=120) | |
| response.raise_for_status() | |
| payload = response.json() | |
| return payload.get("name") or audio_path.name | |
| def submit_prompt(server: str, workflow: dict[str, Any]) -> str: | |
| response = requests.post( | |
| f"{server}/prompt", | |
| json={"prompt": workflow, "client_id": str(uuid.uuid4())}, | |
| timeout=120, | |
| ) | |
| response.raise_for_status() | |
| payload = response.json() | |
| return payload["prompt_id"] | |
| def wait_for_history(server: str, prompt_id: str, timeout: float = 1800) -> dict[str, Any]: | |
| deadline = time.time() + timeout | |
| while time.time() < deadline: | |
| response = requests.get(f"{server}/history/{prompt_id}", timeout=30) | |
| response.raise_for_status() | |
| payload = response.json() | |
| if prompt_id in payload: | |
| return payload[prompt_id] | |
| time.sleep(2) | |
| raise TimeoutError(f"Timed out waiting for prompt {prompt_id}") | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--server", default=os.environ.get("COMFYUI_URL", "http://127.0.0.1:8188")) | |
| parser.add_argument("--workflow", type=Path, default=WORKFLOW_PATH) | |
| parser.add_argument("--audio", type=Path) | |
| parser.add_argument("--target-language", default="English") | |
| parser.add_argument("--tts-trim-start", type=float, default=0.0) | |
| parser.add_argument("--api-baseurl", default=os.environ.get("DEEPSEEK_BASE_URL", "https://api.deepseek.com")) | |
| parser.add_argument("--llm-model", default=os.environ.get("DEEPSEEK_MODEL", "deepseek-v4-flash")) | |
| parser.add_argument("--dry-run", action="store_true") | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = parse_args() | |
| workflow = load_workflow(args.workflow) | |
| audio_filename = args.audio.name if args.audio else "placeholder.mp3" | |
| if args.audio and not args.dry_run: | |
| audio_filename = upload_audio(args.server, args.audio) | |
| patched = patch_voicegate_workflow( | |
| workflow, | |
| audio_filename=audio_filename, | |
| target_language=args.target_language, | |
| api_key=os.environ.get("DEEPSEEK_API_KEY"), | |
| api_baseurl=args.api_baseurl, | |
| llm_model=args.llm_model, | |
| tts_trim_start=args.tts_trim_start, | |
| ) | |
| if args.dry_run: | |
| json.dump(patched, STDOUT, ensure_ascii=False, indent=2) | |
| STDOUT.write("\n") | |
| return | |
| prompt_id = submit_prompt(args.server, patched) | |
| print(f"Submitted prompt {prompt_id}") | |
| history = wait_for_history(args.server, prompt_id) | |
| json.dump(history, STDOUT, ensure_ascii=False, indent=2) | |
| STDOUT.write("\n") | |
| if __name__ == "__main__": | |
| main() | |