Spaces:

ruslanmv
/

avatar-renderer

Paused

File size: 21,894 Bytes

15d3766

"""
api.py – FastAPI front-door for the Avatar Renderer Pod
==========================================================
* POST /render          → returns {jobId, statusUrl, async} (expects server-side file paths)
* POST /render-upload   → upload avatar + audio, returns {jobId, statusUrl, async} (browser-friendly)
* GET  /status/{id}     → returns either {"state": "..."} or the MP4 file
* GET  /avatars         → list available models and system capabilities
* GET  /health/live     → liveness probe (200 OK)
* GET  /health/ready    → readiness probe (checks Celery broker if present)
* POST /text-to-audio   → synthesize text to speech via Chatterbox
"""

from __future__ import annotations

import json
import shutil
import uuid
from pathlib import Path
from typing import Optional

from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field

from .settings import Settings  # pydantic-based env loader

settings = Settings()

# ─────────────────── TTS imports ───────────────────────────────────────── #
try:
    from .tts.chatterbox_client import (
        ChatterboxTtsError,
        tts_wav_bytes_async,
        tts_wav_base64_async,
        chatterbox_health_async,
    )
    tts_available = True
except ImportError:
    tts_available = False

# ─────────────────── Celery optional ────────────────────────────────────── #
celery_available = False
try:
    from celery import Celery
    from celery.result import AsyncResult

    celery_app = Celery(
        "avatar_renderer",
        broker=settings.CELERY_BROKER_URL,
        backend=settings.CELERY_BACKEND_URL or settings.CELERY_BROKER_URL,
    )
    celery_available = bool(settings.CELERY_BROKER_URL)
except ImportError:
    celery_app = None  # type: ignore

# import pipeline after Celery to avoid GPU init on health checks
from .pipeline import render_pipeline  # noqa: E402

# ───────────────────────── FastAPI setup ────────────────────────────────── #
app = FastAPI(
    title="avatar-renderer-svc",
    version="0.1.0",
    description="Generate a lip-synced avatar video (REST façade)",
)

# ───────────────────────────── CORS setup ────────────────────────────────── #
app.add_middleware(
    CORSMiddleware,
    allow_origins=[
        "*",
        "http://localhost:3000",
        "http://localhost:3001",
        "https://*.vercel.app",
        "https://vercel.app",
    ],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ───────────────── Static frontend (HF Spaces / Docker) ─────────────────── #
_STATIC_DIR = Path(__file__).resolve().parent.parent / "static"
if _STATIC_DIR.is_dir():
    app.mount("/static", StaticFiles(directory=str(_STATIC_DIR)), name="static")

    @app.get("/", response_class=HTMLResponse, include_in_schema=False)
    def serve_frontend():
        index = _STATIC_DIR / "index.html"
        if index.exists():
            return HTMLResponse(index.read_text())
        return HTMLResponse("<h1>Avatar Renderer MCP</h1><p>API is running. Visit <a href='/docs'>/docs</a></p>")

WORK_ROOT = Path("/tmp/avatar-jobs")
WORK_ROOT.mkdir(parents=True, exist_ok=True)


# ─────────────────────────── Pydantic models ────────────────────────────── #
class RenderBody(BaseModel):
    avatarPath: str = Field(..., alias="avatarPath", description="Path to PNG/JPG portrait image")
    audioPath: str = Field(..., alias="audioPath", description="Path to WAV/MP3 audio file")
    driverVideo: Optional[str] = Field(None, alias="driverVideo", description="Optional MP4 for head pose")
    visemeJson: Optional[str] = Field(None, alias="visemeJson", description="Optional phoneme alignment JSON")
    qualityMode: str = Field(
        default="auto",
        alias="qualityMode",
        description=(
            "Rendering quality mode: 'real_time' (Wav2Lip/MuseTalk), "
            "'high_quality' (FOMM+Diff2Lip/LatentSync+GFPGAN), "
            "'cinematic' (Hallo3 DiT), '3d' (Gaussian Splatting), or 'auto'"
        )
    )
    enhancements: Optional[list] = Field(
        default=None,
        description=(
            "List of enhancement names to apply. Options: "
            "'emotion_expressions', 'musetalk_lipsync', 'eye_gaze_blink', "
            "'liveportrait_driver', 'latentsync_lipsync', 'hallo3_cinematic', "
            "'cosyvoice_tts', 'viseme_guided', 'gesture_animation', "
            "'gaussian_splatting'. Use ['all'] for all available."
        )
    )
    transcript: Optional[str] = Field(
        None,
        description="Optional text transcript of the audio (enables emotion detection and gesture sync)"
    )


class TextToAudioRequest(BaseModel):
    text: str = Field(..., description="Text to synthesize into speech")
    voice: Optional[str] = Field(None, description="Voice profile: 'female', 'male', or 'neutral'")
    language: Optional[str] = Field(None, description="Language code (ISO 639-1, e.g., 'en', 'it', 'fr')")
    temperature: Optional[float] = Field(0.7, description="Temperature for TTS generation (0.0-1.0)", ge=0.0, le=1.0)
    cfg_weight: Optional[float] = Field(0.4, description="CFG weight for TTS generation (0.0-1.0)", ge=0.0, le=1.0)
    exaggeration: Optional[float] = Field(0.3, description="Exaggeration for TTS generation (0.0-1.0)", ge=0.0, le=1.0)
    speed: Optional[float] = Field(1.0, description="Speed for TTS generation (0.5-2.0)", ge=0.5, le=2.0)
    output_format: Optional[str] = Field("file", description="Output format: 'file' (WAV file) or 'base64' (base64-encoded WAV)")


class TextToAudioResponse(BaseModel):
    status: str = Field(..., description="Status of the request ('success' or 'error')")
    audio_path: Optional[str] = Field(None, description="Path to the generated WAV file (if output_format='file')")
    audio_base64: Optional[str] = Field(None, description="Base64-encoded WAV audio (if output_format='base64')")
    error: Optional[str] = Field(None, description="Error message (if status='error')")


# ───────────────────────── Celery vs Thread task ─────────────────────────── #
if celery_available:

    @celery_app.task(name="render_video_task")
    def _render_video_task(payload: dict):
        render_pipeline(
            face_image=payload["avatar_path"],
            audio=payload["audio_path"],
            reference_video=payload.get("driver_video"),
            viseme_json=payload.get("viseme_json"),
            quality_mode=payload.get("quality_mode", "auto"),
            out_path=payload["out_path"],
            enhancements=payload.get("enhancements"),
            transcript=payload.get("transcript"),
        )

else:
    def _render_video_thread(payload: dict):
        render_pipeline(
            face_image=payload["avatar_path"],
            audio=payload["audio_path"],
            reference_video=payload.get("driver_video"),
            viseme_json=payload.get("viseme_json"),
            quality_mode=payload.get("quality_mode", "auto"),
            out_path=payload["out_path"],
            enhancements=payload.get("enhancements"),
            transcript=payload.get("transcript"),
        )
        # mark success for readiness
        (WORK_ROOT / payload["job_id"] / "done").touch()


# ───────────────────────────── REST endpoints ────────────────────────────── #
@app.post("/render")
def render_job(body: RenderBody, bg: BackgroundTasks):
    """Start a render job and return jobId + status URL."""
    job_id = str(uuid.uuid4())
    job_dir = WORK_ROOT / job_id
    job_dir.mkdir(parents=True, exist_ok=True)
    out_mp4 = job_dir / "out.mp4"

    # Resolve enhancements: per-request or default from settings
    active_enhancements = body.enhancements
    if active_enhancements is None and settings.DEFAULT_ENHANCEMENTS:
        active_enhancements = [e.strip() for e in settings.DEFAULT_ENHANCEMENTS.split(",") if e.strip()]

    payload = {
        "job_id": job_id,
        "avatar_path": body.avatarPath,
        "audio_path": body.audioPath,
        "driver_video": body.driverVideo,
        "viseme_json": body.visemeJson,
        "quality_mode": body.qualityMode,
        "out_path": str(out_mp4),
        "enhancements": active_enhancements,
        "transcript": body.transcript,
    }

    # save original request
    (job_dir / "request.json").write_text(json.dumps(body.dict(by_alias=True), indent=2))

    if celery_available:
        task = _render_video_task.delay(payload)  # type: ignore
        (job_dir / "celery_id").write_text(task.id)
        async_mode = True
    else:
        bg.add_task(_render_video_thread, payload)
        async_mode = False

    return {
        "jobId": job_id,
        "statusUrl": f"/status/{job_id}",
        "async": async_mode,
    }


@app.post("/render-upload")
async def render_upload(
    bg: BackgroundTasks,
    avatar: UploadFile = File(...),
    audio: UploadFile = File(...),
    qualityMode: str = Form("auto"),
    enhancements: Optional[str] = Form(None),
    transcript: Optional[str] = Form(None),
):
    """Upload avatar image + audio, start render job, return jobId + status URL.

    enhancements: Comma-separated list of enhancement names (e.g., 'emotion_expressions,eye_gaze_blink')
    transcript: Optional text transcript of the audio
    """
    job_id = str(uuid.uuid4())
    job_dir = WORK_ROOT / job_id
    job_dir.mkdir(parents=True, exist_ok=True)

    # Save uploaded files
    avatar_path = job_dir / f"avatar_{avatar.filename}"
    audio_path = job_dir / f"audio_{audio.filename}"
    out_mp4 = job_dir / "out.mp4"

    with avatar_path.open("wb") as f:
        shutil.copyfileobj(avatar.file, f)
    with audio_path.open("wb") as f:
        shutil.copyfileobj(audio.file, f)

    # Parse enhancements from comma-separated form field
    active_enhancements = None
    if enhancements:
        active_enhancements = [e.strip() for e in enhancements.split(",") if e.strip()]
    elif settings.DEFAULT_ENHANCEMENTS:
        active_enhancements = [e.strip() for e in settings.DEFAULT_ENHANCEMENTS.split(",") if e.strip()]

    payload = {
        "job_id": job_id,
        "avatar_path": str(avatar_path),
        "audio_path": str(audio_path),
        "quality_mode": qualityMode,
        "out_path": str(out_mp4),
        "enhancements": active_enhancements,
        "transcript": transcript,
    }

    # save upload metadata
    (job_dir / "upload.json").write_text(
        json.dumps(
            {
                "avatar_filename": avatar.filename,
                "audio_filename": audio.filename,
                "quality_mode": qualityMode,
                "enhancements": active_enhancements,
            },
            indent=2,
        )
    )

    if celery_available:
        task = _render_video_task.delay(payload)  # type: ignore
        (job_dir / "celery_id").write_text(task.id)
        async_mode = True
    else:
        bg.add_task(_render_video_thread, payload)
        async_mode = False

    return {
        "jobId": job_id,
        "statusUrl": f"/status/{job_id}",
        "async": async_mode,
    }


@app.get("/status/{job_id}")
def get_status(job_id: str):
    """Fetch job state or return the completed MP4."""
    job_dir = WORK_ROOT / job_id
    if not job_dir.exists():
        raise HTTPException(404, "job not found")

    out_mp4 = job_dir / "out.mp4"
    if out_mp4.exists():
        return FileResponse(out_mp4, media_type="video/mp4")

    if celery_available:
        celery_id_path = job_dir / "celery_id"
        if not celery_id_path.exists():
            raise HTTPException(500, "job metadata missing")
        task_id = celery_id_path.read_text()
        task = AsyncResult(task_id, app=celery_app)
        return {"state": task.state}
    else:
        done_marker = job_dir / "done"
        state = "finished" if done_marker.exists() else "processing"
        return {"state": state}


# ─────────────────────── Text-to-Audio endpoint ────────────────────────────── #
@app.post("/text-to-audio", response_model=TextToAudioResponse)
async def text_to_audio(body: TextToAudioRequest):
    """
    Convert text to speech using the Chatterbox TTS service.

    This endpoint generates audio from text and returns either:
    - A WAV file path (if output_format='file')
    - Base64-encoded WAV audio (if output_format='base64')

    The generated audio can be used with the avatar rendering pipeline.
    """
    if not tts_available:
        raise HTTPException(
            503,
            "TTS service is not available. Please check the Chatterbox TTS server configuration."
        )

    try:
        # Determine output format
        output_format = body.output_format or "file"

        # Get voice and language (use defaults from settings if not specified)
        voice = body.voice or settings.CHATTERBOX_DEFAULT_VOICE
        language = body.language or settings.CHATTERBOX_DEFAULT_LANGUAGE

        if output_format == "base64":
            # Return base64-encoded WAV
            audio_base64 = await tts_wav_base64_async(
                body.text,
                voice=voice,
                language=language,
                temperature=body.temperature or 0.7,
                cfg_weight=body.cfg_weight or 0.4,
                exaggeration=body.exaggeration or 0.3,
                speed=body.speed or 1.0,
            )

            return TextToAudioResponse(
                status="success",
                audio_base64=audio_base64,
            )
        else:
            # Generate WAV and save to file
            wav_bytes = await tts_wav_bytes_async(
                body.text,
                voice=voice,
                language=language,
                temperature=body.temperature or 0.7,
                cfg_weight=body.cfg_weight or 0.4,
                exaggeration=body.exaggeration or 0.3,
                speed=body.speed or 1.0,
            )

            # Save to a file in the work directory
            audio_id = str(uuid.uuid4())
            audio_dir = WORK_ROOT / f"tts-{audio_id}"
            audio_dir.mkdir(parents=True, exist_ok=True)
            audio_path = audio_dir / "audio.wav"

            audio_path.write_bytes(wav_bytes)

            return TextToAudioResponse(
                status="success",
                audio_path=str(audio_path),
            )

    except ChatterboxTtsError as exc:
        return TextToAudioResponse(
            status="error",
            error=f"TTS generation failed: {str(exc)}",
        )
    except Exception as exc:
        return TextToAudioResponse(
            status="error",
            error=f"Unexpected error: {str(exc)}",
        )


# ────────────────────── Health & Readiness probes ────────────────────────── #
@app.get("/health/live")
def liveness():
    return JSONResponse({"status": "alive"})


@app.get("/health/ready")
def readiness():
    if celery_available:
        try:
            celery_app.control.ping(timeout=1)
        except Exception as err:
            raise HTTPException(503, f"celery ping failed: {err}") from err
    return JSONResponse({"status": "ready"})


@app.get("/health/tts")
async def tts_health():
    """Check the health of the Chatterbox TTS service."""
    if not tts_available:
        raise HTTPException(
            503,
            "TTS service is not available. TTS module could not be imported."
        )

    try:
        health_status = await chatterbox_health_async()
        return JSONResponse({
            "status": "healthy",
            "tts_server": settings.CHATTERBOX_URL,
            "details": health_status,
        })
    except ChatterboxTtsError as exc:
        raise HTTPException(
            503,
            f"TTS health check failed: {str(exc)}"
        ) from exc
    except Exception as exc:
        raise HTTPException(
            503,
            f"Unexpected TTS health check error: {str(exc)}"
        ) from exc


@app.get("/avatars")
def list_avatars():
    """List available avatar models and their status.

    This endpoint provides health status for all avatar rendering models,
    indicating which models are available and ready for use.
    """
    import torch
    from pathlib import Path

    # Model checkpoint paths from settings
    model_checks = {
        "fomm": {
            "name": "First Order Motion Model",
            "path": settings.FOMM_CKPT_DIR / "vox-cpk.pth.tar",
            "purpose": "Head pose and expression generation",
            "required_for": "high_quality"
        },
        "diff2lip": {
            "name": "Diff2Lip",
            "path": settings.DIFF2LIP_CKPT_DIR / "Diff2Lip.pth",
            "purpose": "Photorealistic lip synchronization",
            "required_for": "high_quality"
        },
        "sadtalker": {
            "name": "SadTalker",
            "path": settings.SADTALKER_CKPT_DIR / "sadtalker.pth",
            "purpose": "Talking head generation (fallback)",
            "required_for": "real_time"
        },
        "wav2lip": {
            "name": "Wav2Lip",
            "path": settings.WAV2LIP_CKPT,
            "purpose": "Lip synchronization GAN (fallback)",
            "required_for": "real_time"
        },
        "gfpgan": {
            "name": "GFPGAN",
            "path": settings.GFPGAN_CKPT,
            "purpose": "Face enhancement",
            "required_for": "both"
        }
    }

    models_status = {}
    for model_id, info in model_checks.items():
        path = Path(info["path"])
        models_status[model_id] = {
            "name": info["name"],
            "purpose": info["purpose"],
            "required_for": info["required_for"],
            "available": path.exists(),
            "path": str(path)
        }

    # System capabilities
    cuda_available = torch.cuda.is_available()
    gpu_count = torch.cuda.device_count() if cuda_available else 0
    gpu_info = []
    if cuda_available:
        for i in range(gpu_count):
            try:
                gpu_info.append({
                    "id": i,
                    "name": torch.cuda.get_device_name(i),
                    "memory_total_gb": round(torch.cuda.get_device_properties(i).total_memory / 1024**3, 2)
                })
            except Exception:
                pass

    # Determine rendering modes available
    high_quality_ready = (
        models_status["fomm"]["available"] and
        models_status["diff2lip"]["available"] and
        cuda_available
    )
    real_time_ready = (
        models_status["sadtalker"]["available"] and
        models_status["wav2lip"]["available"]
    )

    # Enhancements status
    enhancements_info = []
    try:
        from .enhancements import registry as enhancement_registry
        enhancements_info = enhancement_registry.get_info_all()
    except ImportError:
        pass

    return JSONResponse({
        "status": "ready",
        "models": models_status,
        "system": {
            "cuda_available": cuda_available,
            "gpu_count": gpu_count,
            "gpus": gpu_info,
            "celery_enabled": celery_available,
            "tts_enabled": tts_available
        },
        "rendering_modes": {
            "high_quality": {
                "available": high_quality_ready,
                "description": "FOMM + Diff2Lip pipeline for best quality (requires GPU)",
                "models": ["fomm", "diff2lip", "gfpgan"]
            },
            "real_time": {
                "available": real_time_ready,
                "description": "SadTalker + Wav2Lip pipeline for faster processing",
                "models": ["sadtalker", "wav2lip", "gfpgan"]
            },
            "cinematic": {
                "available": any(e["name"] == "hallo3_cinematic" and e["available"] for e in enhancements_info),
                "description": "Hallo3 Diffusion Transformer for cinematic quality (slow, GPU intensive)",
                "models": ["hallo3"]
            },
            "3d": {
                "available": any(e["name"] == "gaussian_splatting" and e["available"] for e in enhancements_info),
                "description": "InsTaG 3D Gaussian Splatting for real-time 3D avatars",
                "models": ["instag"]
            }
        },
        "enhancements": enhancements_info,
        "tts": {
            "available": tts_available,
            "server_url": settings.CHATTERBOX_URL if tts_available else None,
            "default_voice": settings.CHATTERBOX_DEFAULT_VOICE if tts_available else None,
            "default_language": settings.CHATTERBOX_DEFAULT_LANGUAGE if tts_available else None,
            "description": "Chatterbox TTS for text-to-speech synthesis"
        }
    })