Spaces:

betatestacct
/

api

Running on CPU Upgrade

App Files Files Community

Ftps commited on Jan 25

Commit

08e674b

1 Parent(s): 05fde9a

Replace WebSocket with Gradio Streaming

Browse files

Files changed (3) hide show

.DS_Store +0 -0
app.py +38 -36
tabs/api/realtime_api.py +87 -433

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -97,6 +97,41 @@ with gr.Blocks(
     with gr.Tab(i18n("Settings")):
         settings_tab()
     gr.Markdown(
         """
     <div style="text-align: center; font-size: 0.9em; text-color: a3a3a3;">
@@ -106,41 +141,8 @@ with gr.Blocks(
     )
-def create_app():
-    """Create FastAPI app with Gradio and Realtime API integrated"""
-    from fastapi import FastAPI
-    from fastapi.middleware.cors import CORSMiddleware
-    from tabs.api.realtime_api import router as realtime_router
-    from tabs.api.realtime_api import websocket_realtime
-    app = FastAPI(title="Applio API")
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=["*"],
-        allow_credentials=True,
-        allow_methods=["*"],
-        allow_headers=["*"],
-    )
-    app.include_router(realtime_router)
-    app.add_api_websocket_route("/ws/realtime/{session_id}", websocket_realtime)
-    app = gr.mount_gradio_app(
-        app,
-        Applio,
-        path="/",
         allowed_paths=["/app/assets/audios/", "/home/user/app/assets/audios/"],
     )
-    return app
-app = create_app()
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

     with gr.Tab(i18n("Settings")):
         settings_tab()
+    with gr.Tab("Realtime API"):
+        from tabs.api.realtime_api import (
+            get_available_models,
+            process_audio_stream,
+        )
+        gr.Markdown("### Realtime Voice Conversion (Streaming)")
+        with gr.Row():
+            rt_model = gr.Dropdown(
+                label="Model",
+                choices=get_available_models(),
+                value=None,
+            )
+            rt_pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch")
+            rt_index_rate = gr.Slider(0, 1, value=0.75, step=0.05, label="Index Rate")
+        rt_state = gr.State(None)
+        rt_input = gr.Audio(
+            sources=["microphone"],
+            streaming=True,
+            label="Input (Microphone)",
+        )
+        rt_output = gr.Audio(
+            streaming=True,
+            label="Output",
+            autoplay=True,
+        )
+        rt_input.stream(
+            fn=process_audio_stream,
+            inputs=[rt_state, rt_input, rt_model, rt_pitch, rt_index_rate],
+            outputs=[rt_state, rt_output],
+            api_name="realtime_convert",
+        )
     gr.Markdown(
         """
     <div style="text-align: center; font-size: 0.9em; text-color: a3a3a3;">
     )
+if __name__ == "__main__":
+    Applio.launch(
+        server_name="0.0.0.0",
         allowed_paths=["/app/assets/audios/", "/home/user/app/assets/audios/"],
     )

tabs/api/realtime_api.py CHANGED Viewed

@@ -1,205 +1,36 @@
-"""
-Realtime Voice Conversion WebSocket API
-This module provides WebSocket-based realtime voice conversion API
-for integration with Flutter and other client applications.
-API Endpoints:
-- WS /ws/realtime/{session_id} - WebSocket for realtime audio streaming
-- POST /api/realtime/start - Start a new session
-- POST /api/realtime/stop - Stop and cleanup a session
-- GET /api/realtime/models - List available models
-"""
 import os
 import sys
-import uuid
-import base64
 import numpy as np
-from typing import Dict, Optional, Any
-from dataclasses import dataclass, field
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException, APIRouter
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-import asyncio
-import time
-import json
 now_dir = os.getcwd()
 sys.path.append(now_dir)
-# Constants
-AUDIO_SAMPLE_RATE = 48000  # Will be validated against actual value on first use
-# Session storage
-@dataclass
-class RealtimeSession:
-    session_id: str
-    voice_changer: Any  # VoiceChanger instance
-    model_name: str
-    created_at: float
-    last_active: float
-    settings: dict = field(default_factory=dict)
-class SessionManager:
-    def __init__(self, max_sessions: int = 10, session_timeout: int = 300):
-        self.sessions: Dict[str, RealtimeSession] = {}
-        self.max_sessions = max_sessions
-        self.session_timeout = session_timeout
-        self._lock = asyncio.Lock()
-    async def create_session(
-        self,
-        model_path: str,
-        index_path: str = "",
-        model_name: str = "",
-        f0_method: str = "rmvpe",
-        chunk_size_ms: float = 100,
-        cross_fade_size: float = 0.05,
-        extra_convert_size: float = 0.5,
-        **kwargs
-    ) -> str:
-        # Lazy import VoiceChanger
-        from rvc.realtime.core import VoiceChanger, AUDIO_SAMPLE_RATE as ACTUAL_SAMPLE_RATE
-        async with self._lock:
-            # Cleanup expired sessions
-            await self._cleanup_expired()
-            if len(self.sessions) >= self.max_sessions:
-                raise RuntimeError(f"Maximum sessions ({self.max_sessions}) reached")
-            session_id = str(uuid.uuid4())[:8]
-            # Convert chunk_size_ms to read_chunk_size
-            read_chunk_size = int(chunk_size_ms * ACTUAL_SAMPLE_RATE / 1000 / 128)
-            voice_changer = VoiceChanger(
-                read_chunk_size=read_chunk_size,
-                cross_fade_overlap_size=cross_fade_size,
-                extra_convert_size=extra_convert_size,
-                model_path=model_path,
-                index_path=index_path,
-                f0_method=f0_method,
-                embedder_model=kwargs.get("embedder_model", "contentvec"),
-                silent_threshold=kwargs.get("silent_threshold", -60),
-                vad_enabled=kwargs.get("vad_enabled", True),
-                sid=kwargs.get("sid", 0),
-            )
-            now = time.time()
-            self.sessions[session_id] = RealtimeSession(
-                session_id=session_id,
-                voice_changer=voice_changer,
-                model_name=model_name,
-                created_at=now,
-                last_active=now,
-                settings={
-                    "pitch": kwargs.get("pitch", 0),
-                    "index_rate": kwargs.get("index_rate", 0.75),
-                    "protect": kwargs.get("protect", 0.5),
-                    "volume_envelope": kwargs.get("volume_envelope", 1.0),
-                    "f0_autotune": kwargs.get("f0_autotune", False),
-                    "f0_autotune_strength": kwargs.get("f0_autotune_strength", 1.0),
-                }
-            )
-            return session_id
-    async def get_session(self, session_id: str) -> Optional[RealtimeSession]:
-        session = self.sessions.get(session_id)
-        if session:
-            session.last_active = time.time()
-        return session
-    async def remove_session(self, session_id: str) -> bool:
-        async with self._lock:
-            if session_id in self.sessions:
-                session = self.sessions.pop(session_id)
-                del session.voice_changer
-                return True
-            return False
-    async def _cleanup_expired(self):
-        now = time.time()
-        expired = [
-            sid for sid, session in self.sessions.items()
-            if now - session.last_active > self.session_timeout
-        ]
-        for sid in expired:
-            session = self.sessions.pop(sid)
-            del session.voice_changer
-# Global session manager
-session_manager = SessionManager()
-# Pydantic models for API
-class StartSessionRequest(BaseModel):
-    model_zip_link: Optional[str] = None
-    model_name: str
-    pitch: int = 0
-    index_rate: float = 0.75
-    f0_method: str = "rmvpe"
-    chunk_size_ms: float = 100
-    cross_fade_size: float = 0.05
-    extra_convert_size: float = 0.5
-    protect: float = 0.5
-    volume_envelope: float = 1.0
-    f0_autotune: bool = False
-    f0_autotune_strength: float = 1.0
-    vad_enabled: bool = True
-    silent_threshold: int = -60
-    sid: int = 0
-    embedder_model: str = "contentvec"
-class StartSessionResponse(BaseModel):
-    session_id: str
-    message: str
-    websocket_url: str
-    sample_rate: int
-    chunk_size_samples: int
-class StopSessionRequest(BaseModel):
-    session_id: str
-class UpdateSettingsRequest(BaseModel):
-    session_id: str
-    pitch: Optional[int] = None
-    index_rate: Optional[float] = None
-    protect: Optional[float] = None
-    volume_envelope: Optional[float] = None
-    f0_autotune: Optional[bool] = None
-    f0_autotune_strength: Optional[float] = None
-class ConvertRequest(BaseModel):
-    session_id: str
-    audio_base64: str
-# Create API Router
-router = APIRouter(prefix="/api/realtime", tags=["realtime"])
-LOGS_DIR = os.path.join(now_dir, "logs")
-def get_model_paths(model_name: str):
-    """Get model paths from model name"""
     model_dir = os.path.join(LOGS_DIR, model_name)
     if not os.path.exists(model_dir):
-        return None, None, f"Model directory not found: {model_dir}"
     pth_path = next(
         (os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".pth")),
         None,
     )
     if not pth_path:
-        return None, None, ".pth file not found for the selected model."
     index_path = next(
         (os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".index")),
@@ -208,272 +39,95 @@ def get_model_paths(model_name: str):
     return pth_path, index_path, None
-@router.get("/models")
-async def list_models():
-    """List available voice models"""
-    if not os.path.exists(LOGS_DIR):
-        return {"models": []}
-    models = [
-        d for d in os.listdir(LOGS_DIR)
-        if os.path.isdir(os.path.join(LOGS_DIR, d))
-    ]
-    return {"models": models}
-@router.get("/sessions")
-async def list_sessions():
-    """List active sessions"""
-    sessions = [
-        {
-            "session_id": s.session_id,
-            "model_name": s.model_name,
-            "created_at": s.created_at,
-            "last_active": s.last_active,
         }
-        for s in session_manager.sessions.values()
-    ]
-    return {"sessions": sessions}
-@router.post("/start", response_model=StartSessionResponse)
-async def start_session(request: StartSessionRequest):
-    """Start a new realtime voice conversion session"""
-    # Get model paths
-    pth_path, index_path, error = get_model_paths(request.model_name)
-    if error:
-        raise HTTPException(status_code=404, detail=error)
-    try:
-        session_id = await session_manager.create_session(
             model_path=pth_path,
             index_path=index_path,
-            model_name=request.model_name,
-            f0_method=request.f0_method,
-            chunk_size_ms=request.chunk_size_ms,
-            cross_fade_size=request.cross_fade_size,
-            extra_convert_size=request.extra_convert_size,
-            pitch=request.pitch,
-            index_rate=request.index_rate,
-            protect=request.protect,
-            volume_envelope=request.volume_envelope,
-            f0_autotune=request.f0_autotune,
-            f0_autotune_strength=request.f0_autotune_strength,
-            vad_enabled=request.vad_enabled,
-            silent_threshold=request.silent_threshold,
-            sid=request.sid,
-            embedder_model=request.embedder_model,
         )
-        chunk_size_samples = int(request.chunk_size_ms * AUDIO_SAMPLE_RATE / 1000)
-        return StartSessionResponse(
-            session_id=session_id,
-            message=f"Session started with model '{request.model_name}'",
-            websocket_url=f"/ws/realtime/{session_id}",
-            sample_rate=AUDIO_SAMPLE_RATE,
-            chunk_size_samples=chunk_size_samples,
-        )
-    except RuntimeError as e:
-        raise HTTPException(status_code=503, detail=str(e))
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to start session: {str(e)}")
-@router.post("/stop")
-async def stop_session(request: StopSessionRequest):
-    """Stop and cleanup a realtime session"""
-    removed = await session_manager.remove_session(request.session_id)
-    if removed:
-        return {"message": f"Session {request.session_id} stopped"}
-    else:
-        raise HTTPException(status_code=404, detail="Session not found")
-@router.post("/settings")
-async def update_settings(request: UpdateSettingsRequest):
-    """Update session settings without restarting"""
-    session = await session_manager.get_session(request.session_id)
-    if not session:
-        raise HTTPException(status_code=404, detail="Session not found")
-    # Update only provided settings
-    if request.pitch is not None:
-        session.settings["pitch"] = request.pitch
-    if request.index_rate is not None:
-        session.settings["index_rate"] = request.index_rate
-    if request.protect is not None:
-        session.settings["protect"] = request.protect
-    if request.volume_envelope is not None:
-        session.settings["volume_envelope"] = request.volume_envelope
-    if request.f0_autotune is not None:
-        session.settings["f0_autotune"] = request.f0_autotune
-    if request.f0_autotune_strength is not None:
-        session.settings["f0_autotune_strength"] = request.f0_autotune_strength
-    return {"message": "Settings updated", "settings": session.settings}
-@router.post("/convert")
-async def convert_audio_http(request: ConvertRequest):
-    """
-    HTTP fallback for audio conversion (higher latency than WebSocket)
-    Args:
-        session_id: Active session ID
-        audio_base64: Base64 encoded float32 PCM audio data
-    Returns:
-        Base64 encoded converted audio
-    """
-    session = await session_manager.get_session(request.session_id)
-    if not session:
-        raise HTTPException(status_code=404, detail="Session not found")
-    try:
-        # Decode base64 audio
-        audio_bytes = base64.b64decode(request.audio_base64)
-        audio_input = np.frombuffer(audio_bytes, dtype=np.float32)
-        if len(audio_input) == 0:
-            raise HTTPException(status_code=400, detail="Empty audio data")
-        # Process audio
-        start_time = time.perf_counter()
-        result, vol, latency_info = session.voice_changer.on_request(
-            audio_input,
-            f0_up_key=session.settings["pitch"],
-            index_rate=session.settings["index_rate"],
-            protect=session.settings["protect"],
-            volume_envelope=session.settings["volume_envelope"],
-            f0_autotune=session.settings["f0_autotune"],
-            f0_autotune_strength=session.settings["f0_autotune_strength"],
         )
-        process_time = (time.perf_counter() - start_time) * 1000
-        # Encode result
-        if result is not None:
-            result_base64 = base64.b64encode(result.astype(np.float32).tobytes()).decode()
-        else:
-            silence = np.zeros(len(audio_input), dtype=np.float32)
-            result_base64 = base64.b64encode(silence.tobytes()).decode()
-        return {
-            "audio_base64": result_base64,
-            "volume": float(vol),
-            "process_time_ms": process_time,
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
-# WebSocket endpoint (separate from router for path flexibility)
-async def websocket_realtime(websocket: WebSocket, session_id: str):
-    """
-    WebSocket endpoint for realtime voice conversion
-    Protocol:
-    - Client sends: Binary audio data (float32 PCM, 48kHz, mono)
-    - Server sends: Binary converted audio data (float32 PCM, 48kHz, mono)
-    Message format:
-    - Binary frames: Raw audio samples as float32
-    - Text frames: JSON commands (e.g., {"cmd": "ping"}, {"cmd": "settings", ...})
-    """
-    session = await session_manager.get_session(session_id)
-    if not session:
-        await websocket.close(code=4004, reason="Session not found")
-        return
-    await websocket.accept()
-    try:
-        while True:
-            message = await websocket.receive()
-            if message["type"] == "websocket.disconnect":
-                break
-            if "bytes" in message:
-                # Binary audio data
-                audio_bytes = message["bytes"]
-                # Convert bytes to numpy array (float32)
-                audio_input = np.frombuffer(audio_bytes, dtype=np.float32)
-                if len(audio_input) == 0:
-                    continue
-                # Process audio
-                result, vol, latency_info = session.voice_changer.on_request(
-                    audio_input,
-                    f0_up_key=session.settings["pitch"],
-                    index_rate=session.settings["index_rate"],
-                    protect=session.settings["protect"],
-                    volume_envelope=session.settings["volume_envelope"],
-                    f0_autotune=session.settings["f0_autotune"],
-                    f0_autotune_strength=session.settings["f0_autotune_strength"],
-                )
-                # Send converted audio
-                if result is not None:
-                    await websocket.send_bytes(result.astype(np.float32).tobytes())
-                else:
-                    # Send silence if no audio
-                    silence = np.zeros(len(audio_input), dtype=np.float32)
-                    await websocket.send_bytes(silence.tobytes())
-            elif "text" in message:
-                # JSON command
-                try:
-                    cmd = json.loads(message["text"])
-                    if cmd.get("cmd") == "ping":
-                        await websocket.send_text(json.dumps({"cmd": "pong", "time": time.time()}))
-                    elif cmd.get("cmd") == "settings":
-                        # Update settings
-                        for key in ["pitch", "index_rate", "protect", "volume_envelope", "f0_autotune", "f0_autotune_strength"]:
-                            if key in cmd:
-                                session.settings[key] = cmd[key]
-                        await websocket.send_text(json.dumps({"cmd": "settings_updated", "settings": session.settings}))
-                    elif cmd.get("cmd") == "status":
-                        await websocket.send_text(json.dumps({
-                            "cmd": "status",
-                            "session_id": session_id,
-                            "model": session.model_name,
-                            "settings": session.settings,
-                        }))
-                except json.JSONDecodeError:
-                    pass
-    except WebSocketDisconnect:
-        pass
-    except Exception as e:
-        print(f"WebSocket error: {e}")
-    finally:
-        # Keep session alive for reconnection
-        pass
-# Create FastAPI app with router included
-app = FastAPI(title="Realtime Voice Conversion API")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Include router
-app.include_router(router)
-# Add WebSocket route
-app.add_api_websocket_route("/ws/realtime/{session_id}", websocket_realtime)

 import os
 import sys
 import numpy as np
+from typing import Optional, Tuple, Any
 now_dir = os.getcwd()
 sys.path.append(now_dir)
+LOGS_DIR = os.path.join(now_dir, "logs")
+SAMPLE_RATE = 48000
+def get_available_models() -> list:
+    if not os.path.exists(LOGS_DIR):
+        return []
+    return [
+        d for d in os.listdir(LOGS_DIR)
+        if os.path.isdir(os.path.join(LOGS_DIR, d))
+        and any(f.endswith(".pth") for f in os.listdir(os.path.join(LOGS_DIR, d)))
+    ]
+def get_model_paths(model_name: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
     model_dir = os.path.join(LOGS_DIR, model_name)
     if not os.path.exists(model_dir):
+        return None, None, f"Model not found: {model_name}"
     pth_path = next(
         (os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".pth")),
         None,
     )
     if not pth_path:
+        return None, None, ".pth file not found"
     index_path = next(
         (os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".index")),
     return pth_path, index_path, None
+class RealtimeVoiceChanger:
+    def __init__(self):
+        self.voice_changer = None
+        self.model_name = None
+        self.settings = {
+            "pitch": 0,
+            "index_rate": 0.75,
+            "protect": 0.5,
+            "volume_envelope": 1.0,
+            "f0_autotune": False,
+            "f0_autotune_strength": 1.0,
         }
+    def load_model(
+        self,
+        model_name: str,
+        f0_method: str = "rmvpe",
+        pitch: int = 0,
+        index_rate: float = 0.75,
+    ) -> str:
+        from rvc.realtime.core import VoiceChanger
+        pth_path, index_path, error = get_model_paths(model_name)
+        if error:
+            return error
+        self.voice_changer = VoiceChanger(
+            read_chunk_size=4,
+            cross_fade_overlap_size=0.05,
+            extra_convert_size=0.5,
             model_path=pth_path,
             index_path=index_path,
+            f0_method=f0_method,
         )
+        self.model_name = model_name
+        self.settings["pitch"] = pitch
+        self.settings["index_rate"] = index_rate
+        return f"Model '{model_name}' loaded"
+    def convert(self, audio: np.ndarray) -> Optional[np.ndarray]:
+        if self.voice_changer is None:
+            return None
+        result, _, _ = self.voice_changer.on_request(
+            audio,
+            f0_up_key=self.settings["pitch"],
+            index_rate=self.settings["index_rate"],
+            protect=self.settings["protect"],
+            volume_envelope=self.settings["volume_envelope"],
+            f0_autotune=self.settings["f0_autotune"],
+            f0_autotune_strength=self.settings["f0_autotune_strength"],
         )
+        return result
+def create_voice_changer_state() -> RealtimeVoiceChanger:
+    return RealtimeVoiceChanger()
+def process_audio_stream(
+    state: Optional[RealtimeVoiceChanger],
+    audio_chunk: Optional[Tuple[int, np.ndarray]],
+    model_name: str,
+    pitch: int,
+    index_rate: float,
+) -> Tuple[RealtimeVoiceChanger, Optional[Tuple[int, np.ndarray]]]:
+    if state is None:
+        state = create_voice_changer_state()
+    if audio_chunk is None:
+        return state, None
+    sr, audio = audio_chunk
+    if state.model_name != model_name and model_name:
+        state.load_model(model_name, pitch=pitch, index_rate=index_rate)
+    else:
+        state.settings["pitch"] = pitch
+        state.settings["index_rate"] = index_rate
+    if audio.ndim > 1:
+        audio = audio.mean(axis=1)
+    audio = audio.astype(np.float32)
+    if audio.max() > 1.0:
+        audio = audio / 32768.0
+    converted = state.convert(audio)
+    if converted is not None:
+        return state, (SAMPLE_RATE, converted)
+    return state, None