Ftps commited on
Commit
08e674b
·
1 Parent(s): 05fde9a

Replace WebSocket with Gradio Streaming

Browse files
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. app.py +38 -36
  3. tabs/api/realtime_api.py +87 -433
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -97,6 +97,41 @@ with gr.Blocks(
97
  with gr.Tab(i18n("Settings")):
98
  settings_tab()
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  gr.Markdown(
101
  """
102
  <div style="text-align: center; font-size: 0.9em; text-color: a3a3a3;">
@@ -106,41 +141,8 @@ with gr.Blocks(
106
  )
107
 
108
 
109
- def create_app():
110
- """Create FastAPI app with Gradio and Realtime API integrated"""
111
- from fastapi import FastAPI
112
- from fastapi.middleware.cors import CORSMiddleware
113
-
114
- from tabs.api.realtime_api import router as realtime_router
115
- from tabs.api.realtime_api import websocket_realtime
116
-
117
- app = FastAPI(title="Applio API")
118
-
119
- app.add_middleware(
120
- CORSMiddleware,
121
- allow_origins=["*"],
122
- allow_credentials=True,
123
- allow_methods=["*"],
124
- allow_headers=["*"],
125
- )
126
-
127
- app.include_router(realtime_router)
128
- app.add_api_websocket_route("/ws/realtime/{session_id}", websocket_realtime)
129
-
130
- app = gr.mount_gradio_app(
131
- app,
132
- Applio,
133
- path="/",
134
  allowed_paths=["/app/assets/audios/", "/home/user/app/assets/audios/"],
135
  )
136
-
137
- return app
138
-
139
-
140
- app = create_app()
141
-
142
-
143
- if __name__ == "__main__":
144
- import uvicorn
145
-
146
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
97
  with gr.Tab(i18n("Settings")):
98
  settings_tab()
99
 
100
+ with gr.Tab("Realtime API"):
101
+ from tabs.api.realtime_api import (
102
+ get_available_models,
103
+ process_audio_stream,
104
+ )
105
+
106
+ gr.Markdown("### Realtime Voice Conversion (Streaming)")
107
+ with gr.Row():
108
+ rt_model = gr.Dropdown(
109
+ label="Model",
110
+ choices=get_available_models(),
111
+ value=None,
112
+ )
113
+ rt_pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch")
114
+ rt_index_rate = gr.Slider(0, 1, value=0.75, step=0.05, label="Index Rate")
115
+
116
+ rt_state = gr.State(None)
117
+ rt_input = gr.Audio(
118
+ sources=["microphone"],
119
+ streaming=True,
120
+ label="Input (Microphone)",
121
+ )
122
+ rt_output = gr.Audio(
123
+ streaming=True,
124
+ label="Output",
125
+ autoplay=True,
126
+ )
127
+
128
+ rt_input.stream(
129
+ fn=process_audio_stream,
130
+ inputs=[rt_state, rt_input, rt_model, rt_pitch, rt_index_rate],
131
+ outputs=[rt_state, rt_output],
132
+ api_name="realtime_convert",
133
+ )
134
+
135
  gr.Markdown(
136
  """
137
  <div style="text-align: center; font-size: 0.9em; text-color: a3a3a3;">
 
141
  )
142
 
143
 
144
+ if __name__ == "__main__":
145
+ Applio.launch(
146
+ server_name="0.0.0.0",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  allowed_paths=["/app/assets/audios/", "/home/user/app/assets/audios/"],
148
  )
 
 
 
 
 
 
 
 
 
 
 
tabs/api/realtime_api.py CHANGED
@@ -1,205 +1,36 @@
1
- """
2
- Realtime Voice Conversion WebSocket API
3
-
4
- This module provides WebSocket-based realtime voice conversion API
5
- for integration with Flutter and other client applications.
6
-
7
- API Endpoints:
8
- - WS /ws/realtime/{session_id} - WebSocket for realtime audio streaming
9
- - POST /api/realtime/start - Start a new session
10
- - POST /api/realtime/stop - Stop and cleanup a session
11
- - GET /api/realtime/models - List available models
12
- """
13
-
14
  import os
15
  import sys
16
- import uuid
17
- import base64
18
  import numpy as np
19
- from typing import Dict, Optional, Any
20
- from dataclasses import dataclass, field
21
- from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException, APIRouter
22
- from fastapi.middleware.cors import CORSMiddleware
23
- from pydantic import BaseModel
24
- import asyncio
25
- import time
26
- import json
27
 
28
  now_dir = os.getcwd()
29
  sys.path.append(now_dir)
30
 
31
- # Constants
32
- AUDIO_SAMPLE_RATE = 48000 # Will be validated against actual value on first use
33
-
34
- # Session storage
35
- @dataclass
36
- class RealtimeSession:
37
- session_id: str
38
- voice_changer: Any # VoiceChanger instance
39
- model_name: str
40
- created_at: float
41
- last_active: float
42
- settings: dict = field(default_factory=dict)
43
-
44
-
45
- class SessionManager:
46
- def __init__(self, max_sessions: int = 10, session_timeout: int = 300):
47
- self.sessions: Dict[str, RealtimeSession] = {}
48
- self.max_sessions = max_sessions
49
- self.session_timeout = session_timeout
50
- self._lock = asyncio.Lock()
51
-
52
- async def create_session(
53
- self,
54
- model_path: str,
55
- index_path: str = "",
56
- model_name: str = "",
57
- f0_method: str = "rmvpe",
58
- chunk_size_ms: float = 100,
59
- cross_fade_size: float = 0.05,
60
- extra_convert_size: float = 0.5,
61
- **kwargs
62
- ) -> str:
63
- # Lazy import VoiceChanger
64
- from rvc.realtime.core import VoiceChanger, AUDIO_SAMPLE_RATE as ACTUAL_SAMPLE_RATE
65
-
66
- async with self._lock:
67
- # Cleanup expired sessions
68
- await self._cleanup_expired()
69
-
70
- if len(self.sessions) >= self.max_sessions:
71
- raise RuntimeError(f"Maximum sessions ({self.max_sessions}) reached")
72
-
73
- session_id = str(uuid.uuid4())[:8]
74
-
75
- # Convert chunk_size_ms to read_chunk_size
76
- read_chunk_size = int(chunk_size_ms * ACTUAL_SAMPLE_RATE / 1000 / 128)
77
-
78
- voice_changer = VoiceChanger(
79
- read_chunk_size=read_chunk_size,
80
- cross_fade_overlap_size=cross_fade_size,
81
- extra_convert_size=extra_convert_size,
82
- model_path=model_path,
83
- index_path=index_path,
84
- f0_method=f0_method,
85
- embedder_model=kwargs.get("embedder_model", "contentvec"),
86
- silent_threshold=kwargs.get("silent_threshold", -60),
87
- vad_enabled=kwargs.get("vad_enabled", True),
88
- sid=kwargs.get("sid", 0),
89
- )
90
-
91
- now = time.time()
92
- self.sessions[session_id] = RealtimeSession(
93
- session_id=session_id,
94
- voice_changer=voice_changer,
95
- model_name=model_name,
96
- created_at=now,
97
- last_active=now,
98
- settings={
99
- "pitch": kwargs.get("pitch", 0),
100
- "index_rate": kwargs.get("index_rate", 0.75),
101
- "protect": kwargs.get("protect", 0.5),
102
- "volume_envelope": kwargs.get("volume_envelope", 1.0),
103
- "f0_autotune": kwargs.get("f0_autotune", False),
104
- "f0_autotune_strength": kwargs.get("f0_autotune_strength", 1.0),
105
- }
106
- )
107
-
108
- return session_id
109
-
110
- async def get_session(self, session_id: str) -> Optional[RealtimeSession]:
111
- session = self.sessions.get(session_id)
112
- if session:
113
- session.last_active = time.time()
114
- return session
115
-
116
- async def remove_session(self, session_id: str) -> bool:
117
- async with self._lock:
118
- if session_id in self.sessions:
119
- session = self.sessions.pop(session_id)
120
- del session.voice_changer
121
- return True
122
- return False
123
-
124
- async def _cleanup_expired(self):
125
- now = time.time()
126
- expired = [
127
- sid for sid, session in self.sessions.items()
128
- if now - session.last_active > self.session_timeout
129
- ]
130
- for sid in expired:
131
- session = self.sessions.pop(sid)
132
- del session.voice_changer
133
-
134
-
135
- # Global session manager
136
- session_manager = SessionManager()
137
-
138
- # Pydantic models for API
139
- class StartSessionRequest(BaseModel):
140
- model_zip_link: Optional[str] = None
141
- model_name: str
142
- pitch: int = 0
143
- index_rate: float = 0.75
144
- f0_method: str = "rmvpe"
145
- chunk_size_ms: float = 100
146
- cross_fade_size: float = 0.05
147
- extra_convert_size: float = 0.5
148
- protect: float = 0.5
149
- volume_envelope: float = 1.0
150
- f0_autotune: bool = False
151
- f0_autotune_strength: float = 1.0
152
- vad_enabled: bool = True
153
- silent_threshold: int = -60
154
- sid: int = 0
155
- embedder_model: str = "contentvec"
156
-
157
-
158
- class StartSessionResponse(BaseModel):
159
- session_id: str
160
- message: str
161
- websocket_url: str
162
- sample_rate: int
163
- chunk_size_samples: int
164
-
165
-
166
- class StopSessionRequest(BaseModel):
167
- session_id: str
168
-
169
-
170
- class UpdateSettingsRequest(BaseModel):
171
- session_id: str
172
- pitch: Optional[int] = None
173
- index_rate: Optional[float] = None
174
- protect: Optional[float] = None
175
- volume_envelope: Optional[float] = None
176
- f0_autotune: Optional[bool] = None
177
- f0_autotune_strength: Optional[float] = None
178
-
179
-
180
- class ConvertRequest(BaseModel):
181
- session_id: str
182
- audio_base64: str
183
-
184
 
185
- # Create API Router
186
- router = APIRouter(prefix="/api/realtime", tags=["realtime"])
187
 
188
- LOGS_DIR = os.path.join(now_dir, "logs")
 
 
 
 
 
 
 
189
 
190
 
191
- def get_model_paths(model_name: str):
192
- """Get model paths from model name"""
193
  model_dir = os.path.join(LOGS_DIR, model_name)
194
  if not os.path.exists(model_dir):
195
- return None, None, f"Model directory not found: {model_dir}"
196
 
197
  pth_path = next(
198
  (os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".pth")),
199
  None,
200
  )
201
  if not pth_path:
202
- return None, None, ".pth file not found for the selected model."
203
 
204
  index_path = next(
205
  (os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".index")),
@@ -208,272 +39,95 @@ def get_model_paths(model_name: str):
208
  return pth_path, index_path, None
209
 
210
 
211
- @router.get("/models")
212
- async def list_models():
213
- """List available voice models"""
214
- if not os.path.exists(LOGS_DIR):
215
- return {"models": []}
216
-
217
- models = [
218
- d for d in os.listdir(LOGS_DIR)
219
- if os.path.isdir(os.path.join(LOGS_DIR, d))
220
- ]
221
- return {"models": models}
222
-
223
-
224
- @router.get("/sessions")
225
- async def list_sessions():
226
- """List active sessions"""
227
- sessions = [
228
- {
229
- "session_id": s.session_id,
230
- "model_name": s.model_name,
231
- "created_at": s.created_at,
232
- "last_active": s.last_active,
233
  }
234
- for s in session_manager.sessions.values()
235
- ]
236
- return {"sessions": sessions}
237
 
 
 
 
 
 
 
 
 
238
 
239
- @router.post("/start", response_model=StartSessionResponse)
240
- async def start_session(request: StartSessionRequest):
241
- """Start a new realtime voice conversion session"""
242
- # Get model paths
243
- pth_path, index_path, error = get_model_paths(request.model_name)
244
- if error:
245
- raise HTTPException(status_code=404, detail=error)
246
 
247
- try:
248
- session_id = await session_manager.create_session(
 
 
249
  model_path=pth_path,
250
  index_path=index_path,
251
- model_name=request.model_name,
252
- f0_method=request.f0_method,
253
- chunk_size_ms=request.chunk_size_ms,
254
- cross_fade_size=request.cross_fade_size,
255
- extra_convert_size=request.extra_convert_size,
256
- pitch=request.pitch,
257
- index_rate=request.index_rate,
258
- protect=request.protect,
259
- volume_envelope=request.volume_envelope,
260
- f0_autotune=request.f0_autotune,
261
- f0_autotune_strength=request.f0_autotune_strength,
262
- vad_enabled=request.vad_enabled,
263
- silent_threshold=request.silent_threshold,
264
- sid=request.sid,
265
- embedder_model=request.embedder_model,
266
  )
267
-
268
- chunk_size_samples = int(request.chunk_size_ms * AUDIO_SAMPLE_RATE / 1000)
269
-
270
- return StartSessionResponse(
271
- session_id=session_id,
272
- message=f"Session started with model '{request.model_name}'",
273
- websocket_url=f"/ws/realtime/{session_id}",
274
- sample_rate=AUDIO_SAMPLE_RATE,
275
- chunk_size_samples=chunk_size_samples,
276
- )
277
- except RuntimeError as e:
278
- raise HTTPException(status_code=503, detail=str(e))
279
- except Exception as e:
280
- raise HTTPException(status_code=500, detail=f"Failed to start session: {str(e)}")
281
-
282
-
283
- @router.post("/stop")
284
- async def stop_session(request: StopSessionRequest):
285
- """Stop and cleanup a realtime session"""
286
- removed = await session_manager.remove_session(request.session_id)
287
- if removed:
288
- return {"message": f"Session {request.session_id} stopped"}
289
- else:
290
- raise HTTPException(status_code=404, detail="Session not found")
291
-
292
-
293
- @router.post("/settings")
294
- async def update_settings(request: UpdateSettingsRequest):
295
- """Update session settings without restarting"""
296
- session = await session_manager.get_session(request.session_id)
297
- if not session:
298
- raise HTTPException(status_code=404, detail="Session not found")
299
-
300
- # Update only provided settings
301
- if request.pitch is not None:
302
- session.settings["pitch"] = request.pitch
303
- if request.index_rate is not None:
304
- session.settings["index_rate"] = request.index_rate
305
- if request.protect is not None:
306
- session.settings["protect"] = request.protect
307
- if request.volume_envelope is not None:
308
- session.settings["volume_envelope"] = request.volume_envelope
309
- if request.f0_autotune is not None:
310
- session.settings["f0_autotune"] = request.f0_autotune
311
- if request.f0_autotune_strength is not None:
312
- session.settings["f0_autotune_strength"] = request.f0_autotune_strength
313
-
314
- return {"message": "Settings updated", "settings": session.settings}
315
-
316
-
317
- @router.post("/convert")
318
- async def convert_audio_http(request: ConvertRequest):
319
- """
320
- HTTP fallback for audio conversion (higher latency than WebSocket)
321
-
322
- Args:
323
- session_id: Active session ID
324
- audio_base64: Base64 encoded float32 PCM audio data
325
-
326
- Returns:
327
- Base64 encoded converted audio
328
- """
329
- session = await session_manager.get_session(request.session_id)
330
- if not session:
331
- raise HTTPException(status_code=404, detail="Session not found")
332
-
333
- try:
334
- # Decode base64 audio
335
- audio_bytes = base64.b64decode(request.audio_base64)
336
- audio_input = np.frombuffer(audio_bytes, dtype=np.float32)
337
-
338
- if len(audio_input) == 0:
339
- raise HTTPException(status_code=400, detail="Empty audio data")
340
-
341
- # Process audio
342
- start_time = time.perf_counter()
343
-
344
- result, vol, latency_info = session.voice_changer.on_request(
345
- audio_input,
346
- f0_up_key=session.settings["pitch"],
347
- index_rate=session.settings["index_rate"],
348
- protect=session.settings["protect"],
349
- volume_envelope=session.settings["volume_envelope"],
350
- f0_autotune=session.settings["f0_autotune"],
351
- f0_autotune_strength=session.settings["f0_autotune_strength"],
352
  )
 
353
 
354
- process_time = (time.perf_counter() - start_time) * 1000
355
-
356
- # Encode result
357
- if result is not None:
358
- result_base64 = base64.b64encode(result.astype(np.float32).tobytes()).decode()
359
- else:
360
- silence = np.zeros(len(audio_input), dtype=np.float32)
361
- result_base64 = base64.b64encode(silence.tobytes()).decode()
362
-
363
- return {
364
- "audio_base64": result_base64,
365
- "volume": float(vol),
366
- "process_time_ms": process_time,
367
- }
368
-
369
- except Exception as e:
370
- raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
371
-
372
-
373
- # WebSocket endpoint (separate from router for path flexibility)
374
- async def websocket_realtime(websocket: WebSocket, session_id: str):
375
- """
376
- WebSocket endpoint for realtime voice conversion
377
-
378
- Protocol:
379
- - Client sends: Binary audio data (float32 PCM, 48kHz, mono)
380
- - Server sends: Binary converted audio data (float32 PCM, 48kHz, mono)
381
-
382
- Message format:
383
- - Binary frames: Raw audio samples as float32
384
- - Text frames: JSON commands (e.g., {"cmd": "ping"}, {"cmd": "settings", ...})
385
- """
386
- session = await session_manager.get_session(session_id)
387
- if not session:
388
- await websocket.close(code=4004, reason="Session not found")
389
- return
390
 
391
- await websocket.accept()
 
392
 
393
- try:
394
- while True:
395
- message = await websocket.receive()
396
 
397
- if message["type"] == "websocket.disconnect":
398
- break
 
 
 
 
 
 
 
399
 
400
- if "bytes" in message:
401
- # Binary audio data
402
- audio_bytes = message["bytes"]
403
 
404
- # Convert bytes to numpy array (float32)
405
- audio_input = np.frombuffer(audio_bytes, dtype=np.float32)
406
-
407
- if len(audio_input) == 0:
408
- continue
409
-
410
- # Process audio
411
- result, vol, latency_info = session.voice_changer.on_request(
412
- audio_input,
413
- f0_up_key=session.settings["pitch"],
414
- index_rate=session.settings["index_rate"],
415
- protect=session.settings["protect"],
416
- volume_envelope=session.settings["volume_envelope"],
417
- f0_autotune=session.settings["f0_autotune"],
418
- f0_autotune_strength=session.settings["f0_autotune_strength"],
419
- )
420
-
421
- # Send converted audio
422
- if result is not None:
423
- await websocket.send_bytes(result.astype(np.float32).tobytes())
424
- else:
425
- # Send silence if no audio
426
- silence = np.zeros(len(audio_input), dtype=np.float32)
427
- await websocket.send_bytes(silence.tobytes())
428
-
429
- elif "text" in message:
430
- # JSON command
431
- try:
432
- cmd = json.loads(message["text"])
433
-
434
- if cmd.get("cmd") == "ping":
435
- await websocket.send_text(json.dumps({"cmd": "pong", "time": time.time()}))
436
-
437
- elif cmd.get("cmd") == "settings":
438
- # Update settings
439
- for key in ["pitch", "index_rate", "protect", "volume_envelope", "f0_autotune", "f0_autotune_strength"]:
440
- if key in cmd:
441
- session.settings[key] = cmd[key]
442
- await websocket.send_text(json.dumps({"cmd": "settings_updated", "settings": session.settings}))
443
-
444
- elif cmd.get("cmd") == "status":
445
- await websocket.send_text(json.dumps({
446
- "cmd": "status",
447
- "session_id": session_id,
448
- "model": session.model_name,
449
- "settings": session.settings,
450
- }))
451
-
452
- except json.JSONDecodeError:
453
- pass
454
-
455
- except WebSocketDisconnect:
456
- pass
457
- except Exception as e:
458
- print(f"WebSocket error: {e}")
459
- finally:
460
- # Keep session alive for reconnection
461
- pass
462
 
 
 
 
 
 
463
 
464
- # Create FastAPI app with router included
465
- app = FastAPI(title="Realtime Voice Conversion API")
466
 
467
- app.add_middleware(
468
- CORSMiddleware,
469
- allow_origins=["*"],
470
- allow_credentials=True,
471
- allow_methods=["*"],
472
- allow_headers=["*"],
473
- )
474
 
475
- # Include router
476
- app.include_router(router)
477
 
478
- # Add WebSocket route
479
- app.add_api_websocket_route("/ws/realtime/{session_id}", websocket_realtime)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import sys
 
 
3
  import numpy as np
4
+ from typing import Optional, Tuple, Any
 
 
 
 
 
 
 
5
 
6
  now_dir = os.getcwd()
7
  sys.path.append(now_dir)
8
 
9
+ LOGS_DIR = os.path.join(now_dir, "logs")
10
+ SAMPLE_RATE = 48000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
 
 
12
 
13
+ def get_available_models() -> list:
14
+ if not os.path.exists(LOGS_DIR):
15
+ return []
16
+ return [
17
+ d for d in os.listdir(LOGS_DIR)
18
+ if os.path.isdir(os.path.join(LOGS_DIR, d))
19
+ and any(f.endswith(".pth") for f in os.listdir(os.path.join(LOGS_DIR, d)))
20
+ ]
21
 
22
 
23
+ def get_model_paths(model_name: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
 
24
  model_dir = os.path.join(LOGS_DIR, model_name)
25
  if not os.path.exists(model_dir):
26
+ return None, None, f"Model not found: {model_name}"
27
 
28
  pth_path = next(
29
  (os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".pth")),
30
  None,
31
  )
32
  if not pth_path:
33
+ return None, None, ".pth file not found"
34
 
35
  index_path = next(
36
  (os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".index")),
 
39
  return pth_path, index_path, None
40
 
41
 
42
+ class RealtimeVoiceChanger:
43
+ def __init__(self):
44
+ self.voice_changer = None
45
+ self.model_name = None
46
+ self.settings = {
47
+ "pitch": 0,
48
+ "index_rate": 0.75,
49
+ "protect": 0.5,
50
+ "volume_envelope": 1.0,
51
+ "f0_autotune": False,
52
+ "f0_autotune_strength": 1.0,
 
 
 
 
 
 
 
 
 
 
 
53
  }
 
 
 
54
 
55
+ def load_model(
56
+ self,
57
+ model_name: str,
58
+ f0_method: str = "rmvpe",
59
+ pitch: int = 0,
60
+ index_rate: float = 0.75,
61
+ ) -> str:
62
+ from rvc.realtime.core import VoiceChanger
63
 
64
+ pth_path, index_path, error = get_model_paths(model_name)
65
+ if error:
66
+ return error
 
 
 
 
67
 
68
+ self.voice_changer = VoiceChanger(
69
+ read_chunk_size=4,
70
+ cross_fade_overlap_size=0.05,
71
+ extra_convert_size=0.5,
72
  model_path=pth_path,
73
  index_path=index_path,
74
+ f0_method=f0_method,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  )
76
+ self.model_name = model_name
77
+ self.settings["pitch"] = pitch
78
+ self.settings["index_rate"] = index_rate
79
+ return f"Model '{model_name}' loaded"
80
+
81
+ def convert(self, audio: np.ndarray) -> Optional[np.ndarray]:
82
+ if self.voice_changer is None:
83
+ return None
84
+
85
+ result, _, _ = self.voice_changer.on_request(
86
+ audio,
87
+ f0_up_key=self.settings["pitch"],
88
+ index_rate=self.settings["index_rate"],
89
+ protect=self.settings["protect"],
90
+ volume_envelope=self.settings["volume_envelope"],
91
+ f0_autotune=self.settings["f0_autotune"],
92
+ f0_autotune_strength=self.settings["f0_autotune_strength"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  )
94
+ return result
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ def create_voice_changer_state() -> RealtimeVoiceChanger:
98
+ return RealtimeVoiceChanger()
99
 
 
 
 
100
 
101
+ def process_audio_stream(
102
+ state: Optional[RealtimeVoiceChanger],
103
+ audio_chunk: Optional[Tuple[int, np.ndarray]],
104
+ model_name: str,
105
+ pitch: int,
106
+ index_rate: float,
107
+ ) -> Tuple[RealtimeVoiceChanger, Optional[Tuple[int, np.ndarray]]]:
108
+ if state is None:
109
+ state = create_voice_changer_state()
110
 
111
+ if audio_chunk is None:
112
+ return state, None
 
113
 
114
+ sr, audio = audio_chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ if state.model_name != model_name and model_name:
117
+ state.load_model(model_name, pitch=pitch, index_rate=index_rate)
118
+ else:
119
+ state.settings["pitch"] = pitch
120
+ state.settings["index_rate"] = index_rate
121
 
122
+ if audio.ndim > 1:
123
+ audio = audio.mean(axis=1)
124
 
125
+ audio = audio.astype(np.float32)
126
+ if audio.max() > 1.0:
127
+ audio = audio / 32768.0
 
 
 
 
128
 
129
+ converted = state.convert(audio)
 
130
 
131
+ if converted is not None:
132
+ return state, (SAMPLE_RATE, converted)
133
+ return state, None