Commit ·
2e0989d
1
Parent(s): 447c73c
Added engine-specific parameter issue
Browse files
app.py
CHANGED
|
@@ -17,7 +17,7 @@ from pathlib import Path
|
|
| 17 |
from fastapi import FastAPI, Request, HTTPException
|
| 18 |
from fastapi.responses import Response, JSONResponse, HTMLResponse
|
| 19 |
from pydantic import BaseModel, Field
|
| 20 |
-
from typing import Optional
|
| 21 |
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
| 23 |
logger = logging.getLogger("chatterbox-engine")
|
|
@@ -300,6 +300,7 @@ class ConvertRequest(BaseModel):
|
|
| 300 |
volume: int = Field(default=75, ge=1, le=100)
|
| 301 |
speed_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
|
| 302 |
pitch_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
|
|
|
|
| 303 |
|
| 304 |
|
| 305 |
@app.post("/GetEngineDetails")
|
|
@@ -316,6 +317,32 @@ async def get_engine_details(request: Request):
|
|
| 316 |
"supports_voice_cloning": True,
|
| 317 |
"builtin_voices": [],
|
| 318 |
"supported_emotions": CANONICAL_EMOTIONS,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
"extra_properties": {
|
| 320 |
"model": "ResembleAI/chatterbox",
|
| 321 |
"max_characters": MAX_CHARS,
|
|
@@ -411,13 +438,25 @@ async def convert_text_to_speech(request: Request):
|
|
| 411 |
|
| 412 |
dominant_emotion = req.emotion_set[0].lower(
|
| 413 |
) if req.emotion_set else "neutral"
|
| 414 |
-
base_exaggeration = EMOTION_EXAGGERATION_MAP.get(dominant_emotion, 0.5)
|
| 415 |
intensity_factor = req.intensity / 50.0
|
| 416 |
-
exaggeration = min(1.0, max(0.0, base_exaggeration * intensity_factor))
|
| 417 |
|
| 418 |
-
|
| 419 |
|
| 420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
emotion_speed = EMOTION_SPEED_MAP.get(dominant_emotion, 1.0)
|
| 423 |
emotion_pitch = EMOTION_PITCH_MAP.get(dominant_emotion, 0.0)
|
|
@@ -425,11 +464,13 @@ async def convert_text_to_speech(request: Request):
|
|
| 425 |
emotion_speed = 1.0 + (emotion_speed - 1.0) * intensity_factor
|
| 426 |
emotion_pitch = emotion_pitch * intensity_factor
|
| 427 |
|
|
|
|
| 428 |
logger.info(
|
| 429 |
f"Generating with Chatterbox: emotion={dominant_emotion}, "
|
| 430 |
f"exaggeration={exaggeration:.2f}, cfg={cfg_weight:.2f}, "
|
| 431 |
f"temperature={temperature:.2f}, emotion_speed={emotion_speed:.3f}, "
|
| 432 |
-
f"emotion_pitch={emotion_pitch:.2f}, text_len={len(text)}"
|
|
|
|
| 433 |
|
| 434 |
wav = tts_model.generate(
|
| 435 |
text,
|
|
|
|
| 17 |
from fastapi import FastAPI, Request, HTTPException
|
| 18 |
from fastapi.responses import Response, JSONResponse, HTMLResponse
|
| 19 |
from pydantic import BaseModel, Field
|
| 20 |
+
from typing import Optional, Dict, Any
|
| 21 |
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
| 23 |
logger = logging.getLogger("chatterbox-engine")
|
|
|
|
| 300 |
volume: int = Field(default=75, ge=1, le=100)
|
| 301 |
speed_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
|
| 302 |
pitch_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
|
| 303 |
+
engine_options: Optional[Dict[str, Any]] = None
|
| 304 |
|
| 305 |
|
| 306 |
@app.post("/GetEngineDetails")
|
|
|
|
| 317 |
"supports_voice_cloning": True,
|
| 318 |
"builtin_voices": [],
|
| 319 |
"supported_emotions": CANONICAL_EMOTIONS,
|
| 320 |
+
"engine_params": [
|
| 321 |
+
{
|
| 322 |
+
"short_name": "exaggeration",
|
| 323 |
+
"friendly_name": "Exaggeration",
|
| 324 |
+
"data_type": "float",
|
| 325 |
+
"min_value": 0.25,
|
| 326 |
+
"max_value": 2.0,
|
| 327 |
+
"default_value": 0.5,
|
| 328 |
+
},
|
| 329 |
+
{
|
| 330 |
+
"short_name": "cfg_weight",
|
| 331 |
+
"friendly_name": "CFG Weight",
|
| 332 |
+
"data_type": "float",
|
| 333 |
+
"min_value": 0.0,
|
| 334 |
+
"max_value": 1.0,
|
| 335 |
+
"default_value": 0.5,
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"short_name": "temperature",
|
| 339 |
+
"friendly_name": "Temperature",
|
| 340 |
+
"data_type": "float",
|
| 341 |
+
"min_value": 0.05,
|
| 342 |
+
"max_value": 5.0,
|
| 343 |
+
"default_value": 0.8,
|
| 344 |
+
},
|
| 345 |
+
],
|
| 346 |
"extra_properties": {
|
| 347 |
"model": "ResembleAI/chatterbox",
|
| 348 |
"max_characters": MAX_CHARS,
|
|
|
|
| 438 |
|
| 439 |
dominant_emotion = req.emotion_set[0].lower(
|
| 440 |
) if req.emotion_set else "neutral"
|
|
|
|
| 441 |
intensity_factor = req.intensity / 50.0
|
|
|
|
| 442 |
|
| 443 |
+
opts = req.engine_options or {}
|
| 444 |
|
| 445 |
+
if "exaggeration" in opts:
|
| 446 |
+
exaggeration = float(max(0.25, min(2.0, opts["exaggeration"])))
|
| 447 |
+
else:
|
| 448 |
+
base_exaggeration = EMOTION_EXAGGERATION_MAP.get(dominant_emotion, 0.5)
|
| 449 |
+
exaggeration = min(1.0, max(0.0, base_exaggeration * intensity_factor))
|
| 450 |
+
|
| 451 |
+
if "cfg_weight" in opts:
|
| 452 |
+
cfg_weight = float(max(0.0, min(1.0, opts["cfg_weight"])))
|
| 453 |
+
else:
|
| 454 |
+
cfg_weight = EMOTION_CFG_MAP.get(dominant_emotion, 0.5)
|
| 455 |
+
|
| 456 |
+
if "temperature" in opts:
|
| 457 |
+
temperature = float(max(0.05, min(5.0, opts["temperature"])))
|
| 458 |
+
else:
|
| 459 |
+
temperature = EMOTION_TEMPERATURE_MAP.get(dominant_emotion, 0.8)
|
| 460 |
|
| 461 |
emotion_speed = EMOTION_SPEED_MAP.get(dominant_emotion, 1.0)
|
| 462 |
emotion_pitch = EMOTION_PITCH_MAP.get(dominant_emotion, 0.0)
|
|
|
|
| 464 |
emotion_speed = 1.0 + (emotion_speed - 1.0) * intensity_factor
|
| 465 |
emotion_pitch = emotion_pitch * intensity_factor
|
| 466 |
|
| 467 |
+
override_keys = [k for k in ("exaggeration", "cfg_weight", "temperature") if k in opts]
|
| 468 |
logger.info(
|
| 469 |
f"Generating with Chatterbox: emotion={dominant_emotion}, "
|
| 470 |
f"exaggeration={exaggeration:.2f}, cfg={cfg_weight:.2f}, "
|
| 471 |
f"temperature={temperature:.2f}, emotion_speed={emotion_speed:.3f}, "
|
| 472 |
+
f"emotion_pitch={emotion_pitch:.2f}, text_len={len(text)}"
|
| 473 |
+
+ (f", overrides={override_keys}" if override_keys else ""))
|
| 474 |
|
| 475 |
wav = tts_model.generate(
|
| 476 |
text,
|