CherithCutestory commited on
Commit
2e0989d
·
1 Parent(s): 447c73c

Added engine-specific parameter issue

Browse files
Files changed (1) hide show
  1. app.py +47 -6
app.py CHANGED
@@ -17,7 +17,7 @@ from pathlib import Path
17
  from fastapi import FastAPI, Request, HTTPException
18
  from fastapi.responses import Response, JSONResponse, HTMLResponse
19
  from pydantic import BaseModel, Field
20
- from typing import Optional
21
 
22
  logging.basicConfig(level=logging.INFO)
23
  logger = logging.getLogger("chatterbox-engine")
@@ -300,6 +300,7 @@ class ConvertRequest(BaseModel):
300
  volume: int = Field(default=75, ge=1, le=100)
301
  speed_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
302
  pitch_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
 
303
 
304
 
305
  @app.post("/GetEngineDetails")
@@ -316,6 +317,32 @@ async def get_engine_details(request: Request):
316
  "supports_voice_cloning": True,
317
  "builtin_voices": [],
318
  "supported_emotions": CANONICAL_EMOTIONS,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  "extra_properties": {
320
  "model": "ResembleAI/chatterbox",
321
  "max_characters": MAX_CHARS,
@@ -411,13 +438,25 @@ async def convert_text_to_speech(request: Request):
411
 
412
  dominant_emotion = req.emotion_set[0].lower(
413
  ) if req.emotion_set else "neutral"
414
- base_exaggeration = EMOTION_EXAGGERATION_MAP.get(dominant_emotion, 0.5)
415
  intensity_factor = req.intensity / 50.0
416
- exaggeration = min(1.0, max(0.0, base_exaggeration * intensity_factor))
417
 
418
- cfg_weight = EMOTION_CFG_MAP.get(dominant_emotion, 0.5)
419
 
420
- temperature = EMOTION_TEMPERATURE_MAP.get(dominant_emotion, 0.8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
  emotion_speed = EMOTION_SPEED_MAP.get(dominant_emotion, 1.0)
423
  emotion_pitch = EMOTION_PITCH_MAP.get(dominant_emotion, 0.0)
@@ -425,11 +464,13 @@ async def convert_text_to_speech(request: Request):
425
  emotion_speed = 1.0 + (emotion_speed - 1.0) * intensity_factor
426
  emotion_pitch = emotion_pitch * intensity_factor
427
 
 
428
  logger.info(
429
  f"Generating with Chatterbox: emotion={dominant_emotion}, "
430
  f"exaggeration={exaggeration:.2f}, cfg={cfg_weight:.2f}, "
431
  f"temperature={temperature:.2f}, emotion_speed={emotion_speed:.3f}, "
432
- f"emotion_pitch={emotion_pitch:.2f}, text_len={len(text)}")
 
433
 
434
  wav = tts_model.generate(
435
  text,
 
17
  from fastapi import FastAPI, Request, HTTPException
18
  from fastapi.responses import Response, JSONResponse, HTMLResponse
19
  from pydantic import BaseModel, Field
20
+ from typing import Optional, Dict, Any
21
 
22
  logging.basicConfig(level=logging.INFO)
23
  logger = logging.getLogger("chatterbox-engine")
 
300
  volume: int = Field(default=75, ge=1, le=100)
301
  speed_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
302
  pitch_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
303
+ engine_options: Optional[Dict[str, Any]] = None
304
 
305
 
306
  @app.post("/GetEngineDetails")
 
317
  "supports_voice_cloning": True,
318
  "builtin_voices": [],
319
  "supported_emotions": CANONICAL_EMOTIONS,
320
+ "engine_params": [
321
+ {
322
+ "short_name": "exaggeration",
323
+ "friendly_name": "Exaggeration",
324
+ "data_type": "float",
325
+ "min_value": 0.25,
326
+ "max_value": 2.0,
327
+ "default_value": 0.5,
328
+ },
329
+ {
330
+ "short_name": "cfg_weight",
331
+ "friendly_name": "CFG Weight",
332
+ "data_type": "float",
333
+ "min_value": 0.0,
334
+ "max_value": 1.0,
335
+ "default_value": 0.5,
336
+ },
337
+ {
338
+ "short_name": "temperature",
339
+ "friendly_name": "Temperature",
340
+ "data_type": "float",
341
+ "min_value": 0.05,
342
+ "max_value": 5.0,
343
+ "default_value": 0.8,
344
+ },
345
+ ],
346
  "extra_properties": {
347
  "model": "ResembleAI/chatterbox",
348
  "max_characters": MAX_CHARS,
 
438
 
439
  dominant_emotion = req.emotion_set[0].lower(
440
  ) if req.emotion_set else "neutral"
 
441
  intensity_factor = req.intensity / 50.0
 
442
 
443
+ opts = req.engine_options or {}
444
 
445
+ if "exaggeration" in opts:
446
+ exaggeration = float(max(0.25, min(2.0, opts["exaggeration"])))
447
+ else:
448
+ base_exaggeration = EMOTION_EXAGGERATION_MAP.get(dominant_emotion, 0.5)
449
+ exaggeration = min(1.0, max(0.0, base_exaggeration * intensity_factor))
450
+
451
+ if "cfg_weight" in opts:
452
+ cfg_weight = float(max(0.0, min(1.0, opts["cfg_weight"])))
453
+ else:
454
+ cfg_weight = EMOTION_CFG_MAP.get(dominant_emotion, 0.5)
455
+
456
+ if "temperature" in opts:
457
+ temperature = float(max(0.05, min(5.0, opts["temperature"])))
458
+ else:
459
+ temperature = EMOTION_TEMPERATURE_MAP.get(dominant_emotion, 0.8)
460
 
461
  emotion_speed = EMOTION_SPEED_MAP.get(dominant_emotion, 1.0)
462
  emotion_pitch = EMOTION_PITCH_MAP.get(dominant_emotion, 0.0)
 
464
  emotion_speed = 1.0 + (emotion_speed - 1.0) * intensity_factor
465
  emotion_pitch = emotion_pitch * intensity_factor
466
 
467
+ override_keys = [k for k in ("exaggeration", "cfg_weight", "temperature") if k in opts]
468
  logger.info(
469
  f"Generating with Chatterbox: emotion={dominant_emotion}, "
470
  f"exaggeration={exaggeration:.2f}, cfg={cfg_weight:.2f}, "
471
  f"temperature={temperature:.2f}, emotion_speed={emotion_speed:.3f}, "
472
+ f"emotion_pitch={emotion_pitch:.2f}, text_len={len(text)}"
473
+ + (f", overrides={override_keys}" if override_keys else ""))
474
 
475
  wav = tts_model.generate(
476
  text,