CherithCutestory commited on
Commit
d66f3dd
·
1 Parent(s): 0c19c8e

Updated emotion handling

Browse files
Files changed (1) hide show
  1. app.py +85 -24
app.py CHANGED
@@ -26,17 +26,67 @@ BIT_DEPTH = 16
26
  CHANNELS = 1
27
  MAX_SECONDS = 60
28
 
 
 
 
 
 
 
29
  EMOTION_PRESETS = {
30
- "neutral": {"alpha": 0.3, "beta": 0.7, "embedding_scale": 1, "diffusion_steps": 5},
31
- "happy": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
32
- "sad": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
33
- "angry": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
34
- "fear": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
35
- "excited": {"alpha": 0.05, "beta": 0.95, "embedding_scale": 2.5, "diffusion_steps": 10},
36
- "calm": {"alpha": 0.5, "beta": 0.5, "embedding_scale": 1, "diffusion_steps": 5},
37
- "surprise": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
38
- "surprised": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
39
- "whisper": {"alpha": 0.5, "beta": 0.3, "embedding_scale": 0.5, "diffusion_steps": 10},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
41
 
42
  tts_engine = None
@@ -110,10 +160,10 @@ class ConvertRequest(BaseModel):
110
  voice_to_clone_sample: Optional[str] = None
111
  random_seed: Optional[int] = None
112
  emotion_set: list[str] = Field(default_factory=lambda: ["neutral"])
113
- intensity: int = 50
114
- volume: int = 75
115
- speed_adjust: float = 0.0
116
- pitch_adjust: float = 0.0
117
 
118
 
119
  @app.post("/GetEngineDetails")
@@ -131,7 +181,7 @@ async def get_engine_details(request: Request):
131
  "max_seconds_per_conversion": MAX_SECONDS,
132
  "supports_voice_cloning": True,
133
  "builtin_voices": [],
134
- "supported_emotions": ["neutral", "happy", "sad", "angry", "fear", "excited", "calm", "surprise", "whisper"],
135
  "extra_properties": {
136
  "architecture": "Style diffusion + adversarial training with large SLMs",
137
  "model": "LibriTTS multi-speaker",
@@ -181,11 +231,22 @@ async def convert_text_to_speech(request: Request):
181
 
182
  preset = EMOTION_PRESETS[emotion].copy()
183
 
 
 
184
  if req.intensity != 50:
185
- scale_factor = req.intensity / 50.0
186
- preset["embedding_scale"] = preset["embedding_scale"] * scale_factor
187
  preset["embedding_scale"] = max(0.1, min(5.0, preset["embedding_scale"]))
188
 
 
 
 
 
 
 
 
 
 
 
189
  ref_wav_path = None
190
  if req.voice_to_clone_sample:
191
  try:
@@ -248,14 +309,14 @@ async def convert_text_to_speech(request: Request):
248
  if max_val > 0:
249
  audio_np = audio_np / max_val
250
 
251
- if req.speed_adjust != 0.0:
252
- speed_factor = 1.0 + (req.speed_adjust / 100.0)
253
- speed_factor = max(0.5, min(2.0, speed_factor))
254
- audio_np = pyrb.time_stretch(audio_np, SAMPLE_RATE, speed_factor)
255
 
256
- if req.pitch_adjust != 0.0:
257
- semitones = req.pitch_adjust * 0.24
258
- audio_np = pyrb.pitch_shift(audio_np, SAMPLE_RATE, semitones)
259
 
260
  vol_factor = req.volume / 75.0
261
  audio_np = audio_np * vol_factor
 
26
  CHANNELS = 1
27
  MAX_SECONDS = 60
28
 
29
+ CANONICAL_EMOTIONS = [
30
+ "neutral", "happy", "sad", "angry", "fear",
31
+ "surprise", "disgust", "excited", "calm", "confused",
32
+ "anxious", "hopeful", "melancholy", "fearful",
33
+ ]
34
+
35
  EMOTION_PRESETS = {
36
+ "neutral": {"alpha": 0.3, "beta": 0.7, "embedding_scale": 1, "diffusion_steps": 5},
37
+ "happy": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
38
+ "sad": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
39
+ "angry": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
40
+ "fear": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
41
+ "excited": {"alpha": 0.05, "beta": 0.95, "embedding_scale": 2.5, "diffusion_steps": 10},
42
+ "calm": {"alpha": 0.5, "beta": 0.5, "embedding_scale": 1, "diffusion_steps": 5},
43
+ "surprise": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
44
+ "surprised": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
45
+ "whisper": {"alpha": 0.5, "beta": 0.3, "embedding_scale": 0.5, "diffusion_steps": 10},
46
+ "confused": {"alpha": 0.2, "beta": 0.8, "embedding_scale": 1.5, "diffusion_steps": 8},
47
+ "anxious": {"alpha": 0.15, "beta": 0.85, "embedding_scale": 1.8, "diffusion_steps": 10},
48
+ "hopeful": {"alpha": 0.2, "beta": 0.8, "embedding_scale": 1.8, "diffusion_steps": 8},
49
+ "melancholy":{"alpha": 0.15, "beta": 0.85, "embedding_scale": 1.8, "diffusion_steps": 10},
50
+ "fearful": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
51
+ "disgust": {"alpha": 0.1, "beta": 0.9, "embedding_scale": 2, "diffusion_steps": 10},
52
+ }
53
+
54
+ EMOTION_SPEED_MAP = {
55
+ "neutral": 1.0,
56
+ "happy": 1.04,
57
+ "sad": 0.94,
58
+ "angry": 1.06,
59
+ "fear": 1.05,
60
+ "excited": 1.08,
61
+ "calm": 0.94,
62
+ "surprise": 1.05,
63
+ "surprised": 1.05,
64
+ "whisper": 0.92,
65
+ "confused": 0.97,
66
+ "anxious": 1.04,
67
+ "hopeful": 1.02,
68
+ "melancholy": 0.93,
69
+ "fearful": 1.05,
70
+ "disgust": 0.98,
71
+ }
72
+
73
+ EMOTION_PITCH_MAP = {
74
+ "neutral": 0.0,
75
+ "happy": 0.5,
76
+ "sad": -0.4,
77
+ "angry": -0.3,
78
+ "fear": 0.3,
79
+ "excited": 0.7,
80
+ "calm": 0.0,
81
+ "surprise": 0.6,
82
+ "surprised": 0.6,
83
+ "whisper": -0.2,
84
+ "confused": 0.2,
85
+ "anxious": 0.3,
86
+ "hopeful": 0.3,
87
+ "melancholy":-0.3,
88
+ "fearful": 0.3,
89
+ "disgust": -0.2,
90
  }
91
 
92
  tts_engine = None
 
160
  voice_to_clone_sample: Optional[str] = None
161
  random_seed: Optional[int] = None
162
  emotion_set: list[str] = Field(default_factory=lambda: ["neutral"])
163
+ intensity: int = Field(default=50, ge=1, le=100)
164
+ volume: int = Field(default=75, ge=1, le=100)
165
+ speed_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
166
+ pitch_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
167
 
168
 
169
  @app.post("/GetEngineDetails")
 
181
  "max_seconds_per_conversion": MAX_SECONDS,
182
  "supports_voice_cloning": True,
183
  "builtin_voices": [],
184
+ "supported_emotions": CANONICAL_EMOTIONS,
185
  "extra_properties": {
186
  "architecture": "Style diffusion + adversarial training with large SLMs",
187
  "model": "LibriTTS multi-speaker",
 
231
 
232
  preset = EMOTION_PRESETS[emotion].copy()
233
 
234
+ intensity_scale = req.intensity / 50.0
235
+
236
  if req.intensity != 50:
237
+ preset["embedding_scale"] = preset["embedding_scale"] * intensity_scale
 
238
  preset["embedding_scale"] = max(0.1, min(5.0, preset["embedding_scale"]))
239
 
240
+ base_emotion_speed = EMOTION_SPEED_MAP.get(emotion, 1.0)
241
+ emotion_speed = 1.0 + (base_emotion_speed - 1.0) * intensity_scale
242
+ base_emotion_pitch = EMOTION_PITCH_MAP.get(emotion, 0.0)
243
+ emotion_pitch = base_emotion_pitch * intensity_scale
244
+
245
+ logger.info(
246
+ f"StyleTTS2 emotion={emotion}, intensity={req.intensity}, "
247
+ f"preset={preset}, emotion_speed={emotion_speed:.3f}, emotion_pitch={emotion_pitch:.2f}"
248
+ )
249
+
250
  ref_wav_path = None
251
  if req.voice_to_clone_sample:
252
  try:
 
309
  if max_val > 0:
310
  audio_np = audio_np / max_val
311
 
312
+ combined_speed = emotion_speed * (1.0 + (req.speed_adjust / 100.0))
313
+ combined_speed = max(0.5, min(2.0, combined_speed))
314
+ if abs(combined_speed - 1.0) > 0.01:
315
+ audio_np = pyrb.time_stretch(audio_np, SAMPLE_RATE, combined_speed)
316
 
317
+ combined_pitch = emotion_pitch + (req.pitch_adjust * 0.24)
318
+ if abs(combined_pitch) > 0.01:
319
+ audio_np = pyrb.pitch_shift(audio_np, SAMPLE_RATE, combined_pitch)
320
 
321
  vol_factor = req.volume / 75.0
322
  audio_np = audio_np * vol_factor