yukee1992 commited on
Commit
125c3d1
·
verified ·
1 Parent(s): 48cbdbe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -252
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py - Enhanced with multiple Parler-TTS loading strategies
2
  from fastapi import FastAPI, HTTPException
3
  from fastapi.responses import JSONResponse
4
  from fastapi.middleware.cors import CORSMiddleware
@@ -11,7 +11,7 @@ import os
11
  import torch
12
  import numpy as np
13
  import soundfile as sf
14
- import importlib
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
@@ -31,10 +31,7 @@ app.add_middleware(
31
  OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space")
32
 
33
  # Global variables
34
- parler_model = None
35
- parler_processor = None
36
- bark_model = None
37
- bark_processor = None
38
  model_loaded = False
39
  model_type = "none"
40
 
@@ -44,278 +41,177 @@ class VoiceoverRequest(BaseModel):
44
  voiceover_scenes: List[str]
45
  upload_to_oci: Optional[bool] = False
46
 
47
- class ModelRequest(BaseModel):
48
- action: str # "reload", "status", "switch"
49
-
50
  @app.on_event("startup")
51
  async def startup_event():
52
- """Initialize the application with multiple loading strategies"""
53
- global parler_model, parler_processor, bark_model, bark_processor, model_loaded, model_type
54
 
55
  logger.info("=== TTS API Starting ===")
56
- logger.info("Attempting to load Parler-TTS with enhanced strategies...")
57
-
58
- # Try multiple loading strategies for Parler-TTS
59
- parler_loaded = await load_parler_tts_enhanced()
60
 
61
- if parler_loaded:
 
62
  model_loaded = True
63
- model_type = "parler-tts"
64
- logger.info("✅ Parler-TTS loaded successfully!")
65
- else:
66
- # Fallback to Bark
67
- logger.info("Parler-TTS failed, loading Bark as fallback...")
68
- bark_loaded = await load_bark_model()
69
-
70
- if bark_loaded:
71
- model_loaded = True
72
- model_type = "bark"
73
- logger.info("✅ Bark model loaded as fallback!")
74
- else:
75
- logger.error("❌ All models failed to load")
76
- model_loaded = False
77
-
78
- async def load_parler_tts_enhanced():
79
- """Enhanced Parler-TTS loading with multiple strategies"""
80
- strategies = [
81
- try_strategy_1, # Direct import with trust_remote_code
82
- try_strategy_2, # Force model download
83
- try_strategy_3, # Manual configuration
84
- try_strategy_4 # Alternative import approach
85
- ]
86
 
87
- for i, strategy in enumerate(strategies, 1):
88
- logger.info(f"Trying Parler-TTS loading strategy {i}...")
89
- success = strategy()
90
- if success:
91
- return True
92
- logger.warning(f"Strategy {i} failed")
93
 
94
- return False
95
-
96
- def try_strategy_1():
97
- """Strategy 1: Direct import with trust_remote_code"""
98
- try:
99
- from transformers import AutoProcessor, AutoModel
100
-
101
- global parler_processor, parler_model
102
-
103
- # Clear cache and force download
104
- import transformers
105
- transformers.utils.move_cache()
106
-
107
- parler_processor = AutoProcessor.from_pretrained(
108
- "parler-tts/parler-tts-mini-v1",
109
- trust_remote_code=True,
110
- force_download=True,
111
- resume_download=False,
112
- local_files_only=False
113
- )
114
-
115
- parler_model = AutoModel.from_pretrained(
116
- "parler-tts/parler-tts-mini-v1",
117
- trust_remote_code=True,
118
- force_download=True,
119
- resume_download=False,
120
- local_files_only=False
121
- )
122
-
123
- # Test the model
124
- test_inputs = parler_processor(
125
- text="Test",
126
- description="A test voice",
127
- return_tensors="pt"
128
- )
129
-
130
- with torch.no_grad():
131
- test_output = parler_model.generate(**test_inputs)
132
-
133
- logger.info("✅ Strategy 1 successful!")
134
- return True
135
-
136
- except Exception as e:
137
- logger.warning(f"Strategy 1 failed: {e}")
138
- return False
139
 
140
- def try_strategy_2():
141
- """Strategy 2: Manual model configuration"""
142
  try:
143
- from transformers import AutoConfig
144
- import torch
145
-
146
- # First get the config to understand the model
147
- config = AutoConfig.from_pretrained(
148
- "parler-tts/parler-tts-mini-v1",
149
- trust_remote_code=True
150
- )
151
 
152
- logger.info(f"Model config: {config.model_type}")
153
-
154
- # Now try to load with explicit architecture
155
- if hasattr(config, 'architectures') and config.architectures:
156
- model_class = getattr(
157
- importlib.import_module('transformers'),
158
- config.architectures[0]
159
- )
160
 
161
- global parler_processor, parler_model
 
162
 
163
- parler_processor = AutoProcessor.from_pretrained(
164
- "parler-tts/parler-tts-mini-v1",
165
- trust_remote_code=True
166
- )
167
 
168
- parler_model = model_class.from_pretrained(
169
- "parler-tts/parler-tts-mini-v1",
170
- trust_remote_code=True,
171
- config=config
172
- )
173
 
174
- logger.info("✅ Strategy 2 successful!")
 
 
 
 
 
175
  return True
176
 
 
 
 
177
  except Exception as e:
178
- logger.warning(f"Strategy 2 failed: {e}")
179
- return False
180
 
181
  return False
182
 
183
- def try_strategy_3():
184
- """Strategy 3: Use model-specific classes"""
185
- try:
186
- # Try to import ParlerTTS specific classes
187
- try:
188
- from transformers import ParlerTTSForConditionalGeneration, ParlerTTSProcessor
189
- model_class = ParlerTTSForConditionalGeneration
190
- processor_class = ParlerTTSProcessor
191
- except ImportError:
192
- # If specific classes don't exist, try to create them dynamically
193
- from transformers import AutoModel, AutoProcessor
194
- model_class = AutoModel
195
- processor_class = AutoProcessor
196
-
197
- global parler_processor, parler_model
198
-
199
- parler_processor = processor_class.from_pretrained(
200
- "parler-tts/parler-tts-mini-v1",
201
- trust_remote_code=True
202
- )
203
-
204
- parler_model = model_class.from_pretrained(
205
- "parler-tts/parler-tts-mini-v1",
206
- trust_remote_code=True
207
- )
208
-
209
- logger.info("✅ Strategy 3 successful!")
210
- return True
211
-
212
- except Exception as e:
213
- logger.warning(f"Strategy 3 failed: {e}")
214
- return False
215
-
216
- def try_strategy_4():
217
- """Strategy 4: Alternative approach with different parameters"""
218
- try:
219
- from transformers import AutoProcessor, AutoModel
220
-
221
- global parler_processor, parler_model
222
-
223
- # Try with different parameters
224
- parler_processor = AutoProcessor.from_pretrained(
225
- "parler-tts/parler-tts-mini-v1",
226
- trust_remote_code=True,
227
- use_fast=True,
228
- revision="main"
229
- )
230
-
231
- parler_model = AutoModel.from_pretrained(
232
- "parler-tts/parler-tts-mini-v1",
233
- trust_remote_code=True,
234
- low_cpu_mem_usage=True,
235
- torch_dtype=torch.float32,
236
- revision="main"
237
- )
238
-
239
- logger.info("✅ Strategy 4 successful!")
240
- return True
241
-
242
- except Exception as e:
243
- logger.warning(f"Strategy 4 failed: {e}")
244
- return False
245
-
246
  async def load_bark_model():
247
  """Load Bark model as fallback"""
248
  try:
249
  from transformers import AutoProcessor, AutoModel
250
 
251
- global bark_processor, bark_model
252
-
253
- bark_processor = AutoProcessor.from_pretrained("suno/bark-small")
254
- bark_model = AutoModel.from_pretrained("suno/bark-small")
255
 
 
 
256
  return True
257
 
258
  except Exception as e:
259
  logger.error(f"Bark model loading failed: {e}")
260
  return False
261
 
262
- def generate_with_parler(text, description="A male speaker with a low-pitched voice"):
263
- """Generate voiceover using Parler-TTS"""
264
  try:
265
- inputs = parler_processor(
266
- text=text,
267
- description=description,
268
- return_tensors="pt"
269
- )
270
-
271
- with torch.no_grad():
272
- speech = parler_model.generate(**inputs)
273
-
274
- # Save audio
275
- speech = speech.cpu().numpy().squeeze()
276
- temp_dir = tempfile.gettempdir()
277
- temp_file = os.path.join(temp_dir, "parler_generated.wav")
278
-
279
- sample_rate = getattr(parler_model.config, "sampling_rate", 16000)
280
- sf.write(temp_file, speech, sample_rate)
281
-
282
- return temp_file, None
283
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  except Exception as e:
285
  return None, str(e)
 
 
286
 
287
- def generate_with_bark(text):
288
- """Generate voiceover using Bark"""
289
  try:
290
- inputs = bark_processor(text=[text], return_tensors="pt")
291
-
292
- with torch.no_grad():
293
- speech_values = bark_model.generate(**inputs, do_sample=True)
294
-
295
- # Convert and save
296
- audio_array = speech_values.cpu().numpy().squeeze()
297
- temp_dir = tempfile.gettempdir()
298
- temp_file = os.path.join(temp_dir, "bark_generated.wav")
299
-
300
- sf.write(temp_file, audio_array, 24000)
301
-
302
- return temp_file, None
303
-
 
 
 
 
 
304
  except Exception as e:
305
  return None, str(e)
306
 
307
  @app.get("/")
308
  async def root():
309
  return {
310
- "message": "TTS API with Enhanced Parler-TTS Loading",
311
  "model_loaded": model_loaded,
312
  "model_type": model_type,
313
- "strategies_tested": "4 enhanced strategies",
314
  "endpoints": {
315
  "health": "/health",
316
  "model_status": "/api/model-status",
317
- "generate_voiceovers": "/api/generate-voiceovers",
318
- "reload_model": "/api/reload-model"
319
  }
320
  }
321
 
@@ -325,8 +221,7 @@ async def health():
325
  "status": "healthy" if model_loaded else "degraded",
326
  "model_loaded": model_loaded,
327
  "model_type": model_type,
328
- "parler_loaded": parler_model is not None,
329
- "bark_loaded": bark_model is not None
330
  }
331
 
332
  @app.get("/api/model-status")
@@ -335,21 +230,11 @@ async def model_status():
335
  return {
336
  "model_loaded": model_loaded,
337
  "model_type": model_type,
338
- "parler_model_available": parler_model is not None,
339
- "bark_model_available": bark_model is not None,
340
- "loading_strategies": "4 enhanced strategies implemented"
341
  }
342
 
343
- @app.post("/api/reload-model")
344
- async def reload_model(request: ModelRequest):
345
- """Reload model with different strategy"""
346
- if request.action == "reload":
347
- # Re-initialize
348
- await startup_event()
349
- return {"status": "reloaded", "model_type": model_type}
350
- else:
351
- return {"status": "unknown_action"}
352
-
353
  @app.post("/api/generate-voiceovers")
354
  async def generate_voiceovers_endpoint(request: VoiceoverRequest):
355
  """Main API endpoint"""
@@ -364,13 +249,7 @@ async def generate_voiceovers_endpoint(request: VoiceoverRequest):
364
  filename = f"voiceover_{i:02d}.wav"
365
 
366
  logger.info(f"Generating voiceover {i} with {model_type}...")
367
-
368
- if model_type == "parler-tts" and parler_model is not None:
369
- temp_file, error = generate_with_parler(scene_text)
370
- elif model_type == "bark" and bark_model is not None:
371
- temp_file, error = generate_with_bark(scene_text)
372
- else:
373
- error = "No valid model available"
374
 
375
  if error:
376
  results.append({
@@ -408,7 +287,8 @@ async def generate_voiceovers_endpoint(request: VoiceoverRequest):
408
  "filename": filename,
409
  "text_preview": scene_text[:100] + "..." if len(scene_text) > 100 else scene_text,
410
  "uploaded_to_oci": bool(upload_result),
411
- "model": model_type
 
412
  })
413
 
414
  except Exception as e:
@@ -426,6 +306,7 @@ async def generate_voiceovers_endpoint(request: VoiceoverRequest):
426
  "successful": len([r for r in results if r['status'] == 'success']),
427
  "failed": len([r for r in results if r['status'] != 'success']),
428
  "model_type": model_type,
 
429
  "results": results
430
  }
431
 
 
1
+ # app.py - Using Coqui XTTS instead of Parler-TTS
2
  from fastapi import FastAPI, HTTPException
3
  from fastapi.responses import JSONResponse
4
  from fastapi.middleware.cors import CORSMiddleware
 
11
  import torch
12
  import numpy as np
13
  import soundfile as sf
14
+ import io
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
 
31
  OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space")
32
 
33
  # Global variables
34
+ tts_model = None
 
 
 
35
  model_loaded = False
36
  model_type = "none"
37
 
 
41
  voiceover_scenes: List[str]
42
  upload_to_oci: Optional[bool] = False
43
 
 
 
 
44
  @app.on_event("startup")
45
  async def startup_event():
46
+ """Initialize the application with Coqui XTTS"""
47
+ global tts_model, model_loaded, model_type
48
 
49
  logger.info("=== TTS API Starting ===")
 
 
 
 
50
 
51
+ # Try Coqui XTTS first (most reliable)
52
+ if await load_coqui_xtts():
53
  model_loaded = True
54
+ model_type = "coqui-xtts"
55
+ logger.info("✅ Coqui XTTS loaded successfully!")
56
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ # Fallback to Bark
59
+ if await load_bark_model():
60
+ model_loaded = True
61
+ model_type = "bark"
62
+ logger.info("✅ Bark model loaded as fallback!")
63
+ return
64
 
65
+ logger.error("❌ All models failed to load")
66
+ model_loaded = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ async def load_coqui_xtts():
69
+ """Load Coqui XTTS model"""
70
  try:
71
+ logger.info("Loading Coqui XTTS model...")
 
 
 
 
 
 
 
72
 
73
+ # Method 1: Try using transformers
74
+ try:
75
+ from transformers import AutoProcessor, AutoModel
 
 
 
 
 
76
 
77
+ processor = AutoProcessor.from_pretrained("coqui/XTTS-v2")
78
+ model = AutoModel.from_pretrained("coqui/XTTS-v2")
79
 
80
+ global tts_model
81
+ tts_model = {"processor": processor, "model": model, "type": "transformers"}
82
+ return True
 
83
 
84
+ except Exception as e:
85
+ logger.warning(f"Transformers XTTS failed: {e}")
 
 
 
86
 
87
+ # Method 2: Try using TTS package
88
+ try:
89
+ from TTS.api import TTS
90
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
91
+ global tts_model
92
+ tts_model = {"tts": tts, "type": "coqui"}
93
  return True
94
 
95
+ except Exception as e:
96
+ logger.warning(f"Coqui TTS package failed: {e}")
97
+
98
  except Exception as e:
99
+ logger.error(f"Coqui XTTS loading failed: {e}")
 
100
 
101
  return False
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  async def load_bark_model():
104
  """Load Bark model as fallback"""
105
  try:
106
  from transformers import AutoProcessor, AutoModel
107
 
108
+ processor = AutoProcessor.from_pretrained("suno/bark-small")
109
+ model = AutoModel.from_pretrained("suno/bark-small")
 
 
110
 
111
+ global tts_model
112
+ tts_model = {"processor": processor, "model": model, "type": "bark"}
113
  return True
114
 
115
  except Exception as e:
116
  logger.error(f"Bark model loading failed: {e}")
117
  return False
118
 
119
+ def generate_voiceover(text, speaker_wav=None):
120
+ """Generate voiceover using available model"""
121
  try:
122
+ if tts_model is None:
123
+ return None, "No model loaded"
124
+
125
+ if tts_model["type"] == "coqui":
126
+ # Using Coqui TTS package
127
+ tts = tts_model["tts"]
128
+ temp_dir = tempfile.gettempdir()
129
+ temp_file = os.path.join(temp_dir, "coqui_generated.wav")
130
+
131
+ tts.tts_to_file(
132
+ text=text,
133
+ speaker_wav=speaker_wav,
134
+ language="en",
135
+ file_path=temp_file
136
+ )
137
+
138
+ return temp_file, None
139
+
140
+ elif tts_model["type"] == "transformers":
141
+ # Using transformers XTTS
142
+ processor = tts_model["processor"]
143
+ model = tts_model["model"]
144
+
145
+ inputs = processor(text=[text], return_tensors="pt")
146
+ with torch.no_grad():
147
+ output = model.generate(**inputs)
148
+
149
+ temp_dir = tempfile.gettempdir()
150
+ temp_file = os.path.join(temp_dir, "xtts_generated.wav")
151
+
152
+ audio_array = output.cpu().numpy().squeeze()
153
+ sf.write(temp_file, audio_array, 24000)
154
+
155
+ return temp_file, None
156
+
157
+ elif tts_model["type"] == "bark":
158
+ # Using Bark
159
+ processor = tts_model["processor"]
160
+ model = tts_model["model"]
161
+
162
+ inputs = processor(text=[text], return_tensors="pt")
163
+ with torch.no_grad():
164
+ speech_values = model.generate(**inputs, do_sample=True)
165
+
166
+ temp_dir = tempfile.gettempdir()
167
+ temp_file = os.path.join(temp_dir, "bark_generated.wav")
168
+
169
+ audio_array = speech_values.cpu().numpy().squeeze()
170
+ sf.write(temp_file, audio_array, 24000)
171
+
172
+ return temp_file, None
173
+
174
  except Exception as e:
175
  return None, str(e)
176
+
177
+ return None, "Unknown model type"
178
 
179
+ def upload_to_oci(file_path, filename, project_id):
180
+ """Upload to OCI storage"""
181
  try:
182
+ with open(file_path, 'rb') as f:
183
+ files = {'file': (filename, f)}
184
+ data = {
185
+ 'project_id': project_id,
186
+ 'subfolder': 'voiceover'
187
+ }
188
+
189
+ response = requests.post(
190
+ f"{OCI_UPLOAD_API_URL}/api/upload",
191
+ files=files,
192
+ data=data,
193
+ timeout=30
194
+ )
195
+
196
+ if response.status_code == 200:
197
+ return response.json(), None
198
+ else:
199
+ return None, f"Upload failed: {response.status_code}"
200
+
201
  except Exception as e:
202
  return None, str(e)
203
 
204
  @app.get("/")
205
  async def root():
206
  return {
207
+ "message": "TTS API with High-Quality Voice Generation",
208
  "model_loaded": model_loaded,
209
  "model_type": model_type,
210
+ "supported_models": ["coqui-xtts", "bark"],
211
  "endpoints": {
212
  "health": "/health",
213
  "model_status": "/api/model-status",
214
+ "generate_voiceovers": "/api/generate-voiceovers"
 
215
  }
216
  }
217
 
 
221
  "status": "healthy" if model_loaded else "degraded",
222
  "model_loaded": model_loaded,
223
  "model_type": model_type,
224
+ "quality": "high" if model_type == "coqui-xtts" else "good"
 
225
  }
226
 
227
  @app.get("/api/model-status")
 
230
  return {
231
  "model_loaded": model_loaded,
232
  "model_type": model_type,
233
+ "model_quality": "high" if model_type == "coqui-xtts" else "good",
234
+ "supported_models": ["Coqui XTTS (recommended)", "Bark (fallback)"],
235
+ "message": "Using Coqui XTTS for high-quality voice generation"
236
  }
237
 
 
 
 
 
 
 
 
 
 
 
238
  @app.post("/api/generate-voiceovers")
239
  async def generate_voiceovers_endpoint(request: VoiceoverRequest):
240
  """Main API endpoint"""
 
249
  filename = f"voiceover_{i:02d}.wav"
250
 
251
  logger.info(f"Generating voiceover {i} with {model_type}...")
252
+ temp_file, error = generate_voiceover(scene_text)
 
 
 
 
 
 
253
 
254
  if error:
255
  results.append({
 
287
  "filename": filename,
288
  "text_preview": scene_text[:100] + "..." if len(scene_text) > 100 else scene_text,
289
  "uploaded_to_oci": bool(upload_result),
290
+ "model": model_type,
291
+ "quality": "high" if model_type == "coqui-xtts" else "good"
292
  })
293
 
294
  except Exception as e:
 
306
  "successful": len([r for r in results if r['status'] == 'success']),
307
  "failed": len([r for r in results if r['status'] != 'success']),
308
  "model_type": model_type,
309
+ "voice_quality": "high" if model_type == "coqui-xtts" else "good",
310
  "results": results
311
  }
312