Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -27,21 +27,25 @@ Segment: Any = None
|
|
| 27 |
|
| 28 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
COMPUTE_TYPE = "float16" if device == "cuda" else "float32"
|
|
|
|
| 30 |
token = os.environ.get("HF_TOKEN")
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
pyannote_device = torch.device(device)
|
| 34 |
-
|
| 35 |
"pyannote/speaker-diarization-3.1",
|
| 36 |
use_auth_token=token
|
| 37 |
).to(pyannote_device)
|
| 38 |
-
print("Pyannote pipeline loaded
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
diarization_pipeline = None
|
| 44 |
-
global_diarizer = diarization_pipeline
|
| 45 |
model_name = "medium"
|
| 46 |
ALIGN_MODEL_MAP = {
|
| 47 |
"ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"}
|
|
@@ -197,6 +201,9 @@ def analyze_audio(audio_file: str,
|
|
| 197 |
temp_preproc = None
|
| 198 |
|
| 199 |
start_ml_time = time.time()
|
|
|
|
|
|
|
|
|
|
| 200 |
try:
|
| 201 |
print(f"Loading Whisper model '{model_name}' on {device}...")
|
| 202 |
model = whisperx.load_model(model_name, device, compute_type="float32")
|
|
@@ -250,16 +257,19 @@ def analyze_audio(audio_file: str,
|
|
| 250 |
warn(results, "ALIGN_SKIP", "Alignment model unavailable; using raw Whisper segments.")
|
| 251 |
print("Cleaning up Whisper model memory...")
|
| 252 |
del model
|
|
|
|
| 253 |
del audio_loaded
|
|
|
|
| 254 |
if device == "cuda":
|
| 255 |
torch.cuda.empty_cache()
|
| 256 |
gc.collect()
|
| 257 |
print("Memory cleanup complete.")
|
| 258 |
diarize_output = None
|
| 259 |
-
|
|
|
|
| 260 |
print("Performing speaker diarization (Requires HF_TOKEN)...")
|
| 261 |
try:
|
| 262 |
-
diarize_output =
|
| 263 |
for segment, _, label in diarize_output.itertracks(yield_label=True):
|
| 264 |
print(f"start={segment.start:.1f}s stop={segment.end:.1f}s {label}")
|
| 265 |
except Exception as e:
|
|
@@ -267,7 +277,15 @@ def analyze_audio(audio_file: str,
|
|
| 267 |
diarize_output = None
|
| 268 |
else:
|
| 269 |
warn(results, "DIAR_SKIP", "HF_TOKEN not set or Diarization Pipeline failed to load globally. Skipping speaker diarization.")
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
try:
|
| 272 |
diarize_segments_for_assignment = []
|
| 273 |
if diarize_output is not None and hasattr(diarize_output, "itertracks"):
|
|
|
|
| 27 |
|
| 28 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
COMPUTE_TYPE = "float16" if device == "cuda" else "float32"
|
| 30 |
+
BATCH_SIZE = 16
|
| 31 |
token = os.environ.get("HF_TOKEN")
|
| 32 |
+
global_diarizer = None
|
| 33 |
+
def load_pyannote_pipeline():
|
| 34 |
+
"""Loads and returns the Pyannote Diarization pipeline."""
|
| 35 |
+
if not token:
|
| 36 |
+
print("HF_TOKEN not set. Diarization is unavailable.")
|
| 37 |
+
return None
|
| 38 |
+
try:
|
| 39 |
pyannote_device = torch.device(device)
|
| 40 |
+
pipeline = Pipeline.from_pretrained(
|
| 41 |
"pyannote/speaker-diarization-3.1",
|
| 42 |
use_auth_token=token
|
| 43 |
).to(pyannote_device)
|
| 44 |
+
print("Pyannote pipeline loaded dynamically.")
|
| 45 |
+
return pipeline
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Error loading pyannote pipeline dynamically: {type(e).__name__}: {e}. Diarization will be skipped.")
|
| 48 |
+
return None
|
|
|
|
|
|
|
| 49 |
model_name = "medium"
|
| 50 |
ALIGN_MODEL_MAP = {
|
| 51 |
"ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"}
|
|
|
|
| 201 |
temp_preproc = None
|
| 202 |
|
| 203 |
start_ml_time = time.time()
|
| 204 |
+
model = None
|
| 205 |
+
audio_loaded = None
|
| 206 |
+
diarization_pipeline = None
|
| 207 |
try:
|
| 208 |
print(f"Loading Whisper model '{model_name}' on {device}...")
|
| 209 |
model = whisperx.load_model(model_name, device, compute_type="float32")
|
|
|
|
| 257 |
warn(results, "ALIGN_SKIP", "Alignment model unavailable; using raw Whisper segments.")
|
| 258 |
print("Cleaning up Whisper model memory...")
|
| 259 |
del model
|
| 260 |
+
model = None
|
| 261 |
del audio_loaded
|
| 262 |
+
audio_loaded = None
|
| 263 |
if device == "cuda":
|
| 264 |
torch.cuda.empty_cache()
|
| 265 |
gc.collect()
|
| 266 |
print("Memory cleanup complete.")
|
| 267 |
diarize_output = None
|
| 268 |
+
diarization_pipeline = load_pyannote_pipeline()
|
| 269 |
+
if diarization_pipeline is not None:
|
| 270 |
print("Performing speaker diarization (Requires HF_TOKEN)...")
|
| 271 |
try:
|
| 272 |
+
diarize_output = diarization_pipeline(audio_for_model)
|
| 273 |
for segment, _, label in diarize_output.itertracks(yield_label=True):
|
| 274 |
print(f"start={segment.start:.1f}s stop={segment.end:.1f}s {label}")
|
| 275 |
except Exception as e:
|
|
|
|
| 277 |
diarize_output = None
|
| 278 |
else:
|
| 279 |
warn(results, "DIAR_SKIP", "HF_TOKEN not set or Diarization Pipeline failed to load globally. Skipping speaker diarization.")
|
| 280 |
+
if diarization_pipeline is not None:
|
| 281 |
+
print("Cleaning up Pyannote model memory...")
|
| 282 |
+
del diarization_pipeline
|
| 283 |
+
diarization_pipeline = None
|
| 284 |
+
if device == "cuda":
|
| 285 |
+
torch.cuda.empty_cache()
|
| 286 |
+
gc.collect()
|
| 287 |
+
print("Pyannote cleanup complete.")
|
| 288 |
+
print("Assigning speakers to words...")
|
| 289 |
try:
|
| 290 |
diarize_segments_for_assignment = []
|
| 291 |
if diarize_output is not None and hasattr(diarize_output, "itertracks"):
|