Spaces:
Running
Running
Luis J Camargo commited on
Commit ·
90f1441
1
Parent(s): 4e7deef
refactor: Remove unused imports and commented code, add detailed logging for audio processing, and update Gradio launch parameters.
Browse files
app.py
CHANGED
|
@@ -1,13 +1,9 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
import os
|
| 3 |
import gradio as gr
|
| 4 |
import torch
|
| 5 |
import numpy as np
|
| 6 |
from transformers import WhisperProcessor, AutoConfig, AutoModel, WhisperConfig, WhisperPreTrainedModel
|
| 7 |
from transformers.models.whisper.modeling_whisper import WhisperEncoder
|
| 8 |
import torch.nn as nn
|
| 9 |
-
from safetensors.torch import load_file
|
| 10 |
-
from huggingface_hub import hf_hub_download
|
| 11 |
|
| 12 |
# === CUSTOM MODEL CLASSES ===
|
| 13 |
class WhisperEncoderOnlyConfig(WhisperConfig):
|
|
@@ -79,13 +75,7 @@ MODEL_REPO = "tachiwin/language_classification_enconly_model_2"
|
|
| 79 |
|
| 80 |
print("Loading model on CPU...")
|
| 81 |
processor = WhisperProcessor.from_pretrained(MODEL_REPO)
|
| 82 |
-
#config = WhisperEncoderOnlyConfig.from_pretrained(MODEL_REPO)
|
| 83 |
model = WhisperEncoderOnlyForClassification.from_pretrained(MODEL_REPO)
|
| 84 |
-
|
| 85 |
-
# Load weights from safetensors
|
| 86 |
-
#weights_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors")
|
| 87 |
-
#state_dict = load_file(weights_path)
|
| 88 |
-
#model.load_state_dict(state_dict)
|
| 89 |
model.eval()
|
| 90 |
|
| 91 |
print("Model loaded successfully!")
|
|
@@ -96,8 +86,12 @@ def predict_language(audio):
|
|
| 96 |
return "⚠️ No audio provided", "", ""
|
| 97 |
|
| 98 |
sample_rate, audio_array = audio
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# Normalization
|
|
|
|
| 101 |
if audio_array.dtype == np.int16:
|
| 102 |
audio_array = audio_array.astype(np.float32) / 32768.0
|
| 103 |
elif audio_array.dtype == np.int32:
|
|
@@ -105,10 +99,12 @@ def predict_language(audio):
|
|
| 105 |
|
| 106 |
# Resampling
|
| 107 |
if sample_rate != 16000:
|
|
|
|
| 108 |
import librosa
|
| 109 |
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
|
| 110 |
|
| 111 |
# Preprocessing
|
|
|
|
| 112 |
inputs = processor(
|
| 113 |
audio_array,
|
| 114 |
sampling_rate=16000,
|
|
@@ -116,10 +112,12 @@ def predict_language(audio):
|
|
| 116 |
)
|
| 117 |
|
| 118 |
# Inference
|
|
|
|
| 119 |
with torch.no_grad():
|
| 120 |
outputs = model(input_features=inputs.input_features)
|
| 121 |
|
| 122 |
# Post-processing
|
|
|
|
| 123 |
fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
|
| 124 |
super_probs = torch.softmax(outputs["super_logits"], dim=-1)
|
| 125 |
code_probs = torch.softmax(outputs["code_logits"], dim=-1)
|
|
@@ -132,6 +130,9 @@ def predict_language(audio):
|
|
| 132 |
super_conf = super_probs[0, super_idx].item()
|
| 133 |
code_conf = code_probs[0, code_idx].item()
|
| 134 |
|
|
|
|
|
|
|
|
|
|
| 135 |
# Formatting results
|
| 136 |
return (
|
| 137 |
{f"{fam_idx}": fam_conf},
|
|
@@ -196,5 +197,9 @@ with gr.Blocks() as demo:
|
|
| 196 |
)
|
| 197 |
|
| 198 |
if __name__ == "__main__":
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
from transformers import WhisperProcessor, AutoConfig, AutoModel, WhisperConfig, WhisperPreTrainedModel
|
| 5 |
from transformers.models.whisper.modeling_whisper import WhisperEncoder
|
| 6 |
import torch.nn as nn
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# === CUSTOM MODEL CLASSES ===
|
| 9 |
class WhisperEncoderOnlyConfig(WhisperConfig):
|
|
|
|
| 75 |
|
| 76 |
print("Loading model on CPU...")
|
| 77 |
processor = WhisperProcessor.from_pretrained(MODEL_REPO)
|
|
|
|
| 78 |
model = WhisperEncoderOnlyForClassification.from_pretrained(MODEL_REPO)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
model.eval()
|
| 80 |
|
| 81 |
print("Model loaded successfully!")
|
|
|
|
| 86 |
return "⚠️ No audio provided", "", ""
|
| 87 |
|
| 88 |
sample_rate, audio_array = audio
|
| 89 |
+
audio_len_sec = len(audio_array) / sample_rate
|
| 90 |
+
print(f"\n--- [LOG] New Request ---")
|
| 91 |
+
print(f"[LOG] Audio length: {audio_len_sec:.2f}s, SR: {sample_rate}")
|
| 92 |
|
| 93 |
# Normalization
|
| 94 |
+
print("[LOG] Step 1: Normalizing audio...")
|
| 95 |
if audio_array.dtype == np.int16:
|
| 96 |
audio_array = audio_array.astype(np.float32) / 32768.0
|
| 97 |
elif audio_array.dtype == np.int32:
|
|
|
|
| 99 |
|
| 100 |
# Resampling
|
| 101 |
if sample_rate != 16000:
|
| 102 |
+
print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
|
| 103 |
import librosa
|
| 104 |
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
|
| 105 |
|
| 106 |
# Preprocessing
|
| 107 |
+
print("[LOG] Step 3: Extracting features...")
|
| 108 |
inputs = processor(
|
| 109 |
audio_array,
|
| 110 |
sampling_rate=16000,
|
|
|
|
| 112 |
)
|
| 113 |
|
| 114 |
# Inference
|
| 115 |
+
print("[LOG] Step 4: Running model inference (CPU intensive)...")
|
| 116 |
with torch.no_grad():
|
| 117 |
outputs = model(input_features=inputs.input_features)
|
| 118 |
|
| 119 |
# Post-processing
|
| 120 |
+
print("[LOG] Step 5: Post-processing results...")
|
| 121 |
fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
|
| 122 |
super_probs = torch.softmax(outputs["super_logits"], dim=-1)
|
| 123 |
code_probs = torch.softmax(outputs["code_logits"], dim=-1)
|
|
|
|
| 130 |
super_conf = super_probs[0, super_idx].item()
|
| 131 |
code_conf = code_probs[0, code_idx].item()
|
| 132 |
|
| 133 |
+
print(f"[LOG] Prediction successful: Family {fam_idx}")
|
| 134 |
+
print(f"--- [LOG] Request Finished ---\n")
|
| 135 |
+
|
| 136 |
# Formatting results
|
| 137 |
return (
|
| 138 |
{f"{fam_idx}": fam_conf},
|
|
|
|
| 197 |
)
|
| 198 |
|
| 199 |
if __name__ == "__main__":
|
| 200 |
+
# Increased concurrency for CPU stability
|
| 201 |
+
demo.launch(
|
| 202 |
+
theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"),
|
| 203 |
+
ssr_mode=False,
|
| 204 |
+
show_error=True
|
| 205 |
+
)
|