Luis J Camargo commited on
Commit
90f1441
·
1 Parent(s): 4e7deef

refactor: Remove unused imports and commented code, add detailed logging for audio processing, and update Gradio launch parameters.

Browse files
Files changed (1) hide show
  1. app.py +17 -12
app.py CHANGED
@@ -1,13 +1,9 @@
1
- # app.py
2
- import os
3
  import gradio as gr
4
  import torch
5
  import numpy as np
6
  from transformers import WhisperProcessor, AutoConfig, AutoModel, WhisperConfig, WhisperPreTrainedModel
7
  from transformers.models.whisper.modeling_whisper import WhisperEncoder
8
  import torch.nn as nn
9
- from safetensors.torch import load_file
10
- from huggingface_hub import hf_hub_download
11
 
12
  # === CUSTOM MODEL CLASSES ===
13
  class WhisperEncoderOnlyConfig(WhisperConfig):
@@ -79,13 +75,7 @@ MODEL_REPO = "tachiwin/language_classification_enconly_model_2"
79
 
80
  print("Loading model on CPU...")
81
  processor = WhisperProcessor.from_pretrained(MODEL_REPO)
82
- #config = WhisperEncoderOnlyConfig.from_pretrained(MODEL_REPO)
83
  model = WhisperEncoderOnlyForClassification.from_pretrained(MODEL_REPO)
84
-
85
- # Load weights from safetensors
86
- #weights_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors")
87
- #state_dict = load_file(weights_path)
88
- #model.load_state_dict(state_dict)
89
  model.eval()
90
 
91
  print("Model loaded successfully!")
@@ -96,8 +86,12 @@ def predict_language(audio):
96
  return "⚠️ No audio provided", "", ""
97
 
98
  sample_rate, audio_array = audio
 
 
 
99
 
100
  # Normalization
 
101
  if audio_array.dtype == np.int16:
102
  audio_array = audio_array.astype(np.float32) / 32768.0
103
  elif audio_array.dtype == np.int32:
@@ -105,10 +99,12 @@ def predict_language(audio):
105
 
106
  # Resampling
107
  if sample_rate != 16000:
 
108
  import librosa
109
  audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
110
 
111
  # Preprocessing
 
112
  inputs = processor(
113
  audio_array,
114
  sampling_rate=16000,
@@ -116,10 +112,12 @@ def predict_language(audio):
116
  )
117
 
118
  # Inference
 
119
  with torch.no_grad():
120
  outputs = model(input_features=inputs.input_features)
121
 
122
  # Post-processing
 
123
  fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
124
  super_probs = torch.softmax(outputs["super_logits"], dim=-1)
125
  code_probs = torch.softmax(outputs["code_logits"], dim=-1)
@@ -132,6 +130,9 @@ def predict_language(audio):
132
  super_conf = super_probs[0, super_idx].item()
133
  code_conf = code_probs[0, code_idx].item()
134
 
 
 
 
135
  # Formatting results
136
  return (
137
  {f"{fam_idx}": fam_conf},
@@ -196,5 +197,9 @@ with gr.Blocks() as demo:
196
  )
197
 
198
  if __name__ == "__main__":
199
- demo.launch(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"))
200
-
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
4
  from transformers import WhisperProcessor, AutoConfig, AutoModel, WhisperConfig, WhisperPreTrainedModel
5
  from transformers.models.whisper.modeling_whisper import WhisperEncoder
6
  import torch.nn as nn
 
 
7
 
8
  # === CUSTOM MODEL CLASSES ===
9
  class WhisperEncoderOnlyConfig(WhisperConfig):
 
75
 
76
  print("Loading model on CPU...")
77
  processor = WhisperProcessor.from_pretrained(MODEL_REPO)
 
78
  model = WhisperEncoderOnlyForClassification.from_pretrained(MODEL_REPO)
 
 
 
 
 
79
  model.eval()
80
 
81
  print("Model loaded successfully!")
 
86
  return "⚠️ No audio provided", "", ""
87
 
88
  sample_rate, audio_array = audio
89
+ audio_len_sec = len(audio_array) / sample_rate
90
+ print(f"\n--- [LOG] New Request ---")
91
+ print(f"[LOG] Audio length: {audio_len_sec:.2f}s, SR: {sample_rate}")
92
 
93
  # Normalization
94
+ print("[LOG] Step 1: Normalizing audio...")
95
  if audio_array.dtype == np.int16:
96
  audio_array = audio_array.astype(np.float32) / 32768.0
97
  elif audio_array.dtype == np.int32:
 
99
 
100
  # Resampling
101
  if sample_rate != 16000:
102
+ print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
103
  import librosa
104
  audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
105
 
106
  # Preprocessing
107
+ print("[LOG] Step 3: Extracting features...")
108
  inputs = processor(
109
  audio_array,
110
  sampling_rate=16000,
 
112
  )
113
 
114
  # Inference
115
+ print("[LOG] Step 4: Running model inference (CPU intensive)...")
116
  with torch.no_grad():
117
  outputs = model(input_features=inputs.input_features)
118
 
119
  # Post-processing
120
+ print("[LOG] Step 5: Post-processing results...")
121
  fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
122
  super_probs = torch.softmax(outputs["super_logits"], dim=-1)
123
  code_probs = torch.softmax(outputs["code_logits"], dim=-1)
 
130
  super_conf = super_probs[0, super_idx].item()
131
  code_conf = code_probs[0, code_idx].item()
132
 
133
+ print(f"[LOG] Prediction successful: Family {fam_idx}")
134
+ print(f"--- [LOG] Request Finished ---\n")
135
+
136
  # Formatting results
137
  return (
138
  {f"{fam_idx}": fam_conf},
 
197
  )
198
 
199
  if __name__ == "__main__":
200
+ # Increased concurrency for CPU stability
201
+ demo.launch(
202
+ theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"),
203
+ ssr_mode=False,
204
+ show_error=True
205
+ )