EurekaPotato commited on
Commit
c431263
·
verified ·
1 Parent(s): d56c2d4

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +10 -4
  2. handler.py +21 -67
  3. requirements.txt +6 -5
  4. upload.py +29 -0
Dockerfile CHANGED
@@ -2,18 +2,24 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # System dependencies for audio processing
6
  RUN apt-get update && apt-get install -y \
7
  libsndfile1 \
8
  ffmpeg \
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
  COPY requirements.txt .
12
- RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cpu
 
 
 
 
 
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
15
- COPY handler.py .
16
 
17
  EXPOSE 7860
18
 
19
- CMD ["python", "handler.py"]
 
2
 
3
  WORKDIR /app
4
 
5
+ # System dependencies for audio processing + git for torch.hub
6
  RUN apt-get update && apt-get install -y \
7
  libsndfile1 \
8
  ffmpeg \
9
+ git \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  COPY requirements.txt .
13
+
14
+ # Install CPU-only torch first (prevents CUDA downloads)
15
+ RUN pip install --no-cache-dir torch==2.1.0+cpu torchvision==0.16.0+cpu torchaudio==2.1.0+cpu \
16
+ --extra-index-url https://download.pytorch.org/whl/cpu
17
+
18
+ # Install other dependencies
19
  RUN pip install --no-cache-dir -r requirements.txt
20
 
21
+ COPY . .
22
 
23
  EXPOSE 7860
24
 
25
+ CMD ["uvicorn", "handler:app", "--host", "0.0.0.0", "--port", "7860"]
handler.py CHANGED
@@ -110,81 +110,35 @@ class AudioFeatureExtractorEndpoint:
110
  self.sr = 16000
111
  self.emotion_cnn = EmotionCNN()
112
 
113
- # Load Silero VAD from HuggingFace Hub (more reliable on HF Spaces)
114
  try:
115
- from huggingface_hub import hf_hub_download
 
116
 
117
- # Download the model file from HF
118
- model_path = hf_hub_download(
119
- repo_id="snakers4/silero-vad",
120
- filename="files/silero_vad.jit"
 
 
 
 
121
  )
122
 
123
- # Load the JIT model
124
- self.vad_model = torch.jit.load(model_path)
125
  self.vad_model.eval()
126
 
127
- # Get the get_speech_timestamps function
128
- # The model itself has this as a method in newer versions
129
- def get_speech_timestamps(audio, model, sampling_rate=16000, **kwargs):
130
- """Wrapper for VAD speech detection"""
131
- if not isinstance(audio, torch.Tensor):
132
- audio = torch.FloatTensor(audio)
133
-
134
- # Get speech timestamps using the model
135
- speech_probs = []
136
- chunk_size = 512
137
- for i in range(0, len(audio), chunk_size):
138
- chunk = audio[i:i + chunk_size]
139
- if len(chunk) < chunk_size:
140
- chunk = torch.nn.functional.pad(chunk, (0, chunk_size - len(chunk)))
141
- with torch.no_grad():
142
- speech_prob = model(chunk, sampling_rate).item()
143
- speech_probs.append((i, speech_prob))
144
-
145
- # Convert probabilities to timestamps
146
- threshold = kwargs.get('threshold', 0.5)
147
- min_speech_duration_ms = kwargs.get('min_speech_duration_ms', 250)
148
- min_silence_duration_ms = kwargs.get('min_silence_duration_ms', 100)
149
-
150
- timestamps = []
151
- in_speech = False
152
- speech_start = 0
153
-
154
- for i, prob in speech_probs:
155
- if prob > threshold and not in_speech:
156
- speech_start = i
157
- in_speech = True
158
- elif prob <= threshold and in_speech:
159
- duration_ms = (i - speech_start) / sampling_rate * 1000
160
- if duration_ms >= min_speech_duration_ms:
161
- timestamps.append({'start': speech_start, 'end': i})
162
- in_speech = False
163
-
164
- # Close last segment if still in speech
165
- if in_speech:
166
- timestamps.append({'start': speech_start, 'end': len(audio)})
167
-
168
- return timestamps
169
 
170
- self.get_speech_timestamps = get_speech_timestamps
171
- print("✓ Silero VAD loaded from HuggingFace Hub")
172
  except Exception as e:
173
- print(f"⚠ Silero VAD failed to load from HF Hub: {e}")
174
- print(f" Trying fallback torch.hub.load...")
175
- try:
176
- # Fallback to torch.hub
177
- self.vad_model, self.vad_utils = torch.hub.load(
178
- repo_or_dir="snakers4/silero-vad",
179
- model="silero_vad",
180
- trust_repo=True,
181
- force_reload=False
182
- )
183
- self.get_speech_timestamps = self.vad_utils[0]
184
- print("✓ Silero VAD loaded via torch.hub (fallback)")
185
- except Exception as e2:
186
- print(f"⚠ Both HF Hub and torch.hub failed for Silero VAD: {e2}")
187
- self.vad_model = None
188
 
189
  # -------- V1: SNR --------
190
  def extract_snr(self, audio: np.ndarray) -> float:
 
110
  self.sr = 16000
111
  self.emotion_cnn = EmotionCNN()
112
 
113
+ # Load Silero VAD - optimized for CPU-only HF Spaces
114
  try:
115
+ # Force CPU mode (HF Free Spaces don't have GPU)
116
+ torch.set_num_threads(1)
117
 
118
+ # Load from torch.hub (most reliable method)
119
+ print("[INFO] Loading Silero VAD from torch.hub...")
120
+ self.vad_model, self.vad_utils = torch.hub.load(
121
+ repo_or_dir='snakers4/silero-vad',
122
+ model='silero_vad',
123
+ force_reload=False,
124
+ trust_repo=True,
125
+ verbose=False
126
  )
127
 
128
+ # Force model to CPU
129
+ self.vad_model = self.vad_model.cpu()
130
  self.vad_model.eval()
131
 
132
+ # Extract the get_speech_timestamps utility
133
+ self.get_speech_timestamps = self.vad_utils[0]
134
+
135
+ print(" Silero VAD loaded successfully (CPU mode)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
 
 
137
  except Exception as e:
138
+ print(f"⚠ Silero VAD failed to load: {e}")
139
+ print(f" Audio features will use fallback values for pause detection")
140
+ self.vad_model = None
141
+ self.get_speech_timestamps = None
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  # -------- V1: SNR --------
144
  def extract_snr(self, audio: np.ndarray) -> float:
requirements.txt CHANGED
@@ -1,13 +1,14 @@
1
- # Core audio
2
  librosa==0.10.1
3
  soundfile==0.12.1
4
  numpy==1.24.3
5
  scipy==1.11.2
6
 
7
- # ML (Install torch manually if not using docker, or uncomment below)
8
- torch
9
- torchvision
10
- torchaudio
 
11
 
12
  # API
13
  fastapi==0.95.2
 
1
+ # Core audio processing
2
  librosa==0.10.1
3
  soundfile==0.12.1
4
  numpy==1.24.3
5
  scipy==1.11.2
6
 
7
+ # ML - CPU-only versions (for HF Free Spaces without GPU)
8
+ --extra-index-url https://download.pytorch.org/whl/cpu
9
+ torch==2.1.0+cpu
10
+ torchvision==0.16.0+cpu
11
+ torchaudio==2.1.0+cpu
12
 
13
  # API
14
  fastapi==0.95.2
upload.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload audio endpoint to HF Spaces
3
+ """
4
+ from huggingface_hub import HfApi
5
+ import sys
6
+
7
+ try:
8
+ api = HfApi()
9
+
10
+ print("Uploading audio endpoint to HF Spaces...")
11
+ print("This may take 1-2 minutes...")
12
+
13
+ api.upload_folder(
14
+ folder_path=".",
15
+ repo_id="divAIne/busy-module-audio",
16
+ repo_type="space",
17
+ )
18
+
19
+ print("\n" + "="*60)
20
+ print("✓ Upload successful!")
21
+ print("="*60)
22
+ print("\nSpace URL: https://huggingface.co/spaces/divAIne/busy-module-audio")
23
+ print("API URL: https://divAIne-busy-module-audio.hf.space")
24
+ print("\nThe space will rebuild now (2-5 minutes).")
25
+ print("Check logs at: https://huggingface.co/spaces/divAIne/busy-module-audio/logs")
26
+
27
+ except Exception as e:
28
+ print(f"Error: {e}")
29
+ sys.exit(1)