Spaces:

divAIne
/

busy-module-audio

Sleeping

App Files Files Community

EurekaPotato commited on Feb 17

Commit

c431263

verified ·

1 Parent(s): d56c2d4

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

Dockerfile +10 -4
handler.py +21 -67
requirements.txt +6 -5
upload.py +29 -0

Dockerfile CHANGED Viewed

@@ -2,18 +2,24 @@ FROM python:3.10-slim
 WORKDIR /app
-# System dependencies for audio processing
 RUN apt-get update && apt-get install -y \
     libsndfile1 \
     ffmpeg \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
-RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cpu
 RUN pip install --no-cache-dir -r requirements.txt
-COPY handler.py .
 EXPOSE 7860
-CMD ["python", "handler.py"]

 WORKDIR /app
+# System dependencies for audio processing + git for torch.hub
 RUN apt-get update && apt-get install -y \
     libsndfile1 \
     ffmpeg \
+    git \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
+# Install CPU-only torch first (prevents CUDA downloads)
+RUN pip install --no-cache-dir torch==2.1.0+cpu torchvision==0.16.0+cpu torchaudio==2.1.0+cpu \
+    --extra-index-url https://download.pytorch.org/whl/cpu
+# Install other dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
 EXPOSE 7860
+CMD ["uvicorn", "handler:app", "--host", "0.0.0.0", "--port", "7860"]

handler.py CHANGED Viewed

@@ -110,81 +110,35 @@ class AudioFeatureExtractorEndpoint:
         self.sr = 16000
         self.emotion_cnn = EmotionCNN()
-        # Load Silero VAD from HuggingFace Hub (more reliable on HF Spaces)
         try:
-            from huggingface_hub import hf_hub_download
-            # Download the model file from HF
-            model_path = hf_hub_download(
-                repo_id="snakers4/silero-vad",
-                filename="files/silero_vad.jit"
             )
-            # Load the JIT model
-            self.vad_model = torch.jit.load(model_path)
             self.vad_model.eval()
-            # Get the get_speech_timestamps function
-            # The model itself has this as a method in newer versions
-            def get_speech_timestamps(audio, model, sampling_rate=16000, **kwargs):
-                """Wrapper for VAD speech detection"""
-                if not isinstance(audio, torch.Tensor):
-                    audio = torch.FloatTensor(audio)
-                # Get speech timestamps using the model
-                speech_probs = []
-                chunk_size = 512
-                for i in range(0, len(audio), chunk_size):
-                    chunk = audio[i:i + chunk_size]
-                    if len(chunk) < chunk_size:
-                        chunk = torch.nn.functional.pad(chunk, (0, chunk_size - len(chunk)))
-                    with torch.no_grad():
-                        speech_prob = model(chunk, sampling_rate).item()
-                        speech_probs.append((i, speech_prob))
-                # Convert probabilities to timestamps
-                threshold = kwargs.get('threshold', 0.5)
-                min_speech_duration_ms = kwargs.get('min_speech_duration_ms', 250)
-                min_silence_duration_ms = kwargs.get('min_silence_duration_ms', 100)
-                timestamps = []
-                in_speech = False
-                speech_start = 0
-                for i, prob in speech_probs:
-                    if prob > threshold and not in_speech:
-                        speech_start = i
-                        in_speech = True
-                    elif prob <= threshold and in_speech:
-                        duration_ms = (i - speech_start) / sampling_rate * 1000
-                        if duration_ms >= min_speech_duration_ms:
-                            timestamps.append({'start': speech_start, 'end': i})
-                        in_speech = False
-                # Close last segment if still in speech
-                if in_speech:
-                    timestamps.append({'start': speech_start, 'end': len(audio)})
-                return timestamps
-            self.get_speech_timestamps = get_speech_timestamps
-            print("✓ Silero VAD loaded from HuggingFace Hub")
         except Exception as e:
-            print(f"⚠ Silero VAD failed to load from HF Hub: {e}")
-            print(f"   Trying fallback torch.hub.load...")
-            try:
-                # Fallback to torch.hub
-                self.vad_model, self.vad_utils = torch.hub.load(
-                    repo_or_dir="snakers4/silero-vad",
-                    model="silero_vad",
-                    trust_repo=True,
-                    force_reload=False
-                )
-                self.get_speech_timestamps = self.vad_utils[0]
-                print("✓ Silero VAD loaded via torch.hub (fallback)")
-            except Exception as e2:
-                print(f"⚠ Both HF Hub and torch.hub failed for Silero VAD: {e2}")
-                self.vad_model = None
     # -------- V1: SNR --------
     def extract_snr(self, audio: np.ndarray) -> float:

         self.sr = 16000
         self.emotion_cnn = EmotionCNN()
+        # Load Silero VAD - optimized for CPU-only HF Spaces
         try:
+            # Force CPU mode (HF Free Spaces don't have GPU)
+            torch.set_num_threads(1)
+            # Load from torch.hub (most reliable method)
+            print("[INFO] Loading Silero VAD from torch.hub...")
+            self.vad_model, self.vad_utils = torch.hub.load(
+                repo_or_dir='snakers4/silero-vad',
+                model='silero_vad',
+                force_reload=False,
+                trust_repo=True,
+                verbose=False
             )
+            # Force model to CPU
+            self.vad_model = self.vad_model.cpu()
             self.vad_model.eval()
+            # Extract the get_speech_timestamps utility
+            self.get_speech_timestamps = self.vad_utils[0]
+            print("✅ Silero VAD loaded successfully (CPU mode)")
         except Exception as e:
+            print(f"⚠️ Silero VAD failed to load: {e}")
+            print(f"   Audio features will use fallback values for pause detection")
+            self.vad_model = None
+            self.get_speech_timestamps = None
     # -------- V1: SNR --------
     def extract_snr(self, audio: np.ndarray) -> float:

requirements.txt CHANGED Viewed

@@ -1,13 +1,14 @@
-# Core audio
 librosa==0.10.1
 soundfile==0.12.1
 numpy==1.24.3
 scipy==1.11.2
-# ML (Install torch manually if not using docker, or uncomment below)
-torch
-torchvision
-torchaudio
 # API
 fastapi==0.95.2

+# Core audio processing
 librosa==0.10.1
 soundfile==0.12.1
 numpy==1.24.3
 scipy==1.11.2
+# ML - CPU-only versions (for HF Free Spaces without GPU)
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.1.0+cpu
+torchvision==0.16.0+cpu
+torchaudio==2.1.0+cpu
 # API
 fastapi==0.95.2

upload.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Upload audio endpoint to HF Spaces
+"""
+from huggingface_hub import HfApi
+import sys
+try:
+    api = HfApi()
+    print("Uploading audio endpoint to HF Spaces...")
+    print("This may take 1-2 minutes...")
+    api.upload_folder(
+        folder_path=".",
+        repo_id="divAIne/busy-module-audio",
+        repo_type="space",
+    )
+    print("\n" + "="*60)
+    print("✓ Upload successful!")
+    print("="*60)
+    print("\nSpace URL: https://huggingface.co/spaces/divAIne/busy-module-audio")
+    print("API URL: https://divAIne-busy-module-audio.hf.space")
+    print("\nThe space will rebuild now (2-5 minutes).")
+    print("Check logs at: https://huggingface.co/spaces/divAIne/busy-module-audio/logs")
+except Exception as e:
+    print(f"Error: {e}")
+    sys.exit(1)