daasime Claude Opus 4.6 commited on
Commit
9109931
·
1 Parent(s): 1bd19e4

Fix permission denied: store models/data outside /app mount

Browse files

HF Spaces mounts repo content over /app/ at runtime, overwriting
Docker build-time permissions. Move model downloads to /home/user/models/
and data storage to /home/user/data/ which persist from build. All source
files now use MODEL_DIR and DATA_DIR env vars for path resolution.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Dockerfile CHANGED
@@ -15,26 +15,30 @@ COPY requirements.txt .
15
  # Install Python dependencies
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # Copy application code
19
- COPY . .
20
 
21
- # Create necessary directories with proper permissions
22
- RUN mkdir -p data/db data/clips pretrained_models
 
 
23
 
24
- # Pre-download SpeechBrain models during build
 
25
  RUN python -c "\
26
  from speechbrain.inference.VAD import VAD; \
27
- VAD.from_hparams(source='speechbrain/vad-crdnn-libriparty', savedir='pretrained_models/vad'); \
28
  print('VAD model downloaded')"
29
 
30
  RUN python -c "\
31
  from speechbrain.inference.speaker import SpeakerRecognition; \
32
- SpeakerRecognition.from_hparams(source='speechbrain/spkrec-ecapa-voxceleb', savedir='pretrained_models/spkrec'); \
33
  print('Speaker Recognition model downloaded')"
34
 
35
- # HF Spaces runs as user 1000 - set permissions
36
- RUN useradd -m -u 1000 user
37
- RUN chown -R user:user /app && chmod -R 775 /app/pretrained_models /app/data
 
38
  USER user
39
 
40
  # Expose port (HF Spaces uses 7860)
@@ -46,6 +50,8 @@ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
46
  ENV HOME=/home/user
47
  ENV MPLCONFIGDIR=/tmp/matplotlib
48
  ENV HF_HOME=/tmp/hf_home
 
 
49
 
50
  # Run Streamlit
51
  CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.headless=true"]
 
15
  # Install Python dependencies
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
+ # HF Spaces runs as user 1000 - create user before downloads
19
+ RUN useradd -m -u 1000 user
20
 
21
+ # Create model & data dirs under /home/user (NOT /app, which HF mounts over)
22
+ RUN mkdir -p /home/user/models/vad /home/user/models/spkrec \
23
+ /home/user/data/db /home/user/data/clips \
24
+ && chown -R user:user /home/user
25
 
26
+ # Pre-download models AS the user so files are owned by user
27
+ USER user
28
  RUN python -c "\
29
  from speechbrain.inference.VAD import VAD; \
30
+ VAD.from_hparams(source='speechbrain/vad-crdnn-libriparty', savedir='/home/user/models/vad'); \
31
  print('VAD model downloaded')"
32
 
33
  RUN python -c "\
34
  from speechbrain.inference.speaker import SpeakerRecognition; \
35
+ SpeakerRecognition.from_hparams(source='speechbrain/spkrec-ecapa-voxceleb', savedir='/home/user/models/spkrec'); \
36
  print('Speaker Recognition model downloaded')"
37
 
38
+ # Copy application code (switch back to root, then back to user)
39
+ USER root
40
+ COPY . .
41
+ RUN chown -R user:user /app
42
  USER user
43
 
44
  # Expose port (HF Spaces uses 7860)
 
50
  ENV HOME=/home/user
51
  ENV MPLCONFIGDIR=/tmp/matplotlib
52
  ENV HF_HOME=/tmp/hf_home
53
+ ENV MODEL_DIR=/home/user/models
54
+ ENV DATA_DIR=/home/user/data
55
 
56
  # Run Streamlit
57
  CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.headless=true"]
src/analyzer.py CHANGED
@@ -145,8 +145,8 @@ class AnalysisResult:
145
  class AudioAnalyzer:
146
  """Main analyzer that orchestrates all phases."""
147
 
148
- def __init__(self, db_path: str = "data/db/voiceprints.db",
149
- clips_dir: str = "data/clips",
150
  device: str = None):
151
  """
152
  Initialize analyzer.
@@ -157,9 +157,14 @@ class AudioAnalyzer:
157
  device: torch device (cuda/cpu)
158
  """
159
  self.device = device
 
 
 
 
 
160
  self.clips_dir = clips_dir
161
  os.makedirs(clips_dir, exist_ok=True)
162
-
163
  # Initialize database
164
  self.db = Database(db_path)
165
 
 
145
  class AudioAnalyzer:
146
  """Main analyzer that orchestrates all phases."""
147
 
148
+ def __init__(self, db_path: str = None,
149
+ clips_dir: str = None,
150
  device: str = None):
151
  """
152
  Initialize analyzer.
 
157
  device: torch device (cuda/cpu)
158
  """
159
  self.device = device
160
+ data_dir = os.environ.get("DATA_DIR", "data")
161
+ if db_path is None:
162
+ db_path = os.path.join(data_dir, "db", "voiceprints.db")
163
+ if clips_dir is None:
164
+ clips_dir = os.path.join(data_dir, "clips")
165
  self.clips_dir = clips_dir
166
  os.makedirs(clips_dir, exist_ok=True)
167
+
168
  # Initialize database
169
  self.db = Database(db_path)
170
 
src/database/models.py CHANGED
@@ -77,7 +77,10 @@ class TestAnalysis(Base):
77
  class Database:
78
  """Database manager."""
79
 
80
- def __init__(self, db_path: str = "data/db/voiceprints.db"):
 
 
 
81
  self.db_path = db_path
82
  os.makedirs(os.path.dirname(db_path), exist_ok=True)
83
  self.engine = create_engine(f'sqlite:///{db_path}')
 
77
  class Database:
78
  """Database manager."""
79
 
80
+ def __init__(self, db_path: str = None):
81
+ if db_path is None:
82
+ data_dir = os.environ.get("DATA_DIR", "data")
83
+ db_path = os.path.join(data_dir, "db", "voiceprints.db")
84
  self.db_path = db_path
85
  os.makedirs(os.path.dirname(db_path), exist_ok=True)
86
  self.engine = create_engine(f'sqlite:///{db_path}')
src/phase1_foundation/diarization.py CHANGED
@@ -45,9 +45,11 @@ class SpeakerDiarizer:
45
  """Lazy load embedding model."""
46
  if self._embedding_model is None:
47
  from speechbrain.inference.speaker import SpeakerRecognition
 
 
48
  self._embedding_model = SpeakerRecognition.from_hparams(
49
  source="speechbrain/spkrec-ecapa-voxceleb",
50
- savedir="pretrained_models/spkrec",
51
  run_opts={"device": self.device}
52
  )
53
  return self._embedding_model
 
45
  """Lazy load embedding model."""
46
  if self._embedding_model is None:
47
  from speechbrain.inference.speaker import SpeakerRecognition
48
+ import os
49
+ model_dir = os.environ.get("MODEL_DIR", "pretrained_models")
50
  self._embedding_model = SpeakerRecognition.from_hparams(
51
  source="speechbrain/spkrec-ecapa-voxceleb",
52
+ savedir=os.path.join(model_dir, "spkrec"),
53
  run_opts={"device": self.device}
54
  )
55
  return self._embedding_model
src/phase1_foundation/vad.py CHANGED
@@ -33,9 +33,11 @@ class VoiceActivityDetector:
33
  # Suppress the use_auth_token deprecation warning from speechbrain
34
  with warnings.catch_warnings():
35
  warnings.filterwarnings("ignore", message=".*use_auth_token.*")
 
 
36
  self._model = VAD.from_hparams(
37
  source="speechbrain/vad-crdnn-libriparty",
38
- savedir="pretrained_models/vad",
39
  run_opts={"device": self.device}
40
  )
41
  return self._model
 
33
  # Suppress the use_auth_token deprecation warning from speechbrain
34
  with warnings.catch_warnings():
35
  warnings.filterwarnings("ignore", message=".*use_auth_token.*")
36
+ import os
37
+ model_dir = os.environ.get("MODEL_DIR", "pretrained_models")
38
  self._model = VAD.from_hparams(
39
  source="speechbrain/vad-crdnn-libriparty",
40
+ savedir=os.path.join(model_dir, "vad"),
41
  run_opts={"device": self.device}
42
  )
43
  return self._model
src/phase1_foundation/voiceprint.py CHANGED
@@ -47,9 +47,11 @@ class VoiceprintExtractor:
47
  """Lazy load model."""
48
  if self._model is None:
49
  from speechbrain.inference.speaker import SpeakerRecognition
 
 
50
  self._model = SpeakerRecognition.from_hparams(
51
  source="speechbrain/spkrec-ecapa-voxceleb",
52
- savedir="pretrained_models/spkrec",
53
  run_opts={"device": self.device}
54
  )
55
  return self._model
 
47
  """Lazy load model."""
48
  if self._model is None:
49
  from speechbrain.inference.speaker import SpeakerRecognition
50
+ import os
51
+ model_dir = os.environ.get("MODEL_DIR", "pretrained_models")
52
  self._model = SpeakerRecognition.from_hparams(
53
  source="speechbrain/spkrec-ecapa-voxceleb",
54
+ savedir=os.path.join(model_dir, "spkrec"),
55
  run_opts={"device": self.device}
56
  )
57
  return self._model