Shouvik599 commited on
Commit
ddf4091
·
1 Parent(s): 1ce0537

revert to mlp voice model

Browse files
.gitignore CHANGED
@@ -156,10 +156,16 @@ npm-debug.log
156
  *.tmp
157
  *.temp
158
 
159
- # Model files - will be trained at runtime in Docker
160
- models/*.joblib
161
  models/*.h5
162
  models/*.keras
 
 
 
 
 
 
 
163
 
164
  # Keep FFmpeg binaries but ignore temporary files
165
  ffmpeg/bin/*.log
 
156
  *.tmp
157
  *.temp
158
 
159
+ # Model files - CNN training artifacts (ignored)
 
160
  models/*.h5
161
  models/*.keras
162
+ models/*.pkl
163
+ models/ravdess_cnn_model.*
164
+ models/label_encoder.joblib
165
+
166
+ # Keep MLP model files (needed for Docker build)
167
+ # models/mlp_emotion_model.joblib
168
+ # models/scaler.joblib
169
 
170
  # Keep FFmpeg binaries but ignore temporary files
171
  ffmpeg/bin/*.log
Dockerfile CHANGED
@@ -10,7 +10,7 @@ RUN corepack enable && pnpm install --frozen-lockfile
10
  COPY frontend/ .
11
  RUN pnpm build
12
 
13
- # Stage 2: Python backend with model training at runtime
14
  FROM python:3.11-slim
15
 
16
  WORKDIR /app
@@ -18,7 +18,6 @@ WORKDIR /app
18
  # Install system dependencies
19
  RUN apt-get update && apt-get install -y --no-install-recommends \
20
  ffmpeg \
21
- git \
22
  && rm -rf /var/lib/apt/lists/*
23
 
24
  # Copy and install Python dependencies
@@ -26,7 +25,7 @@ COPY backend/pyproject.toml backend/uv.lock* ./
26
  # Grab the uv binary from the official image
27
  COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
28
 
29
- # Install Python dependencies (includes tensorflow and kagglehub for model training)
30
  RUN uv sync --frozen --no-dev
31
 
32
  # Create necessary directories
@@ -35,8 +34,9 @@ RUN mkdir -p ./uploads ./models
35
  # Copy backend code
36
  COPY backend/ .
37
 
38
- # Copy training script
39
- COPY models/train_cnn.py ./models/train_cnn.py
 
40
 
41
  # Copy frontend build
42
  COPY --from=frontend-build /app/frontend/dist ./static
@@ -45,5 +45,4 @@ ENV PORT=7860
45
  EXPOSE 7860
46
 
47
  # Use PORT environment variable (defaults to 7860 for Hugging Face Space compatibility)
48
- # Model training happens at first startup if models don't exist (see app/main.py)
49
  CMD ["sh", "-c", "uv run uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"]
 
10
  COPY frontend/ .
11
  RUN pnpm build
12
 
13
+ # Stage 2: Python backend
14
  FROM python:3.11-slim
15
 
16
  WORKDIR /app
 
18
  # Install system dependencies
19
  RUN apt-get update && apt-get install -y --no-install-recommends \
20
  ffmpeg \
 
21
  && rm -rf /var/lib/apt/lists/*
22
 
23
  # Copy and install Python dependencies
 
25
  # Grab the uv binary from the official image
26
  COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
27
 
28
+ # Install Python dependencies
29
  RUN uv sync --frozen --no-dev
30
 
31
  # Create necessary directories
 
34
  # Copy backend code
35
  COPY backend/ .
36
 
37
+ # Copy model files (MLP model and scaler)
38
+ COPY models/mlp_emotion_model.joblib ./models/
39
+ COPY models/scaler.joblib ./models/
40
 
41
  # Copy frontend build
42
  COPY --from=frontend-build /app/frontend/dist ./static
 
45
  EXPOSE 7860
46
 
47
  # Use PORT environment variable (defaults to 7860 for Hugging Face Space compatibility)
 
48
  CMD ["sh", "-c", "uv run uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"]
backend/app/main.py CHANGED
@@ -5,7 +5,6 @@ FastAPI Main Application with LangGraph Multi-Agent System
5
 
6
  import os
7
  import re
8
- import logging
9
  from dotenv import load_dotenv
10
  from fastapi import FastAPI
11
  from fastapi.middleware.cors import CORSMiddleware
@@ -14,47 +13,6 @@ from fastapi.staticfiles import StaticFiles
14
  # Load environment variables
15
  load_dotenv()
16
 
17
- logger = logging.getLogger(__name__)
18
-
19
- def train_model_if_missing():
20
- """Train the CNN model at startup if model files don't exist (Docker/HF only)."""
21
- # Only train in Docker environment, not locally
22
- # In Docker, the backend directory is at /app, locally it's at C:\...\backend
23
- backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
24
- is_docker = backend_dir == "/app"
25
-
26
- if not is_docker:
27
- logger.info("Running locally, skipping model training. Using existing model files.")
28
- return
29
-
30
- # In Docker: check if model exists, if not train
31
- model_path = os.path.join(backend_dir, "models", "ravdess_cnn_model.h5")
32
-
33
- if not os.path.exists(model_path):
34
- logger.info("Model not found in Docker. Starting model training...")
35
- try:
36
- import subprocess
37
- env = os.environ.copy()
38
- env["PYTHONPATH"] = backend_dir
39
- result = subprocess.run(
40
- ["python", "models/train_cnn.py"],
41
- capture_output=True,
42
- text=True,
43
- env=env,
44
- cwd=backend_dir
45
- )
46
- if result.returncode == 0:
47
- logger.info("Model training completed successfully")
48
- else:
49
- logger.error(f"Model training failed: {result.stderr}")
50
- except Exception as e:
51
- logger.error(f"Error during model training: {e}")
52
- else:
53
- logger.info("Model already exists in Docker, skipping training")
54
-
55
- # Train model at startup if missing
56
- train_model_if_missing()
57
-
58
  # Import routes
59
  from app.routes import router
60
 
 
5
 
6
  import os
7
  import re
 
8
  from dotenv import load_dotenv
9
  from fastapi import FastAPI
10
  from fastapi.middleware.cors import CORSMiddleware
 
13
  # Load environment variables
14
  load_dotenv()
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # Import routes
17
  from app.routes import router
18
 
backend/app/voice_analysis.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Voice Emotion Analysis API for ShantiView
3
- Uses the pre-trained CNN model with MFCC sequence features from the RAVDESS dataset
4
  """
5
 
6
  import os
@@ -14,63 +14,29 @@ warnings.filterwarnings("ignore")
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
- # Model type configuration
18
- # Set to "cnn" or "mlp" - CNN is preferred, MLP is fallback
19
- MODEL_TYPE = "cnn" # Can be "cnn" or "mlp"
20
-
21
- # Constants for CNN
22
- MAX_SEQ_LENGTH = 130
23
  N_MFCC = 40
24
- EMOTION_LABELS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
25
 
26
  # Paths to models
27
  # __file__ is /app/app/voice_analysis.py in Docker, or C:\...\backend\app\voice_analysis.py locally
28
- # - os.path.dirname(__file__) = .../backend/app
29
- # - os.path.dirname(os.path.dirname(__file__)) = .../backend (project root)
30
  BACKEND_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
31
- # Check if we're in Docker (/app) or local (C:\...\backend)
32
  if BACKEND_DIR == "/app":
33
  MODEL_DIR = os.path.join(BACKEND_DIR, "models")
34
  else:
35
  # Local: models are in parent directory of backend
36
  MODEL_DIR = os.path.join(os.path.dirname(BACKEND_DIR), "models")
37
 
38
- # CNN model paths
39
- CNN_MODEL_PATH = os.path.join(MODEL_DIR, "ravdess_cnn_model.h5")
40
- LABEL_ENCODER_PATH = os.path.join(MODEL_DIR, "label_encoder.joblib")
41
- CNN_SCALER_PATH = os.path.join(MODEL_DIR, "scaler.joblib")
42
-
43
- # MLP model paths (fallback)
44
  MLP_MODEL_PATH = os.path.join(MODEL_DIR, "mlp_emotion_model.joblib")
45
  MLP_SCALER_PATH = os.path.join(MODEL_DIR, "scaler.joblib")
46
 
47
  # Model and scaler cache
48
- _cnn_model = None
49
- _mlp_model = None
50
  _scaler = None
51
- _label_encoder = None
52
-
53
-
54
- def _extract_mfcc_features(file_path, max_len=MAX_SEQ_LENGTH, n_mfcc=N_MFCC):
55
- """Extract MFCC sequence features for CNN model."""
56
- try:
57
- y, sr = librosa.load(file_path, duration=3, offset=0.5, sr=22050)
58
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
59
- mfcc = mfcc.T
60
-
61
- if mfcc.shape[0] < max_len:
62
- pad_width = max_len - mfcc.shape[0]
63
- mfcc = np.pad(mfcc, pad_width=((0, pad_width), (0, 0)), mode='constant')
64
- else:
65
- mfcc = mfcc[:max_len]
66
-
67
- return mfcc
68
- except Exception as e:
69
- logger.error(f"Error extracting MFCC features from {file_path}: {e}")
70
- return None
71
 
72
 
73
- def _extract_mfcc_mean(file_path, n_mfcc=40):
74
  """Extract mean MFCC features for MLP model."""
75
  try:
76
  y, sr = librosa.load(file_path, duration=3, offset=0.5, sr=22050)
@@ -81,48 +47,12 @@ def _extract_mfcc_mean(file_path, n_mfcc=40):
81
  return None
82
 
83
 
84
- def load_cnn_model():
85
- """Load the CNN emotion model, scaler, and label encoder."""
86
- global _cnn_model, _scaler, _label_encoder
87
-
88
- if _cnn_model is not None:
89
- return _cnn_model, _scaler, _label_encoder
90
-
91
- if not os.path.exists(CNN_MODEL_PATH):
92
- logger.error(f"CNN model file not found at {CNN_MODEL_PATH}")
93
- return None, None, None
94
- if not os.path.exists(CNN_SCALER_PATH):
95
- logger.error(f"Scaler file not found at {CNN_SCALER_PATH}")
96
- return None, None, None
97
- if not os.path.exists(LABEL_ENCODER_PATH):
98
- logger.error(f"Label encoder file not found at {LABEL_ENCODER_PATH}")
99
- return None, None, None
100
-
101
- try:
102
- import tensorflow as tf
103
- # Try loading with custom_objects to handle compatibility issues
104
- _cnn_model = tf.keras.models.load_model(
105
- CNN_MODEL_PATH,
106
- custom_objects=None,
107
- compile=False # Skip compilation to avoid deserialization issues
108
- )
109
- # Recompile the model since we skipped compilation
110
- _cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
111
- _scaler = joblib.load(CNN_SCALER_PATH)
112
- _label_encoder = joblib.load(LABEL_ENCODER_PATH)
113
- logger.info("CNN voice emotion model loaded successfully")
114
- return _cnn_model, _scaler, _label_encoder
115
- except Exception as e:
116
- logger.error(f"Error loading CNN model: {e}")
117
- return None, None, None
118
-
119
-
120
- def load_mlp_model():
121
- """Load the MLP emotion model and scaler (fallback)."""
122
- global _mlp_model, _scaler
123
 
124
- if _mlp_model is not None:
125
- return _mlp_model, _scaler
126
 
127
  if not os.path.exists(MLP_MODEL_PATH):
128
  logger.error(f"MLP model file not found at {MLP_MODEL_PATH}")
@@ -132,63 +62,18 @@ def load_mlp_model():
132
  return None, None
133
 
134
  try:
135
- _mlp_model = joblib.load(MLP_MODEL_PATH)
136
  _scaler = joblib.load(MLP_SCALER_PATH)
137
  logger.info("MLP voice emotion model loaded successfully")
138
- return _mlp_model, _scaler
139
  except Exception as e:
140
  logger.error(f"Error loading MLP model: {e}")
141
  return None, None
142
 
143
 
144
- def load_model():
145
- """Load the appropriate model based on configuration."""
146
- if MODEL_TYPE == "cnn":
147
- model, scaler, le = load_cnn_model()
148
- if model is not None:
149
- return model, scaler, le, "cnn"
150
- logger.warning("CNN model failed to load, falling back to MLP")
151
-
152
- # Fallback to MLP
153
- model, scaler = load_mlp_model()
154
- if model is not None:
155
- return model, scaler, None, "mlp"
156
-
157
- logger.error("Both CNN and MLP models failed to load")
158
- return None, None, None, None
159
-
160
-
161
- def extract_mfcc_features(file_path, n_mfcc=40):
162
- """
163
- Extract Mel-frequency cepstral coefficients (MFCCs) from an audio file.
164
-
165
- Args:
166
- file_path: Path to the audio file
167
- n_mfcc: Number of MFCCs to extract
168
-
169
- Returns:
170
- numpy array of MFCC features, or None if extraction fails
171
- """
172
- try:
173
- # Load the audio file with parameters matching the training
174
- y, sr = librosa.load(file_path, duration=3, offset=0.5, sr=22050)
175
-
176
- # Extract MFCCs
177
- mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
178
-
179
- # Calculate mean across time axis
180
- mfccs_mean = np.mean(mfccs.T, axis=0)
181
-
182
- return mfccs_mean
183
- except Exception as e:
184
- logger.error(f"Error extracting MFCC features from {file_path}: {e}")
185
- return None
186
-
187
-
188
  async def predict_voice_emotion(audio_file_path: str) -> dict:
189
  """
190
- Predict the emotion of an audio file using the trained model.
191
- Supports both CNN and MLP models.
192
 
193
  Args:
194
  audio_file_path: Path to the audio file
@@ -198,8 +83,7 @@ async def predict_voice_emotion(audio_file_path: str) -> dict:
198
  """
199
  try:
200
  # Load model
201
- result = load_model()
202
- model, scaler, label_encoder, model_type = result
203
 
204
  if model is None:
205
  return {
@@ -208,75 +92,38 @@ async def predict_voice_emotion(audio_file_path: str) -> dict:
208
  "message": "Voice emotion model is not loaded. Please ensure model files exist."
209
  }
210
 
211
- if model_type == "cnn":
212
- # CNN prediction
213
- features = _extract_mfcc_features(audio_file_path)
214
-
215
- if features is None:
216
- return {
217
- "error": True,
218
- "emotion": "Feature extraction failed",
219
- "message": "Could not extract features from audio file"
220
- }
221
-
222
- # Scale features (reshape for 2D scaler, then reshape back)
223
- original_shape = features.shape
224
- features_scaled = scaler.transform(features.reshape(-1, original_shape[1])).reshape(original_shape)
225
-
226
- # Add batch dimension and predict
227
- features_batch = np.expand_dims(features_scaled, axis=0)
228
- predictions = model.predict(features_batch, verbose=0)[0]
229
-
230
- # Get emotion from predictions
231
- predicted_class = np.argmax(predictions)
232
- confidence = float(predictions[predicted_class])
233
-
234
- # Use label encoder if available, otherwise use emotion labels
235
- if label_encoder is not None:
236
- emotion_display = label_encoder.inverse_transform([predicted_class])[0]
237
- emotion_labels = label_encoder.classes_
238
- else:
239
- emotion_display = EMOTION_LABELS[predicted_class]
240
- emotion_labels = EMOTION_LABELS
241
-
242
- # Build all_emotions dict using the correct label order (convert to native Python types)
243
- emotion_probs = {str(label): float(predictions[i]) for i, label in enumerate(emotion_labels)}
244
-
245
- else:
246
- # MLP prediction (fallback)
247
- features = _extract_mfcc_mean(audio_file_path)
248
-
249
- if features is None:
250
- return {
251
- "error": True,
252
- "emotion": "Feature extraction failed",
253
- "message": "Could not extract features from audio file"
254
- }
255
-
256
- # Scale features and predict
257
- features_scaled = scaler.transform(features.reshape(1, -1))
258
- prediction = model.predict(features_scaled)[0]
259
-
260
- # Get probabilities if available
261
- try:
262
- probabilities = model.predict_proba(features_scaled)[0]
263
- emotion_probs = {label: float(prob) for label, prob in zip(model.classes_, probabilities)}
264
- confidence = float(max(probabilities))
265
- except Exception:
266
- emotion_probs = {}
267
- confidence = 1.0
268
-
269
- emotion_display = prediction.capitalize()
270
 
271
- emotion_display = emotion_display.capitalize()
272
- logger.info(f"Voice emotion prediction ({model_type}): {emotion_display} (confidence: {confidence:.3f})")
273
 
274
  return {
275
  "error": False,
276
  "emotion": emotion_display,
277
  "confidence": confidence,
278
  "all_emotions": emotion_probs,
279
- "model_type": model_type
280
  }
281
 
282
  except Exception as e:
@@ -285,4 +132,4 @@ async def predict_voice_emotion(audio_file_path: str) -> dict:
285
  "error": True,
286
  "emotion": "Error",
287
  "message": str(e)
288
- }
 
1
  """
2
  Voice Emotion Analysis API for ShantiView
3
+ Uses the pre-trained MLP model with MFCC features from the RAVDESS dataset
4
  """
5
 
6
  import os
 
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
+ # Constants for MLP
 
 
 
 
 
18
  N_MFCC = 40
 
19
 
20
  # Paths to models
21
  # __file__ is /app/app/voice_analysis.py in Docker, or C:\...\backend\app\voice_analysis.py locally
 
 
22
  BACKEND_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
23
+ # Check if we're in Docker (/app) or local
24
  if BACKEND_DIR == "/app":
25
  MODEL_DIR = os.path.join(BACKEND_DIR, "models")
26
  else:
27
  # Local: models are in parent directory of backend
28
  MODEL_DIR = os.path.join(os.path.dirname(BACKEND_DIR), "models")
29
 
30
+ # MLP model paths
 
 
 
 
 
31
  MLP_MODEL_PATH = os.path.join(MODEL_DIR, "mlp_emotion_model.joblib")
32
  MLP_SCALER_PATH = os.path.join(MODEL_DIR, "scaler.joblib")
33
 
34
  # Model and scaler cache
35
+ _model = None
 
36
  _scaler = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
+ def _extract_mfcc_mean(file_path, n_mfcc=N_MFCC):
40
  """Extract mean MFCC features for MLP model."""
41
  try:
42
  y, sr = librosa.load(file_path, duration=3, offset=0.5, sr=22050)
 
47
  return None
48
 
49
 
50
+ def load_model():
51
+ """Load the MLP emotion model and scaler."""
52
+ global _model, _scaler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ if _model is not None:
55
+ return _model, _scaler
56
 
57
  if not os.path.exists(MLP_MODEL_PATH):
58
  logger.error(f"MLP model file not found at {MLP_MODEL_PATH}")
 
62
  return None, None
63
 
64
  try:
65
+ _model = joblib.load(MLP_MODEL_PATH)
66
  _scaler = joblib.load(MLP_SCALER_PATH)
67
  logger.info("MLP voice emotion model loaded successfully")
68
+ return _model, _scaler
69
  except Exception as e:
70
  logger.error(f"Error loading MLP model: {e}")
71
  return None, None
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  async def predict_voice_emotion(audio_file_path: str) -> dict:
75
  """
76
+ Predict the emotion of an audio file using the trained MLP model.
 
77
 
78
  Args:
79
  audio_file_path: Path to the audio file
 
83
  """
84
  try:
85
  # Load model
86
+ model, scaler = load_model()
 
87
 
88
  if model is None:
89
  return {
 
92
  "message": "Voice emotion model is not loaded. Please ensure model files exist."
93
  }
94
 
95
+ # Extract features
96
+ features = _extract_mfcc_mean(audio_file_path)
97
+
98
+ if features is None:
99
+ return {
100
+ "error": True,
101
+ "emotion": "Feature extraction failed",
102
+ "message": "Could not extract features from audio file"
103
+ }
104
+
105
+ # Scale features and predict
106
+ features_scaled = scaler.transform(features.reshape(1, -1))
107
+ prediction = model.predict(features_scaled)[0]
108
+
109
+ # Get probabilities if available
110
+ try:
111
+ probabilities = model.predict_proba(features_scaled)[0]
112
+ emotion_probs = {str(label): float(prob) for label, prob in zip(model.classes_, probabilities)}
113
+ confidence = float(max(probabilities))
114
+ except Exception:
115
+ emotion_probs = {}
116
+ confidence = 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ emotion_display = str(prediction).capitalize()
119
+ logger.info(f"Voice emotion prediction: {emotion_display} (confidence: {confidence:.3f})")
120
 
121
  return {
122
  "error": False,
123
  "emotion": emotion_display,
124
  "confidence": confidence,
125
  "all_emotions": emotion_probs,
126
+ "model_type": "mlp"
127
  }
128
 
129
  except Exception as e:
 
132
  "error": True,
133
  "emotion": "Error",
134
  "message": str(e)
135
+ }
backend/pyproject.toml CHANGED
@@ -22,9 +22,6 @@ dependencies = [
22
  "Pillow>=11.0.0",
23
  "aiohttp>=3.10.0",
24
  "deepface>=0.0.93",
25
- "tensorflow>=2.15.0",
26
- "tf-keras>=2.18.0",
27
- "kagglehub>=0.3.0",
28
  ]
29
 
30
  [dependency-groups]
 
22
  "Pillow>=11.0.0",
23
  "aiohttp>=3.10.0",
24
  "deepface>=0.0.93",
 
 
 
25
  ]
26
 
27
  [dependency-groups]
models/train_cnn.py CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:615316b4c5ea006dcf082eb520c35275fcb6fdc48eb686110d8d66baf50d8495
3
- size 6752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:299eed1888b90f9425435d33b185544857a5ab53af4c9badbc79ad9bb460bf99
3
+ size 6842