Spaces:

HackerMOne
/

slaq-ai-engine

Sleeping

App Files Files Community

HackerMOne commited on Dec 14, 2025

Commit

695193c

verified ·

1 Parent(s): 88d16a3

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +27 -0
app.py +134 -0
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# Use Python 3.9 as the base image
+FROM python:3.9
+# Set the working directory
+WORKDIR /app
+# Install system dependencies (FFmpeg is required for audio processing)
+RUN apt-get update && apt-get install -y ffmpeg
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code
+COPY . .
+# Create a cache directory for Hugging Face models and set permissions
+# This prevents permission errors when the model tries to download
+RUN mkdir -p /app/cache && chmod 777 /app/cache
+ENV TRANSFORMERS_CACHE=/app/cache
+ENV HF_HOME=/app/cache
+# Expose the port (Hugging Face Spaces uses 7860)
+EXPOSE 7860
+# Command to run the application using Uvicorn
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import torch
+import librosa
+import numpy as np
+from fastapi import FastAPI, File, UploadFile, Form
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from typing import Optional
+import Levenshtein
+app = FastAPI()
+# --- CONFIGURATION ---
+# Using the 300m model for a balance of speed and Indian language support.
+MODEL_ID = "facebook/mms-300m"
+print(f"🔄 Loading AI Model: {MODEL_ID}...")
+try:
+    processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
+    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+    print("✅ Model loaded successfully!")
+except Exception as e:
+    print(f"❌ Failed to load model: {e}")
+    raise e
+# Language Code Mapping (Must match your Django app's expectations)
+LANG_MAP = {
+    'hindi': 'hin', 'tamil': 'tam', 'telugu': 'tel', 'marathi': 'mar',
+    'bengali': 'ben', 'gujarati': 'guj', 'kannada': 'kan', 'malayalam': 'mal',
+    'punjabi': 'pan', 'urdu': 'urd', 'assamese': 'asm', 'odia': 'ory',
+    'english': 'eng'
+}
+@app.get("/")
+def home():
+    return {"status": "running", "service": "SLAQ AI Engine", "model": MODEL_ID}
+@app.get("/health")
+def health():
+    return {"status": "healthy"}
+@app.post("/analyze")
+async def analyze_audio(
+    audio: UploadFile = File(...),
+    transcript: Optional[str] = Form(""),
+    language: Optional[str] = Form("eng")
+):
+    print(f"📥 Received analysis request. Language: {language}")
+    temp_filename = f"temp_{audio.filename}"
+    try:
+        # 1. Save uploaded file temporarily
+        with open(temp_filename, "wb") as buffer:
+            buffer.write(await audio.read())
+        # 2. Load and resample audio (16kHz required for Wav2Vec2)
+        speech, sr = librosa.load(temp_filename, sr=16000)
+        # 3. Configure Language Adapter
+        target_lang = LANG_MAP.get(str(language).lower(), 'eng')
+        try:
+            # MMS requires loading the specific language adapter
+            processor.tokenizer.set_target_lang(target_lang)
+            model.load_adapter(target_lang)
+        except Exception as e:
+            print(f"⚠️ Language adapter error for '{target_lang}': {e}. Falling back to English.")
+            target_lang = 'eng'
+            processor.tokenizer.set_target_lang('eng')
+            model.load_adapter('eng')
+        # 4. Run Inference (The AI part)
+        inputs = processor(speech, sampling_rate=16000, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(**inputs)
+            logits = outputs.logits
+        # Decode the output to text
+        predicted_ids = torch.argmax(logits, dim=-1)
+        actual_transcript = processor.batch_decode(predicted_ids)[0]
+        print(f"📝 Transcribed: {actual_transcript[:50]}...")
+        # 5. Calculate Metrics
+        confidence = float(torch.mean(torch.nn.functional.softmax(logits, dim=-1).max(dim=-1).values))
+        mismatched_chars = []
+        mismatch_pct = 0.0
+        # Calculate mismatch if a target transcript was provided
+        if transcript:
+            dist = Levenshtein.distance(actual_transcript, transcript)
+            max_len = max(len(transcript), 1)
+            mismatch_pct = (dist / max_len) * 100
+            # Simple character mismatch finding
+            import difflib
+            matcher = difflib.SequenceMatcher(None, actual_transcript, transcript)
+            for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+                if tag in ['replace', 'insert']:
+                    mismatched_chars.extend(list(transcript[j1:j2]))
+        # Determine Severity based on mismatch percentage
+        severity = "none"
+        if mismatch_pct > 10: severity = "mild"
+        if mismatch_pct > 25: severity = "moderate"
+        if mismatch_pct > 45: severity = "severe"
+        # 6. Format Response
+        response_data = {
+            "actual_transcript": actual_transcript,
+            "target_transcript": transcript or "",
+            "mismatched_chars": mismatched_chars,
+            "mismatch_percentage": round(mismatch_pct, 2),
+            "ctc_loss_score": 0.0,
+            "stutter_timestamps": [],
+            "total_stutter_duration": 0.0,
+            "stutter_frequency": 0.0,
+            "severity": severity,
+            "confidence_score": round(confidence, 2),
+            "model_version": MODEL_ID,
+            "language_detected": target_lang
+        }
+        return response_data
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return {"error": str(e)}, 500
+    finally:
+        # Cleanup: Delete the temporary file
+        if os.path.exists(temp_filename):
+            os.remove(temp_filename)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi==0.104.1
+uvicorn==0.24.0
+python-multipart==0.0.6
+torch==2.1.0
+transformers==4.35.2
+librosa==0.10.1
+numpy==1.26.2
+scipy==1.11.4
+soundfile==0.12.1
+python-Levenshtein==0.23.0