Spaces:

ROSHANNN123
/

voicedetectionapi

Sleeping

App Files Files Community

ROSHANNN123 commited on Feb 3

Commit

b08f86a

verified ·

1 Parent(s): f3fce37

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +26 -0
main.py +87 -0
model_service.py +92 -0
requirements.txt +10 -0
schemas.py +9 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+# Use accurate Python image
+FROM python:3.9
+# Set working directory to user's home (Hugging Face requirement for permissions)
+WORKDIR /code
+# Copy requirements file first to leverage cache
+COPY ./requirements.txt /code/requirements.txt
+# Install dependencies
+# Upgrade pip to avoid issues
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Copy the rest of the application
+COPY . /code
+# Create a writable directory for standard cache if needed (though we use /tmp usually)
+# and set permissions for the user 'user' (ID 1000) which HF uses.
+RUN mkdir -p /code/cache && chmod -R 777 /code/cache
+ENV XDG_CACHE_HOME=/code/cache
+# Expose the port (Hugging Face expects port 7860)
+EXPOSE 7860
+# Command to start the uvicorn server on port 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import base64
+import binascii
+from fastapi import FastAPI, HTTPException, Depends, Header
+from schemas import AudioInput, DetectionResult
+from model_service import get_model_service, ModelService
+app = FastAPI(
+    title="AI Voice Detection API",
+    description="Detects whether a voice sample is AI-generated or Human-spoken.",
+    version="1.0.0"
+)
+@app.on_event("startup")
+async def startup_event():
+    # Initialize model on startup
+    get_model_service()
+API_KEY = "my_secret_key_123"  # Simple hardcoded key for submission
+async def verify_api_key(x_api_key: str = Header(...)):
+    if x_api_key != API_KEY:
+        raise HTTPException(status_code=401, detail="Invalid API Key")
+    return x_api_key
+from fastapi import FastAPI, HTTPException, Depends, Header, Request
+# ... (Previous imports stay, schema can stay unused or updated)
+@app.post("/detect", response_model=DetectionResult)
+async def detect_voice(
+    request: Request,
+    service: ModelService = Depends(get_model_service),
+    api_key: str = Depends(verify_api_key)
+):
+    try:
+        # 1. Parse JSON body manually to be flexible
+        body = await request.json()
+        print(f"DEBUG: Received Body Keys: {list(body.keys())}")
+        # 2. Look for the base64 string in common keys using a priority list
+        # OR just grab the first string value that looks like base64
+        audio_b64 = None
+        # Check specific keys first
+        possible_keys = ["audio_base64", "audio", "data", "file", "encoded_audio", "mp3"]
+        for k in possible_keys:
+            if k in body and body[k]:
+                audio_b64 = body[k]
+                print(f"DEBUG: Found audio in key: '{k}'")
+                break
+        # Fallback: Search ALL values for a long string
+        if not audio_b64:
+            for k, v in body.items():
+                if isinstance(v, str) and len(v) > 100:
+                    audio_b64 = v
+                    print(f"DEBUG: Found audio in generic key: '{k}'")
+                    break
+        if not audio_b64:
+             raise HTTPException(status_code=422, detail=f"Could not find audio data. Received keys: {list(body.keys())}")
+        # Decode Base64 string
+        # Handle data URI scheme if present (e.g. "data:audio/mp3;base64,...")
+        if "," in audio_b64:
+            audio_b64 = audio_b64.split(",")[1]
+        audio_bytes = base64.b64decode(audio_b64)
+    except Exception as e:
+        print(f"Error parsing request: {e}")
+        raise HTTPException(status_code=400, detail=f"Invalid Request: {str(e)}")
+    try:
+        label, confidence = service.predict(audio_bytes)
+        return DetectionResult(
+            label=label,
+            confidence=confidence,
+            message="Analysis successful"
+        )
+    except ValueError as ve:
+        raise HTTPException(status_code=400, detail=str(ve))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
+@app.get("/")
+def read_root():
+    return {"message": "AI Voice Detection API is running. Use /detect endpoint."}

model_service.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+import librosa
+import numpy as np
+import io
+import soundfile as sf
+from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
+import torch.nn.functional as F
+# Configuration
+MODEL_NAME = "Hemgg/Deepfake-audio-detection"  # Using a known fine-tuned model
+# Alternative: "mo-thecreator/Deepfake-audio-detection" if the above fails or is private
+# But usually public models are fine.
+class ModelService:
+    def __init__(self):
+        print(f"Loading model: {MODEL_NAME}...")
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        try:
+            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
+            self.model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(self.device)
+            print(f"Model loaded on {self.device}")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            raise e
+    def preprocess_audio(self, audio_bytes):
+        """
+        Load audio bytes, resample to 16000 Hz (required by Wav2Vec2).
+        """
+        try:
+            # Load audio from bytes
+            # librosa.load supports file-like objects
+            audio_file = io.BytesIO(audio_bytes)
+            # Load and resample to 16k
+            speech, sr = librosa.load(audio_file, sr=16000)
+            # Ensure it's mono (if multi-channel, average them) - librosa.load handles this by default (mono=True)
+            return speech
+        except Exception as e:
+            print(f"Error processing audio: {e}")
+            raise ValueError("Invalid audio format or corrupted file.")
+    def predict(self, audio_bytes):
+        speech = self.preprocess_audio(audio_bytes)
+        # Tokenize (extract features)
+        inputs = self.feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
+        inputs = {key: val.to(self.device) for key, val in inputs.items()}
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+        # Get probabilities
+        probs = F.softmax(logits, dim=-1)
+        # The model usually outputs [real, fake] or [fake, real].
+        # We need to verify the label mapping.
+        # Typically, id2label is stored in the config.
+        id2label = self.model.config.id2label
+        # Example id2label: {0: 'real', 1: 'fake'} or similar.
+        predicted_id = torch.argmax(probs, dim=-1).item()
+        predicted_label = id2label[predicted_id]
+        confidence = probs[0][predicted_id].item()
+        # Map to required output format "AI_GENERATED" or "HUMAN"
+        # Adjust based on specific model labels.
+        # Assuming common labels like "real"/"spoof" or "human"/"ai"
+        normalized_label = "UNKNOWN"
+        lower_label = predicted_label.lower()
+        if "real" in lower_label or "human" in lower_label or "bonafide" in lower_label:
+            normalized_label = "HUMAN"
+        elif "fake" in lower_label or "spoof" in lower_label or "ai" in lower_label:
+            normalized_label = "AI_GENERATED"
+        else:
+            # Fallback if labels are obscure, typically 0 is real, 1 is fake for many datasets but not all.
+            # We trust the string matching first.
+            normalized_label = predicted_label
+        return normalized_label, confidence
+# Singleton instance
+model_service = None
+def get_model_service():
+    global model_service
+    if model_service is None:
+        model_service = ModelService()
+    return model_service

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn
+torch
+transformers
+librosa
+soundfile
+python-multipart
+numpy
+requests
+gTTS

schemas.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from pydantic import BaseModel
+class AudioInput(BaseModel):
+    audio_base64: str
+class DetectionResult(BaseModel):
+    label: str  # "AI_GENERATED" or "HUMAN"
+    confidence: float
+    message: str | None = None