Spaces:

Max005
/

DeepfakeDetection

Runtime error

File size: 3,900 Bytes

from fastapi import FastAPI, File, UploadFile
from pydantic import BaseModel
import uvicorn
import os
import torchaudio
import torch.nn.functional as F
import torch
from transformers import AutoProcessor, AutoModelForAudioClassification
from pathlib import Path

app_dir = Path(__file__).parent
# Model setup
model_path = app_dir / "Deepfake" / "model"

processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForAudioClassification.from_pretrained(
    pretrained_model_name_or_path=model_path,
    local_files_only=True,
)

def prepare_audio(file_path, sampling_rate=16000, duration=10):
    """
    Prepares audio by loading, resampling, and returning it in manageable chunks.
    """
    # Load and resample the audio file
    waveform, original_sampling_rate = torchaudio.load(file_path)
    
    # Convert stereo to mono if necessary
    if waveform.shape[0] > 1:  # More than 1 channel
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    # Resample if needed
    if original_sampling_rate != sampling_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=original_sampling_rate, new_freq=sampling_rate)
        waveform = resampler(waveform)
    
    # Calculate chunk size in samples
    chunk_size = sampling_rate * duration
    audio_chunks = []

    # Split the audio into chunks
    for start in range(0, waveform.shape[1], chunk_size):
        chunk = waveform[:, start:start + chunk_size]
        
        # Pad the last chunk if it's shorter than the chunk size
        if chunk.shape[1] < chunk_size:
            padding = chunk_size - chunk.shape[1]
            chunk = torch.nn.functional.pad(chunk, (0, padding))
        
        audio_chunks.append(chunk.squeeze().numpy())
    
    return audio_chunks

def predict_audio(file_path):
    """
    Predicts the class of an audio file by aggregating predictions from chunks and calculates confidence.
    """
    # Prepare audio chunks
    audio_chunks = prepare_audio(file_path)
    predictions = []
    confidences = []

    for i, chunk in enumerate(audio_chunks):
        # Prepare input for the model
        inputs = processor(
            chunk, sampling_rate=16000, return_tensors="pt", padding=True
        )
        
        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            
            # Apply softmax to calculate probabilities
            probabilities = F.softmax(logits, dim=1)
            
            # Get the predicted class and its confidence
            confidence, predicted_class = torch.max(probabilities, dim=1)
            predictions.append(predicted_class.item())
            confidences.append(confidence.item())
    
    # Aggregate predictions (majority voting)
    aggregated_prediction_id = max(set(predictions), key=predictions.count)
    predicted_label = model.config.id2label[aggregated_prediction_id]
    
    # Calculate average confidence across chunks
    average_confidence = sum(confidences) / len(confidences)

    return {
        "predicted_label": predicted_label,
        "average_confidence": average_confidence
    }

# Initialize FastAPI
app = FastAPI()

@app.post("/infer")
async def infer(file: UploadFile = File(...)):
    """
    Accepts an audio file and returns the prediction and confidence.
    """
    # Save the uploaded file to a temporary location
    temp_file_path = f"temp_{file.filename}"
    with open(temp_file_path, "wb") as temp_file:
        temp_file.write(await file.read())
    
    try:
        # Perform inference
        predictions = predict_audio(temp_file_path)
    finally:
        # Clean up the temporary file
        os.remove(temp_file_path)
    
    return predictions

@app.get("/health")
async def health():
    return {
        "message": "ok",
        "Sound":str(torchaudio.list_audio_backends())
        }