import os import torch import librosa import numpy as np from transformers import ( AutoFeatureExtractor, AutoModelForAudioClassification, ) import gradio as gr TOKEN = os.environ['HF_TOKEN'] MODEL_ID = "aitf-komdigi/KomdigiITS-86M-DFK-DeepfakeAudioClassification" feature_extractor = AutoFeatureExtractor.from_pretrained( MODEL_ID, token=TOKEN, ) model = AutoModelForAudioClassification.from_pretrained( MODEL_ID, token=TOKEN, ) model.eval() def predict(audio): if audio is None: return "No audio uploaded" sr, waveform = audio waveform = waveform.astype(np.float32) if waveform.ndim > 1: waveform = waveform.mean(axis=1) waveform = librosa.resample( waveform, orig_sr=sr, target_sr=16000, ) inputs = feature_extractor( waveform, sampling_rate=16000, return_tensors="pt", ) with torch.no_grad(): logits = model(**inputs).logits score = torch.sigmoid(logits).item() prediction = "Fake" if score >= 0.5 else "Real" return { "Real": round(1.0 - score, 4), "Fake": round(score, 4), } demo = gr.Interface( fn=predict, inputs=gr.Audio( sources=["upload", "microphone"], type="numpy", ), outputs=gr.Label(), title="Audio Deepfake Detection", description="Detect whether an audio clip is real or AI-generated.", ) demo.launch()