| import os |
| import torch |
| import librosa |
| import numpy as np |
| from transformers import ( |
| AutoFeatureExtractor, |
| AutoModelForAudioClassification, |
| ) |
| import gradio as gr |
|
|
| TOKEN = os.environ['HF_TOKEN'] |
| MODEL_ID = "aitf-komdigi/KomdigiITS-86M-DFK-DeepfakeAudioClassification" |
|
|
| feature_extractor = AutoFeatureExtractor.from_pretrained( |
| MODEL_ID, |
| token=TOKEN, |
| ) |
|
|
| model = AutoModelForAudioClassification.from_pretrained( |
| MODEL_ID, |
| token=TOKEN, |
| ) |
|
|
| model.eval() |
|
|
|
|
| def predict(audio): |
| if audio is None: |
| return "No audio uploaded" |
|
|
| sr, waveform = audio |
|
|
| waveform = waveform.astype(np.float32) |
|
|
| if waveform.ndim > 1: |
| waveform = waveform.mean(axis=1) |
|
|
| waveform = librosa.resample( |
| waveform, |
| orig_sr=sr, |
| target_sr=16000, |
| ) |
|
|
| inputs = feature_extractor( |
| waveform, |
| sampling_rate=16000, |
| return_tensors="pt", |
| ) |
|
|
| with torch.no_grad(): |
| logits = model(**inputs).logits |
|
|
| score = torch.sigmoid(logits).item() |
|
|
| prediction = "Fake" if score >= 0.5 else "Real" |
|
|
| return { |
| "Real": round(1.0 - score, 4), |
| "Fake": round(score, 4), |
| } |
|
|
|
|
| demo = gr.Interface( |
| fn=predict, |
| inputs=gr.Audio( |
| sources=["upload", "microphone"], |
| type="numpy", |
| ), |
| outputs=gr.Label(), |
| title="Audio Deepfake Detection", |
| description="Detect whether an audio clip is real or AI-generated.", |
| ) |
|
|
| demo.launch() |
|
|