File size: 1,443 Bytes
2bfde2c
db83cda
 
 
 
 
 
 
 
 
2bfde2c
db83cda
 
 
2bfde2c
 
db83cda
 
 
2bfde2c
 
db83cda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import torch
import librosa
import numpy as np
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
)
import gradio as gr

TOKEN = os.environ['HF_TOKEN']
MODEL_ID = "aitf-komdigi/KomdigiITS-86M-DFK-DeepfakeAudioClassification"

feature_extractor = AutoFeatureExtractor.from_pretrained(
    MODEL_ID,
    token=TOKEN,
)

model = AutoModelForAudioClassification.from_pretrained(
    MODEL_ID,
    token=TOKEN,
)

model.eval()


def predict(audio):
    if audio is None:
        return "No audio uploaded"

    sr, waveform = audio

    waveform = waveform.astype(np.float32)

    if waveform.ndim > 1:
        waveform = waveform.mean(axis=1)

    waveform = librosa.resample(
        waveform,
        orig_sr=sr,
        target_sr=16000,
    )

    inputs = feature_extractor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt",
    )

    with torch.no_grad():
        logits = model(**inputs).logits

    score = torch.sigmoid(logits).item()

    prediction = "Fake" if score >= 0.5 else "Real"

    return {
        "Real": round(1.0 - score, 4),
        "Fake": round(score, 4),
    }


demo = gr.Interface(
    fn=predict,
    inputs=gr.Audio(
        sources=["upload", "microphone"],
        type="numpy",
    ),
    outputs=gr.Label(),
    title="Audio Deepfake Detection",
    description="Detect whether an audio clip is real or AI-generated.",
)

demo.launch()