File size: 4,487 Bytes
384f8dc
d40d6d4
722f84e
d40d6d4
f9f0314
85bca98
8773f6c
722f84e
e07795f
 
 
c4365d2
d40d6d4
e07795f
 
 
b30e39a
 
 
f9f0314
d40d6d4
e07795f
8773f6c
 
1f06dd4
d40d6d4
 
722f84e
d40d6d4
 
0b71a97
 
d40d6d4
8773f6c
 
1f06dd4
d40d6d4
 
e07795f
384f8dc
 
 
 
 
 
 
e07795f
 
722f84e
 
 
 
 
e75c5f3
7dfa0e2
e07795f
722f84e
f093f7f
722f84e
3f67209
722f84e
a03cd02
 
 
b30e39a
 
a03cd02
722f84e
 
 
85bca98
 
 
 
 
 
 
 
 
 
 
 
 
587e152
 
85bca98
 
 
 
 
 
 
d5d3072
85bca98
 
 
 
 
 
 
 
 
 
 
722f84e
 
1e4d7e8
ba6b417
e07795f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json
import torch
import gradio as gr
import models as MOD
import process_data as PD
from transformers import pipeline
from huggingface_hub import hf_hub_download

model_master = {
    "SSL-AASIST (Trained on ASV-Spoof5)": {"eer_threshold": 3.3330237865448,
                                           "data_process_func": "process_ssl_assist_input",
                                           "note": "This model is trained only on ASVSpoof 2024 training data.",
                                           "model_class": "Model",
                                           "model_checkpoint": "ssl_aasist_epoch_7.pth"},
    "AASIST": {"eer_threshold": 1.8018419742584229,
               "data_process_func": "process_assist_input",
               "note": "This model is trained on ASVSpoof 2024 training data.",
               "model_class":"AASIST_Model",
               "model_checkpoint": "orig_aasist_epoch_1.pth"}
}

model = MOD.Model(None, "cpu")
base_model_file = hf_hub_download("arnabdas8901/aasist-trained-asvspoof2024", filename="ssl_aasist_epoch_7.pth")
model.load_state_dict(torch.load(base_model_file, map_location="cpu"))
model.eval()
loaded_model = "SSL-AASIST (Trained on ASV-Spoof5)"

def process(file, type):
    global model
    global loaded_model
    inp = getattr(PD, model_master[type]["data_process_func"])(file)
    if not loaded_model == type:
        model = getattr(MOD, model_master[type]["model_class"])(None, "cpu")
        model_file = hf_hub_download("arnabdas8901/aasist-trained-asvspoof2024", filename=model_master[type]["model_checkpoint"])
        model.load_state_dict(torch.load(model_file, map_location="cpu"))
        model.eval()
        loaded_model = type

    op = model(inp).detach().squeeze()[1].item()
    output_json = {}
    output_json["decision_score"] = str(op)
    output_json["model_threshold"] = str(model_master[type]["eer_threshold"])
    output_json["optional_note"] = "1. Any score below threshold is indicative of fake. \n2. {}".format(model_master[type]["note"])
    response_text = json.dumps(output_json, indent=4)
    """response_text = "Decision score: {} \nDecision threshold: {} \nNotes: 1. Any score below threshold is indicative of fake. \n2. {} ".format(
        str(op), str(model_master[type]["eer_threshold"]), model_master[type]["note"])"""
    return response_text


demo = gr.Blocks()
file_proc = gr.Interface(
    fn=process,
    inputs=[
        gr.Audio(sources=["upload"], label="Audio file", type="filepath"),
        gr.Radio(["SSL-AASIST (Trained on ASV-Spoof5)", "AASIST"], label="Select Model", type="value"),
    ],
    outputs="text",
    title="Find the Fake: Analyze 'Real' or 'Fake'.",
    description=(
        "Analyze fake or real with a click of a button. Upload a .wav or .flac file."
    ),
    examples=[
        ["./bonafide.flac", "SSL-AASIST (Trained on ASV-Spoof5)"],
        ["./fake.flac", "SSL-AASIST (Trained on ASV-Spoof5)"],
        ["./bonafide.flac", "AASIST"],
        ["./fake.flac", "AASIST"],
    ],
    cache_examples=True,
    allow_flagging="never",
)
#####################################################################################
# For ASR interface
pipe = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-large-v3",
    chunk_length_s=30,
    device="cpu",
)

def transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    op = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe"}, return_timestamps=False, return_language=True)
    lang = op["chunks"][0]["language"]
    text = op["text"]

    return  lang, text

transcribe_proc = gr.Interface(
    fn = transcribe,
    inputs = [
        gr.Audio(type="filepath", label="Speech file (<30s)", max_length=30, sources=["microphone", "upload"], show_download_button=True)
    ],
    outputs=[
        gr.Text(label="Predicted Language", info="Language identification is performed automatically."),
        gr.Text(label="Predicted transcription", info="Best hypothesis."),
    ],
    title="Transcribe Anything.",
    description=(
        "Automatactic language identification and transcription service by Whisper Large V3. Upload a .wav or .flac file."
    ),
    allow_flagging="never"
)

with demo:
    gr.TabbedInterface([file_proc, transcribe_proc], ["Analyze Audio File", "Transcribe Audio File"])
demo.queue(max_size=10)
demo.launch(share=True)