Upload 5 files

Browse files

some files updated

Files changed (5) hide show

RawNet_model.onnx +3 -0
app.py +114 -0
best_model.pth +3 -0
data_utils.py +94 -0
inference_onnx.py +72 -0

RawNet_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64e9e09f132ecb8d4a4fc60ec29fab2a35e3b4cd8605e5489ba3a5d085d143e2
+size 70911020

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import torch
+import librosa
+import numpy as np
+from fastapi import FastAPI, File, UploadFile
+from model import RawNet
+from data_utils import pad  # Import the pad function from data_utils.py
+import yaml
+import torch.nn.functional as F  # For softmax
+from fastapi.responses import JSONResponse
+from tempfile import NamedTemporaryFile
+import uvicorn
+import webbrowser
+# Initialize FastAPI app
+app = FastAPI()
+# Load the model
+model_config_path = 'C:\\\\Users\\\\GOOD\\\\Desktop\\\\TEST-2024\\\\2021\\\\LA\\\\Baseline-RawNet2\\\\model_config_RawNet.yaml'
+model_path = 'C:\\\\Users\\\\GOOD\\\\Desktop\\\\TEST-2024\\\\2021\\\\LA\\\\Baseline-RawNet2\\\\checkpoints\\\\best_model.pth'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+with open(model_config_path, 'r') as f:
+    model_config = yaml.safe_load(f)
+model = RawNet(model_config['model'], device).to(device)
+model.load_state_dict(torch.load(model_path, map_location=device))
+model.eval()
+def preprocess_audio_segment(segment, cut=64600):
+    """
+    Preprocess a single audio segment: pad or trim as required.
+    """
+    if len(segment) < cut:
+        segment = pad(segment, max_len=cut)  # Pad if shorter
+    else:
+        segment = segment[:cut]  # Trim if longer
+    return torch.tensor(segment, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
+def predict_with_sliding_window(waveform, model, device, window_size=64600, step_size=64600, sample_rate=16000):
+    """
+    Use a sliding window to predict if the audio is real or fake over the entire audio.
+    """
+    total_segments = []
+    total_probabilities = []
+    # Sliding window processing
+    for start in range(0, len(waveform), step_size):
+        end = start + window_size
+        segment = waveform[start:end]
+        # Preprocess the segment
+        audio_tensor = preprocess_audio_segment(segment).to(device)
+        # Perform inference
+        with torch.no_grad():
+            output = model(audio_tensor)
+            probabilities = F.softmax(output, dim=1)  # Compute probabilities
+            prediction = torch.argmax(probabilities, dim=1)
+        # Store the results
+        predicted_class = "Human voice" if prediction.item() == 1 else "AI generated voice (TTS)"
+        probability = probabilities[0, prediction.item()].item() * 100
+        total_segments.append(predicted_class)
+        total_probabilities.append(probability)
+    # Final aggregation
+    majority_class = max(set(total_segments), key=total_segments.count)  # Majority voting
+    avg_probability = np.mean(total_probabilities)  # Average probability
+    return majority_class, avg_probability
+@app.post("/predict")
+async def predict_audio(file: UploadFile = File(...)):
+    """
+    Endpoint to process audio and predict using the RawNet model.
+    """
+    try:
+        # Save uploaded file to a temporary file
+        with NamedTemporaryFile(delete=False) as temp_file:
+            temp_file.write(await file.read())
+            temp_filename = temp_file.name
+        # Load audio file
+        waveform, _ = librosa.load(temp_filename, sr=16000)
+        # Perform prediction
+        result, avg_probability = predict_with_sliding_window(waveform, model, device)
+        # Clean up temporary file
+        os.remove(temp_filename)
+        return JSONResponse({
+            "Your audio": result,
+            "average_probability": f"{avg_probability:.2f}%"
+        })
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+@app.get("/")
+async def root():
+    return {"message": "RawNet Sliding Window Prediction API"}
+# Automatically open docs or print URL when server starts
+if __name__ == "__main__":
+    url = "http://127.0.0.1:8000/docs"
+    print(f"API docs available at: {url}")
+    webbrowser.open(url)  # Open in the default browser
+    uvicorn.run(app, host="127.0.0.1", port=8000)

best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:940acc620962f2ce0e2b1f91c3c514bc9128240b5800612205aaead7b78c1c64
+size 70532085

data_utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import numpy as np
+import torch
+from torch import Tensor
+import librosa
+from torch.utils.data import Dataset
+# Audioni padding qilish
+def pad(x, max_len=64600):
+    x_len = x.shape[0]
+    if x_len >= max_len:
+        return x[:max_len]
+    # Padding kerak
+    num_repeats = (max_len // x_len) + 1
+    padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
+    return padded_x
+def genSpoof_list(dir_meta, is_train=False, is_eval=False):
+    d_meta = {}
+    file_list = []
+    with open(dir_meta, 'r') as f:
+        l_meta = f.readlines()
+    if is_train:
+        for line in l_meta:
+            _, key, _, _, label = line.strip().split(' ')
+            file_list.append(key)
+            d_meta[key] = 1 if label == 'bonafide' else 0
+        return d_meta, file_list
+    elif is_eval:
+        for line in l_meta:
+            key = line.strip()
+            file_list.append(key)
+        return file_list
+    else:
+        for line in l_meta:
+            _, key, _, _, label = line.strip().split(' ')
+            file_list.append(key)
+            d_meta[key] = 1 if label == 'bonafide' else 0
+        return d_meta, file_list
+class Dataset_ASVspoof2019_train(Dataset):
+    def __init__(self, list_IDs, labels, base_dir, cut=64600):
+        """
+        Args:
+            list_IDs: Utts kalitlari ro'yxati (string).
+            labels: Kalitlar va tegishli yorliqlar lug'ati.
+            base_dir: Ma'lumotlar joylashgan katalog (flac katalogsiz).
+            cut: Maksimal uzunlik (standart: 64600).
+        """
+        self.list_IDs = list_IDs
+        self.labels = labels
+        self.base_dir = base_dir
+        self.cut = cut
+    def __len__(self):
+        return len(self.list_IDs)
+    def __getitem__(self, index):
+        key = self.list_IDs[index]
+        file_path = os.path.join(self.base_dir, f"{key}.flac")  # flac ni qayta qo‘shmang
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+        X, fs = librosa.load(file_path, sr=16000)
+        X_pad = pad(X, self.cut)
+        x_inp = Tensor(X_pad)
+        y = self.labels[key]
+        return x_inp, y
+# ASVspoof2021 baholash ma'lumotlar to'plami uchun Dataset sinfi
+class Dataset_ASVspoof2021_eval(Dataset):
+    def __init__(self, list_IDs, base_dir, cut=64600):
+        self.list_IDs = [x.replace(' ', '_') for x in list_IDs]  # Bo'sh joylarni almashtirish
+        self.base_dir = base_dir
+        self.cut = cut
+    def __len__(self):
+        return len(self.list_IDs)
+    def __getitem__(self, index):
+        key = self.list_IDs[index]
+        file_path = os.path.join(self.base_dir, f"{key}.flac")
+        if not os.path.exists(file_path):
+            print(f"Checking file: {file_path}, Exists: {os.path.exists(file_path)}")  # Fayl mavjudligini tekshirish
+            raise FileNotFoundError(f"File not found: {file_path}")
+        X, fs = librosa.load(file_path, sr=16000)
+        X_pad = pad(X, self.cut)
+        x_inp = Tensor(X_pad)
+        return x_inp, key

inference_onnx.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import librosa
+import numpy as np
+import onnxruntime as ort
+import torch
+from data_utils import pad  # Import the pad function from data_utils.py
+# Preprocess audio for a single segment
+def preprocess_audio_segment(segment, cut=64600):
+    """
+    Preprocess a single audio segment: pad or trim as required.
+    """
+    if len(segment) < cut:
+        segment = pad(segment, max_len=cut)  # Pad if shorter
+    else:
+        segment = segment[:cut]  # Trim if longer
+    return np.expand_dims(np.array(segment, dtype=np.float32), axis=0)  # Add batch dimension
+# Perform sliding window prediction
+def predict_with_sliding_window(audio_path, onnx_model_path, window_size=64600, step_size=64600, sample_rate=16000):
+    """
+    Use a sliding window to predict if the audio is real or fake over the entire audio.
+    """
+    # Load the ONNX runtime session
+    ort_session = ort.InferenceSession(onnx_model_path)
+    # Load the audio file
+    waveform, _ = librosa.load(audio_path, sr=sample_rate)
+    total_segments = []
+    total_probabilities = []
+    # Sliding window processing
+    for start in range(0, len(waveform), step_size):
+        end = start + window_size
+        segment = waveform[start:end]
+        # Preprocess the segment
+        audio_tensor = preprocess_audio_segment(segment)
+        # Perform inference
+        inputs = {ort_session.get_inputs()[0].name: audio_tensor}
+        outputs = ort_session.run(None, inputs)
+        probabilities = torch.tensor(outputs[0])  # Convert to torch tensor for processing
+        probabilities = torch.nn.functional.softmax(probabilities, dim=1)  # Compute probabilities
+        prediction = torch.argmax(probabilities, dim=1)
+        # Store the results
+        predicted_class = "Real" if prediction.item() == 1 else "Fake"
+        probability = probabilities[0, prediction.item()].item() * 100
+        total_segments.append(predicted_class)
+        total_probabilities.append(probability)
+        print(f"Segment {start//step_size + 1}: {predicted_class}, Probability: {probability:.2f}%")
+    # Final aggregation
+    majority_class = max(set(total_segments), key=total_segments.count)  # Majority voting
+    avg_probability = np.mean(total_probabilities)  # Average probability
+    return majority_class, avg_probability
+# Main script for inference
+if __name__ == "__main__":
+    # Path to the ONNX model
+    onnx_model_path = 'C:\\Users\\GOOD\\Desktop\\TEST-2024\\2021\\LA\\Baseline-RawNet2\\checkpoints\\RawNet_model.onnx'
+    # Specify the path to the audio file
+    audio_path = "C:\\Users\\GOOD\\Desktop\\TEST-2024\\2021\\LA\\Baseline-RawNet2\\audio\\KTA.mp3"  # Example .mp3 file
+    # Perform sliding window prediction
+    result, avg_probability = predict_with_sliding_window(audio_path, onnx_model_path)
+    print(f"Final Result: {result}, Average Probability: {avg_probability:.2f}%")