import gradio as gr import torch import torch.nn as nn import librosa import numpy as np import whisper import pandas as pd from datasets import load_dataset from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.utils import resample device = torch.device("cpu") # ================= LOAD DATASET ================= data1 = pd.read_csv("spam_dataset.csv") data2 = load_dataset("ucirvine/sms_spam") data2 = data2["train"].to_pandas() data2 = data2.rename(columns={"sms": "text", "label": "label"}) data = pd.concat([data1, data2], ignore_index=True) # ================= FIX LABELS ================= # Ensure labels are 0 (ham) and 1 (spam) data["label"] = data["label"].astype(int) # ================= BALANCE DATASET ================= ham = data[data.label == 0] spam = data[data.label == 1] min_size = min(len(ham), len(spam)) ham_bal = resample(ham, replace=False, n_samples=min_size, random_state=42) spam_bal = resample(spam, replace=False, n_samples=min_size, random_state=42) data = pd.concat([ham_bal, spam_bal]) texts = data["text"] labels = data["label"] # ================= ML TRAINING ================= vectorizer = TfidfVectorizer(stop_words="english") X = vectorizer.fit_transform(texts) ml_model = LogisticRegression(max_iter=200) ml_model.fit(X, labels) # ================= CNN MODEL ================= class ScamAudioCNN(nn.Module): def __init__(self): super(ScamAudioCNN, self).__init__() self.conv1 = nn.Conv2d(1, 16, 3, padding=1) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(16, 32, 3, padding=1) self.fc1 = nn.Linear(32 * 10 * 25, 128) self.fc2 = nn.Linear(128, 2) def forward(self, x): x = self.pool(torch.relu(self.conv1(x))) x = self.pool(torch.relu(self.conv2(x))) x = x.view(x.size(0), -1) x = torch.relu(self.fc1(x)) x = self.fc2(x) return x cnn_model = ScamAudioCNN().to(device) # ================= LOAD CNN ================= try: cnn_model.load_state_dict(torch.load("scam_audio_model.pth", map_location=device)) cnn_model.eval() cnn_loaded = True except: cnn_loaded = False print("⚠️ CNN model not found, skipping CNN contribution") # ================= WHISPER ================= whisper_model = whisper.load_model("tiny", device="cpu") # ================= MFCC ================= def extract_features(file_path, max_len=100): y, sr = librosa.load(file_path, sr=16000) mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) if mfcc.shape[1] < max_len: mfcc = np.pad(mfcc, ((0,0),(0,max_len-mfcc.shape[1]))) else: mfcc = mfcc[:, :max_len] mfcc = mfcc[np.newaxis, np.newaxis, :, :] return torch.tensor(mfcc, dtype=torch.float32) # ================= TRANSCRIPTION ================= def transcribe_audio(file_path): result = whisper_model.transcribe(file_path) return result["text"].lower() # ================= KEYWORDS ================= scam_keywords = [ "otp","bank","account","verify","urgent","blocked","suspend", "credit card","loan","refund","investment","crypto","kyc", "password","security","congratulations","won","winner","prize", "claim","fee","pay","offer","lottery","jackpot","gift","free" ] def keyword_score(text): found = [w for w in scam_keywords if w in text] score = 0 if len(found) == 0 else min(len(found)/3, 1.0) return score, found # ================= ML PREDICTION ================= def ml_predict(text): X_test = vectorizer.transform([text]) prob = ml_model.predict_proba(X_test)[0][1] return prob # ================= MAIN ================= def analyze_audio(audio): if audio is None: return "No audio detected." try: # TRANSCRIBE transcript = transcribe_audio(audio) # KEYWORD k_score, words = keyword_score(transcript) # ML ml_score = ml_predict(transcript) # CNN (optional) cnn_score = 0 if cnn_loaded: features = extract_features(audio).to(device) with torch.no_grad(): out = cnn_model(features) probs = torch.softmax(out, dim=1) cnn_score = probs[0][1].item() # DEBUG PRINTS print("Transcript:", transcript) print("Keyword Score:", k_score) print("ML Score:", ml_score) print("CNN Score:", cnn_score) # FINAL SCORE (balanced weights) final_score = (0.2 * k_score) + (0.5 * ml_score) + (0.3 * cnn_score) # THRESHOLD FIXED if final_score < 0.40: risk = "Low Risk" result = "NOT SPAM" elif final_score < 0.65: risk = "Medium Risk" result = "SPAM" else: risk = "High Scam Risk" result = "SPAM" return f""" Transcript: {transcript} Spam Words Found: {', '.join(words) if words else 'None'} Scores: Keyword: {k_score:.2f} ML: {ml_score:.2f} CNN: {cnn_score:.2f} Final Probability: {final_score*100:.2f}% Risk Level: {risk} Final Result: {result} """ except Exception as e: return f"Error: {str(e)}" # ================= UI ================= with gr.Blocks() as demo: gr.Markdown("# 🎙️ Hybrid Voice Scam Detection System") gr.Markdown("Speech + AI + Keyword Detection") audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") output = gr.Textbox(lines=12) gr.Button("Analyze").click( analyze_audio, inputs=audio_input, outputs=output ) demo.launch()