Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torch.nn as nn | |
| import librosa | |
| import numpy as np | |
| import whisper | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.utils import resample | |
| device = torch.device("cpu") | |
| # ================= LOAD DATASET ================= | |
| data1 = pd.read_csv("spam_dataset.csv") | |
| data2 = load_dataset("ucirvine/sms_spam") | |
| data2 = data2["train"].to_pandas() | |
| data2 = data2.rename(columns={"sms": "text", "label": "label"}) | |
| data = pd.concat([data1, data2], ignore_index=True) | |
| # ================= FIX LABELS ================= | |
| # Ensure labels are 0 (ham) and 1 (spam) | |
| data["label"] = data["label"].astype(int) | |
| # ================= BALANCE DATASET ================= | |
| ham = data[data.label == 0] | |
| spam = data[data.label == 1] | |
| min_size = min(len(ham), len(spam)) | |
| ham_bal = resample(ham, replace=False, n_samples=min_size, random_state=42) | |
| spam_bal = resample(spam, replace=False, n_samples=min_size, random_state=42) | |
| data = pd.concat([ham_bal, spam_bal]) | |
| texts = data["text"] | |
| labels = data["label"] | |
| # ================= ML TRAINING ================= | |
| vectorizer = TfidfVectorizer(stop_words="english") | |
| X = vectorizer.fit_transform(texts) | |
| ml_model = LogisticRegression(max_iter=200) | |
| ml_model.fit(X, labels) | |
| # ================= CNN MODEL ================= | |
| class ScamAudioCNN(nn.Module): | |
| def __init__(self): | |
| super(ScamAudioCNN, self).__init__() | |
| self.conv1 = nn.Conv2d(1, 16, 3, padding=1) | |
| self.pool = nn.MaxPool2d(2, 2) | |
| self.conv2 = nn.Conv2d(16, 32, 3, padding=1) | |
| self.fc1 = nn.Linear(32 * 10 * 25, 128) | |
| self.fc2 = nn.Linear(128, 2) | |
| def forward(self, x): | |
| x = self.pool(torch.relu(self.conv1(x))) | |
| x = self.pool(torch.relu(self.conv2(x))) | |
| x = x.view(x.size(0), -1) | |
| x = torch.relu(self.fc1(x)) | |
| x = self.fc2(x) | |
| return x | |
| cnn_model = ScamAudioCNN().to(device) | |
| # ================= LOAD CNN ================= | |
| try: | |
| cnn_model.load_state_dict(torch.load("scam_audio_model.pth", map_location=device)) | |
| cnn_model.eval() | |
| cnn_loaded = True | |
| except: | |
| cnn_loaded = False | |
| print("⚠️ CNN model not found, skipping CNN contribution") | |
| # ================= WHISPER ================= | |
| whisper_model = whisper.load_model("tiny", device="cpu") | |
| # ================= MFCC ================= | |
| def extract_features(file_path, max_len=100): | |
| y, sr = librosa.load(file_path, sr=16000) | |
| mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) | |
| if mfcc.shape[1] < max_len: | |
| mfcc = np.pad(mfcc, ((0,0),(0,max_len-mfcc.shape[1]))) | |
| else: | |
| mfcc = mfcc[:, :max_len] | |
| mfcc = mfcc[np.newaxis, np.newaxis, :, :] | |
| return torch.tensor(mfcc, dtype=torch.float32) | |
| # ================= TRANSCRIPTION ================= | |
| def transcribe_audio(file_path): | |
| result = whisper_model.transcribe(file_path) | |
| return result["text"].lower() | |
| # ================= KEYWORDS ================= | |
| scam_keywords = [ | |
| "otp","bank","account","verify","urgent","blocked","suspend", | |
| "credit card","loan","refund","investment","crypto","kyc", | |
| "password","security","congratulations","won","winner","prize", | |
| "claim","fee","pay","offer","lottery","jackpot","gift","free" | |
| ] | |
| def keyword_score(text): | |
| found = [w for w in scam_keywords if w in text] | |
| score = 0 if len(found) == 0 else min(len(found)/3, 1.0) | |
| return score, found | |
| # ================= ML PREDICTION ================= | |
| def ml_predict(text): | |
| X_test = vectorizer.transform([text]) | |
| prob = ml_model.predict_proba(X_test)[0][1] | |
| return prob | |
| # ================= MAIN ================= | |
| def analyze_audio(audio): | |
| if audio is None: | |
| return "No audio detected." | |
| try: | |
| # TRANSCRIBE | |
| transcript = transcribe_audio(audio) | |
| # KEYWORD | |
| k_score, words = keyword_score(transcript) | |
| # ML | |
| ml_score = ml_predict(transcript) | |
| # CNN (optional) | |
| cnn_score = 0 | |
| if cnn_loaded: | |
| features = extract_features(audio).to(device) | |
| with torch.no_grad(): | |
| out = cnn_model(features) | |
| probs = torch.softmax(out, dim=1) | |
| cnn_score = probs[0][1].item() | |
| # DEBUG PRINTS | |
| print("Transcript:", transcript) | |
| print("Keyword Score:", k_score) | |
| print("ML Score:", ml_score) | |
| print("CNN Score:", cnn_score) | |
| # FINAL SCORE (balanced weights) | |
| final_score = (0.2 * k_score) + (0.5 * ml_score) + (0.3 * cnn_score) | |
| # THRESHOLD FIXED | |
| if final_score < 0.40: | |
| risk = "Low Risk" | |
| result = "NOT SPAM" | |
| elif final_score < 0.65: | |
| risk = "Medium Risk" | |
| result = "SPAM" | |
| else: | |
| risk = "High Scam Risk" | |
| result = "SPAM" | |
| return f""" | |
| Transcript: {transcript} | |
| Spam Words Found: {', '.join(words) if words else 'None'} | |
| Scores: | |
| Keyword: {k_score:.2f} | |
| ML: {ml_score:.2f} | |
| CNN: {cnn_score:.2f} | |
| Final Probability: {final_score*100:.2f}% | |
| Risk Level: {risk} | |
| Final Result: {result} | |
| """ | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # ================= UI ================= | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🎙️ Hybrid Voice Scam Detection System") | |
| gr.Markdown("Speech + AI + Keyword Detection") | |
| audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
| output = gr.Textbox(lines=12) | |
| gr.Button("Analyze").click( | |
| analyze_audio, | |
| inputs=audio_input, | |
| outputs=output | |
| ) | |
| demo.launch() |