| import streamlit as st |
| import time |
| import tempfile |
| import re |
| from pydub import AudioSegment, silence |
| import speech_recognition as sr |
|
|
| |
| |
| |
|
|
| try: |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.linear_model import LogisticRegression |
| SKLEARN_AVAILABLE = True |
| except: |
| SKLEARN_AVAILABLE = False |
|
|
| scam_samples = [ |
| "your bank account is blocked share otp immediately", |
| "urgent kyc update otherwise account suspended", |
| "police case registered transfer money now", |
| "refund processed verify otp", |
| "upi pin required immediately" |
| ] |
|
|
| normal_samples = [ |
| "hello how are you", |
| "payment completed thank you", |
| "meeting scheduled tomorrow", |
| "family discussion about expenses", |
| "salary credited message" |
| ] |
|
|
| if SKLEARN_AVAILABLE: |
| X_train = scam_samples + normal_samples |
| y_train = [1]*len(scam_samples) + [0]*len(normal_samples) |
|
|
| vectorizer = TfidfVectorizer(ngram_range=(1,2)) |
| X_vec = vectorizer.fit_transform(X_train) |
|
|
| ml_model = LogisticRegression() |
| ml_model.fit(X_vec, y_train) |
|
|
| def ml_risk_score(text): |
| if not SKLEARN_AVAILABLE: |
| return 0.5 |
| vec = vectorizer.transform([text]) |
| return ml_model.predict_proba(vec)[0][1] |
|
|
| |
| |
| |
|
|
| def money_escalation_risk(text): |
| amounts = re.findall(r'\d{3,6}', text) |
| amounts = [int(a) for a in amounts] |
| if len(amounts) >= 3: |
| diffs = [amounts[i+1] - amounts[i] for i in range(len(amounts)-1)] |
| if any(d > 5000 for d in diffs): |
| return 4, "Rapid money escalation" |
| return 0, None |
|
|
| def contextual_risk(text): |
| pairs = [ |
| ("bank","otp"), ("account","blocked"), ("verify","otp"), |
| ("police","arrest"), ("transfer","money"), ("kyc","update") |
| ] |
| score, reasons = 0, [] |
| text = text.lower() |
| for a,b in pairs: |
| if a in text and b in text: |
| score += 3 |
| reasons.append(f"{a} + {b}") |
| return score, reasons |
|
|
| def pressure_risk(text): |
| words = ["urgent","immediately","today","otherwise","right now"] |
| return sum(2 for w in words if w in text.lower()) |
|
|
| def authority_risk(text): |
| auth = ["bank officer","police","rbi","cyber crime","government"] |
| return 3 if any(a in text.lower() for a in auth) else 0 |
|
|
| def command_risk(text): |
| cmds = ["share","tell","confirm","provide","press","don't disconnect"] |
| return sum(1 for c in cmds if c in text.lower()) |
|
|
| |
| |
| |
|
|
| def audio_behavior_risk(chunk): |
| score, reasons = 0, [] |
|
|
| if chunk.dBFS > -20: |
| score += 2 |
| reasons.append("High speaking intensity") |
|
|
| sil = silence.detect_silence(chunk, 400, -40) |
| if len(sil) == 0: |
| score += 2 |
| reasons.append("No natural pauses") |
|
|
| return score, reasons |
|
|
| |
| |
| |
|
|
| st.set_page_config(page_title="Scam Call Detector", layout="centered") |
|
|
| st.markdown(""" |
| <style> |
| .header{display:flex;justify-content:space-between} |
| .left{font-size:14px;font-weight:bold} |
| .right{font-size:14px;font-style:italic;text-align:right} |
| .title{text-align:center;color:#ff4b4b} |
| .footer{text-align:center;margin-top:40px;font-weight:bold} |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| st.markdown(""" |
| <div class="header"> |
| <div class="left"> |
| Project By<br> |
| Abhishek Ranjan Tiwari (IIT Guwahati) |
| </div> |
| <div class="right"> |
| โBe careful.<br> |
| Always open your ears and your senses.<br> |
| Scammers are not clever but we are foolish!โ |
| </div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| st.markdown("<h1 class='title'>๐ Scam Call Detector</h1>", unsafe_allow_html=True) |
|
|
| uploaded_file = st.file_uploader( |
| "Upload a call recording (prototype simulation)", |
| type=["wav","mp3"] |
| ) |
|
|
| |
| |
| |
|
|
| if uploaded_file: |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: |
| tmp.write(uploaded_file.read()) |
| audio_path = tmp.name |
|
|
| audio = AudioSegment.from_file(audio_path) |
| recognizer = sr.Recognizer() |
|
|
| chunk_ms = 5000 |
| total_chunks = len(audio)//chunk_ms |
|
|
| st.subheader("๐ด Simulated Real-Time Analysis") |
| progress = st.progress(0) |
| transcript_box = st.empty() |
| risk_box = st.empty() |
|
|
| full_text = "" |
| final_risk = 0 |
|
|
| for i in range(total_chunks): |
| chunk = audio[i*chunk_ms:(i+1)*chunk_ms] |
|
|
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
| chunk.export(f.name, format="wav") |
| path = f.name |
|
|
| try: |
| with sr.AudioFile(path) as source: |
| text = recognizer.recognize_google( |
| recognizer.record(source) |
| ) |
| except: |
| text = "" |
|
|
| full_text += " " + text |
|
|
| base = command_risk(text) |
| context, creasons = contextual_risk(text) |
| pressure = pressure_risk(text) |
| authority = authority_risk(text) |
| money, mreason = money_escalation_risk(text) |
| audio_score, audio_reasons = audio_behavior_risk(chunk) |
|
|
| ml_prob = ml_risk_score(text) |
| ml_score = 4 if ml_prob > 0.7 else 2 if ml_prob > 0.4 else 0 |
|
|
| total = base+context+pressure+authority+money+audio_score+ml_score |
| final_risk = max(final_risk, total) |
|
|
| transcript_box.markdown(f"**Transcript so far:**\n\n{full_text}") |
|
|
| if final_risk >= 8: |
| risk_box.error("๐จ HIGH RISK SCAM DETECTED") |
| elif final_risk >= 4: |
| risk_box.warning("โ ๏ธ MEDIUM RISK") |
| else: |
| risk_box.success("โ
LOW RISK") |
|
|
| if creasons: |
| st.warning("Language pattern: " + ", ".join(creasons)) |
| if audio_reasons: |
| st.info("Audio behaviour: " + ", ".join(audio_reasons)) |
| if mreason: |
| st.error("๐ธ " + mreason) |
|
|
| st.caption(f"๐ค ML Scam Probability: {ml_prob:.2f}") |
|
|
| progress.progress((i+1)/total_chunks) |
|
|
| if final_risk >= 7: |
| st.error("๐จ Do NOT share OTP / PIN / click links") |
| break |
|
|
| time.sleep(1) |
|
|
| if final_risk < 7: |
| st.success("โ
No high-risk scam behaviour detected") |
|
|
| st.markdown("<div class='footer'>Jai Hind ๐ฎ๐ณ</div>", unsafe_allow_html=True) |
|
|