anish commited on
Commit
7015e1f
·
1 Parent(s): 06a727d
app/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # App module
2
+
app/main.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import base64
4
+ from fastapi import FastAPI, Header, Body
5
+ from fastapi.responses import JSONResponse
6
+ from pydantic import BaseModel
7
+ import librosa
8
+ from pydub import AudioSegment
9
+
10
+ # ML inference bridge
11
+ from ml.inference.predict import predict
12
+
13
+
14
+ # -------------------- CONFIGURATION --------------------
15
+
16
+ app = FastAPI(title="AI Voice Detection API")
17
+
18
+ # API Key (use ENV in production)
19
+ API_KEY = os.getenv("API_KEY", "hackathon-secret")
20
+
21
+ SUPPORTED_LANGUAGES = [
22
+ "Tamil",
23
+ "English",
24
+ "Hindi",
25
+ "Malayalam",
26
+ "Telugu"
27
+ ]
28
+
29
+
30
+ # -------------------- REQUEST MODEL --------------------
31
+
32
+ class VoiceRequest(BaseModel):
33
+ language: str
34
+ audioFormat: str
35
+ audioBase64: str
36
+
37
+
38
+ # -------------------- HELPER FUNCTIONS --------------------
39
+
40
+ def generate_explanation(classification: str, confidence: float, language: str) -> str:
41
+ """
42
+ Generates a human-readable explanation for the result.
43
+ """
44
+ if classification == "AI_GENERATED":
45
+ if confidence >= 0.8:
46
+ return (
47
+ f"High-confidence detection of synthetic voice patterns, "
48
+ f"including unnatural pitch consistency in the {language} sample."
49
+ )
50
+ else:
51
+ return (
52
+ f"Minor digital artifacts detected in the {language} speech, "
53
+ f"suggesting possible AI generation."
54
+ )
55
+ else:
56
+ if confidence >= 0.8:
57
+ return (
58
+ f"Natural prosody, breathing patterns, and organic speech flow "
59
+ f"detected, consistent with a human {language} speaker."
60
+ )
61
+ else:
62
+ return (
63
+ f"Speech characteristics align with human vocal patterns "
64
+ f"for the {language} language."
65
+ )
66
+
67
+
68
+ # -------------------- HEALTH CHECK --------------------
69
+
70
+ @app.get("/health")
71
+ def health_check():
72
+ return {
73
+ "status": "ok",
74
+ "message": "AI Voice Detection API is running"
75
+ }
76
+
77
+
78
+ # -------------------- MAIN API --------------------
79
+
80
+ @app.post("/api/voice-detection")
81
+ def detect_voice(
82
+ request: VoiceRequest = Body(...),
83
+ x_api_key: str = Header(...)
84
+ ):
85
+ # 1️⃣ API KEY VALIDATION
86
+ if x_api_key != API_KEY:
87
+ return JSONResponse(
88
+ status_code=401,
89
+ content={"status": "error", "message": "Invalid API key"}
90
+ )
91
+
92
+ # 2️⃣ LANGUAGE VALIDATION
93
+ if request.language not in SUPPORTED_LANGUAGES:
94
+ return JSONResponse(
95
+ status_code=400,
96
+ content={"status": "error", "message": f"Unsupported language. Allowed values: {SUPPORTED_LANGUAGES}"}
97
+ )
98
+
99
+ # 3️⃣ AUDIO FORMAT VALIDATION
100
+ if request.audioFormat.lower() != "mp3":
101
+ return JSONResponse(
102
+ status_code=400,
103
+ content={"status": "error", "message": "Only mp3 audio format is supported"}
104
+ )
105
+
106
+ # Temporary file names
107
+ temp_mp3 = f"temp_{uuid.uuid4()}.mp3"
108
+ original_temp_mp3 = temp_mp3
109
+
110
+ try:
111
+ # 4️⃣ BASE64 DECODE
112
+ try:
113
+ audio_bytes = base64.b64decode(
114
+ request.audioBase64,
115
+ validate=True
116
+ )
117
+ except Exception:
118
+ return JSONResponse(
119
+ status_code=400,
120
+ content={"status": "error", "message": "Invalid Base64 audio string"}
121
+ )
122
+
123
+ # Reject empty or fake audio
124
+ if len(audio_bytes) < 1000:
125
+ return JSONResponse(
126
+ status_code=400,
127
+ content={"status": "error", "message": "Audio data is too small or empty"}
128
+ )
129
+
130
+ # 5️⃣ SAVE MP3 FILE
131
+ with open(temp_mp3, "wb") as f:
132
+ f.write(audio_bytes)
133
+
134
+ # 5.5️⃣ CHECK AND TRIM AUDIO DURATION (max 60 seconds)
135
+ y, sr = librosa.load(temp_mp3, sr=None)
136
+ duration = len(y) / sr
137
+ if duration > 30:
138
+ # Trim to first 60 seconds
139
+ audio = AudioSegment.from_file(temp_mp3, format="mp3")
140
+ trimmed_audio = audio[:30000] # 60 seconds in milliseconds
141
+ trimmed_mp3 = temp_mp3.replace(".mp3", "_trimmed.mp3")
142
+ trimmed_audio.export(trimmed_mp3, format="mp3")
143
+ temp_mp3 = trimmed_mp3 # Use trimmed file
144
+
145
+ # 6️⃣ ML INFERENCE (Member-1 implementation)
146
+ result = predict(temp_mp3)
147
+
148
+ classification = result.get("classification")
149
+ confidence = result.get("confidenceScore")
150
+
151
+ if classification not in ["AI_GENERATED", "HUMAN"]:
152
+ return JSONResponse(
153
+ status_code=500,
154
+ content={"status": "error", "message": "Invalid classification returned by ML model"}
155
+ )
156
+
157
+ explanation = generate_explanation(
158
+ classification,
159
+ confidence,
160
+ request.language
161
+ )
162
+
163
+ # 8️⃣ SUCCESS RESPONSE (STRICT FORMAT)
164
+ return {
165
+ "status": "success",
166
+ "language": request.language,
167
+ "classification": classification,
168
+ "confidenceScore": confidence,
169
+ "explanation": explanation
170
+ }
171
+
172
+ except Exception as e:
173
+ # Catch-all for unexpected failures
174
+ return JSONResponse(
175
+ status_code=500,
176
+ content={"status": "error", "message": f"Processing error: {str(e)}"}
177
+ )
178
+
179
+ finally:
180
+ # 9️⃣ CLEANUP TEMP FILES
181
+ for path in [original_temp_mp3, temp_mp3]:
182
+ if os.path.exists(path):
183
+ try:
184
+ os.remove(path)
185
+ except:
186
+ pass
ml/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # ML module for AI Voice Detection
2
+
ml/inference/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # ML inference module
2
+ from .predict import predict
3
+
4
+ __all__ = ["predict"]
5
+
ml/inference/predict.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ from transformers import Wav2Vec2Processor, Wav2Vec2Model
6
+ from joblib import load
7
+
8
+ TARGET_SR = 16000
9
+
10
+ # Get the directory where this script is located
11
+ _current_dir = os.path.dirname(os.path.abspath(__file__))
12
+ # Try multiple paths for model location (works in different deployment scenarios)
13
+ _model_paths = [
14
+ os.path.join(_current_dir, "..", "models"), # Relative from inference/
15
+ os.path.join(os.path.dirname(_current_dir), "models"), # From ml/models
16
+ "ml/models", # From root directory
17
+ os.path.join(os.getcwd(), "ml", "models"), # From current working directory
18
+ ]
19
+
20
+ _model_dir = None
21
+ for path in _model_paths:
22
+ abs_path = os.path.abspath(path)
23
+ if os.path.exists(abs_path) and os.path.exists(os.path.join(abs_path, "voice_classifier.joblib")):
24
+ _model_dir = abs_path
25
+ break
26
+
27
+ if _model_dir is None:
28
+ raise FileNotFoundError(
29
+ f"Could not find model files. Tried paths: {_model_paths}. "
30
+ f"Current directory: {os.getcwd()}, Script directory: {_current_dir}"
31
+ )
32
+
33
+ # Load model + scaler
34
+ clf = load(os.path.join(_model_dir, "voice_classifier.joblib"))
35
+ scaler = load(os.path.join(_model_dir, "signal_scaler.joblib"))
36
+
37
+ # Load wav2vec
38
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
39
+ model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
40
+ model.eval()
41
+
42
+ # Use GPU if available
43
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
44
+ model.to(device)
45
+
46
+
47
+ def extract_embedding(audio_path):
48
+ y, _ = librosa.load(audio_path, sr=TARGET_SR, mono=True)
49
+
50
+ inputs = processor(
51
+ y,
52
+ sampling_rate=TARGET_SR,
53
+ return_tensors="pt",
54
+ padding=True
55
+ )
56
+
57
+ # Move to device
58
+ inputs = {k: v.to(device) for k, v in inputs.items()}
59
+
60
+ with torch.no_grad():
61
+ outputs = model(**inputs)
62
+
63
+ return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
64
+
65
+
66
+ def extract_signal_features(audio_path):
67
+ y, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
68
+
69
+ f0 = librosa.yin(y, fmin=50, fmax=300)
70
+ pitch_var = np.var(f0)
71
+
72
+ spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
73
+ spec_mean = np.mean(spec_centroid)
74
+ spec_var = np.var(spec_centroid)
75
+
76
+ zcr = librosa.feature.zero_crossing_rate(y)
77
+ zcr_mean = np.mean(zcr)
78
+
79
+ return np.array([pitch_var, spec_mean, spec_var, zcr_mean])
80
+
81
+
82
+ def generate_explanation(sig_feats, is_ai):
83
+ pitch_var, spec_mean, spec_var, zcr = sig_feats
84
+
85
+ if is_ai:
86
+ reasons = []
87
+
88
+ if pitch_var < 3000:
89
+ reasons.append("unnaturally stable pitch")
90
+ if spec_var < 8e5:
91
+ reasons.append("overly smooth spectral profile")
92
+ if zcr < 0.1:
93
+ reasons.append("robotic waveform structure")
94
+
95
+ if reasons:
96
+ return " and ".join(reasons).capitalize() + " detected"
97
+ else:
98
+ return "Acoustic patterns consistent with synthetic speech detected"
99
+
100
+ else:
101
+ return "Natural pitch variation and spectral dynamics detected"
102
+
103
+
104
+ def predict(audio_path):
105
+ emb = extract_embedding(audio_path)
106
+ sig = extract_signal_features(audio_path)
107
+
108
+ sig_scaled = scaler.transform(sig.reshape(1, -1))
109
+ X = np.concatenate([emb.reshape(1, -1), sig_scaled], axis=1)
110
+
111
+ prob_ai = clf.predict_proba(X)[0][1]
112
+ is_ai = prob_ai >= 0.5
113
+
114
+ label = "AI_GENERATED" if is_ai else "HUMAN"
115
+ explanation = generate_explanation(sig, is_ai)
116
+
117
+ # Confidence in the predicted class
118
+ confidence = prob_ai if is_ai else (1 - prob_ai)
119
+
120
+ return {
121
+ "classification": label,
122
+ "confidenceScore": round(float(confidence), 3),
123
+ "explanation": explanation
124
+ }
ml/models/signal_scaler.joblib ADDED
Binary file (711 Bytes). View file
 
ml/models/voice_classifier.joblib ADDED
Binary file (7.06 kB). View file