kimnamjoon0007 commited on
Commit
551ad23
·
verified ·
1 Parent(s): aba1139

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +134 -42
app.py CHANGED
@@ -1,22 +1,27 @@
1
  """
2
- AI Voice Detection - Hugging Face Spaces
3
- Detects AI-generated vs Human voices
4
  """
5
 
6
  import os
 
7
  import tempfile
8
  import numpy as np
9
  import torch
10
  import torch.nn as nn
11
- import gradio as gr
 
 
12
  from transformers import Wav2Vec2Model
13
  from pydub import AudioSegment
14
  import librosa
 
15
 
16
  # Configuration
17
  MODEL_REPO = "kimnamjoon0007/lkht-v440"
18
  TARGET_SR = 16000
19
  MAX_DURATION = 10.0
 
20
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
 
22
 
@@ -37,20 +42,19 @@ class W2VBertDeepfakeDetector(nn.Module):
37
  return logits
38
 
39
 
40
- # Load model at startup
41
- print("Loading Wav2Vec2 backbone...")
42
  backbone = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")
43
  model = W2VBertDeepfakeDetector(backbone, num_labels=2)
44
 
45
- print(f"Loading classifier weights from {MODEL_REPO}...")
46
  try:
47
  from huggingface_hub import hf_hub_download
48
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_model.pt")
49
  state_dict = torch.load(model_path, map_location="cpu")
50
  model.load_state_dict(state_dict)
51
- print("✓ Model loaded successfully")
52
  except Exception as e:
53
- print(f"Error loading model: {e}")
54
  raise
55
 
56
  model.to(DEVICE)
@@ -58,34 +62,130 @@ model.eval()
58
  print(f"Ready on {DEVICE}")
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def load_audio(audio_path):
62
- """Load and preprocess audio."""
63
  audio_segment = AudioSegment.from_file(audio_path)
64
  samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32)
65
-
66
  if audio_segment.channels > 1:
67
  samples = samples.reshape(-1, audio_segment.channels).mean(axis=1)
68
-
69
  samples /= 32767.0
70
  sr = audio_segment.frame_rate
71
-
72
  if sr != TARGET_SR:
73
  samples = librosa.resample(samples, orig_sr=sr, target_sr=TARGET_SR)
74
-
75
  max_len = int(MAX_DURATION * TARGET_SR)
76
  if len(samples) > max_len:
77
  samples = samples[:max_len]
78
-
79
  return torch.from_numpy(samples).float()
80
 
81
 
82
- def classify(audio_path):
83
- """Classify audio as AI or Human."""
84
- if audio_path is None:
85
- return "Please upload an audio file"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  try:
88
- waveform = load_audio(audio_path)
 
 
 
 
 
 
 
 
 
 
 
89
  input_values = waveform.unsqueeze(0).to(DEVICE)
90
 
91
  with torch.no_grad():
@@ -94,32 +194,24 @@ def classify(audio_path):
94
  pred = torch.argmax(probs, dim=-1).item()
95
  conf = probs[0, pred].item()
96
 
97
- human_pct = probs[0, 0].item() * 100
98
- ai_pct = probs[0, 1].item() * 100
99
 
100
- if pred == 1:
101
- result = f"🤖 **AI-GENERATED** ({conf:.1%} confidence)"
102
  else:
103
- result = f"👤 **HUMAN** ({conf:.1%} confidence)"
104
-
105
- details = f"\n\n**Scores:** Human {human_pct:.1f}% | AI {ai_pct:.1f}%"
106
 
107
- return result + details
108
-
109
- except Exception as e:
110
- return f"Error: {str(e)}"
111
-
 
 
 
 
 
112
 
113
- # Create Gradio app
114
- demo = gr.Interface(
115
- fn=classify,
116
- inputs=gr.Audio(type="filepath", label="Upload Audio"),
117
- outputs=gr.Textbox(label="Result", lines=3),
118
- title="🎤 AI Voice Detection",
119
- description="Upload an audio file to detect if it's AI-generated or human speech.\n\nSupports: Tamil, English, Hindi, Malayalam, Telugu",
120
- examples=[],
121
- cache_examples=False,
122
- )
123
 
124
  if __name__ == "__main__":
125
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  """
2
+ AI Voice Detection API - HuggingFace Spaces
3
+ Pure FastAPI - No Gradio
4
  """
5
 
6
  import os
7
+ import base64
8
  import tempfile
9
  import numpy as np
10
  import torch
11
  import torch.nn as nn
12
+ from fastapi import FastAPI, Header, HTTPException
13
+ from fastapi.responses import HTMLResponse
14
+ from pydantic import BaseModel
15
  from transformers import Wav2Vec2Model
16
  from pydub import AudioSegment
17
  import librosa
18
+ import uvicorn
19
 
20
  # Configuration
21
  MODEL_REPO = "kimnamjoon0007/lkht-v440"
22
  TARGET_SR = 16000
23
  MAX_DURATION = 10.0
24
+ API_KEY = "sk_test_123456789"
25
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
 
27
 
 
42
  return logits
43
 
44
 
45
+ # Load model
46
+ print("Loading model...")
47
  backbone = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")
48
  model = W2VBertDeepfakeDetector(backbone, num_labels=2)
49
 
 
50
  try:
51
  from huggingface_hub import hf_hub_download
52
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_model.pt")
53
  state_dict = torch.load(model_path, map_location="cpu")
54
  model.load_state_dict(state_dict)
55
+ print(f"✓ Model loaded from {MODEL_REPO}")
56
  except Exception as e:
57
+ print(f"Error: {e}")
58
  raise
59
 
60
  model.to(DEVICE)
 
62
  print(f"Ready on {DEVICE}")
63
 
64
 
65
+ # FastAPI app
66
+ app = FastAPI(title="AI Voice Detection API", version="2.0")
67
+
68
+
69
+ class DetectionRequest(BaseModel):
70
+ language: str
71
+ audioFormat: str
72
+ audioBase64: str
73
+
74
+
75
+ class DetectionResponse(BaseModel):
76
+ status: str
77
+ language: str
78
+ classification: str
79
+ confidenceScore: float
80
+ explanation: str
81
+
82
+
83
  def load_audio(audio_path):
 
84
  audio_segment = AudioSegment.from_file(audio_path)
85
  samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32)
 
86
  if audio_segment.channels > 1:
87
  samples = samples.reshape(-1, audio_segment.channels).mean(axis=1)
 
88
  samples /= 32767.0
89
  sr = audio_segment.frame_rate
 
90
  if sr != TARGET_SR:
91
  samples = librosa.resample(samples, orig_sr=sr, target_sr=TARGET_SR)
 
92
  max_len = int(MAX_DURATION * TARGET_SR)
93
  if len(samples) > max_len:
94
  samples = samples[:max_len]
 
95
  return torch.from_numpy(samples).float()
96
 
97
 
98
+ @app.get("/", response_class=HTMLResponse)
99
+ def home():
100
+ space_url = os.getenv("SPACE_HOST", "localhost:7860")
101
+ return f"""
102
+ <!DOCTYPE html>
103
+ <html>
104
+ <head>
105
+ <title>AI Voice Detection API</title>
106
+ <style>
107
+ body {{ font-family: system-ui; max-width: 800px; margin: 50px auto; padding: 20px; background: #1a1a2e; color: #eee; }}
108
+ h1 {{ color: #00d4ff; }}
109
+ .box {{ background: #16213e; padding: 20px; border-radius: 10px; margin: 20px 0; }}
110
+ code {{ background: #0f3460; padding: 2px 8px; border-radius: 4px; }}
111
+ pre {{ background: #0f3460; padding: 15px; border-radius: 8px; overflow-x: auto; white-space: pre-wrap; }}
112
+ .key {{ color: #00ff88; font-size: 1.2em; }}
113
+ </style>
114
+ </head>
115
+ <body>
116
+ <h1>🎤 AI Voice Detection API</h1>
117
+
118
+ <div class="box">
119
+ <h2>API Endpoint</h2>
120
+ <p><code>POST https://{space_url}/api/voice-detection</code></p>
121
+ </div>
122
+
123
+ <div class="box">
124
+ <h2>API Key</h2>
125
+ <p class="key"><code>{API_KEY}</code></p>
126
+ </div>
127
+
128
+ <div class="box">
129
+ <h2>CURL Example</h2>
130
+ <pre>curl -X POST "https://{space_url}/api/voice-detection" \\
131
+ -H "Content-Type: application/json" \\
132
+ -H "x-api-key: {API_KEY}" \\
133
+ -d '{{
134
+ "language": "English",
135
+ "audioFormat": "mp3",
136
+ "audioBase64": "YOUR_BASE64_AUDIO"
137
+ }}'</pre>
138
+ </div>
139
+
140
+ <div class="box">
141
+ <h2>Response Format</h2>
142
+ <pre>{{
143
+ "status": "success",
144
+ "language": "English",
145
+ "classification": "AI_GENERATED" or "HUMAN",
146
+ "confidenceScore": 0.97,
147
+ "explanation": "Detected synthetic voice characteristics"
148
+ }}</pre>
149
+ </div>
150
 
151
+ <div class="box">
152
+ <h2>Supported Languages</h2>
153
+ <p>Tamil, English, Hindi, Malayalam, Telugu</p>
154
+ </div>
155
+ </body>
156
+ </html>
157
+ """
158
+
159
+
160
+ @app.get("/health")
161
+ def health():
162
+ return {"status": "healthy", "model_loaded": True, "device": str(DEVICE)}
163
+
164
+
165
+ @app.post("/api/voice-detection")
166
+ def detect_voice(request: DetectionRequest, x_api_key: str = Header(None)):
167
+ # Validate API key
168
+ if x_api_key != API_KEY:
169
+ raise HTTPException(status_code=401, detail="Invalid API key")
170
+
171
+ # Validate format
172
+ if request.audioFormat.lower() != "mp3":
173
+ raise HTTPException(status_code=400, detail="Only mp3 format supported")
174
+
175
+ # Decode audio
176
  try:
177
+ audio_bytes = base64.b64decode(request.audioBase64)
178
+ except:
179
+ raise HTTPException(status_code=400, detail="Invalid base64")
180
+
181
+ # Save temp file
182
+ temp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
183
+ temp_file.write(audio_bytes)
184
+ temp_file.close()
185
+
186
+ try:
187
+ # Process
188
+ waveform = load_audio(temp_file.name)
189
  input_values = waveform.unsqueeze(0).to(DEVICE)
190
 
191
  with torch.no_grad():
 
194
  pred = torch.argmax(probs, dim=-1).item()
195
  conf = probs[0, pred].item()
196
 
197
+ classification = "AI_GENERATED" if pred == 1 else "HUMAN"
 
198
 
199
+ if classification == "AI_GENERATED":
200
+ explanation = "Detected synthetic voice characteristics and artificial patterns"
201
  else:
202
+ explanation = "Detected natural speech patterns and organic voice characteristics"
 
 
203
 
204
+ return DetectionResponse(
205
+ status="success",
206
+ language=request.language,
207
+ classification=classification,
208
+ confidenceScore=round(conf, 2),
209
+ explanation=explanation
210
+ )
211
+
212
+ finally:
213
+ os.remove(temp_file.name)
214
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  if __name__ == "__main__":
217
+ uvicorn.run(app, host="0.0.0.0", port=7860)