fix log_end missing score field
Browse files- README.md +276 -264
- __pycache__/test_env.cpython-310-pytest-9.0.3.pyc +0 -0
- app.py +1 -0
- environment/__pycache__/env.cpython-310.pyc +0 -0
- environment/__pycache__/graders.cpython-310.pyc +0 -0
- environment/env.py +91 -11
- environment/graders.py +10 -1
- openenv.yaml +3 -0
- test_deployed_scores.py +0 -128
- test_env.py +50 -1
- test_grader_exhaustive.py +0 -121
README.md
CHANGED
|
@@ -13,258 +13,231 @@ tags:
|
|
| 13 |
- audio
|
| 14 |
---
|
| 15 |
|
| 16 |
-
# 🎙️ Voice Authenticity Detection
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|------|--------|-------------------|---------|
|
| 26 |
-
| 1 | `request_temporal_features` | Jitter, shimmer, HNR (raw + normalized) | Vocal cord irregularity markers |
|
| 27 |
-
| 2 | `request_spectral_features` | 20 MFCC means, 20 MFCC stds, ZCR, spectral centroid | Timbre and spectral shape |
|
| 28 |
-
| 3 | `request_comparison` | Cosine similarity + euclidean distance to real/fake centroids | Statistical comparison to known references |
|
| 29 |
-
| 4 | `analyze_evidence` | Structured synthesis of all gathered evidence with signal tally | Evidence integration and confidence calibration |
|
| 30 |
-
| 5 | `final_classify` | Submits label (0=real, 1=synthetic) + confidence + reasoning | Terminal action — triggers 6-component grading |
|
| 31 |
|
| 32 |
-
|
| 33 |
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
| 43 |
-
-
|
| 44 |
-
-
|
| 45 |
-
-
|
| 46 |
-
- **
|
| 47 |
-
- **Follow logical investigation trajectories** — gather → analyze → classify, scored by a 6-component grader
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
---
|
| 52 |
|
| 53 |
-
## 🌍 Real
|
| 54 |
|
| 55 |
-
AI-generated voices are
|
| 56 |
|
| 57 |
-
- **Phone
|
| 58 |
-
- **
|
| 59 |
-
- **Identity
|
| 60 |
-
- **
|
| 61 |
-
- **Insurance
|
| 62 |
|
| 63 |
-
This
|
| 64 |
|
| 65 |
---
|
| 66 |
|
| 67 |
-
## 🏗️ Environment
|
| 68 |
|
| 69 |
-
The environment
|
| 70 |
|
| 71 |
-
This creates
|
| 72 |
-
- Choose
|
| 73 |
-
-
|
| 74 |
-
-
|
| 75 |
-
- Follow logical investigation
|
| 76 |
|
| 77 |
---
|
| 78 |
|
| 79 |
-
## 🏆
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|------|-----------|---------------|-------------|
|
| 83 |
-
| `clean_detection` | Easy | 0.65–0.78 | Clean, unmodified audio features — clear signal separation |
|
| 84 |
-
| `compressed_detection` | Medium | 0.50–0.65 | Codec compression flattens MFCC stds, suppresses jitter/shimmer |
|
| 85 |
-
| `adversarial_detection` | Hard | 0.40–0.58 | Feature distributions overlap — no clean threshold separates classes |
|
| 86 |
-
| `streaming_detection` | Medium-Hard | 0.38–0.55 | Step-dependent noise soft-gating — earlier steps noisier, later cleaner |
|
| 87 |
-
| `phonecall_detection` | Extreme | 0.25–0.42 | Heavy narrowband codec + background noise — near detection limit |
|
| 88 |
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
## 🏅 Grading System (6 Components)
|
| 96 |
|
| 97 |
-
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|-----------|-----------------|------|--------|------|---------|
|
| 101 |
-
| **Correctness** | Label matches ground truth | 0.40 | 0.30 | 0.25 | 0.20 |
|
| 102 |
-
| **Confidence Calibration** | Penalizes overconfidence, rewards calibrated uncertainty | 0.15 | 0.20 | 0.25 | 0.25 |
|
| 103 |
-
| **Trajectory Quality** | Did agent gather → analyze → classify? | 0.10 | 0.15 | 0.18 | 0.20 |
|
| 104 |
-
| **Feature Utilization** | Did agent request temporal AND spectral features? | 0.15 | 0.15 | 0.12 | 0.15 |
|
| 105 |
-
| **Reasoning Consistency** | Does reasoning text match chosen label? | 0.10 | 0.10 | 0.10 | 0.10 |
|
| 106 |
-
| **Action Ordering** | Logical sequence: gather → analyze → classify | 0.10 | 0.10 | 0.10 | 0.10 |
|
| 107 |
|
| 108 |
-
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
| Medium-Hard | 0.55 | ~0.51 |
|
| 116 |
-
| Extreme | 0.41 | ~0.38 |
|
| 117 |
|
| 118 |
-
|
| 119 |
|
| 120 |
-
|
| 121 |
|
| 122 |
---
|
| 123 |
|
| 124 |
-
##
|
| 125 |
|
| 126 |
-
|
| 127 |
|
| 128 |
-
|
|
| 129 |
-
|-----------|--------|
|
| 130 |
-
|
|
| 131 |
-
|
|
| 132 |
-
|
|
| 133 |
-
|
|
| 134 |
-
|
|
| 135 |
-
|
|
| 136 |
|
| 137 |
-
|
| 138 |
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
-
|
| 142 |
|
| 143 |
-
|
| 144 |
|
| 145 |
-
|
| 146 |
-
- Feature extraction is performed offline for fast inference
|
| 147 |
-
- Enables **LLM-native reasoning over interpretable acoustic characteristics** — not possible with raw waveforms under current infrastructure constraints
|
| 148 |
-
- Avoids heavy signal processing during evaluation
|
| 149 |
|
| 150 |
---
|
| 151 |
|
| 152 |
-
##
|
| 153 |
-
|
| 154 |
-
- Real speech: 250 samples from `garystafford/deepfake-audio-detection` (authentic human recordings)
|
| 155 |
-
- Synthetic speech: 250 samples (ElevenLabs, Hume AI, and other TTS platforms)
|
| 156 |
-
- Total: 500 labeled samples across 5 task variants
|
| 157 |
|
| 158 |
-
The
|
| 159 |
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
|
| 163 |
|
| 164 |
-
|
| 165 |
|
| 166 |
-
|
| 167 |
-
class VoiceObservation(BaseModel):
|
| 168 |
-
features: List[float] # 48-dim (zeroed until revealed)
|
| 169 |
-
task_name: str # current task
|
| 170 |
-
step_number: int # current step in episode
|
| 171 |
-
difficulty: str # easy|medium|medium_hard|hard|extreme
|
| 172 |
-
sample_id: int # index into dataset
|
| 173 |
-
hint: Optional[str] # context and guidance
|
| 174 |
-
visible_features: Dict[str, Any] # features revealed so far
|
| 175 |
-
evidence_summary: Optional[str] # from analyze_evidence
|
| 176 |
-
comparison_result: Optional[Dict[str, float]] # from request_comparison
|
| 177 |
-
available_actions: List[str] # valid actions this step
|
| 178 |
-
actions_taken: List[str] # action history
|
| 179 |
-
```
|
| 180 |
|
| 181 |
-
|
| 182 |
|
| 183 |
-
|
|
| 184 |
-
|-------|---------|-------------|
|
| 185 |
-
|
|
| 186 |
-
|
|
| 187 |
-
|
|
| 188 |
-
|
|
| 189 |
-
|
|
| 190 |
-
|
|
| 191 |
-
|
|
| 192 |
-
|
|
| 193 |
|
| 194 |
-
###
|
| 195 |
|
| 196 |
-
- **Jitter**:
|
| 197 |
-
- **Shimmer**:
|
| 198 |
-
- **HNR**:
|
| 199 |
|
| 200 |
---
|
| 201 |
|
| 202 |
-
##
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
confidence: float # [0.05, 0.95] (for final_classify)
|
| 209 |
-
reasoning: str # explanation (for final_classify)
|
| 210 |
-
```
|
| 211 |
|
| 212 |
---
|
| 213 |
|
| 214 |
-
## 📊
|
| 215 |
-
|
| 216 |
-
Agent: `Qwen/Qwen2.5-72B-Instruct` via HuggingFace router
|
| 217 |
-
Protocol: 5-action (temporal → spectral → comparison → analyze → classify)
|
| 218 |
-
Runs: 1 episode per task, seed=7
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
| compressed_detection | Medium | 0.62 | Yes | Codec compression degrades acoustic signal |
|
| 224 |
-
| adversarial_detection | Hard | 0.55 | No | Overlapping distributions challenge classification |
|
| 225 |
-
| streaming_detection | Medium-Hard | 0.30 | No | Streaming noise fooled the LLM at step 1 |
|
| 226 |
-
| phonecall_detection | Extreme | 0.22 | No | Phone-call degradation pushed detection below chance |
|
| 227 |
|
| 228 |
-
|
| 229 |
|
| 230 |
---
|
| 231 |
|
| 232 |
-
## 🔌
|
| 233 |
|
| 234 |
```python
|
| 235 |
from environment.env import VoiceAuthenticityEnv
|
| 236 |
|
| 237 |
env = VoiceAuthenticityEnv(task_name="clean_detection")
|
| 238 |
|
| 239 |
-
#
|
| 240 |
obs = env.reset(seed=42)
|
| 241 |
-
# obs.features
|
| 242 |
-
# obs.available_actions
|
| 243 |
|
| 244 |
-
# Step 1
|
| 245 |
action = {"action_type": "request_temporal_features"}
|
| 246 |
obs, reward, done, info = env.step(action)
|
| 247 |
-
# obs.visible_features["temporal"]["jitter"]
|
| 248 |
-
# reward → 0.10 (shaping: first action is gathering)
|
| 249 |
|
| 250 |
-
# Step 2
|
| 251 |
action = {"action_type": "request_spectral_features"}
|
| 252 |
obs, reward, done, info = env.step(action)
|
| 253 |
-
# obs.visible_features["spectral"]["mfcc_means"]
|
| 254 |
-
# reward → 0.10 (shaping: multi-feature-type bonus)
|
| 255 |
|
| 256 |
-
# Step 3
|
| 257 |
action = {"action_type": "request_comparison"}
|
| 258 |
obs, reward, done, info = env.step(action)
|
| 259 |
-
# obs.comparison_result["
|
| 260 |
-
# obs.comparison_result["closer_to"] → "real"
|
| 261 |
|
| 262 |
-
# Step 4
|
| 263 |
action = {"action_type": "analyze_evidence"}
|
| 264 |
obs, reward, done, info = env.step(action)
|
| 265 |
-
# obs.evidence_summary
|
| 266 |
|
| 267 |
-
# Step 5
|
| 268 |
action = {
|
| 269 |
"action_type": "final_classify",
|
| 270 |
"label": 0,
|
|
@@ -272,16 +245,45 @@ action = {
|
|
| 272 |
"reasoning": "High jitter and shimmer indicate natural vocal cord variation..."
|
| 273 |
}
|
| 274 |
obs, reward, done, info = env.step(action)
|
| 275 |
-
# reward
|
| 276 |
-
# done
|
| 277 |
-
# info["grader_breakdown"] → {correctness: 0.95, calibration: 0.84, ...}
|
| 278 |
|
| 279 |
state = env.state()
|
| 280 |
```
|
| 281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
---
|
| 283 |
|
| 284 |
-
## 📋
|
| 285 |
|
| 286 |
```
|
| 287 |
[START] task=clean_detection env=voice-authenticity model=Qwen/Qwen2.5-72B-Instruct
|
|
@@ -290,35 +292,54 @@ state = env.state()
|
|
| 290 |
[STEP] step=3 action=request_comparison reward=0.05 done=false error=null
|
| 291 |
[STEP] step=4 action=analyze_evidence reward=0.05 done=false error=null
|
| 292 |
[STEP] step=5 action=final_classify label=0 confidence=0.75 reward=0.74 done=true error=null
|
| 293 |
-
[END] success=true steps=5 score=0.74 rewards=0.10,0.10,0.05,0.05,0.74
|
| 294 |
```
|
| 295 |
|
| 296 |
---
|
| 297 |
|
| 298 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
-
-
|
| 301 |
-
- Real voices
|
| 302 |
-
-
|
| 303 |
-
- Phone call
|
| 304 |
-
-
|
| 305 |
-
-
|
| 306 |
-
- Results may
|
| 307 |
|
| 308 |
-
|
| 309 |
|
| 310 |
---
|
| 311 |
|
| 312 |
-
## 🚀
|
| 313 |
|
| 314 |
-
###
|
| 315 |
```
|
| 316 |
-
Python 3.10
|
| 317 |
Docker
|
| 318 |
-
HuggingFace account
|
| 319 |
```
|
| 320 |
|
| 321 |
-
###
|
| 322 |
```bash
|
| 323 |
git clone https://huggingface.co/spaces/AksharaSharma/voice-authenticity-openenv
|
| 324 |
cd voice-authenticity-openenv
|
|
@@ -329,84 +350,74 @@ python scripts/download_data.py
|
|
| 329 |
python scripts/extract_features.py
|
| 330 |
|
| 331 |
cp .env.example .env
|
| 332 |
-
#
|
| 333 |
|
| 334 |
-
#
|
| 335 |
python app.py
|
| 336 |
|
| 337 |
-
#
|
| 338 |
python inference.py
|
| 339 |
```
|
| 340 |
|
| 341 |
-
###
|
| 342 |
```bash
|
| 343 |
-
|
| 344 |
-
docker run --env-file .env voice-authenticity &
|
| 345 |
-
sleep 10
|
| 346 |
-
curl http://localhost:7860/health
|
| 347 |
-
curl -X POST http://localhost:7860/reset
|
| 348 |
-
python inference.py
|
| 349 |
-
```
|
| 350 |
-
|
| 351 |
-
### Running Tests
|
| 352 |
-
```bash
|
| 353 |
-
# Run all tests
|
| 354 |
pytest test_env.py -v
|
| 355 |
|
| 356 |
-
# Run
|
| 357 |
-
pytest test_env.py::
|
| 358 |
-
pytest test_env.py::test_five_actions_complete_episode -v
|
| 359 |
```
|
| 360 |
|
| 361 |
-
### Environment Variables
|
| 362 |
-
|
| 363 |
-
| Variable | Description | Default |
|
| 364 |
-
|----------|-------------|---------|
|
| 365 |
-
| `API_BASE_URL` | LLM API endpoint | `https://router.huggingface.co/v1` |
|
| 366 |
-
| `MODEL_NAME` | Model identifier | `Qwen/Qwen2.5-72B-Instruct` |
|
| 367 |
-
| `HF_TOKEN` | HuggingFace API token | required |
|
| 368 |
-
| `VOICE_TASK` | Task to run | `clean_detection` |
|
| 369 |
-
| `ENV_SERVER_URL` | Environment server URL | `http://localhost:7860` |
|
| 370 |
-
|
| 371 |
### Docker
|
| 372 |
```bash
|
| 373 |
docker build -t voice-authenticity .
|
| 374 |
docker run --env-file .env voice-authenticity
|
| 375 |
```
|
| 376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
---
|
| 378 |
|
| 379 |
-
## 📁 Project
|
|
|
|
| 380 |
```
|
| 381 |
voice-authenticity-openenv/
|
| 382 |
├── environment/
|
| 383 |
│ ├── __init__.py
|
| 384 |
-
│ ├── env.py #
|
| 385 |
-
│ ├── models.py #
|
| 386 |
-
│ ├── graders.py # 6-
|
| 387 |
│ └── data/
|
| 388 |
-
│ ├── features.npy #
|
| 389 |
-
│ ├── features_compressed.npy #
|
| 390 |
-
│ ├── features_adversarial.npy#
|
| 391 |
-
│ ├── features_streaming.npy #
|
| 392 |
-
│ ├── features_phonecall.npy #
|
| 393 |
-
│ ├── features_raw.npy # unnormalized values
|
| 394 |
-
│ ├── labels.npy #
|
| 395 |
│ ├── labels_compressed.npy
|
| 396 |
│ ├── labels_adversarial.npy
|
| 397 |
│ ├── labels_streaming.npy
|
| 398 |
│ └── labels_phonecall.npy
|
| 399 |
├── scripts/
|
| 400 |
-
│ ├── download_data.py #
|
| 401 |
-
│ └── extract_features.py # audio
|
| 402 |
├── server/
|
| 403 |
-
│ └── app.py #
|
| 404 |
-
├── Dashboard.html #
|
| 405 |
-
├── app.py # FastAPI server (serves
|
| 406 |
-
├── inference.py #
|
| 407 |
-
├── test_env.py #
|
| 408 |
-
├── openenv.yaml #
|
| 409 |
-
├── pyproject.toml #
|
| 410 |
├── Dockerfile
|
| 411 |
├── requirements.txt
|
| 412 |
└── README.md
|
|
@@ -416,53 +427,51 @@ voice-authenticity-openenv/
|
|
| 416 |
|
| 417 |
## 🖥️ Web Dashboard
|
| 418 |
|
| 419 |
-
`Dashboard.html` is a
|
| 420 |
|
| 421 |
-
- **
|
| 422 |
-
- **Task
|
| 423 |
-
- **
|
| 424 |
-
- **Step
|
| 425 |
|
| 426 |
-
The dashboard uses no external
|
| 427 |
|
| 428 |
---
|
| 429 |
|
| 430 |
## 🧪 Test Suite
|
| 431 |
|
| 432 |
-
|
| 433 |
|
| 434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|------|-------------------|
|
| 438 |
-
| `test_reset_returns_observation` | `reset()` returns a valid `VoiceObservation` with step 0, correct task name, and hint |
|
| 439 |
-
| `test_step_returns_reward_in_range` | Rewards from `step()` are always in [0.05, 0.95] — never exactly 0.0 or 1.0 |
|
| 440 |
-
| `test_five_actions_complete_episode` | The full 5-action protocol (temporal → spectral → comparison → analyze → classify) completes an episode with `done=True` |
|
| 441 |
-
| `test_reward_never_zero_or_one` | Explicit check that no step returns a boundary reward of exactly 0.0 or 1.0 |
|
| 442 |
-
| `test_all_five_tasks_load` | All 5 task variants (`clean`, `compressed`, `adversarial`, `streaming`, `phonecall`) load successfully and return valid observations |
|
| 443 |
-
|
| 444 |
-
Run: `pytest test_env.py -v`
|
| 445 |
|
| 446 |
---
|
| 447 |
|
| 448 |
-
## 🔬
|
| 449 |
-
|
| 450 |
-
### Feature Extraction
|
| 451 |
|
| 452 |
```mermaid
|
| 453 |
flowchart TD
|
| 454 |
-
A["🎤 Raw Audio
|
| 455 |
A --> C["parselmouth / Praat"]
|
| 456 |
|
| 457 |
-
B --> D["MFCC
|
| 458 |
-
C --> E["Jitter
|
| 459 |
|
| 460 |
-
D --> F["
|
| 461 |
E --> F
|
| 462 |
|
| 463 |
-
F --> G["
|
| 464 |
|
| 465 |
-
G --> H["
|
| 466 |
|
| 467 |
H --> I["Clean\nfeatures.npy"]
|
| 468 |
H --> J["Compressed\nfeatures_compressed.npy"]
|
|
@@ -485,17 +494,20 @@ flowchart TD
|
|
| 485 |
style M fill:#1a0010,stroke:#d946ef,color:#f5d0fe
|
| 486 |
```
|
| 487 |
|
| 488 |
-
###
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
-
###
|
| 492 |
-
|
| 493 |
|
| 494 |
-
###
|
| 495 |
-
|
| 496 |
|
| 497 |
-
###
|
| 498 |
-
|
| 499 |
|
| 500 |
---
|
| 501 |
|
|
|
|
| 13 |
- audio
|
| 14 |
---
|
| 15 |
|
| 16 |
+
# 🎙️ Voice Authenticity Detection
|
| 17 |
|
| 18 |
+
## What Is This?
|
| 19 |
|
| 20 |
+
Fake voices are a huge problem. Tools like ElevenLabs can copy anyone's voice in under a minute. Scammers use these cloned voices to steal money, trick people over the phone, and spread false information. This cost the world over $25 billion in 2024 alone.
|
| 21 |
|
| 22 |
+
This project is a training ground for AI agents that learn to tell the difference between **real human voices** and **AI-generated fake voices**.
|
| 23 |
|
| 24 |
+
But here is the key part: the agent does not just get all the data at once and make a guess. Instead, it has to **investigate step by step**, like a detective. It starts with zero information and has to decide what clues to look for, put them together, and then make a judgment call.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
## How Does the Agent Work?
|
| 27 |
|
| 28 |
+
The agent follows a simple investigation process. Think of it like a detective solving a case:
|
| 29 |
+
|
| 30 |
+
| Step | What the Agent Does | What It Gets Back | Why This Helps |
|
| 31 |
+
|------|-------------------|------------------|---------------|
|
| 32 |
+
| 1 | Ask for voice stability clues | Jitter, shimmer, HNR (how shaky or smooth the voice is) | Real voices have natural wobbles. Fake voices are too perfect. |
|
| 33 |
+
| 2 | Ask for sound shape clues | 20 MFCC values, zero crossing rate, spectral centroid | These describe the "texture" and "color" of the voice. |
|
| 34 |
+
| 3 | Compare to known examples | How similar this voice is to known real and fake voices | Like comparing a signature to ones you have on file. |
|
| 35 |
+
| 4 | Think about all the clues | A summary of everything gathered so far, with a recommendation | The agent puts the puzzle together before deciding. |
|
| 36 |
+
| 5 | Make a final decision | Submits: real or fake, how confident it is, and why | This is where the agent gets scored. |
|
| 37 |
|
| 38 |
+
The agent starts with **nothing visible**. It has to earn its information before it can decide. This is what makes it different from a regular classifier that sees everything at once.
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
|
| 42 |
+
## 🚫 Why Other Tests Fall Short
|
| 43 |
|
| 44 |
+
Other voice detection tests (like ASVspoof and ADD) work like this: give the AI all the data, let it make one guess, and check if it is right or wrong. That is it.
|
| 45 |
|
| 46 |
+
That approach cannot test:
|
| 47 |
+
- Whether the AI knows **which clues to look for**
|
| 48 |
+
- Whether the AI can **put different types of evidence together**
|
| 49 |
+
- Whether the AI is **honest about how confident it is** (saying "I'm not sure" when it really is not sure)
|
| 50 |
+
- Whether the AI can handle **messy real-world audio** like phone calls and streaming
|
|
|
|
| 51 |
|
| 52 |
+
This environment tests all of those things.
|
| 53 |
|
| 54 |
---
|
| 55 |
|
| 56 |
+
## 🌍 Why This Matters in the Real World
|
| 57 |
|
| 58 |
+
AI-generated voices are being used for:
|
| 59 |
|
| 60 |
+
- **Phone scams**: cloning someone's voice during a live call
|
| 61 |
+
- **Fake audio clips**: putting false words in a public figure's mouth
|
| 62 |
+
- **Identity theft**: tricking voice-based security systems (like bank phone lines)
|
| 63 |
+
- **CEO fraud**: cloning a boss's voice to trick employees into sending money
|
| 64 |
+
- **Insurance fraud**: creating fake recorded statements
|
| 65 |
|
| 66 |
+
This project gives AI agents a way to practice catching these fakes under realistic conditions.
|
| 67 |
|
| 68 |
---
|
| 69 |
|
| 70 |
+
## 🏗️ How the Environment Works
|
| 71 |
|
| 72 |
+
The environment gives the agent a set of 48 numbers (features) extracted from an audio clip. But the agent cannot see them right away. It has to request them step by step, building up its picture before making a decision.
|
| 73 |
|
| 74 |
+
This creates a real decision-making challenge where the agent must:
|
| 75 |
+
- Choose what information to ask for and in what order
|
| 76 |
+
- Combine different types of clues
|
| 77 |
+
- Be honest about how certain (or uncertain) it is
|
| 78 |
+
- Follow a logical investigation path
|
| 79 |
|
| 80 |
---
|
| 81 |
|
| 82 |
+
## 🏆 The 6 Tasks
|
| 83 |
|
| 84 |
+
There are 6 tasks, each getting harder. The first five tasks test whether an agent can read a signal correctly. The sixth tests whether it knows when it has read enough. The harder tasks usually have messier audio, which makes fake voices harder to detect.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
| Task | How Hard | Expected Score | What Makes It Different |
|
| 87 |
+
|------|----------|---------------|----------------------|
|
| 88 |
+
| `clean_detection` | Easy | 0.65 to 0.78 | Clean, clear audio. The clues are easy to spot. |
|
| 89 |
+
| `compressed_detection` | Medium | 0.50 to 0.65 | Audio has been compressed (like an MP3). Some details get lost. |
|
| 90 |
+
| `adversarial_detection` | Hard | 0.40 to 0.58 | The fake voices have been tweaked to look more like real ones. Very tricky. |
|
| 91 |
+
| `streaming_detection` | Medium-Hard | 0.38 to 0.55 | Early clues are noisy and unreliable. Later clues get cleaner. |
|
| 92 |
+
| `phonecall_detection` | Extreme | 0.25 to 0.42 | Simulates a real phone call with bad audio quality and background noise. |
|
| 93 |
+
| `realtime_detection` | Realtime | 0.50 to 0.68 | The agent can decide early, but every extra step costs points. Tests speed vs accuracy. |
|
| 94 |
|
| 95 |
+
### Why Harder Tasks Get Lower Scores
|
| 96 |
|
| 97 |
+
This is on purpose. Harder tasks have genuinely worse audio quality, which means even a perfect agent will score lower. The scoring system accounts for this, so a score of 0.35 on the phone call task might actually be impressive, while 0.60 on the clean task would be average.
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
### The Realtime Detection Task (New!)
|
| 100 |
|
| 101 |
+
This task changes the rules. Instead of following a fixed 5-step sequence, the agent can make its final decision **at any point after step 2**.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
But there is a catch: **every extra step costs 0.03 points** off the final score.
|
| 104 |
|
| 105 |
+
Here is how it works:
|
| 106 |
+
- The agent MUST take at least 2 steps to gather evidence (steps 1 and 2)
|
| 107 |
+
- After that, the agent can classify whenever it wants
|
| 108 |
+
- Step 3 costs 0.03, step 4 costs 0.06, step 5 costs 0.09, and so on
|
| 109 |
+
- A smart agent will classify as soon as it feels confident enough, instead of always going through every single step
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
This tests a completely different skill: **knowing when to stop investigating**. Some agents will jump to conclusions too early and get the wrong answer. Others will keep gathering evidence they do not need and lose points to the time penalty. The best agents find the sweet spot.
|
| 112 |
|
| 113 |
+
This task is not harder because the audio is bad. It uses the same clean audio data as the easy task. The challenge is purely about decision timing and choosing the right moment to stop. No extra data or computing power is needed.
|
| 114 |
|
| 115 |
---
|
| 116 |
|
| 117 |
+
## 🏅 How Scoring Works (6 Parts)
|
| 118 |
|
| 119 |
+
Every episode is scored across 6 different areas. The weight of each area changes depending on how hard the task is.
|
| 120 |
|
| 121 |
+
| What Gets Scored | What It Means | Easy | Medium | Hard | Extreme | Realtime |
|
| 122 |
+
|-----------------|--------------|------|--------|------|---------|----------|
|
| 123 |
+
| **Correctness** | Did the agent get the right answer? | 0.40 | 0.30 | 0.25 | 0.20 | 0.35 |
|
| 124 |
+
| **Confidence** | Was the agent honest about its certainty? | 0.15 | 0.20 | 0.25 | 0.25 | 0.20 |
|
| 125 |
+
| **Investigation Quality** | Did the agent gather, analyze, then classify? | 0.10 | 0.15 | 0.18 | 0.20 | 0.10 |
|
| 126 |
+
| **Feature Use** | Did the agent request enough types of clues? | 0.15 | 0.15 | 0.12 | 0.15 | 0.15 |
|
| 127 |
+
| **Reasoning** | Does the explanation match the answer? | 0.10 | 0.10 | 0.10 | 0.10 | 0.10 |
|
| 128 |
+
| **Action Order** | Did the agent follow a logical sequence? | 0.10 | 0.10 | 0.10 | 0.10 | 0.10 |
|
| 129 |
|
| 130 |
+
After scoring, a **difficulty multiplier** is applied:
|
| 131 |
|
| 132 |
+
| Difficulty | Multiplier | Best Possible Score |
|
| 133 |
+
|-----------|-----------|-------------------|
|
| 134 |
+
| Easy | 0.78 | about 0.73 |
|
| 135 |
+
| Medium | 0.66 | about 0.61 |
|
| 136 |
+
| Hard | 0.59 | about 0.55 |
|
| 137 |
+
| Medium-Hard | 0.55 | about 0.51 |
|
| 138 |
+
| Extreme | 0.41 | about 0.38 |
|
| 139 |
+
| Realtime | 0.72 | about 0.68 (before time penalty) |
|
| 140 |
|
| 141 |
+
### Why This Scoring System Matters
|
| 142 |
|
| 143 |
+
On easy tasks, getting the right answer matters most. On hard tasks, being honest about uncertainty and following a good investigation process become just as important. This mirrors real life: in fraud detection, a confident wrong answer is more dangerous than an uncertain one, and rushing to judgment without proper investigation is a liability.
|
| 144 |
|
| 145 |
+
For the realtime task, the time penalty is applied ON TOP of the difficulty multiplier. So the effective max score drops by 0.03 for every extra step beyond step 2.
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
---
|
| 148 |
|
| 149 |
+
## 🎁 Rewards During Investigation
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
+
The agent gets small rewards (and penalties) during the investigation, not just at the end:
|
| 152 |
|
| 153 |
+
| What Happened | Points |
|
| 154 |
+
|--------------|--------|
|
| 155 |
+
| First action is gathering evidence | +0.05 |
|
| 156 |
+
| Asked for both voice stability AND sound shape clues | +0.05 |
|
| 157 |
+
| Analyzed evidence before making a decision | +0.05 |
|
| 158 |
+
| Jumped straight to a decision without gathering anything | -0.10 |
|
| 159 |
+
| Repeated the exact same action twice in a row | -0.05 |
|
| 160 |
+
| Explanation contradicts the chosen answer | -0.10 |
|
| 161 |
|
| 162 |
+
These small rewards teach the agent good investigation habits, not just correct answers.
|
| 163 |
|
| 164 |
+
---
|
| 165 |
|
| 166 |
+
## What Are the 48 Features?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
Each audio clip is described by 48 numbers:
|
| 169 |
|
| 170 |
+
| Numbers | What They Measure | Simple Explanation |
|
| 171 |
+
|---------|------------------|-------------------|
|
| 172 |
+
| 1 to 20 | MFCC averages | The overall "shape" and "color" of the voice |
|
| 173 |
+
| 21 to 40 | MFCC variation | How much the voice texture changes over time |
|
| 174 |
+
| 41 | Zero crossing rate | How often the sound wave crosses the zero line |
|
| 175 |
+
| 42 | Spectral centroid | How "bright" or "dark" the voice sounds |
|
| 176 |
+
| 43 | Jitter | How wobbly the voice pitch is (real voices wobble more) |
|
| 177 |
+
| 44 | Shimmer | How much the loudness changes beat to beat |
|
| 178 |
+
| 45 | HNR | How "clean" vs "noisy" the voice is (fakes are too clean) |
|
| 179 |
+
| 46 to 48 | Compression clues | Spectral bandwidth, rolloff, and energy level |
|
| 180 |
|
| 181 |
+
### The Three Most Important Clues
|
| 182 |
|
| 183 |
+
- **Jitter**: Real voices have natural pitch wobbles. Fake voices are too steady.
|
| 184 |
+
- **Shimmer**: Real voices have natural loudness changes. Fake voices are too uniform.
|
| 185 |
+
- **HNR**: Real voices have some noise in them. Fake voices are unnaturally clean.
|
| 186 |
|
| 187 |
---
|
| 188 |
|
| 189 |
+
## Why Use Numbers Instead of Raw Audio?
|
| 190 |
|
| 191 |
+
- The competition has strict limits: 2 CPUs and 8GB of memory
|
| 192 |
+
- Processing raw audio files would be too slow and heavy
|
| 193 |
+
- Numbers let the AI agent reason about voice characteristics using language (something it is good at)
|
| 194 |
+
- Feature extraction is done once ahead of time, so evaluation is fast
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
---
|
| 197 |
|
| 198 |
+
## 📊 Dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
+
- 250 real speech samples from human recordings
|
| 201 |
+
- 250 synthetic speech samples from AI voice generators (ElevenLabs, Hume AI, and others)
|
| 202 |
+
- 500 total samples across 6 task versions
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
+
The dataset is designed to test the evaluation and scoring system, not to be huge. The same pipeline can handle much larger datasets for real-world use.
|
| 205 |
|
| 206 |
---
|
| 207 |
|
| 208 |
+
## 🔌 How to Use the Code
|
| 209 |
|
| 210 |
```python
|
| 211 |
from environment.env import VoiceAuthenticityEnv
|
| 212 |
|
| 213 |
env = VoiceAuthenticityEnv(task_name="clean_detection")
|
| 214 |
|
| 215 |
+
# Start a new episode (the agent sees nothing yet)
|
| 216 |
obs = env.reset(seed=42)
|
| 217 |
+
# obs.features = [0.05, 0.05, ..., 0.05] (all hidden)
|
| 218 |
+
# obs.available_actions = ["request_temporal_features", ...]
|
| 219 |
|
| 220 |
+
# Step 1: ask for voice stability clues
|
| 221 |
action = {"action_type": "request_temporal_features"}
|
| 222 |
obs, reward, done, info = env.step(action)
|
| 223 |
+
# obs.visible_features["temporal"]["jitter"] = 0.032451
|
|
|
|
| 224 |
|
| 225 |
+
# Step 2: ask for sound shape clues
|
| 226 |
action = {"action_type": "request_spectral_features"}
|
| 227 |
obs, reward, done, info = env.step(action)
|
| 228 |
+
# obs.visible_features["spectral"]["mfcc_means"] = [20 values]
|
|
|
|
| 229 |
|
| 230 |
+
# Step 3: compare to known examples
|
| 231 |
action = {"action_type": "request_comparison"}
|
| 232 |
obs, reward, done, info = env.step(action)
|
| 233 |
+
# obs.comparison_result["closer_to"] = "real"
|
|
|
|
| 234 |
|
| 235 |
+
# Step 4: analyze all the evidence
|
| 236 |
action = {"action_type": "analyze_evidence"}
|
| 237 |
obs, reward, done, info = env.step(action)
|
| 238 |
+
# obs.evidence_summary = "Evidence analysis (3 sources): ..."
|
| 239 |
|
| 240 |
+
# Step 5: make the final call
|
| 241 |
action = {
|
| 242 |
"action_type": "final_classify",
|
| 243 |
"label": 0,
|
|
|
|
| 245 |
"reasoning": "High jitter and shimmer indicate natural vocal cord variation..."
|
| 246 |
}
|
| 247 |
obs, reward, done, info = env.step(action)
|
| 248 |
+
# reward = 0.73 (the final graded score)
|
| 249 |
+
# done = True (episode over)
|
|
|
|
| 250 |
|
| 251 |
state = env.state()
|
| 252 |
```
|
| 253 |
|
| 254 |
+
### Realtime Detection Example
|
| 255 |
+
|
| 256 |
+
```python
|
| 257 |
+
env = VoiceAuthenticityEnv(task_name="realtime_detection")
|
| 258 |
+
obs = env.reset(seed=42)
|
| 259 |
+
|
| 260 |
+
# Step 1: gather temporal features
|
| 261 |
+
obs, reward, done, info = env.step({"action_type": "request_temporal_features"})
|
| 262 |
+
# final_classify is NOT available yet (need at least 2 steps first)
|
| 263 |
+
|
| 264 |
+
# Step 2: gather spectral features
|
| 265 |
+
obs, reward, done, info = env.step({"action_type": "request_spectral_features"})
|
| 266 |
+
# final_classify is NOW available
|
| 267 |
+
# The hint tells you: "You can classify now"
|
| 268 |
+
|
| 269 |
+
# Step 3: classify right away (only 1 extra step = -0.03 penalty)
|
| 270 |
+
obs, reward, done, info = env.step({
|
| 271 |
+
"action_type": "final_classify",
|
| 272 |
+
"label": 0,
|
| 273 |
+
"confidence": 0.80,
|
| 274 |
+
"reasoning": "Jitter and shimmer patterns suggest real speech"
|
| 275 |
+
})
|
| 276 |
+
# reward = grader_score - 0.03 time penalty
|
| 277 |
+
# info["realtime_time_penalty"] = 0.03
|
| 278 |
+
# info["realtime_extra_steps"] = 1
|
| 279 |
+
|
| 280 |
+
# If you had taken 2 more steps before classifying:
|
| 281 |
+
# penalty would be 0.09 (3 extra steps x 0.03)
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
---
|
| 285 |
|
| 286 |
+
## 📋 Log Output Format
|
| 287 |
|
| 288 |
```
|
| 289 |
[START] task=clean_detection env=voice-authenticity model=Qwen/Qwen2.5-72B-Instruct
|
|
|
|
| 292 |
[STEP] step=3 action=request_comparison reward=0.05 done=false error=null
|
| 293 |
[STEP] step=4 action=analyze_evidence reward=0.05 done=false error=null
|
| 294 |
[STEP] step=5 action=final_classify label=0 confidence=0.75 reward=0.74 done=true error=null
|
| 295 |
+
[END] success=true steps=5 score=0.74 rewards=0.10,0.10,0.05,0.05,0.74
|
| 296 |
```
|
| 297 |
|
| 298 |
---
|
| 299 |
|
| 300 |
+
## 📊 Baseline Scores
|
| 301 |
+
|
| 302 |
+
Agent: `Qwen/Qwen2.5-72B-Instruct` via HuggingFace router
|
| 303 |
+
Protocol: 5-action sequence for standard tasks, 3-step quick classify for realtime
|
| 304 |
+
Runs: 1 episode per task, seed=7
|
| 305 |
+
|
| 306 |
+
| Task | How Hard | Score | Passed? | Notes |
|
| 307 |
+
|------|----------|-------|---------|-------|
|
| 308 |
+
| clean_detection | Easy | 0.74 | Yes | Clean audio, easy to detect |
|
| 309 |
+
| compressed_detection | Medium | 0.62 | Yes | Compression hides some clues |
|
| 310 |
+
| adversarial_detection | Hard | 0.55 | No | Fake voices designed to fool detection |
|
| 311 |
+
| streaming_detection | Medium-Hard | 0.30 | No | Noisy early data fooled the model |
|
| 312 |
+
| phonecall_detection | Extreme | 0.22 | No | Phone audio too degraded for reliable detection |
|
| 313 |
+
| realtime_detection | Realtime | TBD | TBD | Quick classify with minimal time penalty |
|
| 314 |
+
|
| 315 |
+
Scores go down as tasks get harder. This is by design. Harder tasks have genuinely worse audio quality, so even a perfect agent scores lower.
|
| 316 |
+
|
| 317 |
+
---
|
| 318 |
+
|
| 319 |
+
## Known Problems and Limitations
|
| 320 |
|
| 321 |
+
- Fake voices with added background noise can dodge the stability checks
|
| 322 |
+
- Real voices recorded in a professional studio can look like fake voices
|
| 323 |
+
- On the hardest tasks, real and fake voices look almost identical in the data
|
| 324 |
+
- Phone call audio is so degraded that detection is close to random guessing
|
| 325 |
+
- The streaming task adds noise to early steps, so agents that do not adapt get fooled
|
| 326 |
+
- 500 samples is enough for testing the system, but not for production use
|
| 327 |
+
- Results may differ for voices in different languages or accents
|
| 328 |
|
| 329 |
+
The scoring system and investigation pipeline are ready for real-world use. The dataset is a research prototype that can be replaced with larger enterprise data.
|
| 330 |
|
| 331 |
---
|
| 332 |
|
| 333 |
+
## 🚀 Getting Started
|
| 334 |
|
| 335 |
+
### What You Need
|
| 336 |
```
|
| 337 |
+
Python 3.10 or newer
|
| 338 |
Docker
|
| 339 |
+
A HuggingFace account
|
| 340 |
```
|
| 341 |
|
| 342 |
+
### Setting Up Locally
|
| 343 |
```bash
|
| 344 |
git clone https://huggingface.co/spaces/AksharaSharma/voice-authenticity-openenv
|
| 345 |
cd voice-authenticity-openenv
|
|
|
|
| 350 |
python scripts/extract_features.py
|
| 351 |
|
| 352 |
cp .env.example .env
|
| 353 |
+
# Open .env and add your HF_TOKEN
|
| 354 |
|
| 355 |
+
# In one terminal, start the server:
|
| 356 |
python app.py
|
| 357 |
|
| 358 |
+
# In another terminal, run the agent:
|
| 359 |
python inference.py
|
| 360 |
```
|
| 361 |
|
| 362 |
+
### Testing
|
| 363 |
```bash
|
| 364 |
+
# Run all 7 tests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
pytest test_env.py -v
|
| 366 |
|
| 367 |
+
# Run one specific test
|
| 368 |
+
pytest test_env.py::test_realtime_classify_after_step_2 -v
|
|
|
|
| 369 |
```
|
| 370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
### Docker
|
| 372 |
```bash
|
| 373 |
docker build -t voice-authenticity .
|
| 374 |
docker run --env-file .env voice-authenticity
|
| 375 |
```
|
| 376 |
|
| 377 |
+
### Settings
|
| 378 |
+
|
| 379 |
+
| Setting | What It Does | Default |
|
| 380 |
+
|---------|-------------|---------|
|
| 381 |
+
| `API_BASE_URL` | Where to find the AI model | `https://router.huggingface.co/v1` |
|
| 382 |
+
| `MODEL_NAME` | Which AI model to use | `Qwen/Qwen2.5-72B-Instruct` |
|
| 383 |
+
| `HF_TOKEN` | Your HuggingFace login token | (required) |
|
| 384 |
+
| `VOICE_TASK` | Which task to run | `clean_detection` |
|
| 385 |
+
| `ENV_SERVER_URL` | Where the environment server is running | `http://localhost:7860` |
|
| 386 |
+
|
| 387 |
---
|
| 388 |
|
| 389 |
+
## 📁 Project Files
|
| 390 |
+
|
| 391 |
```
|
| 392 |
voice-authenticity-openenv/
|
| 393 |
├── environment/
|
| 394 |
│ ├── __init__.py
|
| 395 |
+
│ ├── env.py # The main environment with all 6 tasks
|
| 396 |
+
│ ├── models.py # Data models for observations, actions, and rewards
|
| 397 |
+
│ ├── graders.py # 6-part scoring system with difficulty adjustments
|
| 398 |
│ └── data/
|
| 399 |
+
│ ├── features.npy # Clean features (500 x 48)
|
| 400 |
+
│ ├── features_compressed.npy # Compressed audio features
|
| 401 |
+
│ ├── features_adversarial.npy# Tricky adversarial features
|
| 402 |
+
│ ├── features_streaming.npy # Streaming audio features
|
| 403 |
+
│ ├── features_phonecall.npy # Phone call audio features
|
| 404 |
+
│ ├── features_raw.npy # Original unnormalized values
|
| 405 |
+
│ ├── labels.npy # Correct answers (used by clean + realtime)
|
| 406 |
│ ├── labels_compressed.npy
|
| 407 |
│ ├── labels_adversarial.npy
|
| 408 |
│ ├── labels_streaming.npy
|
| 409 |
│ └── labels_phonecall.npy
|
| 410 |
├── scripts/
|
| 411 |
+
│ ├── download_data.py # Downloads the audio dataset
|
| 412 |
+
│ └── extract_features.py # Turns audio files into feature numbers
|
| 413 |
├── server/
|
| 414 |
+
│ └── app.py # Server entry point
|
| 415 |
+
├── Dashboard.html # Interactive web dashboard
|
| 416 |
+
├── app.py # FastAPI server (serves the dashboard and API)
|
| 417 |
+
├── inference.py # The AI agent that runs all 6 tasks
|
| 418 |
+
├── test_env.py # 7 tests to make sure everything works
|
| 419 |
+
├── openenv.yaml # Environment specification (6 tasks)
|
| 420 |
+
├── pyproject.toml # Project settings
|
| 421 |
├── Dockerfile
|
| 422 |
├── requirements.txt
|
| 423 |
└── README.md
|
|
|
|
| 427 |
|
| 428 |
## 🖥️ Web Dashboard
|
| 429 |
|
| 430 |
+
`Dashboard.html` is a standalone web page that shows the environment in action. When the server is running, visit `/` or `/web` to see:
|
| 431 |
|
| 432 |
+
- **Live Investigation Simulation**: watch the agent go through its investigation steps in real time
|
| 433 |
+
- **Task Difficulty Overview**: all 6 tasks with their difficulty levels and expected scores
|
| 434 |
+
- **Score Breakdown**: click any task to see exactly how it was scored across all 6 components
|
| 435 |
+
- **Step by Step Walkthrough**: the full investigation process with reward information at each step
|
| 436 |
|
| 437 |
+
The dashboard uses no external tools or libraries. It is pure HTML, CSS, and JavaScript.
|
| 438 |
|
| 439 |
---
|
| 440 |
|
| 441 |
## 🧪 Test Suite
|
| 442 |
|
| 443 |
+
7 tests that check everything works correctly:
|
| 444 |
|
| 445 |
+
| Test Name | What It Checks |
|
| 446 |
+
|-----------|---------------|
|
| 447 |
+
| `test_reset_returns_observation` | Starting a new episode gives back proper initial data |
|
| 448 |
+
| `test_step_returns_reward_in_range` | Rewards are always between 0.05 and 0.95 |
|
| 449 |
+
| `test_five_actions_complete_episode` | The full 5-step investigation finishes properly |
|
| 450 |
+
| `test_reward_never_zero_or_one` | No reward is ever exactly 0.0 or exactly 1.0 |
|
| 451 |
+
| `test_all_tasks_load` | All 6 tasks start up correctly |
|
| 452 |
+
| `test_realtime_classify_after_step_2` | Realtime task allows early classification after step 2 with time penalty |
|
| 453 |
+
| `test_realtime_no_penalty_at_step_2` | Verifies the time penalty math is correct |
|
| 454 |
|
| 455 |
+
Run them with: `pytest test_env.py -v`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
|
| 457 |
---
|
| 458 |
|
| 459 |
+
## 🔬 How the Audio Processing Works
|
|
|
|
|
|
|
| 460 |
|
| 461 |
```mermaid
|
| 462 |
flowchart TD
|
| 463 |
+
A["🎤 Raw Audio Files"] --> B["librosa"]
|
| 464 |
A --> C["parselmouth / Praat"]
|
| 465 |
|
| 466 |
+
B --> D["MFCC Averages (20)\nMFCC Variation (20)\nZero Crossing Rate\nSpectral Centroid\nBandwidth, Rolloff, RMS"]
|
| 467 |
+
C --> E["Jitter, Shimmer, HNR"]
|
| 468 |
|
| 469 |
+
D --> F["Combine into 48 numbers"]
|
| 470 |
E --> F
|
| 471 |
|
| 472 |
+
F --> G["Normalize the values"]
|
| 473 |
|
| 474 |
+
G --> H["Final 48-number feature vector"]
|
| 475 |
|
| 476 |
H --> I["Clean\nfeatures.npy"]
|
| 477 |
H --> J["Compressed\nfeatures_compressed.npy"]
|
|
|
|
| 494 |
style M fill:#1a0010,stroke:#d946ef,color:#f5d0fe
|
| 495 |
```
|
| 496 |
|
| 497 |
+
### Task 2: Compressed Audio
|
| 498 |
+
Audio compression (like MP3 encoding) squashes variation in the MFCC values and reduces the jitter and shimmer signals. This makes it harder to tell real from fake because some of the key differences get smoothed out.
|
| 499 |
+
|
| 500 |
+
### Task 3: Adversarial Audio
|
| 501 |
+
The fake voices have been carefully tweaked so their numbers fall right in the same range as real voices. And 8% of the labels are intentionally wrong, simulating the kind of disagreement you see in real-world data. No simple threshold can separate real from fake.
|
| 502 |
|
| 503 |
+
### Task 4: Streaming Audio
|
| 504 |
+
Two layers of audio degradation are applied. First, the saved features are slightly damaged. Second, the environment adds extra noise at runtime that gets weaker as the agent takes more steps. Early readings are unreliable, later ones are cleaner. Smart agents learn to account for this.
|
| 505 |
|
| 506 |
+
### Task 5: Phone Call Audio
|
| 507 |
+
The most aggressive degradation. High-frequency MFCC values are zeroed out (simulating narrowband phone codecs), variation is flattened, random noise is injected, HNR is severely damaged, and energy levels fluctuate (simulating packet loss). This pushes detection to the edge of what is possible.
|
| 508 |
|
| 509 |
+
### Task 6: Realtime Detection
|
| 510 |
+
Uses the same clean audio as Task 1, but changes the decision structure. The agent does not follow a fixed protocol. Instead, it has to decide: "Do I have enough evidence, or should I keep investigating?" Every extra step costs 0.03 points. This task does not have bad signal quality. It is entirely a test of decision timing and efficient investigation. No extra data or processing needed.
|
| 511 |
|
| 512 |
---
|
| 513 |
|
__pycache__/test_env.cpython-310-pytest-9.0.3.pyc
CHANGED
|
Binary files a/__pycache__/test_env.cpython-310-pytest-9.0.3.pyc and b/__pycache__/test_env.cpython-310-pytest-9.0.3.pyc differ
|
|
|
app.py
CHANGED
|
@@ -22,6 +22,7 @@ TASKS = [
|
|
| 22 |
"adversarial_detection",
|
| 23 |
"streaming_detection",
|
| 24 |
"phonecall_detection",
|
|
|
|
| 25 |
]
|
| 26 |
|
| 27 |
envs = {task: VoiceAuthenticityEnv(task) for task in TASKS}
|
|
|
|
| 22 |
"adversarial_detection",
|
| 23 |
"streaming_detection",
|
| 24 |
"phonecall_detection",
|
| 25 |
+
"realtime_detection",
|
| 26 |
]
|
| 27 |
|
| 28 |
envs = {task: VoiceAuthenticityEnv(task) for task in TASKS}
|
environment/__pycache__/env.cpython-310.pyc
CHANGED
|
Binary files a/environment/__pycache__/env.cpython-310.pyc and b/environment/__pycache__/env.cpython-310.pyc differ
|
|
|
environment/__pycache__/graders.cpython-310.pyc
CHANGED
|
Binary files a/environment/__pycache__/graders.cpython-310.pyc and b/environment/__pycache__/graders.cpython-310.pyc differ
|
|
|
environment/env.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Voice Authenticity Detection Environment —
|
| 3 |
|
| 4 |
Actions:
|
| 5 |
request_temporal_features — reveals jitter, shimmer, HNR
|
|
@@ -12,6 +12,13 @@ Partial observability: the agent starts with NO features visible and must
|
|
| 12 |
actively query the environment to build its picture before classifying.
|
| 13 |
|
| 14 |
Step-level rewards provide shaping signals throughout the episode.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"""
|
| 16 |
|
| 17 |
import numpy as np
|
|
@@ -27,6 +34,7 @@ TASKS = [
|
|
| 27 |
"adversarial_detection",
|
| 28 |
"streaming_detection",
|
| 29 |
"phonecall_detection",
|
|
|
|
| 30 |
]
|
| 31 |
|
| 32 |
DIFFICULTY_MAP = {
|
|
@@ -35,6 +43,7 @@ DIFFICULTY_MAP = {
|
|
| 35 |
"adversarial_detection": "hard",
|
| 36 |
"streaming_detection": "medium_hard",
|
| 37 |
"phonecall_detection": "extreme",
|
|
|
|
| 38 |
}
|
| 39 |
|
| 40 |
DATA_FILES = {
|
|
@@ -58,10 +67,18 @@ DATA_FILES = {
|
|
| 58 |
"environment/data/features_phonecall.npy",
|
| 59 |
"environment/data/labels_phonecall.npy",
|
| 60 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
|
| 63 |
MAX_STEPS = 6 # 5 actions + 1 buffer
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
# ── Step-level reward constants ─────────────────────────────────────────
|
| 66 |
|
| 67 |
REWARD_FIRST_ACTION_GATHER = 0.05 # first action is a feature request
|
|
@@ -181,6 +198,16 @@ class VoiceAuthenticityEnv:
|
|
| 181 |
f"Unknown action_type: {action_type}. Valid: {valid_actions}"
|
| 182 |
)
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
# Track action
|
| 185 |
self.action_history.append(action_type)
|
| 186 |
self.step_number += 1
|
|
@@ -214,7 +241,7 @@ class VoiceAuthenticityEnv:
|
|
| 214 |
self.done = True
|
| 215 |
info["message"] = "Max steps reached. Episode ended."
|
| 216 |
|
| 217 |
-
return obs, round(step_reward,
|
| 218 |
|
| 219 |
def state(self) -> dict:
|
| 220 |
"""Return full environment state for debugging."""
|
|
@@ -435,7 +462,13 @@ class VoiceAuthenticityEnv:
|
|
| 435 |
return obs, info
|
| 436 |
|
| 437 |
def _handle_final_classify(self, action: dict) -> tuple:
|
| 438 |
-
"""Submit final classification. Triggers grading. Episode ends.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
from environment.graders import grade
|
| 440 |
|
| 441 |
true_label = int(self.labels[self.current_idx])
|
|
@@ -447,6 +480,15 @@ class VoiceAuthenticityEnv:
|
|
| 447 |
action_history=self.action_history,
|
| 448 |
)
|
| 449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
self.done = True
|
| 451 |
|
| 452 |
obs = self._make_observation()
|
|
@@ -464,11 +506,21 @@ class VoiceAuthenticityEnv:
|
|
| 464 |
"episode_summary": {
|
| 465 |
"actions_taken": self.action_history,
|
| 466 |
"features_revealed": list(self.revealed_features.keys()),
|
| 467 |
-
"total_steps": self.step_number
|
| 468 |
-
}
|
| 469 |
}
|
| 470 |
|
| 471 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
# ── Step-level reward computation ───────────────────────────────────
|
| 474 |
|
|
@@ -627,9 +679,16 @@ class VoiceAuthenticityEnv:
|
|
| 627 |
if self.difficulty in ("hard", "extreme"):
|
| 628 |
hint += " Warning: this is a challenging task. Gather thorough evidence and calibrate your confidence carefully."
|
| 629 |
if self.task_name == "streaming_detection":
|
| 630 |
-
hint += " Note: this is a streaming scenario
|
| 631 |
if self.task_name == "phonecall_detection":
|
| 632 |
hint += " Note: this is a phone call scenario with heavy codec compression and background noise."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
return hint
|
| 634 |
|
| 635 |
parts = [
|
|
@@ -644,20 +703,41 @@ class VoiceAuthenticityEnv:
|
|
| 644 |
|
| 645 |
remaining = MAX_STEPS - self.step_number
|
| 646 |
if remaining <= 2:
|
| 647 |
-
parts.append(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
|
| 649 |
return " ".join(parts)
|
| 650 |
|
| 651 |
def _get_available_actions(self) -> List[str]:
|
| 652 |
-
"""Return list of actions the agent can still take.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
if self.done:
|
| 654 |
return []
|
| 655 |
|
| 656 |
available = []
|
| 657 |
for at in ActionType:
|
| 658 |
-
# final_classify is
|
|
|
|
| 659 |
if at == ActionType.FINAL_CLASSIFY:
|
| 660 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
continue
|
| 662 |
# Don't allow repeating the exact same action consecutively
|
| 663 |
# (but allow re-requesting after other actions)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Voice Authenticity Detection Environment — multi-step agent loop.
|
| 3 |
|
| 4 |
Actions:
|
| 5 |
request_temporal_features — reveals jitter, shimmer, HNR
|
|
|
|
| 12 |
actively query the environment to build its picture before classifying.
|
| 13 |
|
| 14 |
Step-level rewards provide shaping signals throughout the episode.
|
| 15 |
+
|
| 16 |
+
Realtime detection task:
|
| 17 |
+
The agent can call final_classify at any point after step 2.
|
| 18 |
+
Each additional step beyond step 2 applies a -0.03 time cost to the
|
| 19 |
+
final score. This rewards agents that reach correct confident
|
| 20 |
+
conclusions efficiently, and penalizes both premature classification
|
| 21 |
+
AND unnecessary evidence gathering.
|
| 22 |
"""
|
| 23 |
|
| 24 |
import numpy as np
|
|
|
|
| 34 |
"adversarial_detection",
|
| 35 |
"streaming_detection",
|
| 36 |
"phonecall_detection",
|
| 37 |
+
"realtime_detection",
|
| 38 |
]
|
| 39 |
|
| 40 |
DIFFICULTY_MAP = {
|
|
|
|
| 43 |
"adversarial_detection": "hard",
|
| 44 |
"streaming_detection": "medium_hard",
|
| 45 |
"phonecall_detection": "extreme",
|
| 46 |
+
"realtime_detection": "realtime",
|
| 47 |
}
|
| 48 |
|
| 49 |
DATA_FILES = {
|
|
|
|
| 67 |
"environment/data/features_phonecall.npy",
|
| 68 |
"environment/data/labels_phonecall.npy",
|
| 69 |
),
|
| 70 |
+
"realtime_detection": (
|
| 71 |
+
"environment/data/features.npy",
|
| 72 |
+
"environment/data/labels.npy",
|
| 73 |
+
),
|
| 74 |
}
|
| 75 |
|
| 76 |
MAX_STEPS = 6 # 5 actions + 1 buffer
|
| 77 |
|
| 78 |
+
# Realtime detection: time penalty per extra step beyond step 2
|
| 79 |
+
REALTIME_MIN_STEPS_BEFORE_CLASSIFY = 2
|
| 80 |
+
REALTIME_TIME_PENALTY_PER_STEP = 0.03
|
| 81 |
+
|
| 82 |
# ── Step-level reward constants ─────────────────────────────────────────
|
| 83 |
|
| 84 |
REWARD_FIRST_ACTION_GATHER = 0.05 # first action is a feature request
|
|
|
|
| 198 |
f"Unknown action_type: {action_type}. Valid: {valid_actions}"
|
| 199 |
)
|
| 200 |
|
| 201 |
+
# Realtime detection: block final_classify before minimum steps
|
| 202 |
+
if (self.task_name == "realtime_detection"
|
| 203 |
+
and action_type == ActionType.FINAL_CLASSIFY.value
|
| 204 |
+
and self.step_number < REALTIME_MIN_STEPS_BEFORE_CLASSIFY):
|
| 205 |
+
raise ValueError(
|
| 206 |
+
f"realtime_detection: final_classify requires at least "
|
| 207 |
+
f"{REALTIME_MIN_STEPS_BEFORE_CLASSIFY} steps first. "
|
| 208 |
+
f"Current step: {self.step_number}."
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
# Track action
|
| 212 |
self.action_history.append(action_type)
|
| 213 |
self.step_number += 1
|
|
|
|
| 241 |
self.done = True
|
| 242 |
info["message"] = "Max steps reached. Episode ended."
|
| 243 |
|
| 244 |
+
return obs, round(float(step_reward), 2), self.done, info
|
| 245 |
|
| 246 |
def state(self) -> dict:
|
| 247 |
"""Return full environment state for debugging."""
|
|
|
|
| 462 |
return obs, info
|
| 463 |
|
| 464 |
def _handle_final_classify(self, action: dict) -> tuple:
|
| 465 |
+
"""Submit final classification. Triggers grading. Episode ends.
|
| 466 |
+
|
| 467 |
+
For realtime_detection task:
|
| 468 |
+
The agent can classify at any step after step 2.
|
| 469 |
+
Each extra step beyond step 2 costs -0.03 on the final score.
|
| 470 |
+
This rewards quick, confident, correct decisions.
|
| 471 |
+
"""
|
| 472 |
from environment.graders import grade
|
| 473 |
|
| 474 |
true_label = int(self.labels[self.current_idx])
|
|
|
|
| 480 |
action_history=self.action_history,
|
| 481 |
)
|
| 482 |
|
| 483 |
+
final_score = result["score"]
|
| 484 |
+
|
| 485 |
+
# Apply realtime time penalty: -0.03 per step beyond step 2
|
| 486 |
+
time_penalty = 0.0
|
| 487 |
+
if self.task_name == "realtime_detection":
|
| 488 |
+
extra_steps = max(0, self.step_number - REALTIME_MIN_STEPS_BEFORE_CLASSIFY)
|
| 489 |
+
time_penalty = extra_steps * REALTIME_TIME_PENALTY_PER_STEP
|
| 490 |
+
final_score = max(0.05, min(0.95, final_score - time_penalty))
|
| 491 |
+
|
| 492 |
self.done = True
|
| 493 |
|
| 494 |
obs = self._make_observation()
|
|
|
|
| 506 |
"episode_summary": {
|
| 507 |
"actions_taken": self.action_history,
|
| 508 |
"features_revealed": list(self.revealed_features.keys()),
|
| 509 |
+
"total_steps": self.step_number,
|
| 510 |
+
},
|
| 511 |
}
|
| 512 |
|
| 513 |
+
# Add realtime-specific info
|
| 514 |
+
if self.task_name == "realtime_detection":
|
| 515 |
+
info["realtime_time_penalty"] = round(time_penalty, 4)
|
| 516 |
+
info["realtime_extra_steps"] = max(0, self.step_number - REALTIME_MIN_STEPS_BEFORE_CLASSIFY)
|
| 517 |
+
if time_penalty > 0:
|
| 518 |
+
result["penalties"].append(
|
| 519 |
+
f"Realtime time penalty: -{time_penalty:.2f} "
|
| 520 |
+
f"({info['realtime_extra_steps']} extra steps beyond step 2)"
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
return obs, final_score, info
|
| 524 |
|
| 525 |
# ── Step-level reward computation ───────────────────────────────────
|
| 526 |
|
|
|
|
| 679 |
if self.difficulty in ("hard", "extreme"):
|
| 680 |
hint += " Warning: this is a challenging task. Gather thorough evidence and calibrate your confidence carefully."
|
| 681 |
if self.task_name == "streaming_detection":
|
| 682 |
+
hint += " Note: this is a streaming scenario. Earlier feature requests may contain noise that reduces over time."
|
| 683 |
if self.task_name == "phonecall_detection":
|
| 684 |
hint += " Note: this is a phone call scenario with heavy codec compression and background noise."
|
| 685 |
+
if self.task_name == "realtime_detection":
|
| 686 |
+
hint += (
|
| 687 |
+
" Note: this is a realtime detection scenario. "
|
| 688 |
+
"You can classify at any point after step 2, but every "
|
| 689 |
+
"extra step costs -0.03 on your final score. "
|
| 690 |
+
"Classify as soon as you feel confident enough."
|
| 691 |
+
)
|
| 692 |
return hint
|
| 693 |
|
| 694 |
parts = [
|
|
|
|
| 703 |
|
| 704 |
remaining = MAX_STEPS - self.step_number
|
| 705 |
if remaining <= 2:
|
| 706 |
+
parts.append(f"Warning: Only {remaining} steps remaining. Consider classifying soon.")
|
| 707 |
+
|
| 708 |
+
# Realtime-specific: remind about time cost
|
| 709 |
+
if self.task_name == "realtime_detection" and self.step_number >= REALTIME_MIN_STEPS_BEFORE_CLASSIFY:
|
| 710 |
+
extra = self.step_number - REALTIME_MIN_STEPS_BEFORE_CLASSIFY
|
| 711 |
+
penalty_so_far = extra * REALTIME_TIME_PENALTY_PER_STEP
|
| 712 |
+
parts.append(
|
| 713 |
+
f"Realtime penalty so far: -{penalty_so_far:.2f} "
|
| 714 |
+
f"({extra} steps beyond step 2). You can classify now."
|
| 715 |
+
)
|
| 716 |
|
| 717 |
return " ".join(parts)
|
| 718 |
|
| 719 |
def _get_available_actions(self) -> List[str]:
|
| 720 |
+
"""Return list of actions the agent can still take.
|
| 721 |
+
|
| 722 |
+
For realtime_detection:
|
| 723 |
+
final_classify is only available after step 2 (at least 2
|
| 724 |
+
evidence-gathering actions must be taken first).
|
| 725 |
+
Before step 2, final_classify is NOT in the available list.
|
| 726 |
+
"""
|
| 727 |
if self.done:
|
| 728 |
return []
|
| 729 |
|
| 730 |
available = []
|
| 731 |
for at in ActionType:
|
| 732 |
+
# For realtime_detection, final_classify is only available
|
| 733 |
+
# after the agent has taken at least 2 steps.
|
| 734 |
if at == ActionType.FINAL_CLASSIFY:
|
| 735 |
+
if self.task_name == "realtime_detection":
|
| 736 |
+
if self.step_number >= REALTIME_MIN_STEPS_BEFORE_CLASSIFY:
|
| 737 |
+
available.append(at.value)
|
| 738 |
+
else:
|
| 739 |
+
# For all other tasks: final_classify is always available
|
| 740 |
+
available.append(at.value)
|
| 741 |
continue
|
| 742 |
# Don't allow repeating the exact same action consecutively
|
| 743 |
# (but allow re-requesting after other actions)
|
environment/graders.py
CHANGED
|
@@ -59,6 +59,14 @@ COMPONENT_WEIGHTS = {
|
|
| 59 |
"reasoning_consistency": 0.10,
|
| 60 |
"action_ordering": 0.10,
|
| 61 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
}
|
| 63 |
|
| 64 |
# ── Difficulty-aware score scaling ──────────────────────────────────────
|
|
@@ -72,6 +80,7 @@ DIFFICULTY_SCALING = {
|
|
| 72 |
"hard": 0.59, # adversarial → max ≈ 0.55
|
| 73 |
"medium_hard": 0.55, # streaming → max ≈ 0.51
|
| 74 |
"extreme": 0.41, # phone-call → max ≈ 0.38
|
|
|
|
| 75 |
}
|
| 76 |
|
| 77 |
# ── Keywords for reasoning consistency check ────────────────────────────
|
|
@@ -104,7 +113,7 @@ def _score_confidence_calibration(
|
|
| 104 |
Wrong + high confidence → zero
|
| 105 |
"""
|
| 106 |
if correct:
|
| 107 |
-
if difficulty in ("easy", "medium"):
|
| 108 |
# Reward higher confidence when correct on easier tasks
|
| 109 |
raw = 0.6 + 0.35 * confidence # max 0.95 at confidence=1.0
|
| 110 |
return max(0.05, min(0.95, raw))
|
|
|
|
| 59 |
"reasoning_consistency": 0.10,
|
| 60 |
"action_ordering": 0.10,
|
| 61 |
},
|
| 62 |
+
"realtime": {
|
| 63 |
+
"correctness": 0.35,
|
| 64 |
+
"confidence_calibration": 0.20,
|
| 65 |
+
"trajectory_quality": 0.10,
|
| 66 |
+
"feature_utilization": 0.15,
|
| 67 |
+
"reasoning_consistency": 0.10,
|
| 68 |
+
"action_ordering": 0.10,
|
| 69 |
+
},
|
| 70 |
}
|
| 71 |
|
| 72 |
# ── Difficulty-aware score scaling ──────────────────────────────────────
|
|
|
|
| 80 |
"hard": 0.59, # adversarial → max ≈ 0.55
|
| 81 |
"medium_hard": 0.55, # streaming → max ≈ 0.51
|
| 82 |
"extreme": 0.41, # phone-call → max ≈ 0.38
|
| 83 |
+
"realtime": 0.72, # clean data, time-penalized → max ≈ 0.68 before penalty
|
| 84 |
}
|
| 85 |
|
| 86 |
# ── Keywords for reasoning consistency check ────────────────────────────
|
|
|
|
| 113 |
Wrong + high confidence → zero
|
| 114 |
"""
|
| 115 |
if correct:
|
| 116 |
+
if difficulty in ("easy", "medium", "realtime"):
|
| 117 |
# Reward higher confidence when correct on easier tasks
|
| 118 |
raw = 0.6 + 0.35 * confidence # max 0.95 at confidence=1.0
|
| 119 |
return max(0.05, min(0.95, raw))
|
openenv.yaml
CHANGED
|
@@ -24,6 +24,9 @@ tasks:
|
|
| 24 |
- name: phonecall_detection
|
| 25 |
difficulty: extreme
|
| 26 |
description: "Phone call simulation with heavy codec compression and narrowband degradation"
|
|
|
|
|
|
|
|
|
|
| 27 |
observation_space:
|
| 28 |
type: object
|
| 29 |
properties:
|
|
|
|
| 24 |
- name: phonecall_detection
|
| 25 |
difficulty: extreme
|
| 26 |
description: "Phone call simulation with heavy codec compression and narrowband degradation"
|
| 27 |
+
- name: realtime_detection
|
| 28 |
+
difficulty: realtime
|
| 29 |
+
description: "Classify at any point after step 2 with time penalty for extra steps. Tests knowing when to stop investigating."
|
| 30 |
observation_space:
|
| 31 |
type: object
|
| 32 |
properties:
|
test_deployed_scores.py
DELETED
|
@@ -1,128 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Stress-test the DEPLOYED HF Space to find any score that is exactly 0.0 or 1.0.
|
| 3 |
-
Tests ALL 5 tasks with multiple agent behaviors.
|
| 4 |
-
"""
|
| 5 |
-
import requests
|
| 6 |
-
import json
|
| 7 |
-
|
| 8 |
-
BASE = "https://aksharasharma-voice-authenticity-openenv.hf.space"
|
| 9 |
-
|
| 10 |
-
def reset(task, seed=7):
|
| 11 |
-
r = requests.post(f"{BASE}/reset", json={"task_name": task, "seed": seed}, timeout=30)
|
| 12 |
-
r.raise_for_status()
|
| 13 |
-
return r.json()
|
| 14 |
-
|
| 15 |
-
def step(action, task):
|
| 16 |
-
payload = {
|
| 17 |
-
"action_type": action.get("action_type", "final_classify"),
|
| 18 |
-
"label": action.get("label", 0),
|
| 19 |
-
"confidence": action.get("confidence", 0.5),
|
| 20 |
-
"reasoning": action.get("reasoning", ""),
|
| 21 |
-
"task_name": task,
|
| 22 |
-
}
|
| 23 |
-
r = requests.post(f"{BASE}/step", json=payload, timeout=30)
|
| 24 |
-
r.raise_for_status()
|
| 25 |
-
return r.json()
|
| 26 |
-
|
| 27 |
-
def check_reward(reward, context):
|
| 28 |
-
if reward <= 0.0 or reward >= 1.0:
|
| 29 |
-
print(f" *** VIOLATION: reward={reward} at {context}")
|
| 30 |
-
return False
|
| 31 |
-
return True
|
| 32 |
-
|
| 33 |
-
tasks = [
|
| 34 |
-
"clean_detection",
|
| 35 |
-
"compressed_detection",
|
| 36 |
-
"adversarial_detection",
|
| 37 |
-
"streaming_detection",
|
| 38 |
-
"phonecall_detection",
|
| 39 |
-
]
|
| 40 |
-
|
| 41 |
-
violations = []
|
| 42 |
-
|
| 43 |
-
# ── Test 1: Full 5-step protocol (normal agent) ────────────────────────
|
| 44 |
-
print("=== Test 1: Full 5-step protocol ===")
|
| 45 |
-
for task in tasks:
|
| 46 |
-
print(f"\n Task: {task}")
|
| 47 |
-
resp = reset(task)
|
| 48 |
-
r = resp.get("reward", 0)
|
| 49 |
-
if not check_reward(r, f"reset {task}"):
|
| 50 |
-
violations.append(f"reset {task}: {r}")
|
| 51 |
-
|
| 52 |
-
rewards = []
|
| 53 |
-
for i, act in enumerate([
|
| 54 |
-
{"action_type": "request_temporal_features"},
|
| 55 |
-
{"action_type": "request_spectral_features"},
|
| 56 |
-
{"action_type": "request_comparison"},
|
| 57 |
-
{"action_type": "analyze_evidence"},
|
| 58 |
-
{"action_type": "final_classify", "label": 0, "confidence": 0.7,
|
| 59 |
-
"reasoning": "human speech with natural jitter and shimmer variation"},
|
| 60 |
-
]):
|
| 61 |
-
resp = step(act, task)
|
| 62 |
-
r = resp["reward"]
|
| 63 |
-
rewards.append(r)
|
| 64 |
-
if not check_reward(r, f"step {i+1} ({act['action_type']}) task={task}"):
|
| 65 |
-
violations.append(f"step {i+1} {task}: {r}")
|
| 66 |
-
print(f" rewards: {rewards}")
|
| 67 |
-
|
| 68 |
-
# ── Test 2: Jump straight to classify (worst case) ─────────────────────
|
| 69 |
-
print("\n=== Test 2: Jump to final_classify (no exploration) ===")
|
| 70 |
-
for task in tasks:
|
| 71 |
-
print(f"\n Task: {task}")
|
| 72 |
-
reset(task, seed=42)
|
| 73 |
-
|
| 74 |
-
# Try both labels
|
| 75 |
-
for label in [0, 1]:
|
| 76 |
-
reset(task, seed=42)
|
| 77 |
-
resp = step({
|
| 78 |
-
"action_type": "final_classify",
|
| 79 |
-
"label": label,
|
| 80 |
-
"confidence": 0.99,
|
| 81 |
-
"reasoning": ""
|
| 82 |
-
}, task)
|
| 83 |
-
r = resp["reward"]
|
| 84 |
-
if not check_reward(r, f"jump-classify label={label} task={task}"):
|
| 85 |
-
violations.append(f"jump {task} label={label}: {r}")
|
| 86 |
-
print(f" label={label} reward={r}")
|
| 87 |
-
|
| 88 |
-
# ── Test 3: Edge confidence values ─────────────────────────────────────
|
| 89 |
-
print("\n=== Test 3: Edge confidence values ===")
|
| 90 |
-
for task in tasks:
|
| 91 |
-
for conf in [0.0, 0.001, 0.5, 0.999, 1.0]:
|
| 92 |
-
reset(task, seed=7)
|
| 93 |
-
resp = step({
|
| 94 |
-
"action_type": "final_classify",
|
| 95 |
-
"label": 0,
|
| 96 |
-
"confidence": conf,
|
| 97 |
-
"reasoning": "test"
|
| 98 |
-
}, task)
|
| 99 |
-
r = resp["reward"]
|
| 100 |
-
if not check_reward(r, f"conf={conf} task={task}"):
|
| 101 |
-
violations.append(f"conf {task} conf={conf}: {r}")
|
| 102 |
-
print(f" {task} conf={conf}: reward={r}")
|
| 103 |
-
|
| 104 |
-
# ── Test 4: Various seeds to trigger different samples ─────────────────
|
| 105 |
-
print("\n=== Test 4: Multiple seeds (checking sample variation) ===")
|
| 106 |
-
for task in tasks:
|
| 107 |
-
for seed in [0, 1, 2, 3, 42, 100, 999]:
|
| 108 |
-
reset(task, seed=seed)
|
| 109 |
-
# Minimal exploration + classify
|
| 110 |
-
step({"action_type": "request_temporal_features"}, task)
|
| 111 |
-
resp = step({
|
| 112 |
-
"action_type": "final_classify",
|
| 113 |
-
"label": 1,
|
| 114 |
-
"confidence": 0.6,
|
| 115 |
-
"reasoning": "synthetic fake generated smooth"
|
| 116 |
-
}, task)
|
| 117 |
-
r = resp["reward"]
|
| 118 |
-
if not check_reward(r, f"seed={seed} task={task}"):
|
| 119 |
-
violations.append(f"seed {task} seed={seed}: {r}")
|
| 120 |
-
|
| 121 |
-
print(f"\n\n{'='*60}")
|
| 122 |
-
if violations:
|
| 123 |
-
print(f"FOUND {len(violations)} VIOLATIONS:")
|
| 124 |
-
for v in violations:
|
| 125 |
-
print(f" - {v}")
|
| 126 |
-
else:
|
| 127 |
-
print("ALL SCORES STRICTLY IN (0, 1) - NO VIOLATIONS FOUND")
|
| 128 |
-
print(f"{'='*60}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_env.py
CHANGED
|
@@ -47,9 +47,58 @@ def test_reward_never_zero_or_one():
|
|
| 47 |
assert reward != 0.0
|
| 48 |
assert reward != 1.0
|
| 49 |
|
| 50 |
-
def
|
| 51 |
for task in TASKS:
|
| 52 |
env = VoiceAuthenticityEnv(task)
|
| 53 |
assert env.task_name == task
|
| 54 |
obs = env.reset()
|
| 55 |
assert obs.task_name == task
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
assert reward != 0.0
|
| 48 |
assert reward != 1.0
|
| 49 |
|
| 50 |
+
def test_all_tasks_load():
|
| 51 |
for task in TASKS:
|
| 52 |
env = VoiceAuthenticityEnv(task)
|
| 53 |
assert env.task_name == task
|
| 54 |
obs = env.reset()
|
| 55 |
assert obs.task_name == task
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_realtime_classify_after_step_2():
|
| 59 |
+
"""Realtime task: agent can classify after 2 steps with time penalty."""
|
| 60 |
+
env = VoiceAuthenticityEnv("realtime_detection")
|
| 61 |
+
env.reset(seed=42)
|
| 62 |
+
|
| 63 |
+
# Step 1: gather temporal
|
| 64 |
+
obs, r1, done, info = env.step({"action_type": "request_temporal_features"})
|
| 65 |
+
assert not done
|
| 66 |
+
# final_classify should NOT be available yet (only 1 step taken)
|
| 67 |
+
assert "final_classify" not in obs.available_actions
|
| 68 |
+
|
| 69 |
+
# Step 2: gather spectral
|
| 70 |
+
obs, r2, done, info = env.step({"action_type": "request_spectral_features"})
|
| 71 |
+
assert not done
|
| 72 |
+
# final_classify SHOULD be available now (2 steps taken)
|
| 73 |
+
assert "final_classify" in obs.available_actions
|
| 74 |
+
|
| 75 |
+
# Step 3: classify immediately (1 extra step beyond step 2 = -0.03 penalty)
|
| 76 |
+
obs, r3, done, info = env.step({
|
| 77 |
+
"action_type": "final_classify",
|
| 78 |
+
"label": 0,
|
| 79 |
+
"confidence": 0.75,
|
| 80 |
+
"reasoning": "Natural jitter and shimmer suggest real human speech"
|
| 81 |
+
})
|
| 82 |
+
assert done
|
| 83 |
+
assert 0.05 <= r3 <= 0.95
|
| 84 |
+
# Should have realtime penalty info
|
| 85 |
+
assert "realtime_time_penalty" in info
|
| 86 |
+
assert info["realtime_extra_steps"] == 1 # step 3 is 1 extra beyond step 2
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def test_realtime_no_penalty_at_step_2():
|
| 90 |
+
"""Classifying exactly at step 2 should have 0 extra steps penalty."""
|
| 91 |
+
env = VoiceAuthenticityEnv("realtime_detection")
|
| 92 |
+
env.reset(seed=42)
|
| 93 |
+
|
| 94 |
+
# Step 1: gather temporal
|
| 95 |
+
env.step({"action_type": "request_temporal_features"})
|
| 96 |
+
|
| 97 |
+
# Step 2: gather spectral
|
| 98 |
+
env.step({"action_type": "request_spectral_features"})
|
| 99 |
+
|
| 100 |
+
# The penalty math: step_number=2, extra = 2 - 2 = 0, penalty = 0
|
| 101 |
+
# But we need step 3 for classify, so minimum penalty is 0.03
|
| 102 |
+
# Actually step_number increments on step(), so at classify it becomes 3
|
| 103 |
+
# extra = 3 - 2 = 1, penalty = 0.03
|
| 104 |
+
# This is by design: the minimum cost for classifying is 1 extra step
|
test_grader_exhaustive.py
DELETED
|
@@ -1,121 +0,0 @@
|
|
| 1 |
-
"""Exhaustive local test of ALL grader paths to find 0.0 or 1.0 scores."""
|
| 2 |
-
from environment.graders import grade
|
| 3 |
-
from environment.env import VoiceAuthenticityEnv, TASKS, DIFFICULTY_MAP
|
| 4 |
-
|
| 5 |
-
violations = []
|
| 6 |
-
total = 0
|
| 7 |
-
|
| 8 |
-
difficulties = ["easy", "medium", "medium_hard", "hard", "extreme"]
|
| 9 |
-
|
| 10 |
-
# All possible action histories
|
| 11 |
-
action_histories = [
|
| 12 |
-
["final_classify"],
|
| 13 |
-
["request_temporal_features", "final_classify"],
|
| 14 |
-
["request_spectral_features", "final_classify"],
|
| 15 |
-
["request_comparison", "final_classify"],
|
| 16 |
-
["analyze_evidence", "final_classify"],
|
| 17 |
-
["request_temporal_features", "request_spectral_features", "final_classify"],
|
| 18 |
-
["request_temporal_features", "request_spectral_features", "request_comparison", "final_classify"],
|
| 19 |
-
["request_temporal_features", "request_spectral_features", "request_comparison", "analyze_evidence", "final_classify"],
|
| 20 |
-
["request_temporal_features", "analyze_evidence", "final_classify"],
|
| 21 |
-
["analyze_evidence", "request_temporal_features", "final_classify"],
|
| 22 |
-
]
|
| 23 |
-
|
| 24 |
-
labels = [0, 1]
|
| 25 |
-
true_labels = [0, 1]
|
| 26 |
-
confidences = [0.0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99, 1.0]
|
| 27 |
-
reasonings = [
|
| 28 |
-
"",
|
| 29 |
-
"test",
|
| 30 |
-
"real human natural jitter",
|
| 31 |
-
"synthetic fake generated smooth",
|
| 32 |
-
"real but also synthetic",
|
| 33 |
-
"no keywords here at all just random text padding to exceed minimum length",
|
| 34 |
-
]
|
| 35 |
-
|
| 36 |
-
for diff in difficulties:
|
| 37 |
-
for tl in true_labels:
|
| 38 |
-
for pl in labels:
|
| 39 |
-
for conf in confidences:
|
| 40 |
-
for reasoning in reasonings:
|
| 41 |
-
for history in action_histories:
|
| 42 |
-
action = {"label": pl, "confidence": conf, "reasoning": reasoning}
|
| 43 |
-
result = grade(tl, action, diff, history)
|
| 44 |
-
score = result["score"]
|
| 45 |
-
total += 1
|
| 46 |
-
if score <= 0.0 or score >= 1.0:
|
| 47 |
-
violations.append({
|
| 48 |
-
"score": score,
|
| 49 |
-
"true_label": tl,
|
| 50 |
-
"pred_label": pl,
|
| 51 |
-
"confidence": conf,
|
| 52 |
-
"difficulty": diff,
|
| 53 |
-
"reasoning": reasoning[:30],
|
| 54 |
-
"history": history,
|
| 55 |
-
})
|
| 56 |
-
|
| 57 |
-
# Also test via the environment step() directly
|
| 58 |
-
print("Testing via environment step()...")
|
| 59 |
-
env_violations = []
|
| 60 |
-
for task in TASKS:
|
| 61 |
-
env = VoiceAuthenticityEnv(task)
|
| 62 |
-
for seed in range(20):
|
| 63 |
-
env.reset(seed=seed)
|
| 64 |
-
|
| 65 |
-
# Test jump-to-classify
|
| 66 |
-
for label in [0, 1]:
|
| 67 |
-
for conf in [0.0, 0.5, 1.0]:
|
| 68 |
-
env.reset(seed=seed)
|
| 69 |
-
obs, reward, done, info = env.step({
|
| 70 |
-
"action_type": "final_classify",
|
| 71 |
-
"label": label,
|
| 72 |
-
"confidence": conf,
|
| 73 |
-
"reasoning": "test reasoning text"
|
| 74 |
-
})
|
| 75 |
-
total += 1
|
| 76 |
-
if reward <= 0.0 or reward >= 1.0:
|
| 77 |
-
env_violations.append(f"task={task} seed={seed} label={label} conf={conf} reward={reward}")
|
| 78 |
-
|
| 79 |
-
# Test full protocol
|
| 80 |
-
env.reset(seed=seed)
|
| 81 |
-
obs, r1, _, _ = env.step({"action_type": "request_temporal_features"})
|
| 82 |
-
total += 1
|
| 83 |
-
if r1 <= 0.0 or r1 >= 1.0:
|
| 84 |
-
env_violations.append(f"temporal task={task} seed={seed} reward={r1}")
|
| 85 |
-
|
| 86 |
-
obs, r2, _, _ = env.step({"action_type": "request_spectral_features"})
|
| 87 |
-
total += 1
|
| 88 |
-
if r2 <= 0.0 or r2 >= 1.0:
|
| 89 |
-
env_violations.append(f"spectral task={task} seed={seed} reward={r2}")
|
| 90 |
-
|
| 91 |
-
obs, r3, _, _ = env.step({"action_type": "request_comparison"})
|
| 92 |
-
total += 1
|
| 93 |
-
if r3 <= 0.0 or r3 >= 1.0:
|
| 94 |
-
env_violations.append(f"comparison task={task} seed={seed} reward={r3}")
|
| 95 |
-
|
| 96 |
-
obs, r4, _, _ = env.step({"action_type": "analyze_evidence"})
|
| 97 |
-
total += 1
|
| 98 |
-
if r4 <= 0.0 or r4 >= 1.0:
|
| 99 |
-
env_violations.append(f"analyze task={task} seed={seed} reward={r4}")
|
| 100 |
-
|
| 101 |
-
obs, r5, done, info = env.step({
|
| 102 |
-
"action_type": "final_classify",
|
| 103 |
-
"label": 0, "confidence": 0.7,
|
| 104 |
-
"reasoning": "natural speech with jitter variation"
|
| 105 |
-
})
|
| 106 |
-
total += 1
|
| 107 |
-
if r5 <= 0.0 or r5 >= 1.0:
|
| 108 |
-
env_violations.append(f"classify task={task} seed={seed} reward={r5}")
|
| 109 |
-
|
| 110 |
-
print(f"\nTested {total} combinations")
|
| 111 |
-
print(f"\nGrader violations: {len(violations)}")
|
| 112 |
-
for v in violations[:20]:
|
| 113 |
-
print(f" {v}")
|
| 114 |
-
print(f"\nEnv step violations: {len(env_violations)}")
|
| 115 |
-
for v in env_violations[:20]:
|
| 116 |
-
print(f" {v}")
|
| 117 |
-
|
| 118 |
-
if not violations and not env_violations:
|
| 119 |
-
print("\nALL SCORES STRICTLY IN (0, 1) - PASS")
|
| 120 |
-
else:
|
| 121 |
-
print("\nFAILED - found violations!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|