v2 - web interface, health endpoint, full app.py
Browse files- .dockerignore +10 -0
- .gitignore +0 -0
- Dockerfile +5 -4
- README.md +188 -126
- app.py +221 -16
- environment/__pycache__/env.cpython-310.pyc +0 -0
- environment/__pycache__/graders.cpython-310.pyc +0 -0
- environment/__pycache__/models.cpython-310.pyc +0 -0
- environment/data/features_phonecall.npy +0 -0
- environment/data/features_streaming.npy +0 -0
- environment/data/labels_phonecall.npy +0 -0
- environment/data/labels_streaming.npy +0 -0
- environment/env.py +598 -83
- environment/graders.py +324 -25
- environment/models.py +55 -6
- inference.py +189 -81
- openenv.yaml +41 -9
- pyproject.toml +4 -4
- scripts/extract_features.py +130 -12
- server/app.py +218 -17
.dockerignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
.env
|
| 4 |
+
__pycache__
|
| 5 |
+
*.pyc
|
| 6 |
+
*.pyo
|
| 7 |
+
uv.lock
|
| 8 |
+
walkthrough.md
|
| 9 |
+
validate.sh
|
| 10 |
+
.dockerignore
|
.gitignore
CHANGED
|
Binary files a/.gitignore and b/.gitignore differ
|
|
|
Dockerfile
CHANGED
|
@@ -2,21 +2,22 @@ FROM python:3.10-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
| 5 |
RUN apt-get update && apt-get install -y \
|
| 6 |
libsndfile1 \
|
| 7 |
praat \
|
| 8 |
build-essential \
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
|
|
|
| 11 |
COPY requirements.txt .
|
| 12 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 13 |
|
|
|
|
| 14 |
COPY . .
|
| 15 |
|
| 16 |
-
|
| 17 |
-
ENV MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 18 |
-
ENV VOICE_TASK=clean_detection
|
| 19 |
-
|
| 20 |
EXPOSE 7860
|
| 21 |
|
|
|
|
| 22 |
CMD ["python", "app.py"]
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# Install system dependencies
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
libsndfile1 \
|
| 8 |
praat \
|
| 9 |
build-essential \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
+
# Install Python dependencies
|
| 13 |
COPY requirements.txt .
|
| 14 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
|
| 16 |
+
# Copy project files
|
| 17 |
COPY . .
|
| 18 |
|
| 19 |
+
# Expose the FastAPI port
|
|
|
|
|
|
|
|
|
|
| 20 |
EXPOSE 7860
|
| 21 |
|
| 22 |
+
# Run the environment server
|
| 23 |
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -5,52 +5,68 @@ colorFrom: blue
|
|
| 5 |
colorTo: red
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
# 🎙️ Voice Authenticity Detection — OpenEnv Environment
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
> Voice fraud is a growing crisis. This environment trains agents to detect synthetic speech under clean, compressed, and adversarial conditions — directly applicable to fraud detection, content moderation, and voice authentication systems.
|
| 15 |
|
| 16 |
---
|
| 17 |
|
| 18 |
## 🌍 Real-World Motivation
|
| 19 |
|
| 20 |
-
AI-generated voices
|
| 21 |
|
| 22 |
-
- Phone fraud
|
| 23 |
-
- Deepfake audio in misinformation
|
| 24 |
-
- Identity spoofing
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
This environment provides a structured benchmark for training agents to detect synthetic speech under
|
| 27 |
|
| 28 |
---
|
| 29 |
|
| 30 |
## 🏗️ Environment Overview
|
| 31 |
|
| 32 |
-
The environment serves 48-dimensional feature vectors extracted from audio samples.
|
| 33 |
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
---
|
| 37 |
|
| 38 |
-
## 🧠 Agent Interaction Model (Multi-Step)
|
| 39 |
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
| 46 |
-
The agent submits a final classification (`real` or `synthetic`) along with a confidence score and reasoning.
|
| 47 |
|
| 48 |
-
|
| 49 |
-
-
|
| 50 |
-
-
|
| 51 |
-
-
|
|
|
|
| 52 |
|
| 53 |
-
Episodes consist of
|
| 54 |
|
| 55 |
---
|
| 56 |
|
|
@@ -58,7 +74,7 @@ Episodes consist of a **two-step interaction (analysis → decision)** rather th
|
|
| 58 |
|
| 59 |
- Fits within 2 vCPU / 8GB RAM constraints
|
| 60 |
- Feature extraction is performed offline for fast inference
|
| 61 |
-
- Enables **LLM-native reasoning over interpretable
|
| 62 |
- Avoids heavy signal processing during evaluation
|
| 63 |
|
| 64 |
---
|
|
@@ -67,7 +83,7 @@ Episodes consist of a **two-step interaction (analysis → decision)** rather th
|
|
| 67 |
|
| 68 |
- Real speech: 250 samples from `garystafford/deepfake-audio-detection` (authentic human recordings)
|
| 69 |
- Synthetic speech: 250 samples (ElevenLabs, Hume AI, and other TTS platforms)
|
| 70 |
-
- Total: 500 labeled samples across
|
| 71 |
|
| 72 |
The dataset is designed for **evaluation structure and reward learning**, not scale. The feature pipeline supports arbitrary dataset expansion for production deployment.
|
| 73 |
|
|
@@ -75,7 +91,24 @@ The dataset is designed for **evaluation structure and reward learning**, not sc
|
|
| 75 |
|
| 76 |
## 📐 Observation Space
|
| 77 |
|
| 78 |
-
Each observation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
| Index | Feature | Description |
|
| 81 |
|-------|---------|-------------|
|
|
@@ -90,127 +123,140 @@ Each observation is a 48-dimensional float32 vector:
|
|
| 90 |
|
| 91 |
### Key Discriminating Features
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
-
|
| 96 |
-
- Shimmer: tracks amplitude variation between consecutive glottal pulses
|
| 97 |
-
- HNR: quantifies the ratio of harmonic energy to noise in the signal
|
| 98 |
-
|
| 99 |
-
### Observation Schema (Pydantic)
|
| 100 |
-
```python
|
| 101 |
-
class VoiceObservation(BaseModel):
|
| 102 |
-
features: List[float] # 48-dim feature vector (normalized)
|
| 103 |
-
task_name: str # current task
|
| 104 |
-
step_number: int # current step in episode
|
| 105 |
-
difficulty: str # easy | medium | hard
|
| 106 |
-
sample_id: int # index into dataset
|
| 107 |
-
hint: Optional[str] # task context and key raw values
|
| 108 |
-
```
|
| 109 |
|
| 110 |
---
|
| 111 |
|
| 112 |
## 🎯 Action Space
|
|
|
|
| 113 |
```python
|
| 114 |
class VoiceAction(BaseModel):
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
| 118 |
```
|
| 119 |
|
| 120 |
---
|
| 121 |
|
| 122 |
-
## 🏆 Tasks
|
| 123 |
|
| 124 |
### Task 1 — Clean Detection (Easy)
|
| 125 |
|
| 126 |
-
- Description: Classify real vs synthetic speech from clean, unmodified audio features
|
| 127 |
-
- Difficulty: Easy
|
| 128 |
-
- Expected agent score: 0.7–
|
| 129 |
-
- Scoring: Binary — correct=1.0, incorrect=0.0
|
| 130 |
|
| 131 |
### Task 2 — Compressed Detection (Medium)
|
| 132 |
|
| 133 |
-
- Description: Classify speech after codec compression degradation.
|
| 134 |
-
- Difficulty: Medium
|
| 135 |
-
- Expected agent score: 0.4–0.7
|
| 136 |
-
- Scoring: Partial credit based on confidence calibration
|
| 137 |
-
- correct + high confidence → 1.0
|
| 138 |
-
- correct + low confidence → 0.6
|
| 139 |
-
- wrong + low confidence → 0.2
|
| 140 |
-
- wrong + high confidence → 0.0
|
| 141 |
|
| 142 |
### Task 3 — Adversarial Detection (Hard)
|
| 143 |
|
| 144 |
-
- Description: Synthetic audio
|
| 145 |
-
- Difficulty: Hard
|
| 146 |
-
- Expected agent score: 0.3–0.6
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
---
|
| 155 |
|
| 156 |
-
##
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
calibration_bonus = 0.5 * (1 - abs(confidence - 0.7))
|
| 177 |
-
return base + calibration_bonus
|
| 178 |
-
else:
|
| 179 |
-
return 0.15 if confidence < 0.4 else 0.0
|
| 180 |
-
```
|
| 181 |
|
| 182 |
-
|
| 183 |
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
-
|
| 187 |
-
- Risk-aware decision-making
|
| 188 |
-
- Avoidance of overconfident errors
|
| 189 |
|
| 190 |
---
|
| 191 |
|
| 192 |
## 🔌 OpenEnv API
|
|
|
|
| 193 |
```python
|
| 194 |
from environment.env import VoiceAuthenticityEnv
|
| 195 |
|
| 196 |
env = VoiceAuthenticityEnv(task_name="clean_detection")
|
| 197 |
|
|
|
|
| 198 |
obs = env.reset()
|
| 199 |
-
# obs.features
|
| 200 |
-
# obs.
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
-
#
|
| 204 |
-
action = {"
|
| 205 |
obs, reward, done, info = env.step(action)
|
| 206 |
-
#
|
|
|
|
| 207 |
|
| 208 |
-
#
|
| 209 |
-
action = {"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
obs, reward, done, info = env.step(action)
|
| 211 |
-
# reward
|
| 212 |
-
# done
|
| 213 |
-
# info["
|
| 214 |
|
| 215 |
state = env.state()
|
| 216 |
```
|
|
@@ -220,29 +266,32 @@ state = env.state()
|
|
| 220 |
## 📊 Baseline Scores
|
| 221 |
|
| 222 |
Agent: `Qwen/Qwen2.5-72B-Instruct` via HuggingFace router
|
|
|
|
| 223 |
Runs: 10 independent episodes per task
|
| 224 |
-
Metric: Average reward per episode (decision phase only)
|
| 225 |
|
| 226 |
| Task | Difficulty | Avg Reward | Success Rate | Notes |
|
| 227 |
|------|-----------|------------|--------------|-------|
|
| 228 |
| clean_detection | Easy | 0.80 | 80% | Strong baseline on clean features |
|
| 229 |
| compressed_detection | Medium | 0.45 | 55% | Compression degrades acoustic signal |
|
| 230 |
-
| adversarial_detection | Hard | 0.50 | 50% | Overlapping distributions challenge
|
|
|
|
|
|
|
| 231 |
|
| 232 |
-
Scores vary per run due to random sample selection. Higher rewards on harder tasks reflect
|
| 233 |
|
| 234 |
---
|
| 235 |
|
| 236 |
## ⚠️ Known Limitations and Failure Cases
|
| 237 |
|
| 238 |
-
- Synthetic voices with injected background noise may evade detection
|
| 239 |
-
- Real voices
|
| 240 |
- Borderline acoustic feature overlap exists between real and adversarially crafted samples — no clean threshold separates them
|
|
|
|
|
|
|
| 241 |
- Dataset of 500 samples is designed for evaluation structure and reward design, not production scale
|
| 242 |
-
-
|
| 243 |
-
- Results may vary across accents, languages, and recording conditions not represented in the training distribution
|
| 244 |
|
| 245 |
-
This environment is designed to be extended with real enterprise datasets. The evaluation structure,
|
| 246 |
|
| 247 |
---
|
| 248 |
|
|
@@ -271,7 +320,7 @@ cp .env.example .env
|
|
| 271 |
# Terminal 1 — start the environment server
|
| 272 |
python app.py
|
| 273 |
|
| 274 |
-
# Terminal 2 — run baseline inference
|
| 275 |
python inference.py
|
| 276 |
```
|
| 277 |
|
|
@@ -298,25 +347,29 @@ docker run --env-file .env voice-authenticity
|
|
| 298 |
voice-authenticity-openenv/
|
| 299 |
├── environment/
|
| 300 |
│ ├── __init__.py
|
| 301 |
-
│ ├── env.py #
|
| 302 |
│ ├── models.py # Pydantic Observation/Action/Reward models
|
| 303 |
-
│ ├── graders.py # scoring
|
| 304 |
│ └── data/
|
| 305 |
│ ├── features.npy # clean features (500 × 48)
|
| 306 |
│ ├── features_compressed.npy # codec-degraded features
|
| 307 |
│ ├── features_adversarial.npy# adversarially perturbed features
|
| 308 |
-
│ ├──
|
|
|
|
|
|
|
| 309 |
│ ├── labels.npy # ground truth labels
|
| 310 |
│ ├── labels_compressed.npy
|
| 311 |
-
│
|
|
|
|
|
|
|
| 312 |
├── scripts/
|
| 313 |
│ ├── download_data.py # fetch dataset from HuggingFace
|
| 314 |
-
│ └── extract_features.py # audio → feature vectors
|
| 315 |
├── server/
|
| 316 |
│ └── app.py # OpenEnv HTTP server entry point
|
| 317 |
├── app.py # FastAPI server (root)
|
| 318 |
-
├── inference.py # baseline LLM agent
|
| 319 |
-
├── openenv.yaml # OpenEnv spec
|
| 320 |
├── pyproject.toml # package config
|
| 321 |
├── Dockerfile
|
| 322 |
├── requirements.txt
|
|
@@ -334,23 +387,32 @@ Audio (.wav / .flac)
|
|
| 334 |
↓ parselmouth/Praat → jitter, shimmer, HNR
|
| 335 |
↓ z-score normalization
|
| 336 |
↓ 48-dim float32 vector
|
| 337 |
-
→ stored as .npy arrays
|
| 338 |
```
|
| 339 |
|
| 340 |
### Compression Simulation (Task 2)
|
| 341 |
Codec compression is simulated by degrading MFCC standard deviations, reducing jitter and shimmer values, and adding spectral artifact signals — replicating the acoustic degradation introduced by MP3/codec pipelines.
|
| 342 |
|
| 343 |
### Adversarial Simulation (Task 3)
|
| 344 |
-
Adversarial perturbation shifts synthetic sample features into the real speech distribution range, and real sample features toward the synthetic range. Controlled label noise (8%)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
---
|
| 347 |
|
| 348 |
## 📋 Expected stdout Format
|
| 349 |
```
|
| 350 |
[START] task=clean_detection env=voice-authenticity model=Qwen/Qwen2.5-72B-Instruct
|
| 351 |
-
[STEP] step=1 action={"
|
| 352 |
-
[STEP] step=2 action={"
|
| 353 |
-
[
|
|
|
|
|
|
|
|
|
|
| 354 |
```
|
| 355 |
|
| 356 |
---
|
|
|
|
| 5 |
colorTo: red
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 7860
|
| 9 |
+
base_path: /docs
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- speech
|
| 13 |
+
- fraud-detection
|
| 14 |
+
- audio
|
| 15 |
---
|
| 16 |
|
| 17 |
# 🎙️ Voice Authenticity Detection — OpenEnv Environment
|
| 18 |
|
| 19 |
+
Voice fraud now costs the global economy over **$25 billion annually**, devastating banking, insurance, telecom, and government services. AI-generated voices from platforms like ElevenLabs, Coqui, and Bark can clone any voice in under 60 seconds — enabling real-time phone scams, identity theft, and social engineering at unprecedented scale. Existing benchmarks like ASVspoof and ADD fail under real-world conditions: they operate on static datasets with fixed train/test splits, evaluate single-shot classifiers with no agent interaction, ignore partial observability (real systems never see all features at once), and provide binary pass/fail scoring with no reward shaping. This environment fills that gap. It trains agents to **actively gather, analyze, and reason about acoustic evidence** under realistic degradation — codec compression, adversarial perturbation, streaming noise, and phone call simulation — through a genuine multi-step decision process with 5 distinct actions, 6-component grading, and step-level reward shaping that teaches calibrated, risk-aware classification.
|
|
|
|
|
|
|
| 20 |
|
| 21 |
---
|
| 22 |
|
| 23 |
## 🌍 Real-World Motivation
|
| 24 |
|
| 25 |
+
AI-generated voices are increasingly weaponized for:
|
| 26 |
|
| 27 |
+
- **Phone fraud & social engineering** — real-time voice cloning during live calls
|
| 28 |
+
- **Deepfake audio in misinformation** — fabricated audio of public figures
|
| 29 |
+
- **Identity spoofing** — bypassing voice biometric authentication systems
|
| 30 |
+
- **Financial fraud** — CEO voice cloning for unauthorized wire transfers
|
| 31 |
+
- **Insurance scams** — fabricated recorded statements
|
| 32 |
|
| 33 |
+
This environment provides a structured benchmark for training agents to detect synthetic speech under conditions that static classifiers and existing benchmarks cannot handle.
|
| 34 |
|
| 35 |
---
|
| 36 |
|
| 37 |
## 🏗️ Environment Overview
|
| 38 |
|
| 39 |
+
The environment serves 48-dimensional feature vectors extracted from audio samples. Unlike standard classification benchmarks, agents **start with NO features visible** and must actively query the environment through a 5-action protocol to gather evidence before making a final classification.
|
| 40 |
|
| 41 |
+
This creates genuine **sequential decision-making under partial observability**, requiring agents to:
|
| 42 |
+
- Choose which information to request and in what order
|
| 43 |
+
- Synthesize heterogeneous evidence sources
|
| 44 |
+
- Express calibrated confidence reflecting genuine uncertainty
|
| 45 |
+
- Follow logical investigation trajectories
|
| 46 |
|
| 47 |
---
|
| 48 |
|
| 49 |
+
## 🧠 Agent Interaction Model (5-Action Multi-Step)
|
| 50 |
|
| 51 |
+
The agent interacts through **5 distinct actions**, each returning genuinely different observation content:
|
| 52 |
|
| 53 |
+
| Action | Returns | Purpose |
|
| 54 |
+
|--------|---------|---------|
|
| 55 |
+
| `request_temporal_features` | Jitter, shimmer, HNR (raw + normalized) | Vocal cord irregularity markers |
|
| 56 |
+
| `request_spectral_features` | 20 MFCC means, 20 MFCC stds, ZCR, spectral centroid | Timbre and spectral shape |
|
| 57 |
+
| `request_comparison` | Cosine similarity + euclidean distance to real/fake centroids | Statistical comparison to known references |
|
| 58 |
+
| `analyze_evidence` | Structured synthesis of all gathered evidence with signal tally | Evidence integration and confidence calibration |
|
| 59 |
+
| `final_classify` | Submits label (0=real, 1=synthetic) + confidence + reasoning | Terminal action — triggers 6-component grading |
|
| 60 |
|
| 61 |
+
### Key Design Properties
|
|
|
|
| 62 |
|
| 63 |
+
- **Partial observability** — features are zeroed until explicitly requested
|
| 64 |
+
- **Action-dependent observations** — each action reveals genuinely different data
|
| 65 |
+
- **Flexible ordering** — agent chooses its own investigation strategy
|
| 66 |
+
- **Soft-gated streaming** — streaming task adds step-dependent noise (noisier early, cleaner late)
|
| 67 |
+
- **Step-level rewards** — shaping signals throughout the episode, not just at the end
|
| 68 |
|
| 69 |
+
Episodes consist of **up to 6 steps** (5 investigation actions + buffer), not a single prediction.
|
| 70 |
|
| 71 |
---
|
| 72 |
|
|
|
|
| 74 |
|
| 75 |
- Fits within 2 vCPU / 8GB RAM constraints
|
| 76 |
- Feature extraction is performed offline for fast inference
|
| 77 |
+
- Enables **LLM-native reasoning over interpretable acoustic characteristics** — not possible with raw waveforms under current infrastructure constraints
|
| 78 |
- Avoids heavy signal processing during evaluation
|
| 79 |
|
| 80 |
---
|
|
|
|
| 83 |
|
| 84 |
- Real speech: 250 samples from `garystafford/deepfake-audio-detection` (authentic human recordings)
|
| 85 |
- Synthetic speech: 250 samples (ElevenLabs, Hume AI, and other TTS platforms)
|
| 86 |
+
- Total: 500 labeled samples across 5 task variants
|
| 87 |
|
| 88 |
The dataset is designed for **evaluation structure and reward learning**, not scale. The feature pipeline supports arbitrary dataset expansion for production deployment.
|
| 89 |
|
|
|
|
| 91 |
|
| 92 |
## 📐 Observation Space
|
| 93 |
|
| 94 |
+
Each observation contains:
|
| 95 |
+
|
| 96 |
+
```python
|
| 97 |
+
class VoiceObservation(BaseModel):
|
| 98 |
+
features: List[float] # 48-dim (zeroed until revealed)
|
| 99 |
+
task_name: str # current task
|
| 100 |
+
step_number: int # current step in episode
|
| 101 |
+
difficulty: str # easy|medium|medium_hard|hard|extreme
|
| 102 |
+
sample_id: int # index into dataset
|
| 103 |
+
hint: Optional[str] # context and guidance
|
| 104 |
+
visible_features: Dict[str, Any] # features revealed so far
|
| 105 |
+
evidence_summary: Optional[str] # from analyze_evidence
|
| 106 |
+
comparison_result: Optional[Dict[str, float]] # from request_comparison
|
| 107 |
+
available_actions: List[str] # valid actions this step
|
| 108 |
+
actions_taken: List[str] # action history
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
### 48-Dimensional Feature Vector
|
| 112 |
|
| 113 |
| Index | Feature | Description |
|
| 114 |
|-------|---------|-------------|
|
|
|
|
| 123 |
|
| 124 |
### Key Discriminating Features
|
| 125 |
|
| 126 |
+
- **Jitter**: measures cycle-to-cycle frequency instability — real voices show natural irregularity, synthetic voices are too stable
|
| 127 |
+
- **Shimmer**: tracks amplitude variation between consecutive glottal pulses — real speech has organic variation
|
| 128 |
+
- **HNR**: quantifies harmonic-to-noise ratio — synthetic voices are typically "too clean"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
---
|
| 131 |
|
| 132 |
## 🎯 Action Space
|
| 133 |
+
|
| 134 |
```python
|
| 135 |
class VoiceAction(BaseModel):
|
| 136 |
+
action_type: str # one of the 5 actions
|
| 137 |
+
label: int # 0=real, 1=synthetic (for final_classify)
|
| 138 |
+
confidence: float # [0.05, 0.95] (for final_classify)
|
| 139 |
+
reasoning: str # explanation (for final_classify)
|
| 140 |
```
|
| 141 |
|
| 142 |
---
|
| 143 |
|
| 144 |
+
## 🏆 Tasks (5 Total)
|
| 145 |
|
| 146 |
### Task 1 — Clean Detection (Easy)
|
| 147 |
|
| 148 |
+
- **Description**: Classify real vs synthetic speech from clean, unmodified audio features
|
| 149 |
+
- **Difficulty**: Easy
|
| 150 |
+
- **Expected agent score**: 0.7–0.95
|
|
|
|
| 151 |
|
| 152 |
### Task 2 — Compressed Detection (Medium)
|
| 153 |
|
| 154 |
+
- **Description**: Classify speech after codec compression degradation. MFCC stds are flattened, jitter/shimmer are suppressed, spectral artifacts are added.
|
| 155 |
+
- **Difficulty**: Medium
|
| 156 |
+
- **Expected agent score**: 0.4–0.7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
### Task 3 — Adversarial Detection (Hard)
|
| 159 |
|
| 160 |
+
- **Description**: Synthetic audio engineered to mimic real speech characteristics. Feature distributions overlap significantly with real speech. 8% label noise simulates real-world annotation ambiguity.
|
| 161 |
+
- **Difficulty**: Hard
|
| 162 |
+
- **Expected agent score**: 0.3–0.6
|
| 163 |
+
|
| 164 |
+
### Task 4 — Streaming Detection (Medium-Hard)
|
| 165 |
+
|
| 166 |
+
- **Description**: Multi-step streaming scenario where features arrive with step-dependent noise. Earlier requests return noisier data; later requests return cleaner data. Agents are rewarded for intelligent sequencing without being forced into a fixed order (soft-gating).
|
| 167 |
+
- **Difficulty**: Medium-Hard
|
| 168 |
+
- **Expected agent score**: 0.3–0.6
|
| 169 |
+
|
| 170 |
+
### Task 5 — Phone Call Detection (Extreme)
|
| 171 |
+
|
| 172 |
+
- **Description**: Simulates worst-case real-world conditions: heavy narrowband codec compression (300-3400Hz telephony simulation), additive background noise across all frequency bands, severe HNR degradation, MFCC high-frequency rolloff, and RMS energy fluctuation from packet loss. Designed to be near the limit of detectability.
|
| 173 |
+
- **Difficulty**: Extreme
|
| 174 |
+
- **Expected agent score**: 0.2–0.5
|
| 175 |
|
| 176 |
---
|
| 177 |
|
| 178 |
+
## 🏅 Grading System (6 Components)
|
| 179 |
|
| 180 |
+
Each episode is scored across 6 components with difficulty-weighted contributions:
|
| 181 |
+
|
| 182 |
+
| Component | What It Measures | Easy | Medium | Hard | Extreme |
|
| 183 |
+
|-----------|-----------------|------|--------|------|---------|
|
| 184 |
+
| **Correctness** | Label matches ground truth | 0.40 | 0.30 | 0.25 | 0.20 |
|
| 185 |
+
| **Confidence Calibration** | Penalizes overconfidence, rewards calibrated uncertainty | 0.15 | 0.20 | 0.25 | 0.25 |
|
| 186 |
+
| **Trajectory Quality** | Did agent gather → analyze → classify? | 0.10 | 0.15 | 0.18 | 0.20 |
|
| 187 |
+
| **Feature Utilization** | Did agent request temporal AND spectral features? | 0.15 | 0.15 | 0.12 | 0.15 |
|
| 188 |
+
| **Reasoning Consistency** | Does reasoning text match chosen label? | 0.10 | 0.10 | 0.10 | 0.10 |
|
| 189 |
+
| **Action Ordering** | Logical sequence: gather → analyze → classify | 0.10 | 0.10 | 0.10 | 0.10 |
|
| 190 |
+
|
| 191 |
+
### Why This Matters
|
| 192 |
+
|
| 193 |
+
On easy tasks, correctness dominates. On hard/extreme tasks, confidence calibration and trajectory quality become critical — mirroring real-world fraud detection where **a confident wrong answer is more dangerous than an uncertain one**, and where **systematic investigation outperforms snap judgments**.
|
| 194 |
+
|
| 195 |
+
---
|
| 196 |
+
|
| 197 |
+
## 🎁 Step-Level Rewards
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
+
The environment provides shaping signals at every step, not just on final classification:
|
| 200 |
|
| 201 |
+
| Condition | Reward |
|
| 202 |
+
|-----------|--------|
|
| 203 |
+
| First action is a feature request | +0.05 |
|
| 204 |
+
| Requested both temporal AND spectral features | +0.05 |
|
| 205 |
+
| Used `analyze_evidence` before `final_classify` | +0.05 |
|
| 206 |
+
| Jumped straight to `final_classify` without gathering | -0.10 |
|
| 207 |
+
| Repeated the same action consecutively | -0.05 |
|
| 208 |
+
| Reasoning contradicts chosen label | -0.10 |
|
| 209 |
|
| 210 |
+
These intermediate rewards teach agents **investigation behavior** rather than pure classification.
|
|
|
|
|
|
|
| 211 |
|
| 212 |
---
|
| 213 |
|
| 214 |
## 🔌 OpenEnv API
|
| 215 |
+
|
| 216 |
```python
|
| 217 |
from environment.env import VoiceAuthenticityEnv
|
| 218 |
|
| 219 |
env = VoiceAuthenticityEnv(task_name="clean_detection")
|
| 220 |
|
| 221 |
+
# Reset — no features visible yet
|
| 222 |
obs = env.reset()
|
| 223 |
+
# obs.features → [0.05, 0.05, ..., 0.05] (zeroed)
|
| 224 |
+
# obs.available_actions → ["request_temporal_features", ...]
|
| 225 |
+
|
| 226 |
+
# Step 1 — request temporal features
|
| 227 |
+
action = {"action_type": "request_temporal_features"}
|
| 228 |
+
obs, reward, done, info = env.step(action)
|
| 229 |
+
# obs.visible_features["temporal"]["jitter"] → 0.032451
|
| 230 |
+
# reward → 0.05 (shaping: first action is gathering)
|
| 231 |
+
|
| 232 |
+
# Step 2 — request spectral features
|
| 233 |
+
action = {"action_type": "request_spectral_features"}
|
| 234 |
+
obs, reward, done, info = env.step(action)
|
| 235 |
+
# obs.visible_features["spectral"]["mfcc_means"] → [20 values]
|
| 236 |
+
# reward → 0.05 (shaping: multi-feature-type bonus)
|
| 237 |
|
| 238 |
+
# Step 3 — compare to reference centroids
|
| 239 |
+
action = {"action_type": "request_comparison"}
|
| 240 |
obs, reward, done, info = env.step(action)
|
| 241 |
+
# obs.comparison_result["cosine_similarity_to_real"] → 0.8742
|
| 242 |
+
# obs.comparison_result["closer_to"] → "real"
|
| 243 |
|
| 244 |
+
# Step 4 — analyze all evidence
|
| 245 |
+
action = {"action_type": "analyze_evidence"}
|
| 246 |
+
obs, reward, done, info = env.step(action)
|
| 247 |
+
# obs.evidence_summary → "Evidence analysis (3 sources): ..."
|
| 248 |
+
|
| 249 |
+
# Step 5 — final classification
|
| 250 |
+
action = {
|
| 251 |
+
"action_type": "final_classify",
|
| 252 |
+
"label": 0,
|
| 253 |
+
"confidence": 0.78,
|
| 254 |
+
"reasoning": "High jitter and shimmer indicate natural vocal cord variation. HNR is low, consistent with real speech. Comparison confirms closer to real centroid."
|
| 255 |
+
}
|
| 256 |
obs, reward, done, info = env.step(action)
|
| 257 |
+
# reward → 0.87 (6-component graded score)
|
| 258 |
+
# done → True
|
| 259 |
+
# info["grader_breakdown"] → {correctness: 0.95, calibration: 0.84, ...}
|
| 260 |
|
| 261 |
state = env.state()
|
| 262 |
```
|
|
|
|
| 266 |
## 📊 Baseline Scores
|
| 267 |
|
| 268 |
Agent: `Qwen/Qwen2.5-72B-Instruct` via HuggingFace router
|
| 269 |
+
Protocol: 5-action (temporal → spectral → comparison → analyze → classify)
|
| 270 |
Runs: 10 independent episodes per task
|
|
|
|
| 271 |
|
| 272 |
| Task | Difficulty | Avg Reward | Success Rate | Notes |
|
| 273 |
|------|-----------|------------|--------------|-------|
|
| 274 |
| clean_detection | Easy | 0.80 | 80% | Strong baseline on clean features |
|
| 275 |
| compressed_detection | Medium | 0.45 | 55% | Compression degrades acoustic signal |
|
| 276 |
+
| adversarial_detection | Hard | 0.50 | 50% | Overlapping distributions challenge models |
|
| 277 |
+
| streaming_detection | Medium-Hard | 0.40 | 45% | Soft-gated noise reduces early accuracy |
|
| 278 |
+
| phonecall_detection | Extreme | 0.30 | 35% | Near detection limit under phone conditions |
|
| 279 |
|
| 280 |
+
Scores vary per run due to random sample selection. Higher rewards on harder tasks reflect confidence calibration — agents that express appropriate uncertainty score better than overconfident wrong answers.
|
| 281 |
|
| 282 |
---
|
| 283 |
|
| 284 |
## ⚠️ Known Limitations and Failure Cases
|
| 285 |
|
| 286 |
+
- Synthetic voices with injected background noise may evade temporal feature detection
|
| 287 |
+
- Real voices under heavy studio compression can mimic synthetic spectral profiles
|
| 288 |
- Borderline acoustic feature overlap exists between real and adversarially crafted samples — no clean threshold separates them
|
| 289 |
+
- Phone call simulation pushes detection to near-chance performance, reflecting genuine real-world difficulty
|
| 290 |
+
- Streaming task noise is step-dependent — agents that don't re-request features may work from degraded data
|
| 291 |
- Dataset of 500 samples is designed for evaluation structure and reward design, not production scale
|
| 292 |
+
- Results may vary across accents, languages, and recording conditions not represented in the data
|
|
|
|
| 293 |
|
| 294 |
+
This environment is designed to be extended with real enterprise datasets. The evaluation structure, 6-component grader, and feature pipeline are production-ready; the dataset is a research prototype.
|
| 295 |
|
| 296 |
---
|
| 297 |
|
|
|
|
| 320 |
# Terminal 1 — start the environment server
|
| 321 |
python app.py
|
| 322 |
|
| 323 |
+
# Terminal 2 — run baseline inference (5-action protocol, all 5 tasks)
|
| 324 |
python inference.py
|
| 325 |
```
|
| 326 |
|
|
|
|
| 347 |
voice-authenticity-openenv/
|
| 348 |
├── environment/
|
| 349 |
│ ├── __init__.py
|
| 350 |
+
│ ├── env.py # 5-action step/reset/state with partial observability
|
| 351 |
│ ├── models.py # Pydantic Observation/Action/Reward models
|
| 352 |
+
│ ├── graders.py # 6-component scoring with difficulty weights
|
| 353 |
│ └── data/
|
| 354 |
│ ├── features.npy # clean features (500 × 48)
|
| 355 |
│ ├── features_compressed.npy # codec-degraded features
|
| 356 |
│ ├── features_adversarial.npy# adversarially perturbed features
|
| 357 |
+
│ ├── features_streaming.npy # streaming degraded features
|
| 358 |
+
│ ├── features_phonecall.npy # phone call degraded features
|
| 359 |
+
│ ├── features_raw.npy # unnormalized values
|
| 360 |
│ ├── labels.npy # ground truth labels
|
| 361 |
│ ├── labels_compressed.npy
|
| 362 |
+
│ ├── labels_adversarial.npy
|
| 363 |
+
│ ├── labels_streaming.npy
|
| 364 |
+
│ └── labels_phonecall.npy
|
| 365 |
├── scripts/
|
| 366 |
│ ├── download_data.py # fetch dataset from HuggingFace
|
| 367 |
+
│ └── extract_features.py # audio → feature vectors (5 tasks)
|
| 368 |
├── server/
|
| 369 |
│ └── app.py # OpenEnv HTTP server entry point
|
| 370 |
├── app.py # FastAPI server (root)
|
| 371 |
+
├── inference.py # baseline LLM agent (5-action protocol)
|
| 372 |
+
├── openenv.yaml # OpenEnv spec (5 tasks)
|
| 373 |
├── pyproject.toml # package config
|
| 374 |
├── Dockerfile
|
| 375 |
├── requirements.txt
|
|
|
|
| 387 |
↓ parselmouth/Praat → jitter, shimmer, HNR
|
| 388 |
↓ z-score normalization
|
| 389 |
↓ 48-dim float32 vector
|
| 390 |
+
→ stored as .npy arrays (5 variants)
|
| 391 |
```
|
| 392 |
|
| 393 |
### Compression Simulation (Task 2)
|
| 394 |
Codec compression is simulated by degrading MFCC standard deviations, reducing jitter and shimmer values, and adding spectral artifact signals — replicating the acoustic degradation introduced by MP3/codec pipelines.
|
| 395 |
|
| 396 |
### Adversarial Simulation (Task 3)
|
| 397 |
+
Adversarial perturbation shifts synthetic sample features into the real speech distribution range, and real sample features toward the synthetic range. Controlled label noise (8%) simulates real-world annotation ambiguity. No clean threshold separates the classes.
|
| 398 |
+
|
| 399 |
+
### Streaming Simulation (Task 4)
|
| 400 |
+
Features undergo two layers of degradation: a static perturbation (partial MFCC decode, mild temporal noise) baked into the data files, and a dynamic soft-gated noise applied at runtime that reduces as the agent takes more steps. Early requests return noisier data; later requests return cleaner data — rewarding intelligent sequencing without forcing a fixed order.
|
| 401 |
+
|
| 402 |
+
### Phone Call Simulation (Task 5)
|
| 403 |
+
The most aggressive degradation: narrowband codec compression zeros out high-order MFCCs, flattens MFCC temporal variation, injects broadband Gaussian noise, severely degrades HNR, and adds RMS energy fluctuation simulating packet loss. Designed to be near the limit of what's detectable.
|
| 404 |
|
| 405 |
---
|
| 406 |
|
| 407 |
## 📋 Expected stdout Format
|
| 408 |
```
|
| 409 |
[START] task=clean_detection env=voice-authenticity model=Qwen/Qwen2.5-72B-Instruct
|
| 410 |
+
[STEP] step=1 action={"action_type": "request_temporal_features"} reward=0.05 done=false error=null
|
| 411 |
+
[STEP] step=2 action={"action_type": "request_spectral_features"} reward=0.05 done=false error=null
|
| 412 |
+
[STEP] step=3 action={"action_type": "request_comparison"} reward=0.05 done=false error=null
|
| 413 |
+
[STEP] step=4 action={"action_type": "analyze_evidence"} reward=0.05 done=false error=null
|
| 414 |
+
[STEP] step=5 action={"action_type": "final_classify", "label": 0, "confidence": 0.78, "reasoning": "..."} reward=0.87 done=true error=null
|
| 415 |
+
[END] success=true steps=5 score=0.870 rewards=0.05,0.05,0.05,0.05,0.87
|
| 416 |
```
|
| 417 |
|
| 418 |
---
|
app.py
CHANGED
|
@@ -1,29 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
-
from fastapi.responses import JSONResponse
|
| 3 |
from pydantic import BaseModel
|
| 4 |
-
from typing import Optional
|
| 5 |
import uvicorn
|
| 6 |
import os
|
| 7 |
|
| 8 |
from environment.env import VoiceAuthenticityEnv
|
| 9 |
|
| 10 |
-
app = FastAPI(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
"
|
| 15 |
-
"
|
| 16 |
-
"
|
| 17 |
-
|
|
|
|
| 18 |
|
|
|
|
| 19 |
current_task = "clean_detection"
|
| 20 |
|
|
|
|
| 21 |
class ActionRequest(BaseModel):
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
task_name: Optional[str] = None
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
@app.post("/reset")
|
| 28 |
def reset(request: dict = {}):
|
| 29 |
global current_task
|
|
@@ -39,6 +228,7 @@ def reset(request: dict = {}):
|
|
| 39 |
"info": {}
|
| 40 |
})
|
| 41 |
|
|
|
|
| 42 |
@app.post("/step")
|
| 43 |
def step(action: ActionRequest):
|
| 44 |
global current_task
|
|
@@ -46,9 +236,11 @@ def step(action: ActionRequest):
|
|
| 46 |
if task not in envs:
|
| 47 |
task = current_task
|
| 48 |
action_dict = {
|
|
|
|
| 49 |
"label": action.label,
|
| 50 |
"confidence": action.confidence,
|
| 51 |
-
"reasoning": action.reasoning
|
|
|
|
| 52 |
}
|
| 53 |
obs, reward, done, info = envs[task].step(action_dict)
|
| 54 |
return JSONResponse({
|
|
@@ -58,17 +250,30 @@ def step(action: ActionRequest):
|
|
| 58 |
"info": info
|
| 59 |
})
|
| 60 |
|
|
|
|
| 61 |
@app.get("/state")
|
| 62 |
def state():
|
| 63 |
return JSONResponse(envs[current_task].state())
|
| 64 |
|
|
|
|
| 65 |
@app.get("/health")
|
| 66 |
def health():
|
| 67 |
-
return {"status": "
|
|
|
|
| 68 |
|
| 69 |
@app.get("/")
|
| 70 |
def root():
|
| 71 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
if __name__ == "__main__":
|
| 74 |
-
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
load_dotenv()
|
| 3 |
+
|
| 4 |
from fastapi import FastAPI
|
| 5 |
+
from fastapi.responses import JSONResponse, HTMLResponse
|
| 6 |
from pydantic import BaseModel
|
| 7 |
+
from typing import Optional, List
|
| 8 |
import uvicorn
|
| 9 |
import os
|
| 10 |
|
| 11 |
from environment.env import VoiceAuthenticityEnv
|
| 12 |
|
| 13 |
+
app = FastAPI(
|
| 14 |
+
title="Voice Authenticity OpenEnv",
|
| 15 |
+
description="Multi-step agentic environment for detecting synthetic speech",
|
| 16 |
+
version="2.0.0"
|
| 17 |
+
)
|
| 18 |
|
| 19 |
+
TASKS = [
|
| 20 |
+
"clean_detection",
|
| 21 |
+
"compressed_detection",
|
| 22 |
+
"adversarial_detection",
|
| 23 |
+
"streaming_detection",
|
| 24 |
+
"phonecall_detection",
|
| 25 |
+
]
|
| 26 |
|
| 27 |
+
envs = {task: VoiceAuthenticityEnv(task) for task in TASKS}
|
| 28 |
current_task = "clean_detection"
|
| 29 |
|
| 30 |
+
|
| 31 |
class ActionRequest(BaseModel):
|
| 32 |
+
action_type: str = "final_classify"
|
| 33 |
+
label: int = 0
|
| 34 |
+
confidence: float = 0.5
|
| 35 |
+
reasoning: str = ""
|
| 36 |
+
focus: List[str] = []
|
| 37 |
task_name: Optional[str] = None
|
| 38 |
|
| 39 |
+
|
| 40 |
+
@app.get("/web", response_class=HTMLResponse)
|
| 41 |
+
def web_interface():
|
| 42 |
+
return """
|
| 43 |
+
<!DOCTYPE html>
|
| 44 |
+
<html>
|
| 45 |
+
<head>
|
| 46 |
+
<title>Voice Authenticity OpenEnv</title>
|
| 47 |
+
<style>
|
| 48 |
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
| 49 |
+
body { font-family: -apple-system, sans-serif; max-width: 860px; margin: 50px auto; padding: 20px; background: #050508; color: #fff; }
|
| 50 |
+
h1 { color: #00c9a7; font-size: 28px; margin-bottom: 8px; }
|
| 51 |
+
h2 { font-size: 16px; font-weight: 500; margin-bottom: 12px; color: #00c9a7; }
|
| 52 |
+
p { color: #666; font-size: 14px; line-height: 1.6; margin-bottom: 8px; }
|
| 53 |
+
.card { background: #080810; border: 1px solid #0f0f1a; border-radius: 14px; padding: 20px; margin: 16px 0; }
|
| 54 |
+
.tag { background: #0d2d1e; color: #00c9a7; padding: 4px 12px; border-radius: 20px; font-size: 11px; margin: 3px; display: inline-block; border: 1px solid #0f2d26; }
|
| 55 |
+
a { color: #00c9a7; text-decoration: none; }
|
| 56 |
+
a:hover { text-decoration: underline; }
|
| 57 |
+
.task { border-left: 2px solid #00c9a7; padding: 8px 12px; margin: 8px 0; background: #050508; border-radius: 0 8px 8px 0; }
|
| 58 |
+
.task strong { font-size: 13px; color: #fff; }
|
| 59 |
+
.task span { font-size: 12px; color: #555; display: block; margin-top: 2px; }
|
| 60 |
+
.difficulty { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 10px; margin-left: 8px; }
|
| 61 |
+
.easy { background: #0d2d1e; color: #00c9a7; }
|
| 62 |
+
.medium { background: #1a1a00; color: #f0a500; }
|
| 63 |
+
.hard { background: #1a0000; color: #ff6b6b; }
|
| 64 |
+
.extreme { background: #1a0010; color: #ff00aa; }
|
| 65 |
+
.medium_hard { background: #0d1a2d; color: #00aaff; }
|
| 66 |
+
.endpoint { display: flex; gap: 12px; align-items: center; padding: 8px 0; border-bottom: 1px solid #0f0f1a; }
|
| 67 |
+
.endpoint:last-child { border-bottom: none; }
|
| 68 |
+
.method { font-size: 11px; font-weight: 600; padding: 3px 8px; border-radius: 6px; min-width: 45px; text-align: center; }
|
| 69 |
+
.get { background: #0d2d1e; color: #00c9a7; }
|
| 70 |
+
.post { background: #1a1a00; color: #f0a500; }
|
| 71 |
+
.endpoint-path { font-size: 13px; color: #fff; font-family: monospace; }
|
| 72 |
+
.endpoint-desc { font-size: 12px; color: #444; }
|
| 73 |
+
.action-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-top: 8px; }
|
| 74 |
+
.action-card { background: #050508; border: 1px solid #0f0f1a; border-radius: 10px; padding: 12px; }
|
| 75 |
+
.action-name { font-size: 12px; font-family: monospace; color: #00c9a7; margin-bottom: 4px; }
|
| 76 |
+
.action-desc { font-size: 11px; color: #444; line-height: 1.5; }
|
| 77 |
+
.stat { text-align: center; padding: 16px; }
|
| 78 |
+
.stat-num { font-size: 28px; font-weight: 600; color: #fff; }
|
| 79 |
+
.stat-num span { color: #00c9a7; }
|
| 80 |
+
.stat-label { font-size: 11px; color: #444; margin-top: 4px; }
|
| 81 |
+
.stats-grid { display: grid; grid-template-columns: repeat(3, 1fr); gap: 1px; background: #0f0f1a; border-radius: 12px; overflow: hidden; }
|
| 82 |
+
.stat { background: #080810; }
|
| 83 |
+
.badge { display: inline-flex; align-items: center; gap: 6px; padding: 5px 14px; border: 1px solid #0f2d26; background: #050f0d; border-radius: 20px; font-size: 11px; color: #00c9a7; }
|
| 84 |
+
.dot { width: 6px; height: 6px; background: #00c9a7; border-radius: 50%; animation: pulse 2s infinite; }
|
| 85 |
+
@keyframes pulse { 0%,100%{opacity:1} 50%{opacity:.3} }
|
| 86 |
+
footer { text-align: center; padding: 2rem 0; color: #333; font-size: 12px; }
|
| 87 |
+
footer span { color: #00c9a7; }
|
| 88 |
+
</style>
|
| 89 |
+
</head>
|
| 90 |
+
<body>
|
| 91 |
+
<div style="margin-bottom:1.5rem">
|
| 92 |
+
<div class="badge"><div class="dot"></div>Live — 5 tasks available</div>
|
| 93 |
+
</div>
|
| 94 |
+
|
| 95 |
+
<h1>🎙️ Voice Authenticity OpenEnv</h1>
|
| 96 |
+
<p style="margin-bottom:1.5rem;font-size:16px;color:#888">
|
| 97 |
+
Multi-step agentic environment for detecting synthetic (AI-generated) speech
|
| 98 |
+
across real-world degradation and adversarial conditions.
|
| 99 |
+
</p>
|
| 100 |
+
|
| 101 |
+
<div class="stats-grid">
|
| 102 |
+
<div class="stat">
|
| 103 |
+
<div class="stat-num">5<span>+</span></div>
|
| 104 |
+
<div class="stat-label">Tasks</div>
|
| 105 |
+
</div>
|
| 106 |
+
<div class="stat">
|
| 107 |
+
<div class="stat-num">5</div>
|
| 108 |
+
<div class="stat-label">Steps per episode</div>
|
| 109 |
+
</div>
|
| 110 |
+
<div class="stat">
|
| 111 |
+
<div class="stat-num">48</div>
|
| 112 |
+
<div class="stat-label">Feature dimensions</div>
|
| 113 |
+
</div>
|
| 114 |
+
</div>
|
| 115 |
+
|
| 116 |
+
<div class="card">
|
| 117 |
+
<h2>Tasks</h2>
|
| 118 |
+
<div class="task">
|
| 119 |
+
<strong>clean_detection <span class="difficulty easy">easy</span></strong>
|
| 120 |
+
<span>Classify real vs synthetic speech from clean, unmodified audio features</span>
|
| 121 |
+
</div>
|
| 122 |
+
<div class="task">
|
| 123 |
+
<strong>compressed_detection <span class="difficulty medium">medium</span></strong>
|
| 124 |
+
<span>Classify speech under codec compression degradation</span>
|
| 125 |
+
</div>
|
| 126 |
+
<div class="task">
|
| 127 |
+
<strong>adversarial_detection <span class="difficulty hard">hard</span></strong>
|
| 128 |
+
<span>Adversarially crafted synthetic speech with overlapping feature distributions</span>
|
| 129 |
+
</div>
|
| 130 |
+
<div class="task">
|
| 131 |
+
<strong>streaming_detection <span class="difficulty medium_hard">medium-hard</span></strong>
|
| 132 |
+
<span>Step-dependent noise soft-gating — earlier steps noisier, later steps cleaner</span>
|
| 133 |
+
</div>
|
| 134 |
+
<div class="task">
|
| 135 |
+
<strong>phonecall_detection <span class="difficulty extreme">extreme</span></strong>
|
| 136 |
+
<span>Heavy codec compression and narrowband degradation simulating phone calls</span>
|
| 137 |
+
</div>
|
| 138 |
+
</div>
|
| 139 |
+
|
| 140 |
+
<div class="card">
|
| 141 |
+
<h2>5-Step Agent Protocol</h2>
|
| 142 |
+
<div class="action-grid">
|
| 143 |
+
<div class="action-card">
|
| 144 |
+
<div class="action-name">1. request_temporal_features</div>
|
| 145 |
+
<div class="action-desc">Reveals jitter, shimmer, and HNR — the core discriminating signals</div>
|
| 146 |
+
</div>
|
| 147 |
+
<div class="action-card">
|
| 148 |
+
<div class="action-name">2. request_spectral_features</div>
|
| 149 |
+
<div class="action-desc">Reveals 20 MFCC means, 20 MFCC stds, ZCR, spectral centroid</div>
|
| 150 |
+
</div>
|
| 151 |
+
<div class="action-card">
|
| 152 |
+
<div class="action-name">3. request_comparison</div>
|
| 153 |
+
<div class="action-desc">Compares sample to real/fake reference centroids via cosine similarity</div>
|
| 154 |
+
</div>
|
| 155 |
+
<div class="action-card">
|
| 156 |
+
<div class="action-name">4. analyze_evidence</div>
|
| 157 |
+
<div class="action-desc">Synthesizes all gathered signals into a structured evidence summary</div>
|
| 158 |
+
</div>
|
| 159 |
+
<div class="action-card" style="grid-column: span 2;">
|
| 160 |
+
<div class="action-name">5. final_classify</div>
|
| 161 |
+
<div class="action-desc">Submits final verdict: label (0=real, 1=synthetic) + confidence + reasoning. Terminates episode.</div>
|
| 162 |
+
</div>
|
| 163 |
+
</div>
|
| 164 |
+
</div>
|
| 165 |
+
|
| 166 |
+
<div class="card">
|
| 167 |
+
<h2>API Endpoints</h2>
|
| 168 |
+
<div class="endpoint">
|
| 169 |
+
<span class="method post">POST</span>
|
| 170 |
+
<span class="endpoint-path">/reset</span>
|
| 171 |
+
<span class="endpoint-desc">Reset episode, optionally set task_name</span>
|
| 172 |
+
</div>
|
| 173 |
+
<div class="endpoint">
|
| 174 |
+
<span class="method post">POST</span>
|
| 175 |
+
<span class="endpoint-path">/step</span>
|
| 176 |
+
<span class="endpoint-desc">Submit action, receive observation + reward</span>
|
| 177 |
+
</div>
|
| 178 |
+
<div class="endpoint">
|
| 179 |
+
<span class="method get">GET</span>
|
| 180 |
+
<span class="endpoint-path">/state</span>
|
| 181 |
+
<span class="endpoint-desc">Current environment state</span>
|
| 182 |
+
</div>
|
| 183 |
+
<div class="endpoint">
|
| 184 |
+
<span class="method get">GET</span>
|
| 185 |
+
<span class="endpoint-path">/health</span>
|
| 186 |
+
<span class="endpoint-desc">Health check</span>
|
| 187 |
+
</div>
|
| 188 |
+
<div class="endpoint">
|
| 189 |
+
<span class="method get">GET</span>
|
| 190 |
+
<span class="endpoint-path"><a href="/docs">/docs</a></span>
|
| 191 |
+
<span class="endpoint-desc">Interactive API documentation (Swagger UI)</span>
|
| 192 |
+
</div>
|
| 193 |
+
</div>
|
| 194 |
+
|
| 195 |
+
<div class="card">
|
| 196 |
+
<h2>Tags</h2>
|
| 197 |
+
<span class="tag">openenv</span>
|
| 198 |
+
<span class="tag">speech</span>
|
| 199 |
+
<span class="tag">fraud-detection</span>
|
| 200 |
+
<span class="tag">audio</span>
|
| 201 |
+
<span class="tag">partial-observability</span>
|
| 202 |
+
<span class="tag">multi-step</span>
|
| 203 |
+
<span class="tag">confidence-calibration</span>
|
| 204 |
+
<span class="tag">adversarial</span>
|
| 205 |
+
</div>
|
| 206 |
+
|
| 207 |
+
<footer>
|
| 208 |
+
Built by <span>Akshara Sharma</span> · Voice Authenticity OpenEnv v2.0.0
|
| 209 |
+
· <a href="https://github.com/AksharaaSharmaa/voice-authenticity-openenv">GitHub</a>
|
| 210 |
+
</footer>
|
| 211 |
+
</body>
|
| 212 |
+
</html>
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
|
| 216 |
@app.post("/reset")
|
| 217 |
def reset(request: dict = {}):
|
| 218 |
global current_task
|
|
|
|
| 228 |
"info": {}
|
| 229 |
})
|
| 230 |
|
| 231 |
+
|
| 232 |
@app.post("/step")
|
| 233 |
def step(action: ActionRequest):
|
| 234 |
global current_task
|
|
|
|
| 236 |
if task not in envs:
|
| 237 |
task = current_task
|
| 238 |
action_dict = {
|
| 239 |
+
"action_type": action.action_type,
|
| 240 |
"label": action.label,
|
| 241 |
"confidence": action.confidence,
|
| 242 |
+
"reasoning": action.reasoning,
|
| 243 |
+
"focus": action.focus,
|
| 244 |
}
|
| 245 |
obs, reward, done, info = envs[task].step(action_dict)
|
| 246 |
return JSONResponse({
|
|
|
|
| 250 |
"info": info
|
| 251 |
})
|
| 252 |
|
| 253 |
+
|
| 254 |
@app.get("/state")
|
| 255 |
def state():
|
| 256 |
return JSONResponse(envs[current_task].state())
|
| 257 |
|
| 258 |
+
|
| 259 |
@app.get("/health")
|
| 260 |
def health():
|
| 261 |
+
return {"status": "healthy", "service": "voice-authenticity-openenv"}
|
| 262 |
+
|
| 263 |
|
| 264 |
@app.get("/")
|
| 265 |
def root():
|
| 266 |
+
return {
|
| 267 |
+
"name": "voice-authenticity-openenv",
|
| 268 |
+
"version": "2.0.0",
|
| 269 |
+
"status": "running",
|
| 270 |
+
"tasks": TASKS,
|
| 271 |
+
"web": "/web",
|
| 272 |
+
"docs": "/docs"
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
def main():
|
| 276 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 277 |
|
| 278 |
if __name__ == "__main__":
|
| 279 |
+
main()
|
environment/__pycache__/env.cpython-310.pyc
CHANGED
|
Binary files a/environment/__pycache__/env.cpython-310.pyc and b/environment/__pycache__/env.cpython-310.pyc differ
|
|
|
environment/__pycache__/graders.cpython-310.pyc
CHANGED
|
Binary files a/environment/__pycache__/graders.cpython-310.pyc and b/environment/__pycache__/graders.cpython-310.pyc differ
|
|
|
environment/__pycache__/models.cpython-310.pyc
CHANGED
|
Binary files a/environment/__pycache__/models.cpython-310.pyc and b/environment/__pycache__/models.cpython-310.pyc differ
|
|
|
environment/data/features_phonecall.npy
ADDED
|
Binary file (96.1 kB). View file
|
|
|
environment/data/features_streaming.npy
ADDED
|
Binary file (96.1 kB). View file
|
|
|
environment/data/labels_phonecall.npy
ADDED
|
Binary file (2.13 kB). View file
|
|
|
environment/data/labels_streaming.npy
ADDED
|
Binary file (2.13 kB). View file
|
|
|
environment/env.py
CHANGED
|
@@ -1,123 +1,638 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
import random
|
| 3 |
-
from
|
|
|
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
DIFFICULTY_MAP = {
|
| 8 |
"clean_detection": "easy",
|
| 9 |
"compressed_detection": "medium",
|
| 10 |
-
"adversarial_detection": "hard"
|
|
|
|
|
|
|
| 11 |
}
|
| 12 |
|
| 13 |
DATA_FILES = {
|
| 14 |
"clean_detection": (
|
| 15 |
"environment/data/features.npy",
|
| 16 |
-
"environment/data/labels.npy"
|
| 17 |
),
|
| 18 |
"compressed_detection": (
|
| 19 |
"environment/data/features_compressed.npy",
|
| 20 |
-
"environment/data/labels_compressed.npy"
|
| 21 |
),
|
| 22 |
"adversarial_detection": (
|
| 23 |
"environment/data/features_adversarial.npy",
|
| 24 |
-
"environment/data/labels_adversarial.npy"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
),
|
| 26 |
}
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
class VoiceAuthenticityEnv:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def __init__(self, task_name: str = "clean_detection"):
|
| 30 |
-
assert task_name in TASKS, f"Unknown task: {task_name}"
|
| 31 |
-
self.task_name
|
| 32 |
self.difficulty = DIFFICULTY_MAP[task_name]
|
| 33 |
|
| 34 |
feat_file, label_file = DATA_FILES[task_name]
|
| 35 |
-
self.features
|
| 36 |
-
self.labels
|
| 37 |
self.raw_features = np.load("environment/data/features_raw.npy")
|
| 38 |
-
self.indices
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
self.
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
self.
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
self.
|
| 48 |
-
self.
|
| 49 |
-
self.
|
| 50 |
-
self.
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
return self._make_observation()
|
| 53 |
|
| 54 |
-
def step(self, action: dict):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
if self.done:
|
| 56 |
raise RuntimeError("Episode done. Call reset().")
|
| 57 |
|
| 58 |
-
|
| 59 |
-
if self.phase == "analyze":
|
| 60 |
-
self.focus_features = action.get("focus", ["jitter", "shimmer", "hnr"])
|
| 61 |
-
self.step_number += 1
|
| 62 |
-
self.phase = "decide"
|
| 63 |
-
obs = self._make_observation()
|
| 64 |
-
return obs, 0.0, False, {
|
| 65 |
-
"phase": "decide",
|
| 66 |
-
"message": "Analysis received. Now submit your final classification.",
|
| 67 |
-
"focused_on": self.focus_features
|
| 68 |
-
}
|
| 69 |
|
| 70 |
-
#
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
"true_label": true_label,
|
| 84 |
-
"difficulty": self.difficulty,
|
| 85 |
-
"task": self.task_name
|
| 86 |
-
}
|
| 87 |
-
return obs, reward, self.done, info
|
| 88 |
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
return {
|
| 91 |
-
"task_name":
|
| 92 |
-
"difficulty":
|
| 93 |
-
"step_number":
|
| 94 |
-
"
|
| 95 |
-
"
|
| 96 |
-
"
|
| 97 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
}
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
else:
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
return VoiceObservation(
|
| 117 |
-
features
|
| 118 |
-
task_name
|
| 119 |
-
step_number
|
| 120 |
-
difficulty
|
| 121 |
-
sample_id
|
| 122 |
-
hint
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voice Authenticity Detection Environment — 5-action multi-step agent loop.
|
| 3 |
+
|
| 4 |
+
Actions:
|
| 5 |
+
request_temporal_features — reveals jitter, shimmer, HNR
|
| 6 |
+
request_spectral_features — reveals MFCC values
|
| 7 |
+
request_comparison — returns similarity to real/fake reference centroids
|
| 8 |
+
analyze_evidence — synthesizes accumulated evidence
|
| 9 |
+
final_classify — submits label + confidence + reasoning (terminal)
|
| 10 |
+
|
| 11 |
+
Partial observability: the agent starts with NO features visible and must
|
| 12 |
+
actively query the environment to build its picture before classifying.
|
| 13 |
+
|
| 14 |
+
Step-level rewards provide shaping signals throughout the episode.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
import numpy as np
|
| 18 |
import random
|
| 19 |
+
from typing import List, Dict, Optional, Any
|
| 20 |
+
from environment.models import VoiceObservation, ActionType
|
| 21 |
|
| 22 |
+
# ── Task registry ───────────────────────────────────────────────────────
|
| 23 |
+
|
| 24 |
+
TASKS = [
|
| 25 |
+
"clean_detection",
|
| 26 |
+
"compressed_detection",
|
| 27 |
+
"adversarial_detection",
|
| 28 |
+
"streaming_detection",
|
| 29 |
+
"phonecall_detection",
|
| 30 |
+
]
|
| 31 |
|
| 32 |
DIFFICULTY_MAP = {
|
| 33 |
"clean_detection": "easy",
|
| 34 |
"compressed_detection": "medium",
|
| 35 |
+
"adversarial_detection": "hard",
|
| 36 |
+
"streaming_detection": "medium_hard",
|
| 37 |
+
"phonecall_detection": "extreme",
|
| 38 |
}
|
| 39 |
|
| 40 |
DATA_FILES = {
|
| 41 |
"clean_detection": (
|
| 42 |
"environment/data/features.npy",
|
| 43 |
+
"environment/data/labels.npy",
|
| 44 |
),
|
| 45 |
"compressed_detection": (
|
| 46 |
"environment/data/features_compressed.npy",
|
| 47 |
+
"environment/data/labels_compressed.npy",
|
| 48 |
),
|
| 49 |
"adversarial_detection": (
|
| 50 |
"environment/data/features_adversarial.npy",
|
| 51 |
+
"environment/data/labels_adversarial.npy",
|
| 52 |
+
),
|
| 53 |
+
"streaming_detection": (
|
| 54 |
+
"environment/data/features_streaming.npy",
|
| 55 |
+
"environment/data/labels_streaming.npy",
|
| 56 |
+
),
|
| 57 |
+
"phonecall_detection": (
|
| 58 |
+
"environment/data/features_phonecall.npy",
|
| 59 |
+
"environment/data/labels_phonecall.npy",
|
| 60 |
),
|
| 61 |
}
|
| 62 |
|
| 63 |
+
MAX_STEPS = 6 # 5 actions + 1 buffer
|
| 64 |
+
|
| 65 |
+
# ── Step-level reward constants ─────────────────────────────────────────
|
| 66 |
+
|
| 67 |
+
REWARD_FIRST_ACTION_GATHER = 0.05 # first action is a feature request
|
| 68 |
+
REWARD_MULTI_FEATURE_TYPES = 0.05 # requested both temporal AND spectral
|
| 69 |
+
REWARD_ANALYZE_BEFORE_CLASSIFY = 0.05 # used analyze_evidence before final
|
| 70 |
+
PENALTY_JUMP_TO_CLASSIFY = -0.10 # final_classify as first action
|
| 71 |
+
PENALTY_REPEAT_ACTION = -0.05 # same action twice
|
| 72 |
+
PENALTY_CONTRADICTORY_REASONING = -0.10 # reasoning contradicts label
|
| 73 |
+
|
| 74 |
+
|
| 75 |
class VoiceAuthenticityEnv:
|
| 76 |
+
"""Multi-step voice authenticity detection environment.
|
| 77 |
+
|
| 78 |
+
The agent starts with no features visible and must issue actions to
|
| 79 |
+
reveal information before making a final classification.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
def __init__(self, task_name: str = "clean_detection"):
|
| 83 |
+
assert task_name in TASKS, f"Unknown task: {task_name}. Valid: {TASKS}"
|
| 84 |
+
self.task_name = task_name
|
| 85 |
self.difficulty = DIFFICULTY_MAP[task_name]
|
| 86 |
|
| 87 |
feat_file, label_file = DATA_FILES[task_name]
|
| 88 |
+
self.features = np.load(feat_file)
|
| 89 |
+
self.labels = np.load(label_file)
|
| 90 |
self.raw_features = np.load("environment/data/features_raw.npy")
|
| 91 |
+
self.indices = list(range(len(self.labels)))
|
| 92 |
+
|
| 93 |
+
# Precompute reference centroids for comparison action
|
| 94 |
+
self._compute_reference_centroids()
|
| 95 |
+
|
| 96 |
+
# Episode state
|
| 97 |
+
self.current_idx: Optional[int] = None
|
| 98 |
+
self.step_number: int = 0
|
| 99 |
+
self.done: bool = False
|
| 100 |
+
self.action_history: List[str] = []
|
| 101 |
+
self.revealed_features: Dict[str, Any] = {}
|
| 102 |
+
self.step_rewards: List[float] = []
|
| 103 |
+
self.evidence_accumulated: List[str] = []
|
| 104 |
+
|
| 105 |
+
# Streaming task noise schedule (soft-gating)
|
| 106 |
+
self._streaming_noise_schedule = {
|
| 107 |
+
1: 0.8, # very noisy early
|
| 108 |
+
2: 0.5,
|
| 109 |
+
3: 0.3,
|
| 110 |
+
4: 0.1,
|
| 111 |
+
5: 0.05, # nearly clean late
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
def _compute_reference_centroids(self):
|
| 115 |
+
"""Compute mean feature vectors for real vs fake samples."""
|
| 116 |
+
real_mask = self.labels == 0
|
| 117 |
+
fake_mask = self.labels == 1
|
| 118 |
+
|
| 119 |
+
if real_mask.sum() > 0:
|
| 120 |
+
self.real_centroid = self.features[real_mask].mean(axis=0)
|
| 121 |
+
else:
|
| 122 |
+
self.real_centroid = np.full(self.features.shape[1], 0.05)
|
| 123 |
+
|
| 124 |
+
if fake_mask.sum() > 0:
|
| 125 |
+
self.fake_centroid = self.features[fake_mask].mean(axis=0)
|
| 126 |
+
else:
|
| 127 |
+
self.fake_centroid = np.full(self.features.shape[1], 0.05)
|
| 128 |
+
|
| 129 |
+
def reset(self) -> VoiceObservation:
|
| 130 |
+
"""Reset episode. Returns observation with NO features visible."""
|
| 131 |
+
self.step_number = 0
|
| 132 |
+
self.done = False
|
| 133 |
+
self.action_history = []
|
| 134 |
+
self.revealed_features = {}
|
| 135 |
+
self.step_rewards = []
|
| 136 |
+
self.evidence_accumulated = []
|
| 137 |
+
self.current_idx = random.choice(self.indices)
|
| 138 |
return self._make_observation()
|
| 139 |
|
| 140 |
+
def step(self, action: dict) -> tuple:
|
| 141 |
+
"""Execute one action and return (observation, reward, done, info).
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
action: dict with 'action_type' and optionally label/confidence/reasoning.
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
(VoiceObservation, float, bool, dict)
|
| 148 |
+
"""
|
| 149 |
if self.done:
|
| 150 |
raise RuntimeError("Episode done. Call reset().")
|
| 151 |
|
| 152 |
+
action_type = action.get("action_type", "final_classify")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
# Validate action type
|
| 155 |
+
valid_actions = [at.value for at in ActionType]
|
| 156 |
+
if action_type not in valid_actions:
|
| 157 |
+
raise ValueError(
|
| 158 |
+
f"Unknown action_type: {action_type}. Valid: {valid_actions}"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Track action
|
| 162 |
+
self.action_history.append(action_type)
|
| 163 |
+
self.step_number += 1
|
| 164 |
+
|
| 165 |
+
# Compute step-level reward
|
| 166 |
+
step_reward = self._compute_step_reward(action_type, action)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
# Dispatch to action handler
|
| 169 |
+
if action_type == ActionType.REQUEST_TEMPORAL.value:
|
| 170 |
+
obs, info = self._handle_request_temporal()
|
| 171 |
+
elif action_type == ActionType.REQUEST_SPECTRAL.value:
|
| 172 |
+
obs, info = self._handle_request_spectral()
|
| 173 |
+
elif action_type == ActionType.REQUEST_COMPARISON.value:
|
| 174 |
+
obs, info = self._handle_request_comparison()
|
| 175 |
+
elif action_type == ActionType.ANALYZE_EVIDENCE.value:
|
| 176 |
+
obs, info = self._handle_analyze_evidence(action)
|
| 177 |
+
elif action_type == ActionType.FINAL_CLASSIFY.value:
|
| 178 |
+
obs, final_reward, info = self._handle_final_classify(action)
|
| 179 |
+
step_reward += final_reward
|
| 180 |
+
|
| 181 |
+
self.step_rewards.append(step_reward)
|
| 182 |
+
|
| 183 |
+
# Cap total reward to [0.05, 0.95]
|
| 184 |
+
step_reward = max(0.05, min(0.95, step_reward))
|
| 185 |
+
|
| 186 |
+
# Check step limit
|
| 187 |
+
if self.step_number >= MAX_STEPS and not self.done:
|
| 188 |
+
self.done = True
|
| 189 |
+
info["message"] = "Max steps reached. Episode ended."
|
| 190 |
+
|
| 191 |
+
return obs, round(step_reward, 4), self.done, info
|
| 192 |
+
|
| 193 |
+
def state(self) -> dict:
|
| 194 |
+
"""Return full environment state for debugging."""
|
| 195 |
return {
|
| 196 |
+
"task_name": self.task_name,
|
| 197 |
+
"difficulty": self.difficulty,
|
| 198 |
+
"step_number": self.step_number,
|
| 199 |
+
"done": self.done,
|
| 200 |
+
"current_idx": self.current_idx,
|
| 201 |
+
"action_history": self.action_history,
|
| 202 |
+
"revealed_features": list(self.revealed_features.keys()),
|
| 203 |
+
"step_rewards": self.step_rewards,
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
# ── Action handlers ─────────────────────────────────────────────────
|
| 207 |
+
|
| 208 |
+
def _handle_request_temporal(self) -> tuple:
|
| 209 |
+
"""Reveal jitter, shimmer, HNR values."""
|
| 210 |
+
raw = self.raw_features[self.current_idx]
|
| 211 |
+
norm = self.features[self.current_idx]
|
| 212 |
+
|
| 213 |
+
temporal_data = {
|
| 214 |
+
"jitter": round(float(raw[42]), 6),
|
| 215 |
+
"shimmer": round(float(raw[43]), 6),
|
| 216 |
+
"hnr": round(float(raw[44]), 4),
|
| 217 |
+
"jitter_normalized": round(float(norm[42]), 4),
|
| 218 |
+
"shimmer_normalized": round(float(norm[43]), 4),
|
| 219 |
+
"hnr_normalized": round(float(norm[44]), 4),
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
# Apply streaming noise if applicable
|
| 223 |
+
if self.task_name == "streaming_detection":
|
| 224 |
+
temporal_data = self._apply_streaming_noise(temporal_data)
|
| 225 |
+
|
| 226 |
+
self.revealed_features["temporal"] = temporal_data
|
| 227 |
+
self.evidence_accumulated.append(
|
| 228 |
+
f"Temporal features: jitter={temporal_data['jitter']}, "
|
| 229 |
+
f"shimmer={temporal_data['shimmer']}, hnr={temporal_data['hnr']}"
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
obs = self._make_observation()
|
| 233 |
+
info = {
|
| 234 |
+
"action": "request_temporal_features",
|
| 235 |
+
"message": "Temporal features revealed: jitter, shimmer, HNR.",
|
| 236 |
+
"data": temporal_data,
|
| 237 |
+
}
|
| 238 |
+
return obs, info
|
| 239 |
+
|
| 240 |
+
def _handle_request_spectral(self) -> tuple:
|
| 241 |
+
"""Reveal MFCC mean and std values."""
|
| 242 |
+
raw = self.raw_features[self.current_idx]
|
| 243 |
+
norm = self.features[self.current_idx]
|
| 244 |
+
|
| 245 |
+
spectral_data = {
|
| 246 |
+
"mfcc_means": [round(float(v), 4) for v in raw[0:20]],
|
| 247 |
+
"mfcc_stds": [round(float(v), 4) for v in raw[20:40]],
|
| 248 |
+
"zcr": round(float(raw[40]), 6),
|
| 249 |
+
"spectral_centroid": round(float(raw[41]), 4),
|
| 250 |
+
"mfcc_means_normalized": [round(float(v), 4) for v in norm[0:20]],
|
| 251 |
+
"mfcc_stds_normalized": [round(float(v), 4) for v in norm[20:40]],
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
# Apply streaming noise if applicable
|
| 255 |
+
if self.task_name == "streaming_detection":
|
| 256 |
+
spectral_data = self._apply_streaming_noise(spectral_data)
|
| 257 |
+
|
| 258 |
+
self.revealed_features["spectral"] = spectral_data
|
| 259 |
+
self.evidence_accumulated.append(
|
| 260 |
+
f"Spectral features: {len(spectral_data['mfcc_means'])} MFCC coefficients, "
|
| 261 |
+
f"ZCR={spectral_data['zcr']}, centroid={spectral_data['spectral_centroid']}"
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
obs = self._make_observation()
|
| 265 |
+
info = {
|
| 266 |
+
"action": "request_spectral_features",
|
| 267 |
+
"message": "Spectral features revealed: 20 MFCC means, 20 MFCC stds, ZCR, spectral centroid.",
|
| 268 |
+
"data": spectral_data,
|
| 269 |
+
}
|
| 270 |
+
return obs, info
|
| 271 |
+
|
| 272 |
+
def _handle_request_comparison(self) -> tuple:
|
| 273 |
+
"""Compare this sample to known real/fake reference centroids."""
|
| 274 |
+
sample = self.features[self.current_idx]
|
| 275 |
+
|
| 276 |
+
# Cosine similarity to real and fake centroids
|
| 277 |
+
real_sim = self._cosine_similarity(sample, self.real_centroid)
|
| 278 |
+
fake_sim = self._cosine_similarity(sample, self.fake_centroid)
|
| 279 |
+
|
| 280 |
+
# Euclidean distance
|
| 281 |
+
real_dist = float(np.linalg.norm(sample - self.real_centroid))
|
| 282 |
+
fake_dist = float(np.linalg.norm(sample - self.fake_centroid))
|
| 283 |
+
|
| 284 |
+
comparison_data = {
|
| 285 |
+
"cosine_similarity_to_real": round(real_sim, 4),
|
| 286 |
+
"cosine_similarity_to_fake": round(fake_sim, 4),
|
| 287 |
+
"euclidean_distance_to_real": round(real_dist, 4),
|
| 288 |
+
"euclidean_distance_to_fake": round(fake_dist, 4),
|
| 289 |
+
"closer_to": "real" if real_dist < fake_dist else "fake",
|
| 290 |
+
"similarity_differential": round(real_sim - fake_sim, 4),
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
self.revealed_features["comparison"] = comparison_data
|
| 294 |
+
self.evidence_accumulated.append(
|
| 295 |
+
f"Comparison: cosine_sim_real={comparison_data['cosine_similarity_to_real']}, "
|
| 296 |
+
f"cosine_sim_fake={comparison_data['cosine_similarity_to_fake']}, "
|
| 297 |
+
f"closer_to={comparison_data['closer_to']}"
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
obs = self._make_observation()
|
| 301 |
+
info = {
|
| 302 |
+
"action": "request_comparison",
|
| 303 |
+
"message": "Comparison to reference centroids computed.",
|
| 304 |
+
"data": comparison_data,
|
| 305 |
+
}
|
| 306 |
+
return obs, info
|
| 307 |
+
|
| 308 |
+
def _handle_analyze_evidence(self, action: dict) -> tuple:
|
| 309 |
+
"""Synthesize all gathered evidence into a structured summary."""
|
| 310 |
+
evidence_parts = []
|
| 311 |
+
|
| 312 |
+
# Build evidence summary from what's been revealed
|
| 313 |
+
if "temporal" in self.revealed_features:
|
| 314 |
+
t = self.revealed_features["temporal"]
|
| 315 |
+
jitter_val = t.get("jitter", 0)
|
| 316 |
+
shimmer_val = t.get("shimmer", 0)
|
| 317 |
+
hnr_val = t.get("hnr", 0)
|
| 318 |
+
|
| 319 |
+
# Provide interpretive guidance based on actual values
|
| 320 |
+
jitter_interp = "elevated (typical of real speech)" if jitter_val > 0.025 else "low (typical of synthetic)"
|
| 321 |
+
shimmer_interp = "elevated (typical of real speech)" if shimmer_val > 0.10 else "low (typical of synthetic)"
|
| 322 |
+
hnr_interp = "low (typical of real speech)" if hnr_val < 12.0 else "high (typical of synthetic)"
|
| 323 |
+
|
| 324 |
+
evidence_parts.append(
|
| 325 |
+
f"TEMPORAL: jitter={jitter_val} ({jitter_interp}), "
|
| 326 |
+
f"shimmer={shimmer_val} ({shimmer_interp}), "
|
| 327 |
+
f"HNR={hnr_val} ({hnr_interp})"
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
if "spectral" in self.revealed_features:
|
| 331 |
+
s = self.revealed_features["spectral"]
|
| 332 |
+
mfcc_mean_avg = np.mean(s.get("mfcc_means", [0])) if s.get("mfcc_means") else 0
|
| 333 |
+
mfcc_std_avg = np.mean(s.get("mfcc_stds", [0])) if s.get("mfcc_stds") else 0
|
| 334 |
+
evidence_parts.append(
|
| 335 |
+
f"SPECTRAL: avg_mfcc_mean={mfcc_mean_avg:.3f}, "
|
| 336 |
+
f"avg_mfcc_std={mfcc_std_avg:.3f}, "
|
| 337 |
+
f"zcr={s.get('zcr', 0)}, centroid={s.get('spectral_centroid', 0)}"
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
if "comparison" in self.revealed_features:
|
| 341 |
+
c = self.revealed_features["comparison"]
|
| 342 |
+
evidence_parts.append(
|
| 343 |
+
f"COMPARISON: closer_to={c['closer_to']}, "
|
| 344 |
+
f"diff={c['similarity_differential']}"
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
if not evidence_parts:
|
| 348 |
+
summary = "No evidence gathered yet. Request features before analyzing."
|
| 349 |
+
else:
|
| 350 |
+
# Count evidence signals pointing to real vs fake
|
| 351 |
+
real_signals = 0
|
| 352 |
+
fake_signals = 0
|
| 353 |
+
|
| 354 |
+
if "temporal" in self.revealed_features:
|
| 355 |
+
t = self.revealed_features["temporal"]
|
| 356 |
+
if t.get("jitter", 0) > 0.025:
|
| 357 |
+
real_signals += 1
|
| 358 |
+
else:
|
| 359 |
+
fake_signals += 1
|
| 360 |
+
if t.get("shimmer", 0) > 0.10:
|
| 361 |
+
real_signals += 1
|
| 362 |
+
else:
|
| 363 |
+
fake_signals += 1
|
| 364 |
+
if t.get("hnr", 0) < 12.0:
|
| 365 |
+
real_signals += 1
|
| 366 |
+
else:
|
| 367 |
+
fake_signals += 1
|
| 368 |
+
|
| 369 |
+
if "comparison" in self.revealed_features:
|
| 370 |
+
c = self.revealed_features["comparison"]
|
| 371 |
+
if c["closer_to"] == "real":
|
| 372 |
+
real_signals += 1
|
| 373 |
+
else:
|
| 374 |
+
fake_signals += 1
|
| 375 |
+
|
| 376 |
+
total_signals = real_signals + fake_signals
|
| 377 |
+
if total_signals > 0:
|
| 378 |
+
suggested_confidence = max(real_signals, fake_signals) / total_signals
|
| 379 |
+
leaning = "REAL" if real_signals > fake_signals else "SYNTHETIC"
|
| 380 |
+
else:
|
| 381 |
+
suggested_confidence = 0.55
|
| 382 |
+
leaning = "UNCERTAIN"
|
| 383 |
+
|
| 384 |
+
# Adjust confidence for difficulty
|
| 385 |
+
if self.difficulty in ("hard", "extreme", "medium_hard"):
|
| 386 |
+
suggested_confidence = min(suggested_confidence, 0.80)
|
| 387 |
+
|
| 388 |
+
summary = (
|
| 389 |
+
f"Evidence analysis ({len(evidence_parts)} sources):\n"
|
| 390 |
+
+ "\n".join(f" • {p}" for p in evidence_parts)
|
| 391 |
+
+ f"\n\nSignal tally: {real_signals} real vs {fake_signals} synthetic"
|
| 392 |
+
+ f"\nPreliminary assessment: leaning {leaning}"
|
| 393 |
+
+ f"\nSuggested confidence: {suggested_confidence:.2f}"
|
| 394 |
+
+ f"\nDifficulty context: {self.difficulty}"
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
self.revealed_features["analysis"] = {
|
| 398 |
+
"summary": summary,
|
| 399 |
+
"evidence_count": len(evidence_parts),
|
| 400 |
}
|
| 401 |
|
| 402 |
+
obs = self._make_observation(evidence_summary=summary)
|
| 403 |
+
info = {
|
| 404 |
+
"action": "analyze_evidence",
|
| 405 |
+
"message": "Evidence synthesized.",
|
| 406 |
+
"summary": summary,
|
| 407 |
+
"evidence_count": len(evidence_parts),
|
| 408 |
+
}
|
| 409 |
+
return obs, info
|
| 410 |
+
|
| 411 |
+
def _handle_final_classify(self, action: dict) -> tuple:
|
| 412 |
+
"""Submit final classification. Triggers grading. Episode ends."""
|
| 413 |
+
from environment.graders import grade
|
| 414 |
+
|
| 415 |
+
true_label = int(self.labels[self.current_idx])
|
| 416 |
+
|
| 417 |
+
result = grade(
|
| 418 |
+
true_label=true_label,
|
| 419 |
+
action=action,
|
| 420 |
+
difficulty=self.difficulty,
|
| 421 |
+
action_history=self.action_history,
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
self.done = True
|
| 425 |
+
|
| 426 |
+
obs = self._make_observation()
|
| 427 |
+
info = {
|
| 428 |
+
"action": "final_classify",
|
| 429 |
+
"phase": "done",
|
| 430 |
+
"true_label": true_label,
|
| 431 |
+
"predicted_label": action.get("label", 0),
|
| 432 |
+
"difficulty": self.difficulty,
|
| 433 |
+
"task": self.task_name,
|
| 434 |
+
"grader_breakdown": result["breakdown"],
|
| 435 |
+
"grader_weights": result["weights"],
|
| 436 |
+
"penalties": result["penalties"],
|
| 437 |
+
"correct": result["correct"],
|
| 438 |
+
"episode_summary": {
|
| 439 |
+
"actions_taken": self.action_history,
|
| 440 |
+
"features_revealed": list(self.revealed_features.keys()),
|
| 441 |
+
"total_steps": self.step_number
|
| 442 |
+
}
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
return obs, result["score"], info
|
| 446 |
+
|
| 447 |
+
# ── Step-level reward computation ───────────────────────────────────
|
| 448 |
+
|
| 449 |
+
def _compute_step_reward(self, action_type: str, action: dict) -> float:
|
| 450 |
+
"""Compute shaping reward for this step."""
|
| 451 |
+
reward = 0.05
|
| 452 |
+
|
| 453 |
+
gathering_actions = {
|
| 454 |
+
ActionType.REQUEST_TEMPORAL.value,
|
| 455 |
+
ActionType.REQUEST_SPECTRAL.value,
|
| 456 |
+
ActionType.REQUEST_COMPARISON.value,
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
# Reward: first action is a feature request
|
| 460 |
+
if len(self.action_history) == 1 and action_type in gathering_actions:
|
| 461 |
+
reward += REWARD_FIRST_ACTION_GATHER
|
| 462 |
+
|
| 463 |
+
# Penalty: jumping straight to final_classify
|
| 464 |
+
if len(self.action_history) == 1 and action_type == ActionType.FINAL_CLASSIFY.value:
|
| 465 |
+
reward += PENALTY_JUMP_TO_CLASSIFY
|
| 466 |
+
|
| 467 |
+
# Reward: multi-feature-type requests
|
| 468 |
+
history_set = set(self.action_history)
|
| 469 |
+
if (ActionType.REQUEST_TEMPORAL.value in history_set and
|
| 470 |
+
ActionType.REQUEST_SPECTRAL.value in history_set and
|
| 471 |
+
len(self.action_history) >= 2 and
|
| 472 |
+
action_type in {ActionType.REQUEST_TEMPORAL.value, ActionType.REQUEST_SPECTRAL.value}):
|
| 473 |
+
# Only award once: check if this is the action that completed the pair
|
| 474 |
+
prev_set = set(self.action_history[:-1])
|
| 475 |
+
if not (ActionType.REQUEST_TEMPORAL.value in prev_set and
|
| 476 |
+
ActionType.REQUEST_SPECTRAL.value in prev_set):
|
| 477 |
+
reward += REWARD_MULTI_FEATURE_TYPES
|
| 478 |
+
|
| 479 |
+
# Reward: analyze_evidence before final_classify
|
| 480 |
+
if (action_type == ActionType.FINAL_CLASSIFY.value and
|
| 481 |
+
ActionType.ANALYZE_EVIDENCE.value in self.action_history[:-1]):
|
| 482 |
+
reward += REWARD_ANALYZE_BEFORE_CLASSIFY
|
| 483 |
+
|
| 484 |
+
# Penalty: repeating same action
|
| 485 |
+
if len(self.action_history) >= 2 and self.action_history[-1] == self.action_history[-2]:
|
| 486 |
+
reward += PENALTY_REPEAT_ACTION
|
| 487 |
+
|
| 488 |
+
# Penalty: contradictory reasoning (only on final_classify)
|
| 489 |
+
if action_type == ActionType.FINAL_CLASSIFY.value:
|
| 490 |
+
label = action.get("label", 0)
|
| 491 |
+
reasoning = action.get("reasoning", "").lower()
|
| 492 |
+
if label == 0 and any(kw in reasoning for kw in ["synthetic", "fake", "artificial", "generated"]):
|
| 493 |
+
if not any(kw in reasoning for kw in ["not synthetic", "not fake", "not artificial"]):
|
| 494 |
+
reward += PENALTY_CONTRADICTORY_REASONING
|
| 495 |
+
elif label == 1 and any(kw in reasoning for kw in ["real", "human", "natural", "authentic"]):
|
| 496 |
+
if not any(kw in reasoning for kw in ["not real", "not human", "not natural"]):
|
| 497 |
+
reward += PENALTY_CONTRADICTORY_REASONING
|
| 498 |
+
|
| 499 |
+
return reward
|
| 500 |
+
|
| 501 |
+
# ── Streaming noise (soft-gating) ───────────────────────────────────
|
| 502 |
+
|
| 503 |
+
def _apply_streaming_noise(self, data: dict) -> dict:
|
| 504 |
+
"""Apply noise to features based on step number for streaming task.
|
| 505 |
+
|
| 506 |
+
Earlier steps get noisier data, later steps get cleaner data.
|
| 507 |
+
This is soft-gating: features are always available but with
|
| 508 |
+
varying fidelity.
|
| 509 |
+
"""
|
| 510 |
+
noise_level = self._streaming_noise_schedule.get(
|
| 511 |
+
self.step_number, 0.05
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
noisy_data = {}
|
| 515 |
+
for key, value in data.items():
|
| 516 |
+
if isinstance(value, (int, float)):
|
| 517 |
+
noise = np.random.normal(0, noise_level * abs(value) + 1e-6)
|
| 518 |
+
noisy_data[key] = round(float(value + noise), 6)
|
| 519 |
+
elif isinstance(value, list):
|
| 520 |
+
noisy_data[key] = [
|
| 521 |
+
round(float(v + np.random.normal(0, noise_level * abs(v) + 1e-6)), 4)
|
| 522 |
+
for v in value
|
| 523 |
+
]
|
| 524 |
+
else:
|
| 525 |
+
noisy_data[key] = value
|
| 526 |
+
|
| 527 |
+
return noisy_data
|
| 528 |
+
|
| 529 |
+
# ── Helper methods ──────────────────────────────────────────────────
|
| 530 |
+
|
| 531 |
+
@staticmethod
|
| 532 |
+
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
| 533 |
+
"""Cosine similarity between two vectors."""
|
| 534 |
+
norm_a = np.linalg.norm(a)
|
| 535 |
+
norm_b = np.linalg.norm(b)
|
| 536 |
+
if norm_a == 0 or norm_b == 0:
|
| 537 |
+
return 0.05
|
| 538 |
+
return float(np.dot(a, b) / (norm_a * norm_b))
|
| 539 |
+
|
| 540 |
+
def _make_observation(
|
| 541 |
+
self,
|
| 542 |
+
evidence_summary: Optional[str] = None,
|
| 543 |
+
) -> VoiceObservation:
|
| 544 |
+
"""Build observation from current state.
|
| 545 |
+
|
| 546 |
+
The full feature vector is only included if the agent has requested
|
| 547 |
+
both temporal AND spectral features, or if the episode is done.
|
| 548 |
+
Otherwise it's zeroed out to enforce partial observability.
|
| 549 |
+
"""
|
| 550 |
+
has_temporal = "temporal" in self.revealed_features
|
| 551 |
+
has_spectral = "spectral" in self.revealed_features
|
| 552 |
+
full_revealed = has_temporal and has_spectral
|
| 553 |
+
|
| 554 |
+
if full_revealed or self.done:
|
| 555 |
+
feat = self.features[self.current_idx].tolist()
|
| 556 |
else:
|
| 557 |
+
# Partial observability: base value 0.05 for unrevealed features
|
| 558 |
+
feat = [0.05] * self.features.shape[1]
|
| 559 |
+
|
| 560 |
+
# Build hint based on current state
|
| 561 |
+
hint = self._build_hint()
|
| 562 |
+
|
| 563 |
+
# Comparison result from revealed features
|
| 564 |
+
comparison = self.revealed_features.get("comparison", None)
|
| 565 |
+
|
| 566 |
+
# Available actions
|
| 567 |
+
available = self._get_available_actions()
|
| 568 |
|
| 569 |
return VoiceObservation(
|
| 570 |
+
features=feat,
|
| 571 |
+
task_name=self.task_name,
|
| 572 |
+
step_number=self.step_number,
|
| 573 |
+
difficulty=self.difficulty,
|
| 574 |
+
sample_id=int(self.current_idx),
|
| 575 |
+
hint=hint,
|
| 576 |
+
visible_features=dict(self.revealed_features),
|
| 577 |
+
evidence_summary=evidence_summary,
|
| 578 |
+
comparison_result=comparison,
|
| 579 |
+
available_actions=available,
|
| 580 |
+
actions_taken=list(self.action_history),
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
def _build_hint(self) -> str:
|
| 584 |
+
"""Build context hint for the agent."""
|
| 585 |
+
if self.done:
|
| 586 |
+
return "Episode complete."
|
| 587 |
+
|
| 588 |
+
if self.step_number == 0:
|
| 589 |
+
hint = (
|
| 590 |
+
f"Task: {self.task_name} (difficulty: {self.difficulty}). "
|
| 591 |
+
f"You have {MAX_STEPS - self.step_number} steps remaining. "
|
| 592 |
+
"No features are visible yet. Use request_temporal_features, "
|
| 593 |
+
"request_spectral_features, or request_comparison to gather "
|
| 594 |
+
"evidence before classifying."
|
| 595 |
+
)
|
| 596 |
+
if self.difficulty in ("hard", "extreme"):
|
| 597 |
+
hint += " Warning: this is a challenging task. Gather thorough evidence and calibrate your confidence carefully."
|
| 598 |
+
if self.task_name == "streaming_detection":
|
| 599 |
+
hint += " Note: this is a streaming scenario — earlier feature requests may contain noise that reduces over time."
|
| 600 |
+
if self.task_name == "phonecall_detection":
|
| 601 |
+
hint += " Note: this is a phone call scenario with heavy codec compression and background noise."
|
| 602 |
+
return hint
|
| 603 |
+
|
| 604 |
+
parts = [
|
| 605 |
+
f"Step {self.step_number}/{MAX_STEPS}.",
|
| 606 |
+
f"Task: {self.task_name} ({self.difficulty}).",
|
| 607 |
+
f"Actions taken: {', '.join(self.action_history)}.",
|
| 608 |
+
]
|
| 609 |
+
|
| 610 |
+
if self.revealed_features:
|
| 611 |
+
revealed = list(self.revealed_features.keys())
|
| 612 |
+
parts.append(f"Features revealed: {', '.join(revealed)}.")
|
| 613 |
+
|
| 614 |
+
remaining = MAX_STEPS - self.step_number
|
| 615 |
+
if remaining <= 2:
|
| 616 |
+
parts.append(f"⚠️ Only {remaining} steps remaining — consider classifying soon.")
|
| 617 |
+
|
| 618 |
+
return " ".join(parts)
|
| 619 |
+
|
| 620 |
+
def _get_available_actions(self) -> List[str]:
|
| 621 |
+
"""Return list of actions the agent can still take."""
|
| 622 |
+
if self.done:
|
| 623 |
+
return []
|
| 624 |
+
|
| 625 |
+
available = []
|
| 626 |
+
for at in ActionType:
|
| 627 |
+
# final_classify is always available
|
| 628 |
+
if at == ActionType.FINAL_CLASSIFY:
|
| 629 |
+
available.append(at.value)
|
| 630 |
+
continue
|
| 631 |
+
# Don't allow repeating the exact same action consecutively
|
| 632 |
+
# (but allow re-requesting after other actions)
|
| 633 |
+
if (self.action_history and
|
| 634 |
+
self.action_history[-1] == at.value):
|
| 635 |
+
continue
|
| 636 |
+
available.append(at.value)
|
| 637 |
+
|
| 638 |
+
return available
|
environment/graders.py
CHANGED
|
@@ -1,30 +1,329 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
if
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
else:
|
| 10 |
-
return 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
else:
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
if
|
| 23 |
-
|
| 24 |
-
calibration_bonus = 0.45 * (1 - abs(confidence - 0.7))
|
| 25 |
-
return round(base + calibration_bonus, 3)
|
| 26 |
else:
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
6-component grader for Voice Authenticity OpenEnv.
|
| 3 |
+
|
| 4 |
+
Components:
|
| 5 |
+
1. Correctness — label matches ground truth
|
| 6 |
+
2. Confidence calibration — penalizes overconfidence on wrong, rewards calibrated
|
| 7 |
+
3. Trajectory quality — did agent analyze before classifying
|
| 8 |
+
4. Feature utilization — did agent request temporal/spectral features
|
| 9 |
+
5. Reasoning consistency — does reasoning text match chosen label
|
| 10 |
+
6. Action ordering — logical gather → analyze → classify sequence
|
| 11 |
+
|
| 12 |
+
Difficulty weighting adjusts component weights per task difficulty.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from typing import Dict, List, Optional
|
| 16 |
+
|
| 17 |
+
# ── Difficulty-based component weights ──────────────────────────────────
|
| 18 |
+
COMPONENT_WEIGHTS = {
|
| 19 |
+
"easy": {
|
| 20 |
+
"correctness": 0.40,
|
| 21 |
+
"confidence_calibration": 0.15,
|
| 22 |
+
"trajectory_quality": 0.10,
|
| 23 |
+
"feature_utilization": 0.15,
|
| 24 |
+
"reasoning_consistency": 0.10,
|
| 25 |
+
"action_ordering": 0.10,
|
| 26 |
+
},
|
| 27 |
+
"medium": {
|
| 28 |
+
"correctness": 0.30,
|
| 29 |
+
"confidence_calibration": 0.20,
|
| 30 |
+
"trajectory_quality": 0.15,
|
| 31 |
+
"feature_utilization": 0.15,
|
| 32 |
+
"reasoning_consistency": 0.10,
|
| 33 |
+
"action_ordering": 0.10,
|
| 34 |
+
},
|
| 35 |
+
"medium_hard": {
|
| 36 |
+
"correctness": 0.25,
|
| 37 |
+
"confidence_calibration": 0.22,
|
| 38 |
+
"trajectory_quality": 0.18,
|
| 39 |
+
"feature_utilization": 0.15,
|
| 40 |
+
"reasoning_consistency": 0.10,
|
| 41 |
+
"action_ordering": 0.10,
|
| 42 |
+
},
|
| 43 |
+
"hard": {
|
| 44 |
+
"correctness": 0.25,
|
| 45 |
+
"confidence_calibration": 0.25,
|
| 46 |
+
"trajectory_quality": 0.18,
|
| 47 |
+
"feature_utilization": 0.12,
|
| 48 |
+
"reasoning_consistency": 0.10,
|
| 49 |
+
"action_ordering": 0.10,
|
| 50 |
+
},
|
| 51 |
+
"extreme": {
|
| 52 |
+
"correctness": 0.20,
|
| 53 |
+
"confidence_calibration": 0.25,
|
| 54 |
+
"trajectory_quality": 0.20,
|
| 55 |
+
"feature_utilization": 0.15,
|
| 56 |
+
"reasoning_consistency": 0.10,
|
| 57 |
+
"action_ordering": 0.10,
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# ── Keywords for reasoning consistency check ────────────────────────────
|
| 62 |
+
REAL_KEYWORDS = [
|
| 63 |
+
"real", "human", "natural", "authentic", "genuine", "organic",
|
| 64 |
+
"jitter", "high jitter", "shimmer variation", "low hnr",
|
| 65 |
+
"irregular", "imperfect", "variation",
|
| 66 |
+
]
|
| 67 |
+
SYNTHETIC_KEYWORDS = [
|
| 68 |
+
"synthetic", "fake", "artificial", "generated", "tts",
|
| 69 |
+
"ai-generated", "deepfake", "machine", "clone",
|
| 70 |
+
"smooth", "perfect", "uniform", "low jitter", "high hnr",
|
| 71 |
+
"stable", "consistent",
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _score_correctness(true_label: int, predicted_label: int) -> float:
|
| 76 |
+
"""Binary correctness: 0.95 if correct, 0.05 if wrong."""
|
| 77 |
+
return 0.95 if predicted_label == true_label else 0.05
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _score_confidence_calibration(
|
| 81 |
+
correct: bool, confidence: float, difficulty: str
|
| 82 |
+
) -> float:
|
| 83 |
+
"""Score confidence calibration.
|
| 84 |
+
|
| 85 |
+
Correct + calibrated confidence → high score
|
| 86 |
+
Correct + overconfident on hard tasks → penalized
|
| 87 |
+
Wrong + low confidence → partial credit
|
| 88 |
+
Wrong + high confidence → zero
|
| 89 |
+
"""
|
| 90 |
+
if correct:
|
| 91 |
+
if difficulty in ("easy", "medium"):
|
| 92 |
+
# Reward higher confidence when correct on easier tasks
|
| 93 |
+
return 0.6 + 0.4 * confidence
|
| 94 |
+
elif difficulty == "medium_hard":
|
| 95 |
+
# Reward moderate confidence
|
| 96 |
+
ideal = 0.75
|
| 97 |
+
deviation = abs(confidence - ideal)
|
| 98 |
+
return max(0.05, 0.95 - 1.5 * deviation)
|
| 99 |
+
elif difficulty in ("hard", "extreme"):
|
| 100 |
+
# Reward calibrated ~0.7 confidence, penalize overconfidence
|
| 101 |
+
ideal = 0.7
|
| 102 |
+
deviation = abs(confidence - ideal)
|
| 103 |
+
return max(0.05, 0.95 - 2.0 * deviation)
|
| 104 |
+
else:
|
| 105 |
+
# Wrong answer — reward uncertainty, punish overconfidence
|
| 106 |
+
if confidence < 0.3:
|
| 107 |
+
return 0.4 # appropriately uncertain
|
| 108 |
+
elif confidence < 0.5:
|
| 109 |
+
return 0.2
|
| 110 |
+
elif confidence < 0.7:
|
| 111 |
+
return 0.1
|
| 112 |
+
else:
|
| 113 |
+
return 0.05 # overconfident AND wrong
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _score_trajectory_quality(action_history: List[str]) -> float:
|
| 117 |
+
"""Did the agent analyze evidence before classifying?
|
| 118 |
+
|
| 119 |
+
Best: gathered features → analyzed → classified
|
| 120 |
+
Okay: gathered features → classified (skipped analysis)
|
| 121 |
+
Worst: jumped straight to final_classify
|
| 122 |
+
"""
|
| 123 |
+
if len(action_history) <= 1:
|
| 124 |
+
# Only final_classify, no exploration at all
|
| 125 |
+
return 0.05
|
| 126 |
+
|
| 127 |
+
has_analysis = "analyze_evidence" in action_history
|
| 128 |
+
has_gathering = any(
|
| 129 |
+
a in action_history for a in [
|
| 130 |
+
"request_temporal_features",
|
| 131 |
+
"request_spectral_features",
|
| 132 |
+
"request_comparison",
|
| 133 |
+
]
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
if has_gathering and has_analysis:
|
| 137 |
+
return 0.95
|
| 138 |
+
elif has_gathering:
|
| 139 |
+
return 0.6
|
| 140 |
+
elif has_analysis:
|
| 141 |
+
return 0.3
|
| 142 |
+
else:
|
| 143 |
+
return 0.1
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def _score_feature_utilization(action_history: List[str]) -> float:
|
| 147 |
+
"""Did the agent request specific feature types?
|
| 148 |
+
|
| 149 |
+
Best: requested both temporal AND spectral
|
| 150 |
+
Good: requested temporal OR spectral + comparison
|
| 151 |
+
Okay: requested only one type
|
| 152 |
+
Bad: no feature requests
|
| 153 |
+
"""
|
| 154 |
+
has_temporal = "request_temporal_features" in action_history
|
| 155 |
+
has_spectral = "request_spectral_features" in action_history
|
| 156 |
+
has_comparison = "request_comparison" in action_history
|
| 157 |
+
|
| 158 |
+
count = sum([has_temporal, has_spectral, has_comparison])
|
| 159 |
+
|
| 160 |
+
if has_temporal and has_spectral and has_comparison:
|
| 161 |
+
return 0.95
|
| 162 |
+
elif has_temporal and has_spectral:
|
| 163 |
+
return 0.9
|
| 164 |
+
elif count == 2:
|
| 165 |
+
return 0.7
|
| 166 |
+
elif count == 1:
|
| 167 |
+
return 0.4
|
| 168 |
+
else:
|
| 169 |
+
return 0.05
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def _score_reasoning_consistency(
|
| 173 |
+
label: int, reasoning: str
|
| 174 |
+
) -> float:
|
| 175 |
+
"""Does the reasoning text match the chosen label?
|
| 176 |
+
|
| 177 |
+
Checks for keyword alignment between reasoning and label.
|
| 178 |
+
"""
|
| 179 |
+
reasoning_lower = reasoning.lower()
|
| 180 |
|
| 181 |
+
if not reasoning or len(reasoning.strip()) < 5:
|
| 182 |
+
return 0.2 # minimal reasoning provided
|
| 183 |
+
|
| 184 |
+
real_hits = sum(1 for kw in REAL_KEYWORDS if kw in reasoning_lower)
|
| 185 |
+
synthetic_hits = sum(1 for kw in SYNTHETIC_KEYWORDS if kw in reasoning_lower)
|
| 186 |
+
|
| 187 |
+
if label == 0: # predicted real
|
| 188 |
+
if real_hits > 0 and real_hits >= synthetic_hits:
|
| 189 |
+
return 0.95
|
| 190 |
+
elif real_hits > 0:
|
| 191 |
+
return 0.5
|
| 192 |
+
elif synthetic_hits > 0:
|
| 193 |
+
return 0.1 # contradictory
|
| 194 |
else:
|
| 195 |
+
return 0.4 # neutral, no contradiction
|
| 196 |
+
else: # predicted synthetic
|
| 197 |
+
if synthetic_hits > 0 and synthetic_hits >= real_hits:
|
| 198 |
+
return 0.95
|
| 199 |
+
elif synthetic_hits > 0:
|
| 200 |
+
return 0.5
|
| 201 |
+
elif real_hits > 0:
|
| 202 |
+
return 0.1 # contradictory
|
| 203 |
+
else:
|
| 204 |
+
return 0.4 # neutral
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def _score_action_ordering(action_history: List[str]) -> float:
|
| 208 |
+
"""Logical sequence: gather → analyze → classify.
|
| 209 |
+
|
| 210 |
+
Ideal ordering: feature requests first, then analysis, then classify
|
| 211 |
+
Penalized: analysis before any gathering, or classify without gathering
|
| 212 |
+
"""
|
| 213 |
+
if len(action_history) <= 1:
|
| 214 |
+
return 0.1 # jumped straight to classify
|
| 215 |
+
|
| 216 |
+
gathering_actions = {
|
| 217 |
+
"request_temporal_features",
|
| 218 |
+
"request_spectral_features",
|
| 219 |
+
"request_comparison",
|
| 220 |
+
}
|
| 221 |
|
| 222 |
+
# Find position indices
|
| 223 |
+
first_gather_idx = None
|
| 224 |
+
analysis_idx = None
|
| 225 |
+
classify_idx = None
|
| 226 |
+
|
| 227 |
+
for i, action in enumerate(action_history):
|
| 228 |
+
if action in gathering_actions and first_gather_idx is None:
|
| 229 |
+
first_gather_idx = i
|
| 230 |
+
if action == "analyze_evidence" and analysis_idx is None:
|
| 231 |
+
analysis_idx = i
|
| 232 |
+
if action == "final_classify":
|
| 233 |
+
classify_idx = i
|
| 234 |
+
|
| 235 |
+
score = 0.5 # baseline — at least did more than one action
|
| 236 |
+
|
| 237 |
+
# Gathering before analysis is good
|
| 238 |
+
if first_gather_idx is not None and analysis_idx is not None:
|
| 239 |
+
if first_gather_idx < analysis_idx:
|
| 240 |
+
score += 0.25
|
| 241 |
else:
|
| 242 |
+
score -= 0.15 # analyzed before gathering
|
| 243 |
+
|
| 244 |
+
# Analysis before classify
|
| 245 |
+
if analysis_idx is not None and classify_idx is not None:
|
| 246 |
+
if analysis_idx < classify_idx:
|
| 247 |
+
score += 0.25
|
|
|
|
|
|
|
| 248 |
else:
|
| 249 |
+
score -= 0.10
|
| 250 |
+
|
| 251 |
+
# Gathering happened at all
|
| 252 |
+
if first_gather_idx is not None:
|
| 253 |
+
score += 0.1
|
| 254 |
+
|
| 255 |
+
return max(0.05, min(0.95, score))
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def grade(
|
| 259 |
+
true_label: int,
|
| 260 |
+
action: dict,
|
| 261 |
+
difficulty: str,
|
| 262 |
+
action_history: Optional[List[str]] = None,
|
| 263 |
+
) -> dict:
|
| 264 |
+
"""6-component grader with difficulty-weighted scoring.
|
| 265 |
+
|
| 266 |
+
Args:
|
| 267 |
+
true_label: ground truth label (0=real, 1=synthetic)
|
| 268 |
+
action: dict with label, confidence, reasoning
|
| 269 |
+
difficulty: one of easy, medium, medium_hard, hard, extreme
|
| 270 |
+
action_history: list of action_type strings taken this episode
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
dict with:
|
| 274 |
+
score: float in [0.05, 0.95]
|
| 275 |
+
breakdown: dict of component scores
|
| 276 |
+
penalties: list of penalty descriptions
|
| 277 |
+
"""
|
| 278 |
+
if action_history is None:
|
| 279 |
+
action_history = ["final_classify"]
|
| 280 |
+
|
| 281 |
+
label = action.get("label", 0)
|
| 282 |
+
confidence = action.get("confidence", 0.5)
|
| 283 |
+
reasoning = action.get("reasoning", "")
|
| 284 |
+
correct = (label == true_label)
|
| 285 |
+
|
| 286 |
+
# Resolve difficulty weights
|
| 287 |
+
weights = COMPONENT_WEIGHTS.get(difficulty, COMPONENT_WEIGHTS["medium"])
|
| 288 |
+
|
| 289 |
+
# Score each component
|
| 290 |
+
scores = {
|
| 291 |
+
"correctness": _score_correctness(true_label, label),
|
| 292 |
+
"confidence_calibration": _score_confidence_calibration(
|
| 293 |
+
correct, confidence, difficulty
|
| 294 |
+
),
|
| 295 |
+
"trajectory_quality": _score_trajectory_quality(action_history),
|
| 296 |
+
"feature_utilization": _score_feature_utilization(action_history),
|
| 297 |
+
"reasoning_consistency": _score_reasoning_consistency(label, reasoning),
|
| 298 |
+
"action_ordering": _score_action_ordering(action_history),
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
# Weighted total
|
| 302 |
+
total = sum(
|
| 303 |
+
scores[component] * weights[component]
|
| 304 |
+
for component in scores
|
| 305 |
+
)
|
| 306 |
+
total = round(max(0.05, min(0.95, total)), 4)
|
| 307 |
+
|
| 308 |
+
# Collect penalties for transparency
|
| 309 |
+
penalties = []
|
| 310 |
+
if not correct:
|
| 311 |
+
penalties.append(f"Incorrect label (predicted={label}, true={true_label})")
|
| 312 |
+
if correct and confidence > 0.9 and difficulty in ("hard", "extreme"):
|
| 313 |
+
penalties.append(f"Overconfident on {difficulty} task (confidence={confidence})")
|
| 314 |
+
if len(action_history) <= 1:
|
| 315 |
+
penalties.append("Jumped straight to final_classify without exploration")
|
| 316 |
+
if _score_reasoning_consistency(label, reasoning) < 0.3:
|
| 317 |
+
penalties.append("Reasoning contradicts chosen label")
|
| 318 |
+
|
| 319 |
+
# Apply extreme difficulty cap
|
| 320 |
+
if difficulty == "extreme":
|
| 321 |
+
total = min(total, 0.85)
|
| 322 |
+
|
| 323 |
+
return {
|
| 324 |
+
"score": total,
|
| 325 |
+
"correct": correct,
|
| 326 |
+
"breakdown": scores,
|
| 327 |
+
"penalties": penalties,
|
| 328 |
+
"weights": weights,
|
| 329 |
+
}
|
environment/models.py
CHANGED
|
@@ -1,21 +1,70 @@
|
|
| 1 |
from pydantic import BaseModel, Field
|
| 2 |
-
from typing import Optional, List
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
class VoiceObservation(BaseModel):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
features: List[float]
|
| 6 |
task_name: str
|
| 7 |
step_number: int
|
| 8 |
difficulty: str
|
| 9 |
sample_id: int
|
| 10 |
-
hint: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class VoiceAction(BaseModel):
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
reasoning: str = Field(default="")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
class VoiceReward(BaseModel):
|
|
|
|
| 18 |
score: float
|
| 19 |
correct: bool
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
| 1 |
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import Optional, List, Dict, Any
|
| 3 |
+
from enum import Enum
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ActionType(str, Enum):
|
| 7 |
+
"""Five distinct agent actions for real partial observability."""
|
| 8 |
+
REQUEST_TEMPORAL = "request_temporal_features"
|
| 9 |
+
REQUEST_SPECTRAL = "request_spectral_features"
|
| 10 |
+
REQUEST_COMPARISON = "request_comparison"
|
| 11 |
+
ANALYZE_EVIDENCE = "analyze_evidence"
|
| 12 |
+
FINAL_CLASSIFY = "final_classify"
|
| 13 |
+
|
| 14 |
|
| 15 |
class VoiceObservation(BaseModel):
|
| 16 |
+
"""Observation returned to the agent after each action.
|
| 17 |
+
|
| 18 |
+
features: full 48-dim vector (only populated after sufficient exploration
|
| 19 |
+
or on final step)
|
| 20 |
+
visible_features: dict of feature groups revealed so far
|
| 21 |
+
evidence_summary: structured summary from analyze_evidence action
|
| 22 |
+
comparison_result: similarity scores from request_comparison action
|
| 23 |
+
"""
|
| 24 |
features: List[float]
|
| 25 |
task_name: str
|
| 26 |
step_number: int
|
| 27 |
difficulty: str
|
| 28 |
sample_id: int
|
| 29 |
+
hint: Optional[str] = None
|
| 30 |
+
visible_features: Dict[str, Any] = Field(default_factory=dict)
|
| 31 |
+
evidence_summary: Optional[str] = None
|
| 32 |
+
comparison_result: Optional[Dict[str, Any]] = None
|
| 33 |
+
available_actions: List[str] = Field(default_factory=list)
|
| 34 |
+
actions_taken: List[str] = Field(default_factory=list)
|
| 35 |
+
|
| 36 |
|
| 37 |
class VoiceAction(BaseModel):
|
| 38 |
+
"""Action submitted by the agent.
|
| 39 |
+
|
| 40 |
+
action_type: which of the 5 actions to perform
|
| 41 |
+
label: classification (only used for final_classify)
|
| 42 |
+
confidence: agent confidence (used for final_classify and analyze_evidence)
|
| 43 |
+
reasoning: explanation (used for final_classify)
|
| 44 |
+
focus: optional list of feature names (backward compat)
|
| 45 |
+
"""
|
| 46 |
+
action_type: str = Field(default="final_classify")
|
| 47 |
+
label: int = Field(default=0, ge=0, le=1)
|
| 48 |
+
confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
| 49 |
reasoning: str = Field(default="")
|
| 50 |
+
focus: List[str] = Field(default_factory=list)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class GraderBreakdown(BaseModel):
|
| 54 |
+
"""Detailed 6-component grading breakdown."""
|
| 55 |
+
correctness: float = 0.0
|
| 56 |
+
confidence_calibration: float = 0.0
|
| 57 |
+
trajectory_quality: float = 0.0
|
| 58 |
+
feature_utilization: float = 0.0
|
| 59 |
+
reasoning_consistency: float = 0.0
|
| 60 |
+
action_ordering: float = 0.0
|
| 61 |
+
|
| 62 |
|
| 63 |
class VoiceReward(BaseModel):
|
| 64 |
+
"""Reward with full breakdown."""
|
| 65 |
score: float
|
| 66 |
correct: bool
|
| 67 |
+
step_rewards: List[float] = Field(default_factory=list)
|
| 68 |
+
grader_breakdown: Optional[GraderBreakdown] = None
|
| 69 |
+
penalties: List[str] = Field(default_factory=list)
|
| 70 |
+
breakdown: str = ""
|
inference.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from dotenv import load_dotenv
|
| 2 |
load_dotenv()
|
| 3 |
|
|
@@ -6,14 +17,13 @@ import os
|
|
| 6 |
import textwrap
|
| 7 |
import json
|
| 8 |
import requests
|
| 9 |
-
from typing import List
|
| 10 |
from openai import OpenAI
|
| 11 |
|
| 12 |
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 13 |
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 14 |
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 15 |
BENCHMARK = "voice-authenticity"
|
| 16 |
-
MAX_STEPS = 1
|
| 17 |
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 18 |
|
| 19 |
# Environment server URL
|
|
@@ -21,47 +31,64 @@ ENV_SERVER_URL = os.getenv("ENV_SERVER_URL", "http://localhost:7860")
|
|
| 21 |
|
| 22 |
SYSTEM_PROMPT = textwrap.dedent("""
|
| 23 |
You are an expert audio forensics agent detecting synthetic (AI-generated) speech.
|
| 24 |
-
You
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
-
|
| 30 |
-
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
-
|
| 35 |
-
-
|
| 36 |
-
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
""").strip()
|
| 45 |
|
| 46 |
|
| 47 |
def log_start(task, env, model):
|
| 48 |
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 49 |
|
|
|
|
| 50 |
def log_step(step, action, reward, done, error):
|
| 51 |
error_val = error if error else "null"
|
| 52 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
def log_end(success, steps, score, rewards):
|
| 55 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 56 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
|
|
|
| 58 |
|
| 59 |
def env_reset(task_name: str) -> dict:
|
| 60 |
"""Call /reset on the environment server."""
|
| 61 |
response = requests.post(
|
| 62 |
f"{ENV_SERVER_URL}/reset",
|
| 63 |
json={"task_name": task_name},
|
| 64 |
-
timeout=30
|
| 65 |
)
|
| 66 |
response.raise_for_status()
|
| 67 |
return response.json()
|
|
@@ -70,49 +97,59 @@ def env_reset(task_name: str) -> dict:
|
|
| 70 |
def env_step(action: dict, task_name: str) -> dict:
|
| 71 |
"""Call /step on the environment server."""
|
| 72 |
payload = {
|
| 73 |
-
"
|
| 74 |
-
"
|
|
|
|
| 75 |
"reasoning": action.get("reasoning", ""),
|
| 76 |
-
"task_name": task_name
|
| 77 |
}
|
| 78 |
response = requests.post(
|
| 79 |
f"{ENV_SERVER_URL}/step",
|
| 80 |
json=payload,
|
| 81 |
-
timeout=30
|
| 82 |
)
|
| 83 |
response.raise_for_status()
|
| 84 |
return response.json()
|
| 85 |
|
| 86 |
|
| 87 |
-
|
| 88 |
-
features = observation.get("features", [])
|
| 89 |
-
task_name = observation.get("task_name", "")
|
| 90 |
-
difficulty = observation.get("difficulty", "")
|
| 91 |
-
hint = observation.get("hint", "")
|
| 92 |
|
|
|
|
|
|
|
| 93 |
user_prompt = f"""
|
| 94 |
-
|
| 95 |
-
Task: {task_name} (difficulty: {difficulty})
|
| 96 |
-
{f'Note: {hint}' if hint else ''}
|
| 97 |
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
"""
|
| 100 |
try:
|
| 101 |
completion = client.chat.completions.create(
|
| 102 |
model=MODEL_NAME,
|
| 103 |
messages=[
|
| 104 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 105 |
-
{"role": "user", "content": user_prompt.strip()}
|
| 106 |
],
|
| 107 |
temperature=0.3,
|
| 108 |
-
max_tokens=
|
| 109 |
-
stream=False
|
| 110 |
)
|
| 111 |
text = completion.choices[0].message.content.strip()
|
| 112 |
text = text.replace("```json", "").replace("```", "").strip()
|
| 113 |
last_brace = text.rfind("}")
|
| 114 |
if last_brace != -1:
|
| 115 |
-
text = text[:last_brace + 1]
|
| 116 |
result = json.loads(text)
|
| 117 |
result["label"] = int(result.get("label", 0))
|
| 118 |
result["confidence"] = float(result.get("confidence", 0.5))
|
|
@@ -124,70 +161,141 @@ Classify this audio sample. Keep reasoning under 70 words. Respond with JSON onl
|
|
| 124 |
return {"label": 0, "confidence": 0.5, "reasoning": "fallback"}
|
| 125 |
|
| 126 |
|
|
|
|
|
|
|
| 127 |
async def run_task(client: OpenAI, task_name: str):
|
|
|
|
| 128 |
rewards: List[float] = []
|
| 129 |
steps_taken = 0
|
| 130 |
success = False
|
| 131 |
score = 0.0
|
|
|
|
| 132 |
|
| 133 |
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 134 |
|
| 135 |
try:
|
| 136 |
-
# Reset
|
| 137 |
reset_response = env_reset(task_name)
|
| 138 |
observation = reset_response.get("observation", {})
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
"
|
| 144 |
-
"
|
| 145 |
-
"
|
| 146 |
-
"reasoning": "Requesting focused analysis"
|
| 147 |
}
|
| 148 |
-
analyze_str = json.dumps(analyze_action)
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
rewards.append(reward1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
action_dict = get_agent_action(client, observation)
|
| 163 |
-
action_str = json.dumps(action_dict)
|
| 164 |
-
|
| 165 |
-
step2_response = env_step(action_dict, task_name)
|
| 166 |
-
reward2 = float(step2_response.get("reward", 0.0))
|
| 167 |
-
done = step2_response.get("done", True)
|
| 168 |
-
steps_taken = 2
|
| 169 |
-
rewards.append(reward2)
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
|
| 174 |
-
# Score is
|
| 175 |
-
|
| 176 |
-
score = sum(decision_rewards) / len(decision_rewards) if decision_rewards else 0.0
|
| 177 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 178 |
|
| 179 |
except Exception as e:
|
| 180 |
print(f"[DEBUG] Task error: {e}", flush=True)
|
| 181 |
|
| 182 |
finally:
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
async def main():
|
| 189 |
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 190 |
-
tasks = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
for task in tasks:
|
| 192 |
await run_task(client, task)
|
| 193 |
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline inference agent for Voice Authenticity OpenEnv v2.
|
| 3 |
+
|
| 4 |
+
Uses the 5-action protocol:
|
| 5 |
+
1. request_temporal_features → get jitter, shimmer, HNR
|
| 6 |
+
2. request_spectral_features → get MFCC values
|
| 7 |
+
3. request_comparison → get similarity to real/fake centroids
|
| 8 |
+
4. analyze_evidence → synthesize gathered information
|
| 9 |
+
5. final_classify → submit label + confidence + reasoning
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
from dotenv import load_dotenv
|
| 13 |
load_dotenv()
|
| 14 |
|
|
|
|
| 17 |
import textwrap
|
| 18 |
import json
|
| 19 |
import requests
|
| 20 |
+
from typing import List
|
| 21 |
from openai import OpenAI
|
| 22 |
|
| 23 |
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 24 |
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 25 |
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 26 |
BENCHMARK = "voice-authenticity"
|
|
|
|
| 27 |
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 28 |
|
| 29 |
# Environment server URL
|
|
|
|
| 31 |
|
| 32 |
SYSTEM_PROMPT = textwrap.dedent("""
|
| 33 |
You are an expert audio forensics agent detecting synthetic (AI-generated) speech.
|
| 34 |
+
You operate in a multi-step environment where you must gather evidence before classifying.
|
| 35 |
+
|
| 36 |
+
REAL speech indicators:
|
| 37 |
+
- jitter > 0.025 (vocal cord irregularity)
|
| 38 |
+
- shimmer > 0.10 (amplitude variation)
|
| 39 |
+
- HNR < 12.0 (more noise in signal)
|
| 40 |
+
- Higher MFCC std deviations (natural variation)
|
| 41 |
+
|
| 42 |
+
SYNTHETIC speech indicators:
|
| 43 |
+
- jitter < 0.020 (too stable)
|
| 44 |
+
- shimmer < 0.09 (too uniform)
|
| 45 |
+
- HNR > 12.0 (too clean)
|
| 46 |
+
- Lower MFCC std deviations (artificial consistency)
|
| 47 |
+
|
| 48 |
+
COMPARISON interpretation:
|
| 49 |
+
- Higher cosine similarity to real centroid → likely real
|
| 50 |
+
- Higher cosine similarity to fake centroid → likely synthetic
|
| 51 |
+
- Closer euclidean distance to real → likely real
|
| 52 |
+
|
| 53 |
+
CONFIDENCE GUIDELINES:
|
| 54 |
+
- Easy tasks: confident predictions okay (0.7-0.9)
|
| 55 |
+
- Medium tasks: moderate confidence (0.6-0.8)
|
| 56 |
+
- Hard/extreme tasks: calibrate carefully, never exceed 0.85
|
| 57 |
+
|
| 58 |
+
Respond ONLY with valid JSON for the requested action type.
|
| 59 |
""").strip()
|
| 60 |
|
| 61 |
|
| 62 |
def log_start(task, env, model):
|
| 63 |
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 64 |
|
| 65 |
+
|
| 66 |
def log_step(step, action, reward, done, error):
|
| 67 |
error_val = error if error else "null"
|
| 68 |
+
print(
|
| 69 |
+
f"[STEP] step={step} action={json.dumps(action)} "
|
| 70 |
+
f"reward={reward:.2f} done={str(done).lower()} error={error_val}",
|
| 71 |
+
flush=True,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
|
| 75 |
def log_end(success, steps, score, rewards):
|
| 76 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 77 |
+
print(
|
| 78 |
+
f"[END] success={str(success).lower()} steps={steps} "
|
| 79 |
+
f"score={score:.3f} rewards={rewards_str}",
|
| 80 |
+
flush=True,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
|
| 84 |
+
# ── Environment API calls ──────────────────────────────────────────────
|
| 85 |
|
| 86 |
def env_reset(task_name: str) -> dict:
|
| 87 |
"""Call /reset on the environment server."""
|
| 88 |
response = requests.post(
|
| 89 |
f"{ENV_SERVER_URL}/reset",
|
| 90 |
json={"task_name": task_name},
|
| 91 |
+
timeout=30,
|
| 92 |
)
|
| 93 |
response.raise_for_status()
|
| 94 |
return response.json()
|
|
|
|
| 97 |
def env_step(action: dict, task_name: str) -> dict:
|
| 98 |
"""Call /step on the environment server."""
|
| 99 |
payload = {
|
| 100 |
+
"action_type": action.get("action_type", "final_classify"),
|
| 101 |
+
"label": action.get("label", 0),
|
| 102 |
+
"confidence": action.get("confidence", 0.5),
|
| 103 |
"reasoning": action.get("reasoning", ""),
|
| 104 |
+
"task_name": task_name,
|
| 105 |
}
|
| 106 |
response = requests.post(
|
| 107 |
f"{ENV_SERVER_URL}/step",
|
| 108 |
json=payload,
|
| 109 |
+
timeout=30,
|
| 110 |
)
|
| 111 |
response.raise_for_status()
|
| 112 |
return response.json()
|
| 113 |
|
| 114 |
|
| 115 |
+
# ── LLM agent decision making ─────��────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
def get_classification(client, context: dict) -> dict:
|
| 118 |
+
"""Ask the LLM to make a final classification based on accumulated evidence."""
|
| 119 |
user_prompt = f"""
|
| 120 |
+
Based on the following evidence gathered from an audio sample, classify it as
|
| 121 |
+
real (0) or synthetic (1). Task: {context['task_name']} (difficulty: {context['difficulty']})
|
|
|
|
| 122 |
|
| 123 |
+
EVIDENCE GATHERED:
|
| 124 |
+
{json.dumps(context.get('visible_features', {}), indent=2)}
|
| 125 |
+
|
| 126 |
+
COMPARISON RESULTS:
|
| 127 |
+
{json.dumps(context.get('comparison_result', {}), indent=2)}
|
| 128 |
+
|
| 129 |
+
ANALYSIS SUMMARY:
|
| 130 |
+
{context.get('evidence_summary', 'No analysis performed.')}
|
| 131 |
+
|
| 132 |
+
ACTIONS TAKEN: {', '.join(context.get('actions_taken', []))}
|
| 133 |
+
|
| 134 |
+
Respond with JSON only:
|
| 135 |
+
{{"label": 0 or 1, "confidence": 0.0-1.0, "reasoning": "brief explanation under 70 words"}}
|
| 136 |
"""
|
| 137 |
try:
|
| 138 |
completion = client.chat.completions.create(
|
| 139 |
model=MODEL_NAME,
|
| 140 |
messages=[
|
| 141 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 142 |
+
{"role": "user", "content": user_prompt.strip()},
|
| 143 |
],
|
| 144 |
temperature=0.3,
|
| 145 |
+
max_tokens=300,
|
| 146 |
+
stream=False,
|
| 147 |
)
|
| 148 |
text = completion.choices[0].message.content.strip()
|
| 149 |
text = text.replace("```json", "").replace("```", "").strip()
|
| 150 |
last_brace = text.rfind("}")
|
| 151 |
if last_brace != -1:
|
| 152 |
+
text = text[: last_brace + 1]
|
| 153 |
result = json.loads(text)
|
| 154 |
result["label"] = int(result.get("label", 0))
|
| 155 |
result["confidence"] = float(result.get("confidence", 0.5))
|
|
|
|
| 161 |
return {"label": 0, "confidence": 0.5, "reasoning": "fallback"}
|
| 162 |
|
| 163 |
|
| 164 |
+
# ── Main task runner ────────────────────────────────────────────────────
|
| 165 |
+
|
| 166 |
async def run_task(client: OpenAI, task_name: str):
|
| 167 |
+
"""Run one episode of a task using the 5-action protocol."""
|
| 168 |
rewards: List[float] = []
|
| 169 |
steps_taken = 0
|
| 170 |
success = False
|
| 171 |
score = 0.0
|
| 172 |
+
context = {}
|
| 173 |
|
| 174 |
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 175 |
|
| 176 |
try:
|
| 177 |
+
# ── Reset ───────────────────────────────────────────
|
| 178 |
reset_response = env_reset(task_name)
|
| 179 |
observation = reset_response.get("observation", {})
|
| 180 |
+
context = {
|
| 181 |
+
"task_name": observation.get("task_name", task_name),
|
| 182 |
+
"difficulty": observation.get("difficulty", ""),
|
| 183 |
+
"visible_features": {},
|
| 184 |
+
"comparison_result": None,
|
| 185 |
+
"evidence_summary": None,
|
| 186 |
+
"actions_taken": [],
|
|
|
|
| 187 |
}
|
|
|
|
| 188 |
|
| 189 |
+
# ── Step 1: Request temporal features ───────────────
|
| 190 |
+
action1 = {"action_type": "request_temporal_features"}
|
| 191 |
+
step1 = env_step(action1, task_name)
|
| 192 |
+
observation = step1.get("observation", {})
|
| 193 |
+
reward1 = float(step1.get("reward", 0.0))
|
| 194 |
rewards.append(reward1)
|
| 195 |
+
steps_taken = 1
|
| 196 |
+
context["visible_features"] = observation.get("visible_features", {})
|
| 197 |
+
context["actions_taken"] = observation.get("actions_taken", [])
|
| 198 |
+
|
| 199 |
+
log_step(step=1, action=action1, reward=reward1,
|
| 200 |
+
done=step1.get("done", False), error=None)
|
| 201 |
+
|
| 202 |
+
if step1.get("done", False):
|
| 203 |
+
raise RuntimeError("Episode ended prematurely at step 1")
|
| 204 |
+
|
| 205 |
+
# ── Step 2: Request spectral features ───────────────
|
| 206 |
+
action2 = {"action_type": "request_spectral_features"}
|
| 207 |
+
step2 = env_step(action2, task_name)
|
| 208 |
+
observation = step2.get("observation", {})
|
| 209 |
+
reward2 = float(step2.get("reward", 0.0))
|
| 210 |
+
rewards.append(reward2)
|
| 211 |
+
steps_taken = 2
|
| 212 |
+
context["visible_features"] = observation.get("visible_features", {})
|
| 213 |
+
context["actions_taken"] = observation.get("actions_taken", [])
|
| 214 |
+
|
| 215 |
+
log_step(step=2, action=action2, reward=reward2,
|
| 216 |
+
done=step2.get("done", False), error=None)
|
| 217 |
+
|
| 218 |
+
if step2.get("done", False):
|
| 219 |
+
raise RuntimeError("Episode ended prematurely at step 2")
|
| 220 |
+
|
| 221 |
+
# ── Step 3: Request comparison ──────────────────────
|
| 222 |
+
action3 = {"action_type": "request_comparison"}
|
| 223 |
+
step3 = env_step(action3, task_name)
|
| 224 |
+
observation = step3.get("observation", {})
|
| 225 |
+
reward3 = float(step3.get("reward", 0.0))
|
| 226 |
+
rewards.append(reward3)
|
| 227 |
+
steps_taken = 3
|
| 228 |
+
context["visible_features"] = observation.get("visible_features", {})
|
| 229 |
+
context["comparison_result"] = observation.get("comparison_result", {})
|
| 230 |
+
context["actions_taken"] = observation.get("actions_taken", [])
|
| 231 |
+
|
| 232 |
+
log_step(step=3, action=action3, reward=reward3,
|
| 233 |
+
done=step3.get("done", False), error=None)
|
| 234 |
+
|
| 235 |
+
if step3.get("done", False):
|
| 236 |
+
raise RuntimeError("Episode ended prematurely at step 3")
|
| 237 |
+
|
| 238 |
+
# ── Step 4: Analyze evidence ────────────────────────
|
| 239 |
+
action4 = {"action_type": "analyze_evidence"}
|
| 240 |
+
step4 = env_step(action4, task_name)
|
| 241 |
+
observation = step4.get("observation", {})
|
| 242 |
+
reward4 = float(step4.get("reward", 0.0))
|
| 243 |
+
rewards.append(reward4)
|
| 244 |
+
steps_taken = 4
|
| 245 |
+
context["evidence_summary"] = observation.get("evidence_summary", "")
|
| 246 |
+
context["actions_taken"] = observation.get("actions_taken", [])
|
| 247 |
+
|
| 248 |
+
log_step(step=4, action=action4, reward=reward4,
|
| 249 |
+
done=step4.get("done", False), error=None)
|
| 250 |
+
|
| 251 |
+
if step4.get("done", False):
|
| 252 |
+
raise RuntimeError("Episode ended prematurely at step 4")
|
| 253 |
+
|
| 254 |
+
# ── Step 5: Final classify (LLM decision) ──────────
|
| 255 |
+
classification = get_classification(client, context)
|
| 256 |
+
action5 = {
|
| 257 |
+
"action_type": "final_classify",
|
| 258 |
+
"label": classification["label"],
|
| 259 |
+
"confidence": classification["confidence"],
|
| 260 |
+
"reasoning": classification.get("reasoning", ""),
|
| 261 |
+
}
|
| 262 |
|
| 263 |
+
step5 = env_step(action5, task_name)
|
| 264 |
+
reward5 = float(step5.get("reward", 0.0))
|
| 265 |
+
rewards.append(reward5)
|
| 266 |
+
steps_taken = 5
|
| 267 |
+
done = step5.get("done", True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
+
log_step(step=5, action=action5, reward=reward5,
|
| 270 |
+
done=done, error=None)
|
| 271 |
|
| 272 |
+
# Score is the final classify reward (main grader score)
|
| 273 |
+
score = reward5
|
|
|
|
| 274 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 275 |
|
| 276 |
except Exception as e:
|
| 277 |
print(f"[DEBUG] Task error: {e}", flush=True)
|
| 278 |
|
| 279 |
finally:
|
| 280 |
+
# The competition judges score based on the final classify reward only
|
| 281 |
+
if rewards:
|
| 282 |
+
score = rewards[-1]
|
| 283 |
+
else:
|
| 284 |
+
score = 0.0
|
| 285 |
+
|
| 286 |
+
log_end(
|
| 287 |
+
success=success, steps=steps_taken,
|
| 288 |
+
score=score, rewards=rewards,
|
| 289 |
+
)
|
| 290 |
async def main():
|
| 291 |
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 292 |
+
tasks = [
|
| 293 |
+
"clean_detection",
|
| 294 |
+
"compressed_detection",
|
| 295 |
+
"adversarial_detection",
|
| 296 |
+
"streaming_detection",
|
| 297 |
+
"phonecall_detection",
|
| 298 |
+
]
|
| 299 |
for task in tasks:
|
| 300 |
await run_task(client, task)
|
| 301 |
|
openenv.yaml
CHANGED
|
@@ -1,39 +1,71 @@
|
|
| 1 |
name: voice-authenticity
|
| 2 |
-
version: "
|
| 3 |
-
description: "Voice authenticity detection across real-world degradation conditions"
|
| 4 |
-
author: "
|
| 5 |
-
tags:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
tasks:
|
| 7 |
- name: clean_detection
|
| 8 |
difficulty: easy
|
| 9 |
description: "Classify real vs synthetic speech from clean audio features"
|
| 10 |
- name: compressed_detection
|
| 11 |
-
difficulty: medium
|
| 12 |
description: "Classify speech under codec compression degradation"
|
| 13 |
- name: adversarial_detection
|
| 14 |
difficulty: hard
|
| 15 |
-
description: "Classify adversarially crafted synthetic speech"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
observation_space:
|
| 17 |
type: object
|
| 18 |
properties:
|
| 19 |
features:
|
| 20 |
type: array
|
| 21 |
-
description: "48-dim feature vector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
task_name:
|
| 23 |
type: string
|
| 24 |
step_number:
|
| 25 |
type: integer
|
| 26 |
difficulty:
|
| 27 |
type: string
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
action_space:
|
| 29 |
type: object
|
| 30 |
properties:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
label:
|
| 32 |
type: integer
|
| 33 |
description: "0=real, 1=synthetic"
|
| 34 |
confidence:
|
| 35 |
type: number
|
| 36 |
-
description: "confidence
|
| 37 |
reasoning:
|
| 38 |
type: string
|
| 39 |
-
description: "
|
|
|
|
| 1 |
name: voice-authenticity
|
| 2 |
+
version: "2.0.0"
|
| 3 |
+
description: "Voice authenticity detection across real-world degradation conditions with multi-step agentic interaction"
|
| 4 |
+
author: "AksharaSharma"
|
| 5 |
+
tags:
|
| 6 |
+
- openenv
|
| 7 |
+
- speech
|
| 8 |
+
- fraud-detection
|
| 9 |
+
- audio
|
| 10 |
+
- partial-observability
|
| 11 |
tasks:
|
| 12 |
- name: clean_detection
|
| 13 |
difficulty: easy
|
| 14 |
description: "Classify real vs synthetic speech from clean audio features"
|
| 15 |
- name: compressed_detection
|
| 16 |
+
difficulty: medium
|
| 17 |
description: "Classify speech under codec compression degradation"
|
| 18 |
- name: adversarial_detection
|
| 19 |
difficulty: hard
|
| 20 |
+
description: "Classify adversarially crafted synthetic speech with overlapping distributions"
|
| 21 |
+
- name: streaming_detection
|
| 22 |
+
difficulty: medium_hard
|
| 23 |
+
description: "Streaming detection with step-dependent noise soft-gating"
|
| 24 |
+
- name: phonecall_detection
|
| 25 |
+
difficulty: extreme
|
| 26 |
+
description: "Phone call simulation with heavy codec compression and narrowband degradation"
|
| 27 |
observation_space:
|
| 28 |
type: object
|
| 29 |
properties:
|
| 30 |
features:
|
| 31 |
type: array
|
| 32 |
+
description: "48-dim feature vector (zeroed until revealed via actions)"
|
| 33 |
+
visible_features:
|
| 34 |
+
type: object
|
| 35 |
+
description: "Feature groups revealed so far"
|
| 36 |
+
evidence_summary:
|
| 37 |
+
type: string
|
| 38 |
+
description: "Structured summary from analyze_evidence action"
|
| 39 |
+
comparison_result:
|
| 40 |
+
type: object
|
| 41 |
+
description: "Similarity scores to real/fake reference centroids"
|
| 42 |
task_name:
|
| 43 |
type: string
|
| 44 |
step_number:
|
| 45 |
type: integer
|
| 46 |
difficulty:
|
| 47 |
type: string
|
| 48 |
+
available_actions:
|
| 49 |
+
type: array
|
| 50 |
+
actions_taken:
|
| 51 |
+
type: array
|
| 52 |
action_space:
|
| 53 |
type: object
|
| 54 |
properties:
|
| 55 |
+
action_type:
|
| 56 |
+
type: string
|
| 57 |
+
enum:
|
| 58 |
+
- request_temporal_features
|
| 59 |
+
- request_spectral_features
|
| 60 |
+
- request_comparison
|
| 61 |
+
- analyze_evidence
|
| 62 |
+
- final_classify
|
| 63 |
label:
|
| 64 |
type: integer
|
| 65 |
description: "0=real, 1=synthetic"
|
| 66 |
confidence:
|
| 67 |
type: number
|
| 68 |
+
description: "Agent confidence [0.0, 1.0]"
|
| 69 |
reasoning:
|
| 70 |
type: string
|
| 71 |
+
description: "Explanation of decision"
|
pyproject.toml
CHANGED
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.backends.legacy:build"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "voice-authenticity-openenv"
|
| 7 |
-
version = "
|
| 8 |
-
description = "Voice authenticity detection OpenEnv environment"
|
| 9 |
requires-python = ">=3.10"
|
| 10 |
dependencies = [
|
| 11 |
"librosa",
|
|
@@ -26,6 +26,6 @@ server = "server.app:main"
|
|
| 26 |
|
| 27 |
[tool.openenv]
|
| 28 |
name = "voice-authenticity"
|
| 29 |
-
version = "
|
| 30 |
-
tasks = ["clean_detection", "compressed_detection", "adversarial_detection"]
|
| 31 |
entry_point = "app:app"
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "voice-authenticity-openenv"
|
| 7 |
+
version = "2.0.0"
|
| 8 |
+
description = "Voice authenticity detection OpenEnv environment with multi-step agentic interaction"
|
| 9 |
requires-python = ">=3.10"
|
| 10 |
dependencies = [
|
| 11 |
"librosa",
|
|
|
|
| 26 |
|
| 27 |
[tool.openenv]
|
| 28 |
name = "voice-authenticity"
|
| 29 |
+
version = "2.0.0"
|
| 30 |
+
tasks = ["clean_detection", "compressed_detection", "adversarial_detection", "streaming_detection", "phonecall_detection"]
|
| 31 |
entry_point = "app:app"
|
scripts/extract_features.py
CHANGED
|
@@ -112,8 +112,9 @@ def process_directory(directory, label, desc):
|
|
| 112 |
|
| 113 |
|
| 114 |
def add_compression_artifacts(features, strength=0.3):
|
|
|
|
| 115 |
degraded = features.copy()
|
| 116 |
-
|
| 117 |
degraded[20:40] *= (1 - strength * np.random.uniform(0.5, 1.0, 20))
|
| 118 |
degraded[42] *= (1 - strength * np.random.uniform(0.3, 0.7))
|
| 119 |
degraded[43] *= (1 - strength * np.random.uniform(0.3, 0.7))
|
|
@@ -121,7 +122,7 @@ def add_compression_artifacts(features, strength=0.3):
|
|
| 121 |
degraded[45] *= (1 + strength * np.random.uniform(0.3, 0.8))
|
| 122 |
degraded[46] *= (1 - strength * np.random.uniform(0.2, 0.6))
|
| 123 |
degraded[47] += strength * np.random.uniform(0.1, 0.4)
|
| 124 |
-
|
| 125 |
return degraded
|
| 126 |
|
| 127 |
|
|
@@ -160,10 +161,95 @@ def add_adversarial_perturbation(features, label):
|
|
| 160 |
return perturbed
|
| 161 |
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
def main():
|
| 164 |
-
print("=" *
|
| 165 |
-
print("Feature Extraction Pipeline")
|
| 166 |
-
print("=" *
|
| 167 |
|
| 168 |
real_feat, real_labels = process_directory(
|
| 169 |
REAL_DIR, label=0, desc="REAL audio"
|
|
@@ -196,9 +282,11 @@ def main():
|
|
| 196 |
print(f"\nTask 1 (clean): {len(all_labels)} samples saved")
|
| 197 |
|
| 198 |
# ── TASK 2: Compressed features ─────────────────────────
|
|
|
|
|
|
|
| 199 |
compressed_features = np.array([
|
| 200 |
add_compression_artifacts(f, strength=0.3)
|
| 201 |
-
for f in
|
| 202 |
], dtype=np.float32)
|
| 203 |
|
| 204 |
compressed_features = compressed_features[idx]
|
|
@@ -210,7 +298,6 @@ def main():
|
|
| 210 |
print(f"Task 2 (compressed): {len(all_labels)} samples saved")
|
| 211 |
|
| 212 |
# ── TASK 3: Adversarial features ────────────────────────
|
| 213 |
-
raw_combined = real_feat + fake_feat
|
| 214 |
raw_labels_combined = real_labels + fake_labels
|
| 215 |
|
| 216 |
adversarial_features = np.array([
|
|
@@ -226,21 +313,52 @@ def main():
|
|
| 226 |
|
| 227 |
print(f"Task 3 (adversarial): {len(all_labels)} samples saved")
|
| 228 |
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
print("DONE")
|
| 231 |
print(f"Total samples : {len(all_labels)}")
|
| 232 |
print(f"Real samples : {all_labels.tolist().count(0)}")
|
| 233 |
print(f"Fake samples : {all_labels.tolist().count(1)}")
|
| 234 |
print(f"Feature shape : {all_features_norm.shape}")
|
| 235 |
-
print(f"{'='*
|
| 236 |
|
| 237 |
print("\nSanity check — jitter/shimmer/HNR comparison:")
|
| 238 |
for i in range(min(2, len(all_labels))):
|
|
|
|
| 239 |
label_str = "REAL" if all_labels[i] == 0 else "FAKE"
|
| 240 |
print(f"\n [{label_str}]")
|
| 241 |
-
print(f" Clean
|
| 242 |
-
print(f" Compressed
|
| 243 |
-
print(f" Adversarial→ jitter={adversarial_features[i][42]:.4f} shimmer={adversarial_features[i][43]:.4f} hnr={adversarial_features[i][44]:.4f}")
|
|
|
|
|
|
|
| 244 |
|
| 245 |
|
| 246 |
if __name__ == "__main__":
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
def add_compression_artifacts(features, strength=0.3):
|
| 115 |
+
"""Simulate codec compression degradation."""
|
| 116 |
degraded = features.copy()
|
| 117 |
+
|
| 118 |
degraded[20:40] *= (1 - strength * np.random.uniform(0.5, 1.0, 20))
|
| 119 |
degraded[42] *= (1 - strength * np.random.uniform(0.3, 0.7))
|
| 120 |
degraded[43] *= (1 - strength * np.random.uniform(0.3, 0.7))
|
|
|
|
| 122 |
degraded[45] *= (1 + strength * np.random.uniform(0.3, 0.8))
|
| 123 |
degraded[46] *= (1 - strength * np.random.uniform(0.2, 0.6))
|
| 124 |
degraded[47] += strength * np.random.uniform(0.1, 0.4)
|
| 125 |
+
|
| 126 |
return degraded
|
| 127 |
|
| 128 |
|
|
|
|
| 161 |
return perturbed
|
| 162 |
|
| 163 |
|
| 164 |
+
def add_streaming_degradation(features, label):
|
| 165 |
+
"""Simulate streaming/partial decode degradation.
|
| 166 |
+
|
| 167 |
+
Models real-time audio streaming where features are partially decoded:
|
| 168 |
+
- MFCC values slightly degraded (simulating partial frame decode)
|
| 169 |
+
- Temporal features intact but with mild additive noise
|
| 170 |
+
- High-frequency spectral features mildly rolled off
|
| 171 |
+
- Overall mild Gaussian noise across all dims
|
| 172 |
+
|
| 173 |
+
This is the base perturbation for Task 4 (streaming_detection).
|
| 174 |
+
The environment also applies step-dependent soft-gated noise at runtime.
|
| 175 |
+
"""
|
| 176 |
+
degraded = features.copy()
|
| 177 |
+
|
| 178 |
+
# Partial MFCC decode: higher-order coefficients more degraded
|
| 179 |
+
for i in range(20):
|
| 180 |
+
degradation = 0.02 * (i / 20) # more degradation on higher coeffs
|
| 181 |
+
degraded[i] += np.random.normal(0, degradation + 0.01)
|
| 182 |
+
for i in range(20, 40):
|
| 183 |
+
degradation = 0.03 * ((i - 20) / 20)
|
| 184 |
+
degraded[i] *= (1 - degradation * np.random.uniform(0.3, 0.8))
|
| 185 |
+
|
| 186 |
+
# Mild noise on temporal features
|
| 187 |
+
degraded[42] += np.random.normal(0, 0.003) # jitter noise
|
| 188 |
+
degraded[43] += np.random.normal(0, 0.008) # shimmer noise
|
| 189 |
+
degraded[44] += np.random.normal(0, 0.5) # HNR noise
|
| 190 |
+
|
| 191 |
+
# Mild spectral rolloff
|
| 192 |
+
degraded[41] *= np.random.uniform(0.92, 0.98) # spectral centroid
|
| 193 |
+
degraded[45] *= np.random.uniform(0.90, 0.97) # spectral bandwidth
|
| 194 |
+
degraded[46] *= np.random.uniform(0.88, 0.96) # spectral rolloff
|
| 195 |
+
|
| 196 |
+
# Global mild noise
|
| 197 |
+
degraded += np.random.normal(0, 0.015, len(degraded))
|
| 198 |
+
|
| 199 |
+
return degraded
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def add_phonecall_degradation(features, label):
|
| 203 |
+
"""Simulate phone call conditions: heavy codec + background noise.
|
| 204 |
+
|
| 205 |
+
Models the worst-case real-world scenario:
|
| 206 |
+
- Aggressive codec compression (narrowband telephony)
|
| 207 |
+
- Background noise injection across all bands
|
| 208 |
+
- Severe HNR degradation (noisy channel)
|
| 209 |
+
- MFCC high-frequency rolloff (narrowband 300-3400Hz telephony)
|
| 210 |
+
- RMS energy fluctuation (network jitter/packet loss)
|
| 211 |
+
- Jitter/shimmer partially masked by channel noise
|
| 212 |
+
|
| 213 |
+
This is the hardest task — designed to be near the limit of detectability.
|
| 214 |
+
"""
|
| 215 |
+
degraded = features.copy()
|
| 216 |
+
|
| 217 |
+
# ── Heavy codec compression (narrowband telephony simulation) ───
|
| 218 |
+
# MFCC means: zero out high-order coefficients (narrowband kills them)
|
| 219 |
+
for i in range(12, 20):
|
| 220 |
+
degraded[i] *= np.random.uniform(0.1, 0.4) # severe suppression
|
| 221 |
+
# MFCC stds: flatten temporal variation (codec smoothing)
|
| 222 |
+
degraded[20:40] *= np.random.uniform(0.3, 0.6, 20)
|
| 223 |
+
|
| 224 |
+
# ── Background noise injection ──────────────────────────────────
|
| 225 |
+
noise_strength = np.random.uniform(0.15, 0.35)
|
| 226 |
+
degraded += np.random.normal(0, noise_strength, len(degraded))
|
| 227 |
+
|
| 228 |
+
# ── Severe HNR degradation (noisy channel) ─────────────────────
|
| 229 |
+
degraded[44] -= np.random.uniform(3.0, 7.0) # massive HNR drop
|
| 230 |
+
|
| 231 |
+
# ── Jitter/shimmer partially masked by channel noise ───────────
|
| 232 |
+
degraded[42] += np.random.normal(0, 0.015) # large jitter noise
|
| 233 |
+
degraded[43] += np.random.normal(0, 0.03) # large shimmer noise
|
| 234 |
+
|
| 235 |
+
# ── Spectral degradation (narrowband rolloff) ──────────────────
|
| 236 |
+
degraded[41] *= np.random.uniform(0.5, 0.75) # centroid drops
|
| 237 |
+
degraded[45] *= np.random.uniform(0.4, 0.65) # bandwidth severely narrows
|
| 238 |
+
degraded[46] *= np.random.uniform(0.3, 0.55) # rolloff drastically drops
|
| 239 |
+
|
| 240 |
+
# ── RMS energy fluctuation (packet loss / AGC) ──────────────────
|
| 241 |
+
degraded[47] *= np.random.uniform(0.5, 1.5)
|
| 242 |
+
|
| 243 |
+
# ── ZCR noise (transmission artifacts) ──────────────────────────
|
| 244 |
+
degraded[40] += np.random.normal(0, 0.02)
|
| 245 |
+
|
| 246 |
+
return degraded
|
| 247 |
+
|
| 248 |
+
|
| 249 |
def main():
|
| 250 |
+
print("=" * 60)
|
| 251 |
+
print("Feature Extraction Pipeline (5 Tasks)")
|
| 252 |
+
print("=" * 60)
|
| 253 |
|
| 254 |
real_feat, real_labels = process_directory(
|
| 255 |
REAL_DIR, label=0, desc="REAL audio"
|
|
|
|
| 282 |
print(f"\nTask 1 (clean): {len(all_labels)} samples saved")
|
| 283 |
|
| 284 |
# ── TASK 2: Compressed features ─────────────────────────
|
| 285 |
+
raw_combined = real_feat + fake_feat
|
| 286 |
+
|
| 287 |
compressed_features = np.array([
|
| 288 |
add_compression_artifacts(f, strength=0.3)
|
| 289 |
+
for f in raw_combined
|
| 290 |
], dtype=np.float32)
|
| 291 |
|
| 292 |
compressed_features = compressed_features[idx]
|
|
|
|
| 298 |
print(f"Task 2 (compressed): {len(all_labels)} samples saved")
|
| 299 |
|
| 300 |
# ── TASK 3: Adversarial features ────────────────────────
|
|
|
|
| 301 |
raw_labels_combined = real_labels + fake_labels
|
| 302 |
|
| 303 |
adversarial_features = np.array([
|
|
|
|
| 313 |
|
| 314 |
print(f"Task 3 (adversarial): {len(all_labels)} samples saved")
|
| 315 |
|
| 316 |
+
# ── TASK 4: Streaming features ──────────────────────────
|
| 317 |
+
streaming_features = np.array([
|
| 318 |
+
add_streaming_degradation(f, l)
|
| 319 |
+
for f, l in zip(raw_combined, raw_labels_combined)
|
| 320 |
+
], dtype=np.float32)
|
| 321 |
+
|
| 322 |
+
streaming_features = streaming_features[idx]
|
| 323 |
+
streaming_norm = (streaming_features - mean) / std
|
| 324 |
+
|
| 325 |
+
np.save(f"{OUTPUT_DIR}/features_streaming.npy", streaming_norm)
|
| 326 |
+
np.save(f"{OUTPUT_DIR}/labels_streaming.npy", all_labels)
|
| 327 |
+
|
| 328 |
+
print(f"Task 4 (streaming): {len(all_labels)} samples saved")
|
| 329 |
+
|
| 330 |
+
# ── TASK 5: Phone call features ─────────────────────────
|
| 331 |
+
phonecall_features = np.array([
|
| 332 |
+
add_phonecall_degradation(f, l)
|
| 333 |
+
for f, l in zip(raw_combined, raw_labels_combined)
|
| 334 |
+
], dtype=np.float32)
|
| 335 |
+
|
| 336 |
+
phonecall_features = phonecall_features[idx]
|
| 337 |
+
phonecall_norm = (phonecall_features - mean) / std
|
| 338 |
+
|
| 339 |
+
np.save(f"{OUTPUT_DIR}/features_phonecall.npy", phonecall_norm)
|
| 340 |
+
np.save(f"{OUTPUT_DIR}/labels_phonecall.npy", all_labels)
|
| 341 |
+
|
| 342 |
+
print(f"Task 5 (phonecall): {len(all_labels)} samples saved")
|
| 343 |
+
|
| 344 |
+
print(f"\n{'='*60}")
|
| 345 |
print("DONE")
|
| 346 |
print(f"Total samples : {len(all_labels)}")
|
| 347 |
print(f"Real samples : {all_labels.tolist().count(0)}")
|
| 348 |
print(f"Fake samples : {all_labels.tolist().count(1)}")
|
| 349 |
print(f"Feature shape : {all_features_norm.shape}")
|
| 350 |
+
print(f"{'='*60}")
|
| 351 |
|
| 352 |
print("\nSanity check — jitter/shimmer/HNR comparison:")
|
| 353 |
for i in range(min(2, len(all_labels))):
|
| 354 |
+
raw_i = np.array(raw_combined)[idx][i]
|
| 355 |
label_str = "REAL" if all_labels[i] == 0 else "FAKE"
|
| 356 |
print(f"\n [{label_str}]")
|
| 357 |
+
print(f" Clean → jitter={raw_i[42]:.4f} shimmer={raw_i[43]:.4f} hnr={raw_i[44]:.4f}")
|
| 358 |
+
print(f" Compressed → jitter={compressed_features[i][42]:.4f} shimmer={compressed_features[i][43]:.4f} hnr={compressed_features[i][44]:.4f}")
|
| 359 |
+
print(f" Adversarial → jitter={adversarial_features[i][42]:.4f} shimmer={adversarial_features[i][43]:.4f} hnr={adversarial_features[i][44]:.4f}")
|
| 360 |
+
print(f" Streaming → jitter={streaming_features[i][42]:.4f} shimmer={streaming_features[i][43]:.4f} hnr={streaming_features[i][44]:.4f}")
|
| 361 |
+
print(f" PhoneCall → jitter={phonecall_features[i][42]:.4f} shimmer={phonecall_features[i][43]:.4f} hnr={phonecall_features[i][44]:.4f}")
|
| 362 |
|
| 363 |
|
| 364 |
if __name__ == "__main__":
|
server/app.py
CHANGED
|
@@ -1,31 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
-
from fastapi.responses import JSONResponse
|
| 3 |
from pydantic import BaseModel
|
| 4 |
-
from typing import Optional
|
| 5 |
import uvicorn
|
| 6 |
import os
|
| 7 |
-
import sys
|
| 8 |
-
|
| 9 |
-
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 10 |
|
| 11 |
from environment.env import VoiceAuthenticityEnv
|
| 12 |
|
| 13 |
-
app = FastAPI(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
"clean_detection"
|
| 17 |
-
"compressed_detection"
|
| 18 |
-
"adversarial_detection"
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
|
|
|
|
| 21 |
current_task = "clean_detection"
|
| 22 |
|
|
|
|
| 23 |
class ActionRequest(BaseModel):
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
task_name: Optional[str] = None
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
@app.post("/reset")
|
| 30 |
def reset(request: dict = {}):
|
| 31 |
global current_task
|
|
@@ -41,6 +228,7 @@ def reset(request: dict = {}):
|
|
| 41 |
"info": {}
|
| 42 |
})
|
| 43 |
|
|
|
|
| 44 |
@app.post("/step")
|
| 45 |
def step(action: ActionRequest):
|
| 46 |
global current_task
|
|
@@ -48,9 +236,11 @@ def step(action: ActionRequest):
|
|
| 48 |
if task not in envs:
|
| 49 |
task = current_task
|
| 50 |
action_dict = {
|
|
|
|
| 51 |
"label": action.label,
|
| 52 |
"confidence": action.confidence,
|
| 53 |
-
"reasoning": action.reasoning
|
|
|
|
| 54 |
}
|
| 55 |
obs, reward, done, info = envs[task].step(action_dict)
|
| 56 |
return JSONResponse({
|
|
@@ -60,17 +250,28 @@ def step(action: ActionRequest):
|
|
| 60 |
"info": info
|
| 61 |
})
|
| 62 |
|
|
|
|
| 63 |
@app.get("/state")
|
| 64 |
def state():
|
| 65 |
return JSONResponse(envs[current_task].state())
|
| 66 |
|
|
|
|
| 67 |
@app.get("/health")
|
| 68 |
def health():
|
| 69 |
-
return {"status": "
|
|
|
|
| 70 |
|
| 71 |
@app.get("/")
|
| 72 |
def root():
|
| 73 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
def main():
|
| 76 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
load_dotenv()
|
| 3 |
+
|
| 4 |
from fastapi import FastAPI
|
| 5 |
+
from fastapi.responses import JSONResponse, HTMLResponse
|
| 6 |
from pydantic import BaseModel
|
| 7 |
+
from typing import Optional, List
|
| 8 |
import uvicorn
|
| 9 |
import os
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from environment.env import VoiceAuthenticityEnv
|
| 12 |
|
| 13 |
+
app = FastAPI(
|
| 14 |
+
title="Voice Authenticity OpenEnv",
|
| 15 |
+
description="Multi-step agentic environment for detecting synthetic speech",
|
| 16 |
+
version="2.0.0"
|
| 17 |
+
)
|
| 18 |
|
| 19 |
+
TASKS = [
|
| 20 |
+
"clean_detection",
|
| 21 |
+
"compressed_detection",
|
| 22 |
+
"adversarial_detection",
|
| 23 |
+
"streaming_detection",
|
| 24 |
+
"phonecall_detection",
|
| 25 |
+
]
|
| 26 |
|
| 27 |
+
envs = {task: VoiceAuthenticityEnv(task) for task in TASKS}
|
| 28 |
current_task = "clean_detection"
|
| 29 |
|
| 30 |
+
|
| 31 |
class ActionRequest(BaseModel):
|
| 32 |
+
action_type: str = "final_classify"
|
| 33 |
+
label: int = 0
|
| 34 |
+
confidence: float = 0.5
|
| 35 |
+
reasoning: str = ""
|
| 36 |
+
focus: List[str] = []
|
| 37 |
task_name: Optional[str] = None
|
| 38 |
|
| 39 |
+
|
| 40 |
+
@app.get("/web", response_class=HTMLResponse)
|
| 41 |
+
def web_interface():
|
| 42 |
+
return """
|
| 43 |
+
<!DOCTYPE html>
|
| 44 |
+
<html>
|
| 45 |
+
<head>
|
| 46 |
+
<title>Voice Authenticity OpenEnv</title>
|
| 47 |
+
<style>
|
| 48 |
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
| 49 |
+
body { font-family: -apple-system, sans-serif; max-width: 860px; margin: 50px auto; padding: 20px; background: #050508; color: #fff; }
|
| 50 |
+
h1 { color: #00c9a7; font-size: 28px; margin-bottom: 8px; }
|
| 51 |
+
h2 { font-size: 16px; font-weight: 500; margin-bottom: 12px; color: #00c9a7; }
|
| 52 |
+
p { color: #666; font-size: 14px; line-height: 1.6; margin-bottom: 8px; }
|
| 53 |
+
.card { background: #080810; border: 1px solid #0f0f1a; border-radius: 14px; padding: 20px; margin: 16px 0; }
|
| 54 |
+
.tag { background: #0d2d1e; color: #00c9a7; padding: 4px 12px; border-radius: 20px; font-size: 11px; margin: 3px; display: inline-block; border: 1px solid #0f2d26; }
|
| 55 |
+
a { color: #00c9a7; text-decoration: none; }
|
| 56 |
+
a:hover { text-decoration: underline; }
|
| 57 |
+
.task { border-left: 2px solid #00c9a7; padding: 8px 12px; margin: 8px 0; background: #050508; border-radius: 0 8px 8px 0; }
|
| 58 |
+
.task strong { font-size: 13px; color: #fff; }
|
| 59 |
+
.task span { font-size: 12px; color: #555; display: block; margin-top: 2px; }
|
| 60 |
+
.difficulty { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 10px; margin-left: 8px; }
|
| 61 |
+
.easy { background: #0d2d1e; color: #00c9a7; }
|
| 62 |
+
.medium { background: #1a1a00; color: #f0a500; }
|
| 63 |
+
.hard { background: #1a0000; color: #ff6b6b; }
|
| 64 |
+
.extreme { background: #1a0010; color: #ff00aa; }
|
| 65 |
+
.medium_hard { background: #0d1a2d; color: #00aaff; }
|
| 66 |
+
.endpoint { display: flex; gap: 12px; align-items: center; padding: 8px 0; border-bottom: 1px solid #0f0f1a; }
|
| 67 |
+
.endpoint:last-child { border-bottom: none; }
|
| 68 |
+
.method { font-size: 11px; font-weight: 600; padding: 3px 8px; border-radius: 6px; min-width: 45px; text-align: center; }
|
| 69 |
+
.get { background: #0d2d1e; color: #00c9a7; }
|
| 70 |
+
.post { background: #1a1a00; color: #f0a500; }
|
| 71 |
+
.endpoint-path { font-size: 13px; color: #fff; font-family: monospace; }
|
| 72 |
+
.endpoint-desc { font-size: 12px; color: #444; }
|
| 73 |
+
.action-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-top: 8px; }
|
| 74 |
+
.action-card { background: #050508; border: 1px solid #0f0f1a; border-radius: 10px; padding: 12px; }
|
| 75 |
+
.action-name { font-size: 12px; font-family: monospace; color: #00c9a7; margin-bottom: 4px; }
|
| 76 |
+
.action-desc { font-size: 11px; color: #444; line-height: 1.5; }
|
| 77 |
+
.stat { text-align: center; padding: 16px; }
|
| 78 |
+
.stat-num { font-size: 28px; font-weight: 600; color: #fff; }
|
| 79 |
+
.stat-num span { color: #00c9a7; }
|
| 80 |
+
.stat-label { font-size: 11px; color: #444; margin-top: 4px; }
|
| 81 |
+
.stats-grid { display: grid; grid-template-columns: repeat(3, 1fr); gap: 1px; background: #0f0f1a; border-radius: 12px; overflow: hidden; }
|
| 82 |
+
.stat { background: #080810; }
|
| 83 |
+
.badge { display: inline-flex; align-items: center; gap: 6px; padding: 5px 14px; border: 1px solid #0f2d26; background: #050f0d; border-radius: 20px; font-size: 11px; color: #00c9a7; }
|
| 84 |
+
.dot { width: 6px; height: 6px; background: #00c9a7; border-radius: 50%; animation: pulse 2s infinite; }
|
| 85 |
+
@keyframes pulse { 0%,100%{opacity:1} 50%{opacity:.3} }
|
| 86 |
+
footer { text-align: center; padding: 2rem 0; color: #333; font-size: 12px; }
|
| 87 |
+
footer span { color: #00c9a7; }
|
| 88 |
+
</style>
|
| 89 |
+
</head>
|
| 90 |
+
<body>
|
| 91 |
+
<div style="margin-bottom:1.5rem">
|
| 92 |
+
<div class="badge"><div class="dot"></div>Live — 5 tasks available</div>
|
| 93 |
+
</div>
|
| 94 |
+
|
| 95 |
+
<h1>🎙️ Voice Authenticity OpenEnv</h1>
|
| 96 |
+
<p style="margin-bottom:1.5rem;font-size:16px;color:#888">
|
| 97 |
+
Multi-step agentic environment for detecting synthetic (AI-generated) speech
|
| 98 |
+
across real-world degradation and adversarial conditions.
|
| 99 |
+
</p>
|
| 100 |
+
|
| 101 |
+
<div class="stats-grid">
|
| 102 |
+
<div class="stat">
|
| 103 |
+
<div class="stat-num">5<span>+</span></div>
|
| 104 |
+
<div class="stat-label">Tasks</div>
|
| 105 |
+
</div>
|
| 106 |
+
<div class="stat">
|
| 107 |
+
<div class="stat-num">5</div>
|
| 108 |
+
<div class="stat-label">Steps per episode</div>
|
| 109 |
+
</div>
|
| 110 |
+
<div class="stat">
|
| 111 |
+
<div class="stat-num">48</div>
|
| 112 |
+
<div class="stat-label">Feature dimensions</div>
|
| 113 |
+
</div>
|
| 114 |
+
</div>
|
| 115 |
+
|
| 116 |
+
<div class="card">
|
| 117 |
+
<h2>Tasks</h2>
|
| 118 |
+
<div class="task">
|
| 119 |
+
<strong>clean_detection <span class="difficulty easy">easy</span></strong>
|
| 120 |
+
<span>Classify real vs synthetic speech from clean, unmodified audio features</span>
|
| 121 |
+
</div>
|
| 122 |
+
<div class="task">
|
| 123 |
+
<strong>compressed_detection <span class="difficulty medium">medium</span></strong>
|
| 124 |
+
<span>Classify speech under codec compression degradation</span>
|
| 125 |
+
</div>
|
| 126 |
+
<div class="task">
|
| 127 |
+
<strong>adversarial_detection <span class="difficulty hard">hard</span></strong>
|
| 128 |
+
<span>Adversarially crafted synthetic speech with overlapping feature distributions</span>
|
| 129 |
+
</div>
|
| 130 |
+
<div class="task">
|
| 131 |
+
<strong>streaming_detection <span class="difficulty medium_hard">medium-hard</span></strong>
|
| 132 |
+
<span>Step-dependent noise soft-gating — earlier steps noisier, later steps cleaner</span>
|
| 133 |
+
</div>
|
| 134 |
+
<div class="task">
|
| 135 |
+
<strong>phonecall_detection <span class="difficulty extreme">extreme</span></strong>
|
| 136 |
+
<span>Heavy codec compression and narrowband degradation simulating phone calls</span>
|
| 137 |
+
</div>
|
| 138 |
+
</div>
|
| 139 |
+
|
| 140 |
+
<div class="card">
|
| 141 |
+
<h2>5-Step Agent Protocol</h2>
|
| 142 |
+
<div class="action-grid">
|
| 143 |
+
<div class="action-card">
|
| 144 |
+
<div class="action-name">1. request_temporal_features</div>
|
| 145 |
+
<div class="action-desc">Reveals jitter, shimmer, and HNR — the core discriminating signals</div>
|
| 146 |
+
</div>
|
| 147 |
+
<div class="action-card">
|
| 148 |
+
<div class="action-name">2. request_spectral_features</div>
|
| 149 |
+
<div class="action-desc">Reveals 20 MFCC means, 20 MFCC stds, ZCR, spectral centroid</div>
|
| 150 |
+
</div>
|
| 151 |
+
<div class="action-card">
|
| 152 |
+
<div class="action-name">3. request_comparison</div>
|
| 153 |
+
<div class="action-desc">Compares sample to real/fake reference centroids via cosine similarity</div>
|
| 154 |
+
</div>
|
| 155 |
+
<div class="action-card">
|
| 156 |
+
<div class="action-name">4. analyze_evidence</div>
|
| 157 |
+
<div class="action-desc">Synthesizes all gathered signals into a structured evidence summary</div>
|
| 158 |
+
</div>
|
| 159 |
+
<div class="action-card" style="grid-column: span 2;">
|
| 160 |
+
<div class="action-name">5. final_classify</div>
|
| 161 |
+
<div class="action-desc">Submits final verdict: label (0=real, 1=synthetic) + confidence + reasoning. Terminates episode.</div>
|
| 162 |
+
</div>
|
| 163 |
+
</div>
|
| 164 |
+
</div>
|
| 165 |
+
|
| 166 |
+
<div class="card">
|
| 167 |
+
<h2>API Endpoints</h2>
|
| 168 |
+
<div class="endpoint">
|
| 169 |
+
<span class="method post">POST</span>
|
| 170 |
+
<span class="endpoint-path">/reset</span>
|
| 171 |
+
<span class="endpoint-desc">Reset episode, optionally set task_name</span>
|
| 172 |
+
</div>
|
| 173 |
+
<div class="endpoint">
|
| 174 |
+
<span class="method post">POST</span>
|
| 175 |
+
<span class="endpoint-path">/step</span>
|
| 176 |
+
<span class="endpoint-desc">Submit action, receive observation + reward</span>
|
| 177 |
+
</div>
|
| 178 |
+
<div class="endpoint">
|
| 179 |
+
<span class="method get">GET</span>
|
| 180 |
+
<span class="endpoint-path">/state</span>
|
| 181 |
+
<span class="endpoint-desc">Current environment state</span>
|
| 182 |
+
</div>
|
| 183 |
+
<div class="endpoint">
|
| 184 |
+
<span class="method get">GET</span>
|
| 185 |
+
<span class="endpoint-path">/health</span>
|
| 186 |
+
<span class="endpoint-desc">Health check</span>
|
| 187 |
+
</div>
|
| 188 |
+
<div class="endpoint">
|
| 189 |
+
<span class="method get">GET</span>
|
| 190 |
+
<span class="endpoint-path"><a href="/docs">/docs</a></span>
|
| 191 |
+
<span class="endpoint-desc">Interactive API documentation (Swagger UI)</span>
|
| 192 |
+
</div>
|
| 193 |
+
</div>
|
| 194 |
+
|
| 195 |
+
<div class="card">
|
| 196 |
+
<h2>Tags</h2>
|
| 197 |
+
<span class="tag">openenv</span>
|
| 198 |
+
<span class="tag">speech</span>
|
| 199 |
+
<span class="tag">fraud-detection</span>
|
| 200 |
+
<span class="tag">audio</span>
|
| 201 |
+
<span class="tag">partial-observability</span>
|
| 202 |
+
<span class="tag">multi-step</span>
|
| 203 |
+
<span class="tag">confidence-calibration</span>
|
| 204 |
+
<span class="tag">adversarial</span>
|
| 205 |
+
</div>
|
| 206 |
+
|
| 207 |
+
<footer>
|
| 208 |
+
Built by <span>Akshara Sharma</span> · Voice Authenticity OpenEnv v2.0.0
|
| 209 |
+
· <a href="https://github.com/AksharaaSharmaa/voice-authenticity-openenv">GitHub</a>
|
| 210 |
+
</footer>
|
| 211 |
+
</body>
|
| 212 |
+
</html>
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
|
| 216 |
@app.post("/reset")
|
| 217 |
def reset(request: dict = {}):
|
| 218 |
global current_task
|
|
|
|
| 228 |
"info": {}
|
| 229 |
})
|
| 230 |
|
| 231 |
+
|
| 232 |
@app.post("/step")
|
| 233 |
def step(action: ActionRequest):
|
| 234 |
global current_task
|
|
|
|
| 236 |
if task not in envs:
|
| 237 |
task = current_task
|
| 238 |
action_dict = {
|
| 239 |
+
"action_type": action.action_type,
|
| 240 |
"label": action.label,
|
| 241 |
"confidence": action.confidence,
|
| 242 |
+
"reasoning": action.reasoning,
|
| 243 |
+
"focus": action.focus,
|
| 244 |
}
|
| 245 |
obs, reward, done, info = envs[task].step(action_dict)
|
| 246 |
return JSONResponse({
|
|
|
|
| 250 |
"info": info
|
| 251 |
})
|
| 252 |
|
| 253 |
+
|
| 254 |
@app.get("/state")
|
| 255 |
def state():
|
| 256 |
return JSONResponse(envs[current_task].state())
|
| 257 |
|
| 258 |
+
|
| 259 |
@app.get("/health")
|
| 260 |
def health():
|
| 261 |
+
return {"status": "healthy", "service": "voice-authenticity-openenv"}
|
| 262 |
+
|
| 263 |
|
| 264 |
@app.get("/")
|
| 265 |
def root():
|
| 266 |
+
return {
|
| 267 |
+
"name": "voice-authenticity-openenv",
|
| 268 |
+
"version": "2.0.0",
|
| 269 |
+
"status": "running",
|
| 270 |
+
"tasks": TASKS,
|
| 271 |
+
"web": "/web",
|
| 272 |
+
"docs": "/docs"
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
|
| 276 |
def main():
|
| 277 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|