Commit Β·
72983a7
0
Parent(s):
voice authenticity openenv - initial submission
Browse files- Dockerfile +18 -0
- README.md +341 -0
- environment/__init__.py +0 -0
- environment/data/features.npy +0 -0
- environment/data/features_adversarial.npy +0 -0
- environment/data/features_compressed.npy +0 -0
- environment/data/features_raw.npy +0 -0
- environment/data/labels.npy +0 -0
- environment/data/labels_adversarial.npy +0 -0
- environment/data/labels_compressed.npy +0 -0
- environment/data/mean.npy +0 -0
- environment/data/std.npy +0 -0
- environment/env.py +98 -0
- environment/graders.py +33 -0
- environment/models.py +21 -0
- inference.py +132 -0
- openenv.yaml +39 -0
- requirements.txt +8 -0
- scripts/download_data.py +30 -0
- scripts/extract_features.py +223 -0
Dockerfile
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
libsndfile1 \
|
| 7 |
+
praat \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
COPY requirements.txt .
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
ENV API_BASE_URL=https://router.huggingface.co/v1
|
| 16 |
+
ENV MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 17 |
+
|
| 18 |
+
CMD ["python", "inference.py"]
|
README.md
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Here's your complete README:
|
| 2 |
+
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
```markdown
|
| 6 |
+
# ποΈ Voice Authenticity Detection β OpenEnv Environment
|
| 7 |
+
|
| 8 |
+
An reinforcement learning environment for training and evaluating AI agents
|
| 9 |
+
to detect synthetic (AI-generated) speech across real-world degradation
|
| 10 |
+
conditions.
|
| 11 |
+
|
| 12 |
+
> Voice fraud is a growing crisis. This environment trains agents to detect
|
| 13 |
+
> synthetic speech under clean, compressed, and adversarial conditions β
|
| 14 |
+
> directly applicable to fraud detection, content moderation, and voice
|
| 15 |
+
> authentication systems.
|
| 16 |
+
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
## π Real-World Motivation
|
| 20 |
+
|
| 21 |
+
AI-generated voices (ElevenLabs, Coqui, etc.) are increasingly used for:
|
| 22 |
+
- **Phone fraud** and social engineering attacks
|
| 23 |
+
- **Deepfake audio** in misinformation campaigns
|
| 24 |
+
- **Identity spoofing** in voice authentication systems
|
| 25 |
+
|
| 26 |
+
This environment provides a structured benchmark for training agents to
|
| 27 |
+
detect synthetic speech under realistic degradation conditions that existing
|
| 28 |
+
classifiers struggle with.
|
| 29 |
+
|
| 30 |
+
---
|
| 31 |
+
|
| 32 |
+
## ποΈ Environment Overview
|
| 33 |
+
|
| 34 |
+
The environment serves **48-dimensional feature vectors** extracted from
|
| 35 |
+
audio samples. Agents must classify each sample as real or synthetic,
|
| 36 |
+
with a confidence score.
|
| 37 |
+
|
| 38 |
+
### Why Feature Vectors, Not Raw Audio?
|
| 39 |
+
- Fits within 2 vCPU / 8GB RAM constraints
|
| 40 |
+
- Feature extraction done offline β inference is fast
|
| 41 |
+
- Interpretable observations for LLM-based agents
|
| 42 |
+
|
| 43 |
+
### Dataset
|
| 44 |
+
- **Real speech**: 250 samples from
|
| 45 |
+
`garystafford/deepfake-audio-detection` (authentic human recordings)
|
| 46 |
+
- **Synthetic speech**: 250 samples (ElevenLabs, Hume AI, and other
|
| 47 |
+
TTS platforms)
|
| 48 |
+
- **Total**: 500 labeled samples across 3 task variants
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## π Observation Space
|
| 53 |
+
|
| 54 |
+
Each observation is a **48-dimensional float32 vector**:
|
| 55 |
+
|
| 56 |
+
| Index | Feature | Description |
|
| 57 |
+
|-------|---------|-------------|
|
| 58 |
+
| 0β19 | MFCC means | Timbre and spectral shape |
|
| 59 |
+
| 20β39 | MFCC std devs | Variation β synthetic voices are too stable |
|
| 60 |
+
| 40 | Zero crossing rate | Signal sign changes per frame |
|
| 61 |
+
| 41 | Spectral centroid | Brightness of the sound |
|
| 62 |
+
| 42 | Jitter | Frequency instability β real voices wobble slightly |
|
| 63 |
+
| 43 | Shimmer | Amplitude instability β real voices vary naturally |
|
| 64 |
+
| 44 | HNR | Harmonics-to-noise ratio β synthetic voices too clean |
|
| 65 |
+
| 45β47 | Compression artifacts | Spectral bandwidth, rolloff, RMS energy |
|
| 66 |
+
|
| 67 |
+
### Key Discriminators
|
| 68 |
+
```
|
| 69 |
+
Real speech: jitter > 0.025, shimmer > 0.10, hnr < 12.0
|
| 70 |
+
Synthetic speech: jitter < 0.020, shimmer < 0.09, hnr > 12.0
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
### Observation Schema (Pydantic)
|
| 74 |
+
```python
|
| 75 |
+
class VoiceObservation(BaseModel):
|
| 76 |
+
features: List[float] # 48-dim feature vector (normalized)
|
| 77 |
+
task_name: str # current task
|
| 78 |
+
step_number: int # current step
|
| 79 |
+
difficulty: str # easy | medium | hard
|
| 80 |
+
sample_id: int # index into dataset
|
| 81 |
+
hint: Optional[str] # key raw values + task warning
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
---
|
| 85 |
+
|
| 86 |
+
## π― Action Space
|
| 87 |
+
|
| 88 |
+
```python
|
| 89 |
+
class VoiceAction(BaseModel):
|
| 90 |
+
label: int # 0 = real, 1 = synthetic
|
| 91 |
+
confidence: float # confidence in [0.0, 1.0]
|
| 92 |
+
reasoning: str # brief explanation
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
## π Tasks
|
| 98 |
+
|
| 99 |
+
### Task 1 β Clean Detection (Easy)
|
| 100 |
+
- **Description**: Classify real vs synthetic speech from clean,
|
| 101 |
+
unmodified audio features
|
| 102 |
+
- **Difficulty**: Easy
|
| 103 |
+
- **Expected agent score**: 0.7β1.0
|
| 104 |
+
- **Scoring**: Binary β correct=1.0, incorrect=0.0
|
| 105 |
+
|
| 106 |
+
### Task 2 β Compressed Detection (Medium)
|
| 107 |
+
- **Description**: Classify speech after codec compression degradation.
|
| 108 |
+
Jitter and shimmer are reduced, compression artifacts added.
|
| 109 |
+
- **Difficulty**: Medium
|
| 110 |
+
- **Expected agent score**: 0.5β0.9
|
| 111 |
+
- **Scoring**: Partial credit based on confidence calibration
|
| 112 |
+
```
|
| 113 |
+
correct + high confidence β 1.0
|
| 114 |
+
correct + low confidence β 0.6
|
| 115 |
+
wrong + low confidence β 0.2
|
| 116 |
+
wrong + high confidence β 0.0
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### Task 3 β Adversarial Detection (Hard)
|
| 120 |
+
- **Description**: Synthetic audio specifically crafted to mimic real
|
| 121 |
+
speech features. Jitter and shimmer are artificially elevated.
|
| 122 |
+
- **Difficulty**: Hard
|
| 123 |
+
- **Expected agent score**: 0.3β0.97
|
| 124 |
+
- **Scoring**: Rewards correct classification AND penalizes overconfidence
|
| 125 |
+
```
|
| 126 |
+
correct + calibrated confidence (~0.7) β ~1.0
|
| 127 |
+
correct + overconfident (>0.9) β 0.5
|
| 128 |
+
wrong + appropriately uncertain β 0.15
|
| 129 |
+
wrong + overconfident β 0.0
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
---
|
| 133 |
+
|
| 134 |
+
## π Reward Function
|
| 135 |
+
|
| 136 |
+
The reward function provides **partial, meaningful signals** β not just
|
| 137 |
+
binary win/lose.
|
| 138 |
+
|
| 139 |
+
```python
|
| 140 |
+
def grade(true_label, action, difficulty):
|
| 141 |
+
correct = (action["label"] == true_label)
|
| 142 |
+
confidence = action["confidence"]
|
| 143 |
+
|
| 144 |
+
if difficulty == "easy":
|
| 145 |
+
return 1.0 if correct else 0.0
|
| 146 |
+
|
| 147 |
+
elif difficulty == "medium":
|
| 148 |
+
if correct:
|
| 149 |
+
return 0.6 + 0.4 * confidence
|
| 150 |
+
else:
|
| 151 |
+
return max(0.0, 0.2 - 0.3 * confidence)
|
| 152 |
+
|
| 153 |
+
elif difficulty == "hard":
|
| 154 |
+
if correct:
|
| 155 |
+
base = 0.5
|
| 156 |
+
calibration_bonus = 0.5 * (1 - abs(confidence - 0.7))
|
| 157 |
+
return base + calibration_bonus
|
| 158 |
+
else:
|
| 159 |
+
return 0.15 if confidence < 0.4 else 0.0
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
### Why Confidence Calibration Matters
|
| 163 |
+
An agent that is **wrong but uncertain** is more useful than one that is
|
| 164 |
+
**wrong but confident**. This reward design teaches agents to express
|
| 165 |
+
appropriate uncertainty β critical for real-world fraud detection systems.
|
| 166 |
+
|
| 167 |
+
---
|
| 168 |
+
|
| 169 |
+
## π OpenEnv API
|
| 170 |
+
|
| 171 |
+
```python
|
| 172 |
+
from environment.env import VoiceAuthenticityEnv
|
| 173 |
+
|
| 174 |
+
env = VoiceAuthenticityEnv(task_name="clean_detection")
|
| 175 |
+
|
| 176 |
+
# Reset episode
|
| 177 |
+
obs = env.reset()
|
| 178 |
+
# obs.features β 48-dim list
|
| 179 |
+
# obs.hint β key raw values for interpretation
|
| 180 |
+
# obs.difficulty β "easy"
|
| 181 |
+
|
| 182 |
+
# Take action
|
| 183 |
+
action = {"label": 1, "confidence": 0.8, "reasoning": "low jitter"}
|
| 184 |
+
obs, reward, done, info = env.step(action)
|
| 185 |
+
# reward β float in [0.0, 1.0]
|
| 186 |
+
# done β True (one classification per episode)
|
| 187 |
+
# info["true_label"]β ground truth
|
| 188 |
+
|
| 189 |
+
# Get state
|
| 190 |
+
state = env.state()
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
---
|
| 194 |
+
|
| 195 |
+
## π Baseline Scores
|
| 196 |
+
|
| 197 |
+
Scores from `Qwen/Qwen2.5-72B-Instruct` across multiple runs:
|
| 198 |
+
|
| 199 |
+
| Task | Difficulty | Avg Reward | Notes |
|
| 200 |
+
|------|-----------|------------|-------|
|
| 201 |
+
| clean_detection | Easy | ~0.80 | Strong baseline |
|
| 202 |
+
| compressed_detection | Medium | ~0.70 | Compression reduces confidence |
|
| 203 |
+
| adversarial_detection | Hard | ~0.75 | Calibration reward helps |
|
| 204 |
+
|
| 205 |
+
---
|
| 206 |
+
|
| 207 |
+
## π Setup & Usage
|
| 208 |
+
|
| 209 |
+
### Requirements
|
| 210 |
+
```
|
| 211 |
+
Python 3.10+
|
| 212 |
+
Docker
|
| 213 |
+
HuggingFace account
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
### Local Setup
|
| 217 |
+
|
| 218 |
+
```bash
|
| 219 |
+
# Clone the repo
|
| 220 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/voice-authenticity-openenv
|
| 221 |
+
cd voice-authenticity-openenv
|
| 222 |
+
|
| 223 |
+
# Install dependencies
|
| 224 |
+
pip install -r requirements.txt
|
| 225 |
+
|
| 226 |
+
# Download dataset and extract features
|
| 227 |
+
python scripts/download_data.py
|
| 228 |
+
python scripts/extract_features.py
|
| 229 |
+
|
| 230 |
+
# Set environment variables
|
| 231 |
+
cp .env.example .env
|
| 232 |
+
# Edit .env with your HF_TOKEN
|
| 233 |
+
|
| 234 |
+
# Run baseline inference
|
| 235 |
+
python inference.py
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
### Environment Variables
|
| 239 |
+
|
| 240 |
+
| Variable | Description | Default |
|
| 241 |
+
|----------|-------------|---------|
|
| 242 |
+
| `API_BASE_URL` | LLM API endpoint | `https://router.huggingface.co/v1` |
|
| 243 |
+
| `MODEL_NAME` | Model identifier | `Qwen/Qwen2.5-72B-Instruct` |
|
| 244 |
+
| `HF_TOKEN` | HuggingFace API token | required |
|
| 245 |
+
| `VOICE_TASK` | Task to run | `clean_detection` |
|
| 246 |
+
|
| 247 |
+
### Docker
|
| 248 |
+
|
| 249 |
+
```bash
|
| 250 |
+
# Build
|
| 251 |
+
docker build -t voice-authenticity .
|
| 252 |
+
|
| 253 |
+
# Run
|
| 254 |
+
docker run --env-file .env voice-authenticity
|
| 255 |
+
```
|
| 256 |
+
|
| 257 |
+
---
|
| 258 |
+
|
| 259 |
+
## π Project Structure
|
| 260 |
+
|
| 261 |
+
```
|
| 262 |
+
voice-authenticity-openenv/
|
| 263 |
+
βββ environment/
|
| 264 |
+
β βββ __init__.py
|
| 265 |
+
β βββ env.py # step() / reset() / state()
|
| 266 |
+
β βββ models.py # Pydantic Observation/Action/Reward
|
| 267 |
+
β βββ graders.py # scoring logic per task
|
| 268 |
+
β βββ data/
|
| 269 |
+
β βββ features.npy # clean features (500 Γ 48)
|
| 270 |
+
β βββ features_compressed.npy # codec-degraded features
|
| 271 |
+
β βββ features_adversarial.npy# adversarially perturbed
|
| 272 |
+
β βββ features_raw.npy # unnormalized for hints
|
| 273 |
+
β βββ labels.npy # ground truth labels
|
| 274 |
+
βββ scripts/
|
| 275 |
+
β βββ download_data.py # fetch dataset from HuggingFace
|
| 276 |
+
β βββ extract_features.py # audio β feature vectors
|
| 277 |
+
βββ inference.py # baseline LLM agent
|
| 278 |
+
βββ openenv.yaml # OpenEnv spec
|
| 279 |
+
βββ Dockerfile
|
| 280 |
+
βββ requirements.txt
|
| 281 |
+
βββ README.md
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
---
|
| 285 |
+
|
| 286 |
+
## π¬ Technical Details
|
| 287 |
+
|
| 288 |
+
### Feature Extraction Pipeline
|
| 289 |
+
```
|
| 290 |
+
Audio (.wav / .flac)
|
| 291 |
+
β librosa (MFCCs, spectral features)
|
| 292 |
+
β parselmouth/Praat (jitter, shimmer, HNR)
|
| 293 |
+
β z-score normalization
|
| 294 |
+
β 48-dim float32 vector
|
| 295 |
+
β stored as .npy arrays
|
| 296 |
+
```
|
| 297 |
+
|
| 298 |
+
### Compression Simulation (Task 2)
|
| 299 |
+
Codec compression is simulated by:
|
| 300 |
+
- Degrading MFCC standard deviations (compression flattens variation)
|
| 301 |
+
- Reducing jitter and shimmer values
|
| 302 |
+
- Adding spectral artifact signals to indices 45β47
|
| 303 |
+
|
| 304 |
+
### Adversarial Simulation (Task 3)
|
| 305 |
+
Adversarial perturbation on synthetic samples:
|
| 306 |
+
- Artificially elevates jitter (+0.005 to +0.02)
|
| 307 |
+
- Artificially elevates shimmer (+0.01 to +0.05)
|
| 308 |
+
- Slightly reduces HNR to mimic real speech
|
| 309 |
+
|
| 310 |
+
---
|
| 311 |
+
|
| 312 |
+
## π Expected stdout Format
|
| 313 |
+
|
| 314 |
+
```
|
| 315 |
+
[START] task=clean_detection env=voice-authenticity model=Qwen/Qwen2.5-72B-Instruct
|
| 316 |
+
[STEP] step=1 action={"label":0,"confidence":0.95,"reasoning":"..."} reward=1.00 done=true error=null
|
| 317 |
+
[END] success=true steps=1 score=1.000 rewards=1.00
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
---
|
| 321 |
+
|
| 322 |
+
## π License
|
| 323 |
+
MIT
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
---
|
| 327 |
+
|
| 328 |
+
## Save This
|
| 329 |
+
|
| 330 |
+
Create `README.md` in your project root and paste everything above into it.
|
| 331 |
+
|
| 332 |
+
Also create `.env.example` (safe to commit, no real token):
|
| 333 |
+
|
| 334 |
+
```
|
| 335 |
+
API_BASE_URL=https://router.huggingface.co/v1
|
| 336 |
+
MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 337 |
+
HF_TOKEN=your_huggingface_token_here
|
| 338 |
+
VOICE_TASK=clean_detection
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
---
|
environment/__init__.py
ADDED
|
File without changes
|
environment/data/features.npy
ADDED
|
Binary file (96.1 kB). View file
|
|
|
environment/data/features_adversarial.npy
ADDED
|
Binary file (96.1 kB). View file
|
|
|
environment/data/features_compressed.npy
ADDED
|
Binary file (96.1 kB). View file
|
|
|
environment/data/features_raw.npy
ADDED
|
Binary file (96.1 kB). View file
|
|
|
environment/data/labels.npy
ADDED
|
Binary file (2.13 kB). View file
|
|
|
environment/data/labels_adversarial.npy
ADDED
|
Binary file (2.13 kB). View file
|
|
|
environment/data/labels_compressed.npy
ADDED
|
Binary file (2.13 kB). View file
|
|
|
environment/data/mean.npy
ADDED
|
Binary file (320 Bytes). View file
|
|
|
environment/data/std.npy
ADDED
|
Binary file (320 Bytes). View file
|
|
|
environment/env.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import random
|
| 3 |
+
from environment.models import VoiceObservation
|
| 4 |
+
|
| 5 |
+
TASKS = ["clean_detection", "compressed_detection", "adversarial_detection"]
|
| 6 |
+
|
| 7 |
+
DIFFICULTY_MAP = {
|
| 8 |
+
"clean_detection": "easy",
|
| 9 |
+
"compressed_detection": "medium",
|
| 10 |
+
"adversarial_detection":"hard"
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
DATA_FILES = {
|
| 14 |
+
"clean_detection": (
|
| 15 |
+
"environment/data/features.npy",
|
| 16 |
+
"environment/data/labels.npy"
|
| 17 |
+
),
|
| 18 |
+
"compressed_detection": (
|
| 19 |
+
"environment/data/features_compressed.npy",
|
| 20 |
+
"environment/data/labels_compressed.npy"
|
| 21 |
+
),
|
| 22 |
+
"adversarial_detection": (
|
| 23 |
+
"environment/data/features_adversarial.npy",
|
| 24 |
+
"environment/data/labels_adversarial.npy"
|
| 25 |
+
),
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
class VoiceAuthenticityEnv:
|
| 29 |
+
def __init__(self, task_name: str = "clean_detection"):
|
| 30 |
+
assert task_name in TASKS, f"Unknown task: {task_name}"
|
| 31 |
+
self.task_name = task_name
|
| 32 |
+
self.difficulty = DIFFICULTY_MAP[task_name]
|
| 33 |
+
|
| 34 |
+
feat_file, label_file = DATA_FILES[task_name]
|
| 35 |
+
self.features = np.load(feat_file)
|
| 36 |
+
self.labels = np.load(label_file)
|
| 37 |
+
|
| 38 |
+
# Load raw features for interpretable key values
|
| 39 |
+
self.raw_features = np.load("environment/data/features_raw.npy")
|
| 40 |
+
|
| 41 |
+
self.indices = list(range(len(self.labels)))
|
| 42 |
+
self.current_idx = None
|
| 43 |
+
self.step_number = 0
|
| 44 |
+
self.done = False
|
| 45 |
+
self.max_steps = 1
|
| 46 |
+
|
| 47 |
+
def reset(self):
|
| 48 |
+
self.step_number = 0
|
| 49 |
+
self.done = False
|
| 50 |
+
self.current_idx = random.choice(self.indices)
|
| 51 |
+
return self._make_observation()
|
| 52 |
+
|
| 53 |
+
def step(self, action: dict):
|
| 54 |
+
if self.done:
|
| 55 |
+
raise RuntimeError("Episode done. Call reset().")
|
| 56 |
+
|
| 57 |
+
from environment.graders import grade
|
| 58 |
+
true_label = int(self.labels[self.current_idx])
|
| 59 |
+
reward = grade(true_label, action, self.difficulty)
|
| 60 |
+
|
| 61 |
+
self.step_number += 1
|
| 62 |
+
self.done = True
|
| 63 |
+
|
| 64 |
+
obs = self._make_observation()
|
| 65 |
+
info = {
|
| 66 |
+
"true_label": true_label,
|
| 67 |
+
"difficulty": self.difficulty,
|
| 68 |
+
"task": self.task_name
|
| 69 |
+
}
|
| 70 |
+
return obs, reward, self.done, info
|
| 71 |
+
|
| 72 |
+
def state(self):
|
| 73 |
+
return {
|
| 74 |
+
"task_name": self.task_name,
|
| 75 |
+
"difficulty": self.difficulty,
|
| 76 |
+
"step_number": self.step_number,
|
| 77 |
+
"done": self.done,
|
| 78 |
+
"current_idx": self.current_idx
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def _make_observation(self) -> VoiceObservation:
|
| 82 |
+
feat = self.features[self.current_idx].tolist()
|
| 83 |
+
raw = self.raw_features[self.current_idx]
|
| 84 |
+
|
| 85 |
+
hint = None
|
| 86 |
+
if self.difficulty == "medium":
|
| 87 |
+
hint = "Audio has been codec-compressed. Features may be degraded."
|
| 88 |
+
elif self.difficulty == "hard":
|
| 89 |
+
hint = "Warning: adversarial sample β synthetic audio crafted to mimic real speech."
|
| 90 |
+
|
| 91 |
+
return VoiceObservation(
|
| 92 |
+
features = feat,
|
| 93 |
+
task_name = self.task_name,
|
| 94 |
+
step_number = self.step_number,
|
| 95 |
+
difficulty = self.difficulty,
|
| 96 |
+
sample_id = int(self.current_idx),
|
| 97 |
+
hint = (hint or "") + f" | Key values β jitter={raw[42]:.5f} shimmer={raw[43]:.5f} hnr={raw[44]:.4f}"
|
| 98 |
+
)
|
environment/graders.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def grade(true_label: int, action: dict, difficulty: str) -> float:
|
| 2 |
+
label = action.get("label")
|
| 3 |
+
confidence = action.get("confidence", 0.5)
|
| 4 |
+
correct = (label == true_label)
|
| 5 |
+
|
| 6 |
+
if difficulty == "easy":
|
| 7 |
+
if correct:
|
| 8 |
+
return 1.0
|
| 9 |
+
else:
|
| 10 |
+
return 0.0
|
| 11 |
+
|
| 12 |
+
elif difficulty == "medium":
|
| 13 |
+
if correct:
|
| 14 |
+
# reward confidence when correct
|
| 15 |
+
base = 0.6
|
| 16 |
+
bonus = 0.4 * confidence
|
| 17 |
+
return round(base + bonus, 3)
|
| 18 |
+
else:
|
| 19 |
+
# penalize overconfidence when wrong
|
| 20 |
+
penalty = 0.3 * confidence
|
| 21 |
+
return round(max(0.0, 0.2 - penalty), 3)
|
| 22 |
+
|
| 23 |
+
elif difficulty == "hard":
|
| 24 |
+
if correct:
|
| 25 |
+
# correct but penalize overconfidence (hard task, be humble)
|
| 26 |
+
base = 0.5
|
| 27 |
+
calibration_bonus = 0.5 * (1 - abs(confidence - 0.7))
|
| 28 |
+
return round(base + calibration_bonus, 3)
|
| 29 |
+
else:
|
| 30 |
+
if confidence < 0.4:
|
| 31 |
+
return 0.15 # wrong but appropriately uncertain
|
| 32 |
+
else:
|
| 33 |
+
return 0.0 # wrong + overconfident = worst case
|
environment/models.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import Optional, List
|
| 3 |
+
|
| 4 |
+
class VoiceObservation(BaseModel):
|
| 5 |
+
features: List[float]
|
| 6 |
+
task_name: str
|
| 7 |
+
step_number: int
|
| 8 |
+
difficulty: str
|
| 9 |
+
sample_id: int
|
| 10 |
+
hint: Optional[str] = None # extra context for hard task
|
| 11 |
+
|
| 12 |
+
class VoiceAction(BaseModel):
|
| 13 |
+
label: int = Field(..., ge=0, le=1) # 0=real, 1=synthetic
|
| 14 |
+
confidence: float = Field(..., ge=0.0, le=1.0)
|
| 15 |
+
reasoning: str = Field(default="")
|
| 16 |
+
|
| 17 |
+
class VoiceReward(BaseModel):
|
| 18 |
+
score: float
|
| 19 |
+
correct: bool
|
| 20 |
+
confidence_penalty: float
|
| 21 |
+
breakdown: str
|
inference.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
load_dotenv()
|
| 3 |
+
import asyncio
|
| 4 |
+
import os
|
| 5 |
+
import textwrap
|
| 6 |
+
import json
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
from openai import OpenAI
|
| 9 |
+
from environment.env import VoiceAuthenticityEnv
|
| 10 |
+
from environment.models import VoiceAction
|
| 11 |
+
|
| 12 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 13 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 14 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 15 |
+
TASK_NAME = os.getenv("VOICE_TASK", "clean_detection")
|
| 16 |
+
BENCHMARK = "voice-authenticity"
|
| 17 |
+
MAX_STEPS = 1
|
| 18 |
+
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 19 |
+
|
| 20 |
+
SYSTEM_PROMPT = textwrap.dedent("""
|
| 21 |
+
You are an expert audio forensics agent detecting synthetic (AI-generated) speech.
|
| 22 |
+
You receive a 48-dimensional normalized feature vector AND key raw values in the hint.
|
| 23 |
+
|
| 24 |
+
Always use the KEY VALUES in the hint for classification:
|
| 25 |
+
|
| 26 |
+
REAL speech thresholds (from dataset):
|
| 27 |
+
- jitter > 0.025
|
| 28 |
+
- shimmer > 0.10
|
| 29 |
+
- hnr < 12.0
|
| 30 |
+
|
| 31 |
+
SYNTHETIC speech thresholds:
|
| 32 |
+
- jitter < 0.020
|
| 33 |
+
- shimmer < 0.09
|
| 34 |
+
- hnr > 12.0
|
| 35 |
+
|
| 36 |
+
When in doubt, lower your confidence. Never exceed 0.85 confidence on hard tasks.
|
| 37 |
+
|
| 38 |
+
Respond ONLY with valid JSON:
|
| 39 |
+
{"label": 0 or 1, "confidence": 0.0-1.0, "reasoning": "brief"}
|
| 40 |
+
0 = real human speech
|
| 41 |
+
1 = synthetic/AI-generated speech
|
| 42 |
+
""").strip()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def log_start(task, env, model):
|
| 46 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 47 |
+
|
| 48 |
+
def log_step(step, action, reward, done, error):
|
| 49 |
+
error_val = error if error else "null"
|
| 50 |
+
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error_val}", flush=True)
|
| 51 |
+
|
| 52 |
+
def log_end(success, steps, score, rewards):
|
| 53 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 54 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_agent_action(client, observation) -> dict:
|
| 58 |
+
user_prompt = f"""
|
| 59 |
+
Audio sample features: {observation.features}
|
| 60 |
+
Task: {observation.task_name} (difficulty: {observation.difficulty})
|
| 61 |
+
{f'Note: {observation.hint}' if observation.hint else ''}
|
| 62 |
+
|
| 63 |
+
Classify this audio sample. Respond with JSON only. Keep reasoning under 100 characters.
|
| 64 |
+
"""
|
| 65 |
+
try:
|
| 66 |
+
completion = client.chat.completions.create(
|
| 67 |
+
model=MODEL_NAME,
|
| 68 |
+
messages=[
|
| 69 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 70 |
+
{"role": "user", "content": user_prompt.strip()}
|
| 71 |
+
],
|
| 72 |
+
temperature=0.3,
|
| 73 |
+
max_tokens=120,
|
| 74 |
+
stream=False
|
| 75 |
+
)
|
| 76 |
+
text = completion.choices[0].message.content.strip()
|
| 77 |
+
text = text.replace("```json", "").replace("```", "").strip()
|
| 78 |
+
last_brace = text.rfind("}")
|
| 79 |
+
if last_brace != -1:
|
| 80 |
+
text = text[:last_brace + 1]
|
| 81 |
+
result = json.loads(text)
|
| 82 |
+
result["label"] = int(result.get("label", 0))
|
| 83 |
+
result["confidence"] = float(result.get("confidence", 0.5))
|
| 84 |
+
result["label"] = result["label"] if result["label"] in [0,1] else 0
|
| 85 |
+
result["confidence"] = max(0.0, min(1.0, result["confidence"]))
|
| 86 |
+
return result
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"[DEBUG] Model error: {e}", flush=True)
|
| 89 |
+
return {"label": 0, "confidence": 0.5, "reasoning": "fallback"}
|
| 90 |
+
async def run_task(client, task_name: str):
|
| 91 |
+
env = VoiceAuthenticityEnv(task_name=task_name)
|
| 92 |
+
rewards = []
|
| 93 |
+
steps_taken = 0
|
| 94 |
+
success = False
|
| 95 |
+
score = 0.0
|
| 96 |
+
|
| 97 |
+
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
obs = env.reset()
|
| 101 |
+
|
| 102 |
+
for step in range(1, MAX_STEPS + 1):
|
| 103 |
+
action_dict = get_agent_action(client, obs)
|
| 104 |
+
action_str = json.dumps(action_dict)
|
| 105 |
+
|
| 106 |
+
obs, reward, done, info = env.step(action_dict)
|
| 107 |
+
|
| 108 |
+
rewards.append(reward)
|
| 109 |
+
steps_taken = step
|
| 110 |
+
error = None
|
| 111 |
+
|
| 112 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 113 |
+
|
| 114 |
+
if done:
|
| 115 |
+
break
|
| 116 |
+
|
| 117 |
+
score = sum(rewards) / len(rewards) if rewards else 0.0
|
| 118 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 119 |
+
|
| 120 |
+
finally:
|
| 121 |
+
score_val = sum(rewards) / len(rewards) if rewards else 0.0
|
| 122 |
+
log_end(success=success, steps=steps_taken, score=score_val, rewards=rewards)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
async def main():
|
| 126 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 127 |
+
tasks = ["clean_detection", "compressed_detection", "adversarial_detection"]
|
| 128 |
+
for task in tasks:
|
| 129 |
+
await run_task(client, task)
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
asyncio.run(main())
|
openenv.yaml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: voice-authenticity
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: "Voice authenticity detection across real-world degradation conditions"
|
| 4 |
+
author: "Akshara-Sharma"
|
| 5 |
+
tags: ["speech", "fraud-detection", "content-moderation", "audio"]
|
| 6 |
+
tasks:
|
| 7 |
+
- name: clean_detection
|
| 8 |
+
difficulty: easy
|
| 9 |
+
description: "Classify real vs synthetic speech from clean audio features"
|
| 10 |
+
- name: compressed_detection
|
| 11 |
+
difficulty: medium
|
| 12 |
+
description: "Classify speech under codec compression degradation"
|
| 13 |
+
- name: adversarial_detection
|
| 14 |
+
difficulty: hard
|
| 15 |
+
description: "Classify adversarially crafted synthetic speech"
|
| 16 |
+
observation_space:
|
| 17 |
+
type: object
|
| 18 |
+
properties:
|
| 19 |
+
features:
|
| 20 |
+
type: array
|
| 21 |
+
description: "48-dim feature vector: MFCCs, jitter, shimmer, HNR"
|
| 22 |
+
task_name:
|
| 23 |
+
type: string
|
| 24 |
+
step_number:
|
| 25 |
+
type: integer
|
| 26 |
+
difficulty:
|
| 27 |
+
type: string
|
| 28 |
+
action_space:
|
| 29 |
+
type: object
|
| 30 |
+
properties:
|
| 31 |
+
label:
|
| 32 |
+
type: integer
|
| 33 |
+
description: "0=real, 1=synthetic"
|
| 34 |
+
confidence:
|
| 35 |
+
type: number
|
| 36 |
+
description: "confidence in [0.0, 1.0]"
|
| 37 |
+
reasoning:
|
| 38 |
+
type: string
|
| 39 |
+
description: "brief explanation of decision"
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
librosa
|
| 2 |
+
praat-parselmouth
|
| 3 |
+
scikit-learn
|
| 4 |
+
numpy
|
| 5 |
+
openai
|
| 6 |
+
pydantic
|
| 7 |
+
python-dotenv
|
| 8 |
+
soundfile
|
scripts/download_data.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
import soundfile as sf
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
os.makedirs("data/real", exist_ok=True)
|
| 6 |
+
os.makedirs("data/fake", exist_ok=True)
|
| 7 |
+
|
| 8 |
+
dataset = load_dataset("garystafford/deepfake-audio-detection", split="train")
|
| 9 |
+
|
| 10 |
+
real_count = 0
|
| 11 |
+
fake_count = 0
|
| 12 |
+
|
| 13 |
+
for item in dataset:
|
| 14 |
+
audio = item["audio"]
|
| 15 |
+
label = item["label"] # 0=real, 1=fake
|
| 16 |
+
|
| 17 |
+
if label == 0 and real_count < 250:
|
| 18 |
+
sf.write(f"data/real/real_{real_count:04d}.wav",
|
| 19 |
+
audio["array"], audio["sampling_rate"])
|
| 20 |
+
real_count += 1
|
| 21 |
+
|
| 22 |
+
elif label == 1 and fake_count < 250:
|
| 23 |
+
sf.write(f"data/fake/fake_{fake_count:04d}.wav",
|
| 24 |
+
audio["array"], audio["sampling_rate"])
|
| 25 |
+
fake_count += 1
|
| 26 |
+
|
| 27 |
+
if real_count >= 250 and fake_count >= 250:
|
| 28 |
+
break
|
| 29 |
+
|
| 30 |
+
print(f"Downloaded: {real_count} real, {fake_count} fake")
|
scripts/extract_features.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import librosa
|
| 3 |
+
import parselmouth
|
| 4 |
+
from parselmouth.praat import call
|
| 5 |
+
import os
|
| 6 |
+
import warnings
|
| 7 |
+
warnings.filterwarnings("ignore")
|
| 8 |
+
|
| 9 |
+
REAL_DIR = "data/real"
|
| 10 |
+
FAKE_DIR = "data/fake"
|
| 11 |
+
OUTPUT_DIR = "environment/data"
|
| 12 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def extract_features(file_path):
|
| 16 |
+
"""
|
| 17 |
+
Extract 48-dim feature vector from audio file.
|
| 18 |
+
Returns None if file fails.
|
| 19 |
+
"""
|
| 20 |
+
try:
|
| 21 |
+
# Load audio
|
| 22 |
+
y, sr = librosa.load(file_path, sr=16000, duration=5.0)
|
| 23 |
+
|
| 24 |
+
if len(y) < 1600: # skip clips shorter than 0.1s
|
| 25 |
+
return None
|
| 26 |
+
|
| 27 |
+
# ββ MFCC (40 features) ββββββββββββββββββββββββββββββ
|
| 28 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
|
| 29 |
+
mfcc_mean = mfcc.mean(axis=1) # 20 values
|
| 30 |
+
mfcc_std = mfcc.std(axis=1) # 20 values
|
| 31 |
+
|
| 32 |
+
# ββ Spectral features (2 features) ββββββββββββββββββ
|
| 33 |
+
zcr = librosa.feature.zero_crossing_rate(y).mean()
|
| 34 |
+
spec_centroid = librosa.feature.spectral_centroid(
|
| 35 |
+
y=y, sr=sr).mean()
|
| 36 |
+
|
| 37 |
+
# ββ Voice authenticity features (3 features) ββββββββ
|
| 38 |
+
# These are the KEY discriminators between real and fake
|
| 39 |
+
try:
|
| 40 |
+
snd = parselmouth.Sound(file_path)
|
| 41 |
+
pp = call(snd, "To PointProcess (periodic, cc)", 75, 500)
|
| 42 |
+
|
| 43 |
+
jitter = call(
|
| 44 |
+
pp, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3
|
| 45 |
+
)
|
| 46 |
+
shimmer = call(
|
| 47 |
+
[snd, pp], "Get shimmer (local)",
|
| 48 |
+
0, 0, 0.0001, 0.02, 1.3, 1.6
|
| 49 |
+
)
|
| 50 |
+
harmonicity = call(
|
| 51 |
+
snd, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0
|
| 52 |
+
)
|
| 53 |
+
hnr = call(harmonicity, "Get mean", 0, 0)
|
| 54 |
+
|
| 55 |
+
# Replace NaN/inf with 0
|
| 56 |
+
jitter = float(jitter) if np.isfinite(jitter) else 0.0
|
| 57 |
+
shimmer = float(shimmer) if np.isfinite(shimmer) else 0.0
|
| 58 |
+
hnr = float(hnr) if np.isfinite(hnr) else 0.0
|
| 59 |
+
|
| 60 |
+
except Exception:
|
| 61 |
+
jitter, shimmer, hnr = 0.0, 0.0, 0.0
|
| 62 |
+
|
| 63 |
+
# ββ Compression artifact features (3 features) ββββββ
|
| 64 |
+
# Simulates codec degradation for task 2
|
| 65 |
+
spec_bandwidth = librosa.feature.spectral_bandwidth(
|
| 66 |
+
y=y, sr=sr).mean()
|
| 67 |
+
spec_rolloff = librosa.feature.spectral_rolloff(
|
| 68 |
+
y=y, sr=sr).mean()
|
| 69 |
+
rms = librosa.feature.rms(y=y).mean()
|
| 70 |
+
|
| 71 |
+
# ββ Assemble final 48-dim vector βββββββββββββββββββββ
|
| 72 |
+
features = np.concatenate([
|
| 73 |
+
mfcc_mean, # 0-19
|
| 74 |
+
mfcc_std, # 20-39
|
| 75 |
+
[zcr, spec_centroid], # 40-41
|
| 76 |
+
[jitter, shimmer, hnr], # 42-44
|
| 77 |
+
[spec_bandwidth, spec_rolloff, rms] # 45-47
|
| 78 |
+
])
|
| 79 |
+
|
| 80 |
+
return features.astype(np.float32)
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f" ERROR on {file_path}: {e}")
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def process_directory(directory, label, desc):
|
| 88 |
+
files = [
|
| 89 |
+
f for f in os.listdir(directory)
|
| 90 |
+
if f.endswith((".wav", ".flac", ".mp3"))
|
| 91 |
+
]
|
| 92 |
+
print(f"\nProcessing {desc}: {len(files)} files found")
|
| 93 |
+
|
| 94 |
+
features_list = []
|
| 95 |
+
labels_list = []
|
| 96 |
+
failed = 0
|
| 97 |
+
|
| 98 |
+
for i, fname in enumerate(files):
|
| 99 |
+
path = os.path.join(directory, fname)
|
| 100 |
+
feat = extract_features(path)
|
| 101 |
+
|
| 102 |
+
if feat is not None:
|
| 103 |
+
features_list.append(feat)
|
| 104 |
+
labels_list.append(label)
|
| 105 |
+
if (i + 1) % 50 == 0:
|
| 106 |
+
print(f" {i+1}/{len(files)} done...")
|
| 107 |
+
else:
|
| 108 |
+
failed += 1
|
| 109 |
+
|
| 110 |
+
print(f" Success: {len(features_list)}, Failed: {failed}")
|
| 111 |
+
return features_list, labels_list
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def add_compression_artifacts(features, strength=0.3):
|
| 115 |
+
degraded = features.copy()
|
| 116 |
+
|
| 117 |
+
degraded[20:40] *= (1 - strength * np.random.uniform(0.5, 1.0, 20))
|
| 118 |
+
degraded[42] *= (1 - strength * np.random.uniform(0.3, 0.7))
|
| 119 |
+
degraded[43] *= (1 - strength * np.random.uniform(0.3, 0.7))
|
| 120 |
+
degraded[44] *= (1 + strength * np.random.uniform(0.1, 0.4))
|
| 121 |
+
degraded[45] *= (1 + strength * np.random.uniform(0.3, 0.8))
|
| 122 |
+
degraded[46] *= (1 - strength * np.random.uniform(0.2, 0.6))
|
| 123 |
+
degraded[47] += strength * np.random.uniform(0.1, 0.4)
|
| 124 |
+
|
| 125 |
+
return degraded
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def add_adversarial_perturbation(features, label):
|
| 129 |
+
perturbed = features.copy()
|
| 130 |
+
|
| 131 |
+
if label == 1:
|
| 132 |
+
perturbed[42] += np.random.uniform(0.005, 0.02)
|
| 133 |
+
perturbed[43] += np.random.uniform(0.01, 0.05)
|
| 134 |
+
perturbed[44] -= np.random.uniform(1.0, 3.0)
|
| 135 |
+
|
| 136 |
+
return perturbed
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def main():
|
| 140 |
+
print("=" * 50)
|
| 141 |
+
print("Feature Extraction Pipeline")
|
| 142 |
+
print("=" * 50)
|
| 143 |
+
|
| 144 |
+
real_feat, real_labels = process_directory(
|
| 145 |
+
REAL_DIR, label=0, desc="REAL audio"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
fake_feat, fake_labels = process_directory(
|
| 149 |
+
FAKE_DIR, label=1, desc="FAKE audio"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
all_features = np.array(real_feat + fake_feat, dtype=np.float32)
|
| 153 |
+
all_labels = np.array(real_labels + fake_labels, dtype=np.int32)
|
| 154 |
+
|
| 155 |
+
idx = np.random.permutation(len(all_labels))
|
| 156 |
+
all_features = all_features[idx]
|
| 157 |
+
all_labels = all_labels[idx]
|
| 158 |
+
|
| 159 |
+
mean = all_features.mean(axis=0)
|
| 160 |
+
std = all_features.std(axis=0) + 1e-8
|
| 161 |
+
all_features_norm = (all_features - mean) / std
|
| 162 |
+
|
| 163 |
+
np.save(f"{OUTPUT_DIR}/features.npy", all_features_norm)
|
| 164 |
+
|
| 165 |
+
# Save raw unnormalized features for env to use
|
| 166 |
+
np.save(f"{OUTPUT_DIR}/features_raw.npy", all_features)
|
| 167 |
+
|
| 168 |
+
np.save(f"{OUTPUT_DIR}/labels.npy", all_labels)
|
| 169 |
+
np.save(f"{OUTPUT_DIR}/mean.npy", mean)
|
| 170 |
+
np.save(f"{OUTPUT_DIR}/std.npy", std)
|
| 171 |
+
|
| 172 |
+
print(f"\nTask 1 (clean): {len(all_labels)} samples saved")
|
| 173 |
+
|
| 174 |
+
# ββ TASK 2: Compressed features βββββββββββββββββββββββββ
|
| 175 |
+
compressed_features = np.array([
|
| 176 |
+
add_compression_artifacts(f, strength=0.3)
|
| 177 |
+
for f in (real_feat + fake_feat)
|
| 178 |
+
], dtype=np.float32)
|
| 179 |
+
|
| 180 |
+
compressed_features = compressed_features[idx]
|
| 181 |
+
compressed_norm = (compressed_features - mean) / std
|
| 182 |
+
|
| 183 |
+
np.save(f"{OUTPUT_DIR}/features_compressed.npy", compressed_norm)
|
| 184 |
+
np.save(f"{OUTPUT_DIR}/labels_compressed.npy", all_labels)
|
| 185 |
+
|
| 186 |
+
print(f"Task 2 (compressed): {len(all_labels)} samples saved")
|
| 187 |
+
|
| 188 |
+
# ββ TASK 3: Adversarial features ββββββββββββββββββββββββ
|
| 189 |
+
raw_combined = real_feat + fake_feat
|
| 190 |
+
raw_labels_combined = real_labels + fake_labels
|
| 191 |
+
|
| 192 |
+
adversarial_features = np.array([
|
| 193 |
+
add_adversarial_perturbation(f, l)
|
| 194 |
+
for f, l in zip(raw_combined, raw_labels_combined)
|
| 195 |
+
], dtype=np.float32)
|
| 196 |
+
|
| 197 |
+
adversarial_features = adversarial_features[idx]
|
| 198 |
+
adversarial_norm = (adversarial_features - mean) / std
|
| 199 |
+
|
| 200 |
+
np.save(f"{OUTPUT_DIR}/features_adversarial.npy", adversarial_norm)
|
| 201 |
+
np.save(f"{OUTPUT_DIR}/labels_adversarial.npy", all_labels)
|
| 202 |
+
|
| 203 |
+
print(f"Task 3 (adversarial): {len(all_labels)} samples saved")
|
| 204 |
+
|
| 205 |
+
print(f"\n{'='*50}")
|
| 206 |
+
print("DONE")
|
| 207 |
+
print(f"Total samples : {len(all_labels)}")
|
| 208 |
+
print(f"Real samples : {all_labels.tolist().count(0)}")
|
| 209 |
+
print(f"Fake samples : {all_labels.tolist().count(1)}")
|
| 210 |
+
print(f"Feature shape : {all_features_norm.shape}")
|
| 211 |
+
print(f"{'='*50}")
|
| 212 |
+
|
| 213 |
+
print("\nSanity check β jitter/shimmer/HNR comparison:")
|
| 214 |
+
for i in range(min(2, len(all_labels))):
|
| 215 |
+
label_str = "REAL" if all_labels[i] == 0 else "FAKE"
|
| 216 |
+
print(f"\n [{label_str}]")
|
| 217 |
+
print(f" Clean β jitter={all_features[i][42]:.4f} shimmer={all_features[i][43]:.4f} hnr={all_features[i][44]:.4f}")
|
| 218 |
+
print(f" Compressed β jitter={compressed_features[i][42]:.4f} shimmer={compressed_features[i][43]:.4f} hnr={compressed_features[i][44]:.4f}")
|
| 219 |
+
print(f" Adversarialβ jitter={adversarial_features[i][42]:.4f} shimmer={adversarial_features[i][43]:.4f} hnr={adversarial_features[i][44]:.4f}")
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
if __name__ == "__main__":
|
| 223 |
+
main()
|