Spaces:

Obiang
/

Pro-TeVA

Runtime error

App Files Files Community

Obiang commited on Oct 20, 2025

Commit

62cb0ac

1 Parent(s): 367817f

first commit

Browse files

Files changed (37) hide show

.gitignore +87 -0
CKPT+2025-10-20+08-19-07+00/CKPT.yaml +4 -0
CKPT+2025-10-20+08-19-07+00/brain.ckpt +3 -0
CKPT+2025-10-20+08-19-07+00/counter.ckpt +3 -0
CKPT+2025-10-20+08-19-07+00/lr_annealing.ckpt +3 -0
CKPT+2025-10-20+08-19-07+00/lr_annealing_wav2vec.ckpt +3 -0
CKPT+2025-10-20+08-19-07+00/model.ckpt +3 -0
CKPT+2025-10-20+08-19-07+00/optimizer.ckpt +3 -0
CKPT+2025-10-20+08-19-07+00/optimizer_wav2vec.ckpt +3 -0
CKPT+2025-10-20+08-19-07+00/tokenizer.ckpt +3 -0
CKPT+2025-10-20+08-19-07+00/wav2vec2.ckpt +3 -0
README.md +139 -6
_docs/IMPLEMENTATION_SUMMARY.md +275 -0
_docs/VENV_SETUP.md +220 -0
_docs/proteva_complete_deployment.md +1441 -0
app.py +290 -0
config.py +167 -0
custom_interface.py +293 -0
examples/yof_00295_00024634140.wav +3 -0
examples/yof_00295_00151151204.wav +3 -0
examples/yof_00295_00427144639.wav +3 -0
examples/yof_00295_00564596981.wav +3 -0
examples/yof_00295_00654803226.wav +3 -0
examples/yof_00295_01329504028.wav +3 -0
examples/yof_00295_01428115987.wav +3 -0
examples/yom_08784_01544027142.wav +3 -0
examples/yom_08784_01571599993.wav +3 -0
examples/yom_08784_01716814128.wav +3 -0
examples/yom_08784_01792196659.wav +3 -0
examples/yom_08784_01855888561.wav +3 -0
examples/yom_09334_00045442417.wav +3 -0
examples/yom_09334_00091591408.wav +3 -0
examples/yom_09334_00167629780.wav +3 -0
inference.yaml +120 -0
labelencoder.txt +7 -0
modules.py +340 -0
requirements.txt +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,87 @@

+# Python virtual environment
+venv/
+env/
+ENV/
+.venv
+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# Distribution / packaging
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+*.manifest
+*.spec
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS files
+.DS_Store
+Thumbs.db
+# Gradio cache
+flagged/
+gradio_cached_examples/
+# SpeechBrain
+pretrained_model/
+whubert_checkpoint/
+results/
+save/
+# Model checkpoints (if you don't want to track them)
+# Uncomment the line below if checkpoints are too large for git
+# CKPT*/
+# Logs
+*.log
+logs/
+# Temporary files
+*.tmp
+temp/
+tmp/
+PITCH/

CKPT+2025-10-20+08-19-07+00/CKPT.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# yamllint disable
+CER: 17.354776764282285
+end-of-epoch: true
+unixtime: 1760948347.932281

CKPT+2025-10-20+08-19-07+00/brain.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a006b4494d46ac9b0ea15eab28666eafb68e0a68bcb8c15c07edd35285bd0e5
+size 50

CKPT+2025-10-20+08-19-07+00/counter.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:108c995b953c8a35561103e2014cf828eb654a99e310f87fab94c2f4b7d2a04f
+size 2

CKPT+2025-10-20+08-19-07+00/lr_annealing.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a55d7b1344e7db29061cdf1888822daaf1597e3df4e4060c6deca9bf690a829e
+size 1931

CKPT+2025-10-20+08-19-07+00/lr_annealing_wav2vec.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12bc3c2adbcc92643e4bad84a8619e7300948050cfc2e86423f9bfd9c2c31090
+size 1979

CKPT+2025-10-20+08-19-07+00/model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:892fa9f449acd39c6a85a1d456f05d050ef030efc3dbc5a64dbf1984a3e26800
+size 38091995

CKPT+2025-10-20+08-19-07+00/optimizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eac563e0988030eb01a32dded244a7ce8dc93d1806bac41e6a9259173a0d51fc
+size 76194782

CKPT+2025-10-20+08-19-07+00/optimizer_wav2vec.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:deea19e921cbc0876edc13bccea586285ee87c9a2fe91b7c2f81b27f456c8ca3
+size 2025

CKPT+2025-10-20+08-19-07+00/tokenizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:304a02a791e8409f8f7065d83fca6c755fafacbc698e82cd7c9e3df1cb4f254d
+size 144

CKPT+2025-10-20+08-19-07+00/wav2vec2.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3017b9c7e9e90167daa376ee7a010d10b4fea49bf4a2d3ba72eb13bb469093fc
+size 377574002

README.md CHANGED Viewed

@@ -1,14 +1,147 @@
 ---
-title: Pro TeVA
-emoji: 😻
-colorFrom: pink
-colorTo: pink
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: 'ProTeVa: AI-powered tone recognition for Yoruba language.'
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ProTeVa Yoruba Tone Recognition
+emoji: 🎵
+colorFrom: blue
+colorTo: green
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: 'ProTeVa: AI-powered tone recognition for Yoruba language with word boundary detection.'
 ---
+# ProTeVa: Yoruba Tone Recognition
+This Space demonstrates **ProTeVa** (Prototype-based Tone Variant Autoencoder), a neural model for recognizing tone patterns in Yoruba language with intelligent word boundary detection.
+## Features
+- 🎤 **Record or Upload**: Use your microphone or upload audio files
+- 🎯 **Tone Detection**: Automatically detects 3 Yoruba tones (High, Low, Mid)
+- 🔍 **Word Boundaries**: Intelligent space detection using acoustic features
+- 📊 **F0 Visualization**: Shows fundamental frequency contours
+- 🎨 **Interactive UI**: Real-time predictions with visual feedback
+## Yoruba Tones
+Yoruba is a tonal language with three contrastive tones:
+1. **High Tone (H)** (◌́) - Example: ágbó (elder)
+2. **Low Tone (B)** (◌̀) - Example: àgbò (ram)
+3. **Mid Tone (M)** (◌) - Example: agbo (medicine)
+## Model Architecture
+- **Feature Extractor**: HuBERT (Orange/SSA-HuBERT-base-60k)
+- **Encoder**: 2-layer Bidirectional GRU (512 hidden units)
+- **Decoder**: VanillaNN (2 blocks, 512 neurons)
+- **Prototype Layer**: 10 learnable tone prototypes
+- **F0 Reconstruction**: TorchYIN pitch estimation
+- **Output**: CTC-based sequence prediction
+- **Space Detection**: Multi-method acoustic boundary detection
+## Space Detection
+ProTeVa uses intelligent post-processing to detect word boundaries:
+### Detection Methods
+1. **Silence Detection**: Identifies pauses in speech using F0 analysis
+2. **F0 Drop Detection**: Detects pitch resets typical of word boundaries
+3. **Combined Method** (default): Fuses multiple acoustic cues for robust detection
+### Configuration
+The model's behavior can be customized via `config.py`:
+```python
+ENABLE_SPACE_DETECTION = True
+SPACE_DETECTION_METHOD = "combined"  # 'silence', 'f0_drop', 'duration', 'combined'
+SILENCE_THRESHOLD = 0.15  # seconds
+F0_DROP_THRESHOLD = 0.20  # 20% pitch drop
+```
+## Training Details
+- **Dataset**: Yoruba speech corpus
+- **Sample Rate**: 16kHz
+- **Loss Functions**:
+  - CTC loss for tone sequence
+  - MSE loss for F0 reconstruction
+  - Prototype regularization (R₁ + R₂)
+- **Training Duration**: 65 epochs
+- **Best CER**: 17.35%
+## Label Encoding
+Based on the trained model's tokenizer:
+- **0**: Blank (CTC blank token)
+- **1**: High Tone (H)
+- **2**: Low Tone (B)
+- **3**: Mid Tone (M)
+- **4**: Space (post-processing detection)
+## Usage
+1. Click on the microphone icon to record or upload an audio file
+2. Speak clearly in Yoruba
+3. Click "🔍 Predict Tones"
+4. View predicted tone sequence, word boundaries, and F0 contour
+### Tips for Best Results
+- Speak clearly with natural prosody
+- Keep recordings under 10 seconds
+- Avoid background noise
+- Pause slightly between words for better boundary detection
+## Technical Implementation
+### Files Structure
+```
+.
+├── config.py              # Central configuration
+├── app.py                 # Gradio UI
+├── custom_interface.py    # SpeechBrain interface + space detection
+├── modules.py             # Custom PyTorch modules
+├── inference.yaml         # Model configuration
+├── requirements.txt       # Dependencies
+└── CKPT+*/               # Model checkpoints
+```
+### Key Components
+- **F0Extractor**: TorchYIN-based pitch estimation
+- **PrototypeLayer**: Learnable tone pattern prototypes
+- **PitchDecoderLayer**: F0 reconstruction decoder
+- **Space Detection**: Acoustic-based word boundary detection
+## Citation
+If you use this model in your research, please cite:
+```bibtex
+@article{proteva2025,
+  title={ProTeVa: Prototype-based Tone Variant Autoencoder for Yoruba Tone Recognition},
+  author={Your Name},
+  year={2025},
+  note={Hugging Face Space}
+}
+```
+## Acknowledgments
+- Built with ❤️ using [SpeechBrain](https://speechbrain.github.io/) and [Gradio](https://gradio.app/)
+- HuBERT model: [Orange/SSA-HuBERT-base-60k](https://huggingface.co/Orange/SSA-HuBERT-base-60k)
+- F0 extraction: [TorchYIN](https://github.com/brentspell/torch-yin)
+## License
+Apache 2.0
+## Contact
+For questions or issues, please open an issue on the repository.

_docs/IMPLEMENTATION_SUMMARY.md ADDED Viewed

	@@ -0,0 +1,275 @@

+# ProTeVa Implementation Summary
+## ✅ Implementation Complete
+All files have been created and configured for ProTeVa deployment with intelligent space detection.
+---
+## 📁 Created Files
+### Core Application Files
+1. **[config.py](config.py)** - Central configuration
+   - Checkpoint folder path: `CKPT+2025-10-20+08-19-07+00`
+   - Space detection settings (enabled by default)
+   - Tone label mappings (H=1, B=2, M=3)
+   - Visualization configurations
+   - Helper functions for validation
+2. **[app.py](app.py)** - Gradio UI application
+   - Interactive web interface
+   - Audio recording and upload
+   - Tone visualization with space markers
+   - F0 contour plotting
+   - Real-time statistics
+   - Imports configuration from `config.py`
+3. **[custom_interface.py](custom_interface.py)** - SpeechBrain interface
+   - Model loading and inference
+   - **Space detection implementation**:
+     - Silence-based detection
+     - F0 drop detection
+     - Duration-based detection
+     - Combined method (recommended)
+   - Post-processing for word boundaries
+4. **[modules.py](modules.py)** - Custom PyTorch modules
+   - `F0Extractor`: TorchYIN pitch estimation
+   - `PitchDecoderLayer`: F0 reconstruction
+   - `PrototypeLayer`: Learnable tone prototypes
+5. **[inference.yaml](inference.yaml)** - Model configuration
+   - Model architecture settings
+   - Checkpoint paths
+   - References `config.py` for folder name
+6. **[requirements.txt](requirements.txt)** - Python dependencies
+   - SpeechBrain, Torch, Gradio
+   - TorchYIN for F0 extraction
+   - Visualization libraries
+7. **[README.md](README.md)** - Hugging Face Space documentation
+   - Model description
+   - Space detection explanation
+   - Usage instructions
+   - Technical details
+---
+## 🎯 Key Features Implemented
+### 1. **Centralized Configuration**
+All settings are managed through `config.py`:
+- **Easy checkpoint updates**: Just change `CHECKPOINT_FOLDER`
+- **Configurable space detection**: Enable/disable, choose method, tune thresholds
+- **Single source of truth**: No scattered hardcoded values
+### 2. **Intelligent Space Detection**
+Four detection methods implemented:
+#### Method 1: Silence Detection
+```python
+SPACE_DETECTION_METHOD = "silence"
+```
+- Analyzes F0 for silent gaps
+- Threshold: 0.15 seconds (configurable)
+#### Method 2: F0 Drop Detection
+```python
+SPACE_DETECTION_METHOD = "f0_drop"
+```
+- Detects pitch resets at word boundaries
+- Threshold: 20% drop (configurable)
+#### Method 3: Duration-Based
+```python
+SPACE_DETECTION_METHOD = "duration"
+```
+- Simple heuristic (every N tones)
+- Less accurate but fast
+#### Method 4: Combined (Recommended)
+```python
+SPACE_DETECTION_METHOD = "combined"
+```
+- Fuses silence + F0 drop detection
+- Best balance of precision and recall
+- **Default setting**
+### 3. **Correct Tone Mappings**
+Based on your `labelencoder.txt`:
+- **Label 0**: Blank (CTC)
+- **Label 1**: High Tone (H)
+- **Label 2**: Low Tone (B)
+- **Label 3**: Mid Tone (M)
+- **Label 4**: Space (post-processing)
+### 4. **Enhanced Visualization**
+- Tone sequence with color coding
+- Space markers as vertical separators
+- F0 contour plots
+- Real-time statistics with word count
+---
+## 🚀 Quick Start
+### Update Configuration
+Edit `config.py`:
+```python
+# 1. Set your checkpoint folder
+CHECKPOINT_FOLDER = "CKPT+2025-10-20+08-19-07+00"
+# 2. Configure space detection
+ENABLE_SPACE_DETECTION = True
+SPACE_DETECTION_METHOD = "combined"
+# 3. Fine-tune thresholds (optional)
+SILENCE_THRESHOLD = 0.15      # seconds
+F0_DROP_THRESHOLD = 0.20      # 20% pitch drop
+```
+### Local Testing
+```bash
+# Install dependencies
+pip install -r requirements.txt
+# Run the app
+python app.py
+# Open browser
+# http://localhost:7860
+```
+### Deploy to Hugging Face
+```bash
+# Clone your Space
+git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
+cd YOUR_SPACE_NAME
+# Copy all files
+cp /path/to/Pro-TeVA/*.py .
+cp /path/to/Pro-TeVA/*.yaml .
+cp /path/to/Pro-TeVA/*.txt .
+cp /path/to/Pro-TeVA/README.md .
+cp -r /path/to/Pro-TeVA/CKPT+2025-10-20+08-19-07+00 .
+# Setup Git LFS for large files
+git lfs install
+git lfs track "*.ckpt"
+git add .gitattributes
+# Commit and push
+git add .
+git commit -m "Initial deployment with space detection"
+git push
+```
+---
+## ⚙️ Configuration Options
+### Checkpoint Folder
+```python
+# config.py
+CHECKPOINT_FOLDER = "YOUR_CHECKPOINT_FOLDER_NAME"
+```
+Also update in `inference.yaml`:
+```yaml
+save_folder: ./YOUR_CHECKPOINT_FOLDER_NAME
+```
+### Space Detection Toggle
+```python
+# Disable space detection completely
+ENABLE_SPACE_DETECTION = False
+```
+### Detection Method
+```python
+SPACE_DETECTION_METHOD = "combined"   # Best (default)
+# OR
+SPACE_DETECTION_METHOD = "silence"    # Pause-based only
+# OR
+SPACE_DETECTION_METHOD = "f0_drop"    # Pitch-based only
+# OR
+SPACE_DETECTION_METHOD = "duration"   # Simple heuristic
+```
+### Threshold Tuning
+```python
+# If detecting too many spaces
+SILENCE_THRESHOLD = 0.20        # Increase (more lenient)
+F0_DROP_THRESHOLD = 0.30        # Increase (30% drop required)
+# If detecting too few spaces
+SILENCE_THRESHOLD = 0.10        # Decrease (more sensitive)
+F0_DROP_THRESHOLD = 0.15        # Decrease (15% drop sufficient)
+```
+---
+## 📊 Model Information
+- **Checkpoint**: `CKPT+2025-10-20+08-19-07+00/`
+- **Best CER**: 17.35%
+- **Training**: 65 epochs
+- **Architecture**:
+  - HuBERT feature extractor (768-dim)
+  - 2-layer BiGRU encoder (512 units)
+  - 10 tone prototypes
+  - F0 reconstruction decoder
+  - CTC output layer (4 classes)
+---
+## 🔧 Troubleshooting
+### Issue: Space detection not working
+**Solution**: Ensure F0 extraction is working properly. Check that `torchyin` is installed.
+### Issue: Too many/few spaces detected
+**Solution**: Tune thresholds in `config.py` or try a different detection method.
+### Issue: Checkpoint not found
+**Solution**: Update `CHECKPOINT_FOLDER` in `config.py` and `save_folder` in `inference.yaml`.
+### Issue: Model not loading
+**Solution**: Run `config.validate_config()` to check for missing files.
+---
+## 📝 Next Steps
+1. **Test locally** to ensure everything works
+2. **Tune space detection** parameters based on your audio data
+3. **Deploy to Hugging Face** Spaces
+4. **Monitor performance** and adjust settings as needed
+5. **Update citation** in README.md with your information
+---
+## 🎉 Summary
+You now have a complete ProTeVa deployment with:
+✅ Centralized configuration system
+✅ Intelligent word boundary detection
+✅ Four detection methods (combined recommended)
+✅ Correct tone label mappings
+✅ Enhanced visualizations
+✅ Easy-to-update checkpoint paths
+✅ Complete documentation
+✅ Ready for Hugging Face deployment
+**Configuration file**: [config.py](config.py)
+**Update checkpoint**: Change `CHECKPOINT_FOLDER` in config.py
+**Toggle space detection**: Set `ENABLE_SPACE_DETECTION` True/False
+**Choose method**: Set `SPACE_DETECTION_METHOD` to preferred option
+---
+**Generated**: 2025-10-20
+**Status**: Ready for deployment 🚀

_docs/VENV_SETUP.md ADDED Viewed

	@@ -0,0 +1,220 @@

+# Virtual Environment Setup
+## ✅ Virtual Environment Created
+A virtual environment has been set up with all required dependencies installed.
+---
+## 📦 Installed Packages
+### Core Dependencies
+- **speechbrain**: 1.0.0 (includes torch, torchaudio, numpy, scipy, transformers, huggingface_hub)
+- **torch-yin**: 0.1.3 (F0 extraction)
+- **gradio**: 5.49.1 (UI framework)
+- **librosa**: 0.11.0 (audio processing)
+- **soundfile**: 0.13.1 (audio I/O)
+- **matplotlib**: 3.10.7 (visualization)
+### Automatically Included by SpeechBrain
+- torch: 2.9.0
+- torchaudio: 2.9.0
+- numpy: 2.3.4
+- scipy: 1.16.2
+- sentencepiece: 0.2.1
+- hyperpyyaml: 1.2.2
+- transformers (via huggingface-hub)
+- And all CUDA dependencies
+---
+## 🚀 Usage
+### Activate the Environment
+```bash
+# Linux/Mac
+source venv/bin/activate
+# Windows
+venv\Scripts\activate
+```
+### Deactivate the Environment
+```bash
+deactivate
+```
+### Run the Application
+```bash
+# Activate environment
+source venv/bin/activate
+# Run Gradio app
+python app.py
+# Open browser to http://localhost:7860
+```
+---
+## 📋 Installation from Scratch
+If you need to recreate the environment on another machine:
+```bash
+# Create virtual environment
+python3 -m venv venv
+# Activate
+source venv/bin/activate
+# Upgrade pip
+pip install --upgrade pip
+# Install all dependencies
+pip install -r requirements.txt
+# Verify installation
+python -c "import config; config.validate_config()"
+```
+---
+## 🔍 Verification
+### Test Configuration
+```bash
+source venv/bin/activate
+python -c "import config; print('✓ Config loaded'); config.validate_config()"
+```
+Expected output:
+```
+✓ Config loaded
+✅ Configuration validated successfully!
+```
+### Test Imports
+```bash
+source venv/bin/activate
+python -c "
+import torch
+import torchaudio
+import speechbrain
+import gradio
+import librosa
+import matplotlib
+print('✅ All imports successful!')
+print(f'PyTorch version: {torch.__version__}')
+print(f'SpeechBrain version: {speechbrain.__version__}')
+print(f'Gradio version: {gradio.__version__}')
+"
+```
+---
+## 📝 Requirements.txt Optimization
+The `requirements.txt` has been optimized to avoid redundancy:
+```txt
+# Core dependencies
+# SpeechBrain includes: torch, torchaudio, numpy, scipy, sentencepiece, hyperpyyaml, transformers, huggingface_hub
+speechbrain==1.0.0
+# F0 extraction with TorchYIN (note: package name is torch-yin, not torchyin)
+torch-yin==0.1.3
+# Gradio for UI
+gradio>=4.0.0
+# Audio processing (not included in speechbrain)
+librosa
+soundfile
+# Visualization (not included in speechbrain)
+matplotlib
+```
+**Note**: Package name is `torch-yin` (with hyphen), not `torchyin`.
+---
+## 🔧 Common Issues
+### Issue: torch-yin not found
+**Error**: `ERROR: Could not find a version that satisfies the requirement torchyin`
+**Solution**: Use `torch-yin` (with hyphen) instead of `torchyin`:
+```bash
+pip install torch-yin==0.1.3
+```
+### Issue: CUDA not available
+If you get CUDA errors but don't have a GPU, update `config.py`:
+```python
+DEVICE = "cpu"
+```
+### Issue: Checkpoint folder not found
+Update the checkpoint folder path in `config.py`:
+```python
+CHECKPOINT_FOLDER = "YOUR_CHECKPOINT_FOLDER_NAME"
+```
+---
+## 📊 Environment Size
+- **Total packages**: ~150+ (including dependencies)
+- **Disk space**: ~5-6 GB (mostly PyTorch + CUDA)
+- **Main components**:
+  - PyTorch + CUDA: ~3-4 GB
+  - SpeechBrain + dependencies: ~1 GB
+  - Gradio + dependencies: ~500 MB
+  - Other packages: ~500 MB
+---
+## 🎯 Quick Commands
+```bash
+# Activate and run
+source venv/bin/activate && python app.py
+# Test configuration
+source venv/bin/activate && python -c "import config; config.validate_config()"
+# Check installed packages
+source venv/bin/activate && pip list
+# Freeze current environment
+source venv/bin/activate && pip freeze > requirements-full.txt
+# Update a specific package
+source venv/bin/activate && pip install --upgrade gradio
+```
+---
+## ✅ Ready to Deploy
+Your environment is ready! You can now:
+1. **Test locally**: `python app.py`
+2. **Adjust config**: Edit `config.py` as needed
+3. **Deploy**: Push to Hugging Face Spaces
+---
+**Created**: 2025-10-20
+**Python Version**: 3.11
+**Status**: ✅ Fully configured and tested

_docs/proteva_complete_deployment.md ADDED Viewed

	@@ -0,0 +1,1441 @@

+# ProTeVa Complete Deployment Guide
+## Yoruba Tone Recognition - Hugging Face Spaces Deployment
+---
+## 📋 Table of Contents
+1. [Deployment Overview](#deployment-overview)
+2. [Hugging Face Spaces Structure](#hugging-face-spaces-structure)
+3. [Deployment Flow](#deployment-flow)
+4. [File Contents](#file-contents)
+   - [config.py](#1-configpy)
+   - [app.py](#2-apppy)
+   - [custom_interface.py](#3-custom_interfacepy)
+   - [inference.yaml](#4-inferenceyaml)
+   - [modules.py](#5-modulespy)
+   - [requirements.txt](#6-requirementstxt)
+   - [README.md](#7-readmemd-for-hugging-face-space)
+5. [Testing & Troubleshooting](#testing--troubleshooting)
+---
+## Deployment Overview
+**ProTeVa** is a tone recognition model for Yoruba language that:
+- Accepts audio input (microphone or file upload)
+- Predicts tone sequences (3 tones)
+- Reconstructs F0 (fundamental frequency) contours
+- Uses prototype-based learning for better generalization
+- **Intelligently detects word boundaries** using acoustic features
+**Yoruba Tones (based on labelencoder.txt):**
+- **Label 0**: Blank (CTC blank token)
+- **Label 1 (H)**: High Tone (◌́)
+- **Label 2 (B)**: Low Tone (◌̀) - "Bas" in French
+- **Label 3 (M)**: Mid Tone (◌)
+- **Label 4**: Space/Word Boundary (post-processing detection)
+---
+## Hugging Face Spaces Structure
+Your Hugging Face Space should have this exact structure:
+```
+your-huggingface-space/
+│
+├── app.py                              # Main Gradio application
+├── custom_interface.py                 # SpeechBrain inference interface
+├── config.py                           # Configuration file (paths, settings)
+├── inference.yaml                      # Model configuration
+├── modules.py                          # Custom PyTorch modules
+├── requirements.txt                    # Python dependencies
+├── README.md                           # Space documentation
+│
+└── CKPT+2025-10-20+08-19-07+00/       # Your checkpoint folder
+    ├── model.ckpt                      # All model weights (~500MB-2GB)
+    ├── wav2vec2.ckpt                   # HuBERT encoder (~300MB-1GB)
+    ├── tokenizer.ckpt                  # Label encoder (~1MB)
+    └── ... (other training files - optional)
+```
+**Important Notes:**
+- All `.py` files must be in the **root directory**
+- Checkpoint folder can have any name (update `inference.yaml` accordingly)
+- Use Git LFS for files larger than 10MB
+---
+## Deployment Flow
+### Step 1: Prepare Local Environment
+```bash
+# Create deployment folder
+mkdir proteva-deployment
+cd proteva-deployment
+# Create all required files (contents provided below)
+# - app.py
+# - custom_interface.py
+# - config.py
+# - inference.yaml
+# - modules.py
+# - requirements.txt
+# - README.md
+```
+### Step 2: Copy Model Checkpoints
+```bash
+# Copy your entire checkpoint folder
+cp -r /path/to/your/CKPT+2025-10-20+04-14-23+00 ./
+# OR copy only required files (to save space)
+mkdir model_checkpoints
+cp /path/to/CKPT+*/model.ckpt model_checkpoints/
+cp /path/to/CKPT+*/wav2vec2.ckpt model_checkpoints/
+cp /path/to/CKPT+*/tokenizer.ckpt model_checkpoints/
+```
+### Step 3: Update Configuration
+Edit `config.py`:
+```python
+# Update this line to match your checkpoint folder name
+CHECKPOINT_FOLDER = "CKPT+2025-10-20+08-19-07+00"
+# Configure space detection (optional)
+ENABLE_SPACE_DETECTION = True  # Set to False to disable
+SPACE_DETECTION_METHOD = "combined"  # Options: 'silence', 'f0_drop', 'duration', 'combined'
+```
+**Note:** The checkpoint folder name in `inference.yaml` should match `config.py`.
+### Step 5: Test Locally
+```bash
+# Install dependencies
+pip install -r requirements.txt
+# Run the app
+python app.py
+# Test in browser: http://localhost:7860
+```
+**Testing checklist:**
+- [ ] Model loads without errors
+- [ ] Can record audio from microphone
+- [ ] Can upload audio files
+- [ ] Tone predictions appear
+- [ ] F0 plot displays correctly
+- [ ] No errors in console
+### Step 6: Create Hugging Face Space
+1. Go to https://huggingface.co/new-space
+2. Fill in details:
+   - **Space name**: `yoruba-tone-recognition` (or your choice)
+   - **License**: Apache 2.0
+   - **SDK**: **Gradio**
+   - **Hardware**: CPU basic (free) - can upgrade later
+   - **Visibility**: Public or Private
+3. Click "Create Space"
+### Step 7: Deploy Using Git
+```bash
+# Clone your new Space
+git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
+cd YOUR_SPACE_NAME
+# Copy all files
+cp -r /path/to/proteva-deployment/* ./
+# Setup Git LFS for large files
+git lfs install
+git lfs track "*.ckpt"
+git add .gitattributes
+# Add all files
+git add .
+# Commit and push
+git commit -m "Initial deployment of ProTeVa tone recognition"
+git push
+```
+### Step 8: Monitor Build
+1. Go to your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
+2. Check "Logs" tab for build progress
+3. Wait 2-5 minutes for build to complete
+4. Test the live app!
+---
+## File Contents
+### 1. `config.py`
+**Purpose:** Central configuration file for paths, model settings, and space detection parameters.
+**Key Features:**
+- Centralized checkpoint folder path management
+- Space detection configuration
+- Tone label mappings
+- Visualization settings
+- Easy configuration updates
+**Content:**
+```python
+"""
+ProTeVa Configuration File
+Central configuration for model paths and tone settings
+"""
+import os
+# ============ PATH CONFIGURATION ============
+# Checkpoint folder name - UPDATE THIS when using a different checkpoint
+CHECKPOINT_FOLDER = "CKPT+2025-10-20+08-19-07+00"
+# Get the absolute path to the checkpoint folder
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+CHECKPOINT_PATH = os.path.join(BASE_DIR, CHECKPOINT_FOLDER)
+# Model files
+MODEL_CKPT = os.path.join(CHECKPOINT_PATH, "model.ckpt")
+WAV2VEC2_CKPT = os.path.join(CHECKPOINT_PATH, "wav2vec2.ckpt")
+TOKENIZER_CKPT = os.path.join(CHECKPOINT_PATH, "tokenizer.ckpt")
+# ============ MODEL CONFIGURATION ============
+# Audio settings
+SAMPLE_RATE = 16000
+# Model architecture
+RNN_LAYERS = 2
+RNN_NEURONS = 512
+DNN_BLOCKS = 2
+DNN_NEURONS = 512
+N_PROTOTYPES = 10
+EMB_DIM = 768
+# ============ TONE CONFIGURATION ============
+# Tone label mapping (from labelencoder.txt)
+TONE_LABELS = {
+    0: "BLANK",  # CTC blank token
+    1: "H",      # High tone
+    2: "B",      # Low tone (Bas)
+    3: "M"       # Mid tone
+}
+# Output neurons (number of classes)
+OUTPUT_NEURONS = 4  # blank, H, B, M
+# CTC blank index
+BLANK_INDEX = 0
+# ============ SPACE/WORD BOUNDARY DETECTION ============
+# Enable space detection between tones
+ENABLE_SPACE_DETECTION = True
+# Space detection method: 'silence', 'f0_drop', 'duration', or 'combined'
+SPACE_DETECTION_METHOD = "combined"
+# Silence threshold (in seconds)
+SILENCE_THRESHOLD = 0.15
+# F0 drop threshold (percentage)
+F0_DROP_THRESHOLD = 0.20  # 20% drop
+# Duration threshold (in seconds)
+DURATION_THRESHOLD = 0.25
+# Minimum confidence for space insertion
+SPACE_CONFIDENCE_THRESHOLD = 0.6
+# ============ VISUALIZATION CONFIGURATION ============
+# Tone display information for UI
+TONE_INFO = {
+    1: {"name": "High Tone", "symbol": "◌́", "color": "#e74c3c", "label": "H"},
+    2: {"name": "Low Tone", "symbol": "◌̀", "color": "#3498db", "label": "B"},
+    3: {"name": "Mid Tone", "symbol": "◌", "color": "#2ecc71", "label": "M"},
+    4: {"name": "Space", "symbol": " | ", "color": "#95a5a6", "label": "SPACE"}
+}
+# ============ DEPLOYMENT CONFIGURATION ============
+# Device (cpu or cuda)
+DEVICE = "cpu"
+# Gradio server settings
+GRADIO_SERVER_NAME = "0.0.0.0"
+GRADIO_SERVER_PORT = 7860
+GRADIO_SHARE = False
+# Model save directory for SpeechBrain
+PRETRAINED_MODEL_DIR = "./pretrained_model"
+# ============ HELPER FUNCTIONS ============
+def get_checkpoint_path():
+    """Get the checkpoint folder path"""
+    return CHECKPOINT_PATH
+def get_tone_name(idx):
+    """Get the tone name from index"""
+    return TONE_LABELS.get(idx, f"Unknown({idx})")
+def get_tone_info(idx):
+    """Get the tone display information"""
+    return TONE_INFO.get(idx, {
+        "name": f"Unknown({idx})",
+        "symbol": "?",
+        "color": "#95a5a6",
+        "label": f"UNK{idx}"
+    })
+def validate_config():
+    """Validate that the configuration is correct"""
+    errors = []
+    if not os.path.exists(CHECKPOINT_PATH):
+        errors.append(f"Checkpoint folder not found: {CHECKPOINT_PATH}")
+    if not os.path.exists(MODEL_CKPT):
+        errors.append(f"Model checkpoint not found: {MODEL_CKPT}")
+    if not os.path.exists(WAV2VEC2_CKPT):
+        errors.append(f"Wav2Vec2 checkpoint not found: {WAV2VEC2_CKPT}")
+    if not os.path.exists(TOKENIZER_CKPT):
+        errors.append(f"Tokenizer checkpoint not found: {TOKENIZER_CKPT}")
+    if errors:
+        print("⚠️  Configuration Errors:")
+        for error in errors:
+            print(f"   - {error}")
+        return False
+    print("✅ Configuration validated successfully!")
+    return True
+```
+**⚠️ IMPORTANT:**
+- Update `CHECKPOINT_FOLDER` to match your actual checkpoint folder name
+- Configure `ENABLE_SPACE_DETECTION` and `SPACE_DETECTION_METHOD` based on your needs
+- All other files will automatically use these settings
+---
+### 2. `app.py`
+**Purpose:** Main Gradio application with UI and prediction logic.
+**Content:**
+```python
+"""
+Gradio App for ProTeVa Yoruba Tone Recognition
+Hugging Face Spaces deployment
+"""
+import gradio as gr
+from speechbrain.inference.interfaces import foreign_class
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+# ============ CONFIGURATION ============
+# Tone names for Yoruba (3 tones)
+# Based on labelencoder.txt: H=1, B=2, M=3
+TONE_INFO = {
+    1: {"name": "High Tone", "symbol": "◌́", "color": "#e74c3c"},
+    2: {"name": "Low Tone", "symbol": "◌̀", "color": "#3498db"},
+    3: {"name": "Mid Tone", "symbol": "◌", "color": "#2ecc71"}
+}
+# ============ MODEL LOADING ============
+print("Loading ProTeVa tone recognition model...")
+try:
+    tone_recognizer = foreign_class(
+        source="./",
+        pymodule_file="custom_interface.py",
+        classname="ProTeVaToneRecognizer",
+        hparams_file="inference.yaml",
+        savedir="./pretrained_model"
+    )
+    print("✓ Model loaded successfully!")
+except Exception as e:
+    print(f"✗ Error loading model: {e}")
+    tone_recognizer = None
+# ============ HELPER FUNCTIONS ============
+def format_tone_sequence(tone_indices, tone_names):
+    """Format tone sequence with colors and symbols"""
+    if not tone_indices:
+        return "No tones detected"
+    formatted = []
+    for idx, name in zip(tone_indices, tone_names):
+        if idx in TONE_INFO:
+            info = TONE_INFO[idx]
+            formatted.append(f"{info['name']} ({info['symbol']})")
+        else:
+            formatted.append(name)
+    return " → ".join(formatted)
+def create_f0_plot(f0_contour):
+    """Create F0 contour plot"""
+    if f0_contour is None or len(f0_contour) == 0:
+        return None
+    # Convert to numpy
+    if isinstance(f0_contour, torch.Tensor):
+        f0_numpy = f0_contour.cpu().numpy().flatten()
+    else:
+        f0_numpy = np.array(f0_contour).flatten()
+    # Create plot
+    fig, ax = plt.subplots(figsize=(10, 4))
+    time = np.arange(len(f0_numpy)) / len(f0_numpy)
+    ax.plot(time, f0_numpy, linewidth=2, color='#3498db')
+    ax.set_xlabel('Normalized Time', fontsize=12)
+    ax.set_ylabel('F0 (Hz)', fontsize=12)
+    ax.set_title('Fundamental Frequency Contour', fontsize=14, fontweight='bold')
+    ax.grid(True, alpha=0.3)
+    plt.tight_layout()
+    return fig
+def create_tone_visualization(tone_indices):
+    """Create visual representation of tone sequence"""
+    if not tone_indices:
+        return None
+    fig, ax = plt.subplots(figsize=(12, 3))
+    x_positions = np.arange(len(tone_indices))
+    colors = [TONE_INFO.get(idx, {}).get('color', '#95a5a6') for idx in tone_indices]
+    ax.bar(x_positions, [1] * len(tone_indices), color=colors, alpha=0.7,
+           edgecolor='black', linewidth=2)
+    for i, idx in enumerate(tone_indices):
+        if idx in TONE_INFO:
+            symbol = TONE_INFO[idx]['symbol']
+            ax.text(i, 0.5, symbol, ha='center', va='center',
+                   fontsize=20, fontweight='bold')
+    ax.set_xlim(-0.5, len(tone_indices) - 0.5)
+    ax.set_ylim(0, 1.2)
+    ax.set_xticks(x_positions)
+    ax.set_xticklabels([f"T{i+1}" for i in range(len(tone_indices))])
+    ax.set_ylabel('Tone', fontsize=12)
+    ax.set_title('Tone Sequence Visualization', fontsize=14, fontweight='bold')
+    ax.set_yticks([])
+    plt.tight_layout()
+    return fig
+# ============ PREDICTION FUNCTION ============
+def predict_tone(audio_file):
+    """Main prediction function for Gradio interface"""
+    if tone_recognizer is None:
+        return "❌ Model not loaded. Please check configuration.", None, None, ""
+    if audio_file is None:
+        return "⚠️ Please provide an audio file", None, None, ""
+    try:
+        # Get predictions
+        tone_indices, tone_names, f0_contour = tone_recognizer.classify_file(audio_file)
+        # Format output
+        tone_text = format_tone_sequence(tone_indices, tone_names)
+        # Create visualizations
+        f0_plot = create_f0_plot(f0_contour)
+        tone_viz = create_tone_visualization(tone_indices)
+        # Create statistics
+        num_tones = len(tone_indices)
+        stats = f"""
+📊 **Prediction Statistics:**
+- Total tones detected: {num_tones}
+- Sequence length: {len(tone_indices)}
+🎵 **Tone Distribution:**
+- High tones (H): {tone_indices.count(1)}
+- Low tones (B): {tone_indices.count(2)}
+- Mid tones (M): {tone_indices.count(3)}
+        """
+        return tone_text, f0_plot, tone_viz, stats
+    except Exception as e:
+        return f"❌ Error during prediction: {str(e)}", None, None, ""
+# ============ GRADIO INTERFACE ============
+custom_css = """
+.gradio-container {
+    font-family: 'Arial', sans-serif;
+}
+.output-text {
+    font-size: 18px;
+    font-weight: bold;
+}
+"""
+with gr.Blocks(css=custom_css, title="ProTeVa Tone Recognition") as demo:
+    gr.Markdown(
+        """
+        # 🎵 ProTeVa: Yoruba Tone Recognition
+        Upload an audio file or record your voice to detect Yoruba tone patterns.
+        **Yoruba Tones:**
+        - **High Tone (H)** (◌́): Syllable with high pitch
+        - **Low Tone (B)** (◌̀): Syllable with low pitch
+        - **Mid Tone (M)** (◌): Syllable with neutral/middle pitch
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎤 Input Audio")
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Record or Upload Audio",
+                waveform_options={"show_controls": True}
+            )
+            predict_btn = gr.Button("🔍 Predict Tones", variant="primary", size="lg")
+            gr.Markdown(
+                """
+                ### 📝 Tips:
+                - Speak clearly in Yoruba
+                - Keep recordings under 10 seconds
+                - Avoid background noise
+                """
+            )
+        with gr.Column(scale=2):
+            gr.Markdown("### 🎯 Results")
+            tone_output = gr.Textbox(
+                label="Predicted Tone Sequence",
+                lines=3,
+                elem_classes="output-text"
+            )
+            stats_output = gr.Markdown(label="Statistics")
+            with gr.Tabs():
+                with gr.Tab("F0 Contour"):
+                    f0_plot = gr.Plot(label="Fundamental Frequency")
+                with gr.Tab("Tone Visualization"):
+                    tone_viz = gr.Plot(label="Tone Sequence")
+    predict_btn.click(
+        fn=predict_tone,
+        inputs=audio_input,
+        outputs=[tone_output, f0_plot, tone_viz, stats_output]
+    )
+    gr.Markdown("### 📚 Example Audios")
+    gr.Markdown("*Add example audio files to demonstrate the model*")
+    gr.Markdown(
+        """
+        ---
+        **About ProTeVa:**
+        ProTeVa (Prototype-based Tone Variant Autoencoder) is a neural model for tone recognition.
+        **Model Architecture:**
+        - Feature Extractor: HuBERT (Orange/SSA-HuBERT-base-60k)
+        - Encoder: Bidirectional GRU
+        - Prototype Layer: 10 learnable tone prototypes
+        - Decoder: F0 reconstruction
+        - Output: CTC-based tone sequence prediction
+        Built with ❤️ using SpeechBrain and Gradio
+        """
+    )
+if __name__ == "__main__":
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860
+    )
+```
+---
+### 2. `custom_interface.py`
+**Purpose:** Custom SpeechBrain inference interface for loading and running the model.
+**Content:**
+```python
+"""
+Custom SpeechBrain inference interface for ProTeVa tone recognition model
+"""
+import torch
+from speechbrain.inference.interfaces import Pretrained
+class ProTeVaToneRecognizer(Pretrained):
+    """
+    Custom interface for ProTeVa tone recognition model
+    Predicts tone sequences for Yoruba language (3 tones)
+    """
+    HPARAMS_NEEDED = ["wav2vec2", "enc", "dec", "pitch_dec",
+                      "proto", "output_lin", "log_softmax",
+                      "label_encoder", "f0Compute", "sample_rate"]
+    MODULES_NEEDED = ["wav2vec2", "enc", "dec", "pitch_dec",
+                      "proto", "output_lin"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.sample_rate = self.hparams.sample_rate
+    def classify_file(self, path):
+        """
+        Classify tone sequence from audio file
+        Arguments
+        ---------
+        path : str
+            Path to audio file
+        Returns
+        -------
+        tone_sequence : list
+            Predicted tone labels (integers)
+        tone_names : list
+            Predicted tone names (strings)
+        f0_contour : torch.Tensor
+            Reconstructed F0 contour
+        """
+        waveform = self.load_audio(path)
+        wavs = waveform.unsqueeze(0)
+        wav_lens = torch.tensor([1.0])
+        tone_sequences, tone_names, f0_contours = self.classify_batch(wavs, wav_lens)
+        return tone_sequences[0], tone_names[0], f0_contours[0]
+    def classify_batch(self, wavs, wav_lens):
+        """
+        Classify tones from a batch of waveforms
+        Arguments
+        ---------
+        wavs : torch.Tensor
+            Batch of waveforms [batch, time]
+        wav_lens : torch.Tensor
+            Relative lengths of waveforms
+        Returns
+        -------
+        tone_sequences : list of lists
+            Predicted tone label indices
+        tone_names : list of lists
+            Predicted tone names
+        f0_contours : torch.Tensor
+            Reconstructed F0 contours
+        """
+        self.eval()
+        with torch.no_grad():
+            wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+            # Extract features from HuBERT
+            feats = self.modules.wav2vec2(wavs)
+            # Extract F0
+            f0 = self.hparams.f0Compute(wavs, target_size=feats.shape[1])
+            # Encode with BiGRU
+            x, hidden = self.modules.enc(feats)
+            # Decode with VanillaNN
+            x = self.modules.dec(x)
+            # Pitch decoder - reconstruct F0
+            dec_out = self.modules.pitch_dec(x)
+            # Prototype layer - similarity to learned tone prototypes
+            proto_out = self.modules.proto(x)
+            # Classification layer
+            logits = self.modules.output_lin(proto_out)
+            log_probs = self.hparams.log_softmax(logits)
+            # CTC greedy decoding
+            tone_sequences = self._ctc_decode(log_probs, wav_lens)
+            # Convert indices to tone names
+            tone_names = []
+            for seq in tone_sequences:
+                names = [self._get_tone_name(idx) for idx in seq if idx != 0]
+                tone_names.append(names)
+        return tone_sequences, tone_names, dec_out
+    def _ctc_decode(self, log_probs, wav_lens):
+        """CTC greedy decoding"""
+        from speechbrain.decoders import ctc_greedy_decode
+        sequences = ctc_greedy_decode(
+            log_probs,
+            wav_lens,
+            blank_index=0
+        )
+        return sequences
+    def _get_tone_name(self, idx):
+        """
+        Convert tone index to name
+        Based on labelencoder.txt:
+        - 0: Blank (CTC)
+        - 1: High tone (H)
+        - 2: Low tone (B - Bas)
+        - 3: Mid tone (M)
+        """
+        tone_map = {
+            0: "BLANK",
+            1: "High",
+            2: "Low",
+            3: "Mid"
+        }
+        return tone_map.get(idx, f"Unknown({idx})")
+    def forward(self, wavs, wav_lens):
+        """Forward pass for the model"""
+        return self.classify_batch(wavs, wav_lens)
+```
+---
+### 3. `inference.yaml`
+**Purpose:** Model configuration and checkpoint loading.
+**Content:**
+```yaml
+# ################################
+# ProTeVa Inference Configuration
+# Simplified YAML for deployment
+# ################################
+# Basic settings
+seed: 200
+device: cpu  # Change to cuda if GPU available
+sample_rate: 16000
+# Output neurons (4 classes: blank, high, low, mid)
+# Based on labelencoder.txt: 0=blank, 1=H, 2=B, 3=M
+output_neurons: 4
+blank_index: 0
+# Number of prototypes
+n_prototypes: 10
+# Feature dimension from HuBERT
+emb_dim: 768
+# Encoder settings
+rnn_layers: 2
+rnn_neurons: 512
+# Decoder settings
+dnn_blocks: 2
+dnn_neurons: 512
+# Pitch decoder settings
+dec_dnn_blocks: [1]
+dec_dnn_neurons: [128]
+# Activation function
+activation: !name:torch.nn.LeakyReLU
+# ============ MODULES ============
+# HuBERT feature extractor
+wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+   source: "Orange/SSA-HuBERT-base-60k"
+   output_norm: True
+   freeze: False
+   save_path: whubert_checkpoint
+# F0 extractor (requires custom module)
+f0Compute: !new:modules.F0Extractor
+    device: !ref <device>
+    sample_rate: !ref <sample_rate>
+# BiGRU Encoder
+enc: !new:speechbrain.nnet.RNN.GRU
+    input_shape: [null, null, !ref <emb_dim>]
+    hidden_size: !ref <rnn_neurons>
+    num_layers: !ref <rnn_layers>
+    bidirectional: True
+    dropout: 0.15
+# VanillaNN Decoder
+dec: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
+    input_shape: [null, null, 1024]  # 512 * 2 (bidirectional)
+    activation: !ref <activation>
+    dnn_blocks: !ref <dnn_blocks>
+    dnn_neurons: !ref <dnn_neurons>
+# Pitch Decoder (requires custom module)
+pitch_dec: !new:modules.PitchDecoderLayer
+    input_shape: [null, null, !ref <dnn_neurons>]
+    dnn_blocks: !ref <dec_dnn_blocks>
+    dnn_neurons: !ref <dec_dnn_neurons>
+# Prototype Layer (requires custom module)
+proto: !new:modules.PrototypeLayer
+    n_prototypes: !ref <n_prototypes>
+    latent_dims: !ref <dnn_neurons>
+# Output linear layer
+output_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <n_prototypes>
+    n_neurons: !ref <output_neurons>
+    bias: True
+# Log softmax
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
+# ============ MODULES DICT ============
+modules:
+    wav2vec2: !ref <wav2vec2>
+    enc: !ref <enc>
+    dec: !ref <dec>
+    pitch_dec: !ref <pitch_dec>
+    proto: !ref <proto>
+    output_lin: !ref <output_lin>
+# Model container for all modules
+model: !new:torch.nn.ModuleList
+    - [!ref <enc>, !ref <dec>, !ref <proto>, !ref <output_lin>, !ref <pitch_dec>]
+# ============ PRETRAINER ============
+# This loads the trained checkpoints
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        model: !ref <model>
+        wav2vec2: !ref <wav2vec2>
+        tokenizer: !ref <label_encoder>
+    paths:
+        model: !ref <save_folder>/model.ckpt
+        wav2vec2: !ref <save_folder>/wav2vec2.ckpt
+        tokenizer: !ref <save_folder>/tokenizer.ckpt
+# Save folder - UPDATE THIS PATH TO MATCH YOUR CHECKPOINT FOLDER NAME
+save_folder: ./CKPT+2025-10-20+04-14-23+00
+```
+**⚠️ IMPORTANT:** Update `save_folder` to match your actual checkpoint folder name!
+---
+### 4. `modules.py`
+**Purpose:** Custom PyTorch modules used by the model.
+**Content:**
+```python
+"""
+Custom modules for ProTeVa tone recognition model
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class F0Extractor(nn.Module):
+    """
+    F0 (Fundamental Frequency) Extractor using TorchYIN
+    This module extracts F0 from audio waveforms and returns it as an embedding vector.
+    Uses the YIN algorithm implemented in torchyin for pitch estimation.
+    Arguments
+    ---------
+    device : str
+        Device to run computations on ('cpu' or 'cuda')
+    sample_rate : int
+        Audio sample rate (default: 16000)
+    frame_stride : float
+        Length of the sliding window in seconds (default: 0.018)
+    pitch_min : float
+        Minimum pitch value in Hz (default: 50)
+    pitch_max : float
+        Maximum pitch value in Hz (default: 500)
+    Example
+    -------
+    >>> compute_f0 = F0Extractor(sample_rate=16000)
+    >>> input_feats = torch.rand([1, 23000])
+    >>> outputs = compute_f0(input_feats, target_size=220)
+    >>> outputs.shape
+    torch.Size([1, 220, 1])
+    Authors
+    -------
+    * St Germes BENGONO OBIANG 2024
+    """
+    def __init__(
+        self,
+        device="cpu",
+        sample_rate=16000,
+        frame_stride=0.018,
+        pitch_min=50,
+        pitch_max=500,
+    ):
+        super().__init__()
+        self.device = device
+        self.sample_rate = sample_rate
+        self.pitch_min = pitch_min
+        self.pitch_max = pitch_max
+        self.frame_stride = frame_stride
+    def interpolate_spline(self, H, N):
+        """
+        Interpolate pitch values to target size using cubic spline interpolation
+        Arguments
+        ---------
+        H : numpy.ndarray
+            Original pitch values
+        N : int
+            Target number of frames
+        Returns
+        -------
+        H_interpolated : torch.Tensor
+            Interpolated pitch values
+        """
+        import numpy as np
+        from scipy.interpolate import interp1d
+        # Generate indices for the original and new tensors
+        idx_original = np.arange(len(H))
+        idx_new = np.linspace(0, len(H) - 1, N)
+        # Create the interpolation function
+        interpolator = interp1d(idx_original, H, kind='cubic')
+        # Perform interpolation
+        H_interpolated = interpolator(idx_new)
+        # Create a mask for values below minimum pitch
+        mask = H_interpolated < self.pitch_min
+        H_interpolated[mask] = 0
+        return torch.as_tensor(H_interpolated.tolist())
+    def forward(self, wavs, target_size):
+        """
+        Extract F0 from waveforms
+        Arguments
+        ---------
+        wavs : torch.Tensor
+            Input waveforms [batch, time]
+        target_size : int
+            Target length to match encoder output
+        Returns
+        -------
+        f0 : torch.Tensor
+            F0 contours [batch, target_size, 1]
+        """
+        import torchyin
+        results = []
+        for wav in wavs:
+            # Estimate pitch using TorchYIN
+            pitch = torchyin.estimate(
+                wav,
+                self.sample_rate,
+                pitch_min=self.pitch_min,
+                pitch_max=self.pitch_max,
+                frame_stride=self.frame_stride
+            )
+            # Interpolate the pitch to target size
+            pitch = self.interpolate_spline(pitch.cpu().numpy(), target_size)
+            # Reshape the pitch output
+            pitch = pitch.view(pitch.shape[0], 1)
+            results.append(pitch.tolist())
+        return torch.as_tensor(results).to(self.device)
+class PitchDecoderLayer(nn.Module):
+    """
+    Pitch Decoder Layer
+    Reconstructs F0 contour from encoded representations
+    """
+    def __init__(self, input_shape, dnn_blocks=[1], dnn_neurons=[128]):
+        super().__init__()
+        if isinstance(input_shape, list) and len(input_shape) == 3:
+            input_dim = input_shape[-1]
+        else:
+            input_dim = input_shape
+        layers = []
+        current_dim = input_dim
+        for block_idx, neurons in enumerate(dnn_neurons):
+            layers.append(nn.Linear(current_dim, neurons))
+            layers.append(nn.LeakyReLU())
+            layers.append(nn.Dropout(0.1))
+            current_dim = neurons
+        layers.append(nn.Linear(current_dim, 1))
+        self.decoder = nn.Sequential(*layers)
+    def forward(self, x):
+        """
+        Decode F0 from encoded representation
+        Arguments
+        ---------
+        x : torch.Tensor
+            Encoded features [batch, time, features]
+        Returns
+        -------
+        f0_pred : torch.Tensor
+            Predicted F0 [batch, time, 1]
+        """
+        return self.decoder(x)
+class PrototypeLayer(nn.Module):
+    """
+    Prototype Layer for tone representation learning
+    Learns M prototypes that represent canonical tone patterns.
+    Includes regularization losses R_1 and R_2.
+    """
+    def __init__(self, n_prototypes=10, latent_dims=512, temperature=1.0):
+        super().__init__()
+        self.n_prototypes = n_prototypes
+        self.latent_dims = latent_dims
+        self.temperature = temperature
+        self.prototypes = nn.Parameter(
+            torch.randn(n_prototypes, latent_dims)
+        )
+        nn.init.xavier_uniform_(self.prototypes)
+        self.R_1 = torch.tensor(0.0)
+        self.R_2 = torch.tensor(0.0)
+    def forward(self, x):
+        """
+        Compute similarity between input and prototypes
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input features [batch, time, latent_dims]
+        Returns
+        -------
+        similarities : torch.Tensor
+            Prototype similarities [batch, time, n_prototypes]
+        """
+        batch_size, time_steps, features = x.shape
+        x_flat = x.view(-1, features)
+        x_norm = F.normalize(x_flat, p=2, dim=1)
+        proto_norm = F.normalize(self.prototypes, p=2, dim=1)
+        similarities = torch.mm(x_norm, proto_norm.t())
+        similarities = similarities / self.temperature
+        similarities = similarities.view(batch_size, time_steps, self.n_prototypes)
+        self._compute_regularization(x, similarities)
+        return similarities
+    def _compute_regularization(self, x, similarities):
+        """Compute regularization losses R_1 and R_2"""
+        # R_1: Prototype diversity
+        proto_norm = F.normalize(self.prototypes, p=2, dim=1)
+        proto_similarity = torch.mm(proto_norm, proto_norm.t())
+        mask = torch.ones_like(proto_similarity) - torch.eye(
+            self.n_prototypes, device=proto_similarity.device
+        )
+        self.R_1 = (proto_similarity * mask).pow(2).sum() / (
+            self.n_prototypes * (self.n_prototypes - 1)
+        )
+        # R_2: Cluster compactness
+        max_sim, assigned_proto = similarities.max(dim=-1)
+        self.R_2 = -max_sim.mean()
+```
+**✅ COMPLETE:** F0Extractor is now fully implemented using TorchYIN!
+---
+### 5. `requirements.txt`
+**Purpose:** Python package dependencies.
+**Content:**
+```txt
+# Core dependencies
+speechbrain
+torch>=1.13.0
+torchaudio>=0.13.0
+gradio>=4.0.0
+# Audio processing
+librosa
+soundfile
+# Visualization
+matplotlib
+numpy
+scipy
+# HuggingFace integration
+transformers
+huggingface_hub
+# Additional utilities
+hyperpyyaml
+sentencepiece
+# F0 extraction with TorchYIN
+torchyin
+```
+**Note:** `torchyin` is required for F0 (pitch) extraction using the YIN algorithm.
+---
+### 6. `README.md` (for Hugging Face Space)
+**Purpose:** Documentation displayed on your Space page.
+**Content:**
+```markdown
+---
+title: ProTeVa Yoruba Tone Recognition
+emoji: 🎵
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# ProTeVa: Yoruba Tone Recognition
+This Space demonstrates **ProTeVa** (Prototype-based Tone Variant Autoencoder), a neural model for recognizing tone patterns in Yoruba language.
+## Features
+- 🎤 **Record or Upload**: Use your microphone or upload audio files
+- 🎯 **Tone Detection**: Automatically detects 3 Yoruba tones (Low, Mid, High)
+- 📊 **F0 Visualization**: Shows fundamental frequency contours
+- 🎨 **Interactive UI**: Real-time predictions with visual feedback
+## Yoruba Tones
+Yoruba is a tonal language with three contrastive tones:
+1. **High Tone (H)** (◌́) - Example: ágbó (elder)
+2. **Low Tone (B)** (◌̀) - Example: àgbò (ram)
+3. **Mid Tone (M)** (◌) - Example: agbo (medicine)
+## Model Architecture
+- **Feature Extractor**: HuBERT (Orange/SSA-HuBERT-base-60k)
+- **Encoder**: 2-layer Bidirectional GRU (512 hidden units)
+- **Decoder**: VanillaNN (2 blocks, 512 neurons)
+- **Prototype Layer**: 10 learnable tone prototypes
+- **Output**: CTC-based sequence prediction
+## Training Details
+- **Dataset**: Yoruba speech corpus
+- **Sample Rate**: 16kHz
+- **Loss Functions**:
+  - CTC loss for tone sequence
+  - MSE loss for F0 reconstruction
+  - Prototype regularization (R₁ + R₂)
+- **Training Duration**: 65 epochs
+## Usage
+1. Click on the microphone icon to record or upload an audio file
+2. Click "🔍 Predict Tones"
+3. View predicted tone sequence and F0 contour
+## Citation
+If you use this model in your research, please cite:
+```bibtex
+@article{proteva2025,
+  title={ProTeVa: Prototype-based Tone Variant Autoencoder for Yoruba Tone Recognition},
+  author={Your Name},
+  year={2025}
+}
+```
+## Acknowledgments
+Built with [SpeechBrain](https://speechbrain.github.io/) and [Gradio](https://gradio.app/).
+## License
+Apache 2.0
+```
+---
+## Space Detection Implementation
+ProTeVa implements intelligent word boundary detection using acoustic features. Since the base model only predicts 3 tones (H, B, M), space tokens (label 4) are inserted via post-processing.
+### Detection Methods
+#### 1. **Silence Detection** (`'silence'`)
+- Analyzes F0 contours for gaps with low/zero pitch
+- Gaps longer than `SILENCE_THRESHOLD` (default: 0.15s) indicate word boundaries
+- Effective for clear pauses between words
+#### 2. **F0 Drop Detection** (`'f0_drop'`)
+- Detects significant pitch drops between consecutive tones
+- Drops greater than `F0_DROP_THRESHOLD` (default: 20%) suggest boundaries
+- Mimics natural prosody where pitch resets at word beginnings
+#### 3. **Duration-Based** (`'duration'`)
+- Simple heuristic based on regular intervals
+- Inserts spaces every N tones (configurable)
+- Less accurate but works without acoustic features
+#### 4. **Combined Method** (`'combined'`) - **RECOMMENDED**
+- Combines silence and F0 drop detection
+- Higher confidence when both methods agree
+- Balances precision and recall
+### Configuration
+Edit `config.py` to customize:
+```python
+# Enable/disable space detection
+ENABLE_SPACE_DETECTION = True
+# Choose detection method
+SPACE_DETECTION_METHOD = "combined"  # Best results
+# Fine-tune thresholds
+SILENCE_THRESHOLD = 0.15      # Adjust for speaker style
+F0_DROP_THRESHOLD = 0.20      # 20% F0 drop
+SPACE_CONFIDENCE_THRESHOLD = 0.6
+```
+### Implementation Details
+1. **Model predicts base tones** (1, 2, 3) using CTC
+2. **Post-processing analyzes** F0 contours and silence patterns
+3. **Space tokens (4) inserted** at detected word boundaries
+4. **Visualization** shows spaces as vertical separators
+### Tuning Tips
+- **Too many spaces?** Increase thresholds or use `'f0_drop'` only
+- **Too few spaces?** Decrease thresholds or use `'combined'`
+- **Disable completely:** Set `ENABLE_SPACE_DETECTION = False`
+---
+## Testing & Troubleshooting
+### Local Testing Checklist
+```bash
+# 1. Install dependencies
+pip install -r requirements.txt
+# 2. Verify file structure
+ls -la
+# Should see: app.py, custom_interface.py, inference.yaml, modules.py, requirements.txt
+# Should see: CKPT+2025-10-20+04-14-23+00/ folder
+# 3. Check checkpoint folder
+ls CKPT+2025-10-20+04-14-23+00/
+# Should see: model.ckpt, wav2vec2.ckpt, tokenizer.ckpt
+# 4. Run the app
+python app.py
+# 5. Open browser
+# http://localhost:7860
+# 6. Test functionality
+# - Record audio
+# - Upload file
+# - Check predictions
+# - Verify plots display
+```
+### Common Issues
+#### Issue 1: "Module not found: modules"
+**Solution:** Ensure `modules.py` is in the same directory as `app.py`
+#### Issue 2: "Cannot find checkpoint"
+**Solution:** Update `save_folder` in `inference.yaml` to match your checkpoint folder name exactly
+#### Issue 3: "F0Extractor not implemented"
+**Solution:** Implement the `forward()` method in `F0Extractor` class in `modules.py`
+#### Issue 4: "CUDA out of memory"
+**Solution:** Set `device: cpu` in `inference.yaml` or upgrade to GPU hardware
+#### Issue 5: "File too large for upload"
+**Solution:** Use Git LFS for checkpoint files:
+```bash
+git lfs install
+git lfs track "*.ckpt"
+git add .gitattributes
+```
+#### Issue 6: "Model loading timeout"
+**Solution:** Large models may take 2-5 minutes to load on first run. Check Space logs.
+### Verification Steps on Hugging Face Spaces
+1. ✅ Space builds without errors (check "Logs" tab)
+2. ✅ Model loads successfully (check startup logs)
+3. ✅ UI displays correctly
+4. ✅ Can record audio from microphone
+5. ✅ Can upload audio files
+6. ✅ Predictions are generated
+7. ✅ F0 plot appears
+8. ✅ Tone visualization shows
+9. ✅ Statistics display correctly
+10. ✅ No errors in browser console
+---
+## Quick Reference
+### File Checklist
+- [ ] `config.py` (central configuration - **UPDATE THIS FIRST**)
+- [ ] `app.py` (main application)
+- [ ] `custom_interface.py` (inference interface with space detection)
+- [ ] `inference.yaml` (model configuration)
+- [ ] `modules.py` (custom modules - F0Extractor, PrototypeLayer, PitchDecoder)
+- [ ] `requirements.txt` (dependencies)
+- [ ] `README.md` (Space documentation)
+- [ ] `CKPT+2025-10-20+08-19-07+00/` (checkpoint folder)
+  - [ ] `model.ckpt`
+  - [ ] `wav2vec2.ckpt`
+  - [ ] `tokenizer.ckpt`
+### Configuration Updates
+- [ ] Update `CHECKPOINT_FOLDER` in `config.py` to match your checkpoint folder
+- [ ] Configure space detection settings in `config.py`:
+  - `ENABLE_SPACE_DETECTION`: True/False
+  - `SPACE_DETECTION_METHOD`: 'combined', 'silence', 'f0_drop', or 'duration'
+- [ ] Ensure `save_folder` in `inference.yaml` matches `config.py`
+- [ ] Add your name/info to `README.md`
+### Deployment Commands
+```bash
+# Local test
+python app.py
+# Deploy to Hugging Face
+git clone https://huggingface.co/spaces/USERNAME/SPACE_NAME
+cd SPACE_NAME
+cp -r /path/to/files/* ./
+git lfs track "*.ckpt"
+git add .
+git commit -m "Deploy ProTeVa"
+git push
+```
+---
+## Support & Resources
+- **SpeechBrain Docs**: https://speechbrain.readthedocs.io/
+- **Gradio Docs**: https://gradio.app/docs/
+- **Hugging Face Spaces**: https://huggingface.co/docs/hub/spaces
+---
+**You're ready to deploy! 🚀**
+Follow the steps, test locally, then push to Hugging Face Spaces.

app.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+Gradio App for ProTeVa Yoruba Tone Recognition
+Hugging Face Spaces deployment
+"""
+import gradio as gr
+from speechbrain.inference.interfaces import foreign_class
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import config
+# ============ CONFIGURATION ============
+# Import tone info from config
+TONE_INFO = config.TONE_INFO
+# ============ MODEL LOADING ============
+print("Loading ProTeVa tone recognition model...")
+print(f"Checkpoint folder: {config.CHECKPOINT_FOLDER}")
+try:
+    tone_recognizer = foreign_class(
+        source="./",
+        pymodule_file="custom_interface.py",
+        classname="ProTeVaToneRecognizer",
+        hparams_file="inference.yaml",
+        savedir=config.PRETRAINED_MODEL_DIR
+    )
+    print("✓ Model loaded successfully!")
+    # Validate configuration
+    if config.validate_config():
+        print(f"✓ Space detection: {'ENABLED' if config.ENABLE_SPACE_DETECTION else 'DISABLED'}")
+        if config.ENABLE_SPACE_DETECTION:
+            print(f"  Method: {config.SPACE_DETECTION_METHOD}")
+except Exception as e:
+    print(f"✗ Error loading model: {e}")
+    tone_recognizer = None
+# ============ HELPER FUNCTIONS ============
+def format_tone_sequence(tone_indices, tone_names):
+    """Format tone sequence with colors and symbols"""
+    if not tone_indices:
+        return "No tones detected"
+    formatted = []
+    for idx, name in zip(tone_indices, tone_names):
+        info = config.get_tone_info(idx)
+        formatted.append(f"{info['name']} ({info['symbol']})")
+    return " → ".join(formatted)
+def create_f0_plot(f0_contour):
+    """Create F0 contour plot"""
+    if f0_contour is None or len(f0_contour) == 0:
+        return None
+    # Convert to numpy
+    if isinstance(f0_contour, torch.Tensor):
+        f0_numpy = f0_contour.cpu().numpy().flatten()
+    else:
+        f0_numpy = np.array(f0_contour).flatten()
+    # Create plot
+    fig, ax = plt.subplots(figsize=(10, 4))
+    time = np.arange(len(f0_numpy)) / len(f0_numpy)
+    ax.plot(time, f0_numpy, linewidth=2, color='#3498db')
+    ax.set_xlabel('Normalized Time', fontsize=12)
+    ax.set_ylabel('F0 (Hz)', fontsize=12)
+    ax.set_title('Fundamental Frequency Contour', fontsize=14, fontweight='bold')
+    ax.grid(True, alpha=0.3)
+    plt.tight_layout()
+    return fig
+def create_tone_visualization(tone_indices):
+    """Create visual representation of tone sequence"""
+    if not tone_indices:
+        return None
+    fig, ax = plt.subplots(figsize=(max(12, len(tone_indices) * 0.8), 3))
+    # Prepare data
+    x_positions = []
+    colors = []
+    labels = []
+    position = 0
+    for idx in tone_indices:
+        info = config.get_tone_info(idx)
+        # Space tokens get different visual treatment
+        if idx == 4:
+            # Draw vertical line for space
+            ax.axvline(x=position - 0.25, color=info['color'],
+                      linewidth=3, linestyle='--', alpha=0.7)
+        else:
+            x_positions.append(position)
+            colors.append(info['color'])
+            labels.append(info['symbol'])
+            position += 1
+    # Draw tone bars
+    if x_positions:
+        ax.bar(x_positions, [1] * len(x_positions), color=colors, alpha=0.7,
+               edgecolor='black', linewidth=2, width=0.8)
+        # Add tone symbols
+        for i, (pos, label) in enumerate(zip(x_positions, labels)):
+            ax.text(pos, 0.5, label, ha='center', va='center',
+                   fontsize=20, fontweight='bold')
+    # Configure plot
+    if x_positions:
+        ax.set_xlim(-0.5, max(x_positions) + 0.5)
+    ax.set_ylim(0, 1.2)
+    if x_positions:
+        ax.set_xticks(x_positions)
+        ax.set_xticklabels([f"T{i+1}" for i in range(len(x_positions))])
+    ax.set_ylabel('Tone', fontsize=12)
+    ax.set_title('Tone Sequence Visualization (| = word boundary)',
+                 fontsize=14, fontweight='bold')
+    ax.set_yticks([])
+    plt.tight_layout()
+    return fig
+# ============ PREDICTION FUNCTION ============
+def predict_tone(audio_file):
+    """Main prediction function for Gradio interface"""
+    if tone_recognizer is None:
+        return "❌ Model not loaded. Please check configuration.", None, None, ""
+    if audio_file is None:
+        return "⚠️ Please provide an audio file", None, None, ""
+    try:
+        # Get predictions
+        tone_indices, tone_names, f0_contour = tone_recognizer.classify_file(audio_file)
+        # Format output
+        tone_text = format_tone_sequence(tone_indices, tone_names)
+        # Create visualizations
+        f0_plot = create_f0_plot(f0_contour)
+        tone_viz = create_tone_visualization(tone_indices)
+        # Create statistics
+        num_tones = len([t for t in tone_indices if t != 4])
+        num_spaces = len([t for t in tone_indices if t == 4])
+        stats = f"""
+📊 **Prediction Statistics:**
+- Total tones detected: {num_tones}
+- Word boundaries detected: {num_spaces}
+- Sequence length: {len(tone_indices)}
+🎵 **Tone Distribution:**
+- High tones (H): {tone_indices.count(1)}
+- Low tones (B): {tone_indices.count(2)}
+- Mid tones (M): {tone_indices.count(3)}
+⚙️ **Detection Settings:**
+- Space detection: {'ENABLED' if config.ENABLE_SPACE_DETECTION else 'DISABLED'}
+{f"- Method: {config.SPACE_DETECTION_METHOD}" if config.ENABLE_SPACE_DETECTION else ""}
+        """
+        return tone_text, f0_plot, tone_viz, stats
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        return f"❌ Error during prediction: {str(e)}\n\n{error_details}", None, None, ""
+# ============ GRADIO INTERFACE ============
+custom_css = """
+.gradio-container {
+    font-family: 'Arial', sans-serif;
+}
+.output-text {
+    font-size: 18px;
+    font-weight: bold;
+}
+"""
+with gr.Blocks(css=custom_css, title="ProTeVa Tone Recognition") as demo:
+    gr.Markdown(
+        f"""
+        # 🎵 ProTeVa: Yoruba Tone Recognition
+        Upload an audio file or record your voice to detect Yoruba tone patterns.
+        **Yoruba Tones:**
+        - **High Tone (H)** (◌́): Syllable with high pitch
+        - **Low Tone (B)** (◌̀): Syllable with low pitch
+        - **Mid Tone (M)** (◌): Syllable with neutral/middle pitch
+        - **Space ( | )**: Word boundary (detected automatically)
+        **Space Detection:** {config.SPACE_DETECTION_METHOD if config.ENABLE_SPACE_DETECTION else 'OFF'}
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎤 Input Audio")
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Record or Upload Audio",
+                waveform_options={"show_controls": True}
+            )
+            predict_btn = gr.Button("🔍 Predict Tones", variant="primary", size="lg")
+            gr.Markdown(
+                """
+                ### 📝 Tips:
+                - Speak clearly in Yoruba
+                - Keep recordings under 10 seconds
+                - Avoid background noise
+                - Pause slightly between words for better boundary detection
+                """
+            )
+        with gr.Column(scale=2):
+            gr.Markdown("### 🎯 Results")
+            tone_output = gr.Textbox(
+                label="Predicted Tone Sequence",
+                lines=3,
+                elem_classes="output-text"
+            )
+            stats_output = gr.Markdown(label="Statistics")
+            with gr.Tabs():
+                with gr.Tab("F0 Contour"):
+                    f0_plot = gr.Plot(label="Fundamental Frequency")
+                with gr.Tab("Tone Visualization"):
+                    tone_viz = gr.Plot(label="Tone Sequence")
+    predict_btn.click(
+        fn=predict_tone,
+        inputs=audio_input,
+        outputs=[tone_output, f0_plot, tone_viz, stats_output]
+    )
+    gr.Markdown("### 📚 Example Audios")
+    gr.Markdown("*Upload Yoruba speech samples to test the model*")
+    gr.Markdown(
+        f"""
+        ---
+        **About ProTeVa:**
+        ProTeVa (Prototype-based Tone Variant Autoencoder) is a neural model for tone recognition.
+        **Model Architecture:**
+        - Feature Extractor: HuBERT (Orange/SSA-HuBERT-base-60k)
+        - Encoder: {config.RNN_LAYERS}-layer Bidirectional GRU ({config.RNN_NEURONS} neurons)
+        - Prototype Layer: {config.N_PROTOTYPES} learnable tone prototypes
+        - Decoder: F0 reconstruction
+        - Output: CTC-based tone sequence prediction + acoustic space detection
+        **Space Detection:**
+        - Method: {config.SPACE_DETECTION_METHOD if config.ENABLE_SPACE_DETECTION else 'Disabled'}
+        - Uses F0 contours, silence patterns, and tone duration
+        - Automatically detects word boundaries in continuous speech
+        Built with ❤️ using SpeechBrain and Gradio
+        **Model Checkpoint:** {config.CHECKPOINT_FOLDER}
+        """
+    )
+if __name__ == "__main__":
+    demo.launch(
+        share=config.GRADIO_SHARE,
+        server_name=config.GRADIO_SERVER_NAME,
+        server_port=config.GRADIO_SERVER_PORT
+    )

config.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+ProTeVa Configuration File
+Central configuration for model paths and tone settings
+"""
+import os
+# ============ PATH CONFIGURATION ============
+# Checkpoint folder name - UPDATE THIS when using a different checkpoint
+CHECKPOINT_FOLDER = "CKPT+2025-10-20+08-19-07+00"
+# Get the absolute path to the checkpoint folder
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+CHECKPOINT_PATH = os.path.join(BASE_DIR, CHECKPOINT_FOLDER)
+# Model files
+MODEL_CKPT = os.path.join(CHECKPOINT_PATH, "model.ckpt")
+WAV2VEC2_CKPT = os.path.join(CHECKPOINT_PATH, "wav2vec2.ckpt")
+TOKENIZER_CKPT = os.path.join(CHECKPOINT_PATH, "tokenizer.ckpt")
+# ============ MODEL CONFIGURATION ============
+# Audio settings
+SAMPLE_RATE = 16000
+# Model architecture
+RNN_LAYERS = 2
+RNN_NEURONS = 512
+DNN_BLOCKS = 2
+DNN_NEURONS = 512
+N_PROTOTYPES = 10
+EMB_DIM = 768
+# ============ TONE CONFIGURATION ============
+# Tone label mapping (from labelencoder.txt)
+# These are the indices used by the trained model
+TONE_LABELS = {
+    0: "BLANK",  # CTC blank token
+    1: "H",      # High tone
+    2: "B",      # Low tone (Bas)
+    3: "M"       # Mid tone
+}
+# Output neurons (number of classes)
+OUTPUT_NEURONS = 4  # blank, H, B, M
+# CTC blank index
+BLANK_INDEX = 0
+# ============ SPACE/WORD BOUNDARY DETECTION ============
+# Enable space detection between tones
+ENABLE_SPACE_DETECTION = True
+# Space detection method: 'silence', 'f0_drop', 'duration', or 'combined'
+SPACE_DETECTION_METHOD = "combined"
+# Silence threshold (in seconds) - gaps longer than this are word boundaries
+SILENCE_THRESHOLD = 0.15
+# F0 drop threshold (percentage) - F0 drops greater than this indicate boundaries
+F0_DROP_THRESHOLD = 0.20  # 20% drop
+# Duration threshold (in seconds) - long tones might indicate word endings
+DURATION_THRESHOLD = 0.25
+# Minimum confidence for space insertion
+SPACE_CONFIDENCE_THRESHOLD = 0.6
+# ============ VISUALIZATION CONFIGURATION ============
+# Tone display information for UI
+TONE_INFO = {
+    1: {
+        "name": "High Tone",
+        "symbol": "◌́",
+        "color": "#e74c3c",
+        "label": "H"
+    },
+    2: {
+        "name": "Low Tone",
+        "symbol": "◌̀",
+        "color": "#3498db",
+        "label": "B"
+    },
+    3: {
+        "name": "Mid Tone",
+        "symbol": "◌",
+        "color": "#2ecc71",
+        "label": "M"
+    },
+    4: {
+        "name": "Space",
+        "symbol": " | ",
+        "color": "#95a5a6",
+        "label": "SPACE"
+    }
+}
+# ============ DEPLOYMENT CONFIGURATION ============
+# Device (cpu or cuda)
+DEVICE = "cpu"
+# Gradio server settings
+GRADIO_SERVER_NAME = "0.0.0.0"
+GRADIO_SERVER_PORT = 7860
+GRADIO_SHARE = False
+# Model save directory for SpeechBrain
+PRETRAINED_MODEL_DIR = "./pretrained_model"
+# ============ HELPER FUNCTIONS ============
+def get_checkpoint_path():
+    """Get the checkpoint folder path"""
+    return CHECKPOINT_PATH
+def get_tone_name(idx):
+    """Get the tone name from index"""
+    return TONE_LABELS.get(idx, f"Unknown({idx})")
+def get_tone_info(idx):
+    """Get the tone display information"""
+    return TONE_INFO.get(idx, {
+        "name": f"Unknown({idx})",
+        "symbol": "?",
+        "color": "#95a5a6",
+        "label": f"UNK{idx}"
+    })
+def validate_config():
+    """Validate that the configuration is correct"""
+    errors = []
+    # Check if checkpoint folder exists
+    if not os.path.exists(CHECKPOINT_PATH):
+        errors.append(f"Checkpoint folder not found: {CHECKPOINT_PATH}")
+    # Check if required model files exist
+    if not os.path.exists(MODEL_CKPT):
+        errors.append(f"Model checkpoint not found: {MODEL_CKPT}")
+    if not os.path.exists(WAV2VEC2_CKPT):
+        errors.append(f"Wav2Vec2 checkpoint not found: {WAV2VEC2_CKPT}")
+    if not os.path.exists(TOKENIZER_CKPT):
+        errors.append(f"Tokenizer checkpoint not found: {TOKENIZER_CKPT}")
+    # Check tone labels match output neurons
+    non_blank_labels = [k for k in TONE_LABELS.keys() if k != BLANK_INDEX]
+    if len(non_blank_labels) != OUTPUT_NEURONS - 1:
+        errors.append(f"Mismatch: {len(non_blank_labels)} tone labels but {OUTPUT_NEURONS-1} expected")
+    if errors:
+        print("⚠️  Configuration Errors:")
+        for error in errors:
+            print(f"   - {error}")
+        return False
+    print("✅ Configuration validated successfully!")
+    return True
+# Run validation when module is imported
+if __name__ != "__main__":
+    # Only show validation messages in development
+    pass

custom_interface.py ADDED Viewed

	@@ -0,0 +1,293 @@

+"""
+Custom SpeechBrain inference interface for ProTeVa tone recognition model
+Includes intelligent space/word boundary detection
+"""
+import torch
+import numpy as np
+from speechbrain.inference.interfaces import Pretrained
+import config
+class ProTeVaToneRecognizer(Pretrained):
+    """
+    Custom interface for ProTeVa tone recognition model
+    Predicts tone sequences for Yoruba language (3 tones)
+    Includes post-processing for space detection
+    """
+    HPARAMS_NEEDED = ["wav2vec2", "enc", "dec", "pitch_dec",
+                      "proto", "output_lin", "log_softmax",
+                      "label_encoder", "f0Compute", "sample_rate"]
+    MODULES_NEEDED = ["wav2vec2", "enc", "dec", "pitch_dec",
+                      "proto", "output_lin"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.sample_rate = self.hparams.sample_rate
+    def classify_file(self, path):
+        """
+        Classify tone sequence from audio file
+        Arguments
+        ---------
+        path : str
+            Path to audio file
+        Returns
+        -------
+        tone_sequence : list
+            Predicted tone labels (integers)
+        tone_names : list
+            Predicted tone names (strings)
+        f0_contour : torch.Tensor
+            Reconstructed F0 contour
+        """
+        waveform = self.load_audio(path)
+        wavs = waveform.unsqueeze(0)
+        wav_lens = torch.tensor([1.0])
+        tone_sequences, tone_names, f0_contours = self.classify_batch(wavs, wav_lens)
+        return tone_sequences[0], tone_names[0], f0_contours[0]
+    def classify_batch(self, wavs, wav_lens):
+        """
+        Classify tones from a batch of waveforms
+        Arguments
+        ---------
+        wavs : torch.Tensor
+            Batch of waveforms [batch, time]
+        wav_lens : torch.Tensor
+            Relative lengths of waveforms
+        Returns
+        -------
+        tone_sequences : list of lists
+            Predicted tone label indices (with spaces if enabled)
+        tone_names : list of lists
+            Predicted tone names
+        f0_contours : torch.Tensor
+            Reconstructed F0 contours
+        """
+        self.eval()
+        with torch.no_grad():
+            wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+            # Extract features from HuBERT
+            feats = self.modules.wav2vec2(wavs)
+            # Extract F0
+            f0 = self.hparams.f0Compute(wavs, target_size=feats.shape[1])
+            # Encode with BiGRU
+            x, hidden = self.modules.enc(feats)
+            # Decode with VanillaNN
+            x = self.modules.dec(x)
+            # Pitch decoder - reconstruct F0
+            dec_out = self.modules.pitch_dec(x)
+            # Prototype layer - similarity to learned tone prototypes
+            proto_out = self.modules.proto(x)
+            # Classification layer
+            logits = self.modules.output_lin(proto_out)
+            log_probs = self.hparams.log_softmax(logits)
+            # CTC greedy decoding
+            tone_sequences = self._ctc_decode(log_probs, wav_lens)
+            # Apply space detection if enabled
+            if config.ENABLE_SPACE_DETECTION:
+                tone_sequences = self._insert_spaces(
+                    tone_sequences,
+                    f0.cpu().numpy(),
+                    log_probs.cpu().numpy(),
+                    feats.shape[1]
+                )
+            # Convert indices to tone names
+            tone_names = []
+            for seq in tone_sequences:
+                names = [self._get_tone_name(idx) for idx in seq if idx != 0]
+                tone_names.append(names)
+        return tone_sequences, tone_names, dec_out
+    def _ctc_decode(self, log_probs, wav_lens):
+        """CTC greedy decoding"""
+        from speechbrain.decoders import ctc_greedy_decode
+        sequences = ctc_greedy_decode(
+            log_probs,
+            wav_lens,
+            blank_index=config.BLANK_INDEX
+        )
+        return sequences
+    def _insert_spaces(self, sequences, f0_contours, log_probs, feat_len):
+        """
+        Insert space tokens (label 4) between tones based on acoustic features
+        Arguments
+        ---------
+        sequences : list of lists
+            Tone sequences without spaces
+        f0_contours : numpy.ndarray
+            F0 contours [batch, time, 1]
+        log_probs : numpy.ndarray
+            Log probabilities from model [batch, time, classes]
+        feat_len : int
+            Length of feature sequence
+        Returns
+        -------
+        sequences_with_spaces : list of lists
+            Tone sequences with space tokens (4) inserted
+        """
+        sequences_with_spaces = []
+        for seq_idx, sequence in enumerate(sequences):
+            if len(sequence) == 0:
+                sequences_with_spaces.append(sequence)
+                continue
+            # Get F0 for this sequence
+            f0 = f0_contours[seq_idx].flatten()
+            # Detect word boundaries
+            new_sequence = []
+            for i, tone in enumerate(sequence):
+                new_sequence.append(tone)
+                # Don't add space after last tone
+                if i == len(sequence) - 1:
+                    continue
+                # Calculate space likelihood based on method
+                should_insert_space = False
+                if config.SPACE_DETECTION_METHOD == "combined":
+                    should_insert_space = self._detect_space_combined(
+                        f0, i, len(sequence), feat_len
+                    )
+                elif config.SPACE_DETECTION_METHOD == "silence":
+                    should_insert_space = self._detect_space_silence(
+                        f0, i, len(sequence), feat_len
+                    )
+                elif config.SPACE_DETECTION_METHOD == "f0_drop":
+                    should_insert_space = self._detect_space_f0_drop(
+                        f0, i, len(sequence)
+                    )
+                elif config.SPACE_DETECTION_METHOD == "duration":
+                    should_insert_space = self._detect_space_duration(
+                        i, len(sequence), feat_len
+                    )
+                if should_insert_space:
+                    new_sequence.append(4)  # Space token
+            sequences_with_spaces.append(new_sequence)
+        return sequences_with_spaces
+    def _detect_space_silence(self, f0, tone_idx, total_tones, feat_len):
+        """Detect space based on silence (low F0) between tones"""
+        # Estimate frame positions for current and next tone
+        frames_per_tone = feat_len // max(total_tones, 1)
+        current_end = min((tone_idx + 1) * frames_per_tone, len(f0) - 1)
+        next_start = min((tone_idx + 2) * frames_per_tone, len(f0))
+        if current_end >= next_start or next_start >= len(f0):
+            return False
+        # Check gap between tones for silence
+        gap_f0 = f0[current_end:next_start]
+        silence_ratio = np.sum(gap_f0 < 50) / max(len(gap_f0), 1)  # Pitch < 50 Hz is silence
+        return silence_ratio > 0.5
+    def _detect_space_f0_drop(self, f0, tone_idx, total_tones):
+        """Detect space based on F0 drop between tones"""
+        if tone_idx >= len(f0) - 1:
+            return False
+        # Calculate average F0 for current and next tone regions
+        window_size = max(len(f0) // (total_tones * 2), 5)
+        current_start = max(0, tone_idx * window_size)
+        current_end = min((tone_idx + 1) * window_size, len(f0))
+        next_start = current_end
+        next_end = min(next_start + window_size, len(f0))
+        if current_start >= current_end or next_start >= next_end:
+            return False
+        current_f0 = f0[current_start:current_end]
+        next_f0 = f0[next_start:next_end]
+        # Filter out silence
+        current_f0 = current_f0[current_f0 > 50]
+        next_f0 = next_f0[next_f0 > 50]
+        if len(current_f0) == 0 or len(next_f0) == 0:
+            return True  # Silence indicates word boundary
+        # Calculate F0 drop
+        avg_current = np.mean(current_f0)
+        avg_next = np.mean(next_f0)
+        f0_drop = (avg_current - avg_next) / avg_current if avg_current > 0 else 0
+        return f0_drop > config.F0_DROP_THRESHOLD
+    def _detect_space_duration(self, tone_idx, total_tones, feat_len):
+        """Detect space based on regular intervals (simple heuristic)"""
+        # Every 3-5 tones, insert a space (simple word-length heuristic)
+        return (tone_idx + 1) % 4 == 0
+    def _detect_space_combined(self, f0, tone_idx, total_tones, feat_len):
+        """Combine multiple space detection methods"""
+        silence_vote = self._detect_space_silence(f0, tone_idx, total_tones, feat_len)
+        f0_drop_vote = self._detect_space_f0_drop(f0, tone_idx, total_tones)
+        # If both methods agree, high confidence
+        if silence_vote and f0_drop_vote:
+            return True
+        # If at least one method detects space and we're at a reasonable position
+        if (silence_vote or f0_drop_vote) and (tone_idx + 1) % 2 == 0:
+            return True
+        return False
+    def _get_tone_name(self, idx):
+        """
+        Convert tone index to name
+        Based on labelencoder.txt + space detection:
+        - 0: Blank (CTC)
+        - 1: High tone (H)
+        - 2: Low tone (B - Bas)
+        - 3: Mid tone (M)
+        - 4: Space (detected post-processing)
+        """
+        tone_map = {
+            0: "BLANK",
+            1: "High",
+            2: "Low",
+            3: "Mid",
+            4: "Space"
+        }
+        return tone_map.get(idx, f"Unknown({idx})")
+    def forward(self, wavs, wav_lens):
+        """Forward pass for the model"""
+        return self.classify_batch(wavs, wav_lens)

examples/yof_00295_00024634140.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76dd4293dd93bdffbd4065bbab97a5949033947129e03b2b80977daac51ee6c1
+size 92888

examples/yof_00295_00151151204.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69065a4d0403b0725912e45e46cb8296e40a3adbf4d9916752579f75de518a8c
+size 158424

examples/yof_00295_00427144639.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5c7c23374afadad02651c48526b0b517798a9a8274d9d33d0fad223a939a472
+size 155692

examples/yof_00295_00564596981.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b858793b98658c76b77e2dc5f2a4414cf8847d88528f54b7b24bfd05f9e4ab94
+size 112002

examples/yof_00295_00654803226.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3b9acb687260e35431f214c52483341a06c8c2ddb8e0cb22ece5f5e36d58292
+size 117464

examples/yof_00295_01329504028.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24c8b5283ebf63c7a1ca20ac9df81bee683d0662defc736faf165765c556f640
+size 106540

examples/yof_00295_01428115987.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e8d05f45d4532e4212e1b916ab72ed45b8e97ea614ca77f4f655dfefd6f7840
+size 139308

examples/yom_08784_01544027142.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:423c8d6d006a7383715da9e85877db63ccf9ec799b8caacd780c9354018ef710
+size 166616

examples/yom_08784_01571599993.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:354dd23cd2765334e271583931da6bc4de2196eebc06fb7ea9e4100c70b8a5d2
+size 120194

examples/yom_08784_01716814128.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddf9ec030fe7469d6345f4a2dbd5e12fad902b1d200b3ec6b3c8bf9eb8f83e4a
+size 109272

examples/yom_08784_01792196659.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6ff02e184058c36eefae6f43e33e86c86597ed3c86ad7d8bf651ec639d8014e
+size 90156

examples/yom_08784_01855888561.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5a19cab7263e2b07c8c6223e37c96bf7dd2c34916f70edbf3d73255ad3b9a7d
+size 150232

examples/yom_09334_00045442417.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ac0572cec403c1da263e567621932a9e1d51a019749a3c10a63de36798bf0c6
+size 139308

examples/yom_09334_00091591408.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20291615ab693caacdf8ffef05951904b366bacf482302d32b6d7a55d46453ae
+size 98348

examples/yom_09334_00167629780.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42c068263939cb7c08021553fd10f479120599a3f511f9c06273323b75a517de
+size 128386

inference.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+# ################################
+# ProTeVa Inference Configuration
+# Simplified YAML for deployment
+# ################################
+# Basic settings
+seed: 200
+device: cpu  # Change to cuda if GPU available
+sample_rate: 16000
+# Output neurons (4 classes: blank, high, low, mid)
+# Based on labelencoder.txt: 0=blank, 1=H, 2=B, 3=M
+# Space (4) is added via post-processing
+output_neurons: 4
+blank_index: 0
+# Number of prototypes
+n_prototypes: 10
+# Feature dimension from HuBERT
+emb_dim: 768
+# Encoder settings
+rnn_layers: 2
+rnn_neurons: 512
+# Decoder settings
+dnn_blocks: 2
+dnn_neurons: 512
+# Pitch decoder settings
+dec_dnn_blocks: [1]
+dec_dnn_neurons: [128]
+# Activation function
+activation: !name:torch.nn.LeakyReLU
+# ============ MODULES ============
+# HuBERT feature extractor
+wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+   source: "Orange/SSA-HuBERT-base-60k"
+   output_norm: True
+   freeze: False
+   save_path: whubert_checkpoint
+# F0 extractor (requires custom module)
+f0Compute: !new:modules.F0Extractor
+    device: !ref <device>
+    sample_rate: !ref <sample_rate>
+# BiGRU Encoder
+enc: !new:speechbrain.nnet.RNN.GRU
+    input_shape: [null, null, !ref <emb_dim>]
+    hidden_size: !ref <rnn_neurons>
+    num_layers: !ref <rnn_layers>
+    bidirectional: True
+    dropout: 0.15
+# VanillaNN Decoder
+dec: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
+    input_shape: [null, null, 1024]  # 512 * 2 (bidirectional)
+    activation: !ref <activation>
+    dnn_blocks: !ref <dnn_blocks>
+    dnn_neurons: !ref <dnn_neurons>
+# Pitch Decoder (requires custom module)
+pitch_dec: !new:modules.PitchDecoderLayer
+    input_shape: [null, null, !ref <dnn_neurons>]
+    dnn_blocks: !ref <dec_dnn_blocks>
+    dnn_neurons: !ref <dec_dnn_neurons>
+# Prototype Layer (requires custom module)
+proto: !new:modules.PrototypeLayer
+    n_prototypes: !ref <n_prototypes>
+    latent_dims: !ref <dnn_neurons>
+# Output linear layer
+output_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <n_prototypes>
+    n_neurons: !ref <output_neurons>
+    bias: True
+# Log softmax
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
+# ============ MODULES DICT ============
+modules:
+    wav2vec2: !ref <wav2vec2>
+    enc: !ref <enc>
+    dec: !ref <dec>
+    pitch_dec: !ref <pitch_dec>
+    proto: !ref <proto>
+    output_lin: !ref <output_lin>
+# Model container for all modules
+model: !new:torch.nn.ModuleList
+    - [!ref <enc>, !ref <dec>, !ref <proto>, !ref <output_lin>, !ref <pitch_dec>]
+# ============ PRETRAINER ============
+# This loads the trained checkpoints
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        model: !ref <model>
+        wav2vec2: !ref <wav2vec2>
+        tokenizer: !ref <label_encoder>
+    paths:
+        model: !ref <save_folder>/model.ckpt
+        wav2vec2: !ref <save_folder>/wav2vec2.ckpt
+        tokenizer: !ref <save_folder>/tokenizer.ckpt
+# Save folder - Path is loaded from config.py
+# To change checkpoint folder, update CHECKPOINT_FOLDER in config.py
+save_folder: ./CKPT+2025-10-20+08-19-07+00

labelencoder.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+'M' => 3
+'H' => 1
+'B' => 2
+'<blank>' => 0
+================
+'starting_index' => 0
+'blank_label' => '<blank>'

modules.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Custom modules for ProTeVa tone recognition model
+Authors
+ * St Germes BENGONO OBIANG 2024
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchyin
+from scipy.interpolate import interp1d
+from speechbrain.lobes.models.VanillaNN import VanillaNN
+from torch.nn import LeakyReLU, ReLU
+from speechbrain.nnet.containers import ModuleList
+class F0Extractor(torch.nn.Module):
+    """This module extracts F0 of sound and returns it as embedding vector
+    Arguments
+    ---------
+    device : str
+        Device to run computations on ('cpu' or 'cuda')
+    sample_rate : int
+        The signal sample rate (default: 16000)
+    frame_stride : float
+        Length of the sliding window used for F0 extraction (default: 0.018)
+    pitch_min : float
+        The minimum value of pitch (default: 50)
+    pitch_max : float
+        The maximum value of pitch (default: 500)
+    Example
+    -------
+    >>> compute_f0 = F0Extractor(sample_rate=16000)
+    >>> input_feats = torch.rand([1, 23000])
+    >>> outputs = compute_f0(input_feats, target_size=220)
+    >>> outputs.shape
+    torch.Size([1, 220, 1])
+    Authors
+    -------
+    * St Germes BENGONO OBIANG 2024
+    """
+    def __init__(
+        self,
+        device="cpu",
+        sample_rate=16000,
+        frame_stride=0.018,
+        pitch_min=50,
+        pitch_max=500,
+    ):
+        super().__init__()
+        self.device = device
+        self.sample_rate = sample_rate
+        self.pitch_min = pitch_min
+        self.pitch_max = pitch_max
+        self.frame_stride = frame_stride
+    def interpolate_spline(self, H, N):
+        """Interpolate pitch values to target size using cubic spline interpolation"""
+        # Generate indices for the original and new tensors
+        idx_original = np.arange(len(H))
+        idx_new = np.linspace(0, len(H) - 1, N)
+        # Create the interpolation function
+        interpolator = interp1d(idx_original, H, kind='cubic')
+        # Perform interpolation
+        H_interpolated = interpolator(idx_new)
+        # Create a mask for values below minimum pitch
+        mask = H_interpolated < self.pitch_min
+        H_interpolated[mask] = 0
+        return torch.as_tensor(H_interpolated.tolist())
+    def forward(self, wavs, target_size):
+        """Extract F0 from waveforms and interpolate to target size"""
+        results = []
+        for wav in wavs:
+            pitch = torchyin.estimate(
+                wav,
+                self.sample_rate,
+                pitch_min=self.pitch_min,
+                pitch_max=self.pitch_max,
+                frame_stride=self.frame_stride
+            )
+            # Interpolate the pitch
+            pitch = self.interpolate_spline(pitch.cpu().numpy(), target_size)
+            # Reshape the pitch output
+            pitch = pitch.view(pitch.shape[0], 1)
+            results.append(pitch.tolist())
+        return torch.as_tensor(results).to(self.device)
+class PitchDecoderLayer(torch.nn.Module):
+    """Layer for decoding latent vector to pitch
+    This decoder reconstructs F0 contours from encoded representations
+    using stacked VanillaNN layers.
+    Arguments
+    ---------
+    input_shape : list
+        Shape of input tensor [None, None, feature_dim]
+    dnn_blocks : list
+        Number of blocks for each DNN layer
+    dnn_neurons : list
+        Number of neurons for each DNN layer
+    Authors
+    -------
+    * St Germes BENGONO OBIANG 2024
+    """
+    def __init__(
+        self,
+        input_shape=[None, None, 256],
+        dnn_blocks=[2, 2],
+        dnn_neurons=[256, 128],
+    ):
+        super().__init__()
+        if len(dnn_blocks) != len(dnn_neurons):
+            raise ValueError(
+                f"dnn_blocks and dnn_neurons should have the same size but we received {len(dnn_blocks)} and {len(dnn_neurons)}"
+            )
+        layers = []
+        for index in range(len(dnn_neurons)):
+            if index == 0:
+                layers.append(
+                    VanillaNN(
+                        activation=LeakyReLU,
+                        dnn_blocks=dnn_blocks[index],
+                        dnn_neurons=dnn_neurons[index],
+                        input_shape=input_shape
+                    )
+                )
+            else:
+                # The input shape is equal to the output of the previous layer
+                layers.append(
+                    VanillaNN(
+                        activation=LeakyReLU,
+                        dnn_blocks=dnn_blocks[index],
+                        dnn_neurons=dnn_neurons[index],
+                        input_shape=[None, None, dnn_neurons[index - 1]]
+                    )
+                )
+        # Add the last required layer. The input shape is equal to the last DNN block output
+        layers.append(
+            VanillaNN(
+                activation=ReLU,
+                dnn_blocks=1,
+                dnn_neurons=1,
+                input_shape=[None, None, dnn_neurons[len(dnn_neurons) - 1]]
+            )
+        )
+        self.decoder = ModuleList(*layers)
+    def forward(self, latent_vector):
+        """Decode latent vector to F0 prediction"""
+        return self.decoder(latent_vector)
+# ============ HELPER FUNCTIONS FOR PROTOTYPE LAYER ============
+def distance_to_prototype(latent_vector, prototypes):
+    """
+    Compute the L2 squared distance between each timestamp in the latent_vector and each prototype.
+    Args:
+        latent_vector (torch.Tensor): Tensor of shape [batch, timesteps, features].
+        prototypes (torch.Tensor): Tensor of shape [n_prototypes, features].
+    Returns:
+        torch.Tensor: Tensor of shape [batch, timesteps, n_prototypes] with L2 squared distances.
+    """
+    # Expand the dimensions of prototypes to match the shape for broadcasting
+    prototypes = prototypes.unsqueeze(0).unsqueeze(0)  # Shape: [1, 1, n_prototypes, features]
+    # Expand latent_vector to match the shape for broadcasting
+    latent_vector = latent_vector.unsqueeze(2)  # Shape: [batch, timesteps, 1, features]
+    # Compute the L2 squared distance
+    distance = torch.sum((latent_vector - prototypes) ** 2, dim=-1)  # Shape: [batch, timesteps, n_prototypes]
+    return distance
+def cosine_similarity_to_prototype(latent_vector, prototypes):
+    """
+    Compute the cosine similarity between each timestamp in the latent_vector and each prototype.
+    Args:
+        latent_vector (torch.Tensor): Tensor of shape [batch, timesteps, features].
+        prototypes (torch.Tensor): Tensor of shape [n_prototypes, features].
+    Returns:
+        torch.Tensor: Tensor of shape [batch, timesteps, n_prototypes] with cosine similarities.
+    """
+    # Normalize the latent vector and prototypes
+    latent_vector_norm = F.normalize(latent_vector, p=2, dim=-1)  # Shape: [batch, timesteps, features]
+    prototypes_norm = F.normalize(prototypes, p=2, dim=-1)  # Shape: [n_prototypes, features]
+    # Expand dimensions to match for broadcasting
+    prototypes_norm = prototypes_norm.unsqueeze(0).unsqueeze(0)  # Shape: [1, 1, n_prototypes, features]
+    latent_vector_norm = latent_vector_norm.unsqueeze(2)  # Shape: [batch, timesteps, 1, features]
+    # Compute the cosine similarity
+    similarity = torch.sum(latent_vector_norm * prototypes_norm, dim=-1)  # Shape: [batch, timesteps, n_prototypes]
+    return similarity
+def distances_to_feature(input_tensor, prototypes):
+    """
+    Compute the L2 squared distance between each prototype and each timestamp in the input_tensor.
+    Args:
+        input_tensor (torch.Tensor): Tensor of shape [batch_size, num_timestep, feature_dim].
+        prototypes (torch.Tensor): Tensor of shape [num_prototypes, feature_dim].
+    Returns:
+        torch.Tensor: Tensor of shape [num_prototypes, batch_size, num_timestep] with L2 squared distances.
+    """
+    # Expand the dimensions of prototypes to match the shape for broadcasting
+    prototypes = prototypes.unsqueeze(1).unsqueeze(2)  # Shape: [num_prototypes, 1, 1, feature_dim]
+    # Expand input_tensor to match the shape for broadcasting
+    input_tensor = input_tensor.unsqueeze(0)  # Shape: [1, batch_size, num_timestep, feature_dim]
+    # Compute the L2 squared distance
+    distance = torch.sum((input_tensor - prototypes) ** 2, dim=-1)  # Shape: [num_prototypes, batch_size, num_timestep]
+    return distance
+def compute_prototype_distances(prototypes):
+    """
+    Compute the L2 squared distance between each pair of prototypes.
+    Args:
+        prototypes (torch.Tensor): Tensor of shape [n_prototypes, features].
+    Returns:
+        torch.Tensor: Tensor of shape [n_prototypes, n_prototypes] with L2 squared distances between prototypes.
+    """
+    # Calculate the squared norms of the prototypes
+    squared_norms = torch.sum(prototypes ** 2, dim=1, keepdim=True)  # Shape: [n_prototypes, 1]
+    # Calculate the pairwise distance using the formula: (a-b)^2 = a^2 + b^2 - 2ab
+    distances = squared_norms + squared_norms.T - 2 * torch.mm(prototypes, prototypes.T)  # Shape: [n_prototypes, n_prototypes]
+    distances = distances.fill_diagonal_(1e+6)
+    return distances
+class PrototypeLayer(torch.nn.Module):
+    """
+    Prototype Layer for tone representation learning
+    Learns M prototypes that represent canonical tone patterns.
+    Computes similarity between input features and prototypes.
+    Includes regularization losses R_1, R_2, and R_3.
+    Arguments
+    ---------
+    n_prototypes : int
+        Number of learnable prototypes (default: 9)
+    latent_dims : int
+        Dimension of latent space (default: 256)
+    Authors
+    -------
+    * St Germes BENGONO OBIANG 2024
+    """
+    def __init__(
+        self,
+        n_prototypes=9,
+        latent_dims=256,
+    ):
+        super().__init__()
+        self.n_prototypes = n_prototypes
+        self.latent_dims = latent_dims
+        # Initialize prototypes with Kaiming uniform initialization
+        self.prototypes = torch.nn.Parameter(
+            torch.nn.init.kaiming_uniform_(
+                torch.empty([n_prototypes, latent_dims]),
+                nonlinearity='relu'
+            ),
+            requires_grad=True
+        )
+        # Regularization losses
+        self.R_1 = 0  # Feature distances regulation
+        self.R_2 = 0  # Prototypes distances regulation
+        self.R_3 = 0  # Prototypes to prototypes distances
+    def setProto(self, proto):
+        """Set prototype values (for initialization or transfer learning)"""
+        self.prototypes = torch.nn.Parameter(proto, requires_grad=True)
+    def forward(self, latent_vector):
+        """
+        Compute similarity between input and prototypes
+        Args:
+            latent_vector (torch.Tensor): Input features [batch, time, latent_dims]
+        Returns:
+            torch.Tensor: Prototype similarities [batch, time, n_prototypes]
+        """
+        # Compute distances and similarities
+        dist2proto = distance_to_prototype(latent_vector, self.prototypes)
+        similarity2Proto = cosine_similarity_to_prototype(latent_vector, self.prototypes)
+        dist2Feature = distances_to_feature(latent_vector, self.prototypes)
+        protoDistance = compute_prototype_distances(self.prototypes)
+        if self.training:
+            # R_1: Each prototype is near to at least one data in latent space
+            self.R_1 = torch.mean(torch.min(dist2Feature, dim=2).values)
+            # R_2: Each data in latent space is near to at least one prototype
+            self.R_2 = torch.mean(torch.min(dist2proto, dim=2).values)
+            # R_3: Prototype is as far as possible to other prototypes
+            self.R_3 = 1 / (torch.mean(torch.min(protoDistance, dim=1).values) + 1e-8)
+        return similarity2Proto

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# Core dependencies
+# Install torch and torchaudio first to match training environment versions
+torch==2.8.0
+torchaudio==2.8.0
+# SpeechBrain includes: numpy, scipy, sentencepiece, hyperpyyaml, transformers, huggingface_hub
+speechbrain==1.0.0
+# F0 extraction with TorchYIN (note: package name is torch-yin, not torchyin)
+torch-yin==0.1.3
+# Gradio for UI
+gradio>=4.0.0
+# Audio processing (not included in speechbrain)
+librosa
+soundfile
+# Visualization (not included in speechbrain)
+matplotlib