Spaces:
Sleeping
Sleeping
Sidak Singh
commited on
Commit
Β·
7b7db64
1
Parent(s):
66a7fab
question boundary works
Browse files- .env.example +29 -0
- CUDA_SETUP.md +280 -0
- README.md +0 -12
- __pycache__/app.cpython-310.pyc +0 -0
- __pycache__/config.cpython-310.pyc +0 -0
- __pycache__/gpt.cpython-310.pyc +0 -0
- __pycache__/transcriber.cpython-310.pyc +0 -0
- app.py +79 -20
- components/__init__.py +20 -0
- components/__pycache__/__init__.cpython-310.pyc +0 -0
- components/__pycache__/gpt.cpython-310.pyc +0 -0
- components/__pycache__/streaming.cpython-310.pyc +0 -0
- components/__pycache__/transcriber.cpython-310.pyc +0 -0
- components/gpt.py +113 -0
- components/streaming.py +226 -0
- components/struct.json +0 -0
- transcriber.py β components/transcriber.py +42 -4
- config.py +77 -0
- nodemon.json +2 -5
- requirements.txt +38 -11
- test_cuda.py +301 -0
- testing.py +10 -0
.env.example
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment Configuration for Speech Transcription App
|
| 2 |
+
# Copy this file to .env and modify as needed
|
| 3 |
+
|
| 4 |
+
# CUDA Configuration
|
| 5 |
+
# Set to 'true' to use CUDA/GPU acceleration for all models
|
| 6 |
+
# Set to 'false' to use CPU for all models
|
| 7 |
+
# Default: false (CPU)
|
| 8 |
+
USE_CUDA=false
|
| 9 |
+
|
| 10 |
+
# Example configurations:
|
| 11 |
+
# USE_CUDA=true # Use GPU acceleration (requires CUDA-compatible GPU)
|
| 12 |
+
# USE_CUDA=false # Use CPU (works on all systems)
|
| 13 |
+
|
| 14 |
+
# Note: When USE_CUDA=true, the following models will use GPU:
|
| 15 |
+
# - Whisper (speech-to-text)
|
| 16 |
+
# - RoBERTa (question classification)
|
| 17 |
+
# - Sentence Boundary Detection
|
| 18 |
+
#
|
| 19 |
+
# GPU acceleration provides:
|
| 20 |
+
# β
Faster processing (2-10x speedup)
|
| 21 |
+
# β
Better real-time performance
|
| 22 |
+
# β Higher memory usage
|
| 23 |
+
# β Requires CUDA-compatible GPU
|
| 24 |
+
#
|
| 25 |
+
# CPU processing provides:
|
| 26 |
+
# β
Works on all systems
|
| 27 |
+
# β
Lower memory usage
|
| 28 |
+
# β
More stable
|
| 29 |
+
# β Slower processing
|
CUDA_SETUP.md
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CUDA Configuration Guide
|
| 2 |
+
|
| 3 |
+
This guide explains how to configure the Speech Transcription App to use GPU acceleration with CUDA.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The app supports both CPU and GPU processing for all AI models:
|
| 8 |
+
- **Whisper** (speech-to-text)
|
| 9 |
+
- **RoBERTa** (question classification)
|
| 10 |
+
- **Sentence Boundary Detection**
|
| 11 |
+
|
| 12 |
+
GPU acceleration can provide **2-10x faster processing** for real-time transcription.
|
| 13 |
+
|
| 14 |
+
## Quick Setup
|
| 15 |
+
|
| 16 |
+
### 1. Check CUDA Availability
|
| 17 |
+
```bash
|
| 18 |
+
python test_cuda.py
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
### 2. Configure Device
|
| 22 |
+
Create a `.env` file:
|
| 23 |
+
```bash
|
| 24 |
+
cp .env.example .env
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
Edit `.env`:
|
| 28 |
+
```bash
|
| 29 |
+
# For GPU acceleration
|
| 30 |
+
USE_CUDA=true
|
| 31 |
+
|
| 32 |
+
# For CPU processing (default)
|
| 33 |
+
USE_CUDA=false
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
### 3. Run the App
|
| 37 |
+
```bash
|
| 38 |
+
python app.py
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Detailed Configuration
|
| 42 |
+
|
| 43 |
+
### Environment Variables
|
| 44 |
+
|
| 45 |
+
| Variable | Values | Description |
|
| 46 |
+
|----------|--------|-------------|
|
| 47 |
+
| `USE_CUDA` | `true`/`false` | Enable/disable GPU acceleration |
|
| 48 |
+
|
| 49 |
+
### Device Selection Logic
|
| 50 |
+
|
| 51 |
+
```
|
| 52 |
+
1. If USE_CUDA=true AND CUDA available β Use GPU
|
| 53 |
+
2. If USE_CUDA=true AND CUDA not available β Fallback to CPU (with warning)
|
| 54 |
+
3. If USE_CUDA=false β Use CPU
|
| 55 |
+
4. If no .env file β Default to CPU
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Model Configurations
|
| 59 |
+
|
| 60 |
+
| Device | Whisper | RoBERTa | Compute Type |
|
| 61 |
+
|--------|---------|---------|--------------|
|
| 62 |
+
| **CPU** | `device="cpu"` | `device=-1` | `int8` |
|
| 63 |
+
| **GPU** | `device="cuda"` | `device=0` | `float16` |
|
| 64 |
+
|
| 65 |
+
## CUDA Requirements
|
| 66 |
+
|
| 67 |
+
### System Requirements
|
| 68 |
+
- NVIDIA GPU with CUDA Compute Capability 3.5+
|
| 69 |
+
- CUDA Toolkit 11.8+ or 12.x
|
| 70 |
+
- cuDNN 8.x
|
| 71 |
+
- 4GB+ GPU memory recommended
|
| 72 |
+
|
| 73 |
+
### Python Dependencies
|
| 74 |
+
```bash
|
| 75 |
+
# Install PyTorch with CUDA support first
|
| 76 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 77 |
+
|
| 78 |
+
# Then install other requirements
|
| 79 |
+
pip install -r requirements.txt
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## Performance Comparison
|
| 83 |
+
|
| 84 |
+
### Typical Speedups with GPU
|
| 85 |
+
|
| 86 |
+
| Model | CPU Time | GPU Time | Speedup |
|
| 87 |
+
|-------|----------|----------|---------|
|
| 88 |
+
| Whisper (base) | ~2-5s | ~0.5-1s | 3-5x |
|
| 89 |
+
| RoBERTa | ~100ms | ~20ms | 5x |
|
| 90 |
+
| Overall | Real-time lag | Near instant | 3-8x |
|
| 91 |
+
|
| 92 |
+
### Memory Usage
|
| 93 |
+
|
| 94 |
+
| Configuration | RAM | GPU Memory |
|
| 95 |
+
|---------------|-----|------------|
|
| 96 |
+
| CPU Only | 2-4GB | 0GB |
|
| 97 |
+
| GPU Accelerated | 1-2GB | 2-6GB |
|
| 98 |
+
|
| 99 |
+
## Troubleshooting
|
| 100 |
+
|
| 101 |
+
### Common Issues
|
| 102 |
+
|
| 103 |
+
#### 1. "CUDA requested but not available"
|
| 104 |
+
```
|
| 105 |
+
β οΈ Warning: CUDA requested but not available, falling back to CPU
|
| 106 |
+
```
|
| 107 |
+
**Solution:** Install CUDA toolkit and PyTorch with CUDA support
|
| 108 |
+
|
| 109 |
+
#### 2. "Out of memory" errors
|
| 110 |
+
**Solutions:**
|
| 111 |
+
- Reduce model size (e.g., `tiny.en` β `base.en`)
|
| 112 |
+
- Set `USE_CUDA=false` to use CPU
|
| 113 |
+
- Close other GPU applications
|
| 114 |
+
|
| 115 |
+
#### 3. Models not loading on GPU
|
| 116 |
+
**Check:**
|
| 117 |
+
```python
|
| 118 |
+
import torch
|
| 119 |
+
print(f"CUDA available: {torch.cuda.is_available()}")
|
| 120 |
+
print(f"CUDA version: {torch.version.cuda}")
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
### Testing Your Setup
|
| 124 |
+
|
| 125 |
+
Run the comprehensive test:
|
| 126 |
+
```bash
|
| 127 |
+
python test_cuda.py
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
This will test:
|
| 131 |
+
- β
PyTorch CUDA detection
|
| 132 |
+
- β
Transformers device support
|
| 133 |
+
- β
Whisper model loading
|
| 134 |
+
- β
GPU memory availability
|
| 135 |
+
- β
Performance benchmark
|
| 136 |
+
|
| 137 |
+
### Debug Mode
|
| 138 |
+
|
| 139 |
+
For detailed device information, check the app startup:
|
| 140 |
+
```
|
| 141 |
+
π§ Configuration:
|
| 142 |
+
Device: CUDA
|
| 143 |
+
Compute type: float16
|
| 144 |
+
CUDA available: True
|
| 145 |
+
GPU: NVIDIA GeForce RTX 3080
|
| 146 |
+
GPU Memory: 10.0 GB
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
## Installation Examples
|
| 150 |
+
|
| 151 |
+
### Ubuntu/Linux with CUDA
|
| 152 |
+
```bash
|
| 153 |
+
# Install CUDA toolkit
|
| 154 |
+
sudo apt update
|
| 155 |
+
sudo apt install nvidia-cuda-toolkit
|
| 156 |
+
|
| 157 |
+
# Install PyTorch with CUDA
|
| 158 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 159 |
+
|
| 160 |
+
# Install app dependencies
|
| 161 |
+
pip install -r requirements.txt
|
| 162 |
+
|
| 163 |
+
# Configure for GPU
|
| 164 |
+
echo "USE_CUDA=true" > .env
|
| 165 |
+
|
| 166 |
+
# Test setup
|
| 167 |
+
python test_cuda.py
|
| 168 |
+
|
| 169 |
+
# Run app
|
| 170 |
+
python app.py
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
### Windows with CUDA
|
| 174 |
+
```bash
|
| 175 |
+
# Install CUDA toolkit from NVIDIA website
|
| 176 |
+
# https://developer.nvidia.com/cuda-downloads
|
| 177 |
+
|
| 178 |
+
# Install PyTorch with CUDA
|
| 179 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 180 |
+
|
| 181 |
+
# Install app dependencies
|
| 182 |
+
pip install -r requirements.txt
|
| 183 |
+
|
| 184 |
+
# Configure for GPU
|
| 185 |
+
echo USE_CUDA=true > .env
|
| 186 |
+
|
| 187 |
+
# Test setup
|
| 188 |
+
python test_cuda.py
|
| 189 |
+
|
| 190 |
+
# Run app
|
| 191 |
+
python app.py
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
### CPU-Only Installation
|
| 195 |
+
```bash
|
| 196 |
+
# Install PyTorch CPU version
|
| 197 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 198 |
+
|
| 199 |
+
# Install app dependencies
|
| 200 |
+
pip install -r requirements.txt
|
| 201 |
+
|
| 202 |
+
# Configure for CPU
|
| 203 |
+
echo "USE_CUDA=false" > .env
|
| 204 |
+
|
| 205 |
+
# Run app
|
| 206 |
+
python app.py
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
## Advanced Configuration
|
| 210 |
+
|
| 211 |
+
### Custom Device Settings
|
| 212 |
+
|
| 213 |
+
You can override device settings in code:
|
| 214 |
+
```python
|
| 215 |
+
# Force specific device
|
| 216 |
+
from components.transcriber import AudioProcessor
|
| 217 |
+
processor = AudioProcessor(model_size="base.en", device="cuda", compute_type="float16")
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
### Mixed Precision
|
| 221 |
+
|
| 222 |
+
GPU configurations automatically use optimal precision:
|
| 223 |
+
- **CPU:** `int8` quantization for speed
|
| 224 |
+
- **GPU:** `float16` for memory efficiency
|
| 225 |
+
|
| 226 |
+
### Multiple GPUs
|
| 227 |
+
|
| 228 |
+
For systems with multiple GPUs:
|
| 229 |
+
```python
|
| 230 |
+
# Use specific GPU
|
| 231 |
+
import os
|
| 232 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # Use second GPU
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
## Performance Tuning
|
| 236 |
+
|
| 237 |
+
### For Maximum Speed (GPU)
|
| 238 |
+
```bash
|
| 239 |
+
USE_CUDA=true
|
| 240 |
+
```
|
| 241 |
+
- Use `base.en` or `small.en` Whisper model
|
| 242 |
+
- Ensure 4GB+ GPU memory available
|
| 243 |
+
- Close other GPU applications
|
| 244 |
+
|
| 245 |
+
### For Maximum Compatibility (CPU)
|
| 246 |
+
```bash
|
| 247 |
+
USE_CUDA=false
|
| 248 |
+
```
|
| 249 |
+
- Use `tiny.en` Whisper model
|
| 250 |
+
- Works on any system
|
| 251 |
+
- Lower memory requirements
|
| 252 |
+
|
| 253 |
+
### Balanced Performance
|
| 254 |
+
```bash
|
| 255 |
+
USE_CUDA=true # with fallback to CPU
|
| 256 |
+
```
|
| 257 |
+
- Use `base.en` Whisper model
|
| 258 |
+
- Automatic device detection
|
| 259 |
+
- Best of both worlds
|
| 260 |
+
|
| 261 |
+
## Support
|
| 262 |
+
|
| 263 |
+
### Getting Help
|
| 264 |
+
|
| 265 |
+
1. Run diagnostic test: `python test_cuda.py`
|
| 266 |
+
2. Check device info in app startup logs
|
| 267 |
+
3. Verify .env configuration
|
| 268 |
+
4. Test with minimal example
|
| 269 |
+
|
| 270 |
+
### Reporting Issues
|
| 271 |
+
|
| 272 |
+
Include this information:
|
| 273 |
+
- Output of `python test_cuda.py`
|
| 274 |
+
- Your `.env` file contents
|
| 275 |
+
- GPU model and memory
|
| 276 |
+
- Error messages from app startup
|
| 277 |
+
|
| 278 |
+
---
|
| 279 |
+
|
| 280 |
+
**Note:** CPU processing works perfectly for most use cases. GPU acceleration is optional for enhanced performance.
|
README.md
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Testing
|
| 3 |
-
emoji: π’
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: green
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.41.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/app.cpython-310.pyc
ADDED
|
Binary file (2.97 kB). View file
|
|
|
__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (2.41 kB). View file
|
|
|
__pycache__/gpt.cpython-310.pyc
ADDED
|
Binary file (622 Bytes). View file
|
|
|
__pycache__/transcriber.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/transcriber.cpython-310.pyc and b/__pycache__/transcriber.cpython-310.pyc differ
|
|
|
app.py
CHANGED
|
@@ -1,41 +1,54 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
processor = AudioProcessor(model_size="tiny.en", device="cpu")
|
| 8 |
|
| 9 |
# Adjust some settings for better quality
|
| 10 |
-
processor.min_process_length =
|
| 11 |
-
processor.process_interval = 1
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def process_mic_audio(audio):
|
| 14 |
"""Process audio from Gradio microphone and update transcription"""
|
| 15 |
if audio is None:
|
| 16 |
-
return gr.update(), gr.update()
|
| 17 |
|
| 18 |
sr, y = audio
|
| 19 |
|
| 20 |
# Add to processor and possibly trigger transcription
|
| 21 |
buffer_size = processor.add_audio(y, sr)
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
# Get current transcription
|
| 24 |
transcription = processor.get_transcription()
|
| 25 |
-
print(transcription)
|
| 26 |
-
transcription = str(transcription)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
# Return status update and
|
| 30 |
buffer_seconds = buffer_size / processor.sample_rate
|
| 31 |
return (
|
| 32 |
f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s",
|
| 33 |
-
transcription
|
|
|
|
| 34 |
)
|
| 35 |
|
| 36 |
def clear_audio_buffer():
|
| 37 |
"""Clear the audio buffer"""
|
| 38 |
-
return processor.clear_buffer(), gr.update(), ""
|
| 39 |
|
| 40 |
def get_current_buffer():
|
| 41 |
"""Get the current buffer for playback"""
|
|
@@ -43,12 +56,24 @@ def get_current_buffer():
|
|
| 43 |
|
| 44 |
def force_transcribe():
|
| 45 |
"""Force transcription of current buffer"""
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
# Create Gradio interface
|
| 50 |
with gr.Blocks(title="Live Speech Transcription") as demo:
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
with gr.Row():
|
| 54 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
|
|
@@ -63,19 +88,53 @@ with gr.Blocks(title="Live Speech Transcription") as demo:
|
|
| 63 |
force_btn = gr.Button("Force Transcribe")
|
| 64 |
|
| 65 |
with gr.Row():
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
# Connect components
|
| 69 |
audio_input.stream(
|
| 70 |
process_mic_audio,
|
| 71 |
audio_input,
|
| 72 |
-
[status_output, transcription_output]
|
| 73 |
)
|
| 74 |
|
| 75 |
-
clear_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
play_btn.click(get_current_buffer, None, buffer_audio)
|
| 77 |
-
force_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
# Launch the interface
|
| 81 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
+
import threading
|
| 4 |
+
import time
|
| 5 |
+
from components.transcriber import AudioProcessor
|
| 6 |
+
from components.gpt import gen_llm_response
|
| 7 |
+
from components.streaming import StreamingManager, create_streaming_interface
|
| 8 |
+
from config import config
|
| 9 |
|
| 10 |
+
# Create processor instance with configuration-based device settings
|
| 11 |
+
processor = AudioProcessor(model_size="base.en")
|
|
|
|
| 12 |
|
| 13 |
# Adjust some settings for better quality
|
| 14 |
+
processor.min_process_length = 1 * processor.sample_rate # Need at least 2 seconds before processing
|
| 15 |
+
processor.process_interval = 1 # Process at most every 1.5 seconds
|
| 16 |
+
|
| 17 |
+
# Create streaming manager
|
| 18 |
+
streaming_manager = StreamingManager(processor)
|
| 19 |
|
| 20 |
def process_mic_audio(audio):
|
| 21 |
"""Process audio from Gradio microphone and update transcription"""
|
| 22 |
if audio is None:
|
| 23 |
+
return gr.update(), gr.update(), gr.update()
|
| 24 |
|
| 25 |
sr, y = audio
|
| 26 |
|
| 27 |
# Add to processor and possibly trigger transcription
|
| 28 |
buffer_size = processor.add_audio(y, sr)
|
| 29 |
|
| 30 |
+
# Wait for any pending processing to complete before getting transcription
|
| 31 |
+
processor.wait_for_processing_complete(1.0)
|
| 32 |
+
|
| 33 |
# Get current transcription
|
| 34 |
transcription = processor.get_transcription()
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
# Send transcription to LLM and get response
|
| 37 |
+
llm_response = ""
|
| 38 |
+
if transcription and len(transcription) > 0:
|
| 39 |
+
llm_response = gen_llm_response(transcription)
|
| 40 |
|
| 41 |
+
# Return status update, original transcription, and LLM response
|
| 42 |
buffer_seconds = buffer_size / processor.sample_rate
|
| 43 |
return (
|
| 44 |
f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s",
|
| 45 |
+
transcription,
|
| 46 |
+
llm_response
|
| 47 |
)
|
| 48 |
|
| 49 |
def clear_audio_buffer():
|
| 50 |
"""Clear the audio buffer"""
|
| 51 |
+
return processor.clear_buffer(), gr.update(), "", ""
|
| 52 |
|
| 53 |
def get_current_buffer():
|
| 54 |
"""Get the current buffer for playback"""
|
|
|
|
| 56 |
|
| 57 |
def force_transcribe():
|
| 58 |
"""Force transcription of current buffer"""
|
| 59 |
+
# Force complete processing of all remaining audio
|
| 60 |
+
transcription = processor.force_complete_processing()
|
| 61 |
+
|
| 62 |
+
# Send to LLM and get response
|
| 63 |
+
llm_response = ""
|
| 64 |
+
if transcription and len(transcription) > 0:
|
| 65 |
+
llm_response = gen_llm_response(transcription)
|
| 66 |
+
|
| 67 |
+
return transcription, llm_response
|
| 68 |
|
| 69 |
# Create Gradio interface
|
| 70 |
with gr.Blocks(title="Live Speech Transcription") as demo:
|
| 71 |
+
device_info = config.get_device_info()
|
| 72 |
+
device_status = f"π₯οΈ **Device:** {device_info['device'].upper()}"
|
| 73 |
+
if device_info['cuda_available'] and device_info['device'] == 'cuda':
|
| 74 |
+
device_status += f" | **GPU:** {device_info.get('cuda_device_name', 'Unknown')}"
|
| 75 |
+
|
| 76 |
+
gr.Markdown(f"# Live Speech Recognition with LLM Response\n{device_status}")
|
| 77 |
|
| 78 |
with gr.Row():
|
| 79 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
|
|
|
|
| 88 |
force_btn = gr.Button("Force Transcribe")
|
| 89 |
|
| 90 |
with gr.Row():
|
| 91 |
+
with gr.Column():
|
| 92 |
+
transcription_display = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
|
| 93 |
+
with gr.Column():
|
| 94 |
+
llm_response_display = gr.Textbox(label="LLM Response", lines=5, interactive=False)
|
| 95 |
+
|
| 96 |
+
# Create streaming interface
|
| 97 |
+
streaming_components = create_streaming_interface(streaming_manager)
|
| 98 |
|
| 99 |
+
# Connect main interface components
|
| 100 |
audio_input.stream(
|
| 101 |
process_mic_audio,
|
| 102 |
audio_input,
|
| 103 |
+
[status_output, streaming_components['transcription_output'], streaming_components['llm_output']]
|
| 104 |
)
|
| 105 |
|
| 106 |
+
clear_btn.click(
|
| 107 |
+
clear_audio_buffer,
|
| 108 |
+
None,
|
| 109 |
+
[status_output, buffer_audio, streaming_components['transcription_output'], streaming_components['llm_output']]
|
| 110 |
+
)
|
| 111 |
play_btn.click(get_current_buffer, None, buffer_audio)
|
| 112 |
+
force_btn.click(
|
| 113 |
+
force_transcribe,
|
| 114 |
+
None,
|
| 115 |
+
[streaming_components['transcription_output'], streaming_components['llm_output']]
|
| 116 |
+
)
|
| 117 |
|
| 118 |
if __name__ == "__main__":
|
| 119 |
+
print("π€ Live Speech Transcription App with LLM")
|
| 120 |
+
print("=" * 40)
|
| 121 |
+
|
| 122 |
+
# Display device configuration
|
| 123 |
+
device_info = config.get_device_info()
|
| 124 |
+
print("π§ Configuration:")
|
| 125 |
+
print(f" Device: {device_info['device'].upper()}")
|
| 126 |
+
print(f" Compute type: {device_info['compute_type']}")
|
| 127 |
+
print(f" CUDA available: {device_info['cuda_available']}")
|
| 128 |
+
if device_info['cuda_available'] and device_info['device'] == 'cuda':
|
| 129 |
+
print(f" GPU: {device_info.get('cuda_device_name', 'Unknown')}")
|
| 130 |
+
memory_gb = device_info.get('cuda_memory_total', 0) / (1024**3)
|
| 131 |
+
print(f" GPU Memory: {memory_gb:.1f} GB")
|
| 132 |
+
|
| 133 |
+
print("\nFeatures:")
|
| 134 |
+
print("β’ Real-time microphone transcription")
|
| 135 |
+
print("β’ Audio buffer playback")
|
| 136 |
+
print("β’ LLM responses displayed in UI")
|
| 137 |
+
print("β’ RoBERTa+ hybrid question detection")
|
| 138 |
+
|
| 139 |
# Launch the interface
|
| 140 |
demo.launch()
|
components/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Components package for the Live Speech Transcription App.
|
| 3 |
+
|
| 4 |
+
This package contains modular components for:
|
| 5 |
+
- Audio transcription (transcriber.py)
|
| 6 |
+
- GPT/LLM processing (gpt.py)
|
| 7 |
+
- Audio streaming functionality (streaming.py)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from .transcriber import AudioProcessor
|
| 11 |
+
from .gpt import gen_llm_response, detect_question
|
| 12 |
+
from .streaming import StreamingManager, create_streaming_interface
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
'AudioProcessor',
|
| 16 |
+
'gen_llm_response',
|
| 17 |
+
'detect_question',
|
| 18 |
+
'StreamingManager',
|
| 19 |
+
'create_streaming_interface'
|
| 20 |
+
]
|
components/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (636 Bytes). View file
|
|
|
components/__pycache__/gpt.cpython-310.pyc
ADDED
|
Binary file (3.22 kB). View file
|
|
|
components/__pycache__/streaming.cpython-310.pyc
ADDED
|
Binary file (6.1 kB). View file
|
|
|
components/__pycache__/transcriber.cpython-310.pyc
ADDED
|
Binary file (8.61 kB). View file
|
|
|
components/gpt.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 6 |
+
from config import config
|
| 7 |
+
|
| 8 |
+
# Initialize the pipeline with RoBERTa for better accuracy on edge cases
|
| 9 |
+
# Using a proven RoBERTa model for text classification with device config
|
| 10 |
+
device = config.get_transformers_device()
|
| 11 |
+
pipe = pipeline("text-classification", model="roberta-base", device=device)
|
| 12 |
+
print(f"RoBERTa model initialized on device: {config.device}")
|
| 13 |
+
|
| 14 |
+
def rule_based_question_detection(text):
|
| 15 |
+
"""Fast rule-based question detection for obvious cases"""
|
| 16 |
+
if not text or not isinstance(text, str):
|
| 17 |
+
return None
|
| 18 |
+
|
| 19 |
+
text = text.strip()
|
| 20 |
+
|
| 21 |
+
# Question words at the beginning
|
| 22 |
+
question_words = [
|
| 23 |
+
'what', 'when', 'where', 'who', 'whom', 'whose', 'why', 'how',
|
| 24 |
+
'which', 'can', 'could', 'would', 'should', 'will', 'shall',
|
| 25 |
+
'do', 'does', 'did', 'is', 'are', 'am', 'was', 'were',
|
| 26 |
+
'have', 'has', 'had'
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
first_word = text.lower().split()[0] if text.split() else ""
|
| 30 |
+
|
| 31 |
+
# Clear question indicators
|
| 32 |
+
if text.endswith('?'):
|
| 33 |
+
return "QUESTION"
|
| 34 |
+
elif first_word in question_words:
|
| 35 |
+
return "QUESTION"
|
| 36 |
+
elif text.endswith('.') or text.endswith('!'):
|
| 37 |
+
return "STATEMENT"
|
| 38 |
+
|
| 39 |
+
# If unclear, return None to use ML model
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
def classify_single_text(text):
|
| 43 |
+
"""Classify a single text string"""
|
| 44 |
+
text = text.strip()
|
| 45 |
+
|
| 46 |
+
# Try rule-based first (faster)
|
| 47 |
+
rule_result = rule_based_question_detection(text)
|
| 48 |
+
if rule_result:
|
| 49 |
+
return f"'{text}' β {rule_result} (rule-based)"
|
| 50 |
+
|
| 51 |
+
# Fall back to ML model for unclear cases
|
| 52 |
+
try:
|
| 53 |
+
ml_result = pipe(text)
|
| 54 |
+
# Convert to string to avoid type issues
|
| 55 |
+
result_str = str(ml_result)
|
| 56 |
+
|
| 57 |
+
# For RoBERTa base model, use structural analysis as the primary method
|
| 58 |
+
# since it's a general model, not specifically trained for question classification
|
| 59 |
+
|
| 60 |
+
# Enhanced structural analysis for edge cases
|
| 61 |
+
text_lower = text.lower().strip()
|
| 62 |
+
|
| 63 |
+
# Check for auxiliary verb patterns (strong question indicators)
|
| 64 |
+
aux_verbs_start = ['do', 'does', 'did', 'can', 'could', 'will', 'would', 'should', 'may', 'might', 'must']
|
| 65 |
+
be_verbs_start = ['is', 'are', 'am', 'was', 'were']
|
| 66 |
+
have_verbs_start = ['have', 'has', 'had']
|
| 67 |
+
|
| 68 |
+
# Question patterns
|
| 69 |
+
if any(text_lower.startswith(word + ' ') for word in aux_verbs_start + be_verbs_start + have_verbs_start):
|
| 70 |
+
simple_label = "QUESTION"
|
| 71 |
+
elif text_lower.startswith(('tell me', 'let me know', 'i wonder')):
|
| 72 |
+
simple_label = "QUESTION"
|
| 73 |
+
elif ' whether ' in text_lower or ((' or ' in text_lower) and any(text_lower.startswith(word) for word in aux_verbs_start + be_verbs_start + have_verbs_start)):
|
| 74 |
+
# Choice questions (only when starting with question words)
|
| 75 |
+
simple_label = "QUESTION"
|
| 76 |
+
elif text_lower.startswith('either ') and ' or ' in text_lower:
|
| 77 |
+
# Either...or statements are typically declarative
|
| 78 |
+
simple_label = "STATEMENT"
|
| 79 |
+
elif text.count(' ') >= 2 and not any(text_lower.startswith(word) for word in ['the', 'this', 'that', 'it', 'i', 'you', 'we', 'they', 'either']):
|
| 80 |
+
# Longer phrases not starting with typical statement words might be questions
|
| 81 |
+
simple_label = "QUESTION"
|
| 82 |
+
else:
|
| 83 |
+
# Default to statement for declarative patterns
|
| 84 |
+
simple_label = "STATEMENT"
|
| 85 |
+
|
| 86 |
+
return f"'{text}' β {simple_label} (RoBERTa+)"
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
return f"'{text}' β ERROR: {str(e)}"
|
| 90 |
+
|
| 91 |
+
def classify_statement_question(text):
|
| 92 |
+
"""Enhanced classification combining rule-based and ML approaches"""
|
| 93 |
+
if not text:
|
| 94 |
+
return "No text to analyze"
|
| 95 |
+
|
| 96 |
+
# Handle both string and list inputs
|
| 97 |
+
if isinstance(text, list):
|
| 98 |
+
results = []
|
| 99 |
+
for i, sentence in enumerate(text):
|
| 100 |
+
if sentence and str(sentence).strip():
|
| 101 |
+
classification = classify_single_text(str(sentence))
|
| 102 |
+
results.append(f"Sentence {i+1}: {classification}")
|
| 103 |
+
return "\n".join(results) if results else "No valid sentences"
|
| 104 |
+
else:
|
| 105 |
+
return classify_single_text(text)
|
| 106 |
+
|
| 107 |
+
def detect_question(text):
|
| 108 |
+
"""Legacy function for backward compatibility"""
|
| 109 |
+
return classify_statement_question(text)
|
| 110 |
+
|
| 111 |
+
def gen_llm_response(text):
|
| 112 |
+
"""Generate LLM response for the given transcription"""
|
| 113 |
+
return classify_statement_question(text)
|
components/streaming.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
import time
|
| 5 |
+
from typing import Dict, Any, Optional, Tuple
|
| 6 |
+
from .gpt import gen_llm_response
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class StreamingManager:
|
| 10 |
+
"""Manages audio file streaming functionality for testing purposes"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, processor):
|
| 13 |
+
"""Initialize streaming manager with audio processor"""
|
| 14 |
+
self.processor = processor
|
| 15 |
+
self.streaming_data = {
|
| 16 |
+
'active': False,
|
| 17 |
+
'audio_data': None,
|
| 18 |
+
'sr': None,
|
| 19 |
+
'chunk_index': 0,
|
| 20 |
+
'total_chunks': 0,
|
| 21 |
+
'chunk_duration': 0.5,
|
| 22 |
+
'chunk_size': 0
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# Store original processor settings for restoration
|
| 26 |
+
self.original_min_process_length = processor.min_process_length
|
| 27 |
+
self.original_process_interval = processor.process_interval
|
| 28 |
+
|
| 29 |
+
def start_file_streaming_test(self, audio_file: str) -> Tuple[str, str, str]:
|
| 30 |
+
"""Start streaming an audio file in chunks"""
|
| 31 |
+
if audio_file is None:
|
| 32 |
+
return "Please upload an audio file first", "", ""
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
# Clear buffer and reset state
|
| 36 |
+
self.processor.clear_buffer()
|
| 37 |
+
|
| 38 |
+
# Adjust processor settings for streaming test
|
| 39 |
+
self.processor.min_process_length = 0.5 * self.processor.sample_rate # Process every 0.5 seconds
|
| 40 |
+
self.processor.process_interval = 0.3 # Check for processing every 0.3 seconds
|
| 41 |
+
|
| 42 |
+
# Load audio file
|
| 43 |
+
audio_data, sr = librosa.load(audio_file, sr=None)
|
| 44 |
+
|
| 45 |
+
# Calculate chunks
|
| 46 |
+
chunk_duration = 0.5 # 0.5 second chunks
|
| 47 |
+
chunk_size = int(chunk_duration * sr)
|
| 48 |
+
total_chunks = len(audio_data) // chunk_size + (1 if len(audio_data) % chunk_size > 0 else 0)
|
| 49 |
+
|
| 50 |
+
# Store streaming data
|
| 51 |
+
self.streaming_data.update({
|
| 52 |
+
'active': True,
|
| 53 |
+
'audio_data': audio_data,
|
| 54 |
+
'sr': sr,
|
| 55 |
+
'chunk_index': 0,
|
| 56 |
+
'total_chunks': total_chunks,
|
| 57 |
+
'chunk_duration': chunk_duration,
|
| 58 |
+
'chunk_size': chunk_size
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
#print(f"π΅ Starting stream: {len(audio_data)/sr:.1f}s audio, {total_chunks} chunks of {chunk_duration}s each")
|
| 62 |
+
return f"Started streaming {len(audio_data)/sr:.1f}s audio file in {total_chunks} chunks", "", ""
|
| 63 |
+
|
| 64 |
+
except Exception as e:
|
| 65 |
+
return f"Error loading audio file: {e}", "", ""
|
| 66 |
+
|
| 67 |
+
def stop_file_streaming_test(self) -> Tuple[str, str, str]:
|
| 68 |
+
"""Stop streaming test"""
|
| 69 |
+
self.streaming_data['active'] = False
|
| 70 |
+
|
| 71 |
+
# Restore original processor settings
|
| 72 |
+
self.processor.min_process_length = self.original_min_process_length
|
| 73 |
+
self.processor.process_interval = self.original_process_interval
|
| 74 |
+
|
| 75 |
+
# Force complete processing of all remaining audio
|
| 76 |
+
final_transcription = self.processor.force_complete_processing()
|
| 77 |
+
llm_response = ""
|
| 78 |
+
if final_transcription and len(final_transcription) > 0:
|
| 79 |
+
llm_response = gen_llm_response(final_transcription)
|
| 80 |
+
|
| 81 |
+
return "Streaming stopped", final_transcription, llm_response
|
| 82 |
+
|
| 83 |
+
def update_streaming_test(self) -> Tuple[str, str, str]:
|
| 84 |
+
"""Update function called periodically during streaming"""
|
| 85 |
+
if not self.streaming_data['active']:
|
| 86 |
+
current_transcription = self.processor.get_transcription()
|
| 87 |
+
return "Not streaming", current_transcription, ""
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
# Check if we've processed all chunks
|
| 91 |
+
if self.streaming_data['chunk_index'] >= self.streaming_data['total_chunks']:
|
| 92 |
+
# Finished streaming
|
| 93 |
+
self.streaming_data['active'] = False
|
| 94 |
+
|
| 95 |
+
# Force complete processing of all remaining audio
|
| 96 |
+
final_transcription = self.processor.force_complete_processing()
|
| 97 |
+
|
| 98 |
+
# Restore settings after processing is complete
|
| 99 |
+
self.processor.min_process_length = self.original_min_process_length
|
| 100 |
+
self.processor.process_interval = self.original_process_interval
|
| 101 |
+
|
| 102 |
+
# Send final transcription to LLM and get response
|
| 103 |
+
llm_response = ""
|
| 104 |
+
if final_transcription and len(final_transcription) > 0:
|
| 105 |
+
llm_response = gen_llm_response(final_transcription)
|
| 106 |
+
|
| 107 |
+
return f"Streaming complete! Processed {self.streaming_data['total_chunks']} chunks", str(final_transcription), llm_response
|
| 108 |
+
|
| 109 |
+
# Get current chunk info
|
| 110 |
+
chunk_size = self.streaming_data['chunk_size']
|
| 111 |
+
current_chunk = self.streaming_data['chunk_index']
|
| 112 |
+
start_idx = current_chunk * chunk_size
|
| 113 |
+
end_idx = min((current_chunk + 1) * chunk_size, len(self.streaming_data['audio_data']))
|
| 114 |
+
|
| 115 |
+
# Extract and process chunk
|
| 116 |
+
chunk = self.streaming_data['audio_data'][start_idx:end_idx]
|
| 117 |
+
#print(f"Processing chunk {current_chunk + 1}/{self.streaming_data['total_chunks']}: samples {start_idx}-{end_idx} ({len(chunk)} samples)")
|
| 118 |
+
|
| 119 |
+
# Add chunk to processor
|
| 120 |
+
buffer_size = self.processor.add_audio(chunk, self.streaming_data['sr'])
|
| 121 |
+
|
| 122 |
+
# Wait for any pending processing to complete before getting transcription
|
| 123 |
+
self.processor.wait_for_processing_complete(2.0)
|
| 124 |
+
|
| 125 |
+
# Get current transcription
|
| 126 |
+
transcription = self.processor.get_transcription()
|
| 127 |
+
|
| 128 |
+
# Send transcription to LLM and get response (for real-time updates)
|
| 129 |
+
llm_response = ""
|
| 130 |
+
if transcription and len(transcription) > 0:
|
| 131 |
+
llm_response = gen_llm_response(transcription)
|
| 132 |
+
|
| 133 |
+
# Update status
|
| 134 |
+
buffer_seconds = buffer_size / self.processor.sample_rate
|
| 135 |
+
status = f"Chunk {current_chunk+1}/{self.streaming_data['total_chunks']} | Buffer: {buffer_seconds:.1f}s | Processed: {self.processor.processed_length/self.processor.sample_rate:.1f}s"
|
| 136 |
+
|
| 137 |
+
# Move to next chunk
|
| 138 |
+
self.streaming_data['chunk_index'] += 1
|
| 139 |
+
|
| 140 |
+
# Check if this was the last chunk
|
| 141 |
+
if self.streaming_data['chunk_index'] >= self.streaming_data['total_chunks']:
|
| 142 |
+
print(f"β
All {self.streaming_data['total_chunks']} chunks processed!")
|
| 143 |
+
|
| 144 |
+
return status, str(transcription), llm_response
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
self.streaming_data['active'] = False
|
| 148 |
+
return f"Streaming error: {e}", "", ""
|
| 149 |
+
|
| 150 |
+
def is_active(self) -> bool:
|
| 151 |
+
"""Check if streaming is currently active"""
|
| 152 |
+
return self.streaming_data['active']
|
| 153 |
+
|
| 154 |
+
def get_streaming_data(self) -> Dict[str, Any]:
|
| 155 |
+
"""Get current streaming data"""
|
| 156 |
+
return self.streaming_data.copy()
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def create_streaming_interface(streaming_manager: StreamingManager) -> Dict[str, Any]:
|
| 160 |
+
"""Create Gradio interface components for streaming functionality"""
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
with gr.Row():
|
| 164 |
+
test_audio_file = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File for Testing")
|
| 165 |
+
|
| 166 |
+
with gr.Row():
|
| 167 |
+
test_stream_btn = gr.Button("π΅ Start Streaming Test", variant="primary")
|
| 168 |
+
test_stop_btn = gr.Button("βΉοΈ Stop Streaming", variant="stop")
|
| 169 |
+
|
| 170 |
+
with gr.Row():
|
| 171 |
+
test_status = gr.Textbox(label="Streaming Status", interactive=False, placeholder="Upload an audio file and click 'Start Streaming Test'")
|
| 172 |
+
|
| 173 |
+
with gr.Row():
|
| 174 |
+
with gr.Column():
|
| 175 |
+
transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
|
| 176 |
+
with gr.Column():
|
| 177 |
+
llm_output = gr.Textbox(label="LLM Response", lines=5, interactive=False)
|
| 178 |
+
|
| 179 |
+
# Timer for streaming updates (every 0.5 seconds)
|
| 180 |
+
streaming_timer = gr.Timer(value=0.5, active=False)
|
| 181 |
+
|
| 182 |
+
# Event handlers
|
| 183 |
+
def start_and_activate_timer(audio_file):
|
| 184 |
+
status, transcription, llm_response = streaming_manager.start_file_streaming_test(audio_file)
|
| 185 |
+
if streaming_manager.is_active():
|
| 186 |
+
return status, transcription, llm_response, gr.Timer(active=True)
|
| 187 |
+
else:
|
| 188 |
+
return status, transcription, llm_response, gr.Timer(active=False)
|
| 189 |
+
|
| 190 |
+
def stop_and_deactivate_timer():
|
| 191 |
+
status, transcription, llm_response = streaming_manager.stop_file_streaming_test()
|
| 192 |
+
return status, transcription, llm_response, gr.Timer(active=False)
|
| 193 |
+
|
| 194 |
+
def update_with_timer_control():
|
| 195 |
+
status, transcription, llm_response = streaming_manager.update_streaming_test()
|
| 196 |
+
# Keep timer active if still streaming
|
| 197 |
+
timer_active = streaming_manager.is_active()
|
| 198 |
+
return status, transcription, llm_response, gr.Timer(active=timer_active)
|
| 199 |
+
|
| 200 |
+
# Connect event handlers
|
| 201 |
+
test_stream_btn.click(
|
| 202 |
+
start_and_activate_timer,
|
| 203 |
+
inputs=[test_audio_file],
|
| 204 |
+
outputs=[test_status, transcription_output, llm_output, streaming_timer]
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
test_stop_btn.click(
|
| 208 |
+
stop_and_deactivate_timer,
|
| 209 |
+
outputs=[test_status, transcription_output, llm_output, streaming_timer]
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Timer tick updates with automatic deactivation when done
|
| 213 |
+
streaming_timer.tick(
|
| 214 |
+
update_with_timer_control,
|
| 215 |
+
outputs=[test_status, transcription_output, llm_output, streaming_timer]
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
return {
|
| 219 |
+
'test_audio_file': test_audio_file,
|
| 220 |
+
'test_stream_btn': test_stream_btn,
|
| 221 |
+
'test_stop_btn': test_stop_btn,
|
| 222 |
+
'test_status': test_status,
|
| 223 |
+
'transcription_output': transcription_output,
|
| 224 |
+
'llm_output': llm_output,
|
| 225 |
+
'streaming_timer': streaming_timer
|
| 226 |
+
}
|
components/struct.json
ADDED
|
File without changes
|
transcriber.py β components/transcriber.py
RENAMED
|
@@ -5,9 +5,14 @@ from faster_whisper import WhisperModel
|
|
| 5 |
import scipy.signal as signal
|
| 6 |
from typing import List
|
| 7 |
from punctuators.models import SBDModelONNX
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
class AudioProcessor:
|
| 10 |
-
def __init__(self, model_size="tiny.en", device=
|
| 11 |
"""Initialize the audio processor with configurable parameters"""
|
| 12 |
self.audio_buffer = np.array([]) # Stores raw audio for playback
|
| 13 |
self.processed_length = 0 # Length of audio already processed
|
|
@@ -17,18 +22,27 @@ class AudioProcessor:
|
|
| 17 |
self.max_buffer_size = 30 * self.sample_rate # Maximum buffer size (30 seconds)
|
| 18 |
self.overlap_size = 3 * self.sample_rate # Keep 3 seconds of overlap when trimming
|
| 19 |
self.last_process_time = time.time()
|
| 20 |
-
self.process_interval =
|
| 21 |
self.is_processing = False # Flag to prevent concurrent processing
|
| 22 |
|
| 23 |
self.full_transcription = "" # Complete history of transcription
|
| 24 |
self.last_segment_text = "" # Last segment that was transcribed
|
| 25 |
self.confirmed_transcription = "" # Transcription that won't change (beyond overlap zone)
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Initialize the whisper model
|
| 28 |
self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
|
| 29 |
-
print(f"Initialized {model_size} model on {device}")
|
| 30 |
|
|
|
|
| 31 |
self.sentence_end_detect = SBDModelONNX.from_pretrained("sbd_multi_lang")
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def _trim_buffer_intelligently(self):
|
| 34 |
"""
|
|
@@ -259,10 +273,34 @@ class AudioProcessor:
|
|
| 259 |
self.last_process_time = time.time()
|
| 260 |
self.is_processing = True
|
| 261 |
# Process in a separate thread
|
| 262 |
-
threading.Thread(target=self._process_audio_chunk, daemon=
|
| 263 |
|
| 264 |
return len(self.audio_buffer)
|
| 265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
def clear_buffer(self):
|
| 267 |
"""Clear the audio buffer and transcription"""
|
| 268 |
with self.lock:
|
|
|
|
| 5 |
import scipy.signal as signal
|
| 6 |
from typing import List
|
| 7 |
from punctuators.models import SBDModelONNX
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 11 |
+
from config import config
|
| 12 |
+
|
| 13 |
|
| 14 |
class AudioProcessor:
|
| 15 |
+
def __init__(self, model_size="tiny.en", device=None, compute_type=None):
|
| 16 |
"""Initialize the audio processor with configurable parameters"""
|
| 17 |
self.audio_buffer = np.array([]) # Stores raw audio for playback
|
| 18 |
self.processed_length = 0 # Length of audio already processed
|
|
|
|
| 22 |
self.max_buffer_size = 30 * self.sample_rate # Maximum buffer size (30 seconds)
|
| 23 |
self.overlap_size = 3 * self.sample_rate # Keep 3 seconds of overlap when trimming
|
| 24 |
self.last_process_time = time.time()
|
| 25 |
+
self.process_interval = 0.5 # Process every 1 second
|
| 26 |
self.is_processing = False # Flag to prevent concurrent processing
|
| 27 |
|
| 28 |
self.full_transcription = "" # Complete history of transcription
|
| 29 |
self.last_segment_text = "" # Last segment that was transcribed
|
| 30 |
self.confirmed_transcription = "" # Transcription that won't change (beyond overlap zone)
|
| 31 |
|
| 32 |
+
# Use config for device and compute type if not specified
|
| 33 |
+
if device is None or compute_type is None:
|
| 34 |
+
whisper_config = config.get_whisper_config()
|
| 35 |
+
device = device or whisper_config["device"]
|
| 36 |
+
compute_type = compute_type or whisper_config["compute_type"]
|
| 37 |
+
|
| 38 |
# Initialize the whisper model
|
| 39 |
self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
|
| 40 |
+
print(f"Initialized {model_size} model on {device} with {compute_type}")
|
| 41 |
|
| 42 |
+
# Initialize sentence boundary detection with device config
|
| 43 |
self.sentence_end_detect = SBDModelONNX.from_pretrained("sbd_multi_lang")
|
| 44 |
+
if config.device == "cuda":
|
| 45 |
+
print("SBD model initialized with CUDA support")
|
| 46 |
|
| 47 |
def _trim_buffer_intelligently(self):
|
| 48 |
"""
|
|
|
|
| 273 |
self.last_process_time = time.time()
|
| 274 |
self.is_processing = True
|
| 275 |
# Process in a separate thread
|
| 276 |
+
threading.Thread(target=self._process_audio_chunk, daemon=False).start()
|
| 277 |
|
| 278 |
return len(self.audio_buffer)
|
| 279 |
|
| 280 |
+
def wait_for_processing_complete(self, timeout=5.0):
|
| 281 |
+
"""Wait for any current processing to complete"""
|
| 282 |
+
start_time = time.time()
|
| 283 |
+
while self.is_processing and (time.time() - start_time) < timeout:
|
| 284 |
+
time.sleep(0.05)
|
| 285 |
+
return not self.is_processing
|
| 286 |
+
|
| 287 |
+
def force_complete_processing(self):
|
| 288 |
+
"""Force completion of any pending processing - ensures sequential execution"""
|
| 289 |
+
# Wait for any current processing to complete
|
| 290 |
+
self.wait_for_processing_complete(10.0)
|
| 291 |
+
|
| 292 |
+
# Process any remaining audio in buffer
|
| 293 |
+
with self.lock:
|
| 294 |
+
if len(self.audio_buffer) > self.processed_length:
|
| 295 |
+
# Force process remaining audio
|
| 296 |
+
self.is_processing = True
|
| 297 |
+
self._process_audio_chunk()
|
| 298 |
+
|
| 299 |
+
# Final wait to ensure everything is complete
|
| 300 |
+
self.wait_for_processing_complete(2.0)
|
| 301 |
+
|
| 302 |
+
return self.get_transcription()
|
| 303 |
+
|
| 304 |
def clear_buffer(self):
|
| 305 |
"""Clear the audio buffer and transcription"""
|
| 306 |
with self.lock:
|
config.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
# Load environment variables from .env file
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
class Config:
|
| 9 |
+
"""Configuration class for device and model settings"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
# Get USE_CUDA from environment variable, default to False
|
| 13 |
+
self.use_cuda = os.getenv('USE_CUDA', 'false').lower() == 'true'
|
| 14 |
+
|
| 15 |
+
# Determine device based on CUDA availability and config
|
| 16 |
+
self.device = self._get_device()
|
| 17 |
+
|
| 18 |
+
# Set compute type based on device
|
| 19 |
+
self.compute_type = self._get_compute_type()
|
| 20 |
+
|
| 21 |
+
print(f"π§ Config initialized:")
|
| 22 |
+
print(f" USE_CUDA environment variable: {os.getenv('USE_CUDA', 'false')}")
|
| 23 |
+
print(f" CUDA available: {torch.cuda.is_available()}")
|
| 24 |
+
print(f" Selected device: {self.device}")
|
| 25 |
+
print(f" Compute type: {self.compute_type}")
|
| 26 |
+
|
| 27 |
+
def _get_device(self):
|
| 28 |
+
"""Determine the appropriate device"""
|
| 29 |
+
if self.use_cuda and torch.cuda.is_available():
|
| 30 |
+
return "cuda"
|
| 31 |
+
elif self.use_cuda and not torch.cuda.is_available():
|
| 32 |
+
print("β οΈ Warning: CUDA requested but not available, falling back to CPU")
|
| 33 |
+
return "cpu"
|
| 34 |
+
else:
|
| 35 |
+
return "cpu"
|
| 36 |
+
|
| 37 |
+
def _get_compute_type(self):
|
| 38 |
+
"""Get appropriate compute type for the device"""
|
| 39 |
+
if self.device == "cuda":
|
| 40 |
+
return "float16" # More efficient for CUDA
|
| 41 |
+
else:
|
| 42 |
+
return "int8" # More efficient for CPU
|
| 43 |
+
|
| 44 |
+
def get_whisper_config(self):
|
| 45 |
+
"""Get configuration for Whisper model"""
|
| 46 |
+
return {
|
| 47 |
+
"device": self.device,
|
| 48 |
+
"compute_type": self.compute_type
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
def get_transformers_device(self):
|
| 52 |
+
"""Get device configuration for transformers (RoBERTa, etc.)"""
|
| 53 |
+
if self.device == "cuda":
|
| 54 |
+
return 0 # Use first CUDA device
|
| 55 |
+
else:
|
| 56 |
+
return -1 # Use CPU
|
| 57 |
+
|
| 58 |
+
def get_device_info(self):
|
| 59 |
+
"""Get detailed device information"""
|
| 60 |
+
info = {
|
| 61 |
+
"device": self.device,
|
| 62 |
+
"compute_type": self.compute_type,
|
| 63 |
+
"cuda_available": torch.cuda.is_available(),
|
| 64 |
+
"use_cuda_requested": self.use_cuda
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
if torch.cuda.is_available():
|
| 68 |
+
info.update({
|
| 69 |
+
"cuda_device_count": torch.cuda.device_count(),
|
| 70 |
+
"cuda_device_name": torch.cuda.get_device_name(0) if torch.cuda.device_count() > 0 else None,
|
| 71 |
+
"cuda_memory_total": torch.cuda.get_device_properties(0).total_memory if torch.cuda.device_count() > 0 else None
|
| 72 |
+
})
|
| 73 |
+
|
| 74 |
+
return info
|
| 75 |
+
|
| 76 |
+
# Create global config instance
|
| 77 |
+
config = Config()
|
nodemon.json
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"watch": [
|
| 3 |
-
"*.py",
|
| 4 |
-
"**/*.py"
|
| 5 |
-
],
|
| 6 |
"ext": "py",
|
| 7 |
"ignore": [
|
| 8 |
"__pycache__/",
|
|
@@ -14,7 +11,7 @@
|
|
| 14 |
".pytest_cache/",
|
| 15 |
"*.log"
|
| 16 |
],
|
| 17 |
-
"exec": "python3
|
| 18 |
"env": {
|
| 19 |
"PYTHONPATH": ".",
|
| 20 |
"PYTHONUNBUFFERED": "1"
|
|
|
|
| 1 |
{
|
| 2 |
+
"watch": ["*.py", "**/*.py"],
|
|
|
|
|
|
|
|
|
|
| 3 |
"ext": "py",
|
| 4 |
"ignore": [
|
| 5 |
"__pycache__/",
|
|
|
|
| 11 |
".pytest_cache/",
|
| 12 |
"*.log"
|
| 13 |
],
|
| 14 |
+
"exec": "python3 app.py",
|
| 15 |
"env": {
|
| 16 |
"PYTHONPATH": ".",
|
| 17 |
"PYTHONUNBUFFERED": "1"
|
requirements.txt
CHANGED
|
@@ -1,11 +1,38 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
#
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies for speech transcription
|
| 2 |
+
gradio>=4.0.0
|
| 3 |
+
numpy>=1.21.0
|
| 4 |
+
scipy>=1.7.0
|
| 5 |
+
|
| 6 |
+
# Speech processing
|
| 7 |
+
faster-whisper>=0.9.0
|
| 8 |
+
librosa>=0.9.0
|
| 9 |
+
|
| 10 |
+
# ML models and transformers
|
| 11 |
+
transformers>=4.20.0
|
| 12 |
+
torch>=1.12.0
|
| 13 |
+
tokenizers>=0.13.0
|
| 14 |
+
|
| 15 |
+
# Question classification and sentence boundary detection
|
| 16 |
+
punctuators>=0.1.0
|
| 17 |
+
|
| 18 |
+
# Environment configuration
|
| 19 |
+
python-dotenv>=0.19.0
|
| 20 |
+
|
| 21 |
+
# Optional CUDA support (install manually if needed)
|
| 22 |
+
# torch-audio>=0.12.0 # For CUDA audio processing
|
| 23 |
+
# torchaudio>=0.12.0 # Alternative audio processing
|
| 24 |
+
|
| 25 |
+
# Development dependencies (optional)
|
| 26 |
+
# jupyter>=1.0.0
|
| 27 |
+
# matplotlib>=3.5.0
|
| 28 |
+
# seaborn>=0.11.0
|
| 29 |
+
|
| 30 |
+
# System dependencies
|
| 31 |
+
# Note: Some packages may require additional system libraries:
|
| 32 |
+
# - For audio processing: libsndfile, ffmpeg
|
| 33 |
+
# - For CUDA: CUDA toolkit, cuDNN
|
| 34 |
+
#
|
| 35 |
+
# Installation notes:
|
| 36 |
+
# 1. For CPU-only: pip install -r requirements.txt
|
| 37 |
+
# 2. For CUDA: Install PyTorch with CUDA support first, then: pip install -r requirements.txt
|
| 38 |
+
# 3. Create .env file with USE_CUDA=true for GPU acceleration
|
test_cuda.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
CUDA Test Script for Speech Transcription App
|
| 4 |
+
|
| 5 |
+
This script helps users verify their CUDA setup and test performance
|
| 6 |
+
between CPU and GPU configurations.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python test_cuda.py
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
import time
|
| 15 |
+
import torch
|
| 16 |
+
import numpy as np
|
| 17 |
+
from dotenv import load_dotenv
|
| 18 |
+
|
| 19 |
+
def print_header(title):
|
| 20 |
+
"""Print a formatted header"""
|
| 21 |
+
print("\n" + "=" * 60)
|
| 22 |
+
print(f" {title}")
|
| 23 |
+
print("=" * 60)
|
| 24 |
+
|
| 25 |
+
def print_section(title):
|
| 26 |
+
"""Print a formatted section header"""
|
| 27 |
+
print(f"\nπ {title}")
|
| 28 |
+
print("-" * 40)
|
| 29 |
+
|
| 30 |
+
def test_pytorch_cuda():
|
| 31 |
+
"""Test PyTorch CUDA availability and performance"""
|
| 32 |
+
print_section("PyTorch CUDA Test")
|
| 33 |
+
|
| 34 |
+
print(f"PyTorch version: {torch.__version__}")
|
| 35 |
+
print(f"CUDA available: {torch.cuda.is_available()}")
|
| 36 |
+
|
| 37 |
+
if torch.cuda.is_available():
|
| 38 |
+
print(f"CUDA version: {torch.version.cuda}")
|
| 39 |
+
print(f"cuDNN version: {torch.backends.cudnn.version()}")
|
| 40 |
+
print(f"Number of CUDA devices: {torch.cuda.device_count()}")
|
| 41 |
+
|
| 42 |
+
for i in range(torch.cuda.device_count()):
|
| 43 |
+
props = torch.cuda.get_device_properties(i)
|
| 44 |
+
print(f"Device {i}: {props.name}")
|
| 45 |
+
print(f" Memory: {props.total_memory / 1e9:.1f} GB")
|
| 46 |
+
print(f" Compute capability: {props.major}.{props.minor}")
|
| 47 |
+
else:
|
| 48 |
+
print("β CUDA not available")
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
return True
|
| 52 |
+
|
| 53 |
+
def test_transformers_device():
|
| 54 |
+
"""Test transformers library device detection"""
|
| 55 |
+
print_section("Transformers Device Test")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
from transformers import pipeline
|
| 59 |
+
|
| 60 |
+
# Test with CPU
|
| 61 |
+
print("Testing CPU pipeline...")
|
| 62 |
+
start_time = time.time()
|
| 63 |
+
pipe_cpu = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", device=-1)
|
| 64 |
+
result_cpu = pipe_cpu("This is a test sentence")
|
| 65 |
+
cpu_time = time.time() - start_time
|
| 66 |
+
print(f"β
CPU pipeline loaded in {cpu_time:.2f}s")
|
| 67 |
+
print(f"Result: {result_cpu}")
|
| 68 |
+
|
| 69 |
+
# Test with CUDA if available
|
| 70 |
+
if torch.cuda.is_available():
|
| 71 |
+
print("\nTesting CUDA pipeline...")
|
| 72 |
+
start_time = time.time()
|
| 73 |
+
pipe_cuda = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", device=0)
|
| 74 |
+
result_cuda = pipe_cuda("This is a test sentence")
|
| 75 |
+
cuda_time = time.time() - start_time
|
| 76 |
+
print(f"β
CUDA pipeline loaded in {cuda_time:.2f}s")
|
| 77 |
+
print(f"Result: {result_cuda}")
|
| 78 |
+
|
| 79 |
+
speedup = cpu_time / cuda_time if cuda_time > 0 else 0
|
| 80 |
+
print(f"\nπ Speedup: {speedup:.2f}x faster with CUDA")
|
| 81 |
+
|
| 82 |
+
return True
|
| 83 |
+
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"β Error testing transformers: {e}")
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
def test_whisper_models():
|
| 89 |
+
"""Test Whisper model loading with different devices"""
|
| 90 |
+
print_section("Whisper Model Test")
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
from faster_whisper import WhisperModel
|
| 94 |
+
|
| 95 |
+
# Test CPU model
|
| 96 |
+
print("Testing Whisper on CPU...")
|
| 97 |
+
start_time = time.time()
|
| 98 |
+
model_cpu = WhisperModel("tiny.en", device="cpu", compute_type="int8")
|
| 99 |
+
cpu_load_time = time.time() - start_time
|
| 100 |
+
print(f"β
CPU model loaded in {cpu_load_time:.2f}s")
|
| 101 |
+
|
| 102 |
+
# Test CUDA model if available
|
| 103 |
+
if torch.cuda.is_available():
|
| 104 |
+
print("\nTesting Whisper on CUDA...")
|
| 105 |
+
start_time = time.time()
|
| 106 |
+
try:
|
| 107 |
+
model_cuda = WhisperModel("tiny.en", device="cuda", compute_type="float16")
|
| 108 |
+
cuda_load_time = time.time() - start_time
|
| 109 |
+
print(f"β
CUDA model loaded in {cuda_load_time:.2f}s")
|
| 110 |
+
|
| 111 |
+
speedup = cpu_load_time / cuda_load_time if cuda_load_time > 0 else 0
|
| 112 |
+
print(f"π Load speedup: {speedup:.2f}x faster with CUDA")
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"β Error loading CUDA model: {e}")
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
return True
|
| 119 |
+
|
| 120 |
+
except ImportError:
|
| 121 |
+
print("β faster-whisper not installed")
|
| 122 |
+
return False
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"β Error testing Whisper: {e}")
|
| 125 |
+
return False
|
| 126 |
+
|
| 127 |
+
def test_memory_usage():
|
| 128 |
+
"""Test GPU memory usage"""
|
| 129 |
+
print_section("GPU Memory Test")
|
| 130 |
+
|
| 131 |
+
if not torch.cuda.is_available():
|
| 132 |
+
print("β CUDA not available for memory test")
|
| 133 |
+
return False
|
| 134 |
+
|
| 135 |
+
# Get initial memory
|
| 136 |
+
torch.cuda.empty_cache()
|
| 137 |
+
initial_memory = torch.cuda.memory_allocated()
|
| 138 |
+
total_memory = torch.cuda.get_device_properties(0).total_memory
|
| 139 |
+
|
| 140 |
+
print(f"Total GPU memory: {total_memory / 1e9:.1f} GB")
|
| 141 |
+
print(f"Initial memory usage: {initial_memory / 1e6:.1f} MB")
|
| 142 |
+
|
| 143 |
+
# Create a large tensor to test memory
|
| 144 |
+
try:
|
| 145 |
+
test_tensor = torch.randn(1000, 1000, device="cuda")
|
| 146 |
+
allocated_memory = torch.cuda.memory_allocated()
|
| 147 |
+
print(f"Memory after tensor allocation: {allocated_memory / 1e6:.1f} MB")
|
| 148 |
+
print(f"Available memory: {(total_memory - allocated_memory) / 1e9:.1f} GB")
|
| 149 |
+
|
| 150 |
+
# Clean up
|
| 151 |
+
del test_tensor
|
| 152 |
+
torch.cuda.empty_cache()
|
| 153 |
+
print("β
Memory test completed")
|
| 154 |
+
return True
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
print(f"β Memory test failed: {e}")
|
| 158 |
+
return False
|
| 159 |
+
|
| 160 |
+
def test_environment_config():
|
| 161 |
+
"""Test environment configuration"""
|
| 162 |
+
print_section("Environment Configuration Test")
|
| 163 |
+
|
| 164 |
+
# Load .env file if it exists
|
| 165 |
+
env_file = os.path.join(os.path.dirname(__file__), '.env')
|
| 166 |
+
if os.path.exists(env_file):
|
| 167 |
+
load_dotenv(env_file)
|
| 168 |
+
print(f"β
Found .env file: {env_file}")
|
| 169 |
+
else:
|
| 170 |
+
print(f"βΉοΈ No .env file found at: {env_file}")
|
| 171 |
+
print(" Create one from .env.example to configure CUDA usage")
|
| 172 |
+
|
| 173 |
+
# Check USE_CUDA setting
|
| 174 |
+
use_cuda = os.getenv('USE_CUDA', 'false').lower() == 'true'
|
| 175 |
+
print(f"USE_CUDA environment variable: {os.getenv('USE_CUDA', 'false')}")
|
| 176 |
+
print(f"Parsed USE_CUDA value: {use_cuda}")
|
| 177 |
+
|
| 178 |
+
# Test config import
|
| 179 |
+
try:
|
| 180 |
+
sys.path.append(os.path.dirname(__file__))
|
| 181 |
+
from config import config
|
| 182 |
+
print("β
Config module imported successfully")
|
| 183 |
+
|
| 184 |
+
device_info = config.get_device_info()
|
| 185 |
+
print(f"Selected device: {device_info['device']}")
|
| 186 |
+
print(f"Compute type: {device_info['compute_type']}")
|
| 187 |
+
|
| 188 |
+
return True
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"β Error importing config: {e}")
|
| 192 |
+
return False
|
| 193 |
+
|
| 194 |
+
def run_performance_benchmark():
|
| 195 |
+
"""Run a simple performance benchmark"""
|
| 196 |
+
print_section("Performance Benchmark")
|
| 197 |
+
|
| 198 |
+
if not torch.cuda.is_available():
|
| 199 |
+
print("β CUDA not available for benchmark")
|
| 200 |
+
return
|
| 201 |
+
|
| 202 |
+
# Matrix multiplication benchmark
|
| 203 |
+
size = 2000
|
| 204 |
+
iterations = 5
|
| 205 |
+
|
| 206 |
+
print(f"Running {iterations} matrix multiplications ({size}x{size})...")
|
| 207 |
+
|
| 208 |
+
# CPU benchmark
|
| 209 |
+
print("\nCPU benchmark:")
|
| 210 |
+
cpu_times = []
|
| 211 |
+
for i in range(iterations):
|
| 212 |
+
a = torch.randn(size, size)
|
| 213 |
+
b = torch.randn(size, size)
|
| 214 |
+
|
| 215 |
+
start_time = time.time()
|
| 216 |
+
c = torch.mm(a, b)
|
| 217 |
+
cpu_time = time.time() - start_time
|
| 218 |
+
cpu_times.append(cpu_time)
|
| 219 |
+
print(f" Iteration {i+1}: {cpu_time:.3f}s")
|
| 220 |
+
|
| 221 |
+
avg_cpu_time = sum(cpu_times) / len(cpu_times)
|
| 222 |
+
print(f"Average CPU time: {avg_cpu_time:.3f}s")
|
| 223 |
+
|
| 224 |
+
# CUDA benchmark
|
| 225 |
+
print("\nCUDA benchmark:")
|
| 226 |
+
cuda_times = []
|
| 227 |
+
for i in range(iterations):
|
| 228 |
+
a = torch.randn(size, size, device="cuda")
|
| 229 |
+
b = torch.randn(size, size, device="cuda")
|
| 230 |
+
|
| 231 |
+
torch.cuda.synchronize() # Wait for GPU
|
| 232 |
+
start_time = time.time()
|
| 233 |
+
c = torch.mm(a, b)
|
| 234 |
+
torch.cuda.synchronize() # Wait for GPU
|
| 235 |
+
cuda_time = time.time() - start_time
|
| 236 |
+
cuda_times.append(cuda_time)
|
| 237 |
+
print(f" Iteration {i+1}: {cuda_time:.3f}s")
|
| 238 |
+
|
| 239 |
+
avg_cuda_time = sum(cuda_times) / len(cuda_times)
|
| 240 |
+
print(f"Average CUDA time: {avg_cuda_time:.3f}s")
|
| 241 |
+
|
| 242 |
+
speedup = avg_cpu_time / avg_cuda_time
|
| 243 |
+
print(f"\nπ Overall speedup: {speedup:.2f}x faster with CUDA")
|
| 244 |
+
|
| 245 |
+
def main():
|
| 246 |
+
"""Main test function"""
|
| 247 |
+
print_header("CUDA Configuration Test for Speech Transcription App")
|
| 248 |
+
|
| 249 |
+
print("This script will test your CUDA setup and help you configure")
|
| 250 |
+
print("the speech transcription app for optimal performance.")
|
| 251 |
+
|
| 252 |
+
# Run tests
|
| 253 |
+
tests_passed = 0
|
| 254 |
+
total_tests = 5
|
| 255 |
+
|
| 256 |
+
if test_pytorch_cuda():
|
| 257 |
+
tests_passed += 1
|
| 258 |
+
|
| 259 |
+
if test_transformers_device():
|
| 260 |
+
tests_passed += 1
|
| 261 |
+
|
| 262 |
+
if test_whisper_models():
|
| 263 |
+
tests_passed += 1
|
| 264 |
+
|
| 265 |
+
if test_memory_usage():
|
| 266 |
+
tests_passed += 1
|
| 267 |
+
|
| 268 |
+
if test_environment_config():
|
| 269 |
+
tests_passed += 1
|
| 270 |
+
|
| 271 |
+
# Performance benchmark (optional)
|
| 272 |
+
if torch.cuda.is_available():
|
| 273 |
+
try:
|
| 274 |
+
run_performance_benchmark()
|
| 275 |
+
except Exception as e:
|
| 276 |
+
print(f"β Benchmark failed: {e}")
|
| 277 |
+
|
| 278 |
+
# Summary
|
| 279 |
+
print_header("Test Summary")
|
| 280 |
+
print(f"Tests passed: {tests_passed}/{total_tests}")
|
| 281 |
+
|
| 282 |
+
if tests_passed == total_tests and torch.cuda.is_available():
|
| 283 |
+
print("π All tests passed! Your CUDA setup is working correctly.")
|
| 284 |
+
print("\nTo enable CUDA acceleration:")
|
| 285 |
+
print("1. Create a .env file (copy from .env.example)")
|
| 286 |
+
print("2. Set USE_CUDA=true in the .env file")
|
| 287 |
+
print("3. Run the speech transcription app")
|
| 288 |
+
elif torch.cuda.is_available():
|
| 289 |
+
print("β οΈ Some tests failed. Check the error messages above.")
|
| 290 |
+
print("You may still be able to use CUDA, but with potential issues.")
|
| 291 |
+
else:
|
| 292 |
+
print("βΉοΈ CUDA not available. The app will run on CPU.")
|
| 293 |
+
print("This is perfectly fine for most use cases!")
|
| 294 |
+
|
| 295 |
+
print("\nFor CPU usage (always works):")
|
| 296 |
+
print("1. Create a .env file (copy from .env.example)")
|
| 297 |
+
print("2. Set USE_CUDA=false in the .env file")
|
| 298 |
+
print("3. Run the speech transcription app")
|
| 299 |
+
|
| 300 |
+
if __name__ == "__main__":
|
| 301 |
+
main()
|
testing.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use a pipeline as a high-level helper
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
|
| 4 |
+
# Initialize the pipeline
|
| 5 |
+
pipe = pipeline("text-classification", model="FedericoDamboreana/chained_question_classification_es")
|
| 6 |
+
sentence1 = 'how are you doing'
|
| 7 |
+
sentence2 = 'that dog is black'
|
| 8 |
+
|
| 9 |
+
print(pipe(sentence1))
|
| 10 |
+
print(pipe(sentence2))
|