Spaces:
Sleeping
Sleeping
Commit
·
f5d5446
1
Parent(s):
36fbe52
Refactor Dockerfile and requirements for improved dependency management and clarity
Browse files- Dockerfile +2 -2
- TRAINING_GUIDE.md +13 -3
- requirements-hf.txt +0 -14
- requirements.txt +7 -1
Dockerfile
CHANGED
|
@@ -9,8 +9,8 @@ RUN apt-get update && apt-get install -y \
|
|
| 9 |
curl \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
-
# Copy requirements first for better caching
|
| 13 |
-
COPY requirements
|
| 14 |
|
| 15 |
# Install Python dependencies
|
| 16 |
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
| 9 |
curl \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
+
# Copy requirements file first for better caching
|
| 13 |
+
COPY requirements.txt ./requirements.txt
|
| 14 |
|
| 15 |
# Install Python dependencies
|
| 16 |
RUN pip install --no-cache-dir -r requirements.txt
|
TRAINING_GUIDE.md
CHANGED
|
@@ -111,27 +111,31 @@ def synthesize_speech(text, speaker_id=0):
|
|
| 111 |
|
| 112 |
## 🎯 Training Configurations
|
| 113 |
|
| 114 |
-
### For Different Environments
|
| 115 |
|
| 116 |
1. **Local Development** (Single GPU):
|
|
|
|
| 117 |
```bash
|
| 118 |
export CUDA_VISIBLE_DEVICES="0"
|
| 119 |
python speech/train.py --config speech/config.yaml --model llm ...
|
| 120 |
```
|
| 121 |
|
| 122 |
2. **Multi-GPU Training**:
|
|
|
|
| 123 |
```bash
|
| 124 |
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
| 125 |
torchrun --nproc_per_node=4 speech/train.py ...
|
| 126 |
```
|
| 127 |
|
| 128 |
3. **Cloud Training** (Google Colab/Kaggle):
|
|
|
|
| 129 |
```python
|
| 130 |
# Use config_hf.yaml for resource-constrained environments
|
| 131 |
!python speech/train.py --config speech/config_hf.yaml ...
|
| 132 |
```
|
| 133 |
|
| 134 |
4. **Hugging Face Spaces**:
|
|
|
|
| 135 |
```bash
|
| 136 |
# For direct training on HF infrastructure
|
| 137 |
python speech/train.py --config speech/config_hf.yaml --timeout 1800 ...
|
|
@@ -140,6 +144,7 @@ def synthesize_speech(text, speaker_id=0):
|
|
| 140 |
## 📊 Monitoring Training
|
| 141 |
|
| 142 |
1. **Comet ML** (Recommended):
|
|
|
|
| 143 |
```bash
|
| 144 |
# Set up Comet ML for experiment tracking
|
| 145 |
export COMET_API_KEY="your_api_key"
|
|
@@ -147,11 +152,13 @@ def synthesize_speech(text, speaker_id=0):
|
|
| 147 |
```
|
| 148 |
|
| 149 |
2. **Tensorboard**:
|
|
|
|
| 150 |
```bash
|
| 151 |
tensorboard --logdir ./tensorboard
|
| 152 |
```
|
| 153 |
|
| 154 |
3. **Command Line**:
|
|
|
|
| 155 |
```bash
|
| 156 |
# Monitor log files
|
| 157 |
tail -f checkpoints/llm/train.log
|
|
@@ -159,7 +166,7 @@ def synthesize_speech(text, speaker_id=0):
|
|
| 159 |
|
| 160 |
## 🔧 Troubleshooting
|
| 161 |
|
| 162 |
-
### Common Issues
|
| 163 |
|
| 164 |
1. **Out of Memory**:
|
| 165 |
- Reduce batch size in config
|
|
@@ -176,9 +183,10 @@ def synthesize_speech(text, speaker_id=0):
|
|
| 176 |
- Verify data preprocessing
|
| 177 |
- Use pretrained checkpoints
|
| 178 |
|
| 179 |
-
### Performance Tips
|
| 180 |
|
| 181 |
1. **Data Loading Optimization**:
|
|
|
|
| 182 |
```yaml
|
| 183 |
# In config.yaml
|
| 184 |
num_workers: 24
|
|
@@ -187,12 +195,14 @@ def synthesize_speech(text, speaker_id=0):
|
|
| 187 |
```
|
| 188 |
|
| 189 |
2. **Memory Optimization**:
|
|
|
|
| 190 |
```bash
|
| 191 |
# Use gradient checkpointing
|
| 192 |
--use_amp --accum_grad 2
|
| 193 |
```
|
| 194 |
|
| 195 |
3. **Speed Optimization**:
|
|
|
|
| 196 |
```bash
|
| 197 |
# Compile model for faster training (PyTorch 2.0+)
|
| 198 |
export TORCH_COMPILE=1
|
|
|
|
| 111 |
|
| 112 |
## 🎯 Training Configurations
|
| 113 |
|
| 114 |
+
### For Different Environments
|
| 115 |
|
| 116 |
1. **Local Development** (Single GPU):
|
| 117 |
+
|
| 118 |
```bash
|
| 119 |
export CUDA_VISIBLE_DEVICES="0"
|
| 120 |
python speech/train.py --config speech/config.yaml --model llm ...
|
| 121 |
```
|
| 122 |
|
| 123 |
2. **Multi-GPU Training**:
|
| 124 |
+
|
| 125 |
```bash
|
| 126 |
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
| 127 |
torchrun --nproc_per_node=4 speech/train.py ...
|
| 128 |
```
|
| 129 |
|
| 130 |
3. **Cloud Training** (Google Colab/Kaggle):
|
| 131 |
+
|
| 132 |
```python
|
| 133 |
# Use config_hf.yaml for resource-constrained environments
|
| 134 |
!python speech/train.py --config speech/config_hf.yaml ...
|
| 135 |
```
|
| 136 |
|
| 137 |
4. **Hugging Face Spaces**:
|
| 138 |
+
|
| 139 |
```bash
|
| 140 |
# For direct training on HF infrastructure
|
| 141 |
python speech/train.py --config speech/config_hf.yaml --timeout 1800 ...
|
|
|
|
| 144 |
## 📊 Monitoring Training
|
| 145 |
|
| 146 |
1. **Comet ML** (Recommended):
|
| 147 |
+
|
| 148 |
```bash
|
| 149 |
# Set up Comet ML for experiment tracking
|
| 150 |
export COMET_API_KEY="your_api_key"
|
|
|
|
| 152 |
```
|
| 153 |
|
| 154 |
2. **Tensorboard**:
|
| 155 |
+
|
| 156 |
```bash
|
| 157 |
tensorboard --logdir ./tensorboard
|
| 158 |
```
|
| 159 |
|
| 160 |
3. **Command Line**:
|
| 161 |
+
|
| 162 |
```bash
|
| 163 |
# Monitor log files
|
| 164 |
tail -f checkpoints/llm/train.log
|
|
|
|
| 166 |
|
| 167 |
## 🔧 Troubleshooting
|
| 168 |
|
| 169 |
+
### Common Issues
|
| 170 |
|
| 171 |
1. **Out of Memory**:
|
| 172 |
- Reduce batch size in config
|
|
|
|
| 183 |
- Verify data preprocessing
|
| 184 |
- Use pretrained checkpoints
|
| 185 |
|
| 186 |
+
### Performance Tips
|
| 187 |
|
| 188 |
1. **Data Loading Optimization**:
|
| 189 |
+
|
| 190 |
```yaml
|
| 191 |
# In config.yaml
|
| 192 |
num_workers: 24
|
|
|
|
| 195 |
```
|
| 196 |
|
| 197 |
2. **Memory Optimization**:
|
| 198 |
+
|
| 199 |
```bash
|
| 200 |
# Use gradient checkpointing
|
| 201 |
--use_amp --accum_grad 2
|
| 202 |
```
|
| 203 |
|
| 204 |
3. **Speed Optimization**:
|
| 205 |
+
|
| 206 |
```bash
|
| 207 |
# Compile model for faster training (PyTorch 2.0+)
|
| 208 |
export TORCH_COMPILE=1
|
requirements-hf.txt
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
gradio==3.50.2
|
| 2 |
-
torch==2.1.0
|
| 3 |
-
torchaudio==2.1.0
|
| 4 |
-
numpy==1.24.3
|
| 5 |
-
soundfile==0.12.1
|
| 6 |
-
librosa==0.10.1
|
| 7 |
-
transformers==4.36.0
|
| 8 |
-
omegaconf==2.3.0
|
| 9 |
-
hydra-core==1.3.2
|
| 10 |
-
|
| 11 |
-
# Optional: Add these if you need the full training pipeline
|
| 12 |
-
# deepspeed==0.12.6
|
| 13 |
-
# tensorboard==2.14.0
|
| 14 |
-
# matplotlib==3.7.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -17,6 +17,7 @@ lightning==2.2.4
|
|
| 17 |
matplotlib==3.7.5
|
| 18 |
modelscope==1.20.0
|
| 19 |
networkx==3.1
|
|
|
|
| 20 |
omegaconf==2.3.0
|
| 21 |
onnx==1.16.0
|
| 22 |
onnxruntime-gpu==1.18.0; sys_platform == 'linux'
|
|
@@ -41,4 +42,9 @@ wget==3.2
|
|
| 41 |
flatten_dict
|
| 42 |
julius
|
| 43 |
importlib_resources
|
| 44 |
-
randomname
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
matplotlib==3.7.5
|
| 18 |
modelscope==1.20.0
|
| 19 |
networkx==3.1
|
| 20 |
+
numpy==1.24.3
|
| 21 |
omegaconf==2.3.0
|
| 22 |
onnx==1.16.0
|
| 23 |
onnxruntime-gpu==1.18.0; sys_platform == 'linux'
|
|
|
|
| 42 |
flatten_dict
|
| 43 |
julius
|
| 44 |
importlib_resources
|
| 45 |
+
randomname
|
| 46 |
+
|
| 47 |
+
# Optional: Add these if you need the full training pipeline
|
| 48 |
+
# deepspeed==0.12.6
|
| 49 |
+
# tensorboard==2.14.0
|
| 50 |
+
# matplotlib==3.7.2
|