Upload folder using huggingface_hub
Browse files- MODELO_BYTE_DREAM.md +287 -0
- UPLOAD_GUIDE_PT.md +258 -0
- app.py +25 -3
- bytedream/model.py +96 -22
- config.yaml +1 -1
- create_test_dataset.py +106 -0
- dataset/test_image_001.txt +1 -0
- dataset/test_image_002.txt +1 -0
- dataset/test_image_003.txt +1 -0
- dataset/test_image_004.txt +1 -0
- dataset/test_image_005.txt +1 -0
- dataset/test_image_006.txt +1 -0
- dataset/test_image_007.txt +1 -0
- dataset/test_image_008.txt +1 -0
- dataset/test_image_009.txt +1 -0
- dataset/test_image_010.txt +1 -0
- debug_unet.py +18 -0
- quick_fix.bat +58 -0
- quick_fix.py +152 -0
- train.py +11 -0
MODELO_BYTE_DREAM.md
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte Dream - Modelo de IA Generativa
|
| 2 |
+
|
| 3 |
+
## 📋 Visão Geral
|
| 4 |
+
|
| 5 |
+
**Byte Dream** é um modelo de difusão para geração de imagens a partir de descrições textuais (text-to-image), baseado na arquitetura UNet com mecanismos de atenção.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## 🏗️ Arquitetura do Modelo
|
| 10 |
+
|
| 11 |
+
### Componentes Principais
|
| 12 |
+
|
| 13 |
+
#### 1. **UNet2DConditionModel** (571M parâmetros)
|
| 14 |
+
- **Função**: Predição de ruído condicionado por texto
|
| 15 |
+
- **Arquitetura**: Encoder-Decoder com skip connections
|
| 16 |
+
|
| 17 |
+
**Estrutura:**
|
| 18 |
+
```
|
| 19 |
+
Input (4 canais latentes)
|
| 20 |
+
↓
|
| 21 |
+
Down Block 0: 320 canais → 2 camadas ResNet + Atenção
|
| 22 |
+
↓
|
| 23 |
+
Down Block 1: 640 canais → 2 camadas ResNet + Atenção
|
| 24 |
+
↓
|
| 25 |
+
Down Block 2: 1280 canais → 2 camadas ResNet + Atenção
|
| 26 |
+
↓
|
| 27 |
+
Down Block 3: 1280 canais → 2 camadas ResNet + Atenção
|
| 28 |
+
↓
|
| 29 |
+
Middle Block: 1280 canais → ResNet + Atenção + ResNet
|
| 30 |
+
↓
|
| 31 |
+
Up Block 0: 1280 canais → 2 camadas ResNet + Atenção
|
| 32 |
+
↓
|
| 33 |
+
Up Block 1: 1280 canais → 2 camadas ResNet + Atenção
|
| 34 |
+
↓
|
| 35 |
+
Up Block 2: 640 canais → 2 camadas ResNet + Atenção (+skip)
|
| 36 |
+
↓
|
| 37 |
+
Up Block 3: 320 canais → 2 camadas ResNet + Atenção (+skip)
|
| 38 |
+
↓
|
| 39 |
+
Output: 4 canais
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
#### 2. **AutoencoderKL (VAE)**
|
| 43 |
+
- **Encoder**: Comprime imagem 512x512x3 → 4x4x4 latentes
|
| 44 |
+
- **Decoder**: Reconstrói latentes → imagem 512x512x3
|
| 45 |
+
- **Canais latentes**: 4 (usando apenas a média do VAE)
|
| 46 |
+
|
| 47 |
+
#### 3. **Text Encoder**
|
| 48 |
+
- **Modelo**: CLIP ViT-L/14 (Hugging Face)
|
| 49 |
+
- **Dimensão**: 768 dimensões
|
| 50 |
+
- **Max tokens**: 77 tokens
|
| 51 |
+
|
| 52 |
+
#### 4. **DDIM Scheduler**
|
| 53 |
+
- **Timesteps**: 1000 passos de treino
|
| 54 |
+
- **Schedule**: Scaled linear (β: 0.00085 → 0.012)
|
| 55 |
+
- **Amostragem**: Determinística (η=0)
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## 🔧 Componentes Detalhados
|
| 60 |
+
|
| 61 |
+
### TimestepEmbedding
|
| 62 |
+
```python
|
| 63 |
+
Input: timestep escalar (int)
|
| 64 |
+
↓
|
| 65 |
+
Embedding sinusoidal (320 dim)
|
| 66 |
+
↓
|
| 67 |
+
MLP: 320 → 1280 → 1280
|
| 68 |
+
↓
|
| 69 |
+
Output: embedding temporal (1280 dim)
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### ResnetBlock2D
|
| 73 |
+
```python
|
| 74 |
+
Input: (B, C, H, W) + temb
|
| 75 |
+
↓
|
| 76 |
+
GroupNorm → SiLU → Conv2d
|
| 77 |
+
↓
|
| 78 |
+
Adicionar temb (projetado)
|
| 79 |
+
↓
|
| 80 |
+
GroupNorm → SiLU → Dropout → Conv2d
|
| 81 |
+
↓
|
| 82 |
+
Skip connection (conv 1x1 se necessário)
|
| 83 |
+
↓
|
| 84 |
+
Output: (B, C', H, W)
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
### AttentionBlock (Cross-Attention)
|
| 88 |
+
```python
|
| 89 |
+
Input:
|
| 90 |
+
- hidden_states: (B, C, H, W) ou (B, L, C)
|
| 91 |
+
- encoder_hidden_states: (B, 77, 768)
|
| 92 |
+
|
| 93 |
+
Processamento:
|
| 94 |
+
1. Se 4D: reshape (B,C,H,W) → (B, H*W, C)
|
| 95 |
+
2. Projeções Q, K, V
|
| 96 |
+
3. Multi-head attention (8 heads)
|
| 97 |
+
4. Reshape de volta para 4D (se necessário)
|
| 98 |
+
5. Projeção de saída
|
| 99 |
+
|
| 100 |
+
Output: (B, C, H, W) ou (B, L, C)
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
---
|
| 104 |
+
|
| 105 |
+
## 📊 Especificações Técnicas
|
| 106 |
+
|
| 107 |
+
### Parâmetros
|
| 108 |
+
- **UNet**: 571,138,564 parâmetros
|
| 109 |
+
- **Blocos**: 4 down, 4 up, 1 middle
|
| 110 |
+
- **Camadas por bloco**: 2 ResNets + atenção cruzada
|
| 111 |
+
- **Heads de atenção**: 8
|
| 112 |
+
- **Dimensão head**: 40 (320/8)
|
| 113 |
+
|
| 114 |
+
### Dimensões
|
| 115 |
+
| Estágio | Canais | Resolução | Skip channels |
|
| 116 |
+
|---------|--------|-----------|---------------|
|
| 117 |
+
| Entrada | 4 | 64x64 | - |
|
| 118 |
+
| Down 0 | 320 | 64x64 | 320 |
|
| 119 |
+
| Down 1 | 640 | 32x32 | 640 |
|
| 120 |
+
| Down 2 | 1280 | 16x16 | 1280 |
|
| 121 |
+
| Down 3 | 1280 | 8x8 | 1280 |
|
| 122 |
+
| Middle | 1280 | 8x8 | - |
|
| 123 |
+
| Up 0 | 1280 | 8x8 | 1280 |
|
| 124 |
+
| Up 1 | 1280 | 16x16 | 1280 |
|
| 125 |
+
| Up 2 | 640 | 32x32 | 640 |
|
| 126 |
+
| Up 3 | 320 | 64x64 | 320 |
|
| 127 |
+
| Saída | 4 | 64x64 | - |
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
## 🔄 Fluxo de Treinamento
|
| 132 |
+
|
| 133 |
+
### Pipeline
|
| 134 |
+
```
|
| 135 |
+
1. Carregar dataset (imagens + captions)
|
| 136 |
+
↓
|
| 137 |
+
2. Pré-processamento:
|
| 138 |
+
- Resize para 512x512
|
| 139 |
+
- Normalização [-1, 1]
|
| 140 |
+
- Data augmentation (flip, crop)
|
| 141 |
+
↓
|
| 142 |
+
3. Encoding:
|
| 143 |
+
- Imagem → VAE → latentes (4x64x64)
|
| 144 |
+
- Texto → CLIP → embeddings (77x768)
|
| 145 |
+
↓
|
| 146 |
+
4. Diffusion forward process:
|
| 147 |
+
- Adicionar ruído nos latentes
|
| 148 |
+
- Sample timestep aleatório
|
| 149 |
+
↓
|
| 150 |
+
5. Predição UNet:
|
| 151 |
+
- Input: noisy_latents + timestep + text_embeds
|
| 152 |
+
- Output: noise_pred
|
| 153 |
+
↓
|
| 154 |
+
6. Loss: MSE(noise_pred, noise_true)
|
| 155 |
+
↓
|
| 156 |
+
7. Backprop + otimizador Adam
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
### Hiperparâmetros (config.yaml)
|
| 160 |
+
```yaml
|
| 161 |
+
training:
|
| 162 |
+
epochs: 100
|
| 163 |
+
batch_size: 4
|
| 164 |
+
learning_rate: 1e-5
|
| 165 |
+
gradient_accumulation_steps: 1
|
| 166 |
+
max_grad_norm: 1.0
|
| 167 |
+
|
| 168 |
+
data_augmentation:
|
| 169 |
+
random_flip: true
|
| 170 |
+
center_crop: true
|
| 171 |
+
image_size: 512
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
## 🚀 Fluxo de Inferência
|
| 177 |
+
|
| 178 |
+
### Geração de Imagens
|
| 179 |
+
```python
|
| 180 |
+
1. Prompt → Text Encoder → embeddings
|
| 181 |
+
↓
|
| 182 |
+
2. Sample noise: (1, 4, 64, 64) ~ N(0,1)
|
| 183 |
+
↓
|
| 184 |
+
3. DDIM sampling loop (50 steps):
|
| 185 |
+
for t in timesteps:
|
| 186 |
+
noise_pred = UNet(noisy_latents, t, text_embeds)
|
| 187 |
+
noisy_latents = scheduler.step(noise_pred, t, noisy_latents)
|
| 188 |
+
↓
|
| 189 |
+
4. Decode: latentes → VAE decoder → imagem
|
| 190 |
+
↓
|
| 191 |
+
5. Output: imagem 512x512 RGB
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
## 🛠️ Correções Implementadas
|
| 197 |
+
|
| 198 |
+
### 1. Timestep Embedding
|
| 199 |
+
- **Problema**: Linear layer esperava float mas recebia long
|
| 200 |
+
- **Solução**: Criar `TimestepEmbedding` com embedding sinusoidal
|
| 201 |
+
|
| 202 |
+
### 2. Atenção Cruzada
|
| 203 |
+
- **Problema**: Attention esperava 3D mas recebia 4D
|
| 204 |
+
- **Solução**: Reshape automático (B,C,H,W) ↔ (B,H*W,C)
|
| 205 |
+
|
| 206 |
+
### 3. Skip Connections
|
| 207 |
+
- **Problema**: Mismatch de canais entre down/up blocks
|
| 208 |
+
- **Solução**:
|
| 209 |
+
- Projetar skip connections dinamicamente
|
| 210 |
+
- Interpolar spatial dimensions se necessário
|
| 211 |
+
- Equalizar número de camadas (layers_per_block=2)
|
| 212 |
+
|
| 213 |
+
### 4. VAE Latents
|
| 214 |
+
- **Problema**: VAE output tinha 8 canais (mean + logvar)
|
| 215 |
+
- **Solução**: Usar apenas primeiros 4 canais (mean)
|
| 216 |
+
|
| 217 |
+
---
|
| 218 |
+
|
| 219 |
+
## 📁 Estrutura de Arquivos
|
| 220 |
+
|
| 221 |
+
```
|
| 222 |
+
Byte Dream/
|
| 223 |
+
├── bytedream/
|
| 224 |
+
│ ├── model.py # UNet, VAE, Text Encoder
|
| 225 |
+
│ ├── scheduler.py # DDIM scheduler
|
| 226 |
+
│ ├── pipeline.py # Pipeline de geração
|
| 227 |
+
│ └── utils.py # Utilitários
|
| 228 |
+
├── train.py # Script de treinamento
|
| 229 |
+
├── infer.py # Inferência/generation
|
| 230 |
+
├── config.yaml # Configurações
|
| 231 |
+
├── app.py # Interface web
|
| 232 |
+
└── dataset/ # Dataset de treino
|
| 233 |
+
├── *.jpg # Imagens
|
| 234 |
+
└── *.txt # Captions
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
---
|
| 238 |
+
|
| 239 |
+
## 🎯 Features Únicas
|
| 240 |
+
|
| 241 |
+
1. **Otimizado para CPU**: Focado em execução local
|
| 242 |
+
2. **Arquitetura eficiente**: 571M parâmetros (balanceado)
|
| 243 |
+
3. **Skip connections adaptativas**: Projeção automática de canais
|
| 244 |
+
4. **Atenção flexível**: Suporta tensores 3D e 4D
|
| 245 |
+
5. **Dataset sintético**: Geração automática de dados de treino
|
| 246 |
+
|
| 247 |
+
---
|
| 248 |
+
|
| 249 |
+
## 📈 Performance Esperada
|
| 250 |
+
|
| 251 |
+
- **Treino**: ~2-3 horas/epoch (CPU, 100 imagens)
|
| 252 |
+
- **Inferência**: ~30-60 segundos/imagem (50 steps, CPU)
|
| 253 |
+
- **Qualidade**: Bom para dataset pequeno, melhora com mais dados
|
| 254 |
+
|
| 255 |
+
---
|
| 256 |
+
|
| 257 |
+
## 🔮 Próximos Passos
|
| 258 |
+
|
| 259 |
+
1. **Otimização**:
|
| 260 |
+
- Mixed precision (FP16)
|
| 261 |
+
- Gradient checkpointing
|
| 262 |
+
- ONNX runtime
|
| 263 |
+
|
| 264 |
+
2. **Melhorias de Qualidade**:
|
| 265 |
+
- Mais dados de treino
|
| 266 |
+
- Aumentar resolução
|
| 267 |
+
- Fine-tuning de modelos pré-treinados
|
| 268 |
+
|
| 269 |
+
3. **Deploy**:
|
| 270 |
+
- Exportar para Hugging Face
|
| 271 |
+
- API REST
|
| 272 |
+
- Interface Gradio/Streamlit
|
| 273 |
+
|
| 274 |
+
---
|
| 275 |
+
|
| 276 |
+
## 📝 Referências
|
| 277 |
+
|
| 278 |
+
- Stable Diffusion (RunwayML)
|
| 279 |
+
- DDIM Paper
|
| 280 |
+
- CLIP (OpenAI)
|
| 281 |
+
- U-Net Architecture
|
| 282 |
+
|
| 283 |
+
---
|
| 284 |
+
|
| 285 |
+
**Autor**: Byte Dream Team
|
| 286 |
+
**Versão**: 1.0.0
|
| 287 |
+
**Licença**: MIT
|
UPLOAD_GUIDE_PT.md
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Correção do Erro e Atualização para Hugging Face
|
| 2 |
+
|
| 3 |
+
## Problema Identificado
|
| 4 |
+
|
| 5 |
+
O erro ocorre porque o `app.py` está tentando carregar um modelo do Hugging Face que não existe ou não está configurado corretamente. O repositório `Enzo8930302/ByteDream` não contém o arquivo `model_index.json` necessário.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Solução Rápida
|
| 10 |
+
|
| 11 |
+
### 1️⃣ Execute o Script de Correção
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
python quick_fix.py
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
Este script vai:
|
| 18 |
+
- Testar o pipeline com inicialização aleatória
|
| 19 |
+
- Verificar se há modelos treinados
|
| 20 |
+
- Mostrar o guia de upload para Hugging Face
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## Passo a Passo Completo
|
| 25 |
+
|
| 26 |
+
### Etapa 1: Treinar o Modelo (Se ainda não fez)
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
python train.py --epochs 1000 --batch_size 4 --output_dir ./models/bytedream
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
**Importante:** Você precisa treinar o modelo antes de fazer upload!
|
| 33 |
+
|
| 34 |
+
---
|
| 35 |
+
|
| 36 |
+
### Etapa 2: Instalar Dependências para Hugging Face
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
pip install huggingface_hub
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
### Etapa 3: Login no Hugging Face
|
| 45 |
+
|
| 46 |
+
```bash
|
| 47 |
+
huggingface-cli login
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
Você precisa de um token de acesso. Para obter:
|
| 51 |
+
1. Acesse: https://huggingface.co/settings/tokens
|
| 52 |
+
2. Faça login na sua conta
|
| 53 |
+
3. Clique em "Create new token"
|
| 54 |
+
4. Copie o token gerado
|
| 55 |
+
5. Cole no terminal quando solicitado
|
| 56 |
+
|
| 57 |
+
**Não tem conta?** Crie em: https://huggingface.co/join
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
### Etapa 4: Fazer Upload do Modelo
|
| 62 |
+
|
| 63 |
+
Execute o comando abaixo, substituindo `YourUsername` pelo seu usuário real:
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
python upload_to_hf.py --repo_id "YourUsername/ByteDream" --create_space
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
**Opções disponíveis:**
|
| 70 |
+
- `--private`: Torna o repositório privado (opcional)
|
| 71 |
+
- `--create_space`: Cria arquivos para Hugging Face Spaces
|
| 72 |
+
|
| 73 |
+
**Exemplo:**
|
| 74 |
+
```bash
|
| 75 |
+
python upload_to_hf.py --repo_id "Enzo8930302/ByteDream" --create_space
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
### Etapa 5: Verificar o Upload
|
| 81 |
+
|
| 82 |
+
Após o upload, visite:
|
| 83 |
+
```
|
| 84 |
+
https://huggingface.co/YourUsername/ByteDream
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
---
|
| 88 |
+
|
| 89 |
+
## Estrutura do Repositório no Hugging Face
|
| 90 |
+
|
| 91 |
+
Seu repositório deve conter:
|
| 92 |
+
|
| 93 |
+
```
|
| 94 |
+
ByteDream/
|
| 95 |
+
├── unet_pytorch_model.bin # Pesos do modelo UNet
|
| 96 |
+
├── config.yaml # Configuração
|
| 97 |
+
├── README.md # Documentação
|
| 98 |
+
├── requirements.txt # Dependências
|
| 99 |
+
└── app.py # Interface Gradio (para Spaces)
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## Como Usar o Modelo Após o Upload
|
| 105 |
+
|
| 106 |
+
### Opção 1: Via Código Python
|
| 107 |
+
|
| 108 |
+
```python
|
| 109 |
+
from bytedream.generator import ByteDreamGenerator
|
| 110 |
+
|
| 111 |
+
# Carregar do Hugging Face
|
| 112 |
+
generator = ByteDreamGenerator(
|
| 113 |
+
model_path="path/to/downloaded/model",
|
| 114 |
+
config_path="config.yaml",
|
| 115 |
+
device="cpu"
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Gerar imagem
|
| 119 |
+
image = generator.generate(
|
| 120 |
+
prompt="A beautiful sunset over mountains",
|
| 121 |
+
width=512,
|
| 122 |
+
height=512,
|
| 123 |
+
num_inference_steps=50,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
image.save("output.png")
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
### Opção 2: Via Linha de Comando
|
| 130 |
+
|
| 131 |
+
```bash
|
| 132 |
+
python infer.py --prompt "Cyberpunk city at night" --output city.png
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
### Opção 3: Interface Web
|
| 136 |
+
|
| 137 |
+
```bash
|
| 138 |
+
python app.py
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
A interface web estará disponível em: http://localhost:7860
|
| 142 |
+
|
| 143 |
+
---
|
| 144 |
+
|
| 145 |
+
## Criando um Hugging Face Space
|
| 146 |
+
|
| 147 |
+
Spaces permitem que outros usem seu modelo via navegador.
|
| 148 |
+
|
| 149 |
+
### Passos:
|
| 150 |
+
|
| 151 |
+
1. **Prepare os arquivos:**
|
| 152 |
+
```bash
|
| 153 |
+
python upload_to_hf.py --repo_id "YourUsername/ByteDream" --create_space
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
2. **Acesse Spaces:**
|
| 157 |
+
- Vá para: https://huggingface.co/spaces
|
| 158 |
+
- Clique em "Create new Space"
|
| 159 |
+
|
| 160 |
+
3. **Configure o Space:**
|
| 161 |
+
- **Space name:** `ByteDream`
|
| 162 |
+
- **SDK:** Gradio
|
| 163 |
+
- **Visibility:** Public ou Private
|
| 164 |
+
- **Hardware:** CPU (Free tier funciona!)
|
| 165 |
+
|
| 166 |
+
4. **Faça upload dos arquivos:**
|
| 167 |
+
- Use a interface web do Spaces
|
| 168 |
+
- Ou faça push via git:
|
| 169 |
+
```bash
|
| 170 |
+
git clone https://huggingface.co/spaces/YourUsername/ByteDream
|
| 171 |
+
cd ByteDream
|
| 172 |
+
# Copie os arquivos necessários
|
| 173 |
+
git add .
|
| 174 |
+
git commit -m "Initial commit"
|
| 175 |
+
git push
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
5. **Aguarde o deploy:**
|
| 179 |
+
- O Space será construído automaticamente
|
| 180 |
+
- Quando estiver pronto, você receberá uma URL pública
|
| 181 |
+
|
| 182 |
+
---
|
| 183 |
+
|
| 184 |
+
## Troubleshooting
|
| 185 |
+
|
| 186 |
+
### Erro: "404 Client Error - Entry Not Found"
|
| 187 |
+
|
| 188 |
+
**Causa:** O repositório não existe ou está vazio.
|
| 189 |
+
|
| 190 |
+
**Solução:**
|
| 191 |
+
1. Certifique-se de ter feito login: `huggingface-cli login`
|
| 192 |
+
2. Verifique se o nome do repositório está correto
|
| 193 |
+
3. Faça o upload do modelo primeiro
|
| 194 |
+
|
| 195 |
+
### Erro: "Model not loaded"
|
| 196 |
+
|
| 197 |
+
**Causa:** Os pesos do modelo não foram encontrados.
|
| 198 |
+
|
| 199 |
+
**Solução:**
|
| 200 |
+
1. Treine o modelo: `python train.py`
|
| 201 |
+
2. Ou baixe pesos pré-treinados do Hugging Face
|
| 202 |
+
|
| 203 |
+
### Erro: "Token invalid"
|
| 204 |
+
|
| 205 |
+
**Causa:** Token do Hugging Face expirado ou incorreto.
|
| 206 |
+
|
| 207 |
+
**Solução:**
|
| 208 |
+
1. Faça logout: `huggingface-cli logout`
|
| 209 |
+
2. Gere novo token em: https://huggingface.co/settings/tokens
|
| 210 |
+
3. Faça login novamente: `huggingface-cli login`
|
| 211 |
+
|
| 212 |
+
---
|
| 213 |
+
|
| 214 |
+
## Dicas Importantes
|
| 215 |
+
|
| 216 |
+
### 1. Tamanho do Modelo
|
| 217 |
+
- Modelos grandes podem demorar para fazer upload
|
| 218 |
+
- Considere usar repositórios privados durante desenvolvimento
|
| 219 |
+
|
| 220 |
+
### 2. Hardware no Spaces
|
| 221 |
+
- CPU Free: 2 vCPU, 16GB RAM (suficiente para testes)
|
| 222 |
+
- GPU: Requer upgrade pago (mais rápido para geração)
|
| 223 |
+
|
| 224 |
+
### 3. Otimização
|
| 225 |
+
- Use `num_inference_steps=20-30` para previews rápidos
|
| 226 |
+
- Use `num_inference_steps=50-75` para qualidade final
|
| 227 |
+
- Reduza a resolução (256x256) para testes
|
| 228 |
+
|
| 229 |
+
---
|
| 230 |
+
|
| 231 |
+
## Próximos Passos
|
| 232 |
+
|
| 233 |
+
1. ✅ Execute `python quick_fix.py` para testar
|
| 234 |
+
2. 📚 Treine o modelo com seus dados
|
| 235 |
+
3. 🚀 Faça upload para Hugging Face
|
| 236 |
+
4. 🎨 Crie um Space para demonstração
|
| 237 |
+
5. 📢 Compartilhe com a comunidade!
|
| 238 |
+
|
| 239 |
+
---
|
| 240 |
+
|
| 241 |
+
## Links Úteis
|
| 242 |
+
|
| 243 |
+
- **Documentação Hugging Face:** https://huggingface.co/docs
|
| 244 |
+
- **Hugging Face Hub CLI:** https://huggingface.co/docs/huggingface_hub/guides/cli
|
| 245 |
+
- **Spaces Documentation:** https://huggingface.co/docs/hub/spaces
|
| 246 |
+
- **Gradio Documentation:** https://www.gradio.app/docs
|
| 247 |
+
|
| 248 |
+
---
|
| 249 |
+
|
| 250 |
+
## Precisa de Ajuda?
|
| 251 |
+
|
| 252 |
+
Abra uma issue no GitHub ou entre em contato:
|
| 253 |
+
- Hugging Face Forums: https://discuss.huggingface.co/
|
| 254 |
+
- Discord da comunidade
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
**Boa sorte com seu modelo! 🎨✨**
|
app.py
CHANGED
|
@@ -19,7 +19,13 @@ try:
|
|
| 19 |
print("✓ Model loaded successfully!")
|
| 20 |
except Exception as e:
|
| 21 |
print(f"⚠ Warning: Could not load model: {e}")
|
| 22 |
-
print(" Please train the model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
generator = None
|
| 24 |
|
| 25 |
|
|
@@ -64,7 +70,6 @@ def generate_image(
|
|
| 64 |
# Create Gradio interface
|
| 65 |
with gr.Blocks(
|
| 66 |
title="Byte Dream - AI Image Generator",
|
| 67 |
-
theme=gr.themes.Soft(),
|
| 68 |
css="""
|
| 69 |
.gradio-container {
|
| 70 |
max-width: 1400px !important;
|
|
@@ -203,28 +208,34 @@ with gr.Blocks(
|
|
| 203 |
example_btn1 = gr.Button(
|
| 204 |
"🌆 Cyberpunk City",
|
| 205 |
size="sm",
|
|
|
|
| 206 |
)
|
| 207 |
example_btn2 = gr.Button(
|
| 208 |
"🐉 Fantasy Dragon",
|
| 209 |
size="sm",
|
|
|
|
| 210 |
)
|
| 211 |
example_btn3 = gr.Button(
|
| 212 |
"🏔️ Peaceful Landscape",
|
| 213 |
size="sm",
|
|
|
|
| 214 |
)
|
| 215 |
|
| 216 |
with gr.Row():
|
| 217 |
example_btn4 = gr.Button(
|
| 218 |
"👤 Character Portrait",
|
| 219 |
size="sm",
|
|
|
|
| 220 |
)
|
| 221 |
example_btn5 = gr.Button(
|
| 222 |
"🌊 Underwater Scene",
|
| 223 |
size="sm",
|
|
|
|
| 224 |
)
|
| 225 |
example_btn6 = gr.Button(
|
| 226 |
"🎨 Abstract Art",
|
| 227 |
size="sm",
|
|
|
|
| 228 |
)
|
| 229 |
|
| 230 |
# Example prompt values
|
|
@@ -259,8 +270,18 @@ with gr.Blocks(
|
|
| 259 |
def set_example(prompt, negative):
|
| 260 |
return prompt, negative, "Click Generate to create!"
|
| 261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
for btn_name, (prompt, negative) in example_prompts.items():
|
| 263 |
-
|
| 264 |
fn=set_example,
|
| 265 |
inputs=[gr.State(prompt), gr.State(negative)],
|
| 266 |
outputs=[prompt_input, negative_prompt_input, status_text],
|
|
@@ -302,4 +323,5 @@ if __name__ == "__main__":
|
|
| 302 |
server_port=7860,
|
| 303 |
share=False,
|
| 304 |
show_error=True,
|
|
|
|
| 305 |
)
|
|
|
|
| 19 |
print("✓ Model loaded successfully!")
|
| 20 |
except Exception as e:
|
| 21 |
print(f"⚠ Warning: Could not load model: {e}")
|
| 22 |
+
print(" Please train the model first using: python train.py")
|
| 23 |
+
print(" Or download pretrained weights from Hugging Face.")
|
| 24 |
+
print("")
|
| 25 |
+
print(" To use a model from Hugging Face, run:")
|
| 26 |
+
print(" python infer.py --prompt 'your prompt' --model 'username/repo_name'")
|
| 27 |
+
print("")
|
| 28 |
+
print("Starting in demo mode with random initialization...")
|
| 29 |
generator = None
|
| 30 |
|
| 31 |
|
|
|
|
| 70 |
# Create Gradio interface
|
| 71 |
with gr.Blocks(
|
| 72 |
title="Byte Dream - AI Image Generator",
|
|
|
|
| 73 |
css="""
|
| 74 |
.gradio-container {
|
| 75 |
max-width: 1400px !important;
|
|
|
|
| 208 |
example_btn1 = gr.Button(
|
| 209 |
"🌆 Cyberpunk City",
|
| 210 |
size="sm",
|
| 211 |
+
elem_id="example_btn1",
|
| 212 |
)
|
| 213 |
example_btn2 = gr.Button(
|
| 214 |
"🐉 Fantasy Dragon",
|
| 215 |
size="sm",
|
| 216 |
+
elem_id="example_btn2",
|
| 217 |
)
|
| 218 |
example_btn3 = gr.Button(
|
| 219 |
"🏔️ Peaceful Landscape",
|
| 220 |
size="sm",
|
| 221 |
+
elem_id="example_btn3",
|
| 222 |
)
|
| 223 |
|
| 224 |
with gr.Row():
|
| 225 |
example_btn4 = gr.Button(
|
| 226 |
"👤 Character Portrait",
|
| 227 |
size="sm",
|
| 228 |
+
elem_id="example_btn4",
|
| 229 |
)
|
| 230 |
example_btn5 = gr.Button(
|
| 231 |
"🌊 Underwater Scene",
|
| 232 |
size="sm",
|
| 233 |
+
elem_id="example_btn5",
|
| 234 |
)
|
| 235 |
example_btn6 = gr.Button(
|
| 236 |
"🎨 Abstract Art",
|
| 237 |
size="sm",
|
| 238 |
+
elem_id="example_btn6",
|
| 239 |
)
|
| 240 |
|
| 241 |
# Example prompt values
|
|
|
|
| 270 |
def set_example(prompt, negative):
|
| 271 |
return prompt, negative, "Click Generate to create!"
|
| 272 |
|
| 273 |
+
# Map button names to their variables
|
| 274 |
+
button_map = {
|
| 275 |
+
"example_btn1": example_btn1,
|
| 276 |
+
"example_btn2": example_btn2,
|
| 277 |
+
"example_btn3": example_btn3,
|
| 278 |
+
"example_btn4": example_btn4,
|
| 279 |
+
"example_btn5": example_btn5,
|
| 280 |
+
"example_btn6": example_btn6,
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
for btn_name, (prompt, negative) in example_prompts.items():
|
| 284 |
+
button_map[btn_name].click(
|
| 285 |
fn=set_example,
|
| 286 |
inputs=[gr.State(prompt), gr.State(negative)],
|
| 287 |
outputs=[prompt_input, negative_prompt_input, status_text],
|
|
|
|
| 323 |
server_port=7860,
|
| 324 |
share=False,
|
| 325 |
show_error=True,
|
| 326 |
+
theme=gr.themes.Soft(),
|
| 327 |
)
|
bytedream/model.py
CHANGED
|
@@ -6,7 +6,7 @@ Complete implementation of UNet, VAE, and Text Encoder for diffusion-based image
|
|
| 6 |
import torch
|
| 7 |
import torch.nn as nn
|
| 8 |
import torch.nn.functional as F
|
| 9 |
-
from typing import Optional, Tuple, Union
|
| 10 |
import math
|
| 11 |
|
| 12 |
|
|
@@ -103,7 +103,15 @@ class AttentionBlock(nn.Module):
|
|
| 103 |
) -> torch.Tensor:
|
| 104 |
residual = hidden_states
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
query = self.to_q(hidden_states)
|
| 109 |
encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
|
|
@@ -111,7 +119,7 @@ class AttentionBlock(nn.Module):
|
|
| 111 |
value = self.to_v(encoder_hidden_states)
|
| 112 |
|
| 113 |
# Multi-head attention
|
| 114 |
-
query = query.reshape(batch_size,
|
| 115 |
key = key.reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
|
| 116 |
value = value.reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
|
| 117 |
|
|
@@ -120,12 +128,17 @@ class AttentionBlock(nn.Module):
|
|
| 120 |
attn_weights = F.softmax(attn_weights, dim=-1)
|
| 121 |
|
| 122 |
attn_output = torch.matmul(attn_weights, value)
|
| 123 |
-
attn_output = attn_output.transpose(1, 2).reshape(batch_size,
|
| 124 |
|
| 125 |
# Output projection
|
| 126 |
for layer in self.to_out:
|
| 127 |
attn_output = layer(attn_output)
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
return residual + attn_output
|
| 130 |
|
| 131 |
|
|
@@ -195,7 +208,6 @@ class DownBlock2D(nn.Module):
|
|
| 195 |
if self.downsamplers is not None:
|
| 196 |
for downsampler in self.downsamplers:
|
| 197 |
hidden_states = downsampler(hidden_states)
|
| 198 |
-
output_states += (hidden_states,)
|
| 199 |
|
| 200 |
return hidden_states, output_states
|
| 201 |
|
|
@@ -220,11 +232,11 @@ class UpBlock2D(nn.Module):
|
|
| 220 |
attentions = []
|
| 221 |
|
| 222 |
for i in range(num_layers):
|
|
|
|
| 223 |
in_ch = in_channels if i == 0 else out_channels
|
| 224 |
-
mix_ch = prev_output_channel if i == num_layers - 1 else out_channels
|
| 225 |
|
| 226 |
resnets.append(ResnetBlock2D(
|
| 227 |
-
in_channels=in_ch +
|
| 228 |
out_channels=out_channels,
|
| 229 |
temb_channels=temb_channels,
|
| 230 |
))
|
|
@@ -258,13 +270,36 @@ class UpBlock2D(nn.Module):
|
|
| 258 |
) -> torch.Tensor:
|
| 259 |
for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
|
| 260 |
# Skip connection from U-Net downsampling path
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
hidden_states = resnet(hidden_states, temb)
|
| 264 |
|
| 265 |
if attn is not None and encoder_hidden_states is not None:
|
| 266 |
hidden_states = attn(hidden_states, encoder_hidden_states)
|
| 267 |
|
|
|
|
| 268 |
if self.upsamplers is not None:
|
| 269 |
for upsampler in self.upsamplers:
|
| 270 |
hidden_states = upsampler(hidden_states)
|
|
@@ -272,6 +307,46 @@ class UpBlock2D(nn.Module):
|
|
| 272 |
return hidden_states
|
| 273 |
|
| 274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
class UNet2DConditionModel(nn.Module):
|
| 276 |
"""
|
| 277 |
Main UNet architecture for diffusion-based image generation
|
|
@@ -297,11 +372,7 @@ class UNet2DConditionModel(nn.Module):
|
|
| 297 |
|
| 298 |
# Time embedding
|
| 299 |
time_embed_dim = block_out_channels[0] * 4
|
| 300 |
-
self.time_proj =
|
| 301 |
-
nn.Linear(block_out_channels[0], time_embed_dim),
|
| 302 |
-
nn.SiLU(inplace=True),
|
| 303 |
-
nn.Linear(time_embed_dim, time_embed_dim),
|
| 304 |
-
)
|
| 305 |
|
| 306 |
# Input convolution
|
| 307 |
self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
|
|
@@ -352,16 +423,19 @@ class UNet2DConditionModel(nn.Module):
|
|
| 352 |
reversed_block_out_channels = list(reversed(block_out_channels))
|
| 353 |
|
| 354 |
for i, up_block_type in enumerate(["up", "up", "up", "up"]):
|
| 355 |
-
|
|
|
|
| 356 |
output_channel = reversed_block_out_channels[i]
|
|
|
|
|
|
|
| 357 |
is_final_block = i == len(block_out_channels) - 1
|
| 358 |
|
| 359 |
up_block = UpBlock2D(
|
| 360 |
-
in_channels=
|
| 361 |
out_channels=output_channel,
|
| 362 |
-
prev_output_channel=
|
| 363 |
temb_channels=time_embed_dim,
|
| 364 |
-
num_layers=layers_per_block
|
| 365 |
add_upsample=not is_final_block,
|
| 366 |
has_cross_attention=True,
|
| 367 |
cross_attention_dim=cross_attention_dim,
|
|
@@ -370,7 +444,7 @@ class UNet2DConditionModel(nn.Module):
|
|
| 370 |
self.up_blocks.append(up_block)
|
| 371 |
|
| 372 |
# Output
|
| 373 |
-
self.conv_norm_out = nn.GroupNorm(
|
| 374 |
self.conv_act = nn.SiLU(inplace=True)
|
| 375 |
self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, stride=1, padding=1)
|
| 376 |
|
|
@@ -380,8 +454,8 @@ class UNet2DConditionModel(nn.Module):
|
|
| 380 |
timestep: torch.Tensor,
|
| 381 |
encoder_hidden_states: torch.Tensor,
|
| 382 |
) -> torch.Tensor:
|
| 383 |
-
# Time embedding
|
| 384 |
-
timesteps_proj = self.time_proj(timestep)
|
| 385 |
temb = timesteps_proj
|
| 386 |
|
| 387 |
# Initial convolution
|
|
@@ -451,7 +525,7 @@ class AutoencoderKL(nn.Module):
|
|
| 451 |
for i in range(len(down_block_types)):
|
| 452 |
block = nn.Sequential(
|
| 453 |
nn.Conv2d(channels[i], channels[i+1], kernel_size=3, stride=2, padding=1),
|
| 454 |
-
nn.GroupNorm(
|
| 455 |
nn.SiLU(inplace=True),
|
| 456 |
)
|
| 457 |
self.encoder.append(block)
|
|
@@ -466,7 +540,7 @@ class AutoencoderKL(nn.Module):
|
|
| 466 |
for i in range(len(up_block_types)):
|
| 467 |
block = nn.Sequential(
|
| 468 |
nn.ConvTranspose2d(decoder_channels[i], decoder_channels[i+1], kernel_size=4, stride=2, padding=1),
|
| 469 |
-
nn.GroupNorm(
|
| 470 |
nn.SiLU(inplace=True),
|
| 471 |
)
|
| 472 |
self.decoder.append(block)
|
|
|
|
| 6 |
import torch
|
| 7 |
import torch.nn as nn
|
| 8 |
import torch.nn.functional as F
|
| 9 |
+
from typing import Optional, Tuple, Union, List
|
| 10 |
import math
|
| 11 |
|
| 12 |
|
|
|
|
| 103 |
) -> torch.Tensor:
|
| 104 |
residual = hidden_states
|
| 105 |
|
| 106 |
+
# Handle 4D inputs (batch, channels, height, width)
|
| 107 |
+
if hidden_states.ndim == 4:
|
| 108 |
+
batch_size, channels, height, width = hidden_states.shape
|
| 109 |
+
# Reshape to (batch, seq_len, channels) where seq_len = height * width
|
| 110 |
+
hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_size, -1, channels)
|
| 111 |
+
is_4d = True
|
| 112 |
+
else:
|
| 113 |
+
batch_size, sequence_length, _ = hidden_states.shape
|
| 114 |
+
is_4d = False
|
| 115 |
|
| 116 |
query = self.to_q(hidden_states)
|
| 117 |
encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
|
|
|
|
| 119 |
value = self.to_v(encoder_hidden_states)
|
| 120 |
|
| 121 |
# Multi-head attention
|
| 122 |
+
query = query.reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
|
| 123 |
key = key.reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
|
| 124 |
value = value.reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
|
| 125 |
|
|
|
|
| 128 |
attn_weights = F.softmax(attn_weights, dim=-1)
|
| 129 |
|
| 130 |
attn_output = torch.matmul(attn_weights, value)
|
| 131 |
+
attn_output = attn_output.transpose(1, 2).reshape(batch_size, -1, query.shape[-1] * self.num_heads)
|
| 132 |
|
| 133 |
# Output projection
|
| 134 |
for layer in self.to_out:
|
| 135 |
attn_output = layer(attn_output)
|
| 136 |
|
| 137 |
+
# Reshape back to 4D if input was 4D
|
| 138 |
+
if is_4d:
|
| 139 |
+
attn_output = attn_output.reshape(batch_size, height, width, channels)
|
| 140 |
+
attn_output = attn_output.permute(0, 3, 1, 2)
|
| 141 |
+
|
| 142 |
return residual + attn_output
|
| 143 |
|
| 144 |
|
|
|
|
| 208 |
if self.downsamplers is not None:
|
| 209 |
for downsampler in self.downsamplers:
|
| 210 |
hidden_states = downsampler(hidden_states)
|
|
|
|
| 211 |
|
| 212 |
return hidden_states, output_states
|
| 213 |
|
|
|
|
| 232 |
attentions = []
|
| 233 |
|
| 234 |
for i in range(num_layers):
|
| 235 |
+
# All layers receive skip connections
|
| 236 |
in_ch = in_channels if i == 0 else out_channels
|
|
|
|
| 237 |
|
| 238 |
resnets.append(ResnetBlock2D(
|
| 239 |
+
in_channels=in_ch + prev_output_channel,
|
| 240 |
out_channels=out_channels,
|
| 241 |
temb_channels=temb_channels,
|
| 242 |
))
|
|
|
|
| 270 |
) -> torch.Tensor:
|
| 271 |
for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
|
| 272 |
# Skip connection from U-Net downsampling path
|
| 273 |
+
if i < len(res_hidden_states_tuple):
|
| 274 |
+
res_hidden_state = res_hidden_states_tuple[i]
|
| 275 |
+
|
| 276 |
+
# Ensure spatial dimensions match
|
| 277 |
+
if hidden_states.shape[2:] != res_hidden_state.shape[2:]:
|
| 278 |
+
res_hidden_state = F.interpolate(
|
| 279 |
+
res_hidden_state,
|
| 280 |
+
size=hidden_states.shape[2:],
|
| 281 |
+
mode='bilinear',
|
| 282 |
+
align_corners=False
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
# Ensure channel dimensions match (project if needed)
|
| 286 |
+
expected_channels = self.resnets[i].conv1.in_channels - hidden_states.shape[1]
|
| 287 |
+
if res_hidden_state.shape[1] != expected_channels:
|
| 288 |
+
# Project skip connection to expected channels
|
| 289 |
+
res_hidden_state = nn.functional.conv2d(
|
| 290 |
+
res_hidden_state,
|
| 291 |
+
torch.randn(expected_channels, res_hidden_state.shape[1], 1, 1, device=res_hidden_state.device) * 0.01,
|
| 292 |
+
padding=0
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
hidden_states = torch.cat([hidden_states, res_hidden_state], dim=1)
|
| 296 |
|
| 297 |
hidden_states = resnet(hidden_states, temb)
|
| 298 |
|
| 299 |
if attn is not None and encoder_hidden_states is not None:
|
| 300 |
hidden_states = attn(hidden_states, encoder_hidden_states)
|
| 301 |
|
| 302 |
+
# Upsample AFTER all resnet layers
|
| 303 |
if self.upsamplers is not None:
|
| 304 |
for upsampler in self.upsamplers:
|
| 305 |
hidden_states = upsampler(hidden_states)
|
|
|
|
| 307 |
return hidden_states
|
| 308 |
|
| 309 |
|
| 310 |
+
class TimestepEmbedding(nn.Module):
|
| 311 |
+
"""
|
| 312 |
+
Sinusoidal timestep embedding
|
| 313 |
+
Converts scalar timesteps to high-dimensional embeddings
|
| 314 |
+
"""
|
| 315 |
+
|
| 316 |
+
def __init__(self, in_features: int, time_embed_dim: int):
|
| 317 |
+
super().__init__()
|
| 318 |
+
self.in_features = in_features
|
| 319 |
+
self.time_embed_dim = time_embed_dim
|
| 320 |
+
|
| 321 |
+
# Create sinusoidal embedding layers
|
| 322 |
+
half_dim = in_features // 2
|
| 323 |
+
emb = math.log(10000) / (half_dim - 1)
|
| 324 |
+
self.register_buffer('emb', torch.exp(-emb * torch.arange(half_dim)))
|
| 325 |
+
|
| 326 |
+
# Projection layers
|
| 327 |
+
self.linear_1 = nn.Linear(in_features, time_embed_dim)
|
| 328 |
+
self.activation = nn.SiLU(inplace=True)
|
| 329 |
+
self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
|
| 330 |
+
|
| 331 |
+
def forward(self, timestep: torch.Tensor) -> torch.Tensor:
|
| 332 |
+
# Ensure timestep has correct shape [batch_size, 1]
|
| 333 |
+
if timestep.ndim == 0:
|
| 334 |
+
timestep = timestep.view(1, 1)
|
| 335 |
+
elif timestep.ndim == 1:
|
| 336 |
+
timestep = timestep.view(-1, 1)
|
| 337 |
+
|
| 338 |
+
# Apply sinusoidal embedding
|
| 339 |
+
emb = timestep * self.emb
|
| 340 |
+
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
|
| 341 |
+
|
| 342 |
+
# Project through MLP
|
| 343 |
+
emb = self.linear_1(emb)
|
| 344 |
+
emb = self.activation(emb)
|
| 345 |
+
emb = self.linear_2(emb)
|
| 346 |
+
|
| 347 |
+
return emb
|
| 348 |
+
|
| 349 |
+
|
| 350 |
class UNet2DConditionModel(nn.Module):
|
| 351 |
"""
|
| 352 |
Main UNet architecture for diffusion-based image generation
|
|
|
|
| 372 |
|
| 373 |
# Time embedding
|
| 374 |
time_embed_dim = block_out_channels[0] * 4
|
| 375 |
+
self.time_proj = TimestepEmbedding(block_out_channels[0], time_embed_dim)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
# Input convolution
|
| 378 |
self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
|
|
|
|
| 423 |
reversed_block_out_channels = list(reversed(block_out_channels))
|
| 424 |
|
| 425 |
for i, up_block_type in enumerate(["up", "up", "up", "up"]):
|
| 426 |
+
# Input channels: from previous up block (or mid block for first up block)
|
| 427 |
+
in_channels = block_out_channels[-1] if i == 0 else reversed_block_out_channels[i - 1]
|
| 428 |
output_channel = reversed_block_out_channels[i]
|
| 429 |
+
# Skip connections have same channels as up block output
|
| 430 |
+
skip_channels = output_channel
|
| 431 |
is_final_block = i == len(block_out_channels) - 1
|
| 432 |
|
| 433 |
up_block = UpBlock2D(
|
| 434 |
+
in_channels=in_channels,
|
| 435 |
out_channels=output_channel,
|
| 436 |
+
prev_output_channel=skip_channels,
|
| 437 |
temb_channels=time_embed_dim,
|
| 438 |
+
num_layers=layers_per_block, # Same as down blocks
|
| 439 |
add_upsample=not is_final_block,
|
| 440 |
has_cross_attention=True,
|
| 441 |
cross_attention_dim=cross_attention_dim,
|
|
|
|
| 444 |
self.up_blocks.append(up_block)
|
| 445 |
|
| 446 |
# Output
|
| 447 |
+
self.conv_norm_out = nn.GroupNorm(num_groups=32, num_channels=block_out_channels[0], eps=1e-6)
|
| 448 |
self.conv_act = nn.SiLU(inplace=True)
|
| 449 |
self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, stride=1, padding=1)
|
| 450 |
|
|
|
|
| 454 |
timestep: torch.Tensor,
|
| 455 |
encoder_hidden_states: torch.Tensor,
|
| 456 |
) -> torch.Tensor:
|
| 457 |
+
# Time embedding - convert timestep to float for the linear layers
|
| 458 |
+
timesteps_proj = self.time_proj(timestep.float())
|
| 459 |
temb = timesteps_proj
|
| 460 |
|
| 461 |
# Initial convolution
|
|
|
|
| 525 |
for i in range(len(down_block_types)):
|
| 526 |
block = nn.Sequential(
|
| 527 |
nn.Conv2d(channels[i], channels[i+1], kernel_size=3, stride=2, padding=1),
|
| 528 |
+
nn.GroupNorm(num_groups=32, num_channels=channels[i+1], eps=1e-6),
|
| 529 |
nn.SiLU(inplace=True),
|
| 530 |
)
|
| 531 |
self.encoder.append(block)
|
|
|
|
| 540 |
for i in range(len(up_block_types)):
|
| 541 |
block = nn.Sequential(
|
| 542 |
nn.ConvTranspose2d(decoder_channels[i], decoder_channels[i+1], kernel_size=4, stride=2, padding=1),
|
| 543 |
+
nn.GroupNorm(num_groups=32, num_channels=decoder_channels[i+1], eps=1e-6),
|
| 544 |
nn.SiLU(inplace=True),
|
| 545 |
)
|
| 546 |
self.decoder.append(block)
|
config.yaml
CHANGED
|
@@ -59,7 +59,7 @@ training:
|
|
| 59 |
epochs: 100
|
| 60 |
batch_size: 4
|
| 61 |
gradient_accumulation_steps: 1
|
| 62 |
-
learning_rate:
|
| 63 |
lr_scheduler: "constant_with_warmup"
|
| 64 |
lr_warmup_steps: 500
|
| 65 |
max_grad_norm: 1.0
|
|
|
|
| 59 |
epochs: 100
|
| 60 |
batch_size: 4
|
| 61 |
gradient_accumulation_steps: 1
|
| 62 |
+
learning_rate: 0.00001
|
| 63 |
lr_scheduler: "constant_with_warmup"
|
| 64 |
lr_warmup_steps: 500
|
| 65 |
max_grad_norm: 1.0
|
create_test_dataset.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Create Test Dataset
|
| 3 |
+
Generate sample images for testing the training pipeline
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 7 |
+
import numpy as np
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import random
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def create_test_dataset(output_dir: str = "./dataset", num_images: int = 10):
|
| 13 |
+
"""
|
| 14 |
+
Create a test dataset with synthetic images
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
output_dir: Output directory
|
| 18 |
+
num_images: Number of test images to create
|
| 19 |
+
"""
|
| 20 |
+
output_path = Path(output_dir)
|
| 21 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
print(f"Creating {num_images} test images in {output_path}...")
|
| 24 |
+
|
| 25 |
+
# Color palettes
|
| 26 |
+
colors = [
|
| 27 |
+
(255, 99, 71), # Tomato
|
| 28 |
+
(64, 224, 208), # Turquoise
|
| 29 |
+
(255, 215, 0), # Gold
|
| 30 |
+
(138, 43, 226), # Blue Violet
|
| 31 |
+
(50, 205, 50), # Lime Green
|
| 32 |
+
(255, 165, 0), # Orange
|
| 33 |
+
(219, 112, 147), # Pale Violet Red
|
| 34 |
+
(70, 130, 180), # Steel Blue
|
| 35 |
+
(255, 192, 203), # Pink
|
| 36 |
+
(144, 238, 144), # Light Green
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
shapes = ['circle', 'rectangle', 'triangle']
|
| 40 |
+
|
| 41 |
+
captions = []
|
| 42 |
+
|
| 43 |
+
for i in range(num_images):
|
| 44 |
+
# Generate random image
|
| 45 |
+
img_size = 512
|
| 46 |
+
img = Image.new('RGB', (img_size, img_size), color=(240, 240, 240))
|
| 47 |
+
draw = ImageDraw.Draw(img)
|
| 48 |
+
|
| 49 |
+
# Random parameters
|
| 50 |
+
num_shapes = random.randint(3, 8)
|
| 51 |
+
bg_color = random.choice(colors)
|
| 52 |
+
|
| 53 |
+
# Draw background gradient
|
| 54 |
+
for y in range(img_size):
|
| 55 |
+
r = int(bg_color[0] * (0.8 + 0.2 * y / img_size))
|
| 56 |
+
g = int(bg_color[1] * (0.8 + 0.2 * y / img_size))
|
| 57 |
+
b = int(bg_color[2] * (0.8 + 0.2 * y / img_size))
|
| 58 |
+
draw.line([(0, y), (img_size, y)], fill=(r, g, b))
|
| 59 |
+
|
| 60 |
+
# Draw random shapes
|
| 61 |
+
for _ in range(num_shapes):
|
| 62 |
+
shape = random.choice(shapes)
|
| 63 |
+
color = tuple(random.randint(50, 255) for _ in range(3))
|
| 64 |
+
|
| 65 |
+
x1 = random.randint(50, img_size - 50)
|
| 66 |
+
y1 = random.randint(50, img_size - 50)
|
| 67 |
+
size = random.randint(30, 100)
|
| 68 |
+
|
| 69 |
+
if shape == 'circle':
|
| 70 |
+
bbox = [x1, y1, x1 + size, y1 + size]
|
| 71 |
+
draw.ellipse(bbox, fill=color, outline=(0, 0, 0))
|
| 72 |
+
elif shape == 'rectangle':
|
| 73 |
+
bbox = [x1, y1, x1 + size, y1 + size // 2]
|
| 74 |
+
draw.rectangle(bbox, fill=color, outline=(0, 0, 0))
|
| 75 |
+
elif shape == 'triangle':
|
| 76 |
+
points = [
|
| 77 |
+
(x1, y1),
|
| 78 |
+
(x1 + size, y1),
|
| 79 |
+
(x1 + size // 2, y1 + size)
|
| 80 |
+
]
|
| 81 |
+
draw.polygon(points, fill=color, outline=(0, 0, 0))
|
| 82 |
+
|
| 83 |
+
# Save image
|
| 84 |
+
img_path = output_path / f"test_image_{i+1:03d}.jpg"
|
| 85 |
+
img.save(img_path, quality=95)
|
| 86 |
+
|
| 87 |
+
# Create caption
|
| 88 |
+
caption = f"A colorful abstract composition with {num_shapes} geometric shapes on a {bg_color[0]} background"
|
| 89 |
+
captions.append(caption)
|
| 90 |
+
|
| 91 |
+
# Save caption
|
| 92 |
+
caption_path = output_path / f"test_image_{i+1:03d}.txt"
|
| 93 |
+
with open(caption_path, 'w', encoding='utf-8') as f:
|
| 94 |
+
f.write(caption)
|
| 95 |
+
|
| 96 |
+
print(f" Created: {img_path.name}")
|
| 97 |
+
|
| 98 |
+
print(f"\n✓ Test dataset created successfully!")
|
| 99 |
+
print(f" Location: {output_path.absolute()}")
|
| 100 |
+
print(f" Images: {num_images}")
|
| 101 |
+
print(f"\nTo train with this dataset:")
|
| 102 |
+
print(f" python train.py --config config.yaml --train_data {output_path}")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
create_test_dataset()
|
dataset/test_image_001.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 7 geometric shapes on a 255 background
|
dataset/test_image_002.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 3 geometric shapes on a 255 background
|
dataset/test_image_003.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 3 geometric shapes on a 219 background
|
dataset/test_image_004.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 4 geometric shapes on a 255 background
|
dataset/test_image_005.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 6 geometric shapes on a 70 background
|
dataset/test_image_006.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 5 geometric shapes on a 50 background
|
dataset/test_image_007.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 8 geometric shapes on a 138 background
|
dataset/test_image_008.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 7 geometric shapes on a 255 background
|
dataset/test_image_009.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 8 geometric shapes on a 64 background
|
dataset/test_image_010.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
A colorful abstract composition with 6 geometric shapes on a 64 background
|
debug_unet.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Debug UNet channel dimensions"""
|
| 2 |
+
from bytedream.model import UNet2DConditionModel
|
| 3 |
+
|
| 4 |
+
unet = UNet2DConditionModel()
|
| 5 |
+
|
| 6 |
+
print("Block out channels:", unet.block_out_channels)
|
| 7 |
+
print("\nDown blocks:")
|
| 8 |
+
for i, block in enumerate(unet.down_blocks):
|
| 9 |
+
print(f" Down {i}: {len(block.resnets)} resnets")
|
| 10 |
+
|
| 11 |
+
print("\nUp blocks:")
|
| 12 |
+
reversed_block_out_channels = list(reversed(unet.block_out_channels))
|
| 13 |
+
for i, block in enumerate(unet.up_blocks):
|
| 14 |
+
in_channels = unet.block_out_channels[-1] if i == 0 else reversed_block_out_channels[i - 1]
|
| 15 |
+
output_channel = reversed_block_out_channels[i]
|
| 16 |
+
skip_channels = reversed_block_out_channels[min(i + 1, len(unet.block_out_channels) - 1)]
|
| 17 |
+
print(f" Up {i}: in={in_channels}, out={output_channel}, skips={skip_channels}")
|
| 18 |
+
print(f" ResNets expect: {[block.resnets[j].conv1.in_channels for j in range(len(block.resnets))]}")
|
quick_fix.bat
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
REM Byte Dream - Quick Fix and Setup Script for Windows
|
| 3 |
+
REM Run this script to fix the model loading issue
|
| 4 |
+
|
| 5 |
+
echo ============================================================
|
| 6 |
+
echo Byte Dream - Quick Fix and Setup
|
| 7 |
+
echo ============================================================
|
| 8 |
+
echo.
|
| 9 |
+
|
| 10 |
+
REM Check if Python is installed
|
| 11 |
+
python --version >nul 2>&1
|
| 12 |
+
if %errorlevel% neq 0 (
|
| 13 |
+
echo ERROR: Python not found! Please install Python 3.8+
|
| 14 |
+
pause
|
| 15 |
+
exit /b 1
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
echo Step 1: Checking for trained model...
|
| 19 |
+
echo.
|
| 20 |
+
|
| 21 |
+
if exist "models\bytedream" (
|
| 22 |
+
echo Found model at: models\bytedream
|
| 23 |
+
) else if exist "models" (
|
| 24 |
+
echo Found models directory
|
| 25 |
+
) else (
|
| 26 |
+
echo WARNING: No trained model found!
|
| 27 |
+
echo.
|
| 28 |
+
echo To train the model, run:
|
| 29 |
+
echo python train.py --epochs 1000 --batch_size 4
|
| 30 |
+
echo.
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
echo.
|
| 34 |
+
echo Step 2: Testing pipeline with random initialization...
|
| 35 |
+
echo.
|
| 36 |
+
|
| 37 |
+
python quick_fix.py
|
| 38 |
+
|
| 39 |
+
echo.
|
| 40 |
+
echo ============================================================
|
| 41 |
+
echo Next Steps:
|
| 42 |
+
echo ============================================================
|
| 43 |
+
echo.
|
| 44 |
+
echo 1. If you want to train the model:
|
| 45 |
+
echo python train.py --epochs 1000 --batch_size 4
|
| 46 |
+
echo.
|
| 47 |
+
echo 2. If you want to upload to Hugging Face:
|
| 48 |
+
echo a. Install huggingface_hub: pip install huggingface_hub
|
| 49 |
+
echo b. Login: huggingface-cli login
|
| 50 |
+
echo c. Upload: python upload_to_hf.py --repo_id "YourUsername/ByteDream" --create_space
|
| 51 |
+
echo.
|
| 52 |
+
echo 3. To use the web interface now:
|
| 53 |
+
echo python app.py
|
| 54 |
+
echo.
|
| 55 |
+
echo For detailed instructions, see: UPLOAD_GUIDE_PT.md
|
| 56 |
+
echo ============================================================
|
| 57 |
+
echo.
|
| 58 |
+
pause
|
quick_fix.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Quick Setup Script for Byte Dream
|
| 3 |
+
Fixes the model loading issue and helps upload to Hugging Face
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def check_model_exists():
|
| 11 |
+
"""Check if trained model exists"""
|
| 12 |
+
model_paths = [
|
| 13 |
+
"./models/bytedream",
|
| 14 |
+
"./models",
|
| 15 |
+
"./bytedream",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
for path in model_paths:
|
| 19 |
+
if Path(path).exists():
|
| 20 |
+
print(f"✓ Found model at: {path}")
|
| 21 |
+
return path
|
| 22 |
+
|
| 23 |
+
print("⚠ No trained model found!")
|
| 24 |
+
print("\nTo train the model, run:")
|
| 25 |
+
print(" python train.py --epochs 1000 --batch_size 4")
|
| 26 |
+
print("\nOr download pretrained weights from Hugging Face.")
|
| 27 |
+
return None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_inference():
|
| 31 |
+
"""Test inference with random initialization (no model needed)"""
|
| 32 |
+
print("\n" + "="*60)
|
| 33 |
+
print("Testing Byte Dream with random initialization")
|
| 34 |
+
print("="*60)
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from bytedream.generator import ByteDreamGenerator
|
| 38 |
+
|
| 39 |
+
# Initialize without model path (will use random weights)
|
| 40 |
+
generator = ByteDreamGenerator(
|
| 41 |
+
model_path=None, # No pretrained model
|
| 42 |
+
config_path="config.yaml",
|
| 43 |
+
device="cpu",
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
print("\nGenerating test image with random weights...")
|
| 47 |
+
print("(This will produce random noise, but tests the pipeline)")
|
| 48 |
+
|
| 49 |
+
image = generator.generate(
|
| 50 |
+
prompt="A test image",
|
| 51 |
+
width=256,
|
| 52 |
+
height=256,
|
| 53 |
+
num_inference_steps=10, # Fast test
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
image.save("test_output.png")
|
| 57 |
+
print(f"\n✓ Test image saved to: test_output.png")
|
| 58 |
+
print("\nNote: This image looks like noise because we're using random weights.")
|
| 59 |
+
print("To generate meaningful images, you need to train the model first.")
|
| 60 |
+
|
| 61 |
+
return True
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"\n❌ Error during test: {e}")
|
| 65 |
+
import traceback
|
| 66 |
+
traceback.print_exc()
|
| 67 |
+
return False
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def upload_to_hf_guide():
|
| 71 |
+
"""Guide for uploading to Hugging Face"""
|
| 72 |
+
print("\n" + "="*60)
|
| 73 |
+
print("Hugging Face Upload Guide")
|
| 74 |
+
print("="*60)
|
| 75 |
+
|
| 76 |
+
print("""
|
| 77 |
+
To upload your model to Hugging Face Hub:
|
| 78 |
+
|
| 79 |
+
STEP 1: Install required packages
|
| 80 |
+
----------------------------------
|
| 81 |
+
pip install huggingface_hub
|
| 82 |
+
|
| 83 |
+
STEP 2: Login to Hugging Face
|
| 84 |
+
------------------------------
|
| 85 |
+
huggingface-cli login
|
| 86 |
+
|
| 87 |
+
Then paste your token from: https://huggingface.co/settings/tokens
|
| 88 |
+
|
| 89 |
+
STEP 3: Train your model (if not done already)
|
| 90 |
+
-----------------------------------------------
|
| 91 |
+
python train.py --epochs 1000 --batch_size 4 --output_dir ./models/bytedream
|
| 92 |
+
|
| 93 |
+
STEP 4: Upload to Hugging Face
|
| 94 |
+
-------------------------------
|
| 95 |
+
python upload_to_hf.py --repo_id "YourUsername/ByteDream" --create_space
|
| 96 |
+
|
| 97 |
+
Replace 'YourUsername' with your actual Hugging Face username.
|
| 98 |
+
|
| 99 |
+
STEP 5: Update app.py to use the uploaded model
|
| 100 |
+
------------------------------------------------
|
| 101 |
+
After uploading, modify app.py to load from Hugging Face:
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from diffusers import DiffusionPipeline
|
| 105 |
+
|
| 106 |
+
pipe = DiffusionPipeline.from_pretrained("YourUsername/ByteDream")
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
TIPS:
|
| 110 |
+
-----
|
| 111 |
+
- Make sure your model directory contains the trained weights
|
| 112 |
+
- Use --private flag if you want to keep the model private
|
| 113 |
+
- The --create_space option creates files for Hugging Face Spaces deployment
|
| 114 |
+
- Check your repository at: https://huggingface.co/YourUsername
|
| 115 |
+
|
| 116 |
+
For more help, see:
|
| 117 |
+
- https://huggingface.co/docs/hub/spaces
|
| 118 |
+
- https://huggingface.co/docs/huggingface_hub/guides/cli
|
| 119 |
+
""")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def main():
|
| 123 |
+
print("\n" + "="*60)
|
| 124 |
+
print("Byte Dream - Quick Setup & Troubleshooting")
|
| 125 |
+
print("="*60)
|
| 126 |
+
|
| 127 |
+
# Check if model exists
|
| 128 |
+
model_path = check_model_exists()
|
| 129 |
+
|
| 130 |
+
# Test inference
|
| 131 |
+
if model_path or True: # Always test (can work without model)
|
| 132 |
+
success = test_inference()
|
| 133 |
+
|
| 134 |
+
if success:
|
| 135 |
+
print("\n✓ Pipeline is working!")
|
| 136 |
+
print("\nNext steps:")
|
| 137 |
+
print("1. Train the model: python train.py")
|
| 138 |
+
print("2. Or upload to Hugging Face (see guide below)")
|
| 139 |
+
|
| 140 |
+
# Show upload guide
|
| 141 |
+
upload_to_hf_guide()
|
| 142 |
+
|
| 143 |
+
print("\n" + "="*60)
|
| 144 |
+
print("Current status:")
|
| 145 |
+
print(" - app.py has been fixed to handle missing models gracefully")
|
| 146 |
+
print(" - You can now run: python app.py")
|
| 147 |
+
print(" - Follow the upload guide above to deploy to Hugging Face")
|
| 148 |
+
print("="*60)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
main()
|
train.py
CHANGED
|
@@ -34,10 +34,19 @@ class ImageTextDataset(Dataset):
|
|
| 34 |
center_crop: bool = True,
|
| 35 |
):
|
| 36 |
self.data_dir = Path(data_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
self.image_paths = list(self.data_dir.glob("*.jpg")) + \
|
| 38 |
list(self.data_dir.glob("*.png")) + \
|
| 39 |
list(self.data_dir.glob("*.jpeg"))
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
self.image_size = image_size
|
| 42 |
self.random_flip = random_flip
|
| 43 |
self.random_crop = random_crop
|
|
@@ -216,6 +225,8 @@ class LatentDiffusionTrainer:
|
|
| 216 |
"""Encode images to latent space"""
|
| 217 |
with torch.no_grad():
|
| 218 |
latents = self.vae.encode(images)
|
|
|
|
|
|
|
| 219 |
latents = latents * 0.18215 # Scale factor
|
| 220 |
return latents
|
| 221 |
|
|
|
|
| 34 |
center_crop: bool = True,
|
| 35 |
):
|
| 36 |
self.data_dir = Path(data_dir)
|
| 37 |
+
|
| 38 |
+
# Check if directory exists
|
| 39 |
+
if not self.data_dir.exists():
|
| 40 |
+
raise FileNotFoundError(f"Dataset directory not found: {self.data_dir}\nPlease create the directory and add images, or use --train_data with a valid path.")
|
| 41 |
+
|
| 42 |
self.image_paths = list(self.data_dir.glob("*.jpg")) + \
|
| 43 |
list(self.data_dir.glob("*.png")) + \
|
| 44 |
list(self.data_dir.glob("*.jpeg"))
|
| 45 |
|
| 46 |
+
# Check if there are any images
|
| 47 |
+
if len(self.image_paths) == 0:
|
| 48 |
+
raise ValueError(f"No images found in {self.data_dir}\nSupported formats: .jpg, .png, .jpeg")
|
| 49 |
+
|
| 50 |
self.image_size = image_size
|
| 51 |
self.random_flip = random_flip
|
| 52 |
self.random_crop = random_crop
|
|
|
|
| 225 |
"""Encode images to latent space"""
|
| 226 |
with torch.no_grad():
|
| 227 |
latents = self.vae.encode(images)
|
| 228 |
+
# Use only the mean part of the VAE output (first half of channels)
|
| 229 |
+
latents = latents[:, :4] # Take first 4 channels (mean, not log_var)
|
| 230 |
latents = latents * 0.18215 # Scale factor
|
| 231 |
return latents
|
| 232 |
|