trainer: skip completed phases / resume mid-phase + update context
Browse filessrc/training/mindi_trainer.py:
- train(): skip phases with global_step >= phase.end_step
- train(): when resuming mid-phase, set _resume_step_offset so
train_phase() starts from the correct step instead of 0
- train_phase(): honor _resume_step_offset and clear after use
This fixes the resume bug surfaced in Session 2 where Phase 1 resume
restarted from step 0 instead of step 4250. Bug discovered on droplet
165.245.141.141 and fixed remotely; this commit lands the fix locally.
context.md: bring up to date with sessions 2-4
- Session 2 (Apr 16): rate-limit / git-clone data fix, all phases dry
run passed, checkpoint upload to HF, auto-push script
- Session 3 (Apr 19): Phase 1 finish on droplet 165.245.141.141
- Session 4 (Apr 30): frontend bugs, Gradio 5.x SSE v3, ZeroGPU quota,
agent system, training summary across MI300X + Modal A100
- Add bash history-expansion gotcha and data dir-already-exists fix
- Droplet history table
- context.md +270 -32
- src/training/mindi_trainer.py +13 -1
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# MINDI 1.5 Vision-Coder — Complete Project Context
|
| 2 |
|
| 3 |
-
> **Last updated:** April
|
| 4 |
> **Purpose:** This file contains ALL context needed to continue development with any AI assistant.
|
| 5 |
> It covers architecture decisions, errors encountered, fixes applied, training state, and exact next steps.
|
| 6 |
|
|
@@ -285,7 +285,7 @@ Also needed: `apt-get install -y git-lfs && git lfs install`
|
|
| 285 |
2. `echo 1 > /sys/bus/pci/devices/0000:83:00.0/reset` (PCI address from `lspci | grep AMD`)
|
| 286 |
3. If GPU% still 100%: `modprobe -r amdgpu && modprobe amdgpu`
|
| 287 |
4. Verify `rocm-smi` shows GPU% = 0% before restarting Docker
|
| 288 |
-
**Status:** Droplet was deleted.
|
| 289 |
|
| 290 |
### 6.8 HuggingFace Upload Limits
|
| 291 |
|
|
@@ -313,6 +313,7 @@ Also needed: `apt-get install -y git-lfs && git lfs install`
|
|
| 313 |
export HF_TOKEN=<your-hf-token> # Get from HF settings page
|
| 314 |
export HF_HUB_DISABLE_PROGRESS_BARS=1
|
| 315 |
export PYTORCH_ROCM_ARCH=gfx942
|
|
|
|
| 316 |
# DO NOT SET: HSA_OVERRIDE_GFX_VERSION (causes GPU hang on ROCm 7.0)
|
| 317 |
```
|
| 318 |
|
|
@@ -322,57 +323,56 @@ export PYTORCH_ROCM_ARCH=gfx942
|
|
| 322 |
# 1. SSH into droplet
|
| 323 |
ssh root@<DROPLET_IP>
|
| 324 |
|
| 325 |
-
# 2.
|
|
|
|
|
|
|
|
|
|
| 326 |
docker start rocm
|
| 327 |
docker exec -it rocm /bin/bash
|
| 328 |
|
| 329 |
-
#
|
| 330 |
export HF_TOKEN=<your-hf-token> # Get from HF settings page
|
| 331 |
export HF_HUB_DISABLE_PROGRESS_BARS=1
|
| 332 |
export PYTORCH_ROCM_ARCH=gfx942
|
|
|
|
| 333 |
|
| 334 |
-
#
|
| 335 |
python3 -c "import torch; print('GPU:', torch.cuda.get_device_name(0)); x=torch.randn(100,device='cuda'); print('OK:', x.sum().item())"
|
| 336 |
|
| 337 |
-
#
|
| 338 |
apt-get update && apt-get install -y git-lfs
|
| 339 |
git lfs install
|
| 340 |
|
| 341 |
-
#
|
| 342 |
cd /workspace
|
| 343 |
git clone https://$HF_TOKEN:$HF_TOKEN@huggingface.co/Mindigenous/MINDI-1.5-Vision-Coder.git
|
| 344 |
cd MINDI-1.5-Vision-Coder
|
| 345 |
|
| 346 |
-
#
|
| 347 |
pip install -r requirements-training.txt
|
| 348 |
|
| 349 |
-
#
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
)
|
| 360 |
-
print('Data download complete!')
|
| 361 |
-
"
|
| 362 |
|
| 363 |
-
#
|
| 364 |
-
|
| 365 |
-
ls -la data/websight/
|
| 366 |
-
ls data/websight/images/ | head
|
| 367 |
|
| 368 |
-
#
|
| 369 |
python3 scripts/gpu_diagnostic.py
|
| 370 |
|
| 371 |
-
#
|
| 372 |
python3 scripts/train.py --dry_run --no_wandb
|
| 373 |
|
| 374 |
-
#
|
| 375 |
-
python3 scripts/train.py --no_wandb
|
| 376 |
```
|
| 377 |
|
| 378 |
### 7.4 GPU Hang Recovery (if it happens again)
|
|
@@ -388,6 +388,32 @@ rocm-smi # Should show 0% now
|
|
| 388 |
docker start rocm
|
| 389 |
```
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
---
|
| 392 |
|
| 393 |
## 8. HF DATASET REPO STRUCTURE
|
|
@@ -461,12 +487,30 @@ cdc806e Fix: register LLM as nn.Module submodule so optimizer finds LoRA params
|
|
| 461 |
|
| 462 |
## 11. WHAT REMAINS (TODO) ❌
|
| 463 |
|
| 464 |
-
1. **Complete WebSight upload to HF** —
|
| 465 |
2. **Full 3-phase dry run** — Phase 2 (WebSight) and Phase 3 (mixed) NOT yet tested with the vision pipeline
|
| 466 |
3. **Full production training** — 10,000 steps total (Phase 1: 5K, Phase 2: 2.5K, Phase 3: 2.5K)
|
| 467 |
4. **Inference testing** — Generate code from screenshots after training
|
| 468 |
5. **Commit `upload_websight_images.py` and `context.md`** — These new files need to be pushed
|
| 469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
---
|
| 471 |
|
| 472 |
## 12. KNOWN ISSUES & GOTCHAS
|
|
@@ -548,8 +592,10 @@ When continuing with a new AI assistant:
|
|
| 548 |
```
|
| 549 |
6. **Spin up fresh MI300X droplet** on DigitalOcean
|
| 550 |
7. **Follow Section 7.3** for setup procedure
|
| 551 |
-
8. **
|
| 552 |
-
9. **
|
|
|
|
|
|
|
| 553 |
|
| 554 |
---
|
| 555 |
|
|
@@ -570,4 +616,196 @@ The `snapshot_download(local_dir='data')` call places everything correctly becau
|
|
| 570 |
|
| 571 |
---
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
*This context file was created on April 16, 2026 during Claude Opus 4.6 session to ensure project continuity.*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# MINDI 1.5 Vision-Coder — Complete Project Context
|
| 2 |
|
| 3 |
+
> **Last updated:** April 30, 2026 (Session 4)
|
| 4 |
> **Purpose:** This file contains ALL context needed to continue development with any AI assistant.
|
| 5 |
> It covers architecture decisions, errors encountered, fixes applied, training state, and exact next steps.
|
| 6 |
|
|
|
|
| 285 |
2. `echo 1 > /sys/bus/pci/devices/0000:83:00.0/reset` (PCI address from `lspci | grep AMD`)
|
| 286 |
3. If GPU% still 100%: `modprobe -r amdgpu && modprobe amdgpu`
|
| 287 |
4. Verify `rocm-smi` shows GPU% = 0% before restarting Docker
|
| 288 |
+
**Status:** Droplet was deleted. Session 2 is on `134.199.197.198`.
|
| 289 |
|
| 290 |
### 6.8 HuggingFace Upload Limits
|
| 291 |
|
|
|
|
| 313 |
export HF_TOKEN=<your-hf-token> # Get from HF settings page
|
| 314 |
export HF_HUB_DISABLE_PROGRESS_BARS=1
|
| 315 |
export PYTORCH_ROCM_ARCH=gfx942
|
| 316 |
+
export TOKENIZERS_PARALLELISM=false
|
| 317 |
# DO NOT SET: HSA_OVERRIDE_GFX_VERSION (causes GPU hang on ROCm 7.0)
|
| 318 |
```
|
| 319 |
|
|
|
|
| 323 |
# 1. SSH into droplet
|
| 324 |
ssh root@<DROPLET_IP>
|
| 325 |
|
| 326 |
+
# 2. Verify GPU health on host (must show 0% GPU)
|
| 327 |
+
rocm-smi
|
| 328 |
+
|
| 329 |
+
# 3. Start Docker
|
| 330 |
docker start rocm
|
| 331 |
docker exec -it rocm /bin/bash
|
| 332 |
|
| 333 |
+
# 4. Set environment (inside Docker)
|
| 334 |
export HF_TOKEN=<your-hf-token> # Get from HF settings page
|
| 335 |
export HF_HUB_DISABLE_PROGRESS_BARS=1
|
| 336 |
export PYTORCH_ROCM_ARCH=gfx942
|
| 337 |
+
export TOKENIZERS_PARALLELISM=false
|
| 338 |
|
| 339 |
+
# 5. Quick GPU test
|
| 340 |
python3 -c "import torch; print('GPU:', torch.cuda.get_device_name(0)); x=torch.randn(100,device='cuda'); print('OK:', x.sum().item())"
|
| 341 |
|
| 342 |
+
# 6. Install git-lfs (ignore AMD artifactory DNS warning — harmless)
|
| 343 |
apt-get update && apt-get install -y git-lfs
|
| 344 |
git lfs install
|
| 345 |
|
| 346 |
+
# 7. Clone code repo
|
| 347 |
cd /workspace
|
| 348 |
git clone https://$HF_TOKEN:$HF_TOKEN@huggingface.co/Mindigenous/MINDI-1.5-Vision-Coder.git
|
| 349 |
cd MINDI-1.5-Vision-Coder
|
| 350 |
|
| 351 |
+
# 8. Install requirements
|
| 352 |
pip install -r requirements-training.txt
|
| 353 |
|
| 354 |
+
# 9. Download training data from HF dataset repo
|
| 355 |
+
# NOTE: Use git clone, NOT snapshot_download (which hits HTTP 429 rate limits)
|
| 356 |
+
# NOTE: Must rm -rf data first — code repo creates an empty data/ directory
|
| 357 |
+
rm -rf data
|
| 358 |
+
git clone https://$HF_TOKEN:$HF_TOKEN@huggingface.co/datasets/Mindigenous/MINDI-1.5-training-data data
|
| 359 |
+
|
| 360 |
+
# 10. Verify data
|
| 361 |
+
wc -l data/processed/train.jsonl data/processed/val.jsonl
|
| 362 |
+
wc -l data/websight/train.jsonl data/websight/val.jsonl
|
| 363 |
+
for d in data/websight/images/0*/; do echo "$d: $(ls $d | wc -l) files"; done
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
+
# 11. Create output directories
|
| 366 |
+
mkdir -p checkpoints/training checkpoints/best logs/training
|
|
|
|
|
|
|
| 367 |
|
| 368 |
+
# 12. Run GPU diagnostic
|
| 369 |
python3 scripts/gpu_diagnostic.py
|
| 370 |
|
| 371 |
+
# 13. Dry run (test all 3 phases before full training)
|
| 372 |
python3 scripts/train.py --dry_run --no_wandb
|
| 373 |
|
| 374 |
+
# 14. Full training (background, survives SSH disconnect)
|
| 375 |
+
nohup python3 scripts/train.py --no_wandb > /workspace/training.log 2>&1 &
|
| 376 |
```
|
| 377 |
|
| 378 |
### 7.4 GPU Hang Recovery (if it happens again)
|
|
|
|
| 388 |
docker start rocm
|
| 389 |
```
|
| 390 |
|
| 391 |
+
### 6.9 HuggingFace snapshot_download Rate Limit (HTTP 429)
|
| 392 |
+
|
| 393 |
+
**Symptom:** `HTTP Error 429 thrown while requesting GET .../tree/main` during `snapshot_download()`. Retries endlessly.
|
| 394 |
+
**Root cause:** The dataset has 52,500+ image files. `snapshot_download` paginates through the HF tree API listing all files, causing rate limiting.
|
| 395 |
+
**Fix:** Use `git clone` instead of `snapshot_download` for the dataset:
|
| 396 |
+
```bash
|
| 397 |
+
rm -rf data
|
| 398 |
+
git clone https://$HF_TOKEN:$HF_TOKEN@huggingface.co/datasets/Mindigenous/MINDI-1.5-training-data data
|
| 399 |
+
```
|
| 400 |
+
This downloads everything in a single git connection without hitting the API rate limiter.
|
| 401 |
+
**Discovered:** April 16, 2026 — Session 2
|
| 402 |
+
|
| 403 |
+
### 6.10 Bash History Expansion with Exclamation Mark
|
| 404 |
+
|
| 405 |
+
**Symptom:** `bash: !': event not found` when running `python3 -c "...print('Done!')"` in a single line.
|
| 406 |
+
**Root cause:** Bash interprets `!'` inside double quotes as history expansion.
|
| 407 |
+
**Fix:** Use multi-line python commands (with actual newlines between double quotes) instead of single-line. Or use single quotes around the python code.
|
| 408 |
+
**Discovered:** April 16, 2026 — Session 2
|
| 409 |
+
|
| 410 |
+
### 6.11 Data Directory Already Exists on Clone
|
| 411 |
+
|
| 412 |
+
**Symptom:** `fatal: destination path 'data' already exists and is not an empty directory` when trying to `git clone ... data`.
|
| 413 |
+
**Root cause:** The code repo clone creates an empty `data/` directory structure.
|
| 414 |
+
**Fix:** `rm -rf data` before cloning the dataset repo.
|
| 415 |
+
**Discovered:** April 16, 2026 — Session 2
|
| 416 |
+
|
| 417 |
---
|
| 418 |
|
| 419 |
## 8. HF DATASET REPO STRUCTURE
|
|
|
|
| 487 |
|
| 488 |
## 11. WHAT REMAINS (TODO) ❌
|
| 489 |
|
| 490 |
+
1. ~~**Complete WebSight upload to HF**~~ — Check if subdirs 04 and 05 are uploaded; re-run upload script if needed
|
| 491 |
2. **Full 3-phase dry run** — Phase 2 (WebSight) and Phase 3 (mixed) NOT yet tested with the vision pipeline
|
| 492 |
3. **Full production training** — 10,000 steps total (Phase 1: 5K, Phase 2: 2.5K, Phase 3: 2.5K)
|
| 493 |
4. **Inference testing** — Generate code from screenshots after training
|
| 494 |
5. **Commit `upload_websight_images.py` and `context.md`** — These new files need to be pushed
|
| 495 |
|
| 496 |
+
### Session 2 Status (April 16, 2026)
|
| 497 |
+
- ✅ Fresh droplet spun up at `134.199.197.198`
|
| 498 |
+
- ✅ Docker container started, GPU healthy (0% util, 45°C)
|
| 499 |
+
- ✅ Code repo cloned, dependencies installed
|
| 500 |
+
- ✅ GPU diagnostic: All 6 tests passed (bf16 matmul, 1GB alloc, forward pass)
|
| 501 |
+
- ⚠️ Data download: multiple rate limits (snapshot_download → git clone → git-lfs → hf_hub_download retries)
|
| 502 |
+
- ✅ All data downloaded: 1.3M text + 50K WebSight JSONL + 52,500 images
|
| 503 |
+
- ✅ Phase 1 dry run PASSED: loss 18.87 → 8.05 in 10 steps (10.8 min)
|
| 504 |
+
- ✅ Phase 2 dry run PASSED: loss 1.46 → 1.19, val_loss 1.32 in 10 steps (6.2 min)
|
| 505 |
+
- ✅ Phase 3 dry run PASSED: loss 14.10 → 9.71, val_loss 9.72 in 10 steps (8.2 min)
|
| 506 |
+
- ✅ Checkpoint upload to HF fixed (.gitignore was blocking *.pt, *.safetensors — removed model file patterns)
|
| 507 |
+
- ✅ Auto-push script running (pushes latest checkpoint to HF every 2 hours — fixed alphabetic sorting bug)
|
| 508 |
+
- ✅ Resume bug fixed: train() now skips completed phases and resumes mid-phase correctly
|
| 509 |
+
- ⏳ Phase 1 training: step 4500/5000, val_loss 0.5372 — on 3rd droplet (165.245.141.141)
|
| 510 |
+
- ⏳ Image download running: ~8300/52500 images (needed for Phase 2)
|
| 511 |
+
- 💰 Budget: ~$91 on current account, more accounts available
|
| 512 |
+
- 📋 Plan: finish Phase 1 → Phase 2 → Phase 3, auto-push checkpoints to HF
|
| 513 |
+
|
| 514 |
---
|
| 515 |
|
| 516 |
## 12. KNOWN ISSUES & GOTCHAS
|
|
|
|
| 592 |
```
|
| 593 |
6. **Spin up fresh MI300X droplet** on DigitalOcean
|
| 594 |
7. **Follow Section 7.3** for setup procedure
|
| 595 |
+
8. **IMPORTANT:** Use `git clone` for data download (NOT `snapshot_download` — see Section 6.9)
|
| 596 |
+
9. **IMPORTANT:** `rm -rf data` before cloning dataset repo (see Section 6.11)
|
| 597 |
+
10. **Run dry run first** to verify all 3 phases work
|
| 598 |
+
11. **Then full training** — `nohup python3 scripts/train.py --no_wandb > /workspace/training.log 2>&1 &`
|
| 599 |
|
| 600 |
---
|
| 601 |
|
|
|
|
| 616 |
|
| 617 |
---
|
| 618 |
|
| 619 |
+
## 16. APRIL 16, 2026 — MAIN TRAINING COMMANDS
|
| 620 |
+
|
| 621 |
+
### Data Download (git clone — NOT snapshot_download)
|
| 622 |
+
|
| 623 |
+
```bash
|
| 624 |
+
# Inside Docker, after cloning code repo:
|
| 625 |
+
rm -rf data
|
| 626 |
+
git clone https://$HF_TOKEN:$HF_TOKEN@huggingface.co/datasets/Mindigenous/MINDI-1.5-training-data data
|
| 627 |
+
```
|
| 628 |
+
|
| 629 |
+
### Training — Background (Recommended, survives SSH disconnect)
|
| 630 |
+
|
| 631 |
+
```bash
|
| 632 |
+
# From inside Docker:
|
| 633 |
+
nohup python3 scripts/train.py --no_wandb > /workspace/training.log 2>&1 &
|
| 634 |
+
echo $! > /workspace/training.pid
|
| 635 |
+
```
|
| 636 |
+
|
| 637 |
+
Or from the **host** (also survives SSH disconnect):
|
| 638 |
+
|
| 639 |
+
```bash
|
| 640 |
+
docker exec -d rocm bash -lc 'cd /workspace/MINDI-1.5-Vision-Coder && export HF_TOKEN=<your-hf-token> && export PYTORCH_ROCM_ARCH=gfx942 && python3 scripts/train.py --no_wandb > /workspace/training.log 2>&1'
|
| 641 |
+
```
|
| 642 |
+
|
| 643 |
+
### Training — Interactive (Foreground)
|
| 644 |
+
|
| 645 |
+
```bash
|
| 646 |
+
python3 scripts/train.py --no_wandb 2>&1 | tee /workspace/training.log
|
| 647 |
+
```
|
| 648 |
+
|
| 649 |
+
### Monitoring
|
| 650 |
+
|
| 651 |
+
```bash
|
| 652 |
+
docker exec rocm tail -f /workspace/training.log # Live logs
|
| 653 |
+
docker exec rocm rocm-smi # GPU usage
|
| 654 |
+
docker exec rocm ps aux | grep train.py # Process check
|
| 655 |
+
```
|
| 656 |
+
|
| 657 |
+
Notes:
|
| 658 |
+
- Use the background command if you want the process detached from your SSH session.
|
| 659 |
+
- The `scripts/train.py` launcher does not accept a `--log_file` flag; redirect output into `/workspace/training.log` instead.
|
| 660 |
+
- Line-buffered stdout has been added to `src/training/mindi_trainer.py` so logs should appear in near real-time when using `tail -f`.
|
| 661 |
+
|
| 662 |
+
## 17. DROPLET HISTORY
|
| 663 |
+
|
| 664 |
+
| Session | Date | Droplet IP | Status | Notes |
|
| 665 |
+
|---------|------|-----------|--------|-------|
|
| 666 |
+
| 1 | April 15, 2026 | `134.199.194.245` | Deleted | Phase 1 dry run passed. GPU hung during heavy I/O. |
|
| 667 |
+
| 2 | April 16, 2026 | `134.199.197.198` | Deleted | Phase 1 steps 0→4250 completed. Credits exhausted. |
|
| 668 |
+
| 3 | April 19, 2026 | `165.245.141.141` | Active | Phase 1 resumed at step 4250. Resume bug fixed. |
|
| 669 |
+
|
| 670 |
+
---
|
| 671 |
+
|
| 672 |
*This context file was created on April 16, 2026 during Claude Opus 4.6 session to ensure project continuity.*
|
| 673 |
+
*Updated on April 16, 2026 — Session 2: snapshot_download 429 fix, bash escaping, fresh droplet setup.*
|
| 674 |
+
*Updated on April 28, 2026 — Training complete, frontend built, API deployed.*
|
| 675 |
+
*Updated on April 30, 2026 — Session 4: Fixed critical frontend bugs, Gradio 5.x API protocol, ZeroGPU quota handling.*
|
| 676 |
+
|
| 677 |
+
---
|
| 678 |
+
|
| 679 |
+
## 22. SESSION 4 — April 30, 2026
|
| 680 |
+
|
| 681 |
+
### Bugs Found & Fixed
|
| 682 |
+
|
| 683 |
+
**Bug 6.12: `handleSend` ReferenceError (app.js)**
|
| 684 |
+
- **Symptom:** Agent integration broken on page load — `const _originalSend = handleSend` throws ReferenceError because `handleSend` was never defined (the actual function is `send`)
|
| 685 |
+
- **Fix:** Changed to `let activeSend = send` pattern — init() overrides `activeSend = handleSendWithAgent` when MINDIAgent is available. Eliminated duplicate keydown event handlers.
|
| 686 |
+
- **File:** `frontend/app.js`
|
| 687 |
+
|
| 688 |
+
**Bug 6.13: Gradio 5.x API protocol mismatch**
|
| 689 |
+
- **Symptom:** `POST /api/predict` returns 404 — the frontend used old Gradio 3.x API format
|
| 690 |
+
- **Root cause:** HF Space runs Gradio 5.23.0 which uses SSE v3 protocol with `/gradio_api/call/{api_name}` (two-step: POST to submit → GET to stream result)
|
| 691 |
+
- **Fix:** Rewrote `callGenerate()` to use the Gradio 5.x two-step flow: POST `/gradio_api/call/chat_fn` → get event_id → GET `/gradio_api/call/chat_fn/{event_id}` → parse SSE response for `event: complete` data
|
| 692 |
+
- **File:** `frontend/app.js`
|
| 693 |
+
- **Config reference:** `GET /config` returns `{"api_prefix": "/gradio_api", "protocol": "sse_v3", "dependencies": [{"api_name": "chat_fn"}]}`
|
| 694 |
+
|
| 695 |
+
**Bug 6.14: Health check misdetects Gradio Space as offline**
|
| 696 |
+
- **Symptom:** Status shows "Demo Mode" even when Space is running
|
| 697 |
+
- **Root cause:** `pingHealth()` tried `/api/health` (doesn't exist on Gradio) then `/api/predict` (old format → 404)
|
| 698 |
+
- **Fix:** For HF Spaces, use `fetch(base, {mode:'no-cors'})` which succeeds if the Space is reachable
|
| 699 |
+
- **File:** `frontend/app.js`
|
| 700 |
+
|
| 701 |
+
**Improvement: ZeroGPU quota error handling**
|
| 702 |
+
- Reduced `@spaces.GPU(duration=120)` → `@spaces.GPU(duration=60)` (inference is fast after model cache)
|
| 703 |
+
- Added try-except in `chat_fn()` to return clean JSON error instead of crashing when GPU quota exceeded
|
| 704 |
+
- **File:** `hf_space/app.py`
|
| 705 |
+
|
| 706 |
+
### Session 4 Status
|
| 707 |
+
- ✅ Frontend bugs fixed (handleSend reference, duplicate handlers)
|
| 708 |
+
- ✅ Gradio 5.x API protocol implemented (SSE v3 two-step flow)
|
| 709 |
+
- ✅ Health check fixed — shows green "MINDI · HF Space" status
|
| 710 |
+
- ✅ Space updated on HF — `Mindigenous/mindi-chat`
|
| 711 |
+
- ⚠️ ZeroGPU daily quota limit can block visitors — PRO users get 8x more quota
|
| 712 |
+
- ✅ Agent system (agent.js + sandbox.js) scaffolded — Plan→Generate→Execute→Verify→Fix loop
|
| 713 |
+
- 📋 Next: Wait for quota reset, then test full end-to-end flow with real model inference
|
| 714 |
+
|
| 715 |
+
### Training Summary
|
| 716 |
+
All 3 phases of MINDI 1.5 Vision-Coder training are COMPLETE:
|
| 717 |
+
|
| 718 |
+
| Phase | Steps | Status | Platform |
|
| 719 |
+
|-------|-------|--------|----------|
|
| 720 |
+
| Phase 1 (LoRA) | 5,000 | ✅ Complete | DigitalOcean MI300X |
|
| 721 |
+
| Phase 2 (Vision Bridge) | 2,500 | ✅ Complete | DigitalOcean MI300X |
|
| 722 |
+
| Phase 3 (Joint) steps 0-1500 | 1,500 | ✅ Complete | DigitalOcean MI300X |
|
| 723 |
+
| Phase 3 (Joint) steps 1500-2500 | 1,000 | ✅ Complete | Modal A100-40GB |
|
| 724 |
+
|
| 725 |
+
### Modal Training Details
|
| 726 |
+
- Resumed from step 1500 checkpoint on Modal A100-40GB ($2.10/hr)
|
| 727 |
+
- Config patched at runtime: batch_size=2, max_length=2048 (from 6/4096)
|
| 728 |
+
- Total Modal cost: ~$28 ($30 credits)
|
| 729 |
+
- Final loss: 0.25–0.40 range
|
| 730 |
+
|
| 731 |
+
### HuggingFace Checkpoints (Mindigenous/MINDI-1.5-Vision-Coder)
|
| 732 |
+
All checkpoints uploaded to `checkpoints/` directory:
|
| 733 |
+
- Phase 1: 16 checkpoints (step250 → step5000)
|
| 734 |
+
- Phase 2: 10 checkpoints (step250 → step2500)
|
| 735 |
+
- Phase 3: `phase3_all_step500`, `step1000`, `step1500`, `step2000`, `phase3_all_step2500_final`, `phase3_final`
|
| 736 |
+
|
| 737 |
+
### Model Test Results (April 28, 2026)
|
| 738 |
+
- ✅ Code generation (text-only): Matrix exponentiation fibonacci
|
| 739 |
+
- ✅ HTML/CSS generation: Gradient + responsive design
|
| 740 |
+
- ✅ Vision (image input): Processed dummy image
|
| 741 |
+
- ✅ Agentic (bug fix): Identified subtraction→addition bug
|
| 742 |
+
- VRAM usage: 17.2 GB (A100-40GB)
|
| 743 |
+
|
| 744 |
+
---
|
| 745 |
+
|
| 746 |
+
## 19. FRONTEND
|
| 747 |
+
|
| 748 |
+
### Location: `frontend/`
|
| 749 |
+
- `index.html` — Three-panel layout (sidebar + chat + code preview)
|
| 750 |
+
- `styles.css` — Premium dark theme with purple/blue gradients
|
| 751 |
+
- `app.js` — Chat logic, image upload, code extraction, demo mode
|
| 752 |
+
|
| 753 |
+
### Features
|
| 754 |
+
- Chat interface with code block rendering (Prism.js)
|
| 755 |
+
- Image upload for vision-to-code
|
| 756 |
+
- Code preview panel with tabs (Code / Preview / Sections)
|
| 757 |
+
- Special token parsing (thinking, critique, fix, error)
|
| 758 |
+
- Demo mode (works without API — simulated responses)
|
| 759 |
+
- Settings modal (double-click MINDI logo) to configure API endpoint
|
| 760 |
+
- Responsive design (mobile + desktop)
|
| 761 |
+
|
| 762 |
+
### To Run Locally
|
| 763 |
+
```bash
|
| 764 |
+
cd frontend
|
| 765 |
+
python -m http.server 8080
|
| 766 |
+
# Open http://localhost:8080
|
| 767 |
+
```
|
| 768 |
+
|
| 769 |
+
---
|
| 770 |
+
|
| 771 |
+
## 20. MODAL API SERVER
|
| 772 |
+
|
| 773 |
+
### File: `modal_api.py`
|
| 774 |
+
FastAPI web endpoint that:
|
| 775 |
+
1. Loads MINDI 1.5 from volume checkpoint on container startup
|
| 776 |
+
2. Exposes `/api/generate` (POST) and `/api/health` (GET)
|
| 777 |
+
3. Accepts text + optional base64 image
|
| 778 |
+
4. Returns response + parsed special token sections
|
| 779 |
+
5. CORS enabled for frontend
|
| 780 |
+
|
| 781 |
+
### Deployment
|
| 782 |
+
```bash
|
| 783 |
+
modal deploy modal_api.py
|
| 784 |
+
# Returns a URL like: https://mindigenous-ai--mindi-api-api.modal.run
|
| 785 |
+
```
|
| 786 |
+
|
| 787 |
+
### Cost
|
| 788 |
+
- A100 @ $2.10/hr, scales to zero when idle
|
| 789 |
+
- ~$0.01-0.05 per request
|
| 790 |
+
- Container idle timeout: 5 minutes
|
| 791 |
+
|
| 792 |
+
### Connect Frontend to API
|
| 793 |
+
1. Open frontend at http://localhost:8080
|
| 794 |
+
2. Double-click the MINDI logo (top-left sidebar)
|
| 795 |
+
3. Enter the Modal API URL
|
| 796 |
+
4. Save settings
|
| 797 |
+
|
| 798 |
+
---
|
| 799 |
+
|
| 800 |
+
## 21. REMAINING BUDGET & NEXT STEPS
|
| 801 |
+
|
| 802 |
+
### Budget
|
| 803 |
+
- Modal: $2.21 remaining (~1 hour A100 time)
|
| 804 |
+
- DigitalOcean: exhausted
|
| 805 |
+
|
| 806 |
+
### Next Steps
|
| 807 |
+
1. Deploy API when more credits available
|
| 808 |
+
2. Host frontend on Vercel/GitHub Pages (free)
|
| 809 |
+
3. Consider HuggingFace Spaces (free T4) with 4-bit quantization as alternative
|
| 810 |
+
4. Push frontend to GitHub/HF repos
|
| 811 |
+
|
|
@@ -528,7 +528,10 @@ class MINDITrainer:
|
|
| 528 |
|
| 529 |
self.model.train()
|
| 530 |
phase_steps = phase.end_step - phase.start_step
|
| 531 |
-
step_in_phase = 0
|
|
|
|
|
|
|
|
|
|
| 532 |
accum_loss = 0.0
|
| 533 |
accum_count = 0
|
| 534 |
phase_start_time = time.time()
|
|
@@ -679,6 +682,15 @@ class MINDITrainer:
|
|
| 679 |
phase_summaries = []
|
| 680 |
|
| 681 |
for phase in self.config.phases:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 682 |
summary = self.train_phase(phase)
|
| 683 |
phase_summaries.append(summary)
|
| 684 |
|
|
|
|
| 528 |
|
| 529 |
self.model.train()
|
| 530 |
phase_steps = phase.end_step - phase.start_step
|
| 531 |
+
step_in_phase = getattr(self, '_resume_step_offset', 0)
|
| 532 |
+
if step_in_phase > 0:
|
| 533 |
+
print(f" [{phase.name}] Resuming from step {step_in_phase}/{phase_steps}")
|
| 534 |
+
self._resume_step_offset = 0 # Clear after use
|
| 535 |
accum_loss = 0.0
|
| 536 |
accum_count = 0
|
| 537 |
phase_start_time = time.time()
|
|
|
|
| 682 |
phase_summaries = []
|
| 683 |
|
| 684 |
for phase in self.config.phases:
|
| 685 |
+
# Skip completed phases on resume
|
| 686 |
+
if self.global_step >= phase.end_step:
|
| 687 |
+
print(f" Skipping {phase.name} (already completed, global_step={self.global_step})")
|
| 688 |
+
continue
|
| 689 |
+
# Resume mid-phase: calculate how many steps are already done
|
| 690 |
+
if self.global_step > phase.start_step:
|
| 691 |
+
done_in_phase = self.global_step - phase.start_step
|
| 692 |
+
self._resume_step_offset = done_in_phase
|
| 693 |
+
print(f" Resuming {phase.name} at step {done_in_phase}/{phase.end_step - phase.start_step}")
|
| 694 |
summary = self.train_phase(phase)
|
| 695 |
phase_summaries.append(summary)
|
| 696 |
|