diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..4a87953d8c5f484b012a04c086ba9e1004a47a28 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,10 @@ +*.zip filter=lfs diff=lfs merge=lfs -text +*.tar.gz filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +output/checkpoints/* filter=lfs diff=lfs merge=lfs -text +checkpoints/** filter=lfs diff=lfs merge=lfs -text +models/** filter=lfs diff=lfs merge=lfs -text +data/** filter=lfs diff=lfs merge=lfs -text +artifacts/** filter=lfs diff=lfs merge=lfs -text +logs/** filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f3d10e4b1ccace3ca659e5ba8b0720d9a3f886b1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Ignore Python cache and compiled files. +__pycache__/ +*.pyc +*.pyo +*.pyd + +# Ignore virtual environment. +.venv/ + +# Ignore logs and temporary outputs. +logs/ +artifacts/ +*.log + +# Ignore model weights and checkpoints by default. +checkpoints/ +models/base/ +models/lora/ +models/quantized/ + +# Ignore data files by default. +data/raw/ +data/interim/ +data/processed/ +data/external/ + +# Ignore notebook checkpoints. +.ipynb_checkpoints/ + diff --git a/CONTEXT_SUMMARY.md b/CONTEXT_SUMMARY.md new file mode 100644 index 0000000000000000000000000000000000000000..efc424dce25bb0ec672fdd64cfe5eefd3b8aa7d5 --- /dev/null +++ b/CONTEXT_SUMMARY.md @@ -0,0 +1,38 @@ +# Project Context Summary + +This file captures the current state of work from the active collaboration session. + +## Environment +- Original project path: `D:\Desktop 31st Jan 2026\MIND-AI-MODEL` +- Target copy path requested: `C:\AI 2` +- OS: Windows +- GPU: NVIDIA RTX 4060 Laptop (8GB VRAM) + +## Completed Components +1. Component 1 (Project setup): completed and verified. +2. Component 2 (Custom tokenizer): completed and verified. +3. Component 3 (Dataset pipeline): completed and verified. +4. Component 3 final-step reprocess fix: completed and verified, with JS rebalance. +5. Component 4 (420M transformer architecture): completed and verified. + +## Current Dataset Stats +- Total processed records: 139,531 +- Python: 115,572 +- JavaScript: 23,959 + +## Current Model Architecture +- Preset: `medium_420m` +- Parameters: 423,934,848 +- Verified forward pass on GPU successful. + +## Key Files +- `configs/component4_model_config.yaml` +- `src/model_architecture/code_transformer.py` +- `scripts/build_component4_model.py` +- `scripts/verify_component4_model.py` +- `data/processed/train_tokenized.jsonl` +- `data/processed/pipeline_stats.json` + +## Next Planned Component +- Component 5: Training pipeline with FP16, gradient checkpointing, gradient accumulation, checkpointing every 100 steps, resume support, early stopping, and live training metrics. + diff --git a/README_COMPONENT_1_SETUP.md b/README_COMPONENT_1_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..b68216aac55345f90d510408ae12fdb4d4057daf --- /dev/null +++ b/README_COMPONENT_1_SETUP.md @@ -0,0 +1,83 @@ +# Component 1: Project Setup (Windows + RTX 4060 8GB) + +## What This Component Does +- Creates a clean folder structure for the full coding-assistant project. +- Sets up a Python virtual environment. +- Installs all core dependencies needed across Components 2-10. +- Verifies that Python, PyTorch, CUDA visibility, and key libraries work. + +## Folder Structure Created +- `data/raw` -> raw datasets you will provide later +- `data/interim` -> temporary cleaned data +- `data/processed` -> training-ready tokenized data +- `data/external` -> any third-party resources +- `src/tokenizer` -> Component 2 code tokenizer +- `src/dataset_pipeline` -> Component 3 preprocessing pipeline +- `src/model_architecture` -> Component 4 transformer code +- `src/training_pipeline` -> Component 5 training loop +- `src/evaluation_system` -> Component 6 evaluation code +- `src/inference_engine` -> Component 7 inference code +- `src/chat_interface` -> Component 8 Gradio interface +- `src/finetuning_system` -> Component 9 LoRA fine-tuning +- `src/export_optimization` -> Component 10 quantization/export tools +- `configs` -> config files for all components +- `scripts` -> setup, verification, and utility scripts +- `tests` -> quick checks for each component +- `checkpoints` -> model checkpoints saved during training +- `models/base` -> base trained model files +- `models/lora` -> LoRA adapters +- `models/quantized` -> optimized quantized models +- `artifacts` -> generated reports, metrics, and outputs +- `logs` -> training and runtime logs + +## Exact Commands To Run (in this order) +Run from: +`D:\Desktop 31st Jan 2026\MIND-AI-MODEL` + +0. Install Python 3.11 (required for package compatibility): +- Download page: https://www.python.org/downloads/release/python-3119/ +- Windows installer file: `python-3.11.9-amd64.exe` +- During install, check: `Add python.exe to PATH` + +1. Allow script execution for this terminal only: +```powershell +Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass +``` + +2. If you already attempted setup once, remove old virtual environment first: +```powershell +if (Test-Path .\.venv) { Remove-Item -Recurse -Force .\.venv } +``` + +3. Create folders, virtual env, install dependencies: +```powershell +.\scripts\setup_windows_environment.ps1 +``` + +4. Activate virtual environment: +```powershell +.\.venv\Scripts\Activate.ps1 +``` + +5. Verify setup: +```powershell +python .\scripts\verify_component1_setup.py +``` + +## Expected Verification Result +- Prints Python version +- Prints PyTorch version +- Shows whether CUDA is available +- Shows GPU name if available +- Confirms critical libraries import correctly + +Note: +- `codebleu` is excluded from base install on Windows due to a `tree-sitter` dependency conflict on Python 3.11. +- Component 6 will use Windows-stable evaluation metrics and add code-quality checks without breaking setup. +- `bitsandbytes` is optional on native Windows because some CUDA/driver combinations fail to load its DLL. +- Base setup and all early components continue without it. +- For Component 5, we will: + - try `bitsandbytes` if available, and + - automatically fall back to a stable optimizer on your machine if it is not. + +If verification fails, copy the full terminal output and share it with me. diff --git a/README_COMPONENT_3_DATASET_PIPELINE.md b/README_COMPONENT_3_DATASET_PIPELINE.md new file mode 100644 index 0000000000000000000000000000000000000000..a729bca8b87db6fd5709704439e6461dff7b9f35 --- /dev/null +++ b/README_COMPONENT_3_DATASET_PIPELINE.md @@ -0,0 +1,46 @@ +# Component 3: Dataset Pipeline + +## What This Component Does (Simple English) +- Downloads the 3 datasets directly from Hugging Face (no manual download files). +- Reads them in streaming mode so your RAM usage stays low. +- Cleans prompt/code text. +- Removes low-quality and likely auto-generated data. +- Removes duplicate prompt+code pairs using a disk-backed SQLite index. +- Detects language (Python or JavaScript) when unclear. +- Tokenizes all cleaned records using the Component 2 tokenizer. +- Saves training-ready tokenized JSONL output. + +## Files Created By This Component +- `configs/component3_dataset_pipeline.yaml` +- `src/dataset_pipeline/hf_dataset_pipeline.py` +- `scripts/run_component3_dataset_pipeline.py` +- `scripts/verify_component3_dataset_pipeline.py` + +## Required Before Running +- Component 2 tokenizer must exist at: + - `artifacts/tokenizer/code_tokenizer_v1/tokenizer.json` + - `artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json` + +## Quick Verification Run (small test) +Run from project root: +```powershell +.\.venv\Scripts\Activate.ps1 +python .\scripts\verify_component3_dataset_pipeline.py +``` + +This uses `200` records per dataset for a smoke test. + +## Full Pipeline Run +```powershell +.\.venv\Scripts\Activate.ps1 +python .\scripts\run_component3_dataset_pipeline.py --config .\configs\component3_dataset_pipeline.yaml +``` + +## Output Files +- Clean merged dataset: + - `data/interim/combined_clean.jsonl` +- Tokenized training dataset: + - `data/processed/train_tokenized.jsonl` +- Stats summary: + - `data/processed/pipeline_stats.json` + diff --git a/README_COMPONENT_4_MODEL_ARCHITECTURE.md b/README_COMPONENT_4_MODEL_ARCHITECTURE.md new file mode 100644 index 0000000000000000000000000000000000000000..369e94f46af3dcb85d736d7fc3f83d18f8268f03 --- /dev/null +++ b/README_COMPONENT_4_MODEL_ARCHITECTURE.md @@ -0,0 +1,28 @@ +# Component 4: Model Architecture (420M Starter) + +## What This Component Builds +- A decoder-only transformer language model for code generation. +- Configurable size through YAML config. +- Presets for small, medium (420M target), and large. +- Attention + rotary positional encoding + feed-forward blocks. + +## Main Files +- `src/model_architecture/code_transformer.py` +- `configs/component4_model_config.yaml` +- `scripts/build_component4_model.py` +- `scripts/verify_component4_model.py` + +## Commands (run from project root) +```powershell +.\.venv\Scripts\Activate.ps1 +python .\scripts\build_component4_model.py --config .\configs\component4_model_config.yaml +python .\scripts\verify_component4_model.py --config .\configs\component4_model_config.yaml --batch_size 1 --seq_len 256 +``` + +## What Success Looks Like +- Build script prints parameter count near the 420M target. +- Verify script prints: + - VRAM usage at multiple stages + - output tensor shape + - `Component 4 verification passed.` + diff --git a/README_COMPONENT_5_TRAINING_PIPELINE.md b/README_COMPONENT_5_TRAINING_PIPELINE.md new file mode 100644 index 0000000000000000000000000000000000000000..a313c88dbfb239a43ebeb1ca7d0d68393c04a8ff --- /dev/null +++ b/README_COMPONENT_5_TRAINING_PIPELINE.md @@ -0,0 +1,42 @@ +# Component 5: Training Pipeline + +## What This Component Does +- Trains the 420M transformer on tokenized data. +- Uses FP16 mixed precision to reduce VRAM. +- Uses gradient checkpointing to save memory. +- Uses gradient accumulation for larger effective batch size. +- Attempts Adam8bit optimizer when available, otherwise safely falls back. +- Saves checkpoint every 100 steps by default. +- Supports resuming from latest checkpoint. +- Evaluates periodically and supports early stopping. +- Shows live loss, LR, ETA, and VRAM. + +## Main Files +- `configs/component5_training_config.yaml` +- `src/training_pipeline/tokenized_dataset.py` +- `scripts/train_component5.py` +- `scripts/verify_component5_training_pipeline.py` + +## Commands +```powershell +.\.venv\Scripts\Activate.ps1 +python .\scripts\verify_component5_training_pipeline.py +python .\scripts\train_component5.py --config .\configs\component5_training_config.yaml +``` + +## VRAM and Runtime (RTX 4060 8GB) +- Expected VRAM during training with default config: about 5.8 to 6.9 GB. +- Safety stop is enabled at 7.0 GB. +- Approx training time for 1 epoch equivalent: ~30 to 65 hours. + +## Common Failures and Fixes +1. OOM or VRAM threshold hit: + - Reduce `max_seq_len` (e.g., 512 -> 384). + - Increase `grad_accum_steps`. +2. Training too slow: + - Lower `max_seq_len` for first run. + - Keep `micro_batch_size=1` and adjust accumulation. +3. Resume issues: + - Ensure `checkpoints/component5_420m/latest.pt` exists. +4. Validation not improving: + - Lower LR and increase warmup. diff --git a/README_COMPONENT_8_CHAT_INTERFACE.md b/README_COMPONENT_8_CHAT_INTERFACE.md new file mode 100644 index 0000000000000000000000000000000000000000..59eeb59917095ede4e15bb975209beb3450950e6 --- /dev/null +++ b/README_COMPONENT_8_CHAT_INTERFACE.md @@ -0,0 +1,20 @@ +# Component 8: Local Chat Interface + +## What it gives you +- Browser chat UI for your local coding model. +- Uses Component 7 inference engine automatically. +- Dark theme, prompt box, code cards, copy button per response. +- Syntax highlighting for Python and JavaScript. +- Shows generation time and generated token count. +- Keeps conversation history in the current session. +- Clear button to reset conversation. + +## Launch (single command) +```powershell +python .\scripts\launch_component8_chat.py --config .\configs\component8_chat_config.yaml +``` + +## URL to open +- `http://127.0.0.1:7860` + +No internet is needed for local usage. diff --git a/README_FINAL_PROJECT.md b/README_FINAL_PROJECT.md new file mode 100644 index 0000000000000000000000000000000000000000..df399f032d7b5bf7b65b3c4f15f502606636e5df --- /dev/null +++ b/README_FINAL_PROJECT.md @@ -0,0 +1,126 @@ +# Final Project README - MINDI 1.0 420M (Windows, RTX 4060 8GB) + +## What This Project Is +This is a fully local coding-assistant model system built step-by-step from scratch. +It supports: +- custom tokenizer for code +- dataset cleaning + tokenization pipeline +- 420M transformer model +- memory-optimized training +- evaluation + inference improvements +- local chat UI +- LoRA fine-tuning +- INT8 export + portable package + +Everything runs locally on your machine without internet after setup. + +--- + +## What You Built (High Level) +1. **Project setup** with reproducible environment and verification scripts. +2. **Custom code tokenizer** (Python + JavaScript aware). +3. **Dataset pipeline** with cleaning, dedupe, and tokenization. +4. **420M transformer architecture** (modular config). +5. **Training pipeline** (FP16, checkpointing, accumulation, resume, early stopping). +6. **Evaluation system** (val metrics + generation checks). +7. **Inference engine** (greedy mode, stop rules, syntax-aware retry). +8. **Local chat interface** with history, copy button, timing, and mode selector. +9. **LoRA fine-tuning pipeline** for your own examples. +10. **Export/quantization/packaging** with benchmark report and portable launcher. + +--- + +## Most Important File Locations + +### Core model and data +- Base checkpoint: `checkpoints/component5_420m/step_3200.pt` +- Tokenized training data: `data/processed/train_tokenized.jsonl` +- Tokenizer: `artifacts/tokenizer/code_tokenizer_v1/` + +### LoRA +- Best LoRA adapter: `models/lora/custom_lora_v1/best.pt` +- LoRA metadata: `models/lora/custom_lora_v1/adapter_meta.json` + +### Quantized model +- INT8 model: `models/quantized/model_step3200_int8_state.pt` +- Benchmark report: `artifacts/export/component10_benchmark_report.json` + +### Chat interface +- Launcher: `scripts/launch_component8_chat.py` +- Chat config: `configs/component8_chat_config.yaml` + +### Portable package +- Folder: `release/MINDI_1.0_420M` +- Double-click launcher: `release/MINDI_1.0_420M/Start_MINDI.bat` + +--- + +## Launch the Main Chat UI +From project root (`C:\AI 2`): + +```powershell +.\.venv\Scripts\Activate.ps1 +python .\scripts\launch_component8_chat.py --config .\configs\component8_chat_config.yaml +``` + +Open in browser: +- `http://127.0.0.1:7860` + +### Live model selector in UI +You can switch without restart: +- `base` +- `lora` +- `int8` + +Status box shows: +- active mode +- mode load time +- live VRAM usage + +--- + +## How to Add More Training Data (Future Improvement) + +### A) Add more base-training pairs (full training path) +1. Put new JSONL/JSON files in `data/raw/`. +2. Run dataset processing scripts (Component 3 path). +3. Continue/refresh base training with Component 5. + +### B) Add targeted improvements quickly (LoRA recommended) +1. Edit `data/raw/custom_finetune_pairs.jsonl` with your new prompt/code pairs. + - Required fields per row: `prompt`, `code` + - Optional: `language` (`python` or `javascript`) +2. Run LoRA fine-tuning: + +```powershell +python .\scripts\run_component9_lora_finetune.py --config .\configs\component9_lora_config.yaml +``` + +3. Use updated adapter in chat by selecting `lora` mode. + +--- + +## Recommended Next Habit +When quality is weak on specific tasks: +1. Add 20-200 clean examples of exactly that task style to `custom_finetune_pairs.jsonl`. +2. Re-run LoRA fine-tuning. +3. Test in chat `lora` mode. +4. Repeat in small cycles. + +This gives faster improvement than retraining the full base model each time. + +--- + +## One-File Health Check Commands + +```powershell +python .\scripts\verify_component1_setup.py +python .\scripts\verify_component4_model.py --config .\configs\component4_model_config.yaml --batch_size 1 --seq_len 256 +python .\scripts\verify_component9_lora.py +``` + +--- + +## Current Status +Project is complete across Components 1-10 and verified on your hardware. + diff --git a/artifacts/evaluation/component6_eval_results.json b/artifacts/evaluation/component6_eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..395b688ad5f18bbe939409cf371489b51bb99973 --- /dev/null +++ b/artifacts/evaluation/component6_eval_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3da6ee747d77b0c8cdca5d4fedb750549a9e5e7c42592e5e32e6103ff5617d8f +size 2379 diff --git a/artifacts/evaluation/component7_inference_results.json b/artifacts/evaluation/component7_inference_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d74c9b319277093414e7e4624e277606c5b242f7 --- /dev/null +++ b/artifacts/evaluation/component7_inference_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce08bfd6918f619fdcb1ef17ec1db79c2d32578d12a02aaaae7b7092f83384ae +size 5863 diff --git a/artifacts/export/component10_benchmark_report.json b/artifacts/export/component10_benchmark_report.json new file mode 100644 index 0000000000000000000000000000000000000000..48979bc22a1f42e5aad8e586c824bca5a099c519 --- /dev/null +++ b/artifacts/export/component10_benchmark_report.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d827ec736fbdc4ea2ed5bc196223f1bf02d11a9260acd451edd51f8f39bcda75 +size 545 diff --git a/artifacts/model/component4_model_summary.json b/artifacts/model/component4_model_summary.json new file mode 100644 index 0000000000000000000000000000000000000000..5fa452868b7bc7fd737e795c66a4f4e8ee3e5c6c --- /dev/null +++ b/artifacts/model/component4_model_summary.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5ebc8aa081f82bbcaee2c945b207b4db3251f63b845ed86055f4e5b7204010 +size 328 diff --git a/artifacts/tokenizer/code_tokenizer_v1/tokenizer.json b/artifacts/tokenizer/code_tokenizer_v1/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..7e1a514ba7e4fe347a2bb3dd10481c45ede8a552 --- /dev/null +++ b/artifacts/tokenizer/code_tokenizer_v1/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fe04cc37ac778637cb2cc02a6096412e5d8cada3e4ef3e4a7f2d141fccab8a0 +size 11475 diff --git a/artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json b/artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..475df77b084233f9af7d7d0d6117b26ddedb1003 --- /dev/null +++ b/artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb0b7af679bac1c29fe7ac9f86c48f1fed5584ba72c9ef2c338f60b63e07bb46 +size 302 diff --git a/backup_step1000.tar.gz b/backup_step1000.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..90487a54bc939f1f525e3a5bbcefe1e6b4c3abee --- /dev/null +++ b/backup_step1000.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebe005c43dd59c9c49ad153d41af1bdaaad47c2a21ae231a4c5e90c8005560af +size 337623475 diff --git a/backup_step2000.tar.gz b/backup_step2000.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..e90111a43816db073a72f2aaf9a6d41d05bf4a7a --- /dev/null +++ b/backup_step2000.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861329fb551b4c6406e92e06cfa1faae592f0fe0d0ce713189a57c62b33b0969 +size 337571785 diff --git a/backup_step3000.tar.gz b/backup_step3000.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..002bb96b225f1a2d3550a188df8e76e9eac89bf0 --- /dev/null +++ b/backup_step3000.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:238c2859ebf4efc0195456a898d2fb8bce0397e39fdf59e9f940963232d628a8 +size 337762553 diff --git a/checkpoints/component5_420m/latest.pt b/checkpoints/component5_420m/latest.pt new file mode 100644 index 0000000000000000000000000000000000000000..e99a700ef6965af5b53fc2b5e4ae3cbd2bf30543 --- /dev/null +++ b/checkpoints/component5_420m/latest.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32d26a7dd9e6e294c6657f6fb3a4d947cf52eb8e1c0b11032722fa50d15c4a21 +size 5087449970 diff --git a/checkpoints/component5_420m/step_3000.pt b/checkpoints/component5_420m/step_3000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6a459a4b1c19c8ad662c3d3f8443438d9df2bef --- /dev/null +++ b/checkpoints/component5_420m/step_3000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e11bded40789574ef316636c02c2fd1e8cd54c13441d8cd6a28980f2209ffaa9 +size 5087455158 diff --git a/checkpoints/component5_420m/step_3200.pt b/checkpoints/component5_420m/step_3200.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c3c0aa415e3eeaea13ecec2d036f3b8f6bdbd --- /dev/null +++ b/checkpoints/component5_420m/step_3200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d2ea9401f3b08b2528dbb8f993949794d0adb57642d0f4752d74da0e445238 +size 5087455158 diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..59ca8adcfede571d3684bc836b9e03cc8e59b0ae --- /dev/null +++ b/config.py @@ -0,0 +1,45 @@ +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class Paths: + project_root: Path = Path(".") + model_dir: Path = Path("./model") + data_dir: Path = Path("./data") + output_dir: Path = Path("./output") + logs_dir: Path = Path("./logs") + + train_jsonl: Path = Path("./data/train.jsonl") + dataset_cache_dir: Path = Path("./data/cache") + raw_dataset_dir: Path = Path("./data/cache/raw") + checkpoint_dir: Path = Path("./output/checkpoints") + lora_output_dir: Path = Path("./output/lora_adapters") + tokenizer_output_dir: Path = Path("./output/tokenizer") + + +@dataclass(frozen=True) +class DataConfig: + max_total_samples: int = 200000 + max_humaneval_samples: int = 20000 + max_mbpp_samples: int = 50000 + max_codesearchnet_samples: int = 180000 + min_output_chars: int = 40 + + +@dataclass(frozen=True) +class TrainingConfig: + num_train_epochs: int = 5 + per_device_train_batch_size: int = 1 + gradient_accumulation_steps: int = 8 + learning_rate: float = 1e-5 + max_length: int = 1024 + save_steps: int = 250 + logging_steps: int = 20 + eval_max_new_tokens: int = 220 + resume_training: bool = True + + +PATHS = Paths() +DATA_CONFIG = DataConfig() +TRAINING_CONFIG = TrainingConfig() diff --git a/configs/component10_export_config.yaml b/configs/component10_export_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..24d5a702dfa6fe9df0e2943f7605f26be6f33523 --- /dev/null +++ b/configs/component10_export_config.yaml @@ -0,0 +1,21 @@ +# Component 10 export and optimization config + +model: + model_config_path: configs/component4_model_config.yaml + source_checkpoint_path: checkpoints/component5_420m/step_3200.pt + tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 + +quantization: + quantized_output_path: models/quantized/model_step3200_int8_state.pt + +benchmark: + prompt: Write a Python function to compute factorial of n. + max_new_tokens: 120 + +package: + output_dir: release/MINDI_1.0_420M + app_port: 7861 + +outputs: + benchmark_report_json: artifacts/export/component10_benchmark_report.json + diff --git a/configs/component3_dataset_pipeline.yaml b/configs/component3_dataset_pipeline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..24077d99b74fbad01942d843b7781d803067385b --- /dev/null +++ b/configs/component3_dataset_pipeline.yaml @@ -0,0 +1,38 @@ +# Component 3 config: load, clean, deduplicate, tokenize. + +tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 +interim_output_dir: data/interim +processed_output_dir: data/processed +dedupe_db_path: data/interim/dedupe_hashes.sqlite + +# Set null for full run. +# Use a small number like 500 for fast smoke testing. +max_records_per_dataset: null + +min_prompt_chars: 8 +min_code_chars: 16 +max_code_chars: 40000 +progress_every: 1000 + +datasets: + - hf_dataset_id: iamtarun/python_code_instructions_18k_alpaca + split: train + prompt_field: instruction + code_field: output + language_field: null + default_language: python + + - hf_dataset_id: sahil2801/CodeAlpaca-20k + split: train + prompt_field: instruction + code_field: output + language_field: null + default_language: python + + - hf_dataset_id: TokenBender/code_instructions_122k_alpaca_style + split: train + prompt_field: instruction + code_field: output + language_field: null + default_language: python + diff --git a/configs/component3_incremental_js.yaml b/configs/component3_incremental_js.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8b09fbb818e398b6213598b11d90a592573d319 --- /dev/null +++ b/configs/component3_incremental_js.yaml @@ -0,0 +1,27 @@ +# Incremental JS augmentation config. +# This script appends new JavaScript samples into existing Component 3 outputs. + +tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 +existing_clean_path: data/interim/combined_clean.jsonl +existing_tokenized_path: data/processed/train_tokenized.jsonl +existing_stats_path: data/processed/pipeline_stats.json +dedupe_db_path: data/interim/dedupe_hashes_incremental.sqlite + +# Chosen dataset for JS augmentation. +new_dataset: + hf_dataset_id: philschmid/code-alpaca-ruby-python-javascript + split: train + prompt_field: instruction + code_field: output + language_field: null + default_language: auto + +# Hard target requested by user. +target_new_javascript_examples: 20000 + +# Quality filters (same idea as Component 3). +min_prompt_chars: 8 +min_code_chars: 16 +max_code_chars: 40000 +progress_every: 500 + diff --git a/configs/component3_reprocess_from_clean.yaml b/configs/component3_reprocess_from_clean.yaml new file mode 100644 index 0000000000000000000000000000000000000000..08939cae2a7d9b74bed27a1d1369c91af1725fda --- /dev/null +++ b/configs/component3_reprocess_from_clean.yaml @@ -0,0 +1,19 @@ +# Reprocess config: no dataset download, no full pipeline rebuild. +# It reads existing cleaned data and regenerates tokenized output. + +tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 +input_clean_path: data/interim/combined_clean.jsonl +output_tokenized_path: data/processed/train_tokenized.jsonl +output_stats_path: data/processed/pipeline_stats.json + +# Safety backups before overwrite. +backup_existing_tokenized: true +backup_existing_stats: true + +# Existing language labels in clean file may be wrong from earlier runs. +# true = infer language from prompt+code content only. +ignore_existing_language_labels: true + +# Optional quick test mode. +# Set null for full reprocess. +max_records: null diff --git a/configs/component4_model_config.yaml b/configs/component4_model_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5667b78688d6890d294fc808c0110edd093f6a7 --- /dev/null +++ b/configs/component4_model_config.yaml @@ -0,0 +1,18 @@ +# Component 4 model config. +# You can switch the preset name or directly edit dimensions below. + +preset: medium_420m + +model: + vocab_size: 50000 + max_seq_len: 2048 + d_model: 1152 + n_layers: 23 + n_heads: 16 + d_ff: 4608 + dropout: 0.1 + tie_embeddings: true + gradient_checkpointing: false + init_std: 0.02 + rms_norm_eps: 0.00001 + diff --git a/configs/component5_training_config.verify.yaml b/configs/component5_training_config.verify.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0299d5b9fb19d08fbfe80a6594f0e24f3317e234 --- /dev/null +++ b/configs/component5_training_config.verify.yaml @@ -0,0 +1,32 @@ +data: + tokenized_jsonl_path: data/processed/train_tokenized.jsonl + val_ratio: 0.02 + split_seed: 17 + num_workers: 0 +model: + model_config_path: configs/component4_model_config.yaml +training: + output_dir: checkpoints/component5_420m + log_every: 1 + eval_every: 5 + save_every: 5 + max_steps: 5 + micro_batch_size: 1 + grad_accum_steps: 16 + max_seq_len: 512 + learning_rate: 0.0002 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + grad_clip_norm: 1.0 + warmup_steps: 300 + min_lr_ratio: 0.1 + use_fp16: true + use_gradient_checkpointing: true + prefer_8bit_adam: true + early_stopping_patience_evals: 20 + early_stopping_min_delta: 0.0005 + max_vram_gb: 7.0 +resume: + resume_from: none diff --git a/configs/component5_training_config.yaml b/configs/component5_training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2e38d6ed0b29eaca71af3d82e67696bc6338309 --- /dev/null +++ b/configs/component5_training_config.yaml @@ -0,0 +1,37 @@ +# Component 5 training config for RTX 4060 8GB. + +data: + tokenized_jsonl_path: data/processed/train_tokenized.jsonl + val_ratio: 0.02 + split_seed: 17 + num_workers: 2 + +model: + model_config_path: configs/component4_model_config.yaml + +training: + output_dir: checkpoints/component5_420m + log_every: 10 + eval_every: 100 + save_every: 200 + max_steps: 8000 + micro_batch_size: 1 + grad_accum_steps: 16 + max_seq_len: 448 + learning_rate: 0.00022 + weight_decay: 0.1 + betas: [0.9, 0.95] + grad_clip_norm: 1.0 + warmup_steps: 300 + min_lr_ratio: 0.1 + use_fp16: true + use_gradient_checkpointing: true + prefer_8bit_adam: true + early_stopping_patience_evals: 5 + early_stopping_min_delta: 0.0005 + max_vram_gb: 7.0 + +resume: + resume_from: latest # latest | none | explicit checkpoint path + + diff --git a/configs/component6_evaluation_config.yaml b/configs/component6_evaluation_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6edf0a8dce1087b01d00e3a3b3568bc302a72c4c --- /dev/null +++ b/configs/component6_evaluation_config.yaml @@ -0,0 +1,21 @@ +# Component 6 evaluation config. + +model: + model_config_path: configs/component4_model_config.yaml + checkpoint_paths: + - checkpoints/component5_420m/step_3200.pt + +data: + tokenized_jsonl_path: data/processed/train_tokenized.jsonl + val_ratio: 0.02 + split_seed: 17 + +inference: + max_seq_len: 448 + max_new_tokens: 160 + temperature: 0.25 + top_p: 0.85 + +output: + results_json: artifacts/evaluation/component6_eval_results.json + diff --git a/configs/component7_inference_config.yaml b/configs/component7_inference_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3873f5bfc7da45d4a36cfce0e779f6a07621da7a --- /dev/null +++ b/configs/component7_inference_config.yaml @@ -0,0 +1,20 @@ +# Component 7 inference config + +model: + model_config_path: configs/component4_model_config.yaml + checkpoint_path: checkpoints/component5_420m/step_3200.pt + tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 + +inference: + language: python + max_new_tokens: 180 + greedy_temperature: 0.0 + retry2_temperature: 0.25 + retry2_top_p: 0.85 + retry3_temperature: 0.35 + retry3_top_p: 0.90 + max_retries: 3 + min_tokens_before_stop_check: 24 + +output: + results_json: artifacts/evaluation/component7_inference_results.json diff --git a/configs/component8_chat_config.yaml b/configs/component8_chat_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c768f7c4b4ec868c2574d2ff57a224c3abb91455 --- /dev/null +++ b/configs/component8_chat_config.yaml @@ -0,0 +1,30 @@ +# Component 8 chat interface config. + +model: + model_config_path: configs/component4_model_config.yaml + base_checkpoint_path: checkpoints/component5_420m/step_3200.pt + lora_adapter_path: models/lora/custom_lora_v1/best.pt + quantized_state_path: models/quantized/model_step3200_int8_state.pt + tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 + +lora: + r: 8 + alpha: 16 + dropout: 0.05 + target_keywords: [q_proj, k_proj, v_proj, o_proj, fc1, fc2] + +inference: + language_default: python + max_new_tokens: 300 + greedy_temperature: 0.0 + retry2_temperature: 0.25 + retry2_top_p: 0.85 + retry3_temperature: 0.35 + retry3_top_p: 0.90 + max_retries: 3 + min_tokens_before_stop_check: 64 + +server: + host: 127.0.0.1 + port: 7860 + share: false diff --git a/configs/component9_lora_config.verify.yaml b/configs/component9_lora_config.verify.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a5e43b65a380a4b4af5b948159792a7624a9427 --- /dev/null +++ b/configs/component9_lora_config.verify.yaml @@ -0,0 +1,32 @@ +model: + model_config_path: configs/component4_model_config.yaml + base_checkpoint_path: checkpoints/component5_420m/step_3200.pt + tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 +lora: + r: 8 + alpha: 16 + dropout: 0.05 + target_keywords: + - q_proj + - k_proj + - v_proj + - o_proj + - fc1 + - fc2 +finetune: + custom_data_path: data/raw/custom_finetune_pairs.jsonl + output_dir: models/lora/custom_lora_v1 + max_seq_len: 512 + micro_batch_size: 1 + grad_accum_steps: 16 + learning_rate: 0.0003 + weight_decay: 0.0 + max_steps: 5 + save_every: 5 + eval_every: 5 + early_stopping_patience_evals: 6 + early_stopping_min_delta: 0.0005 + use_fp16: true + max_vram_gb: 7.0 +resume: + resume_from: none diff --git a/configs/component9_lora_config.yaml b/configs/component9_lora_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d64028034f3e2e32101bd29ba3ce0f483cbf0a47 --- /dev/null +++ b/configs/component9_lora_config.yaml @@ -0,0 +1,31 @@ +# Component 9 LoRA fine-tuning config + +model: + model_config_path: configs/component4_model_config.yaml + base_checkpoint_path: checkpoints/component5_420m/step_3200.pt + tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1 + +lora: + r: 8 + alpha: 16 + dropout: 0.05 + target_keywords: [q_proj, k_proj, v_proj, o_proj, fc1, fc2] + +finetune: + custom_data_path: data/raw/custom_finetune_pairs.jsonl + output_dir: models/lora/custom_lora_v1 + max_seq_len: 512 + micro_batch_size: 1 + grad_accum_steps: 16 + learning_rate: 0.0003 + weight_decay: 0.0 + max_steps: 1200 + save_every: 100 + eval_every: 100 + early_stopping_patience_evals: 6 + early_stopping_min_delta: 0.0005 + use_fp16: true + max_vram_gb: 7.0 + +resume: + resume_from: none # none | latest | explicit path diff --git a/data/cache/raw/code_search_net_python/dataset_dict.json b/data/cache/raw/code_search_net_python/dataset_dict.json new file mode 100644 index 0000000000000000000000000000000000000000..00be0f6fd7375ce167ad242a64390d512feedd34 --- /dev/null +++ b/data/cache/raw/code_search_net_python/dataset_dict.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bf46fe547f16d795abe0d4c8a591bf031d98882d638931d27660455ee986273 +size 43 diff --git a/data/cache/raw/code_search_net_python/test/data-00000-of-00001.arrow b/data/cache/raw/code_search_net_python/test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2e8b0426396fc60a7813b0838e15268deacae9fe --- /dev/null +++ b/data/cache/raw/code_search_net_python/test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:079bce0f0e2513bae63c12f8699e4ea13ec545c5000844de28dc34a1a9fd19eb +size 84367104 diff --git a/data/cache/raw/code_search_net_python/test/dataset_info.json b/data/cache/raw/code_search_net_python/test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..5b583799d21ef6157999e31ada511bed9e84c578 --- /dev/null +++ b/data/cache/raw/code_search_net_python/test/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c +size 2598 diff --git a/data/cache/raw/code_search_net_python/test/state.json b/data/cache/raw/code_search_net_python/test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..7ad85c7581bbbe247fc901886dfea8a1091324c4 --- /dev/null +++ b/data/cache/raw/code_search_net_python/test/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55d5fecb65147f455bfc8249c3e26fc6a2bd01bfd8bd9f354e86eb7834453d1c +size 261 diff --git a/data/cache/raw/code_search_net_python/train/data-00000-of-00004.arrow b/data/cache/raw/code_search_net_python/train/data-00000-of-00004.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ad3769233e0146349963abbe57b6e50d955b5e1c --- /dev/null +++ b/data/cache/raw/code_search_net_python/train/data-00000-of-00004.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5984af399adbfdab06aca7da7638f6a5eb98411b15b88a1f045f346735fbc9c +size 377852224 diff --git a/data/cache/raw/code_search_net_python/train/data-00001-of-00004.arrow b/data/cache/raw/code_search_net_python/train/data-00001-of-00004.arrow new file mode 100644 index 0000000000000000000000000000000000000000..73aa3abef75d171f5dadb3bcabd014185231a657 --- /dev/null +++ b/data/cache/raw/code_search_net_python/train/data-00001-of-00004.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a62df607497be1fd23f3e8aa50908bebff6732ccc8b5dacbfaa0efd336ad915 +size 411927504 diff --git a/data/cache/raw/code_search_net_python/train/data-00002-of-00004.arrow b/data/cache/raw/code_search_net_python/train/data-00002-of-00004.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3c31fd25f5c405f5d4528764b59fb60a79c970d4 --- /dev/null +++ b/data/cache/raw/code_search_net_python/train/data-00002-of-00004.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d519b4edb8ae27d8e1ab6474a8decc40f45c6a8e7c409039c865abbc9763f351 +size 370005344 diff --git a/data/cache/raw/code_search_net_python/train/data-00003-of-00004.arrow b/data/cache/raw/code_search_net_python/train/data-00003-of-00004.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e4f6c1332e5b8f2213beec2546b5df7f9a21c39e --- /dev/null +++ b/data/cache/raw/code_search_net_python/train/data-00003-of-00004.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42ae91a5e6e48dd32eac5940429d726f0dbc9440d0262a40a3bfe7a0e2e6214 +size 400292712 diff --git a/data/cache/raw/code_search_net_python/train/dataset_info.json b/data/cache/raw/code_search_net_python/train/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..5b583799d21ef6157999e31ada511bed9e84c578 --- /dev/null +++ b/data/cache/raw/code_search_net_python/train/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c +size 2598 diff --git a/data/cache/raw/code_search_net_python/train/state.json b/data/cache/raw/code_search_net_python/train/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad1cf733d9564bf0386c9a83158705dec46f4343 --- /dev/null +++ b/data/cache/raw/code_search_net_python/train/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:180b84fce72622f4113ea103a1fbf79924e61881442db8728b055be042247bcf +size 448 diff --git a/data/cache/raw/code_search_net_python/validation/data-00000-of-00001.arrow b/data/cache/raw/code_search_net_python/validation/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..72adad89455ac9e2a5b197693c563d607bf8a8df --- /dev/null +++ b/data/cache/raw/code_search_net_python/validation/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f848f9c1dfe1c2cfac25fd1b529e050e29291a5d8042ba1d4f904948142c64 +size 92180808 diff --git a/data/cache/raw/code_search_net_python/validation/dataset_info.json b/data/cache/raw/code_search_net_python/validation/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..5b583799d21ef6157999e31ada511bed9e84c578 --- /dev/null +++ b/data/cache/raw/code_search_net_python/validation/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c +size 2598 diff --git a/data/cache/raw/code_search_net_python/validation/state.json b/data/cache/raw/code_search_net_python/validation/state.json new file mode 100644 index 0000000000000000000000000000000000000000..5f11e18b460cd718fad08107add9781cfc12a5c8 --- /dev/null +++ b/data/cache/raw/code_search_net_python/validation/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20e5f3cf2d550a3fb9b3d3e43f23f25dfaae9ae3124e43dcf14072f5e3aee182 +size 267 diff --git a/data/cache/raw/mbpp/dataset_dict.json b/data/cache/raw/mbpp/dataset_dict.json new file mode 100644 index 0000000000000000000000000000000000000000..4287d69d266da7d00c47b23dbcc550f890f936f3 --- /dev/null +++ b/data/cache/raw/mbpp/dataset_dict.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb69d413c1138964f92bd3723baf871db8f40b4cec70586e770e060108a8c612 +size 53 diff --git a/data/cache/raw/mbpp/prompt/data-00000-of-00001.arrow b/data/cache/raw/mbpp/prompt/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e34e148ba050ae18dce7c8812fcd8d2864da2dfd --- /dev/null +++ b/data/cache/raw/mbpp/prompt/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e14c47c41a23d8003284ac9249a5c5e4da285300f1a56b63593fb2d6237556ff +size 6112 diff --git a/data/cache/raw/mbpp/prompt/dataset_info.json b/data/cache/raw/mbpp/prompt/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c007eeaa7baea370cfd0b6b79ee443ee76179bda --- /dev/null +++ b/data/cache/raw/mbpp/prompt/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e +size 2205 diff --git a/data/cache/raw/mbpp/prompt/state.json b/data/cache/raw/mbpp/prompt/state.json new file mode 100644 index 0000000000000000000000000000000000000000..991f78ccc81d3352cf78e4c116d4bec061effd43 --- /dev/null +++ b/data/cache/raw/mbpp/prompt/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:186836c8bbd590862fcd91367aec27d3ec52ff571d1b2b35410003437819f419 +size 263 diff --git a/data/cache/raw/mbpp/test/data-00000-of-00001.arrow b/data/cache/raw/mbpp/test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..578fc2ad3fe9224fec0f06bf7df5ca9271cbd146 --- /dev/null +++ b/data/cache/raw/mbpp/test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7af94cfc4affeacd0bc887e741770b414764c868c8bf08485ea03c1a5f99b38 +size 245680 diff --git a/data/cache/raw/mbpp/test/dataset_info.json b/data/cache/raw/mbpp/test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c007eeaa7baea370cfd0b6b79ee443ee76179bda --- /dev/null +++ b/data/cache/raw/mbpp/test/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e +size 2205 diff --git a/data/cache/raw/mbpp/test/state.json b/data/cache/raw/mbpp/test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..00ec1ee66f100b3cf89c9f109a1cf961bd2e0929 --- /dev/null +++ b/data/cache/raw/mbpp/test/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b14c4aaed028a8972650cc53715f48bcdfb497befc97db690183aa0cd60b183 +size 261 diff --git a/data/cache/raw/mbpp/train/data-00000-of-00001.arrow b/data/cache/raw/mbpp/train/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..cc49abb7dad06fa070d2ea72f51aecf8f766aa04 --- /dev/null +++ b/data/cache/raw/mbpp/train/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbd85255cf0fad7b11f3b39233045a0ab1799c4fe51846ec57946e0abe59ed70 +size 178448 diff --git a/data/cache/raw/mbpp/train/dataset_info.json b/data/cache/raw/mbpp/train/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c007eeaa7baea370cfd0b6b79ee443ee76179bda --- /dev/null +++ b/data/cache/raw/mbpp/train/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e +size 2205 diff --git a/data/cache/raw/mbpp/train/state.json b/data/cache/raw/mbpp/train/state.json new file mode 100644 index 0000000000000000000000000000000000000000..de58b395de36e5c9effc74f1828342cbb76b027d --- /dev/null +++ b/data/cache/raw/mbpp/train/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb788b2e33e4e0f5ccdfdd3f896a09c2c1080d44c73fc57ae2ac7fa7a1034403 +size 262 diff --git a/data/cache/raw/mbpp/validation/data-00000-of-00001.arrow b/data/cache/raw/mbpp/validation/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e02be79a52d52de6bf2a2431a2dd312e39215159 --- /dev/null +++ b/data/cache/raw/mbpp/validation/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fc2337ee96303f7e94580aaeffc92976fb82c04e2ca2d1203e99a22ce03e408 +size 43960 diff --git a/data/cache/raw/mbpp/validation/dataset_info.json b/data/cache/raw/mbpp/validation/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c007eeaa7baea370cfd0b6b79ee443ee76179bda --- /dev/null +++ b/data/cache/raw/mbpp/validation/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e +size 2205 diff --git a/data/cache/raw/mbpp/validation/state.json b/data/cache/raw/mbpp/validation/state.json new file mode 100644 index 0000000000000000000000000000000000000000..389acacc0ecb3982c7f8ff104fadd66630928823 --- /dev/null +++ b/data/cache/raw/mbpp/validation/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d935f09e6248f886ae2e2cbeb7791209c5599fb566f235da2067d112e1a712c9 +size 267 diff --git a/data/cache/raw/openai_humaneval/dataset_dict.json b/data/cache/raw/openai_humaneval/dataset_dict.json new file mode 100644 index 0000000000000000000000000000000000000000..2eb9ae1016128a0658b272ec4d8ca727cbcc2770 --- /dev/null +++ b/data/cache/raw/openai_humaneval/dataset_dict.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24761d3c08510c22918dfca6bc94100b9e5cff3fa323b34f2cc11916fcf69064 +size 20 diff --git a/data/cache/raw/openai_humaneval/test/data-00000-of-00001.arrow b/data/cache/raw/openai_humaneval/test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2948fd38b4bc54bc55fe7e5a888a8b44b28eef97 --- /dev/null +++ b/data/cache/raw/openai_humaneval/test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f52943a6aa89b8d973a477910be07b0d41e31f6c3df276a61db6164910cb223d +size 195528 diff --git a/data/cache/raw/openai_humaneval/test/dataset_info.json b/data/cache/raw/openai_humaneval/test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..359b8034b44973e6ed14cecab6d32c1742ac19ad --- /dev/null +++ b/data/cache/raw/openai_humaneval/test/dataset_info.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cafb7b223551699ced1aca81a07bc5c4232fe19f4a9893a91a00cb52f5390645 +size 1166 diff --git a/data/cache/raw/openai_humaneval/test/state.json b/data/cache/raw/openai_humaneval/test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..c2e0699ea81c16299bd3b13097b36f9c27013ba6 --- /dev/null +++ b/data/cache/raw/openai_humaneval/test/state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fef586a06deb0be67375e335d6eac139383c6a5bdda046e79e58e0b76ec68551 +size 261 diff --git a/data/external/component2_tokenizer_sample.jsonl b/data/external/component2_tokenizer_sample.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3a29594c5079c772996317dad5be756c24334c2e --- /dev/null +++ b/data/external/component2_tokenizer_sample.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:438bfccb48cfc64cc0b96e40a5b773544edcdfc83807c8a59223c775206095ab +size 507 diff --git a/data/interim/combined_clean.jsonl b/data/interim/combined_clean.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7cc72e1a47f512e8c9eaee8d6849205a2c079602 --- /dev/null +++ b/data/interim/combined_clean.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba34da6a4c4d42cba198caa9c252a62f42208ee1b6c9153ec945e6a3dc2f7572 +size 77049613 diff --git a/data/interim/dedupe_hashes.sqlite b/data/interim/dedupe_hashes.sqlite new file mode 100644 index 0000000000000000000000000000000000000000..7aedc2571bb34b945276d075b96c59c78993d465 --- /dev/null +++ b/data/interim/dedupe_hashes.sqlite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5af933cbb66657edd8b415ee9bf1c6a5cc8b8abe9857cf48e9d0ee1f5b8342e +size 21954560 diff --git a/data/interim/dedupe_hashes_incremental.sqlite b/data/interim/dedupe_hashes_incremental.sqlite new file mode 100644 index 0000000000000000000000000000000000000000..cf315de2841b631aa4e434db527f71bffd2b1e7f --- /dev/null +++ b/data/interim/dedupe_hashes_incremental.sqlite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb7f733a85d4a447c274aaff3cc8c334c55f4fb7a866cd4ef6f61364afd3edb8 +size 21905408 diff --git a/data/processed/pipeline_stats.json b/data/processed/pipeline_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..25ee5a8ae4485a702e4f7b5d6d3c82080110b5fe --- /dev/null +++ b/data/processed/pipeline_stats.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d426f14abb3e473c92616068dc74d77eb327c5a9e42987ddf342d3b1eaa76b30 +size 216 diff --git a/data/processed/pipeline_stats.json.bak b/data/processed/pipeline_stats.json.bak new file mode 100644 index 0000000000000000000000000000000000000000..f5276a2fadb080a896c9b387ec694bfc5bd29a78 --- /dev/null +++ b/data/processed/pipeline_stats.json.bak @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42bda6bd9ad255ca525f02954c9a67d2c9a90d14050e1bb68db1eccba8b05766 +size 212 diff --git a/data/processed/train_tokenized.jsonl b/data/processed/train_tokenized.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b978067c2e8a15c696d52f33a99cc3595f7164e6 --- /dev/null +++ b/data/processed/train_tokenized.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7316e1650b4a9ba1a19de2d83d957bd4d0e089b3ab2de076293ecc4760310e45 +size 333825717 diff --git a/data/processed/train_tokenized.jsonl.bak b/data/processed/train_tokenized.jsonl.bak new file mode 100644 index 0000000000000000000000000000000000000000..e54eaf6c757f01d1bbcbdcd4cc86f4d519a36270 --- /dev/null +++ b/data/processed/train_tokenized.jsonl.bak @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d77d82d56e0c041c05f7978c85d4921bed85ba92f3b75ec934d640e16dde7c5c +size 333610086 diff --git a/data/raw/custom_finetune_pairs.jsonl b/data/raw/custom_finetune_pairs.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..847c43db76f1378f39f79e1e5200922f9b8a9bff --- /dev/null +++ b/data/raw/custom_finetune_pairs.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ab1ceab4d5a85de0c15a54f6420c483e78de5db4b5654dc5d34aa1d02893921 +size 451 diff --git a/data/train.jsonl b/data/train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d17d1863b85c74ce75290bbb6b5ce2af1fd6020d --- /dev/null +++ b/data/train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b651ce4bdc47cb7f105a8f2dd110f14bae1bfb98507d055e875e8afb9879d7 +size 44122145 diff --git a/data_fetch.py b/data_fetch.py new file mode 100644 index 0000000000000000000000000000000000000000..e166c402748d6c644003866ff2dcf3ef6417f577 --- /dev/null +++ b/data_fetch.py @@ -0,0 +1,222 @@ +import argparse +import hashlib +from pathlib import Path +from typing import Dict, List, Optional + +from datasets import Dataset, DatasetDict, load_dataset, load_from_disk + +from config import DATA_CONFIG, PATHS +from utils import ensure_dirs, setup_logger, write_jsonl + + +def _normalize_text(text: Optional[str]) -> str: + if not text: + return "" + return " ".join(str(text).strip().split()) + + +def _quality_ok(sample: Dict[str, str]) -> bool: + instruction = _normalize_text(sample.get("instruction")) + output = _normalize_text(sample.get("output")) + + if not instruction or not output: + return False + if len(output) < DATA_CONFIG.min_output_chars: + return False + + lowered = output.lower() + bad_tokens = ("todo", "fixme", "coming soon", "not implemented") + if any(tok in lowered for tok in bad_tokens): + return False + if output.strip() in {"pass", "...", "return ..."}: + return False + return True + + +def _to_record(instruction: str, input_text: str, output_text: str) -> Dict[str, str]: + return { + "instruction": instruction.strip(), + "input": input_text.strip(), + "output": output_text.strip(), + } + + +def _save_dataset_for_offline(ds_obj, save_path: Path) -> None: + if save_path.exists(): + return + save_path.parent.mkdir(parents=True, exist_ok=True) + ds_obj.save_to_disk(str(save_path)) + + +def _load_or_download(dataset_name: str, cache_path: Path, **kwargs): + if cache_path.exists(): + return load_from_disk(str(cache_path)) + + dataset_obj = load_dataset(dataset_name, **kwargs) + _save_dataset_for_offline(dataset_obj, cache_path) + return dataset_obj + + +def _load_or_download_codesearchnet(cache_path: Path, subset: str = "python"): + if cache_path.exists(): + return load_from_disk(str(cache_path)) + + ds = load_dataset("code_search_net", subset) + _save_dataset_for_offline(ds, cache_path) + return ds + + +def _extract_humaneval(ds_obj, max_samples: int) -> List[Dict[str, str]]: + rows: List[Dict[str, str]] = [] + split = ds_obj["test"] if isinstance(ds_obj, DatasetDict) else ds_obj + + for item in split: + prompt = item.get("prompt", "") + solution = item.get("canonical_solution", "") + if "def " not in prompt: + continue + rows.append( + _to_record( + instruction="Complete the Python function so it satisfies the specification.", + input_text=prompt, + output_text=solution, + ) + ) + if len(rows) >= max_samples: + break + return rows + + +def _extract_mbpp(ds_obj, max_samples: int) -> List[Dict[str, str]]: + rows: List[Dict[str, str]] = [] + splits = [] + if isinstance(ds_obj, DatasetDict): + splits = [ds_obj[k] for k in ds_obj.keys()] + else: + splits = [ds_obj] + + for split in splits: + for item in split: + task = item.get("text", "") + code = item.get("code", "") + tests = item.get("test_list", []) + if not task or not code: + continue + test_blob = "\n".join(tests) if isinstance(tests, list) else str(tests) + input_text = f"Task:\n{task}\n\nTests:\n{test_blob}".strip() + rows.append( + _to_record( + instruction="Write Python code that solves the problem and passes the tests.", + input_text=input_text, + output_text=code, + ) + ) + if len(rows) >= max_samples: + return rows + return rows + + +def _extract_codesearchnet(ds_obj, max_samples: int) -> List[Dict[str, str]]: + rows: List[Dict[str, str]] = [] + splits = [] + if isinstance(ds_obj, DatasetDict): + for split_name in ("train", "validation"): + if split_name in ds_obj: + splits.append(ds_obj[split_name]) + else: + splits = [ds_obj] + + for split in splits: + for item in split: + language = str(item.get("language", "")).lower() + if language and language != "python": + continue + + docstring = item.get("docstring", "") or item.get("func_documentation_string", "") + code = item.get("whole_func_string", "") or item.get("code", "") + if not docstring or not code: + continue + if "def " not in code and "class " not in code: + continue + + rows.append( + _to_record( + instruction="Write Python code that matches the following docstring.", + input_text=docstring, + output_text=code, + ) + ) + if len(rows) >= max_samples: + return rows + return rows + + +def _dedupe_and_filter(rows: List[Dict[str, str]], max_total: int) -> List[Dict[str, str]]: + seen = set() + clean_rows: List[Dict[str, str]] = [] + for row in rows: + if not _quality_ok(row): + continue + digest = hashlib.sha256( + f"{row['instruction']}||{row['input']}||{row['output']}".encode("utf-8") + ).hexdigest() + if digest in seen: + continue + seen.add(digest) + clean_rows.append(row) + if len(clean_rows) >= max_total: + break + return clean_rows + + +def fetch_and_prepare_dataset(offline_only: bool = False) -> Path: + ensure_dirs([PATHS.data_dir, PATHS.dataset_cache_dir, PATHS.raw_dataset_dir, PATHS.logs_dir]) + logger = setup_logger("data_fetch", PATHS.logs_dir / "data_fetch.log") + + logger.info("Loading datasets (offline_only=%s).", offline_only) + + humaneval_cache = PATHS.raw_dataset_dir / "openai_humaneval" + mbpp_cache = PATHS.raw_dataset_dir / "mbpp" + csn_cache = PATHS.raw_dataset_dir / "code_search_net_python" + + if offline_only: + if not humaneval_cache.exists() or not mbpp_cache.exists() or not csn_cache.exists(): + raise FileNotFoundError( + "Offline mode requested but one or more cached datasets are missing. " + "Run without --offline first." + ) + humaneval_ds = load_from_disk(str(humaneval_cache)) + mbpp_ds = load_from_disk(str(mbpp_cache)) + csn_ds = load_from_disk(str(csn_cache)) + else: + humaneval_ds = _load_or_download("openai_humaneval", humaneval_cache) + mbpp_ds = _load_or_download("mbpp", mbpp_cache) + csn_ds = _load_or_download_codesearchnet(csn_cache, subset="python") + + rows = [] + rows.extend(_extract_humaneval(humaneval_ds, DATA_CONFIG.max_humaneval_samples)) + rows.extend(_extract_mbpp(mbpp_ds, DATA_CONFIG.max_mbpp_samples)) + rows.extend(_extract_codesearchnet(csn_ds, DATA_CONFIG.max_codesearchnet_samples)) + + clean_rows = _dedupe_and_filter(rows, DATA_CONFIG.max_total_samples) + write_jsonl(PATHS.train_jsonl, clean_rows) + + logger.info("Saved %d cleaned training rows to %s", len(clean_rows), PATHS.train_jsonl) + print(f"Saved dataset: {PATHS.train_jsonl.resolve()}") + return PATHS.train_jsonl + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Download and prepare Python fine-tuning data.") + parser.add_argument( + "--offline", + action="store_true", + help="Use only previously saved local dataset cache.", + ) + return parser + + +if __name__ == "__main__": + args = _build_arg_parser().parse_args() + fetch_and_prepare_dataset(offline_only=args.offline) + diff --git a/dataset.py b/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..eb38e4e476f2fa7eef14df6461f6d56228225914 --- /dev/null +++ b/dataset.py @@ -0,0 +1,56 @@ +from typing import Dict, List + +import torch +from torch.utils.data import Dataset + +from config import PATHS, TRAINING_CONFIG +from utils import read_jsonl + + +def format_prompt(instruction: str, input_text: str, output_text: str) -> str: + return ( + f"### Instruction:\n{instruction}\n" + f"### Input:\n{input_text}\n" + f"### Response:\n{output_text}" + ) + + +class LocalJsonlInstructionDataset(Dataset): + def __init__(self, tokenizer, max_length: int = TRAINING_CONFIG.max_length): + self.tokenizer = tokenizer + self.max_length = max_length + self.samples: List[Dict[str, str]] = read_jsonl(PATHS.train_jsonl) + + if not self.samples: + raise ValueError(f"No training samples found in {PATHS.train_jsonl}") + + def __len__(self) -> int: + return len(self.samples) + + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: + sample = self.samples[idx] + text = format_prompt( + instruction=sample["instruction"], + input_text=sample["input"], + output_text=sample["output"], + ) + encoded = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + padding="max_length", + return_tensors="pt", + ) + + input_ids = encoded["input_ids"].squeeze(0) + attention_mask = encoded["attention_mask"].squeeze(0) + + labels = input_ids.clone() + labels[attention_mask == 0] = -100 + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + } + diff --git a/hf_release/MINDI-1.0-420M/LICENSE b/hf_release/MINDI-1.0-420M/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..1ddde814d0a4676002fd6703363cc362d9d195af --- /dev/null +++ b/hf_release/MINDI-1.0-420M/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 MINDI 1.0 420M Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/hf_release/MINDI-1.0-420M/README.md b/hf_release/MINDI-1.0-420M/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6bbdeb15a5e1d5dac7038a8aa8e776d94f5b0315 --- /dev/null +++ b/hf_release/MINDI-1.0-420M/README.md @@ -0,0 +1,81 @@ +--- +license: mit +language: +- en +library_name: transformers +pipeline_tag: text-generation +tags: +- code +- python +- javascript +- local-llm +- offline +--- + +# MINDI 1.0 420M + +MINDI 1.0 420M is a 420M-parameter coding language model focused on Python first and JavaScript second. +It is built for local, offline code generation workflows. + +## Capabilities + +- Code generation from natural language prompts +- Code completion +- Bug-fix suggestions +- Code explanation + +## Model Details + +- Parameters: 423,934,848 +- Architecture: Decoder-only Transformer +- Context length: 2048 tokens +- Focus languages: Python, JavaScript + +## Hardware Requirements + +Recommended: +- NVIDIA GPU with 8GB+ VRAM +- CUDA-enabled PyTorch + +Minimum: +- CPU inference works but is slower + +## Quick Start (GPU) + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + +repo_id = "YOUR_USERNAME/MINDI-1.0-420M" + +tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained( + repo_id, + trust_remote_code=True, + torch_dtype=torch.float16, +).cuda() + +prompt = "Write a Python function to check if a string is a palindrome." +inputs = tokenizer(prompt, return_tensors="pt").to("cuda") + +with torch.no_grad(): + output = model.generate( + **inputs, + max_new_tokens=220, + temperature=0.2, + top_p=0.9, + do_sample=True, + ) + +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + +## Limitations + +- The model can still produce syntax or logic errors. +- Generated code should always be reviewed and tested. +- Not intended for safety-critical production use without validation. + +## Safety + +Always run tests and static checks before using generated code in production. diff --git a/hf_release/MINDI-1.0-420M/UPLOAD_TO_HF.ps1 b/hf_release/MINDI-1.0-420M/UPLOAD_TO_HF.ps1 new file mode 100644 index 0000000000000000000000000000000000000000..36e1a9447eecdf4905c473b8a75e6784ba204737 --- /dev/null +++ b/hf_release/MINDI-1.0-420M/UPLOAD_TO_HF.ps1 @@ -0,0 +1,6 @@ +# Upload helper for MINDI 1.0 420M +# Run from PowerShell. + +huggingface-cli login +huggingface-cli repo create MINDI-1.0-420M --type model --public +huggingface-cli upload YOUR_USERNAME/MINDI-1.0-420M "C:\AI 2\hf_release\MINDI-1.0-420M" . --repo-type model diff --git a/hf_release/MINDI-1.0-420M/config.json b/hf_release/MINDI-1.0-420M/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e911be583da9cf92057d9e1719a8be97614a38 --- /dev/null +++ b/hf_release/MINDI-1.0-420M/config.json @@ -0,0 +1,29 @@ +{ + "model_type": "mindi", + "architectures": [ + "MindiForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_mindi.MindiConfig", + "AutoModelForCausalLM": "modeling_mindi.MindiForCausalLM", + "AutoTokenizer": [ + null, + "tokenization_mindi.MindiTokenizer" + ] + }, + "vocab_size": 50000, + "max_seq_len": 2048, + "d_model": 1152, + "n_layers": 23, + "n_heads": 16, + "d_ff": 4608, + "dropout": 0.1, + "tie_embeddings": true, + "init_std": 0.02, + "rms_norm_eps": 1e-05, + "bos_token_id": 2, + "eos_token_id": 3, + "pad_token_id": 0, + "torch_dtype": "float16", + "transformers_version": "4.46.3" +} \ No newline at end of file diff --git a/hf_release/MINDI-1.0-420M/configuration_mindi.py b/hf_release/MINDI-1.0-420M/configuration_mindi.py new file mode 100644 index 0000000000000000000000000000000000000000..02ed08ab6765e3bd37e2d68ec577af3c269ef031 --- /dev/null +++ b/hf_release/MINDI-1.0-420M/configuration_mindi.py @@ -0,0 +1,38 @@ +""" +Hugging Face config class for MINDI 1.0 420M. +""" + +from transformers import PretrainedConfig + + +class MindiConfig(PretrainedConfig): + model_type = "mindi" + + def __init__( + self, + vocab_size=50000, + max_seq_len=2048, + d_model=1152, + n_layers=23, + n_heads=16, + d_ff=4608, + dropout=0.1, + tie_embeddings=True, + init_std=0.02, + rms_norm_eps=1e-5, + bos_token_id=2, + eos_token_id=3, + pad_token_id=0, + **kwargs, + ): + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) + self.vocab_size = vocab_size + self.max_seq_len = max_seq_len + self.d_model = d_model + self.n_layers = n_layers + self.n_heads = n_heads + self.d_ff = d_ff + self.dropout = dropout + self.tie_embeddings = tie_embeddings + self.init_std = init_std + self.rms_norm_eps = rms_norm_eps diff --git a/hf_release/MINDI-1.0-420M/generation_config.json b/hf_release/MINDI-1.0-420M/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..75f05d0257daf3ce7641e5ee44eae81599e17bcf --- /dev/null +++ b/hf_release/MINDI-1.0-420M/generation_config.json @@ -0,0 +1,9 @@ +{ + "bos_token_id": 2, + "eos_token_id": 3, + "pad_token_id": 0, + "max_new_tokens": 220, + "temperature": 0.2, + "top_p": 0.9, + "do_sample": true +} \ No newline at end of file diff --git a/hf_release/MINDI-1.0-420M/model.safetensors b/hf_release/MINDI-1.0-420M/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aee736c39b988099c9d0f57143ee9fb1e346a63a --- /dev/null +++ b/hf_release/MINDI-1.0-420M/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d5df76ccfe5be47eaf94b1d58eec9b36276c4c1c2bb235766c766e1dd838a0 +size 1695758072 diff --git a/hf_release/MINDI-1.0-420M/modeling_mindi.py b/hf_release/MINDI-1.0-420M/modeling_mindi.py new file mode 100644 index 0000000000000000000000000000000000000000..434f0953f895c0724514c3d4f9d0ef0a102aaf5c --- /dev/null +++ b/hf_release/MINDI-1.0-420M/modeling_mindi.py @@ -0,0 +1,219 @@ +""" +Hugging Face model class for MINDI 1.0 420M. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PreTrainedModel +from transformers.modeling_outputs import CausalLMOutputWithPast + +from .configuration_mindi import MindiConfig + + +@dataclass +class _Cfg: + vocab_size: int + max_seq_len: int + d_model: int + n_layers: int + n_heads: int + d_ff: int + dropout: float + tie_embeddings: bool + init_std: float + rms_norm_eps: float + + @property + def head_dim(self) -> int: + if self.d_model % self.n_heads != 0: + raise ValueError("d_model must be divisible by n_heads") + return self.d_model // self.n_heads + + +class RMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-5) -> None: + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + norm = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(norm + self.eps) + return self.weight * x + + +class RotaryEmbedding(nn.Module): + def __init__(self, head_dim: int, max_seq_len: int) -> None: + super().__init__() + if head_dim % 2 != 0: + raise ValueError("head_dim must be even for rotary embeddings") + inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim)) + t = torch.arange(max_seq_len, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) + self.register_buffer("cos_cached", torch.cos(freqs), persistent=False) + self.register_buffer("sin_cached", torch.sin(freqs), persistent=False) + + def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]: + cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0) + sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0) + return self._apply_rotary(q, cos, sin), self._apply_rotary(k, cos, sin) + + @staticmethod + def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + xe = x1 * cos - x2 * sin + xo = x1 * sin + x2 * cos + return torch.stack((xe, xo), dim=-1).flatten(-2) + + +class CausalSelfAttention(nn.Module): + def __init__(self, cfg: _Cfg) -> None: + super().__init__() + self.n_heads = cfg.n_heads + self.head_dim = cfg.head_dim + self.scale = self.head_dim ** -0.5 + self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False) + self.k_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False) + self.v_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False) + self.o_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False) + self.dropout = nn.Dropout(cfg.dropout) + self.rotary = RotaryEmbedding(self.head_dim, cfg.max_seq_len) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + bsz, seq_len, _ = x.shape + q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + q, k = self.rotary(q, k, seq_len=seq_len) + out = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=None, + dropout_p=self.dropout.p if self.training else 0.0, + is_causal=True, + scale=self.scale, + ) + out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1) + return self.o_proj(out) + + +class FeedForward(nn.Module): + def __init__(self, cfg: _Cfg) -> None: + super().__init__() + self.fc1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False) + self.fc2 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False) + self.dropout = nn.Dropout(cfg.dropout) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = F.gelu(x, approximate="tanh") + x = self.fc2(x) + x = self.dropout(x) + return x + + +class TransformerBlock(nn.Module): + def __init__(self, cfg: _Cfg) -> None: + super().__init__() + self.norm1 = RMSNorm(cfg.d_model, cfg.rms_norm_eps) + self.attn = CausalSelfAttention(cfg) + self.norm2 = RMSNorm(cfg.d_model, cfg.rms_norm_eps) + self.ffn = FeedForward(cfg) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.norm1(x)) + x = x + self.ffn(self.norm2(x)) + return x + + +class MindiForCausalLM(PreTrainedModel): + config_class = MindiConfig + base_model_prefix = "mindi" + supports_gradient_checkpointing = False + + def __init__(self, config: MindiConfig): + super().__init__(config) + cfg = _Cfg( + vocab_size=config.vocab_size, + max_seq_len=config.max_seq_len, + d_model=config.d_model, + n_layers=config.n_layers, + n_heads=config.n_heads, + d_ff=config.d_ff, + dropout=config.dropout, + tie_embeddings=config.tie_embeddings, + init_std=config.init_std, + rms_norm_eps=config.rms_norm_eps, + ) + + self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.d_model) + self.dropout = nn.Dropout(cfg.dropout) + self.blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)]) + self.norm_final = RMSNorm(cfg.d_model, cfg.rms_norm_eps) + self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False) + + if cfg.tie_embeddings: + self.lm_head.weight = self.embed_tokens.weight + + self.post_init() + + def _init_weights(self, module: nn.Module) -> None: + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std) + + def get_input_embeddings(self) -> nn.Module: + return self.embed_tokens + + def set_input_embeddings(self, value: nn.Module) -> None: + self.embed_tokens = value + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_output_embeddings(self, new_embeddings: nn.Module) -> None: + self.lm_head = new_embeddings + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + **kwargs, + ) -> CausalLMOutputWithPast: + del attention_mask, kwargs + + x = self.embed_tokens(input_ids) + x = self.dropout(x) + + for block in self.blocks: + x = block(x) + + x = self.norm_final(x) + logits = self.lm_head(x) + + loss = None + if labels is not None: + shift_logits = logits[:, :-1, :].contiguous() + shift_labels = labels[:, 1:].contiguous() + loss = F.cross_entropy( + shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1), + ignore_index=-100, + ) + + return CausalLMOutputWithPast(loss=loss, logits=logits) + + @torch.no_grad() + def prepare_inputs_for_generation(self, input_ids: torch.Tensor, **kwargs): + del kwargs + return {"input_ids": input_ids} diff --git a/hf_release/MINDI-1.0-420M/requirements_runtime.txt b/hf_release/MINDI-1.0-420M/requirements_runtime.txt new file mode 100644 index 0000000000000000000000000000000000000000..abaedaaa791becf2f5ce882efa7bea01c73fbf0a --- /dev/null +++ b/hf_release/MINDI-1.0-420M/requirements_runtime.txt @@ -0,0 +1,4 @@ +torch>=2.4.1 +transformers>=4.46.3 +safetensors>=0.4.5 +tokenizers>=0.20.1 diff --git a/hf_release/MINDI-1.0-420M/special_tokens_map.json b/hf_release/MINDI-1.0-420M/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b77818d52342de47ad7d79ee6dd897110362269f --- /dev/null +++ b/hf_release/MINDI-1.0-420M/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "" +} \ No newline at end of file diff --git a/hf_release/MINDI-1.0-420M/tokenization_mindi.py b/hf_release/MINDI-1.0-420M/tokenization_mindi.py new file mode 100644 index 0000000000000000000000000000000000000000..4baa008c200b00429dddf43b9d7befec0f0d4f34 --- /dev/null +++ b/hf_release/MINDI-1.0-420M/tokenization_mindi.py @@ -0,0 +1,33 @@ +""" +Hugging Face tokenizer class for MINDI 1.0 420M. +""" + +from pathlib import Path +from transformers import PreTrainedTokenizerFast + + +class MindiTokenizer(PreTrainedTokenizerFast): + vocab_files_names = {"tokenizer_file": "tokenizer.json"} + model_input_names = ["input_ids", "attention_mask"] + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): + if kwargs.get("tokenizer_file") is None: + local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json" + if local_candidate.exists(): + kwargs["tokenizer_file"] = str(local_candidate) + return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) + + def __init__(self, tokenizer_file=None, **kwargs): + name_or_path = kwargs.pop("name_or_path", None) + if tokenizer_file is None and name_or_path is not None: + candidate = Path(name_or_path) / "tokenizer.json" + if candidate.exists(): + tokenizer_file = str(candidate) + if tokenizer_file is None: + tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json") + kwargs.setdefault("bos_token", "") + kwargs.setdefault("eos_token", "") + kwargs.setdefault("unk_token", "") + kwargs.setdefault("pad_token", "") + super().__init__(tokenizer_file=tokenizer_file, **kwargs) diff --git a/hf_release/MINDI-1.0-420M/tokenizer.json b/hf_release/MINDI-1.0-420M/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4798c7bfd1002d16664c9d8bec52763fdbc3fe48 --- /dev/null +++ b/hf_release/MINDI-1.0-420M/tokenizer.json @@ -0,0 +1,799 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 7, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 8, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 9, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 10, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "NFKC" + } + ] + }, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(==|!=|<=|>=|:=|->|=>|\\+\\+|--|\\+=|-=|\\*=|/=|//=|%=|\\*\\*|&&|\\|\\||<<|>>)" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "Split", + "pattern": { + "Regex": "([()\\[\\]{}.,:;])" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "Metaspace", + "replacement": "_", + "prepend_scheme": "always", + "split": true + } + ] + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 3 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": { + "type": "BPEDecoder", + "suffix": "" + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "": 6, + "": 7, + "": 8, + "": 9, + "": 10, + "(": 11, + ")": 12, + "+": 13, + ",": 14, + ".": 15, + "0": 16, + "4": 17, + "5": 18, + ":": 19, + ";": 20, + "<": 21, + "=": 22, + ">": 23, + "A": 24, + "C": 25, + "D": 26, + "E": 27, + "F": 28, + "H": 29, + "I": 30, + "J": 31, + "L": 32, + "M": 33, + "N": 34, + "O": 35, + "P": 36, + "R": 37, + "S": 38, + "T": 39, + "V": 40, + "W": 41, + "Y": 42, + "_": 43, + "a": 44, + "b": 45, + "c": 46, + "d": 47, + "e": 48, + "f": 49, + "g": 50, + "h": 51, + "i": 52, + "l": 53, + "m": 54, + "n": 55, + "o": 56, + "p": 57, + "r": 58, + "s": 59, + "t": 60, + "u": 61, + "v": 62, + "w": 63, + "x": 64, + "y": 65, + "{": 66, + "}": 67, + "_<": 68, + "DE": 69, + "T>": 70, + "_a": 71, + "L>": 72, + "NL>": 73, + "_": 74, + "NT>": 75, + "_t": 76, + "DENT>": 77, + "_i": 78, + "PT>": 79, + "_(": 80, + "_)": 81, + "on": 82, + "_": 90, + "OMPT>": 91, + "ROMPT>": 92, + "_;": 93, + "_b": 94, + "at": 95, + "_": 99, + "_to": 100, + "_": 101, + "_lo": 102, + "_": 103, + "_": 104, + "_": 105, + "_+": 106, + "_0": 107, + "_re": 108, + "ct": 109, + "dd": 110, + "ion": 111, + "nct": 112, + "rn": 113, + "tu": 114, + "unct": 115, + "va": 116, + "_add": 117, + "_th": 118, + "_funct": 119, + "_retu": 120, + "_function": 121, + "_return": 122, + "AS": 123, + "AV": 124, + "CR": 125, + "Cre": 126, + "HO": 127, + "IPT>": 128, + "Ja": 129, + "JAV": 130, + "N>": 131, + "Py": 132, + "Sc": 133, + "THO": 134, + "YTHO": 135, + "_,": 136, + "_4": 137, + "_5": 138, + "_:": 139, + "_p": 140, + "_{": 141, + "_}": 142, + "_Cre": 143, + "_Ja": 144, + "_Py": 145, + "hon": 146, + "nt": 147, + "op": 148, + "or": 149, + "pt": 150, + "thon": 151, + "_": 168, + "_JavaScript": 169, + "_": 170 + }, + "merges": [ + [ + "_", + "<" + ], + [ + "D", + "E" + ], + [ + "T", + ">" + ], + [ + "_", + "a" + ], + [ + "L", + ">" + ], + [ + "N", + "L>" + ], + [ + "_<", + "NL>" + ], + [ + "N", + "T>" + ], + [ + "_", + "t" + ], + [ + "DE", + "NT>" + ], + [ + "_", + "i" + ], + [ + "P", + "T>" + ], + [ + "_", + "(" + ], + [ + "_", + ")" + ], + [ + "o", + "n" + ], + [ + "_<", + "P" + ], + [ + "_", + "f" + ], + [ + "_", + "l" + ], + [ + "r", + "e" + ], + [ + "r", + "i" + ], + [ + "C", + "O" + ], + [ + "I", + "N" + ], + [ + "M", + "PT>" + ], + [ + "O", + "MPT>" + ], + [ + "R", + "OMPT>" + ], + [ + "_", + ";" + ], + [ + "_", + "b" + ], + [ + "a", + "t" + ], + [ + "_<", + "DE" + ], + [ + "_<", + "CO" + ], + [ + "_<", + "IN" + ], + [ + "DE", + ">" + ], + [ + "_t", + "o" + ], + [ + "_" + ], + [ + "_l", + "o" + ], + [ + "_" + ], + [ + "_" + ], + [ + "_" + ], + [ + "_", + "+" + ], + [ + "_", + "0" + ], + [ + "_", + "re" + ], + [ + "c", + "t" + ], + [ + "d", + "d" + ], + [ + "i", + "on" + ], + [ + "n", + "ct" + ], + [ + "r", + "n" + ], + [ + "t", + "u" + ], + [ + "u", + "nct" + ], + [ + "v", + "a" + ], + [ + "_a", + "dd" + ], + [ + "_t", + "h" + ], + [ + "_f", + "unct" + ], + [ + "_re", + "tu" + ], + [ + "_funct", + "ion" + ], + [ + "_retu", + "rn" + ], + [ + "A", + "S" + ], + [ + "A", + "V" + ], + [ + "C", + "R" + ], + [ + "C", + "re" + ], + [ + "H", + "O" + ], + [ + "I", + "PT>" + ], + [ + "J", + "a" + ], + [ + "J", + "AV" + ], + [ + "N", + ">" + ], + [ + "P", + "y" + ], + [ + "S", + "c" + ], + [ + "T", + "HO" + ], + [ + "Y", + "THO" + ], + [ + "_", + "," + ], + [ + "_", + "4" + ], + [ + "_", + "5" + ], + [ + "_", + ":" + ], + [ + "_", + "p" + ], + [ + "_", + "{" + ], + [ + "_", + "}" + ], + [ + "_", + "Cre" + ], + [ + "_", + "Ja" + ], + [ + "_", + "Py" + ], + [ + "h", + "on" + ], + [ + "n", + "t" + ], + [ + "o", + "p" + ], + [ + "o", + "r" + ], + [ + "p", + "t" + ], + [ + "t", + "hon" + ], + [ + "_<", + "JAV" + ], + [ + "_" + ], + [ + "_JavaSc", + "ript" + ], + [ + "_" + ] + ] + } +} \ No newline at end of file diff --git a/hf_release/MINDI-1.0-420M/tokenizer_config.json b/hf_release/MINDI-1.0-420M/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b1ccb8bd39e3cc06f433c12ec89c39a6d9651162 --- /dev/null +++ b/hf_release/MINDI-1.0-420M/tokenizer_config.json @@ -0,0 +1,17 @@ +{ + "tokenizer_class": "MindiTokenizer", + "model_max_length": 2048, + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "", + "tokenizer_file": "tokenizer.json", + "auto_map": { + "AutoTokenizer": [ + null, + "tokenization_mindi.MindiTokenizer" + ] + }, + "padding_side": "right", + "truncation_side": "right" +} \ No newline at end of file diff --git a/hf_space/MINDI-1.0-420M/DEPLOY_SPACE.ps1 b/hf_space/MINDI-1.0-420M/DEPLOY_SPACE.ps1 new file mode 100644 index 0000000000000000000000000000000000000000..4e710be0a3e967fa1a8f6630e13ecbbc7168721c --- /dev/null +++ b/hf_space/MINDI-1.0-420M/DEPLOY_SPACE.ps1 @@ -0,0 +1,97 @@ +param( + [string]$SpaceRepoId = "Mindigenous/MINDI-1.0-420M", + [string]$SpaceFolder = "C:\AI 2\hf_space\MINDI-1.0-420M", + [string]$Token = "" +) + +$ErrorActionPreference = "Stop" +$py = "C:\AI 2\.venv\Scripts\python.exe" + +function Run-Python { + param([string]$Code) + $Code | & $py - + if ($LASTEXITCODE -ne 0) { + throw "Python command failed." + } +} + +Write-Host "[1/5] Checking/installing huggingface_hub..." +& $py -m pip install "huggingface_hub<1.0,>=0.36.2" | Out-Host + +if (-not (Test-Path $SpaceFolder)) { + throw "Space folder not found: $SpaceFolder" +} + +Write-Host "[2/5] Checking login..." +$loginCheck = @' +from huggingface_hub import whoami +try: + info = whoami() + print("LOGGED_IN", info.get("name", "unknown")) +except Exception: + print("NEED_LOGIN") +'@ +$loginResult = $loginCheck | & $py - +if ($LASTEXITCODE -ne 0) { throw "Login check failed." } + +$needsLogin = ($loginResult -match "NEED_LOGIN") + +if ($needsLogin) { + if ([string]::IsNullOrWhiteSpace($Token)) { + Write-Host "NEED_TOKEN_LOGIN" + exit 42 + } + + Write-Host "Logging in with provided token..." + $loginCode = @" +from huggingface_hub import login +login(token="$Token", add_to_git_credential=False) +print("LOGIN_OK") +"@ + Run-Python -Code $loginCode +} + +Write-Host "[3/5] Creating/ensuring Space repo..." +$createCode = @" +from huggingface_hub import HfApi +api = HfApi() +api.create_repo( + repo_id="$SpaceRepoId", + repo_type="space", + private=False, + space_sdk="gradio", + exist_ok=True, +) +print("SPACE_READY") +"@ +Run-Python -Code $createCode + +Write-Host "[4/5] Uploading Space files..." +$uploadCode = @" +from huggingface_hub import HfApi +api = HfApi() +api.upload_folder( + folder_path=r"$SpaceFolder", + repo_id="$SpaceRepoId", + repo_type="space", + commit_message="Deploy MINDI 1.0 420M Space", +) +print("UPLOAD_OK") +"@ +Run-Python -Code $uploadCode + +Write-Host "[5/5] Verifying live files..." +$verifyCode = @" +from huggingface_hub import list_repo_files +repo_id = "$SpaceRepoId" +files = list_repo_files(repo_id, repo_type="space") +required = ["app.py", "requirements.txt", "README.md"] +missing = [f for f in required if f not in files] +print("FILES_COUNT", len(files)) +print("MISSING", missing) +print("SPACE_URL", f"https://huggingface.co/spaces/{repo_id}") +"@ +Run-Python -Code $verifyCode + +Write-Host "Deployment completed successfully." + diff --git a/hf_space/MINDI-1.0-420M/README.md b/hf_space/MINDI-1.0-420M/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7efa7722013e11576a7738b9d0cf64f1c1bb6d2f --- /dev/null +++ b/hf_space/MINDI-1.0-420M/README.md @@ -0,0 +1,37 @@ +--- +title: MINDI 1.0 420M +emoji: "💻" +colorFrom: gray +colorTo: blue +sdk: gradio +sdk_version: 5.5.0 +app_file: app.py +pinned: false +license: mit +--- + +# MINDI 1.0 420M Space + +This Space serves **MINDI 1.0 420M** as a browser-based coding assistant. + +## Model + +- Model repo: `Mindigenous/MINDI-1.0-420M` +- Focus: Python first, JavaScript second +- Use cases: code generation, completion, bug-fix suggestions, explanation + +## Notes for Free CPU Tier + +- First load can be slow because the model is large. +- Inference latency is expected on CPU. +- For faster responses, upgrade Space hardware later. + +## Example Prompts + +- `Write a Python function to merge two sorted lists.` +- `Fix this JavaScript debounce function and explain the bug.` +- `Implement BFS on an adjacency list in Python.` + +## Safety + +Always review and test generated code before production use. diff --git a/hf_space/MINDI-1.0-420M/app.py b/hf_space/MINDI-1.0-420M/app.py new file mode 100644 index 0000000000000000000000000000000000000000..699e5b40bc155eac9c97a39680cc82cf72563f76 --- /dev/null +++ b/hf_space/MINDI-1.0-420M/app.py @@ -0,0 +1,246 @@ +""" +Hugging Face Space app for MINDI 1.0 420M. +This app loads the public model repo and serves a coding-focused chat UI. +""" + +from __future__ import annotations + +import re +import time +from functools import lru_cache +from typing import Any, Dict, List + +import gradio as gr +import torch +from huggingface_hub import hf_hub_download +from transformers import PreTrainedTokenizerFast +from transformers import AutoModelForCausalLM + +MODEL_ID = "Mindigenous/MINDI-1.0-420M" +MAX_CONTEXT_CHARS = 2400 +CPU_MAX_NEW_TOKENS = 96 +CPU_MAX_TIME_SECONDS = 20.0 + + +def _looks_like_coding_prompt(text: str) -> bool: + """Simple keyword gate so random chat gets a helpful coding-only response.""" + text_l = text.lower() + keywords = [ + "python", "javascript", "js", "function", "bug", "error", "traceback", + "class", "loop", "array", "dict", "api", "sql", "regex", "algorithm", + "code", "implement", "fix", "refactor", "optimize", + ] + return any(k in text_l for k in keywords) + + +def _language_token(language: str) -> str: + return "" if language.lower().startswith("java") else "" + + +def _cleanup_generated_text(text: str) -> str: + """Remove training markers and keep the code region only.""" + # Prefer output after the marker when present. + if "" in text: + text = text.split("", 1)[1] + + # Remove known special tokens but keep real code content. + text = re.sub(r"<(BOS|EOS|PROMPT|CODE|PYTHON|JAVASCRIPT|PAD|UNK|INDENT|DEDENT|NL)>", "", text) + + # Normalize spacing/newlines after marker cleanup. + text = text.replace("\\n", "\n") + text = re.sub(r"\n{3,}", "\n\n", text) + text = text.strip() + + return text + + +def _build_prompt(message: str, history: List[Dict[str, str]], language: str) -> str: + """Build prompt in the same style used during training.""" + # Keep short recent context for better continuity on free CPU. + snippets: List[str] = [] + for item in history[-3:]: + role = item.get("role", "user") + content = (item.get("content") or "").strip() + if content: + prefix = "User" if role == "user" else "Assistant" + snippets.append(f"{prefix}: {content}") + + context_text = "\n".join(snippets) + if len(context_text) > MAX_CONTEXT_CHARS: + context_text = context_text[-MAX_CONTEXT_CHARS:] + + combined = message.strip() + if context_text: + combined = f"Conversation so far:\n{context_text}\n\nCurrent request:\n{message.strip()}" + + return f" {_language_token(language)} {combined} " + + +@lru_cache(maxsize=1) +def _load_runtime() -> tuple[Any, Any, torch.device, torch.dtype]: + """Load tokenizer + model once for the Space process.""" + # Load tokenizer directly from tokenizer.json to avoid custom tokenizer + # path resolution issues in dynamic module cache on Spaces. + tokenizer_file = hf_hub_download(repo_id=MODEL_ID, filename="tokenizer.json") + tokenizer = PreTrainedTokenizerFast( + tokenizer_file=tokenizer_file, + bos_token="", + eos_token="", + unk_token="", + pad_token="", + ) + + use_cuda = torch.cuda.is_available() + device = torch.device("cuda" if use_cuda else "cpu") + dtype = torch.float16 if use_cuda else torch.float32 + + model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + trust_remote_code=True, + torch_dtype=dtype, + ) + model.to(device) + model.eval() + # Bridge custom config names to standard Transformers generation fields. + if not hasattr(model.config, "num_hidden_layers") and hasattr(model.config, "n_layers"): + model.config.num_hidden_layers = int(model.config.n_layers) + if not hasattr(model.config, "num_attention_heads") and hasattr(model.config, "n_heads"): + model.config.num_attention_heads = int(model.config.n_heads) + if not hasattr(model.config, "hidden_size") and hasattr(model.config, "d_model"): + model.config.hidden_size = int(model.config.d_model) + + # Ensure pad token is defined for stable generation. + if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None: + tokenizer.pad_token = tokenizer.eos_token + + return tokenizer, model, device, dtype + + +def _generate(message: str, history: List[Dict[str, str]], language: str, temperature: float, top_p: float, max_new_tokens: int) -> str: + if not _looks_like_coding_prompt(message): + return "MINDI is coding-focused. Please ask a coding question (Python or JavaScript)." + + tokenizer, model, device, _ = _load_runtime() + + prompt = _build_prompt(message=message, history=history, language=language) + inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) + # Custom MINDI model forward() does not consume token_type_ids. + if "token_type_ids" in inputs: + inputs.pop("token_type_ids") + inputs = {k: v.to(device) for k, v in inputs.items()} + + eos_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 3 + pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else eos_id + + effective_max_new_tokens = int(max_new_tokens) + do_sample = temperature > 0 + max_time = None + if device.type == "cpu": + effective_max_new_tokens = min(effective_max_new_tokens, CPU_MAX_NEW_TOKENS) + do_sample = False + max_time = CPU_MAX_TIME_SECONDS + + start = time.perf_counter() + with torch.no_grad(): + out = model.generate( + **inputs, + max_new_tokens=effective_max_new_tokens, + do_sample=do_sample, + temperature=max(temperature, 1e-5), + top_p=top_p, + eos_token_id=eos_id, + pad_token_id=pad_id, + use_cache=False, + max_time=max_time, + num_beams=1, + ) + elapsed = time.perf_counter() - start + + decoded = tokenizer.decode(out[0], skip_special_tokens=False) + code = _cleanup_generated_text(decoded) + + if not code: + code = "# I could not generate code for that prompt.\n# Try being more specific about input/output behavior." + + lang = "javascript" if language.lower().startswith("java") else "python" + generated_tokens = max(0, out.shape[1] - inputs["input_ids"].shape[1]) + + return ( + f"```{lang}\n{code}\n```\n\n" + f"**MINDI 1.0 420M** | mode=base | generated_tokens={generated_tokens} | time={elapsed:.2f}s" + ) + + +def build_demo() -> gr.Blocks: + css = """ + body, .gradio-container { + background: #0b1220 !important; + color: #e6edf3 !important; + } + .gradio-container { + font-family: 'Segoe UI', sans-serif; + } + #title-block { + border: 1px solid #1f2a44; + border-radius: 14px; + padding: 12px 16px; + background: linear-gradient(135deg, #111a2c, #0b1220); + } + """ + + with gr.Blocks(title="MINDI 1.0 420M", theme=gr.themes.Base(), css=css) as demo: + gr.Markdown( + "## MINDI 1.0 420M\n" + "Your local coding intelligence — 420M parameters, fully offline model hosted on Hugging Face.", + elem_id="title-block", + ) + + with gr.Row(): + language = gr.Dropdown( + choices=["python", "javascript"], + value="python", + label="Language Focus", + ) + temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature") + top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p") + max_new_tokens = gr.Slider(32, 192, value=96, step=16, label="Max New Tokens") + + chatbot = gr.Chatbot(type="messages", height=520, label="MINDI Chat") + msg = gr.Textbox( + label="Prompt", + placeholder="Ask MINDI anything about code", + lines=3, + ) + clear_btn = gr.Button("Clear") + + def _user(user_message: str, chat_history: List[Dict[str, str]]): + chat_history = chat_history or [] + chat_history.append({"role": "user", "content": user_message}) + return "", chat_history + + def _bot(chat_history: List[Dict[str, str]], lang: str, temp: float, tp: float, mnt: int): + chat_history = chat_history or [] + last_user = "" + for item in reversed(chat_history): + if item.get("role") == "user": + last_user = item.get("content", "") + break + response = _generate(last_user, chat_history, lang, temp, tp, mnt) + chat_history.append({"role": "assistant", "content": response}) + return chat_history + + msg.submit(_user, [msg, chatbot], [msg, chatbot], queue=False).then( + _bot, + [chatbot, language, temperature, top_p, max_new_tokens], + [chatbot], + ) + + clear_btn.click(lambda: [], None, chatbot, queue=False) + + return demo + + +demo = build_demo() + +if __name__ == "__main__": + demo.queue(max_size=16).launch() diff --git a/hf_space/MINDI-1.0-420M/requirements.txt b/hf_space/MINDI-1.0-420M/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f49f86ff59dfa3ee18c2205c82c5364caec0ad55 --- /dev/null +++ b/hf_space/MINDI-1.0-420M/requirements.txt @@ -0,0 +1,11 @@ +gradio==5.5.0 +transformers>=4.46.3,<5 +torch==2.5.1 +safetensors>=0.4.5 +huggingface_hub>=0.36.2,<1 +fastapi==0.115.5 +starlette==0.41.3 +uvicorn==0.32.0 +httpx==0.27.2 +pydantic==2.9.2 +websockets==12.0 diff --git a/logs/component8_launch_check.err.log b/logs/component8_launch_check.err.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/logs/component8_launch_check.out.log b/logs/component8_launch_check.out.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/logs/data_fetch.log b/logs/data_fetch.log new file mode 100644 index 0000000000000000000000000000000000000000..093165916a5a80ec8f3845f68dd1e60bd51980ef --- /dev/null +++ b/logs/data_fetch.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1efcdb351ebae00b30ecf9d1e1b2f83188919e4a33052707d1627cfde6a3c731 +size 153 diff --git a/logs/start_mindi_test.err.log b/logs/start_mindi_test.err.log new file mode 100644 index 0000000000000000000000000000000000000000..ef08fec4757038583ab06e7fa25c062dd0f6d3a6 --- /dev/null +++ b/logs/start_mindi_test.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e24ebb82e60ecb4d05713ca1e413f896e6fa83998d317a9715463ec340c3ef29 +size 397 diff --git a/logs/start_mindi_test.out.log b/logs/start_mindi_test.out.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/logs/start_mindi_test10.err.log b/logs/start_mindi_test10.err.log new file mode 100644 index 0000000000000000000000000000000000000000..099d0abee3c088def36dfd3a64509ae07b2d2dbe --- /dev/null +++ b/logs/start_mindi_test10.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:564f37db2cc0b0ca3d2960f47e8419114989f3de6a2596782332be13e3bd6a0e +size 2692 diff --git a/logs/start_mindi_test10.out.log b/logs/start_mindi_test10.out.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/logs/start_mindi_test11.err.log b/logs/start_mindi_test11.err.log new file mode 100644 index 0000000000000000000000000000000000000000..fc7d74df0fa83e32218c8d11fb670cd387b7cdf4 --- /dev/null +++ b/logs/start_mindi_test11.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bec8dfc4a7f70745d002ddc05a751241f68e8461753773f97555fa1755738443 +size 1813 diff --git a/logs/start_mindi_test11.out.log b/logs/start_mindi_test11.out.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/logs/start_mindi_test2.err.log b/logs/start_mindi_test2.err.log new file mode 100644 index 0000000000000000000000000000000000000000..efe1684b8bd02ac0ed9a0785479baf5eae84da4a --- /dev/null +++ b/logs/start_mindi_test2.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f5126be1d6fb3b72cec93eebaa41b901b1dd4689fa0cf872a6f1b756d59006 +size 224 diff --git a/logs/start_mindi_test2.out.log b/logs/start_mindi_test2.out.log new file mode 100644 index 0000000000000000000000000000000000000000..338fd7760fa44e75fc78fea5e526c36870ebd718 --- /dev/null +++ b/logs/start_mindi_test2.out.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffef22ad744514bfd22ba830666fb356c148b735f961ff6ef1e8b76b63c86c12 +size 159 diff --git a/logs/start_mindi_test3.err.log b/logs/start_mindi_test3.err.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/logs/start_mindi_test3.out.log b/logs/start_mindi_test3.out.log new file mode 100644 index 0000000000000000000000000000000000000000..8dd8f06e5ebcde5b0f21f16a20d7932bf04de580 --- /dev/null +++ b/logs/start_mindi_test3.out.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5f532f28bee6f0b093e6ea80782647da0f4f3f932def12e994fe5561a46c5ac +size 14701 diff --git a/logs/start_mindi_test4.err.log b/logs/start_mindi_test4.err.log new file mode 100644 index 0000000000000000000000000000000000000000..04d477b15dea7e472b365fafc8494a9c9bdd1f41 --- /dev/null +++ b/logs/start_mindi_test4.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e8058180dd50e94d7b0de1757a962128b32c5571f94f477647a61c04aaa5cb6 +size 288 diff --git a/logs/start_mindi_test4.out.log b/logs/start_mindi_test4.out.log new file mode 100644 index 0000000000000000000000000000000000000000..faeb1bb34d6b70b423b462db10b81dade54dd32d --- /dev/null +++ b/logs/start_mindi_test4.out.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d8e611bd3d3c2aa86b61a66ae8be8842412e003a93217e754dcad3bc968f1af +size 10200 diff --git a/logs/start_mindi_test5.err.log b/logs/start_mindi_test5.err.log new file mode 100644 index 0000000000000000000000000000000000000000..9c6571c99608ecf4621ba9810fce4ab33bdf9aa0 --- /dev/null +++ b/logs/start_mindi_test5.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e745e1abf0e2332620f064a611fec0df39d276b1f1578ac62a09b25f3b9a96f +size 350 diff --git a/logs/start_mindi_test5.out.log b/logs/start_mindi_test5.out.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/logs/start_mindi_test6.err.log b/logs/start_mindi_test6.err.log new file mode 100644 index 0000000000000000000000000000000000000000..03a699127bd4fc34657bb3753096e334607ee048 --- /dev/null +++ b/logs/start_mindi_test6.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d9c9fcac3877a556b8ca642cf654cf9ca60cfbecb31baff5840fc0c2e94a525 +size 34 diff --git a/logs/start_mindi_test6.out.log b/logs/start_mindi_test6.out.log new file mode 100644 index 0000000000000000000000000000000000000000..f344968792eb382dc05b1be94b36977b54da1701 --- /dev/null +++ b/logs/start_mindi_test6.out.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e62816b259003b9d18c287a836ad733f8cdc052e893c09f18e0d41b16b3b2436 +size 41 diff --git a/logs/start_mindi_test7.err.log b/logs/start_mindi_test7.err.log new file mode 100644 index 0000000000000000000000000000000000000000..59a5928d1c7b98b58c43b00665786748c942b9c7 --- /dev/null +++ b/logs/start_mindi_test7.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b2c3945751939e08a4fa5f05cb647f05552e2fa230de22859d262def416b0b +size 291 diff --git a/logs/start_mindi_test7.out.log b/logs/start_mindi_test7.out.log new file mode 100644 index 0000000000000000000000000000000000000000..e75f37db2a9e9171a47d79c29540bff0fbe951c0 --- /dev/null +++ b/logs/start_mindi_test7.out.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4999c0b72f6da602c4e28ce40aa7f5da377bce02743a8c344bb32ecadd0c0a51 +size 15486 diff --git a/logs/start_mindi_test8.err.log b/logs/start_mindi_test8.err.log new file mode 100644 index 0000000000000000000000000000000000000000..9ca61d8c0a3f7f5d7ddd193f33e8d0dbee4793c6 --- /dev/null +++ b/logs/start_mindi_test8.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7ed6ff7728d6cf48413fd39bad9d370cdfb1ae99f5fe32854c5927d724cbab +size 44232 diff --git a/logs/start_mindi_test8.out.log b/logs/start_mindi_test8.out.log new file mode 100644 index 0000000000000000000000000000000000000000..6118aab8736b1f892a3bfb0baa76f08ed04fd533 --- /dev/null +++ b/logs/start_mindi_test8.out.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a96a9fdd72b29c0845362ca8ea92dae01835ab12bfcc94c4eadba1a0a76f494e +size 48 diff --git a/logs/start_mindi_test9.err.log b/logs/start_mindi_test9.err.log new file mode 100644 index 0000000000000000000000000000000000000000..fc7d74df0fa83e32218c8d11fb670cd387b7cdf4 --- /dev/null +++ b/logs/start_mindi_test9.err.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bec8dfc4a7f70745d002ddc05a751241f68e8461753773f97555fa1755738443 +size 1813 diff --git a/logs/start_mindi_test9.out.log b/logs/start_mindi_test9.out.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/logs/train.log b/logs/train.log new file mode 100644 index 0000000000000000000000000000000000000000..f1cfcd527d35505eb14ed9e39f81a20a9a4a68e5 --- /dev/null +++ b/logs/train.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f9c40f4da1cd9ccfeebbb3cacf0874e3b14189ba286291f80cb3cb7b7ee25d7 +size 1389 diff --git a/logs/train_live.log b/logs/train_live.log new file mode 100644 index 0000000000000000000000000000000000000000..6cd4bb4e1c5e3757ebde3bf1bac71a2e6f2b090c --- /dev/null +++ b/logs/train_live.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ab71adb1d0d786c87e87fcf4b16c6041eb9fce2e7d5c868ebbba39bed574b4b +size 696468 diff --git a/models/lora/custom_lora_v1/adapter_meta.json b/models/lora/custom_lora_v1/adapter_meta.json new file mode 100644 index 0000000000000000000000000000000000000000..8ce0141e6f2ec90055aaa445c5a8e3e4ce2c5f9f --- /dev/null +++ b/models/lora/custom_lora_v1/adapter_meta.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29a1ee1234ec290e5d23343363e9733df70078dad5adac38fe007683a881894a +size 336 diff --git a/models/lora/custom_lora_v1/best.pt b/models/lora/custom_lora_v1/best.pt new file mode 100644 index 0000000000000000000000000000000000000000..255e5ea2fb16f55f20e2e03c11b65e8fc73cc98e --- /dev/null +++ b/models/lora/custom_lora_v1/best.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8115abb545aea63105017b908eadcee15e2a4285cedac5ca665c7a3c76ca512e +size 46044542 diff --git a/models/lora/custom_lora_v1/latest.pt b/models/lora/custom_lora_v1/latest.pt new file mode 100644 index 0000000000000000000000000000000000000000..467f0eb80aff06a0f0a06e6d04ef94eede7b2083 --- /dev/null +++ b/models/lora/custom_lora_v1/latest.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6242b27c6fdcb55e889d1d648e150e338848cbccf7bbf0a454eb6af3c969e6 +size 46095014 diff --git a/models/lora/custom_lora_v1/step_100.pt b/models/lora/custom_lora_v1/step_100.pt new file mode 100644 index 0000000000000000000000000000000000000000..373a591f6e4bc6669106476796b9faeaa38b5d38 --- /dev/null +++ b/models/lora/custom_lora_v1/step_100.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af091329497df29e4b50253eb30226439351cba65c32d1fb660427e428d1a31d +size 46102030 diff --git a/models/lora/custom_lora_v1/step_200.pt b/models/lora/custom_lora_v1/step_200.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b02a7f35d37ed86ca65b2d49796d70351b7d906 --- /dev/null +++ b/models/lora/custom_lora_v1/step_200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4217cf2881b3839884cba7525bd96d2ef5e99bc0792048102d1d072293733936 +size 46102030 diff --git a/models/lora/custom_lora_v1/step_300.pt b/models/lora/custom_lora_v1/step_300.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6bf4411e8539735403c44b73885144bc7e03bde --- /dev/null +++ b/models/lora/custom_lora_v1/step_300.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fdf20dbed62d8250906e82b90e4ffd1d6ef78408dd39800e42875f46f44c670 +size 46102030 diff --git a/models/lora/custom_lora_v1/step_400.pt b/models/lora/custom_lora_v1/step_400.pt new file mode 100644 index 0000000000000000000000000000000000000000..97e0505061d86dd2aac4abdf1f5e19c3e037fa32 --- /dev/null +++ b/models/lora/custom_lora_v1/step_400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe3e5fe97e59ac822a50767cf222c98a548b491b4250fc94639f57c68778d7b +size 46102030 diff --git a/models/lora/custom_lora_v1/step_5.pt b/models/lora/custom_lora_v1/step_5.pt new file mode 100644 index 0000000000000000000000000000000000000000..d400660c74f4caa1c9db0c5569bc0c243cee3603 --- /dev/null +++ b/models/lora/custom_lora_v1/step_5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1112fbbc9909b39f2a9a669020be217fdf91e57ef6a661f7fc5933375637e63 +size 46095014 diff --git a/models/lora/custom_lora_v1/step_500.pt b/models/lora/custom_lora_v1/step_500.pt new file mode 100644 index 0000000000000000000000000000000000000000..a238c791f2a0f11346f07fea2b80f81c9c3feaba --- /dev/null +++ b/models/lora/custom_lora_v1/step_500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0efc25c990106212b7653694c1625b0842191fba9bc772ddd06af2b2413b6ec9 +size 46102030 diff --git a/models/lora/custom_lora_v1/step_600.pt b/models/lora/custom_lora_v1/step_600.pt new file mode 100644 index 0000000000000000000000000000000000000000..814ca607dbdf9c7f9ac8a7811abf99aad0be7623 --- /dev/null +++ b/models/lora/custom_lora_v1/step_600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:108c7c7c0f26592383b5f8fea392465d5d8efea07ba3ec3578aef4cb648c8c6c +size 46102030 diff --git a/models/lora/custom_lora_v1/step_700.pt b/models/lora/custom_lora_v1/step_700.pt new file mode 100644 index 0000000000000000000000000000000000000000..4134295c612862b133fda00662643d9d90b4e914 --- /dev/null +++ b/models/lora/custom_lora_v1/step_700.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b94d96ae590f17f9d603d4cab2ef847b710e3f8b4ce1dd755a5fffeed60c44dd +size 46102030 diff --git a/models/quantized/model_step3200_int8_state.pt b/models/quantized/model_step3200_int8_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..088a51d19e14e6f270c8aea0040f23a1c25512ae --- /dev/null +++ b/models/quantized/model_step3200_int8_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba816d83052a2547af224d4fa1cb6ec9836c08df62c570958885acfab1817ef4 +size 654678026 diff --git a/release/MINDI_1.0_420M/.deps_installed b/release/MINDI_1.0_420M/.deps_installed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/release/MINDI_1.0_420M/Start_MINDI.bat b/release/MINDI_1.0_420M/Start_MINDI.bat new file mode 100644 index 0000000000000000000000000000000000000000..c4249ac6129941bba23cc65f01319893a44fa31d --- /dev/null +++ b/release/MINDI_1.0_420M/Start_MINDI.bat @@ -0,0 +1,38 @@ +@echo off +title MINDI 1.0 420M +setlocal +cd /d "%~dp0" + +set "BOOTSTRAP_PY=%~dp0..\..\.venv\Scripts\python.exe" +set "VENV_PY=%~dp0.venv\Scripts\python.exe" + +if exist "%VENV_PY%" goto after_venv +if exist "%~dp0.venv" rmdir /s /q "%~dp0.venv" +echo [setup] Creating virtual environment... +if exist "%BOOTSTRAP_PY%" ( + "%BOOTSTRAP_PY%" -m venv "%~dp0.venv" +) else ( + py -3.11 -m venv "%~dp0.venv" +) +if not exist "%VENV_PY%" ( + echo [error] Python 3.11 is required. Install Python 3.11 and re-run Start_MINDI.bat. + pause + exit /b 1 +) + +:after_venv +"%VENV_PY%" -m pip install --upgrade pip >nul + +if exist "%~dp0.deps_installed" goto run_app +echo [setup] Installing dependencies (first run only)... +"%VENV_PY%" -m pip install -r "%~dp0requirements_portable.txt" +if errorlevel 1 ( + echo [error] Dependency install failed. Check internet and Python version. + pause + exit /b 1 +) +type nul > "%~dp0.deps_installed" + +:run_app +"%VENV_PY%" "%~dp0app\launch_portable_chat.py" --config "%~dp0app\portable_chat_config.yaml" +endlocal diff --git a/release/MINDI_1.0_420M/app/launch_portable_chat.py b/release/MINDI_1.0_420M/app/launch_portable_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..5e2a3a007665d9f0fb400d20187278fe4c46e750 --- /dev/null +++ b/release/MINDI_1.0_420M/app/launch_portable_chat.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import gradio as gr +import torch +import yaml + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets +from src.tokenizer.code_tokenizer import CodeTokenizer +from src.inference_engine.inference_engine import InferenceEngine, DecodingConfig + + +def load_yaml(path: Path): + return yaml.safe_load(path.read_text(encoding="utf-8-sig")) + + +def build_model_config(path: Path) -> ModelConfig: + cfg = load_yaml(path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + if preset: + merged = get_model_presets()[preset].__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + return ModelConfig(**model_cfg) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--config", default="app/portable_chat_config.yaml") + ap.add_argument("--self_test", action="store_true") + args = ap.parse_args() + + cfg = load_yaml(PROJECT_ROOT / args.config) + mcfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"]) + + tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"])) + model = CodeTransformerLM(mcfg).cpu().float() + model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) + state = torch.load(PROJECT_ROOT / cfg["model"]["quantized_state_path"], map_location="cpu") + model.load_state_dict(state) + + engine = InferenceEngine(model=model, tokenizer=tokenizer, device=torch.device("cpu")) + dcfg = DecodingConfig(max_new_tokens=220, min_tokens_before_stop_check=64) + + if args.self_test: + out = engine.generate_with_retry("Write a Python function to add two numbers.", "python", dcfg) + code = out["final"]["code"] + print("portable_self_test_ok=", bool(code.strip())) + return + + def respond(prompt, history): + history = history or [] + p = (prompt or "").strip() + if not p: + return history, "" + out = engine.generate_with_retry(p, "python", dcfg) + history.append((p, out["final"]["code"])) + return history, "" + + with gr.Blocks(title="MINDI 1.0 420M") as demo: + gr.Markdown("## MINDI 1.0 420M (INT8 Portable)") + chat = gr.Chatbot(height=520) + box = gr.Textbox(label="Prompt", lines=4) + btn = gr.Button("Generate") + clear = gr.Button("Clear") + btn.click(respond, [box, chat], [chat, box]) + box.submit(respond, [box, chat], [chat, box]) + clear.click(lambda: ([], ""), None, [chat, box]) + + demo.launch(server_name=cfg["server"].get("host", "127.0.0.1"), server_port=int(cfg["server"].get("port", 7861)), share=False, inbrowser=False) + + +if __name__ == "__main__": + main() + diff --git a/release/MINDI_1.0_420M/app/model_config.yaml b/release/MINDI_1.0_420M/app/model_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5667b78688d6890d294fc808c0110edd093f6a7 --- /dev/null +++ b/release/MINDI_1.0_420M/app/model_config.yaml @@ -0,0 +1,18 @@ +# Component 4 model config. +# You can switch the preset name or directly edit dimensions below. + +preset: medium_420m + +model: + vocab_size: 50000 + max_seq_len: 2048 + d_model: 1152 + n_layers: 23 + n_heads: 16 + d_ff: 4608 + dropout: 0.1 + tie_embeddings: true + gradient_checkpointing: false + init_std: 0.02 + rms_norm_eps: 0.00001 + diff --git a/release/MINDI_1.0_420M/app/portable_chat_config.yaml b/release/MINDI_1.0_420M/app/portable_chat_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7979a08a26dc0f7d96394998ece6ad94913577c --- /dev/null +++ b/release/MINDI_1.0_420M/app/portable_chat_config.yaml @@ -0,0 +1,8 @@ +model: + model_config_path: app/model_config.yaml + quantized_state_path: model/model_step3200_int8_state.pt + tokenizer_dir: model/tokenizer + +server: + host: 127.0.0.1 + port: 7861 diff --git a/release/MINDI_1.0_420M/model/model_step3200_int8_state.pt b/release/MINDI_1.0_420M/model/model_step3200_int8_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..088a51d19e14e6f270c8aea0040f23a1c25512ae --- /dev/null +++ b/release/MINDI_1.0_420M/model/model_step3200_int8_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba816d83052a2547af224d4fa1cb6ec9836c08df62c570958885acfab1817ef4 +size 654678026 diff --git a/release/MINDI_1.0_420M/model/tokenizer/tokenizer.json b/release/MINDI_1.0_420M/model/tokenizer/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4798c7bfd1002d16664c9d8bec52763fdbc3fe48 --- /dev/null +++ b/release/MINDI_1.0_420M/model/tokenizer/tokenizer.json @@ -0,0 +1,799 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 7, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 8, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 9, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 10, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "NFKC" + } + ] + }, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(==|!=|<=|>=|:=|->|=>|\\+\\+|--|\\+=|-=|\\*=|/=|//=|%=|\\*\\*|&&|\\|\\||<<|>>)" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "Split", + "pattern": { + "Regex": "([()\\[\\]{}.,:;])" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "Metaspace", + "replacement": "_", + "prepend_scheme": "always", + "split": true + } + ] + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 3 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": { + "type": "BPEDecoder", + "suffix": "" + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "": 6, + "": 7, + "": 8, + "": 9, + "": 10, + "(": 11, + ")": 12, + "+": 13, + ",": 14, + ".": 15, + "0": 16, + "4": 17, + "5": 18, + ":": 19, + ";": 20, + "<": 21, + "=": 22, + ">": 23, + "A": 24, + "C": 25, + "D": 26, + "E": 27, + "F": 28, + "H": 29, + "I": 30, + "J": 31, + "L": 32, + "M": 33, + "N": 34, + "O": 35, + "P": 36, + "R": 37, + "S": 38, + "T": 39, + "V": 40, + "W": 41, + "Y": 42, + "_": 43, + "a": 44, + "b": 45, + "c": 46, + "d": 47, + "e": 48, + "f": 49, + "g": 50, + "h": 51, + "i": 52, + "l": 53, + "m": 54, + "n": 55, + "o": 56, + "p": 57, + "r": 58, + "s": 59, + "t": 60, + "u": 61, + "v": 62, + "w": 63, + "x": 64, + "y": 65, + "{": 66, + "}": 67, + "_<": 68, + "DE": 69, + "T>": 70, + "_a": 71, + "L>": 72, + "NL>": 73, + "_": 74, + "NT>": 75, + "_t": 76, + "DENT>": 77, + "_i": 78, + "PT>": 79, + "_(": 80, + "_)": 81, + "on": 82, + "_": 90, + "OMPT>": 91, + "ROMPT>": 92, + "_;": 93, + "_b": 94, + "at": 95, + "_": 99, + "_to": 100, + "_": 101, + "_lo": 102, + "_": 103, + "_": 104, + "_": 105, + "_+": 106, + "_0": 107, + "_re": 108, + "ct": 109, + "dd": 110, + "ion": 111, + "nct": 112, + "rn": 113, + "tu": 114, + "unct": 115, + "va": 116, + "_add": 117, + "_th": 118, + "_funct": 119, + "_retu": 120, + "_function": 121, + "_return": 122, + "AS": 123, + "AV": 124, + "CR": 125, + "Cre": 126, + "HO": 127, + "IPT>": 128, + "Ja": 129, + "JAV": 130, + "N>": 131, + "Py": 132, + "Sc": 133, + "THO": 134, + "YTHO": 135, + "_,": 136, + "_4": 137, + "_5": 138, + "_:": 139, + "_p": 140, + "_{": 141, + "_}": 142, + "_Cre": 143, + "_Ja": 144, + "_Py": 145, + "hon": 146, + "nt": 147, + "op": 148, + "or": 149, + "pt": 150, + "thon": 151, + "_": 168, + "_JavaScript": 169, + "_": 170 + }, + "merges": [ + [ + "_", + "<" + ], + [ + "D", + "E" + ], + [ + "T", + ">" + ], + [ + "_", + "a" + ], + [ + "L", + ">" + ], + [ + "N", + "L>" + ], + [ + "_<", + "NL>" + ], + [ + "N", + "T>" + ], + [ + "_", + "t" + ], + [ + "DE", + "NT>" + ], + [ + "_", + "i" + ], + [ + "P", + "T>" + ], + [ + "_", + "(" + ], + [ + "_", + ")" + ], + [ + "o", + "n" + ], + [ + "_<", + "P" + ], + [ + "_", + "f" + ], + [ + "_", + "l" + ], + [ + "r", + "e" + ], + [ + "r", + "i" + ], + [ + "C", + "O" + ], + [ + "I", + "N" + ], + [ + "M", + "PT>" + ], + [ + "O", + "MPT>" + ], + [ + "R", + "OMPT>" + ], + [ + "_", + ";" + ], + [ + "_", + "b" + ], + [ + "a", + "t" + ], + [ + "_<", + "DE" + ], + [ + "_<", + "CO" + ], + [ + "_<", + "IN" + ], + [ + "DE", + ">" + ], + [ + "_t", + "o" + ], + [ + "_" + ], + [ + "_l", + "o" + ], + [ + "_" + ], + [ + "_" + ], + [ + "_" + ], + [ + "_", + "+" + ], + [ + "_", + "0" + ], + [ + "_", + "re" + ], + [ + "c", + "t" + ], + [ + "d", + "d" + ], + [ + "i", + "on" + ], + [ + "n", + "ct" + ], + [ + "r", + "n" + ], + [ + "t", + "u" + ], + [ + "u", + "nct" + ], + [ + "v", + "a" + ], + [ + "_a", + "dd" + ], + [ + "_t", + "h" + ], + [ + "_f", + "unct" + ], + [ + "_re", + "tu" + ], + [ + "_funct", + "ion" + ], + [ + "_retu", + "rn" + ], + [ + "A", + "S" + ], + [ + "A", + "V" + ], + [ + "C", + "R" + ], + [ + "C", + "re" + ], + [ + "H", + "O" + ], + [ + "I", + "PT>" + ], + [ + "J", + "a" + ], + [ + "J", + "AV" + ], + [ + "N", + ">" + ], + [ + "P", + "y" + ], + [ + "S", + "c" + ], + [ + "T", + "HO" + ], + [ + "Y", + "THO" + ], + [ + "_", + "," + ], + [ + "_", + "4" + ], + [ + "_", + "5" + ], + [ + "_", + ":" + ], + [ + "_", + "p" + ], + [ + "_", + "{" + ], + [ + "_", + "}" + ], + [ + "_", + "Cre" + ], + [ + "_", + "Ja" + ], + [ + "_", + "Py" + ], + [ + "h", + "on" + ], + [ + "n", + "t" + ], + [ + "o", + "p" + ], + [ + "o", + "r" + ], + [ + "p", + "t" + ], + [ + "t", + "hon" + ], + [ + "_<", + "JAV" + ], + [ + "_" + ], + [ + "_JavaSc", + "ript" + ], + [ + "_" + ] + ] + } +} \ No newline at end of file diff --git a/release/MINDI_1.0_420M/model/tokenizer/tokenizer_config.json b/release/MINDI_1.0_420M/model/tokenizer/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..32329ced84f28ebf5ed232d510fb78763c616e15 --- /dev/null +++ b/release/MINDI_1.0_420M/model/tokenizer/tokenizer_config.json @@ -0,0 +1,19 @@ +{ + "vocab_size": 50000, + "min_frequency": 2, + "model_max_length": 2048, + "indent_width": 4, + "special_tokens": [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ] +} \ No newline at end of file diff --git a/release/MINDI_1.0_420M/requirements_portable.txt b/release/MINDI_1.0_420M/requirements_portable.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9b36acfefdafdfe7676130830521ce916c0fb38 --- /dev/null +++ b/release/MINDI_1.0_420M/requirements_portable.txt @@ -0,0 +1,11 @@ +torch==2.4.1 +tokenizers==0.20.1 +pyyaml==6.0.2 +gradio==5.5.0 +gradio-client==1.4.2 +fastapi==0.115.5 +starlette==0.41.3 +uvicorn==0.32.0 +httpx==0.27.2 +pydantic==2.9.2 +pygments==2.19.2 diff --git a/release/MINDI_1.0_420M/src/__init__.py b/release/MINDI_1.0_420M/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..45aa8dbeca626dc18818fbfe98c473d01ebd3aff --- /dev/null +++ b/release/MINDI_1.0_420M/src/__init__.py @@ -0,0 +1,2 @@ +# This file marks src as a Python package. + diff --git a/release/MINDI_1.0_420M/src/evaluation_system/__init__.py b/release/MINDI_1.0_420M/src/evaluation_system/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ab838cd1aab93b97566844ce5c08d423d94dc41 --- /dev/null +++ b/release/MINDI_1.0_420M/src/evaluation_system/__init__.py @@ -0,0 +1 @@ +# This file marks evaluation_system as a Python package. diff --git a/release/MINDI_1.0_420M/src/evaluation_system/code_eval.py b/release/MINDI_1.0_420M/src/evaluation_system/code_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..fd8ed48a751285ecba59b448786774f7920eed16 --- /dev/null +++ b/release/MINDI_1.0_420M/src/evaluation_system/code_eval.py @@ -0,0 +1,186 @@ +""" +Component 6 evaluation helpers. +""" + +from __future__ import annotations + +import ast +import json +import re +from pathlib import Path +from typing import Dict, List + + +def python_syntax_ok(code: str) -> bool: + try: + ast.parse(code) + return True + except Exception: + return False + + +def save_json(path: str, payload: Dict) -> None: + p = Path(path) + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") + + +def _normalize_punctuation_spacing(text: str) -> str: + text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text) + text = re.sub(r"([\(\[\{])\s+", r"\1", text) + text = re.sub(r"\s*=\s*", " = ", text) + text = re.sub(r"\s*\+\s*", " + ", text) + text = re.sub(r"\s*-\s*", " - ", text) + text = re.sub(r"\s*\*\s*", " * ", text) + text = re.sub(r"\s*/\s*", " / ", text) + text = re.sub(r"\s*%\s*", " % ", text) + return re.sub(r"[ \t]+", " ", text).strip() + + +def _remove_non_python_noise(line: str) -> str: + line = line.replace("", "1") + line = line.replace("\u0000", "") + line = line.replace("{", "") + line = line.replace("}", "") + line = line.replace(";", "") + return line + + +def _fix_identifier_spacing(line: str) -> str: + # def name with spaces -> def name_with_spaces + m = re.match(r"^(\s*def\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*\(.*)$", line) + if m: + fn = re.sub(r"\s+", "_", m.group(2).strip()) + line = f"{m.group(1)}{fn}{m.group(3)}" + + # class name with spaces -> class Name_With_Spaces + m = re.match(r"^(\s*class\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*:.*)$", line) + if m: + cn = re.sub(r"\s+", "_", m.group(2).strip()) + line = f"{m.group(1)}{cn}{m.group(3)}" + + # assignment lhs spaces -> underscore. + if "=" in line and "==" not in line: + lhs, rhs = line.split("=", 1) + lhs_clean = lhs.strip() + if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean): + lhs_clean = re.sub(r"\s+", "_", lhs_clean) + line = f"{lhs_clean} = {rhs.strip()}" + + return line + + +def _looks_like_python_line(line: str) -> bool: + if not line.strip(): + return False + starts = ( + "def ", + "class ", + "if ", + "for ", + "while ", + "try:", + "except", + "with ", + "return ", + "import ", + "from ", + "print(", + ) + s = line.strip() + if s.startswith(starts): + return True + if re.match(r"^[A-Za-z_][A-Za-z0-9_]*\s*=", s): + return True + return False + + +def _trim_to_code(lines: List[str]) -> List[str]: + # Drop noisy preamble lines until first plausible Python line. + i = 0 + while i < len(lines) and not _looks_like_python_line(lines[i]): + i += 1 + lines = lines[i:] if i < len(lines) else [] + # Keep only plausible lines after start; allow blank lines. + out = [] + for line in lines: + if not line.strip(): + out.append(line) + continue + if _looks_like_python_line(line) or line.startswith(" "): + out.append(line) + return out + + +def _best_effort_python_format(lines: List[str]) -> List[str]: + out: List[str] = [] + indent = 0 + for raw in lines: + line = raw.strip() + if not line: + out.append("") + continue + + if line in {"return", "pass", "break", "continue"}: + indent = max(0, indent - 1) + + out.append((" " * indent) + line) + + if line.endswith(":"): + indent += 1 + + return out + + +def restore_code_from_structured(decoded: str) -> str: + text = decoded + for tok in ["", "", "", "", ""]: + text = text.replace(tok, "") + + if "" in text: + text = text.split("", 1)[1] + + text = text.replace("_", " ") + tokens = text.strip().split() + + lines: List[str] = [] + current_tokens: List[str] = [] + indent = 0 + + for tok in tokens: + if tok == "": + indent += 1 + continue + if tok == "": + indent = max(0, indent - 1) + continue + if tok == "": + line = " ".join(current_tokens).strip() + line = _remove_non_python_noise(line) + line = _normalize_punctuation_spacing(line) + line = _fix_identifier_spacing(line) + if line: + lines.append((" " * indent) + line) + else: + lines.append("") + current_tokens = [] + continue + current_tokens.append(tok) + + if current_tokens: + line = " ".join(current_tokens).strip() + line = _remove_non_python_noise(line) + line = _normalize_punctuation_spacing(line) + line = _fix_identifier_spacing(line) + if line: + lines.append((" " * indent) + line) + + lines = _trim_to_code(lines) + lines = _best_effort_python_format(lines) + + while lines and not lines[0].strip(): + lines.pop(0) + while lines and not lines[-1].strip(): + lines.pop() + + return "\n".join(lines).strip() diff --git a/release/MINDI_1.0_420M/src/inference_engine/__init__.py b/release/MINDI_1.0_420M/src/inference_engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..245858df2c08ea0b4dfb0f65ef3c3a7115ca87fe --- /dev/null +++ b/release/MINDI_1.0_420M/src/inference_engine/__init__.py @@ -0,0 +1 @@ +# This file marks inference_engine as a Python package. diff --git a/release/MINDI_1.0_420M/src/inference_engine/inference_engine.py b/release/MINDI_1.0_420M/src/inference_engine/inference_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..27dfcdc9489ebf39e18a41a2314fb0b83cdb94c0 --- /dev/null +++ b/release/MINDI_1.0_420M/src/inference_engine/inference_engine.py @@ -0,0 +1,211 @@ +""" +Component 7: Inference engine for local code generation. + +Features: +- Deterministic low-temperature greedy mode. +- Stop rules for clean function completion. +- Syntax-aware retry with up to 3 attempts. +""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch + +from src.evaluation_system.code_eval import restore_code_from_structured +from src.model_architecture.code_transformer import CodeTransformerLM +from src.tokenizer.code_tokenizer import CodeTokenizer + + +@dataclass +class DecodingConfig: + max_new_tokens: int = 300 + # Mode 1: deterministic output + greedy_temperature: float = 0.0 + # Retry mode 2 + retry2_temperature: float = 0.25 + retry2_top_p: float = 0.85 + # Retry mode 3 + retry3_temperature: float = 0.35 + retry3_top_p: float = 0.90 + max_retries: int = 3 + min_tokens_before_stop_check: int = 64 + # Stop only when function body is non-trivial. + min_function_body_statements: int = 2 + + +class InferenceEngine: + def __init__(self, model: CodeTransformerLM, tokenizer: CodeTokenizer, device: torch.device) -> None: + self.model = model + self.tokenizer = tokenizer + self.device = device + self.model.eval() + + @staticmethod + def _syntax_ok_python(code: str) -> bool: + try: + ast.parse(code) + return True + except Exception: + return False + + @staticmethod + def _function_completion_score(code: str) -> int: + # Higher score = more complete usable function. + try: + tree = ast.parse(code) + except Exception: + return 0 + funcs = [n for n in tree.body if isinstance(n, ast.FunctionDef)] + if not funcs: + return 0 + fn = funcs[-1] + body_len = len(fn.body) + has_return = any(isinstance(n, ast.Return) for n in ast.walk(fn)) + return body_len + (2 if has_return else 0) + + def _looks_complete_function(self, code: str, min_body_statements: int) -> bool: + if "def " not in code: + return False + try: + tree = ast.parse(code) + except Exception: + return False + funcs = [n for n in tree.body if isinstance(n, ast.FunctionDef)] + if not funcs: + return False + fn = funcs[-1] + if len(fn.body) < min_body_statements: + return False + return True + + def _sample_next( + self, + logits: torch.Tensor, + temperature: float, + top_p: float, + ) -> torch.Tensor: + if temperature <= 0: + return torch.argmax(logits, dim=-1, keepdim=True) + + logits = logits / temperature + probs = torch.softmax(logits, dim=-1) + + sorted_probs, sorted_idx = torch.sort(probs, descending=True) + cumulative = torch.cumsum(sorted_probs, dim=-1) + cutoff = cumulative > top_p + cutoff[..., 1:] = cutoff[..., :-1].clone() + cutoff[..., 0] = False + sorted_probs[cutoff] = 0.0 + denom = sorted_probs.sum(dim=-1, keepdim=True).clamp_min(1e-12) + sorted_probs = sorted_probs / denom + sampled = torch.multinomial(sorted_probs, num_samples=1) + return sorted_idx.gather(-1, sampled) + + @torch.no_grad() + def _generate_once( + self, + prompt: str, + language: str, + max_new_tokens: int, + temperature: float, + top_p: float, + min_tokens_before_stop_check: int, + min_function_body_statements: int, + ) -> Dict[str, object]: + prompt_text = self.tokenizer.format_training_sample(prompt=prompt, code="", language=language) + prompt_text = prompt_text.replace(" ", "").strip() + + ids = self.tokenizer.encode(prompt_text) + eos_id = self.tokenizer.special_token_ids.get("") + + # Remove trailing EOS so generation can continue. + if eos_id is not None and len(ids) > 1 and ids[-1] == int(eos_id): + ids = ids[:-1] + + input_ids = torch.tensor([ids], dtype=torch.long, device=self.device) + + generated_steps = 0 + for _ in range(max_new_tokens): + out = self.model(input_ids=input_ids) + logits = out["logits"][:, -1, :] + next_id = self._sample_next(logits, temperature=temperature, top_p=top_p) + input_ids = torch.cat([input_ids, next_id], dim=1) + generated_steps += 1 + + # Primary stop: EOS token. + if eos_id is not None and int(next_id.item()) == int(eos_id): + break + + # Secondary stop: complete parseable function with non-trivial body. + if generated_steps >= min_tokens_before_stop_check and (generated_steps % 12 == 0): + decoded = self.tokenizer.decode(input_ids[0].tolist()) + code = restore_code_from_structured(decoded) + if self._looks_complete_function(code, min_body_statements=min_function_body_statements): + break + + decoded = self.tokenizer.decode(input_ids[0].tolist()) + code = restore_code_from_structured(decoded) + syntax_ok = self._syntax_ok_python(code) if language == "python" else True + completion_score = self._function_completion_score(code) if language == "python" else 0 + return { + "code": code, + "syntax_ok": syntax_ok, + "generated_tokens": generated_steps, + "temperature": temperature, + "top_p": top_p, + "completion_score": completion_score, + } + + @torch.no_grad() + def generate_with_retry( + self, + prompt: str, + language: str = "python", + cfg: Optional[DecodingConfig] = None, + ) -> Dict[str, object]: + cfg = cfg or DecodingConfig() + + attempts: List[Tuple[float, float]] = [ + (cfg.greedy_temperature, 1.0), + (cfg.retry2_temperature, cfg.retry2_top_p), + (cfg.retry3_temperature, cfg.retry3_top_p), + ] + + results = [] + for i in range(min(cfg.max_retries, len(attempts))): + temp, top_p = attempts[i] + res = self._generate_once( + prompt=prompt, + language=language, + max_new_tokens=cfg.max_new_tokens, + temperature=temp, + top_p=top_p, + min_tokens_before_stop_check=cfg.min_tokens_before_stop_check, + min_function_body_statements=cfg.min_function_body_statements, + ) + res["attempt"] = i + 1 + results.append(res) + + # Syntax-aware retry: stop retries as soon as syntax is valid. + if bool(res["syntax_ok"]): + return { + "final": res, + "attempts": results, + "used_retry": i > 0, + } + + # If all retries fail, choose best completion score then longest generation. + best = sorted( + results, + key=lambda x: (int(x.get("completion_score", 0)), int(x.get("generated_tokens", 0))), + reverse=True, + )[0] + return { + "final": best, + "attempts": results, + "used_retry": True, + } diff --git a/release/MINDI_1.0_420M/src/model_architecture/__init__.py b/release/MINDI_1.0_420M/src/model_architecture/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..342d765602b980261490c2ba31ee4798327a8947 --- /dev/null +++ b/release/MINDI_1.0_420M/src/model_architecture/__init__.py @@ -0,0 +1,2 @@ +# This file marks model_architecture as a Python package. + diff --git a/release/MINDI_1.0_420M/src/model_architecture/code_transformer.py b/release/MINDI_1.0_420M/src/model_architecture/code_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..456fd69995ced3fd03b24a5badc9fc816d269e8f --- /dev/null +++ b/release/MINDI_1.0_420M/src/model_architecture/code_transformer.py @@ -0,0 +1,264 @@ +""" +Component 4: Transformer model architecture for code generation. + +This module defines a decoder-only transformer built from scratch in PyTorch. +It is modular through configuration so model size can be scaled up/down. +""" + +from __future__ import annotations + +import math +from dataclasses import asdict, dataclass +from typing import Dict, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +@dataclass +class ModelConfig: + # Vocabulary size from tokenizer. + vocab_size: int = 50_000 + # Maximum context length in tokens. + max_seq_len: int = 2048 + # Core hidden size of transformer. + d_model: int = 1152 + # Number of transformer blocks. + n_layers: int = 23 + # Number of attention heads. + n_heads: int = 16 + # Feed-forward hidden size. + d_ff: int = 4608 + # Dropout for regularization. + dropout: float = 0.1 + # Whether to tie token embedding and LM head weights. + tie_embeddings: bool = True + # Enable gradient checkpointing to reduce VRAM usage during training. + gradient_checkpointing: bool = False + # Initialization standard deviation. + init_std: float = 0.02 + # Epsilon for layer normalization stability. + rms_norm_eps: float = 1e-5 + + @property + def head_dim(self) -> int: + if self.d_model % self.n_heads != 0: + raise ValueError("d_model must be divisible by n_heads.") + return self.d_model // self.n_heads + + +def get_model_presets() -> Dict[str, ModelConfig]: + """ + Returns standard size presets. + """ + return { + "small_180m": ModelConfig(d_model=896, n_layers=18, n_heads=14, d_ff=3584), + "medium_420m": ModelConfig(d_model=1152, n_layers=23, n_heads=16, d_ff=4608), + "large_800m": ModelConfig(d_model=1536, n_layers=24, n_heads=16, d_ff=6144), + } + + +class RMSNorm(nn.Module): + """ + RMSNorm is a lightweight normalization layer used in many modern LLMs. + """ + + def __init__(self, dim: int, eps: float = 1e-5) -> None: + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + norm = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(norm + self.eps) + return self.weight * x + + +class RotaryEmbedding(nn.Module): + """ + Rotary positional embedding. + This injects token order information directly into query/key vectors. + """ + + def __init__(self, head_dim: int, max_seq_len: int) -> None: + super().__init__() + if head_dim % 2 != 0: + raise ValueError("head_dim must be even for rotary embeddings.") + inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim)) + t = torch.arange(max_seq_len, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) + self.register_buffer("cos_cached", torch.cos(freqs), persistent=False) + self.register_buffer("sin_cached", torch.sin(freqs), persistent=False) + + def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]: + cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0) # [1,1,S,H/2] + sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0) # [1,1,S,H/2] + q = self._apply_rotary(q, cos, sin) + k = self._apply_rotary(k, cos, sin) + return q, k + + @staticmethod + def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + x_rot_even = x1 * cos - x2 * sin + x_rot_odd = x1 * sin + x2 * cos + out = torch.stack((x_rot_even, x_rot_odd), dim=-1).flatten(-2) + return out + + +class CausalSelfAttention(nn.Module): + """ + Multi-head causal self-attention for autoregressive code generation. + """ + + def __init__(self, config: ModelConfig) -> None: + super().__init__() + self.n_heads = config.n_heads + self.head_dim = config.head_dim + self.scale = self.head_dim ** -0.5 + + self.q_proj = nn.Linear(config.d_model, config.d_model, bias=False) + self.k_proj = nn.Linear(config.d_model, config.d_model, bias=False) + self.v_proj = nn.Linear(config.d_model, config.d_model, bias=False) + self.o_proj = nn.Linear(config.d_model, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout) + self.rotary = RotaryEmbedding(head_dim=self.head_dim, max_seq_len=config.max_seq_len) + + def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + bsz, seq_len, _ = x.shape + q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + + q, k = self.rotary(q, k, seq_len=seq_len) + + # Use PyTorch scaled dot-product attention with causal masking. + out = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=attn_mask, + dropout_p=self.dropout.p if self.training else 0.0, + is_causal=True, + scale=self.scale, + ) + out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1) + return self.o_proj(out) + + +class FeedForward(nn.Module): + """ + Two-layer feed-forward network with GELU activation. + """ + + def __init__(self, config: ModelConfig) -> None: + super().__init__() + self.fc1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.fc2 = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = F.gelu(x, approximate="tanh") + x = self.fc2(x) + x = self.dropout(x) + return x + + +class TransformerBlock(nn.Module): + """ + One transformer block: + norm -> attention -> residual + norm -> feed-forward -> residual + """ + + def __init__(self, config: ModelConfig) -> None: + super().__init__() + self.norm1 = RMSNorm(config.d_model, eps=config.rms_norm_eps) + self.attn = CausalSelfAttention(config) + self.norm2 = RMSNorm(config.d_model, eps=config.rms_norm_eps) + self.ffn = FeedForward(config) + + def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + x = x + self.attn(self.norm1(x), attn_mask=attn_mask) + x = x + self.ffn(self.norm2(x)) + return x + + +class CodeTransformerLM(nn.Module): + """ + Full decoder-only language model for code generation. + """ + + def __init__(self, config: ModelConfig) -> None: + super().__init__() + self.config = config + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model) + self.dropout = nn.Dropout(config.dropout) + self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)]) + self.norm_final = RMSNorm(config.d_model, eps=config.rms_norm_eps) + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + if config.tie_embeddings: + self.lm_head.weight = self.embed_tokens.weight + + self.apply(self._init_weights) + + def _init_weights(self, module: nn.Module) -> None: + # Keep initialization stable for deep networks. + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std) + + def enable_gradient_checkpointing(self, enabled: bool = True) -> None: + # Toggle gradient checkpointing mode. + self.config.gradient_checkpointing = enabled + + def forward( + self, + input_ids: torch.Tensor, + labels: Optional[torch.Tensor] = None, + attn_mask: Optional[torch.Tensor] = None, + ) -> Dict[str, torch.Tensor]: + if input_ids.dim() != 2: + raise ValueError("input_ids must be shape [batch, seq_len].") + + x = self.embed_tokens(input_ids) + x = self.dropout(x) + + for block in self.blocks: + if self.config.gradient_checkpointing and self.training: + x = torch.utils.checkpoint.checkpoint(block, x, attn_mask, use_reentrant=False) + else: + x = block(x, attn_mask=attn_mask) + + x = self.norm_final(x) + logits = self.lm_head(x) + + out: Dict[str, torch.Tensor] = {"logits": logits} + if labels is not None: + # Standard next-token cross entropy loss. + shift_logits = logits[:, :-1, :].contiguous() + shift_labels = labels[:, 1:].contiguous() + loss = F.cross_entropy( + shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1), + ignore_index=-100, + ) + out["loss"] = loss + return out + + def estimate_num_parameters(self) -> int: + # Returns total trainable parameter count. + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + def summary(self) -> Dict[str, object]: + # Returns a simple structured summary for logs/CLI. + return { + "config": asdict(self.config), + "num_parameters": self.estimate_num_parameters(), + } + diff --git a/release/MINDI_1.0_420M/src/tokenizer/__init__.py b/release/MINDI_1.0_420M/src/tokenizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35cf334b1ed48cc5bd2702ff997daf5bf7fd0bf6 --- /dev/null +++ b/release/MINDI_1.0_420M/src/tokenizer/__init__.py @@ -0,0 +1,2 @@ +# This file marks tokenizer as a Python package. + diff --git a/release/MINDI_1.0_420M/src/tokenizer/code_tokenizer.py b/release/MINDI_1.0_420M/src/tokenizer/code_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..b68ec2705394185a43465e345664e691245f18a5 --- /dev/null +++ b/release/MINDI_1.0_420M/src/tokenizer/code_tokenizer.py @@ -0,0 +1,216 @@ +""" +Component 2: Custom code tokenizer for Python and JavaScript. + +This tokenizer is code-aware: +- It preserves indentation structure using explicit tokens. +- It keeps newline boundaries using a newline token. +- It treats code operators and brackets as separate units. +- It supports prompt+code style training samples. +""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional + +from tokenizers import Regex, Tokenizer +from tokenizers.decoders import BPEDecoder +from tokenizers.models import BPE +from tokenizers.normalizers import NFKC, Sequence as NormalizerSequence +from tokenizers.pre_tokenizers import Metaspace, Sequence as PreTokenizerSequence, Split +from tokenizers.processors import TemplateProcessing +from tokenizers.trainers import BpeTrainer + + +@dataclass +class CodeTokenizerConfig: + # Vocabulary size controls how many distinct tokens the tokenizer learns. + vocab_size: int = 50_000 + # Minimum frequency filters very rare fragments. + min_frequency: int = 2 + # Sequence length is used later by training/inference components. + model_max_length: int = 2048 + # Indent width is used to normalize tabs and format indentation markers. + indent_width: int = 4 + # These tokens are required for code generation workflows. + special_tokens: List[str] = None # type: ignore[assignment] + + def __post_init__(self) -> None: + if self.special_tokens is None: + self.special_tokens = [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + +class CodeTokenizer: + # This wrapper owns one HF Tokenizers object plus code-specific helpers. + + def __init__(self, config: Optional[CodeTokenizerConfig] = None) -> None: + self.config = config or CodeTokenizerConfig() + self.tokenizer: Optional[Tokenizer] = None + self.special_token_ids: Dict[str, int] = {} + + def _build_base_tokenizer(self) -> Tokenizer: + """ + Creates a BPE tokenizer with code-oriented pre-tokenization rules. + """ + tokenizer = Tokenizer(BPE(unk_token="")) + tokenizer.normalizer = NormalizerSequence([NFKC()]) + + # Split multi-character operators first so they are not broken apart. + multi_op = Regex( + r"(==|!=|<=|>=|:=|->|=>|\+\+|--|\+=|-=|\*=|/=|//=|%=|\*\*|&&|\|\||<<|>>)" + ) + # Split common delimiters used heavily in code. + punct = Regex(r"([()\[\]{}.,:;])") + + tokenizer.pre_tokenizer = PreTokenizerSequence( + [ + Split(multi_op, behavior="isolated"), + Split(punct, behavior="isolated"), + Metaspace(replacement="_", prepend_scheme="always", split=True), + ] + ) + tokenizer.decoder = BPEDecoder() + return tokenizer + + def train(self, text_iterator: Iterable[str]) -> None: + """ + Trains the tokenizer from a stream of preformatted text samples. + """ + tokenizer = self._build_base_tokenizer() + trainer = BpeTrainer( + vocab_size=self.config.vocab_size, + min_frequency=self.config.min_frequency, + special_tokens=self.config.special_tokens, + show_progress=True, + ) + tokenizer.train_from_iterator(text_iterator, trainer=trainer, length=None) + + # Add BOS/EOS automatically around each single sequence. + bos_id = tokenizer.token_to_id("") + eos_id = tokenizer.token_to_id("") + if bos_id is None or eos_id is None: + raise RuntimeError("Tokenizer training failed to register BOS/EOS tokens.") + tokenizer.post_processor = TemplateProcessing( + single=" $A ", + special_tokens=[("", bos_id), ("", eos_id)], + ) + + self.tokenizer = tokenizer + self.special_token_ids = { + token: tokenizer.token_to_id(token) for token in self.config.special_tokens + } + + def save(self, output_dir: str) -> None: + """ + Saves tokenizer JSON and config so all other components can reuse it. + """ + if self.tokenizer is None: + raise RuntimeError("Cannot save tokenizer before training or loading it.") + out = Path(output_dir) + out.mkdir(parents=True, exist_ok=True) + self.tokenizer.save(str(out / "tokenizer.json")) + with (out / "tokenizer_config.json").open("w", encoding="utf-8") as f: + json.dump(asdict(self.config), f, indent=2) + + @classmethod + def load(cls, tokenizer_dir: str) -> "CodeTokenizer": + """ + Loads tokenizer from disk. + """ + base = Path(tokenizer_dir) + cfg_path = base / "tokenizer_config.json" + tok_path = base / "tokenizer.json" + if not cfg_path.exists() or not tok_path.exists(): + raise FileNotFoundError( + f"Missing tokenizer files in {tokenizer_dir}. " + "Expected tokenizer.json and tokenizer_config.json." + ) + with cfg_path.open("r", encoding="utf-8") as f: + cfg_data = json.load(f) + config = CodeTokenizerConfig(**cfg_data) + obj = cls(config=config) + obj.tokenizer = Tokenizer.from_file(str(tok_path)) + obj.special_token_ids = { + token: obj.tokenizer.token_to_id(token) for token in obj.config.special_tokens + } + return obj + + def encode(self, text: str) -> List[int]: + """ + Encodes one preformatted text sample to token IDs. + """ + if self.tokenizer is None: + raise RuntimeError("Tokenizer is not ready. Train or load it first.") + return self.tokenizer.encode(text).ids + + def decode(self, token_ids: List[int]) -> str: + """ + Decodes token IDs to text. + """ + if self.tokenizer is None: + raise RuntimeError("Tokenizer is not ready. Train or load it first.") + return self.tokenizer.decode(token_ids, skip_special_tokens=False) + + def format_training_sample(self, prompt: str, code: str, language: str) -> str: + """ + Converts prompt + code into one structured training text sequence. + """ + lang_token = "" if language.lower() == "python" else "" + prompt_text = self._normalize_text(prompt) + code_text = self._code_to_structure_tokens(code) + return f" {lang_token} {prompt_text} {code_text}" + + def _normalize_text(self, text: str) -> str: + """ + Normalizes regular text by cleaning newlines. + """ + return text.replace("\r\n", "\n").replace("\r", "\n").strip() + + def _code_to_structure_tokens(self, code: str) -> str: + """ + Converts raw code into a string with explicit indentation and newline markers. + """ + code = code.replace("\r\n", "\n").replace("\r", "\n").replace("\t", " " * self.config.indent_width) + lines = code.split("\n") + indent_stack: List[int] = [0] + out_tokens: List[str] = [] + + for raw_line in lines: + # Keep blank lines as newline tokens so code structure is preserved. + if raw_line.strip() == "": + out_tokens.append("") + continue + + current_indent = len(raw_line) - len(raw_line.lstrip(" ")) + line_content = raw_line.lstrip(" ") + + while current_indent < indent_stack[-1]: + indent_stack.pop() + out_tokens.append("") + + while current_indent > indent_stack[-1]: + indent_stack.append(current_indent) + out_tokens.append("") + + out_tokens.append(line_content) + out_tokens.append("") + + while len(indent_stack) > 1: + indent_stack.pop() + out_tokens.append("") + + return " ".join(out_tokens).strip() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..330901606c43d2eab90ce6be3ddc7f49fe5556ae --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +transformers +datasets +peft +accelerate +torch diff --git a/requirements_optional_windows_bitsandbytes.txt b/requirements_optional_windows_bitsandbytes.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c97f1695cec96944159878c0701c0131d82ce29 --- /dev/null +++ b/requirements_optional_windows_bitsandbytes.txt @@ -0,0 +1,5 @@ +# Optional package for Windows-only experiments. +# Important: this package frequently fails on some Windows CUDA setups. +# Keep this optional so base setup remains stable. +bitsandbytes-windows==0.37.5 + diff --git a/scripts/add_incremental_javascript_dataset.py b/scripts/add_incremental_javascript_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9bf952cc9ff2a344a4e24c7606e6d214c8d0cf21 --- /dev/null +++ b/scripts/add_incremental_javascript_dataset.py @@ -0,0 +1,250 @@ +""" +Incremental JS dataset augmentation for Component 3 outputs. + +Goal: +- Do NOT rebuild the full pipeline. +- Reuse existing cleaned/tokenized files. +- Add only new JavaScript samples from one additional HF dataset. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any, Dict + +import yaml +from datasets import load_dataset + +# Ensure src imports work when run from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.dataset_pipeline.hf_dataset_pipeline import ( # noqa: E402 + HFDatasetPipeline, + PipelineConfig, + SourceDatasetSpec, +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Add JS-focused dataset incrementally.") + parser.add_argument( + "--config", + default="configs/component3_incremental_js.yaml", + help="Path to YAML config.", + ) + parser.add_argument( + "--target_new_javascript_examples", + type=int, + default=None, + help="Optional override for JS target.", + ) + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {path}") + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + raise ValueError("Config must be a YAML object.") + return data + + +def load_existing_into_dedupe_db(pipeline: HFDatasetPipeline, existing_clean_path: Path) -> int: + """ + Seeds dedupe DB with existing clean dataset hashes. + This prevents re-adding duplicates during incremental merge. + """ + if not existing_clean_path.exists(): + raise FileNotFoundError( + f"Existing clean dataset not found: {existing_clean_path}. " + "Run Component 3 full pipeline first." + ) + + seeded = 0 + with existing_clean_path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + row = json.loads(line) + prompt = str(row.get("prompt", "")).strip() + code = str(row.get("code", "")).strip() + except Exception: + continue + if not prompt or not code: + continue + # Keep unique adds hash to DB, False means already there. + pipeline._keep_unique(prompt, code) + seeded += 1 + if seeded % 5000 == 0: + pipeline.conn.commit() + pipeline.conn.commit() + return seeded + + +def main() -> None: + args = parse_args() + try: + cfg_data = load_yaml(Path(args.config)) + + existing_clean_path = Path(cfg_data["existing_clean_path"]) + existing_tokenized_path = Path(cfg_data["existing_tokenized_path"]) + existing_stats_path = Path(cfg_data["existing_stats_path"]) + tokenizer_dir = str(cfg_data["tokenizer_dir"]) + dedupe_db_path = str(cfg_data["dedupe_db_path"]) + progress_every = int(cfg_data.get("progress_every", 500)) + min_prompt_chars = int(cfg_data.get("min_prompt_chars", 8)) + min_code_chars = int(cfg_data.get("min_code_chars", 16)) + max_code_chars = int(cfg_data.get("max_code_chars", 40_000)) + target_js = ( + args.target_new_javascript_examples + if args.target_new_javascript_examples is not None + else int(cfg_data.get("target_new_javascript_examples", 20_000)) + ) + + if not existing_tokenized_path.exists(): + raise FileNotFoundError( + f"Existing tokenized dataset not found: {existing_tokenized_path}. " + "Run Component 3 full pipeline first." + ) + + new_ds = cfg_data["new_dataset"] + spec = SourceDatasetSpec( + hf_dataset_id=str(new_ds["hf_dataset_id"]), + split=str(new_ds.get("split", "train")), + prompt_field=str(new_ds["prompt_field"]), + code_field=str(new_ds["code_field"]), + language_field=new_ds.get("language_field"), + default_language=str(new_ds.get("default_language", "auto")), + ) + + # Build minimal pipeline object to reuse cleaning/dedupe/tokenization utilities. + pipeline_cfg = PipelineConfig( + datasets=[spec], + tokenizer_dir=tokenizer_dir, + interim_output_dir=str(existing_clean_path.parent), + processed_output_dir=str(existing_tokenized_path.parent), + dedupe_db_path=dedupe_db_path, + min_prompt_chars=min_prompt_chars, + min_code_chars=min_code_chars, + max_code_chars=max_code_chars, + progress_every=progress_every, + ) + pipeline = HFDatasetPipeline(pipeline_cfg) + + try: + seeded = load_existing_into_dedupe_db(pipeline, existing_clean_path) + print(f"[info] Seeded dedupe DB with existing clean records: {seeded}") + + stream = load_dataset(spec.hf_dataset_id, split=spec.split, streaming=True) + added_js = 0 + seen_new = 0 + dropped_duplicate = 0 + dropped_filtered = 0 + + with existing_clean_path.open("a", encoding="utf-8") as clean_f, existing_tokenized_path.open( + "a", encoding="utf-8" + ) as tok_f: + for row in stream: + seen_new += 1 + std = pipeline._standardize_record(row=row, spec=spec) + if std is None: + dropped_filtered += 1 + continue + + prompt, code, lang = std + cleaned = pipeline._clean_and_filter(prompt=prompt, code=code, language=lang) + if cleaned is None: + dropped_filtered += 1 + continue + + c_prompt, c_code, c_lang = cleaned + if c_lang != "javascript": + dropped_filtered += 1 + continue + + if not pipeline._keep_unique(c_prompt, c_code): + dropped_duplicate += 1 + continue + + formatted_text = pipeline.tokenizer.format_training_sample( + prompt=c_prompt, code=c_code, language="javascript" + ) + token_ids = pipeline.tokenizer.encode(formatted_text) + + clean_record = {"prompt": c_prompt, "code": c_code, "language": "javascript"} + tok_record = { + "language": "javascript", + "text": formatted_text, + "input_ids": token_ids, + "length": len(token_ids), + } + clean_f.write(json.dumps(clean_record, ensure_ascii=False) + "\n") + tok_f.write(json.dumps(tok_record, ensure_ascii=False) + "\n") + + added_js += 1 + if added_js % progress_every == 0: + pipeline.conn.commit() + print( + f"[progress] seen_new={seen_new} added_js={added_js} " + f"dropped_duplicate={dropped_duplicate}" + ) + + if added_js >= target_js: + break + + pipeline.conn.commit() + finally: + pipeline.close() + + # Merge incremental stats into existing summary. + merged_stats: Dict[str, Any] = {} + if existing_stats_path.exists(): + with existing_stats_path.open("r", encoding="utf-8") as f: + try: + merged_stats = json.load(f) + except Exception: + merged_stats = {} + + merged_stats["incremental_js_dataset"] = spec.hf_dataset_id + merged_stats["incremental_js_target"] = target_js + merged_stats["incremental_js_added"] = added_js + merged_stats["incremental_new_seen"] = seen_new + merged_stats["incremental_new_dropped_duplicate"] = dropped_duplicate + merged_stats["incremental_new_dropped_filtered"] = dropped_filtered + merged_stats["final_clean_records_estimate"] = int(merged_stats.get("kept_total", 0)) + added_js + + with existing_stats_path.open("w", encoding="utf-8") as f: + json.dump(merged_stats, f, indent=2) + + print("Incremental JavaScript augmentation completed.") + print(f"Dataset used: {spec.hf_dataset_id}") + print(f"Target JS examples: {target_js}") + print(f"Added JS examples: {added_js}") + if added_js < target_js: + print( + "Warning: JS target not reached from this dataset after filtering/dedupe. " + "You may need one more JS dataset." + ) + print("Updated files:") + print(f"- {existing_clean_path}") + print(f"- {existing_tokenized_path}") + print(f"- {existing_stats_path}") + except Exception as exc: + print("Incremental JavaScript augmentation failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: verify internet access, dataset ID, and existing Component 3 files.") + raise SystemExit(1) + + +if __name__ == "__main__": + main() + diff --git a/scripts/build_component4_model.py b/scripts/build_component4_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ce10e7f0c5d51cb757eafb39c5b6a3c9ddcdde94 --- /dev/null +++ b/scripts/build_component4_model.py @@ -0,0 +1,111 @@ +""" +Build/inspect script for Component 4 model architecture. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any, Dict + +import yaml + +# Ensure src imports work from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.model_architecture.code_transformer import ( # noqa: E402 + CodeTransformerLM, + ModelConfig, + get_model_presets, +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Build and inspect Component 4 model.") + parser.add_argument( + "--config", + default="configs/component4_model_config.yaml", + help="Path to model YAML config.", + ) + parser.add_argument( + "--save_summary", + default="artifacts/model/component4_model_summary.json", + help="Where to save model summary JSON.", + ) + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Model config not found: {path}") + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + raise ValueError("Invalid YAML format in model config.") + return data + + +def build_config(cfg_data: Dict[str, Any]) -> ModelConfig: + preset = cfg_data.get("preset") + model_cfg = cfg_data.get("model", {}) + if not isinstance(model_cfg, dict): + raise ValueError("Config key 'model' must be an object.") + + base = None + if preset: + presets = get_model_presets() + if preset not in presets: + raise ValueError(f"Unknown preset '{preset}'. Available: {list(presets.keys())}") + base = presets[preset] + + if base is None: + return ModelConfig(**model_cfg) + + merged = { + "vocab_size": base.vocab_size, + "max_seq_len": base.max_seq_len, + "d_model": base.d_model, + "n_layers": base.n_layers, + "n_heads": base.n_heads, + "d_ff": base.d_ff, + "dropout": base.dropout, + "tie_embeddings": base.tie_embeddings, + "gradient_checkpointing": base.gradient_checkpointing, + "init_std": base.init_std, + "rms_norm_eps": base.rms_norm_eps, + } + merged.update(model_cfg) + return ModelConfig(**merged) + + +def main() -> None: + args = parse_args() + try: + cfg_data = load_yaml(Path(args.config)) + model_cfg = build_config(cfg_data) + model = CodeTransformerLM(model_cfg) + summary = model.summary() + + save_path = Path(args.save_summary) + save_path.parent.mkdir(parents=True, exist_ok=True) + with save_path.open("w", encoding="utf-8") as f: + json.dump(summary, f, indent=2) + + print("Component 4 model build completed.") + print(f"Preset: {cfg_data.get('preset')}") + print(f"Parameters: {summary['num_parameters']:,}") + print(f"Saved summary: {save_path}") + except Exception as exc: + print("Component 4 model build failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: check config values (especially d_model and n_heads divisibility).") + raise SystemExit(1) + + +if __name__ == "__main__": + main() + diff --git a/scripts/export_mindi_to_hf.py b/scripts/export_mindi_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..3d7b3b802783b8b8236a660cda227e4e16465e7b --- /dev/null +++ b/scripts/export_mindi_to_hf.py @@ -0,0 +1,686 @@ +""" +Export MINDI 1.0 420M to a Hugging Face-ready model folder. + +What this script does: +1) Loads your full-quality checkpoint (step_3200.pt by default). +2) Builds the model architecture with the exact Component 4 config. +3) Saves model weights as model.safetensors. +4) Copies tokenizer files. +5) Writes Hugging Face config files + custom model code. +6) Writes a professional model card README. +7) Writes a helper upload script with exact commands. +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +from pathlib import Path +from typing import Any, Dict + +import torch +import yaml +from safetensors.torch import save_file + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.model_architecture.code_transformer import ( # noqa: E402 + CodeTransformerLM, + ModelConfig, + get_model_presets, +) + + +# These IDs are fixed by CodeTokenizerConfig special token order. +PAD_ID = 0 +UNK_ID = 1 +BOS_ID = 2 +EOS_ID = 3 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Export MINDI 1.0 420M to Hugging Face format.") + parser.add_argument("--repo_id", required=True, help="Hugging Face repo id, for example: yourname/MINDI-1.0-420M") + parser.add_argument( + "--checkpoint_path", + default="checkpoints/component5_420m/step_3200.pt", + help="Path to full-quality checkpoint file.", + ) + parser.add_argument( + "--model_config_path", + default="configs/component4_model_config.yaml", + help="Path to model architecture YAML config.", + ) + parser.add_argument( + "--tokenizer_dir", + default="artifacts/tokenizer/code_tokenizer_v1", + help="Path to tokenizer directory containing tokenizer.json and tokenizer_config.json.", + ) + parser.add_argument( + "--output_dir", + default="hf_release/MINDI-1.0-420M", + help="Output folder for Hugging Face package.", + ) + parser.add_argument( + "--private", + action="store_true", + help="If set, helper script will create a private repo instead of public.", + ) + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config not found: {path}") + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + raise ValueError(f"Invalid YAML format: {path}") + return data + + +def build_model_config(model_cfg_path: Path) -> ModelConfig: + cfg = load_yaml(model_cfg_path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + + if preset: + presets = get_model_presets() + if preset not in presets: + raise ValueError(f"Unknown model preset: {preset}") + merged = presets[preset].__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + + return ModelConfig(**model_cfg) + + +def extract_model_state(checkpoint_path: Path) -> Dict[str, torch.Tensor]: + if not checkpoint_path.exists(): + raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}") + + payload = torch.load(checkpoint_path, map_location="cpu") + if isinstance(payload, dict) and "model_state" in payload: + state = payload["model_state"] + elif isinstance(payload, dict): + state = payload + else: + raise ValueError("Unsupported checkpoint format. Expected dict payload.") + + if not isinstance(state, dict): + raise ValueError("Checkpoint model state is not a dictionary.") + + return state + + +def write_configuration_py(output_dir: Path) -> None: + content = '''""" +Hugging Face config class for MINDI 1.0 420M. +""" + +from transformers import PretrainedConfig + + +class MindiConfig(PretrainedConfig): + model_type = "mindi" + + def __init__( + self, + vocab_size=50000, + max_seq_len=2048, + d_model=1152, + n_layers=23, + n_heads=16, + d_ff=4608, + dropout=0.1, + tie_embeddings=True, + init_std=0.02, + rms_norm_eps=1e-5, + bos_token_id=2, + eos_token_id=3, + pad_token_id=0, + **kwargs, + ): + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) + self.vocab_size = vocab_size + self.max_seq_len = max_seq_len + self.d_model = d_model + self.n_layers = n_layers + self.n_heads = n_heads + self.d_ff = d_ff + self.dropout = dropout + self.tie_embeddings = tie_embeddings + self.init_std = init_std + self.rms_norm_eps = rms_norm_eps +''' + (output_dir / "configuration_mindi.py").write_text(content, encoding="utf-8") + + +def write_modeling_py(output_dir: Path) -> None: + content = '''""" +Hugging Face model class for MINDI 1.0 420M. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PreTrainedModel +from transformers.modeling_outputs import CausalLMOutputWithPast + +from .configuration_mindi import MindiConfig + + +@dataclass +class _Cfg: + vocab_size: int + max_seq_len: int + d_model: int + n_layers: int + n_heads: int + d_ff: int + dropout: float + tie_embeddings: bool + init_std: float + rms_norm_eps: float + + @property + def head_dim(self) -> int: + if self.d_model % self.n_heads != 0: + raise ValueError("d_model must be divisible by n_heads") + return self.d_model // self.n_heads + + +class RMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-5) -> None: + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + norm = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(norm + self.eps) + return self.weight * x + + +class RotaryEmbedding(nn.Module): + def __init__(self, head_dim: int, max_seq_len: int) -> None: + super().__init__() + if head_dim % 2 != 0: + raise ValueError("head_dim must be even for rotary embeddings") + inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim)) + t = torch.arange(max_seq_len, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) + self.register_buffer("cos_cached", torch.cos(freqs), persistent=False) + self.register_buffer("sin_cached", torch.sin(freqs), persistent=False) + + def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]: + cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0) + sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0) + return self._apply_rotary(q, cos, sin), self._apply_rotary(k, cos, sin) + + @staticmethod + def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + xe = x1 * cos - x2 * sin + xo = x1 * sin + x2 * cos + return torch.stack((xe, xo), dim=-1).flatten(-2) + + +class CausalSelfAttention(nn.Module): + def __init__(self, cfg: _Cfg) -> None: + super().__init__() + self.n_heads = cfg.n_heads + self.head_dim = cfg.head_dim + self.scale = self.head_dim ** -0.5 + self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False) + self.k_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False) + self.v_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False) + self.o_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False) + self.dropout = nn.Dropout(cfg.dropout) + self.rotary = RotaryEmbedding(self.head_dim, cfg.max_seq_len) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + bsz, seq_len, _ = x.shape + q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + q, k = self.rotary(q, k, seq_len=seq_len) + out = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=None, + dropout_p=self.dropout.p if self.training else 0.0, + is_causal=True, + scale=self.scale, + ) + out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1) + return self.o_proj(out) + + +class FeedForward(nn.Module): + def __init__(self, cfg: _Cfg) -> None: + super().__init__() + self.fc1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False) + self.fc2 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False) + self.dropout = nn.Dropout(cfg.dropout) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = F.gelu(x, approximate="tanh") + x = self.fc2(x) + x = self.dropout(x) + return x + + +class TransformerBlock(nn.Module): + def __init__(self, cfg: _Cfg) -> None: + super().__init__() + self.norm1 = RMSNorm(cfg.d_model, cfg.rms_norm_eps) + self.attn = CausalSelfAttention(cfg) + self.norm2 = RMSNorm(cfg.d_model, cfg.rms_norm_eps) + self.ffn = FeedForward(cfg) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.norm1(x)) + x = x + self.ffn(self.norm2(x)) + return x + + +class MindiForCausalLM(PreTrainedModel): + config_class = MindiConfig + base_model_prefix = "mindi" + supports_gradient_checkpointing = False + + def __init__(self, config: MindiConfig): + super().__init__(config) + cfg = _Cfg( + vocab_size=config.vocab_size, + max_seq_len=config.max_seq_len, + d_model=config.d_model, + n_layers=config.n_layers, + n_heads=config.n_heads, + d_ff=config.d_ff, + dropout=config.dropout, + tie_embeddings=config.tie_embeddings, + init_std=config.init_std, + rms_norm_eps=config.rms_norm_eps, + ) + + self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.d_model) + self.dropout = nn.Dropout(cfg.dropout) + self.blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)]) + self.norm_final = RMSNorm(cfg.d_model, cfg.rms_norm_eps) + self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False) + + if cfg.tie_embeddings: + self.lm_head.weight = self.embed_tokens.weight + + self.post_init() + + def _init_weights(self, module: nn.Module) -> None: + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std) + + def get_input_embeddings(self) -> nn.Module: + return self.embed_tokens + + def set_input_embeddings(self, value: nn.Module) -> None: + self.embed_tokens = value + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_output_embeddings(self, new_embeddings: nn.Module) -> None: + self.lm_head = new_embeddings + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + **kwargs, + ) -> CausalLMOutputWithPast: + del attention_mask, kwargs + + x = self.embed_tokens(input_ids) + x = self.dropout(x) + + for block in self.blocks: + x = block(x) + + x = self.norm_final(x) + logits = self.lm_head(x) + + loss = None + if labels is not None: + shift_logits = logits[:, :-1, :].contiguous() + shift_labels = labels[:, 1:].contiguous() + loss = F.cross_entropy( + shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1), + ignore_index=-100, + ) + + return CausalLMOutputWithPast(loss=loss, logits=logits) + + @torch.no_grad() + def prepare_inputs_for_generation(self, input_ids: torch.Tensor, **kwargs): + del kwargs + return {"input_ids": input_ids} +''' + (output_dir / "modeling_mindi.py").write_text(content, encoding="utf-8") + + + +def write_tokenization_py(output_dir: Path) -> None: + content = '''""" +Hugging Face tokenizer class for MINDI 1.0 420M. +""" + +from pathlib import Path +from transformers import PreTrainedTokenizerFast + + +class MindiTokenizer(PreTrainedTokenizerFast): + vocab_files_names = {"tokenizer_file": "tokenizer.json"} + model_input_names = ["input_ids", "attention_mask"] + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): + if kwargs.get("tokenizer_file") is None: + local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json" + if local_candidate.exists(): + kwargs["tokenizer_file"] = str(local_candidate) + return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) + + def __init__(self, tokenizer_file=None, **kwargs): + name_or_path = kwargs.pop("name_or_path", None) + if tokenizer_file is None and name_or_path is not None: + candidate = Path(name_or_path) / "tokenizer.json" + if candidate.exists(): + tokenizer_file = str(candidate) + if tokenizer_file is None: + tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json") + kwargs.setdefault("bos_token", "") + kwargs.setdefault("eos_token", "") + kwargs.setdefault("unk_token", "") + kwargs.setdefault("pad_token", "") + super().__init__(tokenizer_file=tokenizer_file, **kwargs) +''' + (output_dir / "tokenization_mindi.py").write_text(content, encoding="utf-8") +def write_model_card(output_dir: Path, repo_id: str, num_params: int) -> None: + text = f'''--- +license: mit +language: +- en +library_name: transformers +pipeline_tag: text-generation +tags: +- code +- python +- javascript +- local-llm +- offline +--- + +# MINDI 1.0 420M + +MINDI 1.0 420M is a 420M-parameter coding language model focused on Python first and JavaScript second. +It is built for local, offline code generation workflows. + +## Capabilities + +- Code generation from natural language prompts +- Code completion +- Bug-fix suggestions +- Code explanation + +## Model Details + +- Parameters: {num_params:,} +- Architecture: Decoder-only Transformer +- Context length: 2048 tokens +- Focus languages: Python, JavaScript + +## Hardware Requirements + +Recommended: +- NVIDIA GPU with 8GB+ VRAM +- CUDA-enabled PyTorch + +Minimum: +- CPU inference works but is slower + +## Quick Start (GPU) + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + +repo_id = "{repo_id}" + +tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained( + repo_id, + trust_remote_code=True, + torch_dtype=torch.float16, +).cuda() + +prompt = "Write a Python function to check if a string is a palindrome." +inputs = tokenizer(prompt, return_tensors="pt").to("cuda") + +with torch.no_grad(): + output = model.generate( + **inputs, + max_new_tokens=220, + temperature=0.2, + top_p=0.9, + do_sample=True, + ) + +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + +## Limitations + +- The model can still produce syntax or logic errors. +- Generated code should always be reviewed and tested. +- Not intended for safety-critical production use without validation. + +## Safety + +Always run tests and static checks before using generated code in production. +''' + (output_dir / "README.md").write_text(text, encoding="utf-8") + + +def write_upload_helper(output_dir: Path, repo_id: str, private: bool) -> None: + visibility = "--private" if private else "--public" + script = f'''# Upload helper for MINDI 1.0 420M +# Run from PowerShell. + +huggingface-cli login +huggingface-cli repo create {repo_id.split('/')[-1]} --type model {visibility} +huggingface-cli upload {repo_id} "{output_dir}" . --repo-type model +''' + helper_path = output_dir / "UPLOAD_TO_HF.ps1" + helper_path.write_text(script, encoding="utf-8") + + +def write_runtime_requirements(output_dir: Path) -> None: + req = '''torch>=2.4.1 +transformers>=4.46.3 +safetensors>=0.4.5 +tokenizers>=0.20.1 +''' + (output_dir / "requirements_runtime.txt").write_text(req, encoding="utf-8") + + +def write_license(output_dir: Path) -> None: + mit = '''MIT License + +Copyright (c) 2026 MINDI 1.0 420M Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''' + (output_dir / "LICENSE").write_text(mit, encoding="utf-8") + + +def main() -> None: + args = parse_args() + + ckpt_path = PROJECT_ROOT / args.checkpoint_path + model_cfg_path = PROJECT_ROOT / args.model_config_path + tokenizer_dir = PROJECT_ROOT / args.tokenizer_dir + output_dir = PROJECT_ROOT / args.output_dir + + if output_dir.exists(): + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + if not tokenizer_dir.exists(): + raise FileNotFoundError(f"Tokenizer directory not found: {tokenizer_dir}") + + model_cfg = build_model_config(model_cfg_path) + model = CodeTransformerLM(model_cfg) + + state = extract_model_state(ckpt_path) + model.load_state_dict(state, strict=True) + model.eval() + + # Save full-quality weights in safetensors format. + tensor_state = {k: v.detach().cpu().contiguous() for k, v in model.state_dict().items()} + if model_cfg.tie_embeddings and "lm_head.weight" in tensor_state: + tensor_state.pop("lm_head.weight") + save_file(tensor_state, str(output_dir / "model.safetensors"), metadata={"format": "pt"}) + + # Save Hugging Face config.json. + hf_config = { + "model_type": "mindi", + "architectures": ["MindiForCausalLM"], + "auto_map": { + "AutoConfig": "configuration_mindi.MindiConfig", + "AutoModelForCausalLM": "modeling_mindi.MindiForCausalLM", + "AutoTokenizer": [None, "tokenization_mindi.MindiTokenizer"], + }, + "vocab_size": model_cfg.vocab_size, + "max_seq_len": model_cfg.max_seq_len, + "d_model": model_cfg.d_model, + "n_layers": model_cfg.n_layers, + "n_heads": model_cfg.n_heads, + "d_ff": model_cfg.d_ff, + "dropout": model_cfg.dropout, + "tie_embeddings": model_cfg.tie_embeddings, + "init_std": model_cfg.init_std, + "rms_norm_eps": model_cfg.rms_norm_eps, + "bos_token_id": BOS_ID, + "eos_token_id": EOS_ID, + "pad_token_id": PAD_ID, + "torch_dtype": "float16", + "transformers_version": "4.46.3", + } + (output_dir / "config.json").write_text(json.dumps(hf_config, indent=2), encoding="utf-8") + + generation_cfg = { + "bos_token_id": BOS_ID, + "eos_token_id": EOS_ID, + "pad_token_id": PAD_ID, + "max_new_tokens": 220, + "temperature": 0.2, + "top_p": 0.9, + "do_sample": True, + } + (output_dir / "generation_config.json").write_text(json.dumps(generation_cfg, indent=2), encoding="utf-8") + + # Copy tokenizer core file. + shutil.copy2(tokenizer_dir / "tokenizer.json", output_dir / "tokenizer.json") + + # Create HF tokenizer metadata files. + tokenizer_cfg = { + "tokenizer_class": "MindiTokenizer", + "model_max_length": int(model_cfg.max_seq_len), + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "", + "tokenizer_file": "tokenizer.json", + "auto_map": {"AutoTokenizer": [None, "tokenization_mindi.MindiTokenizer"]}, + "padding_side": "right", + "truncation_side": "right", + } + (output_dir / "tokenizer_config.json").write_text(json.dumps(tokenizer_cfg, indent=2), encoding="utf-8") + + special_map = { + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "", + } + (output_dir / "special_tokens_map.json").write_text(json.dumps(special_map, indent=2), encoding="utf-8") + + # Custom model files for trust_remote_code=True loading. + write_configuration_py(output_dir) + write_modeling_py(output_dir) + write_tokenization_py(output_dir) + + # Project metadata and helper scripts. + num_params = sum(p.numel() for p in model.parameters()) + write_model_card(output_dir, args.repo_id, num_params) + write_upload_helper(output_dir, args.repo_id, args.private) + write_runtime_requirements(output_dir) + write_license(output_dir) + + print("Hugging Face package export completed.") + print(f"Output folder: {output_dir}") + print(f"Weights: {output_dir / 'model.safetensors'}") + print(f"Tokenizer: {output_dir / 'tokenizer.json'}") + print(f"Model card: {output_dir / 'README.md'}") + + +if __name__ == "__main__": + try: + main() + except Exception as exc: + print("HF export failed.") + print(f"What went wrong: {exc}") + print( + "Fix suggestion: verify checkpoint path, tokenizer path, and that safetensors/yaml are installed " + "in your active Python environment." + ) + raise SystemExit(1) + + + + + + + diff --git a/scripts/launch_component8_chat.py b/scripts/launch_component8_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..eb532ff6cd9c86b1f07e983a0617641a8bc53f8b --- /dev/null +++ b/scripts/launch_component8_chat.py @@ -0,0 +1,44 @@ +""" +Launch Component 8 local chat interface. +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import yaml + +# Ensure src imports work when script is run directly. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.chat_interface.gradio_chat_app import create_demo + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Launch Component 8 chat UI.") + parser.add_argument("--config", default="configs/component8_chat_config.yaml") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + cfg_path = Path(args.config) + if not cfg_path.exists(): + raise FileNotFoundError(f"Config not found: {cfg_path}") + + cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) + host = str(cfg.get("server", {}).get("host", "127.0.0.1")) + port = int(cfg.get("server", {}).get("port", 7860)) + share = bool(cfg.get("server", {}).get("share", False)) + + demo = create_demo(config_path=args.config) + demo.queue(default_concurrency_limit=1) + demo.launch(server_name=host, server_port=port, share=share, inbrowser=False) + + +if __name__ == "__main__": + main() diff --git a/scripts/reprocess_tokenized_from_clean.py b/scripts/reprocess_tokenized_from_clean.py new file mode 100644 index 0000000000000000000000000000000000000000..3717a0d7f6f102c4ead36b6930e9930fde73a081 --- /dev/null +++ b/scripts/reprocess_tokenized_from_clean.py @@ -0,0 +1,230 @@ +""" +Reprocess final tokenized dataset from existing cleaned JSONL. + +Purpose: +- No re-download. +- No full pipeline rerun. +- Rebuild tokenized dataset with improved language detection. +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +from pathlib import Path +from typing import Any, Dict, Optional + +import yaml + +# Ensure src imports work from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.tokenizer.code_tokenizer import CodeTokenizer # noqa: E402 + + +PY_HINTS = [ + "def ", + "import ", + "from ", + "print(", + "if __name__ ==", + "class ", + "lambda ", + "elif ", + "except ", +] + +JS_HINTS = [ + "function ", + "const ", + "let ", + "=>", + "console.log", + "export ", + "require(", + "document.", + "window.", + "=> {", + "var ", +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Rebuild tokenized data from existing clean JSONL.") + parser.add_argument( + "--config", + default="configs/component3_reprocess_from_clean.yaml", + help="Path to YAML config.", + ) + parser.add_argument( + "--max_records", + type=int, + default=None, + help="Optional quick-test limit.", + ) + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config not found: {path}") + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + raise ValueError("Config format is invalid. Expected YAML object.") + return data + + +def infer_language(prompt: str, code: str, raw_language: str, ignore_existing_labels: bool) -> str: + lang = (raw_language or "").lower().strip() + if not ignore_existing_labels: + if "javascript" in lang or lang in {"js", "node", "nodejs"}: + return "javascript" + if "python" in lang: + return "python" + + prompt_lower = prompt.lower() + code_lower = code.lower() + py_score = sum(1 for hint in PY_HINTS if hint in code_lower) + js_score = sum(1 for hint in JS_HINTS if hint in code_lower) + + if "javascript" in prompt_lower or "node.js" in prompt_lower or " js " in f" {prompt_lower} ": + js_score += 2 + if "python" in prompt_lower: + py_score += 2 + + return "javascript" if js_score > py_score else "python" + + +def backup_file_if_needed(path: Path, enabled: bool) -> Optional[Path]: + if not enabled or not path.exists(): + return None + backup = path.with_suffix(path.suffix + ".bak") + shutil.copy2(path, backup) + return backup + + +def main() -> None: + args = parse_args() + try: + cfg = load_yaml(Path(args.config)) + tokenizer_dir = Path(cfg["tokenizer_dir"]) + input_clean_path = Path(cfg["input_clean_path"]) + output_tokenized_path = Path(cfg["output_tokenized_path"]) + output_stats_path = Path(cfg["output_stats_path"]) + ignore_existing_labels = bool(cfg.get("ignore_existing_language_labels", True)) + max_records = args.max_records if args.max_records is not None else cfg.get("max_records") + + if not input_clean_path.exists(): + raise FileNotFoundError( + f"Input clean file not found: {input_clean_path}. " + "Run Component 3 first." + ) + + output_tokenized_path.parent.mkdir(parents=True, exist_ok=True) + output_stats_path.parent.mkdir(parents=True, exist_ok=True) + + token_backup = backup_file_if_needed( + output_tokenized_path, bool(cfg.get("backup_existing_tokenized", True)) + ) + stats_backup = backup_file_if_needed( + output_stats_path, bool(cfg.get("backup_existing_stats", True)) + ) + + tokenizer = CodeTokenizer.load(str(tokenizer_dir)) + + stats: Dict[str, int] = { + "reprocess_seen_total": 0, + "reprocess_kept_total": 0, + "reprocess_dropped_invalid_json": 0, + "reprocess_dropped_empty_fields": 0, + "language_python": 0, + "language_javascript": 0, + } + + with input_clean_path.open("r", encoding="utf-8") as in_f, output_tokenized_path.open( + "w", encoding="utf-8" + ) as out_f: + for line in in_f: + stats["reprocess_seen_total"] += 1 + if max_records is not None and stats["reprocess_seen_total"] > int(max_records): + break + + line = line.strip() + if not line: + stats["reprocess_dropped_empty_fields"] += 1 + continue + + try: + row = json.loads(line) + except json.JSONDecodeError: + stats["reprocess_dropped_invalid_json"] += 1 + continue + + prompt = str(row.get("prompt", "")).strip() + code = str(row.get("code", "")).strip() + raw_language = str(row.get("language", "")).strip() + if not prompt or not code: + stats["reprocess_dropped_empty_fields"] += 1 + continue + + language = infer_language( + prompt=prompt, + code=code, + raw_language=raw_language, + ignore_existing_labels=ignore_existing_labels, + ) + if language == "javascript": + stats["language_javascript"] += 1 + else: + stats["language_python"] += 1 + + formatted_text = tokenizer.format_training_sample( + prompt=prompt, code=code, language=language + ) + token_ids = tokenizer.encode(formatted_text) + out_row = { + "language": language, + "text": formatted_text, + "input_ids": token_ids, + "length": len(token_ids), + } + out_f.write(json.dumps(out_row, ensure_ascii=False) + "\n") + stats["reprocess_kept_total"] += 1 + + if stats["reprocess_kept_total"] % 5000 == 0: + print( + f"[progress] seen={stats['reprocess_seen_total']} " + f"kept={stats['reprocess_kept_total']} " + f"python={stats['language_python']} js={stats['language_javascript']}" + ) + + with output_stats_path.open("w", encoding="utf-8") as f: + json.dump(stats, f, indent=2) + + print("Reprocess completed successfully.") + print(f"Input clean file: {input_clean_path}") + print(f"Output tokenized file: {output_tokenized_path}") + print(f"Output stats file: {output_stats_path}") + if token_backup: + print(f"Tokenized backup: {token_backup}") + if stats_backup: + print(f"Stats backup: {stats_backup}") + print("Summary stats:") + print(json.dumps(stats, indent=2)) + except Exception as exc: + print("Reprocess failed.") + print(f"What went wrong: {exc}") + print( + "Fix suggestion: verify Component 2 tokenizer files and " + "Component 3 clean file paths." + ) + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_component10_export.py b/scripts/run_component10_export.py new file mode 100644 index 0000000000000000000000000000000000000000..efb50c33dd1fc42239fb05fad87b116ed29c3f3a --- /dev/null +++ b/scripts/run_component10_export.py @@ -0,0 +1,376 @@ +""" +Component 10: Export, quantization, benchmarking, and packaging. +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +import time +from pathlib import Path +from typing import Any, Dict, Tuple + +import torch +import torch.nn as nn +import yaml + +# Ensure src imports work. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets # noqa: E402 +from src.tokenizer.code_tokenizer import CodeTokenizer # noqa: E402 +from src.evaluation_system.code_eval import restore_code_from_structured # noqa: E402 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run Component 10 export/optimization.") + parser.add_argument("--config", default="configs/component10_export_config.yaml") + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config not found: {path}") + data = yaml.safe_load(path.read_text(encoding="utf-8-sig")) + if not isinstance(data, dict): + raise ValueError("Invalid YAML format.") + return data + + +def build_model_config(path: Path) -> ModelConfig: + cfg = load_yaml(path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + if preset: + merged = get_model_presets()[preset].__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + return ModelConfig(**model_cfg) + + +def prepare_prompt_ids(tokenizer: CodeTokenizer, prompt: str) -> list[int]: + text = tokenizer.format_training_sample(prompt=prompt, code="", language="python") + text = text.replace(" ", "").strip() + ids = tokenizer.encode(text) + eos = tokenizer.special_token_ids.get("") + if eos is not None and len(ids) > 1 and ids[-1] == int(eos): + ids = ids[:-1] + return ids + + +@torch.no_grad() +def benchmark_tokens_per_sec( + model: CodeTransformerLM, + tokenizer: CodeTokenizer, + prompt: str, + max_new_tokens: int, + device: torch.device, +) -> Dict[str, float]: + model.eval() + ids = prepare_prompt_ids(tokenizer, prompt) + input_ids = torch.tensor([ids], dtype=torch.long, device=device) + + eos_id = tokenizer.special_token_ids.get("") + + if device.type == "cuda": + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + torch.cuda.synchronize() + + t0 = time.perf_counter() + generated = 0 + for _ in range(max_new_tokens): + out = model(input_ids=input_ids) + next_id = torch.argmax(out["logits"][:, -1, :], dim=-1, keepdim=True) + input_ids = torch.cat([input_ids, next_id], dim=1) + generated += 1 + if eos_id is not None and int(next_id.item()) == int(eos_id): + break + + if device.type == "cuda": + torch.cuda.synchronize() + dt = max(1e-6, time.perf_counter() - t0) + + decoded = tokenizer.decode(input_ids[0].tolist()) + code = restore_code_from_structured(decoded) + + peak_vram = float(torch.cuda.max_memory_allocated() / (1024**3)) if device.type == "cuda" else 0.0 + + return { + "generated_tokens": float(generated), + "seconds": float(dt), + "tokens_per_second": float(generated / dt), + "peak_vram_gb": peak_vram, + "preview_code": code[:300], + } + + +def bytes_to_gb(n: int) -> float: + return float(n / (1024**3)) + + +def write_portable_launcher(portable_dir: Path) -> None: + bat = r"""@echo off`r`ntitle MINDI 1.0 420M +setlocal +cd /d "%~dp0" +if not exist .venv ( + echo [setup] Creating virtual environment... + py -3 -m venv .venv +) +call .venv\Scripts\activate.bat +python -m pip install --upgrade pip >nul +python -m pip install -r requirements_portable.txt +python app\launch_portable_chat.py --config app\portable_chat_config.yaml +endlocal +""" + (portable_dir / "Start_MINDI.bat").write_text(bat, encoding="utf-8") + + +def write_portable_requirements(portable_dir: Path) -> None: + req = """torch==2.5.1 +tokenizers==0.20.1 +pyyaml==6.0.2 +gradio==5.5.0 +pygments==2.19.2 +""" + (portable_dir / "requirements_portable.txt").write_text(req, encoding="utf-8") + + +def write_portable_chat_files(portable_dir: Path, port: int) -> None: + app_dir = portable_dir / "app" + app_dir.mkdir(parents=True, exist_ok=True) + + cfg = f"""model: + model_config_path: app/model_config.yaml + quantized_state_path: model/model_step3200_int8_state.pt + tokenizer_dir: model/tokenizer + +server: + host: 127.0.0.1 + port: {port} +""" + (app_dir / "portable_chat_config.yaml").write_text(cfg, encoding="utf-8") + + launch = r'''from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import gradio as gr +import torch +import yaml + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets +from src.tokenizer.code_tokenizer import CodeTokenizer +from src.inference_engine.inference_engine import InferenceEngine, DecodingConfig + + +def load_yaml(path: Path): + return yaml.safe_load(path.read_text(encoding="utf-8-sig")) + + +def build_model_config(path: Path) -> ModelConfig: + cfg = load_yaml(path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + if preset: + merged = get_model_presets()[preset].__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + return ModelConfig(**model_cfg) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--config", default="app/portable_chat_config.yaml") + ap.add_argument("--self_test", action="store_true") + args = ap.parse_args() + + cfg = load_yaml(PROJECT_ROOT / args.config) + mcfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"]) + + tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"])) + model = CodeTransformerLM(mcfg).cpu().float() + model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) + state = torch.load(PROJECT_ROOT / cfg["model"]["quantized_state_path"], map_location="cpu") + model.load_state_dict(state) + + engine = InferenceEngine(model=model, tokenizer=tokenizer, device=torch.device("cpu")) + dcfg = DecodingConfig(max_new_tokens=220, min_tokens_before_stop_check=64) + + if args.self_test: + out = engine.generate_with_retry("Write a Python function to add two numbers.", "python", dcfg) + code = out["final"]["code"] + print("portable_self_test_ok=", bool(code.strip())) + return + + def respond(prompt, history): + history = history or [] + p = (prompt or "").strip() + if not p: + return history, "" + out = engine.generate_with_retry(p, "python", dcfg) + history.append((p, out["final"]["code"])) + return history, "" + + with gr.Blocks(title="MINDI 1.0 420M") as demo: + gr.Markdown("## MINDI 1.0 420M (INT8 Portable)") + chat = gr.Chatbot(height=520) + box = gr.Textbox(label="Prompt", lines=4) + btn = gr.Button("Generate") + clear = gr.Button("Clear") + btn.click(respond, [box, chat], [chat, box]) + box.submit(respond, [box, chat], [chat, box]) + clear.click(lambda: ([], ""), None, [chat, box]) + + demo.launch(server_name=cfg["server"].get("host", "127.0.0.1"), server_port=int(cfg["server"].get("port", 7861)), share=False, inbrowser=False) + + +if __name__ == "__main__": + main() +''' + (app_dir / "launch_portable_chat.py").write_text(launch, encoding="utf-8") + + +def copy_runtime_sources(portable_dir: Path) -> None: + src_root = PROJECT_ROOT / "src" + dst_root = portable_dir / "src" + needed = [ + "__init__.py", + "model_architecture/__init__.py", + "model_architecture/code_transformer.py", + "tokenizer/__init__.py", + "tokenizer/code_tokenizer.py", + "evaluation_system/__init__.py", + "evaluation_system/code_eval.py", + "inference_engine/__init__.py", + "inference_engine/inference_engine.py", + ] + for rel in needed: + src = src_root / rel + dst = dst_root / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + + +def main() -> None: + args = parse_args() + cfg = load_yaml(PROJECT_ROOT / args.config) + + mcfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"]) + tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"])) + + source_ckpt = PROJECT_ROOT / cfg["model"]["source_checkpoint_path"] + if not source_ckpt.exists(): + raise FileNotFoundError(f"Source checkpoint not found: {source_ckpt}") + + # Baseline model (GPU if available). + baseline_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + baseline = CodeTransformerLM(mcfg).to(baseline_device) + payload = torch.load(source_ckpt, map_location=baseline_device) + baseline.load_state_dict(payload["model_state"]) + if baseline_device.type == "cuda": + baseline.half() + + bench_prompt = str(cfg["benchmark"].get("prompt", "Write a Python function to add two numbers.")) + max_new_tokens = int(cfg["benchmark"].get("max_new_tokens", 120)) + + baseline_metrics = benchmark_tokens_per_sec( + model=baseline, + tokenizer=tokenizer, + prompt=bench_prompt, + max_new_tokens=max_new_tokens, + device=baseline_device, + ) + + # Quantize to INT8 on CPU and save separate file. + quant_model = CodeTransformerLM(mcfg).cpu().float() + payload_cpu = torch.load(source_ckpt, map_location="cpu") + quant_model.load_state_dict(payload_cpu["model_state"]) + quant_model = torch.quantization.quantize_dynamic(quant_model, {nn.Linear}, dtype=torch.qint8) + + q_path = PROJECT_ROOT / cfg["quantization"]["quantized_output_path"] + q_path.parent.mkdir(parents=True, exist_ok=True) + torch.save(quant_model.state_dict(), q_path) + + quant_metrics = benchmark_tokens_per_sec( + model=quant_model, + tokenizer=tokenizer, + prompt=bench_prompt, + max_new_tokens=max_new_tokens, + device=torch.device("cpu"), + ) + + before_size_gb = bytes_to_gb(source_ckpt.stat().st_size) + after_size_gb = bytes_to_gb(q_path.stat().st_size) + + report = { + "source_checkpoint": str(source_ckpt), + "quantized_checkpoint": str(q_path), + "size_before_gb": before_size_gb, + "size_after_gb": after_size_gb, + "baseline_device": str(baseline_device), + "baseline_tokens_per_second": baseline_metrics["tokens_per_second"], + "quantized_tokens_per_second": quant_metrics["tokens_per_second"], + "baseline_peak_vram_gb": baseline_metrics["peak_vram_gb"], + "quantized_peak_vram_gb": quant_metrics["peak_vram_gb"], + "baseline_generated_tokens": baseline_metrics["generated_tokens"], + "quantized_generated_tokens": quant_metrics["generated_tokens"], + } + + report_path = PROJECT_ROOT / cfg["outputs"]["benchmark_report_json"] + report_path.parent.mkdir(parents=True, exist_ok=True) + report_path.write_text(json.dumps(report, indent=2), encoding="utf-8") + + # Build portable package folder. + portable_dir = PROJECT_ROOT / cfg["package"]["output_dir"] + if portable_dir.exists(): + shutil.rmtree(portable_dir) + portable_dir.mkdir(parents=True, exist_ok=True) + + # Copy model artifacts. + (portable_dir / "model").mkdir(parents=True, exist_ok=True) + shutil.copy2(q_path, portable_dir / "model" / q_path.name) + + tok_src = PROJECT_ROOT / cfg["model"]["tokenizer_dir"] + tok_dst = portable_dir / "model" / "tokenizer" + shutil.copytree(tok_src, tok_dst) + + shutil.copy2(PROJECT_ROOT / cfg["model"]["model_config_path"], portable_dir / "app" / "model_config.yaml") if (portable_dir / "app").exists() else None + write_portable_chat_files(portable_dir, port=int(cfg["package"].get("app_port", 7861))) + shutil.copy2(PROJECT_ROOT / cfg["model"]["model_config_path"], portable_dir / "app" / "model_config.yaml") + + copy_runtime_sources(portable_dir) + write_portable_requirements(portable_dir) + write_portable_launcher(portable_dir) + + # Verify packaged run independently (self-test). + py = PROJECT_ROOT / ".venv" / "Scripts" / "python.exe" + if py.exists(): + import subprocess + + cmd = [str(py), str(portable_dir / "app" / "launch_portable_chat.py"), "--config", "app/portable_chat_config.yaml", "--self_test"] + proc = subprocess.run(cmd, cwd=str(portable_dir), capture_output=True, text=True, timeout=120) + verify_ok = (proc.returncode == 0) and ("portable_self_test_ok= True" in (proc.stdout + proc.stderr)) + else: + verify_ok = False + + print("Component 10 export completed.") + print(f"INT8 model saved: {q_path}") + print(f"Benchmark report: {report_path}") + print(f"Portable package: {portable_dir}") + print(f"Portable self-test ok: {verify_ok}") + + +if __name__ == "__main__": + main() + diff --git a/scripts/run_component3_dataset_pipeline.py b/scripts/run_component3_dataset_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..1f3bc2657d45b667bcfa79811ccf30b3dbf4c0bc --- /dev/null +++ b/scripts/run_component3_dataset_pipeline.py @@ -0,0 +1,126 @@ +""" +Component 3 runner script. + +Reads YAML config and executes full Hugging Face dataset preprocessing. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any, Dict, List + +import yaml + +# This makes "src" imports work when script is run from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.dataset_pipeline.hf_dataset_pipeline import ( # noqa: E402 + HFDatasetPipeline, + PipelineConfig, + SourceDatasetSpec, +) + + +def parse_args() -> argparse.Namespace: + # Parse command-line arguments for config and optional overrides. + parser = argparse.ArgumentParser(description="Run Component 3 dataset preprocessing pipeline.") + parser.add_argument( + "--config", + default="configs/component3_dataset_pipeline.yaml", + help="Path to YAML config file.", + ) + parser.add_argument( + "--max_records_per_dataset", + type=int, + default=None, + help="Optional override for quick test runs.", + ) + return parser.parse_args() + + +def _read_yaml(path: Path) -> Dict[str, Any]: + # Reads YAML file with friendly errors. + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {path}") + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + raise ValueError("Config file is invalid. Expected a YAML object at top level.") + return data + + +def _build_config(data: Dict[str, Any], max_records_override: int | None) -> PipelineConfig: + # Converts generic dict into strongly typed config objects. + dataset_specs: List[SourceDatasetSpec] = [] + datasets_data = data.get("datasets", []) + if not isinstance(datasets_data, list) or not datasets_data: + raise ValueError("Config must include a non-empty 'datasets' list.") + + for item in datasets_data: + dataset_specs.append( + SourceDatasetSpec( + hf_dataset_id=str(item["hf_dataset_id"]), + split=str(item.get("split", "train")), + prompt_field=str(item["prompt_field"]), + code_field=str(item["code_field"]), + language_field=item.get("language_field"), + default_language=str(item.get("default_language", "python")), + ) + ) + + cfg = PipelineConfig( + datasets=dataset_specs, + tokenizer_dir=str(data["tokenizer_dir"]), + interim_output_dir=str(data["interim_output_dir"]), + processed_output_dir=str(data["processed_output_dir"]), + dedupe_db_path=str(data["dedupe_db_path"]), + max_records_per_dataset=data.get("max_records_per_dataset"), + min_prompt_chars=int(data.get("min_prompt_chars", 8)), + min_code_chars=int(data.get("min_code_chars", 16)), + max_code_chars=int(data.get("max_code_chars", 40_000)), + progress_every=int(data.get("progress_every", 1_000)), + ) + + if max_records_override is not None: + cfg.max_records_per_dataset = max_records_override + return cfg + + +def main() -> None: + # Main entry with explicit plain-English error handling. + args = parse_args() + try: + config_path = Path(args.config) + data = _read_yaml(config_path) + cfg = _build_config(data, args.max_records_per_dataset) + pipeline = HFDatasetPipeline(cfg) + try: + stats = pipeline.run() + finally: + pipeline.close() + + print("Component 3 pipeline completed successfully.") + print("Saved files:") + print(f"- {Path(cfg.interim_output_dir) / 'combined_clean.jsonl'}") + print(f"- {Path(cfg.processed_output_dir) / 'train_tokenized.jsonl'}") + print(f"- {Path(cfg.processed_output_dir) / 'pipeline_stats.json'}") + print("Summary stats:") + print(json.dumps(stats, indent=2)) + except Exception as exc: + print("Component 3 pipeline failed.") + print(f"What went wrong: {exc}") + print( + "Fix suggestion: verify internet access for Hugging Face, tokenizer path, " + "and config field names." + ) + raise SystemExit(1) + + +if __name__ == "__main__": + main() + diff --git a/scripts/run_component6_evaluation.py b/scripts/run_component6_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..5377ecadab9915920b62c42895c6ba14a8d35607 --- /dev/null +++ b/scripts/run_component6_evaluation.py @@ -0,0 +1,241 @@ +""" +Component 6: Evaluation system. + +- Computes validation loss for selected checkpoints. +- Generates code for 5 simple Python prompts. +- Performs syntax validity checks. +- Saves results JSON. +""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from pathlib import Path +from typing import Any, Dict, List + +import torch +import yaml +from torch.utils.data import DataLoader + +# Ensure src imports work. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.evaluation_system.code_eval import python_syntax_ok, restore_code_from_structured, save_json # noqa: E402 +from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets # noqa: E402 +from src.tokenizer.code_tokenizer import CodeTokenizer # noqa: E402 +from src.training_pipeline.tokenized_dataset import CausalCollator, TokenizedJsonlDataset # noqa: E402 + + +PROMPTS = [ + "Write a Python function to check if a number is prime.", + "Write Python code to reverse a string without using slicing.", + "Create a Python function that returns Fibonacci numbers up to n.", + "Write Python code to count word frequency in a sentence.", + "Write a Python function to sort a list of dictionaries by a key.", +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run Component 6 evaluation.") + parser.add_argument("--config", default="configs/component6_evaluation_config.yaml") + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config not found: {path}") + data = yaml.safe_load(path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError("Invalid YAML config.") + return data + + +def build_model_config(model_cfg_path: Path) -> ModelConfig: + cfg = load_yaml(model_cfg_path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + if preset: + presets = get_model_presets() + if preset not in presets: + raise ValueError(f"Unknown preset: {preset}") + merged = presets[preset].__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + return ModelConfig(**model_cfg) + + +@torch.no_grad() +def eval_val_loss(model: CodeTransformerLM, val_loader: DataLoader, device: torch.device, max_batches: int = 50) -> float: + model.eval() + losses = [] + for i, (input_ids, labels) in enumerate(val_loader): + if i >= max_batches: + break + input_ids = input_ids.to(device) + labels = labels.to(device) + with torch.amp.autocast("cuda", enabled=(device.type == "cuda"), dtype=torch.float16): + out = model(input_ids=input_ids, labels=labels) + losses.append(float(out["loss"].item())) + model.train() + if not losses: + return 1e9 + return sum(losses) / len(losses) + + +@torch.no_grad() +def generate_code( + model: CodeTransformerLM, + tokenizer: CodeTokenizer, + prompt: str, + device: torch.device, + max_new_tokens: int, + temperature: float, + top_p: float, +) -> str: + model.eval() + prompt_text = tokenizer.format_training_sample(prompt=prompt, code="", language="python") + # Remove trailing empty code marker noise. + prompt_text = prompt_text.replace(" ", "").strip() + + ids = tokenizer.encode(prompt_text) + eos_id = tokenizer.special_token_ids.get("", None) + # Remove trailing EOS from prompt so generation continues naturally. + if eos_id is not None and len(ids) > 1 and ids[-1] == int(eos_id): + ids = ids[:-1] + input_ids = torch.tensor([ids], dtype=torch.long, device=device) + + for _ in range(max_new_tokens): + out = model(input_ids=input_ids) + logits = out["logits"][:, -1, :] + + if temperature <= 0: + next_id = torch.argmax(logits, dim=-1, keepdim=True) + else: + logits = logits / temperature + probs = torch.softmax(logits, dim=-1) + + # Top-p (nucleus) sampling. + sorted_probs, sorted_idx = torch.sort(probs, descending=True) + cumulative = torch.cumsum(sorted_probs, dim=-1) + cutoff = cumulative > top_p + cutoff[..., 1:] = cutoff[..., :-1].clone() + cutoff[..., 0] = False + sorted_probs[cutoff] = 0.0 + sorted_probs = sorted_probs / sorted_probs.sum(dim=-1, keepdim=True) + sampled = torch.multinomial(sorted_probs, num_samples=1) + next_id = sorted_idx.gather(-1, sampled) + + input_ids = torch.cat([input_ids, next_id], dim=1) + if eos_id is not None and int(next_id.item()) == int(eos_id): + break + + decoded = tokenizer.decode(input_ids[0].tolist()) + code = restore_code_from_structured(decoded) + return code + + +def main() -> None: + args = parse_args() + try: + cfg = load_yaml(Path(args.config)) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device.type != "cuda": + raise RuntimeError("CUDA is required for this evaluation run.") + + model_cfg = build_model_config(Path(cfg["model"]["model_config_path"])) + model_cfg.max_seq_len = int(cfg["inference"]["max_seq_len"]) + + tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / "artifacts" / "tokenizer" / "code_tokenizer_v1")) + + val_ds = TokenizedJsonlDataset( + path=str(PROJECT_ROOT / cfg["data"]["tokenized_jsonl_path"]), + split="val", + val_ratio=float(cfg["data"].get("val_ratio", 0.02)), + split_seed=int(cfg["data"].get("split_seed", 17)), + ) + val_loader = DataLoader( + val_ds, + batch_size=1, + shuffle=False, + collate_fn=CausalCollator(pad_token_id=0, max_seq_len=model_cfg.max_seq_len), + ) + + ckpt_results: List[Dict[str, Any]] = [] + for ckpt_rel in cfg["model"]["checkpoint_paths"]: + ckpt_path = PROJECT_ROOT / ckpt_rel + if not ckpt_path.exists(): + raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}") + + model = CodeTransformerLM(model_cfg).to(device) + payload = torch.load(ckpt_path, map_location=device) + model.load_state_dict(payload["model_state"]) + model.half() + + val_loss = eval_val_loss(model, val_loader, device=device, max_batches=50) + + generations = [] + for p in PROMPTS: + code = generate_code( + model=model, + tokenizer=tokenizer, + prompt=p, + device=device, + max_new_tokens=int(cfg["inference"].get("max_new_tokens", 160)), + temperature=float(cfg["inference"].get("temperature", 0.8)), + top_p=float(cfg["inference"].get("top_p", 0.9)), + ) + generations.append( + { + "prompt": p, + "generated_code": code, + "python_syntax_ok": python_syntax_ok(code), + } + ) + + ckpt_results.append( + { + "checkpoint": str(ckpt_path), + "step": int(payload.get("step", -1)), + "best_val_in_checkpoint": float(payload.get("best_val", math.nan)), + "eval_val_loss_now": float(val_loss), + "generations": generations, + } + ) + + # Basic fit flags from checkpoint trend. + fit_flag = "healthy" + if ckpt_results and ckpt_results[-1]["eval_val_loss_now"] > 1.5: + fit_flag = "underfitting" + + out = { + "fit_flag": fit_flag, + "checkpoints": ckpt_results, + "recommended_prompts": PROMPTS, + } + + out_path = str(PROJECT_ROOT / cfg["output"]["results_json"]) + save_json(out_path, out) + + print("Component 6 evaluation completed.") + print(f"Saved results: {out_path}") + print(f"Fit flag: {fit_flag}") + for row in ckpt_results: + print(f"Checkpoint step={row['step']} val_loss={row['eval_val_loss_now']:.4f}") + ok_count = sum(1 for g in row["generations"] if g["python_syntax_ok"]) + print(f"Python syntax valid in generated samples: {ok_count}/5") + + except Exception as exc: + print("Component 6 evaluation failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: verify checkpoint path and tokenizer path.") + raise SystemExit(1) + + +if __name__ == "__main__": + main() + diff --git a/scripts/run_component7_inference_benchmark.py b/scripts/run_component7_inference_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..4fe6b409d6c640f2eb1c9116b5aaa7cfe90f1a16 --- /dev/null +++ b/scripts/run_component7_inference_benchmark.py @@ -0,0 +1,145 @@ +""" +Run Component 7 inference benchmark on the same 5 Python prompts. +Outputs before/after syntax-valid score. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any, Dict + +import torch +import yaml + +# Ensure imports work from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.inference_engine.inference_engine import DecodingConfig, InferenceEngine # noqa: E402 +from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets # noqa: E402 +from src.tokenizer.code_tokenizer import CodeTokenizer # noqa: E402 + +PROMPTS = [ + "Write a Python function to check if a number is prime.", + "Write Python code to reverse a string without using slicing.", + "Create a Python function that returns Fibonacci numbers up to n.", + "Write Python code to count word frequency in a sentence.", + "Write a Python function to sort a list of dictionaries by a key.", +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run Component 7 inference benchmark.") + parser.add_argument("--config", default="configs/component7_inference_config.yaml") + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config not found: {path}") + data = yaml.safe_load(path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError("Invalid YAML config.") + return data + + +def build_model_config(path: Path) -> ModelConfig: + cfg = load_yaml(path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + if preset: + merged = get_model_presets()[preset].__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + return ModelConfig(**model_cfg) + + +def main() -> None: + args = parse_args() + try: + cfg = load_yaml(Path(args.config)) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device.type != "cuda": + raise RuntimeError("CUDA is required for Component 7 benchmark.") + + model_cfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"]) + model = CodeTransformerLM(model_cfg).to(device) + + ckpt_path = PROJECT_ROOT / cfg["model"]["checkpoint_path"] + payload = torch.load(ckpt_path, map_location=device) + model.load_state_dict(payload["model_state"]) + model.half() + + tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"])) + + dcfg = DecodingConfig( + max_new_tokens=int(cfg["inference"].get("max_new_tokens", 180)), + greedy_temperature=float(cfg["inference"].get("greedy_temperature", 0.0)), + retry2_temperature=float(cfg["inference"].get("retry2_temperature", 0.25)), + retry2_top_p=float(cfg["inference"].get("retry2_top_p", 0.85)), + retry3_temperature=float(cfg["inference"].get("retry3_temperature", 0.35)), + retry3_top_p=float(cfg["inference"].get("retry3_top_p", 0.90)), + max_retries=int(cfg["inference"].get("max_retries", 3)), + min_tokens_before_stop_check=int(cfg["inference"].get("min_tokens_before_stop_check", 24)), + ) + + engine = InferenceEngine(model=model, tokenizer=tokenizer, device=device) + + rows = [] + syntax_ok_count = 0 + for p in PROMPTS: + res = engine.generate_with_retry(prompt=p, language=str(cfg["inference"].get("language", "python")), cfg=dcfg) + final = res["final"] + syntax_ok = bool(final["syntax_ok"]) + syntax_ok_count += 1 if syntax_ok else 0 + rows.append( + { + "prompt": p, + "final_code": final["code"], + "syntax_ok": syntax_ok, + "attempt_used": final["attempt"], + "generated_tokens": final["generated_tokens"], + "attempts": res["attempts"], + } + ) + + before_score = None + before_path = PROJECT_ROOT / "artifacts" / "evaluation" / "component6_eval_results.json" + if before_path.exists(): + d = json.loads(before_path.read_text(encoding="utf-8")) + try: + before_score = sum(1 for x in d["checkpoints"][0]["generations"] if x["python_syntax_ok"]) + except Exception: + before_score = None + + out = { + "checkpoint": str(ckpt_path), + "step": int(payload.get("step", -1)), + "before_component6_syntax_ok_out_of_5": before_score, + "after_component7_syntax_ok_out_of_5": syntax_ok_count, + "prompts": rows, + } + + out_path = PROJECT_ROOT / cfg["output"]["results_json"] + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8") + + print("Component 7 inference benchmark completed.") + if before_score is not None: + print(f"Before (Component 6): {before_score}/5 syntax-valid") + print(f"After (Component 7): {syntax_ok_count}/5 syntax-valid") + print(f"Saved results: {out_path}") + + except Exception as exc: + print("Component 7 benchmark failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: verify checkpoint and tokenizer paths.") + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_component9_lora_finetune.py b/scripts/run_component9_lora_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..a3d49fe76e7eac64a33b521784fdbe12970b348d --- /dev/null +++ b/scripts/run_component9_lora_finetune.py @@ -0,0 +1,246 @@ +""" +Component 9: LoRA fine-tuning pipeline for custom prompt->code pairs. +""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +import time +from pathlib import Path +from typing import Any, Dict, Tuple + +import torch +import yaml +from torch.optim import AdamW +from torch.utils.data import DataLoader, random_split +from tqdm import tqdm + +# Ensure src imports work. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.finetuning_system.custom_pair_dataset import CustomPairDataset # noqa: E402 +from src.finetuning_system.lora_adapter import LoRAConfig, apply_lora, load_lora_state_dict, lora_state_dict # noqa: E402 +from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets # noqa: E402 +from src.training_pipeline.tokenized_dataset import CausalCollator # noqa: E402 +from src.tokenizer.code_tokenizer import CodeTokenizer # noqa: E402 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run Component 9 LoRA fine-tuning.") + parser.add_argument("--config", default="configs/component9_lora_config.yaml") + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config not found: {path}") + data = yaml.safe_load(path.read_text(encoding="utf-8-sig")) + if not isinstance(data, dict): + raise ValueError("Invalid YAML format.") + return data + + +def build_model_config(path: Path) -> ModelConfig: + cfg = load_yaml(path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + if preset: + merged = get_model_presets()[preset].__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + return ModelConfig(**model_cfg) + + +def get_vram_gb() -> float: + if not torch.cuda.is_available(): + return 0.0 + return torch.cuda.memory_allocated() / (1024**3) + + +def save_lora_ckpt(path: Path, step: int, lora_state: dict, optim_state: dict, best_val: float, no_improve: int) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "step": step, + "lora_state": lora_state, + "optimizer_state": optim_state, + "best_val": best_val, + "no_improve": no_improve, + } + torch.save(payload, path) + + +@torch.no_grad() +def eval_loss(model: CodeTransformerLM, loader: DataLoader, device: torch.device, use_fp16: bool) -> float: + model.eval() + vals = [] + for input_ids, labels in loader: + input_ids = input_ids.to(device) + labels = labels.to(device) + with torch.amp.autocast("cuda", enabled=(use_fp16 and device.type == "cuda"), dtype=torch.float16): + out = model(input_ids=input_ids, labels=labels) + vals.append(float(out["loss"].item())) + model.train() + if not vals: + return 1e9 + return sum(vals) / len(vals) + + +def main() -> None: + args = parse_args() + try: + cfg = load_yaml(PROJECT_ROOT / args.config) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device.type != "cuda": + raise RuntimeError("CUDA GPU is required for LoRA fine-tuning.") + + model_cfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"]) + model = CodeTransformerLM(model_cfg).to(device) + + base_ckpt = torch.load(PROJECT_ROOT / cfg["model"]["base_checkpoint_path"], map_location=device) + model.load_state_dict(base_ckpt["model_state"]) + + lcfg = LoRAConfig( + r=int(cfg["lora"].get("r", 8)), + alpha=int(cfg["lora"].get("alpha", 16)), + dropout=float(cfg["lora"].get("dropout", 0.05)), + target_keywords=list(cfg["lora"].get("target_keywords", ["q_proj", "k_proj", "v_proj", "o_proj", "fc1", "fc2"])), + ) + replaced = apply_lora(model, lcfg) + if not replaced: + raise RuntimeError("No modules were LoRA-wrapped. Check target_keywords.") + # LoRA modules are created on CPU by default, so move full model back to GPU. + model = model.to(device) + + tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"])) + ds = CustomPairDataset( + path=str(PROJECT_ROOT / cfg["finetune"]["custom_data_path"]), + tokenizer=tokenizer, + max_seq_len=int(cfg["finetune"].get("max_seq_len", 512)), + ) + + n_val = max(1, int(0.1 * len(ds))) + n_train = len(ds) - n_val + train_ds, val_ds = random_split(ds, [n_train, n_val], generator=torch.Generator().manual_seed(17)) + + collator = CausalCollator(pad_token_id=0, max_seq_len=int(cfg["finetune"].get("max_seq_len", 512))) + train_loader = DataLoader(train_ds, batch_size=int(cfg["finetune"].get("micro_batch_size", 1)), shuffle=True, collate_fn=collator) + val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, collate_fn=collator) + + trainable = [p for p in model.parameters() if p.requires_grad] + optimizer = AdamW(trainable, lr=float(cfg["finetune"].get("learning_rate", 3e-4)), weight_decay=float(cfg["finetune"].get("weight_decay", 0.0))) + + use_fp16 = bool(cfg["finetune"].get("use_fp16", True)) + scaler = torch.amp.GradScaler("cuda", enabled=use_fp16) + + out_dir = PROJECT_ROOT / cfg["finetune"]["output_dir"] + out_dir.mkdir(parents=True, exist_ok=True) + + max_steps = int(cfg["finetune"].get("max_steps", 1200)) + save_every = int(cfg["finetune"].get("save_every", 100)) + eval_every = int(cfg["finetune"].get("eval_every", 100)) + grad_accum = int(cfg["finetune"].get("grad_accum_steps", 16)) + max_vram = float(cfg["finetune"].get("max_vram_gb", 7.0)) + patience = int(cfg["finetune"].get("early_stopping_patience_evals", 6)) + min_delta = float(cfg["finetune"].get("early_stopping_min_delta", 5e-4)) + + step = 0 + best_val = 1e9 + no_improve = 0 + + resume_from = str(cfg.get("resume", {}).get("resume_from", "none")) + if resume_from != "none": + ckpt = out_dir / "latest.pt" if resume_from == "latest" else Path(resume_from) + if ckpt.exists(): + payload = torch.load(ckpt, map_location=device) + load_lora_state_dict(model, payload["lora_state"]) + optimizer.load_state_dict(payload["optimizer_state"]) + step = int(payload.get("step", 0)) + best_val = float(payload.get("best_val", 1e9)) + no_improve = int(payload.get("no_improve", 0)) + print(f"[resume] loaded {ckpt} at step {step}") + + model.train() + pbar = tqdm(total=max_steps, initial=step, desc="lora_finetune", dynamic_ncols=True) + running = 0 + + while step < max_steps: + for input_ids, labels in train_loader: + if step >= max_steps: + break + input_ids = input_ids.to(device) + labels = labels.to(device) + + with torch.amp.autocast("cuda", enabled=use_fp16, dtype=torch.float16): + out = model(input_ids=input_ids, labels=labels) + loss = out["loss"] / grad_accum + + scaler.scale(loss).backward() + running += 1 + + if running % grad_accum == 0: + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad(set_to_none=True) + step += 1 + pbar.update(1) + pbar.set_postfix({"loss": f"{float(loss.item())*grad_accum:.4f}", "vram": f"{get_vram_gb():.2f}GB"}) + + if get_vram_gb() > max_vram: + raise RuntimeError(f"VRAM threshold exceeded: {get_vram_gb():.2f}GB > {max_vram:.2f}GB") + + if step % save_every == 0: + ck = out_dir / f"step_{step}.pt" + save_lora_ckpt(ck, step, lora_state_dict(model), optimizer.state_dict(), best_val, no_improve) + save_lora_ckpt(out_dir / "latest.pt", step, lora_state_dict(model), optimizer.state_dict(), best_val, no_improve) + print(f"\n[checkpoint] saved {ck}") + + if step % eval_every == 0: + val = eval_loss(model, val_loader, device, use_fp16=use_fp16) + print(f"\n[eval] step={step} val_loss={val:.4f} best={best_val:.4f}") + if val < (best_val - min_delta): + best_val = val + no_improve = 0 + save_lora_ckpt(out_dir / "best.pt", step, lora_state_dict(model), optimizer.state_dict(), best_val, no_improve) + else: + no_improve += 1 + if no_improve >= patience: + print("\n[early_stop] no improvement, stopping.") + step = max_steps + break + + pbar.close() + save_lora_ckpt(out_dir / "latest.pt", step, lora_state_dict(model), optimizer.state_dict(), best_val, no_improve) + + # Save metadata for adapter loading. + meta = { + "step": step, + "best_val": best_val, + "lora_config": { + "r": lcfg.r, + "alpha": lcfg.alpha, + "dropout": lcfg.dropout, + "target_keywords": lcfg.target_keywords, + }, + "base_checkpoint_path": cfg["model"]["base_checkpoint_path"], + } + (out_dir / "adapter_meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8-sig") + + print("Component 9 LoRA fine-tuning completed.") + print(f"LoRA adapters saved in: {out_dir}") + + except Exception as exc: + print("Component 9 LoRA fine-tuning failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: verify custom data file format and checkpoint paths.") + raise SystemExit(1) + + +if __name__ == "__main__": + main() + + diff --git a/scripts/setup_windows_environment.ps1 b/scripts/setup_windows_environment.ps1 new file mode 100644 index 0000000000000000000000000000000000000000..dcd5185619d94a329e260149eb4adf6d67af163a --- /dev/null +++ b/scripts/setup_windows_environment.ps1 @@ -0,0 +1,114 @@ +# This script sets up the entire local project environment on Windows. +# It creates folders, creates a virtual environment, upgrades pip tools, +# and installs all dependencies from requirements.txt. + +$ErrorActionPreference = "Stop" + +function Invoke-StepCommand { + param( + [string]$Description, + [scriptblock]$CommandBlock + ) + Write-Host $Description -ForegroundColor Cyan + & $CommandBlock + if ($LASTEXITCODE -ne 0) { + throw "Command failed during: $Description" + } +} + +try { + # This finds the project root based on this script location. + $scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path + $projectRoot = Split-Path -Parent $scriptDir + Set-Location $projectRoot + + Write-Host "Step 1/6: Creating project folder structure..." -ForegroundColor Cyan + + # These are all folders needed for Components 1-10. + $folders = @( + "data", + "data/raw", + "data/interim", + "data/processed", + "data/external", + "src", + "src/tokenizer", + "src/dataset_pipeline", + "src/model_architecture", + "src/training_pipeline", + "src/evaluation_system", + "src/inference_engine", + "src/chat_interface", + "src/finetuning_system", + "src/export_optimization", + "configs", + "scripts", + "tests", + "checkpoints", + "models", + "models/base", + "models/lora", + "models/quantized", + "artifacts", + "logs" + ) + + foreach ($folder in $folders) { + if (-not (Test-Path $folder)) { + New-Item -ItemType Directory -Path $folder | Out-Null + } + } + + Write-Host "Step 2/6: Creating Python virtual environment..." -ForegroundColor Cyan + + # This checks Python version before environment creation. + # We require Python 3.10 or 3.11 for best Windows compatibility with ML packages. + $pyVersionRaw = python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" + if ($LASTEXITCODE -ne 0) { + throw "Python is not available in PATH. Install Python 3.11 and try again." + } + if (($pyVersionRaw -ne "3.10") -and ($pyVersionRaw -ne "3.11")) { + throw "Detected Python $pyVersionRaw. Please install Python 3.11 (recommended) or 3.10, then rerun this script." + } + + # This creates .venv only if it does not already exist. + if (-not (Test-Path ".venv")) { + python -m venv .venv + } else { + Write-Host "Virtual environment already exists. Reusing .venv." -ForegroundColor Yellow + } + + # This points to the venv Python executable on Windows. + $venvPython = Join-Path $projectRoot ".venv\Scripts\python.exe" + if (-not (Test-Path $venvPython)) { + throw "Could not find .venv Python at $venvPython. Please check Python installation." + } + + Invoke-StepCommand "Step 3/6: Upgrading pip, setuptools, and wheel..." { + & $venvPython -m pip install --upgrade pip setuptools wheel + } + + Invoke-StepCommand "Step 4/6: Installing CUDA-enabled PyTorch (cu121)..." { + & $venvPython -m pip install --index-url https://download.pytorch.org/whl/cu121 torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 + } + + Invoke-StepCommand "Step 5/6: Installing project dependencies from requirements.txt..." { + & $venvPython -m pip install -r requirements.txt + } + + Invoke-StepCommand "Step 6/6: Validating pip environment health..." { + & $venvPython -m pip check + } + + Write-Host "Setup complete." -ForegroundColor Green + Write-Host "Next command: .\.venv\Scripts\Activate.ps1" -ForegroundColor Green + Write-Host "Then run: python .\scripts\verify_component1_setup.py" -ForegroundColor Green +} +catch { + # This prints a clear plain-English error if anything breaks. + Write-Host "" + Write-Host "Setup failed." -ForegroundColor Red + Write-Host "What went wrong: $($_.Exception.Message)" -ForegroundColor Red + Write-Host "Fix suggestion: Check Python is installed and available in PATH, then run this script again." -ForegroundColor Yellow + exit 1 +} diff --git a/scripts/train_code_tokenizer.py b/scripts/train_code_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..2e8dc15e5b27b0b008a1635c7e1f74675c39677a --- /dev/null +++ b/scripts/train_code_tokenizer.py @@ -0,0 +1,134 @@ +""" +Component 2 training script. + +This script trains the custom code tokenizer and saves it for reuse. +Supported input formats: +- .jsonl with fields: prompt, code, language +- .txt where each line is one raw sample +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Iterable, Iterator, List + +# This makes "src" imports work when script is run from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.tokenizer.code_tokenizer import CodeTokenizer, CodeTokenizerConfig + + +def stream_jsonl_samples(file_path: Path, tokenizer: CodeTokenizer) -> Iterator[str]: + """ + Streams JSONL rows as training text without loading full file into RAM. + """ + with file_path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + row = json.loads(line) + except json.JSONDecodeError: + continue + prompt = str(row.get("prompt", "")).strip() + code = str(row.get("code", "")).strip() + language = str(row.get("language", "python")).strip().lower() + if not prompt or not code: + continue + if language not in {"python", "javascript"}: + language = "python" + yield tokenizer.format_training_sample(prompt=prompt, code=code, language=language) + + +def stream_txt_samples(file_path: Path) -> Iterator[str]: + """ + Streams plain text file line by line. + """ + with file_path.open("r", encoding="utf-8") as f: + for line in f: + text = line.strip() + if text: + yield text + + +def build_stream(input_files: List[Path], tokenizer: CodeTokenizer) -> Iterable[str]: + """ + Creates one merged iterator from many files. + """ + def _generator() -> Iterator[str]: + for path in input_files: + suffix = path.suffix.lower() + if suffix == ".jsonl": + yield from stream_jsonl_samples(path, tokenizer) + elif suffix == ".txt": + yield from stream_txt_samples(path) + else: + print(f"[warning] Skipping unsupported file type: {path}") + + return _generator() + + +def parse_args() -> argparse.Namespace: + """ + Reads command-line settings for tokenizer training. + """ + parser = argparse.ArgumentParser(description="Train custom Python/JavaScript code tokenizer.") + parser.add_argument( + "--input", + nargs="+", + required=True, + help="One or more input files (.jsonl or .txt).", + ) + parser.add_argument( + "--output_dir", + default="artifacts/tokenizer/code_tokenizer_v1", + help="Folder where tokenizer files will be saved.", + ) + parser.add_argument("--vocab_size", type=int, default=50_000, help="Tokenizer vocabulary size.") + parser.add_argument("--min_frequency", type=int, default=2, help="Minimum token frequency.") + parser.add_argument("--model_max_length", type=int, default=2048, help="Max token length hint.") + return parser.parse_args() + + +def main() -> None: + """ + Main training entry point with clear error messages. + """ + args = parse_args() + + try: + input_files = [Path(p) for p in args.input] + missing = [str(p) for p in input_files if not p.exists()] + if missing: + raise FileNotFoundError( + "Some input files do not exist:\n- " + "\n- ".join(missing) + ) + + config = CodeTokenizerConfig( + vocab_size=args.vocab_size, + min_frequency=args.min_frequency, + model_max_length=args.model_max_length, + ) + tokenizer = CodeTokenizer(config=config) + text_stream = build_stream(input_files=input_files, tokenizer=tokenizer) + tokenizer.train(text_stream) + tokenizer.save(args.output_dir) + + print("Tokenizer training completed successfully.") + print(f"Saved tokenizer to: {args.output_dir}") + print("Saved files: tokenizer.json, tokenizer_config.json") + except Exception as exc: + print("Tokenizer training failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: check file paths and file format, then run again.") + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/train_component5.py b/scripts/train_component5.py new file mode 100644 index 0000000000000000000000000000000000000000..b9c5c73d7719194e744e31c591dfb9eea9c28ae8 --- /dev/null +++ b/scripts/train_component5.py @@ -0,0 +1,396 @@ +""" +Component 5: Training pipeline for the 420M code model. + +Features: +- FP16 mixed precision +- Gradient checkpointing +- Gradient accumulation +- 8-bit optimizer attempt with safe fallback +- Checkpoint save every N steps +- Resume from checkpoint +- Early stopping +- Live progress with loss, LR, ETA, VRAM +""" + +from __future__ import annotations + +import argparse +import json +import math +import os +import sys +import time +from pathlib import Path +from typing import Any, Dict, Optional, Tuple + +import torch +import yaml +from torch.optim import AdamW +from torch.utils.data import DataLoader +from tqdm import tqdm + +# Ensure src imports work from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets # noqa: E402 +from src.training_pipeline.tokenized_dataset import CausalCollator, TokenizedJsonlDataset # noqa: E402 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run Component 5 training.") + parser.add_argument("--config", default="configs/component5_training_config.yaml") + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config not found: {path}") + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + raise ValueError("Invalid YAML format.") + return data + + +def load_model_config(path: Path) -> ModelConfig: + cfg = load_yaml(path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + if preset: + presets = get_model_presets() + if preset not in presets: + raise ValueError(f"Unknown model preset: {preset}") + base = presets[preset].__dict__.copy() + base.update(model_cfg) + return ModelConfig(**base) + return ModelConfig(**model_cfg) + + +def make_optimizer(model: torch.nn.Module, train_cfg: Dict[str, Any]) -> Tuple[torch.optim.Optimizer, str]: + lr = float(train_cfg["learning_rate"]) + wd = float(train_cfg["weight_decay"]) + betas = tuple(float(x) for x in train_cfg.get("betas", [0.9, 0.95])) + prefer_8bit = bool(train_cfg.get("prefer_8bit_adam", True)) + + if prefer_8bit: + try: + import bitsandbytes as bnb # type: ignore + + optimizer = bnb.optim.Adam8bit(model.parameters(), lr=lr, betas=betas, weight_decay=wd) + return optimizer, "Adam8bit" + except Exception: + pass + + optimizer = AdamW(model.parameters(), lr=lr, betas=betas, weight_decay=wd) + return optimizer, "AdamW" + + +def cosine_lr(base_lr: float, step: int, warmup_steps: int, max_steps: int, min_lr_ratio: float) -> float: + if step < warmup_steps: + return base_lr * (step / max(1, warmup_steps)) + progress = (step - warmup_steps) / max(1, max_steps - warmup_steps) + progress = min(1.0, max(0.0, progress)) + cosine = 0.5 * (1.0 + math.cos(math.pi * progress)) + min_lr = base_lr * min_lr_ratio + return min_lr + (base_lr - min_lr) * cosine + + +def set_optimizer_lr(optimizer: torch.optim.Optimizer, lr: float) -> None: + for pg in optimizer.param_groups: + pg["lr"] = lr + + +def get_vram_gb() -> float: + if not torch.cuda.is_available(): + return 0.0 + return torch.cuda.memory_allocated() / (1024**3) + + +def save_checkpoint( + ckpt_dir: Path, + step: int, + model: CodeTransformerLM, + optimizer: torch.optim.Optimizer, + scaler: Optional[torch.cuda.amp.GradScaler], + best_val: float, + no_improve_evals: int, + config: Dict[str, Any], +) -> Path: + ckpt_dir.mkdir(parents=True, exist_ok=True) + ckpt_path = ckpt_dir / f"step_{step}.pt" + payload = { + "step": step, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + "scaler_state": scaler.state_dict() if scaler is not None else None, + "best_val": best_val, + "no_improve_evals": no_improve_evals, + "config": config, + } + torch.save(payload, ckpt_path) + latest = ckpt_dir / "latest.pt" + torch.save(payload, latest) + return ckpt_path + + +def load_checkpoint( + ckpt_path: Path, + model: CodeTransformerLM, + optimizer: torch.optim.Optimizer, + scaler: Optional[torch.cuda.amp.GradScaler], + device: torch.device, +) -> Tuple[int, float, int]: + payload = torch.load(ckpt_path, map_location=device) + model.load_state_dict(payload["model_state"]) + optimizer.load_state_dict(payload["optimizer_state"]) + if scaler is not None and payload.get("scaler_state") is not None: + scaler.load_state_dict(payload["scaler_state"]) + step = int(payload.get("step", 0)) + best_val = float(payload.get("best_val", 1e9)) + no_improve = int(payload.get("no_improve_evals", 0)) + return step, best_val, no_improve + + +@torch.no_grad() +def evaluate_loss( + model: CodeTransformerLM, + val_loader: DataLoader, + device: torch.device, + use_fp16: bool, + max_batches: int = 50, +) -> float: + model.eval() + losses = [] + amp_enabled = use_fp16 and device.type == "cuda" + for i, (input_ids, labels) in enumerate(val_loader): + if i >= max_batches: + break + input_ids = input_ids.to(device, non_blocking=True) + labels = labels.to(device, non_blocking=True) + with torch.amp.autocast("cuda", enabled=amp_enabled, dtype=torch.float16): + out = model(input_ids=input_ids, labels=labels) + losses.append(float(out["loss"].item())) + model.train() + if not losses: + return 1e9 + return sum(losses) / len(losses) + + +def train() -> None: + args = parse_args() + cfg = load_yaml(Path(args.config)) + train_cfg = cfg["training"] + data_cfg = cfg["data"] + resume_cfg = cfg.get("resume", {}) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device.type != "cuda": + raise RuntimeError("CUDA GPU is required for this training setup.") + + model_cfg = load_model_config(Path(cfg["model"]["model_config_path"])) + model_cfg.max_seq_len = int(train_cfg["max_seq_len"]) + model_cfg.gradient_checkpointing = bool(train_cfg.get("use_gradient_checkpointing", True)) + + model = CodeTransformerLM(model_cfg) + model.enable_gradient_checkpointing(model_cfg.gradient_checkpointing) + model = model.to(device) + + use_fp16 = bool(train_cfg.get("use_fp16", True)) + scaler = torch.amp.GradScaler("cuda", enabled=use_fp16) + + optimizer, optimizer_name = make_optimizer(model, train_cfg) + + tokenized_path = str(data_cfg["tokenized_jsonl_path"]) + train_ds = TokenizedJsonlDataset( + path=tokenized_path, + split="train", + val_ratio=float(data_cfg.get("val_ratio", 0.02)), + split_seed=int(data_cfg.get("split_seed", 17)), + ) + val_ds = TokenizedJsonlDataset( + path=tokenized_path, + split="val", + val_ratio=float(data_cfg.get("val_ratio", 0.02)), + split_seed=int(data_cfg.get("split_seed", 17)), + ) + + collator = CausalCollator(pad_token_id=0, max_seq_len=int(train_cfg["max_seq_len"])) + train_loader = DataLoader( + train_ds, + batch_size=int(train_cfg["micro_batch_size"]), + shuffle=True, + num_workers=int(data_cfg.get("num_workers", 0)), + pin_memory=True, + collate_fn=collator, + ) + val_loader = DataLoader( + val_ds, + batch_size=int(train_cfg["micro_batch_size"]), + shuffle=False, + num_workers=0, + pin_memory=True, + collate_fn=collator, + ) + + out_dir = Path(train_cfg["output_dir"]) + out_dir.mkdir(parents=True, exist_ok=True) + + global_step = 0 + best_val = 1e9 + no_improve = 0 + + resume_from = str(resume_cfg.get("resume_from", "none")).strip().lower() + if resume_from != "none": + if resume_from == "latest": + ckpt_path = out_dir / "latest.pt" + else: + ckpt_path = Path(resume_cfg["resume_from"]) + if ckpt_path.exists(): + global_step, best_val, no_improve = load_checkpoint( + ckpt_path=ckpt_path, + model=model, + optimizer=optimizer, + scaler=scaler, + device=device, + ) + print(f"[resume] loaded checkpoint {ckpt_path} at step {global_step}") + else: + print(f"[resume] checkpoint not found, starting fresh: {ckpt_path}") + + max_steps = int(train_cfg["max_steps"]) + grad_accum = int(train_cfg["grad_accum_steps"]) + log_every = int(train_cfg["log_every"]) + eval_every = int(train_cfg["eval_every"]) + save_every = int(train_cfg["save_every"]) + warmup_steps = int(train_cfg["warmup_steps"]) + min_lr_ratio = float(train_cfg["min_lr_ratio"]) + grad_clip = float(train_cfg["grad_clip_norm"]) + max_vram_gb = float(train_cfg.get("max_vram_gb", 7.0)) + patience = int(train_cfg.get("early_stopping_patience_evals", 20)) + min_delta = float(train_cfg.get("early_stopping_min_delta", 5e-4)) + base_lr = float(train_cfg["learning_rate"]) + + model.train() + start_time = time.time() + running_loss = 0.0 + running_count = 0 + + pbar = tqdm(total=max_steps, initial=global_step, desc="train", dynamic_ncols=True) + + while global_step < max_steps: + for input_ids, labels in train_loader: + if global_step >= max_steps: + break + + current_lr = cosine_lr(base_lr, global_step, warmup_steps, max_steps, min_lr_ratio) + set_optimizer_lr(optimizer, current_lr) + + input_ids = input_ids.to(device, non_blocking=True) + labels = labels.to(device, non_blocking=True) + + amp_enabled = use_fp16 and device.type == "cuda" + with torch.amp.autocast("cuda", enabled=amp_enabled, dtype=torch.float16): + out = model(input_ids=input_ids, labels=labels) + loss = out["loss"] / grad_accum + + scaler.scale(loss).backward() + + running_loss += float(loss.item()) * grad_accum + running_count += 1 + + if running_count % grad_accum == 0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad(set_to_none=True) + + global_step += 1 + pbar.update(1) + + elapsed = time.time() - start_time + steps_done = max(1, global_step) + steps_left = max(0, max_steps - global_step) + eta_sec = (elapsed / steps_done) * steps_left + avg_loss = running_loss / max(1, running_count) + vram = get_vram_gb() + + if vram > max_vram_gb: + raise RuntimeError( + f"VRAM safety threshold exceeded: {vram:.2f} GB > {max_vram_gb:.2f} GB. " + "Reduce max_seq_len or grad_accum/micro_batch settings." + ) + + if global_step % log_every == 0: + pbar.set_postfix( + { + "loss": f"{avg_loss:.4f}", + "lr": f"{current_lr:.2e}", + "vram_gb": f"{vram:.2f}", + "eta_min": f"{eta_sec/60.0:.1f}", + } + ) + + if global_step % save_every == 0: + ckpt_path = save_checkpoint( + ckpt_dir=out_dir, + step=global_step, + model=model, + optimizer=optimizer, + scaler=scaler, + best_val=best_val, + no_improve_evals=no_improve, + config=cfg, + ) + print(f"\n[checkpoint] saved {ckpt_path}") + + if global_step % eval_every == 0: + val_loss = evaluate_loss(model, val_loader, device, use_fp16=use_fp16) + print(f"\n[eval] step={global_step} val_loss={val_loss:.4f} best={best_val:.4f}") + if val_loss < (best_val - min_delta): + best_val = val_loss + no_improve = 0 + else: + no_improve += 1 + if no_improve >= patience: + print( + f"\n[early_stop] no improvement for {no_improve} evals " + f"(patience={patience}). Stopping training." + ) + global_step = max_steps + break + + pbar.close() + final_ckpt = save_checkpoint( + ckpt_dir=out_dir, + step=global_step, + model=model, + optimizer=optimizer, + scaler=scaler, + best_val=best_val, + no_improve_evals=no_improve, + config=cfg, + ) + print("Training completed.") + print(f"Optimizer used: {optimizer_name}") + print(f"Final checkpoint: {final_ckpt}") + + +def main() -> None: + try: + train() + except Exception as exc: + print("Component 5 training failed.") + print(f"What went wrong: {exc}") + print( + "Fix suggestion: lower max_seq_len, keep micro_batch_size=1, " + "increase grad_accum_steps, and verify checkpoint/output paths." + ) + raise SystemExit(1) + + +if __name__ == "__main__": + main() + diff --git a/scripts/verify_component1_setup.py b/scripts/verify_component1_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..65dc23cf3ece795f9c9a4d764f9f7185fd8da706 --- /dev/null +++ b/scripts/verify_component1_setup.py @@ -0,0 +1,112 @@ +""" +This script verifies Component 1 setup in plain English. +It checks Python, key packages, and GPU visibility for PyTorch. +""" + +import importlib +import importlib.util +import platform +import sys +from typing import List + + +def check_imports(packages: List[str]) -> List[str]: + """ + Tries importing required packages. + Returns a list of package names that failed to import. + """ + failed = [] + for package in packages: + try: + importlib.import_module(package) + except Exception: + failed.append(package) + return failed + + +def check_optional_installed(packages: List[str]) -> List[str]: + """ + Checks whether optional packages exist without importing them. + Returns packages that are missing. + """ + missing = [] + for package in packages: + if importlib.util.find_spec(package) is None: + missing.append(package) + return missing + + +def main() -> None: + print("=== Component 1 Verification ===") + print(f"Python version: {sys.version.split()[0]}") + print(f"Operating system: {platform.system()} {platform.release()}") + + # Python 3.10/3.11 is the target for best compatibility on Windows. + if sys.version_info.major != 3 or sys.version_info.minor not in (10, 11): + print("") + print("Verification failed.") + print("This project currently requires Python 3.10 or 3.11 on Windows.") + print("Fix suggestion: install Python 3.11, recreate .venv, and reinstall requirements.") + raise SystemExit(1) + + # These are required for Component 1 success. + required = [ + "torch", + "transformers", + "tokenizers", + "datasets", + "accelerate", + "gradio", + "tree_sitter", + ] + + failed = check_imports(required) + if failed: + print("") + print("Verification failed.") + print("The following packages could not be imported:") + for package in failed: + print(f"- {package}") + print("") + print("Fix suggestion: activate .venv and run 'pip install -r requirements.txt' again.") + raise SystemExit(1) + + # Optional imports should not fail Component 1. + optional = ["bitsandbytes"] + optional_failed = check_optional_installed(optional) + + import torch + + print(f"PyTorch version: {torch.__version__}") + cuda_available = torch.cuda.is_available() + print(f"CUDA available: {cuda_available}") + + if cuda_available: + gpu_name = torch.cuda.get_device_name(0) + total_vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3) + print(f"Detected GPU: {gpu_name}") + print(f"Total VRAM: {total_vram_gb:.2f} GB") + else: + print("No CUDA GPU was detected by PyTorch.") + print("You can still continue, but training speed will be much slower.") + + if optional_failed: + print("") + print("Optional package warning:") + print("- bitsandbytes is not available in this environment.") + print("This does not block Component 1.") + print("For Component 5 on native Windows, we will use an automatic fallback optimizer if needed.") + + print("") + print("Component 1 is verified successfully.") + + +if __name__ == "__main__": + try: + main() + except Exception as exc: + print("") + print("Verification script crashed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: ensure .venv is active and dependencies are installed.") + raise SystemExit(1) diff --git a/scripts/verify_component2_tokenizer.py b/scripts/verify_component2_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..281e981f7d3ed72be1c360a416beda92c99d240c --- /dev/null +++ b/scripts/verify_component2_tokenizer.py @@ -0,0 +1,67 @@ +""" +Component 2 verification script. + +This script: +1) Trains tokenizer on a tiny sample file. +2) Saves tokenizer. +3) Loads tokenizer back. +4) Encodes and decodes a sample. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +# This makes "src" imports work when script is run from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.tokenizer.code_tokenizer import CodeTokenizer + + +def main() -> None: + sample_file = Path("data/external/component2_tokenizer_sample.jsonl") + output_dir = Path("artifacts/tokenizer/code_tokenizer_v1") + + if not sample_file.exists(): + print("Verification failed.") + print(f"Missing sample file: {sample_file}") + print("Fix suggestion: ensure Component 2 sample file exists and run again.") + raise SystemExit(1) + + # Train tokenizer from sample file via script-like path. + from scripts.train_code_tokenizer import stream_jsonl_samples # local import on purpose + + tokenizer = CodeTokenizer() + tokenizer.train(stream_jsonl_samples(sample_file, tokenizer)) + tokenizer.save(str(output_dir)) + + loaded = CodeTokenizer.load(str(output_dir)) + sample = loaded.format_training_sample( + prompt="Write Python function that squares a number.", + code="def square(x):\n return x * x", + language="python", + ) + token_ids = loaded.encode(sample) + decoded = loaded.decode(token_ids) + + print("=== Component 2 Verification ===") + print(f"Tokenizer saved to: {output_dir}") + print(f"Encoded token count: {len(token_ids)}") + print("First 25 token IDs:", token_ids[:25]) + print("Decoded preview:") + print(decoded[:300]) + print("") + print("Component 2 tokenizer verification passed.") + + +if __name__ == "__main__": + try: + main() + except Exception as exc: + print("Verification failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: activate .venv and rerun this script.") + raise SystemExit(1) diff --git a/scripts/verify_component3_dataset_pipeline.py b/scripts/verify_component3_dataset_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..30bf56d3ac0fd4e2f0a7198175ec892990c437c9 --- /dev/null +++ b/scripts/verify_component3_dataset_pipeline.py @@ -0,0 +1,41 @@ +""" +Component 3 verification script. + +Runs a small pipeline pass to confirm: +- HF loading works. +- Cleaning + dedupe logic works. +- Tokenized output files are created. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +# This makes script imports stable from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from scripts.run_component3_dataset_pipeline import main as run_pipeline_main # noqa: E402 + + +if __name__ == "__main__": + try: + # We call the main runner with a small override by mutating argv. + sys.argv = [ + "verify_component3_dataset_pipeline.py", + "--config", + "configs/component3_dataset_pipeline.yaml", + "--max_records_per_dataset", + "200", + ] + run_pipeline_main() + print("") + print("Component 3 verification passed.") + except Exception as exc: + print("Component 3 verification failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: verify internet access and that Component 2 tokenizer exists.") + raise SystemExit(1) + diff --git a/scripts/verify_component4_model.py b/scripts/verify_component4_model.py new file mode 100644 index 0000000000000000000000000000000000000000..370b97a0dc16edfb8001fe4c39ce6bdebe0be366 --- /dev/null +++ b/scripts/verify_component4_model.py @@ -0,0 +1,138 @@ +""" +Component 4 verification script. + +This script: +- Builds model from config. +- Runs a small forward pass. +- Prints live VRAM usage at each stage. +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +from typing import Any, Dict + +import torch +import yaml + +# Ensure src imports work from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from src.model_architecture.code_transformer import ( # noqa: E402 + CodeTransformerLM, + ModelConfig, + get_model_presets, +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Verify Component 4 model load and VRAM usage.") + parser.add_argument( + "--config", + default="configs/component4_model_config.yaml", + help="Path to model YAML config.", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Batch size for forward test.") + parser.add_argument("--seq_len", type=int, default=256, help="Sequence length for forward test.") + return parser.parse_args() + + +def load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Model config not found: {path}") + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + raise ValueError("Invalid YAML format in model config.") + return data + + +def build_config(cfg_data: Dict[str, Any]) -> ModelConfig: + preset = cfg_data.get("preset") + model_cfg = cfg_data.get("model", {}) + if not isinstance(model_cfg, dict): + raise ValueError("Config key 'model' must be an object.") + + if preset: + presets = get_model_presets() + if preset not in presets: + raise ValueError(f"Unknown preset '{preset}'.") + base = presets[preset] + merged = base.__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + return ModelConfig(**model_cfg) + + +def gpu_memory_report(stage: str) -> None: + if not torch.cuda.is_available(): + print(f"[{stage}] CUDA not available") + return + allocated = torch.cuda.memory_allocated() / (1024**3) + reserved = torch.cuda.memory_reserved() / (1024**3) + max_alloc = torch.cuda.max_memory_allocated() / (1024**3) + print( + f"[{stage}] VRAM allocated={allocated:.2f} GB " + f"reserved={reserved:.2f} GB max_allocated={max_alloc:.2f} GB" + ) + + +def main() -> None: + args = parse_args() + try: + cfg_data = load_yaml(Path(args.config)) + model_cfg = build_config(cfg_data) + if args.seq_len > model_cfg.max_seq_len: + raise ValueError( + f"seq_len={args.seq_len} exceeds max_seq_len={model_cfg.max_seq_len} in config." + ) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + use_fp16 = device.type == "cuda" + if device.type == "cuda": + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + print(f"Detected GPU: {torch.cuda.get_device_name(0)}") + gpu_memory_report("start") + else: + print("CUDA not available. Running verification on CPU.") + + model = CodeTransformerLM(model_cfg) + print(f"Model parameters: {model.estimate_num_parameters():,}") + + if use_fp16: + model = model.half() + model.to(device) + model.eval() + gpu_memory_report("after_model_load") + + input_ids = torch.randint( + low=0, + high=model_cfg.vocab_size, + size=(args.batch_size, args.seq_len), + dtype=torch.long, + device=device, + ) + gpu_memory_report("after_input_alloc") + + with torch.no_grad(): + out = model(input_ids=input_ids) + logits = out["logits"] + gpu_memory_report("after_forward") + + print(f"Forward output shape: {tuple(logits.shape)}") + print("Component 4 verification passed.") + except Exception as exc: + print("Component 4 verification failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: reduce seq_len or check CUDA/PyTorch installation.") + raise SystemExit(1) + + +if __name__ == "__main__": + main() + diff --git a/scripts/verify_component5_training_pipeline.py b/scripts/verify_component5_training_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..cc73f5536acff86b06de3b376bd0cda7c42d9278 --- /dev/null +++ b/scripts/verify_component5_training_pipeline.py @@ -0,0 +1,46 @@ +""" +Quick verification for Component 5 training pipeline. +Runs a tiny 5-step training smoke test. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +# Ensure script imports work from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from scripts.train_component5 import main as train_main # noqa: E402 + + +if __name__ == "__main__": + try: + # Override CLI args for a tiny smoke test. + sys.argv = ["verify_component5_training_pipeline.py", "--config", "configs/component5_training_config.yaml"] + + # Patch config on disk for very short run. + import yaml + + cfg_path = Path("configs/component5_training_config.yaml") + cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) + cfg["training"]["max_steps"] = 5 + cfg["training"]["save_every"] = 5 + cfg["training"]["eval_every"] = 5 + cfg["training"]["log_every"] = 1 + cfg["resume"]["resume_from"] = "none" + + tmp_cfg = Path("configs/component5_training_config.verify.yaml") + tmp_cfg.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8") + + sys.argv = ["verify_component5_training_pipeline.py", "--config", str(tmp_cfg)] + train_main() + print("") + print("Component 5 verification passed.") + except Exception as exc: + print("Component 5 verification failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: ensure CUDA is available and tokenized dataset path is correct.") + raise SystemExit(1) diff --git a/scripts/verify_component9_lora.py b/scripts/verify_component9_lora.py new file mode 100644 index 0000000000000000000000000000000000000000..7c3fbf1266ff20bfaeb8de35f89338434f3ae74b --- /dev/null +++ b/scripts/verify_component9_lora.py @@ -0,0 +1,34 @@ +""" +Quick verification for Component 9 LoRA pipeline. +Runs a tiny 5-step smoke fine-tune. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import yaml + +# Ensure imports work. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from scripts.run_component9_lora_finetune import main as lora_main # noqa: E402 + + +if __name__ == "__main__": + cfg_path = PROJECT_ROOT / "configs" / "component9_lora_config.yaml" + cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8-sig")) + cfg["finetune"]["max_steps"] = 5 + cfg["finetune"]["save_every"] = 5 + cfg["finetune"]["eval_every"] = 5 + cfg["resume"]["resume_from"] = "none" + tmp = PROJECT_ROOT / "configs" / "component9_lora_config.verify.yaml" + tmp.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8-sig") + + sys.argv = ["verify_component9_lora.py", "--config", str(tmp)] + lora_main() + print("\nComponent 9 verification passed.") + diff --git a/scripts/verify_incremental_javascript_merge.py b/scripts/verify_incremental_javascript_merge.py new file mode 100644 index 0000000000000000000000000000000000000000..e0f7bac25e67f97539afe0779b2106104cd8ef65 --- /dev/null +++ b/scripts/verify_incremental_javascript_merge.py @@ -0,0 +1,37 @@ +""" +Quick verification for incremental JavaScript merge script. + +This performs a small run with a low JS target so you can validate logic fast. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +# Ensure imports work when executed from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from scripts.add_incremental_javascript_dataset import main as incremental_main # noqa: E402 + + +if __name__ == "__main__": + try: + sys.argv = [ + "verify_incremental_javascript_merge.py", + "--config", + "configs/component3_incremental_js.yaml", + "--target_new_javascript_examples", + "100", + ] + incremental_main() + print("") + print("Incremental JS merge verification passed.") + except Exception as exc: + print("Incremental JS merge verification failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: verify dataset accessibility and rerun.") + raise SystemExit(1) + diff --git a/scripts/verify_reprocess_tokenized_from_clean.py b/scripts/verify_reprocess_tokenized_from_clean.py new file mode 100644 index 0000000000000000000000000000000000000000..1fd12ed1e029691e75f76e3f4d8efc5ff01f7369 --- /dev/null +++ b/scripts/verify_reprocess_tokenized_from_clean.py @@ -0,0 +1,35 @@ +""" +Quick verification for reprocess_tokenized_from_clean.py. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +# Ensure imports work from project root. +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from scripts.reprocess_tokenized_from_clean import main as reprocess_main # noqa: E402 + + +if __name__ == "__main__": + try: + sys.argv = [ + "verify_reprocess_tokenized_from_clean.py", + "--config", + "configs/component3_reprocess_from_clean.yaml", + "--max_records", + "500", + ] + reprocess_main() + print("") + print("Reprocess verification passed.") + except Exception as exc: + print("Reprocess verification failed.") + print(f"What went wrong: {exc}") + print("Fix suggestion: verify input clean file and tokenizer path.") + raise SystemExit(1) + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..45aa8dbeca626dc18818fbfe98c473d01ebd3aff --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,2 @@ +# This file marks src as a Python package. + diff --git a/src/chat_interface/__init__.py b/src/chat_interface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4ccac5c513223e9eab9f38019d3010f92fbff3d8 --- /dev/null +++ b/src/chat_interface/__init__.py @@ -0,0 +1 @@ +# This file marks chat_interface as a Python package. diff --git a/src/chat_interface/gradio_chat_app.py b/src/chat_interface/gradio_chat_app.py new file mode 100644 index 0000000000000000000000000000000000000000..6e415e3d3d0e5fd22ae1fc32f6e88300efd89acb --- /dev/null +++ b/src/chat_interface/gradio_chat_app.py @@ -0,0 +1,370 @@ +""" +Component 8: Local chat interface using Gradio. + +- Clean dark-themed UI. +- Prompt input box. +- Syntax-highlighted code output (Python + JavaScript). +- Copy button for each code response. +- Generation time + token count. +- Conversation history in session. +- Clear button to reset history. +- Live model selector: Base / LoRA / INT8 (no restart). +""" + +from __future__ import annotations + +import html +import re +import time +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import gradio as gr +import torch +import torch.nn as nn +import yaml +from pygments import highlight +from pygments.formatters import HtmlFormatter +from pygments.lexers import JavascriptLexer, PythonLexer, TextLexer + +from src.finetuning_system.lora_adapter import LoRAConfig, apply_lora, load_lora_state_dict +from src.inference_engine.inference_engine import DecodingConfig, InferenceEngine +from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets +from src.tokenizer.code_tokenizer import CodeTokenizer + + +def _load_yaml(path: Path) -> Dict[str, Any]: + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {path}") + data = yaml.safe_load(path.read_text(encoding="utf-8-sig")) + if not isinstance(data, dict): + raise ValueError("Invalid YAML format.") + return data + + +def _build_model_config(path: Path) -> ModelConfig: + cfg = _load_yaml(path) + preset = cfg.get("preset") + model_cfg = cfg.get("model", {}) + if preset: + presets = get_model_presets() + if preset not in presets: + raise ValueError(f"Unknown preset: {preset}") + merged = presets[preset].__dict__.copy() + merged.update(model_cfg) + return ModelConfig(**merged) + return ModelConfig(**model_cfg) + + +def _guess_language(prompt: str, default_lang: str = "python") -> str: + p = prompt.lower() + if "javascript" in p or " js " in f" {p} " or "node" in p: + return "javascript" + if "python" in p: + return "python" + return default_lang + + +def _is_coding_prompt(prompt: str) -> bool: + p = prompt.lower().strip() + coding_keywords = [ + "code", + "python", + "javascript", + "function", + "bug", + "error", + "algorithm", + "sort", + "loop", + "class", + "api", + "sql", + "regex", + "debug", + "implement", + "write", + ] + if any(k in p for k in coding_keywords): + return True + if re.fullmatch(r"(hi|hello|hey|yo|hola)[!. ]*", p): + return False + return False + + +def _highlight_code(code: str, language: str) -> str: + code = code or "" + if language == "javascript": + lexer = JavascriptLexer() + elif language == "python": + lexer = PythonLexer() + else: + lexer = TextLexer() + formatter = HtmlFormatter(nowrap=True) + return highlight(code, lexer, formatter) + + +def _render_history(history: List[Dict[str, Any]]) -> str: + formatter = HtmlFormatter(style="monokai") + css = formatter.get_style_defs(".codehilite") + blocks = [ + "", + """ + + """, + '
', + ] + + if not history: + blocks.append('
No messages yet. Ask a coding question to begin.
') + + for i, item in enumerate(history, start=1): + lang = item.get("language", "python") + prompt = html.escape(str(item.get("prompt", ""))) + highlighted = _highlight_code(str(item.get("code", "")), lang) + code_id = f"code-{i}" + syntax_ok = "yes" if item.get("syntax_ok", False) else "n/a" + mode = item.get("mode", "base") + blocks.append('
') + blocks.append(f'
User: {prompt}
') + blocks.append(f'
Assistant ({lang})
') + blocks.append(f'') + blocks.append('
') + blocks.append('
') + blocks.append(f'
{highlighted}
') + blocks.append('
') + blocks.append( + f'
mode={mode} | time={item.get("time_sec", 0):.2f}s | ' + f'tokens={item.get("tokens", 0)} | syntax_ok={syntax_ok} | ' + f'attempt={item.get("attempt", 1)}
' + ) + blocks.append('
') + + blocks.append('
') + return "\n".join(blocks) + + +class ChatRuntime: + def __init__(self, config_path: str) -> None: + self.project_root = Path(__file__).resolve().parents[2] + self.cfg = _load_yaml(self.project_root / config_path) + + self.model_cfg = _build_model_config(self.project_root / self.cfg["model"]["model_config_path"]) + self.cuda_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if self.cuda_device.type != "cuda": + raise RuntimeError("CUDA GPU is required for this chat interface setup.") + + self.tokenizer = CodeTokenizer.load(str(self.project_root / self.cfg["model"]["tokenizer_dir"])) + + self.decode_cfg = DecodingConfig( + max_new_tokens=int(self.cfg["inference"].get("max_new_tokens", 300)), + greedy_temperature=float(self.cfg["inference"].get("greedy_temperature", 0.0)), + retry2_temperature=float(self.cfg["inference"].get("retry2_temperature", 0.25)), + retry2_top_p=float(self.cfg["inference"].get("retry2_top_p", 0.85)), + retry3_temperature=float(self.cfg["inference"].get("retry3_temperature", 0.35)), + retry3_top_p=float(self.cfg["inference"].get("retry3_top_p", 0.90)), + max_retries=int(self.cfg["inference"].get("max_retries", 3)), + min_tokens_before_stop_check=int(self.cfg["inference"].get("min_tokens_before_stop_check", 64)), + ) + + self.current_mode: Optional[str] = None + self.engine: Optional[InferenceEngine] = None + + def _release_current(self) -> None: + self.engine = None + self.current_mode = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def _current_vram_gb(self) -> float: + if not torch.cuda.is_available(): + return 0.0 + return float(torch.cuda.memory_allocated() / (1024**3)) + + def _status_text(self, mode: str, load_sec: float) -> str: + return f"MINDI 1.0 420M | mode={mode} | load={load_sec:.2f}s | vram={self._current_vram_gb():.2f}GB" + + def _load_base_model(self) -> InferenceEngine: + model = CodeTransformerLM(self.model_cfg).to(self.cuda_device) + payload = torch.load(self.project_root / self.cfg["model"]["base_checkpoint_path"], map_location=self.cuda_device) + model.load_state_dict(payload["model_state"]) + model.half() + return InferenceEngine(model=model, tokenizer=self.tokenizer, device=self.cuda_device) + + def _load_lora_model(self) -> InferenceEngine: + model = CodeTransformerLM(self.model_cfg).to(self.cuda_device) + payload = torch.load(self.project_root / self.cfg["model"]["base_checkpoint_path"], map_location=self.cuda_device) + model.load_state_dict(payload["model_state"]) + + lora_cfg = LoRAConfig( + r=int(self.cfg.get("lora", {}).get("r", 8)), + alpha=int(self.cfg.get("lora", {}).get("alpha", 16)), + dropout=float(self.cfg.get("lora", {}).get("dropout", 0.05)), + target_keywords=list(self.cfg.get("lora", {}).get("target_keywords", ["q_proj", "k_proj", "v_proj", "o_proj", "fc1", "fc2"])), + ) + apply_lora(model, lora_cfg) + model = model.to(self.cuda_device) + + lora_payload = torch.load(self.project_root / self.cfg["model"]["lora_adapter_path"], map_location=self.cuda_device) + lora_state = lora_payload.get("lora_state", lora_payload) + load_lora_state_dict(model, lora_state) + model.half() + return InferenceEngine(model=model, tokenizer=self.tokenizer, device=self.cuda_device) + + def _load_int8_model(self) -> InferenceEngine: + cpu = torch.device("cpu") + model = CodeTransformerLM(self.model_cfg).to(cpu).float() + model = torch.quantization.quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8) + q_state = torch.load(self.project_root / self.cfg["model"]["quantized_state_path"], map_location=cpu) + model.load_state_dict(q_state) + return InferenceEngine(model=model, tokenizer=self.tokenizer, device=cpu) + + def _ensure_mode(self, mode: str) -> str: + mode = (mode or "base").lower().strip() + if mode not in {"base", "lora", "int8"}: + mode = "base" + + if self.current_mode == mode and self.engine is not None: + return self._status_text(mode, load_sec=0.0) + + t0 = time.perf_counter() + self._release_current() + if mode == "base": + self.engine = self._load_base_model() + elif mode == "lora": + self.engine = self._load_lora_model() + else: + self.engine = self._load_int8_model() + + self.current_mode = mode + load_sec = time.perf_counter() - t0 + return self._status_text(mode, load_sec=load_sec) + + def switch_mode(self, mode: str) -> str: + return self._ensure_mode(mode) + + def respond(self, prompt: str, history: List[Dict[str, Any]], mode: str) -> Tuple[str, List[Dict[str, Any]], str, str]: + prompt = (prompt or "").strip() + if not prompt: + status = self._ensure_mode(mode) + return _render_history(history), history, "", status + + status = self._ensure_mode(mode) + + if not _is_coding_prompt(prompt): + fallback = "Please ask a coding question (for example: 'Write a Python function to ...' or 'Fix this JavaScript bug ...')." + history.append( + { + "prompt": prompt, + "code": fallback, + "language": "text", + "tokens": 0, + "time_sec": 0.0, + "syntax_ok": None, + "attempt": 0, + "mode": self.current_mode or "base", + } + ) + return _render_history(history), history, "", status + + lang_default = str(self.cfg["inference"].get("language_default", "python")) + language = _guess_language(prompt, default_lang=lang_default) + + start = time.perf_counter() + result = self.engine.generate_with_retry(prompt=prompt, language=language, cfg=self.decode_cfg) # type: ignore[union-attr] + elapsed = time.perf_counter() - start + + final = result["final"] + history.append( + { + "prompt": prompt, + "code": final["code"], + "language": language, + "tokens": int(final.get("generated_tokens", 0)), + "time_sec": float(elapsed), + "syntax_ok": bool(final.get("syntax_ok", False)) if language == "python" else None, + "attempt": int(final.get("attempt", 1)), + "mode": self.current_mode or "base", + } + ) + + return _render_history(history), history, "", status + + def clear(self, mode: str) -> Tuple[str, List[Dict[str, Any]], str, str]: + history: List[Dict[str, Any]] = [] + status = self._ensure_mode(mode) + return _render_history(history), history, "", status + + +def create_demo(config_path: str = "configs/component8_chat_config.yaml") -> gr.Blocks: + runtime = ChatRuntime(config_path=config_path) + + with gr.Blocks(title="MINDI 1.0 420M", theme=gr.themes.Base()) as demo: + gr.Markdown("## MINDI 1.0 420M\nYour local coding intelligence — 420M parameters, fully offline") + + history_state = gr.State([]) + chat_html = gr.HTML(value=_render_history([])) + + with gr.Row(): + mode_dropdown = gr.Dropdown( + label="Model Mode", + choices=["base", "lora", "int8"], + value="base", + interactive=True, + ) + status_box = gr.Textbox(label="Status", value="MINDI 1.0 420M | mode=base | load=0.00s | vram=0.00GB", interactive=False) + + prompt_box = gr.Textbox( + label="Your Prompt", + lines=4, + placeholder="Ask MINDI anything about code", + ) + + with gr.Row(): + send_btn = gr.Button("Generate", variant="primary") + clear_btn = gr.Button("Clear Conversation") + switch_btn = gr.Button("Apply Mode") + + switch_btn.click(fn=runtime.switch_mode, inputs=[mode_dropdown], outputs=[status_box]) + + send_btn.click( + fn=runtime.respond, + inputs=[prompt_box, history_state, mode_dropdown], + outputs=[chat_html, history_state, prompt_box, status_box], + queue=True, + ) + prompt_box.submit( + fn=runtime.respond, + inputs=[prompt_box, history_state, mode_dropdown], + outputs=[chat_html, history_state, prompt_box, status_box], + queue=True, + ) + clear_btn.click( + fn=runtime.clear, + inputs=[mode_dropdown], + outputs=[chat_html, history_state, prompt_box, status_box], + ) + + return demo + + diff --git a/src/dataset_pipeline/__init__.py b/src/dataset_pipeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3fd53166ff987a457ccc6af438753e4d3902cc54 --- /dev/null +++ b/src/dataset_pipeline/__init__.py @@ -0,0 +1,2 @@ +# This file marks dataset_pipeline as a Python package. + diff --git a/src/dataset_pipeline/hf_dataset_pipeline.py b/src/dataset_pipeline/hf_dataset_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..194e7bf461cd546b469df5b03cd46d91018ab095 --- /dev/null +++ b/src/dataset_pipeline/hf_dataset_pipeline.py @@ -0,0 +1,330 @@ +""" +Component 3: Hugging Face dataset pipeline for code model training. + +This module: +1) Streams multiple public datasets from Hugging Face. +2) Standardizes records into prompt/code/language. +3) Cleans and filters low-quality samples. +4) Deduplicates with a disk-backed SQLite hash index. +5) Tokenizes using Component 2 tokenizer. +6) Saves training-ready JSONL output and summary stats. +""" + +from __future__ import annotations + +import hashlib +import json +import re +import sqlite3 +import string +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, Iterator, List, Optional, Tuple + +from datasets import load_dataset + +from src.tokenizer.code_tokenizer import CodeTokenizer + + +@dataclass +class SourceDatasetSpec: + # Hugging Face dataset repo id. + hf_dataset_id: str + # Split name in HF datasets (usually train). + split: str + # Field that contains user prompt/instruction. + prompt_field: str + # Field that contains code answer/output. + code_field: str + # Optional language field name if dataset includes it. + language_field: Optional[str] + # Default language if not provided by the record. + default_language: str + + +@dataclass +class PipelineConfig: + # Dataset definitions to load. + datasets: List[SourceDatasetSpec] + # Path to saved tokenizer directory from Component 2. + tokenizer_dir: str + # Directory to write cleaned intermediate records. + interim_output_dir: str + # Directory to write tokenized final records. + processed_output_dir: str + # Path for SQLite file used to deduplicate efficiently. + dedupe_db_path: str + # Optional limit for quick tests. + max_records_per_dataset: Optional[int] = None + # Filters to remove low quality data. + min_prompt_chars: int = 8 + min_code_chars: int = 16 + max_code_chars: int = 40_000 + # Write stats every N accepted samples. + progress_every: int = 1_000 + + +class HFDatasetPipeline: + # Pipeline object that executes full preprocessing. + + AUTOGEN_PATTERNS = [ + r"(?i)auto[\s-]?generated", + r"(?i)generated by", + r"(?i)do not edit", + r"(?i)machine generated", + r"(?i)this file was generated", + ] + + PY_HINTS = [ + "def ", + "import ", + "from ", + "print(", + "if __name__ ==", + "class ", + "lambda ", + "elif ", + "except ", + ] + JS_HINTS = [ + "function ", + "const ", + "let ", + "=>", + "console.log", + "export ", + "require(", + "document.", + "window.", + "=> {", + ] + + def __init__(self, config: PipelineConfig) -> None: + self.config = config + self.tokenizer = CodeTokenizer.load(config.tokenizer_dir) + self._ensure_dirs() + self.conn = self._init_dedupe_db(config.dedupe_db_path) + self.stats: Dict[str, int] = { + "seen_total": 0, + "kept_total": 0, + "dropped_empty": 0, + "dropped_length": 0, + "dropped_binary_like": 0, + "dropped_autogen_like": 0, + "dropped_duplicate": 0, + "language_python": 0, + "language_javascript": 0, + "language_other_defaulted_to_python": 0, + } + + def _ensure_dirs(self) -> None: + # Create output directories so writing does not fail later. + Path(self.config.interim_output_dir).mkdir(parents=True, exist_ok=True) + Path(self.config.processed_output_dir).mkdir(parents=True, exist_ok=True) + Path(self.config.dedupe_db_path).parent.mkdir(parents=True, exist_ok=True) + + def _init_dedupe_db(self, db_path: str) -> sqlite3.Connection: + # SQLite gives us memory-safe dedupe for large data. + conn = sqlite3.connect(db_path) + conn.execute( + """ + CREATE TABLE IF NOT EXISTS dedupe_hashes ( + sample_hash TEXT PRIMARY KEY + ) + """ + ) + conn.commit() + return conn + + def close(self) -> None: + # Always close DB cleanly. + self.conn.close() + + def run(self) -> Dict[str, int]: + # Full pipeline entry point. + interim_path = Path(self.config.interim_output_dir) / "combined_clean.jsonl" + processed_path = Path(self.config.processed_output_dir) / "train_tokenized.jsonl" + stats_path = Path(self.config.processed_output_dir) / "pipeline_stats.json" + + with interim_path.open("w", encoding="utf-8") as interim_f, processed_path.open( + "w", encoding="utf-8" + ) as tokenized_f: + for spec in self.config.datasets: + self._process_one_dataset(spec, interim_f, tokenized_f) + + with stats_path.open("w", encoding="utf-8") as f: + json.dump(self.stats, f, indent=2) + return self.stats + + def _process_one_dataset(self, spec: SourceDatasetSpec, interim_f, tokenized_f) -> None: + # Stream one dataset and process records one by one. + print(f"[info] Loading dataset: {spec.hf_dataset_id} split={spec.split}") + stream = load_dataset(spec.hf_dataset_id, split=spec.split, streaming=True) + count = 0 + + for row in stream: + self.stats["seen_total"] += 1 + count += 1 + if self.config.max_records_per_dataset and count > self.config.max_records_per_dataset: + break + + sample = self._standardize_record(row=row, spec=spec) + if sample is None: + continue + + prompt, code, language = sample + cleaned = self._clean_and_filter(prompt=prompt, code=code, language=language) + if cleaned is None: + continue + + clean_prompt, clean_code, clean_language = cleaned + if not self._keep_unique(clean_prompt, clean_code): + self.stats["dropped_duplicate"] += 1 + continue + + formatted_text = self.tokenizer.format_training_sample( + prompt=clean_prompt, + code=clean_code, + language=clean_language, + ) + input_ids = self.tokenizer.encode(formatted_text) + + interim_record = { + "prompt": clean_prompt, + "code": clean_code, + "language": clean_language, + } + tokenized_record = { + "language": clean_language, + "text": formatted_text, + "input_ids": input_ids, + "length": len(input_ids), + } + + interim_f.write(json.dumps(interim_record, ensure_ascii=False) + "\n") + tokenized_f.write(json.dumps(tokenized_record, ensure_ascii=False) + "\n") + + self.stats["kept_total"] += 1 + if self.stats["kept_total"] % self.config.progress_every == 0: + print( + "[progress] " + f"seen={self.stats['seen_total']} kept={self.stats['kept_total']} " + f"duplicates={self.stats['dropped_duplicate']}" + ) + self.conn.commit() + + self.conn.commit() + + def _standardize_record( + self, row: Dict[str, object], spec: SourceDatasetSpec + ) -> Optional[Tuple[str, str, str]]: + # Converts source-specific row into a consistent tuple. + prompt_raw = row.get(spec.prompt_field) + code_raw = row.get(spec.code_field) + if prompt_raw is None or code_raw is None: + self.stats["dropped_empty"] += 1 + return None + + prompt = str(prompt_raw).strip() + code = str(code_raw).strip() + if not prompt or not code: + self.stats["dropped_empty"] += 1 + return None + + if spec.language_field and row.get(spec.language_field) is not None: + language = str(row.get(spec.language_field)).strip().lower() + else: + language = spec.default_language.strip().lower() + + return prompt, code, language + + def _clean_and_filter( + self, prompt: str, code: str, language: str + ) -> Optional[Tuple[str, str, str]]: + # Cleans text and applies quality filters. + prompt = self._normalize_text(prompt) + code = self._normalize_code(code) + + if len(prompt) < self.config.min_prompt_chars or len(code) < self.config.min_code_chars: + self.stats["dropped_length"] += 1 + return None + if len(code) > self.config.max_code_chars: + self.stats["dropped_length"] += 1 + return None + + if self._looks_binary_like(prompt) or self._looks_binary_like(code): + self.stats["dropped_binary_like"] += 1 + return None + + combined = f"{prompt}\n{code}" + if self._looks_auto_generated(combined): + self.stats["dropped_autogen_like"] += 1 + return None + + normalized_lang = self._normalize_language(language, prompt, code) + return prompt, code, normalized_lang + + def _normalize_text(self, text: str) -> str: + # Basic whitespace cleanup. + return re.sub(r"\s+", " ", text.replace("\r\n", "\n").replace("\r", "\n")).strip() + + def _normalize_code(self, text: str) -> str: + # Preserve line breaks for code while cleaning trailing whitespace. + lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n") + clean_lines = [line.rstrip() for line in lines] + code = "\n".join(clean_lines).strip() + return code + + def _looks_binary_like(self, text: str) -> bool: + # Detect likely non-text content that should not be in code samples. + if "\x00" in text: + return True + if not text: + return True + printable = set(string.printable) | {"\n", "\t", "\r"} + non_printable_count = sum(1 for ch in text if ch not in printable) + ratio = non_printable_count / max(1, len(text)) + return ratio > 0.12 + + def _looks_auto_generated(self, text: str) -> bool: + # Remove obvious generated boilerplate using lightweight regex checks. + return any(re.search(pattern, text) for pattern in self.AUTOGEN_PATTERNS) + + def _normalize_language(self, language: str, prompt: str, code: str) -> str: + # Normalize to python/javascript for current training goals. + lang = language.lower().strip() + if "python" in lang: + self.stats["language_python"] += 1 + return "python" + if "javascript" in lang or lang in {"js", "node", "nodejs"}: + self.stats["language_javascript"] += 1 + return "javascript" + if lang in {"auto", "unknown", "mixed", ""}: + pass + + prompt_lower = prompt.lower() + code_lower = code.lower() + py_score = sum(1 for hint in self.PY_HINTS if hint in code_lower) + js_score = sum(1 for hint in self.JS_HINTS if hint in code_lower) + if "javascript" in prompt_lower or "node.js" in prompt_lower or "js " in prompt_lower: + js_score += 2 + if "python" in prompt_lower: + py_score += 2 + if js_score > py_score: + self.stats["language_javascript"] += 1 + return "javascript" + + # Default to python to satisfy your "Python first" target. + self.stats["language_other_defaulted_to_python"] += 1 + self.stats["language_python"] += 1 + return "python" + + def _keep_unique(self, prompt: str, code: str) -> bool: + # Hash normalized prompt+code and store in SQLite for dedupe. + normalized_pair = f"{prompt}\n\n{code}".encode("utf-8") + digest = hashlib.sha256(normalized_pair).hexdigest() + try: + self.conn.execute("INSERT INTO dedupe_hashes (sample_hash) VALUES (?)", (digest,)) + return True + except sqlite3.IntegrityError: + return False diff --git a/src/evaluation_system/__init__.py b/src/evaluation_system/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ab838cd1aab93b97566844ce5c08d423d94dc41 --- /dev/null +++ b/src/evaluation_system/__init__.py @@ -0,0 +1 @@ +# This file marks evaluation_system as a Python package. diff --git a/src/evaluation_system/code_eval.py b/src/evaluation_system/code_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..fd8ed48a751285ecba59b448786774f7920eed16 --- /dev/null +++ b/src/evaluation_system/code_eval.py @@ -0,0 +1,186 @@ +""" +Component 6 evaluation helpers. +""" + +from __future__ import annotations + +import ast +import json +import re +from pathlib import Path +from typing import Dict, List + + +def python_syntax_ok(code: str) -> bool: + try: + ast.parse(code) + return True + except Exception: + return False + + +def save_json(path: str, payload: Dict) -> None: + p = Path(path) + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") + + +def _normalize_punctuation_spacing(text: str) -> str: + text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text) + text = re.sub(r"([\(\[\{])\s+", r"\1", text) + text = re.sub(r"\s*=\s*", " = ", text) + text = re.sub(r"\s*\+\s*", " + ", text) + text = re.sub(r"\s*-\s*", " - ", text) + text = re.sub(r"\s*\*\s*", " * ", text) + text = re.sub(r"\s*/\s*", " / ", text) + text = re.sub(r"\s*%\s*", " % ", text) + return re.sub(r"[ \t]+", " ", text).strip() + + +def _remove_non_python_noise(line: str) -> str: + line = line.replace("", "1") + line = line.replace("\u0000", "") + line = line.replace("{", "") + line = line.replace("}", "") + line = line.replace(";", "") + return line + + +def _fix_identifier_spacing(line: str) -> str: + # def name with spaces -> def name_with_spaces + m = re.match(r"^(\s*def\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*\(.*)$", line) + if m: + fn = re.sub(r"\s+", "_", m.group(2).strip()) + line = f"{m.group(1)}{fn}{m.group(3)}" + + # class name with spaces -> class Name_With_Spaces + m = re.match(r"^(\s*class\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*:.*)$", line) + if m: + cn = re.sub(r"\s+", "_", m.group(2).strip()) + line = f"{m.group(1)}{cn}{m.group(3)}" + + # assignment lhs spaces -> underscore. + if "=" in line and "==" not in line: + lhs, rhs = line.split("=", 1) + lhs_clean = lhs.strip() + if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean): + lhs_clean = re.sub(r"\s+", "_", lhs_clean) + line = f"{lhs_clean} = {rhs.strip()}" + + return line + + +def _looks_like_python_line(line: str) -> bool: + if not line.strip(): + return False + starts = ( + "def ", + "class ", + "if ", + "for ", + "while ", + "try:", + "except", + "with ", + "return ", + "import ", + "from ", + "print(", + ) + s = line.strip() + if s.startswith(starts): + return True + if re.match(r"^[A-Za-z_][A-Za-z0-9_]*\s*=", s): + return True + return False + + +def _trim_to_code(lines: List[str]) -> List[str]: + # Drop noisy preamble lines until first plausible Python line. + i = 0 + while i < len(lines) and not _looks_like_python_line(lines[i]): + i += 1 + lines = lines[i:] if i < len(lines) else [] + # Keep only plausible lines after start; allow blank lines. + out = [] + for line in lines: + if not line.strip(): + out.append(line) + continue + if _looks_like_python_line(line) or line.startswith(" "): + out.append(line) + return out + + +def _best_effort_python_format(lines: List[str]) -> List[str]: + out: List[str] = [] + indent = 0 + for raw in lines: + line = raw.strip() + if not line: + out.append("") + continue + + if line in {"return", "pass", "break", "continue"}: + indent = max(0, indent - 1) + + out.append((" " * indent) + line) + + if line.endswith(":"): + indent += 1 + + return out + + +def restore_code_from_structured(decoded: str) -> str: + text = decoded + for tok in ["", "", "", "", ""]: + text = text.replace(tok, "") + + if "" in text: + text = text.split("", 1)[1] + + text = text.replace("_", " ") + tokens = text.strip().split() + + lines: List[str] = [] + current_tokens: List[str] = [] + indent = 0 + + for tok in tokens: + if tok == "": + indent += 1 + continue + if tok == "": + indent = max(0, indent - 1) + continue + if tok == "": + line = " ".join(current_tokens).strip() + line = _remove_non_python_noise(line) + line = _normalize_punctuation_spacing(line) + line = _fix_identifier_spacing(line) + if line: + lines.append((" " * indent) + line) + else: + lines.append("") + current_tokens = [] + continue + current_tokens.append(tok) + + if current_tokens: + line = " ".join(current_tokens).strip() + line = _remove_non_python_noise(line) + line = _normalize_punctuation_spacing(line) + line = _fix_identifier_spacing(line) + if line: + lines.append((" " * indent) + line) + + lines = _trim_to_code(lines) + lines = _best_effort_python_format(lines) + + while lines and not lines[0].strip(): + lines.pop(0) + while lines and not lines[-1].strip(): + lines.pop() + + return "\n".join(lines).strip() diff --git a/src/finetuning_system/__init__.py b/src/finetuning_system/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4f2c60c944c17467cfefa09c7a820c0b8351ec61 --- /dev/null +++ b/src/finetuning_system/__init__.py @@ -0,0 +1 @@ +# This file marks finetuning_system as a Python package. diff --git a/src/finetuning_system/custom_pair_dataset.py b/src/finetuning_system/custom_pair_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c4523951d234ff645a8a4e3b2545b9eb78526a14 --- /dev/null +++ b/src/finetuning_system/custom_pair_dataset.py @@ -0,0 +1,65 @@ +""" +Dataset for custom fine-tuning pairs (JSON or JSONL). +Expected fields: prompt, code, optional language. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Dict, List + +from torch.utils.data import Dataset + +from src.tokenizer.code_tokenizer import CodeTokenizer + + +class CustomPairDataset(Dataset): + def __init__(self, path: str, tokenizer: CodeTokenizer, max_seq_len: int = 512) -> None: + self.path = Path(path) + if not self.path.exists(): + raise FileNotFoundError(f"Custom fine-tune data file not found: {self.path}") + self.tokenizer = tokenizer + self.max_seq_len = max_seq_len + self.rows: List[List[int]] = [] + self._load() + + def _load(self) -> None: + if self.path.suffix.lower() == ".jsonl": + data = [] + for line in self.path.read_text(encoding="utf-8-sig").splitlines(): + line = line.strip().lstrip("\ufeff") + if not line: + continue + data.append(json.loads(line)) + elif self.path.suffix.lower() == ".json": + raw = json.loads(self.path.read_text(encoding="utf-8-sig")) + if isinstance(raw, dict) and "data" in raw: + data = raw["data"] + elif isinstance(raw, list): + data = raw + else: + raise ValueError("JSON fine-tune file must be a list or {'data': [...]}.") + else: + raise ValueError("Custom fine-tune file must be .json or .jsonl") + + for row in data: + prompt = str(row.get("prompt", "")).strip() + code = str(row.get("code", "")).strip() + language = str(row.get("language", "python")).strip().lower() or "python" + if not prompt or not code: + continue + text = self.tokenizer.format_training_sample(prompt=prompt, code=code, language=language) + ids = self.tokenizer.encode(text)[: self.max_seq_len] + if len(ids) >= 8: + self.rows.append(ids) + + if not self.rows: + raise ValueError("No valid samples found in custom fine-tune data.") + + def __len__(self) -> int: + return len(self.rows) + + def __getitem__(self, idx: int) -> List[int]: + return self.rows[idx] + diff --git a/src/finetuning_system/lora_adapter.py b/src/finetuning_system/lora_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..b6a0ee9f37ec74ad3ff6ca82f09b81edcf5b36ae --- /dev/null +++ b/src/finetuning_system/lora_adapter.py @@ -0,0 +1,92 @@ +""" +Simple LoRA implementation for custom PyTorch transformer modules. +""" + +from __future__ import annotations + +import math +from dataclasses import dataclass +from typing import Iterable, List + +import torch +import torch.nn as nn + + +@dataclass +class LoRAConfig: + r: int = 8 + alpha: int = 16 + dropout: float = 0.05 + target_keywords: List[str] = None # type: ignore[assignment] + + def __post_init__(self) -> None: + if self.target_keywords is None: + self.target_keywords = ["q_proj", "k_proj", "v_proj", "o_proj", "fc1", "fc2"] + + +class LoRALinear(nn.Module): + def __init__(self, base: nn.Linear, r: int, alpha: int, dropout: float) -> None: + super().__init__() + if base.bias is not None: + # Keep implementation simple and stable for current model (bias=False modules). + raise ValueError("LoRALinear expects base Linear with bias=None in this project.") + + self.base = base + self.base.weight.requires_grad = False + + self.in_features = base.in_features + self.out_features = base.out_features + self.r = r + self.scaling = alpha / max(1, r) + + self.lora_A = nn.Parameter(torch.zeros(r, self.in_features)) + self.lora_B = nn.Parameter(torch.zeros(self.out_features, r)) + self.dropout = nn.Dropout(dropout) + + nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) + nn.init.zeros_(self.lora_B) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + base_out = self.base(x) + lora_out = self.dropout(x) @ self.lora_A.t() @ self.lora_B.t() + return base_out + (self.scaling * lora_out) + + +def _replace_module(root: nn.Module, dotted_name: str, new_module: nn.Module) -> None: + parts = dotted_name.split(".") + parent = root + for p in parts[:-1]: + parent = getattr(parent, p) + setattr(parent, parts[-1], new_module) + + +def apply_lora(model: nn.Module, cfg: LoRAConfig) -> List[str]: + replaced: List[str] = [] + for name, module in list(model.named_modules()): + if not isinstance(module, nn.Linear): + continue + if not any(k in name for k in cfg.target_keywords): + continue + lora_mod = LoRALinear(base=module, r=cfg.r, alpha=cfg.alpha, dropout=cfg.dropout) + _replace_module(model, name, lora_mod) + replaced.append(name) + + # Freeze everything except LoRA params. + for p in model.parameters(): + p.requires_grad = False + for n, p in model.named_parameters(): + if "lora_A" in n or "lora_B" in n: + p.requires_grad = True + + return replaced + + +def lora_state_dict(model: nn.Module) -> dict: + return {k: v.detach().cpu() for k, v in model.state_dict().items() if ("lora_A" in k or "lora_B" in k)} + + +def load_lora_state_dict(model: nn.Module, state: dict) -> None: + own = model.state_dict() + for k, v in state.items(): + if k in own: + own[k].copy_(v) diff --git a/src/inference_engine/__init__.py b/src/inference_engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..245858df2c08ea0b4dfb0f65ef3c3a7115ca87fe --- /dev/null +++ b/src/inference_engine/__init__.py @@ -0,0 +1 @@ +# This file marks inference_engine as a Python package. diff --git a/src/inference_engine/inference_engine.py b/src/inference_engine/inference_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..27dfcdc9489ebf39e18a41a2314fb0b83cdb94c0 --- /dev/null +++ b/src/inference_engine/inference_engine.py @@ -0,0 +1,211 @@ +""" +Component 7: Inference engine for local code generation. + +Features: +- Deterministic low-temperature greedy mode. +- Stop rules for clean function completion. +- Syntax-aware retry with up to 3 attempts. +""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch + +from src.evaluation_system.code_eval import restore_code_from_structured +from src.model_architecture.code_transformer import CodeTransformerLM +from src.tokenizer.code_tokenizer import CodeTokenizer + + +@dataclass +class DecodingConfig: + max_new_tokens: int = 300 + # Mode 1: deterministic output + greedy_temperature: float = 0.0 + # Retry mode 2 + retry2_temperature: float = 0.25 + retry2_top_p: float = 0.85 + # Retry mode 3 + retry3_temperature: float = 0.35 + retry3_top_p: float = 0.90 + max_retries: int = 3 + min_tokens_before_stop_check: int = 64 + # Stop only when function body is non-trivial. + min_function_body_statements: int = 2 + + +class InferenceEngine: + def __init__(self, model: CodeTransformerLM, tokenizer: CodeTokenizer, device: torch.device) -> None: + self.model = model + self.tokenizer = tokenizer + self.device = device + self.model.eval() + + @staticmethod + def _syntax_ok_python(code: str) -> bool: + try: + ast.parse(code) + return True + except Exception: + return False + + @staticmethod + def _function_completion_score(code: str) -> int: + # Higher score = more complete usable function. + try: + tree = ast.parse(code) + except Exception: + return 0 + funcs = [n for n in tree.body if isinstance(n, ast.FunctionDef)] + if not funcs: + return 0 + fn = funcs[-1] + body_len = len(fn.body) + has_return = any(isinstance(n, ast.Return) for n in ast.walk(fn)) + return body_len + (2 if has_return else 0) + + def _looks_complete_function(self, code: str, min_body_statements: int) -> bool: + if "def " not in code: + return False + try: + tree = ast.parse(code) + except Exception: + return False + funcs = [n for n in tree.body if isinstance(n, ast.FunctionDef)] + if not funcs: + return False + fn = funcs[-1] + if len(fn.body) < min_body_statements: + return False + return True + + def _sample_next( + self, + logits: torch.Tensor, + temperature: float, + top_p: float, + ) -> torch.Tensor: + if temperature <= 0: + return torch.argmax(logits, dim=-1, keepdim=True) + + logits = logits / temperature + probs = torch.softmax(logits, dim=-1) + + sorted_probs, sorted_idx = torch.sort(probs, descending=True) + cumulative = torch.cumsum(sorted_probs, dim=-1) + cutoff = cumulative > top_p + cutoff[..., 1:] = cutoff[..., :-1].clone() + cutoff[..., 0] = False + sorted_probs[cutoff] = 0.0 + denom = sorted_probs.sum(dim=-1, keepdim=True).clamp_min(1e-12) + sorted_probs = sorted_probs / denom + sampled = torch.multinomial(sorted_probs, num_samples=1) + return sorted_idx.gather(-1, sampled) + + @torch.no_grad() + def _generate_once( + self, + prompt: str, + language: str, + max_new_tokens: int, + temperature: float, + top_p: float, + min_tokens_before_stop_check: int, + min_function_body_statements: int, + ) -> Dict[str, object]: + prompt_text = self.tokenizer.format_training_sample(prompt=prompt, code="", language=language) + prompt_text = prompt_text.replace(" ", "").strip() + + ids = self.tokenizer.encode(prompt_text) + eos_id = self.tokenizer.special_token_ids.get("") + + # Remove trailing EOS so generation can continue. + if eos_id is not None and len(ids) > 1 and ids[-1] == int(eos_id): + ids = ids[:-1] + + input_ids = torch.tensor([ids], dtype=torch.long, device=self.device) + + generated_steps = 0 + for _ in range(max_new_tokens): + out = self.model(input_ids=input_ids) + logits = out["logits"][:, -1, :] + next_id = self._sample_next(logits, temperature=temperature, top_p=top_p) + input_ids = torch.cat([input_ids, next_id], dim=1) + generated_steps += 1 + + # Primary stop: EOS token. + if eos_id is not None and int(next_id.item()) == int(eos_id): + break + + # Secondary stop: complete parseable function with non-trivial body. + if generated_steps >= min_tokens_before_stop_check and (generated_steps % 12 == 0): + decoded = self.tokenizer.decode(input_ids[0].tolist()) + code = restore_code_from_structured(decoded) + if self._looks_complete_function(code, min_body_statements=min_function_body_statements): + break + + decoded = self.tokenizer.decode(input_ids[0].tolist()) + code = restore_code_from_structured(decoded) + syntax_ok = self._syntax_ok_python(code) if language == "python" else True + completion_score = self._function_completion_score(code) if language == "python" else 0 + return { + "code": code, + "syntax_ok": syntax_ok, + "generated_tokens": generated_steps, + "temperature": temperature, + "top_p": top_p, + "completion_score": completion_score, + } + + @torch.no_grad() + def generate_with_retry( + self, + prompt: str, + language: str = "python", + cfg: Optional[DecodingConfig] = None, + ) -> Dict[str, object]: + cfg = cfg or DecodingConfig() + + attempts: List[Tuple[float, float]] = [ + (cfg.greedy_temperature, 1.0), + (cfg.retry2_temperature, cfg.retry2_top_p), + (cfg.retry3_temperature, cfg.retry3_top_p), + ] + + results = [] + for i in range(min(cfg.max_retries, len(attempts))): + temp, top_p = attempts[i] + res = self._generate_once( + prompt=prompt, + language=language, + max_new_tokens=cfg.max_new_tokens, + temperature=temp, + top_p=top_p, + min_tokens_before_stop_check=cfg.min_tokens_before_stop_check, + min_function_body_statements=cfg.min_function_body_statements, + ) + res["attempt"] = i + 1 + results.append(res) + + # Syntax-aware retry: stop retries as soon as syntax is valid. + if bool(res["syntax_ok"]): + return { + "final": res, + "attempts": results, + "used_retry": i > 0, + } + + # If all retries fail, choose best completion score then longest generation. + best = sorted( + results, + key=lambda x: (int(x.get("completion_score", 0)), int(x.get("generated_tokens", 0))), + reverse=True, + )[0] + return { + "final": best, + "attempts": results, + "used_retry": True, + } diff --git a/src/model_architecture/__init__.py b/src/model_architecture/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..342d765602b980261490c2ba31ee4798327a8947 --- /dev/null +++ b/src/model_architecture/__init__.py @@ -0,0 +1,2 @@ +# This file marks model_architecture as a Python package. + diff --git a/src/model_architecture/code_transformer.py b/src/model_architecture/code_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..456fd69995ced3fd03b24a5badc9fc816d269e8f --- /dev/null +++ b/src/model_architecture/code_transformer.py @@ -0,0 +1,264 @@ +""" +Component 4: Transformer model architecture for code generation. + +This module defines a decoder-only transformer built from scratch in PyTorch. +It is modular through configuration so model size can be scaled up/down. +""" + +from __future__ import annotations + +import math +from dataclasses import asdict, dataclass +from typing import Dict, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +@dataclass +class ModelConfig: + # Vocabulary size from tokenizer. + vocab_size: int = 50_000 + # Maximum context length in tokens. + max_seq_len: int = 2048 + # Core hidden size of transformer. + d_model: int = 1152 + # Number of transformer blocks. + n_layers: int = 23 + # Number of attention heads. + n_heads: int = 16 + # Feed-forward hidden size. + d_ff: int = 4608 + # Dropout for regularization. + dropout: float = 0.1 + # Whether to tie token embedding and LM head weights. + tie_embeddings: bool = True + # Enable gradient checkpointing to reduce VRAM usage during training. + gradient_checkpointing: bool = False + # Initialization standard deviation. + init_std: float = 0.02 + # Epsilon for layer normalization stability. + rms_norm_eps: float = 1e-5 + + @property + def head_dim(self) -> int: + if self.d_model % self.n_heads != 0: + raise ValueError("d_model must be divisible by n_heads.") + return self.d_model // self.n_heads + + +def get_model_presets() -> Dict[str, ModelConfig]: + """ + Returns standard size presets. + """ + return { + "small_180m": ModelConfig(d_model=896, n_layers=18, n_heads=14, d_ff=3584), + "medium_420m": ModelConfig(d_model=1152, n_layers=23, n_heads=16, d_ff=4608), + "large_800m": ModelConfig(d_model=1536, n_layers=24, n_heads=16, d_ff=6144), + } + + +class RMSNorm(nn.Module): + """ + RMSNorm is a lightweight normalization layer used in many modern LLMs. + """ + + def __init__(self, dim: int, eps: float = 1e-5) -> None: + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + norm = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(norm + self.eps) + return self.weight * x + + +class RotaryEmbedding(nn.Module): + """ + Rotary positional embedding. + This injects token order information directly into query/key vectors. + """ + + def __init__(self, head_dim: int, max_seq_len: int) -> None: + super().__init__() + if head_dim % 2 != 0: + raise ValueError("head_dim must be even for rotary embeddings.") + inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim)) + t = torch.arange(max_seq_len, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) + self.register_buffer("cos_cached", torch.cos(freqs), persistent=False) + self.register_buffer("sin_cached", torch.sin(freqs), persistent=False) + + def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]: + cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0) # [1,1,S,H/2] + sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0) # [1,1,S,H/2] + q = self._apply_rotary(q, cos, sin) + k = self._apply_rotary(k, cos, sin) + return q, k + + @staticmethod + def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + x_rot_even = x1 * cos - x2 * sin + x_rot_odd = x1 * sin + x2 * cos + out = torch.stack((x_rot_even, x_rot_odd), dim=-1).flatten(-2) + return out + + +class CausalSelfAttention(nn.Module): + """ + Multi-head causal self-attention for autoregressive code generation. + """ + + def __init__(self, config: ModelConfig) -> None: + super().__init__() + self.n_heads = config.n_heads + self.head_dim = config.head_dim + self.scale = self.head_dim ** -0.5 + + self.q_proj = nn.Linear(config.d_model, config.d_model, bias=False) + self.k_proj = nn.Linear(config.d_model, config.d_model, bias=False) + self.v_proj = nn.Linear(config.d_model, config.d_model, bias=False) + self.o_proj = nn.Linear(config.d_model, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout) + self.rotary = RotaryEmbedding(head_dim=self.head_dim, max_seq_len=config.max_seq_len) + + def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + bsz, seq_len, _ = x.shape + q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2) + + q, k = self.rotary(q, k, seq_len=seq_len) + + # Use PyTorch scaled dot-product attention with causal masking. + out = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=attn_mask, + dropout_p=self.dropout.p if self.training else 0.0, + is_causal=True, + scale=self.scale, + ) + out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1) + return self.o_proj(out) + + +class FeedForward(nn.Module): + """ + Two-layer feed-forward network with GELU activation. + """ + + def __init__(self, config: ModelConfig) -> None: + super().__init__() + self.fc1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.fc2 = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = F.gelu(x, approximate="tanh") + x = self.fc2(x) + x = self.dropout(x) + return x + + +class TransformerBlock(nn.Module): + """ + One transformer block: + norm -> attention -> residual + norm -> feed-forward -> residual + """ + + def __init__(self, config: ModelConfig) -> None: + super().__init__() + self.norm1 = RMSNorm(config.d_model, eps=config.rms_norm_eps) + self.attn = CausalSelfAttention(config) + self.norm2 = RMSNorm(config.d_model, eps=config.rms_norm_eps) + self.ffn = FeedForward(config) + + def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + x = x + self.attn(self.norm1(x), attn_mask=attn_mask) + x = x + self.ffn(self.norm2(x)) + return x + + +class CodeTransformerLM(nn.Module): + """ + Full decoder-only language model for code generation. + """ + + def __init__(self, config: ModelConfig) -> None: + super().__init__() + self.config = config + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model) + self.dropout = nn.Dropout(config.dropout) + self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)]) + self.norm_final = RMSNorm(config.d_model, eps=config.rms_norm_eps) + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + if config.tie_embeddings: + self.lm_head.weight = self.embed_tokens.weight + + self.apply(self._init_weights) + + def _init_weights(self, module: nn.Module) -> None: + # Keep initialization stable for deep networks. + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std) + + def enable_gradient_checkpointing(self, enabled: bool = True) -> None: + # Toggle gradient checkpointing mode. + self.config.gradient_checkpointing = enabled + + def forward( + self, + input_ids: torch.Tensor, + labels: Optional[torch.Tensor] = None, + attn_mask: Optional[torch.Tensor] = None, + ) -> Dict[str, torch.Tensor]: + if input_ids.dim() != 2: + raise ValueError("input_ids must be shape [batch, seq_len].") + + x = self.embed_tokens(input_ids) + x = self.dropout(x) + + for block in self.blocks: + if self.config.gradient_checkpointing and self.training: + x = torch.utils.checkpoint.checkpoint(block, x, attn_mask, use_reentrant=False) + else: + x = block(x, attn_mask=attn_mask) + + x = self.norm_final(x) + logits = self.lm_head(x) + + out: Dict[str, torch.Tensor] = {"logits": logits} + if labels is not None: + # Standard next-token cross entropy loss. + shift_logits = logits[:, :-1, :].contiguous() + shift_labels = labels[:, 1:].contiguous() + loss = F.cross_entropy( + shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1), + ignore_index=-100, + ) + out["loss"] = loss + return out + + def estimate_num_parameters(self) -> int: + # Returns total trainable parameter count. + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + def summary(self) -> Dict[str, object]: + # Returns a simple structured summary for logs/CLI. + return { + "config": asdict(self.config), + "num_parameters": self.estimate_num_parameters(), + } + diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35cf334b1ed48cc5bd2702ff997daf5bf7fd0bf6 --- /dev/null +++ b/src/tokenizer/__init__.py @@ -0,0 +1,2 @@ +# This file marks tokenizer as a Python package. + diff --git a/src/tokenizer/code_tokenizer.py b/src/tokenizer/code_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..b68ec2705394185a43465e345664e691245f18a5 --- /dev/null +++ b/src/tokenizer/code_tokenizer.py @@ -0,0 +1,216 @@ +""" +Component 2: Custom code tokenizer for Python and JavaScript. + +This tokenizer is code-aware: +- It preserves indentation structure using explicit tokens. +- It keeps newline boundaries using a newline token. +- It treats code operators and brackets as separate units. +- It supports prompt+code style training samples. +""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional + +from tokenizers import Regex, Tokenizer +from tokenizers.decoders import BPEDecoder +from tokenizers.models import BPE +from tokenizers.normalizers import NFKC, Sequence as NormalizerSequence +from tokenizers.pre_tokenizers import Metaspace, Sequence as PreTokenizerSequence, Split +from tokenizers.processors import TemplateProcessing +from tokenizers.trainers import BpeTrainer + + +@dataclass +class CodeTokenizerConfig: + # Vocabulary size controls how many distinct tokens the tokenizer learns. + vocab_size: int = 50_000 + # Minimum frequency filters very rare fragments. + min_frequency: int = 2 + # Sequence length is used later by training/inference components. + model_max_length: int = 2048 + # Indent width is used to normalize tabs and format indentation markers. + indent_width: int = 4 + # These tokens are required for code generation workflows. + special_tokens: List[str] = None # type: ignore[assignment] + + def __post_init__(self) -> None: + if self.special_tokens is None: + self.special_tokens = [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + +class CodeTokenizer: + # This wrapper owns one HF Tokenizers object plus code-specific helpers. + + def __init__(self, config: Optional[CodeTokenizerConfig] = None) -> None: + self.config = config or CodeTokenizerConfig() + self.tokenizer: Optional[Tokenizer] = None + self.special_token_ids: Dict[str, int] = {} + + def _build_base_tokenizer(self) -> Tokenizer: + """ + Creates a BPE tokenizer with code-oriented pre-tokenization rules. + """ + tokenizer = Tokenizer(BPE(unk_token="")) + tokenizer.normalizer = NormalizerSequence([NFKC()]) + + # Split multi-character operators first so they are not broken apart. + multi_op = Regex( + r"(==|!=|<=|>=|:=|->|=>|\+\+|--|\+=|-=|\*=|/=|//=|%=|\*\*|&&|\|\||<<|>>)" + ) + # Split common delimiters used heavily in code. + punct = Regex(r"([()\[\]{}.,:;])") + + tokenizer.pre_tokenizer = PreTokenizerSequence( + [ + Split(multi_op, behavior="isolated"), + Split(punct, behavior="isolated"), + Metaspace(replacement="_", prepend_scheme="always", split=True), + ] + ) + tokenizer.decoder = BPEDecoder() + return tokenizer + + def train(self, text_iterator: Iterable[str]) -> None: + """ + Trains the tokenizer from a stream of preformatted text samples. + """ + tokenizer = self._build_base_tokenizer() + trainer = BpeTrainer( + vocab_size=self.config.vocab_size, + min_frequency=self.config.min_frequency, + special_tokens=self.config.special_tokens, + show_progress=True, + ) + tokenizer.train_from_iterator(text_iterator, trainer=trainer, length=None) + + # Add BOS/EOS automatically around each single sequence. + bos_id = tokenizer.token_to_id("") + eos_id = tokenizer.token_to_id("") + if bos_id is None or eos_id is None: + raise RuntimeError("Tokenizer training failed to register BOS/EOS tokens.") + tokenizer.post_processor = TemplateProcessing( + single=" $A ", + special_tokens=[("", bos_id), ("", eos_id)], + ) + + self.tokenizer = tokenizer + self.special_token_ids = { + token: tokenizer.token_to_id(token) for token in self.config.special_tokens + } + + def save(self, output_dir: str) -> None: + """ + Saves tokenizer JSON and config so all other components can reuse it. + """ + if self.tokenizer is None: + raise RuntimeError("Cannot save tokenizer before training or loading it.") + out = Path(output_dir) + out.mkdir(parents=True, exist_ok=True) + self.tokenizer.save(str(out / "tokenizer.json")) + with (out / "tokenizer_config.json").open("w", encoding="utf-8") as f: + json.dump(asdict(self.config), f, indent=2) + + @classmethod + def load(cls, tokenizer_dir: str) -> "CodeTokenizer": + """ + Loads tokenizer from disk. + """ + base = Path(tokenizer_dir) + cfg_path = base / "tokenizer_config.json" + tok_path = base / "tokenizer.json" + if not cfg_path.exists() or not tok_path.exists(): + raise FileNotFoundError( + f"Missing tokenizer files in {tokenizer_dir}. " + "Expected tokenizer.json and tokenizer_config.json." + ) + with cfg_path.open("r", encoding="utf-8") as f: + cfg_data = json.load(f) + config = CodeTokenizerConfig(**cfg_data) + obj = cls(config=config) + obj.tokenizer = Tokenizer.from_file(str(tok_path)) + obj.special_token_ids = { + token: obj.tokenizer.token_to_id(token) for token in obj.config.special_tokens + } + return obj + + def encode(self, text: str) -> List[int]: + """ + Encodes one preformatted text sample to token IDs. + """ + if self.tokenizer is None: + raise RuntimeError("Tokenizer is not ready. Train or load it first.") + return self.tokenizer.encode(text).ids + + def decode(self, token_ids: List[int]) -> str: + """ + Decodes token IDs to text. + """ + if self.tokenizer is None: + raise RuntimeError("Tokenizer is not ready. Train or load it first.") + return self.tokenizer.decode(token_ids, skip_special_tokens=False) + + def format_training_sample(self, prompt: str, code: str, language: str) -> str: + """ + Converts prompt + code into one structured training text sequence. + """ + lang_token = "" if language.lower() == "python" else "" + prompt_text = self._normalize_text(prompt) + code_text = self._code_to_structure_tokens(code) + return f" {lang_token} {prompt_text} {code_text}" + + def _normalize_text(self, text: str) -> str: + """ + Normalizes regular text by cleaning newlines. + """ + return text.replace("\r\n", "\n").replace("\r", "\n").strip() + + def _code_to_structure_tokens(self, code: str) -> str: + """ + Converts raw code into a string with explicit indentation and newline markers. + """ + code = code.replace("\r\n", "\n").replace("\r", "\n").replace("\t", " " * self.config.indent_width) + lines = code.split("\n") + indent_stack: List[int] = [0] + out_tokens: List[str] = [] + + for raw_line in lines: + # Keep blank lines as newline tokens so code structure is preserved. + if raw_line.strip() == "": + out_tokens.append("") + continue + + current_indent = len(raw_line) - len(raw_line.lstrip(" ")) + line_content = raw_line.lstrip(" ") + + while current_indent < indent_stack[-1]: + indent_stack.pop() + out_tokens.append("") + + while current_indent > indent_stack[-1]: + indent_stack.append(current_indent) + out_tokens.append("") + + out_tokens.append(line_content) + out_tokens.append("") + + while len(indent_stack) > 1: + indent_stack.pop() + out_tokens.append("") + + return " ".join(out_tokens).strip() diff --git a/src/training_pipeline/__init__.py b/src/training_pipeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..95609696a377a7a538c80904947085457d5945f6 --- /dev/null +++ b/src/training_pipeline/__init__.py @@ -0,0 +1 @@ +# This file marks training_pipeline as a Python package. diff --git a/src/training_pipeline/tokenized_dataset.py b/src/training_pipeline/tokenized_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..beeb53db5c1499b622f9c025c41b93bd7f34f76e --- /dev/null +++ b/src/training_pipeline/tokenized_dataset.py @@ -0,0 +1,88 @@ +""" +Memory-efficient dataset utilities for tokenized JSONL training data. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Iterator, List, Tuple + +import torch +from torch.utils.data import Dataset + + +class TokenizedJsonlDataset(Dataset): + """ + Random-access dataset over tokenized JSONL using line byte offsets. + This avoids loading all samples into RAM. + """ + + def __init__(self, path: str, split: str = "train", val_ratio: float = 0.02, split_seed: int = 17) -> None: + self.path = Path(path) + if not self.path.exists(): + raise FileNotFoundError(f"Tokenized dataset not found: {self.path}") + self.split = split + self.val_ratio = val_ratio + self.split_seed = split_seed + self.offsets: List[int] = [] + self._build_offsets() + + def _hash_to_split(self, idx: int) -> bool: + # Deterministic split using index so train/val is stable across runs. + h = (idx * 1103515245 + self.split_seed) & 0x7FFFFFFF + p = (h % 10_000) / 10_000.0 + return p < self.val_ratio + + def _build_offsets(self) -> None: + with self.path.open("rb") as f: + idx = 0 + while True: + offset = f.tell() + line = f.readline() + if not line: + break + if self.split == "val": + keep = self._hash_to_split(idx) + else: + keep = not self._hash_to_split(idx) + if keep: + self.offsets.append(offset) + idx += 1 + + def __len__(self) -> int: + return len(self.offsets) + + def __getitem__(self, index: int) -> List[int]: + offset = self.offsets[index] + with self.path.open("rb") as f: + f.seek(offset) + line = f.readline().decode("utf-8").strip() + row = json.loads(line) + ids = row.get("input_ids") + if not isinstance(ids, list) or not ids: + raise ValueError(f"Invalid input_ids at index {index}") + return [int(x) for x in ids] + + +class CausalCollator: + """ + Pads/truncates sequences and produces labels for next-token training. + """ + + def __init__(self, pad_token_id: int = 0, max_seq_len: int = 512) -> None: + self.pad_token_id = pad_token_id + self.max_seq_len = max_seq_len + + def __call__(self, batch: List[List[int]]) -> Tuple[torch.Tensor, torch.Tensor]: + clipped = [x[: self.max_seq_len] for x in batch] + max_len = max(len(x) for x in clipped) + input_ids = [] + labels = [] + for seq in clipped: + pad_len = max_len - len(seq) + padded = seq + [self.pad_token_id] * pad_len + label = seq + [-100] * pad_len + input_ids.append(padded) + labels.append(label) + return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long) diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..c7c703e7c486ef91d25ec589405daf1d1aab8ea5 --- /dev/null +++ b/train.py @@ -0,0 +1,228 @@ +import argparse +from pathlib import Path +from typing import List + +import torch +from peft import LoraConfig, TaskType, get_peft_model +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + Trainer, + TrainingArguments, + set_seed, +) + +from config import PATHS, TRAINING_CONFIG +from dataset import LocalJsonlInstructionDataset, format_prompt +from utils import ensure_dirs, setup_logger + + +def _is_valid_hf_model_dir(path: Path) -> bool: + if not path.exists(): + return False + has_config = (path / "config.json").exists() + has_weights = (path / "model.safetensors").exists() or (path / "pytorch_model.bin").exists() + return has_config and has_weights + + +def _resolve_model_path(logger) -> Path: + primary = PATHS.model_dir + fallback = Path("./hf_release/MINDI-1.0-420M") + + if _is_valid_hf_model_dir(primary): + return primary + if _is_valid_hf_model_dir(fallback): + logger.warning( + "Primary model path %s is missing HF files. Falling back to %s", + primary.resolve(), + fallback.resolve(), + ) + return fallback + raise FileNotFoundError( + "No valid HuggingFace model directory found.\n" + f"Checked: {primary.resolve()} and {fallback.resolve()}.\n" + "Expected files: config.json + model.safetensors (or pytorch_model.bin)." + ) + + +def _build_model_and_tokenizer(model_path: Path, logger): + try: + tokenizer = AutoTokenizer.from_pretrained( + model_path, + trust_remote_code=True, + local_files_only=True, + use_fast=True, + ) + except Exception as fast_exc: + logger.warning("Fast tokenizer load failed: %s. Retrying with slow tokenizer.", fast_exc) + try: + tokenizer = AutoTokenizer.from_pretrained( + model_path, + trust_remote_code=True, + local_files_only=True, + use_fast=False, + ) + except Exception as slow_exc: + raise RuntimeError( + "Tokenizer loading failed for both fast and slow modes. " + "Ensure tokenizer files exist in the model folder and install " + "`sentencepiece` (and optionally `tiktoken`) if required." + ) from slow_exc + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained( + model_path, + trust_remote_code=True, + local_files_only=True, + dtype=torch.float16 if torch.cuda.is_available() else torch.float32, + ) + + lora_cfg = LoraConfig( + r=16, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type=TaskType.CAUSAL_LM, + target_modules="all-linear", + ) + model = get_peft_model(model, lora_cfg) + return model, tokenizer + + +def _maybe_resume_train(trainer: Trainer, logger, resume_requested: bool) -> None: + if not resume_requested: + trainer.train() + return + + try: + trainer.train(resume_from_checkpoint=True) + except (ValueError, OSError) as exc: + logger.warning( + "Resume requested but no valid checkpoint found (%s). Starting fresh training.", + exc, + ) + trainer.train() + + +def _generate_predictions(model, tokenizer, prompts: List[str], logger) -> None: + model.eval() + device = model.device + logger.info("Running post-training evaluation prompts.") + + for prompt in prompts: + full_prompt = format_prompt( + instruction=prompt, + input_text="", + output_text="", + ) + inputs = tokenizer(full_prompt, return_tensors="pt").to(device) + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=TRAINING_CONFIG.eval_max_new_tokens, + do_sample=True, + temperature=0.2, + top_p=0.95, + pad_token_id=tokenizer.pad_token_id, + ) + decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) + print("\n" + "=" * 80) + print(f"PROMPT: {prompt}") + print("-" * 80) + print(decoded) + + +def train(resume: bool) -> Path: + ensure_dirs( + [ + PATHS.data_dir, + PATHS.output_dir, + PATHS.logs_dir, + PATHS.checkpoint_dir, + PATHS.lora_output_dir, + PATHS.tokenizer_output_dir, + ] + ) + logger = setup_logger("train", PATHS.logs_dir / "train.log") + set_seed(42) + if not torch.cuda.is_available(): + logger.warning( + "CUDA is not available. Training will run on CPU, which is very slow and can limit practical model quality." + ) + + if not PATHS.train_jsonl.exists(): + raise FileNotFoundError( + f"Training dataset not found: {PATHS.train_jsonl.resolve()}. " + "Run data_fetch.py first." + ) + + model_path = _resolve_model_path(logger) + logger.info("Loading model and tokenizer from %s", model_path.resolve()) + model, tokenizer = _build_model_and_tokenizer(model_path, logger) + model.print_trainable_parameters() + + train_dataset = LocalJsonlInstructionDataset(tokenizer, max_length=TRAINING_CONFIG.max_length) + logger.info("Loaded %d samples from %s", len(train_dataset), PATHS.train_jsonl.resolve()) + + training_args = TrainingArguments( + output_dir=str(PATHS.checkpoint_dir), + num_train_epochs=TRAINING_CONFIG.num_train_epochs, + per_device_train_batch_size=TRAINING_CONFIG.per_device_train_batch_size, + gradient_accumulation_steps=TRAINING_CONFIG.gradient_accumulation_steps, + learning_rate=TRAINING_CONFIG.learning_rate, + fp16=torch.cuda.is_available(), + lr_scheduler_type="cosine", + warmup_ratio=0.03, + weight_decay=0.01, + max_grad_norm=1.0, + gradient_checkpointing=True, + group_by_length=True, + logging_steps=TRAINING_CONFIG.logging_steps, + save_steps=TRAINING_CONFIG.save_steps, + save_total_limit=4, + report_to="none", + remove_unused_columns=False, + dataloader_num_workers=2, + dataloader_pin_memory=torch.cuda.is_available(), + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + ) + + logger.info("Starting training. Resume mode: %s", resume) + _maybe_resume_train(trainer, logger, resume_requested=resume) + + logger.info("Saving LoRA adapters to %s", PATHS.lora_output_dir.resolve()) + trainer.model.save_pretrained(str(PATHS.lora_output_dir)) + tokenizer.save_pretrained(str(PATHS.tokenizer_output_dir)) + + prompts = [ + "Write a Python binary search function", + "Fix this Python bug: list index out of range", + "Create a FastAPI endpoint", + ] + _generate_predictions(model, tokenizer, prompts, logger) + + print(f"\nLoRA adapters saved to: {PATHS.lora_output_dir.resolve()}") + print(f"Tokenizer saved to: {PATHS.tokenizer_output_dir.resolve()}") + return PATHS.lora_output_dir + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="LoRA fine-tuning for MINDI Python coding tasks.") + parser.add_argument( + "--no-resume", + action="store_true", + help="Disable automatic resume_from_checkpoint=True behavior.", + ) + return parser + + +if __name__ == "__main__": + args = _build_arg_parser().parse_args() + train(resume=not args.no_resume and TRAINING_CONFIG.resume_training) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e5853a51f99f4b57ad9cff42f6738bb92fe24a84 --- /dev/null +++ b/utils.py @@ -0,0 +1,52 @@ +import json +import logging +from pathlib import Path +from typing import Dict, Iterable, List + + +def ensure_dirs(paths: Iterable[Path]) -> None: + for path in paths: + path.mkdir(parents=True, exist_ok=True) + + +def setup_logger(name: str, log_file: Path, level: int = logging.INFO) -> logging.Logger: + log_file.parent.mkdir(parents=True, exist_ok=True) + logger = logging.getLogger(name) + logger.setLevel(level) + + if logger.handlers: + return logger + + formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + file_handler = logging.FileHandler(log_file, encoding="utf-8") + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + return logger + + +def write_jsonl(path: Path, rows: List[Dict[str, str]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def read_jsonl(path: Path) -> List[Dict[str, str]]: + rows: List[Dict[str, str]] = [] + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + rows.append(json.loads(line)) + return rows +