diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..4a87953d8c5f484b012a04c086ba9e1004a47a28
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,10 @@
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+output/checkpoints/* filter=lfs diff=lfs merge=lfs -text
+checkpoints/** filter=lfs diff=lfs merge=lfs -text
+models/** filter=lfs diff=lfs merge=lfs -text
+data/** filter=lfs diff=lfs merge=lfs -text
+artifacts/** filter=lfs diff=lfs merge=lfs -text
+logs/** filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f3d10e4b1ccace3ca659e5ba8b0720d9a3f886b1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,29 @@
+# Ignore Python cache and compiled files.
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+
+# Ignore virtual environment.
+.venv/
+
+# Ignore logs and temporary outputs.
+logs/
+artifacts/
+*.log
+
+# Ignore model weights and checkpoints by default.
+checkpoints/
+models/base/
+models/lora/
+models/quantized/
+
+# Ignore data files by default.
+data/raw/
+data/interim/
+data/processed/
+data/external/
+
+# Ignore notebook checkpoints.
+.ipynb_checkpoints/
+
diff --git a/CONTEXT_SUMMARY.md b/CONTEXT_SUMMARY.md
new file mode 100644
index 0000000000000000000000000000000000000000..efc424dce25bb0ec672fdd64cfe5eefd3b8aa7d5
--- /dev/null
+++ b/CONTEXT_SUMMARY.md
@@ -0,0 +1,38 @@
+# Project Context Summary
+
+This file captures the current state of work from the active collaboration session.
+
+## Environment
+- Original project path: `D:\Desktop 31st Jan 2026\MIND-AI-MODEL`
+- Target copy path requested: `C:\AI 2`
+- OS: Windows
+- GPU: NVIDIA RTX 4060 Laptop (8GB VRAM)
+
+## Completed Components
+1. Component 1 (Project setup): completed and verified.
+2. Component 2 (Custom tokenizer): completed and verified.
+3. Component 3 (Dataset pipeline): completed and verified.
+4. Component 3 final-step reprocess fix: completed and verified, with JS rebalance.
+5. Component 4 (420M transformer architecture): completed and verified.
+
+## Current Dataset Stats
+- Total processed records: 139,531
+- Python: 115,572
+- JavaScript: 23,959
+
+## Current Model Architecture
+- Preset: `medium_420m`
+- Parameters: 423,934,848
+- Verified forward pass on GPU successful.
+
+## Key Files
+- `configs/component4_model_config.yaml`
+- `src/model_architecture/code_transformer.py`
+- `scripts/build_component4_model.py`
+- `scripts/verify_component4_model.py`
+- `data/processed/train_tokenized.jsonl`
+- `data/processed/pipeline_stats.json`
+
+## Next Planned Component
+- Component 5: Training pipeline with FP16, gradient checkpointing, gradient accumulation, checkpointing every 100 steps, resume support, early stopping, and live training metrics.
+
diff --git a/README_COMPONENT_1_SETUP.md b/README_COMPONENT_1_SETUP.md
new file mode 100644
index 0000000000000000000000000000000000000000..b68216aac55345f90d510408ae12fdb4d4057daf
--- /dev/null
+++ b/README_COMPONENT_1_SETUP.md
@@ -0,0 +1,83 @@
+# Component 1: Project Setup (Windows + RTX 4060 8GB)
+
+## What This Component Does
+- Creates a clean folder structure for the full coding-assistant project.
+- Sets up a Python virtual environment.
+- Installs all core dependencies needed across Components 2-10.
+- Verifies that Python, PyTorch, CUDA visibility, and key libraries work.
+
+## Folder Structure Created
+- `data/raw` -> raw datasets you will provide later
+- `data/interim` -> temporary cleaned data
+- `data/processed` -> training-ready tokenized data
+- `data/external` -> any third-party resources
+- `src/tokenizer` -> Component 2 code tokenizer
+- `src/dataset_pipeline` -> Component 3 preprocessing pipeline
+- `src/model_architecture` -> Component 4 transformer code
+- `src/training_pipeline` -> Component 5 training loop
+- `src/evaluation_system` -> Component 6 evaluation code
+- `src/inference_engine` -> Component 7 inference code
+- `src/chat_interface` -> Component 8 Gradio interface
+- `src/finetuning_system` -> Component 9 LoRA fine-tuning
+- `src/export_optimization` -> Component 10 quantization/export tools
+- `configs` -> config files for all components
+- `scripts` -> setup, verification, and utility scripts
+- `tests` -> quick checks for each component
+- `checkpoints` -> model checkpoints saved during training
+- `models/base` -> base trained model files
+- `models/lora` -> LoRA adapters
+- `models/quantized` -> optimized quantized models
+- `artifacts` -> generated reports, metrics, and outputs
+- `logs` -> training and runtime logs
+
+## Exact Commands To Run (in this order)
+Run from:
+`D:\Desktop 31st Jan 2026\MIND-AI-MODEL`
+
+0. Install Python 3.11 (required for package compatibility):
+- Download page: https://www.python.org/downloads/release/python-3119/
+- Windows installer file: `python-3.11.9-amd64.exe`
+- During install, check: `Add python.exe to PATH`
+
+1. Allow script execution for this terminal only:
+```powershell
+Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
+```
+
+2. If you already attempted setup once, remove old virtual environment first:
+```powershell
+if (Test-Path .\.venv) { Remove-Item -Recurse -Force .\.venv }
+```
+
+3. Create folders, virtual env, install dependencies:
+```powershell
+.\scripts\setup_windows_environment.ps1
+```
+
+4. Activate virtual environment:
+```powershell
+.\.venv\Scripts\Activate.ps1
+```
+
+5. Verify setup:
+```powershell
+python .\scripts\verify_component1_setup.py
+```
+
+## Expected Verification Result
+- Prints Python version
+- Prints PyTorch version
+- Shows whether CUDA is available
+- Shows GPU name if available
+- Confirms critical libraries import correctly
+
+Note:
+- `codebleu` is excluded from base install on Windows due to a `tree-sitter` dependency conflict on Python 3.11.
+- Component 6 will use Windows-stable evaluation metrics and add code-quality checks without breaking setup.
+- `bitsandbytes` is optional on native Windows because some CUDA/driver combinations fail to load its DLL.
+- Base setup and all early components continue without it.
+- For Component 5, we will:
+  - try `bitsandbytes` if available, and
+  - automatically fall back to a stable optimizer on your machine if it is not.
+
+If verification fails, copy the full terminal output and share it with me.
diff --git a/README_COMPONENT_3_DATASET_PIPELINE.md b/README_COMPONENT_3_DATASET_PIPELINE.md
new file mode 100644
index 0000000000000000000000000000000000000000..a729bca8b87db6fd5709704439e6461dff7b9f35
--- /dev/null
+++ b/README_COMPONENT_3_DATASET_PIPELINE.md
@@ -0,0 +1,46 @@
+# Component 3: Dataset Pipeline
+
+## What This Component Does (Simple English)
+- Downloads the 3 datasets directly from Hugging Face (no manual download files).
+- Reads them in streaming mode so your RAM usage stays low.
+- Cleans prompt/code text.
+- Removes low-quality and likely auto-generated data.
+- Removes duplicate prompt+code pairs using a disk-backed SQLite index.
+- Detects language (Python or JavaScript) when unclear.
+- Tokenizes all cleaned records using the Component 2 tokenizer.
+- Saves training-ready tokenized JSONL output.
+
+## Files Created By This Component
+- `configs/component3_dataset_pipeline.yaml`
+- `src/dataset_pipeline/hf_dataset_pipeline.py`
+- `scripts/run_component3_dataset_pipeline.py`
+- `scripts/verify_component3_dataset_pipeline.py`
+
+## Required Before Running
+- Component 2 tokenizer must exist at:
+  - `artifacts/tokenizer/code_tokenizer_v1/tokenizer.json`
+  - `artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json`
+
+## Quick Verification Run (small test)
+Run from project root:
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\verify_component3_dataset_pipeline.py
+```
+
+This uses `200` records per dataset for a smoke test.
+
+## Full Pipeline Run
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\run_component3_dataset_pipeline.py --config .\configs\component3_dataset_pipeline.yaml
+```
+
+## Output Files
+- Clean merged dataset:
+  - `data/interim/combined_clean.jsonl`
+- Tokenized training dataset:
+  - `data/processed/train_tokenized.jsonl`
+- Stats summary:
+  - `data/processed/pipeline_stats.json`
+
diff --git a/README_COMPONENT_4_MODEL_ARCHITECTURE.md b/README_COMPONENT_4_MODEL_ARCHITECTURE.md
new file mode 100644
index 0000000000000000000000000000000000000000..369e94f46af3dcb85d736d7fc3f83d18f8268f03
--- /dev/null
+++ b/README_COMPONENT_4_MODEL_ARCHITECTURE.md
@@ -0,0 +1,28 @@
+# Component 4: Model Architecture (420M Starter)
+
+## What This Component Builds
+- A decoder-only transformer language model for code generation.
+- Configurable size through YAML config.
+- Presets for small, medium (420M target), and large.
+- Attention + rotary positional encoding + feed-forward blocks.
+
+## Main Files
+- `src/model_architecture/code_transformer.py`
+- `configs/component4_model_config.yaml`
+- `scripts/build_component4_model.py`
+- `scripts/verify_component4_model.py`
+
+## Commands (run from project root)
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\build_component4_model.py --config .\configs\component4_model_config.yaml
+python .\scripts\verify_component4_model.py --config .\configs\component4_model_config.yaml --batch_size 1 --seq_len 256
+```
+
+## What Success Looks Like
+- Build script prints parameter count near the 420M target.
+- Verify script prints:
+  - VRAM usage at multiple stages
+  - output tensor shape
+  - `Component 4 verification passed.`
+
diff --git a/README_COMPONENT_5_TRAINING_PIPELINE.md b/README_COMPONENT_5_TRAINING_PIPELINE.md
new file mode 100644
index 0000000000000000000000000000000000000000..a313c88dbfb239a43ebeb1ca7d0d68393c04a8ff
--- /dev/null
+++ b/README_COMPONENT_5_TRAINING_PIPELINE.md
@@ -0,0 +1,42 @@
+﻿# Component 5: Training Pipeline
+
+## What This Component Does
+- Trains the 420M transformer on tokenized data.
+- Uses FP16 mixed precision to reduce VRAM.
+- Uses gradient checkpointing to save memory.
+- Uses gradient accumulation for larger effective batch size.
+- Attempts Adam8bit optimizer when available, otherwise safely falls back.
+- Saves checkpoint every 100 steps by default.
+- Supports resuming from latest checkpoint.
+- Evaluates periodically and supports early stopping.
+- Shows live loss, LR, ETA, and VRAM.
+
+## Main Files
+- `configs/component5_training_config.yaml`
+- `src/training_pipeline/tokenized_dataset.py`
+- `scripts/train_component5.py`
+- `scripts/verify_component5_training_pipeline.py`
+
+## Commands
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\verify_component5_training_pipeline.py
+python .\scripts\train_component5.py --config .\configs\component5_training_config.yaml
+```
+
+## VRAM and Runtime (RTX 4060 8GB)
+- Expected VRAM during training with default config: about 5.8 to 6.9 GB.
+- Safety stop is enabled at 7.0 GB.
+- Approx training time for 1 epoch equivalent: ~30 to 65 hours.
+
+## Common Failures and Fixes
+1. OOM or VRAM threshold hit:
+   - Reduce `max_seq_len` (e.g., 512 -> 384).
+   - Increase `grad_accum_steps`.
+2. Training too slow:
+   - Lower `max_seq_len` for first run.
+   - Keep `micro_batch_size=1` and adjust accumulation.
+3. Resume issues:
+   - Ensure `checkpoints/component5_420m/latest.pt` exists.
+4. Validation not improving:
+   - Lower LR and increase warmup.
diff --git a/README_COMPONENT_8_CHAT_INTERFACE.md b/README_COMPONENT_8_CHAT_INTERFACE.md
new file mode 100644
index 0000000000000000000000000000000000000000..59eeb59917095ede4e15bb975209beb3450950e6
--- /dev/null
+++ b/README_COMPONENT_8_CHAT_INTERFACE.md
@@ -0,0 +1,20 @@
+﻿# Component 8: Local Chat Interface
+
+## What it gives you
+- Browser chat UI for your local coding model.
+- Uses Component 7 inference engine automatically.
+- Dark theme, prompt box, code cards, copy button per response.
+- Syntax highlighting for Python and JavaScript.
+- Shows generation time and generated token count.
+- Keeps conversation history in the current session.
+- Clear button to reset conversation.
+
+## Launch (single command)
+```powershell
+python .\scripts\launch_component8_chat.py --config .\configs\component8_chat_config.yaml
+```
+
+## URL to open
+- `http://127.0.0.1:7860`
+
+No internet is needed for local usage.
diff --git a/README_FINAL_PROJECT.md b/README_FINAL_PROJECT.md
new file mode 100644
index 0000000000000000000000000000000000000000..df399f032d7b5bf7b65b3c4f15f502606636e5df
--- /dev/null
+++ b/README_FINAL_PROJECT.md
@@ -0,0 +1,126 @@
+﻿# Final Project README - MINDI 1.0 420M (Windows, RTX 4060 8GB)
+
+## What This Project Is
+This is a fully local coding-assistant model system built step-by-step from scratch.
+It supports:
+- custom tokenizer for code
+- dataset cleaning + tokenization pipeline
+- 420M transformer model
+- memory-optimized training
+- evaluation + inference improvements
+- local chat UI
+- LoRA fine-tuning
+- INT8 export + portable package
+
+Everything runs locally on your machine without internet after setup.
+
+---
+
+## What You Built (High Level)
+1. **Project setup** with reproducible environment and verification scripts.
+2. **Custom code tokenizer** (Python + JavaScript aware).
+3. **Dataset pipeline** with cleaning, dedupe, and tokenization.
+4. **420M transformer architecture** (modular config).
+5. **Training pipeline** (FP16, checkpointing, accumulation, resume, early stopping).
+6. **Evaluation system** (val metrics + generation checks).
+7. **Inference engine** (greedy mode, stop rules, syntax-aware retry).
+8. **Local chat interface** with history, copy button, timing, and mode selector.
+9. **LoRA fine-tuning pipeline** for your own examples.
+10. **Export/quantization/packaging** with benchmark report and portable launcher.
+
+---
+
+## Most Important File Locations
+
+### Core model and data
+- Base checkpoint: `checkpoints/component5_420m/step_3200.pt`
+- Tokenized training data: `data/processed/train_tokenized.jsonl`
+- Tokenizer: `artifacts/tokenizer/code_tokenizer_v1/`
+
+### LoRA
+- Best LoRA adapter: `models/lora/custom_lora_v1/best.pt`
+- LoRA metadata: `models/lora/custom_lora_v1/adapter_meta.json`
+
+### Quantized model
+- INT8 model: `models/quantized/model_step3200_int8_state.pt`
+- Benchmark report: `artifacts/export/component10_benchmark_report.json`
+
+### Chat interface
+- Launcher: `scripts/launch_component8_chat.py`
+- Chat config: `configs/component8_chat_config.yaml`
+
+### Portable package
+- Folder: `release/MINDI_1.0_420M`
+- Double-click launcher: `release/MINDI_1.0_420M/Start_MINDI.bat`
+
+---
+
+## Launch the Main Chat UI
+From project root (`C:\AI 2`):
+
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\launch_component8_chat.py --config .\configs\component8_chat_config.yaml
+```
+
+Open in browser:
+- `http://127.0.0.1:7860`
+
+### Live model selector in UI
+You can switch without restart:
+- `base`
+- `lora`
+- `int8`
+
+Status box shows:
+- active mode
+- mode load time
+- live VRAM usage
+
+---
+
+## How to Add More Training Data (Future Improvement)
+
+### A) Add more base-training pairs (full training path)
+1. Put new JSONL/JSON files in `data/raw/`.
+2. Run dataset processing scripts (Component 3 path).
+3. Continue/refresh base training with Component 5.
+
+### B) Add targeted improvements quickly (LoRA recommended)
+1. Edit `data/raw/custom_finetune_pairs.jsonl` with your new prompt/code pairs.
+   - Required fields per row: `prompt`, `code`
+   - Optional: `language` (`python` or `javascript`)
+2. Run LoRA fine-tuning:
+
+```powershell
+python .\scripts\run_component9_lora_finetune.py --config .\configs\component9_lora_config.yaml
+```
+
+3. Use updated adapter in chat by selecting `lora` mode.
+
+---
+
+## Recommended Next Habit
+When quality is weak on specific tasks:
+1. Add 20-200 clean examples of exactly that task style to `custom_finetune_pairs.jsonl`.
+2. Re-run LoRA fine-tuning.
+3. Test in chat `lora` mode.
+4. Repeat in small cycles.
+
+This gives faster improvement than retraining the full base model each time.
+
+---
+
+## One-File Health Check Commands
+
+```powershell
+python .\scripts\verify_component1_setup.py
+python .\scripts\verify_component4_model.py --config .\configs\component4_model_config.yaml --batch_size 1 --seq_len 256
+python .\scripts\verify_component9_lora.py
+```
+
+---
+
+## Current Status
+Project is complete across Components 1-10 and verified on your hardware.
+
diff --git a/artifacts/evaluation/component6_eval_results.json b/artifacts/evaluation/component6_eval_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..395b688ad5f18bbe939409cf371489b51bb99973
--- /dev/null
+++ b/artifacts/evaluation/component6_eval_results.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3da6ee747d77b0c8cdca5d4fedb750549a9e5e7c42592e5e32e6103ff5617d8f
+size 2379
diff --git a/artifacts/evaluation/component7_inference_results.json b/artifacts/evaluation/component7_inference_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d74c9b319277093414e7e4624e277606c5b242f7
--- /dev/null
+++ b/artifacts/evaluation/component7_inference_results.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce08bfd6918f619fdcb1ef17ec1db79c2d32578d12a02aaaae7b7092f83384ae
+size 5863
diff --git a/artifacts/export/component10_benchmark_report.json b/artifacts/export/component10_benchmark_report.json
new file mode 100644
index 0000000000000000000000000000000000000000..48979bc22a1f42e5aad8e586c824bca5a099c519
--- /dev/null
+++ b/artifacts/export/component10_benchmark_report.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d827ec736fbdc4ea2ed5bc196223f1bf02d11a9260acd451edd51f8f39bcda75
+size 545
diff --git a/artifacts/model/component4_model_summary.json b/artifacts/model/component4_model_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fa452868b7bc7fd737e795c66a4f4e8ee3e5c6c
--- /dev/null
+++ b/artifacts/model/component4_model_summary.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab5ebc8aa081f82bbcaee2c945b207b4db3251f63b845ed86055f4e5b7204010
+size 328
diff --git a/artifacts/tokenizer/code_tokenizer_v1/tokenizer.json b/artifacts/tokenizer/code_tokenizer_v1/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e1a514ba7e4fe347a2bb3dd10481c45ede8a552
--- /dev/null
+++ b/artifacts/tokenizer/code_tokenizer_v1/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe04cc37ac778637cb2cc02a6096412e5d8cada3e4ef3e4a7f2d141fccab8a0
+size 11475
diff --git a/artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json b/artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..475df77b084233f9af7d7d0d6117b26ddedb1003
--- /dev/null
+++ b/artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb0b7af679bac1c29fe7ac9f86c48f1fed5584ba72c9ef2c338f60b63e07bb46
+size 302
diff --git a/backup_step1000.tar.gz b/backup_step1000.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..90487a54bc939f1f525e3a5bbcefe1e6b4c3abee
--- /dev/null
+++ b/backup_step1000.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebe005c43dd59c9c49ad153d41af1bdaaad47c2a21ae231a4c5e90c8005560af
+size 337623475
diff --git a/backup_step2000.tar.gz b/backup_step2000.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..e90111a43816db073a72f2aaf9a6d41d05bf4a7a
--- /dev/null
+++ b/backup_step2000.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:861329fb551b4c6406e92e06cfa1faae592f0fe0d0ce713189a57c62b33b0969
+size 337571785
diff --git a/backup_step3000.tar.gz b/backup_step3000.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..002bb96b225f1a2d3550a188df8e76e9eac89bf0
--- /dev/null
+++ b/backup_step3000.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:238c2859ebf4efc0195456a898d2fb8bce0397e39fdf59e9f940963232d628a8
+size 337762553
diff --git a/checkpoints/component5_420m/latest.pt b/checkpoints/component5_420m/latest.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e99a700ef6965af5b53fc2b5e4ae3cbd2bf30543
--- /dev/null
+++ b/checkpoints/component5_420m/latest.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32d26a7dd9e6e294c6657f6fb3a4d947cf52eb8e1c0b11032722fa50d15c4a21
+size 5087449970
diff --git a/checkpoints/component5_420m/step_3000.pt b/checkpoints/component5_420m/step_3000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6a459a4b1c19c8ad662c3d3f8443438d9df2bef
--- /dev/null
+++ b/checkpoints/component5_420m/step_3000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e11bded40789574ef316636c02c2fd1e8cd54c13441d8cd6a28980f2209ffaa9
+size 5087455158
diff --git a/checkpoints/component5_420m/step_3200.pt b/checkpoints/component5_420m/step_3200.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4a8c3c0aa415e3eeaea13ecec2d036f3b8f6bdbd
--- /dev/null
+++ b/checkpoints/component5_420m/step_3200.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71d2ea9401f3b08b2528dbb8f993949794d0adb57642d0f4752d74da0e445238
+size 5087455158
diff --git a/config.py b/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ca8adcfede571d3684bc836b9e03cc8e59b0ae
--- /dev/null
+++ b/config.py
@@ -0,0 +1,45 @@
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class Paths:
+    project_root: Path = Path(".")
+    model_dir: Path = Path("./model")
+    data_dir: Path = Path("./data")
+    output_dir: Path = Path("./output")
+    logs_dir: Path = Path("./logs")
+
+    train_jsonl: Path = Path("./data/train.jsonl")
+    dataset_cache_dir: Path = Path("./data/cache")
+    raw_dataset_dir: Path = Path("./data/cache/raw")
+    checkpoint_dir: Path = Path("./output/checkpoints")
+    lora_output_dir: Path = Path("./output/lora_adapters")
+    tokenizer_output_dir: Path = Path("./output/tokenizer")
+
+
+@dataclass(frozen=True)
+class DataConfig:
+    max_total_samples: int = 200000
+    max_humaneval_samples: int = 20000
+    max_mbpp_samples: int = 50000
+    max_codesearchnet_samples: int = 180000
+    min_output_chars: int = 40
+
+
+@dataclass(frozen=True)
+class TrainingConfig:
+    num_train_epochs: int = 5
+    per_device_train_batch_size: int = 1
+    gradient_accumulation_steps: int = 8
+    learning_rate: float = 1e-5
+    max_length: int = 1024
+    save_steps: int = 250
+    logging_steps: int = 20
+    eval_max_new_tokens: int = 220
+    resume_training: bool = True
+
+
+PATHS = Paths()
+DATA_CONFIG = DataConfig()
+TRAINING_CONFIG = TrainingConfig()
diff --git a/configs/component10_export_config.yaml b/configs/component10_export_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24d5a702dfa6fe9df0e2943f7605f26be6f33523
--- /dev/null
+++ b/configs/component10_export_config.yaml
@@ -0,0 +1,21 @@
+﻿# Component 10 export and optimization config
+
+model:
+  model_config_path: configs/component4_model_config.yaml
+  source_checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+
+quantization:
+  quantized_output_path: models/quantized/model_step3200_int8_state.pt
+
+benchmark:
+  prompt: Write a Python function to compute factorial of n.
+  max_new_tokens: 120
+
+package:
+  output_dir: release/MINDI_1.0_420M
+  app_port: 7861
+
+outputs:
+  benchmark_report_json: artifacts/export/component10_benchmark_report.json
+
diff --git a/configs/component3_dataset_pipeline.yaml b/configs/component3_dataset_pipeline.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24077d99b74fbad01942d843b7781d803067385b
--- /dev/null
+++ b/configs/component3_dataset_pipeline.yaml
@@ -0,0 +1,38 @@
+# Component 3 config: load, clean, deduplicate, tokenize.
+
+tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+interim_output_dir: data/interim
+processed_output_dir: data/processed
+dedupe_db_path: data/interim/dedupe_hashes.sqlite
+
+# Set null for full run.
+# Use a small number like 500 for fast smoke testing.
+max_records_per_dataset: null
+
+min_prompt_chars: 8
+min_code_chars: 16
+max_code_chars: 40000
+progress_every: 1000
+
+datasets:
+  - hf_dataset_id: iamtarun/python_code_instructions_18k_alpaca
+    split: train
+    prompt_field: instruction
+    code_field: output
+    language_field: null
+    default_language: python
+
+  - hf_dataset_id: sahil2801/CodeAlpaca-20k
+    split: train
+    prompt_field: instruction
+    code_field: output
+    language_field: null
+    default_language: python
+
+  - hf_dataset_id: TokenBender/code_instructions_122k_alpaca_style
+    split: train
+    prompt_field: instruction
+    code_field: output
+    language_field: null
+    default_language: python
+
diff --git a/configs/component3_incremental_js.yaml b/configs/component3_incremental_js.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8b09fbb818e398b6213598b11d90a592573d319
--- /dev/null
+++ b/configs/component3_incremental_js.yaml
@@ -0,0 +1,27 @@
+# Incremental JS augmentation config.
+# This script appends new JavaScript samples into existing Component 3 outputs.
+
+tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+existing_clean_path: data/interim/combined_clean.jsonl
+existing_tokenized_path: data/processed/train_tokenized.jsonl
+existing_stats_path: data/processed/pipeline_stats.json
+dedupe_db_path: data/interim/dedupe_hashes_incremental.sqlite
+
+# Chosen dataset for JS augmentation.
+new_dataset:
+  hf_dataset_id: philschmid/code-alpaca-ruby-python-javascript
+  split: train
+  prompt_field: instruction
+  code_field: output
+  language_field: null
+  default_language: auto
+
+# Hard target requested by user.
+target_new_javascript_examples: 20000
+
+# Quality filters (same idea as Component 3).
+min_prompt_chars: 8
+min_code_chars: 16
+max_code_chars: 40000
+progress_every: 500
+
diff --git a/configs/component3_reprocess_from_clean.yaml b/configs/component3_reprocess_from_clean.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08939cae2a7d9b74bed27a1d1369c91af1725fda
--- /dev/null
+++ b/configs/component3_reprocess_from_clean.yaml
@@ -0,0 +1,19 @@
+# Reprocess config: no dataset download, no full pipeline rebuild.
+# It reads existing cleaned data and regenerates tokenized output.
+
+tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+input_clean_path: data/interim/combined_clean.jsonl
+output_tokenized_path: data/processed/train_tokenized.jsonl
+output_stats_path: data/processed/pipeline_stats.json
+
+# Safety backups before overwrite.
+backup_existing_tokenized: true
+backup_existing_stats: true
+
+# Existing language labels in clean file may be wrong from earlier runs.
+# true = infer language from prompt+code content only.
+ignore_existing_language_labels: true
+
+# Optional quick test mode.
+# Set null for full reprocess.
+max_records: null
diff --git a/configs/component4_model_config.yaml b/configs/component4_model_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5667b78688d6890d294fc808c0110edd093f6a7
--- /dev/null
+++ b/configs/component4_model_config.yaml
@@ -0,0 +1,18 @@
+# Component 4 model config.
+# You can switch the preset name or directly edit dimensions below.
+
+preset: medium_420m
+
+model:
+  vocab_size: 50000
+  max_seq_len: 2048
+  d_model: 1152
+  n_layers: 23
+  n_heads: 16
+  d_ff: 4608
+  dropout: 0.1
+  tie_embeddings: true
+  gradient_checkpointing: false
+  init_std: 0.02
+  rms_norm_eps: 0.00001
+
diff --git a/configs/component5_training_config.verify.yaml b/configs/component5_training_config.verify.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0299d5b9fb19d08fbfe80a6594f0e24f3317e234
--- /dev/null
+++ b/configs/component5_training_config.verify.yaml
@@ -0,0 +1,32 @@
+data:
+  tokenized_jsonl_path: data/processed/train_tokenized.jsonl
+  val_ratio: 0.02
+  split_seed: 17
+  num_workers: 0
+model:
+  model_config_path: configs/component4_model_config.yaml
+training:
+  output_dir: checkpoints/component5_420m
+  log_every: 1
+  eval_every: 5
+  save_every: 5
+  max_steps: 5
+  micro_batch_size: 1
+  grad_accum_steps: 16
+  max_seq_len: 512
+  learning_rate: 0.0002
+  weight_decay: 0.1
+  betas:
+  - 0.9
+  - 0.95
+  grad_clip_norm: 1.0
+  warmup_steps: 300
+  min_lr_ratio: 0.1
+  use_fp16: true
+  use_gradient_checkpointing: true
+  prefer_8bit_adam: true
+  early_stopping_patience_evals: 20
+  early_stopping_min_delta: 0.0005
+  max_vram_gb: 7.0
+resume:
+  resume_from: none
diff --git a/configs/component5_training_config.yaml b/configs/component5_training_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2e38d6ed0b29eaca71af3d82e67696bc6338309
--- /dev/null
+++ b/configs/component5_training_config.yaml
@@ -0,0 +1,37 @@
+﻿# Component 5 training config for RTX 4060 8GB.
+
+data:
+  tokenized_jsonl_path: data/processed/train_tokenized.jsonl
+  val_ratio: 0.02
+  split_seed: 17
+  num_workers: 2
+
+model:
+  model_config_path: configs/component4_model_config.yaml
+
+training:
+  output_dir: checkpoints/component5_420m
+  log_every: 10
+  eval_every: 100
+  save_every: 200
+  max_steps: 8000
+  micro_batch_size: 1
+  grad_accum_steps: 16
+  max_seq_len: 448
+  learning_rate: 0.00022
+  weight_decay: 0.1
+  betas: [0.9, 0.95]
+  grad_clip_norm: 1.0
+  warmup_steps: 300
+  min_lr_ratio: 0.1
+  use_fp16: true
+  use_gradient_checkpointing: true
+  prefer_8bit_adam: true
+  early_stopping_patience_evals: 5
+  early_stopping_min_delta: 0.0005
+  max_vram_gb: 7.0
+
+resume:
+  resume_from: latest  # latest | none | explicit checkpoint path
+
+
diff --git a/configs/component6_evaluation_config.yaml b/configs/component6_evaluation_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6edf0a8dce1087b01d00e3a3b3568bc302a72c4c
--- /dev/null
+++ b/configs/component6_evaluation_config.yaml
@@ -0,0 +1,21 @@
+﻿# Component 6 evaluation config.
+
+model:
+  model_config_path: configs/component4_model_config.yaml
+  checkpoint_paths:
+    - checkpoints/component5_420m/step_3200.pt
+
+data:
+  tokenized_jsonl_path: data/processed/train_tokenized.jsonl
+  val_ratio: 0.02
+  split_seed: 17
+
+inference:
+  max_seq_len: 448
+  max_new_tokens: 160
+  temperature: 0.25
+  top_p: 0.85
+
+output:
+  results_json: artifacts/evaluation/component6_eval_results.json
+
diff --git a/configs/component7_inference_config.yaml b/configs/component7_inference_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3873f5bfc7da45d4a36cfce0e779f6a07621da7a
--- /dev/null
+++ b/configs/component7_inference_config.yaml
@@ -0,0 +1,20 @@
+﻿# Component 7 inference config
+
+model:
+  model_config_path: configs/component4_model_config.yaml
+  checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+
+inference:
+  language: python
+  max_new_tokens: 180
+  greedy_temperature: 0.0
+  retry2_temperature: 0.25
+  retry2_top_p: 0.85
+  retry3_temperature: 0.35
+  retry3_top_p: 0.90
+  max_retries: 3
+  min_tokens_before_stop_check: 24
+
+output:
+  results_json: artifacts/evaluation/component7_inference_results.json
diff --git a/configs/component8_chat_config.yaml b/configs/component8_chat_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c768f7c4b4ec868c2574d2ff57a224c3abb91455
--- /dev/null
+++ b/configs/component8_chat_config.yaml
@@ -0,0 +1,30 @@
+﻿# Component 8 chat interface config.
+
+model:
+  model_config_path: configs/component4_model_config.yaml
+  base_checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  lora_adapter_path: models/lora/custom_lora_v1/best.pt
+  quantized_state_path: models/quantized/model_step3200_int8_state.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+
+lora:
+  r: 8
+  alpha: 16
+  dropout: 0.05
+  target_keywords: [q_proj, k_proj, v_proj, o_proj, fc1, fc2]
+
+inference:
+  language_default: python
+  max_new_tokens: 300
+  greedy_temperature: 0.0
+  retry2_temperature: 0.25
+  retry2_top_p: 0.85
+  retry3_temperature: 0.35
+  retry3_top_p: 0.90
+  max_retries: 3
+  min_tokens_before_stop_check: 64
+
+server:
+  host: 127.0.0.1
+  port: 7860
+  share: false
diff --git a/configs/component9_lora_config.verify.yaml b/configs/component9_lora_config.verify.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a5e43b65a380a4b4af5b948159792a7624a9427
--- /dev/null
+++ b/configs/component9_lora_config.verify.yaml
@@ -0,0 +1,32 @@
+﻿model:
+  model_config_path: configs/component4_model_config.yaml
+  base_checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+lora:
+  r: 8
+  alpha: 16
+  dropout: 0.05
+  target_keywords:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - fc1
+  - fc2
+finetune:
+  custom_data_path: data/raw/custom_finetune_pairs.jsonl
+  output_dir: models/lora/custom_lora_v1
+  max_seq_len: 512
+  micro_batch_size: 1
+  grad_accum_steps: 16
+  learning_rate: 0.0003
+  weight_decay: 0.0
+  max_steps: 5
+  save_every: 5
+  eval_every: 5
+  early_stopping_patience_evals: 6
+  early_stopping_min_delta: 0.0005
+  use_fp16: true
+  max_vram_gb: 7.0
+resume:
+  resume_from: none
diff --git a/configs/component9_lora_config.yaml b/configs/component9_lora_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d64028034f3e2e32101bd29ba3ce0f483cbf0a47
--- /dev/null
+++ b/configs/component9_lora_config.yaml
@@ -0,0 +1,31 @@
+﻿# Component 9 LoRA fine-tuning config
+
+model:
+  model_config_path: configs/component4_model_config.yaml
+  base_checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+
+lora:
+  r: 8
+  alpha: 16
+  dropout: 0.05
+  target_keywords: [q_proj, k_proj, v_proj, o_proj, fc1, fc2]
+
+finetune:
+  custom_data_path: data/raw/custom_finetune_pairs.jsonl
+  output_dir: models/lora/custom_lora_v1
+  max_seq_len: 512
+  micro_batch_size: 1
+  grad_accum_steps: 16
+  learning_rate: 0.0003
+  weight_decay: 0.0
+  max_steps: 1200
+  save_every: 100
+  eval_every: 100
+  early_stopping_patience_evals: 6
+  early_stopping_min_delta: 0.0005
+  use_fp16: true
+  max_vram_gb: 7.0
+
+resume:
+  resume_from: none  # none | latest | explicit path
diff --git a/data/cache/raw/code_search_net_python/dataset_dict.json b/data/cache/raw/code_search_net_python/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..00be0f6fd7375ce167ad242a64390d512feedd34
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/dataset_dict.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf46fe547f16d795abe0d4c8a591bf031d98882d638931d27660455ee986273
+size 43
diff --git a/data/cache/raw/code_search_net_python/test/data-00000-of-00001.arrow b/data/cache/raw/code_search_net_python/test/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..2e8b0426396fc60a7813b0838e15268deacae9fe
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/test/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:079bce0f0e2513bae63c12f8699e4ea13ec545c5000844de28dc34a1a9fd19eb
+size 84367104
diff --git a/data/cache/raw/code_search_net_python/test/dataset_info.json b/data/cache/raw/code_search_net_python/test/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b583799d21ef6157999e31ada511bed9e84c578
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/test/dataset_info.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c
+size 2598
diff --git a/data/cache/raw/code_search_net_python/test/state.json b/data/cache/raw/code_search_net_python/test/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ad85c7581bbbe247fc901886dfea8a1091324c4
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/test/state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55d5fecb65147f455bfc8249c3e26fc6a2bd01bfd8bd9f354e86eb7834453d1c
+size 261
diff --git a/data/cache/raw/code_search_net_python/train/data-00000-of-00004.arrow b/data/cache/raw/code_search_net_python/train/data-00000-of-00004.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..ad3769233e0146349963abbe57b6e50d955b5e1c
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/train/data-00000-of-00004.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5984af399adbfdab06aca7da7638f6a5eb98411b15b88a1f045f346735fbc9c
+size 377852224
diff --git a/data/cache/raw/code_search_net_python/train/data-00001-of-00004.arrow b/data/cache/raw/code_search_net_python/train/data-00001-of-00004.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..73aa3abef75d171f5dadb3bcabd014185231a657
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/train/data-00001-of-00004.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a62df607497be1fd23f3e8aa50908bebff6732ccc8b5dacbfaa0efd336ad915
+size 411927504
diff --git a/data/cache/raw/code_search_net_python/train/data-00002-of-00004.arrow b/data/cache/raw/code_search_net_python/train/data-00002-of-00004.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..3c31fd25f5c405f5d4528764b59fb60a79c970d4
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/train/data-00002-of-00004.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d519b4edb8ae27d8e1ab6474a8decc40f45c6a8e7c409039c865abbc9763f351
+size 370005344
diff --git a/data/cache/raw/code_search_net_python/train/data-00003-of-00004.arrow b/data/cache/raw/code_search_net_python/train/data-00003-of-00004.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..e4f6c1332e5b8f2213beec2546b5df7f9a21c39e
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/train/data-00003-of-00004.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b42ae91a5e6e48dd32eac5940429d726f0dbc9440d0262a40a3bfe7a0e2e6214
+size 400292712
diff --git a/data/cache/raw/code_search_net_python/train/dataset_info.json b/data/cache/raw/code_search_net_python/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b583799d21ef6157999e31ada511bed9e84c578
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/train/dataset_info.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c
+size 2598
diff --git a/data/cache/raw/code_search_net_python/train/state.json b/data/cache/raw/code_search_net_python/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad1cf733d9564bf0386c9a83158705dec46f4343
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/train/state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:180b84fce72622f4113ea103a1fbf79924e61881442db8728b055be042247bcf
+size 448
diff --git a/data/cache/raw/code_search_net_python/validation/data-00000-of-00001.arrow b/data/cache/raw/code_search_net_python/validation/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..72adad89455ac9e2a5b197693c563d607bf8a8df
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/validation/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9f848f9c1dfe1c2cfac25fd1b529e050e29291a5d8042ba1d4f904948142c64
+size 92180808
diff --git a/data/cache/raw/code_search_net_python/validation/dataset_info.json b/data/cache/raw/code_search_net_python/validation/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b583799d21ef6157999e31ada511bed9e84c578
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/validation/dataset_info.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c
+size 2598
diff --git a/data/cache/raw/code_search_net_python/validation/state.json b/data/cache/raw/code_search_net_python/validation/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f11e18b460cd718fad08107add9781cfc12a5c8
--- /dev/null
+++ b/data/cache/raw/code_search_net_python/validation/state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20e5f3cf2d550a3fb9b3d3e43f23f25dfaae9ae3124e43dcf14072f5e3aee182
+size 267
diff --git a/data/cache/raw/mbpp/dataset_dict.json b/data/cache/raw/mbpp/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..4287d69d266da7d00c47b23dbcc550f890f936f3
--- /dev/null
+++ b/data/cache/raw/mbpp/dataset_dict.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb69d413c1138964f92bd3723baf871db8f40b4cec70586e770e060108a8c612
+size 53
diff --git a/data/cache/raw/mbpp/prompt/data-00000-of-00001.arrow b/data/cache/raw/mbpp/prompt/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..e34e148ba050ae18dce7c8812fcd8d2864da2dfd
--- /dev/null
+++ b/data/cache/raw/mbpp/prompt/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e14c47c41a23d8003284ac9249a5c5e4da285300f1a56b63593fb2d6237556ff
+size 6112
diff --git a/data/cache/raw/mbpp/prompt/dataset_info.json b/data/cache/raw/mbpp/prompt/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..c007eeaa7baea370cfd0b6b79ee443ee76179bda
--- /dev/null
+++ b/data/cache/raw/mbpp/prompt/dataset_info.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e
+size 2205
diff --git a/data/cache/raw/mbpp/prompt/state.json b/data/cache/raw/mbpp/prompt/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..991f78ccc81d3352cf78e4c116d4bec061effd43
--- /dev/null
+++ b/data/cache/raw/mbpp/prompt/state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:186836c8bbd590862fcd91367aec27d3ec52ff571d1b2b35410003437819f419
+size 263
diff --git a/data/cache/raw/mbpp/test/data-00000-of-00001.arrow b/data/cache/raw/mbpp/test/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..578fc2ad3fe9224fec0f06bf7df5ca9271cbd146
--- /dev/null
+++ b/data/cache/raw/mbpp/test/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7af94cfc4affeacd0bc887e741770b414764c868c8bf08485ea03c1a5f99b38
+size 245680
diff --git a/data/cache/raw/mbpp/test/dataset_info.json b/data/cache/raw/mbpp/test/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..c007eeaa7baea370cfd0b6b79ee443ee76179bda
--- /dev/null
+++ b/data/cache/raw/mbpp/test/dataset_info.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e
+size 2205
diff --git a/data/cache/raw/mbpp/test/state.json b/data/cache/raw/mbpp/test/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..00ec1ee66f100b3cf89c9f109a1cf961bd2e0929
--- /dev/null
+++ b/data/cache/raw/mbpp/test/state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b14c4aaed028a8972650cc53715f48bcdfb497befc97db690183aa0cd60b183
+size 261
diff --git a/data/cache/raw/mbpp/train/data-00000-of-00001.arrow b/data/cache/raw/mbpp/train/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..cc49abb7dad06fa070d2ea72f51aecf8f766aa04
--- /dev/null
+++ b/data/cache/raw/mbpp/train/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbd85255cf0fad7b11f3b39233045a0ab1799c4fe51846ec57946e0abe59ed70
+size 178448
diff --git a/data/cache/raw/mbpp/train/dataset_info.json b/data/cache/raw/mbpp/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..c007eeaa7baea370cfd0b6b79ee443ee76179bda
--- /dev/null
+++ b/data/cache/raw/mbpp/train/dataset_info.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e
+size 2205
diff --git a/data/cache/raw/mbpp/train/state.json b/data/cache/raw/mbpp/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..de58b395de36e5c9effc74f1828342cbb76b027d
--- /dev/null
+++ b/data/cache/raw/mbpp/train/state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb788b2e33e4e0f5ccdfdd3f896a09c2c1080d44c73fc57ae2ac7fa7a1034403
+size 262
diff --git a/data/cache/raw/mbpp/validation/data-00000-of-00001.arrow b/data/cache/raw/mbpp/validation/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..e02be79a52d52de6bf2a2431a2dd312e39215159
--- /dev/null
+++ b/data/cache/raw/mbpp/validation/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fc2337ee96303f7e94580aaeffc92976fb82c04e2ca2d1203e99a22ce03e408
+size 43960
diff --git a/data/cache/raw/mbpp/validation/dataset_info.json b/data/cache/raw/mbpp/validation/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..c007eeaa7baea370cfd0b6b79ee443ee76179bda
--- /dev/null
+++ b/data/cache/raw/mbpp/validation/dataset_info.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e
+size 2205
diff --git a/data/cache/raw/mbpp/validation/state.json b/data/cache/raw/mbpp/validation/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..389acacc0ecb3982c7f8ff104fadd66630928823
--- /dev/null
+++ b/data/cache/raw/mbpp/validation/state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d935f09e6248f886ae2e2cbeb7791209c5599fb566f235da2067d112e1a712c9
+size 267
diff --git a/data/cache/raw/openai_humaneval/dataset_dict.json b/data/cache/raw/openai_humaneval/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..2eb9ae1016128a0658b272ec4d8ca727cbcc2770
--- /dev/null
+++ b/data/cache/raw/openai_humaneval/dataset_dict.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24761d3c08510c22918dfca6bc94100b9e5cff3fa323b34f2cc11916fcf69064
+size 20
diff --git a/data/cache/raw/openai_humaneval/test/data-00000-of-00001.arrow b/data/cache/raw/openai_humaneval/test/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..2948fd38b4bc54bc55fe7e5a888a8b44b28eef97
--- /dev/null
+++ b/data/cache/raw/openai_humaneval/test/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f52943a6aa89b8d973a477910be07b0d41e31f6c3df276a61db6164910cb223d
+size 195528
diff --git a/data/cache/raw/openai_humaneval/test/dataset_info.json b/data/cache/raw/openai_humaneval/test/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..359b8034b44973e6ed14cecab6d32c1742ac19ad
--- /dev/null
+++ b/data/cache/raw/openai_humaneval/test/dataset_info.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cafb7b223551699ced1aca81a07bc5c4232fe19f4a9893a91a00cb52f5390645
+size 1166
diff --git a/data/cache/raw/openai_humaneval/test/state.json b/data/cache/raw/openai_humaneval/test/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2e0699ea81c16299bd3b13097b36f9c27013ba6
--- /dev/null
+++ b/data/cache/raw/openai_humaneval/test/state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fef586a06deb0be67375e335d6eac139383c6a5bdda046e79e58e0b76ec68551
+size 261
diff --git a/data/external/component2_tokenizer_sample.jsonl b/data/external/component2_tokenizer_sample.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a29594c5079c772996317dad5be756c24334c2e
--- /dev/null
+++ b/data/external/component2_tokenizer_sample.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:438bfccb48cfc64cc0b96e40a5b773544edcdfc83807c8a59223c775206095ab
+size 507
diff --git a/data/interim/combined_clean.jsonl b/data/interim/combined_clean.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7cc72e1a47f512e8c9eaee8d6849205a2c079602
--- /dev/null
+++ b/data/interim/combined_clean.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba34da6a4c4d42cba198caa9c252a62f42208ee1b6c9153ec945e6a3dc2f7572
+size 77049613
diff --git a/data/interim/dedupe_hashes.sqlite b/data/interim/dedupe_hashes.sqlite
new file mode 100644
index 0000000000000000000000000000000000000000..7aedc2571bb34b945276d075b96c59c78993d465
--- /dev/null
+++ b/data/interim/dedupe_hashes.sqlite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5af933cbb66657edd8b415ee9bf1c6a5cc8b8abe9857cf48e9d0ee1f5b8342e
+size 21954560
diff --git a/data/interim/dedupe_hashes_incremental.sqlite b/data/interim/dedupe_hashes_incremental.sqlite
new file mode 100644
index 0000000000000000000000000000000000000000..cf315de2841b631aa4e434db527f71bffd2b1e7f
--- /dev/null
+++ b/data/interim/dedupe_hashes_incremental.sqlite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb7f733a85d4a447c274aaff3cc8c334c55f4fb7a866cd4ef6f61364afd3edb8
+size 21905408
diff --git a/data/processed/pipeline_stats.json b/data/processed/pipeline_stats.json
new file mode 100644
index 0000000000000000000000000000000000000000..25ee5a8ae4485a702e4f7b5d6d3c82080110b5fe
--- /dev/null
+++ b/data/processed/pipeline_stats.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d426f14abb3e473c92616068dc74d77eb327c5a9e42987ddf342d3b1eaa76b30
+size 216
diff --git a/data/processed/pipeline_stats.json.bak b/data/processed/pipeline_stats.json.bak
new file mode 100644
index 0000000000000000000000000000000000000000..f5276a2fadb080a896c9b387ec694bfc5bd29a78
--- /dev/null
+++ b/data/processed/pipeline_stats.json.bak
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42bda6bd9ad255ca525f02954c9a67d2c9a90d14050e1bb68db1eccba8b05766
+size 212
diff --git a/data/processed/train_tokenized.jsonl b/data/processed/train_tokenized.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b978067c2e8a15c696d52f33a99cc3595f7164e6
--- /dev/null
+++ b/data/processed/train_tokenized.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7316e1650b4a9ba1a19de2d83d957bd4d0e089b3ab2de076293ecc4760310e45
+size 333825717
diff --git a/data/processed/train_tokenized.jsonl.bak b/data/processed/train_tokenized.jsonl.bak
new file mode 100644
index 0000000000000000000000000000000000000000..e54eaf6c757f01d1bbcbdcd4cc86f4d519a36270
--- /dev/null
+++ b/data/processed/train_tokenized.jsonl.bak
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d77d82d56e0c041c05f7978c85d4921bed85ba92f3b75ec934d640e16dde7c5c
+size 333610086
diff --git a/data/raw/custom_finetune_pairs.jsonl b/data/raw/custom_finetune_pairs.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..847c43db76f1378f39f79e1e5200922f9b8a9bff
--- /dev/null
+++ b/data/raw/custom_finetune_pairs.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ab1ceab4d5a85de0c15a54f6420c483e78de5db4b5654dc5d34aa1d02893921
+size 451
diff --git a/data/train.jsonl b/data/train.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d17d1863b85c74ce75290bbb6b5ce2af1fd6020d
--- /dev/null
+++ b/data/train.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72b651ce4bdc47cb7f105a8f2dd110f14bae1bfb98507d055e875e8afb9879d7
+size 44122145
diff --git a/data_fetch.py b/data_fetch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e166c402748d6c644003866ff2dcf3ef6417f577
--- /dev/null
+++ b/data_fetch.py
@@ -0,0 +1,222 @@
+import argparse
+import hashlib
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+
+from config import DATA_CONFIG, PATHS
+from utils import ensure_dirs, setup_logger, write_jsonl
+
+
+def _normalize_text(text: Optional[str]) -> str:
+    if not text:
+        return ""
+    return " ".join(str(text).strip().split())
+
+
+def _quality_ok(sample: Dict[str, str]) -> bool:
+    instruction = _normalize_text(sample.get("instruction"))
+    output = _normalize_text(sample.get("output"))
+
+    if not instruction or not output:
+        return False
+    if len(output) < DATA_CONFIG.min_output_chars:
+        return False
+
+    lowered = output.lower()
+    bad_tokens = ("todo", "fixme", "coming soon", "not implemented")
+    if any(tok in lowered for tok in bad_tokens):
+        return False
+    if output.strip() in {"pass", "...", "return ..."}:
+        return False
+    return True
+
+
+def _to_record(instruction: str, input_text: str, output_text: str) -> Dict[str, str]:
+    return {
+        "instruction": instruction.strip(),
+        "input": input_text.strip(),
+        "output": output_text.strip(),
+    }
+
+
+def _save_dataset_for_offline(ds_obj, save_path: Path) -> None:
+    if save_path.exists():
+        return
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+    ds_obj.save_to_disk(str(save_path))
+
+
+def _load_or_download(dataset_name: str, cache_path: Path, **kwargs):
+    if cache_path.exists():
+        return load_from_disk(str(cache_path))
+
+    dataset_obj = load_dataset(dataset_name, **kwargs)
+    _save_dataset_for_offline(dataset_obj, cache_path)
+    return dataset_obj
+
+
+def _load_or_download_codesearchnet(cache_path: Path, subset: str = "python"):
+    if cache_path.exists():
+        return load_from_disk(str(cache_path))
+
+    ds = load_dataset("code_search_net", subset)
+    _save_dataset_for_offline(ds, cache_path)
+    return ds
+
+
+def _extract_humaneval(ds_obj, max_samples: int) -> List[Dict[str, str]]:
+    rows: List[Dict[str, str]] = []
+    split = ds_obj["test"] if isinstance(ds_obj, DatasetDict) else ds_obj
+
+    for item in split:
+        prompt = item.get("prompt", "")
+        solution = item.get("canonical_solution", "")
+        if "def " not in prompt:
+            continue
+        rows.append(
+            _to_record(
+                instruction="Complete the Python function so it satisfies the specification.",
+                input_text=prompt,
+                output_text=solution,
+            )
+        )
+        if len(rows) >= max_samples:
+            break
+    return rows
+
+
+def _extract_mbpp(ds_obj, max_samples: int) -> List[Dict[str, str]]:
+    rows: List[Dict[str, str]] = []
+    splits = []
+    if isinstance(ds_obj, DatasetDict):
+        splits = [ds_obj[k] for k in ds_obj.keys()]
+    else:
+        splits = [ds_obj]
+
+    for split in splits:
+        for item in split:
+            task = item.get("text", "")
+            code = item.get("code", "")
+            tests = item.get("test_list", [])
+            if not task or not code:
+                continue
+            test_blob = "\n".join(tests) if isinstance(tests, list) else str(tests)
+            input_text = f"Task:\n{task}\n\nTests:\n{test_blob}".strip()
+            rows.append(
+                _to_record(
+                    instruction="Write Python code that solves the problem and passes the tests.",
+                    input_text=input_text,
+                    output_text=code,
+                )
+            )
+            if len(rows) >= max_samples:
+                return rows
+    return rows
+
+
+def _extract_codesearchnet(ds_obj, max_samples: int) -> List[Dict[str, str]]:
+    rows: List[Dict[str, str]] = []
+    splits = []
+    if isinstance(ds_obj, DatasetDict):
+        for split_name in ("train", "validation"):
+            if split_name in ds_obj:
+                splits.append(ds_obj[split_name])
+    else:
+        splits = [ds_obj]
+
+    for split in splits:
+        for item in split:
+            language = str(item.get("language", "")).lower()
+            if language and language != "python":
+                continue
+
+            docstring = item.get("docstring", "") or item.get("func_documentation_string", "")
+            code = item.get("whole_func_string", "") or item.get("code", "")
+            if not docstring or not code:
+                continue
+            if "def " not in code and "class " not in code:
+                continue
+
+            rows.append(
+                _to_record(
+                    instruction="Write Python code that matches the following docstring.",
+                    input_text=docstring,
+                    output_text=code,
+                )
+            )
+            if len(rows) >= max_samples:
+                return rows
+    return rows
+
+
+def _dedupe_and_filter(rows: List[Dict[str, str]], max_total: int) -> List[Dict[str, str]]:
+    seen = set()
+    clean_rows: List[Dict[str, str]] = []
+    for row in rows:
+        if not _quality_ok(row):
+            continue
+        digest = hashlib.sha256(
+            f"{row['instruction']}||{row['input']}||{row['output']}".encode("utf-8")
+        ).hexdigest()
+        if digest in seen:
+            continue
+        seen.add(digest)
+        clean_rows.append(row)
+        if len(clean_rows) >= max_total:
+            break
+    return clean_rows
+
+
+def fetch_and_prepare_dataset(offline_only: bool = False) -> Path:
+    ensure_dirs([PATHS.data_dir, PATHS.dataset_cache_dir, PATHS.raw_dataset_dir, PATHS.logs_dir])
+    logger = setup_logger("data_fetch", PATHS.logs_dir / "data_fetch.log")
+
+    logger.info("Loading datasets (offline_only=%s).", offline_only)
+
+    humaneval_cache = PATHS.raw_dataset_dir / "openai_humaneval"
+    mbpp_cache = PATHS.raw_dataset_dir / "mbpp"
+    csn_cache = PATHS.raw_dataset_dir / "code_search_net_python"
+
+    if offline_only:
+        if not humaneval_cache.exists() or not mbpp_cache.exists() or not csn_cache.exists():
+            raise FileNotFoundError(
+                "Offline mode requested but one or more cached datasets are missing. "
+                "Run without --offline first."
+            )
+        humaneval_ds = load_from_disk(str(humaneval_cache))
+        mbpp_ds = load_from_disk(str(mbpp_cache))
+        csn_ds = load_from_disk(str(csn_cache))
+    else:
+        humaneval_ds = _load_or_download("openai_humaneval", humaneval_cache)
+        mbpp_ds = _load_or_download("mbpp", mbpp_cache)
+        csn_ds = _load_or_download_codesearchnet(csn_cache, subset="python")
+
+    rows = []
+    rows.extend(_extract_humaneval(humaneval_ds, DATA_CONFIG.max_humaneval_samples))
+    rows.extend(_extract_mbpp(mbpp_ds, DATA_CONFIG.max_mbpp_samples))
+    rows.extend(_extract_codesearchnet(csn_ds, DATA_CONFIG.max_codesearchnet_samples))
+
+    clean_rows = _dedupe_and_filter(rows, DATA_CONFIG.max_total_samples)
+    write_jsonl(PATHS.train_jsonl, clean_rows)
+
+    logger.info("Saved %d cleaned training rows to %s", len(clean_rows), PATHS.train_jsonl)
+    print(f"Saved dataset: {PATHS.train_jsonl.resolve()}")
+    return PATHS.train_jsonl
+
+
+def _build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Download and prepare Python fine-tuning data.")
+    parser.add_argument(
+        "--offline",
+        action="store_true",
+        help="Use only previously saved local dataset cache.",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    args = _build_arg_parser().parse_args()
+    fetch_and_prepare_dataset(offline_only=args.offline)
+
diff --git a/dataset.py b/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb38e4e476f2fa7eef14df6461f6d56228225914
--- /dev/null
+++ b/dataset.py
@@ -0,0 +1,56 @@
+from typing import Dict, List
+
+import torch
+from torch.utils.data import Dataset
+
+from config import PATHS, TRAINING_CONFIG
+from utils import read_jsonl
+
+
+def format_prompt(instruction: str, input_text: str, output_text: str) -> str:
+    return (
+        f"### Instruction:\n{instruction}\n"
+        f"### Input:\n{input_text}\n"
+        f"### Response:\n{output_text}"
+    )
+
+
+class LocalJsonlInstructionDataset(Dataset):
+    def __init__(self, tokenizer, max_length: int = TRAINING_CONFIG.max_length):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.samples: List[Dict[str, str]] = read_jsonl(PATHS.train_jsonl)
+
+        if not self.samples:
+            raise ValueError(f"No training samples found in {PATHS.train_jsonl}")
+
+    def __len__(self) -> int:
+        return len(self.samples)
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        sample = self.samples[idx]
+        text = format_prompt(
+            instruction=sample["instruction"],
+            input_text=sample["input"],
+            output_text=sample["output"],
+        )
+        encoded = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+
+        input_ids = encoded["input_ids"].squeeze(0)
+        attention_mask = encoded["attention_mask"].squeeze(0)
+
+        labels = input_ids.clone()
+        labels[attention_mask == 0] = -100
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+
diff --git a/hf_release/MINDI-1.0-420M/LICENSE b/hf_release/MINDI-1.0-420M/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..1ddde814d0a4676002fd6703363cc362d9d195af
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 MINDI 1.0 420M Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/hf_release/MINDI-1.0-420M/README.md b/hf_release/MINDI-1.0-420M/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6bbdeb15a5e1d5dac7038a8aa8e776d94f5b0315
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/README.md
@@ -0,0 +1,81 @@
+---
+license: mit
+language:
+- en
+library_name: transformers
+pipeline_tag: text-generation
+tags:
+- code
+- python
+- javascript
+- local-llm
+- offline
+---
+
+# MINDI 1.0 420M
+
+MINDI 1.0 420M is a 420M-parameter coding language model focused on Python first and JavaScript second.
+It is built for local, offline code generation workflows.
+
+## Capabilities
+
+- Code generation from natural language prompts
+- Code completion
+- Bug-fix suggestions
+- Code explanation
+
+## Model Details
+
+- Parameters: 423,934,848
+- Architecture: Decoder-only Transformer
+- Context length: 2048 tokens
+- Focus languages: Python, JavaScript
+
+## Hardware Requirements
+
+Recommended:
+- NVIDIA GPU with 8GB+ VRAM
+- CUDA-enabled PyTorch
+
+Minimum:
+- CPU inference works but is slower
+
+## Quick Start (GPU)
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+repo_id = "YOUR_USERNAME/MINDI-1.0-420M"
+
+tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    repo_id,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+).cuda()
+
+prompt = "Write a Python function to check if a string is a palindrome."
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    output = model.generate(
+        **inputs,
+        max_new_tokens=220,
+        temperature=0.2,
+        top_p=0.9,
+        do_sample=True,
+    )
+
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Limitations
+
+- The model can still produce syntax or logic errors.
+- Generated code should always be reviewed and tested.
+- Not intended for safety-critical production use without validation.
+
+## Safety
+
+Always run tests and static checks before using generated code in production.
diff --git a/hf_release/MINDI-1.0-420M/UPLOAD_TO_HF.ps1 b/hf_release/MINDI-1.0-420M/UPLOAD_TO_HF.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..36e1a9447eecdf4905c473b8a75e6784ba204737
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/UPLOAD_TO_HF.ps1
@@ -0,0 +1,6 @@
+# Upload helper for MINDI 1.0 420M
+# Run from PowerShell.
+
+huggingface-cli login
+huggingface-cli repo create MINDI-1.0-420M --type model --public
+huggingface-cli upload YOUR_USERNAME/MINDI-1.0-420M "C:\AI 2\hf_release\MINDI-1.0-420M" . --repo-type model
diff --git a/hf_release/MINDI-1.0-420M/config.json b/hf_release/MINDI-1.0-420M/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3e911be583da9cf92057d9e1719a8be97614a38
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/config.json
@@ -0,0 +1,29 @@
+{
+  "model_type": "mindi",
+  "architectures": [
+    "MindiForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mindi.MindiConfig",
+    "AutoModelForCausalLM": "modeling_mindi.MindiForCausalLM",
+    "AutoTokenizer": [
+      null,
+      "tokenization_mindi.MindiTokenizer"
+    ]
+  },
+  "vocab_size": 50000,
+  "max_seq_len": 2048,
+  "d_model": 1152,
+  "n_layers": 23,
+  "n_heads": 16,
+  "d_ff": 4608,
+  "dropout": 0.1,
+  "tie_embeddings": true,
+  "init_std": 0.02,
+  "rms_norm_eps": 1e-05,
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "pad_token_id": 0,
+  "torch_dtype": "float16",
+  "transformers_version": "4.46.3"
+}
\ No newline at end of file
diff --git a/hf_release/MINDI-1.0-420M/configuration_mindi.py b/hf_release/MINDI-1.0-420M/configuration_mindi.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ed08ab6765e3bd37e2d68ec577af3c269ef031
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/configuration_mindi.py
@@ -0,0 +1,38 @@
+"""
+Hugging Face config class for MINDI 1.0 420M.
+"""
+
+from transformers import PretrainedConfig
+
+
+class MindiConfig(PretrainedConfig):
+    model_type = "mindi"
+
+    def __init__(
+        self,
+        vocab_size=50000,
+        max_seq_len=2048,
+        d_model=1152,
+        n_layers=23,
+        n_heads=16,
+        d_ff=4608,
+        dropout=0.1,
+        tie_embeddings=True,
+        init_std=0.02,
+        rms_norm_eps=1e-5,
+        bos_token_id=2,
+        eos_token_id=3,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.d_ff = d_ff
+        self.dropout = dropout
+        self.tie_embeddings = tie_embeddings
+        self.init_std = init_std
+        self.rms_norm_eps = rms_norm_eps
diff --git a/hf_release/MINDI-1.0-420M/generation_config.json b/hf_release/MINDI-1.0-420M/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..75f05d0257daf3ce7641e5ee44eae81599e17bcf
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "pad_token_id": 0,
+  "max_new_tokens": 220,
+  "temperature": 0.2,
+  "top_p": 0.9,
+  "do_sample": true
+}
\ No newline at end of file
diff --git a/hf_release/MINDI-1.0-420M/model.safetensors b/hf_release/MINDI-1.0-420M/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aee736c39b988099c9d0f57143ee9fb1e346a63a
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89d5df76ccfe5be47eaf94b1d58eec9b36276c4c1c2bb235766c766e1dd838a0
+size 1695758072
diff --git a/hf_release/MINDI-1.0-420M/modeling_mindi.py b/hf_release/MINDI-1.0-420M/modeling_mindi.py
new file mode 100644
index 0000000000000000000000000000000000000000..434f0953f895c0724514c3d4f9d0ef0a102aaf5c
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/modeling_mindi.py
@@ -0,0 +1,219 @@
+"""
+Hugging Face model class for MINDI 1.0 420M.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from .configuration_mindi import MindiConfig
+
+
+@dataclass
+class _Cfg:
+    vocab_size: int
+    max_seq_len: int
+    d_model: int
+    n_layers: int
+    n_heads: int
+    d_ff: int
+    dropout: float
+    tie_embeddings: bool
+    init_std: float
+    rms_norm_eps: float
+
+    @property
+    def head_dim(self) -> int:
+        if self.d_model % self.n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads")
+        return self.d_model // self.n_heads
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(norm + self.eps)
+        return self.weight * x
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim: int, max_seq_len: int) -> None:
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for rotary embeddings")
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        t = torch.arange(max_seq_len, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cached", torch.cos(freqs), persistent=False)
+        self.register_buffer("sin_cached", torch.sin(freqs), persistent=False)
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)
+        sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)
+        return self._apply_rotary(q, cos, sin), self._apply_rotary(k, cos, sin)
+
+    @staticmethod
+    def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        xe = x1 * cos - x2 * sin
+        xo = x1 * sin + x2 * cos
+        return torch.stack((xe, xo), dim=-1).flatten(-2)
+
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.n_heads = cfg.n_heads
+        self.head_dim = cfg.head_dim
+        self.scale = self.head_dim ** -0.5
+        self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.k_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.v_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.o_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+        self.rotary = RotaryEmbedding(self.head_dim, cfg.max_seq_len)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        q, k = self.rotary(q, k, seq_len=seq_len)
+        out = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=True,
+            scale=self.scale,
+        )
+        out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.o_proj(out)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.fc2 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = F.gelu(x, approximate="tanh")
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.attn = CausalSelfAttention(cfg)
+        self.norm2 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.ffn = FeedForward(cfg)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class MindiForCausalLM(PreTrainedModel):
+    config_class = MindiConfig
+    base_model_prefix = "mindi"
+    supports_gradient_checkpointing = False
+
+    def __init__(self, config: MindiConfig):
+        super().__init__(config)
+        cfg = _Cfg(
+            vocab_size=config.vocab_size,
+            max_seq_len=config.max_seq_len,
+            d_model=config.d_model,
+            n_layers=config.n_layers,
+            n_heads=config.n_heads,
+            d_ff=config.d_ff,
+            dropout=config.dropout,
+            tie_embeddings=config.tie_embeddings,
+            init_std=config.init_std,
+            rms_norm_eps=config.rms_norm_eps,
+        )
+
+        self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.dropout = nn.Dropout(cfg.dropout)
+        self.blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)])
+        self.norm_final = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+
+        if cfg.tie_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+
+        self.post_init()
+
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.embed_tokens = value
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
+        self.lm_head = new_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        del attention_mask, kwargs
+
+        x = self.embed_tokens(input_ids)
+        x = self.dropout(x)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.norm_final(x)
+        logits = self.lm_head(x)
+
+        loss = None
+        if labels is not None:
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+
+    @torch.no_grad()
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, **kwargs):
+        del kwargs
+        return {"input_ids": input_ids}
diff --git a/hf_release/MINDI-1.0-420M/requirements_runtime.txt b/hf_release/MINDI-1.0-420M/requirements_runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..abaedaaa791becf2f5ce882efa7bea01c73fbf0a
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/requirements_runtime.txt
@@ -0,0 +1,4 @@
+torch>=2.4.1
+transformers>=4.46.3
+safetensors>=0.4.5
+tokenizers>=0.20.1
diff --git a/hf_release/MINDI-1.0-420M/special_tokens_map.json b/hf_release/MINDI-1.0-420M/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..b77818d52342de47ad7d79ee6dd897110362269f
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/special_tokens_map.json
@@ -0,0 +1,6 @@
+{
+  "bos_token": "<BOS>",
+  "eos_token": "<EOS>",
+  "unk_token": "<UNK>",
+  "pad_token": "<PAD>"
+}
\ No newline at end of file
diff --git a/hf_release/MINDI-1.0-420M/tokenization_mindi.py b/hf_release/MINDI-1.0-420M/tokenization_mindi.py
new file mode 100644
index 0000000000000000000000000000000000000000..4baa008c200b00429dddf43b9d7befec0f0d4f34
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/tokenization_mindi.py
@@ -0,0 +1,33 @@
+"""
+Hugging Face tokenizer class for MINDI 1.0 420M.
+"""
+
+from pathlib import Path
+from transformers import PreTrainedTokenizerFast
+
+
+class MindiTokenizer(PreTrainedTokenizerFast):
+    vocab_files_names = {"tokenizer_file": "tokenizer.json"}
+    model_input_names = ["input_ids", "attention_mask"]
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
+        if kwargs.get("tokenizer_file") is None:
+            local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json"
+            if local_candidate.exists():
+                kwargs["tokenizer_file"] = str(local_candidate)
+        return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+
+    def __init__(self, tokenizer_file=None, **kwargs):
+        name_or_path = kwargs.pop("name_or_path", None)
+        if tokenizer_file is None and name_or_path is not None:
+            candidate = Path(name_or_path) / "tokenizer.json"
+            if candidate.exists():
+                tokenizer_file = str(candidate)
+        if tokenizer_file is None:
+            tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json")
+        kwargs.setdefault("bos_token", "<BOS>")
+        kwargs.setdefault("eos_token", "<EOS>")
+        kwargs.setdefault("unk_token", "<UNK>")
+        kwargs.setdefault("pad_token", "<PAD>")
+        super().__init__(tokenizer_file=tokenizer_file, **kwargs)
diff --git a/hf_release/MINDI-1.0-420M/tokenizer.json b/hf_release/MINDI-1.0-420M/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..4798c7bfd1002d16664c9d8bec52763fdbc3fe48
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/tokenizer.json
@@ -0,0 +1,799 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<PAD>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<UNK>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<BOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<EOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "<NL>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 5,
+      "content": "<INDENT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 6,
+      "content": "<DEDENT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 7,
+      "content": "<PROMPT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 8,
+      "content": "<CODE>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 9,
+      "content": "<PYTHON>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 10,
+      "content": "<JAVASCRIPT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "NFKC"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "(==|!=|<=|>=|:=|->|=>|\\+\\+|--|\\+=|-=|\\*=|/=|//=|%=|\\*\\*|&&|\\|\\||<<|>>)"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "([()\\[\\]{}.,:;])"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Metaspace",
+        "replacement": "_",
+        "prepend_scheme": "always",
+        "split": true
+      }
+    ]
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<BOS>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<EOS>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<BOS>": {
+        "id": "<BOS>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<BOS>"
+        ]
+      },
+      "<EOS>": {
+        "id": "<EOS>",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "<EOS>"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "BPEDecoder",
+    "suffix": "</w>"
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<UNK>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<PAD>": 0,
+      "<UNK>": 1,
+      "<BOS>": 2,
+      "<EOS>": 3,
+      "<NL>": 4,
+      "<INDENT>": 5,
+      "<DEDENT>": 6,
+      "<PROMPT>": 7,
+      "<CODE>": 8,
+      "<PYTHON>": 9,
+      "<JAVASCRIPT>": 10,
+      "(": 11,
+      ")": 12,
+      "+": 13,
+      ",": 14,
+      ".": 15,
+      "0": 16,
+      "4": 17,
+      "5": 18,
+      ":": 19,
+      ";": 20,
+      "<": 21,
+      "=": 22,
+      ">": 23,
+      "A": 24,
+      "C": 25,
+      "D": 26,
+      "E": 27,
+      "F": 28,
+      "H": 29,
+      "I": 30,
+      "J": 31,
+      "L": 32,
+      "M": 33,
+      "N": 34,
+      "O": 35,
+      "P": 36,
+      "R": 37,
+      "S": 38,
+      "T": 39,
+      "V": 40,
+      "W": 41,
+      "Y": 42,
+      "_": 43,
+      "a": 44,
+      "b": 45,
+      "c": 46,
+      "d": 47,
+      "e": 48,
+      "f": 49,
+      "g": 50,
+      "h": 51,
+      "i": 52,
+      "l": 53,
+      "m": 54,
+      "n": 55,
+      "o": 56,
+      "p": 57,
+      "r": 58,
+      "s": 59,
+      "t": 60,
+      "u": 61,
+      "v": 62,
+      "w": 63,
+      "x": 64,
+      "y": 65,
+      "{": 66,
+      "}": 67,
+      "_<": 68,
+      "DE": 69,
+      "T>": 70,
+      "_a": 71,
+      "L>": 72,
+      "NL>": 73,
+      "_<NL>": 74,
+      "NT>": 75,
+      "_t": 76,
+      "DENT>": 77,
+      "_i": 78,
+      "PT>": 79,
+      "_(": 80,
+      "_)": 81,
+      "on": 82,
+      "_<P": 83,
+      "_f": 84,
+      "_l": 85,
+      "re": 86,
+      "ri": 87,
+      "CO": 88,
+      "IN": 89,
+      "MPT>": 90,
+      "OMPT>": 91,
+      "ROMPT>": 92,
+      "_;": 93,
+      "_b": 94,
+      "at": 95,
+      "_<DE": 96,
+      "_<CO": 97,
+      "_<IN": 98,
+      "DE>": 99,
+      "_to": 100,
+      "_<PROMPT>": 101,
+      "_lo": 102,
+      "_<DEDENT>": 103,
+      "_<CODE>": 104,
+      "_<INDENT>": 105,
+      "_+": 106,
+      "_0": 107,
+      "_re": 108,
+      "ct": 109,
+      "dd": 110,
+      "ion": 111,
+      "nct": 112,
+      "rn": 113,
+      "tu": 114,
+      "unct": 115,
+      "va": 116,
+      "_add": 117,
+      "_th": 118,
+      "_funct": 119,
+      "_retu": 120,
+      "_function": 121,
+      "_return": 122,
+      "AS": 123,
+      "AV": 124,
+      "CR": 125,
+      "Cre": 126,
+      "HO": 127,
+      "IPT>": 128,
+      "Ja": 129,
+      "JAV": 130,
+      "N>": 131,
+      "Py": 132,
+      "Sc": 133,
+      "THO": 134,
+      "YTHO": 135,
+      "_,": 136,
+      "_4": 137,
+      "_5": 138,
+      "_:": 139,
+      "_p": 140,
+      "_{": 141,
+      "_}": 142,
+      "_Cre": 143,
+      "_Ja": 144,
+      "_Py": 145,
+      "hon": 146,
+      "nt": 147,
+      "op": 148,
+      "or": 149,
+      "pt": 150,
+      "thon": 151,
+      "_<JAV": 152,
+      "_<PYTHO": 153,
+      "_for": 154,
+      "rint": 155,
+      "ript": 156,
+      "ate": 157,
+      "_log": 158,
+      "_loop": 159,
+      "vaSc": 160,
+      "_that": 161,
+      "ASCR": 162,
+      "_print": 163,
+      "_Create": 164,
+      "_JavaSc": 165,
+      "_Python": 166,
+      "_<JAVASCR": 167,
+      "_<PYTHON>": 168,
+      "_JavaScript": 169,
+      "_<JAVASCRIPT>": 170
+    },
+    "merges": [
+      [
+        "_",
+        "<"
+      ],
+      [
+        "D",
+        "E"
+      ],
+      [
+        "T",
+        ">"
+      ],
+      [
+        "_",
+        "a"
+      ],
+      [
+        "L",
+        ">"
+      ],
+      [
+        "N",
+        "L>"
+      ],
+      [
+        "_<",
+        "NL>"
+      ],
+      [
+        "N",
+        "T>"
+      ],
+      [
+        "_",
+        "t"
+      ],
+      [
+        "DE",
+        "NT>"
+      ],
+      [
+        "_",
+        "i"
+      ],
+      [
+        "P",
+        "T>"
+      ],
+      [
+        "_",
+        "("
+      ],
+      [
+        "_",
+        ")"
+      ],
+      [
+        "o",
+        "n"
+      ],
+      [
+        "_<",
+        "P"
+      ],
+      [
+        "_",
+        "f"
+      ],
+      [
+        "_",
+        "l"
+      ],
+      [
+        "r",
+        "e"
+      ],
+      [
+        "r",
+        "i"
+      ],
+      [
+        "C",
+        "O"
+      ],
+      [
+        "I",
+        "N"
+      ],
+      [
+        "M",
+        "PT>"
+      ],
+      [
+        "O",
+        "MPT>"
+      ],
+      [
+        "R",
+        "OMPT>"
+      ],
+      [
+        "_",
+        ";"
+      ],
+      [
+        "_",
+        "b"
+      ],
+      [
+        "a",
+        "t"
+      ],
+      [
+        "_<",
+        "DE"
+      ],
+      [
+        "_<",
+        "CO"
+      ],
+      [
+        "_<",
+        "IN"
+      ],
+      [
+        "DE",
+        ">"
+      ],
+      [
+        "_t",
+        "o"
+      ],
+      [
+        "_<P",
+        "ROMPT>"
+      ],
+      [
+        "_l",
+        "o"
+      ],
+      [
+        "_<DE",
+        "DENT>"
+      ],
+      [
+        "_<CO",
+        "DE>"
+      ],
+      [
+        "_<IN",
+        "DENT>"
+      ],
+      [
+        "_",
+        "+"
+      ],
+      [
+        "_",
+        "0"
+      ],
+      [
+        "_",
+        "re"
+      ],
+      [
+        "c",
+        "t"
+      ],
+      [
+        "d",
+        "d"
+      ],
+      [
+        "i",
+        "on"
+      ],
+      [
+        "n",
+        "ct"
+      ],
+      [
+        "r",
+        "n"
+      ],
+      [
+        "t",
+        "u"
+      ],
+      [
+        "u",
+        "nct"
+      ],
+      [
+        "v",
+        "a"
+      ],
+      [
+        "_a",
+        "dd"
+      ],
+      [
+        "_t",
+        "h"
+      ],
+      [
+        "_f",
+        "unct"
+      ],
+      [
+        "_re",
+        "tu"
+      ],
+      [
+        "_funct",
+        "ion"
+      ],
+      [
+        "_retu",
+        "rn"
+      ],
+      [
+        "A",
+        "S"
+      ],
+      [
+        "A",
+        "V"
+      ],
+      [
+        "C",
+        "R"
+      ],
+      [
+        "C",
+        "re"
+      ],
+      [
+        "H",
+        "O"
+      ],
+      [
+        "I",
+        "PT>"
+      ],
+      [
+        "J",
+        "a"
+      ],
+      [
+        "J",
+        "AV"
+      ],
+      [
+        "N",
+        ">"
+      ],
+      [
+        "P",
+        "y"
+      ],
+      [
+        "S",
+        "c"
+      ],
+      [
+        "T",
+        "HO"
+      ],
+      [
+        "Y",
+        "THO"
+      ],
+      [
+        "_",
+        ","
+      ],
+      [
+        "_",
+        "4"
+      ],
+      [
+        "_",
+        "5"
+      ],
+      [
+        "_",
+        ":"
+      ],
+      [
+        "_",
+        "p"
+      ],
+      [
+        "_",
+        "{"
+      ],
+      [
+        "_",
+        "}"
+      ],
+      [
+        "_",
+        "Cre"
+      ],
+      [
+        "_",
+        "Ja"
+      ],
+      [
+        "_",
+        "Py"
+      ],
+      [
+        "h",
+        "on"
+      ],
+      [
+        "n",
+        "t"
+      ],
+      [
+        "o",
+        "p"
+      ],
+      [
+        "o",
+        "r"
+      ],
+      [
+        "p",
+        "t"
+      ],
+      [
+        "t",
+        "hon"
+      ],
+      [
+        "_<",
+        "JAV"
+      ],
+      [
+        "_<P",
+        "YTHO"
+      ],
+      [
+        "_f",
+        "or"
+      ],
+      [
+        "ri",
+        "nt"
+      ],
+      [
+        "ri",
+        "pt"
+      ],
+      [
+        "at",
+        "e"
+      ],
+      [
+        "_lo",
+        "g"
+      ],
+      [
+        "_lo",
+        "op"
+      ],
+      [
+        "va",
+        "Sc"
+      ],
+      [
+        "_th",
+        "at"
+      ],
+      [
+        "AS",
+        "CR"
+      ],
+      [
+        "_p",
+        "rint"
+      ],
+      [
+        "_Cre",
+        "ate"
+      ],
+      [
+        "_Ja",
+        "vaSc"
+      ],
+      [
+        "_Py",
+        "thon"
+      ],
+      [
+        "_<JAV",
+        "ASCR"
+      ],
+      [
+        "_<PYTHO",
+        "N>"
+      ],
+      [
+        "_JavaSc",
+        "ript"
+      ],
+      [
+        "_<JAVASCR",
+        "IPT>"
+      ]
+    ]
+  }
+}
\ No newline at end of file
diff --git a/hf_release/MINDI-1.0-420M/tokenizer_config.json b/hf_release/MINDI-1.0-420M/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1ccb8bd39e3cc06f433c12ec89c39a6d9651162
--- /dev/null
+++ b/hf_release/MINDI-1.0-420M/tokenizer_config.json
@@ -0,0 +1,17 @@
+{
+  "tokenizer_class": "MindiTokenizer",
+  "model_max_length": 2048,
+  "bos_token": "<BOS>",
+  "eos_token": "<EOS>",
+  "unk_token": "<UNK>",
+  "pad_token": "<PAD>",
+  "tokenizer_file": "tokenizer.json",
+  "auto_map": {
+    "AutoTokenizer": [
+      null,
+      "tokenization_mindi.MindiTokenizer"
+    ]
+  },
+  "padding_side": "right",
+  "truncation_side": "right"
+}
\ No newline at end of file
diff --git a/hf_space/MINDI-1.0-420M/DEPLOY_SPACE.ps1 b/hf_space/MINDI-1.0-420M/DEPLOY_SPACE.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..4e710be0a3e967fa1a8f6630e13ecbbc7168721c
--- /dev/null
+++ b/hf_space/MINDI-1.0-420M/DEPLOY_SPACE.ps1
@@ -0,0 +1,97 @@
+﻿param(
+    [string]$SpaceRepoId = "Mindigenous/MINDI-1.0-420M",
+    [string]$SpaceFolder = "C:\AI 2\hf_space\MINDI-1.0-420M",
+    [string]$Token = ""
+)
+
+$ErrorActionPreference = "Stop"
+$py = "C:\AI 2\.venv\Scripts\python.exe"
+
+function Run-Python {
+    param([string]$Code)
+    $Code | & $py -
+    if ($LASTEXITCODE -ne 0) {
+        throw "Python command failed."
+    }
+}
+
+Write-Host "[1/5] Checking/installing huggingface_hub..."
+& $py -m pip install "huggingface_hub<1.0,>=0.36.2" | Out-Host
+
+if (-not (Test-Path $SpaceFolder)) {
+    throw "Space folder not found: $SpaceFolder"
+}
+
+Write-Host "[2/5] Checking login..."
+$loginCheck = @'
+from huggingface_hub import whoami
+try:
+    info = whoami()
+    print("LOGGED_IN", info.get("name", "unknown"))
+except Exception:
+    print("NEED_LOGIN")
+'@
+$loginResult = $loginCheck | & $py -
+if ($LASTEXITCODE -ne 0) { throw "Login check failed." }
+
+$needsLogin = ($loginResult -match "NEED_LOGIN")
+
+if ($needsLogin) {
+    if ([string]::IsNullOrWhiteSpace($Token)) {
+        Write-Host "NEED_TOKEN_LOGIN"
+        exit 42
+    }
+
+    Write-Host "Logging in with provided token..."
+    $loginCode = @"
+from huggingface_hub import login
+login(token="$Token", add_to_git_credential=False)
+print("LOGIN_OK")
+"@
+    Run-Python -Code $loginCode
+}
+
+Write-Host "[3/5] Creating/ensuring Space repo..."
+$createCode = @"
+from huggingface_hub import HfApi
+api = HfApi()
+api.create_repo(
+    repo_id="$SpaceRepoId",
+    repo_type="space",
+    private=False,
+    space_sdk="gradio",
+    exist_ok=True,
+)
+print("SPACE_READY")
+"@
+Run-Python -Code $createCode
+
+Write-Host "[4/5] Uploading Space files..."
+$uploadCode = @"
+from huggingface_hub import HfApi
+api = HfApi()
+api.upload_folder(
+    folder_path=r"$SpaceFolder",
+    repo_id="$SpaceRepoId",
+    repo_type="space",
+    commit_message="Deploy MINDI 1.0 420M Space",
+)
+print("UPLOAD_OK")
+"@
+Run-Python -Code $uploadCode
+
+Write-Host "[5/5] Verifying live files..."
+$verifyCode = @"
+from huggingface_hub import list_repo_files
+repo_id = "$SpaceRepoId"
+files = list_repo_files(repo_id, repo_type="space")
+required = ["app.py", "requirements.txt", "README.md"]
+missing = [f for f in required if f not in files]
+print("FILES_COUNT", len(files))
+print("MISSING", missing)
+print("SPACE_URL", f"https://huggingface.co/spaces/{repo_id}")
+"@
+Run-Python -Code $verifyCode
+
+Write-Host "Deployment completed successfully."
+
diff --git a/hf_space/MINDI-1.0-420M/README.md b/hf_space/MINDI-1.0-420M/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7efa7722013e11576a7738b9d0cf64f1c1bb6d2f
--- /dev/null
+++ b/hf_space/MINDI-1.0-420M/README.md
@@ -0,0 +1,37 @@
+﻿---
+title: MINDI 1.0 420M
+emoji: "💻"
+colorFrom: gray
+colorTo: blue
+sdk: gradio
+sdk_version: 5.5.0
+app_file: app.py
+pinned: false
+license: mit
+---
+
+# MINDI 1.0 420M Space
+
+This Space serves **MINDI 1.0 420M** as a browser-based coding assistant.
+
+## Model
+
+- Model repo: `Mindigenous/MINDI-1.0-420M`
+- Focus: Python first, JavaScript second
+- Use cases: code generation, completion, bug-fix suggestions, explanation
+
+## Notes for Free CPU Tier
+
+- First load can be slow because the model is large.
+- Inference latency is expected on CPU.
+- For faster responses, upgrade Space hardware later.
+
+## Example Prompts
+
+- `Write a Python function to merge two sorted lists.`
+- `Fix this JavaScript debounce function and explain the bug.`
+- `Implement BFS on an adjacency list in Python.`
+
+## Safety
+
+Always review and test generated code before production use.
diff --git a/hf_space/MINDI-1.0-420M/app.py b/hf_space/MINDI-1.0-420M/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..699e5b40bc155eac9c97a39680cc82cf72563f76
--- /dev/null
+++ b/hf_space/MINDI-1.0-420M/app.py
@@ -0,0 +1,246 @@
+﻿"""
+Hugging Face Space app for MINDI 1.0 420M.
+This app loads the public model repo and serves a coding-focused chat UI.
+"""
+
+from __future__ import annotations
+
+import re
+import time
+from functools import lru_cache
+from typing import Any, Dict, List
+
+import gradio as gr
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import PreTrainedTokenizerFast
+from transformers import AutoModelForCausalLM
+
+MODEL_ID = "Mindigenous/MINDI-1.0-420M"
+MAX_CONTEXT_CHARS = 2400
+CPU_MAX_NEW_TOKENS = 96
+CPU_MAX_TIME_SECONDS = 20.0
+
+
+def _looks_like_coding_prompt(text: str) -> bool:
+    """Simple keyword gate so random chat gets a helpful coding-only response."""
+    text_l = text.lower()
+    keywords = [
+        "python", "javascript", "js", "function", "bug", "error", "traceback",
+        "class", "loop", "array", "dict", "api", "sql", "regex", "algorithm",
+        "code", "implement", "fix", "refactor", "optimize",
+    ]
+    return any(k in text_l for k in keywords)
+
+
+def _language_token(language: str) -> str:
+    return "<JAVASCRIPT>" if language.lower().startswith("java") else "<PYTHON>"
+
+
+def _cleanup_generated_text(text: str) -> str:
+    """Remove training markers and keep the code region only."""
+    # Prefer output after the <CODE> marker when present.
+    if "<CODE>" in text:
+        text = text.split("<CODE>", 1)[1]
+
+    # Remove known special tokens but keep real code content.
+    text = re.sub(r"<(BOS|EOS|PROMPT|CODE|PYTHON|JAVASCRIPT|PAD|UNK|INDENT|DEDENT|NL)>", "", text)
+
+    # Normalize spacing/newlines after marker cleanup.
+    text = text.replace("\\n", "\n")
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = text.strip()
+
+    return text
+
+
+def _build_prompt(message: str, history: List[Dict[str, str]], language: str) -> str:
+    """Build prompt in the same style used during training."""
+    # Keep short recent context for better continuity on free CPU.
+    snippets: List[str] = []
+    for item in history[-3:]:
+        role = item.get("role", "user")
+        content = (item.get("content") or "").strip()
+        if content:
+            prefix = "User" if role == "user" else "Assistant"
+            snippets.append(f"{prefix}: {content}")
+
+    context_text = "\n".join(snippets)
+    if len(context_text) > MAX_CONTEXT_CHARS:
+        context_text = context_text[-MAX_CONTEXT_CHARS:]
+
+    combined = message.strip()
+    if context_text:
+        combined = f"Conversation so far:\n{context_text}\n\nCurrent request:\n{message.strip()}"
+
+    return f"<PROMPT> {_language_token(language)} {combined} <CODE>"
+
+
+@lru_cache(maxsize=1)
+def _load_runtime() -> tuple[Any, Any, torch.device, torch.dtype]:
+    """Load tokenizer + model once for the Space process."""
+    # Load tokenizer directly from tokenizer.json to avoid custom tokenizer
+    # path resolution issues in dynamic module cache on Spaces.
+    tokenizer_file = hf_hub_download(repo_id=MODEL_ID, filename="tokenizer.json")
+    tokenizer = PreTrainedTokenizerFast(
+        tokenizer_file=tokenizer_file,
+        bos_token="<BOS>",
+        eos_token="<EOS>",
+        unk_token="<UNK>",
+        pad_token="<PAD>",
+    )
+
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+    dtype = torch.float16 if use_cuda else torch.float32
+
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        trust_remote_code=True,
+        torch_dtype=dtype,
+    )
+    model.to(device)
+    model.eval()
+    # Bridge custom config names to standard Transformers generation fields.
+    if not hasattr(model.config, "num_hidden_layers") and hasattr(model.config, "n_layers"):
+        model.config.num_hidden_layers = int(model.config.n_layers)
+    if not hasattr(model.config, "num_attention_heads") and hasattr(model.config, "n_heads"):
+        model.config.num_attention_heads = int(model.config.n_heads)
+    if not hasattr(model.config, "hidden_size") and hasattr(model.config, "d_model"):
+        model.config.hidden_size = int(model.config.d_model)
+
+    # Ensure pad token is defined for stable generation.
+    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return tokenizer, model, device, dtype
+
+
+def _generate(message: str, history: List[Dict[str, str]], language: str, temperature: float, top_p: float, max_new_tokens: int) -> str:
+    if not _looks_like_coding_prompt(message):
+        return "MINDI is coding-focused. Please ask a coding question (Python or JavaScript)."
+
+    tokenizer, model, device, _ = _load_runtime()
+
+    prompt = _build_prompt(message=message, history=history, language=language)
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+    # Custom MINDI model forward() does not consume token_type_ids.
+    if "token_type_ids" in inputs:
+        inputs.pop("token_type_ids")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+
+    eos_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 3
+    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else eos_id
+
+    effective_max_new_tokens = int(max_new_tokens)
+    do_sample = temperature > 0
+    max_time = None
+    if device.type == "cpu":
+        effective_max_new_tokens = min(effective_max_new_tokens, CPU_MAX_NEW_TOKENS)
+        do_sample = False
+        max_time = CPU_MAX_TIME_SECONDS
+
+    start = time.perf_counter()
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=effective_max_new_tokens,
+            do_sample=do_sample,
+            temperature=max(temperature, 1e-5),
+            top_p=top_p,
+            eos_token_id=eos_id,
+            pad_token_id=pad_id,
+            use_cache=False,
+            max_time=max_time,
+            num_beams=1,
+        )
+    elapsed = time.perf_counter() - start
+
+    decoded = tokenizer.decode(out[0], skip_special_tokens=False)
+    code = _cleanup_generated_text(decoded)
+
+    if not code:
+        code = "# I could not generate code for that prompt.\n# Try being more specific about input/output behavior."
+
+    lang = "javascript" if language.lower().startswith("java") else "python"
+    generated_tokens = max(0, out.shape[1] - inputs["input_ids"].shape[1])
+
+    return (
+        f"```{lang}\n{code}\n```\n\n"
+        f"**MINDI 1.0 420M** | mode=base | generated_tokens={generated_tokens} | time={elapsed:.2f}s"
+    )
+
+
+def build_demo() -> gr.Blocks:
+    css = """
+    body, .gradio-container {
+        background: #0b1220 !important;
+        color: #e6edf3 !important;
+    }
+    .gradio-container {
+        font-family: 'Segoe UI', sans-serif;
+    }
+    #title-block {
+        border: 1px solid #1f2a44;
+        border-radius: 14px;
+        padding: 12px 16px;
+        background: linear-gradient(135deg, #111a2c, #0b1220);
+    }
+    """
+
+    with gr.Blocks(title="MINDI 1.0 420M", theme=gr.themes.Base(), css=css) as demo:
+        gr.Markdown(
+            "## MINDI 1.0 420M\n"
+            "Your local coding intelligence — 420M parameters, fully offline model hosted on Hugging Face.",
+            elem_id="title-block",
+        )
+
+        with gr.Row():
+            language = gr.Dropdown(
+                choices=["python", "javascript"],
+                value="python",
+                label="Language Focus",
+            )
+            temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
+            max_new_tokens = gr.Slider(32, 192, value=96, step=16, label="Max New Tokens")
+
+        chatbot = gr.Chatbot(type="messages", height=520, label="MINDI Chat")
+        msg = gr.Textbox(
+            label="Prompt",
+            placeholder="Ask MINDI anything about code",
+            lines=3,
+        )
+        clear_btn = gr.Button("Clear")
+
+        def _user(user_message: str, chat_history: List[Dict[str, str]]):
+            chat_history = chat_history or []
+            chat_history.append({"role": "user", "content": user_message})
+            return "", chat_history
+
+        def _bot(chat_history: List[Dict[str, str]], lang: str, temp: float, tp: float, mnt: int):
+            chat_history = chat_history or []
+            last_user = ""
+            for item in reversed(chat_history):
+                if item.get("role") == "user":
+                    last_user = item.get("content", "")
+                    break
+            response = _generate(last_user, chat_history, lang, temp, tp, mnt)
+            chat_history.append({"role": "assistant", "content": response})
+            return chat_history
+
+        msg.submit(_user, [msg, chatbot], [msg, chatbot], queue=False).then(
+            _bot,
+            [chatbot, language, temperature, top_p, max_new_tokens],
+            [chatbot],
+        )
+
+        clear_btn.click(lambda: [], None, chatbot, queue=False)
+
+    return demo
+
+
+demo = build_demo()
+
+if __name__ == "__main__":
+    demo.queue(max_size=16).launch()
diff --git a/hf_space/MINDI-1.0-420M/requirements.txt b/hf_space/MINDI-1.0-420M/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f49f86ff59dfa3ee18c2205c82c5364caec0ad55
--- /dev/null
+++ b/hf_space/MINDI-1.0-420M/requirements.txt
@@ -0,0 +1,11 @@
+﻿gradio==5.5.0
+transformers>=4.46.3,<5
+torch==2.5.1
+safetensors>=0.4.5
+huggingface_hub>=0.36.2,<1
+fastapi==0.115.5
+starlette==0.41.3
+uvicorn==0.32.0
+httpx==0.27.2
+pydantic==2.9.2
+websockets==12.0
diff --git a/logs/component8_launch_check.err.log b/logs/component8_launch_check.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/logs/component8_launch_check.out.log b/logs/component8_launch_check.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/logs/data_fetch.log b/logs/data_fetch.log
new file mode 100644
index 0000000000000000000000000000000000000000..093165916a5a80ec8f3845f68dd1e60bd51980ef
--- /dev/null
+++ b/logs/data_fetch.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1efcdb351ebae00b30ecf9d1e1b2f83188919e4a33052707d1627cfde6a3c731
+size 153
diff --git a/logs/start_mindi_test.err.log b/logs/start_mindi_test.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..ef08fec4757038583ab06e7fa25c062dd0f6d3a6
--- /dev/null
+++ b/logs/start_mindi_test.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e24ebb82e60ecb4d05713ca1e413f896e6fa83998d317a9715463ec340c3ef29
+size 397
diff --git a/logs/start_mindi_test.out.log b/logs/start_mindi_test.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/logs/start_mindi_test10.err.log b/logs/start_mindi_test10.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..099d0abee3c088def36dfd3a64509ae07b2d2dbe
--- /dev/null
+++ b/logs/start_mindi_test10.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:564f37db2cc0b0ca3d2960f47e8419114989f3de6a2596782332be13e3bd6a0e
+size 2692
diff --git a/logs/start_mindi_test10.out.log b/logs/start_mindi_test10.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/logs/start_mindi_test11.err.log b/logs/start_mindi_test11.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..fc7d74df0fa83e32218c8d11fb670cd387b7cdf4
--- /dev/null
+++ b/logs/start_mindi_test11.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bec8dfc4a7f70745d002ddc05a751241f68e8461753773f97555fa1755738443
+size 1813
diff --git a/logs/start_mindi_test11.out.log b/logs/start_mindi_test11.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/logs/start_mindi_test2.err.log b/logs/start_mindi_test2.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..efe1684b8bd02ac0ed9a0785479baf5eae84da4a
--- /dev/null
+++ b/logs/start_mindi_test2.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9f5126be1d6fb3b72cec93eebaa41b901b1dd4689fa0cf872a6f1b756d59006
+size 224
diff --git a/logs/start_mindi_test2.out.log b/logs/start_mindi_test2.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..338fd7760fa44e75fc78fea5e526c36870ebd718
--- /dev/null
+++ b/logs/start_mindi_test2.out.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffef22ad744514bfd22ba830666fb356c148b735f961ff6ef1e8b76b63c86c12
+size 159
diff --git a/logs/start_mindi_test3.err.log b/logs/start_mindi_test3.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/logs/start_mindi_test3.out.log b/logs/start_mindi_test3.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..8dd8f06e5ebcde5b0f21f16a20d7932bf04de580
--- /dev/null
+++ b/logs/start_mindi_test3.out.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5f532f28bee6f0b093e6ea80782647da0f4f3f932def12e994fe5561a46c5ac
+size 14701
diff --git a/logs/start_mindi_test4.err.log b/logs/start_mindi_test4.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..04d477b15dea7e472b365fafc8494a9c9bdd1f41
--- /dev/null
+++ b/logs/start_mindi_test4.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e8058180dd50e94d7b0de1757a962128b32c5571f94f477647a61c04aaa5cb6
+size 288
diff --git a/logs/start_mindi_test4.out.log b/logs/start_mindi_test4.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..faeb1bb34d6b70b423b462db10b81dade54dd32d
--- /dev/null
+++ b/logs/start_mindi_test4.out.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d8e611bd3d3c2aa86b61a66ae8be8842412e003a93217e754dcad3bc968f1af
+size 10200
diff --git a/logs/start_mindi_test5.err.log b/logs/start_mindi_test5.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..9c6571c99608ecf4621ba9810fce4ab33bdf9aa0
--- /dev/null
+++ b/logs/start_mindi_test5.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e745e1abf0e2332620f064a611fec0df39d276b1f1578ac62a09b25f3b9a96f
+size 350
diff --git a/logs/start_mindi_test5.out.log b/logs/start_mindi_test5.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/logs/start_mindi_test6.err.log b/logs/start_mindi_test6.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..03a699127bd4fc34657bb3753096e334607ee048
--- /dev/null
+++ b/logs/start_mindi_test6.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d9c9fcac3877a556b8ca642cf654cf9ca60cfbecb31baff5840fc0c2e94a525
+size 34
diff --git a/logs/start_mindi_test6.out.log b/logs/start_mindi_test6.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..f344968792eb382dc05b1be94b36977b54da1701
--- /dev/null
+++ b/logs/start_mindi_test6.out.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e62816b259003b9d18c287a836ad733f8cdc052e893c09f18e0d41b16b3b2436
+size 41
diff --git a/logs/start_mindi_test7.err.log b/logs/start_mindi_test7.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..59a5928d1c7b98b58c43b00665786748c942b9c7
--- /dev/null
+++ b/logs/start_mindi_test7.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8b2c3945751939e08a4fa5f05cb647f05552e2fa230de22859d262def416b0b
+size 291
diff --git a/logs/start_mindi_test7.out.log b/logs/start_mindi_test7.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..e75f37db2a9e9171a47d79c29540bff0fbe951c0
--- /dev/null
+++ b/logs/start_mindi_test7.out.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4999c0b72f6da602c4e28ce40aa7f5da377bce02743a8c344bb32ecadd0c0a51
+size 15486
diff --git a/logs/start_mindi_test8.err.log b/logs/start_mindi_test8.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..9ca61d8c0a3f7f5d7ddd193f33e8d0dbee4793c6
--- /dev/null
+++ b/logs/start_mindi_test8.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f7ed6ff7728d6cf48413fd39bad9d370cdfb1ae99f5fe32854c5927d724cbab
+size 44232
diff --git a/logs/start_mindi_test8.out.log b/logs/start_mindi_test8.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..6118aab8736b1f892a3bfb0baa76f08ed04fd533
--- /dev/null
+++ b/logs/start_mindi_test8.out.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a96a9fdd72b29c0845362ca8ea92dae01835ab12bfcc94c4eadba1a0a76f494e
+size 48
diff --git a/logs/start_mindi_test9.err.log b/logs/start_mindi_test9.err.log
new file mode 100644
index 0000000000000000000000000000000000000000..fc7d74df0fa83e32218c8d11fb670cd387b7cdf4
--- /dev/null
+++ b/logs/start_mindi_test9.err.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bec8dfc4a7f70745d002ddc05a751241f68e8461753773f97555fa1755738443
+size 1813
diff --git a/logs/start_mindi_test9.out.log b/logs/start_mindi_test9.out.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/logs/train.log b/logs/train.log
new file mode 100644
index 0000000000000000000000000000000000000000..f1cfcd527d35505eb14ed9e39f81a20a9a4a68e5
--- /dev/null
+++ b/logs/train.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f9c40f4da1cd9ccfeebbb3cacf0874e3b14189ba286291f80cb3cb7b7ee25d7
+size 1389
diff --git a/logs/train_live.log b/logs/train_live.log
new file mode 100644
index 0000000000000000000000000000000000000000..6cd4bb4e1c5e3757ebde3bf1bac71a2e6f2b090c
--- /dev/null
+++ b/logs/train_live.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ab71adb1d0d786c87e87fcf4b16c6041eb9fce2e7d5c868ebbba39bed574b4b
+size 696468
diff --git a/models/lora/custom_lora_v1/adapter_meta.json b/models/lora/custom_lora_v1/adapter_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ce0141e6f2ec90055aaa445c5a8e3e4ce2c5f9f
--- /dev/null
+++ b/models/lora/custom_lora_v1/adapter_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29a1ee1234ec290e5d23343363e9733df70078dad5adac38fe007683a881894a
+size 336
diff --git a/models/lora/custom_lora_v1/best.pt b/models/lora/custom_lora_v1/best.pt
new file mode 100644
index 0000000000000000000000000000000000000000..255e5ea2fb16f55f20e2e03c11b65e8fc73cc98e
--- /dev/null
+++ b/models/lora/custom_lora_v1/best.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8115abb545aea63105017b908eadcee15e2a4285cedac5ca665c7a3c76ca512e
+size 46044542
diff --git a/models/lora/custom_lora_v1/latest.pt b/models/lora/custom_lora_v1/latest.pt
new file mode 100644
index 0000000000000000000000000000000000000000..467f0eb80aff06a0f0a06e6d04ef94eede7b2083
--- /dev/null
+++ b/models/lora/custom_lora_v1/latest.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d6242b27c6fdcb55e889d1d648e150e338848cbccf7bbf0a454eb6af3c969e6
+size 46095014
diff --git a/models/lora/custom_lora_v1/step_100.pt b/models/lora/custom_lora_v1/step_100.pt
new file mode 100644
index 0000000000000000000000000000000000000000..373a591f6e4bc6669106476796b9faeaa38b5d38
--- /dev/null
+++ b/models/lora/custom_lora_v1/step_100.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af091329497df29e4b50253eb30226439351cba65c32d1fb660427e428d1a31d
+size 46102030
diff --git a/models/lora/custom_lora_v1/step_200.pt b/models/lora/custom_lora_v1/step_200.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b02a7f35d37ed86ca65b2d49796d70351b7d906
--- /dev/null
+++ b/models/lora/custom_lora_v1/step_200.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4217cf2881b3839884cba7525bd96d2ef5e99bc0792048102d1d072293733936
+size 46102030
diff --git a/models/lora/custom_lora_v1/step_300.pt b/models/lora/custom_lora_v1/step_300.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6bf4411e8539735403c44b73885144bc7e03bde
--- /dev/null
+++ b/models/lora/custom_lora_v1/step_300.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fdf20dbed62d8250906e82b90e4ffd1d6ef78408dd39800e42875f46f44c670
+size 46102030
diff --git a/models/lora/custom_lora_v1/step_400.pt b/models/lora/custom_lora_v1/step_400.pt
new file mode 100644
index 0000000000000000000000000000000000000000..97e0505061d86dd2aac4abdf1f5e19c3e037fa32
--- /dev/null
+++ b/models/lora/custom_lora_v1/step_400.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbe3e5fe97e59ac822a50767cf222c98a548b491b4250fc94639f57c68778d7b
+size 46102030
diff --git a/models/lora/custom_lora_v1/step_5.pt b/models/lora/custom_lora_v1/step_5.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d400660c74f4caa1c9db0c5569bc0c243cee3603
--- /dev/null
+++ b/models/lora/custom_lora_v1/step_5.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1112fbbc9909b39f2a9a669020be217fdf91e57ef6a661f7fc5933375637e63
+size 46095014
diff --git a/models/lora/custom_lora_v1/step_500.pt b/models/lora/custom_lora_v1/step_500.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a238c791f2a0f11346f07fea2b80f81c9c3feaba
--- /dev/null
+++ b/models/lora/custom_lora_v1/step_500.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0efc25c990106212b7653694c1625b0842191fba9bc772ddd06af2b2413b6ec9
+size 46102030
diff --git a/models/lora/custom_lora_v1/step_600.pt b/models/lora/custom_lora_v1/step_600.pt
new file mode 100644
index 0000000000000000000000000000000000000000..814ca607dbdf9c7f9ac8a7811abf99aad0be7623
--- /dev/null
+++ b/models/lora/custom_lora_v1/step_600.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:108c7c7c0f26592383b5f8fea392465d5d8efea07ba3ec3578aef4cb648c8c6c
+size 46102030
diff --git a/models/lora/custom_lora_v1/step_700.pt b/models/lora/custom_lora_v1/step_700.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4134295c612862b133fda00662643d9d90b4e914
--- /dev/null
+++ b/models/lora/custom_lora_v1/step_700.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b94d96ae590f17f9d603d4cab2ef847b710e3f8b4ce1dd755a5fffeed60c44dd
+size 46102030
diff --git a/models/quantized/model_step3200_int8_state.pt b/models/quantized/model_step3200_int8_state.pt
new file mode 100644
index 0000000000000000000000000000000000000000..088a51d19e14e6f270c8aea0040f23a1c25512ae
--- /dev/null
+++ b/models/quantized/model_step3200_int8_state.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba816d83052a2547af224d4fa1cb6ec9836c08df62c570958885acfab1817ef4
+size 654678026
diff --git a/release/MINDI_1.0_420M/.deps_installed b/release/MINDI_1.0_420M/.deps_installed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/release/MINDI_1.0_420M/Start_MINDI.bat b/release/MINDI_1.0_420M/Start_MINDI.bat
new file mode 100644
index 0000000000000000000000000000000000000000..c4249ac6129941bba23cc65f01319893a44fa31d
--- /dev/null
+++ b/release/MINDI_1.0_420M/Start_MINDI.bat
@@ -0,0 +1,38 @@
+@echo off
+title MINDI 1.0 420M
+setlocal
+cd /d "%~dp0"
+
+set "BOOTSTRAP_PY=%~dp0..\..\.venv\Scripts\python.exe"
+set "VENV_PY=%~dp0.venv\Scripts\python.exe"
+
+if exist "%VENV_PY%" goto after_venv
+if exist "%~dp0.venv" rmdir /s /q "%~dp0.venv"
+echo [setup] Creating virtual environment...
+if exist "%BOOTSTRAP_PY%" (
+  "%BOOTSTRAP_PY%" -m venv "%~dp0.venv"
+) else (
+  py -3.11 -m venv "%~dp0.venv"
+)
+if not exist "%VENV_PY%" (
+  echo [error] Python 3.11 is required. Install Python 3.11 and re-run Start_MINDI.bat.
+  pause
+  exit /b 1
+)
+
+:after_venv
+"%VENV_PY%" -m pip install --upgrade pip >nul
+
+if exist "%~dp0.deps_installed" goto run_app
+echo [setup] Installing dependencies (first run only)...
+"%VENV_PY%" -m pip install -r "%~dp0requirements_portable.txt"
+if errorlevel 1 (
+  echo [error] Dependency install failed. Check internet and Python version.
+  pause
+  exit /b 1
+)
+type nul > "%~dp0.deps_installed"
+
+:run_app
+"%VENV_PY%" "%~dp0app\launch_portable_chat.py" --config "%~dp0app\portable_chat_config.yaml"
+endlocal
diff --git a/release/MINDI_1.0_420M/app/launch_portable_chat.py b/release/MINDI_1.0_420M/app/launch_portable_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e2a3a007665d9f0fb400d20187278fe4c46e750
--- /dev/null
+++ b/release/MINDI_1.0_420M/app/launch_portable_chat.py
@@ -0,0 +1,83 @@
+﻿from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import gradio as gr
+import torch
+import yaml
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets
+from src.tokenizer.code_tokenizer import CodeTokenizer
+from src.inference_engine.inference_engine import InferenceEngine, DecodingConfig
+
+
+def load_yaml(path: Path):
+    return yaml.safe_load(path.read_text(encoding="utf-8-sig"))
+
+
+def build_model_config(path: Path) -> ModelConfig:
+    cfg = load_yaml(path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+    if preset:
+        merged = get_model_presets()[preset].__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+    return ModelConfig(**model_cfg)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--config", default="app/portable_chat_config.yaml")
+    ap.add_argument("--self_test", action="store_true")
+    args = ap.parse_args()
+
+    cfg = load_yaml(PROJECT_ROOT / args.config)
+    mcfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"])
+
+    tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"]))
+    model = CodeTransformerLM(mcfg).cpu().float()
+    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+    state = torch.load(PROJECT_ROOT / cfg["model"]["quantized_state_path"], map_location="cpu")
+    model.load_state_dict(state)
+
+    engine = InferenceEngine(model=model, tokenizer=tokenizer, device=torch.device("cpu"))
+    dcfg = DecodingConfig(max_new_tokens=220, min_tokens_before_stop_check=64)
+
+    if args.self_test:
+        out = engine.generate_with_retry("Write a Python function to add two numbers.", "python", dcfg)
+        code = out["final"]["code"]
+        print("portable_self_test_ok=", bool(code.strip()))
+        return
+
+    def respond(prompt, history):
+        history = history or []
+        p = (prompt or "").strip()
+        if not p:
+            return history, ""
+        out = engine.generate_with_retry(p, "python", dcfg)
+        history.append((p, out["final"]["code"]))
+        return history, ""
+
+    with gr.Blocks(title="MINDI 1.0 420M") as demo:
+        gr.Markdown("## MINDI 1.0 420M (INT8 Portable)")
+        chat = gr.Chatbot(height=520)
+        box = gr.Textbox(label="Prompt", lines=4)
+        btn = gr.Button("Generate")
+        clear = gr.Button("Clear")
+        btn.click(respond, [box, chat], [chat, box])
+        box.submit(respond, [box, chat], [chat, box])
+        clear.click(lambda: ([], ""), None, [chat, box])
+
+    demo.launch(server_name=cfg["server"].get("host", "127.0.0.1"), server_port=int(cfg["server"].get("port", 7861)), share=False, inbrowser=False)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/release/MINDI_1.0_420M/app/model_config.yaml b/release/MINDI_1.0_420M/app/model_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5667b78688d6890d294fc808c0110edd093f6a7
--- /dev/null
+++ b/release/MINDI_1.0_420M/app/model_config.yaml
@@ -0,0 +1,18 @@
+# Component 4 model config.
+# You can switch the preset name or directly edit dimensions below.
+
+preset: medium_420m
+
+model:
+  vocab_size: 50000
+  max_seq_len: 2048
+  d_model: 1152
+  n_layers: 23
+  n_heads: 16
+  d_ff: 4608
+  dropout: 0.1
+  tie_embeddings: true
+  gradient_checkpointing: false
+  init_std: 0.02
+  rms_norm_eps: 0.00001
+
diff --git a/release/MINDI_1.0_420M/app/portable_chat_config.yaml b/release/MINDI_1.0_420M/app/portable_chat_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7979a08a26dc0f7d96394998ece6ad94913577c
--- /dev/null
+++ b/release/MINDI_1.0_420M/app/portable_chat_config.yaml
@@ -0,0 +1,8 @@
+model:
+  model_config_path: app/model_config.yaml
+  quantized_state_path: model/model_step3200_int8_state.pt
+  tokenizer_dir: model/tokenizer
+
+server:
+  host: 127.0.0.1
+  port: 7861
diff --git a/release/MINDI_1.0_420M/model/model_step3200_int8_state.pt b/release/MINDI_1.0_420M/model/model_step3200_int8_state.pt
new file mode 100644
index 0000000000000000000000000000000000000000..088a51d19e14e6f270c8aea0040f23a1c25512ae
--- /dev/null
+++ b/release/MINDI_1.0_420M/model/model_step3200_int8_state.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba816d83052a2547af224d4fa1cb6ec9836c08df62c570958885acfab1817ef4
+size 654678026
diff --git a/release/MINDI_1.0_420M/model/tokenizer/tokenizer.json b/release/MINDI_1.0_420M/model/tokenizer/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..4798c7bfd1002d16664c9d8bec52763fdbc3fe48
--- /dev/null
+++ b/release/MINDI_1.0_420M/model/tokenizer/tokenizer.json
@@ -0,0 +1,799 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<PAD>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<UNK>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<BOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<EOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "<NL>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 5,
+      "content": "<INDENT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 6,
+      "content": "<DEDENT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 7,
+      "content": "<PROMPT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 8,
+      "content": "<CODE>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 9,
+      "content": "<PYTHON>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 10,
+      "content": "<JAVASCRIPT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "NFKC"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "(==|!=|<=|>=|:=|->|=>|\\+\\+|--|\\+=|-=|\\*=|/=|//=|%=|\\*\\*|&&|\\|\\||<<|>>)"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "([()\\[\\]{}.,:;])"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Metaspace",
+        "replacement": "_",
+        "prepend_scheme": "always",
+        "split": true
+      }
+    ]
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<BOS>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<EOS>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<BOS>": {
+        "id": "<BOS>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<BOS>"
+        ]
+      },
+      "<EOS>": {
+        "id": "<EOS>",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "<EOS>"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "BPEDecoder",
+    "suffix": "</w>"
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<UNK>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<PAD>": 0,
+      "<UNK>": 1,
+      "<BOS>": 2,
+      "<EOS>": 3,
+      "<NL>": 4,
+      "<INDENT>": 5,
+      "<DEDENT>": 6,
+      "<PROMPT>": 7,
+      "<CODE>": 8,
+      "<PYTHON>": 9,
+      "<JAVASCRIPT>": 10,
+      "(": 11,
+      ")": 12,
+      "+": 13,
+      ",": 14,
+      ".": 15,
+      "0": 16,
+      "4": 17,
+      "5": 18,
+      ":": 19,
+      ";": 20,
+      "<": 21,
+      "=": 22,
+      ">": 23,
+      "A": 24,
+      "C": 25,
+      "D": 26,
+      "E": 27,
+      "F": 28,
+      "H": 29,
+      "I": 30,
+      "J": 31,
+      "L": 32,
+      "M": 33,
+      "N": 34,
+      "O": 35,
+      "P": 36,
+      "R": 37,
+      "S": 38,
+      "T": 39,
+      "V": 40,
+      "W": 41,
+      "Y": 42,
+      "_": 43,
+      "a": 44,
+      "b": 45,
+      "c": 46,
+      "d": 47,
+      "e": 48,
+      "f": 49,
+      "g": 50,
+      "h": 51,
+      "i": 52,
+      "l": 53,
+      "m": 54,
+      "n": 55,
+      "o": 56,
+      "p": 57,
+      "r": 58,
+      "s": 59,
+      "t": 60,
+      "u": 61,
+      "v": 62,
+      "w": 63,
+      "x": 64,
+      "y": 65,
+      "{": 66,
+      "}": 67,
+      "_<": 68,
+      "DE": 69,
+      "T>": 70,
+      "_a": 71,
+      "L>": 72,
+      "NL>": 73,
+      "_<NL>": 74,
+      "NT>": 75,
+      "_t": 76,
+      "DENT>": 77,
+      "_i": 78,
+      "PT>": 79,
+      "_(": 80,
+      "_)": 81,
+      "on": 82,
+      "_<P": 83,
+      "_f": 84,
+      "_l": 85,
+      "re": 86,
+      "ri": 87,
+      "CO": 88,
+      "IN": 89,
+      "MPT>": 90,
+      "OMPT>": 91,
+      "ROMPT>": 92,
+      "_;": 93,
+      "_b": 94,
+      "at": 95,
+      "_<DE": 96,
+      "_<CO": 97,
+      "_<IN": 98,
+      "DE>": 99,
+      "_to": 100,
+      "_<PROMPT>": 101,
+      "_lo": 102,
+      "_<DEDENT>": 103,
+      "_<CODE>": 104,
+      "_<INDENT>": 105,
+      "_+": 106,
+      "_0": 107,
+      "_re": 108,
+      "ct": 109,
+      "dd": 110,
+      "ion": 111,
+      "nct": 112,
+      "rn": 113,
+      "tu": 114,
+      "unct": 115,
+      "va": 116,
+      "_add": 117,
+      "_th": 118,
+      "_funct": 119,
+      "_retu": 120,
+      "_function": 121,
+      "_return": 122,
+      "AS": 123,
+      "AV": 124,
+      "CR": 125,
+      "Cre": 126,
+      "HO": 127,
+      "IPT>": 128,
+      "Ja": 129,
+      "JAV": 130,
+      "N>": 131,
+      "Py": 132,
+      "Sc": 133,
+      "THO": 134,
+      "YTHO": 135,
+      "_,": 136,
+      "_4": 137,
+      "_5": 138,
+      "_:": 139,
+      "_p": 140,
+      "_{": 141,
+      "_}": 142,
+      "_Cre": 143,
+      "_Ja": 144,
+      "_Py": 145,
+      "hon": 146,
+      "nt": 147,
+      "op": 148,
+      "or": 149,
+      "pt": 150,
+      "thon": 151,
+      "_<JAV": 152,
+      "_<PYTHO": 153,
+      "_for": 154,
+      "rint": 155,
+      "ript": 156,
+      "ate": 157,
+      "_log": 158,
+      "_loop": 159,
+      "vaSc": 160,
+      "_that": 161,
+      "ASCR": 162,
+      "_print": 163,
+      "_Create": 164,
+      "_JavaSc": 165,
+      "_Python": 166,
+      "_<JAVASCR": 167,
+      "_<PYTHON>": 168,
+      "_JavaScript": 169,
+      "_<JAVASCRIPT>": 170
+    },
+    "merges": [
+      [
+        "_",
+        "<"
+      ],
+      [
+        "D",
+        "E"
+      ],
+      [
+        "T",
+        ">"
+      ],
+      [
+        "_",
+        "a"
+      ],
+      [
+        "L",
+        ">"
+      ],
+      [
+        "N",
+        "L>"
+      ],
+      [
+        "_<",
+        "NL>"
+      ],
+      [
+        "N",
+        "T>"
+      ],
+      [
+        "_",
+        "t"
+      ],
+      [
+        "DE",
+        "NT>"
+      ],
+      [
+        "_",
+        "i"
+      ],
+      [
+        "P",
+        "T>"
+      ],
+      [
+        "_",
+        "("
+      ],
+      [
+        "_",
+        ")"
+      ],
+      [
+        "o",
+        "n"
+      ],
+      [
+        "_<",
+        "P"
+      ],
+      [
+        "_",
+        "f"
+      ],
+      [
+        "_",
+        "l"
+      ],
+      [
+        "r",
+        "e"
+      ],
+      [
+        "r",
+        "i"
+      ],
+      [
+        "C",
+        "O"
+      ],
+      [
+        "I",
+        "N"
+      ],
+      [
+        "M",
+        "PT>"
+      ],
+      [
+        "O",
+        "MPT>"
+      ],
+      [
+        "R",
+        "OMPT>"
+      ],
+      [
+        "_",
+        ";"
+      ],
+      [
+        "_",
+        "b"
+      ],
+      [
+        "a",
+        "t"
+      ],
+      [
+        "_<",
+        "DE"
+      ],
+      [
+        "_<",
+        "CO"
+      ],
+      [
+        "_<",
+        "IN"
+      ],
+      [
+        "DE",
+        ">"
+      ],
+      [
+        "_t",
+        "o"
+      ],
+      [
+        "_<P",
+        "ROMPT>"
+      ],
+      [
+        "_l",
+        "o"
+      ],
+      [
+        "_<DE",
+        "DENT>"
+      ],
+      [
+        "_<CO",
+        "DE>"
+      ],
+      [
+        "_<IN",
+        "DENT>"
+      ],
+      [
+        "_",
+        "+"
+      ],
+      [
+        "_",
+        "0"
+      ],
+      [
+        "_",
+        "re"
+      ],
+      [
+        "c",
+        "t"
+      ],
+      [
+        "d",
+        "d"
+      ],
+      [
+        "i",
+        "on"
+      ],
+      [
+        "n",
+        "ct"
+      ],
+      [
+        "r",
+        "n"
+      ],
+      [
+        "t",
+        "u"
+      ],
+      [
+        "u",
+        "nct"
+      ],
+      [
+        "v",
+        "a"
+      ],
+      [
+        "_a",
+        "dd"
+      ],
+      [
+        "_t",
+        "h"
+      ],
+      [
+        "_f",
+        "unct"
+      ],
+      [
+        "_re",
+        "tu"
+      ],
+      [
+        "_funct",
+        "ion"
+      ],
+      [
+        "_retu",
+        "rn"
+      ],
+      [
+        "A",
+        "S"
+      ],
+      [
+        "A",
+        "V"
+      ],
+      [
+        "C",
+        "R"
+      ],
+      [
+        "C",
+        "re"
+      ],
+      [
+        "H",
+        "O"
+      ],
+      [
+        "I",
+        "PT>"
+      ],
+      [
+        "J",
+        "a"
+      ],
+      [
+        "J",
+        "AV"
+      ],
+      [
+        "N",
+        ">"
+      ],
+      [
+        "P",
+        "y"
+      ],
+      [
+        "S",
+        "c"
+      ],
+      [
+        "T",
+        "HO"
+      ],
+      [
+        "Y",
+        "THO"
+      ],
+      [
+        "_",
+        ","
+      ],
+      [
+        "_",
+        "4"
+      ],
+      [
+        "_",
+        "5"
+      ],
+      [
+        "_",
+        ":"
+      ],
+      [
+        "_",
+        "p"
+      ],
+      [
+        "_",
+        "{"
+      ],
+      [
+        "_",
+        "}"
+      ],
+      [
+        "_",
+        "Cre"
+      ],
+      [
+        "_",
+        "Ja"
+      ],
+      [
+        "_",
+        "Py"
+      ],
+      [
+        "h",
+        "on"
+      ],
+      [
+        "n",
+        "t"
+      ],
+      [
+        "o",
+        "p"
+      ],
+      [
+        "o",
+        "r"
+      ],
+      [
+        "p",
+        "t"
+      ],
+      [
+        "t",
+        "hon"
+      ],
+      [
+        "_<",
+        "JAV"
+      ],
+      [
+        "_<P",
+        "YTHO"
+      ],
+      [
+        "_f",
+        "or"
+      ],
+      [
+        "ri",
+        "nt"
+      ],
+      [
+        "ri",
+        "pt"
+      ],
+      [
+        "at",
+        "e"
+      ],
+      [
+        "_lo",
+        "g"
+      ],
+      [
+        "_lo",
+        "op"
+      ],
+      [
+        "va",
+        "Sc"
+      ],
+      [
+        "_th",
+        "at"
+      ],
+      [
+        "AS",
+        "CR"
+      ],
+      [
+        "_p",
+        "rint"
+      ],
+      [
+        "_Cre",
+        "ate"
+      ],
+      [
+        "_Ja",
+        "vaSc"
+      ],
+      [
+        "_Py",
+        "thon"
+      ],
+      [
+        "_<JAV",
+        "ASCR"
+      ],
+      [
+        "_<PYTHO",
+        "N>"
+      ],
+      [
+        "_JavaSc",
+        "ript"
+      ],
+      [
+        "_<JAVASCR",
+        "IPT>"
+      ]
+    ]
+  }
+}
\ No newline at end of file
diff --git a/release/MINDI_1.0_420M/model/tokenizer/tokenizer_config.json b/release/MINDI_1.0_420M/model/tokenizer/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..32329ced84f28ebf5ed232d510fb78763c616e15
--- /dev/null
+++ b/release/MINDI_1.0_420M/model/tokenizer/tokenizer_config.json
@@ -0,0 +1,19 @@
+{
+  "vocab_size": 50000,
+  "min_frequency": 2,
+  "model_max_length": 2048,
+  "indent_width": 4,
+  "special_tokens": [
+    "<PAD>",
+    "<UNK>",
+    "<BOS>",
+    "<EOS>",
+    "<NL>",
+    "<INDENT>",
+    "<DEDENT>",
+    "<PROMPT>",
+    "<CODE>",
+    "<PYTHON>",
+    "<JAVASCRIPT>"
+  ]
+}
\ No newline at end of file
diff --git a/release/MINDI_1.0_420M/requirements_portable.txt b/release/MINDI_1.0_420M/requirements_portable.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c9b36acfefdafdfe7676130830521ce916c0fb38
--- /dev/null
+++ b/release/MINDI_1.0_420M/requirements_portable.txt
@@ -0,0 +1,11 @@
+torch==2.4.1
+tokenizers==0.20.1
+pyyaml==6.0.2
+gradio==5.5.0
+gradio-client==1.4.2
+fastapi==0.115.5
+starlette==0.41.3
+uvicorn==0.32.0
+httpx==0.27.2
+pydantic==2.9.2
+pygments==2.19.2
diff --git a/release/MINDI_1.0_420M/src/__init__.py b/release/MINDI_1.0_420M/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..45aa8dbeca626dc18818fbfe98c473d01ebd3aff
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/__init__.py
@@ -0,0 +1,2 @@
+# This file marks src as a Python package.
+
diff --git a/release/MINDI_1.0_420M/src/evaluation_system/__init__.py b/release/MINDI_1.0_420M/src/evaluation_system/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab838cd1aab93b97566844ce5c08d423d94dc41
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/evaluation_system/__init__.py
@@ -0,0 +1 @@
+﻿# This file marks evaluation_system as a Python package.
diff --git a/release/MINDI_1.0_420M/src/evaluation_system/code_eval.py b/release/MINDI_1.0_420M/src/evaluation_system/code_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd8ed48a751285ecba59b448786774f7920eed16
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/evaluation_system/code_eval.py
@@ -0,0 +1,186 @@
+﻿"""
+Component 6 evaluation helpers.
+"""
+
+from __future__ import annotations
+
+import ast
+import json
+import re
+from pathlib import Path
+from typing import Dict, List
+
+
+def python_syntax_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except Exception:
+        return False
+
+
+def save_json(path: str, payload: Dict) -> None:
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def _normalize_punctuation_spacing(text: str) -> str:
+    text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text)
+    text = re.sub(r"([\(\[\{])\s+", r"\1", text)
+    text = re.sub(r"\s*=\s*", " = ", text)
+    text = re.sub(r"\s*\+\s*", " + ", text)
+    text = re.sub(r"\s*-\s*", " - ", text)
+    text = re.sub(r"\s*\*\s*", " * ", text)
+    text = re.sub(r"\s*/\s*", " / ", text)
+    text = re.sub(r"\s*%\s*", " % ", text)
+    return re.sub(r"[ \t]+", " ", text).strip()
+
+
+def _remove_non_python_noise(line: str) -> str:
+    line = line.replace("<UNK>", "1")
+    line = line.replace("\u0000", "")
+    line = line.replace("{", "")
+    line = line.replace("}", "")
+    line = line.replace(";", "")
+    return line
+
+
+def _fix_identifier_spacing(line: str) -> str:
+    # def name with spaces -> def name_with_spaces
+    m = re.match(r"^(\s*def\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*\(.*)$", line)
+    if m:
+        fn = re.sub(r"\s+", "_", m.group(2).strip())
+        line = f"{m.group(1)}{fn}{m.group(3)}"
+
+    # class name with spaces -> class Name_With_Spaces
+    m = re.match(r"^(\s*class\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*:.*)$", line)
+    if m:
+        cn = re.sub(r"\s+", "_", m.group(2).strip())
+        line = f"{m.group(1)}{cn}{m.group(3)}"
+
+    # assignment lhs spaces -> underscore.
+    if "=" in line and "==" not in line:
+        lhs, rhs = line.split("=", 1)
+        lhs_clean = lhs.strip()
+        if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean):
+            lhs_clean = re.sub(r"\s+", "_", lhs_clean)
+            line = f"{lhs_clean} = {rhs.strip()}"
+
+    return line
+
+
+def _looks_like_python_line(line: str) -> bool:
+    if not line.strip():
+        return False
+    starts = (
+        "def ",
+        "class ",
+        "if ",
+        "for ",
+        "while ",
+        "try:",
+        "except",
+        "with ",
+        "return ",
+        "import ",
+        "from ",
+        "print(",
+    )
+    s = line.strip()
+    if s.startswith(starts):
+        return True
+    if re.match(r"^[A-Za-z_][A-Za-z0-9_]*\s*=", s):
+        return True
+    return False
+
+
+def _trim_to_code(lines: List[str]) -> List[str]:
+    # Drop noisy preamble lines until first plausible Python line.
+    i = 0
+    while i < len(lines) and not _looks_like_python_line(lines[i]):
+        i += 1
+    lines = lines[i:] if i < len(lines) else []
+    # Keep only plausible lines after start; allow blank lines.
+    out = []
+    for line in lines:
+        if not line.strip():
+            out.append(line)
+            continue
+        if _looks_like_python_line(line) or line.startswith("    "):
+            out.append(line)
+    return out
+
+
+def _best_effort_python_format(lines: List[str]) -> List[str]:
+    out: List[str] = []
+    indent = 0
+    for raw in lines:
+        line = raw.strip()
+        if not line:
+            out.append("")
+            continue
+
+        if line in {"return", "pass", "break", "continue"}:
+            indent = max(0, indent - 1)
+
+        out.append(("    " * indent) + line)
+
+        if line.endswith(":"):
+            indent += 1
+
+    return out
+
+
+def restore_code_from_structured(decoded: str) -> str:
+    text = decoded
+    for tok in ["<BOS>", "<EOS>", "<PROMPT>", "<PYTHON>", "<JAVASCRIPT>"]:
+        text = text.replace(tok, "")
+
+    if "<CODE>" in text:
+        text = text.split("<CODE>", 1)[1]
+
+    text = text.replace("_", " ")
+    tokens = text.strip().split()
+
+    lines: List[str] = []
+    current_tokens: List[str] = []
+    indent = 0
+
+    for tok in tokens:
+        if tok == "<INDENT>":
+            indent += 1
+            continue
+        if tok == "<DEDENT>":
+            indent = max(0, indent - 1)
+            continue
+        if tok == "<NL>":
+            line = " ".join(current_tokens).strip()
+            line = _remove_non_python_noise(line)
+            line = _normalize_punctuation_spacing(line)
+            line = _fix_identifier_spacing(line)
+            if line:
+                lines.append(("    " * indent) + line)
+            else:
+                lines.append("")
+            current_tokens = []
+            continue
+        current_tokens.append(tok)
+
+    if current_tokens:
+        line = " ".join(current_tokens).strip()
+        line = _remove_non_python_noise(line)
+        line = _normalize_punctuation_spacing(line)
+        line = _fix_identifier_spacing(line)
+        if line:
+            lines.append(("    " * indent) + line)
+
+    lines = _trim_to_code(lines)
+    lines = _best_effort_python_format(lines)
+
+    while lines and not lines[0].strip():
+        lines.pop(0)
+    while lines and not lines[-1].strip():
+        lines.pop()
+
+    return "\n".join(lines).strip()
diff --git a/release/MINDI_1.0_420M/src/inference_engine/__init__.py b/release/MINDI_1.0_420M/src/inference_engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..245858df2c08ea0b4dfb0f65ef3c3a7115ca87fe
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/inference_engine/__init__.py
@@ -0,0 +1 @@
+﻿# This file marks inference_engine as a Python package.
diff --git a/release/MINDI_1.0_420M/src/inference_engine/inference_engine.py b/release/MINDI_1.0_420M/src/inference_engine/inference_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..27dfcdc9489ebf39e18a41a2314fb0b83cdb94c0
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/inference_engine/inference_engine.py
@@ -0,0 +1,211 @@
+﻿"""
+Component 7: Inference engine for local code generation.
+
+Features:
+- Deterministic low-temperature greedy mode.
+- Stop rules for clean function completion.
+- Syntax-aware retry with up to 3 attempts.
+"""
+
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from src.evaluation_system.code_eval import restore_code_from_structured
+from src.model_architecture.code_transformer import CodeTransformerLM
+from src.tokenizer.code_tokenizer import CodeTokenizer
+
+
+@dataclass
+class DecodingConfig:
+    max_new_tokens: int = 300
+    # Mode 1: deterministic output
+    greedy_temperature: float = 0.0
+    # Retry mode 2
+    retry2_temperature: float = 0.25
+    retry2_top_p: float = 0.85
+    # Retry mode 3
+    retry3_temperature: float = 0.35
+    retry3_top_p: float = 0.90
+    max_retries: int = 3
+    min_tokens_before_stop_check: int = 64
+    # Stop only when function body is non-trivial.
+    min_function_body_statements: int = 2
+
+
+class InferenceEngine:
+    def __init__(self, model: CodeTransformerLM, tokenizer: CodeTokenizer, device: torch.device) -> None:
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.model.eval()
+
+    @staticmethod
+    def _syntax_ok_python(code: str) -> bool:
+        try:
+            ast.parse(code)
+            return True
+        except Exception:
+            return False
+
+    @staticmethod
+    def _function_completion_score(code: str) -> int:
+        # Higher score = more complete usable function.
+        try:
+            tree = ast.parse(code)
+        except Exception:
+            return 0
+        funcs = [n for n in tree.body if isinstance(n, ast.FunctionDef)]
+        if not funcs:
+            return 0
+        fn = funcs[-1]
+        body_len = len(fn.body)
+        has_return = any(isinstance(n, ast.Return) for n in ast.walk(fn))
+        return body_len + (2 if has_return else 0)
+
+    def _looks_complete_function(self, code: str, min_body_statements: int) -> bool:
+        if "def " not in code:
+            return False
+        try:
+            tree = ast.parse(code)
+        except Exception:
+            return False
+        funcs = [n for n in tree.body if isinstance(n, ast.FunctionDef)]
+        if not funcs:
+            return False
+        fn = funcs[-1]
+        if len(fn.body) < min_body_statements:
+            return False
+        return True
+
+    def _sample_next(
+        self,
+        logits: torch.Tensor,
+        temperature: float,
+        top_p: float,
+    ) -> torch.Tensor:
+        if temperature <= 0:
+            return torch.argmax(logits, dim=-1, keepdim=True)
+
+        logits = logits / temperature
+        probs = torch.softmax(logits, dim=-1)
+
+        sorted_probs, sorted_idx = torch.sort(probs, descending=True)
+        cumulative = torch.cumsum(sorted_probs, dim=-1)
+        cutoff = cumulative > top_p
+        cutoff[..., 1:] = cutoff[..., :-1].clone()
+        cutoff[..., 0] = False
+        sorted_probs[cutoff] = 0.0
+        denom = sorted_probs.sum(dim=-1, keepdim=True).clamp_min(1e-12)
+        sorted_probs = sorted_probs / denom
+        sampled = torch.multinomial(sorted_probs, num_samples=1)
+        return sorted_idx.gather(-1, sampled)
+
+    @torch.no_grad()
+    def _generate_once(
+        self,
+        prompt: str,
+        language: str,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        min_tokens_before_stop_check: int,
+        min_function_body_statements: int,
+    ) -> Dict[str, object]:
+        prompt_text = self.tokenizer.format_training_sample(prompt=prompt, code="", language=language)
+        prompt_text = prompt_text.replace(" <NL>", "").strip()
+
+        ids = self.tokenizer.encode(prompt_text)
+        eos_id = self.tokenizer.special_token_ids.get("<EOS>")
+
+        # Remove trailing EOS so generation can continue.
+        if eos_id is not None and len(ids) > 1 and ids[-1] == int(eos_id):
+            ids = ids[:-1]
+
+        input_ids = torch.tensor([ids], dtype=torch.long, device=self.device)
+
+        generated_steps = 0
+        for _ in range(max_new_tokens):
+            out = self.model(input_ids=input_ids)
+            logits = out["logits"][:, -1, :]
+            next_id = self._sample_next(logits, temperature=temperature, top_p=top_p)
+            input_ids = torch.cat([input_ids, next_id], dim=1)
+            generated_steps += 1
+
+            # Primary stop: EOS token.
+            if eos_id is not None and int(next_id.item()) == int(eos_id):
+                break
+
+            # Secondary stop: complete parseable function with non-trivial body.
+            if generated_steps >= min_tokens_before_stop_check and (generated_steps % 12 == 0):
+                decoded = self.tokenizer.decode(input_ids[0].tolist())
+                code = restore_code_from_structured(decoded)
+                if self._looks_complete_function(code, min_body_statements=min_function_body_statements):
+                    break
+
+        decoded = self.tokenizer.decode(input_ids[0].tolist())
+        code = restore_code_from_structured(decoded)
+        syntax_ok = self._syntax_ok_python(code) if language == "python" else True
+        completion_score = self._function_completion_score(code) if language == "python" else 0
+        return {
+            "code": code,
+            "syntax_ok": syntax_ok,
+            "generated_tokens": generated_steps,
+            "temperature": temperature,
+            "top_p": top_p,
+            "completion_score": completion_score,
+        }
+
+    @torch.no_grad()
+    def generate_with_retry(
+        self,
+        prompt: str,
+        language: str = "python",
+        cfg: Optional[DecodingConfig] = None,
+    ) -> Dict[str, object]:
+        cfg = cfg or DecodingConfig()
+
+        attempts: List[Tuple[float, float]] = [
+            (cfg.greedy_temperature, 1.0),
+            (cfg.retry2_temperature, cfg.retry2_top_p),
+            (cfg.retry3_temperature, cfg.retry3_top_p),
+        ]
+
+        results = []
+        for i in range(min(cfg.max_retries, len(attempts))):
+            temp, top_p = attempts[i]
+            res = self._generate_once(
+                prompt=prompt,
+                language=language,
+                max_new_tokens=cfg.max_new_tokens,
+                temperature=temp,
+                top_p=top_p,
+                min_tokens_before_stop_check=cfg.min_tokens_before_stop_check,
+                min_function_body_statements=cfg.min_function_body_statements,
+            )
+            res["attempt"] = i + 1
+            results.append(res)
+
+            # Syntax-aware retry: stop retries as soon as syntax is valid.
+            if bool(res["syntax_ok"]):
+                return {
+                    "final": res,
+                    "attempts": results,
+                    "used_retry": i > 0,
+                }
+
+        # If all retries fail, choose best completion score then longest generation.
+        best = sorted(
+            results,
+            key=lambda x: (int(x.get("completion_score", 0)), int(x.get("generated_tokens", 0))),
+            reverse=True,
+        )[0]
+        return {
+            "final": best,
+            "attempts": results,
+            "used_retry": True,
+        }
diff --git a/release/MINDI_1.0_420M/src/model_architecture/__init__.py b/release/MINDI_1.0_420M/src/model_architecture/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..342d765602b980261490c2ba31ee4798327a8947
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/model_architecture/__init__.py
@@ -0,0 +1,2 @@
+# This file marks model_architecture as a Python package.
+
diff --git a/release/MINDI_1.0_420M/src/model_architecture/code_transformer.py b/release/MINDI_1.0_420M/src/model_architecture/code_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..456fd69995ced3fd03b24a5badc9fc816d269e8f
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/model_architecture/code_transformer.py
@@ -0,0 +1,264 @@
+"""
+Component 4: Transformer model architecture for code generation.
+
+This module defines a decoder-only transformer built from scratch in PyTorch.
+It is modular through configuration so model size can be scaled up/down.
+"""
+
+from __future__ import annotations
+
+import math
+from dataclasses import asdict, dataclass
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+@dataclass
+class ModelConfig:
+    # Vocabulary size from tokenizer.
+    vocab_size: int = 50_000
+    # Maximum context length in tokens.
+    max_seq_len: int = 2048
+    # Core hidden size of transformer.
+    d_model: int = 1152
+    # Number of transformer blocks.
+    n_layers: int = 23
+    # Number of attention heads.
+    n_heads: int = 16
+    # Feed-forward hidden size.
+    d_ff: int = 4608
+    # Dropout for regularization.
+    dropout: float = 0.1
+    # Whether to tie token embedding and LM head weights.
+    tie_embeddings: bool = True
+    # Enable gradient checkpointing to reduce VRAM usage during training.
+    gradient_checkpointing: bool = False
+    # Initialization standard deviation.
+    init_std: float = 0.02
+    # Epsilon for layer normalization stability.
+    rms_norm_eps: float = 1e-5
+
+    @property
+    def head_dim(self) -> int:
+        if self.d_model % self.n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads.")
+        return self.d_model // self.n_heads
+
+
+def get_model_presets() -> Dict[str, ModelConfig]:
+    """
+    Returns standard size presets.
+    """
+    return {
+        "small_180m": ModelConfig(d_model=896, n_layers=18, n_heads=14, d_ff=3584),
+        "medium_420m": ModelConfig(d_model=1152, n_layers=23, n_heads=16, d_ff=4608),
+        "large_800m": ModelConfig(d_model=1536, n_layers=24, n_heads=16, d_ff=6144),
+    }
+
+
+class RMSNorm(nn.Module):
+    """
+    RMSNorm is a lightweight normalization layer used in many modern LLMs.
+    """
+
+    def __init__(self, dim: int, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(norm + self.eps)
+        return self.weight * x
+
+
+class RotaryEmbedding(nn.Module):
+    """
+    Rotary positional embedding.
+    This injects token order information directly into query/key vectors.
+    """
+
+    def __init__(self, head_dim: int, max_seq_len: int) -> None:
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for rotary embeddings.")
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        t = torch.arange(max_seq_len, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cached", torch.cos(freqs), persistent=False)
+        self.register_buffer("sin_cached", torch.sin(freqs), persistent=False)
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)  # [1,1,S,H/2]
+        sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)  # [1,1,S,H/2]
+        q = self._apply_rotary(q, cos, sin)
+        k = self._apply_rotary(k, cos, sin)
+        return q, k
+
+    @staticmethod
+    def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        x_rot_even = x1 * cos - x2 * sin
+        x_rot_odd = x1 * sin + x2 * cos
+        out = torch.stack((x_rot_even, x_rot_odd), dim=-1).flatten(-2)
+        return out
+
+
+class CausalSelfAttention(nn.Module):
+    """
+    Multi-head causal self-attention for autoregressive code generation.
+    """
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.n_heads = config.n_heads
+        self.head_dim = config.head_dim
+        self.scale = self.head_dim ** -0.5
+
+        self.q_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.k_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.v_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.o_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+        self.rotary = RotaryEmbedding(head_dim=self.head_dim, max_seq_len=config.max_seq_len)
+
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+
+        q, k = self.rotary(q, k, seq_len=seq_len)
+
+        # Use PyTorch scaled dot-product attention with causal masking.
+        out = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attn_mask,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=True,
+            scale=self.scale,
+        )
+        out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.o_proj(out)
+
+
+class FeedForward(nn.Module):
+    """
+    Two-layer feed-forward network with GELU activation.
+    """
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.fc2 = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = F.gelu(x, approximate="tanh")
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    """
+    One transformer block:
+    norm -> attention -> residual
+    norm -> feed-forward -> residual
+    """
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.attn = CausalSelfAttention(config)
+        self.norm2 = RMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.ffn = FeedForward(config)
+
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x), attn_mask=attn_mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class CodeTransformerLM(nn.Module):
+    """
+    Full decoder-only language model for code generation.
+    """
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
+        self.norm_final = RMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        if config.tie_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module: nn.Module) -> None:
+        # Keep initialization stable for deep networks.
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+
+    def enable_gradient_checkpointing(self, enabled: bool = True) -> None:
+        # Toggle gradient checkpointing mode.
+        self.config.gradient_checkpointing = enabled
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        if input_ids.dim() != 2:
+            raise ValueError("input_ids must be shape [batch, seq_len].")
+
+        x = self.embed_tokens(input_ids)
+        x = self.dropout(x)
+
+        for block in self.blocks:
+            if self.config.gradient_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, attn_mask, use_reentrant=False)
+            else:
+                x = block(x, attn_mask=attn_mask)
+
+        x = self.norm_final(x)
+        logits = self.lm_head(x)
+
+        out: Dict[str, torch.Tensor] = {"logits": logits}
+        if labels is not None:
+            # Standard next-token cross entropy loss.
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+            out["loss"] = loss
+        return out
+
+    def estimate_num_parameters(self) -> int:
+        # Returns total trainable parameter count.
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+    def summary(self) -> Dict[str, object]:
+        # Returns a simple structured summary for logs/CLI.
+        return {
+            "config": asdict(self.config),
+            "num_parameters": self.estimate_num_parameters(),
+        }
+
diff --git a/release/MINDI_1.0_420M/src/tokenizer/__init__.py b/release/MINDI_1.0_420M/src/tokenizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35cf334b1ed48cc5bd2702ff997daf5bf7fd0bf6
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/tokenizer/__init__.py
@@ -0,0 +1,2 @@
+# This file marks tokenizer as a Python package.
+
diff --git a/release/MINDI_1.0_420M/src/tokenizer/code_tokenizer.py b/release/MINDI_1.0_420M/src/tokenizer/code_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68ec2705394185a43465e345664e691245f18a5
--- /dev/null
+++ b/release/MINDI_1.0_420M/src/tokenizer/code_tokenizer.py
@@ -0,0 +1,216 @@
+"""
+Component 2: Custom code tokenizer for Python and JavaScript.
+
+This tokenizer is code-aware:
+- It preserves indentation structure using explicit tokens.
+- It keeps newline boundaries using a newline token.
+- It treats code operators and brackets as separate units.
+- It supports prompt+code style training samples.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional
+
+from tokenizers import Regex, Tokenizer
+from tokenizers.decoders import BPEDecoder
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFKC, Sequence as NormalizerSequence
+from tokenizers.pre_tokenizers import Metaspace, Sequence as PreTokenizerSequence, Split
+from tokenizers.processors import TemplateProcessing
+from tokenizers.trainers import BpeTrainer
+
+
+@dataclass
+class CodeTokenizerConfig:
+    # Vocabulary size controls how many distinct tokens the tokenizer learns.
+    vocab_size: int = 50_000
+    # Minimum frequency filters very rare fragments.
+    min_frequency: int = 2
+    # Sequence length is used later by training/inference components.
+    model_max_length: int = 2048
+    # Indent width is used to normalize tabs and format indentation markers.
+    indent_width: int = 4
+    # These tokens are required for code generation workflows.
+    special_tokens: List[str] = None  # type: ignore[assignment]
+
+    def __post_init__(self) -> None:
+        if self.special_tokens is None:
+            self.special_tokens = [
+                "<PAD>",
+                "<UNK>",
+                "<BOS>",
+                "<EOS>",
+                "<NL>",
+                "<INDENT>",
+                "<DEDENT>",
+                "<PROMPT>",
+                "<CODE>",
+                "<PYTHON>",
+                "<JAVASCRIPT>",
+            ]
+
+
+class CodeTokenizer:
+    # This wrapper owns one HF Tokenizers object plus code-specific helpers.
+
+    def __init__(self, config: Optional[CodeTokenizerConfig] = None) -> None:
+        self.config = config or CodeTokenizerConfig()
+        self.tokenizer: Optional[Tokenizer] = None
+        self.special_token_ids: Dict[str, int] = {}
+
+    def _build_base_tokenizer(self) -> Tokenizer:
+        """
+        Creates a BPE tokenizer with code-oriented pre-tokenization rules.
+        """
+        tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
+        tokenizer.normalizer = NormalizerSequence([NFKC()])
+
+        # Split multi-character operators first so they are not broken apart.
+        multi_op = Regex(
+            r"(==|!=|<=|>=|:=|->|=>|\+\+|--|\+=|-=|\*=|/=|//=|%=|\*\*|&&|\|\||<<|>>)"
+        )
+        # Split common delimiters used heavily in code.
+        punct = Regex(r"([()\[\]{}.,:;])")
+
+        tokenizer.pre_tokenizer = PreTokenizerSequence(
+            [
+                Split(multi_op, behavior="isolated"),
+                Split(punct, behavior="isolated"),
+                Metaspace(replacement="_", prepend_scheme="always", split=True),
+            ]
+        )
+        tokenizer.decoder = BPEDecoder()
+        return tokenizer
+
+    def train(self, text_iterator: Iterable[str]) -> None:
+        """
+        Trains the tokenizer from a stream of preformatted text samples.
+        """
+        tokenizer = self._build_base_tokenizer()
+        trainer = BpeTrainer(
+            vocab_size=self.config.vocab_size,
+            min_frequency=self.config.min_frequency,
+            special_tokens=self.config.special_tokens,
+            show_progress=True,
+        )
+        tokenizer.train_from_iterator(text_iterator, trainer=trainer, length=None)
+
+        # Add BOS/EOS automatically around each single sequence.
+        bos_id = tokenizer.token_to_id("<BOS>")
+        eos_id = tokenizer.token_to_id("<EOS>")
+        if bos_id is None or eos_id is None:
+            raise RuntimeError("Tokenizer training failed to register BOS/EOS tokens.")
+        tokenizer.post_processor = TemplateProcessing(
+            single="<BOS> $A <EOS>",
+            special_tokens=[("<BOS>", bos_id), ("<EOS>", eos_id)],
+        )
+
+        self.tokenizer = tokenizer
+        self.special_token_ids = {
+            token: tokenizer.token_to_id(token) for token in self.config.special_tokens
+        }
+
+    def save(self, output_dir: str) -> None:
+        """
+        Saves tokenizer JSON and config so all other components can reuse it.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError("Cannot save tokenizer before training or loading it.")
+        out = Path(output_dir)
+        out.mkdir(parents=True, exist_ok=True)
+        self.tokenizer.save(str(out / "tokenizer.json"))
+        with (out / "tokenizer_config.json").open("w", encoding="utf-8") as f:
+            json.dump(asdict(self.config), f, indent=2)
+
+    @classmethod
+    def load(cls, tokenizer_dir: str) -> "CodeTokenizer":
+        """
+        Loads tokenizer from disk.
+        """
+        base = Path(tokenizer_dir)
+        cfg_path = base / "tokenizer_config.json"
+        tok_path = base / "tokenizer.json"
+        if not cfg_path.exists() or not tok_path.exists():
+            raise FileNotFoundError(
+                f"Missing tokenizer files in {tokenizer_dir}. "
+                "Expected tokenizer.json and tokenizer_config.json."
+            )
+        with cfg_path.open("r", encoding="utf-8") as f:
+            cfg_data = json.load(f)
+        config = CodeTokenizerConfig(**cfg_data)
+        obj = cls(config=config)
+        obj.tokenizer = Tokenizer.from_file(str(tok_path))
+        obj.special_token_ids = {
+            token: obj.tokenizer.token_to_id(token) for token in obj.config.special_tokens
+        }
+        return obj
+
+    def encode(self, text: str) -> List[int]:
+        """
+        Encodes one preformatted text sample to token IDs.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer is not ready. Train or load it first.")
+        return self.tokenizer.encode(text).ids
+
+    def decode(self, token_ids: List[int]) -> str:
+        """
+        Decodes token IDs to text.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer is not ready. Train or load it first.")
+        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
+
+    def format_training_sample(self, prompt: str, code: str, language: str) -> str:
+        """
+        Converts prompt + code into one structured training text sequence.
+        """
+        lang_token = "<PYTHON>" if language.lower() == "python" else "<JAVASCRIPT>"
+        prompt_text = self._normalize_text(prompt)
+        code_text = self._code_to_structure_tokens(code)
+        return f"<PROMPT> {lang_token} {prompt_text} <CODE> {code_text}"
+
+    def _normalize_text(self, text: str) -> str:
+        """
+        Normalizes regular text by cleaning newlines.
+        """
+        return text.replace("\r\n", "\n").replace("\r", "\n").strip()
+
+    def _code_to_structure_tokens(self, code: str) -> str:
+        """
+        Converts raw code into a string with explicit indentation and newline markers.
+        """
+        code = code.replace("\r\n", "\n").replace("\r", "\n").replace("\t", " " * self.config.indent_width)
+        lines = code.split("\n")
+        indent_stack: List[int] = [0]
+        out_tokens: List[str] = []
+
+        for raw_line in lines:
+            # Keep blank lines as newline tokens so code structure is preserved.
+            if raw_line.strip() == "":
+                out_tokens.append("<NL>")
+                continue
+
+            current_indent = len(raw_line) - len(raw_line.lstrip(" "))
+            line_content = raw_line.lstrip(" ")
+
+            while current_indent < indent_stack[-1]:
+                indent_stack.pop()
+                out_tokens.append("<DEDENT>")
+
+            while current_indent > indent_stack[-1]:
+                indent_stack.append(current_indent)
+                out_tokens.append("<INDENT>")
+
+            out_tokens.append(line_content)
+            out_tokens.append("<NL>")
+
+        while len(indent_stack) > 1:
+            indent_stack.pop()
+            out_tokens.append("<DEDENT>")
+
+        return " ".join(out_tokens).strip()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..330901606c43d2eab90ce6be3ddc7f49fe5556ae
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+transformers
+datasets
+peft
+accelerate
+torch
diff --git a/requirements_optional_windows_bitsandbytes.txt b/requirements_optional_windows_bitsandbytes.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1c97f1695cec96944159878c0701c0131d82ce29
--- /dev/null
+++ b/requirements_optional_windows_bitsandbytes.txt
@@ -0,0 +1,5 @@
+# Optional package for Windows-only experiments.
+# Important: this package frequently fails on some Windows CUDA setups.
+# Keep this optional so base setup remains stable.
+bitsandbytes-windows==0.37.5
+
diff --git a/scripts/add_incremental_javascript_dataset.py b/scripts/add_incremental_javascript_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bf952cc9ff2a344a4e24c7606e6d214c8d0cf21
--- /dev/null
+++ b/scripts/add_incremental_javascript_dataset.py
@@ -0,0 +1,250 @@
+"""
+Incremental JS dataset augmentation for Component 3 outputs.
+
+Goal:
+- Do NOT rebuild the full pipeline.
+- Reuse existing cleaned/tokenized files.
+- Add only new JavaScript samples from one additional HF dataset.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict
+
+import yaml
+from datasets import load_dataset
+
+# Ensure src imports work when run from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.dataset_pipeline.hf_dataset_pipeline import (  # noqa: E402
+    HFDatasetPipeline,
+    PipelineConfig,
+    SourceDatasetSpec,
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Add JS-focused dataset incrementally.")
+    parser.add_argument(
+        "--config",
+        default="configs/component3_incremental_js.yaml",
+        help="Path to YAML config.",
+    )
+    parser.add_argument(
+        "--target_new_javascript_examples",
+        type=int,
+        default=None,
+        help="Optional override for JS target.",
+    )
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError("Config must be a YAML object.")
+    return data
+
+
+def load_existing_into_dedupe_db(pipeline: HFDatasetPipeline, existing_clean_path: Path) -> int:
+    """
+    Seeds dedupe DB with existing clean dataset hashes.
+    This prevents re-adding duplicates during incremental merge.
+    """
+    if not existing_clean_path.exists():
+        raise FileNotFoundError(
+            f"Existing clean dataset not found: {existing_clean_path}. "
+            "Run Component 3 full pipeline first."
+        )
+
+    seeded = 0
+    with existing_clean_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+                prompt = str(row.get("prompt", "")).strip()
+                code = str(row.get("code", "")).strip()
+            except Exception:
+                continue
+            if not prompt or not code:
+                continue
+            # Keep unique adds hash to DB, False means already there.
+            pipeline._keep_unique(prompt, code)
+            seeded += 1
+            if seeded % 5000 == 0:
+                pipeline.conn.commit()
+    pipeline.conn.commit()
+    return seeded
+
+
+def main() -> None:
+    args = parse_args()
+    try:
+        cfg_data = load_yaml(Path(args.config))
+
+        existing_clean_path = Path(cfg_data["existing_clean_path"])
+        existing_tokenized_path = Path(cfg_data["existing_tokenized_path"])
+        existing_stats_path = Path(cfg_data["existing_stats_path"])
+        tokenizer_dir = str(cfg_data["tokenizer_dir"])
+        dedupe_db_path = str(cfg_data["dedupe_db_path"])
+        progress_every = int(cfg_data.get("progress_every", 500))
+        min_prompt_chars = int(cfg_data.get("min_prompt_chars", 8))
+        min_code_chars = int(cfg_data.get("min_code_chars", 16))
+        max_code_chars = int(cfg_data.get("max_code_chars", 40_000))
+        target_js = (
+            args.target_new_javascript_examples
+            if args.target_new_javascript_examples is not None
+            else int(cfg_data.get("target_new_javascript_examples", 20_000))
+        )
+
+        if not existing_tokenized_path.exists():
+            raise FileNotFoundError(
+                f"Existing tokenized dataset not found: {existing_tokenized_path}. "
+                "Run Component 3 full pipeline first."
+            )
+
+        new_ds = cfg_data["new_dataset"]
+        spec = SourceDatasetSpec(
+            hf_dataset_id=str(new_ds["hf_dataset_id"]),
+            split=str(new_ds.get("split", "train")),
+            prompt_field=str(new_ds["prompt_field"]),
+            code_field=str(new_ds["code_field"]),
+            language_field=new_ds.get("language_field"),
+            default_language=str(new_ds.get("default_language", "auto")),
+        )
+
+        # Build minimal pipeline object to reuse cleaning/dedupe/tokenization utilities.
+        pipeline_cfg = PipelineConfig(
+            datasets=[spec],
+            tokenizer_dir=tokenizer_dir,
+            interim_output_dir=str(existing_clean_path.parent),
+            processed_output_dir=str(existing_tokenized_path.parent),
+            dedupe_db_path=dedupe_db_path,
+            min_prompt_chars=min_prompt_chars,
+            min_code_chars=min_code_chars,
+            max_code_chars=max_code_chars,
+            progress_every=progress_every,
+        )
+        pipeline = HFDatasetPipeline(pipeline_cfg)
+
+        try:
+            seeded = load_existing_into_dedupe_db(pipeline, existing_clean_path)
+            print(f"[info] Seeded dedupe DB with existing clean records: {seeded}")
+
+            stream = load_dataset(spec.hf_dataset_id, split=spec.split, streaming=True)
+            added_js = 0
+            seen_new = 0
+            dropped_duplicate = 0
+            dropped_filtered = 0
+
+            with existing_clean_path.open("a", encoding="utf-8") as clean_f, existing_tokenized_path.open(
+                "a", encoding="utf-8"
+            ) as tok_f:
+                for row in stream:
+                    seen_new += 1
+                    std = pipeline._standardize_record(row=row, spec=spec)
+                    if std is None:
+                        dropped_filtered += 1
+                        continue
+
+                    prompt, code, lang = std
+                    cleaned = pipeline._clean_and_filter(prompt=prompt, code=code, language=lang)
+                    if cleaned is None:
+                        dropped_filtered += 1
+                        continue
+
+                    c_prompt, c_code, c_lang = cleaned
+                    if c_lang != "javascript":
+                        dropped_filtered += 1
+                        continue
+
+                    if not pipeline._keep_unique(c_prompt, c_code):
+                        dropped_duplicate += 1
+                        continue
+
+                    formatted_text = pipeline.tokenizer.format_training_sample(
+                        prompt=c_prompt, code=c_code, language="javascript"
+                    )
+                    token_ids = pipeline.tokenizer.encode(formatted_text)
+
+                    clean_record = {"prompt": c_prompt, "code": c_code, "language": "javascript"}
+                    tok_record = {
+                        "language": "javascript",
+                        "text": formatted_text,
+                        "input_ids": token_ids,
+                        "length": len(token_ids),
+                    }
+                    clean_f.write(json.dumps(clean_record, ensure_ascii=False) + "\n")
+                    tok_f.write(json.dumps(tok_record, ensure_ascii=False) + "\n")
+
+                    added_js += 1
+                    if added_js % progress_every == 0:
+                        pipeline.conn.commit()
+                        print(
+                            f"[progress] seen_new={seen_new} added_js={added_js} "
+                            f"dropped_duplicate={dropped_duplicate}"
+                        )
+
+                    if added_js >= target_js:
+                        break
+
+            pipeline.conn.commit()
+        finally:
+            pipeline.close()
+
+        # Merge incremental stats into existing summary.
+        merged_stats: Dict[str, Any] = {}
+        if existing_stats_path.exists():
+            with existing_stats_path.open("r", encoding="utf-8") as f:
+                try:
+                    merged_stats = json.load(f)
+                except Exception:
+                    merged_stats = {}
+
+        merged_stats["incremental_js_dataset"] = spec.hf_dataset_id
+        merged_stats["incremental_js_target"] = target_js
+        merged_stats["incremental_js_added"] = added_js
+        merged_stats["incremental_new_seen"] = seen_new
+        merged_stats["incremental_new_dropped_duplicate"] = dropped_duplicate
+        merged_stats["incremental_new_dropped_filtered"] = dropped_filtered
+        merged_stats["final_clean_records_estimate"] = int(merged_stats.get("kept_total", 0)) + added_js
+
+        with existing_stats_path.open("w", encoding="utf-8") as f:
+            json.dump(merged_stats, f, indent=2)
+
+        print("Incremental JavaScript augmentation completed.")
+        print(f"Dataset used: {spec.hf_dataset_id}")
+        print(f"Target JS examples: {target_js}")
+        print(f"Added JS examples: {added_js}")
+        if added_js < target_js:
+            print(
+                "Warning: JS target not reached from this dataset after filtering/dedupe. "
+                "You may need one more JS dataset."
+            )
+        print("Updated files:")
+        print(f"- {existing_clean_path}")
+        print(f"- {existing_tokenized_path}")
+        print(f"- {existing_stats_path}")
+    except Exception as exc:
+        print("Incremental JavaScript augmentation failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: verify internet access, dataset ID, and existing Component 3 files.")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/build_component4_model.py b/scripts/build_component4_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce10e7f0c5d51cb757eafb39c5b6a3c9ddcdde94
--- /dev/null
+++ b/scripts/build_component4_model.py
@@ -0,0 +1,111 @@
+"""
+Build/inspect script for Component 4 model architecture.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict
+
+import yaml
+
+# Ensure src imports work from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.model_architecture.code_transformer import (  # noqa: E402
+    CodeTransformerLM,
+    ModelConfig,
+    get_model_presets,
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Build and inspect Component 4 model.")
+    parser.add_argument(
+        "--config",
+        default="configs/component4_model_config.yaml",
+        help="Path to model YAML config.",
+    )
+    parser.add_argument(
+        "--save_summary",
+        default="artifacts/model/component4_model_summary.json",
+        help="Where to save model summary JSON.",
+    )
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Model config not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError("Invalid YAML format in model config.")
+    return data
+
+
+def build_config(cfg_data: Dict[str, Any]) -> ModelConfig:
+    preset = cfg_data.get("preset")
+    model_cfg = cfg_data.get("model", {})
+    if not isinstance(model_cfg, dict):
+        raise ValueError("Config key 'model' must be an object.")
+
+    base = None
+    if preset:
+        presets = get_model_presets()
+        if preset not in presets:
+            raise ValueError(f"Unknown preset '{preset}'. Available: {list(presets.keys())}")
+        base = presets[preset]
+
+    if base is None:
+        return ModelConfig(**model_cfg)
+
+    merged = {
+        "vocab_size": base.vocab_size,
+        "max_seq_len": base.max_seq_len,
+        "d_model": base.d_model,
+        "n_layers": base.n_layers,
+        "n_heads": base.n_heads,
+        "d_ff": base.d_ff,
+        "dropout": base.dropout,
+        "tie_embeddings": base.tie_embeddings,
+        "gradient_checkpointing": base.gradient_checkpointing,
+        "init_std": base.init_std,
+        "rms_norm_eps": base.rms_norm_eps,
+    }
+    merged.update(model_cfg)
+    return ModelConfig(**merged)
+
+
+def main() -> None:
+    args = parse_args()
+    try:
+        cfg_data = load_yaml(Path(args.config))
+        model_cfg = build_config(cfg_data)
+        model = CodeTransformerLM(model_cfg)
+        summary = model.summary()
+
+        save_path = Path(args.save_summary)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        with save_path.open("w", encoding="utf-8") as f:
+            json.dump(summary, f, indent=2)
+
+        print("Component 4 model build completed.")
+        print(f"Preset: {cfg_data.get('preset')}")
+        print(f"Parameters: {summary['num_parameters']:,}")
+        print(f"Saved summary: {save_path}")
+    except Exception as exc:
+        print("Component 4 model build failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: check config values (especially d_model and n_heads divisibility).")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/export_mindi_to_hf.py b/scripts/export_mindi_to_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d7b3b802783b8b8236a660cda227e4e16465e7b
--- /dev/null
+++ b/scripts/export_mindi_to_hf.py
@@ -0,0 +1,686 @@
+﻿"""
+Export MINDI 1.0 420M to a Hugging Face-ready model folder.
+
+What this script does:
+1) Loads your full-quality checkpoint (step_3200.pt by default).
+2) Builds the model architecture with the exact Component 4 config.
+3) Saves model weights as model.safetensors.
+4) Copies tokenizer files.
+5) Writes Hugging Face config files + custom model code.
+6) Writes a professional model card README.
+7) Writes a helper upload script with exact commands.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+from pathlib import Path
+from typing import Any, Dict
+
+import torch
+import yaml
+from safetensors.torch import save_file
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.model_architecture.code_transformer import (  # noqa: E402
+    CodeTransformerLM,
+    ModelConfig,
+    get_model_presets,
+)
+
+
+# These IDs are fixed by CodeTokenizerConfig special token order.
+PAD_ID = 0
+UNK_ID = 1
+BOS_ID = 2
+EOS_ID = 3
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Export MINDI 1.0 420M to Hugging Face format.")
+    parser.add_argument("--repo_id", required=True, help="Hugging Face repo id, for example: yourname/MINDI-1.0-420M")
+    parser.add_argument(
+        "--checkpoint_path",
+        default="checkpoints/component5_420m/step_3200.pt",
+        help="Path to full-quality checkpoint file.",
+    )
+    parser.add_argument(
+        "--model_config_path",
+        default="configs/component4_model_config.yaml",
+        help="Path to model architecture YAML config.",
+    )
+    parser.add_argument(
+        "--tokenizer_dir",
+        default="artifacts/tokenizer/code_tokenizer_v1",
+        help="Path to tokenizer directory containing tokenizer.json and tokenizer_config.json.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="hf_release/MINDI-1.0-420M",
+        help="Output folder for Hugging Face package.",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="If set, helper script will create a private repo instead of public.",
+    )
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError(f"Invalid YAML format: {path}")
+    return data
+
+
+def build_model_config(model_cfg_path: Path) -> ModelConfig:
+    cfg = load_yaml(model_cfg_path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+
+    if preset:
+        presets = get_model_presets()
+        if preset not in presets:
+            raise ValueError(f"Unknown model preset: {preset}")
+        merged = presets[preset].__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+
+    return ModelConfig(**model_cfg)
+
+
+def extract_model_state(checkpoint_path: Path) -> Dict[str, torch.Tensor]:
+    if not checkpoint_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+
+    payload = torch.load(checkpoint_path, map_location="cpu")
+    if isinstance(payload, dict) and "model_state" in payload:
+        state = payload["model_state"]
+    elif isinstance(payload, dict):
+        state = payload
+    else:
+        raise ValueError("Unsupported checkpoint format. Expected dict payload.")
+
+    if not isinstance(state, dict):
+        raise ValueError("Checkpoint model state is not a dictionary.")
+
+    return state
+
+
+def write_configuration_py(output_dir: Path) -> None:
+    content = '''"""
+Hugging Face config class for MINDI 1.0 420M.
+"""
+
+from transformers import PretrainedConfig
+
+
+class MindiConfig(PretrainedConfig):
+    model_type = "mindi"
+
+    def __init__(
+        self,
+        vocab_size=50000,
+        max_seq_len=2048,
+        d_model=1152,
+        n_layers=23,
+        n_heads=16,
+        d_ff=4608,
+        dropout=0.1,
+        tie_embeddings=True,
+        init_std=0.02,
+        rms_norm_eps=1e-5,
+        bos_token_id=2,
+        eos_token_id=3,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.d_ff = d_ff
+        self.dropout = dropout
+        self.tie_embeddings = tie_embeddings
+        self.init_std = init_std
+        self.rms_norm_eps = rms_norm_eps
+'''
+    (output_dir / "configuration_mindi.py").write_text(content, encoding="utf-8")
+
+
+def write_modeling_py(output_dir: Path) -> None:
+    content = '''"""
+Hugging Face model class for MINDI 1.0 420M.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from .configuration_mindi import MindiConfig
+
+
+@dataclass
+class _Cfg:
+    vocab_size: int
+    max_seq_len: int
+    d_model: int
+    n_layers: int
+    n_heads: int
+    d_ff: int
+    dropout: float
+    tie_embeddings: bool
+    init_std: float
+    rms_norm_eps: float
+
+    @property
+    def head_dim(self) -> int:
+        if self.d_model % self.n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads")
+        return self.d_model // self.n_heads
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(norm + self.eps)
+        return self.weight * x
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim: int, max_seq_len: int) -> None:
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for rotary embeddings")
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        t = torch.arange(max_seq_len, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cached", torch.cos(freqs), persistent=False)
+        self.register_buffer("sin_cached", torch.sin(freqs), persistent=False)
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)
+        sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)
+        return self._apply_rotary(q, cos, sin), self._apply_rotary(k, cos, sin)
+
+    @staticmethod
+    def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        xe = x1 * cos - x2 * sin
+        xo = x1 * sin + x2 * cos
+        return torch.stack((xe, xo), dim=-1).flatten(-2)
+
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.n_heads = cfg.n_heads
+        self.head_dim = cfg.head_dim
+        self.scale = self.head_dim ** -0.5
+        self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.k_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.v_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.o_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+        self.rotary = RotaryEmbedding(self.head_dim, cfg.max_seq_len)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        q, k = self.rotary(q, k, seq_len=seq_len)
+        out = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=True,
+            scale=self.scale,
+        )
+        out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.o_proj(out)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.fc2 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = F.gelu(x, approximate="tanh")
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.attn = CausalSelfAttention(cfg)
+        self.norm2 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.ffn = FeedForward(cfg)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class MindiForCausalLM(PreTrainedModel):
+    config_class = MindiConfig
+    base_model_prefix = "mindi"
+    supports_gradient_checkpointing = False
+
+    def __init__(self, config: MindiConfig):
+        super().__init__(config)
+        cfg = _Cfg(
+            vocab_size=config.vocab_size,
+            max_seq_len=config.max_seq_len,
+            d_model=config.d_model,
+            n_layers=config.n_layers,
+            n_heads=config.n_heads,
+            d_ff=config.d_ff,
+            dropout=config.dropout,
+            tie_embeddings=config.tie_embeddings,
+            init_std=config.init_std,
+            rms_norm_eps=config.rms_norm_eps,
+        )
+
+        self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.dropout = nn.Dropout(cfg.dropout)
+        self.blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)])
+        self.norm_final = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+
+        if cfg.tie_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+
+        self.post_init()
+
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.embed_tokens = value
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
+        self.lm_head = new_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        del attention_mask, kwargs
+
+        x = self.embed_tokens(input_ids)
+        x = self.dropout(x)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.norm_final(x)
+        logits = self.lm_head(x)
+
+        loss = None
+        if labels is not None:
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+
+    @torch.no_grad()
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, **kwargs):
+        del kwargs
+        return {"input_ids": input_ids}
+'''
+    (output_dir / "modeling_mindi.py").write_text(content, encoding="utf-8")
+
+
+
+def write_tokenization_py(output_dir: Path) -> None:
+    content = '''"""
+Hugging Face tokenizer class for MINDI 1.0 420M.
+"""
+
+from pathlib import Path
+from transformers import PreTrainedTokenizerFast
+
+
+class MindiTokenizer(PreTrainedTokenizerFast):
+    vocab_files_names = {"tokenizer_file": "tokenizer.json"}
+    model_input_names = ["input_ids", "attention_mask"]
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
+        if kwargs.get("tokenizer_file") is None:
+            local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json"
+            if local_candidate.exists():
+                kwargs["tokenizer_file"] = str(local_candidate)
+        return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+
+    def __init__(self, tokenizer_file=None, **kwargs):
+        name_or_path = kwargs.pop("name_or_path", None)
+        if tokenizer_file is None and name_or_path is not None:
+            candidate = Path(name_or_path) / "tokenizer.json"
+            if candidate.exists():
+                tokenizer_file = str(candidate)
+        if tokenizer_file is None:
+            tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json")
+        kwargs.setdefault("bos_token", "<BOS>")
+        kwargs.setdefault("eos_token", "<EOS>")
+        kwargs.setdefault("unk_token", "<UNK>")
+        kwargs.setdefault("pad_token", "<PAD>")
+        super().__init__(tokenizer_file=tokenizer_file, **kwargs)
+'''
+    (output_dir / "tokenization_mindi.py").write_text(content, encoding="utf-8")
+def write_model_card(output_dir: Path, repo_id: str, num_params: int) -> None:
+    text = f'''---
+license: mit
+language:
+- en
+library_name: transformers
+pipeline_tag: text-generation
+tags:
+- code
+- python
+- javascript
+- local-llm
+- offline
+---
+
+# MINDI 1.0 420M
+
+MINDI 1.0 420M is a 420M-parameter coding language model focused on Python first and JavaScript second.
+It is built for local, offline code generation workflows.
+
+## Capabilities
+
+- Code generation from natural language prompts
+- Code completion
+- Bug-fix suggestions
+- Code explanation
+
+## Model Details
+
+- Parameters: {num_params:,}
+- Architecture: Decoder-only Transformer
+- Context length: 2048 tokens
+- Focus languages: Python, JavaScript
+
+## Hardware Requirements
+
+Recommended:
+- NVIDIA GPU with 8GB+ VRAM
+- CUDA-enabled PyTorch
+
+Minimum:
+- CPU inference works but is slower
+
+## Quick Start (GPU)
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+repo_id = "{repo_id}"
+
+tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    repo_id,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+).cuda()
+
+prompt = "Write a Python function to check if a string is a palindrome."
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    output = model.generate(
+        **inputs,
+        max_new_tokens=220,
+        temperature=0.2,
+        top_p=0.9,
+        do_sample=True,
+    )
+
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Limitations
+
+- The model can still produce syntax or logic errors.
+- Generated code should always be reviewed and tested.
+- Not intended for safety-critical production use without validation.
+
+## Safety
+
+Always run tests and static checks before using generated code in production.
+'''
+    (output_dir / "README.md").write_text(text, encoding="utf-8")
+
+
+def write_upload_helper(output_dir: Path, repo_id: str, private: bool) -> None:
+    visibility = "--private" if private else "--public"
+    script = f'''# Upload helper for MINDI 1.0 420M
+# Run from PowerShell.
+
+huggingface-cli login
+huggingface-cli repo create {repo_id.split('/')[-1]} --type model {visibility}
+huggingface-cli upload {repo_id} "{output_dir}" . --repo-type model
+'''
+    helper_path = output_dir / "UPLOAD_TO_HF.ps1"
+    helper_path.write_text(script, encoding="utf-8")
+
+
+def write_runtime_requirements(output_dir: Path) -> None:
+    req = '''torch>=2.4.1
+transformers>=4.46.3
+safetensors>=0.4.5
+tokenizers>=0.20.1
+'''
+    (output_dir / "requirements_runtime.txt").write_text(req, encoding="utf-8")
+
+
+def write_license(output_dir: Path) -> None:
+    mit = '''MIT License
+
+Copyright (c) 2026 MINDI 1.0 420M Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+'''
+    (output_dir / "LICENSE").write_text(mit, encoding="utf-8")
+
+
+def main() -> None:
+    args = parse_args()
+
+    ckpt_path = PROJECT_ROOT / args.checkpoint_path
+    model_cfg_path = PROJECT_ROOT / args.model_config_path
+    tokenizer_dir = PROJECT_ROOT / args.tokenizer_dir
+    output_dir = PROJECT_ROOT / args.output_dir
+
+    if output_dir.exists():
+        shutil.rmtree(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if not tokenizer_dir.exists():
+        raise FileNotFoundError(f"Tokenizer directory not found: {tokenizer_dir}")
+
+    model_cfg = build_model_config(model_cfg_path)
+    model = CodeTransformerLM(model_cfg)
+
+    state = extract_model_state(ckpt_path)
+    model.load_state_dict(state, strict=True)
+    model.eval()
+
+    # Save full-quality weights in safetensors format.
+    tensor_state = {k: v.detach().cpu().contiguous() for k, v in model.state_dict().items()}
+    if model_cfg.tie_embeddings and "lm_head.weight" in tensor_state:
+        tensor_state.pop("lm_head.weight")
+    save_file(tensor_state, str(output_dir / "model.safetensors"), metadata={"format": "pt"})
+
+    # Save Hugging Face config.json.
+    hf_config = {
+        "model_type": "mindi",
+        "architectures": ["MindiForCausalLM"],
+        "auto_map": {
+            "AutoConfig": "configuration_mindi.MindiConfig",
+            "AutoModelForCausalLM": "modeling_mindi.MindiForCausalLM",
+            "AutoTokenizer": [None, "tokenization_mindi.MindiTokenizer"],
+        },
+        "vocab_size": model_cfg.vocab_size,
+        "max_seq_len": model_cfg.max_seq_len,
+        "d_model": model_cfg.d_model,
+        "n_layers": model_cfg.n_layers,
+        "n_heads": model_cfg.n_heads,
+        "d_ff": model_cfg.d_ff,
+        "dropout": model_cfg.dropout,
+        "tie_embeddings": model_cfg.tie_embeddings,
+        "init_std": model_cfg.init_std,
+        "rms_norm_eps": model_cfg.rms_norm_eps,
+        "bos_token_id": BOS_ID,
+        "eos_token_id": EOS_ID,
+        "pad_token_id": PAD_ID,
+        "torch_dtype": "float16",
+        "transformers_version": "4.46.3",
+    }
+    (output_dir / "config.json").write_text(json.dumps(hf_config, indent=2), encoding="utf-8")
+
+    generation_cfg = {
+        "bos_token_id": BOS_ID,
+        "eos_token_id": EOS_ID,
+        "pad_token_id": PAD_ID,
+        "max_new_tokens": 220,
+        "temperature": 0.2,
+        "top_p": 0.9,
+        "do_sample": True,
+    }
+    (output_dir / "generation_config.json").write_text(json.dumps(generation_cfg, indent=2), encoding="utf-8")
+
+    # Copy tokenizer core file.
+    shutil.copy2(tokenizer_dir / "tokenizer.json", output_dir / "tokenizer.json")
+
+    # Create HF tokenizer metadata files.
+    tokenizer_cfg = {
+        "tokenizer_class": "MindiTokenizer",
+        "model_max_length": int(model_cfg.max_seq_len),
+        "bos_token": "<BOS>",
+        "eos_token": "<EOS>",
+        "unk_token": "<UNK>",
+        "pad_token": "<PAD>",
+        "tokenizer_file": "tokenizer.json",
+        "auto_map": {"AutoTokenizer": [None, "tokenization_mindi.MindiTokenizer"]},
+        "padding_side": "right",
+        "truncation_side": "right",
+    }
+    (output_dir / "tokenizer_config.json").write_text(json.dumps(tokenizer_cfg, indent=2), encoding="utf-8")
+
+    special_map = {
+        "bos_token": "<BOS>",
+        "eos_token": "<EOS>",
+        "unk_token": "<UNK>",
+        "pad_token": "<PAD>",
+    }
+    (output_dir / "special_tokens_map.json").write_text(json.dumps(special_map, indent=2), encoding="utf-8")
+
+    # Custom model files for trust_remote_code=True loading.
+    write_configuration_py(output_dir)
+    write_modeling_py(output_dir)
+    write_tokenization_py(output_dir)
+
+    # Project metadata and helper scripts.
+    num_params = sum(p.numel() for p in model.parameters())
+    write_model_card(output_dir, args.repo_id, num_params)
+    write_upload_helper(output_dir, args.repo_id, args.private)
+    write_runtime_requirements(output_dir)
+    write_license(output_dir)
+
+    print("Hugging Face package export completed.")
+    print(f"Output folder: {output_dir}")
+    print(f"Weights: {output_dir / 'model.safetensors'}")
+    print(f"Tokenizer: {output_dir / 'tokenizer.json'}")
+    print(f"Model card: {output_dir / 'README.md'}")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as exc:
+        print("HF export failed.")
+        print(f"What went wrong: {exc}")
+        print(
+            "Fix suggestion: verify checkpoint path, tokenizer path, and that safetensors/yaml are installed "
+            "in your active Python environment."
+        )
+        raise SystemExit(1)
+
+
+
+
+
+
+
diff --git a/scripts/launch_component8_chat.py b/scripts/launch_component8_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb532ff6cd9c86b1f07e983a0617641a8bc53f8b
--- /dev/null
+++ b/scripts/launch_component8_chat.py
@@ -0,0 +1,44 @@
+﻿"""
+Launch Component 8 local chat interface.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import yaml
+
+# Ensure src imports work when script is run directly.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.chat_interface.gradio_chat_app import create_demo
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Launch Component 8 chat UI.")
+    parser.add_argument("--config", default="configs/component8_chat_config.yaml")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    cfg_path = Path(args.config)
+    if not cfg_path.exists():
+        raise FileNotFoundError(f"Config not found: {cfg_path}")
+
+    cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
+    host = str(cfg.get("server", {}).get("host", "127.0.0.1"))
+    port = int(cfg.get("server", {}).get("port", 7860))
+    share = bool(cfg.get("server", {}).get("share", False))
+
+    demo = create_demo(config_path=args.config)
+    demo.queue(default_concurrency_limit=1)
+    demo.launch(server_name=host, server_port=port, share=share, inbrowser=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/reprocess_tokenized_from_clean.py b/scripts/reprocess_tokenized_from_clean.py
new file mode 100644
index 0000000000000000000000000000000000000000..3717a0d7f6f102c4ead36b6930e9930fde73a081
--- /dev/null
+++ b/scripts/reprocess_tokenized_from_clean.py
@@ -0,0 +1,230 @@
+"""
+Reprocess final tokenized dataset from existing cleaned JSONL.
+
+Purpose:
+- No re-download.
+- No full pipeline rerun.
+- Rebuild tokenized dataset with improved language detection.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import yaml
+
+# Ensure src imports work from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.tokenizer.code_tokenizer import CodeTokenizer  # noqa: E402
+
+
+PY_HINTS = [
+    "def ",
+    "import ",
+    "from ",
+    "print(",
+    "if __name__ ==",
+    "class ",
+    "lambda ",
+    "elif ",
+    "except ",
+]
+
+JS_HINTS = [
+    "function ",
+    "const ",
+    "let ",
+    "=>",
+    "console.log",
+    "export ",
+    "require(",
+    "document.",
+    "window.",
+    "=> {",
+    "var ",
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Rebuild tokenized data from existing clean JSONL.")
+    parser.add_argument(
+        "--config",
+        default="configs/component3_reprocess_from_clean.yaml",
+        help="Path to YAML config.",
+    )
+    parser.add_argument(
+        "--max_records",
+        type=int,
+        default=None,
+        help="Optional quick-test limit.",
+    )
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError("Config format is invalid. Expected YAML object.")
+    return data
+
+
+def infer_language(prompt: str, code: str, raw_language: str, ignore_existing_labels: bool) -> str:
+    lang = (raw_language or "").lower().strip()
+    if not ignore_existing_labels:
+        if "javascript" in lang or lang in {"js", "node", "nodejs"}:
+            return "javascript"
+        if "python" in lang:
+            return "python"
+
+    prompt_lower = prompt.lower()
+    code_lower = code.lower()
+    py_score = sum(1 for hint in PY_HINTS if hint in code_lower)
+    js_score = sum(1 for hint in JS_HINTS if hint in code_lower)
+
+    if "javascript" in prompt_lower or "node.js" in prompt_lower or " js " in f" {prompt_lower} ":
+        js_score += 2
+    if "python" in prompt_lower:
+        py_score += 2
+
+    return "javascript" if js_score > py_score else "python"
+
+
+def backup_file_if_needed(path: Path, enabled: bool) -> Optional[Path]:
+    if not enabled or not path.exists():
+        return None
+    backup = path.with_suffix(path.suffix + ".bak")
+    shutil.copy2(path, backup)
+    return backup
+
+
+def main() -> None:
+    args = parse_args()
+    try:
+        cfg = load_yaml(Path(args.config))
+        tokenizer_dir = Path(cfg["tokenizer_dir"])
+        input_clean_path = Path(cfg["input_clean_path"])
+        output_tokenized_path = Path(cfg["output_tokenized_path"])
+        output_stats_path = Path(cfg["output_stats_path"])
+        ignore_existing_labels = bool(cfg.get("ignore_existing_language_labels", True))
+        max_records = args.max_records if args.max_records is not None else cfg.get("max_records")
+
+        if not input_clean_path.exists():
+            raise FileNotFoundError(
+                f"Input clean file not found: {input_clean_path}. "
+                "Run Component 3 first."
+            )
+
+        output_tokenized_path.parent.mkdir(parents=True, exist_ok=True)
+        output_stats_path.parent.mkdir(parents=True, exist_ok=True)
+
+        token_backup = backup_file_if_needed(
+            output_tokenized_path, bool(cfg.get("backup_existing_tokenized", True))
+        )
+        stats_backup = backup_file_if_needed(
+            output_stats_path, bool(cfg.get("backup_existing_stats", True))
+        )
+
+        tokenizer = CodeTokenizer.load(str(tokenizer_dir))
+
+        stats: Dict[str, int] = {
+            "reprocess_seen_total": 0,
+            "reprocess_kept_total": 0,
+            "reprocess_dropped_invalid_json": 0,
+            "reprocess_dropped_empty_fields": 0,
+            "language_python": 0,
+            "language_javascript": 0,
+        }
+
+        with input_clean_path.open("r", encoding="utf-8") as in_f, output_tokenized_path.open(
+            "w", encoding="utf-8"
+        ) as out_f:
+            for line in in_f:
+                stats["reprocess_seen_total"] += 1
+                if max_records is not None and stats["reprocess_seen_total"] > int(max_records):
+                    break
+
+                line = line.strip()
+                if not line:
+                    stats["reprocess_dropped_empty_fields"] += 1
+                    continue
+
+                try:
+                    row = json.loads(line)
+                except json.JSONDecodeError:
+                    stats["reprocess_dropped_invalid_json"] += 1
+                    continue
+
+                prompt = str(row.get("prompt", "")).strip()
+                code = str(row.get("code", "")).strip()
+                raw_language = str(row.get("language", "")).strip()
+                if not prompt or not code:
+                    stats["reprocess_dropped_empty_fields"] += 1
+                    continue
+
+                language = infer_language(
+                    prompt=prompt,
+                    code=code,
+                    raw_language=raw_language,
+                    ignore_existing_labels=ignore_existing_labels,
+                )
+                if language == "javascript":
+                    stats["language_javascript"] += 1
+                else:
+                    stats["language_python"] += 1
+
+                formatted_text = tokenizer.format_training_sample(
+                    prompt=prompt, code=code, language=language
+                )
+                token_ids = tokenizer.encode(formatted_text)
+                out_row = {
+                    "language": language,
+                    "text": formatted_text,
+                    "input_ids": token_ids,
+                    "length": len(token_ids),
+                }
+                out_f.write(json.dumps(out_row, ensure_ascii=False) + "\n")
+                stats["reprocess_kept_total"] += 1
+
+                if stats["reprocess_kept_total"] % 5000 == 0:
+                    print(
+                        f"[progress] seen={stats['reprocess_seen_total']} "
+                        f"kept={stats['reprocess_kept_total']} "
+                        f"python={stats['language_python']} js={stats['language_javascript']}"
+                    )
+
+        with output_stats_path.open("w", encoding="utf-8") as f:
+            json.dump(stats, f, indent=2)
+
+        print("Reprocess completed successfully.")
+        print(f"Input clean file: {input_clean_path}")
+        print(f"Output tokenized file: {output_tokenized_path}")
+        print(f"Output stats file: {output_stats_path}")
+        if token_backup:
+            print(f"Tokenized backup: {token_backup}")
+        if stats_backup:
+            print(f"Stats backup: {stats_backup}")
+        print("Summary stats:")
+        print(json.dumps(stats, indent=2))
+    except Exception as exc:
+        print("Reprocess failed.")
+        print(f"What went wrong: {exc}")
+        print(
+            "Fix suggestion: verify Component 2 tokenizer files and "
+            "Component 3 clean file paths."
+        )
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_component10_export.py b/scripts/run_component10_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb50c33dd1fc42239fb05fad87b116ed29c3f3a
--- /dev/null
+++ b/scripts/run_component10_export.py
@@ -0,0 +1,376 @@
+﻿"""
+Component 10: Export, quantization, benchmarking, and packaging.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import torch
+import torch.nn as nn
+import yaml
+
+# Ensure src imports work.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets  # noqa: E402
+from src.tokenizer.code_tokenizer import CodeTokenizer  # noqa: E402
+from src.evaluation_system.code_eval import restore_code_from_structured  # noqa: E402
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run Component 10 export/optimization.")
+    parser.add_argument("--config", default="configs/component10_export_config.yaml")
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    data = yaml.safe_load(path.read_text(encoding="utf-8-sig"))
+    if not isinstance(data, dict):
+        raise ValueError("Invalid YAML format.")
+    return data
+
+
+def build_model_config(path: Path) -> ModelConfig:
+    cfg = load_yaml(path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+    if preset:
+        merged = get_model_presets()[preset].__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+    return ModelConfig(**model_cfg)
+
+
+def prepare_prompt_ids(tokenizer: CodeTokenizer, prompt: str) -> list[int]:
+    text = tokenizer.format_training_sample(prompt=prompt, code="", language="python")
+    text = text.replace(" <NL>", "").strip()
+    ids = tokenizer.encode(text)
+    eos = tokenizer.special_token_ids.get("<EOS>")
+    if eos is not None and len(ids) > 1 and ids[-1] == int(eos):
+        ids = ids[:-1]
+    return ids
+
+
+@torch.no_grad()
+def benchmark_tokens_per_sec(
+    model: CodeTransformerLM,
+    tokenizer: CodeTokenizer,
+    prompt: str,
+    max_new_tokens: int,
+    device: torch.device,
+) -> Dict[str, float]:
+    model.eval()
+    ids = prepare_prompt_ids(tokenizer, prompt)
+    input_ids = torch.tensor([ids], dtype=torch.long, device=device)
+
+    eos_id = tokenizer.special_token_ids.get("<EOS>")
+
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+
+    t0 = time.perf_counter()
+    generated = 0
+    for _ in range(max_new_tokens):
+        out = model(input_ids=input_ids)
+        next_id = torch.argmax(out["logits"][:, -1, :], dim=-1, keepdim=True)
+        input_ids = torch.cat([input_ids, next_id], dim=1)
+        generated += 1
+        if eos_id is not None and int(next_id.item()) == int(eos_id):
+            break
+
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    dt = max(1e-6, time.perf_counter() - t0)
+
+    decoded = tokenizer.decode(input_ids[0].tolist())
+    code = restore_code_from_structured(decoded)
+
+    peak_vram = float(torch.cuda.max_memory_allocated() / (1024**3)) if device.type == "cuda" else 0.0
+
+    return {
+        "generated_tokens": float(generated),
+        "seconds": float(dt),
+        "tokens_per_second": float(generated / dt),
+        "peak_vram_gb": peak_vram,
+        "preview_code": code[:300],
+    }
+
+
+def bytes_to_gb(n: int) -> float:
+    return float(n / (1024**3))
+
+
+def write_portable_launcher(portable_dir: Path) -> None:
+    bat = r"""@echo off`r`ntitle MINDI 1.0 420M
+setlocal
+cd /d "%~dp0"
+if not exist .venv (
+  echo [setup] Creating virtual environment...
+  py -3 -m venv .venv
+)
+call .venv\Scripts\activate.bat
+python -m pip install --upgrade pip >nul
+python -m pip install -r requirements_portable.txt
+python app\launch_portable_chat.py --config app\portable_chat_config.yaml
+endlocal
+"""
+    (portable_dir / "Start_MINDI.bat").write_text(bat, encoding="utf-8")
+
+
+def write_portable_requirements(portable_dir: Path) -> None:
+    req = """torch==2.5.1
+tokenizers==0.20.1
+pyyaml==6.0.2
+gradio==5.5.0
+pygments==2.19.2
+"""
+    (portable_dir / "requirements_portable.txt").write_text(req, encoding="utf-8")
+
+
+def write_portable_chat_files(portable_dir: Path, port: int) -> None:
+    app_dir = portable_dir / "app"
+    app_dir.mkdir(parents=True, exist_ok=True)
+
+    cfg = f"""model:
+  model_config_path: app/model_config.yaml
+  quantized_state_path: model/model_step3200_int8_state.pt
+  tokenizer_dir: model/tokenizer
+
+server:
+  host: 127.0.0.1
+  port: {port}
+"""
+    (app_dir / "portable_chat_config.yaml").write_text(cfg, encoding="utf-8")
+
+    launch = r'''from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import gradio as gr
+import torch
+import yaml
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets
+from src.tokenizer.code_tokenizer import CodeTokenizer
+from src.inference_engine.inference_engine import InferenceEngine, DecodingConfig
+
+
+def load_yaml(path: Path):
+    return yaml.safe_load(path.read_text(encoding="utf-8-sig"))
+
+
+def build_model_config(path: Path) -> ModelConfig:
+    cfg = load_yaml(path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+    if preset:
+        merged = get_model_presets()[preset].__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+    return ModelConfig(**model_cfg)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--config", default="app/portable_chat_config.yaml")
+    ap.add_argument("--self_test", action="store_true")
+    args = ap.parse_args()
+
+    cfg = load_yaml(PROJECT_ROOT / args.config)
+    mcfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"])
+
+    tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"]))
+    model = CodeTransformerLM(mcfg).cpu().float()
+    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+    state = torch.load(PROJECT_ROOT / cfg["model"]["quantized_state_path"], map_location="cpu")
+    model.load_state_dict(state)
+
+    engine = InferenceEngine(model=model, tokenizer=tokenizer, device=torch.device("cpu"))
+    dcfg = DecodingConfig(max_new_tokens=220, min_tokens_before_stop_check=64)
+
+    if args.self_test:
+        out = engine.generate_with_retry("Write a Python function to add two numbers.", "python", dcfg)
+        code = out["final"]["code"]
+        print("portable_self_test_ok=", bool(code.strip()))
+        return
+
+    def respond(prompt, history):
+        history = history or []
+        p = (prompt or "").strip()
+        if not p:
+            return history, ""
+        out = engine.generate_with_retry(p, "python", dcfg)
+        history.append((p, out["final"]["code"]))
+        return history, ""
+
+    with gr.Blocks(title="MINDI 1.0 420M") as demo:
+        gr.Markdown("## MINDI 1.0 420M (INT8 Portable)")
+        chat = gr.Chatbot(height=520)
+        box = gr.Textbox(label="Prompt", lines=4)
+        btn = gr.Button("Generate")
+        clear = gr.Button("Clear")
+        btn.click(respond, [box, chat], [chat, box])
+        box.submit(respond, [box, chat], [chat, box])
+        clear.click(lambda: ([], ""), None, [chat, box])
+
+    demo.launch(server_name=cfg["server"].get("host", "127.0.0.1"), server_port=int(cfg["server"].get("port", 7861)), share=False, inbrowser=False)
+
+
+if __name__ == "__main__":
+    main()
+'''
+    (app_dir / "launch_portable_chat.py").write_text(launch, encoding="utf-8")
+
+
+def copy_runtime_sources(portable_dir: Path) -> None:
+    src_root = PROJECT_ROOT / "src"
+    dst_root = portable_dir / "src"
+    needed = [
+        "__init__.py",
+        "model_architecture/__init__.py",
+        "model_architecture/code_transformer.py",
+        "tokenizer/__init__.py",
+        "tokenizer/code_tokenizer.py",
+        "evaluation_system/__init__.py",
+        "evaluation_system/code_eval.py",
+        "inference_engine/__init__.py",
+        "inference_engine/inference_engine.py",
+    ]
+    for rel in needed:
+        src = src_root / rel
+        dst = dst_root / rel
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(src, dst)
+
+
+def main() -> None:
+    args = parse_args()
+    cfg = load_yaml(PROJECT_ROOT / args.config)
+
+    mcfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"])
+    tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"]))
+
+    source_ckpt = PROJECT_ROOT / cfg["model"]["source_checkpoint_path"]
+    if not source_ckpt.exists():
+        raise FileNotFoundError(f"Source checkpoint not found: {source_ckpt}")
+
+    # Baseline model (GPU if available).
+    baseline_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    baseline = CodeTransformerLM(mcfg).to(baseline_device)
+    payload = torch.load(source_ckpt, map_location=baseline_device)
+    baseline.load_state_dict(payload["model_state"])
+    if baseline_device.type == "cuda":
+        baseline.half()
+
+    bench_prompt = str(cfg["benchmark"].get("prompt", "Write a Python function to add two numbers."))
+    max_new_tokens = int(cfg["benchmark"].get("max_new_tokens", 120))
+
+    baseline_metrics = benchmark_tokens_per_sec(
+        model=baseline,
+        tokenizer=tokenizer,
+        prompt=bench_prompt,
+        max_new_tokens=max_new_tokens,
+        device=baseline_device,
+    )
+
+    # Quantize to INT8 on CPU and save separate file.
+    quant_model = CodeTransformerLM(mcfg).cpu().float()
+    payload_cpu = torch.load(source_ckpt, map_location="cpu")
+    quant_model.load_state_dict(payload_cpu["model_state"])
+    quant_model = torch.quantization.quantize_dynamic(quant_model, {nn.Linear}, dtype=torch.qint8)
+
+    q_path = PROJECT_ROOT / cfg["quantization"]["quantized_output_path"]
+    q_path.parent.mkdir(parents=True, exist_ok=True)
+    torch.save(quant_model.state_dict(), q_path)
+
+    quant_metrics = benchmark_tokens_per_sec(
+        model=quant_model,
+        tokenizer=tokenizer,
+        prompt=bench_prompt,
+        max_new_tokens=max_new_tokens,
+        device=torch.device("cpu"),
+    )
+
+    before_size_gb = bytes_to_gb(source_ckpt.stat().st_size)
+    after_size_gb = bytes_to_gb(q_path.stat().st_size)
+
+    report = {
+        "source_checkpoint": str(source_ckpt),
+        "quantized_checkpoint": str(q_path),
+        "size_before_gb": before_size_gb,
+        "size_after_gb": after_size_gb,
+        "baseline_device": str(baseline_device),
+        "baseline_tokens_per_second": baseline_metrics["tokens_per_second"],
+        "quantized_tokens_per_second": quant_metrics["tokens_per_second"],
+        "baseline_peak_vram_gb": baseline_metrics["peak_vram_gb"],
+        "quantized_peak_vram_gb": quant_metrics["peak_vram_gb"],
+        "baseline_generated_tokens": baseline_metrics["generated_tokens"],
+        "quantized_generated_tokens": quant_metrics["generated_tokens"],
+    }
+
+    report_path = PROJECT_ROOT / cfg["outputs"]["benchmark_report_json"]
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    report_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
+
+    # Build portable package folder.
+    portable_dir = PROJECT_ROOT / cfg["package"]["output_dir"]
+    if portable_dir.exists():
+        shutil.rmtree(portable_dir)
+    portable_dir.mkdir(parents=True, exist_ok=True)
+
+    # Copy model artifacts.
+    (portable_dir / "model").mkdir(parents=True, exist_ok=True)
+    shutil.copy2(q_path, portable_dir / "model" / q_path.name)
+
+    tok_src = PROJECT_ROOT / cfg["model"]["tokenizer_dir"]
+    tok_dst = portable_dir / "model" / "tokenizer"
+    shutil.copytree(tok_src, tok_dst)
+
+    shutil.copy2(PROJECT_ROOT / cfg["model"]["model_config_path"], portable_dir / "app" / "model_config.yaml") if (portable_dir / "app").exists() else None
+    write_portable_chat_files(portable_dir, port=int(cfg["package"].get("app_port", 7861)))
+    shutil.copy2(PROJECT_ROOT / cfg["model"]["model_config_path"], portable_dir / "app" / "model_config.yaml")
+
+    copy_runtime_sources(portable_dir)
+    write_portable_requirements(portable_dir)
+    write_portable_launcher(portable_dir)
+
+    # Verify packaged run independently (self-test).
+    py = PROJECT_ROOT / ".venv" / "Scripts" / "python.exe"
+    if py.exists():
+        import subprocess
+
+        cmd = [str(py), str(portable_dir / "app" / "launch_portable_chat.py"), "--config", "app/portable_chat_config.yaml", "--self_test"]
+        proc = subprocess.run(cmd, cwd=str(portable_dir), capture_output=True, text=True, timeout=120)
+        verify_ok = (proc.returncode == 0) and ("portable_self_test_ok= True" in (proc.stdout + proc.stderr))
+    else:
+        verify_ok = False
+
+    print("Component 10 export completed.")
+    print(f"INT8 model saved: {q_path}")
+    print(f"Benchmark report: {report_path}")
+    print(f"Portable package: {portable_dir}")
+    print(f"Portable self-test ok: {verify_ok}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/run_component3_dataset_pipeline.py b/scripts/run_component3_dataset_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f3bc2657d45b667bcfa79811ccf30b3dbf4c0bc
--- /dev/null
+++ b/scripts/run_component3_dataset_pipeline.py
@@ -0,0 +1,126 @@
+"""
+Component 3 runner script.
+
+Reads YAML config and executes full Hugging Face dataset preprocessing.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+
+import yaml
+
+# This makes "src" imports work when script is run from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.dataset_pipeline.hf_dataset_pipeline import (  # noqa: E402
+    HFDatasetPipeline,
+    PipelineConfig,
+    SourceDatasetSpec,
+)
+
+
+def parse_args() -> argparse.Namespace:
+    # Parse command-line arguments for config and optional overrides.
+    parser = argparse.ArgumentParser(description="Run Component 3 dataset preprocessing pipeline.")
+    parser.add_argument(
+        "--config",
+        default="configs/component3_dataset_pipeline.yaml",
+        help="Path to YAML config file.",
+    )
+    parser.add_argument(
+        "--max_records_per_dataset",
+        type=int,
+        default=None,
+        help="Optional override for quick test runs.",
+    )
+    return parser.parse_args()
+
+
+def _read_yaml(path: Path) -> Dict[str, Any]:
+    # Reads YAML file with friendly errors.
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError("Config file is invalid. Expected a YAML object at top level.")
+    return data
+
+
+def _build_config(data: Dict[str, Any], max_records_override: int | None) -> PipelineConfig:
+    # Converts generic dict into strongly typed config objects.
+    dataset_specs: List[SourceDatasetSpec] = []
+    datasets_data = data.get("datasets", [])
+    if not isinstance(datasets_data, list) or not datasets_data:
+        raise ValueError("Config must include a non-empty 'datasets' list.")
+
+    for item in datasets_data:
+        dataset_specs.append(
+            SourceDatasetSpec(
+                hf_dataset_id=str(item["hf_dataset_id"]),
+                split=str(item.get("split", "train")),
+                prompt_field=str(item["prompt_field"]),
+                code_field=str(item["code_field"]),
+                language_field=item.get("language_field"),
+                default_language=str(item.get("default_language", "python")),
+            )
+        )
+
+    cfg = PipelineConfig(
+        datasets=dataset_specs,
+        tokenizer_dir=str(data["tokenizer_dir"]),
+        interim_output_dir=str(data["interim_output_dir"]),
+        processed_output_dir=str(data["processed_output_dir"]),
+        dedupe_db_path=str(data["dedupe_db_path"]),
+        max_records_per_dataset=data.get("max_records_per_dataset"),
+        min_prompt_chars=int(data.get("min_prompt_chars", 8)),
+        min_code_chars=int(data.get("min_code_chars", 16)),
+        max_code_chars=int(data.get("max_code_chars", 40_000)),
+        progress_every=int(data.get("progress_every", 1_000)),
+    )
+
+    if max_records_override is not None:
+        cfg.max_records_per_dataset = max_records_override
+    return cfg
+
+
+def main() -> None:
+    # Main entry with explicit plain-English error handling.
+    args = parse_args()
+    try:
+        config_path = Path(args.config)
+        data = _read_yaml(config_path)
+        cfg = _build_config(data, args.max_records_per_dataset)
+        pipeline = HFDatasetPipeline(cfg)
+        try:
+            stats = pipeline.run()
+        finally:
+            pipeline.close()
+
+        print("Component 3 pipeline completed successfully.")
+        print("Saved files:")
+        print(f"- {Path(cfg.interim_output_dir) / 'combined_clean.jsonl'}")
+        print(f"- {Path(cfg.processed_output_dir) / 'train_tokenized.jsonl'}")
+        print(f"- {Path(cfg.processed_output_dir) / 'pipeline_stats.json'}")
+        print("Summary stats:")
+        print(json.dumps(stats, indent=2))
+    except Exception as exc:
+        print("Component 3 pipeline failed.")
+        print(f"What went wrong: {exc}")
+        print(
+            "Fix suggestion: verify internet access for Hugging Face, tokenizer path, "
+            "and config field names."
+        )
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/run_component6_evaluation.py b/scripts/run_component6_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..5377ecadab9915920b62c42895c6ba14a8d35607
--- /dev/null
+++ b/scripts/run_component6_evaluation.py
@@ -0,0 +1,241 @@
+﻿"""
+Component 6: Evaluation system.
+
+- Computes validation loss for selected checkpoints.
+- Generates code for 5 simple Python prompts.
+- Performs syntax validity checks.
+- Saves results JSON.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+
+# Ensure src imports work.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.evaluation_system.code_eval import python_syntax_ok, restore_code_from_structured, save_json  # noqa: E402
+from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets  # noqa: E402
+from src.tokenizer.code_tokenizer import CodeTokenizer  # noqa: E402
+from src.training_pipeline.tokenized_dataset import CausalCollator, TokenizedJsonlDataset  # noqa: E402
+
+
+PROMPTS = [
+    "Write a Python function to check if a number is prime.",
+    "Write Python code to reverse a string without using slicing.",
+    "Create a Python function that returns Fibonacci numbers up to n.",
+    "Write Python code to count word frequency in a sentence.",
+    "Write a Python function to sort a list of dictionaries by a key.",
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run Component 6 evaluation.")
+    parser.add_argument("--config", default="configs/component6_evaluation_config.yaml")
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    data = yaml.safe_load(path.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise ValueError("Invalid YAML config.")
+    return data
+
+
+def build_model_config(model_cfg_path: Path) -> ModelConfig:
+    cfg = load_yaml(model_cfg_path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+    if preset:
+        presets = get_model_presets()
+        if preset not in presets:
+            raise ValueError(f"Unknown preset: {preset}")
+        merged = presets[preset].__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+    return ModelConfig(**model_cfg)
+
+
+@torch.no_grad()
+def eval_val_loss(model: CodeTransformerLM, val_loader: DataLoader, device: torch.device, max_batches: int = 50) -> float:
+    model.eval()
+    losses = []
+    for i, (input_ids, labels) in enumerate(val_loader):
+        if i >= max_batches:
+            break
+        input_ids = input_ids.to(device)
+        labels = labels.to(device)
+        with torch.amp.autocast("cuda", enabled=(device.type == "cuda"), dtype=torch.float16):
+            out = model(input_ids=input_ids, labels=labels)
+        losses.append(float(out["loss"].item()))
+    model.train()
+    if not losses:
+        return 1e9
+    return sum(losses) / len(losses)
+
+
+@torch.no_grad()
+def generate_code(
+    model: CodeTransformerLM,
+    tokenizer: CodeTokenizer,
+    prompt: str,
+    device: torch.device,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+) -> str:
+    model.eval()
+    prompt_text = tokenizer.format_training_sample(prompt=prompt, code="", language="python")
+    # Remove trailing empty code marker noise.
+    prompt_text = prompt_text.replace(" <NL>", "").strip()
+
+    ids = tokenizer.encode(prompt_text)
+    eos_id = tokenizer.special_token_ids.get("<EOS>", None)
+    # Remove trailing EOS from prompt so generation continues naturally.
+    if eos_id is not None and len(ids) > 1 and ids[-1] == int(eos_id):
+        ids = ids[:-1]
+    input_ids = torch.tensor([ids], dtype=torch.long, device=device)
+
+    for _ in range(max_new_tokens):
+        out = model(input_ids=input_ids)
+        logits = out["logits"][:, -1, :]
+
+        if temperature <= 0:
+            next_id = torch.argmax(logits, dim=-1, keepdim=True)
+        else:
+            logits = logits / temperature
+            probs = torch.softmax(logits, dim=-1)
+
+            # Top-p (nucleus) sampling.
+            sorted_probs, sorted_idx = torch.sort(probs, descending=True)
+            cumulative = torch.cumsum(sorted_probs, dim=-1)
+            cutoff = cumulative > top_p
+            cutoff[..., 1:] = cutoff[..., :-1].clone()
+            cutoff[..., 0] = False
+            sorted_probs[cutoff] = 0.0
+            sorted_probs = sorted_probs / sorted_probs.sum(dim=-1, keepdim=True)
+            sampled = torch.multinomial(sorted_probs, num_samples=1)
+            next_id = sorted_idx.gather(-1, sampled)
+
+        input_ids = torch.cat([input_ids, next_id], dim=1)
+        if eos_id is not None and int(next_id.item()) == int(eos_id):
+            break
+
+    decoded = tokenizer.decode(input_ids[0].tolist())
+    code = restore_code_from_structured(decoded)
+    return code
+
+
+def main() -> None:
+    args = parse_args()
+    try:
+        cfg = load_yaml(Path(args.config))
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if device.type != "cuda":
+            raise RuntimeError("CUDA is required for this evaluation run.")
+
+        model_cfg = build_model_config(Path(cfg["model"]["model_config_path"]))
+        model_cfg.max_seq_len = int(cfg["inference"]["max_seq_len"])
+
+        tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / "artifacts" / "tokenizer" / "code_tokenizer_v1"))
+
+        val_ds = TokenizedJsonlDataset(
+            path=str(PROJECT_ROOT / cfg["data"]["tokenized_jsonl_path"]),
+            split="val",
+            val_ratio=float(cfg["data"].get("val_ratio", 0.02)),
+            split_seed=int(cfg["data"].get("split_seed", 17)),
+        )
+        val_loader = DataLoader(
+            val_ds,
+            batch_size=1,
+            shuffle=False,
+            collate_fn=CausalCollator(pad_token_id=0, max_seq_len=model_cfg.max_seq_len),
+        )
+
+        ckpt_results: List[Dict[str, Any]] = []
+        for ckpt_rel in cfg["model"]["checkpoint_paths"]:
+            ckpt_path = PROJECT_ROOT / ckpt_rel
+            if not ckpt_path.exists():
+                raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
+
+            model = CodeTransformerLM(model_cfg).to(device)
+            payload = torch.load(ckpt_path, map_location=device)
+            model.load_state_dict(payload["model_state"])
+            model.half()
+
+            val_loss = eval_val_loss(model, val_loader, device=device, max_batches=50)
+
+            generations = []
+            for p in PROMPTS:
+                code = generate_code(
+                    model=model,
+                    tokenizer=tokenizer,
+                    prompt=p,
+                    device=device,
+                    max_new_tokens=int(cfg["inference"].get("max_new_tokens", 160)),
+                    temperature=float(cfg["inference"].get("temperature", 0.8)),
+                    top_p=float(cfg["inference"].get("top_p", 0.9)),
+                )
+                generations.append(
+                    {
+                        "prompt": p,
+                        "generated_code": code,
+                        "python_syntax_ok": python_syntax_ok(code),
+                    }
+                )
+
+            ckpt_results.append(
+                {
+                    "checkpoint": str(ckpt_path),
+                    "step": int(payload.get("step", -1)),
+                    "best_val_in_checkpoint": float(payload.get("best_val", math.nan)),
+                    "eval_val_loss_now": float(val_loss),
+                    "generations": generations,
+                }
+            )
+
+        # Basic fit flags from checkpoint trend.
+        fit_flag = "healthy"
+        if ckpt_results and ckpt_results[-1]["eval_val_loss_now"] > 1.5:
+            fit_flag = "underfitting"
+
+        out = {
+            "fit_flag": fit_flag,
+            "checkpoints": ckpt_results,
+            "recommended_prompts": PROMPTS,
+        }
+
+        out_path = str(PROJECT_ROOT / cfg["output"]["results_json"])
+        save_json(out_path, out)
+
+        print("Component 6 evaluation completed.")
+        print(f"Saved results: {out_path}")
+        print(f"Fit flag: {fit_flag}")
+        for row in ckpt_results:
+            print(f"Checkpoint step={row['step']} val_loss={row['eval_val_loss_now']:.4f}")
+            ok_count = sum(1 for g in row["generations"] if g["python_syntax_ok"])
+            print(f"Python syntax valid in generated samples: {ok_count}/5")
+
+    except Exception as exc:
+        print("Component 6 evaluation failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: verify checkpoint path and tokenizer path.")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/run_component7_inference_benchmark.py b/scripts/run_component7_inference_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fe6b409d6c640f2eb1c9116b5aaa7cfe90f1a16
--- /dev/null
+++ b/scripts/run_component7_inference_benchmark.py
@@ -0,0 +1,145 @@
+﻿"""
+Run Component 7 inference benchmark on the same 5 Python prompts.
+Outputs before/after syntax-valid score.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict
+
+import torch
+import yaml
+
+# Ensure imports work from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.inference_engine.inference_engine import DecodingConfig, InferenceEngine  # noqa: E402
+from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets  # noqa: E402
+from src.tokenizer.code_tokenizer import CodeTokenizer  # noqa: E402
+
+PROMPTS = [
+    "Write a Python function to check if a number is prime.",
+    "Write Python code to reverse a string without using slicing.",
+    "Create a Python function that returns Fibonacci numbers up to n.",
+    "Write Python code to count word frequency in a sentence.",
+    "Write a Python function to sort a list of dictionaries by a key.",
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run Component 7 inference benchmark.")
+    parser.add_argument("--config", default="configs/component7_inference_config.yaml")
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    data = yaml.safe_load(path.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise ValueError("Invalid YAML config.")
+    return data
+
+
+def build_model_config(path: Path) -> ModelConfig:
+    cfg = load_yaml(path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+    if preset:
+        merged = get_model_presets()[preset].__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+    return ModelConfig(**model_cfg)
+
+
+def main() -> None:
+    args = parse_args()
+    try:
+        cfg = load_yaml(Path(args.config))
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if device.type != "cuda":
+            raise RuntimeError("CUDA is required for Component 7 benchmark.")
+
+        model_cfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"])
+        model = CodeTransformerLM(model_cfg).to(device)
+
+        ckpt_path = PROJECT_ROOT / cfg["model"]["checkpoint_path"]
+        payload = torch.load(ckpt_path, map_location=device)
+        model.load_state_dict(payload["model_state"])
+        model.half()
+
+        tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"]))
+
+        dcfg = DecodingConfig(
+            max_new_tokens=int(cfg["inference"].get("max_new_tokens", 180)),
+            greedy_temperature=float(cfg["inference"].get("greedy_temperature", 0.0)),
+            retry2_temperature=float(cfg["inference"].get("retry2_temperature", 0.25)),
+            retry2_top_p=float(cfg["inference"].get("retry2_top_p", 0.85)),
+            retry3_temperature=float(cfg["inference"].get("retry3_temperature", 0.35)),
+            retry3_top_p=float(cfg["inference"].get("retry3_top_p", 0.90)),
+            max_retries=int(cfg["inference"].get("max_retries", 3)),
+            min_tokens_before_stop_check=int(cfg["inference"].get("min_tokens_before_stop_check", 24)),
+        )
+
+        engine = InferenceEngine(model=model, tokenizer=tokenizer, device=device)
+
+        rows = []
+        syntax_ok_count = 0
+        for p in PROMPTS:
+            res = engine.generate_with_retry(prompt=p, language=str(cfg["inference"].get("language", "python")), cfg=dcfg)
+            final = res["final"]
+            syntax_ok = bool(final["syntax_ok"])
+            syntax_ok_count += 1 if syntax_ok else 0
+            rows.append(
+                {
+                    "prompt": p,
+                    "final_code": final["code"],
+                    "syntax_ok": syntax_ok,
+                    "attempt_used": final["attempt"],
+                    "generated_tokens": final["generated_tokens"],
+                    "attempts": res["attempts"],
+                }
+            )
+
+        before_score = None
+        before_path = PROJECT_ROOT / "artifacts" / "evaluation" / "component6_eval_results.json"
+        if before_path.exists():
+            d = json.loads(before_path.read_text(encoding="utf-8"))
+            try:
+                before_score = sum(1 for x in d["checkpoints"][0]["generations"] if x["python_syntax_ok"])
+            except Exception:
+                before_score = None
+
+        out = {
+            "checkpoint": str(ckpt_path),
+            "step": int(payload.get("step", -1)),
+            "before_component6_syntax_ok_out_of_5": before_score,
+            "after_component7_syntax_ok_out_of_5": syntax_ok_count,
+            "prompts": rows,
+        }
+
+        out_path = PROJECT_ROOT / cfg["output"]["results_json"]
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8")
+
+        print("Component 7 inference benchmark completed.")
+        if before_score is not None:
+            print(f"Before (Component 6): {before_score}/5 syntax-valid")
+        print(f"After (Component 7): {syntax_ok_count}/5 syntax-valid")
+        print(f"Saved results: {out_path}")
+
+    except Exception as exc:
+        print("Component 7 benchmark failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: verify checkpoint and tokenizer paths.")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_component9_lora_finetune.py b/scripts/run_component9_lora_finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3d49fe76e7eac64a33b521784fdbe12970b348d
--- /dev/null
+++ b/scripts/run_component9_lora_finetune.py
@@ -0,0 +1,246 @@
+﻿"""
+Component 9: LoRA fine-tuning pipeline for custom prompt->code pairs.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import torch
+import yaml
+from torch.optim import AdamW
+from torch.utils.data import DataLoader, random_split
+from tqdm import tqdm
+
+# Ensure src imports work.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.finetuning_system.custom_pair_dataset import CustomPairDataset  # noqa: E402
+from src.finetuning_system.lora_adapter import LoRAConfig, apply_lora, load_lora_state_dict, lora_state_dict  # noqa: E402
+from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets  # noqa: E402
+from src.training_pipeline.tokenized_dataset import CausalCollator  # noqa: E402
+from src.tokenizer.code_tokenizer import CodeTokenizer  # noqa: E402
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run Component 9 LoRA fine-tuning.")
+    parser.add_argument("--config", default="configs/component9_lora_config.yaml")
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    data = yaml.safe_load(path.read_text(encoding="utf-8-sig"))
+    if not isinstance(data, dict):
+        raise ValueError("Invalid YAML format.")
+    return data
+
+
+def build_model_config(path: Path) -> ModelConfig:
+    cfg = load_yaml(path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+    if preset:
+        merged = get_model_presets()[preset].__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+    return ModelConfig(**model_cfg)
+
+
+def get_vram_gb() -> float:
+    if not torch.cuda.is_available():
+        return 0.0
+    return torch.cuda.memory_allocated() / (1024**3)
+
+
+def save_lora_ckpt(path: Path, step: int, lora_state: dict, optim_state: dict, best_val: float, no_improve: int) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "step": step,
+        "lora_state": lora_state,
+        "optimizer_state": optim_state,
+        "best_val": best_val,
+        "no_improve": no_improve,
+    }
+    torch.save(payload, path)
+
+
+@torch.no_grad()
+def eval_loss(model: CodeTransformerLM, loader: DataLoader, device: torch.device, use_fp16: bool) -> float:
+    model.eval()
+    vals = []
+    for input_ids, labels in loader:
+        input_ids = input_ids.to(device)
+        labels = labels.to(device)
+        with torch.amp.autocast("cuda", enabled=(use_fp16 and device.type == "cuda"), dtype=torch.float16):
+            out = model(input_ids=input_ids, labels=labels)
+        vals.append(float(out["loss"].item()))
+    model.train()
+    if not vals:
+        return 1e9
+    return sum(vals) / len(vals)
+
+
+def main() -> None:
+    args = parse_args()
+    try:
+        cfg = load_yaml(PROJECT_ROOT / args.config)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if device.type != "cuda":
+            raise RuntimeError("CUDA GPU is required for LoRA fine-tuning.")
+
+        model_cfg = build_model_config(PROJECT_ROOT / cfg["model"]["model_config_path"])
+        model = CodeTransformerLM(model_cfg).to(device)
+
+        base_ckpt = torch.load(PROJECT_ROOT / cfg["model"]["base_checkpoint_path"], map_location=device)
+        model.load_state_dict(base_ckpt["model_state"])
+
+        lcfg = LoRAConfig(
+            r=int(cfg["lora"].get("r", 8)),
+            alpha=int(cfg["lora"].get("alpha", 16)),
+            dropout=float(cfg["lora"].get("dropout", 0.05)),
+            target_keywords=list(cfg["lora"].get("target_keywords", ["q_proj", "k_proj", "v_proj", "o_proj", "fc1", "fc2"])),
+        )
+        replaced = apply_lora(model, lcfg)
+        if not replaced:
+            raise RuntimeError("No modules were LoRA-wrapped. Check target_keywords.")
+        # LoRA modules are created on CPU by default, so move full model back to GPU.
+        model = model.to(device)
+
+        tokenizer = CodeTokenizer.load(str(PROJECT_ROOT / cfg["model"]["tokenizer_dir"]))
+        ds = CustomPairDataset(
+            path=str(PROJECT_ROOT / cfg["finetune"]["custom_data_path"]),
+            tokenizer=tokenizer,
+            max_seq_len=int(cfg["finetune"].get("max_seq_len", 512)),
+        )
+
+        n_val = max(1, int(0.1 * len(ds)))
+        n_train = len(ds) - n_val
+        train_ds, val_ds = random_split(ds, [n_train, n_val], generator=torch.Generator().manual_seed(17))
+
+        collator = CausalCollator(pad_token_id=0, max_seq_len=int(cfg["finetune"].get("max_seq_len", 512)))
+        train_loader = DataLoader(train_ds, batch_size=int(cfg["finetune"].get("micro_batch_size", 1)), shuffle=True, collate_fn=collator)
+        val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, collate_fn=collator)
+
+        trainable = [p for p in model.parameters() if p.requires_grad]
+        optimizer = AdamW(trainable, lr=float(cfg["finetune"].get("learning_rate", 3e-4)), weight_decay=float(cfg["finetune"].get("weight_decay", 0.0)))
+
+        use_fp16 = bool(cfg["finetune"].get("use_fp16", True))
+        scaler = torch.amp.GradScaler("cuda", enabled=use_fp16)
+
+        out_dir = PROJECT_ROOT / cfg["finetune"]["output_dir"]
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        max_steps = int(cfg["finetune"].get("max_steps", 1200))
+        save_every = int(cfg["finetune"].get("save_every", 100))
+        eval_every = int(cfg["finetune"].get("eval_every", 100))
+        grad_accum = int(cfg["finetune"].get("grad_accum_steps", 16))
+        max_vram = float(cfg["finetune"].get("max_vram_gb", 7.0))
+        patience = int(cfg["finetune"].get("early_stopping_patience_evals", 6))
+        min_delta = float(cfg["finetune"].get("early_stopping_min_delta", 5e-4))
+
+        step = 0
+        best_val = 1e9
+        no_improve = 0
+
+        resume_from = str(cfg.get("resume", {}).get("resume_from", "none"))
+        if resume_from != "none":
+            ckpt = out_dir / "latest.pt" if resume_from == "latest" else Path(resume_from)
+            if ckpt.exists():
+                payload = torch.load(ckpt, map_location=device)
+                load_lora_state_dict(model, payload["lora_state"])
+                optimizer.load_state_dict(payload["optimizer_state"])
+                step = int(payload.get("step", 0))
+                best_val = float(payload.get("best_val", 1e9))
+                no_improve = int(payload.get("no_improve", 0))
+                print(f"[resume] loaded {ckpt} at step {step}")
+
+        model.train()
+        pbar = tqdm(total=max_steps, initial=step, desc="lora_finetune", dynamic_ncols=True)
+        running = 0
+
+        while step < max_steps:
+            for input_ids, labels in train_loader:
+                if step >= max_steps:
+                    break
+                input_ids = input_ids.to(device)
+                labels = labels.to(device)
+
+                with torch.amp.autocast("cuda", enabled=use_fp16, dtype=torch.float16):
+                    out = model(input_ids=input_ids, labels=labels)
+                    loss = out["loss"] / grad_accum
+
+                scaler.scale(loss).backward()
+                running += 1
+
+                if running % grad_accum == 0:
+                    scaler.step(optimizer)
+                    scaler.update()
+                    optimizer.zero_grad(set_to_none=True)
+                    step += 1
+                    pbar.update(1)
+                    pbar.set_postfix({"loss": f"{float(loss.item())*grad_accum:.4f}", "vram": f"{get_vram_gb():.2f}GB"})
+
+                    if get_vram_gb() > max_vram:
+                        raise RuntimeError(f"VRAM threshold exceeded: {get_vram_gb():.2f}GB > {max_vram:.2f}GB")
+
+                    if step % save_every == 0:
+                        ck = out_dir / f"step_{step}.pt"
+                        save_lora_ckpt(ck, step, lora_state_dict(model), optimizer.state_dict(), best_val, no_improve)
+                        save_lora_ckpt(out_dir / "latest.pt", step, lora_state_dict(model), optimizer.state_dict(), best_val, no_improve)
+                        print(f"\n[checkpoint] saved {ck}")
+
+                    if step % eval_every == 0:
+                        val = eval_loss(model, val_loader, device, use_fp16=use_fp16)
+                        print(f"\n[eval] step={step} val_loss={val:.4f} best={best_val:.4f}")
+                        if val < (best_val - min_delta):
+                            best_val = val
+                            no_improve = 0
+                            save_lora_ckpt(out_dir / "best.pt", step, lora_state_dict(model), optimizer.state_dict(), best_val, no_improve)
+                        else:
+                            no_improve += 1
+                        if no_improve >= patience:
+                            print("\n[early_stop] no improvement, stopping.")
+                            step = max_steps
+                            break
+
+        pbar.close()
+        save_lora_ckpt(out_dir / "latest.pt", step, lora_state_dict(model), optimizer.state_dict(), best_val, no_improve)
+
+        # Save metadata for adapter loading.
+        meta = {
+            "step": step,
+            "best_val": best_val,
+            "lora_config": {
+                "r": lcfg.r,
+                "alpha": lcfg.alpha,
+                "dropout": lcfg.dropout,
+                "target_keywords": lcfg.target_keywords,
+            },
+            "base_checkpoint_path": cfg["model"]["base_checkpoint_path"],
+        }
+        (out_dir / "adapter_meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8-sig")
+
+        print("Component 9 LoRA fine-tuning completed.")
+        print(f"LoRA adapters saved in: {out_dir}")
+
+    except Exception as exc:
+        print("Component 9 LoRA fine-tuning failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: verify custom data file format and checkpoint paths.")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/scripts/setup_windows_environment.ps1 b/scripts/setup_windows_environment.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..dcd5185619d94a329e260149eb4adf6d67af163a
--- /dev/null
+++ b/scripts/setup_windows_environment.ps1
@@ -0,0 +1,114 @@
+# This script sets up the entire local project environment on Windows.
+# It creates folders, creates a virtual environment, upgrades pip tools,
+# and installs all dependencies from requirements.txt.
+
+$ErrorActionPreference = "Stop"
+
+function Invoke-StepCommand {
+    param(
+        [string]$Description,
+        [scriptblock]$CommandBlock
+    )
+    Write-Host $Description -ForegroundColor Cyan
+    & $CommandBlock
+    if ($LASTEXITCODE -ne 0) {
+        throw "Command failed during: $Description"
+    }
+}
+
+try {
+    # This finds the project root based on this script location.
+    $scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
+    $projectRoot = Split-Path -Parent $scriptDir
+    Set-Location $projectRoot
+
+    Write-Host "Step 1/6: Creating project folder structure..." -ForegroundColor Cyan
+
+    # These are all folders needed for Components 1-10.
+    $folders = @(
+        "data",
+        "data/raw",
+        "data/interim",
+        "data/processed",
+        "data/external",
+        "src",
+        "src/tokenizer",
+        "src/dataset_pipeline",
+        "src/model_architecture",
+        "src/training_pipeline",
+        "src/evaluation_system",
+        "src/inference_engine",
+        "src/chat_interface",
+        "src/finetuning_system",
+        "src/export_optimization",
+        "configs",
+        "scripts",
+        "tests",
+        "checkpoints",
+        "models",
+        "models/base",
+        "models/lora",
+        "models/quantized",
+        "artifacts",
+        "logs"
+    )
+
+    foreach ($folder in $folders) {
+        if (-not (Test-Path $folder)) {
+            New-Item -ItemType Directory -Path $folder | Out-Null
+        }
+    }
+
+    Write-Host "Step 2/6: Creating Python virtual environment..." -ForegroundColor Cyan
+
+    # This checks Python version before environment creation.
+    # We require Python 3.10 or 3.11 for best Windows compatibility with ML packages.
+    $pyVersionRaw = python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')"
+    if ($LASTEXITCODE -ne 0) {
+        throw "Python is not available in PATH. Install Python 3.11 and try again."
+    }
+    if (($pyVersionRaw -ne "3.10") -and ($pyVersionRaw -ne "3.11")) {
+        throw "Detected Python $pyVersionRaw. Please install Python 3.11 (recommended) or 3.10, then rerun this script."
+    }
+
+    # This creates .venv only if it does not already exist.
+    if (-not (Test-Path ".venv")) {
+        python -m venv .venv
+    } else {
+        Write-Host "Virtual environment already exists. Reusing .venv." -ForegroundColor Yellow
+    }
+
+    # This points to the venv Python executable on Windows.
+    $venvPython = Join-Path $projectRoot ".venv\Scripts\python.exe"
+    if (-not (Test-Path $venvPython)) {
+        throw "Could not find .venv Python at $venvPython. Please check Python installation."
+    }
+
+    Invoke-StepCommand "Step 3/6: Upgrading pip, setuptools, and wheel..." {
+        & $venvPython -m pip install --upgrade pip setuptools wheel
+    }
+
+    Invoke-StepCommand "Step 4/6: Installing CUDA-enabled PyTorch (cu121)..." {
+        & $venvPython -m pip install --index-url https://download.pytorch.org/whl/cu121 torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1
+    }
+
+    Invoke-StepCommand "Step 5/6: Installing project dependencies from requirements.txt..." {
+        & $venvPython -m pip install -r requirements.txt
+    }
+
+    Invoke-StepCommand "Step 6/6: Validating pip environment health..." {
+        & $venvPython -m pip check
+    }
+
+    Write-Host "Setup complete." -ForegroundColor Green
+    Write-Host "Next command: .\.venv\Scripts\Activate.ps1" -ForegroundColor Green
+    Write-Host "Then run: python .\scripts\verify_component1_setup.py" -ForegroundColor Green
+}
+catch {
+    # This prints a clear plain-English error if anything breaks.
+    Write-Host ""
+    Write-Host "Setup failed." -ForegroundColor Red
+    Write-Host "What went wrong: $($_.Exception.Message)" -ForegroundColor Red
+    Write-Host "Fix suggestion: Check Python is installed and available in PATH, then run this script again." -ForegroundColor Yellow
+    exit 1
+}
diff --git a/scripts/train_code_tokenizer.py b/scripts/train_code_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e8dc15e5b27b0b008a1635c7e1f74675c39677a
--- /dev/null
+++ b/scripts/train_code_tokenizer.py
@@ -0,0 +1,134 @@
+"""
+Component 2 training script.
+
+This script trains the custom code tokenizer and saves it for reuse.
+Supported input formats:
+- .jsonl with fields: prompt, code, language
+- .txt where each line is one raw sample
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Iterable, Iterator, List
+
+# This makes "src" imports work when script is run from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.tokenizer.code_tokenizer import CodeTokenizer, CodeTokenizerConfig
+
+
+def stream_jsonl_samples(file_path: Path, tokenizer: CodeTokenizer) -> Iterator[str]:
+    """
+    Streams JSONL rows as training text without loading full file into RAM.
+    """
+    with file_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            prompt = str(row.get("prompt", "")).strip()
+            code = str(row.get("code", "")).strip()
+            language = str(row.get("language", "python")).strip().lower()
+            if not prompt or not code:
+                continue
+            if language not in {"python", "javascript"}:
+                language = "python"
+            yield tokenizer.format_training_sample(prompt=prompt, code=code, language=language)
+
+
+def stream_txt_samples(file_path: Path) -> Iterator[str]:
+    """
+    Streams plain text file line by line.
+    """
+    with file_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            text = line.strip()
+            if text:
+                yield text
+
+
+def build_stream(input_files: List[Path], tokenizer: CodeTokenizer) -> Iterable[str]:
+    """
+    Creates one merged iterator from many files.
+    """
+    def _generator() -> Iterator[str]:
+        for path in input_files:
+            suffix = path.suffix.lower()
+            if suffix == ".jsonl":
+                yield from stream_jsonl_samples(path, tokenizer)
+            elif suffix == ".txt":
+                yield from stream_txt_samples(path)
+            else:
+                print(f"[warning] Skipping unsupported file type: {path}")
+
+    return _generator()
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Reads command-line settings for tokenizer training.
+    """
+    parser = argparse.ArgumentParser(description="Train custom Python/JavaScript code tokenizer.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        required=True,
+        help="One or more input files (.jsonl or .txt).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="artifacts/tokenizer/code_tokenizer_v1",
+        help="Folder where tokenizer files will be saved.",
+    )
+    parser.add_argument("--vocab_size", type=int, default=50_000, help="Tokenizer vocabulary size.")
+    parser.add_argument("--min_frequency", type=int, default=2, help="Minimum token frequency.")
+    parser.add_argument("--model_max_length", type=int, default=2048, help="Max token length hint.")
+    return parser.parse_args()
+
+
+def main() -> None:
+    """
+    Main training entry point with clear error messages.
+    """
+    args = parse_args()
+
+    try:
+        input_files = [Path(p) for p in args.input]
+        missing = [str(p) for p in input_files if not p.exists()]
+        if missing:
+            raise FileNotFoundError(
+                "Some input files do not exist:\n- " + "\n- ".join(missing)
+            )
+
+        config = CodeTokenizerConfig(
+            vocab_size=args.vocab_size,
+            min_frequency=args.min_frequency,
+            model_max_length=args.model_max_length,
+        )
+        tokenizer = CodeTokenizer(config=config)
+        text_stream = build_stream(input_files=input_files, tokenizer=tokenizer)
+        tokenizer.train(text_stream)
+        tokenizer.save(args.output_dir)
+
+        print("Tokenizer training completed successfully.")
+        print(f"Saved tokenizer to: {args.output_dir}")
+        print("Saved files: tokenizer.json, tokenizer_config.json")
+    except Exception as exc:
+        print("Tokenizer training failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: check file paths and file format, then run again.")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/train_component5.py b/scripts/train_component5.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9c5c73d7719194e744e31c591dfb9eea9c28ae8
--- /dev/null
+++ b/scripts/train_component5.py
@@ -0,0 +1,396 @@
+﻿"""
+Component 5: Training pipeline for the 420M code model.
+
+Features:
+- FP16 mixed precision
+- Gradient checkpointing
+- Gradient accumulation
+- 8-bit optimizer attempt with safe fallback
+- Checkpoint save every N steps
+- Resume from checkpoint
+- Early stopping
+- Live progress with loss, LR, ETA, VRAM
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+import yaml
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+# Ensure src imports work from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets  # noqa: E402
+from src.training_pipeline.tokenized_dataset import CausalCollator, TokenizedJsonlDataset  # noqa: E402
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run Component 5 training.")
+    parser.add_argument("--config", default="configs/component5_training_config.yaml")
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError("Invalid YAML format.")
+    return data
+
+
+def load_model_config(path: Path) -> ModelConfig:
+    cfg = load_yaml(path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+    if preset:
+        presets = get_model_presets()
+        if preset not in presets:
+            raise ValueError(f"Unknown model preset: {preset}")
+        base = presets[preset].__dict__.copy()
+        base.update(model_cfg)
+        return ModelConfig(**base)
+    return ModelConfig(**model_cfg)
+
+
+def make_optimizer(model: torch.nn.Module, train_cfg: Dict[str, Any]) -> Tuple[torch.optim.Optimizer, str]:
+    lr = float(train_cfg["learning_rate"])
+    wd = float(train_cfg["weight_decay"])
+    betas = tuple(float(x) for x in train_cfg.get("betas", [0.9, 0.95]))
+    prefer_8bit = bool(train_cfg.get("prefer_8bit_adam", True))
+
+    if prefer_8bit:
+        try:
+            import bitsandbytes as bnb  # type: ignore
+
+            optimizer = bnb.optim.Adam8bit(model.parameters(), lr=lr, betas=betas, weight_decay=wd)
+            return optimizer, "Adam8bit"
+        except Exception:
+            pass
+
+    optimizer = AdamW(model.parameters(), lr=lr, betas=betas, weight_decay=wd)
+    return optimizer, "AdamW"
+
+
+def cosine_lr(base_lr: float, step: int, warmup_steps: int, max_steps: int, min_lr_ratio: float) -> float:
+    if step < warmup_steps:
+        return base_lr * (step / max(1, warmup_steps))
+    progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
+    progress = min(1.0, max(0.0, progress))
+    cosine = 0.5 * (1.0 + math.cos(math.pi * progress))
+    min_lr = base_lr * min_lr_ratio
+    return min_lr + (base_lr - min_lr) * cosine
+
+
+def set_optimizer_lr(optimizer: torch.optim.Optimizer, lr: float) -> None:
+    for pg in optimizer.param_groups:
+        pg["lr"] = lr
+
+
+def get_vram_gb() -> float:
+    if not torch.cuda.is_available():
+        return 0.0
+    return torch.cuda.memory_allocated() / (1024**3)
+
+
+def save_checkpoint(
+    ckpt_dir: Path,
+    step: int,
+    model: CodeTransformerLM,
+    optimizer: torch.optim.Optimizer,
+    scaler: Optional[torch.cuda.amp.GradScaler],
+    best_val: float,
+    no_improve_evals: int,
+    config: Dict[str, Any],
+) -> Path:
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+    ckpt_path = ckpt_dir / f"step_{step}.pt"
+    payload = {
+        "step": step,
+        "model_state": model.state_dict(),
+        "optimizer_state": optimizer.state_dict(),
+        "scaler_state": scaler.state_dict() if scaler is not None else None,
+        "best_val": best_val,
+        "no_improve_evals": no_improve_evals,
+        "config": config,
+    }
+    torch.save(payload, ckpt_path)
+    latest = ckpt_dir / "latest.pt"
+    torch.save(payload, latest)
+    return ckpt_path
+
+
+def load_checkpoint(
+    ckpt_path: Path,
+    model: CodeTransformerLM,
+    optimizer: torch.optim.Optimizer,
+    scaler: Optional[torch.cuda.amp.GradScaler],
+    device: torch.device,
+) -> Tuple[int, float, int]:
+    payload = torch.load(ckpt_path, map_location=device)
+    model.load_state_dict(payload["model_state"])
+    optimizer.load_state_dict(payload["optimizer_state"])
+    if scaler is not None and payload.get("scaler_state") is not None:
+        scaler.load_state_dict(payload["scaler_state"])
+    step = int(payload.get("step", 0))
+    best_val = float(payload.get("best_val", 1e9))
+    no_improve = int(payload.get("no_improve_evals", 0))
+    return step, best_val, no_improve
+
+
+@torch.no_grad()
+def evaluate_loss(
+    model: CodeTransformerLM,
+    val_loader: DataLoader,
+    device: torch.device,
+    use_fp16: bool,
+    max_batches: int = 50,
+) -> float:
+    model.eval()
+    losses = []
+    amp_enabled = use_fp16 and device.type == "cuda"
+    for i, (input_ids, labels) in enumerate(val_loader):
+        if i >= max_batches:
+            break
+        input_ids = input_ids.to(device, non_blocking=True)
+        labels = labels.to(device, non_blocking=True)
+        with torch.amp.autocast("cuda", enabled=amp_enabled, dtype=torch.float16):
+            out = model(input_ids=input_ids, labels=labels)
+        losses.append(float(out["loss"].item()))
+    model.train()
+    if not losses:
+        return 1e9
+    return sum(losses) / len(losses)
+
+
+def train() -> None:
+    args = parse_args()
+    cfg = load_yaml(Path(args.config))
+    train_cfg = cfg["training"]
+    data_cfg = cfg["data"]
+    resume_cfg = cfg.get("resume", {})
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if device.type != "cuda":
+        raise RuntimeError("CUDA GPU is required for this training setup.")
+
+    model_cfg = load_model_config(Path(cfg["model"]["model_config_path"]))
+    model_cfg.max_seq_len = int(train_cfg["max_seq_len"])
+    model_cfg.gradient_checkpointing = bool(train_cfg.get("use_gradient_checkpointing", True))
+
+    model = CodeTransformerLM(model_cfg)
+    model.enable_gradient_checkpointing(model_cfg.gradient_checkpointing)
+    model = model.to(device)
+
+    use_fp16 = bool(train_cfg.get("use_fp16", True))
+    scaler = torch.amp.GradScaler("cuda", enabled=use_fp16)
+
+    optimizer, optimizer_name = make_optimizer(model, train_cfg)
+
+    tokenized_path = str(data_cfg["tokenized_jsonl_path"])
+    train_ds = TokenizedJsonlDataset(
+        path=tokenized_path,
+        split="train",
+        val_ratio=float(data_cfg.get("val_ratio", 0.02)),
+        split_seed=int(data_cfg.get("split_seed", 17)),
+    )
+    val_ds = TokenizedJsonlDataset(
+        path=tokenized_path,
+        split="val",
+        val_ratio=float(data_cfg.get("val_ratio", 0.02)),
+        split_seed=int(data_cfg.get("split_seed", 17)),
+    )
+
+    collator = CausalCollator(pad_token_id=0, max_seq_len=int(train_cfg["max_seq_len"]))
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=int(train_cfg["micro_batch_size"]),
+        shuffle=True,
+        num_workers=int(data_cfg.get("num_workers", 0)),
+        pin_memory=True,
+        collate_fn=collator,
+    )
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=int(train_cfg["micro_batch_size"]),
+        shuffle=False,
+        num_workers=0,
+        pin_memory=True,
+        collate_fn=collator,
+    )
+
+    out_dir = Path(train_cfg["output_dir"])
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    global_step = 0
+    best_val = 1e9
+    no_improve = 0
+
+    resume_from = str(resume_cfg.get("resume_from", "none")).strip().lower()
+    if resume_from != "none":
+        if resume_from == "latest":
+            ckpt_path = out_dir / "latest.pt"
+        else:
+            ckpt_path = Path(resume_cfg["resume_from"])
+        if ckpt_path.exists():
+            global_step, best_val, no_improve = load_checkpoint(
+                ckpt_path=ckpt_path,
+                model=model,
+                optimizer=optimizer,
+                scaler=scaler,
+                device=device,
+            )
+            print(f"[resume] loaded checkpoint {ckpt_path} at step {global_step}")
+        else:
+            print(f"[resume] checkpoint not found, starting fresh: {ckpt_path}")
+
+    max_steps = int(train_cfg["max_steps"])
+    grad_accum = int(train_cfg["grad_accum_steps"])
+    log_every = int(train_cfg["log_every"])
+    eval_every = int(train_cfg["eval_every"])
+    save_every = int(train_cfg["save_every"])
+    warmup_steps = int(train_cfg["warmup_steps"])
+    min_lr_ratio = float(train_cfg["min_lr_ratio"])
+    grad_clip = float(train_cfg["grad_clip_norm"])
+    max_vram_gb = float(train_cfg.get("max_vram_gb", 7.0))
+    patience = int(train_cfg.get("early_stopping_patience_evals", 20))
+    min_delta = float(train_cfg.get("early_stopping_min_delta", 5e-4))
+    base_lr = float(train_cfg["learning_rate"])
+
+    model.train()
+    start_time = time.time()
+    running_loss = 0.0
+    running_count = 0
+
+    pbar = tqdm(total=max_steps, initial=global_step, desc="train", dynamic_ncols=True)
+
+    while global_step < max_steps:
+        for input_ids, labels in train_loader:
+            if global_step >= max_steps:
+                break
+
+            current_lr = cosine_lr(base_lr, global_step, warmup_steps, max_steps, min_lr_ratio)
+            set_optimizer_lr(optimizer, current_lr)
+
+            input_ids = input_ids.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
+
+            amp_enabled = use_fp16 and device.type == "cuda"
+            with torch.amp.autocast("cuda", enabled=amp_enabled, dtype=torch.float16):
+                out = model(input_ids=input_ids, labels=labels)
+                loss = out["loss"] / grad_accum
+
+            scaler.scale(loss).backward()
+
+            running_loss += float(loss.item()) * grad_accum
+            running_count += 1
+
+            if running_count % grad_accum == 0:
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.zero_grad(set_to_none=True)
+
+                global_step += 1
+                pbar.update(1)
+
+                elapsed = time.time() - start_time
+                steps_done = max(1, global_step)
+                steps_left = max(0, max_steps - global_step)
+                eta_sec = (elapsed / steps_done) * steps_left
+                avg_loss = running_loss / max(1, running_count)
+                vram = get_vram_gb()
+
+                if vram > max_vram_gb:
+                    raise RuntimeError(
+                        f"VRAM safety threshold exceeded: {vram:.2f} GB > {max_vram_gb:.2f} GB. "
+                        "Reduce max_seq_len or grad_accum/micro_batch settings."
+                    )
+
+                if global_step % log_every == 0:
+                    pbar.set_postfix(
+                        {
+                            "loss": f"{avg_loss:.4f}",
+                            "lr": f"{current_lr:.2e}",
+                            "vram_gb": f"{vram:.2f}",
+                            "eta_min": f"{eta_sec/60.0:.1f}",
+                        }
+                    )
+
+                if global_step % save_every == 0:
+                    ckpt_path = save_checkpoint(
+                        ckpt_dir=out_dir,
+                        step=global_step,
+                        model=model,
+                        optimizer=optimizer,
+                        scaler=scaler,
+                        best_val=best_val,
+                        no_improve_evals=no_improve,
+                        config=cfg,
+                    )
+                    print(f"\n[checkpoint] saved {ckpt_path}")
+
+                if global_step % eval_every == 0:
+                    val_loss = evaluate_loss(model, val_loader, device, use_fp16=use_fp16)
+                    print(f"\n[eval] step={global_step} val_loss={val_loss:.4f} best={best_val:.4f}")
+                    if val_loss < (best_val - min_delta):
+                        best_val = val_loss
+                        no_improve = 0
+                    else:
+                        no_improve += 1
+                    if no_improve >= patience:
+                        print(
+                            f"\n[early_stop] no improvement for {no_improve} evals "
+                            f"(patience={patience}). Stopping training."
+                        )
+                        global_step = max_steps
+                        break
+
+    pbar.close()
+    final_ckpt = save_checkpoint(
+        ckpt_dir=out_dir,
+        step=global_step,
+        model=model,
+        optimizer=optimizer,
+        scaler=scaler,
+        best_val=best_val,
+        no_improve_evals=no_improve,
+        config=cfg,
+    )
+    print("Training completed.")
+    print(f"Optimizer used: {optimizer_name}")
+    print(f"Final checkpoint: {final_ckpt}")
+
+
+def main() -> None:
+    try:
+        train()
+    except Exception as exc:
+        print("Component 5 training failed.")
+        print(f"What went wrong: {exc}")
+        print(
+            "Fix suggestion: lower max_seq_len, keep micro_batch_size=1, "
+            "increase grad_accum_steps, and verify checkpoint/output paths."
+        )
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/verify_component1_setup.py b/scripts/verify_component1_setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..65dc23cf3ece795f9c9a4d764f9f7185fd8da706
--- /dev/null
+++ b/scripts/verify_component1_setup.py
@@ -0,0 +1,112 @@
+"""
+This script verifies Component 1 setup in plain English.
+It checks Python, key packages, and GPU visibility for PyTorch.
+"""
+
+import importlib
+import importlib.util
+import platform
+import sys
+from typing import List
+
+
+def check_imports(packages: List[str]) -> List[str]:
+    """
+    Tries importing required packages.
+    Returns a list of package names that failed to import.
+    """
+    failed = []
+    for package in packages:
+        try:
+            importlib.import_module(package)
+        except Exception:
+            failed.append(package)
+    return failed
+
+
+def check_optional_installed(packages: List[str]) -> List[str]:
+    """
+    Checks whether optional packages exist without importing them.
+    Returns packages that are missing.
+    """
+    missing = []
+    for package in packages:
+        if importlib.util.find_spec(package) is None:
+            missing.append(package)
+    return missing
+
+
+def main() -> None:
+    print("=== Component 1 Verification ===")
+    print(f"Python version: {sys.version.split()[0]}")
+    print(f"Operating system: {platform.system()} {platform.release()}")
+
+    # Python 3.10/3.11 is the target for best compatibility on Windows.
+    if sys.version_info.major != 3 or sys.version_info.minor not in (10, 11):
+        print("")
+        print("Verification failed.")
+        print("This project currently requires Python 3.10 or 3.11 on Windows.")
+        print("Fix suggestion: install Python 3.11, recreate .venv, and reinstall requirements.")
+        raise SystemExit(1)
+
+    # These are required for Component 1 success.
+    required = [
+        "torch",
+        "transformers",
+        "tokenizers",
+        "datasets",
+        "accelerate",
+        "gradio",
+        "tree_sitter",
+    ]
+
+    failed = check_imports(required)
+    if failed:
+        print("")
+        print("Verification failed.")
+        print("The following packages could not be imported:")
+        for package in failed:
+            print(f"- {package}")
+        print("")
+        print("Fix suggestion: activate .venv and run 'pip install -r requirements.txt' again.")
+        raise SystemExit(1)
+
+    # Optional imports should not fail Component 1.
+    optional = ["bitsandbytes"]
+    optional_failed = check_optional_installed(optional)
+
+    import torch
+
+    print(f"PyTorch version: {torch.__version__}")
+    cuda_available = torch.cuda.is_available()
+    print(f"CUDA available: {cuda_available}")
+
+    if cuda_available:
+        gpu_name = torch.cuda.get_device_name(0)
+        total_vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        print(f"Detected GPU: {gpu_name}")
+        print(f"Total VRAM: {total_vram_gb:.2f} GB")
+    else:
+        print("No CUDA GPU was detected by PyTorch.")
+        print("You can still continue, but training speed will be much slower.")
+
+    if optional_failed:
+        print("")
+        print("Optional package warning:")
+        print("- bitsandbytes is not available in this environment.")
+        print("This does not block Component 1.")
+        print("For Component 5 on native Windows, we will use an automatic fallback optimizer if needed.")
+
+    print("")
+    print("Component 1 is verified successfully.")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as exc:
+        print("")
+        print("Verification script crashed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: ensure .venv is active and dependencies are installed.")
+        raise SystemExit(1)
diff --git a/scripts/verify_component2_tokenizer.py b/scripts/verify_component2_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..281e981f7d3ed72be1c360a416beda92c99d240c
--- /dev/null
+++ b/scripts/verify_component2_tokenizer.py
@@ -0,0 +1,67 @@
+"""
+Component 2 verification script.
+
+This script:
+1) Trains tokenizer on a tiny sample file.
+2) Saves tokenizer.
+3) Loads tokenizer back.
+4) Encodes and decodes a sample.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# This makes "src" imports work when script is run from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.tokenizer.code_tokenizer import CodeTokenizer
+
+
+def main() -> None:
+    sample_file = Path("data/external/component2_tokenizer_sample.jsonl")
+    output_dir = Path("artifacts/tokenizer/code_tokenizer_v1")
+
+    if not sample_file.exists():
+        print("Verification failed.")
+        print(f"Missing sample file: {sample_file}")
+        print("Fix suggestion: ensure Component 2 sample file exists and run again.")
+        raise SystemExit(1)
+
+    # Train tokenizer from sample file via script-like path.
+    from scripts.train_code_tokenizer import stream_jsonl_samples  # local import on purpose
+
+    tokenizer = CodeTokenizer()
+    tokenizer.train(stream_jsonl_samples(sample_file, tokenizer))
+    tokenizer.save(str(output_dir))
+
+    loaded = CodeTokenizer.load(str(output_dir))
+    sample = loaded.format_training_sample(
+        prompt="Write Python function that squares a number.",
+        code="def square(x):\n    return x * x",
+        language="python",
+    )
+    token_ids = loaded.encode(sample)
+    decoded = loaded.decode(token_ids)
+
+    print("=== Component 2 Verification ===")
+    print(f"Tokenizer saved to: {output_dir}")
+    print(f"Encoded token count: {len(token_ids)}")
+    print("First 25 token IDs:", token_ids[:25])
+    print("Decoded preview:")
+    print(decoded[:300])
+    print("")
+    print("Component 2 tokenizer verification passed.")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as exc:
+        print("Verification failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: activate .venv and rerun this script.")
+        raise SystemExit(1)
diff --git a/scripts/verify_component3_dataset_pipeline.py b/scripts/verify_component3_dataset_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..30bf56d3ac0fd4e2f0a7198175ec892990c437c9
--- /dev/null
+++ b/scripts/verify_component3_dataset_pipeline.py
@@ -0,0 +1,41 @@
+"""
+Component 3 verification script.
+
+Runs a small pipeline pass to confirm:
+- HF loading works.
+- Cleaning + dedupe logic works.
+- Tokenized output files are created.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# This makes script imports stable from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from scripts.run_component3_dataset_pipeline import main as run_pipeline_main  # noqa: E402
+
+
+if __name__ == "__main__":
+    try:
+        # We call the main runner with a small override by mutating argv.
+        sys.argv = [
+            "verify_component3_dataset_pipeline.py",
+            "--config",
+            "configs/component3_dataset_pipeline.yaml",
+            "--max_records_per_dataset",
+            "200",
+        ]
+        run_pipeline_main()
+        print("")
+        print("Component 3 verification passed.")
+    except Exception as exc:
+        print("Component 3 verification failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: verify internet access and that Component 2 tokenizer exists.")
+        raise SystemExit(1)
+
diff --git a/scripts/verify_component4_model.py b/scripts/verify_component4_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..370b97a0dc16edfb8001fe4c39ce6bdebe0be366
--- /dev/null
+++ b/scripts/verify_component4_model.py
@@ -0,0 +1,138 @@
+"""
+Component 4 verification script.
+
+This script:
+- Builds model from config.
+- Runs a small forward pass.
+- Prints live VRAM usage at each stage.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Any, Dict
+
+import torch
+import yaml
+
+# Ensure src imports work from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from src.model_architecture.code_transformer import (  # noqa: E402
+    CodeTransformerLM,
+    ModelConfig,
+    get_model_presets,
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Verify Component 4 model load and VRAM usage.")
+    parser.add_argument(
+        "--config",
+        default="configs/component4_model_config.yaml",
+        help="Path to model YAML config.",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size for forward test.")
+    parser.add_argument("--seq_len", type=int, default=256, help="Sequence length for forward test.")
+    return parser.parse_args()
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Model config not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError("Invalid YAML format in model config.")
+    return data
+
+
+def build_config(cfg_data: Dict[str, Any]) -> ModelConfig:
+    preset = cfg_data.get("preset")
+    model_cfg = cfg_data.get("model", {})
+    if not isinstance(model_cfg, dict):
+        raise ValueError("Config key 'model' must be an object.")
+
+    if preset:
+        presets = get_model_presets()
+        if preset not in presets:
+            raise ValueError(f"Unknown preset '{preset}'.")
+        base = presets[preset]
+        merged = base.__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+    return ModelConfig(**model_cfg)
+
+
+def gpu_memory_report(stage: str) -> None:
+    if not torch.cuda.is_available():
+        print(f"[{stage}] CUDA not available")
+        return
+    allocated = torch.cuda.memory_allocated() / (1024**3)
+    reserved = torch.cuda.memory_reserved() / (1024**3)
+    max_alloc = torch.cuda.max_memory_allocated() / (1024**3)
+    print(
+        f"[{stage}] VRAM allocated={allocated:.2f} GB "
+        f"reserved={reserved:.2f} GB max_allocated={max_alloc:.2f} GB"
+    )
+
+
+def main() -> None:
+    args = parse_args()
+    try:
+        cfg_data = load_yaml(Path(args.config))
+        model_cfg = build_config(cfg_data)
+        if args.seq_len > model_cfg.max_seq_len:
+            raise ValueError(
+                f"seq_len={args.seq_len} exceeds max_seq_len={model_cfg.max_seq_len} in config."
+            )
+
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        use_fp16 = device.type == "cuda"
+        if device.type == "cuda":
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+            print(f"Detected GPU: {torch.cuda.get_device_name(0)}")
+            gpu_memory_report("start")
+        else:
+            print("CUDA not available. Running verification on CPU.")
+
+        model = CodeTransformerLM(model_cfg)
+        print(f"Model parameters: {model.estimate_num_parameters():,}")
+
+        if use_fp16:
+            model = model.half()
+        model.to(device)
+        model.eval()
+        gpu_memory_report("after_model_load")
+
+        input_ids = torch.randint(
+            low=0,
+            high=model_cfg.vocab_size,
+            size=(args.batch_size, args.seq_len),
+            dtype=torch.long,
+            device=device,
+        )
+        gpu_memory_report("after_input_alloc")
+
+        with torch.no_grad():
+            out = model(input_ids=input_ids)
+        logits = out["logits"]
+        gpu_memory_report("after_forward")
+
+        print(f"Forward output shape: {tuple(logits.shape)}")
+        print("Component 4 verification passed.")
+    except Exception as exc:
+        print("Component 4 verification failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: reduce seq_len or check CUDA/PyTorch installation.")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/verify_component5_training_pipeline.py b/scripts/verify_component5_training_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc73f5536acff86b06de3b376bd0cda7c42d9278
--- /dev/null
+++ b/scripts/verify_component5_training_pipeline.py
@@ -0,0 +1,46 @@
+﻿"""
+Quick verification for Component 5 training pipeline.
+Runs a tiny 5-step training smoke test.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# Ensure script imports work from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from scripts.train_component5 import main as train_main  # noqa: E402
+
+
+if __name__ == "__main__":
+    try:
+        # Override CLI args for a tiny smoke test.
+        sys.argv = ["verify_component5_training_pipeline.py", "--config", "configs/component5_training_config.yaml"]
+
+        # Patch config on disk for very short run.
+        import yaml
+
+        cfg_path = Path("configs/component5_training_config.yaml")
+        cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
+        cfg["training"]["max_steps"] = 5
+        cfg["training"]["save_every"] = 5
+        cfg["training"]["eval_every"] = 5
+        cfg["training"]["log_every"] = 1
+        cfg["resume"]["resume_from"] = "none"
+
+        tmp_cfg = Path("configs/component5_training_config.verify.yaml")
+        tmp_cfg.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8")
+
+        sys.argv = ["verify_component5_training_pipeline.py", "--config", str(tmp_cfg)]
+        train_main()
+        print("")
+        print("Component 5 verification passed.")
+    except Exception as exc:
+        print("Component 5 verification failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: ensure CUDA is available and tokenized dataset path is correct.")
+        raise SystemExit(1)
diff --git a/scripts/verify_component9_lora.py b/scripts/verify_component9_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3fbf1266ff20bfaeb8de35f89338434f3ae74b
--- /dev/null
+++ b/scripts/verify_component9_lora.py
@@ -0,0 +1,34 @@
+﻿"""
+Quick verification for Component 9 LoRA pipeline.
+Runs a tiny 5-step smoke fine-tune.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import yaml
+
+# Ensure imports work.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from scripts.run_component9_lora_finetune import main as lora_main  # noqa: E402
+
+
+if __name__ == "__main__":
+    cfg_path = PROJECT_ROOT / "configs" / "component9_lora_config.yaml"
+    cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8-sig"))
+    cfg["finetune"]["max_steps"] = 5
+    cfg["finetune"]["save_every"] = 5
+    cfg["finetune"]["eval_every"] = 5
+    cfg["resume"]["resume_from"] = "none"
+    tmp = PROJECT_ROOT / "configs" / "component9_lora_config.verify.yaml"
+    tmp.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8-sig")
+
+    sys.argv = ["verify_component9_lora.py", "--config", str(tmp)]
+    lora_main()
+    print("\nComponent 9 verification passed.")
+
diff --git a/scripts/verify_incremental_javascript_merge.py b/scripts/verify_incremental_javascript_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0f7bac25e67f97539afe0779b2106104cd8ef65
--- /dev/null
+++ b/scripts/verify_incremental_javascript_merge.py
@@ -0,0 +1,37 @@
+"""
+Quick verification for incremental JavaScript merge script.
+
+This performs a small run with a low JS target so you can validate logic fast.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# Ensure imports work when executed from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from scripts.add_incremental_javascript_dataset import main as incremental_main  # noqa: E402
+
+
+if __name__ == "__main__":
+    try:
+        sys.argv = [
+            "verify_incremental_javascript_merge.py",
+            "--config",
+            "configs/component3_incremental_js.yaml",
+            "--target_new_javascript_examples",
+            "100",
+        ]
+        incremental_main()
+        print("")
+        print("Incremental JS merge verification passed.")
+    except Exception as exc:
+        print("Incremental JS merge verification failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: verify dataset accessibility and rerun.")
+        raise SystemExit(1)
+
diff --git a/scripts/verify_reprocess_tokenized_from_clean.py b/scripts/verify_reprocess_tokenized_from_clean.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fd12ed1e029691e75f76e3f4d8efc5ff01f7369
--- /dev/null
+++ b/scripts/verify_reprocess_tokenized_from_clean.py
@@ -0,0 +1,35 @@
+"""
+Quick verification for reprocess_tokenized_from_clean.py.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# Ensure imports work from project root.
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from scripts.reprocess_tokenized_from_clean import main as reprocess_main  # noqa: E402
+
+
+if __name__ == "__main__":
+    try:
+        sys.argv = [
+            "verify_reprocess_tokenized_from_clean.py",
+            "--config",
+            "configs/component3_reprocess_from_clean.yaml",
+            "--max_records",
+            "500",
+        ]
+        reprocess_main()
+        print("")
+        print("Reprocess verification passed.")
+    except Exception as exc:
+        print("Reprocess verification failed.")
+        print(f"What went wrong: {exc}")
+        print("Fix suggestion: verify input clean file and tokenizer path.")
+        raise SystemExit(1)
+
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..45aa8dbeca626dc18818fbfe98c473d01ebd3aff
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,2 @@
+# This file marks src as a Python package.
+
diff --git a/src/chat_interface/__init__.py b/src/chat_interface/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccac5c513223e9eab9f38019d3010f92fbff3d8
--- /dev/null
+++ b/src/chat_interface/__init__.py
@@ -0,0 +1 @@
+﻿# This file marks chat_interface as a Python package.
diff --git a/src/chat_interface/gradio_chat_app.py b/src/chat_interface/gradio_chat_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e415e3d3d0e5fd22ae1fc32f6e88300efd89acb
--- /dev/null
+++ b/src/chat_interface/gradio_chat_app.py
@@ -0,0 +1,370 @@
+﻿"""
+Component 8: Local chat interface using Gradio.
+
+- Clean dark-themed UI.
+- Prompt input box.
+- Syntax-highlighted code output (Python + JavaScript).
+- Copy button for each code response.
+- Generation time + token count.
+- Conversation history in session.
+- Clear button to reset history.
+- Live model selector: Base / LoRA / INT8 (no restart).
+"""
+
+from __future__ import annotations
+
+import html
+import re
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import gradio as gr
+import torch
+import torch.nn as nn
+import yaml
+from pygments import highlight
+from pygments.formatters import HtmlFormatter
+from pygments.lexers import JavascriptLexer, PythonLexer, TextLexer
+
+from src.finetuning_system.lora_adapter import LoRAConfig, apply_lora, load_lora_state_dict
+from src.inference_engine.inference_engine import DecodingConfig, InferenceEngine
+from src.model_architecture.code_transformer import CodeTransformerLM, ModelConfig, get_model_presets
+from src.tokenizer.code_tokenizer import CodeTokenizer
+
+
+def _load_yaml(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    data = yaml.safe_load(path.read_text(encoding="utf-8-sig"))
+    if not isinstance(data, dict):
+        raise ValueError("Invalid YAML format.")
+    return data
+
+
+def _build_model_config(path: Path) -> ModelConfig:
+    cfg = _load_yaml(path)
+    preset = cfg.get("preset")
+    model_cfg = cfg.get("model", {})
+    if preset:
+        presets = get_model_presets()
+        if preset not in presets:
+            raise ValueError(f"Unknown preset: {preset}")
+        merged = presets[preset].__dict__.copy()
+        merged.update(model_cfg)
+        return ModelConfig(**merged)
+    return ModelConfig(**model_cfg)
+
+
+def _guess_language(prompt: str, default_lang: str = "python") -> str:
+    p = prompt.lower()
+    if "javascript" in p or " js " in f" {p} " or "node" in p:
+        return "javascript"
+    if "python" in p:
+        return "python"
+    return default_lang
+
+
+def _is_coding_prompt(prompt: str) -> bool:
+    p = prompt.lower().strip()
+    coding_keywords = [
+        "code",
+        "python",
+        "javascript",
+        "function",
+        "bug",
+        "error",
+        "algorithm",
+        "sort",
+        "loop",
+        "class",
+        "api",
+        "sql",
+        "regex",
+        "debug",
+        "implement",
+        "write",
+    ]
+    if any(k in p for k in coding_keywords):
+        return True
+    if re.fullmatch(r"(hi|hello|hey|yo|hola)[!. ]*", p):
+        return False
+    return False
+
+
+def _highlight_code(code: str, language: str) -> str:
+    code = code or ""
+    if language == "javascript":
+        lexer = JavascriptLexer()
+    elif language == "python":
+        lexer = PythonLexer()
+    else:
+        lexer = TextLexer()
+    formatter = HtmlFormatter(nowrap=True)
+    return highlight(code, lexer, formatter)
+
+
+def _render_history(history: List[Dict[str, Any]]) -> str:
+    formatter = HtmlFormatter(style="monokai")
+    css = formatter.get_style_defs(".codehilite")
+    blocks = [
+        "<style>",
+        css,
+        """
+        .chat-wrap { background: #0f1117; color: #e5e7eb; padding: 14px; border-radius: 12px; font-family: 'Segoe UI', sans-serif; }
+        .entry { border: 1px solid #262a33; background: #151922; border-radius: 10px; padding: 12px; margin-bottom: 12px; }
+        .prompt { color: #93c5fd; font-weight: 600; margin-bottom: 8px; white-space: pre-wrap; }
+        .meta { color: #9ca3af; font-size: 12px; margin-top: 8px; }
+        .code-box { border: 1px solid #2f3542; border-radius: 8px; background: #0b0d12; overflow-x: auto; }
+        .code-inner { padding: 12px; font-family: Consolas, 'Courier New', monospace; font-size: 13px; line-height: 1.5; white-space: pre; }
+        .copy-btn { background: #1f2937; color: #e5e7eb; border: 1px solid #374151; border-radius: 6px; padding: 5px 10px; cursor: pointer; float: right; margin-bottom: 6px; }
+        .copy-btn:hover { background: #374151; }
+        .label { font-size: 12px; color: #a1a1aa; margin-bottom: 6px; }
+        """,
+        "</style>",
+        """
+        <script>
+        function copyCode(id) {
+          const el = document.getElementById(id);
+          if (!el) return;
+          const text = el.innerText;
+          navigator.clipboard.writeText(text);
+        }
+        </script>
+        """,
+        '<div class="chat-wrap">',
+    ]
+
+    if not history:
+        blocks.append('<div class="entry"><div class="meta">No messages yet. Ask a coding question to begin.</div></div>')
+
+    for i, item in enumerate(history, start=1):
+        lang = item.get("language", "python")
+        prompt = html.escape(str(item.get("prompt", "")))
+        highlighted = _highlight_code(str(item.get("code", "")), lang)
+        code_id = f"code-{i}"
+        syntax_ok = "yes" if item.get("syntax_ok", False) else "n/a"
+        mode = item.get("mode", "base")
+        blocks.append('<div class="entry">')
+        blocks.append(f'<div class="prompt">User: {prompt}</div>')
+        blocks.append(f'<div class="label">Assistant ({lang})</div>')
+        blocks.append(f'<button class="copy-btn" onclick="copyCode(\'{code_id}\')">Copy</button>')
+        blocks.append('<div style="clear: both"></div>')
+        blocks.append('<div class="code-box">')
+        blocks.append(f'<pre class="code-inner codehilite" id="{code_id}">{highlighted}</pre>')
+        blocks.append('</div>')
+        blocks.append(
+            f'<div class="meta">mode={mode} | time={item.get("time_sec", 0):.2f}s | '
+            f'tokens={item.get("tokens", 0)} | syntax_ok={syntax_ok} | '
+            f'attempt={item.get("attempt", 1)}</div>'
+        )
+        blocks.append('</div>')
+
+    blocks.append('</div>')
+    return "\n".join(blocks)
+
+
+class ChatRuntime:
+    def __init__(self, config_path: str) -> None:
+        self.project_root = Path(__file__).resolve().parents[2]
+        self.cfg = _load_yaml(self.project_root / config_path)
+
+        self.model_cfg = _build_model_config(self.project_root / self.cfg["model"]["model_config_path"])
+        self.cuda_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if self.cuda_device.type != "cuda":
+            raise RuntimeError("CUDA GPU is required for this chat interface setup.")
+
+        self.tokenizer = CodeTokenizer.load(str(self.project_root / self.cfg["model"]["tokenizer_dir"]))
+
+        self.decode_cfg = DecodingConfig(
+            max_new_tokens=int(self.cfg["inference"].get("max_new_tokens", 300)),
+            greedy_temperature=float(self.cfg["inference"].get("greedy_temperature", 0.0)),
+            retry2_temperature=float(self.cfg["inference"].get("retry2_temperature", 0.25)),
+            retry2_top_p=float(self.cfg["inference"].get("retry2_top_p", 0.85)),
+            retry3_temperature=float(self.cfg["inference"].get("retry3_temperature", 0.35)),
+            retry3_top_p=float(self.cfg["inference"].get("retry3_top_p", 0.90)),
+            max_retries=int(self.cfg["inference"].get("max_retries", 3)),
+            min_tokens_before_stop_check=int(self.cfg["inference"].get("min_tokens_before_stop_check", 64)),
+        )
+
+        self.current_mode: Optional[str] = None
+        self.engine: Optional[InferenceEngine] = None
+
+    def _release_current(self) -> None:
+        self.engine = None
+        self.current_mode = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    def _current_vram_gb(self) -> float:
+        if not torch.cuda.is_available():
+            return 0.0
+        return float(torch.cuda.memory_allocated() / (1024**3))
+
+    def _status_text(self, mode: str, load_sec: float) -> str:
+        return f"MINDI 1.0 420M | mode={mode} | load={load_sec:.2f}s | vram={self._current_vram_gb():.2f}GB"
+
+    def _load_base_model(self) -> InferenceEngine:
+        model = CodeTransformerLM(self.model_cfg).to(self.cuda_device)
+        payload = torch.load(self.project_root / self.cfg["model"]["base_checkpoint_path"], map_location=self.cuda_device)
+        model.load_state_dict(payload["model_state"])
+        model.half()
+        return InferenceEngine(model=model, tokenizer=self.tokenizer, device=self.cuda_device)
+
+    def _load_lora_model(self) -> InferenceEngine:
+        model = CodeTransformerLM(self.model_cfg).to(self.cuda_device)
+        payload = torch.load(self.project_root / self.cfg["model"]["base_checkpoint_path"], map_location=self.cuda_device)
+        model.load_state_dict(payload["model_state"])
+
+        lora_cfg = LoRAConfig(
+            r=int(self.cfg.get("lora", {}).get("r", 8)),
+            alpha=int(self.cfg.get("lora", {}).get("alpha", 16)),
+            dropout=float(self.cfg.get("lora", {}).get("dropout", 0.05)),
+            target_keywords=list(self.cfg.get("lora", {}).get("target_keywords", ["q_proj", "k_proj", "v_proj", "o_proj", "fc1", "fc2"])),
+        )
+        apply_lora(model, lora_cfg)
+        model = model.to(self.cuda_device)
+
+        lora_payload = torch.load(self.project_root / self.cfg["model"]["lora_adapter_path"], map_location=self.cuda_device)
+        lora_state = lora_payload.get("lora_state", lora_payload)
+        load_lora_state_dict(model, lora_state)
+        model.half()
+        return InferenceEngine(model=model, tokenizer=self.tokenizer, device=self.cuda_device)
+
+    def _load_int8_model(self) -> InferenceEngine:
+        cpu = torch.device("cpu")
+        model = CodeTransformerLM(self.model_cfg).to(cpu).float()
+        model = torch.quantization.quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)
+        q_state = torch.load(self.project_root / self.cfg["model"]["quantized_state_path"], map_location=cpu)
+        model.load_state_dict(q_state)
+        return InferenceEngine(model=model, tokenizer=self.tokenizer, device=cpu)
+
+    def _ensure_mode(self, mode: str) -> str:
+        mode = (mode or "base").lower().strip()
+        if mode not in {"base", "lora", "int8"}:
+            mode = "base"
+
+        if self.current_mode == mode and self.engine is not None:
+            return self._status_text(mode, load_sec=0.0)
+
+        t0 = time.perf_counter()
+        self._release_current()
+        if mode == "base":
+            self.engine = self._load_base_model()
+        elif mode == "lora":
+            self.engine = self._load_lora_model()
+        else:
+            self.engine = self._load_int8_model()
+
+        self.current_mode = mode
+        load_sec = time.perf_counter() - t0
+        return self._status_text(mode, load_sec=load_sec)
+
+    def switch_mode(self, mode: str) -> str:
+        return self._ensure_mode(mode)
+
+    def respond(self, prompt: str, history: List[Dict[str, Any]], mode: str) -> Tuple[str, List[Dict[str, Any]], str, str]:
+        prompt = (prompt or "").strip()
+        if not prompt:
+            status = self._ensure_mode(mode)
+            return _render_history(history), history, "", status
+
+        status = self._ensure_mode(mode)
+
+        if not _is_coding_prompt(prompt):
+            fallback = "Please ask a coding question (for example: 'Write a Python function to ...' or 'Fix this JavaScript bug ...')."
+            history.append(
+                {
+                    "prompt": prompt,
+                    "code": fallback,
+                    "language": "text",
+                    "tokens": 0,
+                    "time_sec": 0.0,
+                    "syntax_ok": None,
+                    "attempt": 0,
+                    "mode": self.current_mode or "base",
+                }
+            )
+            return _render_history(history), history, "", status
+
+        lang_default = str(self.cfg["inference"].get("language_default", "python"))
+        language = _guess_language(prompt, default_lang=lang_default)
+
+        start = time.perf_counter()
+        result = self.engine.generate_with_retry(prompt=prompt, language=language, cfg=self.decode_cfg)  # type: ignore[union-attr]
+        elapsed = time.perf_counter() - start
+
+        final = result["final"]
+        history.append(
+            {
+                "prompt": prompt,
+                "code": final["code"],
+                "language": language,
+                "tokens": int(final.get("generated_tokens", 0)),
+                "time_sec": float(elapsed),
+                "syntax_ok": bool(final.get("syntax_ok", False)) if language == "python" else None,
+                "attempt": int(final.get("attempt", 1)),
+                "mode": self.current_mode or "base",
+            }
+        )
+
+        return _render_history(history), history, "", status
+
+    def clear(self, mode: str) -> Tuple[str, List[Dict[str, Any]], str, str]:
+        history: List[Dict[str, Any]] = []
+        status = self._ensure_mode(mode)
+        return _render_history(history), history, "", status
+
+
+def create_demo(config_path: str = "configs/component8_chat_config.yaml") -> gr.Blocks:
+    runtime = ChatRuntime(config_path=config_path)
+
+    with gr.Blocks(title="MINDI 1.0 420M", theme=gr.themes.Base()) as demo:
+        gr.Markdown("## MINDI 1.0 420M\nYour local coding intelligence — 420M parameters, fully offline")
+
+        history_state = gr.State([])
+        chat_html = gr.HTML(value=_render_history([]))
+
+        with gr.Row():
+            mode_dropdown = gr.Dropdown(
+                label="Model Mode",
+                choices=["base", "lora", "int8"],
+                value="base",
+                interactive=True,
+            )
+            status_box = gr.Textbox(label="Status", value="MINDI 1.0 420M | mode=base | load=0.00s | vram=0.00GB", interactive=False)
+
+        prompt_box = gr.Textbox(
+            label="Your Prompt",
+            lines=4,
+            placeholder="Ask MINDI anything about code",
+        )
+
+        with gr.Row():
+            send_btn = gr.Button("Generate", variant="primary")
+            clear_btn = gr.Button("Clear Conversation")
+            switch_btn = gr.Button("Apply Mode")
+
+        switch_btn.click(fn=runtime.switch_mode, inputs=[mode_dropdown], outputs=[status_box])
+
+        send_btn.click(
+            fn=runtime.respond,
+            inputs=[prompt_box, history_state, mode_dropdown],
+            outputs=[chat_html, history_state, prompt_box, status_box],
+            queue=True,
+        )
+        prompt_box.submit(
+            fn=runtime.respond,
+            inputs=[prompt_box, history_state, mode_dropdown],
+            outputs=[chat_html, history_state, prompt_box, status_box],
+            queue=True,
+        )
+        clear_btn.click(
+            fn=runtime.clear,
+            inputs=[mode_dropdown],
+            outputs=[chat_html, history_state, prompt_box, status_box],
+        )
+
+    return demo
+
+
diff --git a/src/dataset_pipeline/__init__.py b/src/dataset_pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd53166ff987a457ccc6af438753e4d3902cc54
--- /dev/null
+++ b/src/dataset_pipeline/__init__.py
@@ -0,0 +1,2 @@
+# This file marks dataset_pipeline as a Python package.
+
diff --git a/src/dataset_pipeline/hf_dataset_pipeline.py b/src/dataset_pipeline/hf_dataset_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..194e7bf461cd546b469df5b03cd46d91018ab095
--- /dev/null
+++ b/src/dataset_pipeline/hf_dataset_pipeline.py
@@ -0,0 +1,330 @@
+"""
+Component 3: Hugging Face dataset pipeline for code model training.
+
+This module:
+1) Streams multiple public datasets from Hugging Face.
+2) Standardizes records into prompt/code/language.
+3) Cleans and filters low-quality samples.
+4) Deduplicates with a disk-backed SQLite hash index.
+5) Tokenizes using Component 2 tokenizer.
+6) Saves training-ready JSONL output and summary stats.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+import sqlite3
+import string
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple
+
+from datasets import load_dataset
+
+from src.tokenizer.code_tokenizer import CodeTokenizer
+
+
+@dataclass
+class SourceDatasetSpec:
+    # Hugging Face dataset repo id.
+    hf_dataset_id: str
+    # Split name in HF datasets (usually train).
+    split: str
+    # Field that contains user prompt/instruction.
+    prompt_field: str
+    # Field that contains code answer/output.
+    code_field: str
+    # Optional language field name if dataset includes it.
+    language_field: Optional[str]
+    # Default language if not provided by the record.
+    default_language: str
+
+
+@dataclass
+class PipelineConfig:
+    # Dataset definitions to load.
+    datasets: List[SourceDatasetSpec]
+    # Path to saved tokenizer directory from Component 2.
+    tokenizer_dir: str
+    # Directory to write cleaned intermediate records.
+    interim_output_dir: str
+    # Directory to write tokenized final records.
+    processed_output_dir: str
+    # Path for SQLite file used to deduplicate efficiently.
+    dedupe_db_path: str
+    # Optional limit for quick tests.
+    max_records_per_dataset: Optional[int] = None
+    # Filters to remove low quality data.
+    min_prompt_chars: int = 8
+    min_code_chars: int = 16
+    max_code_chars: int = 40_000
+    # Write stats every N accepted samples.
+    progress_every: int = 1_000
+
+
+class HFDatasetPipeline:
+    # Pipeline object that executes full preprocessing.
+
+    AUTOGEN_PATTERNS = [
+        r"(?i)auto[\s-]?generated",
+        r"(?i)generated by",
+        r"(?i)do not edit",
+        r"(?i)machine generated",
+        r"(?i)this file was generated",
+    ]
+
+    PY_HINTS = [
+        "def ",
+        "import ",
+        "from ",
+        "print(",
+        "if __name__ ==",
+        "class ",
+        "lambda ",
+        "elif ",
+        "except ",
+    ]
+    JS_HINTS = [
+        "function ",
+        "const ",
+        "let ",
+        "=>",
+        "console.log",
+        "export ",
+        "require(",
+        "document.",
+        "window.",
+        "=> {",
+    ]
+
+    def __init__(self, config: PipelineConfig) -> None:
+        self.config = config
+        self.tokenizer = CodeTokenizer.load(config.tokenizer_dir)
+        self._ensure_dirs()
+        self.conn = self._init_dedupe_db(config.dedupe_db_path)
+        self.stats: Dict[str, int] = {
+            "seen_total": 0,
+            "kept_total": 0,
+            "dropped_empty": 0,
+            "dropped_length": 0,
+            "dropped_binary_like": 0,
+            "dropped_autogen_like": 0,
+            "dropped_duplicate": 0,
+            "language_python": 0,
+            "language_javascript": 0,
+            "language_other_defaulted_to_python": 0,
+        }
+
+    def _ensure_dirs(self) -> None:
+        # Create output directories so writing does not fail later.
+        Path(self.config.interim_output_dir).mkdir(parents=True, exist_ok=True)
+        Path(self.config.processed_output_dir).mkdir(parents=True, exist_ok=True)
+        Path(self.config.dedupe_db_path).parent.mkdir(parents=True, exist_ok=True)
+
+    def _init_dedupe_db(self, db_path: str) -> sqlite3.Connection:
+        # SQLite gives us memory-safe dedupe for large data.
+        conn = sqlite3.connect(db_path)
+        conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS dedupe_hashes (
+                sample_hash TEXT PRIMARY KEY
+            )
+            """
+        )
+        conn.commit()
+        return conn
+
+    def close(self) -> None:
+        # Always close DB cleanly.
+        self.conn.close()
+
+    def run(self) -> Dict[str, int]:
+        # Full pipeline entry point.
+        interim_path = Path(self.config.interim_output_dir) / "combined_clean.jsonl"
+        processed_path = Path(self.config.processed_output_dir) / "train_tokenized.jsonl"
+        stats_path = Path(self.config.processed_output_dir) / "pipeline_stats.json"
+
+        with interim_path.open("w", encoding="utf-8") as interim_f, processed_path.open(
+            "w", encoding="utf-8"
+        ) as tokenized_f:
+            for spec in self.config.datasets:
+                self._process_one_dataset(spec, interim_f, tokenized_f)
+
+        with stats_path.open("w", encoding="utf-8") as f:
+            json.dump(self.stats, f, indent=2)
+        return self.stats
+
+    def _process_one_dataset(self, spec: SourceDatasetSpec, interim_f, tokenized_f) -> None:
+        # Stream one dataset and process records one by one.
+        print(f"[info] Loading dataset: {spec.hf_dataset_id} split={spec.split}")
+        stream = load_dataset(spec.hf_dataset_id, split=spec.split, streaming=True)
+        count = 0
+
+        for row in stream:
+            self.stats["seen_total"] += 1
+            count += 1
+            if self.config.max_records_per_dataset and count > self.config.max_records_per_dataset:
+                break
+
+            sample = self._standardize_record(row=row, spec=spec)
+            if sample is None:
+                continue
+
+            prompt, code, language = sample
+            cleaned = self._clean_and_filter(prompt=prompt, code=code, language=language)
+            if cleaned is None:
+                continue
+
+            clean_prompt, clean_code, clean_language = cleaned
+            if not self._keep_unique(clean_prompt, clean_code):
+                self.stats["dropped_duplicate"] += 1
+                continue
+
+            formatted_text = self.tokenizer.format_training_sample(
+                prompt=clean_prompt,
+                code=clean_code,
+                language=clean_language,
+            )
+            input_ids = self.tokenizer.encode(formatted_text)
+
+            interim_record = {
+                "prompt": clean_prompt,
+                "code": clean_code,
+                "language": clean_language,
+            }
+            tokenized_record = {
+                "language": clean_language,
+                "text": formatted_text,
+                "input_ids": input_ids,
+                "length": len(input_ids),
+            }
+
+            interim_f.write(json.dumps(interim_record, ensure_ascii=False) + "\n")
+            tokenized_f.write(json.dumps(tokenized_record, ensure_ascii=False) + "\n")
+
+            self.stats["kept_total"] += 1
+            if self.stats["kept_total"] % self.config.progress_every == 0:
+                print(
+                    "[progress] "
+                    f"seen={self.stats['seen_total']} kept={self.stats['kept_total']} "
+                    f"duplicates={self.stats['dropped_duplicate']}"
+                )
+                self.conn.commit()
+
+        self.conn.commit()
+
+    def _standardize_record(
+        self, row: Dict[str, object], spec: SourceDatasetSpec
+    ) -> Optional[Tuple[str, str, str]]:
+        # Converts source-specific row into a consistent tuple.
+        prompt_raw = row.get(spec.prompt_field)
+        code_raw = row.get(spec.code_field)
+        if prompt_raw is None or code_raw is None:
+            self.stats["dropped_empty"] += 1
+            return None
+
+        prompt = str(prompt_raw).strip()
+        code = str(code_raw).strip()
+        if not prompt or not code:
+            self.stats["dropped_empty"] += 1
+            return None
+
+        if spec.language_field and row.get(spec.language_field) is not None:
+            language = str(row.get(spec.language_field)).strip().lower()
+        else:
+            language = spec.default_language.strip().lower()
+
+        return prompt, code, language
+
+    def _clean_and_filter(
+        self, prompt: str, code: str, language: str
+    ) -> Optional[Tuple[str, str, str]]:
+        # Cleans text and applies quality filters.
+        prompt = self._normalize_text(prompt)
+        code = self._normalize_code(code)
+
+        if len(prompt) < self.config.min_prompt_chars or len(code) < self.config.min_code_chars:
+            self.stats["dropped_length"] += 1
+            return None
+        if len(code) > self.config.max_code_chars:
+            self.stats["dropped_length"] += 1
+            return None
+
+        if self._looks_binary_like(prompt) or self._looks_binary_like(code):
+            self.stats["dropped_binary_like"] += 1
+            return None
+
+        combined = f"{prompt}\n{code}"
+        if self._looks_auto_generated(combined):
+            self.stats["dropped_autogen_like"] += 1
+            return None
+
+        normalized_lang = self._normalize_language(language, prompt, code)
+        return prompt, code, normalized_lang
+
+    def _normalize_text(self, text: str) -> str:
+        # Basic whitespace cleanup.
+        return re.sub(r"\s+", " ", text.replace("\r\n", "\n").replace("\r", "\n")).strip()
+
+    def _normalize_code(self, text: str) -> str:
+        # Preserve line breaks for code while cleaning trailing whitespace.
+        lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
+        clean_lines = [line.rstrip() for line in lines]
+        code = "\n".join(clean_lines).strip()
+        return code
+
+    def _looks_binary_like(self, text: str) -> bool:
+        # Detect likely non-text content that should not be in code samples.
+        if "\x00" in text:
+            return True
+        if not text:
+            return True
+        printable = set(string.printable) | {"\n", "\t", "\r"}
+        non_printable_count = sum(1 for ch in text if ch not in printable)
+        ratio = non_printable_count / max(1, len(text))
+        return ratio > 0.12
+
+    def _looks_auto_generated(self, text: str) -> bool:
+        # Remove obvious generated boilerplate using lightweight regex checks.
+        return any(re.search(pattern, text) for pattern in self.AUTOGEN_PATTERNS)
+
+    def _normalize_language(self, language: str, prompt: str, code: str) -> str:
+        # Normalize to python/javascript for current training goals.
+        lang = language.lower().strip()
+        if "python" in lang:
+            self.stats["language_python"] += 1
+            return "python"
+        if "javascript" in lang or lang in {"js", "node", "nodejs"}:
+            self.stats["language_javascript"] += 1
+            return "javascript"
+        if lang in {"auto", "unknown", "mixed", ""}:
+            pass
+
+        prompt_lower = prompt.lower()
+        code_lower = code.lower()
+        py_score = sum(1 for hint in self.PY_HINTS if hint in code_lower)
+        js_score = sum(1 for hint in self.JS_HINTS if hint in code_lower)
+        if "javascript" in prompt_lower or "node.js" in prompt_lower or "js " in prompt_lower:
+            js_score += 2
+        if "python" in prompt_lower:
+            py_score += 2
+        if js_score > py_score:
+            self.stats["language_javascript"] += 1
+            return "javascript"
+
+        # Default to python to satisfy your "Python first" target.
+        self.stats["language_other_defaulted_to_python"] += 1
+        self.stats["language_python"] += 1
+        return "python"
+
+    def _keep_unique(self, prompt: str, code: str) -> bool:
+        # Hash normalized prompt+code and store in SQLite for dedupe.
+        normalized_pair = f"{prompt}\n<SEP>\n{code}".encode("utf-8")
+        digest = hashlib.sha256(normalized_pair).hexdigest()
+        try:
+            self.conn.execute("INSERT INTO dedupe_hashes (sample_hash) VALUES (?)", (digest,))
+            return True
+        except sqlite3.IntegrityError:
+            return False
diff --git a/src/evaluation_system/__init__.py b/src/evaluation_system/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab838cd1aab93b97566844ce5c08d423d94dc41
--- /dev/null
+++ b/src/evaluation_system/__init__.py
@@ -0,0 +1 @@
+﻿# This file marks evaluation_system as a Python package.
diff --git a/src/evaluation_system/code_eval.py b/src/evaluation_system/code_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd8ed48a751285ecba59b448786774f7920eed16
--- /dev/null
+++ b/src/evaluation_system/code_eval.py
@@ -0,0 +1,186 @@
+﻿"""
+Component 6 evaluation helpers.
+"""
+
+from __future__ import annotations
+
+import ast
+import json
+import re
+from pathlib import Path
+from typing import Dict, List
+
+
+def python_syntax_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except Exception:
+        return False
+
+
+def save_json(path: str, payload: Dict) -> None:
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def _normalize_punctuation_spacing(text: str) -> str:
+    text = re.sub(r"\s+([,.:;\)\]\}])", r"\1", text)
+    text = re.sub(r"([\(\[\{])\s+", r"\1", text)
+    text = re.sub(r"\s*=\s*", " = ", text)
+    text = re.sub(r"\s*\+\s*", " + ", text)
+    text = re.sub(r"\s*-\s*", " - ", text)
+    text = re.sub(r"\s*\*\s*", " * ", text)
+    text = re.sub(r"\s*/\s*", " / ", text)
+    text = re.sub(r"\s*%\s*", " % ", text)
+    return re.sub(r"[ \t]+", " ", text).strip()
+
+
+def _remove_non_python_noise(line: str) -> str:
+    line = line.replace("<UNK>", "1")
+    line = line.replace("\u0000", "")
+    line = line.replace("{", "")
+    line = line.replace("}", "")
+    line = line.replace(";", "")
+    return line
+
+
+def _fix_identifier_spacing(line: str) -> str:
+    # def name with spaces -> def name_with_spaces
+    m = re.match(r"^(\s*def\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*\(.*)$", line)
+    if m:
+        fn = re.sub(r"\s+", "_", m.group(2).strip())
+        line = f"{m.group(1)}{fn}{m.group(3)}"
+
+    # class name with spaces -> class Name_With_Spaces
+    m = re.match(r"^(\s*class\s+)([A-Za-z_][A-Za-z0-9_\s]*)(\s*:.*)$", line)
+    if m:
+        cn = re.sub(r"\s+", "_", m.group(2).strip())
+        line = f"{m.group(1)}{cn}{m.group(3)}"
+
+    # assignment lhs spaces -> underscore.
+    if "=" in line and "==" not in line:
+        lhs, rhs = line.split("=", 1)
+        lhs_clean = lhs.strip()
+        if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_\s]*", lhs_clean):
+            lhs_clean = re.sub(r"\s+", "_", lhs_clean)
+            line = f"{lhs_clean} = {rhs.strip()}"
+
+    return line
+
+
+def _looks_like_python_line(line: str) -> bool:
+    if not line.strip():
+        return False
+    starts = (
+        "def ",
+        "class ",
+        "if ",
+        "for ",
+        "while ",
+        "try:",
+        "except",
+        "with ",
+        "return ",
+        "import ",
+        "from ",
+        "print(",
+    )
+    s = line.strip()
+    if s.startswith(starts):
+        return True
+    if re.match(r"^[A-Za-z_][A-Za-z0-9_]*\s*=", s):
+        return True
+    return False
+
+
+def _trim_to_code(lines: List[str]) -> List[str]:
+    # Drop noisy preamble lines until first plausible Python line.
+    i = 0
+    while i < len(lines) and not _looks_like_python_line(lines[i]):
+        i += 1
+    lines = lines[i:] if i < len(lines) else []
+    # Keep only plausible lines after start; allow blank lines.
+    out = []
+    for line in lines:
+        if not line.strip():
+            out.append(line)
+            continue
+        if _looks_like_python_line(line) or line.startswith("    "):
+            out.append(line)
+    return out
+
+
+def _best_effort_python_format(lines: List[str]) -> List[str]:
+    out: List[str] = []
+    indent = 0
+    for raw in lines:
+        line = raw.strip()
+        if not line:
+            out.append("")
+            continue
+
+        if line in {"return", "pass", "break", "continue"}:
+            indent = max(0, indent - 1)
+
+        out.append(("    " * indent) + line)
+
+        if line.endswith(":"):
+            indent += 1
+
+    return out
+
+
+def restore_code_from_structured(decoded: str) -> str:
+    text = decoded
+    for tok in ["<BOS>", "<EOS>", "<PROMPT>", "<PYTHON>", "<JAVASCRIPT>"]:
+        text = text.replace(tok, "")
+
+    if "<CODE>" in text:
+        text = text.split("<CODE>", 1)[1]
+
+    text = text.replace("_", " ")
+    tokens = text.strip().split()
+
+    lines: List[str] = []
+    current_tokens: List[str] = []
+    indent = 0
+
+    for tok in tokens:
+        if tok == "<INDENT>":
+            indent += 1
+            continue
+        if tok == "<DEDENT>":
+            indent = max(0, indent - 1)
+            continue
+        if tok == "<NL>":
+            line = " ".join(current_tokens).strip()
+            line = _remove_non_python_noise(line)
+            line = _normalize_punctuation_spacing(line)
+            line = _fix_identifier_spacing(line)
+            if line:
+                lines.append(("    " * indent) + line)
+            else:
+                lines.append("")
+            current_tokens = []
+            continue
+        current_tokens.append(tok)
+
+    if current_tokens:
+        line = " ".join(current_tokens).strip()
+        line = _remove_non_python_noise(line)
+        line = _normalize_punctuation_spacing(line)
+        line = _fix_identifier_spacing(line)
+        if line:
+            lines.append(("    " * indent) + line)
+
+    lines = _trim_to_code(lines)
+    lines = _best_effort_python_format(lines)
+
+    while lines and not lines[0].strip():
+        lines.pop(0)
+    while lines and not lines[-1].strip():
+        lines.pop()
+
+    return "\n".join(lines).strip()
diff --git a/src/finetuning_system/__init__.py b/src/finetuning_system/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2c60c944c17467cfefa09c7a820c0b8351ec61
--- /dev/null
+++ b/src/finetuning_system/__init__.py
@@ -0,0 +1 @@
+﻿# This file marks finetuning_system as a Python package.
diff --git a/src/finetuning_system/custom_pair_dataset.py b/src/finetuning_system/custom_pair_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4523951d234ff645a8a4e3b2545b9eb78526a14
--- /dev/null
+++ b/src/finetuning_system/custom_pair_dataset.py
@@ -0,0 +1,65 @@
+﻿"""
+Dataset for custom fine-tuning pairs (JSON or JSONL).
+Expected fields: prompt, code, optional language.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Dict, List
+
+from torch.utils.data import Dataset
+
+from src.tokenizer.code_tokenizer import CodeTokenizer
+
+
+class CustomPairDataset(Dataset):
+    def __init__(self, path: str, tokenizer: CodeTokenizer, max_seq_len: int = 512) -> None:
+        self.path = Path(path)
+        if not self.path.exists():
+            raise FileNotFoundError(f"Custom fine-tune data file not found: {self.path}")
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        self.rows: List[List[int]] = []
+        self._load()
+
+    def _load(self) -> None:
+        if self.path.suffix.lower() == ".jsonl":
+            data = []
+            for line in self.path.read_text(encoding="utf-8-sig").splitlines():
+                line = line.strip().lstrip("\ufeff")
+                if not line:
+                    continue
+                data.append(json.loads(line))
+        elif self.path.suffix.lower() == ".json":
+            raw = json.loads(self.path.read_text(encoding="utf-8-sig"))
+            if isinstance(raw, dict) and "data" in raw:
+                data = raw["data"]
+            elif isinstance(raw, list):
+                data = raw
+            else:
+                raise ValueError("JSON fine-tune file must be a list or {'data': [...]}.")
+        else:
+            raise ValueError("Custom fine-tune file must be .json or .jsonl")
+
+        for row in data:
+            prompt = str(row.get("prompt", "")).strip()
+            code = str(row.get("code", "")).strip()
+            language = str(row.get("language", "python")).strip().lower() or "python"
+            if not prompt or not code:
+                continue
+            text = self.tokenizer.format_training_sample(prompt=prompt, code=code, language=language)
+            ids = self.tokenizer.encode(text)[: self.max_seq_len]
+            if len(ids) >= 8:
+                self.rows.append(ids)
+
+        if not self.rows:
+            raise ValueError("No valid samples found in custom fine-tune data.")
+
+    def __len__(self) -> int:
+        return len(self.rows)
+
+    def __getitem__(self, idx: int) -> List[int]:
+        return self.rows[idx]
+
diff --git a/src/finetuning_system/lora_adapter.py b/src/finetuning_system/lora_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6a0ee9f37ec74ad3ff6ca82f09b81edcf5b36ae
--- /dev/null
+++ b/src/finetuning_system/lora_adapter.py
@@ -0,0 +1,92 @@
+﻿"""
+Simple LoRA implementation for custom PyTorch transformer modules.
+"""
+
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from typing import Iterable, List
+
+import torch
+import torch.nn as nn
+
+
+@dataclass
+class LoRAConfig:
+    r: int = 8
+    alpha: int = 16
+    dropout: float = 0.05
+    target_keywords: List[str] = None  # type: ignore[assignment]
+
+    def __post_init__(self) -> None:
+        if self.target_keywords is None:
+            self.target_keywords = ["q_proj", "k_proj", "v_proj", "o_proj", "fc1", "fc2"]
+
+
+class LoRALinear(nn.Module):
+    def __init__(self, base: nn.Linear, r: int, alpha: int, dropout: float) -> None:
+        super().__init__()
+        if base.bias is not None:
+            # Keep implementation simple and stable for current model (bias=False modules).
+            raise ValueError("LoRALinear expects base Linear with bias=None in this project.")
+
+        self.base = base
+        self.base.weight.requires_grad = False
+
+        self.in_features = base.in_features
+        self.out_features = base.out_features
+        self.r = r
+        self.scaling = alpha / max(1, r)
+
+        self.lora_A = nn.Parameter(torch.zeros(r, self.in_features))
+        self.lora_B = nn.Parameter(torch.zeros(self.out_features, r))
+        self.dropout = nn.Dropout(dropout)
+
+        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_B)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        base_out = self.base(x)
+        lora_out = self.dropout(x) @ self.lora_A.t() @ self.lora_B.t()
+        return base_out + (self.scaling * lora_out)
+
+
+def _replace_module(root: nn.Module, dotted_name: str, new_module: nn.Module) -> None:
+    parts = dotted_name.split(".")
+    parent = root
+    for p in parts[:-1]:
+        parent = getattr(parent, p)
+    setattr(parent, parts[-1], new_module)
+
+
+def apply_lora(model: nn.Module, cfg: LoRAConfig) -> List[str]:
+    replaced: List[str] = []
+    for name, module in list(model.named_modules()):
+        if not isinstance(module, nn.Linear):
+            continue
+        if not any(k in name for k in cfg.target_keywords):
+            continue
+        lora_mod = LoRALinear(base=module, r=cfg.r, alpha=cfg.alpha, dropout=cfg.dropout)
+        _replace_module(model, name, lora_mod)
+        replaced.append(name)
+
+    # Freeze everything except LoRA params.
+    for p in model.parameters():
+        p.requires_grad = False
+    for n, p in model.named_parameters():
+        if "lora_A" in n or "lora_B" in n:
+            p.requires_grad = True
+
+    return replaced
+
+
+def lora_state_dict(model: nn.Module) -> dict:
+    return {k: v.detach().cpu() for k, v in model.state_dict().items() if ("lora_A" in k or "lora_B" in k)}
+
+
+def load_lora_state_dict(model: nn.Module, state: dict) -> None:
+    own = model.state_dict()
+    for k, v in state.items():
+        if k in own:
+            own[k].copy_(v)
diff --git a/src/inference_engine/__init__.py b/src/inference_engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..245858df2c08ea0b4dfb0f65ef3c3a7115ca87fe
--- /dev/null
+++ b/src/inference_engine/__init__.py
@@ -0,0 +1 @@
+﻿# This file marks inference_engine as a Python package.
diff --git a/src/inference_engine/inference_engine.py b/src/inference_engine/inference_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..27dfcdc9489ebf39e18a41a2314fb0b83cdb94c0
--- /dev/null
+++ b/src/inference_engine/inference_engine.py
@@ -0,0 +1,211 @@
+﻿"""
+Component 7: Inference engine for local code generation.
+
+Features:
+- Deterministic low-temperature greedy mode.
+- Stop rules for clean function completion.
+- Syntax-aware retry with up to 3 attempts.
+"""
+
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from src.evaluation_system.code_eval import restore_code_from_structured
+from src.model_architecture.code_transformer import CodeTransformerLM
+from src.tokenizer.code_tokenizer import CodeTokenizer
+
+
+@dataclass
+class DecodingConfig:
+    max_new_tokens: int = 300
+    # Mode 1: deterministic output
+    greedy_temperature: float = 0.0
+    # Retry mode 2
+    retry2_temperature: float = 0.25
+    retry2_top_p: float = 0.85
+    # Retry mode 3
+    retry3_temperature: float = 0.35
+    retry3_top_p: float = 0.90
+    max_retries: int = 3
+    min_tokens_before_stop_check: int = 64
+    # Stop only when function body is non-trivial.
+    min_function_body_statements: int = 2
+
+
+class InferenceEngine:
+    def __init__(self, model: CodeTransformerLM, tokenizer: CodeTokenizer, device: torch.device) -> None:
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.model.eval()
+
+    @staticmethod
+    def _syntax_ok_python(code: str) -> bool:
+        try:
+            ast.parse(code)
+            return True
+        except Exception:
+            return False
+
+    @staticmethod
+    def _function_completion_score(code: str) -> int:
+        # Higher score = more complete usable function.
+        try:
+            tree = ast.parse(code)
+        except Exception:
+            return 0
+        funcs = [n for n in tree.body if isinstance(n, ast.FunctionDef)]
+        if not funcs:
+            return 0
+        fn = funcs[-1]
+        body_len = len(fn.body)
+        has_return = any(isinstance(n, ast.Return) for n in ast.walk(fn))
+        return body_len + (2 if has_return else 0)
+
+    def _looks_complete_function(self, code: str, min_body_statements: int) -> bool:
+        if "def " not in code:
+            return False
+        try:
+            tree = ast.parse(code)
+        except Exception:
+            return False
+        funcs = [n for n in tree.body if isinstance(n, ast.FunctionDef)]
+        if not funcs:
+            return False
+        fn = funcs[-1]
+        if len(fn.body) < min_body_statements:
+            return False
+        return True
+
+    def _sample_next(
+        self,
+        logits: torch.Tensor,
+        temperature: float,
+        top_p: float,
+    ) -> torch.Tensor:
+        if temperature <= 0:
+            return torch.argmax(logits, dim=-1, keepdim=True)
+
+        logits = logits / temperature
+        probs = torch.softmax(logits, dim=-1)
+
+        sorted_probs, sorted_idx = torch.sort(probs, descending=True)
+        cumulative = torch.cumsum(sorted_probs, dim=-1)
+        cutoff = cumulative > top_p
+        cutoff[..., 1:] = cutoff[..., :-1].clone()
+        cutoff[..., 0] = False
+        sorted_probs[cutoff] = 0.0
+        denom = sorted_probs.sum(dim=-1, keepdim=True).clamp_min(1e-12)
+        sorted_probs = sorted_probs / denom
+        sampled = torch.multinomial(sorted_probs, num_samples=1)
+        return sorted_idx.gather(-1, sampled)
+
+    @torch.no_grad()
+    def _generate_once(
+        self,
+        prompt: str,
+        language: str,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        min_tokens_before_stop_check: int,
+        min_function_body_statements: int,
+    ) -> Dict[str, object]:
+        prompt_text = self.tokenizer.format_training_sample(prompt=prompt, code="", language=language)
+        prompt_text = prompt_text.replace(" <NL>", "").strip()
+
+        ids = self.tokenizer.encode(prompt_text)
+        eos_id = self.tokenizer.special_token_ids.get("<EOS>")
+
+        # Remove trailing EOS so generation can continue.
+        if eos_id is not None and len(ids) > 1 and ids[-1] == int(eos_id):
+            ids = ids[:-1]
+
+        input_ids = torch.tensor([ids], dtype=torch.long, device=self.device)
+
+        generated_steps = 0
+        for _ in range(max_new_tokens):
+            out = self.model(input_ids=input_ids)
+            logits = out["logits"][:, -1, :]
+            next_id = self._sample_next(logits, temperature=temperature, top_p=top_p)
+            input_ids = torch.cat([input_ids, next_id], dim=1)
+            generated_steps += 1
+
+            # Primary stop: EOS token.
+            if eos_id is not None and int(next_id.item()) == int(eos_id):
+                break
+
+            # Secondary stop: complete parseable function with non-trivial body.
+            if generated_steps >= min_tokens_before_stop_check and (generated_steps % 12 == 0):
+                decoded = self.tokenizer.decode(input_ids[0].tolist())
+                code = restore_code_from_structured(decoded)
+                if self._looks_complete_function(code, min_body_statements=min_function_body_statements):
+                    break
+
+        decoded = self.tokenizer.decode(input_ids[0].tolist())
+        code = restore_code_from_structured(decoded)
+        syntax_ok = self._syntax_ok_python(code) if language == "python" else True
+        completion_score = self._function_completion_score(code) if language == "python" else 0
+        return {
+            "code": code,
+            "syntax_ok": syntax_ok,
+            "generated_tokens": generated_steps,
+            "temperature": temperature,
+            "top_p": top_p,
+            "completion_score": completion_score,
+        }
+
+    @torch.no_grad()
+    def generate_with_retry(
+        self,
+        prompt: str,
+        language: str = "python",
+        cfg: Optional[DecodingConfig] = None,
+    ) -> Dict[str, object]:
+        cfg = cfg or DecodingConfig()
+
+        attempts: List[Tuple[float, float]] = [
+            (cfg.greedy_temperature, 1.0),
+            (cfg.retry2_temperature, cfg.retry2_top_p),
+            (cfg.retry3_temperature, cfg.retry3_top_p),
+        ]
+
+        results = []
+        for i in range(min(cfg.max_retries, len(attempts))):
+            temp, top_p = attempts[i]
+            res = self._generate_once(
+                prompt=prompt,
+                language=language,
+                max_new_tokens=cfg.max_new_tokens,
+                temperature=temp,
+                top_p=top_p,
+                min_tokens_before_stop_check=cfg.min_tokens_before_stop_check,
+                min_function_body_statements=cfg.min_function_body_statements,
+            )
+            res["attempt"] = i + 1
+            results.append(res)
+
+            # Syntax-aware retry: stop retries as soon as syntax is valid.
+            if bool(res["syntax_ok"]):
+                return {
+                    "final": res,
+                    "attempts": results,
+                    "used_retry": i > 0,
+                }
+
+        # If all retries fail, choose best completion score then longest generation.
+        best = sorted(
+            results,
+            key=lambda x: (int(x.get("completion_score", 0)), int(x.get("generated_tokens", 0))),
+            reverse=True,
+        )[0]
+        return {
+            "final": best,
+            "attempts": results,
+            "used_retry": True,
+        }
diff --git a/src/model_architecture/__init__.py b/src/model_architecture/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..342d765602b980261490c2ba31ee4798327a8947
--- /dev/null
+++ b/src/model_architecture/__init__.py
@@ -0,0 +1,2 @@
+# This file marks model_architecture as a Python package.
+
diff --git a/src/model_architecture/code_transformer.py b/src/model_architecture/code_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..456fd69995ced3fd03b24a5badc9fc816d269e8f
--- /dev/null
+++ b/src/model_architecture/code_transformer.py
@@ -0,0 +1,264 @@
+"""
+Component 4: Transformer model architecture for code generation.
+
+This module defines a decoder-only transformer built from scratch in PyTorch.
+It is modular through configuration so model size can be scaled up/down.
+"""
+
+from __future__ import annotations
+
+import math
+from dataclasses import asdict, dataclass
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+@dataclass
+class ModelConfig:
+    # Vocabulary size from tokenizer.
+    vocab_size: int = 50_000
+    # Maximum context length in tokens.
+    max_seq_len: int = 2048
+    # Core hidden size of transformer.
+    d_model: int = 1152
+    # Number of transformer blocks.
+    n_layers: int = 23
+    # Number of attention heads.
+    n_heads: int = 16
+    # Feed-forward hidden size.
+    d_ff: int = 4608
+    # Dropout for regularization.
+    dropout: float = 0.1
+    # Whether to tie token embedding and LM head weights.
+    tie_embeddings: bool = True
+    # Enable gradient checkpointing to reduce VRAM usage during training.
+    gradient_checkpointing: bool = False
+    # Initialization standard deviation.
+    init_std: float = 0.02
+    # Epsilon for layer normalization stability.
+    rms_norm_eps: float = 1e-5
+
+    @property
+    def head_dim(self) -> int:
+        if self.d_model % self.n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads.")
+        return self.d_model // self.n_heads
+
+
+def get_model_presets() -> Dict[str, ModelConfig]:
+    """
+    Returns standard size presets.
+    """
+    return {
+        "small_180m": ModelConfig(d_model=896, n_layers=18, n_heads=14, d_ff=3584),
+        "medium_420m": ModelConfig(d_model=1152, n_layers=23, n_heads=16, d_ff=4608),
+        "large_800m": ModelConfig(d_model=1536, n_layers=24, n_heads=16, d_ff=6144),
+    }
+
+
+class RMSNorm(nn.Module):
+    """
+    RMSNorm is a lightweight normalization layer used in many modern LLMs.
+    """
+
+    def __init__(self, dim: int, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(norm + self.eps)
+        return self.weight * x
+
+
+class RotaryEmbedding(nn.Module):
+    """
+    Rotary positional embedding.
+    This injects token order information directly into query/key vectors.
+    """
+
+    def __init__(self, head_dim: int, max_seq_len: int) -> None:
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for rotary embeddings.")
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        t = torch.arange(max_seq_len, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cached", torch.cos(freqs), persistent=False)
+        self.register_buffer("sin_cached", torch.sin(freqs), persistent=False)
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)  # [1,1,S,H/2]
+        sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)  # [1,1,S,H/2]
+        q = self._apply_rotary(q, cos, sin)
+        k = self._apply_rotary(k, cos, sin)
+        return q, k
+
+    @staticmethod
+    def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        x_rot_even = x1 * cos - x2 * sin
+        x_rot_odd = x1 * sin + x2 * cos
+        out = torch.stack((x_rot_even, x_rot_odd), dim=-1).flatten(-2)
+        return out
+
+
+class CausalSelfAttention(nn.Module):
+    """
+    Multi-head causal self-attention for autoregressive code generation.
+    """
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.n_heads = config.n_heads
+        self.head_dim = config.head_dim
+        self.scale = self.head_dim ** -0.5
+
+        self.q_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.k_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.v_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.o_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+        self.rotary = RotaryEmbedding(head_dim=self.head_dim, max_seq_len=config.max_seq_len)
+
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+
+        q, k = self.rotary(q, k, seq_len=seq_len)
+
+        # Use PyTorch scaled dot-product attention with causal masking.
+        out = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attn_mask,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=True,
+            scale=self.scale,
+        )
+        out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.o_proj(out)
+
+
+class FeedForward(nn.Module):
+    """
+    Two-layer feed-forward network with GELU activation.
+    """
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.fc2 = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = F.gelu(x, approximate="tanh")
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    """
+    One transformer block:
+    norm -> attention -> residual
+    norm -> feed-forward -> residual
+    """
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.attn = CausalSelfAttention(config)
+        self.norm2 = RMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.ffn = FeedForward(config)
+
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x), attn_mask=attn_mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class CodeTransformerLM(nn.Module):
+    """
+    Full decoder-only language model for code generation.
+    """
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
+        self.norm_final = RMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        if config.tie_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module: nn.Module) -> None:
+        # Keep initialization stable for deep networks.
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+
+    def enable_gradient_checkpointing(self, enabled: bool = True) -> None:
+        # Toggle gradient checkpointing mode.
+        self.config.gradient_checkpointing = enabled
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        if input_ids.dim() != 2:
+            raise ValueError("input_ids must be shape [batch, seq_len].")
+
+        x = self.embed_tokens(input_ids)
+        x = self.dropout(x)
+
+        for block in self.blocks:
+            if self.config.gradient_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, attn_mask, use_reentrant=False)
+            else:
+                x = block(x, attn_mask=attn_mask)
+
+        x = self.norm_final(x)
+        logits = self.lm_head(x)
+
+        out: Dict[str, torch.Tensor] = {"logits": logits}
+        if labels is not None:
+            # Standard next-token cross entropy loss.
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+            out["loss"] = loss
+        return out
+
+    def estimate_num_parameters(self) -> int:
+        # Returns total trainable parameter count.
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+    def summary(self) -> Dict[str, object]:
+        # Returns a simple structured summary for logs/CLI.
+        return {
+            "config": asdict(self.config),
+            "num_parameters": self.estimate_num_parameters(),
+        }
+
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35cf334b1ed48cc5bd2702ff997daf5bf7fd0bf6
--- /dev/null
+++ b/src/tokenizer/__init__.py
@@ -0,0 +1,2 @@
+# This file marks tokenizer as a Python package.
+
diff --git a/src/tokenizer/code_tokenizer.py b/src/tokenizer/code_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68ec2705394185a43465e345664e691245f18a5
--- /dev/null
+++ b/src/tokenizer/code_tokenizer.py
@@ -0,0 +1,216 @@
+"""
+Component 2: Custom code tokenizer for Python and JavaScript.
+
+This tokenizer is code-aware:
+- It preserves indentation structure using explicit tokens.
+- It keeps newline boundaries using a newline token.
+- It treats code operators and brackets as separate units.
+- It supports prompt+code style training samples.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional
+
+from tokenizers import Regex, Tokenizer
+from tokenizers.decoders import BPEDecoder
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFKC, Sequence as NormalizerSequence
+from tokenizers.pre_tokenizers import Metaspace, Sequence as PreTokenizerSequence, Split
+from tokenizers.processors import TemplateProcessing
+from tokenizers.trainers import BpeTrainer
+
+
+@dataclass
+class CodeTokenizerConfig:
+    # Vocabulary size controls how many distinct tokens the tokenizer learns.
+    vocab_size: int = 50_000
+    # Minimum frequency filters very rare fragments.
+    min_frequency: int = 2
+    # Sequence length is used later by training/inference components.
+    model_max_length: int = 2048
+    # Indent width is used to normalize tabs and format indentation markers.
+    indent_width: int = 4
+    # These tokens are required for code generation workflows.
+    special_tokens: List[str] = None  # type: ignore[assignment]
+
+    def __post_init__(self) -> None:
+        if self.special_tokens is None:
+            self.special_tokens = [
+                "<PAD>",
+                "<UNK>",
+                "<BOS>",
+                "<EOS>",
+                "<NL>",
+                "<INDENT>",
+                "<DEDENT>",
+                "<PROMPT>",
+                "<CODE>",
+                "<PYTHON>",
+                "<JAVASCRIPT>",
+            ]
+
+
+class CodeTokenizer:
+    # This wrapper owns one HF Tokenizers object plus code-specific helpers.
+
+    def __init__(self, config: Optional[CodeTokenizerConfig] = None) -> None:
+        self.config = config or CodeTokenizerConfig()
+        self.tokenizer: Optional[Tokenizer] = None
+        self.special_token_ids: Dict[str, int] = {}
+
+    def _build_base_tokenizer(self) -> Tokenizer:
+        """
+        Creates a BPE tokenizer with code-oriented pre-tokenization rules.
+        """
+        tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
+        tokenizer.normalizer = NormalizerSequence([NFKC()])
+
+        # Split multi-character operators first so they are not broken apart.
+        multi_op = Regex(
+            r"(==|!=|<=|>=|:=|->|=>|\+\+|--|\+=|-=|\*=|/=|//=|%=|\*\*|&&|\|\||<<|>>)"
+        )
+        # Split common delimiters used heavily in code.
+        punct = Regex(r"([()\[\]{}.,:;])")
+
+        tokenizer.pre_tokenizer = PreTokenizerSequence(
+            [
+                Split(multi_op, behavior="isolated"),
+                Split(punct, behavior="isolated"),
+                Metaspace(replacement="_", prepend_scheme="always", split=True),
+            ]
+        )
+        tokenizer.decoder = BPEDecoder()
+        return tokenizer
+
+    def train(self, text_iterator: Iterable[str]) -> None:
+        """
+        Trains the tokenizer from a stream of preformatted text samples.
+        """
+        tokenizer = self._build_base_tokenizer()
+        trainer = BpeTrainer(
+            vocab_size=self.config.vocab_size,
+            min_frequency=self.config.min_frequency,
+            special_tokens=self.config.special_tokens,
+            show_progress=True,
+        )
+        tokenizer.train_from_iterator(text_iterator, trainer=trainer, length=None)
+
+        # Add BOS/EOS automatically around each single sequence.
+        bos_id = tokenizer.token_to_id("<BOS>")
+        eos_id = tokenizer.token_to_id("<EOS>")
+        if bos_id is None or eos_id is None:
+            raise RuntimeError("Tokenizer training failed to register BOS/EOS tokens.")
+        tokenizer.post_processor = TemplateProcessing(
+            single="<BOS> $A <EOS>",
+            special_tokens=[("<BOS>", bos_id), ("<EOS>", eos_id)],
+        )
+
+        self.tokenizer = tokenizer
+        self.special_token_ids = {
+            token: tokenizer.token_to_id(token) for token in self.config.special_tokens
+        }
+
+    def save(self, output_dir: str) -> None:
+        """
+        Saves tokenizer JSON and config so all other components can reuse it.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError("Cannot save tokenizer before training or loading it.")
+        out = Path(output_dir)
+        out.mkdir(parents=True, exist_ok=True)
+        self.tokenizer.save(str(out / "tokenizer.json"))
+        with (out / "tokenizer_config.json").open("w", encoding="utf-8") as f:
+            json.dump(asdict(self.config), f, indent=2)
+
+    @classmethod
+    def load(cls, tokenizer_dir: str) -> "CodeTokenizer":
+        """
+        Loads tokenizer from disk.
+        """
+        base = Path(tokenizer_dir)
+        cfg_path = base / "tokenizer_config.json"
+        tok_path = base / "tokenizer.json"
+        if not cfg_path.exists() or not tok_path.exists():
+            raise FileNotFoundError(
+                f"Missing tokenizer files in {tokenizer_dir}. "
+                "Expected tokenizer.json and tokenizer_config.json."
+            )
+        with cfg_path.open("r", encoding="utf-8") as f:
+            cfg_data = json.load(f)
+        config = CodeTokenizerConfig(**cfg_data)
+        obj = cls(config=config)
+        obj.tokenizer = Tokenizer.from_file(str(tok_path))
+        obj.special_token_ids = {
+            token: obj.tokenizer.token_to_id(token) for token in obj.config.special_tokens
+        }
+        return obj
+
+    def encode(self, text: str) -> List[int]:
+        """
+        Encodes one preformatted text sample to token IDs.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer is not ready. Train or load it first.")
+        return self.tokenizer.encode(text).ids
+
+    def decode(self, token_ids: List[int]) -> str:
+        """
+        Decodes token IDs to text.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer is not ready. Train or load it first.")
+        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
+
+    def format_training_sample(self, prompt: str, code: str, language: str) -> str:
+        """
+        Converts prompt + code into one structured training text sequence.
+        """
+        lang_token = "<PYTHON>" if language.lower() == "python" else "<JAVASCRIPT>"
+        prompt_text = self._normalize_text(prompt)
+        code_text = self._code_to_structure_tokens(code)
+        return f"<PROMPT> {lang_token} {prompt_text} <CODE> {code_text}"
+
+    def _normalize_text(self, text: str) -> str:
+        """
+        Normalizes regular text by cleaning newlines.
+        """
+        return text.replace("\r\n", "\n").replace("\r", "\n").strip()
+
+    def _code_to_structure_tokens(self, code: str) -> str:
+        """
+        Converts raw code into a string with explicit indentation and newline markers.
+        """
+        code = code.replace("\r\n", "\n").replace("\r", "\n").replace("\t", " " * self.config.indent_width)
+        lines = code.split("\n")
+        indent_stack: List[int] = [0]
+        out_tokens: List[str] = []
+
+        for raw_line in lines:
+            # Keep blank lines as newline tokens so code structure is preserved.
+            if raw_line.strip() == "":
+                out_tokens.append("<NL>")
+                continue
+
+            current_indent = len(raw_line) - len(raw_line.lstrip(" "))
+            line_content = raw_line.lstrip(" ")
+
+            while current_indent < indent_stack[-1]:
+                indent_stack.pop()
+                out_tokens.append("<DEDENT>")
+
+            while current_indent > indent_stack[-1]:
+                indent_stack.append(current_indent)
+                out_tokens.append("<INDENT>")
+
+            out_tokens.append(line_content)
+            out_tokens.append("<NL>")
+
+        while len(indent_stack) > 1:
+            indent_stack.pop()
+            out_tokens.append("<DEDENT>")
+
+        return " ".join(out_tokens).strip()
diff --git a/src/training_pipeline/__init__.py b/src/training_pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..95609696a377a7a538c80904947085457d5945f6
--- /dev/null
+++ b/src/training_pipeline/__init__.py
@@ -0,0 +1 @@
+﻿# This file marks training_pipeline as a Python package.
diff --git a/src/training_pipeline/tokenized_dataset.py b/src/training_pipeline/tokenized_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..beeb53db5c1499b622f9c025c41b93bd7f34f76e
--- /dev/null
+++ b/src/training_pipeline/tokenized_dataset.py
@@ -0,0 +1,88 @@
+﻿"""
+Memory-efficient dataset utilities for tokenized JSONL training data.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Iterator, List, Tuple
+
+import torch
+from torch.utils.data import Dataset
+
+
+class TokenizedJsonlDataset(Dataset):
+    """
+    Random-access dataset over tokenized JSONL using line byte offsets.
+    This avoids loading all samples into RAM.
+    """
+
+    def __init__(self, path: str, split: str = "train", val_ratio: float = 0.02, split_seed: int = 17) -> None:
+        self.path = Path(path)
+        if not self.path.exists():
+            raise FileNotFoundError(f"Tokenized dataset not found: {self.path}")
+        self.split = split
+        self.val_ratio = val_ratio
+        self.split_seed = split_seed
+        self.offsets: List[int] = []
+        self._build_offsets()
+
+    def _hash_to_split(self, idx: int) -> bool:
+        # Deterministic split using index so train/val is stable across runs.
+        h = (idx * 1103515245 + self.split_seed) & 0x7FFFFFFF
+        p = (h % 10_000) / 10_000.0
+        return p < self.val_ratio
+
+    def _build_offsets(self) -> None:
+        with self.path.open("rb") as f:
+            idx = 0
+            while True:
+                offset = f.tell()
+                line = f.readline()
+                if not line:
+                    break
+                if self.split == "val":
+                    keep = self._hash_to_split(idx)
+                else:
+                    keep = not self._hash_to_split(idx)
+                if keep:
+                    self.offsets.append(offset)
+                idx += 1
+
+    def __len__(self) -> int:
+        return len(self.offsets)
+
+    def __getitem__(self, index: int) -> List[int]:
+        offset = self.offsets[index]
+        with self.path.open("rb") as f:
+            f.seek(offset)
+            line = f.readline().decode("utf-8").strip()
+        row = json.loads(line)
+        ids = row.get("input_ids")
+        if not isinstance(ids, list) or not ids:
+            raise ValueError(f"Invalid input_ids at index {index}")
+        return [int(x) for x in ids]
+
+
+class CausalCollator:
+    """
+    Pads/truncates sequences and produces labels for next-token training.
+    """
+
+    def __init__(self, pad_token_id: int = 0, max_seq_len: int = 512) -> None:
+        self.pad_token_id = pad_token_id
+        self.max_seq_len = max_seq_len
+
+    def __call__(self, batch: List[List[int]]) -> Tuple[torch.Tensor, torch.Tensor]:
+        clipped = [x[: self.max_seq_len] for x in batch]
+        max_len = max(len(x) for x in clipped)
+        input_ids = []
+        labels = []
+        for seq in clipped:
+            pad_len = max_len - len(seq)
+            padded = seq + [self.pad_token_id] * pad_len
+            label = seq + [-100] * pad_len
+            input_ids.append(padded)
+            labels.append(label)
+        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7c703e7c486ef91d25ec589405daf1d1aab8ea5
--- /dev/null
+++ b/train.py
@@ -0,0 +1,228 @@
+import argparse
+from pathlib import Path
+from typing import List
+
+import torch
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+
+from config import PATHS, TRAINING_CONFIG
+from dataset import LocalJsonlInstructionDataset, format_prompt
+from utils import ensure_dirs, setup_logger
+
+
+def _is_valid_hf_model_dir(path: Path) -> bool:
+    if not path.exists():
+        return False
+    has_config = (path / "config.json").exists()
+    has_weights = (path / "model.safetensors").exists() or (path / "pytorch_model.bin").exists()
+    return has_config and has_weights
+
+
+def _resolve_model_path(logger) -> Path:
+    primary = PATHS.model_dir
+    fallback = Path("./hf_release/MINDI-1.0-420M")
+
+    if _is_valid_hf_model_dir(primary):
+        return primary
+    if _is_valid_hf_model_dir(fallback):
+        logger.warning(
+            "Primary model path %s is missing HF files. Falling back to %s",
+            primary.resolve(),
+            fallback.resolve(),
+        )
+        return fallback
+    raise FileNotFoundError(
+        "No valid HuggingFace model directory found.\n"
+        f"Checked: {primary.resolve()} and {fallback.resolve()}.\n"
+        "Expected files: config.json + model.safetensors (or pytorch_model.bin)."
+    )
+
+
+def _build_model_and_tokenizer(model_path: Path, logger):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            local_files_only=True,
+            use_fast=True,
+        )
+    except Exception as fast_exc:
+        logger.warning("Fast tokenizer load failed: %s. Retrying with slow tokenizer.", fast_exc)
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path,
+                trust_remote_code=True,
+                local_files_only=True,
+                use_fast=False,
+            )
+        except Exception as slow_exc:
+            raise RuntimeError(
+                "Tokenizer loading failed for both fast and slow modes. "
+                "Ensure tokenizer files exist in the model folder and install "
+                "`sentencepiece` (and optionally `tiktoken`) if required."
+            ) from slow_exc
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        local_files_only=True,
+        dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    )
+
+    lora_cfg = LoraConfig(
+        r=16,
+        lora_alpha=32,
+        lora_dropout=0.05,
+        bias="none",
+        task_type=TaskType.CAUSAL_LM,
+        target_modules="all-linear",
+    )
+    model = get_peft_model(model, lora_cfg)
+    return model, tokenizer
+
+
+def _maybe_resume_train(trainer: Trainer, logger, resume_requested: bool) -> None:
+    if not resume_requested:
+        trainer.train()
+        return
+
+    try:
+        trainer.train(resume_from_checkpoint=True)
+    except (ValueError, OSError) as exc:
+        logger.warning(
+            "Resume requested but no valid checkpoint found (%s). Starting fresh training.",
+            exc,
+        )
+        trainer.train()
+
+
+def _generate_predictions(model, tokenizer, prompts: List[str], logger) -> None:
+    model.eval()
+    device = model.device
+    logger.info("Running post-training evaluation prompts.")
+
+    for prompt in prompts:
+        full_prompt = format_prompt(
+            instruction=prompt,
+            input_text="",
+            output_text="",
+        )
+        inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=TRAINING_CONFIG.eval_max_new_tokens,
+                do_sample=True,
+                temperature=0.2,
+                top_p=0.95,
+                pad_token_id=tokenizer.pad_token_id,
+            )
+        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        print("\n" + "=" * 80)
+        print(f"PROMPT: {prompt}")
+        print("-" * 80)
+        print(decoded)
+
+
+def train(resume: bool) -> Path:
+    ensure_dirs(
+        [
+            PATHS.data_dir,
+            PATHS.output_dir,
+            PATHS.logs_dir,
+            PATHS.checkpoint_dir,
+            PATHS.lora_output_dir,
+            PATHS.tokenizer_output_dir,
+        ]
+    )
+    logger = setup_logger("train", PATHS.logs_dir / "train.log")
+    set_seed(42)
+    if not torch.cuda.is_available():
+        logger.warning(
+            "CUDA is not available. Training will run on CPU, which is very slow and can limit practical model quality."
+        )
+
+    if not PATHS.train_jsonl.exists():
+        raise FileNotFoundError(
+            f"Training dataset not found: {PATHS.train_jsonl.resolve()}. "
+            "Run data_fetch.py first."
+        )
+
+    model_path = _resolve_model_path(logger)
+    logger.info("Loading model and tokenizer from %s", model_path.resolve())
+    model, tokenizer = _build_model_and_tokenizer(model_path, logger)
+    model.print_trainable_parameters()
+
+    train_dataset = LocalJsonlInstructionDataset(tokenizer, max_length=TRAINING_CONFIG.max_length)
+    logger.info("Loaded %d samples from %s", len(train_dataset), PATHS.train_jsonl.resolve())
+
+    training_args = TrainingArguments(
+        output_dir=str(PATHS.checkpoint_dir),
+        num_train_epochs=TRAINING_CONFIG.num_train_epochs,
+        per_device_train_batch_size=TRAINING_CONFIG.per_device_train_batch_size,
+        gradient_accumulation_steps=TRAINING_CONFIG.gradient_accumulation_steps,
+        learning_rate=TRAINING_CONFIG.learning_rate,
+        fp16=torch.cuda.is_available(),
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.03,
+        weight_decay=0.01,
+        max_grad_norm=1.0,
+        gradient_checkpointing=True,
+        group_by_length=True,
+        logging_steps=TRAINING_CONFIG.logging_steps,
+        save_steps=TRAINING_CONFIG.save_steps,
+        save_total_limit=4,
+        report_to="none",
+        remove_unused_columns=False,
+        dataloader_num_workers=2,
+        dataloader_pin_memory=torch.cuda.is_available(),
+    )
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+    )
+
+    logger.info("Starting training. Resume mode: %s", resume)
+    _maybe_resume_train(trainer, logger, resume_requested=resume)
+
+    logger.info("Saving LoRA adapters to %s", PATHS.lora_output_dir.resolve())
+    trainer.model.save_pretrained(str(PATHS.lora_output_dir))
+    tokenizer.save_pretrained(str(PATHS.tokenizer_output_dir))
+
+    prompts = [
+        "Write a Python binary search function",
+        "Fix this Python bug: list index out of range",
+        "Create a FastAPI endpoint",
+    ]
+    _generate_predictions(model, tokenizer, prompts, logger)
+
+    print(f"\nLoRA adapters saved to: {PATHS.lora_output_dir.resolve()}")
+    print(f"Tokenizer saved to: {PATHS.tokenizer_output_dir.resolve()}")
+    return PATHS.lora_output_dir
+
+
+def _build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="LoRA fine-tuning for MINDI Python coding tasks.")
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Disable automatic resume_from_checkpoint=True behavior.",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    args = _build_arg_parser().parse_args()
+    train(resume=not args.no_resume and TRAINING_CONFIG.resume_training)
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5853a51f99f4b57ad9cff42f6738bb92fe24a84
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,52 @@
+import json
+import logging
+from pathlib import Path
+from typing import Dict, Iterable, List
+
+
+def ensure_dirs(paths: Iterable[Path]) -> None:
+    for path in paths:
+        path.mkdir(parents=True, exist_ok=True)
+
+
+def setup_logger(name: str, log_file: Path, level: int = logging.INFO) -> logging.Logger:
+    log_file.parent.mkdir(parents=True, exist_ok=True)
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+
+    if logger.handlers:
+        return logger
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    file_handler = logging.FileHandler(log_file, encoding="utf-8")
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(stream_handler)
+
+    return logger
+
+
+def write_jsonl(path: Path, rows: List[Dict[str, str]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def read_jsonl(path: Path) -> List[Dict[str, str]]:
+    rows: List[Dict[str, str]] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            rows.append(json.loads(line))
+    return rows
+