Mindigenous commited on Mar 29

Commit

53f0cc2

0 Parent(s):

Initial full project backup with Git LFS

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +10 -0
.gitignore +29 -0
CONTEXT_SUMMARY.md +38 -0
README_COMPONENT_1_SETUP.md +83 -0
README_COMPONENT_3_DATASET_PIPELINE.md +46 -0
README_COMPONENT_4_MODEL_ARCHITECTURE.md +28 -0
README_COMPONENT_5_TRAINING_PIPELINE.md +42 -0
README_COMPONENT_8_CHAT_INTERFACE.md +20 -0
README_FINAL_PROJECT.md +126 -0
artifacts/evaluation/component6_eval_results.json +3 -0
artifacts/evaluation/component7_inference_results.json +3 -0
artifacts/export/component10_benchmark_report.json +3 -0
artifacts/model/component4_model_summary.json +3 -0
artifacts/tokenizer/code_tokenizer_v1/tokenizer.json +3 -0
artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json +3 -0
backup_step1000.tar.gz +3 -0
backup_step2000.tar.gz +3 -0
backup_step3000.tar.gz +3 -0
checkpoints/component5_420m/latest.pt +3 -0
checkpoints/component5_420m/step_3000.pt +3 -0
checkpoints/component5_420m/step_3200.pt +3 -0
config.py +45 -0
configs/component10_export_config.yaml +21 -0
configs/component3_dataset_pipeline.yaml +38 -0
configs/component3_incremental_js.yaml +27 -0
configs/component3_reprocess_from_clean.yaml +19 -0
configs/component4_model_config.yaml +18 -0
configs/component5_training_config.verify.yaml +32 -0
configs/component5_training_config.yaml +37 -0
configs/component6_evaluation_config.yaml +21 -0
configs/component7_inference_config.yaml +20 -0
configs/component8_chat_config.yaml +30 -0
configs/component9_lora_config.verify.yaml +32 -0
configs/component9_lora_config.yaml +31 -0
data/cache/raw/code_search_net_python/dataset_dict.json +3 -0
data/cache/raw/code_search_net_python/test/data-00000-of-00001.arrow +3 -0
data/cache/raw/code_search_net_python/test/dataset_info.json +3 -0
data/cache/raw/code_search_net_python/test/state.json +3 -0
data/cache/raw/code_search_net_python/train/data-00000-of-00004.arrow +3 -0
data/cache/raw/code_search_net_python/train/data-00001-of-00004.arrow +3 -0
data/cache/raw/code_search_net_python/train/data-00002-of-00004.arrow +3 -0
data/cache/raw/code_search_net_python/train/data-00003-of-00004.arrow +3 -0
data/cache/raw/code_search_net_python/train/dataset_info.json +3 -0
data/cache/raw/code_search_net_python/train/state.json +3 -0
data/cache/raw/code_search_net_python/validation/data-00000-of-00001.arrow +3 -0
data/cache/raw/code_search_net_python/validation/dataset_info.json +3 -0
data/cache/raw/code_search_net_python/validation/state.json +3 -0
data/cache/raw/mbpp/dataset_dict.json +3 -0
data/cache/raw/mbpp/prompt/data-00000-of-00001.arrow +3 -0
data/cache/raw/mbpp/prompt/dataset_info.json +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,10 @@

+*.zip filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+output/checkpoints/* filter=lfs diff=lfs merge=lfs -text
+checkpoints/** filter=lfs diff=lfs merge=lfs -text
+models/** filter=lfs diff=lfs merge=lfs -text
+data/** filter=lfs diff=lfs merge=lfs -text
+artifacts/** filter=lfs diff=lfs merge=lfs -text
+logs/** filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,29 @@

+# Ignore Python cache and compiled files.
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Ignore virtual environment.
+.venv/
+# Ignore logs and temporary outputs.
+logs/
+artifacts/
+*.log
+# Ignore model weights and checkpoints by default.
+checkpoints/
+models/base/
+models/lora/
+models/quantized/
+# Ignore data files by default.
+data/raw/
+data/interim/
+data/processed/
+data/external/
+# Ignore notebook checkpoints.
+.ipynb_checkpoints/

CONTEXT_SUMMARY.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Project Context Summary
+This file captures the current state of work from the active collaboration session.
+## Environment
+- Original project path: `D:\Desktop 31st Jan 2026\MIND-AI-MODEL`
+- Target copy path requested: `C:\AI 2`
+- OS: Windows
+- GPU: NVIDIA RTX 4060 Laptop (8GB VRAM)
+## Completed Components
+1. Component 1 (Project setup): completed and verified.
+2. Component 2 (Custom tokenizer): completed and verified.
+3. Component 3 (Dataset pipeline): completed and verified.
+4. Component 3 final-step reprocess fix: completed and verified, with JS rebalance.
+5. Component 4 (420M transformer architecture): completed and verified.
+## Current Dataset Stats
+- Total processed records: 139,531
+- Python: 115,572
+- JavaScript: 23,959
+## Current Model Architecture
+- Preset: `medium_420m`
+- Parameters: 423,934,848
+- Verified forward pass on GPU successful.
+## Key Files
+- `configs/component4_model_config.yaml`
+- `src/model_architecture/code_transformer.py`
+- `scripts/build_component4_model.py`
+- `scripts/verify_component4_model.py`
+- `data/processed/train_tokenized.jsonl`
+- `data/processed/pipeline_stats.json`
+## Next Planned Component
+- Component 5: Training pipeline with FP16, gradient checkpointing, gradient accumulation, checkpointing every 100 steps, resume support, early stopping, and live training metrics.

README_COMPONENT_1_SETUP.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# Component 1: Project Setup (Windows + RTX 4060 8GB)
+## What This Component Does
+- Creates a clean folder structure for the full coding-assistant project.
+- Sets up a Python virtual environment.
+- Installs all core dependencies needed across Components 2-10.
+- Verifies that Python, PyTorch, CUDA visibility, and key libraries work.
+## Folder Structure Created
+- `data/raw` -> raw datasets you will provide later
+- `data/interim` -> temporary cleaned data
+- `data/processed` -> training-ready tokenized data
+- `data/external` -> any third-party resources
+- `src/tokenizer` -> Component 2 code tokenizer
+- `src/dataset_pipeline` -> Component 3 preprocessing pipeline
+- `src/model_architecture` -> Component 4 transformer code
+- `src/training_pipeline` -> Component 5 training loop
+- `src/evaluation_system` -> Component 6 evaluation code
+- `src/inference_engine` -> Component 7 inference code
+- `src/chat_interface` -> Component 8 Gradio interface
+- `src/finetuning_system` -> Component 9 LoRA fine-tuning
+- `src/export_optimization` -> Component 10 quantization/export tools
+- `configs` -> config files for all components
+- `scripts` -> setup, verification, and utility scripts
+- `tests` -> quick checks for each component
+- `checkpoints` -> model checkpoints saved during training
+- `models/base` -> base trained model files
+- `models/lora` -> LoRA adapters
+- `models/quantized` -> optimized quantized models
+- `artifacts` -> generated reports, metrics, and outputs
+- `logs` -> training and runtime logs
+## Exact Commands To Run (in this order)
+Run from:
+`D:\Desktop 31st Jan 2026\MIND-AI-MODEL`
+0. Install Python 3.11 (required for package compatibility):
+- Download page: https://www.python.org/downloads/release/python-3119/
+- Windows installer file: `python-3.11.9-amd64.exe`
+- During install, check: `Add python.exe to PATH`
+1. Allow script execution for this terminal only:
+```powershell
+Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
+```
+2. If you already attempted setup once, remove old virtual environment first:
+```powershell
+if (Test-Path .\.venv) { Remove-Item -Recurse -Force .\.venv }
+```
+3. Create folders, virtual env, install dependencies:
+```powershell
+.\scripts\setup_windows_environment.ps1
+```
+4. Activate virtual environment:
+```powershell
+.\.venv\Scripts\Activate.ps1
+```
+5. Verify setup:
+```powershell
+python .\scripts\verify_component1_setup.py
+```
+## Expected Verification Result
+- Prints Python version
+- Prints PyTorch version
+- Shows whether CUDA is available
+- Shows GPU name if available
+- Confirms critical libraries import correctly
+Note:
+- `codebleu` is excluded from base install on Windows due to a `tree-sitter` dependency conflict on Python 3.11.
+- Component 6 will use Windows-stable evaluation metrics and add code-quality checks without breaking setup.
+- `bitsandbytes` is optional on native Windows because some CUDA/driver combinations fail to load its DLL.
+- Base setup and all early components continue without it.
+- For Component 5, we will:
+  - try `bitsandbytes` if available, and
+  - automatically fall back to a stable optimizer on your machine if it is not.
+If verification fails, copy the full terminal output and share it with me.

README_COMPONENT_3_DATASET_PIPELINE.md ADDED Viewed

	@@ -0,0 +1,46 @@

+# Component 3: Dataset Pipeline
+## What This Component Does (Simple English)
+- Downloads the 3 datasets directly from Hugging Face (no manual download files).
+- Reads them in streaming mode so your RAM usage stays low.
+- Cleans prompt/code text.
+- Removes low-quality and likely auto-generated data.
+- Removes duplicate prompt+code pairs using a disk-backed SQLite index.
+- Detects language (Python or JavaScript) when unclear.
+- Tokenizes all cleaned records using the Component 2 tokenizer.
+- Saves training-ready tokenized JSONL output.
+## Files Created By This Component
+- `configs/component3_dataset_pipeline.yaml`
+- `src/dataset_pipeline/hf_dataset_pipeline.py`
+- `scripts/run_component3_dataset_pipeline.py`
+- `scripts/verify_component3_dataset_pipeline.py`
+## Required Before Running
+- Component 2 tokenizer must exist at:
+  - `artifacts/tokenizer/code_tokenizer_v1/tokenizer.json`
+  - `artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json`
+## Quick Verification Run (small test)
+Run from project root:
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\verify_component3_dataset_pipeline.py
+```
+This uses `200` records per dataset for a smoke test.
+## Full Pipeline Run
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\run_component3_dataset_pipeline.py --config .\configs\component3_dataset_pipeline.yaml
+```
+## Output Files
+- Clean merged dataset:
+  - `data/interim/combined_clean.jsonl`
+- Tokenized training dataset:
+  - `data/processed/train_tokenized.jsonl`
+- Stats summary:
+  - `data/processed/pipeline_stats.json`

README_COMPONENT_4_MODEL_ARCHITECTURE.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# Component 4: Model Architecture (420M Starter)
+## What This Component Builds
+- A decoder-only transformer language model for code generation.
+- Configurable size through YAML config.
+- Presets for small, medium (420M target), and large.
+- Attention + rotary positional encoding + feed-forward blocks.
+## Main Files
+- `src/model_architecture/code_transformer.py`
+- `configs/component4_model_config.yaml`
+- `scripts/build_component4_model.py`
+- `scripts/verify_component4_model.py`
+## Commands (run from project root)
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\build_component4_model.py --config .\configs\component4_model_config.yaml
+python .\scripts\verify_component4_model.py --config .\configs\component4_model_config.yaml --batch_size 1 --seq_len 256
+```
+## What Success Looks Like
+- Build script prints parameter count near the 420M target.
+- Verify script prints:
+  - VRAM usage at multiple stages
+  - output tensor shape
+  - `Component 4 verification passed.`

README_COMPONENT_5_TRAINING_PIPELINE.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# Component 5: Training Pipeline
+## What This Component Does
+- Trains the 420M transformer on tokenized data.
+- Uses FP16 mixed precision to reduce VRAM.
+- Uses gradient checkpointing to save memory.
+- Uses gradient accumulation for larger effective batch size.
+- Attempts Adam8bit optimizer when available, otherwise safely falls back.
+- Saves checkpoint every 100 steps by default.
+- Supports resuming from latest checkpoint.
+- Evaluates periodically and supports early stopping.
+- Shows live loss, LR, ETA, and VRAM.
+## Main Files
+- `configs/component5_training_config.yaml`
+- `src/training_pipeline/tokenized_dataset.py`
+- `scripts/train_component5.py`
+- `scripts/verify_component5_training_pipeline.py`
+## Commands
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\verify_component5_training_pipeline.py
+python .\scripts\train_component5.py --config .\configs\component5_training_config.yaml
+```
+## VRAM and Runtime (RTX 4060 8GB)
+- Expected VRAM during training with default config: about 5.8 to 6.9 GB.
+- Safety stop is enabled at 7.0 GB.
+- Approx training time for 1 epoch equivalent: ~30 to 65 hours.
+## Common Failures and Fixes
+1. OOM or VRAM threshold hit:
+   - Reduce `max_seq_len` (e.g., 512 -> 384).
+   - Increase `grad_accum_steps`.
+2. Training too slow:
+   - Lower `max_seq_len` for first run.
+   - Keep `micro_batch_size=1` and adjust accumulation.
+3. Resume issues:
+   - Ensure `checkpoints/component5_420m/latest.pt` exists.
+4. Validation not improving:
+   - Lower LR and increase warmup.

README_COMPONENT_8_CHAT_INTERFACE.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# Component 8: Local Chat Interface
+## What it gives you
+- Browser chat UI for your local coding model.
+- Uses Component 7 inference engine automatically.
+- Dark theme, prompt box, code cards, copy button per response.
+- Syntax highlighting for Python and JavaScript.
+- Shows generation time and generated token count.
+- Keeps conversation history in the current session.
+- Clear button to reset conversation.
+## Launch (single command)
+```powershell
+python .\scripts\launch_component8_chat.py --config .\configs\component8_chat_config.yaml
+```
+## URL to open
+- `http://127.0.0.1:7860`
+No internet is needed for local usage.

README_FINAL_PROJECT.md ADDED Viewed

	@@ -0,0 +1,126 @@

+# Final Project README - MINDI 1.0 420M (Windows, RTX 4060 8GB)
+## What This Project Is
+This is a fully local coding-assistant model system built step-by-step from scratch.
+It supports:
+- custom tokenizer for code
+- dataset cleaning + tokenization pipeline
+- 420M transformer model
+- memory-optimized training
+- evaluation + inference improvements
+- local chat UI
+- LoRA fine-tuning
+- INT8 export + portable package
+Everything runs locally on your machine without internet after setup.
+---
+## What You Built (High Level)
+1. **Project setup** with reproducible environment and verification scripts.
+2. **Custom code tokenizer** (Python + JavaScript aware).
+3. **Dataset pipeline** with cleaning, dedupe, and tokenization.
+4. **420M transformer architecture** (modular config).
+5. **Training pipeline** (FP16, checkpointing, accumulation, resume, early stopping).
+6. **Evaluation system** (val metrics + generation checks).
+7. **Inference engine** (greedy mode, stop rules, syntax-aware retry).
+8. **Local chat interface** with history, copy button, timing, and mode selector.
+9. **LoRA fine-tuning pipeline** for your own examples.
+10. **Export/quantization/packaging** with benchmark report and portable launcher.
+---
+## Most Important File Locations
+### Core model and data
+- Base checkpoint: `checkpoints/component5_420m/step_3200.pt`
+- Tokenized training data: `data/processed/train_tokenized.jsonl`
+- Tokenizer: `artifacts/tokenizer/code_tokenizer_v1/`
+### LoRA
+- Best LoRA adapter: `models/lora/custom_lora_v1/best.pt`
+- LoRA metadata: `models/lora/custom_lora_v1/adapter_meta.json`
+### Quantized model
+- INT8 model: `models/quantized/model_step3200_int8_state.pt`
+- Benchmark report: `artifacts/export/component10_benchmark_report.json`
+### Chat interface
+- Launcher: `scripts/launch_component8_chat.py`
+- Chat config: `configs/component8_chat_config.yaml`
+### Portable package
+- Folder: `release/MINDI_1.0_420M`
+- Double-click launcher: `release/MINDI_1.0_420M/Start_MINDI.bat`
+---
+## Launch the Main Chat UI
+From project root (`C:\AI 2`):
+```powershell
+.\.venv\Scripts\Activate.ps1
+python .\scripts\launch_component8_chat.py --config .\configs\component8_chat_config.yaml
+```
+Open in browser:
+- `http://127.0.0.1:7860`
+### Live model selector in UI
+You can switch without restart:
+- `base`
+- `lora`
+- `int8`
+Status box shows:
+- active mode
+- mode load time
+- live VRAM usage
+---
+## How to Add More Training Data (Future Improvement)
+### A) Add more base-training pairs (full training path)
+1. Put new JSONL/JSON files in `data/raw/`.
+2. Run dataset processing scripts (Component 3 path).
+3. Continue/refresh base training with Component 5.
+### B) Add targeted improvements quickly (LoRA recommended)
+1. Edit `data/raw/custom_finetune_pairs.jsonl` with your new prompt/code pairs.
+   - Required fields per row: `prompt`, `code`
+   - Optional: `language` (`python` or `javascript`)
+2. Run LoRA fine-tuning:
+```powershell
+python .\scripts\run_component9_lora_finetune.py --config .\configs\component9_lora_config.yaml
+```
+3. Use updated adapter in chat by selecting `lora` mode.
+---
+## Recommended Next Habit
+When quality is weak on specific tasks:
+1. Add 20-200 clean examples of exactly that task style to `custom_finetune_pairs.jsonl`.
+2. Re-run LoRA fine-tuning.
+3. Test in chat `lora` mode.
+4. Repeat in small cycles.
+This gives faster improvement than retraining the full base model each time.
+---
+## One-File Health Check Commands
+```powershell
+python .\scripts\verify_component1_setup.py
+python .\scripts\verify_component4_model.py --config .\configs\component4_model_config.yaml --batch_size 1 --seq_len 256
+python .\scripts\verify_component9_lora.py
+```
+---
+## Current Status
+Project is complete across Components 1-10 and verified on your hardware.

artifacts/evaluation/component6_eval_results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3da6ee747d77b0c8cdca5d4fedb750549a9e5e7c42592e5e32e6103ff5617d8f
+size 2379

artifacts/evaluation/component7_inference_results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce08bfd6918f619fdcb1ef17ec1db79c2d32578d12a02aaaae7b7092f83384ae
+size 5863

artifacts/export/component10_benchmark_report.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d827ec736fbdc4ea2ed5bc196223f1bf02d11a9260acd451edd51f8f39bcda75
+size 545

artifacts/model/component4_model_summary.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab5ebc8aa081f82bbcaee2c945b207b4db3251f63b845ed86055f4e5b7204010
+size 328

artifacts/tokenizer/code_tokenizer_v1/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe04cc37ac778637cb2cc02a6096412e5d8cada3e4ef3e4a7f2d141fccab8a0
+size 11475

artifacts/tokenizer/code_tokenizer_v1/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb0b7af679bac1c29fe7ac9f86c48f1fed5584ba72c9ef2c338f60b63e07bb46
+size 302

backup_step1000.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebe005c43dd59c9c49ad153d41af1bdaaad47c2a21ae231a4c5e90c8005560af
+size 337623475

backup_step2000.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:861329fb551b4c6406e92e06cfa1faae592f0fe0d0ce713189a57c62b33b0969
+size 337571785

backup_step3000.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:238c2859ebf4efc0195456a898d2fb8bce0397e39fdf59e9f940963232d628a8
+size 337762553

checkpoints/component5_420m/latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32d26a7dd9e6e294c6657f6fb3a4d947cf52eb8e1c0b11032722fa50d15c4a21
+size 5087449970

checkpoints/component5_420m/step_3000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e11bded40789574ef316636c02c2fd1e8cd54c13441d8cd6a28980f2209ffaa9
+size 5087455158

checkpoints/component5_420m/step_3200.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71d2ea9401f3b08b2528dbb8f993949794d0adb57642d0f4752d74da0e445238
+size 5087455158

config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class Paths:
+    project_root: Path = Path(".")
+    model_dir: Path = Path("./model")
+    data_dir: Path = Path("./data")
+    output_dir: Path = Path("./output")
+    logs_dir: Path = Path("./logs")
+    train_jsonl: Path = Path("./data/train.jsonl")
+    dataset_cache_dir: Path = Path("./data/cache")
+    raw_dataset_dir: Path = Path("./data/cache/raw")
+    checkpoint_dir: Path = Path("./output/checkpoints")
+    lora_output_dir: Path = Path("./output/lora_adapters")
+    tokenizer_output_dir: Path = Path("./output/tokenizer")
+@dataclass(frozen=True)
+class DataConfig:
+    max_total_samples: int = 200000
+    max_humaneval_samples: int = 20000
+    max_mbpp_samples: int = 50000
+    max_codesearchnet_samples: int = 180000
+    min_output_chars: int = 40
+@dataclass(frozen=True)
+class TrainingConfig:
+    num_train_epochs: int = 5
+    per_device_train_batch_size: int = 1
+    gradient_accumulation_steps: int = 8
+    learning_rate: float = 1e-5
+    max_length: int = 1024
+    save_steps: int = 250
+    logging_steps: int = 20
+    eval_max_new_tokens: int = 220
+    resume_training: bool = True
+PATHS = Paths()
+DATA_CONFIG = DataConfig()
+TRAINING_CONFIG = TrainingConfig()

configs/component10_export_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+# Component 10 export and optimization config
+model:
+  model_config_path: configs/component4_model_config.yaml
+  source_checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+quantization:
+  quantized_output_path: models/quantized/model_step3200_int8_state.pt
+benchmark:
+  prompt: Write a Python function to compute factorial of n.
+  max_new_tokens: 120
+package:
+  output_dir: release/MINDI_1.0_420M
+  app_port: 7861
+outputs:
+  benchmark_report_json: artifacts/export/component10_benchmark_report.json

configs/component3_dataset_pipeline.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+# Component 3 config: load, clean, deduplicate, tokenize.
+tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+interim_output_dir: data/interim
+processed_output_dir: data/processed
+dedupe_db_path: data/interim/dedupe_hashes.sqlite
+# Set null for full run.
+# Use a small number like 500 for fast smoke testing.
+max_records_per_dataset: null
+min_prompt_chars: 8
+min_code_chars: 16
+max_code_chars: 40000
+progress_every: 1000
+datasets:
+  - hf_dataset_id: iamtarun/python_code_instructions_18k_alpaca
+    split: train
+    prompt_field: instruction
+    code_field: output
+    language_field: null
+    default_language: python
+  - hf_dataset_id: sahil2801/CodeAlpaca-20k
+    split: train
+    prompt_field: instruction
+    code_field: output
+    language_field: null
+    default_language: python
+  - hf_dataset_id: TokenBender/code_instructions_122k_alpaca_style
+    split: train
+    prompt_field: instruction
+    code_field: output
+    language_field: null
+    default_language: python

configs/component3_incremental_js.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+# Incremental JS augmentation config.
+# This script appends new JavaScript samples into existing Component 3 outputs.
+tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+existing_clean_path: data/interim/combined_clean.jsonl
+existing_tokenized_path: data/processed/train_tokenized.jsonl
+existing_stats_path: data/processed/pipeline_stats.json
+dedupe_db_path: data/interim/dedupe_hashes_incremental.sqlite
+# Chosen dataset for JS augmentation.
+new_dataset:
+  hf_dataset_id: philschmid/code-alpaca-ruby-python-javascript
+  split: train
+  prompt_field: instruction
+  code_field: output
+  language_field: null
+  default_language: auto
+# Hard target requested by user.
+target_new_javascript_examples: 20000
+# Quality filters (same idea as Component 3).
+min_prompt_chars: 8
+min_code_chars: 16
+max_code_chars: 40000
+progress_every: 500

configs/component3_reprocess_from_clean.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# Reprocess config: no dataset download, no full pipeline rebuild.
+# It reads existing cleaned data and regenerates tokenized output.
+tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+input_clean_path: data/interim/combined_clean.jsonl
+output_tokenized_path: data/processed/train_tokenized.jsonl
+output_stats_path: data/processed/pipeline_stats.json
+# Safety backups before overwrite.
+backup_existing_tokenized: true
+backup_existing_stats: true
+# Existing language labels in clean file may be wrong from earlier runs.
+# true = infer language from prompt+code content only.
+ignore_existing_language_labels: true
+# Optional quick test mode.
+# Set null for full reprocess.
+max_records: null

configs/component4_model_config.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# Component 4 model config.
+# You can switch the preset name or directly edit dimensions below.
+preset: medium_420m
+model:
+  vocab_size: 50000
+  max_seq_len: 2048
+  d_model: 1152
+  n_layers: 23
+  n_heads: 16
+  d_ff: 4608
+  dropout: 0.1
+  tie_embeddings: true
+  gradient_checkpointing: false
+  init_std: 0.02
+  rms_norm_eps: 0.00001

configs/component5_training_config.verify.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+data:
+  tokenized_jsonl_path: data/processed/train_tokenized.jsonl
+  val_ratio: 0.02
+  split_seed: 17
+  num_workers: 0
+model:
+  model_config_path: configs/component4_model_config.yaml
+training:
+  output_dir: checkpoints/component5_420m
+  log_every: 1
+  eval_every: 5
+  save_every: 5
+  max_steps: 5
+  micro_batch_size: 1
+  grad_accum_steps: 16
+  max_seq_len: 512
+  learning_rate: 0.0002
+  weight_decay: 0.1
+  betas:
+  - 0.9
+  - 0.95
+  grad_clip_norm: 1.0
+  warmup_steps: 300
+  min_lr_ratio: 0.1
+  use_fp16: true
+  use_gradient_checkpointing: true
+  prefer_8bit_adam: true
+  early_stopping_patience_evals: 20
+  early_stopping_min_delta: 0.0005
+  max_vram_gb: 7.0
+resume:
+  resume_from: none

configs/component5_training_config.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# Component 5 training config for RTX 4060 8GB.
+data:
+  tokenized_jsonl_path: data/processed/train_tokenized.jsonl
+  val_ratio: 0.02
+  split_seed: 17
+  num_workers: 2
+model:
+  model_config_path: configs/component4_model_config.yaml
+training:
+  output_dir: checkpoints/component5_420m
+  log_every: 10
+  eval_every: 100
+  save_every: 200
+  max_steps: 8000
+  micro_batch_size: 1
+  grad_accum_steps: 16
+  max_seq_len: 448
+  learning_rate: 0.00022
+  weight_decay: 0.1
+  betas: [0.9, 0.95]
+  grad_clip_norm: 1.0
+  warmup_steps: 300
+  min_lr_ratio: 0.1
+  use_fp16: true
+  use_gradient_checkpointing: true
+  prefer_8bit_adam: true
+  early_stopping_patience_evals: 5
+  early_stopping_min_delta: 0.0005
+  max_vram_gb: 7.0
+resume:
+  resume_from: latest  # latest | none | explicit checkpoint path

configs/component6_evaluation_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+# Component 6 evaluation config.
+model:
+  model_config_path: configs/component4_model_config.yaml
+  checkpoint_paths:
+    - checkpoints/component5_420m/step_3200.pt
+data:
+  tokenized_jsonl_path: data/processed/train_tokenized.jsonl
+  val_ratio: 0.02
+  split_seed: 17
+inference:
+  max_seq_len: 448
+  max_new_tokens: 160
+  temperature: 0.25
+  top_p: 0.85
+output:
+  results_json: artifacts/evaluation/component6_eval_results.json

configs/component7_inference_config.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+# Component 7 inference config
+model:
+  model_config_path: configs/component4_model_config.yaml
+  checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+inference:
+  language: python
+  max_new_tokens: 180
+  greedy_temperature: 0.0
+  retry2_temperature: 0.25
+  retry2_top_p: 0.85
+  retry3_temperature: 0.35
+  retry3_top_p: 0.90
+  max_retries: 3
+  min_tokens_before_stop_check: 24
+output:
+  results_json: artifacts/evaluation/component7_inference_results.json

configs/component8_chat_config.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# Component 8 chat interface config.
+model:
+  model_config_path: configs/component4_model_config.yaml
+  base_checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  lora_adapter_path: models/lora/custom_lora_v1/best.pt
+  quantized_state_path: models/quantized/model_step3200_int8_state.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+lora:
+  r: 8
+  alpha: 16
+  dropout: 0.05
+  target_keywords: [q_proj, k_proj, v_proj, o_proj, fc1, fc2]
+inference:
+  language_default: python
+  max_new_tokens: 300
+  greedy_temperature: 0.0
+  retry2_temperature: 0.25
+  retry2_top_p: 0.85
+  retry3_temperature: 0.35
+  retry3_top_p: 0.90
+  max_retries: 3
+  min_tokens_before_stop_check: 64
+server:
+  host: 127.0.0.1
+  port: 7860
+  share: false

configs/component9_lora_config.verify.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+model:
+  model_config_path: configs/component4_model_config.yaml
+  base_checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+lora:
+  r: 8
+  alpha: 16
+  dropout: 0.05
+  target_keywords:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - fc1
+  - fc2
+finetune:
+  custom_data_path: data/raw/custom_finetune_pairs.jsonl
+  output_dir: models/lora/custom_lora_v1
+  max_seq_len: 512
+  micro_batch_size: 1
+  grad_accum_steps: 16
+  learning_rate: 0.0003
+  weight_decay: 0.0
+  max_steps: 5
+  save_every: 5
+  eval_every: 5
+  early_stopping_patience_evals: 6
+  early_stopping_min_delta: 0.0005
+  use_fp16: true
+  max_vram_gb: 7.0
+resume:
+  resume_from: none

configs/component9_lora_config.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+# Component 9 LoRA fine-tuning config
+model:
+  model_config_path: configs/component4_model_config.yaml
+  base_checkpoint_path: checkpoints/component5_420m/step_3200.pt
+  tokenizer_dir: artifacts/tokenizer/code_tokenizer_v1
+lora:
+  r: 8
+  alpha: 16
+  dropout: 0.05
+  target_keywords: [q_proj, k_proj, v_proj, o_proj, fc1, fc2]
+finetune:
+  custom_data_path: data/raw/custom_finetune_pairs.jsonl
+  output_dir: models/lora/custom_lora_v1
+  max_seq_len: 512
+  micro_batch_size: 1
+  grad_accum_steps: 16
+  learning_rate: 0.0003
+  weight_decay: 0.0
+  max_steps: 1200
+  save_every: 100
+  eval_every: 100
+  early_stopping_patience_evals: 6
+  early_stopping_min_delta: 0.0005
+  use_fp16: true
+  max_vram_gb: 7.0
+resume:
+  resume_from: none  # none | latest | explicit path

data/cache/raw/code_search_net_python/dataset_dict.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf46fe547f16d795abe0d4c8a591bf031d98882d638931d27660455ee986273
+size 43

data/cache/raw/code_search_net_python/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:079bce0f0e2513bae63c12f8699e4ea13ec545c5000844de28dc34a1a9fd19eb
+size 84367104

data/cache/raw/code_search_net_python/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c
+size 2598

data/cache/raw/code_search_net_python/test/state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55d5fecb65147f455bfc8249c3e26fc6a2bd01bfd8bd9f354e86eb7834453d1c
+size 261

data/cache/raw/code_search_net_python/train/data-00000-of-00004.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5984af399adbfdab06aca7da7638f6a5eb98411b15b88a1f045f346735fbc9c
+size 377852224

data/cache/raw/code_search_net_python/train/data-00001-of-00004.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a62df607497be1fd23f3e8aa50908bebff6732ccc8b5dacbfaa0efd336ad915
+size 411927504

data/cache/raw/code_search_net_python/train/data-00002-of-00004.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d519b4edb8ae27d8e1ab6474a8decc40f45c6a8e7c409039c865abbc9763f351
+size 370005344

data/cache/raw/code_search_net_python/train/data-00003-of-00004.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b42ae91a5e6e48dd32eac5940429d726f0dbc9440d0262a40a3bfe7a0e2e6214
+size 400292712

data/cache/raw/code_search_net_python/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c
+size 2598

data/cache/raw/code_search_net_python/train/state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:180b84fce72622f4113ea103a1fbf79924e61881442db8728b055be042247bcf
+size 448

data/cache/raw/code_search_net_python/validation/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9f848f9c1dfe1c2cfac25fd1b529e050e29291a5d8042ba1d4f904948142c64
+size 92180808

data/cache/raw/code_search_net_python/validation/dataset_info.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8ba7e0c98d4303660c791c0af8da617dce739fcf2be906ee269c6bf572bad9c
+size 2598

data/cache/raw/code_search_net_python/validation/state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20e5f3cf2d550a3fb9b3d3e43f23f25dfaae9ae3124e43dcf14072f5e3aee182
+size 267

data/cache/raw/mbpp/dataset_dict.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb69d413c1138964f92bd3723baf871db8f40b4cec70586e770e060108a8c612
+size 53

data/cache/raw/mbpp/prompt/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e14c47c41a23d8003284ac9249a5c5e4da285300f1a56b63593fb2d6237556ff
+size 6112

data/cache/raw/mbpp/prompt/dataset_info.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb63c6a97c4cbbd8e28f0e478687c69ea593cd0d4a3a1f2b4e85c6b5378b776e
+size 2205