MINDI-1.5-Vision-Coder / configs /data_config.yaml
Faaz
Day 1 Complete: Tokenizer setup β€” Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89
# ==========================================
# MINDI 1.5 Vision-Coder β€” Data Configuration
# ==========================================
dataset:
name: "mindi-1.5-training-data"
target_size: 500000
format: "jsonl"
# Data sources for fine-tuning
sources:
- name: "code_generation"
description: "Prompt β†’ Next.js + Tailwind + TypeScript code pairs"
path: "./data/raw/code_generation/"
weight: 0.40
- name: "ui_critique"
description: "Screenshot + code β†’ critique + improved code pairs"
path: "./data/raw/ui_critique/"
weight: 0.20
- name: "error_correction"
description: "Broken code β†’ fixed code pairs with explanations"
path: "./data/raw/error_correction/"
weight: 0.15
- name: "documentation_qa"
description: "Documentation context β†’ code answer pairs"
path: "./data/raw/documentation_qa/"
weight: 0.10
- name: "multi_turn"
description: "Multi-turn conversation with iterative refinement"
path: "./data/raw/multi_turn/"
weight: 0.15
# Processing
processing:
tokenizer: "Qwen/Qwen2.5-Coder-7B-Instruct"
max_length: 8192
min_length: 64
dedup_strategy: "minhash"
quality_filter: true
output_dir: "./data/processed/"
# Train / validation split
splits:
train: 0.95
validation: 0.05
# Knowledge base for RAG
knowledge_base:
path: "./data/knowledge_base/"
sources:
- "nextjs-14-docs"
- "tailwindcss-docs"
- "typescript-docs"
- "react-docs"
- "shadcn-ui-docs"
embedding_model: "BAAI/bge-small-en-v1.5"
chunk_size: 512
chunk_overlap: 64