Faaz
Day 1 Complete: Tokenizer setup β Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89 | # ========================================== | |
| # MINDI 1.5 Vision-Coder β Data Configuration | |
| # ========================================== | |
| dataset: | |
| name: "mindi-1.5-training-data" | |
| target_size: 500000 | |
| format: "jsonl" | |
| # Data sources for fine-tuning | |
| sources: | |
| - name: "code_generation" | |
| description: "Prompt β Next.js + Tailwind + TypeScript code pairs" | |
| path: "./data/raw/code_generation/" | |
| weight: 0.40 | |
| - name: "ui_critique" | |
| description: "Screenshot + code β critique + improved code pairs" | |
| path: "./data/raw/ui_critique/" | |
| weight: 0.20 | |
| - name: "error_correction" | |
| description: "Broken code β fixed code pairs with explanations" | |
| path: "./data/raw/error_correction/" | |
| weight: 0.15 | |
| - name: "documentation_qa" | |
| description: "Documentation context β code answer pairs" | |
| path: "./data/raw/documentation_qa/" | |
| weight: 0.10 | |
| - name: "multi_turn" | |
| description: "Multi-turn conversation with iterative refinement" | |
| path: "./data/raw/multi_turn/" | |
| weight: 0.15 | |
| # Processing | |
| processing: | |
| tokenizer: "Qwen/Qwen2.5-Coder-7B-Instruct" | |
| max_length: 8192 | |
| min_length: 64 | |
| dedup_strategy: "minhash" | |
| quality_filter: true | |
| output_dir: "./data/processed/" | |
| # Train / validation split | |
| splits: | |
| train: 0.95 | |
| validation: 0.05 | |
| # Knowledge base for RAG | |
| knowledge_base: | |
| path: "./data/knowledge_base/" | |
| sources: | |
| - "nextjs-14-docs" | |
| - "tailwindcss-docs" | |
| - "typescript-docs" | |
| - "react-docs" | |
| - "shadcn-ui-docs" | |
| embedding_model: "BAAI/bge-small-en-v1.5" | |
| chunk_size: 512 | |
| chunk_overlap: 64 | |