MINDI-1.5-Vision-Coder / src /utils /config_loader.py
Faaz
feat: initial project scaffold for MINDI 1.5 Vision-Coder
553fbf7
"""
MINDI 1.5 Vision-Coder β€” Configuration Loader
Typed dataclasses for all YAML configuration files.
Provides validated, type-safe access to model, training, data, and search configs.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional
import yaml
# ── Model Config Dataclasses ──
@dataclass
class BaseModelConfig:
name: str = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
parameters: str = "16B"
license: str = "Apache-2.0"
context_length: int = 8192
dtype: str = "bfloat16"
@dataclass
class VisionConfig:
name: str = "openai/clip-vit-large-patch14"
image_size: int = 224
patch_size: int = 14
hidden_size: int = 1024
projection_dim: int = 768
freeze_backbone: bool = True
trainable_projection: bool = True
@dataclass
class LoraConfig:
rank: int = 64
alpha: int = 128
dropout: float = 0.05
target_modules: list[str] = field(default_factory=lambda: [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
])
bias: str = "none"
task_type: str = "CAUSAL_LM"
@dataclass
class OutputConfig:
framework: str = "nextjs-14"
styling: str = "tailwindcss"
language: str = "typescript"
template_format: str = "markdown-codeblock"
@dataclass
class HuggingFaceConfig:
repo_id: str = "Mindigenous/MINDI-1.5-Vision-Coder"
private: bool = False
license: str = "apache-2.0"
@dataclass
class ModelConfig:
name: str = "MINDI-1.5-Vision-Coder"
version: str = "1.5.0"
base: BaseModelConfig = field(default_factory=BaseModelConfig)
vision: VisionConfig = field(default_factory=VisionConfig)
lora: LoraConfig = field(default_factory=LoraConfig)
output: OutputConfig = field(default_factory=OutputConfig)
huggingface: HuggingFaceConfig = field(default_factory=HuggingFaceConfig)
# ── Training Config Dataclasses ──
@dataclass
class LocalOverrides:
batch_size: int = 1
gradient_accumulation_steps: int = 16
max_seq_length: int = 2048
gradient_checkpointing: bool = True
optim: str = "adamw_8bit"
@dataclass
class WandbConfig:
project: str = "mindi-1.5-vision-coder"
entity: str = "mindigenous"
tags: list[str] = field(default_factory=lambda: ["mindi-1.5", "lora", "vision-coder"])
@dataclass
class TrainingConfig:
local_device: str = "cuda"
cloud_device: str = "cuda"
precision: str = "bf16"
epochs: int = 3
batch_size: int = 4
gradient_accumulation_steps: int = 8
effective_batch_size: int = 32
learning_rate: float = 2.0e-4
weight_decay: float = 0.01
warmup_ratio: float = 0.03
lr_scheduler: str = "cosine"
max_grad_norm: float = 1.0
max_seq_length: int = 8192
packing: bool = True
save_strategy: str = "steps"
save_steps: int = 500
save_total_limit: int = 5
checkpoint_dir: str = "./checkpoints"
resume_from_checkpoint: Optional[str] = None
logging_steps: int = 10
log_dir: str = "./logs/training"
report_to: str = "wandb"
eval_strategy: str = "steps"
eval_steps: int = 250
eval_samples: int = 1000
local_overrides: LocalOverrides = field(default_factory=LocalOverrides)
wandb: WandbConfig = field(default_factory=WandbConfig)
# ── Data Config Dataclasses ──
@dataclass
class DataSource:
name: str = ""
description: str = ""
path: str = ""
weight: float = 0.0
@dataclass
class DataProcessing:
tokenizer: str = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
max_length: int = 8192
min_length: int = 64
dedup_strategy: str = "minhash"
quality_filter: bool = True
output_dir: str = "./data/processed/"
@dataclass
class DataSplits:
train: float = 0.95
validation: float = 0.05
@dataclass
class KnowledgeBase:
path: str = "./data/knowledge_base/"
sources: list[str] = field(default_factory=lambda: [
"nextjs-14-docs", "tailwindcss-docs", "typescript-docs",
"react-docs", "shadcn-ui-docs",
])
embedding_model: str = "BAAI/bge-small-en-v1.5"
chunk_size: int = 512
chunk_overlap: int = 64
@dataclass
class DataConfig:
name: str = "mindi-1.5-training-data"
target_size: int = 500000
format: str = "jsonl"
sources: list[DataSource] = field(default_factory=list)
processing: DataProcessing = field(default_factory=DataProcessing)
splits: DataSplits = field(default_factory=DataSplits)
knowledge_base: KnowledgeBase = field(default_factory=KnowledgeBase)
# ── Search Config Dataclasses ──
@dataclass
class RateLimit:
requests_per_minute: int = 30
retry_attempts: int = 3
retry_delay_seconds: int = 2
@dataclass
class SearchCache:
enabled: bool = True
ttl_hours: int = 24
max_entries: int = 10000
storage_path: str = "./data/knowledge_base/search_cache.db"
@dataclass
class DocsScraper:
enabled: bool = True
output_dir: str = "./docs/"
max_pages_per_site: int = 100
respect_robots_txt: bool = True
request_delay_seconds: int = 1
@dataclass
class SearchConfig:
provider: str = "tavily"
api_key_env: str = "TAVILY_API_KEY"
max_results: int = 5
search_depth: str = "advanced"
include_domains: list[str] = field(default_factory=list)
exclude_domains: list[str] = field(default_factory=list)
rate_limit: RateLimit = field(default_factory=RateLimit)
cache: SearchCache = field(default_factory=SearchCache)
docs_scraper: DocsScraper = field(default_factory=DocsScraper)
# ── Config Loader ──
def _dict_to_dataclass(cls: type, data: dict[str, Any]) -> Any:
"""Recursively convert a dict to a dataclass, handling nested dataclasses and lists."""
if not isinstance(data, dict):
return data
field_types = {f.name: f.type for f in cls.__dataclass_fields__.values()}
kwargs: dict[str, Any] = {}
for key, value in data.items():
if key not in field_types:
continue
field_type = field_types[key]
# Handle list of dataclasses (e.g., list[DataSource])
if isinstance(value, list) and hasattr(field_type, "__origin__"):
# For list[DataSource] etc.
inner = getattr(field_type, "__args__", [None])[0]
if inner and hasattr(inner, "__dataclass_fields__"):
kwargs[key] = [_dict_to_dataclass(inner, item) for item in value]
else:
kwargs[key] = value
elif isinstance(value, dict):
# Try to match nested dataclass
field_cls = cls.__dataclass_fields__[key].default_factory if hasattr(cls.__dataclass_fields__[key], "default_factory") else None
# Get actual type from annotations
import typing
actual_type = typing.get_type_hints(cls).get(key)
if actual_type and hasattr(actual_type, "__dataclass_fields__"):
kwargs[key] = _dict_to_dataclass(actual_type, value)
else:
kwargs[key] = value
else:
kwargs[key] = value
return cls(**kwargs)
class ConfigLoader:
"""
Loads and provides typed access to all YAML configuration files.
Usage:
loader = ConfigLoader()
model_cfg = loader.model
training_cfg = loader.training
data_cfg = loader.data
search_cfg = loader.search
"""
def __init__(self, config_dir: Optional[Path] = None) -> None:
self.config_dir = config_dir or Path(__file__).resolve().parents[2] / "configs"
self._model: Optional[ModelConfig] = None
self._training: Optional[TrainingConfig] = None
self._data: Optional[DataConfig] = None
self._search: Optional[SearchConfig] = None
def _load_yaml(self, filename: str) -> dict[str, Any]:
"""Load a YAML file from the config directory."""
path = self.config_dir / filename
if not path.exists():
raise FileNotFoundError(f"Config file not found: {path}")
with open(path, "r", encoding="utf-8") as f:
return yaml.safe_load(f)
@property
def model(self) -> ModelConfig:
"""Load and return typed model configuration."""
if self._model is None:
raw = self._load_yaml("model_config.yaml")
model_data = raw.get("model", {})
self._model = _dict_to_dataclass(ModelConfig, model_data)
# HuggingFace config is at root level
hf_data = raw.get("huggingface", {})
if hf_data:
self._model.huggingface = _dict_to_dataclass(HuggingFaceConfig, hf_data)
return self._model
@property
def training(self) -> TrainingConfig:
"""Load and return typed training configuration."""
if self._training is None:
raw = self._load_yaml("training_config.yaml")
training_data = raw.get("training", {})
self._training = _dict_to_dataclass(TrainingConfig, training_data)
# WandB config is at root level
wandb_data = raw.get("wandb", {})
if wandb_data:
self._training.wandb = _dict_to_dataclass(WandbConfig, wandb_data)
return self._training
@property
def data(self) -> DataConfig:
"""Load and return typed data configuration."""
if self._data is None:
raw = self._load_yaml("data_config.yaml")
dataset_data = raw.get("dataset", {})
self._data = _dict_to_dataclass(DataConfig, dataset_data)
return self._data
@property
def search(self) -> SearchConfig:
"""Load and return typed search configuration."""
if self._search is None:
raw = self._load_yaml("search_config.yaml")
search_data = raw.get("search", {})
self._search = _dict_to_dataclass(SearchConfig, search_data)
return self._search
def reload(self) -> None:
"""Force reload all configurations from disk."""
self._model = None
self._training = None
self._data = None
self._search = None
def print_summary(self) -> None:
"""Print a summary of all loaded configurations."""
print("\n╔══════════════════════════════════════════╗")
print("β•‘ MINDI 1.5 β€” Configuration Summary β•‘")
print("╠══════════════════════════════════════════╣")
m = self.model
print(f" Model: {m.name} v{m.version}")
print(f" Base: {m.base.name} ({m.base.parameters})")
print(f" Vision: {m.vision.name}")
print(f" LoRA: r={m.lora.rank}, alpha={m.lora.alpha}")
print(f" Output: {m.output.framework} + {m.output.styling} + {m.output.language}")
print("╠──────────────────────────────────────────╣")
t = self.training
print(f" Epochs: {t.epochs}")
print(f" Batch: {t.batch_size} (effective: {t.effective_batch_size})")
print(f" LR: {t.learning_rate}")
print(f" Precision: {t.precision}")
print(f" Seq length: {t.max_seq_length}")
print("╠──────────────────────────────────────────╣")
d = self.data
print(f" Dataset: {d.name} ({d.target_size:,} target)")
print(f" Sources: {len(d.sources)}")
print(f" Format: {d.format}")
print("╠──────────────────────────────────────────╣")
s = self.search
print(f" Provider: {s.provider}")
print(f" Max results: {s.max_results}")
print(f" Domains: {len(s.include_domains)} included, {len(s.exclude_domains)} excluded")
print("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n")