| """ |
| MINDI 1.5 Vision-Coder β Configuration Loader |
| |
| Typed dataclasses for all YAML configuration files. |
| Provides validated, type-safe access to model, training, data, and search configs. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass, field |
| from pathlib import Path |
| from typing import Any, Optional |
|
|
| import yaml |
|
|
|
|
| |
|
|
| @dataclass |
| class BaseModelConfig: |
| name: str = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" |
| parameters: str = "16B" |
| license: str = "Apache-2.0" |
| context_length: int = 8192 |
| dtype: str = "bfloat16" |
|
|
|
|
| @dataclass |
| class VisionConfig: |
| name: str = "openai/clip-vit-large-patch14" |
| image_size: int = 224 |
| patch_size: int = 14 |
| hidden_size: int = 1024 |
| projection_dim: int = 768 |
| freeze_backbone: bool = True |
| trainable_projection: bool = True |
|
|
|
|
| @dataclass |
| class LoraConfig: |
| rank: int = 64 |
| alpha: int = 128 |
| dropout: float = 0.05 |
| target_modules: list[str] = field(default_factory=lambda: [ |
| "q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj", |
| ]) |
| bias: str = "none" |
| task_type: str = "CAUSAL_LM" |
|
|
|
|
| @dataclass |
| class OutputConfig: |
| framework: str = "nextjs-14" |
| styling: str = "tailwindcss" |
| language: str = "typescript" |
| template_format: str = "markdown-codeblock" |
|
|
|
|
| @dataclass |
| class HuggingFaceConfig: |
| repo_id: str = "Mindigenous/MINDI-1.5-Vision-Coder" |
| private: bool = False |
| license: str = "apache-2.0" |
|
|
|
|
| @dataclass |
| class ModelConfig: |
| name: str = "MINDI-1.5-Vision-Coder" |
| version: str = "1.5.0" |
| base: BaseModelConfig = field(default_factory=BaseModelConfig) |
| vision: VisionConfig = field(default_factory=VisionConfig) |
| lora: LoraConfig = field(default_factory=LoraConfig) |
| output: OutputConfig = field(default_factory=OutputConfig) |
| huggingface: HuggingFaceConfig = field(default_factory=HuggingFaceConfig) |
|
|
|
|
| |
|
|
| @dataclass |
| class LocalOverrides: |
| batch_size: int = 1 |
| gradient_accumulation_steps: int = 16 |
| max_seq_length: int = 2048 |
| gradient_checkpointing: bool = True |
| optim: str = "adamw_8bit" |
|
|
|
|
| @dataclass |
| class WandbConfig: |
| project: str = "mindi-1.5-vision-coder" |
| entity: str = "mindigenous" |
| tags: list[str] = field(default_factory=lambda: ["mindi-1.5", "lora", "vision-coder"]) |
|
|
|
|
| @dataclass |
| class TrainingConfig: |
| local_device: str = "cuda" |
| cloud_device: str = "cuda" |
| precision: str = "bf16" |
| epochs: int = 3 |
| batch_size: int = 4 |
| gradient_accumulation_steps: int = 8 |
| effective_batch_size: int = 32 |
| learning_rate: float = 2.0e-4 |
| weight_decay: float = 0.01 |
| warmup_ratio: float = 0.03 |
| lr_scheduler: str = "cosine" |
| max_grad_norm: float = 1.0 |
| max_seq_length: int = 8192 |
| packing: bool = True |
| save_strategy: str = "steps" |
| save_steps: int = 500 |
| save_total_limit: int = 5 |
| checkpoint_dir: str = "./checkpoints" |
| resume_from_checkpoint: Optional[str] = None |
| logging_steps: int = 10 |
| log_dir: str = "./logs/training" |
| report_to: str = "wandb" |
| eval_strategy: str = "steps" |
| eval_steps: int = 250 |
| eval_samples: int = 1000 |
| local_overrides: LocalOverrides = field(default_factory=LocalOverrides) |
| wandb: WandbConfig = field(default_factory=WandbConfig) |
|
|
|
|
| |
|
|
| @dataclass |
| class DataSource: |
| name: str = "" |
| description: str = "" |
| path: str = "" |
| weight: float = 0.0 |
|
|
|
|
| @dataclass |
| class DataProcessing: |
| tokenizer: str = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" |
| max_length: int = 8192 |
| min_length: int = 64 |
| dedup_strategy: str = "minhash" |
| quality_filter: bool = True |
| output_dir: str = "./data/processed/" |
|
|
|
|
| @dataclass |
| class DataSplits: |
| train: float = 0.95 |
| validation: float = 0.05 |
|
|
|
|
| @dataclass |
| class KnowledgeBase: |
| path: str = "./data/knowledge_base/" |
| sources: list[str] = field(default_factory=lambda: [ |
| "nextjs-14-docs", "tailwindcss-docs", "typescript-docs", |
| "react-docs", "shadcn-ui-docs", |
| ]) |
| embedding_model: str = "BAAI/bge-small-en-v1.5" |
| chunk_size: int = 512 |
| chunk_overlap: int = 64 |
|
|
|
|
| @dataclass |
| class DataConfig: |
| name: str = "mindi-1.5-training-data" |
| target_size: int = 500000 |
| format: str = "jsonl" |
| sources: list[DataSource] = field(default_factory=list) |
| processing: DataProcessing = field(default_factory=DataProcessing) |
| splits: DataSplits = field(default_factory=DataSplits) |
| knowledge_base: KnowledgeBase = field(default_factory=KnowledgeBase) |
|
|
|
|
| |
|
|
| @dataclass |
| class RateLimit: |
| requests_per_minute: int = 30 |
| retry_attempts: int = 3 |
| retry_delay_seconds: int = 2 |
|
|
|
|
| @dataclass |
| class SearchCache: |
| enabled: bool = True |
| ttl_hours: int = 24 |
| max_entries: int = 10000 |
| storage_path: str = "./data/knowledge_base/search_cache.db" |
|
|
|
|
| @dataclass |
| class DocsScraper: |
| enabled: bool = True |
| output_dir: str = "./docs/" |
| max_pages_per_site: int = 100 |
| respect_robots_txt: bool = True |
| request_delay_seconds: int = 1 |
|
|
|
|
| @dataclass |
| class SearchConfig: |
| provider: str = "tavily" |
| api_key_env: str = "TAVILY_API_KEY" |
| max_results: int = 5 |
| search_depth: str = "advanced" |
| include_domains: list[str] = field(default_factory=list) |
| exclude_domains: list[str] = field(default_factory=list) |
| rate_limit: RateLimit = field(default_factory=RateLimit) |
| cache: SearchCache = field(default_factory=SearchCache) |
| docs_scraper: DocsScraper = field(default_factory=DocsScraper) |
|
|
|
|
| |
|
|
| def _dict_to_dataclass(cls: type, data: dict[str, Any]) -> Any: |
| """Recursively convert a dict to a dataclass, handling nested dataclasses and lists.""" |
| if not isinstance(data, dict): |
| return data |
|
|
| field_types = {f.name: f.type for f in cls.__dataclass_fields__.values()} |
| kwargs: dict[str, Any] = {} |
|
|
| for key, value in data.items(): |
| if key not in field_types: |
| continue |
|
|
| field_type = field_types[key] |
|
|
| |
| if isinstance(value, list) and hasattr(field_type, "__origin__"): |
| |
| inner = getattr(field_type, "__args__", [None])[0] |
| if inner and hasattr(inner, "__dataclass_fields__"): |
| kwargs[key] = [_dict_to_dataclass(inner, item) for item in value] |
| else: |
| kwargs[key] = value |
| elif isinstance(value, dict): |
| |
| field_cls = cls.__dataclass_fields__[key].default_factory if hasattr(cls.__dataclass_fields__[key], "default_factory") else None |
| |
| import typing |
| actual_type = typing.get_type_hints(cls).get(key) |
| if actual_type and hasattr(actual_type, "__dataclass_fields__"): |
| kwargs[key] = _dict_to_dataclass(actual_type, value) |
| else: |
| kwargs[key] = value |
| else: |
| kwargs[key] = value |
|
|
| return cls(**kwargs) |
|
|
|
|
| class ConfigLoader: |
| """ |
| Loads and provides typed access to all YAML configuration files. |
| |
| Usage: |
| loader = ConfigLoader() |
| model_cfg = loader.model |
| training_cfg = loader.training |
| data_cfg = loader.data |
| search_cfg = loader.search |
| """ |
|
|
| def __init__(self, config_dir: Optional[Path] = None) -> None: |
| self.config_dir = config_dir or Path(__file__).resolve().parents[2] / "configs" |
| self._model: Optional[ModelConfig] = None |
| self._training: Optional[TrainingConfig] = None |
| self._data: Optional[DataConfig] = None |
| self._search: Optional[SearchConfig] = None |
|
|
| def _load_yaml(self, filename: str) -> dict[str, Any]: |
| """Load a YAML file from the config directory.""" |
| path = self.config_dir / filename |
| if not path.exists(): |
| raise FileNotFoundError(f"Config file not found: {path}") |
| with open(path, "r", encoding="utf-8") as f: |
| return yaml.safe_load(f) |
|
|
| @property |
| def model(self) -> ModelConfig: |
| """Load and return typed model configuration.""" |
| if self._model is None: |
| raw = self._load_yaml("model_config.yaml") |
| model_data = raw.get("model", {}) |
| self._model = _dict_to_dataclass(ModelConfig, model_data) |
| |
| hf_data = raw.get("huggingface", {}) |
| if hf_data: |
| self._model.huggingface = _dict_to_dataclass(HuggingFaceConfig, hf_data) |
| return self._model |
|
|
| @property |
| def training(self) -> TrainingConfig: |
| """Load and return typed training configuration.""" |
| if self._training is None: |
| raw = self._load_yaml("training_config.yaml") |
| training_data = raw.get("training", {}) |
| self._training = _dict_to_dataclass(TrainingConfig, training_data) |
| |
| wandb_data = raw.get("wandb", {}) |
| if wandb_data: |
| self._training.wandb = _dict_to_dataclass(WandbConfig, wandb_data) |
| return self._training |
|
|
| @property |
| def data(self) -> DataConfig: |
| """Load and return typed data configuration.""" |
| if self._data is None: |
| raw = self._load_yaml("data_config.yaml") |
| dataset_data = raw.get("dataset", {}) |
| self._data = _dict_to_dataclass(DataConfig, dataset_data) |
| return self._data |
|
|
| @property |
| def search(self) -> SearchConfig: |
| """Load and return typed search configuration.""" |
| if self._search is None: |
| raw = self._load_yaml("search_config.yaml") |
| search_data = raw.get("search", {}) |
| self._search = _dict_to_dataclass(SearchConfig, search_data) |
| return self._search |
|
|
| def reload(self) -> None: |
| """Force reload all configurations from disk.""" |
| self._model = None |
| self._training = None |
| self._data = None |
| self._search = None |
|
|
| def print_summary(self) -> None: |
| """Print a summary of all loaded configurations.""" |
| print("\nββββββββββββββββββββββββββββββββββββββββββββ") |
| print("β MINDI 1.5 β Configuration Summary β") |
| print("β βββββββββββββββββββββββββββββββββββββββββββ£") |
|
|
| m = self.model |
| print(f" Model: {m.name} v{m.version}") |
| print(f" Base: {m.base.name} ({m.base.parameters})") |
| print(f" Vision: {m.vision.name}") |
| print(f" LoRA: r={m.lora.rank}, alpha={m.lora.alpha}") |
| print(f" Output: {m.output.framework} + {m.output.styling} + {m.output.language}") |
|
|
| print("β βββββββββββββββββββββββββββββββββββββββββββ£") |
|
|
| t = self.training |
| print(f" Epochs: {t.epochs}") |
| print(f" Batch: {t.batch_size} (effective: {t.effective_batch_size})") |
| print(f" LR: {t.learning_rate}") |
| print(f" Precision: {t.precision}") |
| print(f" Seq length: {t.max_seq_length}") |
|
|
| print("β βββββββββββββββββββββββββββββββββββββββββββ£") |
|
|
| d = self.data |
| print(f" Dataset: {d.name} ({d.target_size:,} target)") |
| print(f" Sources: {len(d.sources)}") |
| print(f" Format: {d.format}") |
|
|
| print("β βββββββββββββββββββββββββββββββββββββββββββ£") |
|
|
| s = self.search |
| print(f" Provider: {s.provider}") |
| print(f" Max results: {s.max_results}") |
| print(f" Domains: {len(s.include_domains)} included, {len(s.exclude_domains)} excluded") |
|
|
| print("ββββββββββββββββββββββββββββββββββββββββββββ\n") |
|
|