| """ |
| MINDI 1.5 Vision-Coder — Dataset Loader |
| |
| Loads and preprocesses training data from JSONL files into |
| tokenized format for LoRA fine-tuning. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
| from typing import Any, Optional |
|
|
| import yaml |
| from torch.utils.data import Dataset |
|
|
|
|
| class MindiDataset(Dataset): |
| """Dataset for MINDI 1.5 fine-tuning data (JSONL format).""" |
|
|
| def __init__( |
| self, |
| data_dir: Path, |
| tokenizer: Any, |
| max_length: int = 8192, |
| split: str = "train", |
| ) -> None: |
| self.data_dir = Path(data_dir) |
| self.tokenizer = tokenizer |
| self.max_length = max_length |
| self.split = split |
| self.examples: list[dict[str, Any]] = [] |
| self._load_data() |
|
|
| def _load_data(self) -> None: |
| """Load all JSONL files from the data directory.""" |
| data_path = self.data_dir / f"{self.split}.jsonl" |
| if not data_path.exists(): |
| print(f"[MindiDataset] No data file at {data_path} — dataset is empty") |
| return |
|
|
| with open(data_path, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if line: |
| self.examples.append(json.loads(line)) |
|
|
| print(f"[MindiDataset] Loaded {len(self.examples)} examples ({self.split})") |
|
|
| def __len__(self) -> int: |
| return len(self.examples) |
|
|
| def __getitem__(self, idx: int) -> dict[str, Any]: |
| """Tokenize and return a single training example.""" |
| example = self.examples[idx] |
|
|
| |
| prompt = example.get("prompt", "") |
| completion = example.get("completion", "") |
| full_text = f"{prompt}\n{completion}" |
|
|
| encoded = self.tokenizer( |
| full_text, |
| max_length=self.max_length, |
| truncation=True, |
| padding="max_length", |
| return_tensors="pt", |
| ) |
|
|
| return { |
| "input_ids": encoded["input_ids"].squeeze(0), |
| "attention_mask": encoded["attention_mask"].squeeze(0), |
| "labels": encoded["input_ids"].squeeze(0), |
| } |
|
|
|
|
| def load_data_config(config_path: Optional[Path] = None) -> dict: |
| """Load data configuration from YAML.""" |
| path = config_path or Path("./configs/data_config.yaml") |
| if not path.exists(): |
| raise FileNotFoundError(f"Data config not found: {path}") |
|
|
| with open(path, "r", encoding="utf-8") as f: |
| return yaml.safe_load(f).get("dataset", {}) |
|
|