File size: 2,565 Bytes
553fbf7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | """
MINDI 1.5 Vision-Coder — Dataset Loader
Loads and preprocesses training data from JSONL files into
tokenized format for LoRA fine-tuning.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any, Optional
import yaml
from torch.utils.data import Dataset
class MindiDataset(Dataset):
"""Dataset for MINDI 1.5 fine-tuning data (JSONL format)."""
def __init__(
self,
data_dir: Path,
tokenizer: Any,
max_length: int = 8192,
split: str = "train",
) -> None:
self.data_dir = Path(data_dir)
self.tokenizer = tokenizer
self.max_length = max_length
self.split = split
self.examples: list[dict[str, Any]] = []
self._load_data()
def _load_data(self) -> None:
"""Load all JSONL files from the data directory."""
data_path = self.data_dir / f"{self.split}.jsonl"
if not data_path.exists():
print(f"[MindiDataset] No data file at {data_path} — dataset is empty")
return
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
self.examples.append(json.loads(line))
print(f"[MindiDataset] Loaded {len(self.examples)} examples ({self.split})")
def __len__(self) -> int:
return len(self.examples)
def __getitem__(self, idx: int) -> dict[str, Any]:
"""Tokenize and return a single training example."""
example = self.examples[idx]
# Expected format: {"prompt": "...", "completion": "..."}
prompt = example.get("prompt", "")
completion = example.get("completion", "")
full_text = f"{prompt}\n{completion}"
encoded = self.tokenizer(
full_text,
max_length=self.max_length,
truncation=True,
padding="max_length",
return_tensors="pt",
)
return {
"input_ids": encoded["input_ids"].squeeze(0),
"attention_mask": encoded["attention_mask"].squeeze(0),
"labels": encoded["input_ids"].squeeze(0),
}
def load_data_config(config_path: Optional[Path] = None) -> dict:
"""Load data configuration from YAML."""
path = config_path or Path("./configs/data_config.yaml")
if not path.exists():
raise FileNotFoundError(f"Data config not found: {path}")
with open(path, "r", encoding="utf-8") as f:
return yaml.safe_load(f).get("dataset", {})
|