File size: 2,565 Bytes
553fbf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
MINDI 1.5 Vision-Coder — Dataset Loader

Loads and preprocesses training data from JSONL files into
tokenized format for LoRA fine-tuning.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Optional

import yaml
from torch.utils.data import Dataset


class MindiDataset(Dataset):
    """Dataset for MINDI 1.5 fine-tuning data (JSONL format)."""

    def __init__(
        self,
        data_dir: Path,
        tokenizer: Any,
        max_length: int = 8192,
        split: str = "train",
    ) -> None:
        self.data_dir = Path(data_dir)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.split = split
        self.examples: list[dict[str, Any]] = []
        self._load_data()

    def _load_data(self) -> None:
        """Load all JSONL files from the data directory."""
        data_path = self.data_dir / f"{self.split}.jsonl"
        if not data_path.exists():
            print(f"[MindiDataset] No data file at {data_path} — dataset is empty")
            return

        with open(data_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    self.examples.append(json.loads(line))

        print(f"[MindiDataset] Loaded {len(self.examples)} examples ({self.split})")

    def __len__(self) -> int:
        return len(self.examples)

    def __getitem__(self, idx: int) -> dict[str, Any]:
        """Tokenize and return a single training example."""
        example = self.examples[idx]

        # Expected format: {"prompt": "...", "completion": "..."}
        prompt = example.get("prompt", "")
        completion = example.get("completion", "")
        full_text = f"{prompt}\n{completion}"

        encoded = self.tokenizer(
            full_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": encoded["input_ids"].squeeze(0),
        }


def load_data_config(config_path: Optional[Path] = None) -> dict:
    """Load data configuration from YAML."""
    path = config_path or Path("./configs/data_config.yaml")
    if not path.exists():
        raise FileNotFoundError(f"Data config not found: {path}")

    with open(path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f).get("dataset", {})