File size: 2,055 Bytes
53f0cc2
 
 
 
 
 
 
 
 
3132f2e
53f0cc2
3132f2e
 
53f0cc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3132f2e
 
 
 
 
 
 
 
 
53f0cc2
3132f2e
53f0cc2
 
 
 
 
 
 
 
 
 
3132f2e
 
 
 
 
 
 
 
 
 
 
 
 
53f0cc2
 
 
 
 
 
3132f2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from typing import Dict, List

import torch
from torch.utils.data import Dataset

from config import PATHS, TRAINING_CONFIG
from utils import read_jsonl


def format_prompt(instruction: str, response: str) -> str:
    return (
        f"### Instruction:\n{instruction}\n\n"
        f"### Response:\n{response}"
    )


class LocalJsonlInstructionDataset(Dataset):
    def __init__(self, tokenizer, max_length: int = TRAINING_CONFIG.max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples: List[Dict[str, str]] = read_jsonl(PATHS.train_jsonl)

        if not self.samples:
            raise ValueError(f"No training samples found in {PATHS.train_jsonl}")

    def __len__(self) -> int:
        return len(self.samples)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        sample = self.samples[idx]

        instruction = sample["instruction"]
        response = sample["response"]

        # 🔥 Build prompt (without response first)
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
        full_text = prompt + response

        # Tokenize full text
        encoded = self.tokenizer(
            full_text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )

        input_ids = encoded["input_ids"].squeeze(0)
        attention_mask = encoded["attention_mask"].squeeze(0)

        labels = input_ids.clone()

        # 🔥 Mask instruction part (ONLY train on response)
        prompt_ids = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )["input_ids"].squeeze(0)

        prompt_len = min(len(prompt_ids), self.max_length)
        labels[:prompt_len] = -100

        # Mask padding
        labels[attention_mask == 0] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }