File size: 2,241 Bytes
4f2b2f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from datasets import load_dataset   

def preprocess_opc_coder(tokenizer, max_length):
    ds = load_dataset("OpenCoder-LLM/opc-sft-stage2", "educational_instruct")['train']
    
    def process_sample(sample):
        # Tokenize instruction and output separately
        instruction_tokens = tokenizer(sample['instruction'], add_special_tokens=False)['input_ids']
        output_tokens = tokenizer(sample['output'], add_special_tokens=False)['input_ids']
        
        # Combine instruction and output
        input_ids = instruction_tokens + output_tokens
        
        # Pad to max_length
        if len(input_ids) < max_length:
            input_ids = input_ids + [tokenizer.pad_token_id] * (max_length - len(input_ids))
        elif len(input_ids) > max_length:
            input_ids = input_ids[:max_length]
        
        # Set prefix_cutoff to the length of the instruction
        prefix_cutoff = len(instruction_tokens)
        
        return {
            'input_ids': input_ids,
            'prefix_cutoff': prefix_cutoff
        }
    
    processed_ds = ds.map(process_sample, remove_columns=ds.column_names)
    return processed_ds


def preprocess_human_eval(tokenizer, max_length):
    ds = load_dataset("openai/openai_humaneval")['test']
    
    def process_sample(sample):
        # Tokenize prompt and canonical_solution separately
        prompt_tokens = tokenizer(sample['prompt'], add_special_tokens=False)['input_ids']
        solution_tokens = tokenizer(sample['canonical_solution'], add_special_tokens=False)['input_ids']
        
        # Combine prompt and solution
        input_ids = prompt_tokens + solution_tokens
        
        # Pad to max_length
        if len(input_ids) < max_length:
            input_ids = input_ids + [tokenizer.pad_token_id] * (max_length - len(input_ids))
        elif len(input_ids) > max_length:
            input_ids = input_ids[:max_length]
        
        # Set prefix_cutoff to the length of the prompt
        prefix_cutoff = len(prompt_tokens)
        
        return {
            'input_ids': input_ids,
            'prefix_cutoff': prefix_cutoff
        }
    
    processed_ds = ds.map(process_sample, remove_columns=ds.column_names)
    return processed_ds