Charlie81 commited on
Commit
1e0b293
·
1 Parent(s): 6d21fca

tokenize function

Browse files
Files changed (1) hide show
  1. scripts/train.py +22 -10
scripts/train.py CHANGED
@@ -35,18 +35,30 @@ def main():
35
 
36
  # Load dataset
37
  dataset = load_dataset("allenai/tulu-v2-sft-mixture", split="train")
38
- for i in range(10):
39
- print("looking")
40
- print(dataset.column_names)
41
 
42
  def tokenize_function(examples):
43
- text_key = "content" if "content" in examples else "text"
44
- return tokenizer(
45
- examples[text_key],
46
- truncation=True,
47
- max_length=4096,
48
- padding="max_length"
49
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  tokenized_dataset = dataset.map(
52
  tokenize_function,
 
35
 
36
  # Load dataset
37
  dataset = load_dataset("allenai/tulu-v2-sft-mixture", split="train")
 
 
 
38
 
39
  def tokenize_function(examples):
40
+ texts = []
41
+ for message_list in examples["messages"]:
42
+ # Format the conversation history into a single string
43
+ formatted = ""
44
+ for msg in message_list:
45
+ role = msg["role"]
46
+ content = msg["content"]
47
+ if role == "user":
48
+ formatted += f"User: {content}\n"
49
+ elif role == "assistant":
50
+ formatted += f"Assistant: {content}\n"
51
+ else:
52
+ formatted += f"{role.capitalize()}: {content}\n"
53
+ texts.append(formatted)
54
+
55
+ return tokenizer(
56
+ texts,
57
+ truncation=True,
58
+ max_length=4096,
59
+ padding="max_length"
60
+ )
61
+
62
 
63
  tokenized_dataset = dataset.map(
64
  tokenize_function,