Charlie81
/

LoRE

Charlie81 commited on Jul 6, 2025

Commit

1e0b293

1 Parent(s): 6d21fca

tokenize function

Files changed (1) hide show

scripts/train.py CHANGED Viewed

@@ -35,18 +35,30 @@ def main():
     # Load dataset
     dataset = load_dataset("allenai/tulu-v2-sft-mixture", split="train")
-    for i in range(10):
-        print("looking")
-    print(dataset.column_names)
     def tokenize_function(examples):
-        text_key = "content" if "content" in examples else "text"
-        return tokenizer(
-            examples[text_key],
-            truncation=True,
-            max_length=4096,
-            padding="max_length"
-        )
     tokenized_dataset = dataset.map(
         tokenize_function,

     # Load dataset
     dataset = load_dataset("allenai/tulu-v2-sft-mixture", split="train")
     def tokenize_function(examples):
+        texts = []
+        for message_list in examples["messages"]:
+            # Format the conversation history into a single string
+            formatted = ""
+            for msg in message_list:
+                role = msg["role"]
+                content = msg["content"]
+                if role == "user":
+                    formatted += f"User: {content}\n"
+                elif role == "assistant":
+                    formatted += f"Assistant: {content}\n"
+                else:
+                    formatted += f"{role.capitalize()}: {content}\n"
+            texts.append(formatted)
+    return tokenizer(
+        texts,
+        truncation=True,
+        max_length=4096,
+        padding="max_length"
+    )
     tokenized_dataset = dataset.map(
         tokenize_function,