GPT-2 Base trained on prefix dataset (682K)
Browse files
scripts/train_with_json.py
CHANGED
|
@@ -28,7 +28,7 @@ from peft import LoraConfig, get_peft_model
|
|
| 28 |
|
| 29 |
def convert_to_json_format(example):
|
| 30 |
"""Convert dataset format to JSON format."""
|
| 31 |
-
text = example['
|
| 32 |
|
| 33 |
# Parse the text format
|
| 34 |
lines = text.strip().split('\n')
|
|
@@ -115,12 +115,12 @@ def main():
|
|
| 115 |
|
| 116 |
# Check original format
|
| 117 |
print("Original format sample:")
|
| 118 |
-
print(dataset["train"][0]['
|
| 119 |
print()
|
| 120 |
|
| 121 |
# Convert to JSON format
|
| 122 |
print("Converting to JSON format...")
|
| 123 |
-
train_dataset = dataset["train"].map(convert_to_json_format, remove_columns=['
|
| 124 |
|
| 125 |
# Split for validation (10%)
|
| 126 |
split_dataset = train_dataset.train_test_split(test_size=0.1, seed=42)
|
|
|
|
| 28 |
|
| 29 |
def convert_to_json_format(example):
|
| 30 |
"""Convert dataset format to JSON format."""
|
| 31 |
+
text = example['p_prompt_n_converted']
|
| 32 |
|
| 33 |
# Parse the text format
|
| 34 |
lines = text.strip().split('\n')
|
|
|
|
| 115 |
|
| 116 |
# Check original format
|
| 117 |
print("Original format sample:")
|
| 118 |
+
print(dataset["train"][0]['p_prompt_n_converted'][:150])
|
| 119 |
print()
|
| 120 |
|
| 121 |
# Convert to JSON format
|
| 122 |
print("Converting to JSON format...")
|
| 123 |
+
train_dataset = dataset["train"].map(convert_to_json_format, remove_columns=['p_prompt_n_converted'])
|
| 124 |
|
| 125 |
# Split for validation (10%)
|
| 126 |
split_dataset = train_dataset.train_test_split(test_size=0.1, seed=42)
|