organize data in the data/ folder. use dedicated test set in finetune notebook
Browse files- test.jsonl → data/test.jsonl +0 -0
- train.jsonl → data/train.jsonl +0 -0
- val.jsonl → data/val.jsonl +0 -0
- qwen3_finetune.ipynb +15 -13
test.jsonl → data/test.jsonl
RENAMED
|
File without changes
|
train.jsonl → data/train.jsonl
RENAMED
|
File without changes
|
val.jsonl → data/val.jsonl
RENAMED
|
File without changes
|
qwen3_finetune.ipynb
CHANGED
|
@@ -61,8 +61,9 @@
|
|
| 61 |
"outputs": [],
|
| 62 |
"source": [
|
| 63 |
"\n",
|
| 64 |
-
"TRAIN_FILE = \"train.jsonl\"\n",
|
| 65 |
-
"VAL_FILE = \"val.jsonl\"\n",
|
|
|
|
| 66 |
"MODEL_NAME = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
|
| 67 |
"HF_TOKEN = \"...\""
|
| 68 |
]
|
|
@@ -321,6 +322,16 @@
|
|
| 321 |
"val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)"
|
| 322 |
]
|
| 323 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
{
|
| 325 |
"cell_type": "markdown",
|
| 326 |
"metadata": {
|
|
@@ -824,16 +835,7 @@
|
|
| 824 |
},
|
| 825 |
{
|
| 826 |
"cell_type": "code",
|
| 827 |
-
"execution_count":
|
| 828 |
-
"metadata": {},
|
| 829 |
-
"outputs": [],
|
| 830 |
-
"source": [
|
| 831 |
-
"val_dataset = load_data('./val.jsonl')"
|
| 832 |
-
]
|
| 833 |
-
},
|
| 834 |
-
{
|
| 835 |
-
"cell_type": "code",
|
| 836 |
-
"execution_count": 19,
|
| 837 |
"metadata": {},
|
| 838 |
"outputs": [
|
| 839 |
{
|
|
@@ -851,7 +853,7 @@
|
|
| 851 |
}
|
| 852 |
],
|
| 853 |
"source": [
|
| 854 |
-
"messages =
|
| 855 |
"messages"
|
| 856 |
]
|
| 857 |
},
|
|
|
|
| 61 |
"outputs": [],
|
| 62 |
"source": [
|
| 63 |
"\n",
|
| 64 |
+
"TRAIN_FILE = \"data/train.jsonl\"\n",
|
| 65 |
+
"VAL_FILE = \"data/val.jsonl\"\n",
|
| 66 |
+
"TEST_FILE = \"data/test.jsonl\"\n",
|
| 67 |
"MODEL_NAME = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
|
| 68 |
"HF_TOKEN = \"...\""
|
| 69 |
]
|
|
|
|
| 322 |
"val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)"
|
| 323 |
]
|
| 324 |
},
|
| 325 |
+
{
|
| 326 |
+
"cell_type": "code",
|
| 327 |
+
"execution_count": null,
|
| 328 |
+
"metadata": {},
|
| 329 |
+
"outputs": [],
|
| 330 |
+
"source": [
|
| 331 |
+
"test_dataset = load_data(TEST_FILE)\n",
|
| 332 |
+
"test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)"
|
| 333 |
+
]
|
| 334 |
+
},
|
| 335 |
{
|
| 336 |
"cell_type": "markdown",
|
| 337 |
"metadata": {
|
|
|
|
| 835 |
},
|
| 836 |
{
|
| 837 |
"cell_type": "code",
|
| 838 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
"metadata": {},
|
| 840 |
"outputs": [
|
| 841 |
{
|
|
|
|
| 853 |
}
|
| 854 |
],
|
| 855 |
"source": [
|
| 856 |
+
"messages = test_dataset[1][\"messages\"][:2]\n",
|
| 857 |
"messages"
|
| 858 |
]
|
| 859 |
},
|