nuriyev commited on
Commit
791e77b
·
1 Parent(s): 9ce26a7

organize data in the data/ folder. use dedicated test set in finetune notebook

Browse files
test.jsonl → data/test.jsonl RENAMED
File without changes
train.jsonl → data/train.jsonl RENAMED
File without changes
val.jsonl → data/val.jsonl RENAMED
File without changes
qwen3_finetune.ipynb CHANGED
@@ -61,8 +61,9 @@
61
  "outputs": [],
62
  "source": [
63
  "\n",
64
- "TRAIN_FILE = \"train.jsonl\"\n",
65
- "VAL_FILE = \"val.jsonl\"\n",
 
66
  "MODEL_NAME = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
67
  "HF_TOKEN = \"...\""
68
  ]
@@ -321,6 +322,16 @@
321
  "val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)"
322
  ]
323
  },
 
 
 
 
 
 
 
 
 
 
324
  {
325
  "cell_type": "markdown",
326
  "metadata": {
@@ -824,16 +835,7 @@
824
  },
825
  {
826
  "cell_type": "code",
827
- "execution_count": 18,
828
- "metadata": {},
829
- "outputs": [],
830
- "source": [
831
- "val_dataset = load_data('./val.jsonl')"
832
- ]
833
- },
834
- {
835
- "cell_type": "code",
836
- "execution_count": 19,
837
  "metadata": {},
838
  "outputs": [
839
  {
@@ -851,7 +853,7 @@
851
  }
852
  ],
853
  "source": [
854
- "messages = val_dataset[1][\"messages\"][:2]\n",
855
  "messages"
856
  ]
857
  },
 
61
  "outputs": [],
62
  "source": [
63
  "\n",
64
+ "TRAIN_FILE = \"data/train.jsonl\"\n",
65
+ "VAL_FILE = \"data/val.jsonl\"\n",
66
+ "TEST_FILE = \"data/test.jsonl\"\n",
67
  "MODEL_NAME = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
68
  "HF_TOKEN = \"...\""
69
  ]
 
322
  "val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)"
323
  ]
324
  },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": null,
328
+ "metadata": {},
329
+ "outputs": [],
330
+ "source": [
331
+ "test_dataset = load_data(TEST_FILE)\n",
332
+ "test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)"
333
+ ]
334
+ },
335
  {
336
  "cell_type": "markdown",
337
  "metadata": {
 
835
  },
836
  {
837
  "cell_type": "code",
838
+ "execution_count": null,
 
 
 
 
 
 
 
 
 
839
  "metadata": {},
840
  "outputs": [
841
  {
 
853
  }
854
  ],
855
  "source": [
856
+ "messages = test_dataset[1][\"messages\"][:2]\n",
857
  "messages"
858
  ]
859
  },