rrayy commited on
Commit
9d1f85a
·
1 Parent(s): fdbf091

Changes to be committed: 데이터 전처리 완료

Browse files

new file: DIVA_dataset.pt
modified: preprocessing.ipynb

Files changed (2) hide show
  1. DIVA_dataset.pt +3 -0
  2. preprocessing.ipynb +47 -3
DIVA_dataset.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2aad626b3e4e2ebfcc83c88623dbdee4d6e2ac90dcf6683a66c1b735d8fa51cf
3
+ size 327629
preprocessing.ipynb CHANGED
@@ -1284,7 +1284,7 @@
1284
  },
1285
  {
1286
  "cell_type": "code",
1287
- "execution_count": 9,
1288
  "id": "dea532d4",
1289
  "metadata": {},
1290
  "outputs": [],
@@ -1378,13 +1378,57 @@
1378
  },
1379
  {
1380
  "cell_type": "code",
1381
- "execution_count": 1,
1382
  "id": "f7b77c0c",
1383
  "metadata": {},
1384
  "outputs": [],
1385
  "source": [
 
 
1386
  "import torch\n",
1387
- "from sklearn.preprocessing import OneHotEncoder, MinMaxScaler"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1388
  ]
1389
  }
1390
  ],
 
1284
  },
1285
  {
1286
  "cell_type": "code",
1287
+ "execution_count": 1,
1288
  "id": "dea532d4",
1289
  "metadata": {},
1290
  "outputs": [],
 
1378
  },
1379
  {
1380
  "cell_type": "code",
1381
+ "execution_count": 4,
1382
  "id": "f7b77c0c",
1383
  "metadata": {},
1384
  "outputs": [],
1385
  "source": [
1386
+ "from sklearn.preprocessing import OneHotEncoder, MinMaxScaler\n",
1387
+ "from sklearn.compose import ColumnTransformer\n",
1388
  "import torch\n",
1389
+ "import pandas as pd\n",
1390
+ "\n",
1391
+ "vector_df = pd.DataFrame([item['vector'] for item in tokenized_data])\n",
1392
+ "tokens = [item['token'] for item in tokenized_data]\n",
1393
+ "\n",
1394
+ "# 전처리 파이프라인\n",
1395
+ "preprocessor = ColumnTransformer([\n",
1396
+ " (\"cat\", OneHotEncoder(sparse_output=False), [\"mode\", \"mood\", \"key\"]),\n",
1397
+ " (\"num\", MinMaxScaler(), [\"bpm\", \"chord_complexity\", \"melody_density\", \"syncopation\", \"pitch_range\"])\n",
1398
+ "])\n",
1399
+ "\n",
1400
+ "X = preprocessor.fit_transform(vector_df)\n",
1401
+ "\n",
1402
+ "# 토큰 시퀀스 패딩 (0으로 뒤에 채우기)\n",
1403
+ "max_len = max(len(seq) for seq in tokens)\n",
1404
+ "padded_tokens = [seq + [0]*(max_len - len(seq)) for seq in tokens]\n",
1405
+ "\n",
1406
+ "# Tensor 변환\n",
1407
+ "X_tensor = torch.tensor(X, dtype=torch.float32)\n",
1408
+ "Y_tensor = torch.tensor(padded_tokens, dtype=torch.long)"
1409
+ ]
1410
+ },
1411
+ {
1412
+ "cell_type": "code",
1413
+ "execution_count": 5,
1414
+ "id": "4f5f5dc1",
1415
+ "metadata": {},
1416
+ "outputs": [],
1417
+ "source": [
1418
+ "import torch\n",
1419
+ "\n",
1420
+ "torch.save({\n",
1421
+ " \"X\": X_tensor,\n",
1422
+ " \"Y\": Y_tensor\n",
1423
+ "}, \"DIVA_dataset.pt\")"
1424
+ ]
1425
+ },
1426
+ {
1427
+ "cell_type": "markdown",
1428
+ "id": "224e349f",
1429
+ "metadata": {},
1430
+ "source": [
1431
+ "## 전처리 끝!"
1432
  ]
1433
  }
1434
  ],