rrayy
commited on
Commit
·
9d1f85a
1
Parent(s):
fdbf091
Changes to be committed: 데이터 전처리 완료
Browse filesnew file: DIVA_dataset.pt
modified: preprocessing.ipynb
- DIVA_dataset.pt +3 -0
- preprocessing.ipynb +47 -3
DIVA_dataset.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2aad626b3e4e2ebfcc83c88623dbdee4d6e2ac90dcf6683a66c1b735d8fa51cf
|
| 3 |
+
size 327629
|
preprocessing.ipynb
CHANGED
|
@@ -1284,7 +1284,7 @@
|
|
| 1284 |
},
|
| 1285 |
{
|
| 1286 |
"cell_type": "code",
|
| 1287 |
-
"execution_count":
|
| 1288 |
"id": "dea532d4",
|
| 1289 |
"metadata": {},
|
| 1290 |
"outputs": [],
|
|
@@ -1378,13 +1378,57 @@
|
|
| 1378 |
},
|
| 1379 |
{
|
| 1380 |
"cell_type": "code",
|
| 1381 |
-
"execution_count":
|
| 1382 |
"id": "f7b77c0c",
|
| 1383 |
"metadata": {},
|
| 1384 |
"outputs": [],
|
| 1385 |
"source": [
|
|
|
|
|
|
|
| 1386 |
"import torch\n",
|
| 1387 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1388 |
]
|
| 1389 |
}
|
| 1390 |
],
|
|
|
|
| 1284 |
},
|
| 1285 |
{
|
| 1286 |
"cell_type": "code",
|
| 1287 |
+
"execution_count": 1,
|
| 1288 |
"id": "dea532d4",
|
| 1289 |
"metadata": {},
|
| 1290 |
"outputs": [],
|
|
|
|
| 1378 |
},
|
| 1379 |
{
|
| 1380 |
"cell_type": "code",
|
| 1381 |
+
"execution_count": 4,
|
| 1382 |
"id": "f7b77c0c",
|
| 1383 |
"metadata": {},
|
| 1384 |
"outputs": [],
|
| 1385 |
"source": [
|
| 1386 |
+
"from sklearn.preprocessing import OneHotEncoder, MinMaxScaler\n",
|
| 1387 |
+
"from sklearn.compose import ColumnTransformer\n",
|
| 1388 |
"import torch\n",
|
| 1389 |
+
"import pandas as pd\n",
|
| 1390 |
+
"\n",
|
| 1391 |
+
"vector_df = pd.DataFrame([item['vector'] for item in tokenized_data])\n",
|
| 1392 |
+
"tokens = [item['token'] for item in tokenized_data]\n",
|
| 1393 |
+
"\n",
|
| 1394 |
+
"# 전처리 파이프라인\n",
|
| 1395 |
+
"preprocessor = ColumnTransformer([\n",
|
| 1396 |
+
" (\"cat\", OneHotEncoder(sparse_output=False), [\"mode\", \"mood\", \"key\"]),\n",
|
| 1397 |
+
" (\"num\", MinMaxScaler(), [\"bpm\", \"chord_complexity\", \"melody_density\", \"syncopation\", \"pitch_range\"])\n",
|
| 1398 |
+
"])\n",
|
| 1399 |
+
"\n",
|
| 1400 |
+
"X = preprocessor.fit_transform(vector_df)\n",
|
| 1401 |
+
"\n",
|
| 1402 |
+
"# 토큰 시퀀스 패딩 (0으로 뒤에 채우기)\n",
|
| 1403 |
+
"max_len = max(len(seq) for seq in tokens)\n",
|
| 1404 |
+
"padded_tokens = [seq + [0]*(max_len - len(seq)) for seq in tokens]\n",
|
| 1405 |
+
"\n",
|
| 1406 |
+
"# Tensor 변환\n",
|
| 1407 |
+
"X_tensor = torch.tensor(X, dtype=torch.float32)\n",
|
| 1408 |
+
"Y_tensor = torch.tensor(padded_tokens, dtype=torch.long)"
|
| 1409 |
+
]
|
| 1410 |
+
},
|
| 1411 |
+
{
|
| 1412 |
+
"cell_type": "code",
|
| 1413 |
+
"execution_count": 5,
|
| 1414 |
+
"id": "4f5f5dc1",
|
| 1415 |
+
"metadata": {},
|
| 1416 |
+
"outputs": [],
|
| 1417 |
+
"source": [
|
| 1418 |
+
"import torch\n",
|
| 1419 |
+
"\n",
|
| 1420 |
+
"torch.save({\n",
|
| 1421 |
+
" \"X\": X_tensor,\n",
|
| 1422 |
+
" \"Y\": Y_tensor\n",
|
| 1423 |
+
"}, \"DIVA_dataset.pt\")"
|
| 1424 |
+
]
|
| 1425 |
+
},
|
| 1426 |
+
{
|
| 1427 |
+
"cell_type": "markdown",
|
| 1428 |
+
"id": "224e349f",
|
| 1429 |
+
"metadata": {},
|
| 1430 |
+
"source": [
|
| 1431 |
+
"## 전처리 끝!"
|
| 1432 |
]
|
| 1433 |
}
|
| 1434 |
],
|