vicky4s4s commited on
Commit
4be3cda
·
verified ·
1 Parent(s): 81657e3

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -105,3 +105,5 @@ genre_finder/assets/model_ready_dataset_final1.csv filter=lfs diff=lfs merge=lfs
105
  genre_finder/assets/model_ready_dataset1.csv filter=lfs diff=lfs merge=lfs -text
106
  genre_finder/find_genre_different_apporoch/features_dataset.csv filter=lfs diff=lfs merge=lfs -text
107
  genre_finder/find_genre_different_apporoch/all_genre.csv filter=lfs diff=lfs merge=lfs -text
 
 
 
105
  genre_finder/assets/model_ready_dataset1.csv filter=lfs diff=lfs merge=lfs -text
106
  genre_finder/find_genre_different_apporoch/features_dataset.csv filter=lfs diff=lfs merge=lfs -text
107
  genre_finder/find_genre_different_apporoch/all_genre.csv filter=lfs diff=lfs merge=lfs -text
108
+ rocketship-ml-model-train/assets/all_genres_clean.csv filter=lfs diff=lfs merge=lfs -text
109
+ rocketship-ml-model-train/assets/features_dataset.csv filter=lfs diff=lfs merge=lfs -text
rocketship-ml-model-train/app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re, json
2
+ from get_db_info import extract_the_db_data2
3
+ from prediction_ import predict_genre
4
+
5
+ while True:
6
+ input_id = input('Enter input id: ')
7
+ if input_id.lower() == 'exit':
8
+ break
9
+ dict_data = extract_the_db_data2(input_id)
10
+ answer = predict_genre(dict_data)
11
+ print(answer)
rocketship-ml-model-train/assets/all_genres_clean.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d0daf801b311511ab134fe677cadbf75c30f55953a3ade5ee16c0dfbef9987b
3
+ size 71663849
rocketship-ml-model-train/assets/features_dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ac64a40014104b2c4f9f822fe956e876db8eeeb0c7c8e1caaf8e4a0df0cc515
3
+ size 73116979
rocketship-ml-model-train/genre_pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87e0209eacb5b5cf7cfe67eac22786ae34306fc53170a0ae8838f62842718870
3
+ size 129741572
rocketship-ml-model-train/genre_pipeline_v2.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:615bebec486a76b3e34ba0986a4921c031b437ff8b53398f4705db94b7984472
3
+ size 128260374
rocketship-ml-model-train/get_db_info.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from pymongo import MongoClient
4
+ from bson.objectid import ObjectId
5
+
6
+ load_dotenv()
7
+
8
+ mongo_db_client = MongoClient(os.getenv("MONGO_URI"))
9
+ db = mongo_db_client[os.getenv("DB_NAME")]
10
+ collections = db[os.getenv("COLLECTION_NAME")]
11
+
12
+ oid = "69cb6b4bd86a47b0a1581017"
13
+
14
+ def extract_the_db_data1(oid):
15
+ collection = collections.find_one({"_id": ObjectId(oid)})
16
+ features_answer = collection.get("features","")
17
+ dict_data = {
18
+ 'spectral_contrast_mean (mix)': features_answer.get('spectral_contrast_mean (mix)',0.0),
19
+ 'chroma_mean (mix)': features_answer.get('chroma_mean (mix)',0.0),
20
+ 'melody_variability (vocals)': features_answer.get('melody_variability (vocals)',0.0),
21
+ 'rhythm_onset_rate (mix)': features_answer.get('rhythm_onset_rate (mix)',0.0),
22
+ 'spectral_centroid_mean custom (mix)': features_answer.get('spectral_centroid_mean custom (mix)',0.0),
23
+ 'repetition_score custom (mix)': features_answer.get('repetition_score custom (mix)',0.0),
24
+ 'pitch_std (mix)': features_answer.get('pitch_std (mix)',0.0),
25
+ 'tempo_bpm_corrected (mix)': features_answer.get('tempo_bpm_corrected (mix)',0.0),
26
+ 'tempo_bpm_original (mix)': features_answer.get('tempo_bpm_original (mix)',0.0),
27
+ 'mfcc_mean_1 (mix)': features_answer.get('mfcc_mean_1 (mix)',0.0),
28
+ 'mfcc_mean_2 (mix)': features_answer.get('mfcc_mean_2 (mix)',0.0),
29
+ 'loudness_range_lu custom (mix)': features_answer.get('loudness_range_lu custom (mix)',0.0),
30
+ 'zero_crossing_rate (mix)': features_answer.get('zero_crossing_rate (mix)',0.0),
31
+ 'loudness_integrated_lufs custom (mix)': features_answer.get('loudness_integrated_lufs custom (mix)',0.0),
32
+ 'energy_essentia (mix)': features_answer.get('energy_essentia (mix)',0.0),
33
+ 'energy_librosa (mix)': features_answer.get('energy_librosa (mix)',0.0),
34
+ 'rms_energy_mean (mix)': features_answer.get('rms_energy_mean (mix)',0.0),
35
+ 'melody_complexity (vocals)': features_answer.get('melody_complexity (vocals)',0.0),
36
+ }
37
+ #print(features_answer.keys())
38
+ return dict_data
39
+
40
+ def extract_the_db_data2(oid):
41
+ collection = collections.find_one({"_id": ObjectId(oid)})
42
+ features_answer = collection.get("features","")
43
+ dict_data = {
44
+ 'melody_complexity (vocals)': features_answer.get('melody_complexity (vocals)',0.0),
45
+ 'melody_range (vocals)': features_answer.get('melody_range (vocals)',0.0),
46
+ 'melody_variability (vocals)': features_answer.get('melody_variability (vocals)',0.0),
47
+ 'tempo_bpm_original (mix)': features_answer.get('tempo_bpm_original (mix)',0.0),
48
+ 'danceability custom (mix)': features_answer.get('danceability custom (mix)',0.0),
49
+ 'loudness_integrated_lufs custom (mix)': features_answer.get('loudness_integrated_lufs custom (mix)',0.0),
50
+ 'loudness_range_lu custom (mix)': features_answer.get('loudness_range_lu custom (mix)',0.0),
51
+ 'energy_librosa (mix)': features_answer.get('energy_librosa (mix)',0.0),
52
+ 'energy_librosa_std (mix)': features_answer.get('energy_librosa_std (mix)',0.0),
53
+ 'energy_essentia (mix)': features_answer.get('energy_essentia (mix)',0.0),
54
+ 'energy_essentia_std (mix)': features_answer.get('energy_essentia_std (mix)',0.0),
55
+ 'energy_combined (mix)': features_answer.get('energy_combined (mix)',0.0),
56
+ 'spectral_centroid_mean custom (mix)': features_answer.get('spectral_centroid_mean custom (mix)',0.0),
57
+ 'mfcc_mean_1 (mix)': features_answer.get('mfcc_mean_1 (mix)',0.0),
58
+ 'mfcc_mean_2 (mix)': features_answer.get('mfcc_mean_2 (mix)',0.0),
59
+ 'chroma_mean (mix)': features_answer.get('chroma_mean (mix)',0.0),
60
+ 'spectral_contrast_mean (mix)': features_answer.get('spectral_contrast_mean (mix)',0.0),
61
+ 'repetition_score custom (mix)': features_answer.get('repetition_score custom (mix)',0.0),
62
+ 'pitch_mean (mix)': features_answer.get('pitch_mean (mix)',0.0),
63
+ 'pitch_std (mix)': features_answer.get('pitch_std (mix)',0.0),
64
+ 'rms_energy_mean (mix)': features_answer.get('rms_energy_mean (mix)',0.0),
65
+ 'rms_energy_std (mix)': features_answer.get('rms_energy_std (mix)',0.0),
66
+ 'zero_crossing_rate (mix)': features_answer.get('zero_crossing_rate (mix)',0.0),
67
+ }
68
+ print(dict_data)
69
+ return dict_data
70
+
71
+
72
+ # answer = extract_the_db_data2("69cb624e4801e0963cda8568")
73
+ # print(answer)
rocketship-ml-model-train/model_train.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import ast
4
+ import joblib
5
+
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import LabelEncoder
8
+ from sklearn.multioutput import MultiOutputClassifier
9
+ from sklearn.metrics import classification_report
10
+ from collections import Counter
11
+
12
+ from xgboost import XGBClassifier
13
+ from sklearn.ensemble import RandomForestClassifier
14
+ from imblearn.over_sampling import SMOTE
15
+ pd.set_option("display.max_columns", None)
16
+
17
+ df.to_csv("all_genres_clean.csv", index=False)
18
+ df = pd.read_csv("all_genres_clean.csv",low_memory=False)
19
+ drop_cols = ['track_url','name']
20
+ df = df.drop(columns=drop_cols)
21
+ X = df.drop(columns=["genre", "sub_genres"])
22
+ y_genre = df["genre"]
23
+ genre_encoder = LabelEncoder()
24
+ y_genre_encoded = genre_encoder.fit_transform(y_genre)
25
+ smote = SMOTE(random_state=42)
26
+ X_resampled, y_resampled = smote.fit_resample(X, y_genre_encoded)
27
+ counter = Counter(y_resampled)
28
+
29
+ X_train, X_test, y_train, y_test = train_test_split(
30
+ X_resampled, y_resampled,
31
+ test_size=0.2,
32
+ random_state=42
33
+ )
34
+
35
+ genre_model = XGBClassifier(
36
+ n_estimators=1000,
37
+ max_depth=8,
38
+ learning_rate=0.05,
39
+ subsample=0.8,
40
+ colsample_bytree=0.8,
41
+ min_child_weight=5,
42
+ gamma=0.1,
43
+ reg_lambda=1,
44
+ tree_method="hist",
45
+ eval_metric="mlogloss"
46
+ )
47
+
48
+ genre_model.fit(X_train, y_train)
49
+ y_pred = genre_model.predict(X_test)
50
+ print(classification_report(y_test, y_pred))
51
+
52
+ pipeline_data = {
53
+ "model": genre_model,
54
+ "label_encoder": genre_encoder,
55
+ "features": X.columns.tolist(),
56
+ "train_data": df
57
+ }
58
+ joblib.dump(pipeline_data, "genre_pipeline_v2.pkl")
rocketship-ml-model-train/prediction_.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os,joblib
3
+ import numpy as np
4
+
5
+ path = r"genre_pipeline_v2.pkl"
6
+ pipeline = joblib.load(path)
7
+
8
+ model = pipeline["model"]
9
+ le = pipeline["label_encoder"]
10
+ features = pipeline["features"]
11
+ df_full = pipeline["train_data"]
12
+ X_full = df_full[features]
13
+
14
+ def predict_genre(sample):
15
+ # = X_full.iloc[-2].to_dict()
16
+ input_df = pd.DataFrame([sample])
17
+ for col in features:
18
+ if col not in input_df.columns:
19
+ input_df[col] = 0
20
+
21
+ input_df = input_df[features]
22
+ pred_encoded = model.predict(input_df)
23
+ prediction = le.inverse_transform(pred_encoded)[0]
24
+ filtered_df = df_full[df_full["genre"] == prediction].copy()
25
+ def find_best_match(input_row, df_subset):
26
+ X_subset = df_subset[features]
27
+ distances = np.linalg.norm(X_subset.values - input_row.values, axis=1)
28
+ best_idx = np.argmin(distances)
29
+ return df_subset.iloc[best_idx]
30
+ best_row = find_best_match(input_df.iloc[0], filtered_df)
31
+ final_output = pd.DataFrame([{
32
+ "predict_genre": prediction,
33
+ "genre_subgenre_list": best_row["sub_genres"]
34
+ }])
35
+
36
+ return final_output
rocketship-ml-model-train/train_ml_model.ipynb ADDED
@@ -0,0 +1,1028 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {
5
+ "ExecuteTime": {
6
+ "end_time": "2026-03-31T12:26:36.441430700Z",
7
+ "start_time": "2026-03-31T12:26:36.428176800Z"
8
+ }
9
+ },
10
+ "cell_type": "code",
11
+ "source": "#!pip install imbalanced-learn",
12
+ "id": "6061b6e1a4964764",
13
+ "outputs": [],
14
+ "execution_count": 50
15
+ },
16
+ {
17
+ "metadata": {},
18
+ "cell_type": "markdown",
19
+ "source": "## 1. IMPORTS",
20
+ "id": "80a50f1f4cac28af"
21
+ },
22
+ {
23
+ "metadata": {
24
+ "ExecuteTime": {
25
+ "end_time": "2026-03-31T15:50:53.238168500Z",
26
+ "start_time": "2026-03-31T15:50:53.191532300Z"
27
+ }
28
+ },
29
+ "cell_type": "code",
30
+ "source": [
31
+ "import pandas as pd\n",
32
+ "import numpy as np\n",
33
+ "import ast\n",
34
+ "import joblib\n",
35
+ "\n",
36
+ "from sklearn.model_selection import train_test_split\n",
37
+ "from sklearn.preprocessing import LabelEncoder\n",
38
+ "from sklearn.multioutput import MultiOutputClassifier\n",
39
+ "from sklearn.metrics import classification_report\n",
40
+ "\n",
41
+ "from xgboost import XGBClassifier\n",
42
+ "from sklearn.ensemble import RandomForestClassifier\n",
43
+ "from imblearn.over_sampling import SMOTE"
44
+ ],
45
+ "id": "a1a47a2f55d6d805",
46
+ "outputs": [],
47
+ "execution_count": 138
48
+ },
49
+ {
50
+ "metadata": {},
51
+ "cell_type": "markdown",
52
+ "source": "## 2. LOAD DATA",
53
+ "id": "eaf2ccbd5817b489"
54
+ },
55
+ {
56
+ "metadata": {
57
+ "ExecuteTime": {
58
+ "end_time": "2026-03-31T15:50:55.726659200Z",
59
+ "start_time": "2026-03-31T15:50:54.991904900Z"
60
+ }
61
+ },
62
+ "cell_type": "code",
63
+ "source": [
64
+ "df = pd.read_csv(\"assets/all_genre.csv\")\n",
65
+ "df.shape"
66
+ ],
67
+ "id": "cd568b9bd5b06a09",
68
+ "outputs": [
69
+ {
70
+ "data": {
71
+ "text/plain": [
72
+ "(125169, 28)"
73
+ ]
74
+ },
75
+ "execution_count": 139,
76
+ "metadata": {},
77
+ "output_type": "execute_result"
78
+ }
79
+ ],
80
+ "execution_count": 139
81
+ },
82
+ {
83
+ "metadata": {
84
+ "ExecuteTime": {
85
+ "end_time": "2026-03-31T15:51:05.654993800Z",
86
+ "start_time": "2026-03-31T15:51:05.582282300Z"
87
+ }
88
+ },
89
+ "cell_type": "code",
90
+ "source": "df.tail(2)",
91
+ "id": "a1d0c8b7cde52c38",
92
+ "outputs": [
93
+ {
94
+ "data": {
95
+ "text/plain": [
96
+ " _id genre \\\n",
97
+ "125167 698ebc778a361f5ae9bc3b9b RnB Soul \n",
98
+ "125168 6996dc6afd7fa1d0c64b9096 RnB Soul \n",
99
+ "\n",
100
+ " genre_subgenre \\\n",
101
+ "125167 RnB Soul --- ['Contemporary RnB', 'Hip Hop Rap... \n",
102
+ "125168 RnB Soul --- [] \n",
103
+ "\n",
104
+ " trimmed_audio_duration_sec syllable_count word_count \\\n",
105
+ "125167 190.132381 534.0 431.0 \n",
106
+ "125168 100.680000 148.0 122.0 \n",
107
+ "\n",
108
+ " spectral_contrast_mean (mix) chroma_mean (mix) \\\n",
109
+ "125167 19.721407 0.529660 \n",
110
+ "125168 20.478757 0.498923 \n",
111
+ "\n",
112
+ " melody_variability (vocals) rhythm_onset_rate (mix) ... \\\n",
113
+ "125167 0.640377 3.994444 ... \n",
114
+ "125168 0.570758 3.436631 ... \n",
115
+ "\n",
116
+ " zero_crossing_rate (mix) vocab_richness \\\n",
117
+ "125167 0.087692 0.411 \n",
118
+ "125168 0.075311 0.541 \n",
119
+ "\n",
120
+ " loudness_integrated_lufs custom (mix) readability_score \\\n",
121
+ "125167 -15.104306 1.5 \n",
122
+ "125168 -15.867487 21.2 \n",
123
+ "\n",
124
+ " energy_essentia (mix) energy_librosa (mix) rms_energy_mean (mix) \\\n",
125
+ "125167 0.185114 0.360564 0.403058 \n",
126
+ "125168 0.280897 0.475058 0.475100 \n",
127
+ "\n",
128
+ " sentiment_score melody_complexity (vocals) avg_word_length \n",
129
+ "125167 0.002 3.416667 3.77 \n",
130
+ "125168 0.036 2.666667 3.52 \n",
131
+ "\n",
132
+ "[2 rows x 28 columns]"
133
+ ],
134
+ "text/html": [
135
+ "<div>\n",
136
+ "<style scoped>\n",
137
+ " .dataframe tbody tr th:only-of-type {\n",
138
+ " vertical-align: middle;\n",
139
+ " }\n",
140
+ "\n",
141
+ " .dataframe tbody tr th {\n",
142
+ " vertical-align: top;\n",
143
+ " }\n",
144
+ "\n",
145
+ " .dataframe thead th {\n",
146
+ " text-align: right;\n",
147
+ " }\n",
148
+ "</style>\n",
149
+ "<table border=\"1\" class=\"dataframe\">\n",
150
+ " <thead>\n",
151
+ " <tr style=\"text-align: right;\">\n",
152
+ " <th></th>\n",
153
+ " <th>_id</th>\n",
154
+ " <th>genre</th>\n",
155
+ " <th>genre_subgenre</th>\n",
156
+ " <th>trimmed_audio_duration_sec</th>\n",
157
+ " <th>syllable_count</th>\n",
158
+ " <th>word_count</th>\n",
159
+ " <th>spectral_contrast_mean (mix)</th>\n",
160
+ " <th>chroma_mean (mix)</th>\n",
161
+ " <th>melody_variability (vocals)</th>\n",
162
+ " <th>rhythm_onset_rate (mix)</th>\n",
163
+ " <th>...</th>\n",
164
+ " <th>zero_crossing_rate (mix)</th>\n",
165
+ " <th>vocab_richness</th>\n",
166
+ " <th>loudness_integrated_lufs custom (mix)</th>\n",
167
+ " <th>readability_score</th>\n",
168
+ " <th>energy_essentia (mix)</th>\n",
169
+ " <th>energy_librosa (mix)</th>\n",
170
+ " <th>rms_energy_mean (mix)</th>\n",
171
+ " <th>sentiment_score</th>\n",
172
+ " <th>melody_complexity (vocals)</th>\n",
173
+ " <th>avg_word_length</th>\n",
174
+ " </tr>\n",
175
+ " </thead>\n",
176
+ " <tbody>\n",
177
+ " <tr>\n",
178
+ " <th>125167</th>\n",
179
+ " <td>698ebc778a361f5ae9bc3b9b</td>\n",
180
+ " <td>RnB Soul</td>\n",
181
+ " <td>RnB Soul --- ['Contemporary RnB', 'Hip Hop Rap...</td>\n",
182
+ " <td>190.132381</td>\n",
183
+ " <td>534.0</td>\n",
184
+ " <td>431.0</td>\n",
185
+ " <td>19.721407</td>\n",
186
+ " <td>0.529660</td>\n",
187
+ " <td>0.640377</td>\n",
188
+ " <td>3.994444</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>0.087692</td>\n",
191
+ " <td>0.411</td>\n",
192
+ " <td>-15.104306</td>\n",
193
+ " <td>1.5</td>\n",
194
+ " <td>0.185114</td>\n",
195
+ " <td>0.360564</td>\n",
196
+ " <td>0.403058</td>\n",
197
+ " <td>0.002</td>\n",
198
+ " <td>3.416667</td>\n",
199
+ " <td>3.77</td>\n",
200
+ " </tr>\n",
201
+ " <tr>\n",
202
+ " <th>125168</th>\n",
203
+ " <td>6996dc6afd7fa1d0c64b9096</td>\n",
204
+ " <td>RnB Soul</td>\n",
205
+ " <td>RnB Soul --- []</td>\n",
206
+ " <td>100.680000</td>\n",
207
+ " <td>148.0</td>\n",
208
+ " <td>122.0</td>\n",
209
+ " <td>20.478757</td>\n",
210
+ " <td>0.498923</td>\n",
211
+ " <td>0.570758</td>\n",
212
+ " <td>3.436631</td>\n",
213
+ " <td>...</td>\n",
214
+ " <td>0.075311</td>\n",
215
+ " <td>0.541</td>\n",
216
+ " <td>-15.867487</td>\n",
217
+ " <td>21.2</td>\n",
218
+ " <td>0.280897</td>\n",
219
+ " <td>0.475058</td>\n",
220
+ " <td>0.475100</td>\n",
221
+ " <td>0.036</td>\n",
222
+ " <td>2.666667</td>\n",
223
+ " <td>3.52</td>\n",
224
+ " </tr>\n",
225
+ " </tbody>\n",
226
+ "</table>\n",
227
+ "<p>2 rows × 28 columns</p>\n",
228
+ "</div>"
229
+ ]
230
+ },
231
+ "execution_count": 140,
232
+ "metadata": {},
233
+ "output_type": "execute_result"
234
+ }
235
+ ],
236
+ "execution_count": 140
237
+ },
238
+ {
239
+ "metadata": {},
240
+ "cell_type": "markdown",
241
+ "source": "## 3. EXTRACT SUBGENRE LIST",
242
+ "id": "25e6116f88f4e7b5"
243
+ },
244
+ {
245
+ "metadata": {
246
+ "ExecuteTime": {
247
+ "end_time": "2026-03-31T15:32:35.481030700Z",
248
+ "start_time": "2026-03-31T15:32:33.709931Z"
249
+ }
250
+ },
251
+ "cell_type": "code",
252
+ "source": [
253
+ "def extract_list(value):\n",
254
+ " if pd.isna(value):\n",
255
+ " return []\n",
256
+ " try:\n",
257
+ " parts = value.split('---')\n",
258
+ " if len(parts) < 2:\n",
259
+ " return []\n",
260
+ " return ast.literal_eval(parts[1].strip())\n",
261
+ " except:\n",
262
+ " return []\n",
263
+ "\n",
264
+ "df[\"genre_subgenre_list\"] = df[\"genre_subgenre\"].apply(extract_list)\n",
265
+ "df.drop(columns=[\"genre_subgenre\"], inplace=True)"
266
+ ],
267
+ "id": "ceb7a07f3a0ef9c",
268
+ "outputs": [],
269
+ "execution_count": 137
270
+ },
271
+ {
272
+ "metadata": {
273
+ "ExecuteTime": {
274
+ "end_time": "2026-03-31T14:48:22.250988500Z",
275
+ "start_time": "2026-03-31T14:48:22.187035700Z"
276
+ }
277
+ },
278
+ "cell_type": "code",
279
+ "source": "df.head(2)",
280
+ "id": "64c82db63a7d7cda",
281
+ "outputs": [
282
+ {
283
+ "data": {
284
+ "text/plain": [
285
+ " _id genre trimmed_audio_duration_sec \\\n",
286
+ "0 69143037d64595f86b812d77 Hip Hop Rap 183.843991 \n",
287
+ "1 691448a64bef1dcbb1d3da1b Hip Hop Rap 160.786576 \n",
288
+ "\n",
289
+ " syllable_count word_count spectral_contrast_mean (mix) \\\n",
290
+ "0 355.0 298.0 20.440557 \n",
291
+ "1 285.0 236.0 19.657228 \n",
292
+ "\n",
293
+ " chroma_mean (mix) melody_variability (vocals) rhythm_onset_rate (mix) \\\n",
294
+ "0 0.464204 0.434021 3.250000 \n",
295
+ "1 0.424237 0.351001 2.425576 \n",
296
+ "\n",
297
+ " spectral_centroid_mean custom (mix) ... vocab_richness \\\n",
298
+ "0 0.132002 ... 0.305 \n",
299
+ "1 0.139757 ... 0.352 \n",
300
+ "\n",
301
+ " loudness_integrated_lufs custom (mix) readability_score \\\n",
302
+ "0 -12.104649 20.9 \n",
303
+ "1 -11.814197 89.4 \n",
304
+ "\n",
305
+ " energy_essentia (mix) energy_librosa (mix) rms_energy_mean (mix) \\\n",
306
+ "0 0.349783 0.576250 0.578454 \n",
307
+ "1 0.429792 0.619931 0.632507 \n",
308
+ "\n",
309
+ " sentiment_score melody_complexity (vocals) avg_word_length \\\n",
310
+ "0 0.116 2.250000 3.76 \n",
311
+ "1 0.173 1.916667 3.50 \n",
312
+ "\n",
313
+ " genre_subgenre_list \n",
314
+ "0 [Alternative Hip Hop, Boom Bap, Contemporary H... \n",
315
+ "1 [Americana, Contemporary Country, Bro Country] \n",
316
+ "\n",
317
+ "[2 rows x 28 columns]"
318
+ ],
319
+ "text/html": [
320
+ "<div>\n",
321
+ "<style scoped>\n",
322
+ " .dataframe tbody tr th:only-of-type {\n",
323
+ " vertical-align: middle;\n",
324
+ " }\n",
325
+ "\n",
326
+ " .dataframe tbody tr th {\n",
327
+ " vertical-align: top;\n",
328
+ " }\n",
329
+ "\n",
330
+ " .dataframe thead th {\n",
331
+ " text-align: right;\n",
332
+ " }\n",
333
+ "</style>\n",
334
+ "<table border=\"1\" class=\"dataframe\">\n",
335
+ " <thead>\n",
336
+ " <tr style=\"text-align: right;\">\n",
337
+ " <th></th>\n",
338
+ " <th>_id</th>\n",
339
+ " <th>genre</th>\n",
340
+ " <th>trimmed_audio_duration_sec</th>\n",
341
+ " <th>syllable_count</th>\n",
342
+ " <th>word_count</th>\n",
343
+ " <th>spectral_contrast_mean (mix)</th>\n",
344
+ " <th>chroma_mean (mix)</th>\n",
345
+ " <th>melody_variability (vocals)</th>\n",
346
+ " <th>rhythm_onset_rate (mix)</th>\n",
347
+ " <th>spectral_centroid_mean custom (mix)</th>\n",
348
+ " <th>...</th>\n",
349
+ " <th>vocab_richness</th>\n",
350
+ " <th>loudness_integrated_lufs custom (mix)</th>\n",
351
+ " <th>readability_score</th>\n",
352
+ " <th>energy_essentia (mix)</th>\n",
353
+ " <th>energy_librosa (mix)</th>\n",
354
+ " <th>rms_energy_mean (mix)</th>\n",
355
+ " <th>sentiment_score</th>\n",
356
+ " <th>melody_complexity (vocals)</th>\n",
357
+ " <th>avg_word_length</th>\n",
358
+ " <th>genre_subgenre_list</th>\n",
359
+ " </tr>\n",
360
+ " </thead>\n",
361
+ " <tbody>\n",
362
+ " <tr>\n",
363
+ " <th>0</th>\n",
364
+ " <td>69143037d64595f86b812d77</td>\n",
365
+ " <td>Hip Hop Rap</td>\n",
366
+ " <td>183.843991</td>\n",
367
+ " <td>355.0</td>\n",
368
+ " <td>298.0</td>\n",
369
+ " <td>20.440557</td>\n",
370
+ " <td>0.464204</td>\n",
371
+ " <td>0.434021</td>\n",
372
+ " <td>3.250000</td>\n",
373
+ " <td>0.132002</td>\n",
374
+ " <td>...</td>\n",
375
+ " <td>0.305</td>\n",
376
+ " <td>-12.104649</td>\n",
377
+ " <td>20.9</td>\n",
378
+ " <td>0.349783</td>\n",
379
+ " <td>0.576250</td>\n",
380
+ " <td>0.578454</td>\n",
381
+ " <td>0.116</td>\n",
382
+ " <td>2.250000</td>\n",
383
+ " <td>3.76</td>\n",
384
+ " <td>[Alternative Hip Hop, Boom Bap, Contemporary H...</td>\n",
385
+ " </tr>\n",
386
+ " <tr>\n",
387
+ " <th>1</th>\n",
388
+ " <td>691448a64bef1dcbb1d3da1b</td>\n",
389
+ " <td>Hip Hop Rap</td>\n",
390
+ " <td>160.786576</td>\n",
391
+ " <td>285.0</td>\n",
392
+ " <td>236.0</td>\n",
393
+ " <td>19.657228</td>\n",
394
+ " <td>0.424237</td>\n",
395
+ " <td>0.351001</td>\n",
396
+ " <td>2.425576</td>\n",
397
+ " <td>0.139757</td>\n",
398
+ " <td>...</td>\n",
399
+ " <td>0.352</td>\n",
400
+ " <td>-11.814197</td>\n",
401
+ " <td>89.4</td>\n",
402
+ " <td>0.429792</td>\n",
403
+ " <td>0.619931</td>\n",
404
+ " <td>0.632507</td>\n",
405
+ " <td>0.173</td>\n",
406
+ " <td>1.916667</td>\n",
407
+ " <td>3.50</td>\n",
408
+ " <td>[Americana, Contemporary Country, Bro Country]</td>\n",
409
+ " </tr>\n",
410
+ " </tbody>\n",
411
+ "</table>\n",
412
+ "<p>2 rows × 28 columns</p>\n",
413
+ "</div>"
414
+ ]
415
+ },
416
+ "execution_count": 100,
417
+ "metadata": {},
418
+ "output_type": "execute_result"
419
+ }
420
+ ],
421
+ "execution_count": 100
422
+ },
423
+ {
424
+ "metadata": {},
425
+ "cell_type": "markdown",
426
+ "source": "## 4. CLEAN FEATURES",
427
+ "id": "7a0ce7b0f4ea696a"
428
+ },
429
+ {
430
+ "metadata": {
431
+ "ExecuteTime": {
432
+ "end_time": "2026-03-31T14:29:37.586057300Z",
433
+ "start_time": "2026-03-31T14:29:37.559184400Z"
434
+ }
435
+ },
436
+ "cell_type": "code",
437
+ "source": [
438
+ "drop_cols = ['_id','trimmed_audio_duration_sec', 'syllable_count', 'word_count',\n",
439
+ " 'sentiment_score', 'avg_word_length', 'avg_word_length', 'readability_score','vocab_richness']\n",
440
+ "\n",
441
+ "\n",
442
+ "df = df.drop(columns=drop_cols)"
443
+ ],
444
+ "id": "10232ce5c7e02ebe",
445
+ "outputs": [],
446
+ "execution_count": 74
447
+ },
448
+ {
449
+ "metadata": {},
450
+ "cell_type": "markdown",
451
+ "source": "## 5. SPLIT FEATURES",
452
+ "id": "e501c73d9a21d15b"
453
+ },
454
+ {
455
+ "metadata": {
456
+ "ExecuteTime": {
457
+ "end_time": "2026-03-31T14:29:39.435021600Z",
458
+ "start_time": "2026-03-31T14:29:39.364438700Z"
459
+ }
460
+ },
461
+ "cell_type": "code",
462
+ "source": [
463
+ "X = df.drop(columns=[\"genre\", \"genre_subgenre_list\"])\n",
464
+ "y_genre = df[\"genre\"]"
465
+ ],
466
+ "id": "46943cd9570c5ef1",
467
+ "outputs": [],
468
+ "execution_count": 75
469
+ },
470
+ {
471
+ "metadata": {},
472
+ "cell_type": "markdown",
473
+ "source": "## 6. ENCODE GENRE",
474
+ "id": "9fe48aab3a37b04a"
475
+ },
476
+ {
477
+ "metadata": {
478
+ "ExecuteTime": {
479
+ "end_time": "2026-03-31T14:29:40.867349200Z",
480
+ "start_time": "2026-03-31T14:29:40.820224600Z"
481
+ }
482
+ },
483
+ "cell_type": "code",
484
+ "source": [
485
+ "genre_encoder = LabelEncoder()\n",
486
+ "y_genre_encoded = genre_encoder.fit_transform(y_genre)"
487
+ ],
488
+ "id": "9cfcd4b98c05576b",
489
+ "outputs": [],
490
+ "execution_count": 76
491
+ },
492
+ {
493
+ "metadata": {},
494
+ "cell_type": "markdown",
495
+ "source": "## 7. BALANCE DATA (SMOTE)",
496
+ "id": "c37f54cc88a20873"
497
+ },
498
+ {
499
+ "metadata": {
500
+ "ExecuteTime": {
501
+ "end_time": "2026-03-31T14:29:43.867990400Z",
502
+ "start_time": "2026-03-31T14:29:42.314962900Z"
503
+ }
504
+ },
505
+ "cell_type": "code",
506
+ "source": [
507
+ "smote = SMOTE(random_state=42)\n",
508
+ "X_resampled, y_resampled = smote.fit_resample(X, y_genre_encoded)"
509
+ ],
510
+ "id": "4058a4ecea99d77",
511
+ "outputs": [],
512
+ "execution_count": 77
513
+ },
514
+ {
515
+ "metadata": {},
516
+ "cell_type": "markdown",
517
+ "source": "## 8. TRAIN GENRE MODEL",
518
+ "id": "383f3a8e280c7d78"
519
+ },
520
+ {
521
+ "metadata": {
522
+ "ExecuteTime": {
523
+ "end_time": "2026-03-31T14:33:26.436070600Z",
524
+ "start_time": "2026-03-31T14:29:48.404312300Z"
525
+ }
526
+ },
527
+ "cell_type": "code",
528
+ "source": [
529
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
530
+ " X_resampled, y_resampled,\n",
531
+ " test_size=0.2,\n",
532
+ " random_state=42\n",
533
+ ")\n",
534
+ "\n",
535
+ "genre_model = XGBClassifier(\n",
536
+ " n_estimators=1000,\n",
537
+ " max_depth=8,\n",
538
+ " learning_rate=0.05,\n",
539
+ " subsample=0.8,\n",
540
+ " colsample_bytree=0.8,\n",
541
+ " min_child_weight=5,\n",
542
+ " gamma=0.1,\n",
543
+ " reg_lambda=1,\n",
544
+ " tree_method=\"hist\",\n",
545
+ " eval_metric=\"mlogloss\"\n",
546
+ ")\n",
547
+ "\n",
548
+ "genre_model.fit(X_train, y_train)\n",
549
+ "y_pred = genre_model.predict(X_test)\n",
550
+ "print(classification_report(y_test, y_pred))"
551
+ ],
552
+ "id": "6669f4cb87c22b0d",
553
+ "outputs": [
554
+ {
555
+ "name": "stdout",
556
+ "output_type": "stream",
557
+ "text": [
558
+ " precision recall f1-score support\n",
559
+ "\n",
560
+ " 0 0.72 0.66 0.69 5785\n",
561
+ " 1 0.85 0.84 0.85 5860\n",
562
+ " 2 0.71 0.71 0.71 5766\n",
563
+ " 3 0.69 0.73 0.71 5956\n",
564
+ " 4 0.93 0.98 0.95 5752\n",
565
+ " 5 0.68 0.75 0.71 5824\n",
566
+ " 6 1.00 1.00 1.00 5865\n",
567
+ " 7 0.51 0.41 0.45 5788\n",
568
+ " 8 0.98 1.00 0.99 5804\n",
569
+ " 9 0.56 0.56 0.56 5856\n",
570
+ " 10 0.56 0.60 0.58 5775\n",
571
+ "\n",
572
+ " accuracy 0.75 64031\n",
573
+ " macro avg 0.74 0.75 0.75 64031\n",
574
+ "weighted avg 0.74 0.75 0.75 64031\n",
575
+ "\n"
576
+ ]
577
+ }
578
+ ],
579
+ "execution_count": 78
580
+ },
581
+ {
582
+ "metadata": {
583
+ "ExecuteTime": {
584
+ "end_time": "2026-03-31T14:35:17.020787300Z",
585
+ "start_time": "2026-03-31T14:35:08.071408100Z"
586
+ }
587
+ },
588
+ "cell_type": "code",
589
+ "source": "genre_model.score(X_train, y_train)",
590
+ "id": "118a434daae8c565",
591
+ "outputs": [
592
+ {
593
+ "data": {
594
+ "text/plain": [
595
+ "0.9226663647295841"
596
+ ]
597
+ },
598
+ "execution_count": 80,
599
+ "metadata": {},
600
+ "output_type": "execute_result"
601
+ }
602
+ ],
603
+ "execution_count": 80
604
+ },
605
+ {
606
+ "metadata": {
607
+ "ExecuteTime": {
608
+ "end_time": "2026-03-31T14:35:22.584004600Z",
609
+ "start_time": "2026-03-31T14:35:20.164379Z"
610
+ }
611
+ },
612
+ "cell_type": "code",
613
+ "source": "genre_model.score(X_test, y_test)",
614
+ "id": "d0cc6a38f09dfaf3",
615
+ "outputs": [
616
+ {
617
+ "data": {
618
+ "text/plain": [
619
+ "0.7481063859692961"
620
+ ]
621
+ },
622
+ "execution_count": 81,
623
+ "metadata": {},
624
+ "output_type": "execute_result"
625
+ }
626
+ ],
627
+ "execution_count": 81
628
+ },
629
+ {
630
+ "metadata": {},
631
+ "cell_type": "markdown",
632
+ "source": "## 9. SAVE MODEL",
633
+ "id": "44d62a34d7c97a4d"
634
+ },
635
+ {
636
+ "metadata": {
637
+ "ExecuteTime": {
638
+ "end_time": "2026-03-31T15:25:11.541174Z",
639
+ "start_time": "2026-03-31T15:25:10.667061500Z"
640
+ }
641
+ },
642
+ "cell_type": "code",
643
+ "source": [
644
+ "pipeline_data = {\n",
645
+ " \"model\": genre_model,\n",
646
+ " \"label_encoder\": genre_encoder,\n",
647
+ " \"features\": X.columns.tolist(),\n",
648
+ " \"train_data\": df\n",
649
+ "}\n",
650
+ "joblib.dump(pipeline_data, \"genre_pipeline.pkl\")"
651
+ ],
652
+ "id": "99a419eddad21a44",
653
+ "outputs": [
654
+ {
655
+ "data": {
656
+ "text/plain": [
657
+ "['genre_pipeline.pkl']"
658
+ ]
659
+ },
660
+ "execution_count": 126,
661
+ "metadata": {},
662
+ "output_type": "execute_result"
663
+ }
664
+ ],
665
+ "execution_count": 126
666
+ },
667
+ {
668
+ "metadata": {},
669
+ "cell_type": "markdown",
670
+ "source": "## 10. LOAD + PREDICT (Using same .pkl)",
671
+ "id": "41cb3fb66288af9e"
672
+ },
673
+ {
674
+ "metadata": {
675
+ "ExecuteTime": {
676
+ "end_time": "2026-03-31T15:28:15.462539200Z",
677
+ "start_time": "2026-03-31T15:28:14.847279900Z"
678
+ }
679
+ },
680
+ "cell_type": "code",
681
+ "source": [
682
+ "import numpy as np\n",
683
+ "\n",
684
+ "pipeline = joblib.load(\"genre_pipeline.pkl\")\n",
685
+ "\n",
686
+ "model = pipeline[\"model\"]\n",
687
+ "le = pipeline[\"label_encoder\"]\n",
688
+ "features = pipeline[\"features\"]\n",
689
+ "df_full = pipeline[\"train_data\"]\n",
690
+ "\n",
691
+ "# Input sample\n",
692
+ "sample = X.iloc[92229].to_dict()\n",
693
+ "input_df = pd.DataFrame([sample])\n",
694
+ "\n",
695
+ "for col in features:\n",
696
+ " if col not in input_df.columns:\n",
697
+ " input_df[col] = 0\n",
698
+ "\n",
699
+ "input_df = input_df[features]\n",
700
+ "pred_encoded = model.predict(input_df)\n",
701
+ "prediction = le.inverse_transform(pred_encoded)[0]\n",
702
+ "filtered_df = df_full[df_full[\"genre\"] == prediction].copy()\n",
703
+ "def find_best_match(input_row, df_subset):\n",
704
+ " X_subset = df_subset[features]\n",
705
+ " distances = np.linalg.norm(X_subset.values - input_row.values, axis=1)\n",
706
+ " best_idx = np.argmin(distances)\n",
707
+ " return df_subset.iloc[best_idx]\n",
708
+ "best_row = find_best_match(input_df.iloc[0], filtered_df)\n",
709
+ "final_output = pd.DataFrame([{\n",
710
+ " \"predict_genre\": prediction,\n",
711
+ " \"genre_subgenre_list\": best_row[\"genre_subgenre_list\"]\n",
712
+ "}])\n",
713
+ "\n",
714
+ "print(final_output)"
715
+ ],
716
+ "id": "c4fc96bfd6a8636e",
717
+ "outputs": [
718
+ {
719
+ "name": "stdout",
720
+ "output_type": "stream",
721
+ "text": [
722
+ " predict_genre genre_subgenre_list\n",
723
+ "0 EDM []\n"
724
+ ]
725
+ }
726
+ ],
727
+ "execution_count": 133
728
+ },
729
+ {
730
+ "metadata": {
731
+ "ExecuteTime": {
732
+ "end_time": "2026-03-31T15:28:18.801970900Z",
733
+ "start_time": "2026-03-31T15:28:18.721374900Z"
734
+ }
735
+ },
736
+ "cell_type": "code",
737
+ "source": "print(best_row[\"genre_subgenre_list\"])",
738
+ "id": "800aba339a69aa9c",
739
+ "outputs": [
740
+ {
741
+ "name": "stdout",
742
+ "output_type": "stream",
743
+ "text": [
744
+ "[]\n"
745
+ ]
746
+ }
747
+ ],
748
+ "execution_count": 134
749
+ },
750
+ {
751
+ "metadata": {
752
+ "ExecuteTime": {
753
+ "end_time": "2026-03-31T15:27:37.723854800Z",
754
+ "start_time": "2026-03-31T15:27:37.661366500Z"
755
+ }
756
+ },
757
+ "cell_type": "code",
758
+ "source": "df.sample(5)",
759
+ "id": "1a304b17152705c9",
760
+ "outputs": [
761
+ {
762
+ "data": {
763
+ "text/plain": [
764
+ " _id genre trimmed_audio_duration_sec \\\n",
765
+ "86885 6996acb01af4c88ae2d95a39 EDM 190.320000 \n",
766
+ "92229 699c338e0b2a57db2a93724f EDM 190.792290 \n",
767
+ "123297 697c327515cf3a48da484dff RnB Soul 138.646349 \n",
768
+ "44658 68fa1e2f4677fa714c26e8ae Rock 212.214240 \n",
769
+ "86793 6996a38b7c4d735613b230a3 EDM 239.879977 \n",
770
+ "\n",
771
+ " syllable_count word_count spectral_contrast_mean (mix) \\\n",
772
+ "86885 372.0 334.0 20.215855 \n",
773
+ "92229 194.0 161.0 19.167682 \n",
774
+ "123297 264.0 219.0 18.698643 \n",
775
+ "44658 109.0 85.0 20.150875 \n",
776
+ "86793 300.0 234.0 18.801814 \n",
777
+ "\n",
778
+ " chroma_mean (mix) melody_variability (vocals) \\\n",
779
+ "86885 0.411808 0.584400 \n",
780
+ "92229 0.527524 0.401779 \n",
781
+ "123297 0.573993 0.438452 \n",
782
+ "44658 0.455458 0.420922 \n",
783
+ "86793 0.507734 0.410575 \n",
784
+ "\n",
785
+ " rhythm_onset_rate (mix) spectral_centroid_mean custom (mix) ... \\\n",
786
+ "86885 3.905556 0.139566 ... \n",
787
+ "92229 4.311111 0.129095 ... \n",
788
+ "123297 5.907116 0.111474 ... \n",
789
+ "44658 4.944444 0.160074 ... \n",
790
+ "86793 1.938889 0.169642 ... \n",
791
+ "\n",
792
+ " vocab_richness loudness_integrated_lufs custom (mix) \\\n",
793
+ "86885 0.210 -13.503495 \n",
794
+ "92229 0.410 -15.650211 \n",
795
+ "123297 0.363 -12.241549 \n",
796
+ "44658 0.612 -12.745431 \n",
797
+ "86793 0.303 -11.677942 \n",
798
+ "\n",
799
+ " readability_score energy_essentia (mix) energy_librosa (mix) \\\n",
800
+ "86885 5.7 0.298788 0.496576 \n",
801
+ "92229 61.4 0.197921 0.394782 \n",
802
+ "123297 23.1 0.342908 0.552086 \n",
803
+ "44658 6.9 0.433083 0.633059 \n",
804
+ "86793 29.0 0.440966 0.612347 \n",
805
+ "\n",
806
+ " rms_energy_mean (mix) sentiment_score melody_complexity (vocals) \\\n",
807
+ "86885 0.513011 0.039 2.916667 \n",
808
+ "92229 0.395509 -0.052 2.333333 \n",
809
+ "123297 0.559722 0.083 0.416667 \n",
810
+ "44658 0.636330 -0.197 2.666667 \n",
811
+ "86793 0.661424 -0.040 2.333333 \n",
812
+ "\n",
813
+ " avg_word_length genre_subgenre_list \n",
814
+ "86885 3.08 [Indie Electronic, Indie Pop] \n",
815
+ "92229 3.83 [] \n",
816
+ "123297 3.58 [Funk, Soul, Neo Soul, Motown] \n",
817
+ "44658 3.92 [Classic Alternative Rock, Modern Alternative ... \n",
818
+ "86793 3.64 [] \n",
819
+ "\n",
820
+ "[5 rows x 28 columns]"
821
+ ],
822
+ "text/html": [
823
+ "<div>\n",
824
+ "<style scoped>\n",
825
+ " .dataframe tbody tr th:only-of-type {\n",
826
+ " vertical-align: middle;\n",
827
+ " }\n",
828
+ "\n",
829
+ " .dataframe tbody tr th {\n",
830
+ " vertical-align: top;\n",
831
+ " }\n",
832
+ "\n",
833
+ " .dataframe thead th {\n",
834
+ " text-align: right;\n",
835
+ " }\n",
836
+ "</style>\n",
837
+ "<table border=\"1\" class=\"dataframe\">\n",
838
+ " <thead>\n",
839
+ " <tr style=\"text-align: right;\">\n",
840
+ " <th></th>\n",
841
+ " <th>_id</th>\n",
842
+ " <th>genre</th>\n",
843
+ " <th>trimmed_audio_duration_sec</th>\n",
844
+ " <th>syllable_count</th>\n",
845
+ " <th>word_count</th>\n",
846
+ " <th>spectral_contrast_mean (mix)</th>\n",
847
+ " <th>chroma_mean (mix)</th>\n",
848
+ " <th>melody_variability (vocals)</th>\n",
849
+ " <th>rhythm_onset_rate (mix)</th>\n",
850
+ " <th>spectral_centroid_mean custom (mix)</th>\n",
851
+ " <th>...</th>\n",
852
+ " <th>vocab_richness</th>\n",
853
+ " <th>loudness_integrated_lufs custom (mix)</th>\n",
854
+ " <th>readability_score</th>\n",
855
+ " <th>energy_essentia (mix)</th>\n",
856
+ " <th>energy_librosa (mix)</th>\n",
857
+ " <th>rms_energy_mean (mix)</th>\n",
858
+ " <th>sentiment_score</th>\n",
859
+ " <th>melody_complexity (vocals)</th>\n",
860
+ " <th>avg_word_length</th>\n",
861
+ " <th>genre_subgenre_list</th>\n",
862
+ " </tr>\n",
863
+ " </thead>\n",
864
+ " <tbody>\n",
865
+ " <tr>\n",
866
+ " <th>86885</th>\n",
867
+ " <td>6996acb01af4c88ae2d95a39</td>\n",
868
+ " <td>EDM</td>\n",
869
+ " <td>190.320000</td>\n",
870
+ " <td>372.0</td>\n",
871
+ " <td>334.0</td>\n",
872
+ " <td>20.215855</td>\n",
873
+ " <td>0.411808</td>\n",
874
+ " <td>0.584400</td>\n",
875
+ " <td>3.905556</td>\n",
876
+ " <td>0.139566</td>\n",
877
+ " <td>...</td>\n",
878
+ " <td>0.210</td>\n",
879
+ " <td>-13.503495</td>\n",
880
+ " <td>5.7</td>\n",
881
+ " <td>0.298788</td>\n",
882
+ " <td>0.496576</td>\n",
883
+ " <td>0.513011</td>\n",
884
+ " <td>0.039</td>\n",
885
+ " <td>2.916667</td>\n",
886
+ " <td>3.08</td>\n",
887
+ " <td>[Indie Electronic, Indie Pop]</td>\n",
888
+ " </tr>\n",
889
+ " <tr>\n",
890
+ " <th>92229</th>\n",
891
+ " <td>699c338e0b2a57db2a93724f</td>\n",
892
+ " <td>EDM</td>\n",
893
+ " <td>190.792290</td>\n",
894
+ " <td>194.0</td>\n",
895
+ " <td>161.0</td>\n",
896
+ " <td>19.167682</td>\n",
897
+ " <td>0.527524</td>\n",
898
+ " <td>0.401779</td>\n",
899
+ " <td>4.311111</td>\n",
900
+ " <td>0.129095</td>\n",
901
+ " <td>...</td>\n",
902
+ " <td>0.410</td>\n",
903
+ " <td>-15.650211</td>\n",
904
+ " <td>61.4</td>\n",
905
+ " <td>0.197921</td>\n",
906
+ " <td>0.394782</td>\n",
907
+ " <td>0.395509</td>\n",
908
+ " <td>-0.052</td>\n",
909
+ " <td>2.333333</td>\n",
910
+ " <td>3.83</td>\n",
911
+ " <td>[]</td>\n",
912
+ " </tr>\n",
913
+ " <tr>\n",
914
+ " <th>123297</th>\n",
915
+ " <td>697c327515cf3a48da484dff</td>\n",
916
+ " <td>RnB Soul</td>\n",
917
+ " <td>138.646349</td>\n",
918
+ " <td>264.0</td>\n",
919
+ " <td>219.0</td>\n",
920
+ " <td>18.698643</td>\n",
921
+ " <td>0.573993</td>\n",
922
+ " <td>0.438452</td>\n",
923
+ " <td>5.907116</td>\n",
924
+ " <td>0.111474</td>\n",
925
+ " <td>...</td>\n",
926
+ " <td>0.363</td>\n",
927
+ " <td>-12.241549</td>\n",
928
+ " <td>23.1</td>\n",
929
+ " <td>0.342908</td>\n",
930
+ " <td>0.552086</td>\n",
931
+ " <td>0.559722</td>\n",
932
+ " <td>0.083</td>\n",
933
+ " <td>0.416667</td>\n",
934
+ " <td>3.58</td>\n",
935
+ " <td>[Funk, Soul, Neo Soul, Motown]</td>\n",
936
+ " </tr>\n",
937
+ " <tr>\n",
938
+ " <th>44658</th>\n",
939
+ " <td>68fa1e2f4677fa714c26e8ae</td>\n",
940
+ " <td>Rock</td>\n",
941
+ " <td>212.214240</td>\n",
942
+ " <td>109.0</td>\n",
943
+ " <td>85.0</td>\n",
944
+ " <td>20.150875</td>\n",
945
+ " <td>0.455458</td>\n",
946
+ " <td>0.420922</td>\n",
947
+ " <td>4.944444</td>\n",
948
+ " <td>0.160074</td>\n",
949
+ " <td>...</td>\n",
950
+ " <td>0.612</td>\n",
951
+ " <td>-12.745431</td>\n",
952
+ " <td>6.9</td>\n",
953
+ " <td>0.433083</td>\n",
954
+ " <td>0.633059</td>\n",
955
+ " <td>0.636330</td>\n",
956
+ " <td>-0.197</td>\n",
957
+ " <td>2.666667</td>\n",
958
+ " <td>3.92</td>\n",
959
+ " <td>[Classic Alternative Rock, Modern Alternative ...</td>\n",
960
+ " </tr>\n",
961
+ " <tr>\n",
962
+ " <th>86793</th>\n",
963
+ " <td>6996a38b7c4d735613b230a3</td>\n",
964
+ " <td>EDM</td>\n",
965
+ " <td>239.879977</td>\n",
966
+ " <td>300.0</td>\n",
967
+ " <td>234.0</td>\n",
968
+ " <td>18.801814</td>\n",
969
+ " <td>0.507734</td>\n",
970
+ " <td>0.410575</td>\n",
971
+ " <td>1.938889</td>\n",
972
+ " <td>0.169642</td>\n",
973
+ " <td>...</td>\n",
974
+ " <td>0.303</td>\n",
975
+ " <td>-11.677942</td>\n",
976
+ " <td>29.0</td>\n",
977
+ " <td>0.440966</td>\n",
978
+ " <td>0.612347</td>\n",
979
+ " <td>0.661424</td>\n",
980
+ " <td>-0.040</td>\n",
981
+ " <td>2.333333</td>\n",
982
+ " <td>3.64</td>\n",
983
+ " <td>[]</td>\n",
984
+ " </tr>\n",
985
+ " </tbody>\n",
986
+ "</table>\n",
987
+ "<p>5 rows × 28 columns</p>\n",
988
+ "</div>"
989
+ ]
990
+ },
991
+ "execution_count": 132,
992
+ "metadata": {},
993
+ "output_type": "execute_result"
994
+ }
995
+ ],
996
+ "execution_count": 132
997
+ },
998
+ {
999
+ "metadata": {},
1000
+ "cell_type": "code",
1001
+ "outputs": [],
1002
+ "execution_count": null,
1003
+ "source": "",
1004
+ "id": "d4e8ce5000dcff26"
1005
+ }
1006
+ ],
1007
+ "metadata": {
1008
+ "kernelspec": {
1009
+ "display_name": "Python 3",
1010
+ "language": "python",
1011
+ "name": "python3"
1012
+ },
1013
+ "language_info": {
1014
+ "codemirror_mode": {
1015
+ "name": "ipython",
1016
+ "version": 2
1017
+ },
1018
+ "file_extension": ".py",
1019
+ "mimetype": "text/x-python",
1020
+ "name": "python",
1021
+ "nbconvert_exporter": "python",
1022
+ "pygments_lexer": "ipython2",
1023
+ "version": "2.7.6"
1024
+ }
1025
+ },
1026
+ "nbformat": 4,
1027
+ "nbformat_minor": 5
1028
+ }
rocketship-ml-model-train/train_ml_model_1.ipynb ADDED
The diff for this file is too large to render. See raw diff