import pandas as pd df = pd.read_csv("Data_With_Phonks_and_Not_Phonks.csv") from sklearn.model_selection import train_test_split train_data, test_data = train_test_split(df, test_size=0.2, random_state=42) from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imputer = IterativeImputer(initial_strategy="median", random_state=42) import numpy as np training_data_num = train_data.select_dtypes(include=[np.number]) imputer.fit(training_data_num) X = imputer.transform(training_data_num) imputer.feature_names_in_ train_data_tr = pd.DataFrame(X, columns=training_data_num.columns, index=training_data_num.index) from sklearn.pipeline import Pipeline from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ("imputer", IterativeImputer(initial_strategy="median")), ("scaler", StandardScaler()) ]) from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder cat_pipeline = Pipeline([ ("ordinal_encoder", OrdinalEncoder()), ("imputer", IterativeImputer(initial_strategy="most_frequent")), ("cat_encoder", OneHotEncoder(sparse_output=False)), ]) from sklearn.compose import ColumnTransformer num_attribs = ["danceability_%", "energy_%", "bpm", "speechiness_%", "acousticness_%", "instrumentalness_%", "liveness_%", "valence_%"] cat_attribs = ["key", "mode"] preprocess_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", cat_pipeline, cat_attribs), ]) X_train = preprocess_pipeline.fit_transform(train_data) X_train y_train = train_data["genre"] from sklearn.svm import SVC svm_clf = SVC(random_state=42) svm_clf.fit(X_train, y_train) X_test = preprocess_pipeline.transform(test_data) y_pred = svm_clf.predict(X_test) from sklearn.model_selection import cross_val_score svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10) svm_scores.mean()