| | import numpy as np
|
| | import os
|
| | import urllib.request
|
| | import zipfile
|
| | import json
|
| | import pandas as pd
|
| | import time
|
| | import torch
|
| | import numpy as np
|
| | import pandas as pd
|
| | import torch.nn as nn
|
| | import torch.nn.functional as F
|
| | import torch.optim as optim
|
| | from torch.utils.data import DataLoader, TensorDataset
|
| | from sklearn.model_selection import train_test_split
|
| | import matplotlib.pyplot as plt
|
| | from sklearn.preprocessing import LabelEncoder
|
| | import shutil
|
| | import os
|
| | import pyarrow.parquet as pq
|
| |
|
| | def make_dir(directory):
|
| | if os.path.exists(directory):
|
| | shutil.rmtree(directory)
|
| | os.makedirs(directory)
|
| | else:
|
| | os.makedirs(directory)
|
| |
|
| |
|
| | def read_parquet_folder(folder_path):
|
| | dataframes = []
|
| | for file in os.listdir(folder_path):
|
| | if file.endswith('.parquet'):
|
| | file_path = os.path.join(folder_path, file)
|
| | df = pd.read_parquet(file_path)
|
| | dataframes.append(df)
|
| |
|
| | return pd.concat(dataframes, ignore_index=True)
|
| |
|
| |
|
| | def create_ids(df, col, name):
|
| |
|
| | value_to_id = {val: i for i, val in enumerate(df[col].unique())}
|
| |
|
| |
|
| | df[f'{name}_id'] = df[col].map(value_to_id)
|
| | df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
|
| |
|
| | return df
|
| |
|
| | if __name__ == '__main__':
|
| | folder_path = os.getcwd() + '/data/raw/data'
|
| | df = read_parquet_folder(folder_path)
|
| |
|
| | directory = os.getcwd() + '/data/processed'
|
| | make_dir(directory)
|
| |
|
| | df = create_ids(df, 'artist_name', 'artist')
|
| | df = create_ids(df, 'pid', 'playlist')
|
| | df = create_ids(df, 'album_name', 'album')
|
| |
|
| | df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
|
| | df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
|
| | df['playlist_songs'] += 1
|
| |
|
| | df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
|
| | value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
|
| | df['artist_album_id'] = df['artist_album'].map(value_to_id)
|
| |
|
| | df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
|
| |
|
| | df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
|
| |
|
| | encoder = LabelEncoder()
|
| | encoder.fit(df['track_name'])
|
| |
|
| | df['track_id'] = encoder.transform(df['track_name'])
|
| | df['song_percent'] = df['song_count'] / df['playlist_songs']
|
| | df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
|
| |
|
| | artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
|
| | artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')
|
| |
|