| | import os
|
| | import urllib.request
|
| | import zipfile
|
| | import json
|
| | import pandas as pd
|
| | import time
|
| | import torch
|
| | import numpy as np
|
| | import pandas as pd
|
| | import torch.nn as nn
|
| | import torch.nn.functional as F
|
| | import torch.optim as optim
|
| | from torch.utils.data import DataLoader, TensorDataset
|
| | from sklearn.model_selection import train_test_split
|
| | import matplotlib.pyplot as plt
|
| | from sklearn.preprocessing import LabelEncoder
|
| | import shutil
|
| | import os
|
| | import pyarrow.parquet as pq
|
| |
|
| |
|
| | cols = [
|
| | 'name',
|
| | 'pid',
|
| | 'num_followers',
|
| | 'pos',
|
| | 'artist_name',
|
| | 'track_name',
|
| | 'album_name'
|
| | ]
|
| |
|
| |
|
| | def copy_file(src, dst):
|
| |
|
| | dst_dir = os.path.dirname(dst)
|
| | if not os.path.exists(dst_dir):
|
| | os.makedirs(dst_dir)
|
| |
|
| | shutil.copy2(src, dst)
|
| |
|
| | def unzip_archive(filepath, dir_path):
|
| | with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
|
| | zip_ref.extractall(dir_path)
|
| |
|
| |
|
| | def make_dir(directory):
|
| | if os.path.exists(directory):
|
| | shutil.rmtree(directory)
|
| | os.makedirs(directory)
|
| | else:
|
| | os.makedirs(directory)
|
| |
|
| |
|
| | def make_dataset():
|
| | directory = os.getcwd() + '/data/raw/playlists/data'
|
| | df = pd.DataFrame()
|
| | index = 0
|
| |
|
| | for filename in os.listdir(directory):
|
| |
|
| | if os.path.isfile(os.path.join(directory, filename)):
|
| | if filename.find('.json') != -1 :
|
| | index += 1
|
| |
|
| |
|
| | print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
|
| |
|
| |
|
| | full_path = os.path.join(directory, filename)
|
| |
|
| | with open(full_path, 'r') as file:
|
| | json_data = json.load(file)
|
| |
|
| | temp = pd.DataFrame(json_data['playlists'])
|
| | expanded_df = temp.explode('tracks').reset_index(drop=True)
|
| |
|
| |
|
| | json_normalized = pd.json_normalize(expanded_df['tracks'])
|
| |
|
| |
|
| | result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
|
| |
|
| | result = result[cols]
|
| |
|
| | df = pd.concat([df, result], axis=0, ignore_index=True)
|
| |
|
| | if index % 50 == 0:
|
| | df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
|
| | del df
|
| | df = pd.DataFrame()
|
| | if index % 200 == 0:
|
| | break
|
| |
|
| |
|
| | if __name__ == '__main__':
|
| | unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
|
| | directory = os.getcwd() + '/data/raw/data'
|
| | make_dir(directory)
|
| | directory = os.getcwd() + '/data/processed'
|
| | make_dir(directory)
|
| | make_dataset()
|
| |
|
| |
|