keesephillips's picture
Added Naive model and comments
6ce6b56 verified
raw
history blame
3.05 kB
import os
import zipfile
import json
import pandas as pd
import pandas as pd
import shutil
import os
cols = [
'name',
'pid',
'num_followers',
'pos',
'artist_name',
'track_name',
'album_name'
]
def copy_file(src, dst):
'''
Copies a file from one dir to another
Inputs:
src: filepath to use as the soruce
dst: filepath to copy the file to
Returns:
'''
dst_dir = os.path.dirname(dst)
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
shutil.copy2(src, dst)
def unzip_archive(filepath, dir_path):
'''
Unzips a zipfile to the dir_path
Inputs:
filepath: filepath of the zip file
dir_path: path to extract the zip file contents to
Returns:
'''
with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
zip_ref.extractall(dir_path)
def make_dir(directory):
'''
Creates a new blank directory
Inputs:
directory: path to create a new directory at
Returns:
'''
if os.path.exists(directory):
shutil.rmtree(directory)
os.makedirs(directory)
else:
os.makedirs(directory)
def make_dataset():
'''
Creates the directory of parquet files to create the
dataset with, used parquet to reduce memory load
Inputs:
Returns:
'''
directory = os.getcwd() + '/data/raw/playlists/data'
df = pd.DataFrame()
index = 0
for filename in os.listdir(directory):
if os.path.isfile(os.path.join(directory, filename)):
if filename.find('.json') != -1 :
index += 1
print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
full_path = os.path.join(directory, filename)
with open(full_path, 'r') as file:
json_data = json.load(file)
temp = pd.DataFrame(json_data['playlists'])
expanded_df = temp.explode('tracks').reset_index(drop=True)
json_normalized = pd.json_normalize(expanded_df['tracks'])
result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
result = result[cols]
df = pd.concat([df, result], axis=0, ignore_index=True)
if index % 50 == 0:
df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
del df
df = pd.DataFrame()
if index % 200 == 0:
break
if __name__ == '__main__':
unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
directory = os.getcwd() + '/data/raw/data'
make_dir(directory)
directory = os.getcwd() + '/data/processed'
make_dir(directory)
make_dataset()