| | import pandas as pd |
| | import numpy as np |
| | from sklearn.preprocessing import MultiLabelBinarizer |
| | from load_data import loadFileIntoDict |
| | import nltk |
| | nltk.download('stopwords') |
| | from nltk.corpus import stopwords |
| | from string import punctuation |
| | |
| | D= loadFileIntoDict("/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/ID2Genre.txt") |
| | PATH = '/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/model_weights_multi.pth' |
| |
|
| | punctuation = list(punctuation) |
| | def cleanDesc(df): |
| | sw_nltk = stopwords.words('english') |
| | for i,desc in enumerate(df['Description']): |
| | desc = [word.lower() for word in desc.split() if (word not in sw_nltk and word not in punctuation)] |
| |
|
| | df['Description'][i]= " ".join(desc) |
| | return df |
| |
|
| | def cleanGenres(df): |
| | cleaned_genre_col=[] |
| | for genre in df['Genres']: |
| | spaced_entries =[entry for entry in genre[1:-1].strip().split(",")] |
| | cleaned_genre_col.append([g.strip()[1:-1] for g in spaced_entries]) |
| |
|
| | df['Genres']= pd.Series(cleaned_genre_col) |
| | return df |
| |
|
| | |
| | def getMostFreqGenres(df): |
| | keys_to_drop = ['personal development', 'biography', 'dystopia', 'science fiction fantasy','fiction', 'memoir', 'spirituality', 'classics', 'biography memoir' , 'new adult', 'thriller', 'suspense', 'literary fiction', 'christian', 'british literature', 'paranormal', 'short stories', 'literature', 'young Adult', 'audiobook', 'novels', 'history', 'mystery thriller', 'adult' , 'chick lit', 'contemporary romance', 'contemporary', 'adult fiction', 'urban fantasy', 'middle grade', 'historical', 'american'] |
| | freq={} |
| | for genre in df['Genres']: |
| | for g in genre: |
| | if g not in freq.keys(): |
| | freq[g]=1 |
| | else: |
| | freq[g] += 1 |
| |
|
| | newfreq={} |
| | for key,ent in freq.items(): |
| | if ent > 600 and key.lower() not in keys_to_drop: |
| | newfreq[key]=ent |
| | return newfreq.keys() |
| |
|
| | def store_most_frequent_genres(df, most_frequent): |
| | |
| | df['Genres'] = df['Genres'].apply(lambda x: [item for item in x if item in most_frequent]) |
| | |
| | return df |
| |
|
| | def one_hot(df, col_name): |
| | col = df[col_name].tolist() |
| | mlb = MultiLabelBinarizer() |
| | mlb.fit_transform(col) |
| | one_hot_encodings = mlb.transform(col) |
| |
|
| | |
| | d = {} |
| | for i,classes in enumerate(mlb.classes_): |
| | d[i] = classes |
| | |
| | df["genre_id"]=[[0]*len(df) for i in range(len(df))] |
| | for i in range(len(df)): |
| | df["genre_id"][i]=one_hot_encodings[i] |
| | df ["genre_id"] = [list(map(float, target)) for target in df["genre_id"]] |
| | return df |
| |
|
| |
|
| | def getDF(): |
| |
|
| | data_path = "/Users/rishikasrinivas/Documents/Rishika/UCSC/Projects/BERt/Backend/Data/goodreads_data.csv" |
| | df = pd.read_csv(data_path) |
| |
|
| |
|
| | df = df.drop(columns=['Unnamed: 0', 'Book', 'Author', 'Avg_Rating', 'Num_Ratings', 'URL']) |
| | df.dropna(inplace=True) |
| | df = df[df['Genres']!= '[]'] |
| |
|
| | df.reset_index(inplace=True, drop=True) |
| |
|
| |
|
| | df = cleanGenres(df) |
| | df = cleanDesc(df) |
| | most_freq_genres = getMostFreqGenres(df) |
| | df = store_most_frequent_genres(df, most_freq_genres) |
| | df =one_hot(df, 'Genres') |
| | return df |