VJyzCELERY commited on
Commit ·
21cbaed
1
Parent(s): ac04036
Added some optimization
Browse files
app.py
CHANGED
|
@@ -10,6 +10,7 @@ from GameRecommender import *
|
|
| 10 |
import gc
|
| 11 |
from sklearn.model_selection import train_test_split
|
| 12 |
from huggingface_hub import snapshot_download
|
|
|
|
| 13 |
|
| 14 |
DATASETS = {
|
| 15 |
"converted": "converted.csv",
|
|
@@ -75,7 +76,9 @@ df_review_raw = REVIEWS_DS['train'].to_pandas()
|
|
| 75 |
df_review_trimmed = TRIMMED_REVIEWS_DS['train'].to_pandas()
|
| 76 |
df_user_pref = USER_PREF_DS['train'].to_pandas()
|
| 77 |
available_names = df_games[df_games['app_id'].astype(str).isin(selectable_app_ids)]['Name'].tolist()
|
| 78 |
-
|
|
|
|
|
|
|
| 79 |
def extract_year(date_str):
|
| 80 |
if isinstance(date_str, str) and len(date_str) >= 4:
|
| 81 |
year_str = date_str[-4:]
|
|
@@ -338,7 +341,7 @@ df_games_raw['Publishers'] = df_games_raw['Publishers'].fillna('')
|
|
| 338 |
df_games_raw.to_csv('Cleaned_games.csv',index=False)
|
| 339 |
""")
|
| 340 |
h2('Games Data Cleaned')
|
| 341 |
-
gr.
|
| 342 |
|
| 343 |
h2('2.2. Review Preprocessing')
|
| 344 |
Dataset(df_review_raw,'Review Data Raw',REVIEWS_DATAPATH)
|
|
@@ -413,8 +416,6 @@ df = df[['steamid','app_id','voted_up','cleaned_review']]
|
|
| 413 |
df.to_csv('Cleaned_Dataframe.csv',index=False)
|
| 414 |
""")
|
| 415 |
Dataset(df_review_trimmed,'Cleaned Review',source=TRIMMED_REVIEW_DATAPATH,key='trimmed_review')
|
| 416 |
-
min_word=20
|
| 417 |
-
df_review_trimmed_filtered = df_review_trimmed[df_review_trimmed['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
|
| 418 |
code_cell("""
|
| 419 |
min_word = 20
|
| 420 |
df = df[df['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
|
|
@@ -528,20 +529,23 @@ df_liked = df_liked.drop_duplicates(subset=['steamid', 'app_id'])
|
|
| 528 |
code_cell("""
|
| 529 |
vectorizer = TfidfVectorizer(max_df=0.7,min_df=3,stop_words=None,ngram_range=(1,2))
|
| 530 |
review_app_id_encoder = LabelEncoder()""")
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
|
| 538 |
-
""")
|
| 539 |
-
p(f"""
|
| 540 |
Training : {train_df.shape}
|
| 541 |
Testing : {test_df.shape}
|
| 542 |
Validation : {val_df.shape}
|
| 543 |
""")
|
| 544 |
code_cell("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
X_train = vectorizer.fit_transform(train_df['cleaned_review'])
|
| 546 |
y_train = review_app_id_encoder.fit_transform(train_df['app_id'])
|
| 547 |
X_test = vectorizer.transform(test_df['cleaned_review'])
|
|
@@ -868,16 +872,6 @@ df = col_to_list(df,'Categories')
|
|
| 868 |
df = apply_price_range_labels(df,price_labels,price_bins)
|
| 869 |
""")
|
| 870 |
Dataset(df_games,"The game dataset",GAMES_DATAPATH)
|
| 871 |
-
df_games_temp = df_games
|
| 872 |
-
df_games_temp = col_to_list(df_games_temp,'Genres')
|
| 873 |
-
df_games_temp = col_to_list(df_games_temp,'Categories')
|
| 874 |
-
df_games_temp = apply_price_range_labels(df_games_temp,price_ranges_labels,price_bins)
|
| 875 |
-
df_games_temp['Year_Release'] = df_games_temp['Release date'].apply(extract_year)
|
| 876 |
-
df_games_temp['Game score'] = np.where(
|
| 877 |
-
(df_games_temp['Positive'] + df_games_temp['Negative']) == 0,
|
| 878 |
-
0,
|
| 879 |
-
(df_games_temp['Positive'] / (df_games_temp['Positive'] + df_games_temp['Negative'])) * 100
|
| 880 |
-
)
|
| 881 |
|
| 882 |
code_cell("""
|
| 883 |
def extract_year(date_str):
|
|
@@ -893,25 +887,36 @@ df['Game score'] = np.where(
|
|
| 893 |
0,
|
| 894 |
(df['Positive'] / (df['Positive'] + df['Negative'])) * 100
|
| 895 |
)""")
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 915 |
code_cell("""
|
| 916 |
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
|
| 917 |
genre_mlb = MultiLabelBinarizer()
|
|
@@ -932,7 +937,10 @@ genre_df = pd.DataFrame(genre_matrix, columns=genre_mlb.classes_, index=df.index
|
|
| 932 |
categories_matrix = categories_mlb.transform(df['Categories'])
|
| 933 |
categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df.index)
|
| 934 |
game_df = pd.concat([df[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)""")
|
| 935 |
-
|
|
|
|
|
|
|
|
|
|
| 936 |
code_cell("""
|
| 937 |
from sklearn.neighbors import KNeighborsClassifier
|
| 938 |
X = game_df.loc[:,['Year_Release','Average playtime forever','Game score','DLC count','Price_range']+ list(genre_mlb.classes_) + list(categories_mlb.classes_)]
|
|
|
|
| 10 |
import gc
|
| 11 |
from sklearn.model_selection import train_test_split
|
| 12 |
from huggingface_hub import snapshot_download
|
| 13 |
+
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
|
| 14 |
|
| 15 |
DATASETS = {
|
| 16 |
"converted": "converted.csv",
|
|
|
|
| 76 |
df_review_trimmed = TRIMMED_REVIEWS_DS['train'].to_pandas()
|
| 77 |
df_user_pref = USER_PREF_DS['train'].to_pandas()
|
| 78 |
available_names = df_games[df_games['app_id'].astype(str).isin(selectable_app_ids)]['Name'].tolist()
|
| 79 |
+
min_word=20
|
| 80 |
+
df_review_trimmed_filtered = df_review_trimmed[df_review_trimmed['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
|
| 81 |
+
|
| 82 |
def extract_year(date_str):
|
| 83 |
if isinstance(date_str, str) and len(date_str) >= 4:
|
| 84 |
year_str = date_str[-4:]
|
|
|
|
| 341 |
df_games_raw.to_csv('Cleaned_games.csv',index=False)
|
| 342 |
""")
|
| 343 |
h2('Games Data Cleaned')
|
| 344 |
+
gr.Dataframe(df_games.head(20))
|
| 345 |
|
| 346 |
h2('2.2. Review Preprocessing')
|
| 347 |
Dataset(df_review_raw,'Review Data Raw',REVIEWS_DATAPATH)
|
|
|
|
| 416 |
df.to_csv('Cleaned_Dataframe.csv',index=False)
|
| 417 |
""")
|
| 418 |
Dataset(df_review_trimmed,'Cleaned Review',source=TRIMMED_REVIEW_DATAPATH,key='trimmed_review')
|
|
|
|
|
|
|
| 419 |
code_cell("""
|
| 420 |
min_word = 20
|
| 421 |
df = df[df['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
|
|
|
|
| 529 |
code_cell("""
|
| 530 |
vectorizer = TfidfVectorizer(max_df=0.7,min_df=3,stop_words=None,ngram_range=(1,2))
|
| 531 |
review_app_id_encoder = LabelEncoder()""")
|
| 532 |
+
def get_data_split():
|
| 533 |
+
train_df,df_temp = train_test_split(sampled,test_size=0.2,random_state=SEED,stratify=sampled['app_id'])
|
| 534 |
+
test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
|
| 535 |
+
del df_temp
|
| 536 |
+
gc.collect()
|
| 537 |
+
p(f"""
|
|
|
|
|
|
|
|
|
|
| 538 |
Training : {train_df.shape}
|
| 539 |
Testing : {test_df.shape}
|
| 540 |
Validation : {val_df.shape}
|
| 541 |
""")
|
| 542 |
code_cell("""
|
| 543 |
+
train_df,df_temp = train_test_split(sampled,test_size=0.2,random_state=SEED,stratify=sampled['app_id'])
|
| 544 |
+
test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
|
| 545 |
+
""")
|
| 546 |
+
btn = gr.Button("View data split size :")
|
| 547 |
+
btn.click(fn=get_data_split())
|
| 548 |
+
code_cell("""
|
| 549 |
X_train = vectorizer.fit_transform(train_df['cleaned_review'])
|
| 550 |
y_train = review_app_id_encoder.fit_transform(train_df['app_id'])
|
| 551 |
X_test = vectorizer.transform(test_df['cleaned_review'])
|
|
|
|
| 872 |
df = apply_price_range_labels(df,price_labels,price_bins)
|
| 873 |
""")
|
| 874 |
Dataset(df_games,"The game dataset",GAMES_DATAPATH)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
code_cell("""
|
| 877 |
def extract_year(date_str):
|
|
|
|
| 887 |
0,
|
| 888 |
(df['Positive'] / (df['Positive'] + df['Negative'])) * 100
|
| 889 |
)""")
|
| 890 |
+
def game_df_create():
|
| 891 |
+
df_games_temp = df_games
|
| 892 |
+
df_games_temp = col_to_list(df_games_temp,'Genres')
|
| 893 |
+
df_games_temp = col_to_list(df_games_temp,'Categories')
|
| 894 |
+
df_games_temp = apply_price_range_labels(df_games_temp,price_ranges_labels,price_bins)
|
| 895 |
+
df_games_temp['Year_Release'] = df_games_temp['Release date'].apply(extract_year)
|
| 896 |
+
df_games_temp['Game score'] = np.where(
|
| 897 |
+
(df_games_temp['Positive'] + df_games_temp['Negative']) == 0,
|
| 898 |
+
0,
|
| 899 |
+
(df_games_temp['Positive'] / (df_games_temp['Positive'] + df_games_temp['Negative'])) * 100
|
| 900 |
+
)
|
| 901 |
+
genre_mlb = MultiLabelBinarizer()
|
| 902 |
+
genre_mlb = genre_mlb.fit(df_games_temp['Genres'])
|
| 903 |
+
categories_mlb = MultiLabelBinarizer()
|
| 904 |
+
categories_mlb = categories_mlb.fit(df_games_temp['Categories'])
|
| 905 |
+
price_range_le = model.game_content_recommeder.price_range_encoder
|
| 906 |
+
scaler = MinMaxScaler()
|
| 907 |
+
scaler = scaler.fit(df_games_temp[['Year_Release','Average playtime forever','Game score','DLC count']].values)
|
| 908 |
+
app_id_le = LabelEncoder()
|
| 909 |
+
app_id_le = app_id_le.fit(df_games_temp['app_id'])
|
| 910 |
+
numerical_col =['Year_Release','Average playtime forever','Game score','DLC count']
|
| 911 |
+
|
| 912 |
+
genre_matrix = genre_mlb.transform(df_games_temp['Genres'])
|
| 913 |
+
genre_df = pd.DataFrame(genre_matrix, columns=genre_mlb.classes_, index=df_games_temp.index)
|
| 914 |
+
categories_matrix = categories_mlb.transform(df_games_temp['Categories'])
|
| 915 |
+
categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df_games_temp.index)
|
| 916 |
+
game_df = pd.concat([df_games_temp[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)
|
| 917 |
+
game_df['Price_range'] = price_range_le.transform(game_df['Price_range'])
|
| 918 |
+
game_df[numerical_col] = scaler.transform(game_df[numerical_col].values)
|
| 919 |
+
return game_df.head(10)
|
| 920 |
code_cell("""
|
| 921 |
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
|
| 922 |
genre_mlb = MultiLabelBinarizer()
|
|
|
|
| 937 |
categories_matrix = categories_mlb.transform(df['Categories'])
|
| 938 |
categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df.index)
|
| 939 |
game_df = pd.concat([df[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)""")
|
| 940 |
+
|
| 941 |
+
btn = gr.Button("Run game_df preprocess")
|
| 942 |
+
output_game_df = gr.Dataframe()
|
| 943 |
+
btn.click(fn=game_df_create, inputs=None, outputs=output_game_df)
|
| 944 |
code_cell("""
|
| 945 |
from sklearn.neighbors import KNeighborsClassifier
|
| 946 |
X = game_df.loc[:,['Year_Release','Average playtime forever','Game score','DLC count','Price_range']+ list(genre_mlb.classes_) + list(categories_mlb.classes_)]
|