VJyzCELERY commited on
Commit
21cbaed
·
1 Parent(s): ac04036

Added some optimization

Browse files
Files changed (1) hide show
  1. app.py +51 -43
app.py CHANGED
@@ -10,6 +10,7 @@ from GameRecommender import *
10
  import gc
11
  from sklearn.model_selection import train_test_split
12
  from huggingface_hub import snapshot_download
 
13
 
14
  DATASETS = {
15
  "converted": "converted.csv",
@@ -75,7 +76,9 @@ df_review_raw = REVIEWS_DS['train'].to_pandas()
75
  df_review_trimmed = TRIMMED_REVIEWS_DS['train'].to_pandas()
76
  df_user_pref = USER_PREF_DS['train'].to_pandas()
77
  available_names = df_games[df_games['app_id'].astype(str).isin(selectable_app_ids)]['Name'].tolist()
78
-
 
 
79
  def extract_year(date_str):
80
  if isinstance(date_str, str) and len(date_str) >= 4:
81
  year_str = date_str[-4:]
@@ -338,7 +341,7 @@ df_games_raw['Publishers'] = df_games_raw['Publishers'].fillna('')
338
  df_games_raw.to_csv('Cleaned_games.csv',index=False)
339
  """)
340
  h2('Games Data Cleaned')
341
- gr.DataFrame(df_games.head(20))
342
 
343
  h2('2.2. Review Preprocessing')
344
  Dataset(df_review_raw,'Review Data Raw',REVIEWS_DATAPATH)
@@ -413,8 +416,6 @@ df = df[['steamid','app_id','voted_up','cleaned_review']]
413
  df.to_csv('Cleaned_Dataframe.csv',index=False)
414
  """)
415
  Dataset(df_review_trimmed,'Cleaned Review',source=TRIMMED_REVIEW_DATAPATH,key='trimmed_review')
416
- min_word=20
417
- df_review_trimmed_filtered = df_review_trimmed[df_review_trimmed['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
418
  code_cell("""
419
  min_word = 20
420
  df = df[df['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
@@ -528,20 +529,23 @@ df_liked = df_liked.drop_duplicates(subset=['steamid', 'app_id'])
528
  code_cell("""
529
  vectorizer = TfidfVectorizer(max_df=0.7,min_df=3,stop_words=None,ngram_range=(1,2))
530
  review_app_id_encoder = LabelEncoder()""")
531
- train_df,df_temp = train_test_split(sampled,test_size=0.2,random_state=SEED,stratify=sampled['app_id'])
532
- test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
533
- del df_temp
534
- gc.collect()
535
- code_cell("""
536
- train_df,df_temp = train_test_split(sampled,test_size=0.2,random_state=SEED,stratify=sampled['app_id'])
537
- test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
538
- """)
539
- p(f"""
540
  Training : {train_df.shape}
541
  Testing : {test_df.shape}
542
  Validation : {val_df.shape}
543
  """)
544
  code_cell("""
 
 
 
 
 
 
545
  X_train = vectorizer.fit_transform(train_df['cleaned_review'])
546
  y_train = review_app_id_encoder.fit_transform(train_df['app_id'])
547
  X_test = vectorizer.transform(test_df['cleaned_review'])
@@ -868,16 +872,6 @@ df = col_to_list(df,'Categories')
868
  df = apply_price_range_labels(df,price_labels,price_bins)
869
  """)
870
  Dataset(df_games,"The game dataset",GAMES_DATAPATH)
871
- df_games_temp = df_games
872
- df_games_temp = col_to_list(df_games_temp,'Genres')
873
- df_games_temp = col_to_list(df_games_temp,'Categories')
874
- df_games_temp = apply_price_range_labels(df_games_temp,price_ranges_labels,price_bins)
875
- df_games_temp['Year_Release'] = df_games_temp['Release date'].apply(extract_year)
876
- df_games_temp['Game score'] = np.where(
877
- (df_games_temp['Positive'] + df_games_temp['Negative']) == 0,
878
- 0,
879
- (df_games_temp['Positive'] / (df_games_temp['Positive'] + df_games_temp['Negative'])) * 100
880
- )
881
 
882
  code_cell("""
883
  def extract_year(date_str):
@@ -893,25 +887,36 @@ df['Game score'] = np.where(
893
  0,
894
  (df['Positive'] / (df['Positive'] + df['Negative'])) * 100
895
  )""")
896
- from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
897
- genre_mlb = MultiLabelBinarizer()
898
- genre_mlb = genre_mlb.fit(df_games_temp['Genres'])
899
- categories_mlb = MultiLabelBinarizer()
900
- categories_mlb = categories_mlb.fit(df_games_temp['Categories'])
901
- price_range_le = model.game_content_recommeder.price_range_encoder
902
- scaler = MinMaxScaler()
903
- scaler = scaler.fit(df_games_temp[['Year_Release','Average playtime forever','Game score','DLC count']].values)
904
- app_id_le = LabelEncoder()
905
- app_id_le = app_id_le.fit(df_games_temp['app_id'])
906
- numerical_col =['Year_Release','Average playtime forever','Game score','DLC count']
907
-
908
- genre_matrix = genre_mlb.transform(df_games_temp['Genres'])
909
- genre_df = pd.DataFrame(genre_matrix, columns=genre_mlb.classes_, index=df_games_temp.index)
910
- categories_matrix = categories_mlb.transform(df_games_temp['Categories'])
911
- categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df_games_temp.index)
912
- game_df = pd.concat([df_games_temp[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)
913
- game_df['Price_range'] = price_range_le.transform(game_df['Price_range'])
914
- game_df[numerical_col] = scaler.transform(game_df[numerical_col].values)
 
 
 
 
 
 
 
 
 
 
 
915
  code_cell("""
916
  from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
917
  genre_mlb = MultiLabelBinarizer()
@@ -932,7 +937,10 @@ genre_df = pd.DataFrame(genre_matrix, columns=genre_mlb.classes_, index=df.index
932
  categories_matrix = categories_mlb.transform(df['Categories'])
933
  categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df.index)
934
  game_df = pd.concat([df[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)""")
935
- gr.Dataframe(game_df.head(10))
 
 
 
936
  code_cell("""
937
  from sklearn.neighbors import KNeighborsClassifier
938
  X = game_df.loc[:,['Year_Release','Average playtime forever','Game score','DLC count','Price_range']+ list(genre_mlb.classes_) + list(categories_mlb.classes_)]
 
10
  import gc
11
  from sklearn.model_selection import train_test_split
12
  from huggingface_hub import snapshot_download
13
+ from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
14
 
15
  DATASETS = {
16
  "converted": "converted.csv",
 
76
  df_review_trimmed = TRIMMED_REVIEWS_DS['train'].to_pandas()
77
  df_user_pref = USER_PREF_DS['train'].to_pandas()
78
  available_names = df_games[df_games['app_id'].astype(str).isin(selectable_app_ids)]['Name'].tolist()
79
+ min_word=20
80
+ df_review_trimmed_filtered = df_review_trimmed[df_review_trimmed['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
81
+
82
  def extract_year(date_str):
83
  if isinstance(date_str, str) and len(date_str) >= 4:
84
  year_str = date_str[-4:]
 
341
  df_games_raw.to_csv('Cleaned_games.csv',index=False)
342
  """)
343
  h2('Games Data Cleaned')
344
+ gr.Dataframe(df_games.head(20))
345
 
346
  h2('2.2. Review Preprocessing')
347
  Dataset(df_review_raw,'Review Data Raw',REVIEWS_DATAPATH)
 
416
  df.to_csv('Cleaned_Dataframe.csv',index=False)
417
  """)
418
  Dataset(df_review_trimmed,'Cleaned Review',source=TRIMMED_REVIEW_DATAPATH,key='trimmed_review')
 
 
419
  code_cell("""
420
  min_word = 20
421
  df = df[df['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
 
529
  code_cell("""
530
  vectorizer = TfidfVectorizer(max_df=0.7,min_df=3,stop_words=None,ngram_range=(1,2))
531
  review_app_id_encoder = LabelEncoder()""")
532
+ def get_data_split():
533
+ train_df,df_temp = train_test_split(sampled,test_size=0.2,random_state=SEED,stratify=sampled['app_id'])
534
+ test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
535
+ del df_temp
536
+ gc.collect()
537
+ p(f"""
 
 
 
538
  Training : {train_df.shape}
539
  Testing : {test_df.shape}
540
  Validation : {val_df.shape}
541
  """)
542
  code_cell("""
543
+ train_df,df_temp = train_test_split(sampled,test_size=0.2,random_state=SEED,stratify=sampled['app_id'])
544
+ test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
545
+ """)
546
+ btn = gr.Button("View data split size :")
547
+ btn.click(fn=get_data_split())
548
+ code_cell("""
549
  X_train = vectorizer.fit_transform(train_df['cleaned_review'])
550
  y_train = review_app_id_encoder.fit_transform(train_df['app_id'])
551
  X_test = vectorizer.transform(test_df['cleaned_review'])
 
872
  df = apply_price_range_labels(df,price_labels,price_bins)
873
  """)
874
  Dataset(df_games,"The game dataset",GAMES_DATAPATH)
 
 
 
 
 
 
 
 
 
 
875
 
876
  code_cell("""
877
  def extract_year(date_str):
 
887
  0,
888
  (df['Positive'] / (df['Positive'] + df['Negative'])) * 100
889
  )""")
890
+ def game_df_create():
891
+ df_games_temp = df_games
892
+ df_games_temp = col_to_list(df_games_temp,'Genres')
893
+ df_games_temp = col_to_list(df_games_temp,'Categories')
894
+ df_games_temp = apply_price_range_labels(df_games_temp,price_ranges_labels,price_bins)
895
+ df_games_temp['Year_Release'] = df_games_temp['Release date'].apply(extract_year)
896
+ df_games_temp['Game score'] = np.where(
897
+ (df_games_temp['Positive'] + df_games_temp['Negative']) == 0,
898
+ 0,
899
+ (df_games_temp['Positive'] / (df_games_temp['Positive'] + df_games_temp['Negative'])) * 100
900
+ )
901
+ genre_mlb = MultiLabelBinarizer()
902
+ genre_mlb = genre_mlb.fit(df_games_temp['Genres'])
903
+ categories_mlb = MultiLabelBinarizer()
904
+ categories_mlb = categories_mlb.fit(df_games_temp['Categories'])
905
+ price_range_le = model.game_content_recommeder.price_range_encoder
906
+ scaler = MinMaxScaler()
907
+ scaler = scaler.fit(df_games_temp[['Year_Release','Average playtime forever','Game score','DLC count']].values)
908
+ app_id_le = LabelEncoder()
909
+ app_id_le = app_id_le.fit(df_games_temp['app_id'])
910
+ numerical_col =['Year_Release','Average playtime forever','Game score','DLC count']
911
+
912
+ genre_matrix = genre_mlb.transform(df_games_temp['Genres'])
913
+ genre_df = pd.DataFrame(genre_matrix, columns=genre_mlb.classes_, index=df_games_temp.index)
914
+ categories_matrix = categories_mlb.transform(df_games_temp['Categories'])
915
+ categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df_games_temp.index)
916
+ game_df = pd.concat([df_games_temp[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)
917
+ game_df['Price_range'] = price_range_le.transform(game_df['Price_range'])
918
+ game_df[numerical_col] = scaler.transform(game_df[numerical_col].values)
919
+ return game_df.head(10)
920
  code_cell("""
921
  from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
922
  genre_mlb = MultiLabelBinarizer()
 
937
  categories_matrix = categories_mlb.transform(df['Categories'])
938
  categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df.index)
939
  game_df = pd.concat([df[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)""")
940
+
941
+ btn = gr.Button("Run game_df preprocess")
942
+ output_game_df = gr.Dataframe()
943
+ btn.click(fn=game_df_create, inputs=None, outputs=output_game_df)
944
  code_cell("""
945
  from sklearn.neighbors import KNeighborsClassifier
946
  X = game_df.loc[:,['Year_Release','Average playtime forever','Game score','DLC count','Price_range']+ list(genre_mlb.classes_) + list(categories_mlb.classes_)]