VJyzCELERY commited on
Commit
213e089
·
1 Parent(s): 73ce121

First Commit

Browse files
Files changed (5) hide show
  1. GameRecommender.py +334 -0
  2. app.py +1466 -0
  3. component.py +301 -0
  4. requirements.txt +193 -0
  5. style.css +208 -0
GameRecommender.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.neighbors import KNeighborsClassifier
4
+ from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ import joblib
7
+ from sklearn.decomposition import TruncatedSVD
8
+ from sklearn.metrics import classification_report
9
+ from xgboost import XGBClassifier
10
+ import nltk
11
+ from nltk.tokenize import word_tokenize
12
+ from nltk.corpus import stopwords
13
+ from nltk.stem import WordNetLemmatizer
14
+ from nltk.tag import pos_tag
15
+ import string
16
+ import re
17
+ import os
18
+ nltk.download('punkt')
19
+ nltk.download('averaged_perceptron_tagger_eng')
20
+ nltk.download('wordnet')
21
+
22
+ class CollaborativeRecommender:
23
+ def __init__(self, svd_matrix, item_to_index, index_to_item):
24
+ """
25
+ svd_matrix: 2D numpy array (items x latent features)
26
+ item_to_index: dict mapping app_id to row index in svd_matrix
27
+ index_to_item: dict mapping row index to app_id
28
+ """
29
+ self.svd_matrix : TruncatedSVD = svd_matrix
30
+ self.item_to_index = item_to_index
31
+ self.index_to_item = index_to_item
32
+
33
+ def save(self, path: str):
34
+ """Save the entire model as a single file using joblib."""
35
+ joblib.dump(self, path)
36
+
37
+ @staticmethod
38
+ def load(path: str):
39
+ """Load the entire model from a joblib file."""
40
+ return joblib.load(path)
41
+
42
+ def _get_item_vector(self, app_id):
43
+ idx = self.item_to_index.get(app_id)
44
+ if idx is None:
45
+ raise ValueError(f"app_id {app_id} not found in the model.")
46
+ return self.svd_matrix[idx]
47
+
48
+ def _cosine_similarity(self, vec, matrix):
49
+ # Cosine similarity between vec and all rows in matrix
50
+ vec_norm = np.linalg.norm(vec)
51
+ matrix_norms = np.linalg.norm(matrix, axis=1)
52
+ similarity = (matrix @ vec) / (matrix_norms * vec_norm + 1e-10)
53
+ return similarity
54
+
55
+ def get_similarities(self, app_ids,top_n=None):
56
+ """
57
+ Input: app_ids - single app_id or list of app_ids
58
+ Output: DataFrame with columns ['app_id', 'similarity'] sorted by similarity descending
59
+ """
60
+ if isinstance(app_ids, (str, int)):
61
+ app_ids = [app_ids]
62
+ elif not isinstance(app_ids, (list, tuple, np.ndarray)):
63
+ raise TypeError("app_ids must be a string/int or a list of such")
64
+
65
+ valid_vectors = []
66
+ missing_ids = []
67
+ for app_id in app_ids:
68
+ try:
69
+ vec = self._get_item_vector(app_id)
70
+ valid_vectors.append(vec)
71
+ except ValueError:
72
+ missing_ids.append(app_id)
73
+
74
+ if len(valid_vectors) == 0:
75
+ raise ValueError("None of the input app_ids were found in the model.")
76
+
77
+ # Aggregate vectors by averaging if multiple inputs
78
+ aggregated_vec = np.mean(valid_vectors, axis=0)
79
+
80
+ # Compute similarity with all items
81
+ similarities = self._cosine_similarity(aggregated_vec, self.svd_matrix)
82
+
83
+ # Build DataFrame of results
84
+ result_df = pd.DataFrame({
85
+ 'app_id': [self.index_to_item[i] for i in range(len(similarities))],
86
+ 'collaborative_similarity': similarities
87
+ })
88
+
89
+ # Exclude the input app_ids themselves from results
90
+ result_df = result_df[~result_df['app_id'].isin(app_ids)]
91
+
92
+ # Sort descending by similarity
93
+ result_df = result_df.sort_values('collaborative_similarity', ascending=False).reset_index(drop=True)
94
+
95
+ # If any input app_ids were missing, notify user (optional)
96
+ if missing_ids:
97
+ print(f"Warning: These app_ids were not found in the model and ignored: {missing_ids}")
98
+ if top_n:
99
+ return result_df.head(top_n)
100
+ else:
101
+ return result_df
102
+
103
+ class GameContentRecommender:
104
+ def __init__(self,model,genre_encoder,category_encoder,price_range_encoder,scaler,app_id_encoder):
105
+ self.model : KNeighborsClassifier = model
106
+ self.genre_encoder : MultiLabelBinarizer = genre_encoder
107
+ self.category_encoder : MultiLabelBinarizer = category_encoder
108
+ self.price_range_encoder : LabelEncoder = price_range_encoder
109
+ self.scaler : MinMaxScaler = scaler
110
+ self.app_id_encoder : LabelEncoder = app_id_encoder
111
+
112
+ def save(self, path: str):
113
+ """Save the entire model as a single file using joblib."""
114
+ joblib.dump(self, path)
115
+
116
+ @staticmethod
117
+ def load(path: str):
118
+ """Load the entire model from a joblib file."""
119
+ return joblib.load(path)
120
+
121
+ def predict(self, price_range, year_release, average_playtime, game_score, dlc_count, genres, categories, top_n=None):
122
+ genre_dict = {g: 0 for g in self.genre_encoder.classes_}
123
+ categories_dict = {c: 0 for c in self.category_encoder.classes_}
124
+
125
+ for genre in genres:
126
+ if genre != 'Unknown' and genre in genre_dict:
127
+ genre_dict[genre] = 1
128
+
129
+ for category in categories:
130
+ if category != 'Unknown' and category in categories_dict:
131
+ categories_dict[category] = 1
132
+
133
+ price_range = self.price_range_encoder.transform(np.array(price_range).reshape(-1, 1))
134
+ scaled_features = self.scaler.transform(np.array([[year_release, average_playtime, game_score, dlc_count]]))[0]
135
+
136
+ user_vector = list(scaled_features) + list(price_range) + list(genre_dict.values()) + list(categories_dict.values())
137
+
138
+ user_df = pd.DataFrame([user_vector])
139
+
140
+ distances, indices = self.model.kneighbors(user_df)
141
+ distances = distances.flatten()
142
+ indices = indices.flatten()
143
+
144
+ similarity = 1 / (1 + distances)
145
+
146
+ app_ids = self.app_id_encoder.inverse_transform(indices)
147
+
148
+ prediction = pd.DataFrame({
149
+ 'app_id': app_ids,
150
+ 'content_probability': similarity
151
+ })
152
+
153
+ if top_n:
154
+ prediction = prediction.head(top_n)
155
+
156
+ return prediction
157
+
158
+
159
+
160
+ class TextBasedRecommendation():
161
+ def __init__(self,classifier,vectorizer,app_id_encoder,history):
162
+ self.classifier : XGBClassifier = classifier
163
+ self.vectorizer : TfidfVectorizer = vectorizer
164
+ self.app_id_encoder : LabelEncoder = app_id_encoder
165
+ self.history = history
166
+
167
+ def save(self, path_prefix: str):
168
+ self.classifier.save_model(f"{path_prefix}_xgb.json")
169
+
170
+ classifier_backup = self.classifier
171
+ self.classifier = None
172
+
173
+ joblib.dump(self, f"{path_prefix}_preprocessor.joblib")
174
+
175
+ self.classifier = classifier_backup
176
+
177
+ @staticmethod
178
+ def load(path_prefix: str):
179
+ obj = joblib.load(f"{path_prefix}_preprocessor.joblib")
180
+ xgb = XGBClassifier()
181
+ xgb.load_model(f"{path_prefix}_xgb.json")
182
+ obj.classifier = xgb
183
+
184
+ return obj
185
+
186
+ def preprocess(self,text : str):
187
+ stopword = stopwords.words('english')
188
+ lemmatizer = WordNetLemmatizer()
189
+ def convert_postag(postag:str):
190
+ if postag.startswith('V'):
191
+ return 'v'
192
+ elif postag.startswith('R'):
193
+ return 'r'
194
+ elif postag.startswith('J'):
195
+ return 'a'
196
+ return 'n'
197
+
198
+ def clean_space(text : str):
199
+ if not isinstance(text, str):
200
+ return ''
201
+ cleaned = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()
202
+ return cleaned
203
+
204
+ def tokenize(text : str):
205
+ text = text.lower()
206
+ text = clean_space(text)
207
+ token = word_tokenize(text)
208
+ token = [word for word in token if word not in
209
+ string.punctuation and word not in stopword and word.isalpha()]
210
+ return token
211
+
212
+ # lemmatize
213
+ def lemmatizing(token : str):
214
+ postag = pos_tag(token)
215
+ lemmatized = [lemmatizer.lemmatize(word,convert_postag(tag)) for word,tag in postag]
216
+ return lemmatized
217
+
218
+ token = tokenize(text)
219
+ token = lemmatizing(token)
220
+ return " ".join(token)
221
+
222
+ def get_accuracy(self,X_test,y_test):
223
+ y_pred = self.classifier.predict(self.vectorizer.transform(X_test))
224
+ y_test = self.app_id_encoder.transform(y_test)
225
+ print(classification_report(y_test,y_pred))
226
+
227
+ def predict(self,text,top_n=None):
228
+ cleaned_text = self.preprocess(text)
229
+ vectorized_text = self.vectorizer.transform([cleaned_text])
230
+ proba = self.classifier.predict_proba(vectorized_text)[0]
231
+ class_indices = np.argsort(proba)[::-1]
232
+ if top_n is not None:
233
+ class_indices = class_indices[:top_n]
234
+ class_labels = self.app_id_encoder.inverse_transform(class_indices)
235
+ class_probs = proba[class_indices]
236
+ return pd.DataFrame({
237
+ 'app_id': class_labels,
238
+ 'text_probability': class_probs
239
+ })
240
+
241
+ class GameRecommendationEnsemble:
242
+ def __init__(self,game_content_recommeder,collaborative_recommender,text_based_recommender):
243
+ self.game_content_recommeder : GameContentRecommender=game_content_recommeder
244
+ self.collaborative_recommender : CollaborativeRecommender=collaborative_recommender
245
+ self.text_based_recommender : TextBasedRecommendation = text_based_recommender
246
+
247
+ def save(self, dir_path: str):
248
+ os.makedirs(dir_path, exist_ok=True)
249
+ self.game_content_recommeder.save(os.path.join(dir_path, "game_content_recommender.joblib"))
250
+ self.collaborative_recommender.save(os.path.join(dir_path, "collaborative_recommender.joblib"))
251
+ self.text_based_recommender.save(os.path.join(dir_path, "text_based_recommender"))
252
+
253
+ @staticmethod
254
+ def load(dir_path: str):
255
+ game_content_recommender = GameContentRecommender.load(os.path.join(dir_path, "game_content_recommender.joblib"))
256
+ collaborative_recommender = CollaborativeRecommender.load(os.path.join(dir_path, "collaborative_recommender.joblib"))
257
+ text_based_recommender = TextBasedRecommendation.load(os.path.join(dir_path, "text_based_recommender"))
258
+
259
+ return GameRecommendationEnsemble(
260
+ game_content_recommender,
261
+ collaborative_recommender,
262
+ text_based_recommender
263
+ )
264
+
265
+ def scale_proba(self,series):
266
+ if len(series)<=1:
267
+ return pd.Series([1.0] * len(series), index=series.index)
268
+ scaler = MinMaxScaler()
269
+ scaled = scaler.fit_transform(series.values.reshape(-1, 1)).flatten()
270
+ return pd.Series(scaled, index=series.index)
271
+
272
+ def predict(self, description=None, app_ids=None, price_range=None, year_release=None,
273
+ average_playtime=None, game_score=None, dlc_count=None,
274
+ genres=None, categories=None, top_n=None,
275
+ weight_text=1.0, weight_collab=1.0, weight_content=1.0):
276
+
277
+ merge_dfs = []
278
+ if description is not None:
279
+ text_proba = self.text_based_recommender.predict(description)
280
+ text_proba['app_id'] = text_proba['app_id'].astype(str)
281
+ text_proba['text_probability'] = self.scale_proba(text_proba['text_probability'])
282
+ merge_dfs.append(text_proba)
283
+ else:
284
+ weight_text=0
285
+
286
+ # Collaborative similarity (only if app_ids is provided)
287
+ if app_ids is not None:
288
+ similar_app = self.collaborative_recommender.get_similarities(app_ids)
289
+ similar_app['app_id'] = similar_app['app_id'].astype(str)
290
+ similar_app['collaborative_similarity'] = self.scale_proba(similar_app['collaborative_similarity'])
291
+ merge_dfs.append(similar_app)
292
+ else:
293
+ weight_collab = 0 # No weight if not used
294
+
295
+ if None in (price_range, year_release,average_playtime,game_score,dlc_count, genres, categories):
296
+ weight_content=0
297
+ else:
298
+ similar_content = self.game_content_recommeder.predict(price_range, year_release,average_playtime,game_score,dlc_count, genres, categories)
299
+ similar_content['app_id'] = similar_content['app_id'].astype(str)
300
+ similar_content['content_probability'] = self.scale_proba(similar_content['content_probability'])
301
+ merge_dfs.append(similar_content)
302
+
303
+ if not merge_dfs:
304
+ return None
305
+
306
+ from functools import reduce
307
+ merged = reduce(lambda left, right: pd.merge(left, right, on='app_id', how='outer'), merge_dfs)
308
+
309
+ # Fill missing values
310
+ merged = merged.fillna(0)
311
+
312
+ # Final score calculation
313
+ def compute_aggregated_score(df, w_text, w_collab, w_content):
314
+ # Normalize weights (prevent divide-by-zero if one or more weights are 0)
315
+ total_weight = w_text + w_collab + w_content
316
+ if total_weight == 0:
317
+ raise ValueError("All weights are zero. At least one weight must be positive.")
318
+
319
+ w_text /= total_weight
320
+ w_collab /= total_weight
321
+ w_content /= total_weight
322
+
323
+ df['final_score'] = (
324
+ df.get('text_probability', 0) * w_text +
325
+ df.get('collaborative_similarity', 0) * w_collab +
326
+ df.get('content_probability', 0) * w_content
327
+ )
328
+
329
+ return df.sort_values(by='final_score', ascending=False).reset_index(drop=True)
330
+ final_df = compute_aggregated_score(merged, weight_text, weight_collab, weight_content)
331
+ if top_n:
332
+ return final_df.head(top_n)
333
+ else:
334
+ return final_df
app.py ADDED
@@ -0,0 +1,1466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ from component import *
5
+ from GameRecommender import *
6
+ import gc
7
+ from sklearn.model_selection import train_test_split
8
+ from huggingface_hub import snapshot_download
9
+
10
+ DATASETS = {
11
+ "converted": "converted.csv",
12
+ "Cleaned_games": "Cleaned_games.csv",
13
+ "MergedFragmentData_SAMPLE": "MergedFragmentData_SAMPLE.csv",
14
+ "Trimmed_Dataset": "Trimmed_Dataset.csv",
15
+ "UserPreferenceDF": "UserPreferenceDF.csv",
16
+ }
17
+
18
+ def load_hf_csv_dataset(repo_name, filename):
19
+ # Download the dataset repo snapshot locally, only for 'data' folder
20
+ local_path = snapshot_download(
21
+ repo_id=f"VJyzCELERY/{repo_name}",
22
+ repo_type="dataset",
23
+ allow_patterns=["data/*"],
24
+ )
25
+ csv_path = os.path.join(local_path, "data", filename)
26
+ print(f"Loading {csv_path} ...")
27
+ return pd.read_csv(csv_path, index_col=False)
28
+
29
+ DATA_BASE_PATH = 'data'
30
+ MODEL_BASE_PATH = snapshot_download(
31
+ repo_id="VJyzCELERY/SteamGameRecommender",
32
+ repo_type="model",
33
+ allow_patterns=["GameRecommender/*"]
34
+ )
35
+ SEED = 42
36
+ RAW_GAMES_DATAPATH = os.path.join(DATA_BASE_PATH,'converted.csv')
37
+ GAMES_DATAPATH = os.path.join(DATA_BASE_PATH,'Cleaned_games.csv')
38
+ REVIEWS_DATAPATH = os.path.join(DATA_BASE_PATH,'MergedFragmentData_SAMPLE.csv')
39
+ TRIMMED_REVIEW_DATAPATH = os.path.join(DATA_BASE_PATH,'Trimmed_Dataset.csv')
40
+ USER_PREFERENCE_DATAPATH = os.path.join(DATA_BASE_PATH,'UserPreferenceDF.csv')
41
+ MODEL_PATH = os.path.join(MODEL_BASE_PATH,'GameRecommender')
42
+ from datasets import load_dataset
43
+
44
+ RAW_GAMES_DS = load_dataset("VJyzCELERY/converted")
45
+ GAMES_DS = load_dataset("VJyzCELERY/Cleaned_games")
46
+ REVIEWS_DS = load_dataset("VJyzCELERY/MergedFragmentData_SAMPLE")
47
+ TRIMMED_REVIEWS_DS = load_dataset("VJyzCELERY/Trimmed_Dataset")
48
+ USER_PREF_DS = load_dataset("VJyzCELERY/UserPreferenceDF")
49
+
50
+
51
+ # load dataset
52
+
53
+ model = GameRecommendationEnsemble.load(MODEL_PATH)
54
+ vectorizer=model.text_based_recommender.vectorizer
55
+ review_app_id_encoder=model.text_based_recommender.app_id_encoder
56
+ genres = model.game_content_recommeder.genre_encoder.classes_.tolist()
57
+ genres = [genre for genre in genres if genre != 'Unknown']
58
+ categories = model.game_content_recommeder.category_encoder.classes_.tolist()
59
+ categories = [cat for cat in categories if cat != 'Unknown']
60
+ price_ranges = model.game_content_recommeder.price_range_encoder.classes_.tolist()
61
+ selectable_app_ids = list(model.collaborative_recommender.item_to_index.keys())
62
+ # df_games = pd.read_csv(GAMES_DATAPATH,index_col=False)
63
+ # df_games_raw = pd.read_csv(RAW_GAMES_DATAPATH,index_col=False)
64
+ # df_review_raw = pd.read_csv(REVIEWS_DATAPATH,index_col=False)
65
+ # df_review_trimmed = pd.read_csv(TRIMMED_REVIEW_DATAPATH,index_col=False)
66
+ # df_user_pref = pd.read_csv(USER_PREFERENCE_DATAPATH,index_col=False)
67
+
68
+ df_games = GAMES_DS['train'].to_pandas()
69
+ df_games_raw = RAW_GAMES_DS['train'].to_pandas()
70
+ df_review_raw = REVIEWS_DS['train'].to_pandas()
71
+ df_review_trimmed = TRIMMED_REVIEWS_DS['train'].to_pandas()
72
+ df_user_pref = USER_PREF_DS['train'].to_pandas()
73
+ available_names = df_games[df_games['app_id'].astype(str).isin(selectable_app_ids)]['Name'].tolist()
74
+
75
+ def extract_year(date_str):
76
+ if isinstance(date_str, str) and len(date_str) >= 4:
77
+ year_str = date_str[-4:]
78
+ if year_str.isdigit():
79
+ return int(year_str)
80
+ return None
81
+ def col_to_list(df,col='Genres'):
82
+ import ast
83
+ df[col]=df[col].apply(
84
+ lambda x: ast.literal_eval(x) if isinstance(x, str) else x
85
+ )
86
+ df[col]=df[col].apply(
87
+ lambda genres: [g.strip() for g in genres] if isinstance(genres, list) else ['Unknown']
88
+ )
89
+ return df
90
+
91
+ def apply_price_range_labels(df,labels,bins, price_col='Price', range_col='Price_range'):
92
+ df[range_col] = pd.cut(df[price_col], bins=bins, labels=labels, right=True)
93
+
94
+ return df
95
+
96
+ price_bins = [-0.01, 0, 5, 10, 20, 30, 40, 50, float('inf')]
97
+ price_ranges_labels = [
98
+ "Free",
99
+ "Less than $5",
100
+ "$5 - $9.99",
101
+ "$10 - $19.99",
102
+ "$20 - $29.99",
103
+ "$30 - $39.99",
104
+ "$40 - $49.99",
105
+ "$50+"
106
+ ]
107
+
108
+ def recommend_game(description=None, app_name=None, price_range=None, year_release=None,
109
+ excpected_playtime=None, game_score=None, dlc_count=None,
110
+ genres=None, categories=None, top_n=5,weight_text=1.0, weight_collab=1.0, weight_content=1.0):
111
+ if app_name:
112
+ if isinstance(app_name, (str)):
113
+ app_name = [app_name]
114
+ app_ids = df_games[df_games['Name'].isin(app_name)]['app_id'].astype(str).tolist()
115
+ else:
116
+ app_ids = None
117
+ prediction = model.predict(description=description,app_ids=app_ids,price_range=price_range,year_release=year_release,average_playtime=excpected_playtime,game_score=game_score,
118
+ dlc_count=dlc_count,genres=genres,categories=categories,top_n=top_n,weight_text=weight_text,weight_collab=weight_collab,weight_content=weight_content)
119
+ app_ids = prediction['app_id'].tolist()
120
+ output = df_games.loc[df_games['app_id'].astype(str).isin(app_ids)].reset_index()
121
+ return gr.DataFrame(value=output)
122
+
123
+ # Load external CSS file
124
+ with open('style.css', 'r') as f:
125
+ custom_css = f.read()
126
+ # for nav
127
+ def set_active_section(btn_id):
128
+ """
129
+ button active function and handle visibility section
130
+ """
131
+ # First set all sections to invisible
132
+ updates = [gr.update(visible=False) for _ in sections]
133
+
134
+ # Then set the selected section to visible
135
+ if btn_id in sections:
136
+ index = list(sections.keys()).index(btn_id)
137
+ updates[index] = gr.update(visible=True)
138
+
139
+ # Also update button active states
140
+ button_states = []
141
+ for btn in nav_buttons:
142
+ state = ("active" if btn.elem_id == btn_id else "")
143
+ button_states.append(gr.update(elem_classes=f"nav-btn {state}"))
144
+
145
+ return updates + button_states
146
+
147
+ """
148
+ MAIN DEMO
149
+ """
150
+ with gr.Blocks(css = custom_css) as demo:
151
+ # container
152
+ with gr.Row(elem_classes="container"):
153
+ # navbar
154
+ with gr.Sidebar(elem_classes="navbar"):
155
+
156
+ # nav header
157
+ with gr.Column(elem_classes="nav-header"):
158
+ gr.Markdown("# Game Recommendation by Your Preference")
159
+
160
+ # nav button container
161
+ with gr.Column(elem_classes="nav-buttons"):
162
+ # nav button list
163
+ nav_buttons = []
164
+ sections = [
165
+ ('Home', 'home'),
166
+ ("Dataset", "dataset"),
167
+ ("Exploratory Data Analysis", "eda"),
168
+ ("Preprocessing Data", "preprocess"),
169
+ ("Training Result", "training"),
170
+ ("Our System", "system")
171
+ ]
172
+ # create button
173
+ for label, section_id in sections:
174
+ button = gr.Button(label, elem_classes="nav-btn", elem_id=f"btn-{section_id}")
175
+ nav_buttons.append(button)
176
+
177
+ # main content
178
+ with gr.Column(elem_classes="main-content"):
179
+
180
+ # Home Section
181
+ """
182
+ Introduction section. Using header, h2, p for text formating
183
+ """
184
+ with gr.Column(elem_id="home", elem_classes="content-section", visible=True) as home_section:
185
+ header('About This System')
186
+ with gr.Column(elem_classes='content'):
187
+ h2("Background and Problem")
188
+ p('''
189
+ One of the problem when we are looking for something that we want usually we use an abstract description of what we wanted.
190
+ This issue is also prevalent when it comes to finding games. When we ask our friend for a game we usually describe them then later on narrow them down by Genres if possible and Price.
191
+ However, most system only supports the ability to search games by their category and tags such as genres or prices.
192
+ With that, we wanted to try and make a game recommendation based on description where user can describe the game they are looking for with text and later narrow it down with classification based on their content like genres and price ranges.
193
+ ''')
194
+ h2("The Model")
195
+ p("""The system consists of three model :\nThe first one is the Language Model that will learn users review for a game and use that as a way to describe a game.
196
+ The Language Model will be a classifier based on a Gradient Boosting model called XGBClassifier.\n
197
+ The second model and third model will be the filter model.\n
198
+ The second model is a collaborative filter model where it will recommend the user a game based on a game that they have liked in the past or a game that they specify similar to the game they are looking for.
199
+ This model will learn based on other user who have reviewed a game and a similar game is the game that said user liked other than the input game. This model will use utility matrix and cosine similarity.\n
200
+ The third model is a content based model where it will recommend user a game based on their content such as Genres, Categories, Price range, Year Release, etc.\n
201
+ This third model will be a KNeighborsClassifier.""")
202
+ with gr.Column(elem_id="dataset", elem_classes="content-section", visible=False) as dataset_section:
203
+ """
204
+ Dataset Display section. use Dataset()
205
+ will displaying dataframe.
206
+ key attribute is optional
207
+ """
208
+ header('DATASET')
209
+ with gr.Column(elem_classes='datasets-container'):
210
+ Dataset(
211
+ df=df_games_raw,
212
+ title="1. Games Dataset",
213
+ source=GAMES_DATAPATH,
214
+ key="game_data"
215
+ )
216
+ Dataset(
217
+ df=df_review_raw,
218
+ title="2. Steam Review Dataset",
219
+ source=REVIEWS_DATAPATH,
220
+ key="reviews"
221
+ )
222
+
223
+ # eda section
224
+ with gr.Column(elem_id="eda", elem_classes="content-section", visible=False) as eda_section:
225
+ header('EDA System')
226
+
227
+ h2('1. Game Dataset')
228
+ code_cell('df.head(5)')
229
+ gr.Dataframe(df_games_raw.head(5))
230
+ p(f'Dataset shape : {df_games_raw.shape}')
231
+
232
+ h2('2. Description of data')
233
+ code_cell('df.describe()')
234
+ gr.Dataframe(df_games_raw.describe())
235
+
236
+ h2('3. Distribution of data')
237
+ dropdown = gr.Dropdown(choices=list(df_games_raw.columns), label="Select Column for Distribution",value=list(df_games_raw.columns)[0] if len(df_games_raw.columns) > 0 else None,allow_custom_value=True)
238
+ plot_output = gr.Plot(format='png')
239
+ dropdown.change(plot_distribution, inputs=[gr.State(df_games_raw), dropdown], outputs=plot_output)
240
+
241
+ h2('1. Review Dataset')
242
+ code_cell('df.head(5)')
243
+ gr.Dataframe(df_review_raw.head(5))
244
+ p(f'Dataset shape : {df_review_raw.shape}')
245
+
246
+ h2('2. Description of data')
247
+ code_cell('df.describe()')
248
+ gr.Dataframe(df_review_raw.describe())
249
+
250
+ h2('3. Distribution of data')
251
+ dropdown = gr.Dropdown(choices=list(df_review_raw.columns), label="Select Column for Distribution",value=list(df_review_raw.columns)[0] if len(df_review_raw.columns) > 0 else None,allow_custom_value=True)
252
+ plot_output = gr.Plot(format='png')
253
+ dropdown.change(plot_distribution, inputs=[gr.State(df_review_raw), dropdown], outputs=plot_output)
254
+
255
+ # preprocess section
256
+ with gr.Column(elem_id="preprocess", elem_classes="content-section", visible=False) as preprocess_section:
257
+ header('Preprocess System')
258
+ h2("1. Review Dataset initial merging")
259
+ code_cell("""
260
+ import pandas as pd
261
+ import glob
262
+ import os
263
+ from langdetect import detect
264
+ from joblib import Parallel, delayed
265
+ from tqdm import tqdm
266
+ folder_path = 'Fragmented_Dataset'
267
+
268
+ csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
269
+
270
+ df_list = [pd.read_csv(file) for file in csv_files]
271
+ df = pd.concat(df_list, ignore_index=True)
272
+
273
+ min_word = 20
274
+ print(f'shape before filtering : {df.shape}')
275
+ df = df[df['review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
276
+ print(f'shape after filtering : {df.shape}')
277
+
278
+ def detect_lang(text):
279
+ try:
280
+ return detect(str(text))
281
+ except:
282
+ return 'error'
283
+
284
+ results = Parallel(n_jobs=6)(
285
+ delayed(detect_lang)(text) for text in tqdm(df['review'], desc='Detecting Language')
286
+ )
287
+
288
+ df['lang'] = results
289
+
290
+ # Filter English reviews only
291
+ df_english = df[df['lang'] == 'en'].drop(columns=['lang'])
292
+
293
+ df_english.to_csv('english_reviews.csv', index=False)
294
+
295
+ print("Finished filtering English reviews!")
296
+ """)
297
+ h2("Output : ")
298
+ code_cell("""
299
+ >> shape before filtering : (15437471, 13)
300
+ >> shape after filtering : (6531410, 13)
301
+ >> Finished filtering English reviews!
302
+ """)
303
+
304
+
305
+ h2("2. Data Preprocessing")
306
+ h2("2.1. Games Data Cleaning")
307
+ code_cell("""
308
+ game_datapath = 'converted.csv'
309
+ df_games_raw = pd.read_csv('converted.csv',index_col=False)
310
+ df_games_raw.rename(columns={"AppID": "app_id"}, inplace=True)
311
+ df_games_raw["Genres"] = df_games_raw["Genres"].apply(lambda x: x.split(",") if isinstance(x, str) else ['NONE'])
312
+ df_games_raw["Tags"] = df_games_raw["Tags"].apply(lambda x: x.split(",") if isinstance(x, str) else ['NONE'])
313
+ df_games_raw['Genres'] = df_games_raw['Genres']+df_games_raw['Tags']
314
+ def make_set(row):
315
+ data = [d for d in row if d != 'NONE']
316
+ return set(data)
317
+ df_games_raw['Genres'] = df_games_raw['Genres'].apply(make_set)
318
+ genres_to_keep = [
319
+ 'Action', 'Adventure', 'RPG', 'Strategy', 'Simulation',
320
+ 'Casual', 'Indie', 'Sports', 'Racing', 'Fighting',
321
+ 'Puzzle', 'Shooter', 'Platformer', 'MMO', 'Horror',
322
+ 'Survival', 'Open World', 'Visual Novel', 'Point & Click',
323
+ 'Sandbox', 'Metroidvania', 'Tactical', 'Rhythm',
324
+ 'Stealth', 'Rogue-like', 'Rogue-lite'
325
+ ]
326
+ df_games_raw['Genres'] = df_games_raw['Genres'].apply(lambda genre_list: [g for g in genre_list if g in genres_to_keep])
327
+ df_games_raw = df_games_raw[['app_id','Name','Release date','DLC count','Positive','Negative','Average playtime forever','Price','Developers','Publishers','Detailed description','About the game','Short description','Categories','Genres','Achievements','Windows','Mac','Linux']]
328
+ df_games_raw["Categories"] = df_games_raw["Categories"].apply(lambda x: x.split(",") if isinstance(x, str) else ['Unknown'])
329
+ df_games_raw['Detailed description'] = df_games_raw['Detailed description'].fillna('')
330
+ df_games_raw['About the game'] = df_games_raw['About the game'].fillna('')
331
+ df_games_raw['Short description'] = df_games_raw['About the game'].fillna('')
332
+ df_games_raw['Developers'] = df_games_raw['Developers'].fillna('')
333
+ df_games_raw['Publishers'] = df_games_raw['Publishers'].fillna('')
334
+ df_games_raw.to_csv('Cleaned_games.csv',index=False)
335
+ """)
336
+ h2('Games Data Cleaned')
337
+ gr.DataFrame(df_games.head(20))
338
+
339
+ h2('2.2. Review Preprocessing')
340
+ Dataset(df_review_raw,'Review Data Raw',REVIEWS_DATAPATH)
341
+ code_cell("""
342
+ from nltk.tokenize import word_tokenize
343
+ from nltk.corpus import stopwords
344
+ from nltk.stem import WordNetLemmatizer
345
+ from nltk.tag import pos_tag
346
+ import string
347
+ from joblib import Parallel, delayed
348
+ import multiprocessing
349
+ from tqdm import tqdm
350
+ import re
351
+ import nltk
352
+ nltk.download('punkt')
353
+ nltk.download('averaged_perceptron_tagger_eng')
354
+ nltk.download('wordnet')
355
+
356
+ datapath = 'english_reviews.csv'
357
+ df = pd.read_csv(datapath)
358
+
359
+
360
+ stopword = stopwords.words('english')
361
+
362
+ lemmatizer = WordNetLemmatizer()
363
+
364
+ def convert_postag(postag:str):
365
+ if postag.startswith('V'):
366
+ return 'v'
367
+ elif postag.startswith('R'):
368
+ return 'r'
369
+ elif postag.startswith('J'):
370
+ return 'a'
371
+ return 'n'
372
+
373
+ def clean_space(text : str):
374
+ if not isinstance(text, str):
375
+ return ''
376
+ # Replace newlines with space, collapse multiple spaces, strip
377
+ cleaned = re.sub(r'\s+', ' ', text.replace('\\n', ' ')).strip()
378
+ return cleaned
379
+
380
+ def tokenize(text : str):
381
+ text = text.lower() # lower sentencees
382
+ text = clean_space(text)
383
+ token = word_tokenize(text) # tokenize
384
+ # remove stopword punctuation and numeric
385
+ token = [word for word in token if word not in
386
+ string.punctuation and word not in stopword and word.isalpha()]
387
+ return token
388
+
389
+
390
+ def lemmatizing(token : str):
391
+ postag = pos_tag(token)
392
+ lemmatized = [lemmatizer.lemmatize(word,convert_postag(tag)) for word,tag in postag]
393
+ return lemmatized
394
+
395
+
396
+ def preprocess(text : str):
397
+ token = tokenize(text)
398
+ token = lemmatizing(token)
399
+ return " ".join(token)
400
+
401
+ num_cores = int(multiprocessing.cpu_count()*0.75)
402
+ print("Cleaning Data . . .")
403
+
404
+ df["cleaned_review"] = Parallel(n_jobs=num_cores)(
405
+ delayed(preprocess)(text) for text in tqdm(df["review"], desc="Processing reviews")
406
+ )
407
+ gc.collect()
408
+ df = df[['steamid','app_id','voted_up','cleaned_review']]
409
+ df.to_csv('Cleaned_Dataframe.csv',index=False)
410
+ """)
411
+ Dataset(df_review_trimmed,'Cleaned Review',source=TRIMMED_REVIEW_DATAPATH,key='trimmed_review')
412
+ min_word=20
413
+ df_review_trimmed_filtered = df_review_trimmed[df_review_trimmed['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
414
+ code_cell("""
415
+ min_word = 20
416
+ df = df[df['cleaned_review'].apply(lambda x: len(str(x).split()) >=min_word)].reset_index(drop=True)
417
+ """)
418
+ code_cell(f"""
419
+ >>> shape before filtering : {df_review_trimmed.shape}
420
+ >>> shape after filtering : {df_review_trimmed_filtered.shape}
421
+ >>> number of unique app_ids : {len(set(df_review_trimmed_filtered['app_id']))}
422
+ """)
423
+ fig, ax = plt.subplots()
424
+ df_review_trimmed_filtered['app_id'].value_counts().plot(kind='bar',ax=ax)
425
+ ax.set_xlabel('app_id')
426
+ ax.set_ylabel('Count')
427
+ ax.set_title('Value Counts of app_id')
428
+ gr.Plot(fig,format='png')
429
+ class_counts = df_review_trimmed_filtered['app_id'].value_counts()
430
+ gr.Dataframe(describe_value_counts(class_counts))
431
+ code_cell("""
432
+ min_row = 4500
433
+ max_row = 5000
434
+
435
+ def sample_group(g):
436
+ if len(g) > max_row:
437
+ return g.sample(n=max_row, random_state=SEED)
438
+ else:
439
+ return g
440
+
441
+ # Filter categories with at least min_row rows
442
+ filtered = df.groupby('app_id').filter(lambda x: len(x) >= min_row)
443
+
444
+ # For each app_id, keep only max_row rows (if more, trim to max_row)
445
+ df = filtered.groupby('app_id', group_keys=False).apply(sample_group).reset_index(drop=True)""")
446
+ min_row = 4500
447
+ max_row = 5000
448
+
449
+ def sample_group(g):
450
+ if len(g) > max_row:
451
+ return g.sample(n=max_row, random_state=SEED)
452
+ else:
453
+ return g
454
+ sampled=df_review_trimmed_filtered.groupby('app_id').filter(lambda x:len(x)>= min_row)
455
+ sampled = sampled.groupby('app_id',group_keys=False).apply(sample_group).reset_index(drop=True)
456
+ code_cell(f"""
457
+ Num of class after sampling : {len(set(sampled['app_id']))}
458
+ Shape of the sampled df : {sampled.shape}
459
+ """)
460
+ fig,ax = plt.subplots()
461
+ sampled_class_dist = sampled['app_id'].value_counts()
462
+ sampled_class_dist.plot(kind='bar',ax=ax)
463
+ ax.set_xlabel('app_id')
464
+ ax.set_ylabel('Count')
465
+ ax.set_title('Value Counts of app_id')
466
+ code_cell("""
467
+ df['app_id'].value_counts().plot(kind='bar')
468
+ plt.xlabel('app_id')
469
+ plt.ylabel('Count')
470
+ plt.title('Value Counts of app_id')
471
+ plt.show()
472
+ df.to_csv('Cleaned_Trimmed_Dataset.csv',index=False)""")
473
+ gr.Plot(fig,format='png')
474
+ code_cell("""class_counts = df['app_id'].value_counts()""")
475
+ gr.DataFrame(describe_value_counts(sampled_class_dist))
476
+ h2('Review Preprocessed!')
477
+
478
+ h2('2.3. User Preference Data')
479
+ Dataset(df_review_raw,'User Review Dataset',REVIEWS_DATAPATH)
480
+ code_cell("""
481
+ df_review = df_review[['steamid','appid','voted_up']]
482
+ df_review.to_csv('UserPreferenceDF.csv',index=False)
483
+ """)
484
+ Dataset(df_user_pref,'User Preference Dataset',USER_PREFERENCE_DATAPATH)
485
+ p(f"Dataset Shape : {df_user_pref.shape}")
486
+ df_liked=df_user_pref[df_user_pref['voted_up']==1]
487
+ df_liked.rename(columns={'appid':'app_id'},inplace=True)
488
+ df_liked['voted_up'] = df_liked['voted_up'].astype(int)
489
+ df_liked['steamid'] = df_liked['steamid'].astype(str)
490
+ df_liked['app_id'] = df_liked['app_id'].astype(str)
491
+ df_liked = df_liked.drop_duplicates(subset=['steamid', 'app_id'])
492
+ code_cell("""
493
+ df_liked=df_users[df_users['voted_up']==1]
494
+ df_liked.rename(columns={'appid':'app_id'},inplace=True)
495
+ df_liked['voted_up'] = df_liked['voted_up'].astype(int)
496
+ df_liked['steamid'] = df_liked['steamid'].astype(str)
497
+ df_liked['app_id'] = df_liked['app_id'].astype(str)
498
+ df_liked = df_liked.drop_duplicates(subset=['steamid', 'app_id'])
499
+ """)
500
+ h2(f"Dataset Shape : {df_liked.shape}")
501
+ code_cell("""
502
+ # Keep users who liked at least 5 games
503
+ user_counts = df_liked['steamid'].value_counts()
504
+ df_liked = df_liked[df_liked['steamid'].isin(user_counts[user_counts >= 5].index)]
505
+
506
+ # Keep games liked by at least 10 users
507
+ game_counts = df_liked['app_id'].value_counts()
508
+ df_liked = df_liked[df_liked['app_id'].isin(game_counts[game_counts >= 10].index)]
509
+ df_liked = df_liked.drop_duplicates(subset=['steamid', 'app_id'])
510
+ """)
511
+ p(f"Unique steamids: {df_liked['steamid'].nunique()}")
512
+ p(f"Unique app_ids: {df_liked['app_id'].nunique()}")
513
+ p(f"Total rows: {len(df_liked)}")
514
+ p(f"Unique (steamid, app_id) pairs: {df_liked.drop_duplicates(subset=['steamid', 'app_id']).shape[0]}")
515
+ h2("We're done here, next stop is Training!")
516
+
517
+
518
+ # training section
519
+ with gr.Column(elem_id="training", elem_classes="content-section", visible=False) as training_section:
520
+ header('Training Result')
521
+ h2("Language Model Training")
522
+ h2('Dataset')
523
+ gr.Dataframe(sampled.head(15))
524
+ code_cell("""
525
+ vectorizer = TfidfVectorizer(max_df=0.7,min_df=3,stop_words=None,ngram_range=(1,2))
526
+ review_app_id_encoder = LabelEncoder()""")
527
+ train_df,df_temp = train_test_split(sampled,test_size=0.2,random_state=SEED,stratify=sampled['app_id'])
528
+ test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
529
+ del df_temp
530
+ gc.collect()
531
+ code_cell("""
532
+ train_df,df_temp = train_test_split(sampled,test_size=0.2,random_state=SEED,stratify=sampled['app_id'])
533
+ test_df,val_df = train_test_split(df_temp,test_size=0.5,random_state=SEED,stratify=df_temp['app_id'])
534
+ """)
535
+ p(f"""
536
+ Training : {train_df.shape}
537
+ Testing : {test_df.shape}
538
+ Validation : {val_df.shape}
539
+ """)
540
+ code_cell("""
541
+ X_train = vectorizer.fit_transform(train_df['cleaned_review'])
542
+ y_train = review_app_id_encoder.fit_transform(train_df['app_id'])
543
+ X_test = vectorizer.transform(test_df['cleaned_review'])
544
+ y_test = review_app_id_encoder.transform(test_df['app_id'])
545
+ X_val = vectorizer.transform(val_df['cleaned_review'])
546
+ y_val = review_app_id_encoder.transform(val_df['app_id'])""")
547
+ p("""The shape of X_train : (656396, 1795889)}""")
548
+ code_cell("""
549
+ classifier = XGBClassifier(
550
+ objective='multi:softprob',
551
+ max_depth=4,
552
+ learning_rate=0.2,
553
+ n_estimators=18,
554
+ subsample=0.7,
555
+ colsample_bytree=0.7,
556
+ reg_alpha=1.0,
557
+ reg_lambda=1.0,
558
+ tree_method='hist',
559
+ eval_metric=['mlogloss', 'merror'],
560
+ early_stopping_rounds=10
561
+ )""")
562
+ code_cell("""
563
+ classifier.fit(
564
+ X_train,y_train,
565
+ eval_set=[(X_train, y_train), (X_val, y_val)],
566
+ verbose=True
567
+ )
568
+ """)
569
+ history = model.text_based_recommender.history
570
+
571
+
572
+ h2('Result Training Loss and Error')
573
+ results = {
574
+ "merror": history['validation_0']['merror'],
575
+ "mlogloss": history['validation_0']['mlogloss']
576
+ }
577
+ plot_output = gr.Plot(format='png')
578
+ btn = gr.Button("Generate Plot")
579
+ btn.click(fn=lambda:plot_training_results(results), inputs=[], outputs=plot_output, preprocess=False)
580
+
581
+ h2('Result Validation Loss and Error')
582
+
583
+ resultsval = {
584
+ "merror": history['validation_1']['merror'],
585
+ "mlogloss": history['validation_1']['mlogloss']
586
+ }
587
+ plot_outputval = gr.Plot(format='png')
588
+ btnval = gr.Button("Generate Plot")
589
+ btnval.click(fn=lambda:plot_training_results(resultsval), inputs=[], outputs=plot_outputval, preprocess=False)
590
+ y_pred = model.text_based_recommender.classifier.predict(vectorizer.transform(test_df['cleaned_review']))
591
+ y_test = model.text_based_recommender.app_id_encoder.transform(test_df['app_id'])
592
+ class_report = classification_report(y_test,y_pred)
593
+ h2("Classification Report")
594
+ code_cell(f"""
595
+ {class_report}
596
+ """)
597
+ h2("Language Model Class")
598
+ code_cell("""
599
+ import nltk
600
+ from nltk.tokenize import word_tokenize
601
+ from nltk.corpus import stopwords
602
+ from nltk.stem import WordNetLemmatizer
603
+ from nltk.tag import pos_tag
604
+ import string
605
+ import re
606
+ import os
607
+ nltk.download('punkt')
608
+ nltk.download('averaged_perceptron_tagger_eng')
609
+ nltk.download('wordnet')
610
+
611
+ class TextBasedRecommendation():
612
+ def __init__(self,classifier,vectorizer,app_id_encoder,history):
613
+ self.classifier : XGBClassifier = classifier
614
+ self.vectorizer : TfidfVectorizer = vectorizer
615
+ self.app_id_encoder : LabelEncoder = app_id_encoder
616
+ self.history = history
617
+
618
+ def updateModel(self):
619
+ self.classifier.save_model('xgb_model.json')
620
+ self.classifier.load_model('xgb_model.json')
621
+
622
+ def save(self, path_prefix: str):
623
+ self.classifier.save_model(f"{path_prefix}_xgb.json")
624
+
625
+ classifier_backup = self.classifier
626
+ self.classifier = None
627
+
628
+ joblib.dump(self, f"{path_prefix}_preprocessor.joblib")
629
+
630
+ self.classifier = classifier_backup
631
+
632
+ @staticmethod
633
+ def load(path_prefix: str):
634
+ obj = joblib.load(f"{path_prefix}_preprocessor.joblib")
635
+ xgb = XGBClassifier()
636
+ xgb.load_model(f"{path_prefix}_xgb.json")
637
+ obj.classifier = xgb
638
+
639
+ return obj
640
+
641
+ def preprocess(self,text : str):
642
+ stopword = stopwords.words('english')
643
+ lemmatizer = WordNetLemmatizer()
644
+ def convert_postag(postag:str):
645
+ if postag.startswith('V'):
646
+ return 'v'
647
+ elif postag.startswith('R'):
648
+ return 'r'
649
+ elif postag.startswith('J'):
650
+ return 'a'
651
+ return 'n'
652
+
653
+ def clean_space(text : str):
654
+ if not isinstance(text, str):
655
+ return ''
656
+ cleaned = re.sub(r'\s+', ' ', text.replace('\\n', ' ')).strip()
657
+ return cleaned
658
+
659
+ def tokenize(text : str):
660
+ text = text.lower()
661
+ text = clean_space(text)
662
+ token = word_tokenize(text)
663
+ token = [word for word in token if word not in
664
+ string.punctuation and word not in stopword and word.isalpha()]
665
+ return token
666
+
667
+ # lemmatize
668
+ def lemmatizing(token : str):
669
+ postag = pos_tag(token)
670
+ lemmatized = [lemmatizer.lemmatize(word,convert_postag(tag)) for word,tag in postag]
671
+ return lemmatized
672
+
673
+ token = tokenize(text)
674
+ token = lemmatizing(token)
675
+ return " ".join(token)
676
+
677
+ def get_accuracy(self,X_test,y_test):
678
+ y_pred = self.classifier.predict(self.vectorizer.transform(X_test))
679
+ y_test = self.app_id_encoder.transform(y_test)
680
+ print(classification_report(y_test,y_pred))
681
+
682
+ def predict(self,text,top_n=None):
683
+ cleaned_text = self.preprocess(text)
684
+ vectorized_text = self.vectorizer.transform([cleaned_text])
685
+ proba = self.classifier.predict_proba(vectorized_text)[0]
686
+ class_indices = np.argsort(proba)[::-1]
687
+ if top_n is not None:
688
+ class_indices = class_indices[:top_n]
689
+ class_labels = self.app_id_encoder.inverse_transform(class_indices)
690
+ class_probs = proba[class_indices]
691
+ return pd.DataFrame({
692
+ 'app_id': class_labels,
693
+ 'text_probability': class_probs
694
+ })""")
695
+ h2("Collaborative Filter Training")
696
+ h2("Dataset of User Preference")
697
+ gr.DataFrame(df_liked.head(10))
698
+ p(f"Unique steamids: {df_liked['steamid'].nunique()}")
699
+ p(f"Unique app_ids: {df_liked['app_id'].nunique()}")
700
+ p(f"Total rows: {len(df_liked)}")
701
+ p(f"Unique (steamid, app_id) pairs: {df_liked.drop_duplicates(subset=['steamid', 'app_id']).shape[0]}")
702
+ top_n=3001
703
+ # Top n users with most reviews
704
+ top_users = df_liked['steamid'].value_counts().head(top_n).index
705
+ # Top n games with most reviews
706
+ top_games = df_liked['app_id'].value_counts().head(top_n).index
707
+
708
+ df_liked = df_liked[df_liked['steamid'].isin(top_users) & df_liked['app_id'].isin(top_games)]
709
+
710
+ user_item_matrix = df_liked.pivot_table(
711
+ index='steamid',
712
+ columns='app_id',
713
+ values='voted_up',
714
+ aggfunc='max',
715
+ fill_value=0
716
+ )
717
+ code_cell("""
718
+ top_n=3001
719
+ # Top n users with most reviews
720
+ top_users = df_liked['steamid'].value_counts().head(top_n).index
721
+ # Top n games with most reviews
722
+ top_games = df_liked['app_id'].value_counts().head(top_n).index
723
+
724
+ df_liked = df_liked[df_liked['steamid'].isin(top_users) & df_liked['app_id'].isin(top_games)]
725
+
726
+ user_item_matrix = df_liked.pivot_table(
727
+ index='steamid',
728
+ columns='app_id',
729
+ values='voted_up',
730
+ aggfunc='max',
731
+ fill_value=0
732
+ )
733
+ """)
734
+ gr.Dataframe(user_item_matrix.reset_index().head(10))
735
+ code_cell("""
736
+ from sklearn.decomposition import TruncatedSVD
737
+ X = user_item_matrix.T
738
+
739
+ n_components = 100
740
+
741
+ svd = TruncatedSVD(n_components=n_components, random_state=42)
742
+ item_embeddings = svd.fit_transform(X)
743
+ item_list = list(user_item_matrix.columns)
744
+ unique_items =df_liked['app_id'].unique()
745
+ item_to_index = {item: idx for idx, item in enumerate(unique_items)}
746
+ """)
747
+ h2("Model")
748
+ code_cell("""
749
+
750
+ import numpy as np
751
+ import joblib
752
+ class CollaborativeRecommender:
753
+ def __init__(self, svd_matrix, item_to_index, index_to_item):
754
+ \"""
755
+ svd_matrix: 2D numpy array (items x latent features)
756
+ item_to_index: dict mapping app_id to row index in svd_matrix
757
+ index_to_item: dict mapping row index to app_id
758
+ \"""
759
+ self.svd_matrix : TruncatedSVD = svd_matrix
760
+ self.item_to_index = item_to_index
761
+ self.index_to_item = index_to_item
762
+
763
+ def save(self, path: str):
764
+ \"""Save the entire model as a single file using joblib.\"""
765
+ joblib.dump(self, path)
766
+
767
+ @staticmethod
768
+ def load(path: str):
769
+ \"""Load the entire model from a joblib file.\"""
770
+ return joblib.load(path)
771
+
772
+ def _get_item_vector(self, app_id):
773
+ idx = self.item_to_index.get(app_id)
774
+ if idx is None:
775
+ raise ValueError(f"app_id {app_id} not found in the model.")
776
+ return self.svd_matrix[idx]
777
+
778
+ def _cosine_similarity(self, vec, matrix):
779
+ # Cosine similarity between vec and all rows in matrix
780
+ vec_norm = np.linalg.norm(vec)
781
+ matrix_norms = np.linalg.norm(matrix, axis=1)
782
+ similarity = (matrix @ vec) / (matrix_norms * vec_norm + 1e-10)
783
+ return similarity
784
+
785
+ def get_similarities(self, app_ids,top_n=None):
786
+ \"""
787
+ Input: app_ids - single app_id or list of app_ids
788
+ Output: DataFrame with columns ['app_id', 'similarity'] sorted by similarity descending
789
+ \"""
790
+ if isinstance(app_ids, (str, int)):
791
+ app_ids = [app_ids]
792
+ elif not isinstance(app_ids, (list, tuple, np.ndarray)):
793
+ raise TypeError("app_ids must be a string/int or a list of such")
794
+
795
+ valid_vectors = []
796
+ missing_ids = []
797
+ for app_id in app_ids:
798
+ try:
799
+ vec = self._get_item_vector(app_id)
800
+ valid_vectors.append(vec)
801
+ except ValueError:
802
+ missing_ids.append(app_id)
803
+
804
+ if len(valid_vectors) == 0:
805
+ raise ValueError("None of the input app_ids were found in the model.")
806
+
807
+ # Aggregate vectors by averaging if multiple inputs
808
+ aggregated_vec = np.mean(valid_vectors, axis=0)
809
+
810
+ # Compute similarity with all items
811
+ similarities = self._cosine_similarity(aggregated_vec, self.svd_matrix)
812
+
813
+ # Build DataFrame of results
814
+ result_df = pd.DataFrame({
815
+ 'app_id': [self.index_to_item[i] for i in range(len(similarities))],
816
+ 'collaborative_similarity': similarities
817
+ })
818
+
819
+ # Exclude the input app_ids themselves from results
820
+ result_df = result_df[~result_df['app_id'].isin(app_ids)]
821
+
822
+ # Sort descending by similarity
823
+ result_df = result_df.sort_values('collaborative_similarity', ascending=False).reset_index(drop=True)
824
+
825
+ # If any input app_ids were missing, notify user (optional)
826
+ if missing_ids:
827
+ print(f"Warning: These app_ids were not found in the model and ignored: {missing_ids}")
828
+ if top_n:
829
+ return result_df.head(top_n)
830
+ else:
831
+ return result_df""")
832
+ h2("Content Based Model")
833
+ code_cell("""
834
+ def col_to_list(df,col='Genres'):
835
+ import ast
836
+ df[col]=df[col].apply(
837
+ lambda x: ast.literal_eval(x) if isinstance(x, str) else x
838
+ )
839
+ df[col]=df[col].apply(
840
+ lambda genres: [g.strip() for g in genres] if isinstance(genres, list) else ['Unknown']
841
+ )
842
+ return df
843
+
844
+ def apply_price_range_labels(df,labels,bins, price_col='Price', range_col='Price_range'):
845
+ df[range_col] = pd.cut(df[price_col], bins=bins, labels=labels, right=True)
846
+
847
+ return df
848
+
849
+ price_bins = [-0.01, 0, 5, 10, 20, 30, 40, 50, float('inf')]
850
+ price_labels = [
851
+ "Free",
852
+ "Less than $5",
853
+ "$5 - $9.99",
854
+ "$10 - $19.99",
855
+ "$20 - $29.99",
856
+ "$30 - $39.99",
857
+ "$40 - $49.99",
858
+ "$50+"
859
+ ]
860
+
861
+ df = pd.read_csv("Cleaned_games.csv",index_col=False)
862
+ df = col_to_list(df,'Genres')
863
+ df = col_to_list(df,'Categories')
864
+ df = apply_price_range_labels(df,price_labels,price_bins)
865
+ """)
866
+ Dataset(df_games,"The game dataset",GAMES_DATAPATH)
867
+ df_games_temp = df_games
868
+ df_games_temp = col_to_list(df_games_temp,'Genres')
869
+ df_games_temp = col_to_list(df_games_temp,'Categories')
870
+ df_games_temp = apply_price_range_labels(df_games_temp,price_ranges_labels,price_bins)
871
+ df_games_temp['Year_Release'] = df_games_temp['Release date'].apply(extract_year)
872
+ df_games_temp['Game score'] = np.where(
873
+ (df_games_temp['Positive'] + df_games_temp['Negative']) == 0,
874
+ 0,
875
+ (df_games_temp['Positive'] / (df_games_temp['Positive'] + df_games_temp['Negative'])) * 100
876
+ )
877
+
878
+ code_cell("""
879
+ def extract_year(date_str):
880
+ if isinstance(date_str, str) and len(date_str) >= 4:
881
+ year_str = date_str[-4:]
882
+ if year_str.isdigit():
883
+ return int(year_str)
884
+ return None
885
+
886
+ df['Year_Release'] = df['Release date'].apply(extract_year)
887
+ df['Game score'] = np.where(
888
+ (df['Positive'] + df['Negative']) == 0,
889
+ 0,
890
+ (df['Positive'] / (df['Positive'] + df['Negative'])) * 100
891
+ )""")
892
+ from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
893
+ genre_mlb = MultiLabelBinarizer()
894
+ genre_mlb = genre_mlb.fit(df_games_temp['Genres'])
895
+ categories_mlb = MultiLabelBinarizer()
896
+ categories_mlb = categories_mlb.fit(df_games_temp['Categories'])
897
+ price_range_le = model.game_content_recommeder.price_range_encoder
898
+ scaler = MinMaxScaler()
899
+ scaler = scaler.fit(df_games_temp[['Year_Release','Average playtime forever','Game score','DLC count']].values)
900
+ app_id_le = LabelEncoder()
901
+ app_id_le = app_id_le.fit(df_games_temp['app_id'])
902
+ numerical_col =['Year_Release','Average playtime forever','Game score','DLC count']
903
+
904
+ genre_matrix = genre_mlb.transform(df_games_temp['Genres'])
905
+ genre_df = pd.DataFrame(genre_matrix, columns=genre_mlb.classes_, index=df_games_temp.index)
906
+ categories_matrix = categories_mlb.transform(df_games_temp['Categories'])
907
+ categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df_games_temp.index)
908
+ game_df = pd.concat([df_games_temp[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)
909
+ game_df['Price_range'] = price_range_le.transform(game_df['Price_range'])
910
+ game_df[numerical_col] = scaler.transform(game_df[numerical_col].values)
911
+ code_cell("""
912
+ from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
913
+ genre_mlb = MultiLabelBinarizer()
914
+ genre_mlb = genre_mlb.fit(df['Genres'])
915
+ categories_mlb = MultiLabelBinarizer()
916
+ categories_mlb = categories_mlb.fit(df['Categories'])
917
+ price_range_le = LabelEncoder()
918
+ price_range_le = price_range_le.fit(price_labels)
919
+ scaler = MinMaxScaler()
920
+ scaler = scaler.fit(df[['Year_Release','Average playtime forever','Game score','DLC count']].values)
921
+ app_id_le = LabelEncoder()
922
+ app_id_le = app_id_le.fit(df['app_id'])
923
+ numerical_col =['Year_Release','Average playtime forever','Game score','DLC count']""")
924
+
925
+ code_cell("""
926
+ genre_matrix = genre_mlb.transform(df['Genres'])
927
+ genre_df = pd.DataFrame(genre_matrix, columns=genre_mlb.classes_, index=df.index)
928
+ categories_matrix = categories_mlb.transform(df['Categories'])
929
+ categories_df = pd.DataFrame(categories_matrix,columns=categories_mlb.classes_,index=df.index)
930
+ game_df = pd.concat([df[['app_id','Price_range']+numerical_col],genre_df,categories_df],axis=1)""")
931
+ gr.Dataframe(game_df.head(10))
932
+ code_cell("""
933
+ from sklearn.neighbors import KNeighborsClassifier
934
+ X = game_df.loc[:,['Year_Release','Average playtime forever','Game score','DLC count','Price_range']+ list(genre_mlb.classes_) + list(categories_mlb.classes_)]
935
+ y = app_id_le.transform(game_df['app_id'])
936
+
937
+ model = KNeighborsClassifier(n_neighbors=len(y), metric='cosine')
938
+ model.fit(X.values,y)
939
+ """)
940
+ h2("Content Based Recommender Class")
941
+ code_cell("""
942
+ class GameContentRecommender:
943
+ def __init__(self,model,genre_encoder,category_encoder,price_range_encoder,scaler,app_id_encoder):
944
+ self.model : KNeighborsClassifier = model
945
+ self.genre_encoder : MultiLabelBinarizer = genre_encoder
946
+ self.category_encoder : MultiLabelBinarizer = category_encoder
947
+ self.price_range_encoder : LabelEncoder = price_range_encoder
948
+ self.scaler : MinMaxScaler = scaler
949
+ self.app_id_encoder : LabelEncoder = app_id_encoder
950
+
951
+ def save(self, path: str):
952
+ \"""Save the entire model as a single file using joblib.\"""
953
+ joblib.dump(self, path)
954
+
955
+ @staticmethod
956
+ def load(path: str):
957
+ \"""Load the entire model from a joblib file.\"""
958
+ return joblib.load(path)
959
+
960
+ def predict(self, price_range, year_release, average_playtime, game_score, dlc_count, genres, categories, top_n=None):
961
+ # Create one-hot encoded genre and category dicts
962
+ genre_dict = {g: 0 for g in self.genre_encoder.classes_}
963
+ categories_dict = {c: 0 for c in self.category_encoder.classes_}
964
+
965
+ for genre in genres:
966
+ if genre != 'Unknown' and genre in genre_dict:
967
+ genre_dict[genre] = 1
968
+
969
+ for category in categories:
970
+ if category != 'Unknown' and category in categories_dict:
971
+ categories_dict[category] = 1
972
+
973
+ # Encode and normalize numeric features
974
+ price_range = self.price_range_encoder.transform(np.array(price_range).reshape(-1, 1))
975
+ scaled_features = self.scaler.transform(np.array([[year_release, average_playtime, game_score, dlc_count]]))[0]
976
+
977
+ user_vector = list(scaled_features) + list(price_range) + list(genre_dict.values()) + list(categories_dict.values())
978
+
979
+ # Prepare DataFrame for KNN
980
+ user_df = pd.DataFrame([user_vector])
981
+
982
+ # Get KNN results
983
+ distances, indices = self.model.kneighbors(user_df)
984
+ distances = distances.flatten()
985
+ indices = indices.flatten()
986
+
987
+ # Convert distances to similarity scores
988
+ similarity = 1 / (1 + distances)
989
+
990
+ # Decode app_ids
991
+ app_ids = self.app_id_encoder.inverse_transform(indices)
992
+
993
+ prediction = pd.DataFrame({
994
+ 'app_id': app_ids,
995
+ 'content_probability': similarity
996
+ })
997
+
998
+ if top_n:
999
+ prediction = prediction.head(top_n)
1000
+
1001
+ return prediction
1002
+
1003
+ """)
1004
+ h2("After finishing with individual model we finally ensemble them together")
1005
+ code_cell("""
1006
+ import numpy as np
1007
+ import pandas as pd
1008
+ from sklearn.neighbors import KNeighborsClassifier
1009
+ from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder,MinMaxScaler
1010
+ from sklearn.feature_extraction.text import TfidfVectorizer
1011
+ import joblib
1012
+ from sklearn.decomposition import TruncatedSVD
1013
+ from sklearn.metrics import classification_report
1014
+ from xgboost import XGBClassifier
1015
+ import nltk
1016
+ from nltk.tokenize import word_tokenize
1017
+ from nltk.corpus import stopwords
1018
+ from nltk.stem import WordNetLemmatizer
1019
+ from nltk.tag import pos_tag
1020
+ import string
1021
+ import re
1022
+ import os
1023
+ nltk.download('punkt')
1024
+ nltk.download('averaged_perceptron_tagger_eng')
1025
+ nltk.download('wordnet')
1026
+
1027
+ class CollaborativeRecommender:
1028
+ def __init__(self, svd_matrix, item_to_index, index_to_item):
1029
+ \"""
1030
+ svd_matrix: 2D numpy array (items x latent features)
1031
+ item_to_index: dict mapping app_id to row index in svd_matrix
1032
+ index_to_item: dict mapping row index to app_id
1033
+ \"""
1034
+ self.svd_matrix : TruncatedSVD = svd_matrix
1035
+ self.item_to_index = item_to_index
1036
+ self.index_to_item = index_to_item
1037
+
1038
+ def save(self, path: str):
1039
+ \"""Save the entire model as a single file using joblib.\"""
1040
+ joblib.dump(self, path)
1041
+
1042
+ @staticmethod
1043
+ def load(path: str):
1044
+ \"""Load the entire model from a joblib file.\"""
1045
+ return joblib.load(path)
1046
+
1047
+ def _get_item_vector(self, app_id):
1048
+ idx = self.item_to_index.get(app_id)
1049
+ if idx is None:
1050
+ raise ValueError(f"app_id {app_id} not found in the model.")
1051
+ return self.svd_matrix[idx]
1052
+
1053
+ def _cosine_similarity(self, vec, matrix):
1054
+ # Cosine similarity between vec and all rows in matrix
1055
+ vec_norm = np.linalg.norm(vec)
1056
+ matrix_norms = np.linalg.norm(matrix, axis=1)
1057
+ similarity = (matrix @ vec) / (matrix_norms * vec_norm + 1e-10)
1058
+ return similarity
1059
+
1060
+ def get_similarities(self, app_ids,top_n=None):
1061
+ \"""
1062
+ Input: app_ids - single app_id or list of app_ids
1063
+ Output: DataFrame with columns ['app_id', 'similarity'] sorted by similarity descending
1064
+ \"""
1065
+ if isinstance(app_ids, (str, int)):
1066
+ app_ids = [app_ids]
1067
+ elif not isinstance(app_ids, (list, tuple, np.ndarray)):
1068
+ raise TypeError("app_ids must be a string/int or a list of such")
1069
+
1070
+ valid_vectors = []
1071
+ missing_ids = []
1072
+ for app_id in app_ids:
1073
+ try:
1074
+ vec = self._get_item_vector(app_id)
1075
+ valid_vectors.append(vec)
1076
+ except ValueError:
1077
+ missing_ids.append(app_id)
1078
+
1079
+ if len(valid_vectors) == 0:
1080
+ raise ValueError("None of the input app_ids were found in the model.")
1081
+
1082
+ # Aggregate vectors by averaging if multiple inputs
1083
+ aggregated_vec = np.mean(valid_vectors, axis=0)
1084
+
1085
+ # Compute similarity with all items
1086
+ similarities = self._cosine_similarity(aggregated_vec, self.svd_matrix)
1087
+
1088
+ # Build DataFrame of results
1089
+ result_df = pd.DataFrame({
1090
+ 'app_id': [self.index_to_item[i] for i in range(len(similarities))],
1091
+ 'collaborative_similarity': similarities
1092
+ })
1093
+
1094
+ # Exclude the input app_ids themselves from results
1095
+ result_df = result_df[~result_df['app_id'].isin(app_ids)]
1096
+
1097
+ # Sort descending by similarity
1098
+ result_df = result_df.sort_values('collaborative_similarity', ascending=False).reset_index(drop=True)
1099
+
1100
+ # If any input app_ids were missing, notify user (optional)
1101
+ if missing_ids:
1102
+ print(f"Warning: These app_ids were not found in the model and ignored: {missing_ids}")
1103
+ if top_n:
1104
+ return result_df.head(top_n)
1105
+ else:
1106
+ return result_df
1107
+
1108
+ class GameContentRecommender:
1109
+ def __init__(self,model,genre_encoder,category_encoder,price_range_encoder,scaler,app_id_encoder):
1110
+ self.model : KNeighborsClassifier = model
1111
+ self.genre_encoder : MultiLabelBinarizer = genre_encoder
1112
+ self.category_encoder : MultiLabelBinarizer = category_encoder
1113
+ self.price_range_encoder : LabelEncoder = price_range_encoder
1114
+ self.scaler : MinMaxScaler = scaler
1115
+ self.app_id_encoder : LabelEncoder = app_id_encoder
1116
+
1117
+ def save(self, path: str):
1118
+ \"""Save the entire model as a single file using joblib.\"""
1119
+ joblib.dump(self, path)
1120
+
1121
+ @staticmethod
1122
+ def load(path: str):
1123
+ \"""Load the entire model from a joblib file.\"""
1124
+ return joblib.load(path)
1125
+
1126
+ def predict(self, price_range, year_release, average_playtime, game_score, dlc_count, genres, categories, top_n=None):
1127
+ genre_dict = {g: 0 for g in self.genre_encoder.classes_}
1128
+ categories_dict = {c: 0 for c in self.category_encoder.classes_}
1129
+
1130
+ for genre in genres:
1131
+ if genre != 'Unknown' and genre in genre_dict:
1132
+ genre_dict[genre] = 1
1133
+
1134
+ for category in categories:
1135
+ if category != 'Unknown' and category in categories_dict:
1136
+ categories_dict[category] = 1
1137
+
1138
+ price_range = self.price_range_encoder.transform(np.array(price_range).reshape(-1, 1))
1139
+ scaled_features = self.scaler.transform(np.array([[year_release, average_playtime, game_score, dlc_count]]))[0]
1140
+
1141
+ user_vector = list(scaled_features) + list(price_range) + list(genre_dict.values()) + list(categories_dict.values())
1142
+
1143
+ user_df = pd.DataFrame([user_vector])
1144
+
1145
+ distances, indices = self.model.kneighbors(user_df)
1146
+ distances = distances.flatten()
1147
+ indices = indices.flatten()
1148
+
1149
+ similarity = 1 / (1 + distances)
1150
+
1151
+ app_ids = self.app_id_encoder.inverse_transform(indices)
1152
+
1153
+ prediction = pd.DataFrame({
1154
+ 'app_id': app_ids,
1155
+ 'content_probability': similarity
1156
+ })
1157
+
1158
+ if top_n:
1159
+ prediction = prediction.head(top_n)
1160
+
1161
+ return prediction
1162
+
1163
+
1164
+
1165
+ class TextBasedRecommendation():
1166
+ def __init__(self,classifier,vectorizer,app_id_encoder,history):
1167
+ self.classifier : XGBClassifier = classifier
1168
+ self.vectorizer : TfidfVectorizer = vectorizer
1169
+ self.app_id_encoder : LabelEncoder = app_id_encoder
1170
+ self.history = history
1171
+
1172
+ def save(self, path_prefix: str):
1173
+ self.classifier.save_model(f"{path_prefix}_xgb.json")
1174
+
1175
+ classifier_backup = self.classifier
1176
+ self.classifier = None
1177
+
1178
+ joblib.dump(self, f"{path_prefix}_preprocessor.joblib")
1179
+
1180
+ self.classifier = classifier_backup
1181
+
1182
+ @staticmethod
1183
+ def load(path_prefix: str):
1184
+ obj = joblib.load(f"{path_prefix}_preprocessor.joblib")
1185
+ xgb = XGBClassifier()
1186
+ xgb.load_model(f"{path_prefix}_xgb.json")
1187
+ obj.classifier = xgb
1188
+
1189
+ return obj
1190
+
1191
+ def preprocess(self,text : str):
1192
+ stopword = stopwords.words('english')
1193
+ lemmatizer = WordNetLemmatizer()
1194
+ def convert_postag(postag:str):
1195
+ if postag.startswith('V'):
1196
+ return 'v'
1197
+ elif postag.startswith('R'):
1198
+ return 'r'
1199
+ elif postag.startswith('J'):
1200
+ return 'a'
1201
+ return 'n'
1202
+
1203
+ def clean_space(text : str):
1204
+ if not isinstance(text, str):
1205
+ return ''
1206
+ cleaned = re.sub(r'\s+', ' ', text.replace('\\n', ' ')).strip()
1207
+ return cleaned
1208
+
1209
+ def tokenize(text : str):
1210
+ text = text.lower()
1211
+ text = clean_space(text)
1212
+ token = word_tokenize(text)
1213
+ token = [word for word in token if word not in
1214
+ string.punctuation and word not in stopword and word.isalpha()]
1215
+ return token
1216
+
1217
+ # lemmatize
1218
+ def lemmatizing(token : str):
1219
+ postag = pos_tag(token)
1220
+ lemmatized = [lemmatizer.lemmatize(word,convert_postag(tag)) for word,tag in postag]
1221
+ return lemmatized
1222
+
1223
+ token = tokenize(text)
1224
+ token = lemmatizing(token)
1225
+ return " ".join(token)
1226
+
1227
+ def get_accuracy(self,X_test,y_test):
1228
+ y_pred = self.classifier.predict(self.vectorizer.transform(X_test))
1229
+ y_test = self.app_id_encoder.transform(y_test)
1230
+ print(classification_report(y_test,y_pred))
1231
+
1232
+ def predict(self,text,top_n=None):
1233
+ cleaned_text = self.preprocess(text)
1234
+ vectorized_text = self.vectorizer.transform([cleaned_text])
1235
+ proba = self.classifier.predict_proba(vectorized_text)[0]
1236
+ class_indices = np.argsort(proba)[::-1]
1237
+ if top_n is not None:
1238
+ class_indices = class_indices[:top_n]
1239
+ class_labels = self.app_id_encoder.inverse_transform(class_indices)
1240
+ class_probs = proba[class_indices]
1241
+ return pd.DataFrame({
1242
+ 'app_id': class_labels,
1243
+ 'text_probability': class_probs
1244
+ })
1245
+
1246
+ class GameRecommendationEnsemble:
1247
+ def __init__(self,game_content_recommeder,collaborative_recommender,text_based_recommender):
1248
+ self.game_content_recommeder : GameContentRecommender=game_content_recommeder
1249
+ self.collaborative_recommender : CollaborativeRecommender=collaborative_recommender
1250
+ self.text_based_recommender : TextBasedRecommendation = text_based_recommender
1251
+
1252
+ def save(self, dir_path: str):
1253
+ os.makedirs(dir_path, exist_ok=True)
1254
+ self.game_content_recommeder.save(os.path.join(dir_path, "game_content_recommender.joblib"))
1255
+ self.collaborative_recommender.save(os.path.join(dir_path, "collaborative_recommender.joblib"))
1256
+ self.text_based_recommender.save(os.path.join(dir_path, "text_based_recommender"))
1257
+
1258
+ @staticmethod
1259
+ def load(dir_path: str):
1260
+ game_content_recommender = GameContentRecommender.load(os.path.join(dir_path, "game_content_recommender.joblib"))
1261
+ collaborative_recommender = CollaborativeRecommender.load(os.path.join(dir_path, "collaborative_recommender.joblib"))
1262
+ text_based_recommender = TextBasedRecommendation.load(os.path.join(dir_path, "text_based_recommender"))
1263
+
1264
+ return GameRecommendationEnsemble(
1265
+ game_content_recommender,
1266
+ collaborative_recommender,
1267
+ text_based_recommender
1268
+ )
1269
+
1270
+ def scale_proba(self,series):
1271
+ if len(series)<=1:
1272
+ return pd.Series([1.0] * len(series), index=series.index)
1273
+ scaler = MinMaxScaler()
1274
+ scaled = scaler.fit_transform(series.values.reshape(-1, 1)).flatten()
1275
+ return pd.Series(scaled, index=series.index)
1276
+
1277
+ def predict(self, description=None, app_ids=None, price_range=None, year_release=None,
1278
+ average_playtime=None, game_score=None, dlc_count=None,
1279
+ genres=None, categories=None, top_n=None,
1280
+ weight_text=1.0, weight_collab=1.0, weight_content=1.0):
1281
+
1282
+ merge_dfs = []
1283
+ if description is not None:
1284
+ text_proba = self.text_based_recommender.predict(description)
1285
+ text_proba['app_id'] = text_proba['app_id'].astype(str)
1286
+ text_proba['text_probability'] = self.scale_proba(text_proba['text_probability'])
1287
+ merge_dfs.append(text_proba)
1288
+ else:
1289
+ weight_text=0
1290
+
1291
+ # Collaborative similarity (only if app_ids is provided)
1292
+ if app_ids is not None:
1293
+ similar_app = self.collaborative_recommender.get_similarities(app_ids)
1294
+ similar_app['app_id'] = similar_app['app_id'].astype(str)
1295
+ similar_app['collaborative_similarity'] = self.scale_proba(similar_app['collaborative_similarity'])
1296
+ merge_dfs.append(similar_app)
1297
+ else:
1298
+ weight_collab = 0 # No weight if not used
1299
+
1300
+ if None in (price_range, year_release,average_playtime,game_score,dlc_count, genres, categories):
1301
+ weight_content=0
1302
+ else:
1303
+ similar_content = self.game_content_recommeder.predict(price_range, year_release,average_playtime,game_score,dlc_count, genres, categories)
1304
+ similar_content['app_id'] = similar_content['app_id'].astype(str)
1305
+ similar_content['content_probability'] = self.scale_proba(similar_content['content_probability'])
1306
+ merge_dfs.append(similar_content)
1307
+
1308
+ if not merge_dfs:
1309
+ return None
1310
+
1311
+ from functools import reduce
1312
+ merged = reduce(lambda left, right: pd.merge(left, right, on='app_id', how='outer'), merge_dfs)
1313
+
1314
+ # Fill missing values
1315
+ merged = merged.fillna(0)
1316
+
1317
+ # Final score calculation
1318
+ def compute_aggregated_score(df, w_text, w_collab, w_content):
1319
+ # Normalize weights (prevent divide-by-zero if one or more weights are 0)
1320
+ total_weight = w_text + w_collab + w_content
1321
+ if total_weight == 0:
1322
+ raise ValueError("All weights are zero. At least one weight must be positive.")
1323
+
1324
+ w_text /= total_weight
1325
+ w_collab /= total_weight
1326
+ w_content /= total_weight
1327
+
1328
+ df['final_score'] = (
1329
+ df.get('text_probability', 0) * w_text +
1330
+ df.get('collaborative_similarity', 0) * w_collab +
1331
+ df.get('content_probability', 0) * w_content
1332
+ )
1333
+
1334
+ return df.sort_values(by='final_score', ascending=False).reset_index(drop=True)
1335
+ final_df = compute_aggregated_score(merged, weight_text, weight_collab, weight_content)
1336
+ if top_n:
1337
+ return final_df.head(top_n)
1338
+ else:
1339
+ return final_df
1340
+ """)
1341
+
1342
+
1343
+
1344
+
1345
+
1346
+
1347
+ # Recommendation system
1348
+ with gr.Column(elem_id="system", elem_classes='content-section', visible=False) as system_section:
1349
+ # special for this section
1350
+ gr.HTML('<h1 class="header-title">Game Recommendation System</h1>', elem_id='system')
1351
+ with gr.Row():
1352
+ with gr.Column(min_width=500, elem_classes='input-column'):
1353
+
1354
+ app_name = input_choice(
1355
+ Label='Select games that you liked',
1356
+ Choices=available_names,
1357
+ Multiselect=True
1358
+ )
1359
+
1360
+ year = input_number(
1361
+ Label='Year Release',
1362
+ Precision=0,
1363
+ minimum=0
1364
+ )
1365
+
1366
+ expected_playtime = input_number(
1367
+ Label='Expected Playtime (Hours)',
1368
+ Precision=2,
1369
+ minimum=0
1370
+ )
1371
+
1372
+ expected_score = input_number(
1373
+ Label='Expected Score (% Positive)',
1374
+ Precision=2,
1375
+ minimum=0
1376
+ )
1377
+
1378
+ dlc_count = input_number(
1379
+ Label='DLC Count',
1380
+ Precision=0,
1381
+ minimum=0
1382
+ )
1383
+
1384
+ description = input_paragaph_textbox('Description', 'Describe the game (max 1200 characters)...')
1385
+
1386
+ genre = input_choice(
1387
+ Label="Select Your Genre (Multiple Choice)",
1388
+ Choices=genres,
1389
+ Multiselect=True
1390
+ )
1391
+
1392
+ categories = input_choice(
1393
+ Label="Select Your Categories (Multiple Choice)",
1394
+ Choices=categories,
1395
+ Multiselect=True
1396
+ )
1397
+
1398
+ # single selection (multiselect=False)
1399
+ price_range = input_choice(
1400
+ Label="Select Your Price Range (Only Single Choice)",
1401
+ Choices=price_ranges,
1402
+ Multiselect=False
1403
+ )
1404
+
1405
+ top_n= input_number(
1406
+ Label='Output amount',
1407
+ Precision=0,
1408
+ minimum=0,
1409
+ value=10
1410
+ )
1411
+ weight_text = input_number(
1412
+ Label='Weight Text',
1413
+ Precision=2,
1414
+ minimum=0,
1415
+ maximum=1,
1416
+ value=1
1417
+ )
1418
+ weight_collab = input_number(
1419
+ Label='Weight Of Collaborative Model',
1420
+ Precision=2,
1421
+ minimum=0,
1422
+ maximum=1,
1423
+ value=0.03
1424
+ )
1425
+ weight_content = input_number(
1426
+ Label='Weight Of Content Based Model',
1427
+ Precision=2,
1428
+ minimum=0,
1429
+ maximum=1,
1430
+ value=0.03
1431
+ )
1432
+ submit_btn = gr.Button("Get Recommendations", variant="primary", elem_id="submit-btn")
1433
+
1434
+ # Results column
1435
+ with gr.Column(min_width=500, elem_classes='results-column'):
1436
+ h2('Result')
1437
+ with gr.Column(elem_id='Output'):
1438
+ # Results column using the modular component
1439
+ h2('Recommended Game')
1440
+ recommended_game = gr.DataFrame()
1441
+ # click button logic
1442
+ submit_btn.click(
1443
+ fn=recommend_game,
1444
+ inputs=[description,app_name,price_range,year,expected_playtime,expected_score,dlc_count, genre, categories,top_n,weight_text,weight_collab,weight_content],
1445
+ outputs=recommended_game
1446
+ )
1447
+
1448
+ # Navigation logic
1449
+ sections = {
1450
+ "btn-home": home_section,
1451
+ "btn-dataset": dataset_section,
1452
+ "btn-eda": eda_section,
1453
+ "btn-preprocess": preprocess_section,
1454
+ "btn-training": training_section,
1455
+ "btn-system": system_section
1456
+ }
1457
+
1458
+ # Set click events for navigation buttons
1459
+ for btn in nav_buttons:
1460
+ btn.click(
1461
+ set_active_section,
1462
+ inputs=gr.State(btn.elem_id),
1463
+ outputs=list(sections.values()) + nav_buttons
1464
+ )
1465
+
1466
+ demo.launch()
component.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import matplotlib
4
+ matplotlib.use("Agg")
5
+ import matplotlib.pyplot as plt
6
+ import inspect
7
+ import io
8
+
9
+
10
+ # style formating for Header
11
+ def header(input:str):
12
+ """
13
+ Usage:
14
+ header('your text')
15
+ Output:
16
+ <h1 class="header"> {input} <h1>
17
+ output will be bold. use for container header only
18
+ Args:
19
+ input (str): _header_Title_
20
+ """
21
+ gr.Markdown(f"# {input}", elem_classes='header')
22
+
23
+ # style formating for Header2
24
+ def h2(input:str):
25
+ """
26
+ Usage:
27
+ h2('your text')
28
+ Output:
29
+ <h2 class="subheader"> {input} <h2>
30
+ output will be bold. use for optional
31
+ Args:
32
+ input (str): _subheader_Title_
33
+ """
34
+ gr.Markdown(f'<h2 class="subheader" style="black">{input}</h2>')
35
+
36
+ # style formating for Text
37
+ def p(input:str):
38
+ """
39
+ Usage:
40
+ p('''
41
+ text <br>
42
+ text
43
+ ''')
44
+
45
+ or
46
+
47
+ p('text')
48
+ Outputs:
49
+ Multiple <p class="desc">...</p> blocks, one per paragraph.
50
+ """
51
+ paragraphs = input.strip().split("<br>")
52
+ text = ''.join(f'<p class="desc">{para.strip()}</p>' for para in paragraphs if para.strip())
53
+ return gr.Markdown(text)
54
+
55
+ # this for displaying dataframe and also provied downlaod csv
56
+ def Dataset(df,title, source, key=None):
57
+ """
58
+ Creates a reusable dataset display component.
59
+ This is displaying title, dataframe, and provide download button
60
+ file path means file
61
+ Args:
62
+ df (pd.DataFrame): Dataset to display
63
+ title (str): Title for the dataset display
64
+ file_path (str): Path to the CSV file for download (the file name following the path)
65
+ key (str): Optional unique identifier for Gradio components
66
+ """
67
+ def get_file():
68
+ return source
69
+
70
+ with gr.Column(elem_classes='dataframe-layout', elem_id=f"dataset-{key}" if key else None):
71
+ # Title and download button in a row
72
+ with gr.Row():
73
+ gr.Markdown(f'<h1 class="subtitle">{title}</h1>') # title formating
74
+ download_btn = gr.DownloadButton(
75
+ label="Download CSV",
76
+ value=get_file,
77
+ elem_id=f"download-{key}" if key else None
78
+ )
79
+
80
+ # Dataframe display
81
+ df_display=gr.Dataframe(
82
+ value=df.head(100),
83
+ headers=list(df.columns),
84
+ elem_id=f"table-{key}" if key else None,
85
+ interactive=False, # read only
86
+ # disable the warp for reduce height of data
87
+ # wrap=True
88
+ )
89
+ return df_display
90
+
91
+ def describe_value_counts(series):
92
+ description = series.describe().to_frame(name='value')
93
+ description = description.reset_index() # Move index (stat name) into column
94
+ description.columns = ['Statistic', 'Value']
95
+ return description
96
+
97
+ # this is for EDA, preprocess
98
+ def plot_distribution(df, column):
99
+ """
100
+ Generates a matplotlib plot (bar chart or histogram) showing the distribution
101
+ of values in a selected column from the dataframe.
102
+
103
+ Parameters:
104
+ -----------
105
+ df : pd.DataFrame
106
+ The dataframe to plot from.
107
+ column : str
108
+ The column name to visualize.
109
+
110
+ Returns:
111
+ --------
112
+ matplotlib.figure.Figure
113
+ A figure object representing the distribution plot.
114
+ """
115
+ fig, ax = plt.subplots(figsize=(10, 5))
116
+
117
+ if df[column].dtype == 'object' or df[column].nunique() < 20:
118
+ # Bar plot for categorical/small unique values
119
+ value_counts = df[column].value_counts().head(20)
120
+ ax.bar(value_counts.index, value_counts.values)
121
+ ax.set_xticklabels(value_counts.index, rotation=45, ha='right')
122
+ ax.set_ylabel('Count')
123
+ ax.set_title(f'Distribution of {column}')
124
+ else:
125
+ # Histogram for numerical
126
+ ax.hist(df[column].dropna(), bins=100, edgecolor='black')
127
+ ax.set_title(f'Distribution of {column}')
128
+ ax.set_xlabel(column)
129
+ ax.set_ylabel('Frequency')
130
+
131
+ fig.tight_layout()
132
+ return fig
133
+
134
+ ## this is for eda, preprocess, and training
135
+ def code_cell(code):
136
+ """
137
+ simply syntax for gr.code
138
+ Usage :
139
+ Code_cell('df = pd.read_csv(path)')
140
+ or
141
+ using triple string for multiple line
142
+ code_cell("""""")
143
+ """
144
+ gr.Code(inspect.cleandoc(code), language='python')
145
+
146
+ ## This for EDA, Preprocess, and training
147
+ def plot_training_results(results: dict):
148
+ """
149
+ Plots the training metrics: merror and mlogloss from the result dictionary.
150
+
151
+ This function generates a line plot that visualizes the model's training
152
+ performance over time (e.g., across epochs or folds), using the merror
153
+ (training error) and mlogloss (log loss) values.
154
+
155
+ Args:
156
+ results (dict): A dictionary containing two keys:
157
+ - 'merror': list of training error values.
158
+ - 'mlogloss': list of log loss values.
159
+ Example:
160
+ {
161
+ "merror": [0.12, 0.10, 0.08],
162
+ "mlogloss": [0.35, 0.32, 0.30]
163
+ }
164
+
165
+ Returns:
166
+ matplotlib.figure.Figure: A Matplotlib figure showing the trends of
167
+ training error and log loss as line plots.
168
+
169
+ Example:
170
+ results = {
171
+ "merror": [0.12, 0.10, 0.08],
172
+ "mlogloss": [0.35, 0.32, 0.30]
173
+ }
174
+ plot_output = gr.Plot()
175
+ btn = gr.Button("Generate Plot")
176
+ btn.click(fn=lambda:plot_training_results(results), inputs=[], outputs=plot_output, preprocess=False)
177
+ """
178
+ epochs = list(range(1, len(results["merror"]) + 1))
179
+
180
+ plt.figure(figsize=(8, 5))
181
+ plt.plot(epochs, results["merror"], marker='o', label='Training Error (merror)', color='blue')
182
+ plt.plot(epochs, results["mlogloss"], marker='s', label='Log Loss (mlogloss)', color='orange')
183
+
184
+ plt.title('Training Metrics Over Time')
185
+ plt.xlabel('Epoch / Fold')
186
+ plt.ylabel('Value')
187
+ plt.legend()
188
+ plt.grid(True)
189
+ plt.tight_layout()
190
+
191
+ return plt.gcf()
192
+
193
+ # for Recommendation section
194
+ def input_name_textbox(Label:str, Placeholder:str):
195
+ """
196
+ usage:
197
+ app_name = input_name_textbox('Input Your App', 'Enter game title...')
198
+ Args:
199
+ Label (str): Title textbox
200
+ Placeholder (str): placeholder text
201
+
202
+ Returns:
203
+ variable : str
204
+ """
205
+
206
+ inputbox = gr.Textbox(
207
+ label=Label,
208
+ placeholder=Placeholder,
209
+ elem_classes="text-input"
210
+ )
211
+ return inputbox
212
+
213
+ def input_number(Label:str,Precision = 0,**kwargs):
214
+ """
215
+ usage:
216
+ app_name = input_number('Input Number', 'Enter game number...')
217
+ Args:
218
+ Label (str): Title textbox
219
+ Placeholder (str): placeholder text
220
+
221
+ Returns:
222
+ variable : str
223
+ """
224
+
225
+ inputbox = gr.Number(
226
+ label=Label,
227
+ elem_classes="text-input",
228
+ precision=Precision,
229
+ **kwargs
230
+ )
231
+ return inputbox
232
+
233
+ def input_paragaph_textbox(Label:str, Placeholder:str):
234
+ """
235
+ usage:
236
+ paragraph = input_paragaph_textbox('Your Story', 'Type your text...')
237
+ Args:
238
+ Label (str): Title textbox
239
+ Placeholder (str): placeholder text
240
+
241
+ Returns:
242
+ variable : str
243
+ """
244
+ paragraph = gr.Textbox(
245
+ label=Label,
246
+ placeholder=Placeholder,
247
+ lines=5,
248
+ max_lines=8,
249
+ max_length=1200,
250
+ elem_classes="text-input"
251
+ )
252
+ return paragraph
253
+
254
+ def input_choice(Label:str, Choices:list, Multiselect:bool):
255
+ """Allow user to select choices\n
256
+ Multiselect True for multiple choices\n
257
+ Multiselect False for single choices\n
258
+ Usage:\n
259
+ genre = gr.Dropdown(\n
260
+ label="Select Your Genre (Multiple Choice)",\n
261
+ choices=[\n
262
+ 'Action', 'Adventure', 'RPG', 'Strategy', 'Simulation',\n
263
+ 'Casual', 'Indie', 'Sports', 'Racing', 'Fighting',\n
264
+ 'Puzzle', 'Shooter', 'Platformer', 'MMO', 'Horror',\n
265
+ 'Survival', 'Open World', 'Visual Novel', 'Point & Click',\n
266
+ 'Sandbox', 'Metroidvania', 'Tactical', 'Rhythm',\n
267
+ 'Stealth', 'Rogue-like', 'Rogue-lite'\n
268
+ ],\n
269
+ multiselect=True,\n
270
+ value=[],\n
271
+ elem_classes="dropdown"\n
272
+ )\n
273
+
274
+ or only single choice \n
275
+
276
+ price_range_input = gr.Dropdown(\n
277
+ label="Select Your Price Range (Only Single Choice)",\n
278
+ choices=[\n
279
+ 'Free',\n
280
+ '5$ - 10%',\n
281
+ '10$ - 50%',\n
282
+ '50$ - 100%',\n
283
+ '100$ - 500%',\n
284
+ 'above 500%',\n
285
+ ],
286
+ multiselect=False,\n
287
+ value=[],\n
288
+ elem_classes="dropdown"\n
289
+ )\n
290
+ Args:\n
291
+ Label (str): _description_\n
292
+ Choices (list): _description_\n
293
+ """
294
+ multiple_choice = gr.Dropdown(
295
+ label=Label,
296
+ choices=Choices,
297
+ multiselect=Multiselect, # True Allowing multi select
298
+ value=[] if Multiselect else None, # the choosen value will be passed here
299
+ elem_classes="dropdown"
300
+ )
301
+ return multiple_choice
requirements.txt ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file may be used to create an environment using:
2
+ # $ conda create --name <env> --file <this file>
3
+ # platform: win-64
4
+ # created-by: conda 25.1.1
5
+ _openmp_mutex=4.5=2_gnu
6
+ aiofiles=24.1.0=pypi_0
7
+ aiohappyeyeballs=2.6.1=pypi_0
8
+ aiohttp=3.12.9=pypi_0
9
+ aiosignal=1.3.2=pypi_0
10
+ annotated-types=0.7.0=pypi_0
11
+ anyio=4.9.0=pypi_0
12
+ asttokens=3.0.0=pyhd8ed1ab_1
13
+ async-timeout=5.0.1=pypi_0
14
+ attrs=25.3.0=pypi_0
15
+ blis=0.7.11=pypi_0
16
+ bzip2=1.0.8=h2bbff1b_6
17
+ ca-certificates=2025.4.26=h4c7d964_0
18
+ catalogue=2.0.10=pypi_0
19
+ certifi=2025.4.26=pypi_0
20
+ charset-normalizer=3.4.2=pypi_0
21
+ click=8.2.0=pypi_0
22
+ cloudpickle=3.1.1=pypi_0
23
+ colorama=0.4.6=pyhd8ed1ab_1
24
+ comm=0.2.2=pyhd8ed1ab_1
25
+ confection=0.1.5=pypi_0
26
+ cpython=3.10.17=py310hd8ed1ab_0
27
+ cuda-version=12.9=h4f385c5_3
28
+ cycler=0.12.1=pypi_0
29
+ cymem=2.0.11=pypi_0
30
+ cython=0.29.32=pypi_0
31
+ dask=2025.5.1=pypi_0
32
+ datasets=3.6.0=pypi_0
33
+ debugpy=1.8.14=py310h9e98ed7_0
34
+ decorator=5.2.1=pyhd8ed1ab_0
35
+ dill=0.3.8=pypi_0
36
+ en-core-web-sm=3.5.0=pypi_0
37
+ exceptiongroup=1.3.0=pyhd8ed1ab_0
38
+ executing=2.2.0=pyhd8ed1ab_0
39
+ fastapi=0.115.12=pypi_0
40
+ ffmpy=0.6.0=pypi_0
41
+ filelock=3.18.0=pypi_0
42
+ fonttools=4.58.0=pypi_0
43
+ frozenlist=1.6.2=pypi_0
44
+ fsspec=2025.3.0=pypi_0
45
+ fst-pso=1.8.1=pypi_0
46
+ fuzzytm=2.0.9=pypi_0
47
+ gensim=4.3.0=pypi_0
48
+ gradio=5.32.1=pypi_0
49
+ gradio-client=1.10.2=pypi_0
50
+ groovy=0.1.2=pypi_0
51
+ h11=0.16.0=pypi_0
52
+ httpcore=1.0.9=pypi_0
53
+ httpx=0.28.1=pypi_0
54
+ huggingface-hub=0.32.4=pypi_0
55
+ idna=3.10=pypi_0
56
+ importlib-metadata=8.6.1=pyha770c72_0
57
+ inquirerpy=0.3.4=pypi_0
58
+ intel-openmp=2024.2.1=h57928b3_1083
59
+ ipykernel=6.29.5=pyh4bbf305_0
60
+ ipython=8.36.0=pyh9ab4c32_0
61
+ jedi=0.19.2=pyhd8ed1ab_1
62
+ jinja2=3.1.6=pypi_0
63
+ joblib=1.5.0=pyhd8ed1ab_0
64
+ jupyter_client=8.6.3=pyhd8ed1ab_1
65
+ jupyter_core=5.7.2=pyh5737063_1
66
+ kiwisolver=1.4.8=pypi_0
67
+ krb5=1.21.3=hdf4eb48_0
68
+ langcodes=3.5.0=pypi_0
69
+ langdetect=1.0.9=pypi_0
70
+ language-data=1.3.0=pypi_0
71
+ libblas=3.9.0=31_h641d27c_mkl
72
+ libcblas=3.9.0=31_h5e41251_mkl
73
+ libffi=3.4.4=hd77b12b_1
74
+ libgomp=15.1.0=h1383e82_2
75
+ libhwloc=2.11.2=default_ha69328c_1001
76
+ libiconv=1.18=h135ad9c_1
77
+ liblapack=3.9.0=31_h1aa476e_mkl
78
+ libsodium=1.0.20=hc70643c_0
79
+ libwinpthread=12.0.0.r4.gg4f2fc60ca=h57928b3_9
80
+ libxgboost=3.0.1=cuda128_hace5437_0
81
+ libxml2=2.13.8=h866ff63_0
82
+ locket=1.0.0=pypi_0
83
+ marisa-trie=1.2.1=pypi_0
84
+ markdown-it-py=3.0.0=pypi_0
85
+ markupsafe=3.0.2=pypi_0
86
+ matplotlib=3.5.3=pypi_0
87
+ matplotlib-inline=0.1.7=pyhd8ed1ab_1
88
+ mdurl=0.1.2=pypi_0
89
+ miniful=0.0.6=pypi_0
90
+ mkl=2024.2.2=h66d3029_15
91
+ mpmath=1.3.0=pypi_0
92
+ multidict=6.4.4=pypi_0
93
+ multiprocess=0.70.16=pypi_0
94
+ murmurhash=1.0.12=pypi_0
95
+ nest-asyncio=1.6.0=pyhd8ed1ab_1
96
+ networkx=3.4.2=pypi_0
97
+ nltk=3.8.1=pypi_0
98
+ numpy=1.25.2=py310hd02465a_0
99
+ openssl=3.5.0=ha4e3fda_1
100
+ orjson=3.10.18=pypi_0
101
+ packaging=25.0=pyh29332c3_1
102
+ pandas=2.1.4=pypi_0
103
+ parso=0.8.4=pyhd8ed1ab_1
104
+ partd=1.4.2=pypi_0
105
+ pathlib-abc=0.1.1=pypi_0
106
+ pathy=0.11.0=pypi_0
107
+ pfzy=0.3.4=pypi_0
108
+ pickleshare=0.7.5=pyhd8ed1ab_1004
109
+ pillow=9.5.0=pypi_0
110
+ pip=25.1=pyhc872135_2
111
+ platformdirs=4.3.8=pyhe01879c_0
112
+ preshed=3.0.9=pypi_0
113
+ prompt-toolkit=3.0.51=pyha770c72_0
114
+ propcache=0.3.1=pypi_0
115
+ psutil=7.0.0=py310ha8f682b_0
116
+ pure_eval=0.2.3=pyhd8ed1ab_1
117
+ py-xgboost=3.0.1=cuda128_pyhee1328b_0
118
+ pyarrow=20.0.0=pypi_0
119
+ pycountry=24.6.1=pypi_0
120
+ pydantic=2.11.5=pypi_0
121
+ pydantic-core=2.33.2=pypi_0
122
+ pydub=0.25.1=pypi_0
123
+ pyfume=0.3.1=pypi_0
124
+ pygments=2.19.1=pyhd8ed1ab_0
125
+ pyparsing=3.2.3=pypi_0
126
+ python=3.10.16=h4607a30_1
127
+ python-dateutil=2.9.0.post0=pyhff2d567_1
128
+ python-multipart=0.0.20=pypi_0
129
+ python-tzdata=2025.2=pyhd8ed1ab_0
130
+ python_abi=3.10=2_cp310
131
+ pytz=2025.2=pyhd8ed1ab_0
132
+ pywin32=307=py310h9e98ed7_3
133
+ pyyaml=6.0.2=pypi_0
134
+ pyzmq=26.4.0=py310h656833d_0
135
+ regex=2024.11.6=pypi_0
136
+ requests=2.32.3=pypi_0
137
+ rich=14.0.0=pypi_0
138
+ ruff=0.11.12=pypi_0
139
+ safehttpx=0.1.6=pypi_0
140
+ safetensors=0.5.3=pypi_0
141
+ scikit-learn=1.3.0=pypi_0
142
+ scipy=1.11.4=pypi_0
143
+ seaborn=0.13.2=pypi_0
144
+ semantic-version=2.10.0=pypi_0
145
+ sentence-transformers=4.1.0=pypi_0
146
+ setuptools=78.1.1=py310haa95532_0
147
+ shellingham=1.5.4=pypi_0
148
+ simpful=2.12.0=pypi_0
149
+ six=1.17.0=pyhd8ed1ab_0
150
+ smart-open=6.4.0=pypi_0
151
+ sniffio=1.3.1=pypi_0
152
+ spacy=3.5.3=pypi_0
153
+ spacy-legacy=3.0.12=pypi_0
154
+ spacy-loggers=1.0.5=pypi_0
155
+ sqlite=3.45.3=h2bbff1b_0
156
+ srsly=2.5.1=pypi_0
157
+ stack_data=0.6.3=pyhd8ed1ab_1
158
+ starlette=0.46.2=pypi_0
159
+ swifter=1.4.0=pypi_0
160
+ sympy=1.14.0=pypi_0
161
+ tbb=2021.13.0=h62715c5_1
162
+ thinc=8.1.12=pypi_0
163
+ threadpoolctl=3.6.0=pyhecae5ae_0
164
+ tk=8.6.14=h0416ee5_0
165
+ tokenizers=0.21.1=pypi_0
166
+ tomlkit=0.13.2=pypi_0
167
+ toolz=1.0.0=pypi_0
168
+ torch=2.7.0=pypi_0
169
+ tornado=6.4.2=py310ha8f682b_0
170
+ tqdm=4.67.1=pypi_0
171
+ traitlets=5.14.3=pyhd8ed1ab_1
172
+ transformers=4.51.3=pypi_0
173
+ typer=0.16.0=pypi_0
174
+ typing-inspection=0.4.1=pypi_0
175
+ typing_extensions=4.13.2=pyh29332c3_0
176
+ tzdata=2025b=h04d1e81_0
177
+ ucrt=10.0.22621.0=h57928b3_1
178
+ urllib3=2.4.0=pypi_0
179
+ uvicorn=0.34.3=pypi_0
180
+ vc=14.42=haa95532_5
181
+ vc14_runtime=14.42.34438=hfd919c2_26
182
+ vs2015_runtime=14.42.34438=h7142326_26
183
+ wasabi=1.1.3=pypi_0
184
+ wcwidth=0.2.13=pyhd8ed1ab_1
185
+ websockets=15.0.1=pypi_0
186
+ wheel=0.45.1=py310haa95532_0
187
+ xgboost=3.0.1=cuda128_pyh68bd8d9_0
188
+ xxhash=3.5.0=pypi_0
189
+ xz=5.6.4=h4754444_1
190
+ yarl=1.20.0=pypi_0
191
+ zeromq=4.3.5=ha9f60a1_7
192
+ zipp=3.21.0=pyhd8ed1ab_1
193
+ zlib=1.2.13=h8cc25b3_1
style.css ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .container {
2
+ /* display: flex; */
3
+ width: 100%;
4
+ /* min-height: 90vh; */
5
+ font-family: 'Arial', 'sans-serif';
6
+ }
7
+
8
+ .navbar {
9
+ width: 200px;
10
+ height: 100%;
11
+ border-right: 5px solid #34495e;
12
+
13
+ display: flex;
14
+ flex-direction: column;
15
+ padding: 0 10px;
16
+ /* justify-content: center; */
17
+ justify-content: flex-start;
18
+ background-color: #2c3e50;
19
+ }
20
+
21
+ .nav-header {
22
+ margin-top: 1rem;
23
+ margin-bottom: 2rem;
24
+ }
25
+
26
+ .nav-header h1 {
27
+ color: #fcdf1e;
28
+ }
29
+
30
+ .nav-buttons {
31
+ display: flex;
32
+ flex-direction: column;
33
+ gap: 0.5rem;
34
+ padding: 0 5px;
35
+ }
36
+
37
+ .nav-btn {
38
+ text-align: left;
39
+ padding: 10px 15px;
40
+ width: 100%;
41
+ background-color: #34495e;
42
+ color: #ecf0f1;
43
+ border: none;
44
+ border-radius: 4px;
45
+ cursor: pointer;
46
+ transition: all 0.3s ease;
47
+ font-weight: bold;
48
+ }
49
+
50
+ .nav-btn:hover {
51
+ background-color: #3d566e;
52
+ color: #fcdf1e;
53
+ }
54
+
55
+ .nav-btn.active {
56
+ background-color: #f39c12;
57
+ color: #2c3e50;
58
+ }
59
+
60
+ .main-content {
61
+ flex-grow: 1;
62
+ padding: 1rem;
63
+ display: flex;
64
+ flex-direction: column;
65
+ }
66
+
67
+ /* Section layout styling */
68
+ .content-section {
69
+ border: 2px solid #ccc;
70
+ padding: 1rem !important;
71
+ margin-bottom: 1rem;
72
+ background-color: #f9f9f9;
73
+ border-radius: 8px;
74
+
75
+ height: auto !important;
76
+ min-height: 80vh;
77
+ overflow: visible !important;
78
+ /* padding: 20px !important; */
79
+ }
80
+
81
+ .content-section .header h1,
82
+ .content-section .header * h1 {
83
+ color: #3d3d3c !important;
84
+ font-size: 1.5rem;
85
+ font-weight: bold;
86
+ border-bottom: 2px solid #ccc;
87
+ padding-bottom: 0.5rem;
88
+ margin-bottom: 1rem;
89
+ }
90
+
91
+ .content {
92
+ border: 2px solid #ccc;
93
+ padding: 0.5rem;
94
+ height: 80vh; /* Fixed height */
95
+ margin-bottom: 1rem;
96
+ background-color: #f9f9f9;
97
+ border-radius: 8px;
98
+ overflow-y: auto;
99
+ }
100
+
101
+ p.desc {
102
+ color: #3d3d3c !important;
103
+ /* color: white; */
104
+ }
105
+
106
+ /* dataset display */
107
+ /* Dataset Container */
108
+ .datasets-container {
109
+ display: flex;
110
+ flex-direction: column;
111
+ gap: 30px;
112
+ width: 100%;
113
+ }
114
+
115
+ /* Dataset Layout */
116
+ .dataframe-layout {
117
+ border: 1px solid #e0e0e0;
118
+ border-radius: 8px;
119
+ padding: 20px;
120
+ background-color: #fff;
121
+ box-shadow: 0 2px 10px rgba(0,0,0,0.05);
122
+ }
123
+
124
+ /* Title Styling */
125
+ .subtitle {
126
+ font-size: 1.2rem !important;
127
+ font-weight: 600;
128
+ color: #2c3e50;
129
+ margin: 0 !important;
130
+ padding: 0 !important;
131
+ }
132
+
133
+ /* Download Button */
134
+ .download-button {
135
+ background-color: #3498db !important;
136
+ color: white !important;
137
+ border: none !important;
138
+ padding: 8px 16px !important;
139
+ border-radius: 4px !important;
140
+ font-size: 0.9rem !important;
141
+ }
142
+
143
+ .download-button:hover {
144
+ background-color: #2980b9 !important;
145
+ }
146
+
147
+ /* Table Styling */
148
+ .dataframe-layout table {
149
+ width: 100%;
150
+ border-collapse: collapse;
151
+ margin-top: 15px;
152
+ }
153
+
154
+ .dataframe-layout th {
155
+ background-color: #34495e;
156
+ color: white;
157
+ padding: 10px;
158
+ text-align: left;
159
+ }
160
+
161
+ .dataframe-layout td {
162
+ padding: 8px 10px;
163
+ border-bottom: 1px solid #dddddd;
164
+ }
165
+
166
+ .dataframe-layout tr:nth-child(even) {
167
+ background-color: #85a285;
168
+ }
169
+
170
+ .dataframe-layout tr:nth-child(odd) {
171
+ background-color: #466c45;
172
+ }
173
+
174
+ /* EDA */
175
+ .subheader{
176
+ font-weight: bold;
177
+ font-size: 24px;
178
+ color: #3d3d3c;
179
+ margin-bottom: 10px;
180
+ }
181
+
182
+ /* Recomendation system */
183
+ #system .header-title {
184
+ color: white;
185
+ font-size: 2rem;
186
+ }
187
+
188
+ #system {
189
+ background-color: #3d3d3c;
190
+ }
191
+
192
+ .dropdown, .text-input{
193
+ height: 100%;
194
+ flex: 1 1 auto;
195
+ /* background-color: #dddddd; */
196
+ border: none;
197
+ }
198
+
199
+ .text-input label.gr-label,
200
+ .dropdown label.gr-label {
201
+ color: #3d3d3c !important;
202
+ }
203
+
204
+ /* .results-column h2{
205
+ color: black;
206
+ } */
207
+
208
+