Update main.py
Browse files
main.py
CHANGED
|
@@ -78,55 +78,73 @@ def generate_recommendations_for_session(session_id):
|
|
| 78 |
# Convert session data to a DataFrame
|
| 79 |
raw_df = pd.DataFrame(session_data)
|
| 80 |
|
| 81 |
-
#
|
| 82 |
-
|
| 83 |
-
presence=('action', 'size'),
|
| 84 |
-
total_duration=('duration', 'sum')
|
| 85 |
-
).reset_index()
|
| 86 |
|
| 87 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
pivot_df = aggregated_data.pivot_table(
|
| 89 |
index=['id'],
|
| 90 |
columns='action',
|
| 91 |
-
values=
|
| 92 |
fill_value=0
|
| 93 |
)
|
| 94 |
-
|
| 95 |
# Flatten column names
|
| 96 |
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]
|
| 97 |
-
|
| 98 |
# Ensure all expected columns exist in the pivot table
|
| 99 |
for col in ALL_COLUMNS:
|
| 100 |
if f'presence_{col}' not in pivot_df.columns and col != 'time_spent':
|
| 101 |
pivot_df[f'presence_{col}'] = 0
|
| 102 |
-
elif col == 'time_spent' and 'total_duration_time_spent' not in pivot_df.columns:
|
| 103 |
pivot_df['total_duration_time_spent'] = 0
|
| 104 |
-
|
| 105 |
# Calculate interaction score for each row
|
| 106 |
pivot_df['interaction_score'] = pivot_df.apply(calculate_interaction_score, axis=1)
|
| 107 |
-
|
| 108 |
# Create a user vector based on the interaction scores
|
| 109 |
user_vector = pd.Series(index=user_item_matrix_columns, dtype=float).fillna(0)
|
| 110 |
for property_id, score in pivot_df['interaction_score'].items():
|
| 111 |
if property_id in user_vector.index:
|
| 112 |
user_vector[property_id] = score
|
| 113 |
-
|
| 114 |
# Transform the user vector using the SVD model
|
| 115 |
user_vector_array = user_vector.values.reshape(1, -1)
|
| 116 |
user_latent = svd.transform(user_vector_array)
|
| 117 |
-
|
| 118 |
# Calculate similarity scores between the user vector and item factors
|
| 119 |
similarity_scores = cosine_similarity(user_latent, item_factors)
|
| 120 |
-
|
| 121 |
# Get the indices of the top 10 most similar items
|
| 122 |
top_indices = similarity_scores.argsort()[0][-10:][::-1]
|
| 123 |
-
|
| 124 |
# Get the corresponding property IDs for the top indices
|
| 125 |
recommendations = user_item_matrix_columns[top_indices].tolist()
|
| 126 |
-
|
| 127 |
return recommendations
|
|
|
|
| 128 |
except Exception as e:
|
| 129 |
-
logger.error(f"Error in generate_recommendations_for_session: {e}")
|
|
|
|
| 130 |
return None
|
| 131 |
|
| 132 |
|
|
|
|
| 78 |
# Convert session data to a DataFrame
|
| 79 |
raw_df = pd.DataFrame(session_data)
|
| 80 |
|
| 81 |
+
# Debug: Print column names
|
| 82 |
+
logger.debug(f"Columns in raw_df: {raw_df.columns.tolist()}")
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
# Check if required columns exist
|
| 85 |
+
required_columns = ['id', 'action']
|
| 86 |
+
missing_columns = [col for col in required_columns if col not in raw_df.columns]
|
| 87 |
+
if missing_columns:
|
| 88 |
+
logger.error(f"Missing required columns: {missing_columns}")
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
# Determine aggregation based on presence of 'duration' column
|
| 92 |
+
if 'duration' in raw_df.columns:
|
| 93 |
+
aggregated_data = raw_df.groupby(['id', 'action']).agg(
|
| 94 |
+
presence=('action', 'size'),
|
| 95 |
+
total_duration=('duration', 'sum')
|
| 96 |
+
).reset_index()
|
| 97 |
+
else:
|
| 98 |
+
aggregated_data = raw_df.groupby(['id', 'action']).agg(
|
| 99 |
+
presence=('action', 'size')
|
| 100 |
+
).reset_index()
|
| 101 |
+
|
| 102 |
+
# Create pivot table
|
| 103 |
+
pivot_columns = ['presence', 'total_duration'] if 'duration' in raw_df.columns else ['presence']
|
| 104 |
pivot_df = aggregated_data.pivot_table(
|
| 105 |
index=['id'],
|
| 106 |
columns='action',
|
| 107 |
+
values=pivot_columns,
|
| 108 |
fill_value=0
|
| 109 |
)
|
| 110 |
+
|
| 111 |
# Flatten column names
|
| 112 |
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]
|
| 113 |
+
|
| 114 |
# Ensure all expected columns exist in the pivot table
|
| 115 |
for col in ALL_COLUMNS:
|
| 116 |
if f'presence_{col}' not in pivot_df.columns and col != 'time_spent':
|
| 117 |
pivot_df[f'presence_{col}'] = 0
|
| 118 |
+
elif col == 'time_spent' and 'duration' in raw_df.columns and 'total_duration_time_spent' not in pivot_df.columns:
|
| 119 |
pivot_df['total_duration_time_spent'] = 0
|
| 120 |
+
|
| 121 |
# Calculate interaction score for each row
|
| 122 |
pivot_df['interaction_score'] = pivot_df.apply(calculate_interaction_score, axis=1)
|
| 123 |
+
|
| 124 |
# Create a user vector based on the interaction scores
|
| 125 |
user_vector = pd.Series(index=user_item_matrix_columns, dtype=float).fillna(0)
|
| 126 |
for property_id, score in pivot_df['interaction_score'].items():
|
| 127 |
if property_id in user_vector.index:
|
| 128 |
user_vector[property_id] = score
|
| 129 |
+
|
| 130 |
# Transform the user vector using the SVD model
|
| 131 |
user_vector_array = user_vector.values.reshape(1, -1)
|
| 132 |
user_latent = svd.transform(user_vector_array)
|
| 133 |
+
|
| 134 |
# Calculate similarity scores between the user vector and item factors
|
| 135 |
similarity_scores = cosine_similarity(user_latent, item_factors)
|
| 136 |
+
|
| 137 |
# Get the indices of the top 10 most similar items
|
| 138 |
top_indices = similarity_scores.argsort()[0][-10:][::-1]
|
| 139 |
+
|
| 140 |
# Get the corresponding property IDs for the top indices
|
| 141 |
recommendations = user_item_matrix_columns[top_indices].tolist()
|
| 142 |
+
|
| 143 |
return recommendations
|
| 144 |
+
|
| 145 |
except Exception as e:
|
| 146 |
+
logger.error(f"Error in generate_recommendations_for_session: {str(e)}")
|
| 147 |
+
logger.debug(f"Raw dataframe info: {raw_df.info()}")
|
| 148 |
return None
|
| 149 |
|
| 150 |
|