Update app.py
Browse files
app.py
CHANGED
|
@@ -49,6 +49,14 @@ except FileNotFoundError:
|
|
| 49 |
logging.error("solved.json not found. Please ensure the file exists.")
|
| 50 |
exit()
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# Check for required columns in engagement data
|
| 53 |
required_columns = ['posting_time', 'likes', 'comments', 'shares']
|
| 54 |
missing_columns = [col for col in required_columns if col not in engagement_df.columns]
|
|
@@ -118,37 +126,57 @@ engagement_summary = engagement_df.groupby('posting_time').agg({
|
|
| 118 |
|
| 119 |
# Plot engagement rate over time
|
| 120 |
plt.figure(figsize=(10, 6))
|
| 121 |
-
plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'])
|
| 122 |
plt.title('Engagement Rate Over Time')
|
| 123 |
plt.xlabel('Time')
|
| 124 |
plt.ylabel('Engagement Rate')
|
|
|
|
| 125 |
plt.show()
|
| 126 |
|
| 127 |
# Time-Series Model: Optimal Posting Times (using Prophet)
|
| 128 |
logging.info("Training time-series model for optimal posting times using Prophet...")
|
| 129 |
time_series_data = engagement_summary[['posting_time', 'engagement_rate']].rename(columns={'posting_time': 'ds', 'engagement_rate': 'y'})
|
| 130 |
|
| 131 |
-
#
|
| 132 |
-
|
| 133 |
-
prophet_model.fit(time_series_data)
|
| 134 |
|
| 135 |
-
#
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
#
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
#
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
# Ensure 'hashtags' column is properly formatted
|
| 154 |
solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
|
|
@@ -266,4 +294,157 @@ content_type_impact = solved_df.groupby('content_type')['promote'].mean().sort_v
|
|
| 266 |
logging.info("Content Type Impact on Promotion:")
|
| 267 |
print(content_type_impact)
|
| 268 |
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
logging.error("solved.json not found. Please ensure the file exists.")
|
| 50 |
exit()
|
| 51 |
|
| 52 |
+
# Load competitor data
|
| 53 |
+
logging.info("Loading competitor data...")
|
| 54 |
+
try:
|
| 55 |
+
competitor_df = pd.read_csv('competitors_data.csv')
|
| 56 |
+
except FileNotFoundError:
|
| 57 |
+
logging.error("competitors_data.csv not found. Please ensure the file exists.")
|
| 58 |
+
exit()
|
| 59 |
+
|
| 60 |
# Check for required columns in engagement data
|
| 61 |
required_columns = ['posting_time', 'likes', 'comments', 'shares']
|
| 62 |
missing_columns = [col for col in required_columns if col not in engagement_df.columns]
|
|
|
|
| 126 |
|
| 127 |
# Plot engagement rate over time
|
| 128 |
plt.figure(figsize=(10, 6))
|
| 129 |
+
plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'], label='Your Data')
|
| 130 |
plt.title('Engagement Rate Over Time')
|
| 131 |
plt.xlabel('Time')
|
| 132 |
plt.ylabel('Engagement Rate')
|
| 133 |
+
plt.legend()
|
| 134 |
plt.show()
|
| 135 |
|
| 136 |
# Time-Series Model: Optimal Posting Times (using Prophet)
|
| 137 |
logging.info("Training time-series model for optimal posting times using Prophet...")
|
| 138 |
time_series_data = engagement_summary[['posting_time', 'engagement_rate']].rename(columns={'posting_time': 'ds', 'engagement_rate': 'y'})
|
| 139 |
|
| 140 |
+
# Drop rows with NaN values in the target column
|
| 141 |
+
time_series_data = time_series_data.dropna(subset=['y'])
|
|
|
|
| 142 |
|
| 143 |
+
# Check if there is enough data to train the Prophet model
|
| 144 |
+
if len(time_series_data) < 2:
|
| 145 |
+
logging.warning("Not enough data to train the Prophet model. Skipping time-series analysis.")
|
| 146 |
+
else:
|
| 147 |
+
# Resample the data to a fixed frequency (e.g., daily)
|
| 148 |
+
time_series_data = time_series_data.set_index('ds').resample('D').mean().reset_index()
|
| 149 |
+
|
| 150 |
+
# Train Prophet model with hyperparameters
|
| 151 |
+
prophet_model = Prophet(
|
| 152 |
+
changepoint_prior_scale=0.05, # Adjust sensitivity to trend changes
|
| 153 |
+
seasonality_prior_scale=10.0, # Adjust seasonality strength
|
| 154 |
+
yearly_seasonality=True, # Enable yearly seasonality
|
| 155 |
+
daily_seasonality=True # Enable daily seasonality
|
| 156 |
+
)
|
| 157 |
+
prophet_model.fit(time_series_data)
|
| 158 |
+
|
| 159 |
+
# Make future predictions
|
| 160 |
+
future = prophet_model.make_future_dataframe(periods=30) # Predict for the next 30 days
|
| 161 |
+
forecast = prophet_model.predict(future)
|
| 162 |
+
|
| 163 |
+
# Plot the forecast
|
| 164 |
+
fig = prophet_model.plot(forecast)
|
| 165 |
+
plt.title('Engagement Rate Forecast (Prophet)')
|
| 166 |
+
plt.xlabel('Date')
|
| 167 |
+
plt.ylabel('Engagement Rate')
|
| 168 |
+
plt.show()
|
| 169 |
+
|
| 170 |
+
# Evaluate the model
|
| 171 |
+
y_true = time_series_data['y']
|
| 172 |
+
y_pred = forecast.loc[:len(y_true)-1, 'yhat'] # Align predictions with true values
|
| 173 |
+
|
| 174 |
+
# Check for NaN values in y_true or y_pred
|
| 175 |
+
if y_true.isnull().any() or y_pred.isnull().any():
|
| 176 |
+
logging.warning("NaN values detected in true values or predictions. Skipping MAE calculation.")
|
| 177 |
+
else:
|
| 178 |
+
mae = mean_absolute_error(y_true, y_pred)
|
| 179 |
+
logging.info(f"Prophet Model - MAE: {mae:.4f}")
|
| 180 |
|
| 181 |
# Ensure 'hashtags' column is properly formatted
|
| 182 |
solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
|
|
|
|
| 294 |
logging.info("Content Type Impact on Promotion:")
|
| 295 |
print(content_type_impact)
|
| 296 |
|
| 297 |
+
# Competitor Data Analysis
|
| 298 |
+
logging.info("Analyzing competitor data...")
|
| 299 |
+
|
| 300 |
+
# Preprocess competitor data
|
| 301 |
+
competitor_df['caption'] = competitor_df['caption'].astype(str)
|
| 302 |
+
competitor_df['hashtags'] = competitor_df['hashtags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
|
| 303 |
+
competitor_df['posting_time'] = pd.to_datetime(competitor_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
|
| 304 |
+
competitor_df = competitor_df[competitor_df['posting_time'].notna()]
|
| 305 |
+
|
| 306 |
+
# Calculate engagement_rate for competitor data
|
| 307 |
+
competitor_df['engagement_rate'] = competitor_df['likes'] + competitor_df['comments'] + competitor_df['shares']
|
| 308 |
+
|
| 309 |
+
# Perform sentiment analysis on competitor captions
|
| 310 |
+
logging.info("Performing sentiment analysis on competitor captions...")
|
| 311 |
+
competitor_df['caption_sentiment'] = competitor_df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
|
| 312 |
+
competitor_df['sentiment'] = competitor_df['caption_sentiment']
|
| 313 |
+
|
| 314 |
+
# Feature engineering for competitor data
|
| 315 |
+
logging.info("Performing feature engineering for competitor data...")
|
| 316 |
+
competitor_df['caption_length'] = competitor_df['caption'].apply(len)
|
| 317 |
+
competitor_df['hashtag_count'] = competitor_df['hashtags'].apply(len)
|
| 318 |
+
|
| 319 |
+
# Analyze competitor engagement data
|
| 320 |
+
logging.info("Analyzing competitor engagement data...")
|
| 321 |
+
competitor_summary = competitor_df.groupby('posting_time').agg({
|
| 322 |
+
'likes': 'sum',
|
| 323 |
+
'comments': 'sum',
|
| 324 |
+
'shares': 'sum',
|
| 325 |
+
'engagement_rate': 'mean'
|
| 326 |
+
}).reset_index()
|
| 327 |
+
|
| 328 |
+
# Plot competitor engagement rate over time
|
| 329 |
+
plt.figure(figsize=(10, 6))
|
| 330 |
+
plt.plot(competitor_summary['posting_time'], competitor_summary['engagement_rate'], label='Competitor')
|
| 331 |
+
plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'], label='Your Data')
|
| 332 |
+
plt.title('Engagement Rate Over Time (Your Data vs Competitor)')
|
| 333 |
+
plt.xlabel('Time')
|
| 334 |
+
plt.ylabel('Engagement Rate')
|
| 335 |
+
plt.legend()
|
| 336 |
+
plt.show()
|
| 337 |
+
|
| 338 |
+
# Analyze competitor niche trends
|
| 339 |
+
logging.info("Analyzing competitor niche trends...")
|
| 340 |
+
competitor_niche_trends = competitor_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
|
| 341 |
+
logging.info("Top Performing Content Types by Sentiment (Competitor):")
|
| 342 |
+
print(competitor_niche_trends)
|
| 343 |
+
|
| 344 |
+
# Combine your data with competitor data for enhanced analysis
|
| 345 |
+
logging.info("Combining your data with competitor data...")
|
| 346 |
+
combined_df = pd.concat([solved_df, competitor_df], ignore_index=True)
|
| 347 |
+
|
| 348 |
+
# Enhanced recommendation system using combined data
|
| 349 |
+
logging.info("Training enhanced recommendation system using combined data...")
|
| 350 |
+
combined_hashtags = combined_df['hashtags'].apply(lambda x: ' '.join(x))
|
| 351 |
+
|
| 352 |
+
if combined_hashtags.str.strip().eq('').all():
|
| 353 |
+
logging.warning("The 'hashtags' column is empty or contains only stop words. Skipping recommendation system.")
|
| 354 |
+
else:
|
| 355 |
+
vectorizer = TfidfVectorizer()
|
| 356 |
+
tfidf_matrix = vectorizer.fit_transform(combined_hashtags)
|
| 357 |
+
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
|
| 358 |
+
|
| 359 |
+
def recommend_hashtags(post_index, top_n=5):
|
| 360 |
+
sim_scores = list(enumerate(cosine_sim[post_index]))
|
| 361 |
+
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
| 362 |
+
top_indices = [i[0] for i in sim_scores[1:top_n+1]]
|
| 363 |
+
return combined_df.iloc[top_indices]['hashtags']
|
| 364 |
+
|
| 365 |
+
# Example: Recommend hashtags for the first post
|
| 366 |
+
logging.info("Example Hashtag Recommendations (Combined Data):")
|
| 367 |
+
print(recommend_hashtags(0))
|
| 368 |
+
|
| 369 |
+
# Enhanced niche trend analysis using combined data
|
| 370 |
+
logging.info("Analyzing enhanced niche trends using combined data...")
|
| 371 |
+
combined_niche_trends = combined_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
|
| 372 |
+
logging.info("Top Performing Content Types by Sentiment (Combined Data):")
|
| 373 |
+
print(combined_niche_trends)
|
| 374 |
+
|
| 375 |
+
# Enhanced viral potential prediction using combined data
|
| 376 |
+
logging.info("Training enhanced viral potential prediction model using combined data...")
|
| 377 |
+
combined_viral_threshold = combined_df['engagement_rate'].quantile(0.9)
|
| 378 |
+
combined_df['viral'] = combined_df['engagement_rate'].apply(lambda x: 1 if x >= combined_viral_threshold else 0)
|
| 379 |
+
|
| 380 |
+
# Features for viral potential prediction
|
| 381 |
+
features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
|
| 382 |
+
X = combined_df[features]
|
| 383 |
+
y = combined_df['viral']
|
| 384 |
+
|
| 385 |
+
# Split data into training and testing sets
|
| 386 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 387 |
+
|
| 388 |
+
# Train a Random Forest Classifier
|
| 389 |
+
viral_model = RandomForestClassifier(random_state=42)
|
| 390 |
+
viral_model.fit(X_train, y_train)
|
| 391 |
+
|
| 392 |
+
# Evaluate the model
|
| 393 |
+
y_pred = viral_model.predict(X_test)
|
| 394 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 395 |
+
logging.info(f"Enhanced Viral Potential Model Accuracy: {accuracy:.4f}")
|
| 396 |
+
|
| 397 |
+
# Feature importance
|
| 398 |
+
importance = viral_model.feature_importances_
|
| 399 |
+
for feature, score in zip(features, importance):
|
| 400 |
+
logging.info(f"Feature Importance - {feature}: {score:.4f}")
|
| 401 |
+
|
| 402 |
+
# Enhanced engagement rate predictions using combined data
|
| 403 |
+
logging.info("Training enhanced engagement rate prediction model using combined data...")
|
| 404 |
+
X = combined_df[features]
|
| 405 |
+
y = combined_df['engagement_rate']
|
| 406 |
+
|
| 407 |
+
# Split data into training and testing sets
|
| 408 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 409 |
+
|
| 410 |
+
# Train an XGBoost Regressor
|
| 411 |
+
engagement_model = XGBRegressor(random_state=42)
|
| 412 |
+
engagement_model.fit(X_train, y_train)
|
| 413 |
+
|
| 414 |
+
# Evaluate the model
|
| 415 |
+
y_pred = engagement_model.predict(X_test)
|
| 416 |
+
mae = mean_absolute_error(y_test, y_pred)
|
| 417 |
+
logging.info(f"Enhanced Engagement Rate Prediction Model - MAE: {mae:.4f}")
|
| 418 |
+
|
| 419 |
+
# Feature importance
|
| 420 |
+
importance = engagement_model.feature_importances_
|
| 421 |
+
for feature, score in zip(features, importance):
|
| 422 |
+
logging.info(f"Feature Importance - {feature}: {score:.4f}")
|
| 423 |
+
|
| 424 |
+
# Enhanced promotion strategy using combined data
|
| 425 |
+
logging.info("Training enhanced promotion prediction model using combined data...")
|
| 426 |
+
promotion_threshold = combined_df['engagement_rate'].quantile(0.8)
|
| 427 |
+
combined_df['promote'] = combined_df['engagement_rate'].apply(lambda x: 1 if x >= promotion_threshold else 0)
|
| 428 |
+
|
| 429 |
+
# Features for promotion prediction
|
| 430 |
+
X = combined_df[features]
|
| 431 |
+
y = combined_df['promote']
|
| 432 |
+
|
| 433 |
+
# Split data into training and testing sets
|
| 434 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 435 |
+
|
| 436 |
+
# Train a Logistic Regression Model
|
| 437 |
+
promotion_model = LogisticRegression(random_state=42)
|
| 438 |
+
promotion_model.fit(X_train, y_train)
|
| 439 |
+
|
| 440 |
+
# Evaluate the model
|
| 441 |
+
y_pred = promotion_model.predict(X_test)
|
| 442 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 443 |
+
logging.info(f"Enhanced Promotion Prediction Model Accuracy: {accuracy:.4f}")
|
| 444 |
+
|
| 445 |
+
# Analyze content type impact
|
| 446 |
+
content_type_impact = combined_df.groupby('content_type')['promote'].mean().sort_values(ascending=False)
|
| 447 |
+
logging.info("Enhanced Content Type Impact on Promotion:")
|
| 448 |
+
print(content_type_impact)
|
| 449 |
+
|
| 450 |
+
logging.info("Enhanced analysis complete!")
|