Fred808 commited on
Commit
1ef5971
·
verified ·
1 Parent(s): 5748634

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -22
app.py CHANGED
@@ -49,6 +49,14 @@ except FileNotFoundError:
49
  logging.error("solved.json not found. Please ensure the file exists.")
50
  exit()
51
 
 
 
 
 
 
 
 
 
52
  # Check for required columns in engagement data
53
  required_columns = ['posting_time', 'likes', 'comments', 'shares']
54
  missing_columns = [col for col in required_columns if col not in engagement_df.columns]
@@ -118,37 +126,57 @@ engagement_summary = engagement_df.groupby('posting_time').agg({
118
 
119
  # Plot engagement rate over time
120
  plt.figure(figsize=(10, 6))
121
- plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'])
122
  plt.title('Engagement Rate Over Time')
123
  plt.xlabel('Time')
124
  plt.ylabel('Engagement Rate')
 
125
  plt.show()
126
 
127
  # Time-Series Model: Optimal Posting Times (using Prophet)
128
  logging.info("Training time-series model for optimal posting times using Prophet...")
129
  time_series_data = engagement_summary[['posting_time', 'engagement_rate']].rename(columns={'posting_time': 'ds', 'engagement_rate': 'y'})
130
 
131
- # Train Prophet model
132
- prophet_model = Prophet()
133
- prophet_model.fit(time_series_data)
134
 
135
- # Make future predictions
136
- future = prophet_model.make_future_dataframe(periods=30) # Predict for the next 30 days
137
- forecast = prophet_model.predict(future)
138
-
139
- # Plot the forecast
140
- fig = prophet_model.plot(forecast)
141
- plt.title('Engagement Rate Forecast (Prophet)')
142
- plt.xlabel('Date')
143
- plt.ylabel('Engagement Rate')
144
- plt.show()
145
-
146
- # Evaluate the model
147
- from sklearn.metrics import mean_absolute_error
148
- y_true = time_series_data['y']
149
- y_pred = forecast.loc[:len(y_true)-1, 'yhat'] # Align predictions with true values
150
- mae = mean_absolute_error(y_true, y_pred)
151
- logging.info(f"Prophet Model - MAE: {mae:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  # Ensure 'hashtags' column is properly formatted
154
  solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
@@ -266,4 +294,157 @@ content_type_impact = solved_df.groupby('content_type')['promote'].mean().sort_v
266
  logging.info("Content Type Impact on Promotion:")
267
  print(content_type_impact)
268
 
269
- logging.info("Analysis complete!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  logging.error("solved.json not found. Please ensure the file exists.")
50
  exit()
51
 
52
+ # Load competitor data
53
+ logging.info("Loading competitor data...")
54
+ try:
55
+ competitor_df = pd.read_csv('competitors_data.csv')
56
+ except FileNotFoundError:
57
+ logging.error("competitors_data.csv not found. Please ensure the file exists.")
58
+ exit()
59
+
60
  # Check for required columns in engagement data
61
  required_columns = ['posting_time', 'likes', 'comments', 'shares']
62
  missing_columns = [col for col in required_columns if col not in engagement_df.columns]
 
126
 
127
  # Plot engagement rate over time
128
  plt.figure(figsize=(10, 6))
129
+ plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'], label='Your Data')
130
  plt.title('Engagement Rate Over Time')
131
  plt.xlabel('Time')
132
  plt.ylabel('Engagement Rate')
133
+ plt.legend()
134
  plt.show()
135
 
136
  # Time-Series Model: Optimal Posting Times (using Prophet)
137
  logging.info("Training time-series model for optimal posting times using Prophet...")
138
  time_series_data = engagement_summary[['posting_time', 'engagement_rate']].rename(columns={'posting_time': 'ds', 'engagement_rate': 'y'})
139
 
140
+ # Drop rows with NaN values in the target column
141
+ time_series_data = time_series_data.dropna(subset=['y'])
 
142
 
143
+ # Check if there is enough data to train the Prophet model
144
+ if len(time_series_data) < 2:
145
+ logging.warning("Not enough data to train the Prophet model. Skipping time-series analysis.")
146
+ else:
147
+ # Resample the data to a fixed frequency (e.g., daily)
148
+ time_series_data = time_series_data.set_index('ds').resample('D').mean().reset_index()
149
+
150
+ # Train Prophet model with hyperparameters
151
+ prophet_model = Prophet(
152
+ changepoint_prior_scale=0.05, # Adjust sensitivity to trend changes
153
+ seasonality_prior_scale=10.0, # Adjust seasonality strength
154
+ yearly_seasonality=True, # Enable yearly seasonality
155
+ daily_seasonality=True # Enable daily seasonality
156
+ )
157
+ prophet_model.fit(time_series_data)
158
+
159
+ # Make future predictions
160
+ future = prophet_model.make_future_dataframe(periods=30) # Predict for the next 30 days
161
+ forecast = prophet_model.predict(future)
162
+
163
+ # Plot the forecast
164
+ fig = prophet_model.plot(forecast)
165
+ plt.title('Engagement Rate Forecast (Prophet)')
166
+ plt.xlabel('Date')
167
+ plt.ylabel('Engagement Rate')
168
+ plt.show()
169
+
170
+ # Evaluate the model
171
+ y_true = time_series_data['y']
172
+ y_pred = forecast.loc[:len(y_true)-1, 'yhat'] # Align predictions with true values
173
+
174
+ # Check for NaN values in y_true or y_pred
175
+ if y_true.isnull().any() or y_pred.isnull().any():
176
+ logging.warning("NaN values detected in true values or predictions. Skipping MAE calculation.")
177
+ else:
178
+ mae = mean_absolute_error(y_true, y_pred)
179
+ logging.info(f"Prophet Model - MAE: {mae:.4f}")
180
 
181
  # Ensure 'hashtags' column is properly formatted
182
  solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
 
294
  logging.info("Content Type Impact on Promotion:")
295
  print(content_type_impact)
296
 
297
+ # Competitor Data Analysis
298
+ logging.info("Analyzing competitor data...")
299
+
300
+ # Preprocess competitor data
301
+ competitor_df['caption'] = competitor_df['caption'].astype(str)
302
+ competitor_df['hashtags'] = competitor_df['hashtags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
303
+ competitor_df['posting_time'] = pd.to_datetime(competitor_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
304
+ competitor_df = competitor_df[competitor_df['posting_time'].notna()]
305
+
306
+ # Calculate engagement_rate for competitor data
307
+ competitor_df['engagement_rate'] = competitor_df['likes'] + competitor_df['comments'] + competitor_df['shares']
308
+
309
+ # Perform sentiment analysis on competitor captions
310
+ logging.info("Performing sentiment analysis on competitor captions...")
311
+ competitor_df['caption_sentiment'] = competitor_df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
312
+ competitor_df['sentiment'] = competitor_df['caption_sentiment']
313
+
314
+ # Feature engineering for competitor data
315
+ logging.info("Performing feature engineering for competitor data...")
316
+ competitor_df['caption_length'] = competitor_df['caption'].apply(len)
317
+ competitor_df['hashtag_count'] = competitor_df['hashtags'].apply(len)
318
+
319
+ # Analyze competitor engagement data
320
+ logging.info("Analyzing competitor engagement data...")
321
+ competitor_summary = competitor_df.groupby('posting_time').agg({
322
+ 'likes': 'sum',
323
+ 'comments': 'sum',
324
+ 'shares': 'sum',
325
+ 'engagement_rate': 'mean'
326
+ }).reset_index()
327
+
328
+ # Plot competitor engagement rate over time
329
+ plt.figure(figsize=(10, 6))
330
+ plt.plot(competitor_summary['posting_time'], competitor_summary['engagement_rate'], label='Competitor')
331
+ plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'], label='Your Data')
332
+ plt.title('Engagement Rate Over Time (Your Data vs Competitor)')
333
+ plt.xlabel('Time')
334
+ plt.ylabel('Engagement Rate')
335
+ plt.legend()
336
+ plt.show()
337
+
338
+ # Analyze competitor niche trends
339
+ logging.info("Analyzing competitor niche trends...")
340
+ competitor_niche_trends = competitor_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
341
+ logging.info("Top Performing Content Types by Sentiment (Competitor):")
342
+ print(competitor_niche_trends)
343
+
344
+ # Combine your data with competitor data for enhanced analysis
345
+ logging.info("Combining your data with competitor data...")
346
+ combined_df = pd.concat([solved_df, competitor_df], ignore_index=True)
347
+
348
+ # Enhanced recommendation system using combined data
349
+ logging.info("Training enhanced recommendation system using combined data...")
350
+ combined_hashtags = combined_df['hashtags'].apply(lambda x: ' '.join(x))
351
+
352
+ if combined_hashtags.str.strip().eq('').all():
353
+ logging.warning("The 'hashtags' column is empty or contains only stop words. Skipping recommendation system.")
354
+ else:
355
+ vectorizer = TfidfVectorizer()
356
+ tfidf_matrix = vectorizer.fit_transform(combined_hashtags)
357
+ cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
358
+
359
+ def recommend_hashtags(post_index, top_n=5):
360
+ sim_scores = list(enumerate(cosine_sim[post_index]))
361
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
362
+ top_indices = [i[0] for i in sim_scores[1:top_n+1]]
363
+ return combined_df.iloc[top_indices]['hashtags']
364
+
365
+ # Example: Recommend hashtags for the first post
366
+ logging.info("Example Hashtag Recommendations (Combined Data):")
367
+ print(recommend_hashtags(0))
368
+
369
+ # Enhanced niche trend analysis using combined data
370
+ logging.info("Analyzing enhanced niche trends using combined data...")
371
+ combined_niche_trends = combined_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
372
+ logging.info("Top Performing Content Types by Sentiment (Combined Data):")
373
+ print(combined_niche_trends)
374
+
375
+ # Enhanced viral potential prediction using combined data
376
+ logging.info("Training enhanced viral potential prediction model using combined data...")
377
+ combined_viral_threshold = combined_df['engagement_rate'].quantile(0.9)
378
+ combined_df['viral'] = combined_df['engagement_rate'].apply(lambda x: 1 if x >= combined_viral_threshold else 0)
379
+
380
+ # Features for viral potential prediction
381
+ features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
382
+ X = combined_df[features]
383
+ y = combined_df['viral']
384
+
385
+ # Split data into training and testing sets
386
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
387
+
388
+ # Train a Random Forest Classifier
389
+ viral_model = RandomForestClassifier(random_state=42)
390
+ viral_model.fit(X_train, y_train)
391
+
392
+ # Evaluate the model
393
+ y_pred = viral_model.predict(X_test)
394
+ accuracy = accuracy_score(y_test, y_pred)
395
+ logging.info(f"Enhanced Viral Potential Model Accuracy: {accuracy:.4f}")
396
+
397
+ # Feature importance
398
+ importance = viral_model.feature_importances_
399
+ for feature, score in zip(features, importance):
400
+ logging.info(f"Feature Importance - {feature}: {score:.4f}")
401
+
402
+ # Enhanced engagement rate predictions using combined data
403
+ logging.info("Training enhanced engagement rate prediction model using combined data...")
404
+ X = combined_df[features]
405
+ y = combined_df['engagement_rate']
406
+
407
+ # Split data into training and testing sets
408
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
409
+
410
+ # Train an XGBoost Regressor
411
+ engagement_model = XGBRegressor(random_state=42)
412
+ engagement_model.fit(X_train, y_train)
413
+
414
+ # Evaluate the model
415
+ y_pred = engagement_model.predict(X_test)
416
+ mae = mean_absolute_error(y_test, y_pred)
417
+ logging.info(f"Enhanced Engagement Rate Prediction Model - MAE: {mae:.4f}")
418
+
419
+ # Feature importance
420
+ importance = engagement_model.feature_importances_
421
+ for feature, score in zip(features, importance):
422
+ logging.info(f"Feature Importance - {feature}: {score:.4f}")
423
+
424
+ # Enhanced promotion strategy using combined data
425
+ logging.info("Training enhanced promotion prediction model using combined data...")
426
+ promotion_threshold = combined_df['engagement_rate'].quantile(0.8)
427
+ combined_df['promote'] = combined_df['engagement_rate'].apply(lambda x: 1 if x >= promotion_threshold else 0)
428
+
429
+ # Features for promotion prediction
430
+ X = combined_df[features]
431
+ y = combined_df['promote']
432
+
433
+ # Split data into training and testing sets
434
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
435
+
436
+ # Train a Logistic Regression Model
437
+ promotion_model = LogisticRegression(random_state=42)
438
+ promotion_model.fit(X_train, y_train)
439
+
440
+ # Evaluate the model
441
+ y_pred = promotion_model.predict(X_test)
442
+ accuracy = accuracy_score(y_test, y_pred)
443
+ logging.info(f"Enhanced Promotion Prediction Model Accuracy: {accuracy:.4f}")
444
+
445
+ # Analyze content type impact
446
+ content_type_impact = combined_df.groupby('content_type')['promote'].mean().sort_values(ascending=False)
447
+ logging.info("Enhanced Content Type Impact on Promotion:")
448
+ print(content_type_impact)
449
+
450
+ logging.info("Enhanced analysis complete!")