Fred808 commited on
Commit
3e6e6e2
·
verified ·
1 Parent(s): af70a14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -9
app.py CHANGED
@@ -1,12 +1,128 @@
1
- # Install Prophet if not already installed
2
- !pip install prophet
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- # Import Prophet
5
- from prophet import Prophet
6
 
7
- # Remove posting_time_encoded if it's not available
8
- if 'posting_time_encoded' in solved_df.columns:
9
- solved_df.drop(columns=['posting_time_encoded'], inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Time-Series Model: Optimal Posting Times (using Prophet)
12
  logging.info("Training time-series model for optimal posting times using Prophet...")
@@ -34,7 +150,72 @@ y_pred = forecast.loc[:len(y_true)-1, 'yhat'] # Align predictions with true val
34
  mae = mean_absolute_error(y_true, y_pred)
35
  logging.info(f"Prophet Model - MAE: {mae:.4f}")
36
 
37
- # Engagement Rate Predictions (without posting_time_encoded)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  logging.info("Training model for engagement rate prediction...")
39
  features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
40
  X = solved_df[features]
@@ -55,4 +236,34 @@ logging.info(f"Engagement Rate Prediction Model - MAE: {mae:.4f}")
55
  # Feature importance
56
  importance = engagement_model.feature_importances_
57
  for feature, score in zip(features, importance):
58
- logging.info(f"Feature Importance - {feature}: {score:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import json
4
+ import ast
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.linear_model import LinearRegression, LogisticRegression
7
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
8
+ from xgboost import XGBRegressor, XGBClassifier
9
+ from sklearn.svm import SVC
10
+ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
11
+ from prophet import Prophet # For time-series forecasting
12
+ from tensorflow.keras.models import Sequential
13
+ from tensorflow.keras.layers import LSTM, Dense
14
+ from tensorflow.keras.callbacks import EarlyStopping
15
+ from sklearn.preprocessing import MinMaxScaler, LabelEncoder
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
+ from sklearn.metrics.pairwise import cosine_similarity
18
+ from textblob import TextBlob # For sentiment analysis
19
+ from imblearn.over_sampling import SMOTE # For handling imbalanced data
20
+ import logging
21
+ import matplotlib.pyplot as plt
22
+ from statsmodels.tsa.stattools import adfuller # For stationarity check
23
 
24
+ # Set up logging
25
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
26
 
27
+ # Define mean_absolute_percentage_error function
28
+ def mean_absolute_percentage_error(y_true, y_pred):
29
+ y_true, y_pred = np.array(y_true), np.array(y_pred)
30
+ return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
31
+
32
+ # Load engagement_metrics.json
33
+ logging.info("Loading engagement metrics...")
34
+ try:
35
+ with open('engagement_metrics.json', 'r') as f:
36
+ engagement_metrics = json.load(f)
37
+ engagement_df = pd.json_normalize(engagement_metrics)
38
+ except FileNotFoundError:
39
+ logging.error("engagement_metrics.json not found. Please ensure the file exists.")
40
+ exit()
41
+
42
+ # Load solved.json (hashtags and captions)
43
+ logging.info("Loading solved.json...")
44
+ try:
45
+ with open('solved.json', 'r') as f:
46
+ solved_data = json.load(f)
47
+ solved_df = pd.json_normalize(solved_data)
48
+ except FileNotFoundError:
49
+ logging.error("solved.json not found. Please ensure the file exists.")
50
+ exit()
51
+
52
+ # Check for required columns in engagement data
53
+ required_columns = ['posting_time', 'likes', 'comments', 'shares']
54
+ missing_columns = [col for col in required_columns if col not in engagement_df.columns]
55
+
56
+ if missing_columns:
57
+ logging.warning(f"Missing required columns in engagement_metrics.json: {missing_columns}")
58
+ for col in missing_columns:
59
+ engagement_df[col] = 0 # Fill with default value
60
+ logging.info("Default values added for missing columns.")
61
+
62
+ # Handle missing values in engagement data
63
+ engagement_df.fillna({
64
+ 'likes': 0,
65
+ 'comments': 0,
66
+ 'shares': 0
67
+ }, inplace=True)
68
+
69
+ # Calculate engagement_rate
70
+ engagement_df['engagement_rate'] = engagement_df['likes'] + engagement_df['comments'] + engagement_df['shares']
71
+
72
+ # Convert posting_time to datetime in engagement data
73
+ logging.info("Converting posting_time to datetime...")
74
+ engagement_df['posting_time'] = pd.to_datetime(engagement_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
75
+
76
+ # Ensure 'caption' is treated as a string column in solved data
77
+ solved_df['caption'] = solved_df['caption'].astype(str)
78
+
79
+ # Extract hashtags from the solved data (already provided as a list)
80
+ logging.info("Extracting hashtags from solved data...")
81
+ solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) else [])
82
+
83
+ # Filter out rows with invalid posting_time in engagement data
84
+ engagement_df = engagement_df[engagement_df['posting_time'].notna()]
85
+
86
+ # Ensure required columns exist in the solved dataset
87
+ if 'content_type' not in solved_df.columns:
88
+ solved_df['content_type'] = 'photo' # Default value (adjust based on your data)
89
+
90
+ if 'media_type' not in solved_df.columns:
91
+ solved_df['media_type'] = 'image' # Default value (adjust based on your data)
92
+
93
+ # Encode categorical columns in the solved dataset
94
+ label_encoder = LabelEncoder()
95
+ solved_df['content_type_encoded'] = label_encoder.fit_transform(solved_df['content_type'])
96
+ solved_df['media_type_encoded'] = label_encoder.fit_transform(solved_df['media_type'])
97
+
98
+ # Calculate sentiment for captions in the solved dataset
99
+ logging.info("Performing sentiment analysis on captions...")
100
+ solved_df['caption_sentiment'] = solved_df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
101
+
102
+ # Use caption sentiment as the overall sentiment
103
+ solved_df['sentiment'] = solved_df['caption_sentiment']
104
+
105
+ # Feature Engineering in the solved dataset
106
+ logging.info("Performing feature engineering...")
107
+ solved_df['caption_length'] = solved_df['caption'].apply(len)
108
+ solved_df['hashtag_count'] = solved_df['hashtags'].apply(len)
109
+
110
+ # Analyze engagement data separately
111
+ logging.info("Analyzing engagement data separately...")
112
+ engagement_summary = engagement_df.groupby('posting_time').agg({
113
+ 'likes': 'sum',
114
+ 'comments': 'sum',
115
+ 'shares': 'sum',
116
+ 'engagement_rate': 'mean'
117
+ }).reset_index()
118
+
119
+ # Plot engagement rate over time
120
+ plt.figure(figsize=(10, 6))
121
+ plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'])
122
+ plt.title('Engagement Rate Over Time')
123
+ plt.xlabel('Time')
124
+ plt.ylabel('Engagement Rate')
125
+ plt.show()
126
 
127
  # Time-Series Model: Optimal Posting Times (using Prophet)
128
  logging.info("Training time-series model for optimal posting times using Prophet...")
 
150
  mae = mean_absolute_error(y_true, y_pred)
151
  logging.info(f"Prophet Model - MAE: {mae:.4f}")
152
 
153
+ # Ensure 'hashtags' column is properly formatted
154
+ solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
155
+
156
+ # Recommendation System: Hashtag and Keyword Recommendations (using solved dataset)
157
+ logging.info("Training recommendation system for hashtags...")
158
+ hashtags = solved_df['hashtags'].apply(lambda x: ' '.join(x)) # Convert list of hashtags to a single string
159
+
160
+ # Check if hashtags are empty
161
+ if hashtags.str.strip().eq('').all():
162
+ logging.warning("The 'hashtags' column is empty or contains only stop words. Skipping recommendation system.")
163
+ else:
164
+ vectorizer = TfidfVectorizer()
165
+ tfidf_matrix = vectorizer.fit_transform(hashtags)
166
+ cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
167
+
168
+ def recommend_hashtags(post_index, top_n=5):
169
+ sim_scores = list(enumerate(cosine_sim[post_index]))
170
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
171
+ top_indices = [i[0] for i in sim_scores[1:top_n+1]]
172
+ return solved_df.iloc[top_indices]['hashtags']
173
+
174
+ # Example: Recommend hashtags for the first post
175
+ logging.info("Example Hashtag Recommendations:")
176
+ print(recommend_hashtags(0))
177
+
178
+ # Sentiment Analysis: Audience Reactions (using solved dataset)
179
+ logging.info("Performing sentiment analysis on captions...")
180
+ solved_df['sentiment_category'] = solved_df['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
181
+ logging.info("Sentiment Analysis Results:")
182
+ print(solved_df['sentiment_category'].value_counts())
183
+
184
+ # Niche Trend Analysis (using solved dataset)
185
+ logging.info("Analyzing niche trends...")
186
+ niche_trends = solved_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
187
+ logging.info("Top Performing Content Types by Sentiment:")
188
+ print(niche_trends)
189
+
190
+ # Viral Potential of Posts
191
+ logging.info("Training model for viral potential prediction...")
192
+ viral_threshold = engagement_df['engagement_rate'].quantile(0.9)
193
+ engagement_df['viral'] = engagement_df['engagement_rate'].apply(lambda x: 1 if x >= viral_threshold else 0)
194
+ solved_df['viral'] = engagement_df['viral']
195
+
196
+ # Features for viral potential prediction
197
+ features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
198
+ X = solved_df[features]
199
+ y = solved_df['viral']
200
+
201
+ # Split data into training and testing sets
202
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
203
+
204
+ # Train a Random Forest Classifier
205
+ viral_model = RandomForestClassifier(random_state=42)
206
+ viral_model.fit(X_train, y_train)
207
+
208
+ # Evaluate the model
209
+ y_pred = viral_model.predict(X_test)
210
+ accuracy = accuracy_score(y_test, y_pred)
211
+ logging.info(f"Viral Potential Model Accuracy: {accuracy:.4f}")
212
+
213
+ # Feature importance
214
+ importance = viral_model.feature_importances_
215
+ for feature, score in zip(features, importance):
216
+ logging.info(f"Feature Importance - {feature}: {score:.4f}")
217
+
218
+ # Engagement Rate Predictions
219
  logging.info("Training model for engagement rate prediction...")
220
  features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
221
  X = solved_df[features]
 
236
  # Feature importance
237
  importance = engagement_model.feature_importances_
238
  for feature, score in zip(features, importance):
239
+ logging.info(f"Feature Importance - {feature}: {score:.4f}")
240
+
241
+ # Which Type of Posts Yield Greater Results When Promoted
242
+ logging.info("Training model for promotion prediction...")
243
+ promotion_threshold = engagement_df['engagement_rate'].quantile(0.8)
244
+ engagement_df['promote'] = engagement_df['engagement_rate'].apply(lambda x: 1 if x >= promotion_threshold else 0)
245
+ solved_df['promote'] = engagement_df['promote']
246
+
247
+ # Features for promotion prediction
248
+ features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
249
+ X = solved_df[features]
250
+ y = solved_df['promote']
251
+
252
+ # Split data into training and testing sets
253
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
254
+
255
+ # Train a Logistic Regression Model
256
+ promotion_model = LogisticRegression(random_state=42)
257
+ promotion_model.fit(X_train, y_train)
258
+
259
+ # Evaluate the model
260
+ y_pred = promotion_model.predict(X_test)
261
+ accuracy = accuracy_score(y_test, y_pred)
262
+ logging.info(f"Promotion Prediction Model Accuracy: {accuracy:.4f}")
263
+
264
+ # Analyze content type impact
265
+ content_type_impact = solved_df.groupby('content_type')['promote'].mean().sort_values(ascending=False)
266
+ logging.info("Content Type Impact on Promotion:")
267
+ print(content_type_impact)
268
+
269
+ logging.info("Analysis complete!")