Fred808 commited on
Commit
c8d040b
·
verified ·
1 Parent(s): 78be209

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -266
app.py CHANGED
@@ -1,250 +1,42 @@
1
- import pandas as pd
2
- import numpy as np
3
- import json
4
- import ast
5
- from sklearn.model_selection import train_test_split
6
- from sklearn.linear_model import LinearRegression, LogisticRegression
7
- from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
8
- from xgboost import XGBRegressor, XGBClassifier
9
- from sklearn.svm import SVC
10
- from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
11
- from statsmodels.tsa.arima.model import ARIMA
12
- from tensorflow.keras.models import Sequential
13
- from tensorflow.keras.layers import LSTM, Dense
14
- from tensorflow.keras.callbacks import EarlyStopping
15
- from sklearn.preprocessing import MinMaxScaler, LabelEncoder
16
- from sklearn.feature_extraction.text import TfidfVectorizer
17
- from sklearn.metrics.pairwise import cosine_similarity
18
- from textblob import TextBlob # For sentiment analysis
19
- from imblearn.over_sampling import SMOTE # For handling imbalanced data
20
- import logging
21
- import matplotlib.pyplot as plt
22
- from statsmodels.tsa.stattools import adfuller # For stationarity check
23
 
24
- # Set up logging
25
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
26
 
27
- # Define mean_absolute_percentage_error function
28
- def mean_absolute_percentage_error(y_true, y_pred):
29
- y_true, y_pred = np.array(y_true), np.array(y_pred)
30
- return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
31
 
32
- # Load engagement_metrics.json
33
- logging.info("Loading engagement metrics...")
34
- try:
35
- with open('engagement_metrics.json', 'r') as f:
36
- engagement_metrics = json.load(f)
37
- engagement_df = pd.json_normalize(engagement_metrics)
38
- except FileNotFoundError:
39
- logging.error("engagement_metrics.json not found. Please ensure the file exists.")
40
- exit()
41
 
42
- # Load solved.json (hashtags and captions)
43
- logging.info("Loading solved.json...")
44
- try:
45
- with open('solved.json', 'r') as f:
46
- solved_data = json.load(f)
47
- solved_df = pd.json_normalize(solved_data)
48
- except FileNotFoundError:
49
- logging.error("solved.json not found. Please ensure the file exists.")
50
- exit()
51
 
52
- # Check for required columns in engagement data
53
- required_columns = ['posting_time', 'likes', 'comments', 'shares']
54
- missing_columns = [col for col in required_columns if col not in engagement_df.columns]
55
 
56
- if missing_columns:
57
- logging.warning(f"Missing required columns in engagement_metrics.json: {missing_columns}")
58
- for col in missing_columns:
59
- engagement_df[col] = 0 # Fill with default value
60
- logging.info("Default values added for missing columns.")
61
-
62
- # Handle missing values in engagement data
63
- engagement_df.fillna({
64
- 'likes': 0,
65
- 'comments': 0,
66
- 'shares': 0
67
- }, inplace=True)
68
-
69
- # Calculate engagement_rate
70
- engagement_df['engagement_rate'] = engagement_df['likes'] + engagement_df['comments'] + engagement_df['shares']
71
-
72
- # Convert posting_time to datetime in engagement data
73
- logging.info("Converting posting_time to datetime...")
74
- engagement_df['posting_time'] = pd.to_datetime(engagement_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
75
-
76
- # Ensure 'caption' is treated as a string column in solved data
77
- solved_df['caption'] = solved_df['caption'].astype(str)
78
-
79
- # Extract hashtags from the solved data (already provided as a list)
80
- logging.info("Extracting hashtags from solved data...")
81
- solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) else [])
82
-
83
- # Filter out rows with invalid posting_time in engagement data
84
- engagement_df = engagement_df[engagement_df['posting_time'].notna()]
85
-
86
- # Convert posting_time to Unix timestamp in engagement data (for time-based operations)
87
- logging.info("Converting posting_time to Unix timestamp...")
88
- engagement_df['posting_time_encoded'] = engagement_df['posting_time'].astype(int) / 10**9
89
-
90
- # Ensure required columns exist in the solved dataset
91
- if 'content_type' not in solved_df.columns:
92
- solved_df['content_type'] = 'photo' # Default value (adjust based on your data)
93
-
94
- if 'media_type' not in solved_df.columns:
95
- solved_df['media_type'] = 'image' # Default value (adjust based on your data)
96
-
97
- # Encode categorical columns in the solved dataset
98
- label_encoder = LabelEncoder()
99
- solved_df['content_type_encoded'] = label_encoder.fit_transform(solved_df['content_type'])
100
- solved_df['media_type_encoded'] = label_encoder.fit_transform(solved_df['media_type'])
101
-
102
- # Calculate sentiment for captions in the solved dataset
103
- logging.info("Performing sentiment analysis on captions...")
104
- solved_df['caption_sentiment'] = solved_df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
105
-
106
- # Use caption sentiment as the overall sentiment
107
- solved_df['sentiment'] = solved_df['caption_sentiment']
108
-
109
- # Feature Engineering in the solved dataset
110
- logging.info("Performing feature engineering...")
111
- solved_df['caption_length'] = solved_df['caption'].apply(len)
112
- solved_df['hashtag_count'] = solved_df['hashtags'].apply(len)
113
-
114
- # Analyze engagement data separately
115
- logging.info("Analyzing engagement data separately...")
116
- engagement_summary = engagement_df.groupby('posting_time').agg({
117
- 'likes': 'sum',
118
- 'comments': 'sum',
119
- 'shares': 'sum',
120
- 'engagement_rate': 'mean'
121
- }).reset_index()
122
-
123
- # Plot engagement rate over time
124
- plt.figure(figsize=(10, 6))
125
- plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'])
126
- plt.title('Engagement Rate Over Time')
127
- plt.xlabel('Time')
128
  plt.ylabel('Engagement Rate')
129
  plt.show()
130
 
131
- # Time-Series Model: Optimal Posting Times (using engagement data)
132
- logging.info("Training time-series model for optimal posting times...")
133
- time_series_data = engagement_summary.set_index('posting_time')
134
-
135
- # Check for NaN values
136
- print("NaN values in time-series data:")
137
- print(time_series_data.isnull().sum())
138
-
139
- # Check if the 'engagement_rate' column is empty or entirely NaN
140
- if time_series_data['engagement_rate'].isnull().all() or len(time_series_data) == 0:
141
- logging.warning("The 'engagement_rate' column is empty or entirely NaN. Skipping stationarity check and ARIMA modeling.")
142
- else:
143
- # Check stationarity using ADF test
144
- engagement_rate_cleaned = time_series_data['engagement_rate'].dropna()
145
- if len(engagement_rate_cleaned) == 0:
146
- logging.warning("The 'engagement_rate' column is empty after dropping NaN values. Skipping stationarity check.")
147
- else:
148
- result = adfuller(engagement_rate_cleaned)
149
- print('ADF Statistic:', result[0])
150
- print('p-value:', result[1])
151
- print('Critical Values:', result[4])
152
-
153
- # If the data is not stationary, apply differencing
154
- if result[1] > 0.05:
155
- print("Data is not stationary. Applying differencing...")
156
- time_series_data['engagement_rate_diff'] = time_series_data['engagement_rate'].diff().dropna()
157
- time_series_data = time_series_data.dropna() # Drop NaN values after differencing
158
- if len(time_series_data) == 0:
159
- logging.warning("The differenced 'engagement_rate' column is empty. Skipping ARIMA modeling.")
160
- else:
161
- print("ADF test after differencing:")
162
- result_diff = adfuller(time_series_data['engagement_rate_diff'])
163
- print('ADF Statistic:', result_diff[0])
164
- print('p-value:', result_diff[1])
165
- print('Critical Values:', result_diff[4])
166
-
167
- # Train ARIMA model on stationary data
168
- if len(time_series_data) > 0:
169
- train_size = int(len(time_series_data) * 0.8)
170
- train, test = time_series_data[:train_size], time_series_data[train_size:]
171
- if 'engagement_rate_diff' in time_series_data.columns:
172
- arima_model = ARIMA(train['engagement_rate_diff'], order=(5, 1, 0))
173
- else:
174
- arima_model = ARIMA(train['engagement_rate'], order=(5, 1, 0))
175
- arima_fit = arima_model.fit()
176
- predictions = arima_fit.forecast(steps=len(test))
177
- mape = mean_absolute_percentage_error(test['engagement_rate'], predictions)
178
- logging.info(f"ARIMA Model: MAPE: {mape:.4f}")
179
-
180
- # Ensure 'hashtags' column is properly formatted
181
- solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
182
-
183
- # Recommendation System: Hashtag and Keyword Recommendations (using solved dataset)
184
- logging.info("Training recommendation system for hashtags...")
185
- hashtags = solved_df['hashtags'].apply(lambda x: ' '.join(x)) # Convert list of hashtags to a single string
186
-
187
- # Check if hashtags are empty
188
- if hashtags.str.strip().eq('').all():
189
- logging.warning("The 'hashtags' column is empty or contains only stop words. Skipping recommendation system.")
190
- else:
191
- vectorizer = TfidfVectorizer()
192
- tfidf_matrix = vectorizer.fit_transform(hashtags)
193
- cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
194
-
195
- def recommend_hashtags(post_index, top_n=5):
196
- sim_scores = list(enumerate(cosine_sim[post_index]))
197
- sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
198
- top_indices = [i[0] for i in sim_scores[1:top_n+1]]
199
- return solved_df.iloc[top_indices]['hashtags']
200
-
201
- # Example: Recommend hashtags for the first post
202
- logging.info("Example Hashtag Recommendations:")
203
- print(recommend_hashtags(0))
204
-
205
- # Sentiment Analysis: Audience Reactions (using solved dataset)
206
- logging.info("Performing sentiment analysis on captions...")
207
- solved_df['sentiment_category'] = solved_df['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
208
- logging.info("Sentiment Analysis Results:")
209
- print(solved_df['sentiment_category'].value_counts())
210
-
211
- # Niche Trend Analysis (using solved dataset)
212
- logging.info("Analyzing niche trends...")
213
- niche_trends = solved_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
214
- logging.info("Top Performing Content Types by Sentiment:")
215
- print(niche_trends)
216
-
217
- # Viral Potential of Posts
218
- logging.info("Training model for viral potential prediction...")
219
- viral_threshold = engagement_df['engagement_rate'].quantile(0.9)
220
- engagement_df['viral'] = engagement_df['engagement_rate'].apply(lambda x: 1 if x >= viral_threshold else 0)
221
- solved_df['viral'] = engagement_df['viral']
222
-
223
- # Features for viral potential prediction
224
- features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
225
- X = solved_df[features]
226
- y = solved_df['viral']
227
-
228
- # Split data into training and testing sets
229
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
230
-
231
- # Train a Random Forest Classifier
232
- viral_model = RandomForestClassifier(random_state=42)
233
- viral_model.fit(X_train, y_train)
234
-
235
  # Evaluate the model
236
- y_pred = viral_model.predict(X_test)
237
- accuracy = accuracy_score(y_test, y_pred)
238
- logging.info(f"Viral Potential Model Accuracy: {accuracy:.4f}")
239
-
240
- # Feature importance
241
- importance = viral_model.feature_importances_
242
- for feature, score in zip(features, importance):
243
- logging.info(f"Feature Importance - {feature}: {score:.4f}")
244
 
245
- # Engagement Rate Predictions
246
  logging.info("Training model for engagement rate prediction...")
247
- features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded', 'posting_time_encoded']
248
  X = solved_df[features]
249
  y = engagement_df['engagement_rate']
250
 
@@ -263,34 +55,4 @@ logging.info(f"Engagement Rate Prediction Model - MAE: {mae:.4f}")
263
  # Feature importance
264
  importance = engagement_model.feature_importances_
265
  for feature, score in zip(features, importance):
266
- logging.info(f"Feature Importance - {feature}: {score:.4f}")
267
-
268
- # Which Type of Posts Yield Greater Results When Promoted
269
- logging.info("Training model for promotion prediction...")
270
- promotion_threshold = engagement_df['engagement_rate'].quantile(0.8)
271
- engagement_df['promote'] = engagement_df['engagement_rate'].apply(lambda x: 1 if x >= promotion_threshold else 0)
272
- solved_df['promote'] = engagement_df['promote']
273
-
274
- # Features for promotion prediction
275
- features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
276
- X = solved_df[features]
277
- y = solved_df['promote']
278
-
279
- # Split data into training and testing sets
280
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
281
-
282
- # Train a Logistic Regression Model
283
- promotion_model = LogisticRegression(random_state=42)
284
- promotion_model.fit(X_train, y_train)
285
-
286
- # Evaluate the model
287
- y_pred = promotion_model.predict(X_test)
288
- accuracy = accuracy_score(y_test, y_pred)
289
- logging.info(f"Promotion Prediction Model Accuracy: {accuracy:.4f}")
290
-
291
- # Analyze content type impact
292
- content_type_impact = solved_df.groupby('content_type')['promote'].mean().sort_values(ascending=False)
293
- logging.info("Content Type Impact on Promotion:")
294
- print(content_type_impact)
295
-
296
- logging.info("Analysis complete!")
 
1
+ # Install Prophet if not already installed
2
+ !pip install prophet
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ # Import Prophet
5
+ from prophet import Prophet
6
 
7
+ # Remove posting_time_encoded if it's not available
8
+ if 'posting_time_encoded' in solved_df.columns:
9
+ solved_df.drop(columns=['posting_time_encoded'], inplace=True)
 
10
 
11
+ # Time-Series Model: Optimal Posting Times (using Prophet)
12
+ logging.info("Training time-series model for optimal posting times using Prophet...")
13
+ time_series_data = engagement_summary[['posting_time', 'engagement_rate']].rename(columns={'posting_time': 'ds', 'engagement_rate': 'y'})
 
 
 
 
 
 
14
 
15
+ # Train Prophet model
16
+ prophet_model = Prophet()
17
+ prophet_model.fit(time_series_data)
 
 
 
 
 
 
18
 
19
+ # Make future predictions
20
+ future = prophet_model.make_future_dataframe(periods=30) # Predict for the next 30 days
21
+ forecast = prophet_model.predict(future)
22
 
23
+ # Plot the forecast
24
+ fig = prophet_model.plot(forecast)
25
+ plt.title('Engagement Rate Forecast (Prophet)')
26
+ plt.xlabel('Date')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  plt.ylabel('Engagement Rate')
28
  plt.show()
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Evaluate the model
31
+ from sklearn.metrics import mean_absolute_error
32
+ y_true = time_series_data['y']
33
+ y_pred = forecast.loc[:len(y_true)-1, 'yhat'] # Align predictions with true values
34
+ mae = mean_absolute_error(y_true, y_pred)
35
+ logging.info(f"Prophet Model - MAE: {mae:.4f}")
 
 
 
36
 
37
+ # Engagement Rate Predictions (without posting_time_encoded)
38
  logging.info("Training model for engagement rate prediction...")
39
+ features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
40
  X = solved_df[features]
41
  y = engagement_df['engagement_rate']
42
 
 
55
  # Feature importance
56
  importance = engagement_model.feature_importances_
57
  for feature, score in zip(features, importance):
58
+ logging.info(f"Feature Importance - {feature}: {score:.4f}")