Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,128 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
#
|
| 5 |
-
|
| 6 |
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# Time-Series Model: Optimal Posting Times (using Prophet)
|
| 12 |
logging.info("Training time-series model for optimal posting times using Prophet...")
|
|
@@ -34,7 +150,72 @@ y_pred = forecast.loc[:len(y_true)-1, 'yhat'] # Align predictions with true val
|
|
| 34 |
mae = mean_absolute_error(y_true, y_pred)
|
| 35 |
logging.info(f"Prophet Model - MAE: {mae:.4f}")
|
| 36 |
|
| 37 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
logging.info("Training model for engagement rate prediction...")
|
| 39 |
features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
|
| 40 |
X = solved_df[features]
|
|
@@ -55,4 +236,34 @@ logging.info(f"Engagement Rate Prediction Model - MAE: {mae:.4f}")
|
|
| 55 |
# Feature importance
|
| 56 |
importance = engagement_model.feature_importances_
|
| 57 |
for feature, score in zip(features, importance):
|
| 58 |
-
logging.info(f"Feature Importance - {feature}: {score:.4f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import json
|
| 4 |
+
import ast
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
| 7 |
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
| 8 |
+
from xgboost import XGBRegressor, XGBClassifier
|
| 9 |
+
from sklearn.svm import SVC
|
| 10 |
+
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
| 11 |
+
from prophet import Prophet # For time-series forecasting
|
| 12 |
+
from tensorflow.keras.models import Sequential
|
| 13 |
+
from tensorflow.keras.layers import LSTM, Dense
|
| 14 |
+
from tensorflow.keras.callbacks import EarlyStopping
|
| 15 |
+
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
|
| 16 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 17 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 18 |
+
from textblob import TextBlob # For sentiment analysis
|
| 19 |
+
from imblearn.over_sampling import SMOTE # For handling imbalanced data
|
| 20 |
+
import logging
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
from statsmodels.tsa.stattools import adfuller # For stationarity check
|
| 23 |
|
| 24 |
+
# Set up logging
|
| 25 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 26 |
|
| 27 |
+
# Define mean_absolute_percentage_error function
|
| 28 |
+
def mean_absolute_percentage_error(y_true, y_pred):
|
| 29 |
+
y_true, y_pred = np.array(y_true), np.array(y_pred)
|
| 30 |
+
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
|
| 31 |
+
|
| 32 |
+
# Load engagement_metrics.json
|
| 33 |
+
logging.info("Loading engagement metrics...")
|
| 34 |
+
try:
|
| 35 |
+
with open('engagement_metrics.json', 'r') as f:
|
| 36 |
+
engagement_metrics = json.load(f)
|
| 37 |
+
engagement_df = pd.json_normalize(engagement_metrics)
|
| 38 |
+
except FileNotFoundError:
|
| 39 |
+
logging.error("engagement_metrics.json not found. Please ensure the file exists.")
|
| 40 |
+
exit()
|
| 41 |
+
|
| 42 |
+
# Load solved.json (hashtags and captions)
|
| 43 |
+
logging.info("Loading solved.json...")
|
| 44 |
+
try:
|
| 45 |
+
with open('solved.json', 'r') as f:
|
| 46 |
+
solved_data = json.load(f)
|
| 47 |
+
solved_df = pd.json_normalize(solved_data)
|
| 48 |
+
except FileNotFoundError:
|
| 49 |
+
logging.error("solved.json not found. Please ensure the file exists.")
|
| 50 |
+
exit()
|
| 51 |
+
|
| 52 |
+
# Check for required columns in engagement data
|
| 53 |
+
required_columns = ['posting_time', 'likes', 'comments', 'shares']
|
| 54 |
+
missing_columns = [col for col in required_columns if col not in engagement_df.columns]
|
| 55 |
+
|
| 56 |
+
if missing_columns:
|
| 57 |
+
logging.warning(f"Missing required columns in engagement_metrics.json: {missing_columns}")
|
| 58 |
+
for col in missing_columns:
|
| 59 |
+
engagement_df[col] = 0 # Fill with default value
|
| 60 |
+
logging.info("Default values added for missing columns.")
|
| 61 |
+
|
| 62 |
+
# Handle missing values in engagement data
|
| 63 |
+
engagement_df.fillna({
|
| 64 |
+
'likes': 0,
|
| 65 |
+
'comments': 0,
|
| 66 |
+
'shares': 0
|
| 67 |
+
}, inplace=True)
|
| 68 |
+
|
| 69 |
+
# Calculate engagement_rate
|
| 70 |
+
engagement_df['engagement_rate'] = engagement_df['likes'] + engagement_df['comments'] + engagement_df['shares']
|
| 71 |
+
|
| 72 |
+
# Convert posting_time to datetime in engagement data
|
| 73 |
+
logging.info("Converting posting_time to datetime...")
|
| 74 |
+
engagement_df['posting_time'] = pd.to_datetime(engagement_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
|
| 75 |
+
|
| 76 |
+
# Ensure 'caption' is treated as a string column in solved data
|
| 77 |
+
solved_df['caption'] = solved_df['caption'].astype(str)
|
| 78 |
+
|
| 79 |
+
# Extract hashtags from the solved data (already provided as a list)
|
| 80 |
+
logging.info("Extracting hashtags from solved data...")
|
| 81 |
+
solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) else [])
|
| 82 |
+
|
| 83 |
+
# Filter out rows with invalid posting_time in engagement data
|
| 84 |
+
engagement_df = engagement_df[engagement_df['posting_time'].notna()]
|
| 85 |
+
|
| 86 |
+
# Ensure required columns exist in the solved dataset
|
| 87 |
+
if 'content_type' not in solved_df.columns:
|
| 88 |
+
solved_df['content_type'] = 'photo' # Default value (adjust based on your data)
|
| 89 |
+
|
| 90 |
+
if 'media_type' not in solved_df.columns:
|
| 91 |
+
solved_df['media_type'] = 'image' # Default value (adjust based on your data)
|
| 92 |
+
|
| 93 |
+
# Encode categorical columns in the solved dataset
|
| 94 |
+
label_encoder = LabelEncoder()
|
| 95 |
+
solved_df['content_type_encoded'] = label_encoder.fit_transform(solved_df['content_type'])
|
| 96 |
+
solved_df['media_type_encoded'] = label_encoder.fit_transform(solved_df['media_type'])
|
| 97 |
+
|
| 98 |
+
# Calculate sentiment for captions in the solved dataset
|
| 99 |
+
logging.info("Performing sentiment analysis on captions...")
|
| 100 |
+
solved_df['caption_sentiment'] = solved_df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
|
| 101 |
+
|
| 102 |
+
# Use caption sentiment as the overall sentiment
|
| 103 |
+
solved_df['sentiment'] = solved_df['caption_sentiment']
|
| 104 |
+
|
| 105 |
+
# Feature Engineering in the solved dataset
|
| 106 |
+
logging.info("Performing feature engineering...")
|
| 107 |
+
solved_df['caption_length'] = solved_df['caption'].apply(len)
|
| 108 |
+
solved_df['hashtag_count'] = solved_df['hashtags'].apply(len)
|
| 109 |
+
|
| 110 |
+
# Analyze engagement data separately
|
| 111 |
+
logging.info("Analyzing engagement data separately...")
|
| 112 |
+
engagement_summary = engagement_df.groupby('posting_time').agg({
|
| 113 |
+
'likes': 'sum',
|
| 114 |
+
'comments': 'sum',
|
| 115 |
+
'shares': 'sum',
|
| 116 |
+
'engagement_rate': 'mean'
|
| 117 |
+
}).reset_index()
|
| 118 |
+
|
| 119 |
+
# Plot engagement rate over time
|
| 120 |
+
plt.figure(figsize=(10, 6))
|
| 121 |
+
plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'])
|
| 122 |
+
plt.title('Engagement Rate Over Time')
|
| 123 |
+
plt.xlabel('Time')
|
| 124 |
+
plt.ylabel('Engagement Rate')
|
| 125 |
+
plt.show()
|
| 126 |
|
| 127 |
# Time-Series Model: Optimal Posting Times (using Prophet)
|
| 128 |
logging.info("Training time-series model for optimal posting times using Prophet...")
|
|
|
|
| 150 |
mae = mean_absolute_error(y_true, y_pred)
|
| 151 |
logging.info(f"Prophet Model - MAE: {mae:.4f}")
|
| 152 |
|
| 153 |
+
# Ensure 'hashtags' column is properly formatted
|
| 154 |
+
solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
|
| 155 |
+
|
| 156 |
+
# Recommendation System: Hashtag and Keyword Recommendations (using solved dataset)
|
| 157 |
+
logging.info("Training recommendation system for hashtags...")
|
| 158 |
+
hashtags = solved_df['hashtags'].apply(lambda x: ' '.join(x)) # Convert list of hashtags to a single string
|
| 159 |
+
|
| 160 |
+
# Check if hashtags are empty
|
| 161 |
+
if hashtags.str.strip().eq('').all():
|
| 162 |
+
logging.warning("The 'hashtags' column is empty or contains only stop words. Skipping recommendation system.")
|
| 163 |
+
else:
|
| 164 |
+
vectorizer = TfidfVectorizer()
|
| 165 |
+
tfidf_matrix = vectorizer.fit_transform(hashtags)
|
| 166 |
+
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
|
| 167 |
+
|
| 168 |
+
def recommend_hashtags(post_index, top_n=5):
|
| 169 |
+
sim_scores = list(enumerate(cosine_sim[post_index]))
|
| 170 |
+
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
| 171 |
+
top_indices = [i[0] for i in sim_scores[1:top_n+1]]
|
| 172 |
+
return solved_df.iloc[top_indices]['hashtags']
|
| 173 |
+
|
| 174 |
+
# Example: Recommend hashtags for the first post
|
| 175 |
+
logging.info("Example Hashtag Recommendations:")
|
| 176 |
+
print(recommend_hashtags(0))
|
| 177 |
+
|
| 178 |
+
# Sentiment Analysis: Audience Reactions (using solved dataset)
|
| 179 |
+
logging.info("Performing sentiment analysis on captions...")
|
| 180 |
+
solved_df['sentiment_category'] = solved_df['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
|
| 181 |
+
logging.info("Sentiment Analysis Results:")
|
| 182 |
+
print(solved_df['sentiment_category'].value_counts())
|
| 183 |
+
|
| 184 |
+
# Niche Trend Analysis (using solved dataset)
|
| 185 |
+
logging.info("Analyzing niche trends...")
|
| 186 |
+
niche_trends = solved_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
|
| 187 |
+
logging.info("Top Performing Content Types by Sentiment:")
|
| 188 |
+
print(niche_trends)
|
| 189 |
+
|
| 190 |
+
# Viral Potential of Posts
|
| 191 |
+
logging.info("Training model for viral potential prediction...")
|
| 192 |
+
viral_threshold = engagement_df['engagement_rate'].quantile(0.9)
|
| 193 |
+
engagement_df['viral'] = engagement_df['engagement_rate'].apply(lambda x: 1 if x >= viral_threshold else 0)
|
| 194 |
+
solved_df['viral'] = engagement_df['viral']
|
| 195 |
+
|
| 196 |
+
# Features for viral potential prediction
|
| 197 |
+
features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
|
| 198 |
+
X = solved_df[features]
|
| 199 |
+
y = solved_df['viral']
|
| 200 |
+
|
| 201 |
+
# Split data into training and testing sets
|
| 202 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 203 |
+
|
| 204 |
+
# Train a Random Forest Classifier
|
| 205 |
+
viral_model = RandomForestClassifier(random_state=42)
|
| 206 |
+
viral_model.fit(X_train, y_train)
|
| 207 |
+
|
| 208 |
+
# Evaluate the model
|
| 209 |
+
y_pred = viral_model.predict(X_test)
|
| 210 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 211 |
+
logging.info(f"Viral Potential Model Accuracy: {accuracy:.4f}")
|
| 212 |
+
|
| 213 |
+
# Feature importance
|
| 214 |
+
importance = viral_model.feature_importances_
|
| 215 |
+
for feature, score in zip(features, importance):
|
| 216 |
+
logging.info(f"Feature Importance - {feature}: {score:.4f}")
|
| 217 |
+
|
| 218 |
+
# Engagement Rate Predictions
|
| 219 |
logging.info("Training model for engagement rate prediction...")
|
| 220 |
features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
|
| 221 |
X = solved_df[features]
|
|
|
|
| 236 |
# Feature importance
|
| 237 |
importance = engagement_model.feature_importances_
|
| 238 |
for feature, score in zip(features, importance):
|
| 239 |
+
logging.info(f"Feature Importance - {feature}: {score:.4f}")
|
| 240 |
+
|
| 241 |
+
# Which Type of Posts Yield Greater Results When Promoted
|
| 242 |
+
logging.info("Training model for promotion prediction...")
|
| 243 |
+
promotion_threshold = engagement_df['engagement_rate'].quantile(0.8)
|
| 244 |
+
engagement_df['promote'] = engagement_df['engagement_rate'].apply(lambda x: 1 if x >= promotion_threshold else 0)
|
| 245 |
+
solved_df['promote'] = engagement_df['promote']
|
| 246 |
+
|
| 247 |
+
# Features for promotion prediction
|
| 248 |
+
features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
|
| 249 |
+
X = solved_df[features]
|
| 250 |
+
y = solved_df['promote']
|
| 251 |
+
|
| 252 |
+
# Split data into training and testing sets
|
| 253 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 254 |
+
|
| 255 |
+
# Train a Logistic Regression Model
|
| 256 |
+
promotion_model = LogisticRegression(random_state=42)
|
| 257 |
+
promotion_model.fit(X_train, y_train)
|
| 258 |
+
|
| 259 |
+
# Evaluate the model
|
| 260 |
+
y_pred = promotion_model.predict(X_test)
|
| 261 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 262 |
+
logging.info(f"Promotion Prediction Model Accuracy: {accuracy:.4f}")
|
| 263 |
+
|
| 264 |
+
# Analyze content type impact
|
| 265 |
+
content_type_impact = solved_df.groupby('content_type')['promote'].mean().sort_values(ascending=False)
|
| 266 |
+
logging.info("Content Type Impact on Promotion:")
|
| 267 |
+
print(content_type_impact)
|
| 268 |
+
|
| 269 |
+
logging.info("Analysis complete!")
|