Sam Fred commited on
Commit
40fb94f
·
1 Parent(s): 555c6af
app.py CHANGED
@@ -1,375 +1,26 @@
1
- import os
2
- import pandas as pd
3
- import numpy as np
4
- import json
5
- import logging
6
- import re
7
- import requests
8
- from io import BytesIO
9
- from PIL import Image
10
- import pytesseract
11
- from textblob import TextBlob
12
- from sklearn.model_selection import train_test_split
13
- from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
14
- from xgboost import XGBRegressor
15
- from sklearn.linear_model import LogisticRegression
16
- from sklearn.metrics import mean_absolute_error, accuracy_score
17
- from sklearn.preprocessing import LabelEncoder
18
- import torch
19
- from torchvision import transforms
20
- import matplotlib.pyplot as plt
21
- import seaborn as sns
22
- from collections import Counter
23
- import pickle
24
- from transformers import ResNetForImageClassification
25
- from prophet import Prophet
26
 
27
  # Set up logging
28
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
29
 
30
- # Set the working directory to a writable location
31
- WORKING_DIR = "/app" # Use /app for temporary files
32
- os.makedirs(WORKING_DIR, exist_ok=True)
33
- os.chdir(WORKING_DIR)
34
-
35
- # Verify the current directory
36
- logging.info(f"Current working directory: {os.getcwd()}")
37
-
38
- # Cache file to store extracted text
39
- CACHE_FILE = os.path.join(WORKING_DIR, "image_text_cache.pkl")
40
-
41
- # Load cache if it exists
42
- if os.path.exists(CACHE_FILE):
43
- with open(CACHE_FILE, "rb") as f:
44
- cache = pickle.load(f)
45
- else:
46
- cache = {}
47
-
48
- # Define mean_absolute_percentage_error function
49
- def mean_absolute_percentage_error(y_true, y_pred):
50
- y_true, y_pred = np.array(y_true), np.array(y_pred)
51
- return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
52
-
53
- # Load engagement_metrics.json (your company's data)
54
- logging.info("Loading your company's engagement metrics...")
55
- try:
56
- with open('engagement_metrics.json', 'r') as f:
57
- engagement_metrics = json.load(f)
58
- your_df = pd.json_normalize(engagement_metrics)
59
- except FileNotFoundError:
60
- logging.error("engagement_metrics.json not found. Please ensure the file exists.")
61
- exit()
62
-
63
-
64
-
65
- # Load solved.json (your company's hashtags and captions)
66
- logging.info("Loading your company's solved data...")
67
- try:
68
- with open('solved.json', 'r') as f:
69
- solved_data = json.load(f)
70
- solved_df = pd.json_normalize(solved_data)
71
- except FileNotFoundError:
72
- logging.error("solved.json not found. Please ensure the file exists.")
73
- exit()
74
-
75
- # Load competitor data from JSON
76
- logging.info("Loading competitor data from JSON...")
77
- try:
78
- with open('competitors_data.json', 'r') as f:
79
- competitor_data = json.load(f)
80
- competitor_df = pd.json_normalize(competitor_data['eazylancer_posts'])
81
- except FileNotFoundError:
82
- logging.error("competitors_data.json not found. Please ensure the file exists.")
83
- exit()
84
-
85
- # Ensure required columns exist in your company's data
86
- required_columns = ['likes', 'comments', 'shares', 'posting_time', 'caption', 'hashtags']
87
- missing_columns = [col for col in required_columns if col not in your_df.columns]
88
-
89
- if missing_columns:
90
- logging.warning(f"Missing required columns in your company's data: {missing_columns}")
91
- for col in missing_columns:
92
- if col in ['likes', 'comments', 'shares']:
93
- your_df[col] = 0 # Fill with default value (integer)
94
- elif col == 'caption':
95
- your_df[col] = '' # Fill with default value (empty string)
96
- elif col == 'hashtags':
97
- your_df[col] = [[] for _ in range(len(your_df))] # Fill with default value (list of empty lists)
98
- logging.info("Default values added for missing columns.")
99
-
100
- # Ensure required columns exist in competitor data
101
- required_columns = ['caption', 'hashtags', 'likes', 'comments', 'date']
102
- missing_columns = [col for col in required_columns if col not in competitor_df.columns]
103
-
104
- if missing_columns:
105
- logging.warning(f"Missing required columns in competitor data: {missing_columns}")
106
- for col in missing_columns:
107
- if col == 'caption':
108
- competitor_df[col] = '' # Fill with default value (empty string)
109
- elif col == 'hashtags':
110
- competitor_df[col] = [[] for _ in range(len(competitor_df))] # Fill with default value (list of empty lists)
111
- else:
112
- competitor_df[col] = 0 # Fill with default value (integer)
113
- logging.info("Default values added for missing columns.")
114
-
115
- # Process your company's data
116
- logging.info("Processing your company's data...")
117
- your_df['posting_time'] = pd.to_datetime(your_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
118
- your_df = your_df[your_df['posting_time'].notna()]
119
- your_df['engagement_rate'] = your_df['likes'] + your_df['comments'] + your_df['shares']
120
- your_df['caption_length'] = your_df['caption'].apply(len)
121
- your_df['hashtag_count'] = your_df['hashtags'].apply(len)
122
- your_df['caption_sentiment'] = your_df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
123
- your_df['sentiment'] = your_df['caption_sentiment']
124
-
125
- # Process competitor data
126
- logging.info("Processing competitor data...")
127
- competitor_df['posting_time'] = pd.to_datetime(competitor_df['date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
128
- competitor_df = competitor_df[competitor_df['posting_time'].notna()]
129
- competitor_df['engagement_rate'] = competitor_df['likes'] + competitor_df['comments']
130
- competitor_df['caption_length'] = competitor_df['caption'].apply(len)
131
- competitor_df['hashtag_count'] = competitor_df['hashtags'].apply(len)
132
- competitor_df['caption_sentiment'] = competitor_df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
133
- competitor_df['sentiment'] = competitor_df['caption_sentiment']
134
-
135
- # Combine your company's data and competitor data for model training
136
- logging.info("Combining your company's data and competitor data for model training...")
137
- combined_df = pd.concat([your_df, competitor_df], ignore_index=True)
138
-
139
- # Encode categorical columns if they exist
140
- if 'content_type' in combined_df.columns and 'media_type' in combined_df.columns:
141
- logging.info("Encoding categorical columns...")
142
- label_encoder = LabelEncoder()
143
- combined_df['content_type_encoded'] = label_encoder.fit_transform(combined_df['content_type'])
144
- combined_df['media_type_encoded'] = label_encoder.fit_transform(combined_df['media_type'])
145
- features = ['caption_length', 'hashtag_count', 'sentiment', 'content_type_encoded', 'media_type_encoded']
146
- else:
147
- logging.warning("'content_type' or 'media_type' columns not found. Skipping encoding.")
148
- features = ['caption_length', 'hashtag_count', 'sentiment']
149
-
150
- # Log the features being used
151
- logging.info(f"Features for model training: {features}")
152
-
153
- # Viral Potential Prediction
154
- logging.info("Training viral potential prediction model...")
155
- combined_viral_threshold = combined_df['engagement_rate'].quantile(0.9)
156
- combined_df['viral'] = combined_df['engagement_rate'].apply(lambda x: 1 if x >= combined_viral_threshold else 0)
157
-
158
- X = combined_df[features]
159
- y = combined_df['viral']
160
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
161
-
162
- viral_model = RandomForestClassifier(random_state=42)
163
- viral_model.fit(X_train, y_train)
164
- y_pred = viral_model.predict(X_test)
165
- accuracy = accuracy_score(y_test, y_pred)
166
- logging.info(f"Viral Potential Model Accuracy: {accuracy:.4f}")
167
-
168
- # Engagement Rate Prediction
169
- logging.info("Training engagement rate prediction model...")
170
- X = combined_df[features]
171
- y = combined_df['engagement_rate']
172
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
173
-
174
- engagement_model = XGBRegressor(random_state=42)
175
- engagement_model.fit(X_train, y_train)
176
- y_pred = engagement_model.predict(X_test)
177
- mae = mean_absolute_error(y_test, y_pred)
178
- logging.info(f"Engagement Rate Prediction Model - MAE: {mae:.4f}")
179
-
180
- # Promotion Strategy
181
- logging.info("Training promotion prediction model...")
182
- promotion_threshold = combined_df['engagement_rate'].quantile(0.8)
183
- combined_df['promote'] = combined_df['engagement_rate'].apply(lambda x: 1 if x >= promotion_threshold else 0)
184
-
185
- X = combined_df[features]
186
- y = combined_df['promote']
187
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
188
-
189
- promotion_model = LogisticRegression(random_state=42)
190
- promotion_model.fit(X_train, y_train)
191
- y_pred = promotion_model.predict(X_test)
192
- accuracy = accuracy_score(y_test, y_pred)
193
- logging.info(f"Promotion Prediction Model Accuracy: {accuracy:.4f}")
194
-
195
- # Sentiment Analysis
196
- logging.info("Performing sentiment analysis on captions...")
197
- combined_df['sentiment_category'] = combined_df['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
198
- logging.info("Sentiment Analysis Results:")
199
- print(combined_df['sentiment_category'].value_counts())
200
-
201
- # Niche Trend Analysis
202
- logging.info("Analyzing niche trends...")
203
- if 'content_type' in combined_df.columns:
204
- niche_trends = combined_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
205
- logging.info("Top Performing Content Types by Sentiment:")
206
- print(niche_trends)
207
- else:
208
- logging.warning("'content_type' column not found. Skipping niche trend analysis.")
209
-
210
- # Trending Hashtags
211
- logging.info("Analyzing trending hashtags...")
212
- trending_hashtags = combined_df['hashtags'].explode().value_counts().head(10)
213
- logging.info("Top 10 Trending Hashtags:")
214
- print(trending_hashtags)
215
-
216
- # Trending Keywords
217
- logging.info("Analyzing trending keywords in captions...")
218
- words = combined_df['caption'].apply(lambda x: re.findall(r'\b\w+\b', x.lower())).explode()
219
- trending_keywords = Counter(words).most_common(10)
220
- logging.info("Top 10 Trending Keywords in Captions:")
221
- print(trending_keywords)
222
-
223
- # Engagement Heatmap by Time of Day (using combined data)
224
- logging.info("Creating engagement heatmap by time of day...")
225
- combined_df['hour'] = combined_df['posting_time'].dt.hour
226
- engagement_by_hour = combined_df.groupby('hour')['engagement_rate'].mean().reset_index()
227
-
228
- plt.figure(figsize=(10, 6))
229
- sns.heatmap(engagement_by_hour.pivot_table(index='hour', values='engagement_rate'), annot=True, cmap='YlGnBu')
230
- plt.title('Engagement Heatmap by Time of Day')
231
- plt.xlabel('Engagement Rate')
232
- plt.ylabel('Hour of Day')
233
- plt.show()
234
-
235
- def resize_image(image, max_size=(800, 600)):
236
- """Resize an image to the specified maximum size."""
237
- image.thumbnail(max_size)
238
- return image
239
-
240
- # Function to extract text from an image
241
- def extract_text_from_image(image):
242
- """Extract text from an image using OCR."""
243
- try:
244
- # Resize the image
245
- image = resize_image(image)
246
- # Extract text using pytesseract
247
- text = pytesseract.image_to_string(image)
248
- return text
249
- except Exception as e:
250
- logging.error(f"Error extracting text from image: {e}")
251
- return ""
252
-
253
- # Function to analyze image content using a pre-trained model
254
- def analyze_image(image):
255
- """Analyze image content using a pre-trained model."""
256
- try:
257
- # Preprocess the image
258
- preprocess = transforms.Compose([
259
- transforms.Resize(256),
260
- transforms.CenterCrop(224),
261
- transforms.ToTensor(),
262
- transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
263
- ])
264
- image_tensor = preprocess(image).unsqueeze(0)
265
-
266
- # Use the pre-trained ResNet model
267
- with torch.no_grad():
268
- output = model(image_tensor)
269
- return output
270
- except Exception as e:
271
- logging.error(f"Error analyzing image: {e}")
272
- return None
273
-
274
- # Function to rate an image based on visual appeal and text quality
275
- def rate_image(image, caption):
276
- """Rate an image based on visual appeal and text quality."""
277
- # Analyze the image
278
- image_analysis = analyze_image(image)
279
- if image_analysis is None:
280
- return 0.0 # Return a default score if analysis fails
281
-
282
- # Visual appeal (placeholder for image analysis score)
283
- visual_appeal = 0.5 # Replace with actual image analysis logic
284
-
285
- # Text quality (based on caption sentiment and length)
286
- text_quality = 0.3 * TextBlob(caption).sentiment.polarity + 0.2 * len(caption)
287
-
288
- # Combine factors into a weighted score
289
- score = 0.6 * visual_appeal + 0.4 * text_quality
290
- return score
291
-
292
- # Example usage
293
  if __name__ == "__main__":
294
- # Example user-uploaded image and caption
295
- image_url = "https://instagram.flos5-1.fna.fbcdn.net/v/t51.2885-15/468436961_18431508154072851_4306676786324401005_n.jpg?stp=dst-jpg_e35_p1080x1080_sh0.08_tt6&_nc_ht=instagram.flos5-1.fna.fbcdn.net&_nc_cat=103&_nc_ohc=-9uxYx-M4WYQ7kNvgEWXUp_&_nc_gid=cfa5b625792446db8db41e38348d0aeb&edm=AEhyXUkBAAAA&ccb=7-5&ig_cache_key=MzUxMDU2MTc3OTc5NDcxOTY4NQ%3D%3D.3-ccb7-5&oh=00_AYC25Wc1tsFzU0DKvzQ5kLYcYx4KOIKAdOLSFki4xmvFUQ&oe=678FA376&_nc_sid=8f1549" # Replace with actual image URL
296
- caption = "This is a beautiful sunset!" # Replace with actual caption
297
-
298
- try:
299
- # Download the image
300
- response = requests.get(image_url)
301
- response.raise_for_status() # Raise an error for bad responses (4xx, 5xx)
302
- image = Image.open(BytesIO(response.content))
303
-
304
- # Extract text from the image
305
- extracted_text = extract_text_from_image(image)
306
- logging.info(f"Extracted text: {extracted_text}")
307
-
308
- # Rate the image
309
- score = rate_image(image, caption)
310
- logging.info(f"Image Rating: {score:.2f}")
311
- except Exception as e:
312
- logging.error(f"Error processing image: {e}")
313
-
314
- # Analyze engagement data separately
315
- logging.info("Analyzing engagement data separately...")
316
- engagement_summary = your_df.groupby('posting_time').agg({
317
- 'likes': 'sum',
318
- 'comments': 'sum',
319
- 'shares': 'sum',
320
- 'engagement_rate': 'mean'
321
- }).reset_index()
322
-
323
- # Convert posting_time to datetime in engagement data
324
- logging.info("Converting posting_time to datetime...")
325
- your_df['posting_time'] = pd.to_datetime(your_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
326
-
327
- # Plot engagement rate over time
328
- plt.figure(figsize=(10, 6))
329
- plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'])
330
- plt.title('Engagement Rate Over Time')
331
- plt.xlabel('Time')
332
- plt.ylabel('Engagement Rate')
333
- plt.show()
334
-
335
- # Handle missing values in engagement data
336
- your_df.fillna({
337
- 'likes': 0,
338
- 'comments': 0,
339
- 'shares': 0
340
- }, inplace=True)
341
-
342
- # Calculate engagement_rate
343
- your_df['engagement_rate'] = your_df['likes'] + your_df['comments'] + your_df['shares']
344
-
345
-
346
- # Time-Series Model: Optimal Posting Times (using Prophet)
347
- logging.info("Training time-series model for optimal posting times using Prophet...")
348
- time_series_data = engagement_summary[['posting_time', 'engagement_rate']].rename(columns={'posting_time': 'ds', 'engagement_rate': 'y'})
349
-
350
- # Train Prophet model
351
- prophet_model = Prophet()
352
- prophet_model.fit(time_series_data)
353
-
354
- # Make future predictions
355
- future = prophet_model.make_future_dataframe(periods=30) # Predict for the next 30 days
356
- forecast = prophet_model.predict(future)
357
-
358
- # Plot the forecast
359
- fig = prophet_model.plot(forecast)
360
- plt.title('Engagement Rate Forecast (Prophet)')
361
- plt.xlabel('Date')
362
- plt.ylabel('Engagement Rate')
363
- plt.show()
364
-
365
- # Evaluate the model
366
- from sklearn.metrics import mean_absolute_error
367
- y_true = time_series_data['y']
368
- y_pred = forecast.loc[:len(y_true)-1, 'yhat'] # Align predictions with true values
369
- mae = mean_absolute_error(y_true, y_pred)
370
- logging.info(f"Prophet Model - MAE: {mae:.4f}")
371
-
372
-
373
-
374
-
375
- logging.info("Analysis complete!")
 
1
+ from utils.logging_utils import setup_logging
2
+ from scripts.train_viral_potential import train_viral_potential
3
+ from scripts.train_engagement_rate import train_engagement_rate
4
+ from scripts.train_promotion_strategy import train_promotion_strategy
5
+ from scripts.train_time_series import train_time_series
6
+ from scripts.analyze_image import analyze_image_url
7
+ from scripts.analyze_engagement import analyze_engagement
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Set up logging
10
+ setup_logging()
11
 
12
+ # Main application logic
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  if __name__ == "__main__":
14
+ # Train models
15
+ train_viral_potential()
16
+ train_engagement_rate()
17
+ train_promotion_strategy()
18
+ train_time_series()
19
+
20
+ # Analyze engagement data
21
+ analyze_engagement()
22
+
23
+ # Analyze an example image
24
+ image_url = "https://example.com/path/to/image.jpg"
25
+ caption = "This is a beautiful sunset!"
26
+ analyze_image_url(image_url, caption)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
competitors_data.csv → data/processed/competitors_data.csv RENAMED
File without changes
competitors_data.json → data/raw/competitors_data.json RENAMED
The diff for this file is too large to render. See raw diff
 
engagement_metrics.json → data/raw/engagement_metrics.json RENAMED
The diff for this file is too large to render. See raw diff
 
solved.json → data/raw/solved.json RENAMED
The diff for this file is too large to render. See raw diff
 
scripts/analyze_engagement.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from utils.visualization import plot_engagement_heatmap, plot_engagement_over_time
3
+ from utils.preprocessing import preprocess_data
4
+
5
+ def analyze_engagement():
6
+ """Analyze engagement data."""
7
+ # Load data
8
+ df = pd.read_json("data/raw/engagement_metrics.json")
9
+ df = preprocess_data(df)
10
+
11
+ # Group by hour for heatmap
12
+ df['hour'] = df['posting_time'].dt.hour
13
+ engagement_by_hour = df.groupby('hour')['engagement_rate'].mean().reset_index()
14
+ plot_engagement_heatmap(engagement_by_hour)
15
+
16
+ # Plot engagement over time
17
+ engagement_summary = df.groupby('posting_time').agg({'engagement_rate': 'mean'}).reset_index()
18
+ plot_engagement_over_time(engagement_summary)
scripts/analyze_image.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from PIL import Image
3
+ from io import BytesIO
4
+ from utils.image_processing import extract_text_from_image, analyze_image
5
+ from utils.logging_utils import setup_logging
6
+
7
+ def analyze_image_url(image_url, caption):
8
+ """Analyze an image from a URL."""
9
+ setup_logging()
10
+ try:
11
+ # Download the image
12
+ response = requests.get(image_url)
13
+ response.raise_for_status()
14
+ image = Image.open(BytesIO(response.content))
15
+
16
+ # Extract text from the image
17
+ extracted_text = extract_text_from_image(image)
18
+ logging.info(f"Extracted text: {extracted_text}")
19
+
20
+ # Analyze the image
21
+ image_analysis = analyze_image(image)
22
+ if image_analysis is not None:
23
+ logging.info("Image analysis completed successfully.")
24
+ except Exception as e:
25
+ logging.error(f"Error processing image: {e}")
scripts/train_engagement_rate.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from xgboost import XGBRegressor
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import mean_absolute_error
5
+ import joblib
6
+ from utils.preprocessing import preprocess_data
7
+
8
+ def train_engagement_rate():
9
+ """Train the engagement rate prediction model."""
10
+ # Load data
11
+ df = pd.read_json("data/raw/engagement_metrics.json")
12
+ df = preprocess_data(df)
13
+
14
+ # Train engagement rate model
15
+ X = df[['caption_length', 'hashtag_count', 'sentiment']]
16
+ y = df['engagement_rate']
17
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
18
+
19
+ engagement_model = XGBRegressor(random_state=42)
20
+ engagement_model.fit(X_train, y_train)
21
+ y_pred = engagement_model.predict(X_test)
22
+ mae = mean_absolute_error(y_test, y_pred)
23
+ print(f"Engagement Rate Prediction Model - MAE: {mae:.4f}")
24
+
25
+ # Save the model
26
+ joblib.dump(engagement_model, "models/engagement_rate_model.pkl")
scripts/train_promotion_strategy.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.linear_model import LogisticRegression
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+ import joblib
6
+ from utils.preprocessing import preprocess_data
7
+
8
+ def train_promotion_strategy():
9
+ """Train the promotion strategy model."""
10
+ # Load data
11
+ df = pd.read_json("data/raw/engagement_metrics.json")
12
+ df = preprocess_data(df)
13
+
14
+ # Train promotion strategy model
15
+ promotion_threshold = df['engagement_rate'].quantile(0.8)
16
+ df['promote'] = df['engagement_rate'].apply(lambda x: 1 if x >= promotion_threshold else 0)
17
+
18
+ X = df[['caption_length', 'hashtag_count', 'sentiment']]
19
+ y = df['promote']
20
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
21
+
22
+ promotion_model = LogisticRegression(random_state=42)
23
+ promotion_model.fit(X_train, y_train)
24
+ y_pred = promotion_model.predict(X_test)
25
+ accuracy = accuracy_score(y_test, y_pred)
26
+ print(f"Promotion Prediction Model Accuracy: {accuracy:.4f}")
27
+
28
+ # Save the model
29
+ joblib.dump(promotion_model, "models/promotion_strategy_model.pkl")
scripts/train_time_series.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from prophet import Prophet
3
+ from sklearn.metrics import mean_absolute_error
4
+ import joblib
5
+ from utils.preprocessing import preprocess_data
6
+
7
+ def train_time_series():
8
+ """Train the time-series model for optimal posting times."""
9
+ # Load data
10
+ df = pd.read_json("data/raw/engagement_metrics.json")
11
+ df = preprocess_data(df)
12
+
13
+ # Prepare time-series data
14
+ time_series_data = df.groupby('posting_time').agg({'engagement_rate': 'mean'}).reset_index()
15
+ time_series_data = time_series_data.rename(columns={'posting_time': 'ds', 'engagement_rate': 'y'})
16
+
17
+ # Train Prophet model
18
+ prophet_model = Prophet()
19
+ prophet_model.fit(time_series_data)
20
+
21
+ # Save the model
22
+ joblib.dump(prophet_model, "models/prophet_model.pkl")
scripts/train_viral_potential.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+ import joblib
6
+ from utils.preprocessing import preprocess_data
7
+
8
+ def train_viral_potential():
9
+ """Train the viral potential prediction model."""
10
+ # Load data
11
+ df = pd.read_json("data/raw/engagement_metrics.json")
12
+ df = preprocess_data(df)
13
+
14
+ # Train viral potential model
15
+ viral_threshold = df['engagement_rate'].quantile(0.9)
16
+ df['viral'] = df['engagement_rate'].apply(lambda x: 1 if x >= viral_threshold else 0)
17
+
18
+ X = df[['caption_length', 'hashtag_count', 'sentiment']]
19
+ y = df['viral']
20
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
21
+
22
+ viral_model = RandomForestClassifier(random_state=42)
23
+ viral_model.fit(X_train, y_train)
24
+ y_pred = viral_model.predict(X_test)
25
+ accuracy = accuracy_score(y_test, y_pred)
26
+ print(f"Viral Potential Model Accuracy: {accuracy:.4f}")
27
+
28
+ # Save the model
29
+ joblib.dump(viral_model, "models/viral_potential_model.pkl")
utils/image_processing.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import pytesseract
3
+ import torch
4
+ from torchvision import transforms
5
+ from transformers import ResNetForImageClassification
6
+ import logging
7
+
8
+ def resize_image(image, max_size=(800, 600)):
9
+ """Resize an image to the specified maximum size."""
10
+ image.thumbnail(max_size)
11
+ return image
12
+
13
+ def extract_text_from_image(image):
14
+ """Extract text from an image using OCR."""
15
+ try:
16
+ image = resize_image(image)
17
+ text = pytesseract.image_to_string(image)
18
+ return text
19
+ except Exception as e:
20
+ logging.error(f"Error extracting text from image: {e}")
21
+ return ""
22
+
23
+ def analyze_image(image):
24
+ """Analyze image content using a pre-trained model."""
25
+ try:
26
+ preprocess = transforms.Compose([
27
+ transforms.Resize(256),
28
+ transforms.CenterCrop(224),
29
+ transforms.ToTensor(),
30
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
31
+ ])
32
+ image_tensor = preprocess(image).unsqueeze(0)
33
+
34
+ # Load ResNet model
35
+ model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")
36
+ model.eval()
37
+
38
+ with torch.no_grad():
39
+ output = model(image_tensor)
40
+ return output
41
+ except Exception as e:
42
+ logging.error(f"Error analyzing image: {e}")
43
+ return None
utils/logging_utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import logging
2
+
3
+ def setup_logging():
4
+ """Set up logging configuration."""
5
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
utils/preprocessing.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from textblob import TextBlob
3
+ from sklearn.preprocessing import LabelEncoder
4
+ import logging
5
+
6
+ def preprocess_data(df):
7
+ """Preprocess the input DataFrame."""
8
+ # Ensure required columns exist
9
+ required_columns = ['likes', 'comments', 'shares', 'posting_time', 'caption', 'hashtags']
10
+ missing_columns = [col for col in required_columns if col not in df.columns]
11
+
12
+ if missing_columns:
13
+ logging.warning(f"Missing required columns: {missing_columns}")
14
+ for col in missing_columns:
15
+ if col in ['likes', 'comments', 'shares']:
16
+ df[col] = 0 # Fill with default value (integer)
17
+ elif col == 'caption':
18
+ df[col] = '' # Fill with default value (empty string)
19
+ elif col == 'hashtags':
20
+ df[col] = [[] for _ in range(len(df))] # Fill with default value (list of empty lists)
21
+
22
+ # Convert posting_time to datetime
23
+ df['posting_time'] = pd.to_datetime(df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
24
+ df = df[df['posting_time'].notna()]
25
+
26
+ # Calculate engagement rate
27
+ df['engagement_rate'] = df['likes'] + df['comments'] + df['shares']
28
+
29
+ # Calculate caption length and hashtag count
30
+ df['caption_length'] = df['caption'].apply(len)
31
+ df['hashtag_count'] = df['hashtags'].apply(len)
32
+
33
+ # Calculate sentiment
34
+ df['caption_sentiment'] = df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
35
+ df['sentiment'] = df['caption_sentiment']
36
+
37
+ # Encode categorical columns
38
+ if 'content_type' in df.columns and 'media_type' in df.columns:
39
+ label_encoder = LabelEncoder()
40
+ df['content_type_encoded'] = label_encoder.fit_transform(df['content_type'])
41
+ df['media_type_encoded'] = label_encoder.fit_transform(df['media_type'])
42
+
43
+ return df
utils/visualization.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+
4
+ def plot_engagement_heatmap(engagement_by_hour):
5
+ """Plot engagement heatmap by time of day."""
6
+ plt.figure(figsize=(10, 6))
7
+ sns.heatmap(engagement_by_hour.pivot_table(index='hour', values='engagement_rate'), annot=True, cmap='YlGnBu')
8
+ plt.title('Engagement Heatmap by Time of Day')
9
+ plt.xlabel('Engagement Rate')
10
+ plt.ylabel('Hour of Day')
11
+ plt.show()
12
+
13
+ def plot_engagement_over_time(engagement_summary):
14
+ """Plot engagement rate over time."""
15
+ plt.figure(figsize=(10, 6))
16
+ plt.plot(engagement_summary['posting_time'], engagement_summary['engagement_rate'])
17
+ plt.title('Engagement Rate Over Time')
18
+ plt.xlabel('Time')
19
+ plt.ylabel('Engagement Rate')
20
+ plt.show()