Fred808 commited on
Commit
001f6ce
·
verified ·
1 Parent(s): 8d6e351

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -47
app.py CHANGED
@@ -29,25 +29,25 @@ def mean_absolute_percentage_error(y_true, y_pred):
29
  y_true, y_pred = np.array(y_true), np.array(y_pred)
30
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
31
 
32
- # Load main dataset
33
- logging.info("Loading main dataset...")
34
- data = pd.read_csv('train_data.csv')
35
-
36
  # Load engagement_metrics.json
37
  logging.info("Loading engagement metrics...")
38
- with open('engagement_metrics.json', 'r') as f:
39
- engagement_metrics = json.load(f)
40
-
41
- # Convert engagement metrics to DataFrame
42
- engagement_df = pd.json_normalize(engagement_metrics)
 
 
43
 
44
  # Load solved.json (hashtags and captions)
45
  logging.info("Loading solved.json...")
46
- with open('solved.json', 'r') as f:
47
- solved_data = json.load(f)
48
-
49
- # Convert solved.json to DataFrame
50
- solved_df = pd.json_normalize(solved_data)
 
 
51
 
52
  # Check for required columns in engagement data
53
  required_columns = ['posting_time', 'likes', 'comments', 'shares']
@@ -69,15 +69,14 @@ engagement_df.fillna({
69
  # Calculate engagement_rate
70
  engagement_df['engagement_rate'] = engagement_df['likes'] + engagement_df['comments'] + engagement_df['shares']
71
 
72
- # Convert posting_time to datetime in both datasets
73
  logging.info("Converting posting_time to datetime...")
74
- data['posting_time'] = pd.to_datetime(data['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
75
  engagement_df['posting_time'] = pd.to_datetime(engagement_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
76
 
77
- # Ensure 'caption' is treated as a string column in the main dataset
78
- data['caption'] = data['caption'].astype(str)
79
 
80
- # Extract hashtags from the caption column in the main dataset
81
  def extract_hashtags(caption):
82
  try:
83
  # Convert the caption string to a dictionary
@@ -89,38 +88,38 @@ def extract_hashtags(caption):
89
  return []
90
 
91
  # Apply the function to the caption column
92
- data['hashtags'] = data['caption'].apply(extract_hashtags)
93
 
94
- # Filter out rows with invalid posting_time in the main dataset
95
- data = data[data['posting_time'].notna()]
96
 
97
- # Convert to Unix timestamp in the main dataset (for time-based operations)
98
  logging.info("Converting posting_time to Unix timestamp...")
99
- data['posting_time_encoded'] = data['posting_time'].astype(int) / 10**9
100
 
101
- # Ensure required columns exist in the main dataset
102
- if 'content_type' not in data.columns:
103
- data['content_type'] = 'photo' # Default value (adjust based on your data)
104
 
105
- if 'media_type' not in data.columns:
106
- data['media_type'] = 'image' # Default value (adjust based on your data)
107
 
108
- # Encode categorical columns in the main dataset
109
  label_encoder = LabelEncoder()
110
- data['content_type_encoded'] = label_encoder.fit_transform(data['content_type'])
111
- data['media_type_encoded'] = label_encoder.fit_transform(data['media_type'])
112
 
113
- # Calculate sentiment for captions in the main dataset
114
  logging.info("Performing sentiment analysis on captions...")
115
- data['caption_sentiment'] = data['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
116
 
117
  # Use caption sentiment as the overall sentiment
118
- data['sentiment'] = data['caption_sentiment']
119
 
120
- # Feature Engineering in the main dataset
121
  logging.info("Performing feature engineering...")
122
- data['caption_length'] = data['caption'].apply(len)
123
- data['hashtag_count'] = data['hashtags'].apply(len)
124
 
125
  # Analyze engagement data separately
126
  logging.info("Analyzing engagement data separately...")
@@ -189,11 +188,11 @@ else:
189
  logging.info(f"ARIMA Model: MAPE: {mape:.4f}")
190
 
191
  # Ensure 'hashtags' column is properly formatted
192
- data['hashtags'] = data['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
193
 
194
- # Recommendation System: Hashtag and Keyword Recommendations (using main dataset)
195
  logging.info("Training recommendation system for hashtags...")
196
- hashtags = data['hashtags'].apply(lambda x: ' '.join(x)) # Convert list of hashtags to a single string
197
 
198
  # Check if hashtags are empty
199
  if hashtags.str.strip().eq('').all():
@@ -207,21 +206,21 @@ else:
207
  sim_scores = list(enumerate(cosine_sim[post_index]))
208
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
209
  top_indices = [i[0] for i in sim_scores[1:top_n+1]]
210
- return data.iloc[top_indices]['hashtags']
211
 
212
  # Example: Recommend hashtags for the first post
213
  logging.info("Example Hashtag Recommendations:")
214
  print(recommend_hashtags(0))
215
 
216
- # Sentiment Analysis: Audience Reactions (using main dataset)
217
  logging.info("Performing sentiment analysis on captions...")
218
- data['sentiment_category'] = data['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
219
  logging.info("Sentiment Analysis Results:")
220
- print(data['sentiment_category'].value_counts())
221
 
222
- # Niche Trend Analysis (using main dataset)
223
  logging.info("Analyzing niche trends...")
224
- niche_trends = data.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
225
  logging.info("Top Performing Content Types by Sentiment:")
226
  print(niche_trends)
227
 
 
29
  y_true, y_pred = np.array(y_true), np.array(y_pred)
30
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
31
 
 
 
 
 
32
  # Load engagement_metrics.json
33
  logging.info("Loading engagement metrics...")
34
+ try:
35
+ with open('engagement_metrics.json', 'r') as f:
36
+ engagement_metrics = json.load(f)
37
+ engagement_df = pd.json_normalize(engagement_metrics)
38
+ except FileNotFoundError:
39
+ logging.error("engagement_metrics.json not found. Please ensure the file exists.")
40
+ exit()
41
 
42
  # Load solved.json (hashtags and captions)
43
  logging.info("Loading solved.json...")
44
+ try:
45
+ with open('solved.json', 'r') as f:
46
+ solved_data = json.load(f)
47
+ solved_df = pd.json_normalize(solved_data)
48
+ except FileNotFoundError:
49
+ logging.error("solved.json not found. Please ensure the file exists.")
50
+ exit()
51
 
52
  # Check for required columns in engagement data
53
  required_columns = ['posting_time', 'likes', 'comments', 'shares']
 
69
  # Calculate engagement_rate
70
  engagement_df['engagement_rate'] = engagement_df['likes'] + engagement_df['comments'] + engagement_df['shares']
71
 
72
+ # Convert posting_time to datetime in engagement data
73
  logging.info("Converting posting_time to datetime...")
 
74
  engagement_df['posting_time'] = pd.to_datetime(engagement_df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
75
 
76
+ # Ensure 'caption' is treated as a string column in solved data
77
+ solved_df['caption'] = solved_df['caption'].astype(str)
78
 
79
+ # Extract hashtags from the caption column in solved data
80
  def extract_hashtags(caption):
81
  try:
82
  # Convert the caption string to a dictionary
 
88
  return []
89
 
90
  # Apply the function to the caption column
91
+ solved_df['hashtags'] = solved_df['caption'].apply(extract_hashtags)
92
 
93
+ # Filter out rows with invalid posting_time in engagement data
94
+ engagement_df = engagement_df[engagement_df['posting_time'].notna()]
95
 
96
+ # Convert posting_time to Unix timestamp in engagement data (for time-based operations)
97
  logging.info("Converting posting_time to Unix timestamp...")
98
+ engagement_df['posting_time_encoded'] = engagement_df['posting_time'].astype(int) / 10**9
99
 
100
+ # Ensure required columns exist in the solved dataset
101
+ if 'content_type' not in solved_df.columns:
102
+ solved_df['content_type'] = 'photo' # Default value (adjust based on your data)
103
 
104
+ if 'media_type' not in solved_df.columns:
105
+ solved_df['media_type'] = 'image' # Default value (adjust based on your data)
106
 
107
+ # Encode categorical columns in the solved dataset
108
  label_encoder = LabelEncoder()
109
+ solved_df['content_type_encoded'] = label_encoder.fit_transform(solved_df['content_type'])
110
+ solved_df['media_type_encoded'] = label_encoder.fit_transform(solved_df['media_type'])
111
 
112
+ # Calculate sentiment for captions in the solved dataset
113
  logging.info("Performing sentiment analysis on captions...")
114
+ solved_df['caption_sentiment'] = solved_df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
115
 
116
  # Use caption sentiment as the overall sentiment
117
+ solved_df['sentiment'] = solved_df['caption_sentiment']
118
 
119
+ # Feature Engineering in the solved dataset
120
  logging.info("Performing feature engineering...")
121
+ solved_df['caption_length'] = solved_df['caption'].apply(len)
122
+ solved_df['hashtag_count'] = solved_df['hashtags'].apply(len)
123
 
124
  # Analyze engagement data separately
125
  logging.info("Analyzing engagement data separately...")
 
188
  logging.info(f"ARIMA Model: MAPE: {mape:.4f}")
189
 
190
  # Ensure 'hashtags' column is properly formatted
191
+ solved_df['hashtags'] = solved_df['hashtags'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['no_hashtag'])
192
 
193
+ # Recommendation System: Hashtag and Keyword Recommendations (using solved dataset)
194
  logging.info("Training recommendation system for hashtags...")
195
+ hashtags = solved_df['hashtags'].apply(lambda x: ' '.join(x)) # Convert list of hashtags to a single string
196
 
197
  # Check if hashtags are empty
198
  if hashtags.str.strip().eq('').all():
 
206
  sim_scores = list(enumerate(cosine_sim[post_index]))
207
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
208
  top_indices = [i[0] for i in sim_scores[1:top_n+1]]
209
+ return solved_df.iloc[top_indices]['hashtags']
210
 
211
  # Example: Recommend hashtags for the first post
212
  logging.info("Example Hashtag Recommendations:")
213
  print(recommend_hashtags(0))
214
 
215
+ # Sentiment Analysis: Audience Reactions (using solved dataset)
216
  logging.info("Performing sentiment analysis on captions...")
217
+ solved_df['sentiment_category'] = solved_df['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')
218
  logging.info("Sentiment Analysis Results:")
219
+ print(solved_df['sentiment_category'].value_counts())
220
 
221
+ # Niche Trend Analysis (using solved dataset)
222
  logging.info("Analyzing niche trends...")
223
+ niche_trends = solved_df.groupby('content_type')['sentiment'].mean().sort_values(ascending=False)
224
  logging.info("Top Performing Content Types by Sentiment:")
225
  print(niche_trends)
226