KeegBarb commited on
Commit
c02102a
·
verified ·
1 Parent(s): f17133d

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ OnlineNewsPopularity.csv filter=lfs diff=lfs merge=lfs -text
OnlineNewsPopularity.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b66d9088632308cc27fa35af847650d174a5a50503987c4e511de94a99d1c218
3
+ size 24311769
model_features.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f1ad2cc757d88882b181ad900addcc61c0634bacf2fce7ac7c6af32e3b32aa4
3
+ size 353
observatory_app.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.linear_model import Ridge
4
+ from sklearn.model_selection import train_test_split
5
+ import joblib
6
+ df = pd.read_csv("OnlineNewsPopularity.csv")
7
+ df.columns = df.columns.str.strip()
8
+
9
+ df['log_shares'] = np.log1p(df['shares'])
10
+
11
+ feature_cols = [
12
+ 'n_tokens_content', 'num_imgs', 'global_sentiment_polarity',
13
+ 'global_subjectivity', 'title_sentiment_polarity',
14
+ 'data_channel_is_tech', 'weekday_is_monday', 'weekday_is_tuesday', 'weekday_is_wednesday',
15
+ 'weekday_is_thursday', 'weekday_is_friday', 'weekday_is_saturday', 'weekday_is_sunday', "n_tokens_title", "num_videos", "num_keywords",
16
+ "num_imgs", "num_hrefs"
17
+ ]
18
+ x = df[feature_cols]
19
+ y = df['log_shares']
20
+
21
+ X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
22
+ model = Ridge(alpha=1.0)
23
+ model.fit(X_train, y_train)
24
+
25
+ joblib.dump(model, 'popularity_model.pkl')
26
+ joblib.dump(x.columns.tolist(), 'model_features.pkl')
27
+
28
+ from textblob import TextBlob
29
+ import numpy as np
30
+ import joblib
31
+
32
+ # Load the trained model and features (Assumes Step 1 was run with the new feature list)
33
+ MODEL = joblib.load('popularity_model.pkl')
34
+ FEATURE_COLUMNS = joblib.load('model_features.pkl') # Must contain your 17 features
35
+
36
+ def analyze_and_predict(headline_text, content_text, num_images, channel_tech, publish_day, num_videos, num_keywords, num_hrefs ):
37
+
38
+ # --- 1. Calculate Features from User Text ---
39
+ title_blob = TextBlob(headline_text)
40
+ content_blob = TextBlob(content_text)
41
+
42
+ # Text-derived features:
43
+ n_tokens_title = len(headline_text.split())
44
+ n_tokens_content = len(content_text.split())
45
+ global_sentiment_polarity = content_blob.sentiment.polarity
46
+ global_subjectivity = content_blob.sentiment.subjectivity
47
+ title_sentiment_polarity = title_blob.sentiment.polarity
48
+
49
+ # --- 2. Create Binary Weekday Flags ---
50
+ # Weekday names must match the names in your FEATURE_COLUMNS exactly
51
+ weekday_flags = {
52
+ 'weekday_is_monday': 0, 'weekday_is_tuesday': 0, 'weekday_is_wednesday': 0,
53
+ 'weekday_is_thursday': 0, 'weekday_is_friday': 0, 'weekday_is_saturday': 0,
54
+ 'weekday_is_sunday': 0
55
+ }
56
+ # Set the selected day's flag to 1
57
+ if publish_day:
58
+ weekday_key = f'weekday_is_{publish_day.lower()}'
59
+ if weekday_key in weekday_flags:
60
+ weekday_flags[weekday_key] = 1
61
+
62
+ # --- 3. Compile all features into the input dictionary ---
63
+ input_data = {
64
+ # Text-derived features (5)
65
+ 'n_tokens_title': n_tokens_title,
66
+ 'n_tokens_content': n_tokens_content,
67
+ 'global_sentiment_polarity': global_sentiment_polarity,
68
+ 'global_subjectivity': global_subjectivity,
69
+ 'title_sentiment_polarity': title_sentiment_polarity,
70
+
71
+ # Constant/User-Selected features (12)
72
+ 'num_imgs': num_images,
73
+ 'data_channel_is_tech': 1 if channel_tech else 0,
74
+ 'num_videos': num_videos,
75
+ 'num_keywords': num_keywords,
76
+ 'num_hrefs': num_hrefs,
77
+ **weekday_flags # Adds all the dynamically set weekday flags
78
+ }
79
+
80
+ # --- 4. Prepare for Prediction ---
81
+ # Ensure features are in the exact order the model expects (using FEATURE_COLUMNS)
82
+ X_pred = np.array([input_data[col] for col in FEATURE_COLUMNS]).reshape(1, -1)
83
+
84
+ # 5. Predict and invert the log-transform
85
+ log_pred_shares = MODEL.predict(X_pred)[0]
86
+ predicted_shares = np.expm1(log_pred_shares)
87
+
88
+ # 6. Return the results
89
+ return (
90
+ f"~{int(predicted_shares):,}",
91
+ f"{global_sentiment_polarity:.3f}",
92
+ f"{global_subjectivity:.3f}",
93
+ f"{n_tokens_content}",
94
+ f"{n_tokens_title}" # New returned value
95
+ )
96
+ # (Code to define analyze_and_predict is now in the app)
97
+ # (Assuming the model loading and analyze_and_predict function are defined above)
98
+ import streamlit as st
99
+
100
+ st.title("Headline Impact: Live Popularity Predictor")
101
+ st.markdown("Use this tool to test how your article's features affect its predicted share count.")
102
+
103
+ # ----------------------------------------------------
104
+ # --- SIDEBAR FOR CONSTANT FEATURES (Non-Text Inputs) ---
105
+ # ----------------------------------------------------
106
+ with st.sidebar:
107
+ st.header("Structural & Temporal Inputs")
108
+
109
+ # Temporal Feature
110
+ publish_day = st.selectbox(
111
+ "Day of Publication",
112
+ ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'),
113
+ index=0
114
+ )
115
+
116
+ # Channel Feature
117
+ channel_tech = st.checkbox("Is it a **Tech** Channel Article?", value=False)
118
+
119
+ st.subheader("Multimedia & Linking")
120
+ # Multimedia Features
121
+ num_images = st.slider("Number of Images (num_imgs)",
122
+ min_value=0, max_value=20, value=5, step=1)
123
+ num_videos = st.slider("Number of Videos (num_videos)",
124
+ min_value=0, max_value=10, value=1, step=1)
125
+
126
+ st.subheader("Article Structure")
127
+ # Linking/Keyword Features
128
+ num_hrefs = st.slider("Number of Links (num_hrefs)",
129
+ min_value=0, max_value=30, value=5, step=1)
130
+ num_keywords = st.slider("Number of Keywords (num_keywords)",
131
+ min_value=1, max_value=10, value=5, step=1)
132
+
133
+ # ----------------------------------------------------
134
+ # --- MAIN AREA FOR TEXT INPUTS (Defining the missing variables) ---
135
+ # ----------------------------------------------------
136
+
137
+ st.header("Article Content")
138
+
139
+ # *** HEADLINE TEXT DEFINITION ***
140
+ headline_text = st.text_input("Headline Text",
141
+ placeholder="E.g., Revolutionary AI Tool Boosts Productivity")
142
+
143
+ # *** CONTENT TEXT DEFINITION ***
144
+ content_text = st.text_area("Article Snippet (for Sentiment Analysis)",
145
+ placeholder="Paste a few paragraphs of the article content here.")
146
+
147
+ # ----------------------------------------------------
148
+ # --- PREDICTION LOGIC ---
149
+ # ----------------------------------------------------
150
+
151
+ if st.button("Analyze & Predict Shares"):
152
+ if headline_text and content_text:
153
+
154
+ # 1. Input Processing
155
+ # The weekday_is_... flag is handled inside the analyze_and_predict function
156
+
157
+ # 2. Call the updated function with all arguments
158
+ predicted_shares, polarity, subjectivity, content_length, title_length = analyze_and_predict(
159
+ headline_text,
160
+ content_text,
161
+ num_images,
162
+ channel_tech,
163
+ publish_day,
164
+ num_videos,
165
+ num_keywords,
166
+ num_hrefs
167
+ )
168
+
169
+ # 3. Display Results
170
+ st.success(f"### Predicted Shares: {predicted_shares}")
171
+
172
+ col1, col2, col3, col4 = st.columns(4)
173
+ col1.metric("Content Polarity", polarity)
174
+ col2.metric("Content Subjectivity", subjectivity)
175
+ col3.metric("Content Word Count", content_length)
176
+ col4.metric("Title Word Count", title_length)
177
+ else:
178
+ st.warning("Please enter both a Headline and an Article Snippet to run the analysis.")
popularity_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1e47044bad89f8072048d5173868e8423c68f15f62ea842c11484874ba0158e
3
+ size 1400
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
1
+ streamlit
2
+ pandas
3
+ scikit-learn
4
+ numpy
5
+ textblob