Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,11 @@ from googleapiclient.discovery import build
|
|
| 15 |
import os
|
| 16 |
import time
|
| 17 |
from functools import lru_cache
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# --------------------------
|
| 20 |
# Initial Setup
|
|
@@ -166,6 +171,85 @@ def fetch_youtube_data(keyword, limit=30):
|
|
| 166 |
st.error(f"YouTube fetch error: {str(e)}")
|
| 167 |
return pd.DataFrame()
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
# --------------------------
|
| 170 |
# Visualization Functions
|
| 171 |
# --------------------------
|
|
@@ -191,25 +275,59 @@ def generate_wordcloud(text):
|
|
| 191 |
def plot_sentiment(data, keyword):
|
| 192 |
"""Optimized plotting function"""
|
| 193 |
try:
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
fig.update_layout(
|
| 208 |
-
|
| 209 |
xaxis_title="Date",
|
| 210 |
yaxis_title="Sentiment Score",
|
| 211 |
-
|
|
|
|
| 212 |
)
|
|
|
|
| 213 |
return fig
|
| 214 |
except Exception as e:
|
| 215 |
st.error(f"Plotting error: {str(e)}")
|
|
@@ -248,6 +366,7 @@ def main():
|
|
| 248 |
st.markdown("---")
|
| 249 |
st.markdown("### Options")
|
| 250 |
show_details = st.checkbox("Show detailed results", value=False)
|
|
|
|
| 251 |
st.markdown("---")
|
| 252 |
|
| 253 |
# Main content
|
|
@@ -335,21 +454,48 @@ def main():
|
|
| 335 |
|
| 336 |
# Filter recent data
|
| 337 |
combined_data['date'] = pd.to_datetime(combined_data['date'])
|
| 338 |
-
recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=
|
| 339 |
|
| 340 |
if not recent_data.empty:
|
| 341 |
# Sentiment trends
|
| 342 |
st.subheader("π
Sentiment Over Time")
|
| 343 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
if fig:
|
| 345 |
st.plotly_chart(fig, use_container_width=True)
|
| 346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
# Show details if enabled
|
| 348 |
if show_details:
|
| 349 |
st.subheader("π Detailed Results")
|
| 350 |
st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
|
| 351 |
else:
|
| 352 |
-
st.info("No recent data found (within last
|
| 353 |
|
| 354 |
if __name__ == "__main__":
|
| 355 |
# Initialize NLTK data
|
|
|
|
| 15 |
import os
|
| 16 |
import time
|
| 17 |
from functools import lru_cache
|
| 18 |
+
import numpy as np
|
| 19 |
+
from sklearn.linear_model import Ridge
|
| 20 |
+
from sklearn.preprocessing import PolynomialFeatures
|
| 21 |
+
from sklearn.pipeline import make_pipeline
|
| 22 |
+
from sklearn.model_selection import train_test_split
|
| 23 |
|
| 24 |
# --------------------------
|
| 25 |
# Initial Setup
|
|
|
|
| 171 |
st.error(f"YouTube fetch error: {str(e)}")
|
| 172 |
return pd.DataFrame()
|
| 173 |
|
| 174 |
+
# --------------------------
|
| 175 |
+
# Prediction Functions
|
| 176 |
+
# --------------------------
|
| 177 |
+
|
| 178 |
+
def prepare_data_for_prediction(data):
|
| 179 |
+
"""Prepare time series data for prediction"""
|
| 180 |
+
try:
|
| 181 |
+
# Ensure data is sorted by date
|
| 182 |
+
data = data.sort_values('date')
|
| 183 |
+
|
| 184 |
+
# Create daily aggregates
|
| 185 |
+
daily_data = data.groupby(pd.Grouper(key='date', freq='D'))['average'].mean().reset_index()
|
| 186 |
+
|
| 187 |
+
# Create numerical features (days since first date)
|
| 188 |
+
daily_data['days'] = (daily_data['date'] - daily_data['date'].min()).dt.days
|
| 189 |
+
|
| 190 |
+
return daily_data
|
| 191 |
+
except Exception as e:
|
| 192 |
+
st.error(f"Data preparation error: {str(e)}")
|
| 193 |
+
return None
|
| 194 |
+
|
| 195 |
+
def train_sentiment_model(data):
|
| 196 |
+
"""Train Ridge regression model for sentiment prediction"""
|
| 197 |
+
try:
|
| 198 |
+
if len(data) < 5:
|
| 199 |
+
st.warning("Not enough data points for reliable prediction (minimum 5 days required)")
|
| 200 |
+
return None, None
|
| 201 |
+
|
| 202 |
+
# Split data into features (days) and target (sentiment)
|
| 203 |
+
X = data['days'].values.reshape(-1, 1)
|
| 204 |
+
y = data['average'].values
|
| 205 |
+
|
| 206 |
+
# Create polynomial features (degree=2 for simple curves)
|
| 207 |
+
model = make_pipeline(
|
| 208 |
+
PolynomialFeatures(degree=2),
|
| 209 |
+
Ridge(alpha=1.0)
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
model.fit(X, y)
|
| 213 |
+
|
| 214 |
+
return model, data
|
| 215 |
+
except Exception as e:
|
| 216 |
+
st.error(f"Model training error: {str(e)}")
|
| 217 |
+
return None, None
|
| 218 |
+
|
| 219 |
+
def predict_future_sentiment(model, training_data, days_to_predict=15):
|
| 220 |
+
"""Predict future sentiment using trained model"""
|
| 221 |
+
try:
|
| 222 |
+
if model is None:
|
| 223 |
+
return None
|
| 224 |
+
|
| 225 |
+
# Create future dates
|
| 226 |
+
last_date = training_data['date'].max()
|
| 227 |
+
future_dates = [last_date + timedelta(days=i) for i in range(1, days_to_predict+1)]
|
| 228 |
+
|
| 229 |
+
# Create feature matrix for future dates
|
| 230 |
+
min_date = training_data['date'].min()
|
| 231 |
+
future_days = [(date - min_date).days for date in future_dates]
|
| 232 |
+
X_future = np.array(future_days).reshape(-1, 1)
|
| 233 |
+
|
| 234 |
+
# Make predictions
|
| 235 |
+
predictions = model.predict(X_future)
|
| 236 |
+
|
| 237 |
+
# Create prediction dataframe
|
| 238 |
+
pred_df = pd.DataFrame({
|
| 239 |
+
'date': future_dates,
|
| 240 |
+
'average': predictions,
|
| 241 |
+
'type': 'prediction'
|
| 242 |
+
})
|
| 243 |
+
|
| 244 |
+
# Add training data for plotting
|
| 245 |
+
training_df = training_data.copy()
|
| 246 |
+
training_df['type'] = 'actual'
|
| 247 |
+
|
| 248 |
+
return pd.concat([training_df, pred_df], ignore_index=True)
|
| 249 |
+
except Exception as e:
|
| 250 |
+
st.error(f"Prediction error: {str(e)}")
|
| 251 |
+
return None
|
| 252 |
+
|
| 253 |
# --------------------------
|
| 254 |
# Visualization Functions
|
| 255 |
# --------------------------
|
|
|
|
| 275 |
def plot_sentiment(data, keyword):
|
| 276 |
"""Optimized plotting function"""
|
| 277 |
try:
|
| 278 |
+
# Separate actual and predicted data
|
| 279 |
+
actual_data = data[data['type'] == 'actual']
|
| 280 |
+
pred_data = data[data['type'] == 'prediction']
|
| 281 |
+
|
| 282 |
+
fig = go.Figure()
|
| 283 |
+
|
| 284 |
+
# Add actual data
|
| 285 |
+
fig.add_trace(go.Scatter(
|
| 286 |
+
x=actual_data['date'],
|
| 287 |
+
y=actual_data['average'],
|
| 288 |
+
name='Actual Sentiment',
|
| 289 |
+
mode='lines+markers',
|
| 290 |
+
line=dict(color='#636EFA')
|
| 291 |
+
))
|
| 292 |
+
|
| 293 |
+
# Add predicted data if available
|
| 294 |
+
if not pred_data.empty:
|
| 295 |
+
fig.add_trace(go.Scatter(
|
| 296 |
+
x=pred_data['date'],
|
| 297 |
+
y=pred_data['average'],
|
| 298 |
+
name='Predicted Sentiment',
|
| 299 |
+
mode='lines+markers',
|
| 300 |
+
line=dict(color='#EF553B', dash='dot')
|
| 301 |
+
))
|
| 302 |
+
|
| 303 |
+
# Add confidence interval (simple version)
|
| 304 |
+
fig.add_trace(go.Scatter(
|
| 305 |
+
x=pred_data['date'],
|
| 306 |
+
y=pred_data['average'] + 0.1,
|
| 307 |
+
mode='lines',
|
| 308 |
+
line=dict(width=0),
|
| 309 |
+
showlegend=False,
|
| 310 |
+
hoverinfo='skip'
|
| 311 |
+
))
|
| 312 |
+
|
| 313 |
+
fig.add_trace(go.Scatter(
|
| 314 |
+
x=pred_data['date'],
|
| 315 |
+
y=pred_data['average'] - 0.1,
|
| 316 |
+
mode='lines',
|
| 317 |
+
fill='tonexty',
|
| 318 |
+
line=dict(width=0),
|
| 319 |
+
fillcolor='rgba(239, 85, 59, 0.2)',
|
| 320 |
+
name='Prediction Range'
|
| 321 |
+
))
|
| 322 |
+
|
| 323 |
fig.update_layout(
|
| 324 |
+
title=f'Sentiment Analysis and Prediction for "{keyword}"',
|
| 325 |
xaxis_title="Date",
|
| 326 |
yaxis_title="Sentiment Score",
|
| 327 |
+
hovermode="x unified",
|
| 328 |
+
legend_title="Data Type"
|
| 329 |
)
|
| 330 |
+
|
| 331 |
return fig
|
| 332 |
except Exception as e:
|
| 333 |
st.error(f"Plotting error: {str(e)}")
|
|
|
|
| 366 |
st.markdown("---")
|
| 367 |
st.markdown("### Options")
|
| 368 |
show_details = st.checkbox("Show detailed results", value=False)
|
| 369 |
+
enable_prediction = st.checkbox("Enable sentiment prediction", value=True)
|
| 370 |
st.markdown("---")
|
| 371 |
|
| 372 |
# Main content
|
|
|
|
| 454 |
|
| 455 |
# Filter recent data
|
| 456 |
combined_data['date'] = pd.to_datetime(combined_data['date'])
|
| 457 |
+
recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))] # Increased to 60 days for better prediction
|
| 458 |
|
| 459 |
if not recent_data.empty:
|
| 460 |
# Sentiment trends
|
| 461 |
st.subheader("π
Sentiment Over Time")
|
| 462 |
+
|
| 463 |
+
# Prepare data for prediction if enabled
|
| 464 |
+
if enable_prediction and len(recent_data) >= 5:
|
| 465 |
+
with st.spinner("Training prediction model..."):
|
| 466 |
+
daily_data = prepare_data_for_prediction(recent_data)
|
| 467 |
+
model, training_data = train_sentiment_model(daily_data)
|
| 468 |
+
|
| 469 |
+
if model is not None:
|
| 470 |
+
full_data = predict_future_sentiment(model, training_data)
|
| 471 |
+
fig = plot_sentiment(full_data, keyword)
|
| 472 |
+
else:
|
| 473 |
+
fig = plot_sentiment(training_data, keyword)
|
| 474 |
+
else:
|
| 475 |
+
daily_data = prepare_data_for_prediction(recent_data)
|
| 476 |
+
fig = plot_sentiment(daily_data.assign(type='actual'), keyword)
|
| 477 |
+
|
| 478 |
if fig:
|
| 479 |
st.plotly_chart(fig, use_container_width=True)
|
| 480 |
|
| 481 |
+
# Show prediction insights
|
| 482 |
+
if enable_prediction and 'full_data' in locals() and full_data is not None:
|
| 483 |
+
last_actual = full_data[full_data['type'] == 'actual']['average'].iloc[-1]
|
| 484 |
+
last_pred = full_data[full_data['type'] == 'prediction']['average'].iloc[-1]
|
| 485 |
+
|
| 486 |
+
if last_pred > last_actual + 0.1:
|
| 487 |
+
st.success("π Prediction: Sentiment is expected to improve in the next 15 days")
|
| 488 |
+
elif last_pred < last_actual - 0.1:
|
| 489 |
+
st.warning("π Prediction: Sentiment is expected to decline in the next 15 days")
|
| 490 |
+
else:
|
| 491 |
+
st.info("π Prediction: Sentiment is expected to remain stable in the next 15 days")
|
| 492 |
+
|
| 493 |
# Show details if enabled
|
| 494 |
if show_details:
|
| 495 |
st.subheader("π Detailed Results")
|
| 496 |
st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
|
| 497 |
else:
|
| 498 |
+
st.info("No recent data found (within last 60 days).")
|
| 499 |
|
| 500 |
if __name__ == "__main__":
|
| 501 |
# Initialize NLTK data
|