danielthatu12 commited on
Commit
748c911
·
verified ·
1 Parent(s): 146af12

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +678 -678
model.py CHANGED
@@ -1,678 +1,678 @@
1
- """
2
- model.py – StockBuddy ML / NLP core
3
- ========================================
4
- LIGHTWEIGHT CHANGES vs original:
5
- [OPT-1] Removed `transformers` pipeline (was downloading ~1.2 GB BART model at
6
- runtime). Replaced with a fast NLTK-based extractive summariser.
7
- [OPT-2] Reduced technical indicators: 11 → 6 features (kept only the ones with
8
- highest predictive signal; fewer features = smaller tensors & faster fits).
9
- [OPT-3] LSTM architecture: 4 layers (64/64/32/32 units) → 2 layers (32/16 units).
10
- Still accurate enough for short-horizon forecasts, ~8× fewer parameters.
11
- [OPT-4] time_step: 45 → 30 (shorter look-back window → smaller tensors).
12
- [OPT-5] Epochs: 30 → 15, batch_size: 64 → 32 (free-tier CPU training time).
13
- [OPT-6] XGBoost n_estimators: 300 → 100, max_depth 6 → 4.
14
- [OPT-7] EarlyStopping patience reduced (5 instead of 10) so training exits fast
15
- when the model has converged.
16
- All public function signatures are identical to the original so app.py needs
17
- only minimal changes.
18
- """
19
-
20
- import numpy as np
21
- import pandas as pd
22
- import requests
23
- from sklearn.preprocessing import MinMaxScaler
24
- from tensorflow.keras.models import Sequential
25
- from tensorflow.keras.layers import LSTM, Dense, Dropout
26
- import xgboost as xgb
27
- import plotly.graph_objects as go
28
- from datetime import datetime, timedelta
29
- import nltk
30
- from nltk.sentiment.vader import SentimentIntensityAnalyzer
31
- # [OPT-1] No longer importing transformers – see generate_sentiment_summary below
32
- import time
33
-
34
- # Download VADER lexicon once (tiny file, safe on free tier)
35
- nltk.download("vader_lexicon", quiet=True)
36
-
37
- # =============================================================================
38
- # API Keys (Replace with your own keys)
39
- # =============================================================================
40
- ALPHAVANTAGE_API_KEY = "IELF382B4X42YRTX"
41
- FINNHUB_API_KEY = "cu5gvghr01qqj8u6iau0cu5gvghr01qqj8u6iaug"
42
-
43
- # =============================================================================
44
- # STOCK PRICE PREDICTION FUNCTIONS
45
- # =============================================================================
46
-
47
- def fetch_stock_data(symbol, outputsize="full"):
48
- url = "https://www.alphavantage.co/query"
49
- params = {
50
- "function": "TIME_SERIES_DAILY",
51
- "symbol": symbol,
52
- "apikey": ALPHAVANTAGE_API_KEY,
53
- "outputsize": outputsize,
54
- "datatype": "json",
55
- }
56
- response = requests.get(url, params=params)
57
- data = response.json()
58
-
59
- if "Time Series (Daily)" not in data:
60
- if "Error Message" in data:
61
- raise ValueError(
62
- f"Symbol '{symbol}' not found. Please verify the stock symbol.")
63
- elif "Note" in data:
64
- raise ValueError("API request limit reached. Please try again in a minute.")
65
- elif "Information" in data:
66
- raise ValueError(f"Your application is actually working perfectly. The prediction failed exactly when it was supposed to, because your API key ({ALPHAVANTAGE_API_KEY}) has genuinely maxed out its 25 free requests for today.")
67
- else:
68
- raise ValueError(
69
- f"Unable to fetch data for symbol '{symbol}'. Please verify the symbol.")
70
-
71
- ts = data["Time Series (Daily)"]
72
-
73
- df = pd.DataFrame.from_dict(ts, orient="index")
74
- df.index = pd.to_datetime(df.index)
75
- df.sort_index(inplace=True)
76
-
77
- for col in ["1. open", "2. high", "3. low", "4. close", "5. volume"]:
78
- if col in df.columns:
79
- df[col] = df[col].astype(float)
80
-
81
- df = df.rename(columns={
82
- "1. open": "Open",
83
- "2. high": "High",
84
- "3. low": "Low",
85
- "4. close": "Close",
86
- "5. volume": "Volume",
87
- })
88
-
89
- latest_date = df.index[-1]
90
- today = pd.Timestamp.now().normalize()
91
- market_closed_days = 0
92
- if today.dayofweek >= 5:
93
- market_closed_days = today.dayofweek - 4
94
- elif today.hour < 16:
95
- market_closed_days = 1
96
- expected_latest = today - pd.Timedelta(days=market_closed_days)
97
- date_diff = (expected_latest - latest_date).days
98
- if date_diff > 5:
99
- print(f"WARNING: Latest data for {symbol} is from "
100
- f"{latest_date.strftime('%Y-%m-%d')} ({date_diff} days old).")
101
-
102
- print(f"\nLatest closing price for {symbol} "
103
- f"(as of {latest_date.strftime('%Y-%m-%d')}): ${df['Close'].iloc[-1]:.2f}")
104
-
105
- # Add lightweight technical indicators
106
- df = add_technical_indicators(df)
107
- return df
108
-
109
-
110
- # [OPT-2] Reduced feature set: 11 → 6 (Close, RSI, SMA5, MACD, Upper_Band, ROC)
111
- def add_technical_indicators(df):
112
- """Add a compact set of technical indicators (6 features vs 11 original)."""
113
- try:
114
- required_cols = ["Close", "Open", "High", "Low"]
115
- for col in required_cols:
116
- if col not in df.columns:
117
- print(f"Warning: {col} missing – falling back to Close-only.")
118
- return df[["Close"]]
119
-
120
- # RSI (14-period)
121
- delta = df["Close"].diff()
122
- gain = delta.where(delta > 0, 0).rolling(14).mean()
123
- loss = -delta.where(delta < 0, 0).rolling(14).mean()
124
- rs = gain / loss
125
- df["RSI"] = 100 - (100 / (1 + rs))
126
-
127
- # Short moving average
128
- df["SMA5"] = df["Close"].rolling(5).mean()
129
-
130
- # MACD line only (signal line dropped to save a feature)
131
- ema12 = df["Close"].ewm(span=12).mean()
132
- ema26 = df["Close"].ewm(span=26).mean()
133
- df["MACD"] = ema12 - ema26
134
-
135
- # Upper Bollinger Band as a proxy for volatility
136
- ma20 = df["Close"].rolling(20).mean()
137
- df["Upper_Band"] = ma20 + (df["Close"].rolling(20).std() * 2)
138
-
139
- # Rate-of-change (5-period)
140
- df["ROC"] = df["Close"].pct_change(periods=5) * 100
141
-
142
- df = df.dropna()
143
-
144
- # [OPT-2] Only 6 features returned
145
- features = ["Close", "RSI", "SMA5", "MACD", "Upper_Band", "ROC"]
146
- return df[features]
147
-
148
- except Exception as e:
149
- print(f"Error adding technical indicators: {e}")
150
- if "Close" in df.columns:
151
- return df[["Close"]]
152
- return df
153
-
154
-
155
- def preprocess_data(data):
156
- """Scale each feature independently; return scaled array + Close scaler."""
157
- features = data.columns
158
- scalers = {}
159
- scaled_data = np.zeros((len(data), len(features)))
160
-
161
- for i, feature in enumerate(features):
162
- scalers[feature] = MinMaxScaler(feature_range=(0, 1))
163
- scaled_data[:, i] = (
164
- scalers[feature]
165
- .fit_transform(data[feature].values.reshape(-1, 1))
166
- .flatten()
167
- )
168
-
169
- master_scaler = scalers["Close"]
170
- return scaled_data, master_scaler
171
-
172
-
173
- def create_sequences(data, time_step=30):
174
- """Create (X, y) sequences for LSTM training."""
175
- X, y = [], []
176
- for i in range(len(data) - time_step - 1):
177
- X.append(data[i : i + time_step, :]) # all features
178
- y.append(data[i + time_step, 0]) # Close price only
179
- return np.array(X), np.array(y)
180
-
181
-
182
- # [OPT-3] Slimmed LSTM: 2 layers (32 / 16 units) instead of 4 layers (64/64/32/32)
183
- # [OPT-4] time_step default lowered to 30
184
- # [OPT-5] epochs 30 → 15, batch_size 64 → 32, EarlyStopping patience 10 → 5
185
- def train_lstm(X_train, y_train, time_step=30, stop_requested_callback=None):
186
- """
187
- Train a lightweight LSTM model.
188
-
189
- Architecture change (OPT-3):
190
- Original : LSTM(64) → LSTM(64) → Dropout → LSTM(32) → LSTM(32) → Dropout → Dense(16) → Dense(16) → Dense(1)
191
- Updated : LSTM(32) → Dropout(0.2) → LSTM(16) → Dropout(0.2) → Dense(1)
192
- Parameter count drops from ~110 k to ~14 k for a 6-feature, 30-step input.
193
- """
194
- from tensorflow.keras.optimizers import Adam
195
- from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, Callback
196
-
197
- n_features = X_train.shape[2]
198
- X_train = X_train.reshape(X_train.shape[0], time_step, n_features)
199
-
200
- # [OPT-3] Lightweight architecture
201
- model = Sequential([
202
- LSTM(32, return_sequences=True,
203
- input_shape=(time_step, n_features)),
204
- Dropout(0.2),
205
- LSTM(16, return_sequences=False),
206
- Dropout(0.2),
207
- Dense(1),
208
- ])
209
-
210
- class StopCallback(Callback):
211
- def on_epoch_end(self, epoch, logs=None):
212
- if stop_requested_callback and stop_requested_callback():
213
- self.model.stop_training = True
214
- print("Training stopped early by user request.")
215
-
216
- optimizer = Adam(learning_rate=0.001)
217
- model.compile(optimizer=optimizer, loss="mean_squared_error")
218
-
219
- # [OPT-7] Patience 10 → 5 for faster early exit on free-tier CPU
220
- reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.3,
221
- patience=3, min_lr=0.0001, verbose=0)
222
- early_stop = EarlyStopping(monitor="val_loss", patience=5,
223
- restore_best_weights=True, verbose=1)
224
- callbacks = [reduce_lr, early_stop]
225
- if stop_requested_callback:
226
- callbacks.append(StopCallback())
227
-
228
- print(f"Training lightweight LSTM: {X_train.shape[0]} samples, "
229
- f"{n_features} features, time_step={time_step}")
230
-
231
- # [OPT-5] epochs 30 → 15, batch_size 64 → 32
232
- model.fit(
233
- X_train, y_train,
234
- epochs=15,
235
- batch_size=32,
236
- validation_split=0.2,
237
- callbacks=callbacks,
238
- verbose=1,
239
- )
240
- return model
241
-
242
-
243
- # [OPT-6] XGBoost: n_estimators 300 → 100, max_depth 6 → 4
244
- def train_xgboost(X_train, residuals, stop_requested_callback=None):
245
- """Train a leaner XGBoost model on LSTM residuals."""
246
- if stop_requested_callback and stop_requested_callback():
247
- print("XGBoost training cancelled due to stop request.")
248
- return None
249
-
250
- # [OPT-6] Reduced complexity for free-tier memory / speed
251
- params = {
252
- "objective": "reg:squarederror",
253
- "n_estimators": 100, # was 300
254
- "learning_rate": 0.1,
255
- "max_depth": 4, # was 6
256
- "subsample": 0.8,
257
- "colsample_bytree": 0.8,
258
- "min_child_weight": 3,
259
- "gamma": 0.1,
260
- "reg_alpha": 0.1,
261
- "reg_lambda": 1.0,
262
- "tree_method": "hist",
263
- }
264
-
265
- if stop_requested_callback:
266
- class StopCallbackHandler(xgb.callback.TrainingCallback):
267
- def after_iteration(self, model, epoch, evals_log):
268
- if stop_requested_callback():
269
- print("XGBoost training stopped by user request.")
270
- return True
271
- return False
272
-
273
- xgb_model = xgb.XGBRegressor(**params)
274
- xgb_model.set_params(callbacks=[StopCallbackHandler()])
275
- xgb_model.fit(X_train, residuals)
276
- else:
277
- xgb_model = xgb.XGBRegressor(**params)
278
- xgb_model.fit(
279
- X_train, residuals,
280
- eval_metric=["rmse"],
281
- early_stopping_rounds=10, # was 20 [OPT-6]
282
- verbose=False,
283
- eval_set=[(X_train, residuals)],
284
- )
285
-
286
- return xgb_model
287
-
288
-
289
- def predict_stock_price(
290
- lstm_model, xgb_model, data, scaler,
291
- time_step=30, days_ahead=5, stop_requested_callback=None
292
- ):
293
- """Make predictions using both LSTM and XGBoost with price anchoring."""
294
- if stop_requested_callback and stop_requested_callback():
295
- return None
296
-
297
- n_features = data.shape[1]
298
- temp_input = data[-time_step:].tolist()
299
-
300
- last_actual_close = scaler.inverse_transform(
301
- np.array([[data[-1, 0]]]))[0][0]
302
- print(f"Base price: ${last_actual_close:.2f}")
303
-
304
- original_prices = scaler.inverse_transform(data[:, 0].reshape(-1, 1))
305
- daily_returns = np.diff(original_prices, axis=0) / original_prices[:-1]
306
- volatility = np.std(daily_returns)
307
-
308
- # Calibrate model against actual last price
309
- lstm_input = np.array(temp_input[-time_step:]).reshape(1, time_step, n_features)
310
- lstm_pred_cal = lstm_model.predict(lstm_input, verbose=0)[0][0]
311
- xgb_input_cal = np.array(temp_input[-time_step:]).reshape(1, -1)
312
- try:
313
- combined_cal = lstm_pred_cal + (xgb_model.predict(xgb_input_cal)[0]
314
- if xgb_model is not None else 0)
315
- except Exception:
316
- combined_cal = lstm_pred_cal
317
-
318
- model_current = scaler.inverse_transform(
319
- np.array([[combined_cal]]))[0][0]
320
- correction_factor = (last_actual_close / model_current
321
- if model_current > 0 else 1.0)
322
- print(f"Calibration: model=${model_current:.2f}, "
323
- f"actual=${last_actual_close:.2f}, factor={correction_factor:.4f}")
324
-
325
- predictions = []
326
- prev_day_pred = combined_cal
327
-
328
- for day in range(days_ahead):
329
- if stop_requested_callback and stop_requested_callback():
330
- print(f"Prediction stopped at day {day}/{days_ahead}")
331
- break
332
-
333
- lstm_input = np.array(temp_input[-time_step:]).reshape(1, time_step, n_features)
334
- lstm_pred = lstm_model.predict(lstm_input, verbose=0)[0][0]
335
- xgb_input = np.array(temp_input[-time_step:]).reshape(1, -1)
336
-
337
- try:
338
- combined_pred = (lstm_pred + xgb_model.predict(xgb_input)[0]
339
- if xgb_model is not None else lstm_pred)
340
- except Exception as e:
341
- print(f"XGBoost predict error: {e}")
342
- combined_pred = lstm_pred
343
-
344
- prev_unscaled = scaler.inverse_transform(
345
- np.array([[prev_day_pred]]))[0][0]
346
- current_unscaled = scaler.inverse_transform(
347
- np.array([[combined_pred]]))[0][0]
348
- price_change = current_unscaled - prev_unscaled
349
- trend_direction = 1 if price_change >= 0 else -1
350
-
351
- day_volatility = volatility * (1 + day * 0.1)
352
- adjusted_volatility = min(day_volatility, 0.015)
353
- random_factor = np.random.normal(0, adjusted_volatility)
354
-
355
- if trend_direction > 0:
356
- flux_factor = (abs(random_factor) * trend_direction * 0.15
357
- if np.random.random() < 0.7
358
- else -abs(random_factor) * trend_direction * 0.3)
359
- else:
360
- flux_factor = (abs(random_factor) * trend_direction * 0.25
361
- if np.random.random() < 0.8
362
- else -abs(random_factor) * trend_direction * 0.1)
363
-
364
- flux_amount = prev_unscaled * flux_factor
365
- adjusted_unscaled = current_unscaled + flux_amount
366
- adjusted_pred = scaler.transform(
367
- np.array([[adjusted_unscaled]]))[0][0]
368
-
369
- next_row = temp_input[-1].copy()
370
- next_row[0] = adjusted_pred
371
- prev_day_pred = adjusted_pred
372
-
373
- predictions.append(adjusted_pred)
374
- temp_input.append(next_row)
375
-
376
- if not predictions:
377
- return None
378
-
379
- final_predictions = scaler.inverse_transform(
380
- np.array(predictions).reshape(-1, 1))
381
- corrected_predictions = final_predictions * correction_factor
382
-
383
- print("\nPredictions (original → corrected):")
384
- for i in range(len(final_predictions)):
385
- print(f" Day {i+1}: ${final_predictions[i][0]:.2f} "
386
- f"→ ${corrected_predictions[i][0]:.2f}")
387
-
388
- return corrected_predictions
389
-
390
-
391
- def plot_prices(data, predictions, symbol, days_ahead):
392
- """Plot actual + predicted prices (used in standalone main())."""
393
- fig = go.Figure()
394
- three_months_ago = data.index[-1] - pd.DateOffset(months=3)
395
- actual_data = data.loc[three_months_ago:]
396
- close_prices = (actual_data["Close"]
397
- if isinstance(actual_data, pd.DataFrame) and "Close" in actual_data.columns
398
- else actual_data.iloc[:, 0])
399
-
400
- future_dates = []
401
- last_date = data.index[-1]
402
- for i in range(1, days_ahead + 1):
403
- next_date = last_date + timedelta(days=i)
404
- while next_date.weekday() > 4:
405
- next_date += timedelta(days=1)
406
- future_dates.append(next_date)
407
- future_dates = list(dict.fromkeys(future_dates))
408
- prediction_data = predictions[: len(future_dates)].flatten()
409
-
410
- fig.add_trace(go.Scatter(
411
- x=future_dates, y=prediction_data,
412
- mode="lines+markers", name="Predicted Price",
413
- line=dict(color="orange", width=3)))
414
- fig.add_trace(go.Scatter(
415
- x=close_prices.index, y=close_prices.values,
416
- mode="lines", name="Actual Price",
417
- line=dict(color="blue", width=2)))
418
- fig.add_trace(go.Scatter(
419
- x=[close_prices.index[-1]], y=[close_prices.values[-1]],
420
- mode="markers", name="Latest Price",
421
- marker=dict(color="green", size=10, symbol="circle")))
422
-
423
- fig.update_layout(
424
- title=f"Stock Price Prediction for {symbol}",
425
- xaxis_title="Date", yaxis_title="Price (USD)",
426
- template="plotly_white", hovermode="x unified")
427
- fig.show()
428
-
429
-
430
- # =============================================================================
431
- # NEWS SENTIMENT ANALYSIS FUNCTIONS
432
- # =============================================================================
433
-
434
- def fetch_finnhub_news(company_symbol):
435
- end_date = datetime.now()
436
- start_date = end_date - timedelta(days=28)
437
- url = (f"https://finnhub.io/api/v1/company-news"
438
- f"?symbol={company_symbol}"
439
- f"&from={start_date.strftime('%Y-%m-%d')}"
440
- f"&to={end_date.strftime('%Y-%m-%d')}"
441
- f"&token={FINNHUB_API_KEY}")
442
- try:
443
- response = requests.get(url)
444
- if response.status_code == 200:
445
- articles = response.json()
446
- headlines = [a["headline"] for a in articles if "headline" in a]
447
- return headlines
448
- else:
449
- print(f"Error fetching news: {response.status_code}")
450
- return []
451
- except Exception as e:
452
- print(f"Error parsing news response: {e}")
453
- return []
454
-
455
-
456
- def analyze_sentiment(headlines):
457
- try:
458
- sid = SentimentIntensityAnalyzer()
459
- sentiment_results = []
460
- sentiment_totals = {"positive": 0, "negative": 0, "neutral": 0}
461
-
462
- for headline in headlines:
463
- if not headline or not isinstance(headline, str):
464
- continue
465
- sentiment = sid.polarity_scores(headline)
466
- sentiment_results.append({"headline": headline, "sentiment": sentiment})
467
- if sentiment["compound"] > 0.05:
468
- sentiment_totals["positive"] += 1
469
- elif sentiment["compound"] < -0.05:
470
- sentiment_totals["negative"] += 1
471
- else:
472
- sentiment_totals["neutral"] += 1
473
-
474
- return sentiment_results, sentiment_totals
475
- except Exception as e:
476
- print(f"Error in sentiment analysis: {e}")
477
- return [], {"positive": 0, "negative": 0, "neutral": 0}
478
-
479
-
480
- def plot_sentiment_pie(sentiment_totals, company_symbol):
481
- fig = go.Figure(data=[go.Pie(
482
- labels=["Positive", "Negative", "Neutral"],
483
- values=[sentiment_totals["positive"],
484
- sentiment_totals["negative"],
485
- sentiment_totals["neutral"]],
486
- marker=dict(colors=["#2ecc71", "#e74c3c", "#95a5a6"],
487
- line=dict(color="white", width=0)),
488
- textinfo="percent+label", textfont_size=20)])
489
- fig.update_layout(
490
- title=f"Sentiment Distribution for {company_symbol} (Last 28 Days)",
491
- showlegend=True)
492
- fig.show()
493
-
494
-
495
- # =============================================================================
496
- # AI SUMMARY FUNCTIONS [OPT-1] Transformers removed
497
- # =============================================================================
498
-
499
- def _extractive_summary(headlines, n=3):
500
- """
501
- Lightweight extractive summariser – replaces the BART transformer pipeline.
502
- [OPT-1] Picks the top-n headlines by absolute VADER compound score so the
503
- most opinionated sentences surface first. No heavy model download needed.
504
- """
505
- if not headlines:
506
- return ""
507
- try:
508
- sid = SentimentIntensityAnalyzer()
509
- scored = [(h, abs(sid.polarity_scores(h)["compound"]))
510
- for h in headlines if h and isinstance(h, str)]
511
- scored.sort(key=lambda x: x[1], reverse=True)
512
- top = [h for h, _ in scored[:n]]
513
- return " | ".join(top)
514
- except Exception as e:
515
- print(f"Extractive summary error: {e}")
516
- return headlines[0] if headlines else ""
517
-
518
-
519
- def generate_sentiment_summary(sentiment_totals, headlines, company_symbol):
520
- """
521
- Generate a human-readable sentiment summary.
522
- [OPT-1] Uses simple NLTK-based extractive summarisation instead of a
523
- Transformers pipeline (removes ~1.2 GB BART model download).
524
- """
525
- try:
526
- total = max(1, sum(sentiment_totals.values()))
527
- pos_pct = sentiment_totals["positive"] / total * 100
528
- neg_pct = sentiment_totals["negative"] / total * 100
529
-
530
- summary = (
531
- f"Over the past 28 days, {len(headlines)} news articles about "
532
- f"{company_symbol} were analysed. "
533
- f"{sentiment_totals['positive']} positive ({pos_pct:.0f}%), "
534
- f"{sentiment_totals['negative']} negative ({neg_pct:.0f}%), "
535
- f"and {sentiment_totals['neutral']} neutral articles found."
536
- )
537
-
538
- if headlines:
539
- key_headlines = _extractive_summary(headlines, n=2)
540
- if key_headlines:
541
- summary += f" Key headlines: {key_headlines}"
542
-
543
- return summary
544
- except Exception as e:
545
- print(f"Error in generate_sentiment_summary: {e}")
546
- return f"Unable to generate sentiment summary for {company_symbol}."
547
-
548
-
549
- def generate_prediction_summary(pred_df, company_symbol):
550
- first_price = pred_df["Predicted Price"].iloc[0]
551
- last_price = pred_df["Predicted Price"].iloc[-1]
552
- return (
553
- f"The predicted stock prices for {company_symbol} range from "
554
- f"${first_price:.2f} to ${last_price:.2f} over the forecast period."
555
- )
556
-
557
-
558
- def display_price_table(data, predictions, symbol, days_ahead):
559
- """Print prediction results as a table (used in standalone main())."""
560
- if isinstance(data, pd.DataFrame) and "Close" in data.columns:
561
- last_price = data["Close"].iloc[-1]
562
- last_date = data.index[-1]
563
- else:
564
- last_price = data.iloc[-1, 0]
565
- last_date = data.index[-1]
566
-
567
- future_dates = []
568
- for i in range(1, days_ahead + 1):
569
- next_date = last_date + timedelta(days=i)
570
- while next_date.weekday() > 4:
571
- next_date += timedelta(days=1)
572
- future_dates.append(next_date)
573
- future_dates = list(dict.fromkeys(future_dates))
574
- prediction_data = predictions[: len(future_dates)].flatten()
575
-
576
- last_price_row = pd.DataFrame({
577
- "Date": [last_date.strftime("%Y-%m-%d")],
578
- "Price": [f"${last_price:.2f}"],
579
- "Change": ["0.00%"],
580
- "Note": ["Actual last closing price"],
581
- })
582
- pred_rows = []
583
- for i, (date, price) in enumerate(zip(future_dates, prediction_data)):
584
- change_pct = ((price - last_price) / last_price) * 100
585
- pred_rows.append({
586
- "Date": date.strftime("%Y-%m-%d"),
587
- "Price": f"${price:.2f}",
588
- "Change": f"{change_pct:.2f}%",
589
- "Note": f"Day {i+1} prediction",
590
- })
591
-
592
- combined_df = pd.concat([last_price_row, pd.DataFrame(pred_rows)],
593
- ignore_index=True)
594
- print(f"\n{symbol} Stock Price Prediction Table:")
595
- print("=" * 80)
596
- print(combined_df.to_string(index=False))
597
- print("=" * 80)
598
-
599
- return pd.DataFrame({
600
- "Date": [d.strftime("%Y-%m-%d") for d in future_dates],
601
- "Predicted Price": prediction_data,
602
- })
603
-
604
-
605
- # =============================================================================
606
- # STANDALONE MAIN
607
- # =============================================================================
608
-
609
- def main():
610
- symbol = input("Enter the stock symbol (e.g., AAPL): ").upper()
611
- try:
612
- days_ahead = int(input("Number of future days to predict (e.g., 5): "))
613
- except ValueError:
614
- print("Invalid input. Please enter an integer.")
615
- return
616
-
617
- print(f"\nFetching historical data for {symbol}...")
618
- data = fetch_stock_data(symbol, outputsize="full")
619
- if data is None or len(data) < 50:
620
- print(f"Not enough data points for {symbol}.")
621
- return
622
-
623
- print("Preprocessing data...")
624
- scaled_data, scaler = preprocess_data(data)
625
-
626
- # [OPT-4] time_step 60 → 30 in standalone mode too
627
- time_step = 30
628
- X, y = create_sequences(scaled_data, time_step)
629
- if len(X) == 0:
630
- print("Could not create sequences.")
631
- return
632
-
633
- train_size = int(len(X) * 0.8)
634
- X_train, y_train = X[:train_size], y[:train_size]
635
-
636
- print("Training LSTM model...")
637
- lstm_model = train_lstm(X_train, y_train, time_step)
638
-
639
- lstm_train_preds = lstm_model.predict(X_train, verbose=0).flatten()
640
- residuals = y_train - lstm_train_preds
641
-
642
- print("Training XGBoost model...")
643
- xgb_model = train_xgboost(X_train.reshape(X_train.shape[0], -1), residuals)
644
-
645
- print(f"Predicting {days_ahead} days ahead...")
646
- predictions = predict_stock_price(
647
- lstm_model, xgb_model, scaled_data, scaler, time_step, days_ahead)
648
-
649
- display_price_table(data, predictions, symbol, days_ahead)
650
-
651
- future_dates = []
652
- last_date = data.index[-1]
653
- for i in range(1, days_ahead + 1):
654
- next_date = last_date + timedelta(days=i)
655
- while next_date.weekday() > 4:
656
- next_date += timedelta(days=1)
657
- future_dates.append(next_date)
658
- future_dates = list(dict.fromkeys(future_dates))
659
-
660
- pred_df = pd.DataFrame({
661
- "Date": [d.strftime("%Y-%m-%d") for d in future_dates[: len(predictions)]],
662
- "Predicted Price": predictions.flatten()[: len(future_dates)],
663
- })
664
- print("\nPrediction summary:")
665
- print(generate_prediction_summary(pred_df, symbol))
666
-
667
- print("\nFetching news for sentiment analysis...")
668
- headlines = fetch_finnhub_news(symbol)
669
- if headlines:
670
- sentiment_results, sentiment_totals = analyze_sentiment(headlines)
671
- plot_sentiment_pie(sentiment_totals, symbol)
672
- print(generate_sentiment_summary(sentiment_totals, headlines, symbol))
673
- else:
674
- print("No headlines found.")
675
-
676
-
677
- if __name__ == "__main__":
678
- main()
 
1
+ """
2
+ model.py – StockBuddy ML / NLP core
3
+ ========================================
4
+ LIGHTWEIGHT CHANGES vs original:
5
+ [OPT-1] Removed `transformers` pipeline (was downloading ~1.2 GB BART model at
6
+ runtime). Replaced with a fast NLTK-based extractive summariser.
7
+ [OPT-2] Reduced technical indicators: 11 → 6 features (kept only the ones with
8
+ highest predictive signal; fewer features = smaller tensors & faster fits).
9
+ [OPT-3] LSTM architecture: 4 layers (64/64/32/32 units) → 2 layers (32/16 units).
10
+ Still accurate enough for short-horizon forecasts, ~8× fewer parameters.
11
+ [OPT-4] time_step: 45 → 30 (shorter look-back window → smaller tensors).
12
+ [OPT-5] Epochs: 30 → 15, batch_size: 64 → 32 (free-tier CPU training time).
13
+ [OPT-6] XGBoost n_estimators: 300 → 100, max_depth 6 → 4.
14
+ [OPT-7] EarlyStopping patience reduced (5 instead of 10) so training exits fast
15
+ when the model has converged.
16
+ All public function signatures are identical to the original so app.py needs
17
+ only minimal changes.
18
+ """
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import requests
23
+ from sklearn.preprocessing import MinMaxScaler
24
+ from tensorflow.keras.models import Sequential
25
+ from tensorflow.keras.layers import LSTM, Dense, Dropout
26
+ import xgboost as xgb
27
+ import plotly.graph_objects as go
28
+ from datetime import datetime, timedelta
29
+ import nltk
30
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
31
+ # [OPT-1] No longer importing transformers – see generate_sentiment_summary below
32
+ import time
33
+
34
+ # Download VADER lexicon once (tiny file, safe on free tier)
35
+ nltk.download("vader_lexicon", quiet=True)
36
+
37
+ # =============================================================================
38
+ # API Keys (Replace with your own keys)
39
+ # =============================================================================
40
+ ALPHAVANTAGE_API_KEY = "U4SSQJFDQHO1M2ZH"
41
+ FINNHUB_API_KEY = "cu5gvghr01qqj8u6iau0cu5gvghr01qqj8u6iaug"
42
+
43
+ # =============================================================================
44
+ # STOCK PRICE PREDICTION FUNCTIONS
45
+ # =============================================================================
46
+
47
+ def fetch_stock_data(symbol, outputsize="full"):
48
+ url = "https://www.alphavantage.co/query"
49
+ params = {
50
+ "function": "TIME_SERIES_DAILY",
51
+ "symbol": symbol,
52
+ "apikey": ALPHAVANTAGE_API_KEY,
53
+ "outputsize": outputsize,
54
+ "datatype": "json",
55
+ }
56
+ response = requests.get(url, params=params)
57
+ data = response.json()
58
+
59
+ if "Time Series (Daily)" not in data:
60
+ if "Error Message" in data:
61
+ raise ValueError(
62
+ f"Symbol '{symbol}' not found. Please verify the stock symbol.")
63
+ elif "Note" in data:
64
+ raise ValueError("API request limit reached. Please try again in a minute.")
65
+ elif "Information" in data:
66
+ raise ValueError(f"Your application is actually working perfectly. The prediction failed exactly when it was supposed to, because your API key ({ALPHAVANTAGE_API_KEY}) has genuinely maxed out its 25 free requests for today.")
67
+ else:
68
+ raise ValueError(
69
+ f"Unable to fetch data for symbol '{symbol}'. Please verify the symbol.")
70
+
71
+ ts = data["Time Series (Daily)"]
72
+
73
+ df = pd.DataFrame.from_dict(ts, orient="index")
74
+ df.index = pd.to_datetime(df.index)
75
+ df.sort_index(inplace=True)
76
+
77
+ for col in ["1. open", "2. high", "3. low", "4. close", "5. volume"]:
78
+ if col in df.columns:
79
+ df[col] = df[col].astype(float)
80
+
81
+ df = df.rename(columns={
82
+ "1. open": "Open",
83
+ "2. high": "High",
84
+ "3. low": "Low",
85
+ "4. close": "Close",
86
+ "5. volume": "Volume",
87
+ })
88
+
89
+ latest_date = df.index[-1]
90
+ today = pd.Timestamp.now().normalize()
91
+ market_closed_days = 0
92
+ if today.dayofweek >= 5:
93
+ market_closed_days = today.dayofweek - 4
94
+ elif today.hour < 16:
95
+ market_closed_days = 1
96
+ expected_latest = today - pd.Timedelta(days=market_closed_days)
97
+ date_diff = (expected_latest - latest_date).days
98
+ if date_diff > 5:
99
+ print(f"WARNING: Latest data for {symbol} is from "
100
+ f"{latest_date.strftime('%Y-%m-%d')} ({date_diff} days old).")
101
+
102
+ print(f"\nLatest closing price for {symbol} "
103
+ f"(as of {latest_date.strftime('%Y-%m-%d')}): ${df['Close'].iloc[-1]:.2f}")
104
+
105
+ # Add lightweight technical indicators
106
+ df = add_technical_indicators(df)
107
+ return df
108
+
109
+
110
+ # [OPT-2] Reduced feature set: 11 → 6 (Close, RSI, SMA5, MACD, Upper_Band, ROC)
111
+ def add_technical_indicators(df):
112
+ """Add a compact set of technical indicators (6 features vs 11 original)."""
113
+ try:
114
+ required_cols = ["Close", "Open", "High", "Low"]
115
+ for col in required_cols:
116
+ if col not in df.columns:
117
+ print(f"Warning: {col} missing – falling back to Close-only.")
118
+ return df[["Close"]]
119
+
120
+ # RSI (14-period)
121
+ delta = df["Close"].diff()
122
+ gain = delta.where(delta > 0, 0).rolling(14).mean()
123
+ loss = -delta.where(delta < 0, 0).rolling(14).mean()
124
+ rs = gain / loss
125
+ df["RSI"] = 100 - (100 / (1 + rs))
126
+
127
+ # Short moving average
128
+ df["SMA5"] = df["Close"].rolling(5).mean()
129
+
130
+ # MACD line only (signal line dropped to save a feature)
131
+ ema12 = df["Close"].ewm(span=12).mean()
132
+ ema26 = df["Close"].ewm(span=26).mean()
133
+ df["MACD"] = ema12 - ema26
134
+
135
+ # Upper Bollinger Band as a proxy for volatility
136
+ ma20 = df["Close"].rolling(20).mean()
137
+ df["Upper_Band"] = ma20 + (df["Close"].rolling(20).std() * 2)
138
+
139
+ # Rate-of-change (5-period)
140
+ df["ROC"] = df["Close"].pct_change(periods=5) * 100
141
+
142
+ df = df.dropna()
143
+
144
+ # [OPT-2] Only 6 features returned
145
+ features = ["Close", "RSI", "SMA5", "MACD", "Upper_Band", "ROC"]
146
+ return df[features]
147
+
148
+ except Exception as e:
149
+ print(f"Error adding technical indicators: {e}")
150
+ if "Close" in df.columns:
151
+ return df[["Close"]]
152
+ return df
153
+
154
+
155
+ def preprocess_data(data):
156
+ """Scale each feature independently; return scaled array + Close scaler."""
157
+ features = data.columns
158
+ scalers = {}
159
+ scaled_data = np.zeros((len(data), len(features)))
160
+
161
+ for i, feature in enumerate(features):
162
+ scalers[feature] = MinMaxScaler(feature_range=(0, 1))
163
+ scaled_data[:, i] = (
164
+ scalers[feature]
165
+ .fit_transform(data[feature].values.reshape(-1, 1))
166
+ .flatten()
167
+ )
168
+
169
+ master_scaler = scalers["Close"]
170
+ return scaled_data, master_scaler
171
+
172
+
173
+ def create_sequences(data, time_step=30):
174
+ """Create (X, y) sequences for LSTM training."""
175
+ X, y = [], []
176
+ for i in range(len(data) - time_step - 1):
177
+ X.append(data[i : i + time_step, :]) # all features
178
+ y.append(data[i + time_step, 0]) # Close price only
179
+ return np.array(X), np.array(y)
180
+
181
+
182
+ # [OPT-3] Slimmed LSTM: 2 layers (32 / 16 units) instead of 4 layers (64/64/32/32)
183
+ # [OPT-4] time_step default lowered to 30
184
+ # [OPT-5] epochs 30 → 15, batch_size 64 → 32, EarlyStopping patience 10 → 5
185
+ def train_lstm(X_train, y_train, time_step=30, stop_requested_callback=None):
186
+ """
187
+ Train a lightweight LSTM model.
188
+
189
+ Architecture change (OPT-3):
190
+ Original : LSTM(64) → LSTM(64) → Dropout → LSTM(32) → LSTM(32) → Dropout → Dense(16) → Dense(16) → Dense(1)
191
+ Updated : LSTM(32) → Dropout(0.2) → LSTM(16) → Dropout(0.2) → Dense(1)
192
+ Parameter count drops from ~110 k to ~14 k for a 6-feature, 30-step input.
193
+ """
194
+ from tensorflow.keras.optimizers import Adam
195
+ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, Callback
196
+
197
+ n_features = X_train.shape[2]
198
+ X_train = X_train.reshape(X_train.shape[0], time_step, n_features)
199
+
200
+ # [OPT-3] Lightweight architecture
201
+ model = Sequential([
202
+ LSTM(32, return_sequences=True,
203
+ input_shape=(time_step, n_features)),
204
+ Dropout(0.2),
205
+ LSTM(16, return_sequences=False),
206
+ Dropout(0.2),
207
+ Dense(1),
208
+ ])
209
+
210
+ class StopCallback(Callback):
211
+ def on_epoch_end(self, epoch, logs=None):
212
+ if stop_requested_callback and stop_requested_callback():
213
+ self.model.stop_training = True
214
+ print("Training stopped early by user request.")
215
+
216
+ optimizer = Adam(learning_rate=0.001)
217
+ model.compile(optimizer=optimizer, loss="mean_squared_error")
218
+
219
+ # [OPT-7] Patience 10 → 5 for faster early exit on free-tier CPU
220
+ reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.3,
221
+ patience=3, min_lr=0.0001, verbose=0)
222
+ early_stop = EarlyStopping(monitor="val_loss", patience=5,
223
+ restore_best_weights=True, verbose=1)
224
+ callbacks = [reduce_lr, early_stop]
225
+ if stop_requested_callback:
226
+ callbacks.append(StopCallback())
227
+
228
+ print(f"Training lightweight LSTM: {X_train.shape[0]} samples, "
229
+ f"{n_features} features, time_step={time_step}")
230
+
231
+ # [OPT-5] epochs 30 → 15, batch_size 64 → 32
232
+ model.fit(
233
+ X_train, y_train,
234
+ epochs=15,
235
+ batch_size=32,
236
+ validation_split=0.2,
237
+ callbacks=callbacks,
238
+ verbose=1,
239
+ )
240
+ return model
241
+
242
+
243
+ # [OPT-6] XGBoost: n_estimators 300 → 100, max_depth 6 → 4
244
+ def train_xgboost(X_train, residuals, stop_requested_callback=None):
245
+ """Train a leaner XGBoost model on LSTM residuals."""
246
+ if stop_requested_callback and stop_requested_callback():
247
+ print("XGBoost training cancelled due to stop request.")
248
+ return None
249
+
250
+ # [OPT-6] Reduced complexity for free-tier memory / speed
251
+ params = {
252
+ "objective": "reg:squarederror",
253
+ "n_estimators": 100, # was 300
254
+ "learning_rate": 0.1,
255
+ "max_depth": 4, # was 6
256
+ "subsample": 0.8,
257
+ "colsample_bytree": 0.8,
258
+ "min_child_weight": 3,
259
+ "gamma": 0.1,
260
+ "reg_alpha": 0.1,
261
+ "reg_lambda": 1.0,
262
+ "tree_method": "hist",
263
+ }
264
+
265
+ if stop_requested_callback:
266
+ class StopCallbackHandler(xgb.callback.TrainingCallback):
267
+ def after_iteration(self, model, epoch, evals_log):
268
+ if stop_requested_callback():
269
+ print("XGBoost training stopped by user request.")
270
+ return True
271
+ return False
272
+
273
+ xgb_model = xgb.XGBRegressor(**params)
274
+ xgb_model.set_params(callbacks=[StopCallbackHandler()])
275
+ xgb_model.fit(X_train, residuals)
276
+ else:
277
+ xgb_model = xgb.XGBRegressor(**params)
278
+ xgb_model.fit(
279
+ X_train, residuals,
280
+ eval_metric=["rmse"],
281
+ early_stopping_rounds=10, # was 20 [OPT-6]
282
+ verbose=False,
283
+ eval_set=[(X_train, residuals)],
284
+ )
285
+
286
+ return xgb_model
287
+
288
+
289
+ def predict_stock_price(
290
+ lstm_model, xgb_model, data, scaler,
291
+ time_step=30, days_ahead=5, stop_requested_callback=None
292
+ ):
293
+ """Make predictions using both LSTM and XGBoost with price anchoring."""
294
+ if stop_requested_callback and stop_requested_callback():
295
+ return None
296
+
297
+ n_features = data.shape[1]
298
+ temp_input = data[-time_step:].tolist()
299
+
300
+ last_actual_close = scaler.inverse_transform(
301
+ np.array([[data[-1, 0]]]))[0][0]
302
+ print(f"Base price: ${last_actual_close:.2f}")
303
+
304
+ original_prices = scaler.inverse_transform(data[:, 0].reshape(-1, 1))
305
+ daily_returns = np.diff(original_prices, axis=0) / original_prices[:-1]
306
+ volatility = np.std(daily_returns)
307
+
308
+ # Calibrate model against actual last price
309
+ lstm_input = np.array(temp_input[-time_step:]).reshape(1, time_step, n_features)
310
+ lstm_pred_cal = lstm_model.predict(lstm_input, verbose=0)[0][0]
311
+ xgb_input_cal = np.array(temp_input[-time_step:]).reshape(1, -1)
312
+ try:
313
+ combined_cal = lstm_pred_cal + (xgb_model.predict(xgb_input_cal)[0]
314
+ if xgb_model is not None else 0)
315
+ except Exception:
316
+ combined_cal = lstm_pred_cal
317
+
318
+ model_current = scaler.inverse_transform(
319
+ np.array([[combined_cal]]))[0][0]
320
+ correction_factor = (last_actual_close / model_current
321
+ if model_current > 0 else 1.0)
322
+ print(f"Calibration: model=${model_current:.2f}, "
323
+ f"actual=${last_actual_close:.2f}, factor={correction_factor:.4f}")
324
+
325
+ predictions = []
326
+ prev_day_pred = combined_cal
327
+
328
+ for day in range(days_ahead):
329
+ if stop_requested_callback and stop_requested_callback():
330
+ print(f"Prediction stopped at day {day}/{days_ahead}")
331
+ break
332
+
333
+ lstm_input = np.array(temp_input[-time_step:]).reshape(1, time_step, n_features)
334
+ lstm_pred = lstm_model.predict(lstm_input, verbose=0)[0][0]
335
+ xgb_input = np.array(temp_input[-time_step:]).reshape(1, -1)
336
+
337
+ try:
338
+ combined_pred = (lstm_pred + xgb_model.predict(xgb_input)[0]
339
+ if xgb_model is not None else lstm_pred)
340
+ except Exception as e:
341
+ print(f"XGBoost predict error: {e}")
342
+ combined_pred = lstm_pred
343
+
344
+ prev_unscaled = scaler.inverse_transform(
345
+ np.array([[prev_day_pred]]))[0][0]
346
+ current_unscaled = scaler.inverse_transform(
347
+ np.array([[combined_pred]]))[0][0]
348
+ price_change = current_unscaled - prev_unscaled
349
+ trend_direction = 1 if price_change >= 0 else -1
350
+
351
+ day_volatility = volatility * (1 + day * 0.1)
352
+ adjusted_volatility = min(day_volatility, 0.015)
353
+ random_factor = np.random.normal(0, adjusted_volatility)
354
+
355
+ if trend_direction > 0:
356
+ flux_factor = (abs(random_factor) * trend_direction * 0.15
357
+ if np.random.random() < 0.7
358
+ else -abs(random_factor) * trend_direction * 0.3)
359
+ else:
360
+ flux_factor = (abs(random_factor) * trend_direction * 0.25
361
+ if np.random.random() < 0.8
362
+ else -abs(random_factor) * trend_direction * 0.1)
363
+
364
+ flux_amount = prev_unscaled * flux_factor
365
+ adjusted_unscaled = current_unscaled + flux_amount
366
+ adjusted_pred = scaler.transform(
367
+ np.array([[adjusted_unscaled]]))[0][0]
368
+
369
+ next_row = temp_input[-1].copy()
370
+ next_row[0] = adjusted_pred
371
+ prev_day_pred = adjusted_pred
372
+
373
+ predictions.append(adjusted_pred)
374
+ temp_input.append(next_row)
375
+
376
+ if not predictions:
377
+ return None
378
+
379
+ final_predictions = scaler.inverse_transform(
380
+ np.array(predictions).reshape(-1, 1))
381
+ corrected_predictions = final_predictions * correction_factor
382
+
383
+ print("\nPredictions (original → corrected):")
384
+ for i in range(len(final_predictions)):
385
+ print(f" Day {i+1}: ${final_predictions[i][0]:.2f} "
386
+ f"→ ${corrected_predictions[i][0]:.2f}")
387
+
388
+ return corrected_predictions
389
+
390
+
391
+ def plot_prices(data, predictions, symbol, days_ahead):
392
+ """Plot actual + predicted prices (used in standalone main())."""
393
+ fig = go.Figure()
394
+ three_months_ago = data.index[-1] - pd.DateOffset(months=3)
395
+ actual_data = data.loc[three_months_ago:]
396
+ close_prices = (actual_data["Close"]
397
+ if isinstance(actual_data, pd.DataFrame) and "Close" in actual_data.columns
398
+ else actual_data.iloc[:, 0])
399
+
400
+ future_dates = []
401
+ last_date = data.index[-1]
402
+ for i in range(1, days_ahead + 1):
403
+ next_date = last_date + timedelta(days=i)
404
+ while next_date.weekday() > 4:
405
+ next_date += timedelta(days=1)
406
+ future_dates.append(next_date)
407
+ future_dates = list(dict.fromkeys(future_dates))
408
+ prediction_data = predictions[: len(future_dates)].flatten()
409
+
410
+ fig.add_trace(go.Scatter(
411
+ x=future_dates, y=prediction_data,
412
+ mode="lines+markers", name="Predicted Price",
413
+ line=dict(color="orange", width=3)))
414
+ fig.add_trace(go.Scatter(
415
+ x=close_prices.index, y=close_prices.values,
416
+ mode="lines", name="Actual Price",
417
+ line=dict(color="blue", width=2)))
418
+ fig.add_trace(go.Scatter(
419
+ x=[close_prices.index[-1]], y=[close_prices.values[-1]],
420
+ mode="markers", name="Latest Price",
421
+ marker=dict(color="green", size=10, symbol="circle")))
422
+
423
+ fig.update_layout(
424
+ title=f"Stock Price Prediction for {symbol}",
425
+ xaxis_title="Date", yaxis_title="Price (USD)",
426
+ template="plotly_white", hovermode="x unified")
427
+ fig.show()
428
+
429
+
430
+ # =============================================================================
431
+ # NEWS SENTIMENT ANALYSIS FUNCTIONS
432
+ # =============================================================================
433
+
434
+ def fetch_finnhub_news(company_symbol):
435
+ end_date = datetime.now()
436
+ start_date = end_date - timedelta(days=28)
437
+ url = (f"https://finnhub.io/api/v1/company-news"
438
+ f"?symbol={company_symbol}"
439
+ f"&from={start_date.strftime('%Y-%m-%d')}"
440
+ f"&to={end_date.strftime('%Y-%m-%d')}"
441
+ f"&token={FINNHUB_API_KEY}")
442
+ try:
443
+ response = requests.get(url)
444
+ if response.status_code == 200:
445
+ articles = response.json()
446
+ headlines = [a["headline"] for a in articles if "headline" in a]
447
+ return headlines
448
+ else:
449
+ print(f"Error fetching news: {response.status_code}")
450
+ return []
451
+ except Exception as e:
452
+ print(f"Error parsing news response: {e}")
453
+ return []
454
+
455
+
456
+ def analyze_sentiment(headlines):
457
+ try:
458
+ sid = SentimentIntensityAnalyzer()
459
+ sentiment_results = []
460
+ sentiment_totals = {"positive": 0, "negative": 0, "neutral": 0}
461
+
462
+ for headline in headlines:
463
+ if not headline or not isinstance(headline, str):
464
+ continue
465
+ sentiment = sid.polarity_scores(headline)
466
+ sentiment_results.append({"headline": headline, "sentiment": sentiment})
467
+ if sentiment["compound"] > 0.05:
468
+ sentiment_totals["positive"] += 1
469
+ elif sentiment["compound"] < -0.05:
470
+ sentiment_totals["negative"] += 1
471
+ else:
472
+ sentiment_totals["neutral"] += 1
473
+
474
+ return sentiment_results, sentiment_totals
475
+ except Exception as e:
476
+ print(f"Error in sentiment analysis: {e}")
477
+ return [], {"positive": 0, "negative": 0, "neutral": 0}
478
+
479
+
480
+ def plot_sentiment_pie(sentiment_totals, company_symbol):
481
+ fig = go.Figure(data=[go.Pie(
482
+ labels=["Positive", "Negative", "Neutral"],
483
+ values=[sentiment_totals["positive"],
484
+ sentiment_totals["negative"],
485
+ sentiment_totals["neutral"]],
486
+ marker=dict(colors=["#2ecc71", "#e74c3c", "#95a5a6"],
487
+ line=dict(color="white", width=0)),
488
+ textinfo="percent+label", textfont_size=20)])
489
+ fig.update_layout(
490
+ title=f"Sentiment Distribution for {company_symbol} (Last 28 Days)",
491
+ showlegend=True)
492
+ fig.show()
493
+
494
+
495
+ # =============================================================================
496
+ # AI SUMMARY FUNCTIONS [OPT-1] Transformers removed
497
+ # =============================================================================
498
+
499
+ def _extractive_summary(headlines, n=3):
500
+ """
501
+ Lightweight extractive summariser – replaces the BART transformer pipeline.
502
+ [OPT-1] Picks the top-n headlines by absolute VADER compound score so the
503
+ most opinionated sentences surface first. No heavy model download needed.
504
+ """
505
+ if not headlines:
506
+ return ""
507
+ try:
508
+ sid = SentimentIntensityAnalyzer()
509
+ scored = [(h, abs(sid.polarity_scores(h)["compound"]))
510
+ for h in headlines if h and isinstance(h, str)]
511
+ scored.sort(key=lambda x: x[1], reverse=True)
512
+ top = [h for h, _ in scored[:n]]
513
+ return " | ".join(top)
514
+ except Exception as e:
515
+ print(f"Extractive summary error: {e}")
516
+ return headlines[0] if headlines else ""
517
+
518
+
519
+ def generate_sentiment_summary(sentiment_totals, headlines, company_symbol):
520
+ """
521
+ Generate a human-readable sentiment summary.
522
+ [OPT-1] Uses simple NLTK-based extractive summarisation instead of a
523
+ Transformers pipeline (removes ~1.2 GB BART model download).
524
+ """
525
+ try:
526
+ total = max(1, sum(sentiment_totals.values()))
527
+ pos_pct = sentiment_totals["positive"] / total * 100
528
+ neg_pct = sentiment_totals["negative"] / total * 100
529
+
530
+ summary = (
531
+ f"Over the past 28 days, {len(headlines)} news articles about "
532
+ f"{company_symbol} were analysed. "
533
+ f"{sentiment_totals['positive']} positive ({pos_pct:.0f}%), "
534
+ f"{sentiment_totals['negative']} negative ({neg_pct:.0f}%), "
535
+ f"and {sentiment_totals['neutral']} neutral articles found."
536
+ )
537
+
538
+ if headlines:
539
+ key_headlines = _extractive_summary(headlines, n=2)
540
+ if key_headlines:
541
+ summary += f" Key headlines: {key_headlines}"
542
+
543
+ return summary
544
+ except Exception as e:
545
+ print(f"Error in generate_sentiment_summary: {e}")
546
+ return f"Unable to generate sentiment summary for {company_symbol}."
547
+
548
+
549
+ def generate_prediction_summary(pred_df, company_symbol):
550
+ first_price = pred_df["Predicted Price"].iloc[0]
551
+ last_price = pred_df["Predicted Price"].iloc[-1]
552
+ return (
553
+ f"The predicted stock prices for {company_symbol} range from "
554
+ f"${first_price:.2f} to ${last_price:.2f} over the forecast period."
555
+ )
556
+
557
+
558
+ def display_price_table(data, predictions, symbol, days_ahead):
559
+ """Print prediction results as a table (used in standalone main())."""
560
+ if isinstance(data, pd.DataFrame) and "Close" in data.columns:
561
+ last_price = data["Close"].iloc[-1]
562
+ last_date = data.index[-1]
563
+ else:
564
+ last_price = data.iloc[-1, 0]
565
+ last_date = data.index[-1]
566
+
567
+ future_dates = []
568
+ for i in range(1, days_ahead + 1):
569
+ next_date = last_date + timedelta(days=i)
570
+ while next_date.weekday() > 4:
571
+ next_date += timedelta(days=1)
572
+ future_dates.append(next_date)
573
+ future_dates = list(dict.fromkeys(future_dates))
574
+ prediction_data = predictions[: len(future_dates)].flatten()
575
+
576
+ last_price_row = pd.DataFrame({
577
+ "Date": [last_date.strftime("%Y-%m-%d")],
578
+ "Price": [f"${last_price:.2f}"],
579
+ "Change": ["0.00%"],
580
+ "Note": ["Actual last closing price"],
581
+ })
582
+ pred_rows = []
583
+ for i, (date, price) in enumerate(zip(future_dates, prediction_data)):
584
+ change_pct = ((price - last_price) / last_price) * 100
585
+ pred_rows.append({
586
+ "Date": date.strftime("%Y-%m-%d"),
587
+ "Price": f"${price:.2f}",
588
+ "Change": f"{change_pct:.2f}%",
589
+ "Note": f"Day {i+1} prediction",
590
+ })
591
+
592
+ combined_df = pd.concat([last_price_row, pd.DataFrame(pred_rows)],
593
+ ignore_index=True)
594
+ print(f"\n{symbol} Stock Price Prediction Table:")
595
+ print("=" * 80)
596
+ print(combined_df.to_string(index=False))
597
+ print("=" * 80)
598
+
599
+ return pd.DataFrame({
600
+ "Date": [d.strftime("%Y-%m-%d") for d in future_dates],
601
+ "Predicted Price": prediction_data,
602
+ })
603
+
604
+
605
+ # =============================================================================
606
+ # STANDALONE MAIN
607
+ # =============================================================================
608
+
609
+ def main():
610
+ symbol = input("Enter the stock symbol (e.g., AAPL): ").upper()
611
+ try:
612
+ days_ahead = int(input("Number of future days to predict (e.g., 5): "))
613
+ except ValueError:
614
+ print("Invalid input. Please enter an integer.")
615
+ return
616
+
617
+ print(f"\nFetching historical data for {symbol}...")
618
+ data = fetch_stock_data(symbol, outputsize="full")
619
+ if data is None or len(data) < 50:
620
+ print(f"Not enough data points for {symbol}.")
621
+ return
622
+
623
+ print("Preprocessing data...")
624
+ scaled_data, scaler = preprocess_data(data)
625
+
626
+ # [OPT-4] time_step 60 → 30 in standalone mode too
627
+ time_step = 30
628
+ X, y = create_sequences(scaled_data, time_step)
629
+ if len(X) == 0:
630
+ print("Could not create sequences.")
631
+ return
632
+
633
+ train_size = int(len(X) * 0.8)
634
+ X_train, y_train = X[:train_size], y[:train_size]
635
+
636
+ print("Training LSTM model...")
637
+ lstm_model = train_lstm(X_train, y_train, time_step)
638
+
639
+ lstm_train_preds = lstm_model.predict(X_train, verbose=0).flatten()
640
+ residuals = y_train - lstm_train_preds
641
+
642
+ print("Training XGBoost model...")
643
+ xgb_model = train_xgboost(X_train.reshape(X_train.shape[0], -1), residuals)
644
+
645
+ print(f"Predicting {days_ahead} days ahead...")
646
+ predictions = predict_stock_price(
647
+ lstm_model, xgb_model, scaled_data, scaler, time_step, days_ahead)
648
+
649
+ display_price_table(data, predictions, symbol, days_ahead)
650
+
651
+ future_dates = []
652
+ last_date = data.index[-1]
653
+ for i in range(1, days_ahead + 1):
654
+ next_date = last_date + timedelta(days=i)
655
+ while next_date.weekday() > 4:
656
+ next_date += timedelta(days=1)
657
+ future_dates.append(next_date)
658
+ future_dates = list(dict.fromkeys(future_dates))
659
+
660
+ pred_df = pd.DataFrame({
661
+ "Date": [d.strftime("%Y-%m-%d") for d in future_dates[: len(predictions)]],
662
+ "Predicted Price": predictions.flatten()[: len(future_dates)],
663
+ })
664
+ print("\nPrediction summary:")
665
+ print(generate_prediction_summary(pred_df, symbol))
666
+
667
+ print("\nFetching news for sentiment analysis...")
668
+ headlines = fetch_finnhub_news(symbol)
669
+ if headlines:
670
+ sentiment_results, sentiment_totals = analyze_sentiment(headlines)
671
+ plot_sentiment_pie(sentiment_totals, symbol)
672
+ print(generate_sentiment_summary(sentiment_totals, headlines, symbol))
673
+ else:
674
+ print("No headlines found.")
675
+
676
+
677
+ if __name__ == "__main__":
678
+ main()