| |
| import re |
| from pathlib import Path |
| import numpy as np |
| import pandas as pd |
| import yfinance as yf |
| from datetime import datetime, timedelta |
| from textblob import TextBlob |
| from scraper import NewsScraper |
| from extractor import ContentExtractor |
|
|
| class Features: |
| BULL = [ |
| 'upgrade','buy','outperform','beat','surge','soar', |
| 'rally','breakout','strong','growth','profit','record', |
| 'bullish','positive','raise','upside','optimistic', |
| 'boom','gain','higher','best','partnership','deal', |
| 'approval','dividend','buyback','expansion','launch', |
| 'breakthrough','recovery','momentum','confidence', |
| ] |
| BEAR = [ |
| 'downgrade','sell','underperform','miss','plunge', |
| 'crash','tumble','breakdown','weak','decline','loss', |
| 'bearish','negative','cut','lower','worst','risk', |
| 'warning','concern','fear','recession','lawsuit', |
| 'investigation','fraud','layoff','restructuring', |
| 'debt','default','bankruptcy','slump','drop','falling', |
| 'disappointing','headwind','pressure','downturn', |
| 'uncertainty','volatile','overvalued', |
| ] |
| URGENT = [ |
| 'breaking','alert','urgent','exclusive','flash', |
| 'developing','critical','emergency', |
| ] |
| SOURCES = { |
| 'reuters':3,'bloomberg':3,'wsj':3, |
| 'wall street journal':3,'financial times':3, |
| 'cnbc':2,'marketwatch':2,'seeking alpha':2, |
| 'barrons':2,'yahoo finance':2, |
| 'benzinga':1,'motley fool':1,'zacks':1,'tipranks':1, |
| } |
| TICKER_ALIASES = { |
| "^nsei": ["nifty 50", "nifty50", "nifty", "nsei"], |
| "nifty 50": ["nifty 50", "nifty50", "nifty", "nsei"], |
| "^bsesn": ["sensex", "bse sensex", "bsesn"], |
| "sensex": ["sensex", "bse sensex", "bsesn"], |
| "^nsebank": ["nifty bank", "bank nifty", "nsebank"], |
| "nifty bank": ["nifty bank", "bank nifty", "nsebank"], |
| "nifty it": ["nifty it", "it index", "niftyit"], |
| "nifty fin service": ["nifty fin service", "financial services", "nifty financial services"], |
| } |
| PRICE_SYMBOL_MAP = { |
| "^nsei": "NIFTY 50", |
| "nifty 50": "NIFTY 50", |
| "^bsesn": "SENSEX", |
| "sensex": "SENSEX", |
| "^nsebank": "NIFTY BANK", |
| "nifty bank": "NIFTY BANK", |
| "nifty it": "NIFTY IT", |
| "nifty fin service": "NIFTY FIN SERVICE", |
| } |
| EVENT_KEYWORDS = { |
| "event_earnings": [ |
| "earnings", "eps", "revenue", "quarterly", "results", "guidance", "outlook", |
| "beat estimates", "miss estimates", "margin", "profit" |
| ], |
| "event_analyst": [ |
| "analyst", "rating", "price target", "upgrade", "downgrade", "initiate", |
| "coverage", "overweight", "underweight", "outperform", "underperform" |
| ], |
| "event_macro": [ |
| "inflation", "rates", "interest rate", "fed", "cpi", "ppi", "jobs", |
| "treasury", "yield", "macro", "recession", "growth outlook" |
| ], |
| "event_legal": [ |
| "lawsuit", "investigation", "sec", "regulator", "fraud", "antitrust", |
| "probe", "settlement", "penalty", "fine" |
| ], |
| "event_mna": [ |
| "acquisition", "merger", "deal", "buyout", "stake", "partner", |
| "partnership", "joint venture", "acquires", "acquired" |
| ], |
| "event_capital": [ |
| "buyback", "repurchase", "dividend", "offering", "fundraising", "ipo", |
| "secondary", "split", "rights issue" |
| ], |
| "event_product": [ |
| "launch", "product", "service", "subscription", "expansion", "pipeline", |
| "launches", "rollout" |
| ], |
| "event_supply": [ |
| "supply chain", "inventory", "shipping", "logistics", "factory", |
| "production", "shipment", "capacity" |
| ], |
| "event_policy": [ |
| "policy", "regulation", "tariff", "ban", "export", "import", |
| "government", "tax", "subsidy" |
| ], |
| "event_urgency": [ |
| "breaking", "alert", "urgent", "exclusive", "developing", "critical" |
| ], |
| } |
|
|
| def __init__(self, ticker): |
| self.ticker = ticker.lower() |
| self.ticker_aliases = self._resolve_ticker_aliases(self.ticker) |
|
|
| def _resolve_ticker_aliases(self, ticker): |
| key = ticker.lower().strip() |
| aliases = list(self.TICKER_ALIASES.get(key, [])) |
| compact = re.sub(r"[^a-z0-9]+", " ", key).strip() |
| if compact and compact not in aliases: |
| aliases.append(compact) |
| base = key.replace("^", "").replace(".", " ").strip() |
| if base and base not in aliases: |
| aliases.append(base) |
| aliases = [a for a in aliases if a] |
| |
| seen = set() |
| ordered = [] |
| for alias in aliases: |
| alias = alias.lower() |
| if alias not in seen: |
| ordered.append(alias) |
| seen.add(alias) |
| return ordered or [base or key] |
|
|
| def _count_keywords(self, text_series, keywords): |
| patterns = [re.escape(k.lower()) for k in keywords] |
| if not patterns: |
| return pd.Series(0, index=text_series.index) |
| regex = "|".join(patterns) |
| return text_series.apply(lambda x: len(re.findall(regex, x))) |
|
|
| def _count_alias_mentions(self, text_series): |
| if not self.ticker_aliases: |
| return pd.Series(0, index=text_series.index) |
| regex = "|".join(re.escape(alias) for alias in self.ticker_aliases) |
| return text_series.apply(lambda x: len(re.findall(regex, x))) |
|
|
| def _price_symbol(self): |
| return self.PRICE_SYMBOL_MAP.get(self.ticker, self.ticker.replace("^", "").upper()) |
|
|
| def build(self, df): |
| df = df.copy() |
| if 'content' not in df.columns: |
| df['content'] = '' |
| if 'source' not in df.columns: |
| df['source'] = 'Unknown' |
| df['content'] = df['content'].fillna('') |
| df['title'] = df['title'].fillna('') |
| df['source'] = df['source'].fillna('Unknown') |
| text = (df['title'] + ' ' + df['content']).str.lower() |
| title_low = df['title'].str.lower() |
| source_low = df['source'].astype(str).str.lower() |
|
|
| df['sent_title'] = df['title'].apply(lambda x: TextBlob(str(x)).sentiment.polarity) |
| df['subj_title'] = df['title'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity) |
| df['sent_content'] = df['content'].apply(lambda x: TextBlob(str(x)).sentiment.polarity) |
| df['subj_content'] = df['content'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity) |
| df['sent_combined'] = df['sent_title'] * 0.6 + df['sent_content'] * 0.4 |
| df['sent_abs'] = df['sent_combined'].abs() |
| df['sent_agreement'] = df['sent_title'] * df['sent_content'] |
| df['sent_disagreement'] = (df['sent_title'] - df['sent_content']).abs() |
| df['sent_strength'] = df['sent_title'].abs() + df['sent_content'].abs() |
|
|
| df['n_bull'] = self._count_keywords(text, self.BULL) |
| df['n_bear'] = self._count_keywords(text, self.BEAR) |
| df['n_urgent'] = self._count_keywords(text, self.URGENT) |
| df['bull_bear_net'] = df['n_bull'] - df['n_bear'] |
| df['bull_bear_ratio'] = df['n_bull'] / df['n_bear'].clip(lower=1) |
| df['sent_kw_align'] = df['sent_combined'] * df['bull_bear_net'] |
|
|
| df['len_title'] = df['title'].str.len() |
| df['len_content'] = df['content'].str.len() |
| df['title_content_ratio'] = df['len_title'] / (df['len_content'].clip(lower=1)) |
| df['word_content_ratio'] = df['title'].str.split().str.len().fillna(0) / (df['content'].str.split().str.len().fillna(0).clip(lower=1)) |
| df['words_title'] = df['title'].str.split().str.len().fillna(0) |
| df['words_content'] = df['content'].str.split().str.len().fillna(0) |
| df['n_excl'] = (df['title'] + df['content']).str.count('!') |
| df['n_quest'] = (df['title'] + df['content']).str.count(r'\?') |
| df['caps_ratio'] = df['title'].apply(lambda x: sum(c.isupper() for c in str(x)) / max(len(str(x)), 1)) |
| df['upper_word_ratio'] = df['title'].apply(lambda x: sum(w.isupper() and len(w) > 1 for w in str(x).split()) / max(len(str(x).split()), 1)) |
| df['punct_density'] = (df['n_excl'] + df['n_quest']) / (df['len_title'].clip(lower=1)) |
| df['n_numbers'] = text.apply(lambda x: len(re.findall(r'\d+\.?\d*', x))) |
| df['n_dollar'] = text.apply(lambda x: len(re.findall(r'\$[\d,.]+', x))) |
| df['n_percent'] = text.apply(lambda x: len(re.findall(r'[\d.]+\s*%', x))) |
| df['number_density'] = df['n_numbers'] / (df['words_title'].clip(lower=1) + df['words_content'].clip(lower=1)) |
| df['digits_in_title'] = df['title'].apply(lambda x: sum(c.isdigit() for c in str(x))) |
| df['digits_ratio'] = df['digits_in_title'] / (df['len_title'].clip(lower=1)) |
|
|
| ticker_regex = "|".join(re.escape(alias) for alias in self.ticker_aliases) |
| df['ticker_in_title'] = title_low.str.contains(ticker_regex, regex=True).astype(int) |
| df['ticker_mentions'] = self._count_alias_mentions(text) |
| df['ticker_mentions_title'] = self._count_alias_mentions(title_low) |
|
|
| source_lookup = source_low.str.cat(title_low, sep=' ') |
| df['source_tier'] = source_lookup.apply(lambda x: max((t for s, t in self.SOURCES.items() if s in x), default=0)) |
| df['source_generic'] = source_low.str.contains(r'google|yahoo|news|feed|rss|unknown').astype(int) |
|
|
| ts = pd.to_datetime(df['timestamp'], errors='coerce', utc=True) |
| df['hour'] = ts.dt.hour.fillna(12).astype(int) |
| df['dow'] = ts.dt.dayofweek.fillna(0).astype(int) |
| df['is_weekend'] = (df['dow'] >= 5).astype(int) |
| df['is_market_hrs'] = ((df['hour'] >= 9) & (df['hour'] <= 16)).astype(int) |
| df['is_premarket'] = ((df['hour'] >= 4) & (df['hour'] < 9)).astype(int) |
| df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0) |
| df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0) |
| df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7.0) |
| df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7.0) |
|
|
| for feat_name, keywords in self.EVENT_KEYWORDS.items(): |
| df[feat_name] = self._count_keywords(text, keywords) |
| df['is_earnings'] = df['event_earnings'] |
| df['is_analyst'] = df['event_analyst'] |
| df['event_total'] = df[[col for col in self.EVENT_KEYWORDS.keys()]].sum(axis=1) |
| df['event_balance'] = ( |
| df['event_earnings'] + df['event_analyst'] + df['event_macro'] + df['event_mna'] |
| - df['event_legal'] - df['event_policy'] |
| ) |
| df['topic_strength'] = df['event_total'] / (df['words_title'] + df['words_content'] + 1) |
| df['event_x_source'] = df['event_total'] * df['source_tier'] |
| df['event_x_urgent'] = df['event_total'] * df['n_urgent'] |
| df['event_x_sent'] = df['event_balance'] * df['sent_combined'] |
|
|
| df['sent_x_urgent'] = df['sent_combined'] * df['n_urgent'] |
| df['sent_x_source'] = df['sent_combined'] * df['source_tier'] |
| df['sent_x_ticker'] = df['sent_combined'] * df['ticker_mentions'] |
| df['sent_x_event'] = df['sent_combined'] * df['event_total'] |
| df['signal_intensity'] = ( |
| df['sent_abs'] * 0.35 |
| + df['bull_bear_net'].abs() * 0.15 |
| + df['event_total'] * 0.20 |
| + df['source_tier'] * 0.10 |
| + df['ticker_mentions'] * 0.10 |
| + df['n_urgent'] * 0.10 |
| ) |
|
|
| chron = df.assign(ts=ts).sort_values(["ts", "title"], kind="stable").copy() |
| if "date" not in chron.columns: |
| chron["date"] = chron["ts"].dt.date |
| chron["news_seq"] = np.arange(len(chron), dtype=float) |
| chron["time_since_prev_hours"] = chron["ts"].diff().dt.total_seconds().div(3600).fillna(24.0) |
| chron["same_day_order"] = chron.groupby("date").cumcount().astype(float) |
| chron["same_day_count"] = chron.groupby("date")["title"].transform("count").astype(float) |
| chron["news_per_hour"] = chron["same_day_count"] / chron["time_since_prev_hours"].replace(0, np.nan).fillna(24.0) |
| day_groups = chron.groupby("date", sort=False) |
| chron["day_sent_mean"] = day_groups["sent_combined"].transform("mean") |
| chron["day_sent_std"] = day_groups["sent_combined"].transform("std").fillna(0.0) |
| chron["day_event_sum"] = day_groups["event_total"].transform("sum") |
| chron["day_signal_sum"] = day_groups["signal_intensity"].transform("sum") |
| chron["day_urgent_sum"] = day_groups["n_urgent"].transform("sum") |
| chron["day_sent_rank"] = day_groups["sent_combined"].rank(method="average", ascending=False) |
| chron["day_impact_rank"] = day_groups["signal_intensity"].rank(method="average", ascending=False) |
| chron["news_share_day"] = 1.0 / chron["same_day_count"].replace(0, np.nan) |
| chron["sent_vs_day_mean"] = chron["sent_combined"] - chron["day_sent_mean"] |
| chron["impact_vs_day_mean"] = chron["signal_intensity"] - chron["day_signal_sum"] / chron["same_day_count"].replace(0, np.nan) |
|
|
| def add_rolling_stats(series, prefix, windows): |
| out = {} |
| for window in windows: |
| roll = series.rolling(window, min_periods=1) |
| out[f"{prefix}_{window}"] = roll.mean() |
| out[f"{prefix}_std_{window}"] = roll.std().fillna(0.0) |
| out[f"{prefix}_min_{window}"] = roll.min() |
| out[f"{prefix}_max_{window}"] = roll.max() |
| return out |
|
|
| rolling_sources = { |
| "sent": chron["sent_combined"], |
| "abs_sent": chron["sent_abs"], |
| "bull": chron["bull_bear_net"], |
| "event": chron["event_total"], |
| "urgent": chron["n_urgent"], |
| "source": chron["source_tier"], |
| "impact": chron["signal_intensity"], |
| } |
| roll_frames = [chron[["news_seq", "time_since_prev_hours", "same_day_order", "same_day_count", "news_per_hour"]]] |
| roll_frames.append( |
| chron[[ |
| "day_sent_mean", |
| "day_sent_std", |
| "day_event_sum", |
| "day_signal_sum", |
| "day_urgent_sum", |
| "day_sent_rank", |
| "day_impact_rank", |
| "news_share_day", |
| "sent_vs_day_mean", |
| "impact_vs_day_mean", |
| ]] |
| ) |
| for prefix, series in rolling_sources.items(): |
| roll_frames.append(pd.DataFrame(add_rolling_stats(series, prefix, [3, 5, 10]), index=chron.index)) |
|
|
| rolling_df = pd.concat(roll_frames, axis=1) |
| rolling_df["sent_momentum_3"] = rolling_df["sent_3"] - rolling_df["sent_5"] |
| rolling_df["sent_momentum_5"] = rolling_df["sent_5"] - rolling_df["sent_10"] |
| rolling_df["bull_momentum_5"] = rolling_df["bull_5"] - rolling_df["bull_10"] |
| rolling_df["event_momentum_5"] = rolling_df["event_5"] - rolling_df["event_10"] |
| rolling_df["source_momentum_5"] = rolling_df["source_5"] - rolling_df["source_10"] |
| rolling_df["impact_momentum_5"] = rolling_df["impact_5"] - rolling_df["impact_10"] |
| rolling_df["sent_volatility_5"] = rolling_df["sent_std_5"] |
| rolling_df["event_volatility_5"] = rolling_df["event_std_5"] |
| rolling_df["recency_decay"] = np.exp(-rolling_df["time_since_prev_hours"].clip(lower=0) / 18.0) |
| rolling_df["news_density"] = rolling_df["same_day_count"] / rolling_df["news_per_hour"].replace(0, np.nan).fillna(1.0) |
|
|
| df = pd.concat([df, rolling_df.reindex(df.index)], axis=1) |
|
|
| return df |
|
|
| class DataPipeline: |
| FEATURE_COLS = [ |
| 'sent_title','subj_title','sent_content','subj_content', |
| 'sent_combined','sent_abs','sent_agreement','sent_disagreement','sent_strength', |
| 'n_bull','n_bear','n_urgent','bull_bear_net', |
| 'bull_bear_ratio','sent_kw_align', |
| 'len_title','len_content','title_content_ratio','word_content_ratio','words_title','words_content', |
| 'n_excl','n_quest','caps_ratio','upper_word_ratio','punct_density','n_numbers', |
| 'n_dollar','n_percent','number_density','digits_in_title','digits_ratio', |
| 'ticker_in_title','ticker_mentions','ticker_mentions_title','source_tier','source_generic', |
| 'hour','dow','is_weekend','is_market_hrs','is_premarket','hour_sin','hour_cos','dow_sin','dow_cos', |
| 'is_earnings','is_analyst','event_earnings','event_analyst','event_macro','event_legal','event_mna', |
| 'event_capital','event_product','event_supply','event_policy','event_urgency','event_total', |
| 'event_balance','topic_strength','event_x_source','event_x_urgent','event_x_sent', |
| 'sent_x_urgent','sent_x_source','sent_x_ticker','sent_x_event','signal_intensity', |
| 'news_seq','time_since_prev_hours','same_day_order','same_day_count','news_per_hour', |
| 'day_sent_mean','day_sent_std','day_event_sum','day_signal_sum','day_urgent_sum', |
| 'day_sent_rank','day_impact_rank','news_share_day','sent_vs_day_mean','impact_vs_day_mean', |
| 'sent_3','sent_std_3','sent_min_3','sent_max_3','sent_5','sent_std_5','sent_min_5','sent_max_5', |
| 'sent_10','sent_std_10','sent_min_10','sent_max_10','abs_sent_3','abs_sent_std_3','abs_sent_min_3', |
| 'abs_sent_max_3','abs_sent_5','abs_sent_std_5','abs_sent_min_5','abs_sent_max_5','abs_sent_10', |
| 'abs_sent_std_10','abs_sent_min_10','abs_sent_max_10','bull_3','bull_std_3','bull_min_3','bull_max_3', |
| 'bull_5','bull_std_5','bull_min_5','bull_max_5','bull_10','bull_std_10','bull_min_10','bull_max_10', |
| 'event_3','event_std_3','event_min_3','event_max_3','event_5','event_std_5','event_min_5','event_max_5', |
| 'event_10','event_std_10','event_min_10','event_max_10','urgent_3','urgent_std_3','urgent_min_3', |
| 'urgent_max_3','urgent_5','urgent_std_5','urgent_min_5','urgent_max_5','urgent_10','urgent_std_10', |
| 'urgent_min_10','urgent_max_10','source_3','source_std_3','source_min_3','source_max_3','source_5', |
| 'source_std_5','source_min_5','source_max_5','source_10','source_std_10','source_min_10', |
| 'source_max_10','impact_3','impact_std_3','impact_min_3','impact_max_3','impact_5','impact_std_5', |
| 'impact_min_5','impact_max_5','impact_10','impact_std_10','impact_min_10','impact_max_10', |
| 'sent_momentum_3','sent_momentum_5','bull_momentum_5','event_momentum_5','source_momentum_5', |
| 'impact_momentum_5','sent_volatility_5','event_volatility_5','recency_decay','news_density', |
| 'vol_1d','vol_2d','vol_3d','vol_5d','vol_10d','vol_20d','vol_change', |
| 'price_sma5','price_sma20','price_ema5','price_ema13','price_ema21','price_ema50', |
| 'ema5_gap','ema13_gap','ema21_gap','ema50_gap','trend_5_20','trend_10_20','trend_20_50', |
| 'ret_1d','ret_2d','ret_3d','ret_5d','ret_10d','ret_20d', |
| 'gap_pct','range_pct','body_pct','upper_wick_pct','lower_wick_pct', |
| 'atr14','atr14_ratio','rsi14','stoch_k14','breakout_20','drawdown_20', |
| 'ret_z20','vol_z20','close_vs_low20','close_vs_high20', |
| 'news_x_ret_1d','news_x_trend_5_20','news_x_trend_20_50','news_x_vol_20', |
| 'event_x_trend','event_x_vol','source_x_trend','sent_x_rsi','sent_x_atr', |
| 'news_burst_x_vol','urgency_x_gap','sentiment_regime','impact_regime', |
| ] |
|
|
| def __init__(self, ticker, train_days=120, test_days=14): |
| self.ticker = ticker |
| self.train_days = train_days |
| self.test_days = test_days |
| self.scraper = NewsScraper(limit=600) |
| self.extractor = ContentExtractor() |
| self.features = Features(ticker) |
| |
| current_path = Path(__file__).resolve() |
| |
| self.base_dir = current_path.parent |
| for p in [current_path] + list(current_path.parents): |
| if (p / "backend").exists() and (p / "backend").is_dir(): |
| self.base_dir = p |
| break |
| else: |
| |
| if len(current_path.parents) >= 2: |
| self.base_dir = current_path.parents[1] |
| else: |
| self.base_dir = current_path.parent |
| self.dir_move_threshold = 0.0 |
| self.hh_move_threshold = 0.015 |
|
|
| def _augment_price_features(self, df): |
| df = df.copy() |
| for col in ["Open", "High", "Low", "Close", "Volume"]: |
| if col not in df.columns: |
| df[col] = np.nan |
|
|
| df["ret"] = df["Close"].pct_change() |
| for lag in [1, 2, 3, 5, 10, 20]: |
| df[f"ret_{lag}d"] = df["Close"].pct_change(lag) |
|
|
| df["vol_1d"] = df["ret"].abs() |
| for window in [2, 3, 5, 10, 20]: |
| df[f"vol_{window}d"] = df["ret"].rolling(window).std() |
|
|
| df["vol_change"] = df["Volume"].pct_change() |
| df["gap_pct"] = df["Open"] / df["Close"].shift(1) - 1 |
| df["range_pct"] = (df["High"] - df["Low"]) / df["Close"].replace(0, np.nan) |
| df["body_pct"] = (df["Close"] - df["Open"]) / df["Open"].replace(0, np.nan) |
| df["upper_wick_pct"] = (df["High"] - df[["Open", "Close"]].max(axis=1)) / df["Close"].replace(0, np.nan) |
| df["lower_wick_pct"] = (df[["Open", "Close"]].min(axis=1) - df["Low"]) / df["Close"].replace(0, np.nan) |
|
|
| df["price_sma5"] = df["Close"] / df["Close"].rolling(5).mean() - 1 |
| df["price_sma20"] = df["Close"] / df["Close"].rolling(20).mean() - 1 |
| df["price_ema5"] = df["Close"].ewm(span=5, adjust=False).mean() |
| df["price_ema13"] = df["Close"].ewm(span=13, adjust=False).mean() |
| df["price_ema21"] = df["Close"].ewm(span=21, adjust=False).mean() |
| df["price_ema50"] = df["Close"].ewm(span=50, adjust=False).mean() |
| df["ema5_gap"] = df["Close"] / df["price_ema5"] - 1 |
| df["ema13_gap"] = df["Close"] / df["price_ema13"] - 1 |
| df["ema21_gap"] = df["Close"] / df["price_ema21"] - 1 |
| df["ema50_gap"] = df["Close"] / df["price_ema50"] - 1 |
| df["trend_5_20"] = df["price_ema5"] / df["price_ema21"] - 1 |
| df["trend_10_20"] = df["Close"].rolling(10).mean() / df["Close"].rolling(20).mean() - 1 |
| df["trend_20_50"] = df["Close"].rolling(20).mean() / df["Close"].rolling(50).mean() - 1 |
|
|
| high_low = df["High"] - df["Low"] |
| high_close = (df["High"] - df["Close"].shift(1)).abs() |
| low_close = (df["Low"] - df["Close"].shift(1)).abs() |
| tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1) |
| df["atr14"] = tr.ewm(span=14, adjust=False).mean() |
| df["atr14_ratio"] = df["atr14"] / df["Close"].replace(0, np.nan) |
|
|
| delta = df["Close"].diff() |
| gain = delta.clip(lower=0).rolling(14).mean() |
| loss = (-delta.clip(upper=0)).rolling(14).mean() |
| rs = gain / loss.replace(0, np.nan) |
| df["rsi14"] = 100 - (100 / (1 + rs)) |
|
|
| low20 = df["Low"].rolling(20).min() |
| high20 = df["High"].rolling(20).max() |
| df["stoch_k14"] = ((df["Close"] - low20) / (high20 - low20).replace(0, np.nan)) * 100 |
| df["breakout_20"] = df["Close"] / high20 - 1 |
| df["drawdown_20"] = df["Close"] / low20 - 1 |
| df["close_vs_low20"] = df["Close"] / low20 - 1 |
| df["close_vs_high20"] = df["Close"] / high20 - 1 |
|
|
| df["ret_z20"] = (df["ret"] - df["ret"].rolling(20).mean()) / df["ret"].rolling(20).std() |
| df["vol_z20"] = (df["Volume"] - df["Volume"].rolling(20).mean()) / df["Volume"].rolling(20).std() |
| return df |
|
|
| def _load_local_prices(self, start_date): |
| symbol = self.features._price_symbol() |
| local_path = ( |
| self.base_dir |
| / "backend" |
| / "data and ML handling" |
| / "MAIN DATA SOURCE" |
| / "daily" |
| / symbol |
| / "1_day" |
| / f"{symbol}_1_day.parquet" |
| ) |
| if not local_path.exists(): |
| return pd.DataFrame() |
|
|
| df = pd.read_parquet(local_path).copy() |
| df = df.rename(columns={ |
| "date": "Date", |
| "open": "Open", |
| "high": "High", |
| "low": "Low", |
| "close": "Close", |
| "volume": "Volume", |
| }) |
| if "Date" not in df.columns: |
| return pd.DataFrame() |
| df["Date"] = pd.to_datetime(df["Date"]).dt.date |
| start = (pd.Timestamp(start_date) - pd.Timedelta(days=45)).date() |
| end = (pd.Timestamp.now() + pd.Timedelta(days=2)).date() |
| df = df[(df["Date"] >= start) & (df["Date"] <= end)].reset_index(drop=True) |
| if df.empty: |
| return df |
| return self._augment_price_features(df) |
|
|
| async def build_dataset(self): |
| now = datetime.now() |
| train_start = now - timedelta(days=self.train_days + self.test_days) |
| cutoff_date = now - timedelta(days=self.test_days) |
| test_start = cutoff_date |
|
|
| price_df = self.get_prices(train_start) |
| if price_df.empty: |
| raise ValueError(f"No price data for {self.ticker}") |
|
|
| train_articles = await self.scraper.scrape(self.ticker, train_start) |
| test_scraper = NewsScraper(limit=600) |
| test_articles = await test_scraper.scrape(self.ticker, test_start) |
| |
| all_articles = train_articles + test_articles |
| all_articles = await self.extractor.extract_all(all_articles) |
|
|
| df = pd.DataFrame(all_articles) |
| if "content" not in df.columns: |
| df["content"] = df.get("description", "") |
| else: |
| df["content"] = df["content"].fillna(df.get("description", "")).fillna("") |
| df['ts'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True) |
| df = df.dropna(subset=['ts']) |
| df['date'] = df['ts'].dt.date |
| |
| cutoff = cutoff_date.date() |
| train_mask = df['date'] < cutoff |
| test_mask = df['date'] >= cutoff |
|
|
| train_df = df[train_mask].copy().reset_index(drop=True) |
| test_df = df[test_mask].copy().reset_index(drop=True) |
|
|
| train_links = set(train_df['link']) |
| test_df = test_df[~test_df['link'].isin(train_links)].reset_index(drop=True) |
|
|
| if len(train_df) < 20: |
| raise ValueError(f"Only {len(train_df)} training articles — need more data.") |
|
|
| train_df = self.features.build(train_df) |
| test_df = self.features.build(test_df) |
|
|
| train_df = self._add_market_context(train_df, price_df) |
| test_df = self._add_market_context(test_df, price_df) |
|
|
| train_df = self._add_labels(train_df, price_df) |
|
|
| atr_series = train_df["atr14_ratio"] if "atr14_ratio" in train_df.columns else pd.Series(dtype=float) |
| atr_ref = float(pd.Series(atr_series).replace([np.inf, -np.inf], np.nan).median()) |
| if not np.isfinite(atr_ref): |
| atr_ref = 0.01 |
| self.dir_move_threshold = max(0.0015, atr_ref * 0.20) |
| self.hh_move_threshold = max(0.012, atr_ref * 1.10) |
|
|
| train_df = train_df.sort_values(['date', 'ts']).reset_index(drop=True) |
| test_df = test_df.sort_values(['date', 'ts']).reset_index(drop=True) |
| train_df = train_df.dropna(subset=['next_ret']).reset_index(drop=True) |
|
|
| avail_feats = [c for c in self.FEATURE_COLS if c in train_df.columns] |
|
|
| X_train = train_df[avail_feats].fillna(0).replace([np.inf, -np.inf], 0) |
| X_test = test_df[avail_feats].fillna(0).replace([np.inf, -np.inf], 0) |
|
|
| y_dir = (train_df['next_ret'] > self.dir_move_threshold).astype(int) |
| y_hh = (train_df['next_ret'].abs() > self.hh_move_threshold).astype(int) |
| y_ret = train_df['next_ret'].astype(float) |
|
|
| return X_train, y_dir, y_hh, y_ret, X_test, test_df, price_df |
|
|
| def get_prices(self, start_date): |
| local_df = self._load_local_prices(start_date) |
| if not local_df.empty: |
| return local_df |
|
|
| stock = yf.Ticker(self.ticker) |
| df = stock.history( |
| start=start_date - timedelta(days=45), |
| end=datetime.now() + timedelta(days=2), |
| auto_adjust=True |
| ) |
| if df.empty: |
| return df |
| df = df.reset_index() |
| if hasattr(df['Date'].dtype, 'tz') and df['Date'].dtype.tz is not None: |
| df['Date'] = df['Date'].dt.tz_localize(None) |
| df['Date'] = pd.to_datetime(df['Date']).dt.date |
| df = df.rename(columns={ |
| 'Date': 'Date', |
| 'Open': 'Open', |
| 'High': 'High', |
| 'Low': 'Low', |
| 'Close': 'Close', |
| 'Volume': 'Volume', |
| }) |
| return self._augment_price_features(df) |
|
|
| def _add_market_context(self, articles_df, price_df): |
| trading_dates = sorted(price_df['Date'].unique()) |
| def get_prior_trading_date(pub_date): |
| for td in reversed(trading_dates): |
| if td <= pub_date: |
| return td |
| return trading_dates[0] if trading_dates else None |
|
|
| articles_df['prior_td'] = articles_df['date'].apply(get_prior_trading_date) |
|
|
| price_map = price_df.set_index('Date') |
| excluded = {'Open', 'High', 'Low', 'Close', 'Volume', 'Date'} |
| mapped_cols = {} |
| for col in [c for c in price_map.columns if c not in excluded]: |
| mapped_cols[col] = articles_df['prior_td'].map(price_map[col].to_dict()) |
| if mapped_cols: |
| articles_df = pd.concat([articles_df, pd.DataFrame(mapped_cols, index=articles_df.index)], axis=1) |
|
|
| def _col(name): |
| if name in articles_df.columns: |
| return articles_df[name].fillna(0) |
| return pd.Series(0.0, index=articles_df.index) |
|
|
| sent = _col('sent_combined') |
| abs_sent = _col('sent_abs') |
| event_total = _col('event_total') |
| source_tier = _col('source_tier') |
| signal = _col('signal_intensity') |
| urgency = _col('n_urgent') |
| vol_20 = _col('vol_20d') |
| rsi14 = _col('rsi14') |
| atr14 = _col('atr14_ratio') |
| trend_5_20 = _col('trend_5_20') |
| trend_20_50 = _col('trend_20_50') |
| gap_pct = _col('gap_pct') |
|
|
| articles_df['news_x_ret_1d'] = sent * _col('ret_1d') |
| articles_df['news_x_trend_5_20'] = sent * trend_5_20 |
| articles_df['news_x_trend_20_50'] = sent * trend_20_50 |
| articles_df['news_x_vol_20'] = abs_sent * vol_20 |
| articles_df['event_x_trend'] = event_total * trend_5_20 |
| articles_df['event_x_vol'] = event_total * vol_20 |
| articles_df['source_x_trend'] = source_tier * trend_5_20 |
| articles_df['sent_x_rsi'] = sent * rsi14 |
| articles_df['sent_x_atr'] = sent * atr14 |
| articles_df['news_burst_x_vol'] = _col('same_day_count') * vol_20 |
| articles_df['urgency_x_gap'] = urgency * gap_pct |
| articles_df['sentiment_regime'] = sent * np.sign(trend_5_20.fillna(0)) |
| articles_df['impact_regime'] = signal * np.sign(_col('ret_1d').fillna(0)) |
| return articles_df |
|
|
| def _add_labels(self, articles_df, price_df): |
| trading_dates = sorted(price_df['Date'].unique()) |
| td_set = set(trading_dates) |
| |
| def get_next_trading_date(pub_date): |
| for td in trading_dates: |
| if td > pub_date: |
| return td |
| return None |
|
|
| def get_current_or_next_td(pub_date): |
| if pub_date in td_set: |
| return pub_date |
| for td in trading_dates: |
| if td > pub_date: |
| return td |
| return None |
|
|
| price_close = price_df.set_index('Date')['Close'].to_dict() |
|
|
| next_rets = [] |
| for _, row in articles_df.iterrows(): |
| pub = row['date'] |
| td_current = get_current_or_next_td(pub) |
| td_next = get_next_trading_date(pub) if td_current == pub else None |
| |
| if td_current and td_current != pub: |
| idx = trading_dates.index(td_current) if td_current in trading_dates else -1 |
| if idx >= 0 and idx + 1 < len(trading_dates): |
| td_next = trading_dates[idx + 1] |
| c0 = price_close.get(td_current) |
| c1 = price_close.get(td_next) |
| if c0 and c1 and c0 > 0: |
| next_rets.append(c1 / c0 - 1) |
| else: |
| next_rets.append(np.nan) |
| else: |
| next_rets.append(np.nan) |
| elif td_current == pub: |
| idx = trading_dates.index(pub) |
| if idx + 1 < len(trading_dates): |
| td_next = trading_dates[idx + 1] |
| c0 = price_close.get(pub) |
| c1 = price_close.get(td_next) |
| if c0 and c1 and c0 > 0: |
| next_rets.append(c1 / c0 - 1) |
| else: |
| next_rets.append(np.nan) |
| else: |
| next_rets.append(np.nan) |
| else: |
| next_rets.append(np.nan) |
|
|
| articles_df['next_ret'] = next_rets |
| return articles_df |
|
|