Spaces:
Sleeping
Sleeping
| # src/build_features.py | |
| import pandas as pd | |
| import os | |
| # ------------------------------------------------------------------ | |
| # Setup | |
| # ------------------------------------------------------------------ | |
| os.makedirs("data/processed", exist_ok=True) | |
| POS_WORDS = {"good", "buy", "up", "rise", "gain", "bull", "profit", "growth"} | |
| NEG_WORDS = {"bad", "sell", "down", "fall", "loss", "bear", "risk", "crash"} | |
| # ------------------------------------------------------------------ | |
| # Simple rule-based sentiment | |
| # ------------------------------------------------------------------ | |
| def simple_sentiment(text): | |
| if not isinstance(text, str): | |
| return 0.0 | |
| words = text.lower().split() | |
| pos = sum(w in POS_WORDS for w in words) | |
| neg = sum(w in NEG_WORDS for w in words) | |
| return (pos - neg) / (pos + neg) if (pos + neg) > 0 else 0.0 | |
| # ------------------------------------------------------------------ | |
| # Load & normalize news data | |
| # ------------------------------------------------------------------ | |
| def load_news(): | |
| dfs = [] | |
| for fname in ["news_articles.csv", "gnews_data.csv", "reddit_data.csv"]: | |
| path = f"data/raw/{fname}" | |
| if os.path.exists(path): | |
| df = pd.read_csv(path) | |
| dfs.append(df) | |
| if not dfs: | |
| print("⚠ No news files found — sentiment will be zero") | |
| return pd.DataFrame(columns=["date", "sentiment"]) | |
| news = pd.concat(dfs, ignore_index=True) | |
| # Normalize text column | |
| if "content" in news.columns: | |
| news["text"] = news["content"] | |
| elif "text" not in news.columns: | |
| raise ValueError("No text/content column found in news data") | |
| # Normalize datetime column | |
| if "publishedAt" not in news.columns: | |
| raise ValueError("No publishedAt column found in news data") | |
| news["publishedAt"] = pd.to_datetime(news["publishedAt"], errors="coerce") | |
| news = news.dropna(subset=["publishedAt"]) | |
| news["date"] = news["publishedAt"].dt.date | |
| news["sentiment"] = news["text"].apply(simple_sentiment) | |
| # Daily aggregated sentiment | |
| daily_sent = ( | |
| news.groupby("date")["sentiment"] | |
| .mean() | |
| .reset_index() | |
| ) | |
| return daily_sent | |
| # ------------------------------------------------------------------ | |
| # Main feature pipeline | |
| # ------------------------------------------------------------------ | |
| def main(): | |
| # ------------------------------- | |
| # Load stock prices | |
| # ------------------------------- | |
| prices = pd.read_csv("data/raw/stock_prices.csv") | |
| prices = prices.dropna(subset=["Ticker"]) | |
| prices["Date"] = pd.to_datetime(prices["Date"], utc=True) | |
| prices["date"] = prices["Date"].dt.date | |
| # Ensure numeric columns (CRITICAL FIX) | |
| for col in ["Close", "Volume", "Return"]: | |
| if col in prices.columns: | |
| prices[col] = pd.to_numeric(prices[col], errors="coerce") | |
| # ------------------------------- | |
| # Load sentiment | |
| # ------------------------------- | |
| daily_sent = load_news() | |
| # ------------------------------- | |
| # Merge prices + sentiment | |
| # ------------------------------- | |
| merged = prices.merge(daily_sent, on="date", how="left") | |
| merged["sentiment"] = merged["sentiment"].fillna(0) | |
| merged = merged.sort_values(["Ticker", "Date"]) | |
| # ------------------------------- | |
| # Lag features | |
| # ------------------------------- | |
| merged["return_lag1"] = merged.groupby("Ticker")["Return"].shift(1) | |
| merged["volume_lag1"] = merged.groupby("Ticker")["Volume"].shift(1) | |
| merged["sentiment_lag1"] = merged.groupby("Ticker")["sentiment"].shift(1) | |
| # ------------------------------- | |
| # Coerce lagged columns to numeric | |
| # ------------------------------- | |
| merged["return_lag1"] = pd.to_numeric( | |
| merged["return_lag1"], errors="coerce" | |
| ).fillna(0) | |
| merged["volume_lag1"] = pd.to_numeric( | |
| merged["volume_lag1"], errors="coerce" | |
| ) | |
| # Compute per-ticker median lagged volume | |
| median_volume = merged.groupby("Ticker")["volume_lag1"].median() | |
| # Map median volume back to rows (vectorized, NaN-safe) | |
| merged["volume_lag1"] = merged["volume_lag1"].fillna( | |
| merged["Ticker"].map(median_volume) | |
| ) | |
| # Final fallback if still NaN (e.g., ticker itself missing) | |
| merged["volume_lag1"] = merged["volume_lag1"].fillna(0) | |
| merged["sentiment_lag1"] = merged["sentiment_lag1"].fillna(0) | |
| # ------------------------------- | |
| # Final sanity filter | |
| # ------------------------------- | |
| merged = merged[merged["Ticker"].notna()] | |
| # ------------------------------- | |
| # Save output | |
| # ------------------------------- | |
| merged.to_csv("data/processed/merged_features.csv", index=False) | |
| print("Saved data/processed/merged_features.csv") | |
| print("Rows:", len(merged)) | |
| print("Tickers:", merged["Ticker"].unique()) | |
| # ------------------------------------------------------------------ | |
| if __name__ == "__main__": | |
| main() | |