Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| from pathlib import Path | |
| INPUT_FILE = "news_articles_1.csv" | |
| OUTPUT_FILE = "data/streaming/news_stream.csv" | |
| TICKER_KEYWORDS = { | |
| "AAPL": ["apple", "iphone", "ipad", "tim cook"], | |
| "GOOGL": ["google", "alphabet", "youtube"], | |
| "TSLA": ["tesla", "elon", "musk"] | |
| } | |
| def infer_ticker(text): | |
| text = text.lower() | |
| for ticker, keywords in TICKER_KEYWORDS.items(): | |
| if any(k in text for k in keywords): | |
| return ticker | |
| return None | |
| def main(): | |
| df = pd.read_csv(INPUT_FILE) | |
| # ---- unify text field ---- | |
| if "content" in df.columns: | |
| df["text"] = df["content"] | |
| elif "description" in df.columns: | |
| df["text"] = df["description"] | |
| elif "title" in df.columns: | |
| df["text"] = df["title"] | |
| else: | |
| raise ValueError("No usable text column found") | |
| # ---- date ---- | |
| if "publishedAt" in df.columns: | |
| df["date"] = pd.to_datetime(df["publishedAt"], errors="coerce").dt.date | |
| else: | |
| df["date"] = pd.to_datetime(df.iloc[:, 0], errors="coerce").dt.date | |
| # ---- ticker inference ---- | |
| df["ticker"] = df["text"].apply(infer_ticker) | |
| # ---- cleanup ---- | |
| df = df.dropna(subset=["date", "ticker", "text"]) | |
| df = df[["date", "ticker", "text"]] | |
| Path("data/streaming").mkdir(parents=True, exist_ok=True) | |
| df.to_csv(OUTPUT_FILE, index=False) | |
| print(f"Streaming-ready file saved → {OUTPUT_FILE}") | |
| print(df.head()) | |
| if __name__ == "__main__": | |
| main() | |