Spaces:
Sleeping
Sleeping
File size: 1,496 Bytes
e2b2661 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import pandas as pd
import re
from pathlib import Path
INPUT_FILE = "news_articles_1.csv"
OUTPUT_FILE = "data/streaming/news_stream.csv"
TICKER_KEYWORDS = {
"AAPL": ["apple", "iphone", "ipad", "tim cook"],
"GOOGL": ["google", "alphabet", "youtube"],
"TSLA": ["tesla", "elon", "musk"]
}
def infer_ticker(text):
text = text.lower()
for ticker, keywords in TICKER_KEYWORDS.items():
if any(k in text for k in keywords):
return ticker
return None
def main():
df = pd.read_csv(INPUT_FILE)
# ---- unify text field ----
if "content" in df.columns:
df["text"] = df["content"]
elif "description" in df.columns:
df["text"] = df["description"]
elif "title" in df.columns:
df["text"] = df["title"]
else:
raise ValueError("No usable text column found")
# ---- date ----
if "publishedAt" in df.columns:
df["date"] = pd.to_datetime(df["publishedAt"], errors="coerce").dt.date
else:
df["date"] = pd.to_datetime(df.iloc[:, 0], errors="coerce").dt.date
# ---- ticker inference ----
df["ticker"] = df["text"].apply(infer_ticker)
# ---- cleanup ----
df = df.dropna(subset=["date", "ticker", "text"])
df = df[["date", "ticker", "text"]]
Path("data/streaming").mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_FILE, index=False)
print(f"Streaming-ready file saved → {OUTPUT_FILE}")
print(df.head())
if __name__ == "__main__":
main()
|