File size: 1,496 Bytes
e2b2661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
import re
from pathlib import Path

INPUT_FILE = "news_articles_1.csv"
OUTPUT_FILE = "data/streaming/news_stream.csv"

TICKER_KEYWORDS = {
    "AAPL": ["apple", "iphone", "ipad", "tim cook"],
    "GOOGL": ["google", "alphabet", "youtube"],
    "TSLA": ["tesla", "elon", "musk"]
}

def infer_ticker(text):
    text = text.lower()
    for ticker, keywords in TICKER_KEYWORDS.items():
        if any(k in text for k in keywords):
            return ticker
    return None


def main():
    df = pd.read_csv(INPUT_FILE)

    # ---- unify text field ----
    if "content" in df.columns:
        df["text"] = df["content"]
    elif "description" in df.columns:
        df["text"] = df["description"]
    elif "title" in df.columns:
        df["text"] = df["title"]
    else:
        raise ValueError("No usable text column found")

    # ---- date ----
    if "publishedAt" in df.columns:
        df["date"] = pd.to_datetime(df["publishedAt"], errors="coerce").dt.date
    else:
        df["date"] = pd.to_datetime(df.iloc[:, 0], errors="coerce").dt.date

    # ---- ticker inference ----
    df["ticker"] = df["text"].apply(infer_ticker)

    # ---- cleanup ----
    df = df.dropna(subset=["date", "ticker", "text"])
    df = df[["date", "ticker", "text"]]

    Path("data/streaming").mkdir(parents=True, exist_ok=True)
    df.to_csv(OUTPUT_FILE, index=False)

    print(f"Streaming-ready file saved → {OUTPUT_FILE}")
    print(df.head())


if __name__ == "__main__":
    main()