AutoDataLab2.0 / ceo_brief_env /stooq_scrape.py
uchihamadara1816's picture
Upload 172 files
d02bacd verified
"""
Lightweight Stooq daily OHLCV fetch (HTTP GET of CSV) for RAG-advanced mode.
Not financial advice. For simulation / grounding only. Falls back to bundled fixtures
when offline or on error so CI and judges stay reproducible.
"""
from __future__ import annotations
import csv
import io
from pathlib import Path
from typing import List, Tuple
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
# Must match grader `STOOQ_CITATION_SUFFIXES`.
DEFAULT_WATCHLIST: tuple[str, ...] = ("nvda.us", "aapl.us", "jpm.us")
_STOOQ_DAILY = "https://stooq.com/q/d/l/?s={symbol}&i=d"
_USER_AGENT = "AutoDataLab-Plus/0.1 (research; +https://github.com/)"
_TIMEOUT_SEC = 8.0
_FIXTURES = Path(__file__).resolve().parent / "fixtures" / "stooq"
# Bundled multi-hundred-row daily history (Stooq-shaped) for Strategy when RAG is off
# (no network; used as “enterprise tape” context).
_LONG_FIXTURES = Path(__file__).resolve().parent / "fixtures" / "stooq_long"
def _parse_csv_tail(text: str, symbol: str, last_n: int = 3) -> str:
text = text.strip()
if not text or "Date" not in text:
return f"{symbol}: (no data)"
lines = [ln for ln in text.splitlines() if ln.strip()]
if len(lines) < 2:
return f"{symbol}: (no data)"
rdr = csv.reader(io.StringIO(text))
rows: List[list[str]] = list(rdr)
if not rows:
return f"{symbol}: (no data)"
header, *data = rows
if not data:
return f"{symbol}: (no data)"
tail = data[-last_n:]
parts = [f"Stooq {symbol} daily:"]
for row in tail:
if not row or row[0] == "No data":
continue
date = row[0]
try:
close = row[4] if len(row) > 4 else row[-1]
except IndexError:
close = "—"
parts.append(f" {date} close≈{close}")
return " ".join(parts) if len(parts) > 1 else f"{symbol}: (unparseable)"
def _read_fixture(symbol: str) -> str | None:
path = _FIXTURES / f"{symbol.replace('.', '_')}.csv"
if not path.is_file():
return None
try:
return path.read_text(encoding="utf-8", errors="replace")
except OSError:
return None
def _looks_like_stooq_csv(text: str) -> bool:
t = text.lstrip("\ufeff").strip()
if not t or "Date" not in t.splitlines()[0]:
return False
if "<html" in t.lower() or "<!doctype" in t.lower():
return False
return True
def fetch_stooq_daily_csv(symbol: str) -> str:
"""Return raw CSV text from network or local fixture."""
url = _STOOQ_DAILY.format(symbol=symbol.lower())
req = Request(url, headers={"User-Agent": _USER_AGENT})
try:
with urlopen(req, timeout=_TIMEOUT_SEC) as resp:
raw = resp.read().decode("utf-8", errors="replace")
except (HTTPError, URLError, OSError, TimeoutError):
raw = ""
if not _looks_like_stooq_csv(raw):
fix = _read_fixture(symbol)
if fix is not None:
return fix
return "Date,Open,High,Low,Close,Volume\n"
return raw
def read_long_fixture_csv(symbol: str) -> str:
"""Read the bundled long daily CSV for ``symbol`` (e.g. ``nvda.us``)."""
path = _LONG_FIXTURES / f"{symbol.replace('.', '_')}.csv"
if not path.is_file():
return "Date,Open,High,Low,Close,Volume\n"
try:
return path.read_text(encoding="utf-8", errors="replace")
except OSError:
return "Date,Open,High,Low,Close,Volume\n"
def scrape_watchlist_from_long_csv(
symbols: tuple[str, ...] = DEFAULT_WATCHLIST,
last_n: int = 5,
) -> List[Tuple[str, str, str, int]]:
"""
Like :func:`scrape_watchlist` but only reads local long CSVs (no HTTP).
Returns ``(stooq_symbol, citation, snippet, row_count_excl_header)``.
Citations use the same ``stooq:`` prefix so graders stay consistent if RAG is on elsewhere.
"""
out: list[tuple[str, str, str, int]] = []
for sym in symbols:
raw = read_long_fixture_csv(sym)
rows = list(csv.reader(io.StringIO(raw)))
n_data = max(0, len(rows) - 1)
snip = _parse_csv_tail(raw, sym, last_n=last_n)
cite = f"stooq:{sym}"
out.append((sym, cite, snip, n_data))
return out
def scrape_watchlist(
symbols: tuple[str, ...] = DEFAULT_WATCHLIST,
) -> List[Tuple[str, str, str]]:
"""
For each symbol, fetch Stooq daily history and return
(stooq_symbol, citation, snippet) for RAG + grounding.
Citation format: ``stooq:nvda.us`` (used by ``graders.grounding_score``).
"""
out: list[tuple[str, str, str]] = []
for sym in symbols:
raw = fetch_stooq_daily_csv(sym)
snip = _parse_csv_tail(raw, sym)
cite = f"stooq:{sym}"
out.append((sym, cite, snip))
return out