mathidot commited on
Commit
8f1601b
·
1 Parent(s): 4a8fc49

build option trading agent modules

Browse files
.gitignore CHANGED
@@ -1,2 +1,44 @@
1
- ./knowledge_base
2
- knowledge_base/raw/pdf/*.pdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Secrets and local environment
2
+ .env
3
+ .env.*
4
+ !.env.example
5
+ .venv/
6
+ .uv-cache/
7
+ .python_history
8
+
9
+ # Python generated files
10
+ __pycache__/
11
+ *.py[cod]
12
+ *$py.class
13
+ .pytest_cache/
14
+ .ruff_cache/
15
+ .mypy_cache/
16
+ .pyright/
17
+
18
+ # App/runtime artifacts
19
+ .gradio/
20
+ *.log
21
+ .DS_Store
22
+
23
+ # Local vector databases and RAG inputs
24
+ alfred_chroma_db/
25
+ knowledge_base/
26
+ tools/knowledge_base/
27
+ *.sqlite3
28
+ *.sqlite
29
+
30
+ # Local model caches
31
+ hf_cache/
32
+ tools/hf_cache/
33
+
34
+ # Evaluation datasets, indexes, and generated reports
35
+ eval/data/
36
+ eval/indexes/
37
+ eval/reports/
38
+ eval/local_options_eval.jsonl
39
+
40
+ # Local market/backtest data exports
41
+ data/
42
+ backtest/data/
43
+ *.parquet
44
+ *.feather
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
app.py CHANGED
@@ -1,91 +1,27 @@
1
- from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool, LiteLLMModel
2
  import os
3
  import datetime
4
- import requests
5
  import pytz
6
  import yaml
7
- import json
8
  from dotenv import load_dotenv
9
  from tools.final_answer import FinalAnswerTool
10
- import yfinance as yf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from Gradio_UI import GradioUI
12
 
13
- @tool
14
- def query_market_asset(symbol: str) -> str:
15
- """A universal market data tool to query the current price or level of ANY asset.
16
-
17
- Supported asset classes include major indices, stocks, ETFs, crypto, and forex.
18
-
19
- Args:
20
- symbol: The specific ticker symbol used by Yahoo Finance. Examples:
21
- - Indices (requires '^'): '^GSPC' (S&P 500), '^VIX' (Volatility Index), '^DJI' (Dow Jones), '^IXIC' (Nasdaq)
22
- - Equities / ETFs: 'AAPL' (Apple), 'SPY' (SPDR S&P 500 ETF), 'TLT' (20+ Yr Treasury Bond)
23
- - Crypto: 'BTC-USD' (Bitcoin), 'ETH-USD' (Ethereum)
24
- - Forex: 'EURUSD=X' (EUR/USD rate), 'USDCNH=X' (USD/Offshore RMB)
25
-
26
- Returns:
27
- A JSON-formatted string containing the current price, high/low, timestamp, and asset info.
28
- """
29
- symbol = symbol.strip().upper()
30
-
31
- try:
32
- ticker = yf.Ticker(symbol)
33
-
34
- data = ticker.history(period="1d", interval="1m")
35
-
36
- if not data.empty:
37
- latest_row = data.iloc[-1]
38
- current_price = float(latest_row['Close'])
39
- open_price = float(latest_row['Open'])
40
- high_price = float(latest_row['High'])
41
- low_price = float(latest_row['Low'])
42
- volume = int(latest_row['Volume'])
43
- timestamp = str(data.index[-1])
44
-
45
- result = {
46
- "status": "success",
47
- "symbol": symbol,
48
- "current_price": round(current_price, 4),
49
- "open": round(open_price, 4),
50
- "high": round(high_price, 4),
51
- "low": round(low_price, 4),
52
- "volume": volume,
53
- "timestamp": timestamp,
54
- "data_type": "intraday_1m"
55
- }
56
- else:
57
- info = ticker.info
58
- current_price = info.get("regularMarketPrice") or info.get("previousClose") or info.get("ask") or info.get("bid")
59
-
60
- if current_price:
61
- result = {
62
- "status": "success",
63
- "symbol": symbol,
64
- "current_price": round(float(current_price), 4),
65
- "open": info.get("regularMarketOpen") or info.get("open"),
66
- "high": info.get("regularMarketDayHigh") or info.get("dayHigh"),
67
- "low": info.get("regularMarketDayLow") or info.get("dayLow"),
68
- "volume": info.get("regularMarketVolume") or info.get("volume", 0),
69
- "short_name": info.get("shortName", ""),
70
- "data_type": "cached_info"
71
- }
72
- else:
73
- result = {
74
- "status": "error",
75
- "symbol": symbol,
76
- "message": "No price data could be resolved for this asset."
77
- }
78
-
79
- except Exception as e:
80
- result = {
81
- "status": "error",
82
- "symbol": symbol,
83
- "message": f"Exception occurred while querying: {str(e)}"
84
- }
85
-
86
- return json.dumps(result, ensure_ascii=False, indent=2)
87
-
88
-
89
  @tool
90
  def get_current_time_in_timezone(timezone: str) -> str:
91
  """A tool that fetches the current local time in a specified timezone.
@@ -104,6 +40,9 @@ def get_current_time_in_timezone(timezone: str) -> str:
104
 
105
  if __name__ == "__main__":
106
  final_answer = FinalAnswerTool()
 
 
 
107
  load_dotenv()
108
  hf_token = os.getenv("HF_TOKEN")
109
  gemini_api_key = os.getenv("GEMINI_API_KEY");
@@ -118,7 +57,25 @@ if __name__ == "__main__":
118
 
119
  agent = CodeAgent(
120
  model=model,
121
- tools=[query_market_asset, get_current_time_in_timezone, final_answer],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  max_steps=6,
123
  verbosity_level=1,
124
  grammar=None,
 
1
+ from smolagents import CodeAgent, HfApiModel, load_tool, tool, LiteLLMModel
2
  import os
3
  import datetime
 
4
  import pytz
5
  import yaml
 
6
  from dotenv import load_dotenv
7
  from tools.final_answer import FinalAnswerTool
8
+ from tools.query_knowledge import QueryKnowledgeTool
9
+ from tools.web_search import DuckDuckGoSearchTool
10
+ from tools.visit_webpage import VisitWebpageTool
11
+ from market_data.tools import (
12
+ calculate_option_greeks,
13
+ query_market_asset,
14
+ query_option_chain,
15
+ query_option_expirations,
16
+ query_price_history,
17
+ query_realized_volatility,
18
+ query_volatility_snapshot,
19
+ )
20
+ from strategy.tools import build_volatility_strategy
21
+ from backtest.tools import analyze_strategy_payoff, backtest_long_straddle_csv, backtest_volatility_signal
22
+ from optimizer.tools import optimize_volatility_signal_parameters
23
  from Gradio_UI import GradioUI
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  @tool
26
  def get_current_time_in_timezone(timezone: str) -> str:
27
  """A tool that fetches the current local time in a specified timezone.
 
40
 
41
  if __name__ == "__main__":
42
  final_answer = FinalAnswerTool()
43
+ query_knowledge = QueryKnowledgeTool()
44
+ web_search = DuckDuckGoSearchTool(max_results=6)
45
+ visit_webpage = VisitWebpageTool()
46
  load_dotenv()
47
  hf_token = os.getenv("HF_TOKEN")
48
  gemini_api_key = os.getenv("GEMINI_API_KEY");
 
57
 
58
  agent = CodeAgent(
59
  model=model,
60
+ tools=[
61
+ query_market_asset,
62
+ query_price_history,
63
+ query_realized_volatility,
64
+ query_option_expirations,
65
+ query_option_chain,
66
+ query_volatility_snapshot,
67
+ calculate_option_greeks,
68
+ build_volatility_strategy,
69
+ analyze_strategy_payoff,
70
+ backtest_long_straddle_csv,
71
+ backtest_volatility_signal,
72
+ optimize_volatility_signal_parameters,
73
+ get_current_time_in_timezone,
74
+ query_knowledge,
75
+ web_search,
76
+ visit_webpage,
77
+ final_answer,
78
+ ],
79
  max_steps=6,
80
  verbosity_level=1,
81
  grammar=None,
backtest/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .vol_backtest import backtest_realized_vol_signal
2
+ from .option_backtest import backtest_long_straddle_from_quotes, load_option_quotes_csv
3
+
4
+ __all__ = [
5
+ "backtest_long_straddle_from_quotes",
6
+ "backtest_realized_vol_signal",
7
+ "load_option_quotes_csv",
8
+ ]
backtest/option_backtest.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import asdict, dataclass
4
+ from datetime import timedelta
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+ from .vol_backtest import max_drawdown
10
+
11
+
12
+ REQUIRED_QUOTE_COLUMNS = {
13
+ "date",
14
+ "underlying_symbol",
15
+ "underlying_price",
16
+ "contract_symbol",
17
+ "option_type",
18
+ "expiration",
19
+ "strike",
20
+ "bid",
21
+ "ask",
22
+ }
23
+
24
+
25
+ @dataclass
26
+ class OptionBacktestTrade:
27
+ entry_date: str
28
+ exit_date: str
29
+ contract_symbol: str
30
+ option_type: str
31
+ strike: float
32
+ expiration: str
33
+ quantity: int
34
+ entry_price: float
35
+ exit_price: float
36
+ fees: float
37
+ pnl: float
38
+
39
+ def to_dict(self) -> dict:
40
+ return asdict(self)
41
+
42
+
43
+ def validate_quote_frame(quotes: pd.DataFrame) -> None:
44
+ missing = REQUIRED_QUOTE_COLUMNS - set(quotes.columns)
45
+ if missing:
46
+ raise ValueError(f"Historical option quotes missing required columns: {sorted(missing)}")
47
+
48
+
49
+ def prepare_quotes(quotes: pd.DataFrame) -> pd.DataFrame:
50
+ validate_quote_frame(quotes)
51
+ frame = quotes.copy()
52
+ frame["date"] = pd.to_datetime(frame["date"]).dt.normalize()
53
+ frame["expiration"] = pd.to_datetime(frame["expiration"]).dt.normalize()
54
+ frame["option_type"] = frame["option_type"].str.lower()
55
+ quoted_mid = (frame["bid"] + frame["ask"]) / 2
56
+ if "mid" not in frame.columns:
57
+ frame["mid"] = quoted_mid
58
+ else:
59
+ frame["mid"] = frame["mid"].where(frame["mid"].notna(), quoted_mid)
60
+ frame["dte"] = (frame["expiration"] - frame["date"]).dt.days
61
+ frame = frame[(frame["bid"] >= 0) & (frame["ask"] > 0) & (frame["dte"] >= 0)]
62
+ return frame.sort_values(["date", "expiration", "strike", "option_type"]).reset_index(drop=True)
63
+
64
+
65
+ def load_option_quotes_csv(path: str | Path) -> pd.DataFrame:
66
+ return prepare_quotes(pd.read_csv(path))
67
+
68
+
69
+ def available_exit_date(
70
+ quotes: pd.DataFrame,
71
+ entry_date: pd.Timestamp,
72
+ target_exit_date: pd.Timestamp,
73
+ contract_symbol: str,
74
+ ) -> pd.Timestamp | None:
75
+ contract_quotes = quotes[
76
+ (quotes["contract_symbol"] == contract_symbol)
77
+ & (quotes["date"] >= target_exit_date)
78
+ ]
79
+ if contract_quotes.empty:
80
+ contract_quotes = quotes[quotes["contract_symbol"] == contract_symbol]
81
+ contract_quotes = contract_quotes[
82
+ (contract_quotes["date"] > entry_date)
83
+ & (contract_quotes["date"] < target_exit_date)
84
+ ]
85
+ if contract_quotes.empty:
86
+ return None
87
+ return contract_quotes["date"].max()
88
+ if contract_quotes.empty:
89
+ return None
90
+ return contract_quotes["date"].min()
91
+
92
+
93
+ def quote_price(row: pd.Series, side: str, price_field: str) -> float:
94
+ if price_field == "mid":
95
+ return float(row["mid"])
96
+ if price_field != "trade":
97
+ raise ValueError("price_field must be 'trade' or 'mid'.")
98
+ if side == "buy":
99
+ return float(row["ask"])
100
+ return float(row["bid"])
101
+
102
+
103
+ def select_expiration_slice(day_quotes: pd.DataFrame, target_dte: int) -> pd.DataFrame:
104
+ candidates = day_quotes[day_quotes["dte"] > 0]
105
+ if candidates.empty:
106
+ return candidates
107
+ expiration = candidates.assign(dte_error=(candidates["dte"] - target_dte).abs()).sort_values("dte_error").iloc[0]["expiration"]
108
+ return candidates[candidates["expiration"] == expiration]
109
+
110
+
111
+ def select_atm_contract(expiration_slice: pd.DataFrame, option_type: str) -> pd.Series | None:
112
+ contracts = expiration_slice[expiration_slice["option_type"] == option_type]
113
+ if contracts.empty:
114
+ return None
115
+ spot = float(expiration_slice["underlying_price"].iloc[0])
116
+ return contracts.assign(strike_error=(contracts["strike"] - spot).abs()).sort_values("strike_error").iloc[0]
117
+
118
+
119
+ def backtest_long_straddle_from_quotes(
120
+ quotes: pd.DataFrame,
121
+ symbol: str,
122
+ target_dte: int = 30,
123
+ holding_days: int = 5,
124
+ entry_every_days: int = 5,
125
+ contract_multiplier: int = 100,
126
+ fee_per_contract: float = 0.65,
127
+ price_field: str = "trade",
128
+ ) -> dict:
129
+ frame = prepare_quotes(quotes)
130
+ frame = frame[frame["underlying_symbol"].str.upper() == symbol.upper()]
131
+ if frame.empty:
132
+ raise ValueError(f"No historical option quotes found for {symbol}.")
133
+
134
+ trades: list[OptionBacktestTrade] = []
135
+ trade_groups = []
136
+ equity = [0.0]
137
+ dates = sorted(frame["date"].unique())
138
+ next_entry_date = dates[0]
139
+
140
+ for entry_date in dates:
141
+ entry_date = pd.Timestamp(entry_date)
142
+ if entry_date < next_entry_date:
143
+ continue
144
+
145
+ day_quotes = frame[frame["date"] == entry_date]
146
+ expiration_slice = select_expiration_slice(day_quotes, target_dte)
147
+ if expiration_slice.empty:
148
+ continue
149
+
150
+ call = select_atm_contract(expiration_slice, "call")
151
+ put = select_atm_contract(expiration_slice, "put")
152
+ if call is None or put is None:
153
+ continue
154
+
155
+ target_exit_date = entry_date + timedelta(days=holding_days)
156
+ pending_group_trades = []
157
+ group_pnl = 0.0
158
+ for leg in [call, put]:
159
+ exit_date = available_exit_date(frame, entry_date, target_exit_date, str(leg["contract_symbol"]))
160
+ if exit_date is None:
161
+ continue
162
+ exit_quote = frame[
163
+ (frame["date"] == exit_date)
164
+ & (frame["contract_symbol"] == leg["contract_symbol"])
165
+ ].iloc[0]
166
+
167
+ entry_price = quote_price(leg, "buy", price_field)
168
+ exit_price = quote_price(exit_quote, "sell", price_field)
169
+ fees = fee_per_contract * 2
170
+ pnl = (exit_price - entry_price) * contract_multiplier - fees
171
+ trade = OptionBacktestTrade(
172
+ entry_date=str(entry_date.date()),
173
+ exit_date=str(pd.Timestamp(exit_date).date()),
174
+ contract_symbol=str(leg["contract_symbol"]),
175
+ option_type=str(leg["option_type"]),
176
+ strike=float(leg["strike"]),
177
+ expiration=str(pd.Timestamp(leg["expiration"]).date()),
178
+ quantity=1,
179
+ entry_price=round(entry_price, 4),
180
+ exit_price=round(exit_price, 4),
181
+ fees=round(fees, 2),
182
+ pnl=round(pnl, 2),
183
+ )
184
+ pending_group_trades.append(trade)
185
+ group_pnl += pnl
186
+
187
+ if len(pending_group_trades) == 2:
188
+ trades.extend(pending_group_trades)
189
+ equity.append(equity[-1] + group_pnl)
190
+ trade_groups.append(
191
+ {
192
+ "entry_date": str(entry_date.date()),
193
+ "exit_date": pending_group_trades[0].exit_date,
194
+ "strategy": "long_straddle",
195
+ "pnl": round(group_pnl, 2),
196
+ "legs": [trade.to_dict() for trade in pending_group_trades],
197
+ }
198
+ )
199
+ next_entry_date = entry_date + timedelta(days=entry_every_days)
200
+
201
+ equity_series = pd.Series(equity)
202
+ group_pnls = [group["pnl"] for group in trade_groups]
203
+ wins = [pnl for pnl in group_pnls if pnl > 0]
204
+ losses = [pnl for pnl in group_pnls if pnl <= 0]
205
+
206
+ return {
207
+ "strategy": "long_straddle",
208
+ "symbol": symbol.upper(),
209
+ "target_dte": target_dte,
210
+ "holding_days": holding_days,
211
+ "entry_every_days": entry_every_days,
212
+ "contract_multiplier": contract_multiplier,
213
+ "fee_per_contract": fee_per_contract,
214
+ "price_field": price_field,
215
+ "trade_count": len(trade_groups),
216
+ "leg_trade_count": len(trades),
217
+ "total_pnl": round(float(equity_series.iloc[-1]), 2) if not equity_series.empty else 0.0,
218
+ "max_drawdown": round(max_drawdown(equity_series + 100000), 6),
219
+ "win_rate": len(wins) / len(group_pnls) if group_pnls else 0.0,
220
+ "avg_win": round(sum(wins) / len(wins), 2) if wins else 0.0,
221
+ "avg_loss": round(sum(losses) / len(losses), 2) if losses else 0.0,
222
+ "trades": trade_groups[:200],
223
+ "data_requirements": [
224
+ "Historical option quotes with date, expiration, strike, bid, ask, and underlying_price.",
225
+ "For production-grade backtests, include deltas, IV, volume, open interest, and corporate action adjusted symbols.",
226
+ ],
227
+ "limitations": [
228
+ "No early assignment model yet.",
229
+ "No margin model yet.",
230
+ "No intraday fills; entry and exit use the daily quote row.",
231
+ "Results are only as good as the historical option quote data supplied.",
232
+ ],
233
+ }
backtest/tools.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from smolagents import tool
6
+
7
+ from market_data.providers import get_price_history
8
+ from strategy.payoff import expiration_payoff, strategy_summary
9
+ from strategy.schemas import OptionLeg, OptionStrategy
10
+
11
+ from .option_backtest import backtest_long_straddle_from_quotes, load_option_quotes_csv
12
+ from .vol_backtest import backtest_realized_vol_signal
13
+
14
+
15
+ def parse_legs(legs_json: str) -> list[OptionLeg]:
16
+ payload = json.loads(legs_json)
17
+ if isinstance(payload, dict) and "legs" in payload:
18
+ payload = payload["legs"]
19
+ return [OptionLeg(**leg) for leg in payload]
20
+
21
+
22
+ @tool
23
+ def analyze_strategy_payoff(legs_json: str, min_price: float, max_price: float, steps: int = 25) -> str:
24
+ """Analyze expiration payoff for an option strategy.
25
+
26
+ Args:
27
+ legs_json: JSON list of option legs from build_volatility_strategy.
28
+ min_price: Minimum underlying price scenario.
29
+ max_price: Maximum underlying price scenario.
30
+ steps: Number of scenario steps.
31
+ """
32
+ try:
33
+ legs = parse_legs(legs_json)
34
+ points = [
35
+ min_price + (max_price - min_price) * index / max(steps, 1)
36
+ for index in range(max(steps, 1) + 1)
37
+ ]
38
+ rows = [
39
+ {"underlying_price": round(price, 2), "pnl": round(expiration_payoff(legs, price), 2)}
40
+ for price in points
41
+ ]
42
+ temp_strategy = OptionStrategy(
43
+ name="custom_strategy",
44
+ volatility_view="unknown",
45
+ directional_view="unknown",
46
+ legs=legs,
47
+ rationale="custom payoff analysis",
48
+ risks=[],
49
+ max_profit=None,
50
+ max_loss=None,
51
+ breakevens=[],
52
+ net_debit_or_credit=round(sum(leg.premium * leg.signed_quantity() * 100 for leg in legs), 2),
53
+ score=0.0,
54
+ )
55
+ return json.dumps(
56
+ {
57
+ "status": "success",
58
+ "payoff_rows": rows,
59
+ "payoff_summary": strategy_summary(temp_strategy),
60
+ },
61
+ ensure_ascii=False,
62
+ indent=2,
63
+ )
64
+ except Exception as exc:
65
+ return json.dumps({"status": "error", "message": str(exc)}, ensure_ascii=False, indent=2)
66
+
67
+
68
+ @tool
69
+ def backtest_volatility_signal(
70
+ symbol: str,
71
+ signal: str = "long_vol",
72
+ period: str = "2y",
73
+ short_window: int = 10,
74
+ long_window: int = 30,
75
+ holding_days: int = 5,
76
+ ) -> str:
77
+ """Backtest a simple realized-volatility expansion/compression signal on the underlying.
78
+
79
+ Args:
80
+ symbol: Yahoo Finance ticker.
81
+ signal: long_vol or short_vol.
82
+ period: Yahoo Finance history period.
83
+ short_window: Short realized volatility lookback.
84
+ long_window: Long realized volatility lookback.
85
+ holding_days: Holding period after entry.
86
+ """
87
+ try:
88
+ history = get_price_history(symbol, period=period, interval="1d")
89
+ result = backtest_realized_vol_signal(
90
+ history["Close"],
91
+ short_window=short_window,
92
+ long_window=long_window,
93
+ holding_days=holding_days,
94
+ signal=signal,
95
+ )
96
+ return json.dumps({"status": "success", "symbol": symbol.upper(), **result}, ensure_ascii=False, indent=2)
97
+ except Exception as exc:
98
+ return json.dumps({"status": "error", "symbol": symbol, "message": str(exc)}, ensure_ascii=False, indent=2)
99
+
100
+
101
+ @tool
102
+ def backtest_long_straddle_csv(
103
+ csv_path: str,
104
+ symbol: str,
105
+ target_dte: int = 30,
106
+ holding_days: int = 5,
107
+ entry_every_days: int = 5,
108
+ price_field: str = "trade",
109
+ ) -> str:
110
+ """Run a real option-quote backtest for repeated ATM long straddles.
111
+
112
+ This is a true option PnL backtest when supplied with historical option quotes.
113
+ Required CSV columns: date, underlying_symbol, underlying_price, contract_symbol,
114
+ option_type, expiration, strike, bid, ask. Optional columns include mid, delta,
115
+ gamma, theta, vega, implied_volatility, volume, open_interest.
116
+
117
+ Args:
118
+ csv_path: Path to historical option quotes CSV.
119
+ symbol: Underlying ticker.
120
+ target_dte: Target days to expiration at entry.
121
+ holding_days: Number of calendar days to hold each straddle.
122
+ entry_every_days: Minimum days between new entries.
123
+ price_field: trade for buy-at-ask/sell-at-bid, or mid for mid-price marks.
124
+ """
125
+ try:
126
+ quotes = load_option_quotes_csv(csv_path)
127
+ result = backtest_long_straddle_from_quotes(
128
+ quotes=quotes,
129
+ symbol=symbol,
130
+ target_dte=target_dte,
131
+ holding_days=holding_days,
132
+ entry_every_days=entry_every_days,
133
+ price_field=price_field,
134
+ )
135
+ return json.dumps({"status": "success", **result}, ensure_ascii=False, indent=2)
136
+ except Exception as exc:
137
+ return json.dumps(
138
+ {
139
+ "status": "error",
140
+ "symbol": symbol,
141
+ "message": str(exc),
142
+ "note": "A real option backtest requires historical option quote data. yfinance does not provide reliable historical option chains.",
143
+ },
144
+ ensure_ascii=False,
145
+ indent=2,
146
+ )
backtest/vol_backtest.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def max_drawdown(equity: pd.Series) -> float:
9
+ if equity.empty:
10
+ return 0.0
11
+ running_max = equity.cummax()
12
+ drawdown = equity / running_max - 1
13
+ return float(drawdown.min())
14
+
15
+
16
+ def backtest_realized_vol_signal(
17
+ prices: pd.Series,
18
+ short_window: int = 10,
19
+ long_window: int = 30,
20
+ holding_days: int = 5,
21
+ signal: str = "long_vol",
22
+ ) -> dict:
23
+ close = prices.dropna().astype(float)
24
+ returns = close.pct_change().dropna()
25
+ short_rv = returns.rolling(short_window).std() * math.sqrt(252)
26
+ long_rv = returns.rolling(long_window).std() * math.sqrt(252)
27
+
28
+ trades = []
29
+ equity = [1.0]
30
+ index = 0
31
+ dates = list(returns.index)
32
+ while index + holding_days < len(returns):
33
+ current_date = dates[index]
34
+ if pd.isna(short_rv.iloc[index]) or pd.isna(long_rv.iloc[index]):
35
+ index += 1
36
+ equity.append(equity[-1])
37
+ continue
38
+
39
+ vol_expanding = short_rv.iloc[index] > long_rv.iloc[index]
40
+ enter = vol_expanding if signal == "long_vol" else not vol_expanding
41
+ if not enter:
42
+ index += 1
43
+ equity.append(equity[-1])
44
+ continue
45
+
46
+ period_returns = returns.iloc[index + 1:index + 1 + holding_days]
47
+ realized_move = float(period_returns.abs().sum())
48
+ signed_pnl = realized_move if signal == "long_vol" else -realized_move
49
+ equity.append(equity[-1] * (1 + signed_pnl))
50
+ trades.append(
51
+ {
52
+ "entry_date": str(current_date),
53
+ "exit_date": str(dates[index + holding_days]),
54
+ "short_rv": float(short_rv.iloc[index]),
55
+ "long_rv": float(long_rv.iloc[index]),
56
+ "realized_abs_move": realized_move,
57
+ "pnl_proxy": signed_pnl,
58
+ }
59
+ )
60
+ index += holding_days
61
+
62
+ equity_series = pd.Series(equity)
63
+ wins = [trade for trade in trades if trade["pnl_proxy"] > 0]
64
+ return {
65
+ "signal": signal,
66
+ "short_window": short_window,
67
+ "long_window": long_window,
68
+ "holding_days": holding_days,
69
+ "trade_count": len(trades),
70
+ "win_rate": len(wins) / len(trades) if trades else 0.0,
71
+ "total_return_proxy": float(equity_series.iloc[-1] - 1) if not equity_series.empty else 0.0,
72
+ "max_drawdown_proxy": max_drawdown(equity_series),
73
+ "avg_trade_pnl_proxy": (
74
+ sum(trade["pnl_proxy"] for trade in trades) / len(trades)
75
+ if trades
76
+ else 0.0
77
+ ),
78
+ "trades": trades[:100],
79
+ "limitations": [
80
+ "This is an underlying-price realized-volatility signal backtest, not a true option PnL backtest.",
81
+ "It does not use historical option-chain prices, bid/ask spreads, margin, assignment, or delta hedging costs.",
82
+ ],
83
+ }
eval/README.md CHANGED
@@ -58,6 +58,23 @@ The suite writes per-dataset reports and one aggregate report under `eval/report
58
 
59
  ## Common Commands
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  Run the fastest local check while developing PDF parsing or chunking:
62
 
63
  ```bash
@@ -114,6 +131,36 @@ uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
114
  --rebuild
115
  ```
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  Compare different chunk settings:
118
 
119
  ```bash
@@ -169,6 +216,7 @@ uv --cache-dir .uv-cache run python -m eval.rag_eval \
169
  2. After changing PDF extraction, chunking, embeddings, or retrieval code, add `--rebuild`.
170
  3. Before comparing two versions, use the same `--datasets`, `--max-queries`, `--max-corpus-docs`, `--top-k`, `--chunk-size`, and `--chunk-overlap`.
171
  4. Use `--output-name` to save stable report names for before/after comparison.
 
172
 
173
  ## Metrics
174
 
 
58
 
59
  ## Common Commands
60
 
61
+ Run with the default multilingual embedding model:
62
+
63
+ ```bash
64
+ uv --cache-dir .uv-cache run python -m eval.run_eval_suite --rebuild
65
+ ```
66
+
67
+ Use a custom embedding model for experiments:
68
+
69
+ ```bash
70
+ RAG_EMBED_MODEL=intfloat/multilingual-e5-base \
71
+ uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
72
+ --datasets local-options \
73
+ --top-k 5 \
74
+ --output-name local_options_e5_base \
75
+ --rebuild
76
+ ```
77
+
78
  Run the fastest local check while developing PDF parsing or chunking:
79
 
80
  ```bash
 
131
  --rebuild
132
  ```
133
 
134
+ Compare retrieval with and without reranker:
135
+
136
+ ```bash
137
+ uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
138
+ --datasets local-options \
139
+ --top-k 5 \
140
+ --output-name local_options_no_reranker \
141
+ --rebuild
142
+
143
+ uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
144
+ --datasets local-options \
145
+ --top-k 5 \
146
+ --use-reranker \
147
+ --reranker-candidates 25 \
148
+ --output-name local_options_with_reranker \
149
+ --rebuild
150
+ ```
151
+
152
+ Use a custom reranker model:
153
+
154
+ ```bash
155
+ uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
156
+ --datasets beir/fiqa \
157
+ --use-reranker \
158
+ --reranker-model cross-encoder/ms-marco-MiniLM-L-6-v2 \
159
+ --reranker-candidates 50 \
160
+ --top-k 5 \
161
+ --rebuild
162
+ ```
163
+
164
  Compare different chunk settings:
165
 
166
  ```bash
 
216
  2. After changing PDF extraction, chunking, embeddings, or retrieval code, add `--rebuild`.
217
  3. Before comparing two versions, use the same `--datasets`, `--max-queries`, `--max-corpus-docs`, `--top-k`, `--chunk-size`, and `--chunk-overlap`.
218
  4. Use `--output-name` to save stable report names for before/after comparison.
219
+ 5. When testing reranker, compare the same dataset once without `--use-reranker` and once with `--use-reranker`.
220
 
221
  ## Metrics
222
 
eval/generate_local_options_eval.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import random
6
+ import re
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from tools.query_knowledge import RAW_DIR, iter_source_files, load_source_file
11
+
12
+
13
+ KEY_TERMS = [
14
+ "volatility smile",
15
+ "implied volatility",
16
+ "local volatility",
17
+ "stochastic volatility",
18
+ "Black-Scholes",
19
+ "delta",
20
+ "gamma",
21
+ "vega",
22
+ "theta",
23
+ "rho",
24
+ "skew",
25
+ "straddle",
26
+ "correlation",
27
+ "at-the-money",
28
+ "forward",
29
+ "risk-neutral",
30
+ ]
31
+
32
+
33
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
34
+ OUTPUT_PATH = PROJECT_ROOT / "eval" / "local_options_eval.jsonl"
35
+
36
+
37
+ def normalize_space(text: str) -> str:
38
+ return re.sub(r"\s+", " ", text).strip()
39
+
40
+
41
+ def extract_keywords(text: str, max_keywords: int = 4) -> list[str]:
42
+ lowered = text.lower()
43
+ keywords = [term for term in KEY_TERMS if term.lower() in lowered]
44
+ equation_ids = re.findall(r"\(\d+\.\d+[a-z]?\)", text)
45
+ formulas = re.findall(r"[A-Za-z𝜎𝜇𝜌𝜃𝛴][A-Za-z0-9𝜎𝜇𝜌𝜃𝛴_{}^]*\s*=", text)
46
+ keywords.extend(equation_ids[:2])
47
+ keywords.extend(item.strip() for item in formulas[:2])
48
+
49
+ if not keywords:
50
+ candidates = [
51
+ word
52
+ for word in re.findall(r"[A-Za-z][A-Za-z-]{4,}", text)
53
+ if word.lower() not in {"there", "where", "which", "would", "could", "should", "chapter"}
54
+ ]
55
+ keywords.extend(candidates[:max_keywords])
56
+
57
+ deduped = []
58
+ banned = {"id=", "FORMULA", "value ="}
59
+ for keyword in keywords:
60
+ if keyword and keyword not in banned and keyword not in deduped:
61
+ deduped.append(keyword)
62
+ return deduped[:max_keywords]
63
+
64
+
65
+ def is_sane_section(section: str | None) -> bool:
66
+ if not section:
67
+ return False
68
+ section = section.strip()
69
+ if not 6 <= len(section) <= 90:
70
+ return False
71
+ if section.count(",") >= 2:
72
+ return False
73
+ digit_count = sum(char.isdigit() for char in section)
74
+ letter_count = sum(char.isalpha() for char in section)
75
+ if digit_count > max(2, letter_count // 3):
76
+ return False
77
+ if re.search(r"\b(figure|table|printed|united states|amount unit price|call price|under)$", section, re.I):
78
+ return False
79
+ if "figure" in section.lower() or "table" in section.lower():
80
+ return False
81
+ if re.search(r"\b(figure|table|printed|united states|amount unit price|call price)\b", section, re.I):
82
+ return False
83
+ words = section.split()
84
+ if len(words) > 12:
85
+ return False
86
+ return True
87
+
88
+
89
+ def make_case(document: Any, index: int) -> dict[str, Any] | None:
90
+ metadata = document.metadata
91
+ text = normalize_space(document.text)
92
+ if len(text) < 80:
93
+ return None
94
+
95
+ page = metadata.get("page_number")
96
+ if isinstance(page, int) and (page < 25 or page > 500):
97
+ return None
98
+ section = metadata.get("section_path") or metadata.get("section_title")
99
+ content_type = metadata.get("content_type", "text")
100
+ formula_id = metadata.get("formula_id")
101
+ keywords = extract_keywords(text)
102
+ if not keywords and not section:
103
+ return None
104
+
105
+ if content_type == "formula" or formula_id:
106
+ question = f"What formula or equation is described on page {page}?"
107
+ answer_type = "formula"
108
+ elif is_sane_section(section):
109
+ question = f"What does the section {section} discuss?"
110
+ answer_type = "section"
111
+ keywords.append(section.split(">")[-1].strip())
112
+ else:
113
+ if not keywords:
114
+ return None
115
+ term = keywords[0]
116
+ if term.lower() in {"formula", "id=", "value ="}:
117
+ return None
118
+ question = f"Where does the options reference discuss {term}?"
119
+ answer_type = "concept"
120
+
121
+ expected_pages = [page] if page is not None else []
122
+ return {
123
+ "id": f"auto_options_{index:03d}",
124
+ "question": question,
125
+ "expected_pages": expected_pages,
126
+ "expected_keywords": keywords[:5],
127
+ "answer_type": answer_type,
128
+ }
129
+
130
+
131
+ def generate_cases(count: int, seed: int) -> list[dict[str, Any]]:
132
+ documents = []
133
+ for source_file in iter_source_files(RAW_DIR):
134
+ documents.extend(load_source_file(source_file))
135
+
136
+ random.Random(seed).shuffle(documents)
137
+ cases = []
138
+ seen_questions = set()
139
+ for document in documents:
140
+ case = make_case(document, len(cases) + 1)
141
+ if not case:
142
+ continue
143
+ if case["question"] in seen_questions:
144
+ continue
145
+ seen_questions.add(case["question"])
146
+ cases.append(case)
147
+ if len(cases) >= count:
148
+ break
149
+
150
+ if len(cases) < count:
151
+ raise RuntimeError(f"Only generated {len(cases)} cases; requested {count}.")
152
+ return cases
153
+
154
+
155
+ def main() -> None:
156
+ parser = argparse.ArgumentParser(description="Generate local options RAG eval cases.")
157
+ parser.add_argument("--count", type=int, default=40)
158
+ parser.add_argument("--seed", type=int, default=20260525)
159
+ parser.add_argument("--output", type=Path, default=OUTPUT_PATH)
160
+ args = parser.parse_args()
161
+
162
+ cases = generate_cases(args.count, args.seed)
163
+ args.output.parent.mkdir(parents=True, exist_ok=True)
164
+ args.output.write_text(
165
+ "\n".join(json.dumps(case, ensure_ascii=False) for case in cases) + "\n",
166
+ encoding="utf-8",
167
+ )
168
+ print(f"Wrote {len(cases)} cases to {args.output}")
169
+
170
+
171
+ if __name__ == "__main__":
172
+ main()
eval/rag_eval.py CHANGED
@@ -4,6 +4,7 @@ import argparse
4
  import csv
5
  import json
6
  import math
 
7
  import shutil
8
  import zipfile
9
  from dataclasses import dataclass
@@ -15,9 +16,17 @@ import requests
15
  from llama_index.core import StorageContext, VectorStoreIndex
16
  from llama_index.core.node_parser import SentenceSplitter
17
  from llama_index.core.schema import Document
 
18
  from llama_index.vector_stores.chroma import ChromaVectorStore
19
 
20
- from tools.query_knowledge import configure_model_cache, resolve_embed_model_name
 
 
 
 
 
 
 
21
 
22
 
23
  PROJECT_ROOT = Path(__file__).resolve().parents[1]
@@ -387,7 +396,9 @@ def load_local_options_eval(max_queries: int | None) -> EvalCorpus:
387
 
388
  from tools.query_knowledge import load_pdf_file
389
 
390
- pdf_files = sorted((PROJECT_ROOT / "tools" / "knowledge_base" / "raw").rglob("*.pdf"))
 
 
391
  documents = []
392
  for pdf_file in pdf_files:
393
  for doc_index, document in enumerate(load_pdf_file(pdf_file)):
@@ -443,6 +454,11 @@ def load_eval_corpus(args: argparse.Namespace) -> EvalCorpus:
443
  raise ValueError(f"Unknown dataset: {args.dataset}")
444
 
445
 
 
 
 
 
 
446
  def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild: bool) -> VectorStoreIndex:
447
  configure_model_cache()
448
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
@@ -453,7 +469,8 @@ def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild
453
  index_path.mkdir(parents=True, exist_ok=True)
454
 
455
  db = chromadb.PersistentClient(path=str(index_path))
456
- collection_name = f"{corpus.name}_eval"
 
457
  if rebuild:
458
  try:
459
  db.delete_collection(collection_name)
@@ -464,7 +481,7 @@ def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild
464
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
465
  embed_model = HuggingFaceEmbedding(
466
  model_name=resolve_embed_model_name(),
467
- cache_folder=str(PROJECT_ROOT / "tools" / "hf_cache" / "sentence_transformers"),
468
  )
469
 
470
  if collection.count() == 0:
@@ -491,8 +508,70 @@ def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild
491
  return VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
492
 
493
 
494
- def evaluate_retrieval(corpus: EvalCorpus, index: VectorStoreIndex, top_k: int) -> dict[str, Any]:
495
- retriever = index.as_retriever(similarity_top_k=max(top_k * 5, top_k))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  cases = []
497
  hit_counts = {1: 0, 3: 0, 5: 0, top_k: 0}
498
  reciprocal_ranks = []
@@ -500,7 +579,17 @@ def evaluate_retrieval(corpus: EvalCorpus, index: VectorStoreIndex, top_k: int)
500
 
501
  for query in corpus.queries:
502
  relevant_doc_ids = corpus.qrels.get(query["query_id"], set())
503
- results = retriever.retrieve(query["question"])
 
 
 
 
 
 
 
 
 
 
504
  retrieved = []
505
  seen_doc_ids = set()
506
  first_hit_rank = None
@@ -557,6 +646,8 @@ def evaluate_retrieval(corpus: EvalCorpus, index: VectorStoreIndex, top_k: int)
557
  "top_k": top_k,
558
  "mrr": sum(reciprocal_ranks) / total if total else 0.0,
559
  "ndcg_at_k": sum(ndcg_scores) / total if total else 0.0,
 
 
560
  }
561
  for k, count in sorted(hit_counts.items()):
562
  metrics[f"hit_at_{k}"] = count / total if total else 0.0
@@ -612,6 +703,10 @@ def parse_args() -> argparse.Namespace:
612
  parser.add_argument("--max-corpus-docs", type=int, default=None)
613
  parser.add_argument("--max-queries", type=int, default=None)
614
  parser.add_argument("--rebuild", action="store_true")
 
 
 
 
615
  return parser.parse_args()
616
 
617
 
@@ -619,7 +714,17 @@ def main() -> None:
619
  args = parse_args()
620
  corpus = load_eval_corpus(args)
621
  index = build_index(corpus, args.chunk_size, args.chunk_overlap, args.rebuild)
622
- report = evaluate_retrieval(corpus, index, args.top_k)
 
 
 
 
 
 
 
 
 
 
623
  json_path, md_path = write_reports(report)
624
  print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
625
  print(f"JSON report: {json_path}")
 
4
  import csv
5
  import json
6
  import math
7
+ import re
8
  import shutil
9
  import zipfile
10
  from dataclasses import dataclass
 
16
  from llama_index.core import StorageContext, VectorStoreIndex
17
  from llama_index.core.node_parser import SentenceSplitter
18
  from llama_index.core.schema import Document
19
+ from llama_index.core.schema import NodeWithScore, TextNode
20
  from llama_index.vector_stores.chroma import ChromaVectorStore
21
 
22
+ from tools.query_knowledge import (
23
+ BM25Retriever,
24
+ EMBED_MODEL_NAME,
25
+ RERANKER_MODEL_NAME,
26
+ CrossEncoderReranker,
27
+ configure_model_cache,
28
+ resolve_embed_model_name,
29
+ )
30
 
31
 
32
  PROJECT_ROOT = Path(__file__).resolve().parents[1]
 
396
 
397
  from tools.query_knowledge import load_pdf_file
398
 
399
+ pdf_files = sorted((PROJECT_ROOT / "knowledge_base" / "raw").rglob("*.pdf"))
400
+ if not pdf_files:
401
+ pdf_files = sorted((PROJECT_ROOT / "tools" / "knowledge_base" / "raw").rglob("*.pdf"))
402
  documents = []
403
  for pdf_file in pdf_files:
404
  for doc_index, document in enumerate(load_pdf_file(pdf_file)):
 
454
  raise ValueError(f"Unknown dataset: {args.dataset}")
455
 
456
 
457
+ def collection_safe_name(value: str) -> str:
458
+ safe = re.sub(r"[^A-Za-z0-9_-]+", "_", value)
459
+ return safe.strip("_") or "default"
460
+
461
+
462
  def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild: bool) -> VectorStoreIndex:
463
  configure_model_cache()
464
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 
469
  index_path.mkdir(parents=True, exist_ok=True)
470
 
471
  db = chromadb.PersistentClient(path=str(index_path))
472
+ embed_slug = collection_safe_name(EMBED_MODEL_NAME)
473
+ collection_name = f"{corpus.name}_{embed_slug}_eval"
474
  if rebuild:
475
  try:
476
  db.delete_collection(collection_name)
 
481
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
482
  embed_model = HuggingFaceEmbedding(
483
  model_name=resolve_embed_model_name(),
484
+ cache_folder=str(PROJECT_ROOT / "hf_cache" / "sentence_transformers"),
485
  )
486
 
487
  if collection.count() == 0:
 
508
  return VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
509
 
510
 
511
+ def build_bm25_retriever(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int) -> BM25Retriever:
512
+ documents = [
513
+ Document(
514
+ text=document["text"],
515
+ metadata={
516
+ "doc_id": document["doc_id"],
517
+ "title": document.get("title", ""),
518
+ **(document.get("metadata") or {}),
519
+ },
520
+ )
521
+ for document in corpus.documents
522
+ ]
523
+ splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
524
+ nodes = splitter.get_nodes_from_documents(documents)
525
+ text_nodes = [
526
+ TextNode(id_=node.node_id, text=node.get_content(), metadata=node.metadata)
527
+ for node in nodes
528
+ ]
529
+ return BM25Retriever(text_nodes)
530
+
531
+
532
+ def merge_eval_results(
533
+ vector_results: list[NodeWithScore],
534
+ bm25_results: list[NodeWithScore],
535
+ top_k: int,
536
+ ) -> list[NodeWithScore]:
537
+ merged: dict[str, NodeWithScore] = {}
538
+
539
+ for rank, result in enumerate(vector_results):
540
+ node_id = result.node.node_id
541
+ merged[node_id] = NodeWithScore(node=result.node, score=1.0 / (rank + 1))
542
+
543
+ for rank, result in enumerate(bm25_results):
544
+ node_id = result.node.node_id
545
+ reciprocal_rank_score = 1.0 / (rank + 1)
546
+ if node_id in merged:
547
+ merged[node_id].score = (merged[node_id].score or 0.0) + reciprocal_rank_score
548
+ else:
549
+ merged[node_id] = NodeWithScore(node=result.node, score=reciprocal_rank_score)
550
+
551
+ results = list(merged.values())
552
+ results.sort(key=lambda item: item.score or float("-inf"), reverse=True)
553
+ return results[:top_k]
554
+
555
+
556
+ def evaluate_retrieval(
557
+ corpus: EvalCorpus,
558
+ index: VectorStoreIndex,
559
+ top_k: int,
560
+ use_reranker: bool = False,
561
+ use_hybrid: bool = False,
562
+ chunk_size: int = 512,
563
+ chunk_overlap: int = 64,
564
+ reranker_model_name: str = RERANKER_MODEL_NAME,
565
+ reranker_candidates: int = 25,
566
+ ) -> dict[str, Any]:
567
+ retrieve_top_k = max(reranker_candidates, top_k) if use_reranker else max(top_k * 5, top_k)
568
+ retriever = index.as_retriever(similarity_top_k=retrieve_top_k)
569
+ bm25_retriever = (
570
+ build_bm25_retriever(corpus, chunk_size, chunk_overlap)
571
+ if use_hybrid
572
+ else None
573
+ )
574
+ reranker = CrossEncoderReranker(reranker_model_name) if use_reranker else None
575
  cases = []
576
  hit_counts = {1: 0, 3: 0, 5: 0, top_k: 0}
577
  reciprocal_ranks = []
 
579
 
580
  for query in corpus.queries:
581
  relevant_doc_ids = corpus.qrels.get(query["query_id"], set())
582
+ vector_results = retriever.retrieve(query["question"])
583
+ results = vector_results
584
+ if bm25_retriever:
585
+ bm25_results = bm25_retriever.retrieve(query["question"], retrieve_top_k)
586
+ results = merge_eval_results(vector_results, bm25_results, retrieve_top_k)
587
+ if reranker:
588
+ results = reranker.rerank(
589
+ query["question"],
590
+ results,
591
+ top_n=max(top_k * 5, top_k),
592
+ )
593
  retrieved = []
594
  seen_doc_ids = set()
595
  first_hit_rank = None
 
646
  "top_k": top_k,
647
  "mrr": sum(reciprocal_ranks) / total if total else 0.0,
648
  "ndcg_at_k": sum(ndcg_scores) / total if total else 0.0,
649
+ "reranker_enabled": use_reranker,
650
+ "hybrid_enabled": use_hybrid,
651
  }
652
  for k, count in sorted(hit_counts.items()):
653
  metrics[f"hit_at_{k}"] = count / total if total else 0.0
 
703
  parser.add_argument("--max-corpus-docs", type=int, default=None)
704
  parser.add_argument("--max-queries", type=int, default=None)
705
  parser.add_argument("--rebuild", action="store_true")
706
+ parser.add_argument("--use-hybrid", action="store_true")
707
+ parser.add_argument("--use-reranker", action="store_true")
708
+ parser.add_argument("--reranker-model", default=RERANKER_MODEL_NAME)
709
+ parser.add_argument("--reranker-candidates", type=int, default=25)
710
  return parser.parse_args()
711
 
712
 
 
714
  args = parse_args()
715
  corpus = load_eval_corpus(args)
716
  index = build_index(corpus, args.chunk_size, args.chunk_overlap, args.rebuild)
717
+ report = evaluate_retrieval(
718
+ corpus,
719
+ index,
720
+ args.top_k,
721
+ use_reranker=args.use_reranker,
722
+ use_hybrid=args.use_hybrid,
723
+ chunk_size=args.chunk_size,
724
+ chunk_overlap=args.chunk_overlap,
725
+ reranker_model_name=args.reranker_model,
726
+ reranker_candidates=args.reranker_candidates,
727
+ )
728
  json_path, md_path = write_reports(report)
729
  print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
730
  print(f"JSON report: {json_path}")
eval/run_eval_suite.py CHANGED
@@ -57,6 +57,10 @@ def build_dataset_args(args: argparse.Namespace, dataset: str) -> SimpleNamespac
57
  else defaults["max_corpus_docs"],
58
  max_queries=args.max_queries if args.max_queries is not None else defaults["max_queries"],
59
  rebuild=args.rebuild,
 
 
 
 
60
  )
61
 
62
 
@@ -65,7 +69,9 @@ def run_one(dataset: str, args: argparse.Namespace) -> DatasetRun:
65
  print(
66
  f"\n=== Running {dataset} "
67
  f"(top_k={dataset_args.top_k}, max_corpus_docs={dataset_args.max_corpus_docs}, "
68
- f"max_queries={dataset_args.max_queries}, rebuild={dataset_args.rebuild}) ==="
 
 
69
  )
70
 
71
  corpus = load_eval_corpus(dataset_args)
@@ -75,7 +81,17 @@ def run_one(dataset: str, args: argparse.Namespace) -> DatasetRun:
75
  chunk_overlap=dataset_args.chunk_overlap,
76
  rebuild=dataset_args.rebuild,
77
  )
78
- report = evaluate_retrieval(corpus, index, dataset_args.top_k)
 
 
 
 
 
 
 
 
 
 
79
  json_path, md_path = write_reports(report)
80
  print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
81
 
@@ -132,6 +148,10 @@ def parse_args() -> argparse.Namespace:
132
  parser.add_argument("--max-corpus-docs", type=int, default=None)
133
  parser.add_argument("--max-queries", type=int, default=None)
134
  parser.add_argument("--rebuild", action="store_true")
 
 
 
 
135
  parser.add_argument("--fail-fast", action="store_true")
136
  parser.add_argument("--output-name", default=None, help="Suite report filename stem under eval/reports.")
137
  return parser.parse_args()
 
57
  else defaults["max_corpus_docs"],
58
  max_queries=args.max_queries if args.max_queries is not None else defaults["max_queries"],
59
  rebuild=args.rebuild,
60
+ use_hybrid=args.use_hybrid,
61
+ use_reranker=args.use_reranker,
62
+ reranker_model=args.reranker_model,
63
+ reranker_candidates=args.reranker_candidates,
64
  )
65
 
66
 
 
69
  print(
70
  f"\n=== Running {dataset} "
71
  f"(top_k={dataset_args.top_k}, max_corpus_docs={dataset_args.max_corpus_docs}, "
72
+ f"max_queries={dataset_args.max_queries}, rebuild={dataset_args.rebuild}, "
73
+ f"use_hybrid={dataset_args.use_hybrid}, "
74
+ f"use_reranker={dataset_args.use_reranker}) ==="
75
  )
76
 
77
  corpus = load_eval_corpus(dataset_args)
 
81
  chunk_overlap=dataset_args.chunk_overlap,
82
  rebuild=dataset_args.rebuild,
83
  )
84
+ report = evaluate_retrieval(
85
+ corpus,
86
+ index,
87
+ dataset_args.top_k,
88
+ use_hybrid=dataset_args.use_hybrid,
89
+ chunk_size=dataset_args.chunk_size,
90
+ chunk_overlap=dataset_args.chunk_overlap,
91
+ use_reranker=dataset_args.use_reranker,
92
+ reranker_model_name=dataset_args.reranker_model,
93
+ reranker_candidates=dataset_args.reranker_candidates,
94
+ )
95
  json_path, md_path = write_reports(report)
96
  print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
97
 
 
148
  parser.add_argument("--max-corpus-docs", type=int, default=None)
149
  parser.add_argument("--max-queries", type=int, default=None)
150
  parser.add_argument("--rebuild", action="store_true")
151
+ parser.add_argument("--use-hybrid", action="store_true")
152
+ parser.add_argument("--use-reranker", action="store_true")
153
+ parser.add_argument("--reranker-model", default="cross-encoder/ms-marco-MiniLM-L-6-v2")
154
+ parser.add_argument("--reranker-candidates", type=int, default=25)
155
  parser.add_argument("--fail-fast", action="store_true")
156
  parser.add_argument("--output-name", default=None, help="Suite report filename stem under eval/reports.")
157
  return parser.parse_args()
market_data/__init__.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .analytics import (
2
+ black_scholes_greeks,
3
+ classify_volatility_regime,
4
+ rank_current_iv_against_rv,
5
+ realized_volatility,
6
+ summarize_option_chain,
7
+ )
8
+ from .providers import (
9
+ get_current_quote,
10
+ get_option_chain,
11
+ get_price_history,
12
+ list_option_expirations,
13
+ )
14
+ from .schemas import OptionChain, OptionContract, UnderlyingQuote, VolSnapshot
15
+
16
+ __all__ = [
17
+ "black_scholes_greeks",
18
+ "classify_volatility_regime",
19
+ "get_current_quote",
20
+ "get_option_chain",
21
+ "get_price_history",
22
+ "list_option_expirations",
23
+ "OptionChain",
24
+ "OptionContract",
25
+ "realized_volatility",
26
+ "rank_current_iv_against_rv",
27
+ "summarize_option_chain",
28
+ "UnderlyingQuote",
29
+ "VolSnapshot",
30
+ ]
market_data/analytics.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from statistics import NormalDist
5
+
6
+ import pandas as pd
7
+
8
+ from .schemas import OptionChain
9
+
10
+
11
+ NORMAL = NormalDist()
12
+
13
+
14
+ def realized_volatility(
15
+ prices: pd.Series,
16
+ windows: tuple[int, ...] = (5, 10, 20, 30, 60),
17
+ trading_days: int = 252,
18
+ ) -> dict[str, float | None]:
19
+ close = prices.dropna().astype(float)
20
+ returns = close.pct_change().dropna()
21
+ output: dict[str, float | None] = {}
22
+
23
+ for window in windows:
24
+ key = f"{window}d"
25
+ if len(returns) < window:
26
+ output[key] = None
27
+ continue
28
+ output[key] = float(returns.tail(window).std(ddof=1) * math.sqrt(trading_days))
29
+
30
+ return output
31
+
32
+
33
+ def _norm_pdf(value: float) -> float:
34
+ return math.exp(-0.5 * value * value) / math.sqrt(2 * math.pi)
35
+
36
+
37
+ def black_scholes_greeks(
38
+ spot: float,
39
+ strike: float,
40
+ time_to_expiry: float,
41
+ volatility: float,
42
+ risk_free_rate: float = 0.0,
43
+ dividend_yield: float = 0.0,
44
+ option_type: str = "call",
45
+ ) -> dict[str, float | None]:
46
+ if spot <= 0 or strike <= 0 or time_to_expiry <= 0 or volatility <= 0:
47
+ return {
48
+ "delta": None,
49
+ "gamma": None,
50
+ "vega": None,
51
+ "theta": None,
52
+ "rho": None,
53
+ }
54
+
55
+ sqrt_t = math.sqrt(time_to_expiry)
56
+ d1 = (
57
+ math.log(spot / strike)
58
+ + (risk_free_rate - dividend_yield + 0.5 * volatility * volatility) * time_to_expiry
59
+ ) / (volatility * sqrt_t)
60
+ d2 = d1 - volatility * sqrt_t
61
+ discount_dividend = math.exp(-dividend_yield * time_to_expiry)
62
+ discount_rate = math.exp(-risk_free_rate * time_to_expiry)
63
+ option_type = option_type.lower()
64
+
65
+ if option_type == "put":
66
+ delta = discount_dividend * (NORMAL.cdf(d1) - 1)
67
+ theta = (
68
+ -spot * discount_dividend * _norm_pdf(d1) * volatility / (2 * sqrt_t)
69
+ + dividend_yield * spot * discount_dividend * NORMAL.cdf(-d1)
70
+ - risk_free_rate * strike * discount_rate * NORMAL.cdf(-d2)
71
+ ) / 365
72
+ rho = -strike * time_to_expiry * discount_rate * NORMAL.cdf(-d2) / 100
73
+ else:
74
+ delta = discount_dividend * NORMAL.cdf(d1)
75
+ theta = (
76
+ -spot * discount_dividend * _norm_pdf(d1) * volatility / (2 * sqrt_t)
77
+ - dividend_yield * spot * discount_dividend * NORMAL.cdf(d1)
78
+ + risk_free_rate * strike * discount_rate * NORMAL.cdf(d2)
79
+ ) / 365
80
+ rho = strike * time_to_expiry * discount_rate * NORMAL.cdf(d2) / 100
81
+
82
+ gamma = discount_dividend * _norm_pdf(d1) / (spot * volatility * sqrt_t)
83
+ vega = spot * discount_dividend * _norm_pdf(d1) * sqrt_t / 100
84
+
85
+ return {
86
+ "delta": float(delta),
87
+ "gamma": float(gamma),
88
+ "vega": float(vega),
89
+ "theta": float(theta),
90
+ "rho": float(rho),
91
+ }
92
+
93
+
94
+ def nearest_atm_iv(chain: OptionChain) -> float | None:
95
+ if chain.underlying_price is None:
96
+ return None
97
+ contracts = chain.calls + chain.puts
98
+ valid = [
99
+ contract
100
+ for contract in contracts
101
+ if contract.implied_volatility is not None and contract.implied_volatility > 0
102
+ ]
103
+ if not valid:
104
+ return None
105
+ nearest = min(valid, key=lambda contract: abs(contract.strike - chain.underlying_price))
106
+ return nearest.implied_volatility
107
+
108
+
109
+ def simple_skew(chain: OptionChain) -> float | None:
110
+ if chain.underlying_price is None:
111
+ return None
112
+ otm_puts = [
113
+ contract
114
+ for contract in chain.puts
115
+ if contract.strike < chain.underlying_price and contract.implied_volatility
116
+ ]
117
+ otm_calls = [
118
+ contract
119
+ for contract in chain.calls
120
+ if contract.strike > chain.underlying_price and contract.implied_volatility
121
+ ]
122
+ if not otm_puts or not otm_calls:
123
+ return None
124
+ put = max(otm_puts, key=lambda contract: contract.strike)
125
+ call = min(otm_calls, key=lambda contract: contract.strike)
126
+ return float((put.implied_volatility or 0) - (call.implied_volatility or 0))
127
+
128
+
129
+ def summarize_option_chain(chain: OptionChain, realized_vol_20d: float | None = None) -> dict:
130
+ atm_iv = nearest_atm_iv(chain)
131
+ return {
132
+ "symbol": chain.symbol,
133
+ "expiration": chain.expiration,
134
+ "underlying_price": chain.underlying_price,
135
+ "atm_iv": atm_iv,
136
+ "iv_rv_spread_20d": (
137
+ float(atm_iv - realized_vol_20d)
138
+ if atm_iv is not None and realized_vol_20d is not None
139
+ else None
140
+ ),
141
+ "skew_put_minus_call": simple_skew(chain),
142
+ "call_count": len(chain.calls),
143
+ "put_count": len(chain.puts),
144
+ }
145
+
146
+
147
+ def rank_current_iv_against_rv(
148
+ current_iv: float | None,
149
+ realized_vols: dict[str, float | None],
150
+ ) -> float | None:
151
+ if current_iv is None:
152
+ return None
153
+ rv_values = [value for value in realized_vols.values() if value is not None]
154
+ if len(rv_values) < 2:
155
+ return None
156
+ low = min(rv_values)
157
+ high = max(rv_values)
158
+ if high <= low:
159
+ return None
160
+ return max(0.0, min(1.0, (current_iv - low) / (high - low)))
161
+
162
+
163
+ def classify_volatility_regime(
164
+ current_iv: float | None,
165
+ realized_vol_20d: float | None,
166
+ term_structure_slope: float | None,
167
+ skew: float | None,
168
+ ) -> dict:
169
+ if current_iv is None or realized_vol_20d is None:
170
+ return {
171
+ "regime": "unknown",
172
+ "vol_signal": "insufficient_iv_or_rv",
173
+ "confidence": "low",
174
+ "notes": ["Need both option implied volatility and realized volatility."],
175
+ }
176
+
177
+ iv_rv_spread = current_iv - realized_vol_20d
178
+ notes = []
179
+ if iv_rv_spread > 0.08:
180
+ regime = "high_implied_vol_premium"
181
+ vol_signal = "short_vol_candidate"
182
+ notes.append("Current ATM IV is materially above 20D realized volatility.")
183
+ elif iv_rv_spread < -0.04:
184
+ regime = "low_implied_vol_discount"
185
+ vol_signal = "long_vol_candidate"
186
+ notes.append("Current ATM IV is below 20D realized volatility.")
187
+ else:
188
+ regime = "balanced_iv_vs_rv"
189
+ vol_signal = "neutral_vol"
190
+ notes.append("Current ATM IV is close to 20D realized volatility.")
191
+
192
+ if term_structure_slope is not None:
193
+ if term_structure_slope > 0.04:
194
+ notes.append("Term structure is upward sloping.")
195
+ elif term_structure_slope < -0.04:
196
+ notes.append("Term structure is inverted or front-loaded.")
197
+ if skew is not None and abs(skew) > 0.05:
198
+ notes.append("Put-call skew is elevated in the sampled expiration.")
199
+
200
+ confidence = "medium" if len(notes) >= 2 else "low"
201
+ return {
202
+ "regime": regime,
203
+ "vol_signal": vol_signal,
204
+ "confidence": confidence,
205
+ "notes": notes,
206
+ }
market_data/providers.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import date, datetime
4
+ from typing import Any
5
+
6
+ import pandas as pd
7
+ import yfinance as yf
8
+
9
+ from .schemas import OptionChain, OptionContract, UnderlyingQuote
10
+
11
+
12
+ def none_if_nan(value: Any) -> Any:
13
+ if pd.isna(value):
14
+ return None
15
+ return value
16
+
17
+
18
+ def to_float(value: Any) -> float | None:
19
+ value = none_if_nan(value)
20
+ return float(value) if value is not None else None
21
+
22
+
23
+ def to_int(value: Any) -> int | None:
24
+ value = none_if_nan(value)
25
+ return int(value) if value is not None else None
26
+
27
+
28
+ def get_price_history(
29
+ symbol: str,
30
+ period: str = "1y",
31
+ interval: str = "1d",
32
+ start: str | None = None,
33
+ end: str | None = None,
34
+ ) -> pd.DataFrame:
35
+ ticker = yf.Ticker(symbol.strip().upper())
36
+ return ticker.history(period=period, interval=interval, start=start, end=end)
37
+
38
+
39
+ def get_current_quote(symbol: str) -> UnderlyingQuote:
40
+ symbol = symbol.strip().upper()
41
+ ticker = yf.Ticker(symbol)
42
+ data = ticker.history(period="1d", interval="1m")
43
+
44
+ if not data.empty:
45
+ latest_row = data.iloc[-1]
46
+ return UnderlyingQuote(
47
+ symbol=symbol,
48
+ current_price=float(latest_row["Close"]),
49
+ open=float(latest_row["Open"]),
50
+ high=float(latest_row["High"]),
51
+ low=float(latest_row["Low"]),
52
+ volume=int(latest_row["Volume"]),
53
+ timestamp=str(data.index[-1]),
54
+ data_type="intraday_1m",
55
+ )
56
+
57
+ info = ticker.info
58
+ current_price = (
59
+ info.get("regularMarketPrice")
60
+ or info.get("previousClose")
61
+ or info.get("ask")
62
+ or info.get("bid")
63
+ )
64
+ return UnderlyingQuote(
65
+ symbol=symbol,
66
+ current_price=float(current_price) if current_price else None,
67
+ open=to_float(info.get("regularMarketOpen") or info.get("open")),
68
+ high=to_float(info.get("regularMarketDayHigh") or info.get("dayHigh")),
69
+ low=to_float(info.get("regularMarketDayLow") or info.get("dayLow")),
70
+ volume=to_int(info.get("regularMarketVolume") or info.get("volume")),
71
+ timestamp=datetime.utcnow().isoformat(timespec="seconds"),
72
+ data_type="cached_info",
73
+ short_name=info.get("shortName", ""),
74
+ )
75
+
76
+
77
+ def list_option_expirations(symbol: str) -> list[str]:
78
+ ticker = yf.Ticker(symbol.strip().upper())
79
+ return list(ticker.options or [])
80
+
81
+
82
+ def liquidity_warnings(row: pd.Series) -> list[str]:
83
+ warnings = []
84
+ bid = to_float(row.get("bid"))
85
+ ask = to_float(row.get("ask"))
86
+ volume = to_int(row.get("volume")) or 0
87
+ open_interest = to_int(row.get("openInterest")) or 0
88
+
89
+ if bid is None or ask is None or bid <= 0 or ask <= 0:
90
+ warnings.append("missing_or_zero_bid_ask")
91
+ elif ask > 0 and (ask - bid) / ask > 0.25:
92
+ warnings.append("wide_bid_ask_spread")
93
+ if volume <= 0:
94
+ warnings.append("zero_volume")
95
+ if open_interest <= 0:
96
+ warnings.append("zero_open_interest")
97
+ return warnings
98
+
99
+
100
+ def row_to_contract(row: pd.Series, option_type: str, expiration: str) -> OptionContract:
101
+ bid = to_float(row.get("bid"))
102
+ ask = to_float(row.get("ask"))
103
+ mid = (bid + ask) / 2 if bid is not None and ask is not None and bid > 0 and ask > 0 else None
104
+ days_to_expiration = max((date.fromisoformat(expiration) - date.today()).days, 0)
105
+
106
+ return OptionContract(
107
+ contract_symbol=str(row.get("contractSymbol", "")),
108
+ option_type=option_type,
109
+ expiration=expiration,
110
+ strike=float(row.get("strike")),
111
+ bid=bid,
112
+ ask=ask,
113
+ mid=mid,
114
+ last_price=to_float(row.get("lastPrice")),
115
+ volume=to_int(row.get("volume")),
116
+ open_interest=to_int(row.get("openInterest")),
117
+ implied_volatility=to_float(row.get("impliedVolatility")),
118
+ in_the_money=bool(row.get("inTheMoney", False)),
119
+ days_to_expiration=days_to_expiration,
120
+ liquidity_warnings=liquidity_warnings(row),
121
+ )
122
+
123
+
124
+ def get_option_chain(symbol: str, expiration: str | None = None) -> OptionChain:
125
+ symbol = symbol.strip().upper()
126
+ ticker = yf.Ticker(symbol)
127
+ expirations = list(ticker.options or [])
128
+ if not expirations:
129
+ raise ValueError(f"No option expirations found for {symbol}.")
130
+ expiration = expiration or expirations[0]
131
+ if expiration not in expirations:
132
+ raise ValueError(f"Expiration {expiration} is not available for {symbol}.")
133
+
134
+ chain = ticker.option_chain(expiration)
135
+ quote = get_current_quote(symbol)
136
+ calls = [row_to_contract(row, "call", expiration) for _, row in chain.calls.iterrows()]
137
+ puts = [row_to_contract(row, "put", expiration) for _, row in chain.puts.iterrows()]
138
+ return OptionChain(
139
+ symbol=symbol,
140
+ expiration=expiration,
141
+ underlying_price=quote.current_price,
142
+ calls=calls,
143
+ puts=puts,
144
+ )
market_data/schemas.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import asdict, dataclass
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class UnderlyingQuote:
9
+ symbol: str
10
+ current_price: float | None
11
+ open: float | None
12
+ high: float | None
13
+ low: float | None
14
+ volume: int | None
15
+ timestamp: str
16
+ data_type: str
17
+ short_name: str = ""
18
+
19
+ def to_dict(self) -> dict[str, Any]:
20
+ return asdict(self)
21
+
22
+
23
+ @dataclass
24
+ class OptionContract:
25
+ contract_symbol: str
26
+ option_type: str
27
+ expiration: str
28
+ strike: float
29
+ bid: float | None
30
+ ask: float | None
31
+ mid: float | None
32
+ last_price: float | None
33
+ volume: int | None
34
+ open_interest: int | None
35
+ implied_volatility: float | None
36
+ in_the_money: bool
37
+ days_to_expiration: int
38
+ liquidity_warnings: list[str]
39
+
40
+ def to_dict(self) -> dict[str, Any]:
41
+ return asdict(self)
42
+
43
+
44
+ @dataclass
45
+ class OptionChain:
46
+ symbol: str
47
+ expiration: str
48
+ underlying_price: float | None
49
+ calls: list[OptionContract]
50
+ puts: list[OptionContract]
51
+
52
+ def to_dict(self) -> dict[str, Any]:
53
+ return {
54
+ "symbol": self.symbol,
55
+ "expiration": self.expiration,
56
+ "underlying_price": self.underlying_price,
57
+ "calls": [contract.to_dict() for contract in self.calls],
58
+ "puts": [contract.to_dict() for contract in self.puts],
59
+ }
60
+
61
+
62
+ @dataclass
63
+ class VolSnapshot:
64
+ symbol: str
65
+ current_price: float | None
66
+ realized_volatility: dict[str, float | None]
67
+ atm_iv_by_expiration: dict[str, float | None]
68
+ iv_rv_spread_by_expiration: dict[str, float | None]
69
+ term_structure_slope: float | None
70
+ skew_by_expiration: dict[str, float | None]
71
+
72
+ def to_dict(self) -> dict[str, Any]:
73
+ return asdict(self)
market_data/tools.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from smolagents import tool
6
+
7
+ from .analytics import (
8
+ black_scholes_greeks,
9
+ classify_volatility_regime,
10
+ rank_current_iv_against_rv,
11
+ realized_volatility,
12
+ summarize_option_chain,
13
+ )
14
+ from .providers import get_current_quote, get_option_chain, get_price_history, list_option_expirations
15
+ from .schemas import VolSnapshot
16
+
17
+
18
+ def json_dumps(payload) -> str:
19
+ return json.dumps(payload, ensure_ascii=False, indent=2, default=str)
20
+
21
+
22
+ @tool
23
+ def query_market_asset(symbol: str) -> str:
24
+ """Query the current price and intraday quote data for an asset.
25
+
26
+ Args:
27
+ symbol: Yahoo Finance ticker, e.g. AAPL, SPY, ^VIX, BTC-USD, EURUSD=X.
28
+ """
29
+ try:
30
+ return json_dumps({"status": "success", **get_current_quote(symbol).to_dict()})
31
+ except Exception as exc:
32
+ return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
33
+
34
+
35
+ @tool
36
+ def query_price_history(symbol: str, period: str = "1y", interval: str = "1d") -> str:
37
+ """Query historical OHLCV prices for an asset.
38
+
39
+ Args:
40
+ symbol: Yahoo Finance ticker.
41
+ period: Yahoo Finance period such as 1mo, 6mo, 1y, 5y.
42
+ interval: Yahoo Finance interval such as 1d, 1h, 15m.
43
+ """
44
+ try:
45
+ history = get_price_history(symbol, period=period, interval=interval)
46
+ records = history.tail(20).reset_index().to_dict(orient="records")
47
+ return json_dumps(
48
+ {
49
+ "status": "success",
50
+ "symbol": symbol.upper(),
51
+ "period": period,
52
+ "interval": interval,
53
+ "rows_returned": len(records),
54
+ "latest_rows": records,
55
+ }
56
+ )
57
+ except Exception as exc:
58
+ return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
59
+
60
+
61
+ @tool
62
+ def query_realized_volatility(symbol: str, period: str = "1y") -> str:
63
+ """Calculate realized volatility windows from historical close prices.
64
+
65
+ Args:
66
+ symbol: Yahoo Finance ticker.
67
+ period: Yahoo Finance history period.
68
+ """
69
+ try:
70
+ history = get_price_history(symbol, period=period, interval="1d")
71
+ rv = realized_volatility(history["Close"])
72
+ return json_dumps({"status": "success", "symbol": symbol.upper(), "realized_volatility": rv})
73
+ except Exception as exc:
74
+ return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
75
+
76
+
77
+ @tool
78
+ def query_option_expirations(symbol: str) -> str:
79
+ """List available option expiration dates for an underlying.
80
+
81
+ Args:
82
+ symbol: Yahoo Finance ticker.
83
+ """
84
+ try:
85
+ expirations = list_option_expirations(symbol)
86
+ return json_dumps({"status": "success", "symbol": symbol.upper(), "expirations": expirations})
87
+ except Exception as exc:
88
+ return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
89
+
90
+
91
+ @tool
92
+ def query_option_chain(symbol: str, expiration: str = "") -> str:
93
+ """Query an option chain with liquidity warnings and implied volatility.
94
+
95
+ Args:
96
+ symbol: Yahoo Finance ticker.
97
+ expiration: Expiration date in YYYY-MM-DD. Leave empty to use the nearest expiration.
98
+ """
99
+ try:
100
+ chain = get_option_chain(symbol, expiration or None)
101
+ summary = summarize_option_chain(chain)
102
+ payload = chain.to_dict()
103
+ payload["summary"] = summary
104
+ payload["calls"] = payload["calls"][:80]
105
+ payload["puts"] = payload["puts"][:80]
106
+ return json_dumps({"status": "success", **payload})
107
+ except Exception as exc:
108
+ return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
109
+
110
+
111
+ @tool
112
+ def query_volatility_snapshot(symbol: str, max_expirations: int = 4, history_period: str = "1y") -> str:
113
+ """Summarize realized volatility, ATM IV, IV-RV spread, skew, and term structure.
114
+
115
+ Args:
116
+ symbol: Yahoo Finance ticker.
117
+ max_expirations: Number of expirations to sample from the option chain.
118
+ history_period: Yahoo Finance history period for realized volatility.
119
+ """
120
+ try:
121
+ symbol = symbol.strip().upper()
122
+ quote = get_current_quote(symbol)
123
+ history = get_price_history(symbol, period=history_period, interval="1d")
124
+ rv = realized_volatility(history["Close"])
125
+ rv_20d = rv.get("20d")
126
+ expirations = list_option_expirations(symbol)[:max_expirations]
127
+
128
+ atm_iv_by_expiration = {}
129
+ iv_rv_spread_by_expiration = {}
130
+ skew_by_expiration = {}
131
+ for expiration in expirations:
132
+ chain = get_option_chain(symbol, expiration)
133
+ summary = summarize_option_chain(chain, realized_vol_20d=rv_20d)
134
+ atm_iv_by_expiration[expiration] = summary["atm_iv"]
135
+ iv_rv_spread_by_expiration[expiration] = summary["iv_rv_spread_20d"]
136
+ skew_by_expiration[expiration] = summary["skew_put_minus_call"]
137
+
138
+ valid_term_ivs = [
139
+ value
140
+ for value in atm_iv_by_expiration.values()
141
+ if value is not None
142
+ ]
143
+ current_atm_iv = valid_term_ivs[0] if valid_term_ivs else None
144
+ sampled_skews = [value for value in skew_by_expiration.values() if value is not None]
145
+ front_skew = sampled_skews[0] if sampled_skews else None
146
+ term_structure_slope = (
147
+ float(valid_term_ivs[-1] - valid_term_ivs[0])
148
+ if len(valid_term_ivs) >= 2
149
+ else None
150
+ )
151
+ regime = classify_volatility_regime(
152
+ current_iv=current_atm_iv,
153
+ realized_vol_20d=rv_20d,
154
+ term_structure_slope=term_structure_slope,
155
+ skew=front_skew,
156
+ )
157
+ snapshot = VolSnapshot(
158
+ symbol=symbol,
159
+ current_price=quote.current_price,
160
+ realized_volatility=rv,
161
+ atm_iv_by_expiration=atm_iv_by_expiration,
162
+ iv_rv_spread_by_expiration=iv_rv_spread_by_expiration,
163
+ term_structure_slope=term_structure_slope,
164
+ skew_by_expiration=skew_by_expiration,
165
+ )
166
+ return json_dumps(
167
+ {
168
+ "status": "success",
169
+ **snapshot.to_dict(),
170
+ "front_atm_iv": current_atm_iv,
171
+ "front_skew": front_skew,
172
+ "iv_vs_rv_rank_proxy": rank_current_iv_against_rv(current_atm_iv, rv),
173
+ "volatility_regime": regime,
174
+ "limitations": [
175
+ "IV rank/percentile is a proxy based on current ATM IV versus realized-volatility windows.",
176
+ "True historical IV rank requires historical option-chain data from a richer provider.",
177
+ ],
178
+ }
179
+ )
180
+ except Exception as exc:
181
+ return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
182
+
183
+
184
+ @tool
185
+ def calculate_option_greeks(
186
+ spot: float,
187
+ strike: float,
188
+ time_to_expiry: float,
189
+ volatility: float,
190
+ option_type: str = "call",
191
+ risk_free_rate: float = 0.0,
192
+ dividend_yield: float = 0.0,
193
+ ) -> str:
194
+ """Calculate Black-Scholes-Merton Greeks for a single option.
195
+
196
+ Args:
197
+ spot: Current underlying price.
198
+ strike: Option strike.
199
+ time_to_expiry: Time to expiration in years.
200
+ volatility: Annualized implied volatility as a decimal.
201
+ option_type: call or put.
202
+ risk_free_rate: Annualized risk-free rate as a decimal.
203
+ dividend_yield: Annualized dividend yield as a decimal.
204
+ """
205
+ greeks = black_scholes_greeks(
206
+ spot=spot,
207
+ strike=strike,
208
+ time_to_expiry=time_to_expiry,
209
+ volatility=volatility,
210
+ risk_free_rate=risk_free_rate,
211
+ dividend_yield=dividend_yield,
212
+ option_type=option_type,
213
+ )
214
+ return json_dumps({"status": "success", "greeks": greeks})
optimizer/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .vol_optimizer import optimize_volatility_signal
2
+
3
+ __all__ = ["optimize_volatility_signal"]
optimizer/tools.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from smolagents import tool
6
+
7
+ from market_data.providers import get_price_history
8
+
9
+ from .vol_optimizer import optimize_volatility_signal
10
+
11
+
12
+ @tool
13
+ def optimize_volatility_signal_parameters(
14
+ symbol: str,
15
+ signal: str = "long_vol",
16
+ period: str = "3y",
17
+ ) -> str:
18
+ """Scan simple realized-volatility signal parameters and compare best vs baseline.
19
+
20
+ Args:
21
+ symbol: Yahoo Finance ticker.
22
+ signal: long_vol or short_vol.
23
+ period: Yahoo Finance history period.
24
+ """
25
+ try:
26
+ history = get_price_history(symbol, period=period, interval="1d")
27
+ result = optimize_volatility_signal(history["Close"], signal=signal)
28
+ return json.dumps({"status": "success", "symbol": symbol.upper(), **result}, ensure_ascii=False, indent=2)
29
+ except Exception as exc:
30
+ return json.dumps({"status": "error", "symbol": symbol, "message": str(exc)}, ensure_ascii=False, indent=2)
optimizer/vol_optimizer.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from backtest.vol_backtest import backtest_realized_vol_signal
6
+
7
+
8
+ def optimize_volatility_signal(
9
+ prices: pd.Series,
10
+ signal: str = "long_vol",
11
+ short_windows: tuple[int, ...] = (5, 10, 15),
12
+ long_windows: tuple[int, ...] = (20, 30, 60),
13
+ holding_days_options: tuple[int, ...] = (3, 5, 10),
14
+ ) -> dict:
15
+ runs = []
16
+ for short_window in short_windows:
17
+ for long_window in long_windows:
18
+ if short_window >= long_window:
19
+ continue
20
+ for holding_days in holding_days_options:
21
+ result = backtest_realized_vol_signal(
22
+ prices=prices,
23
+ short_window=short_window,
24
+ long_window=long_window,
25
+ holding_days=holding_days,
26
+ signal=signal,
27
+ )
28
+ runs.append(
29
+ {
30
+ "short_window": short_window,
31
+ "long_window": long_window,
32
+ "holding_days": holding_days,
33
+ "trade_count": result["trade_count"],
34
+ "win_rate": result["win_rate"],
35
+ "total_return_proxy": result["total_return_proxy"],
36
+ "max_drawdown_proxy": result["max_drawdown_proxy"],
37
+ "avg_trade_pnl_proxy": result["avg_trade_pnl_proxy"],
38
+ }
39
+ )
40
+
41
+ runs.sort(
42
+ key=lambda run: (
43
+ run["total_return_proxy"],
44
+ -abs(run["max_drawdown_proxy"]),
45
+ run["win_rate"],
46
+ ),
47
+ reverse=True,
48
+ )
49
+ best = runs[0] if runs else None
50
+ baseline = next(
51
+ (
52
+ run
53
+ for run in runs
54
+ if run["short_window"] == 10 and run["long_window"] == 30 and run["holding_days"] == 5
55
+ ),
56
+ runs[0] if runs else None,
57
+ )
58
+ return {
59
+ "signal": signal,
60
+ "best": best,
61
+ "baseline": baseline,
62
+ "top_runs": runs[:10],
63
+ "metrics_delta": (
64
+ {
65
+ "total_return_proxy_delta": best["total_return_proxy"] - baseline["total_return_proxy"],
66
+ "win_rate_delta": best["win_rate"] - baseline["win_rate"],
67
+ "max_drawdown_proxy_delta": best["max_drawdown_proxy"] - baseline["max_drawdown_proxy"],
68
+ }
69
+ if best and baseline
70
+ else None
71
+ ),
72
+ "anti_overfit_note": (
73
+ "This is an in-sample parameter scan. Use walk-forward or out-of-sample validation "
74
+ "before trusting optimized parameters."
75
+ ),
76
+ }
prompts.yaml CHANGED
@@ -9,6 +9,18 @@
9
  These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
10
  In the end you have to return a final answer using the `final_answer` tool.
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  Here are a few examples using notional tools:
13
  ---
14
  Task: "Generate an image of the oldest person in this document."
 
9
  These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
10
  In the end you have to return a final answer using the `final_answer` tool.
11
 
12
+ You are also an options research agent focused on volatility trading. When the task concerns options, volatility, market data, strategy construction, or backtesting, follow these rules:
13
+ - Treat all outputs as research and education, not guaranteed investment advice.
14
+ - Prefer `query_knowledge` for stable options concepts, formulas, Greeks, volatility trading theory, and citations from local reference books.
15
+ - Use `web_search` and `visit_webpage` for recent market events, earnings dates, company announcements, macro events, exchange rules, and source verification.
16
+ - Use market data tools for current price, option chains, realized volatility, IV/RV spread, skew, term structure, and Greeks before proposing a strategy.
17
+ - For volatility strategies, state whether the idea is long vol, short vol, term-structure, skew, or event-vol driven.
18
+ - Every strategy discussion must include legs, expiration, strikes, net debit/credit, max loss, breakevens, major Greeks exposure, liquidity warnings, and event/IV-crush risk when relevant.
19
+ - Before presenting a final strategy, use payoff/backtest/optimization tools when sufficient data is available, and clearly label any proxy backtest limitations.
20
+ - Never present short premium strategies as low-risk. Explicitly mention tail risk, margin, assignment, liquidity, slippage, and gap risk.
21
+ - If required inputs are missing, ask for the missing symbol, outlook, time horizon, risk budget, or whether naked option selling is allowed.
22
+ - Final answers for options tasks should use this structure when applicable: market_context, volatility_view, strategy_candidates, selected_strategy, backtest_summary, risk_warnings, sources, limitations.
23
+
24
  Here are a few examples using notional tools:
25
  ---
26
  Task: "Generate an image of the oldest person in this document."
pyproject.toml CHANGED
@@ -18,6 +18,8 @@ dependencies = [
18
  "tokenizers>=0.22.0,<=0.23.0",
19
  "transformers<5",
20
  "pymupdf>=1.27.2.3",
 
 
21
  ]
22
 
23
  [build-system]
 
18
  "tokenizers>=0.22.0,<=0.23.0",
19
  "transformers<5",
20
  "pymupdf>=1.27.2.3",
21
+ "pandas>=2.0.0",
22
+ "yfinance>=0.2.0",
23
  ]
24
 
25
  [build-system]
quantconnect/README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QuantConnect / LEAN Option Backtest Template
2
+
3
+ 这个目录放真正期权历史回测的 QuantConnect/LEAN 模板。它和本地
4
+ `backtest_long_straddle_csv` 的定位不同:
5
+
6
+ - 本地 CSV 回测:适合用供应商导出的历史期权 bid/ask quote 快速验证策略。
7
+ - QuantConnect/LEAN:适合用 LEAN 数据源、撮合模型、手续费模型、组合持仓和保证金模型做更完整的回测。
8
+
9
+ ## 当前模板
10
+
11
+ `VolatilityStraddleAlgorithm.py` 实现一个 ATM long straddle 示例:
12
+
13
+ - 每隔 `entry_every_days` 天寻找目标 DTE 附近的期权到期日。
14
+ - 选择最接近 ATM 的 call 和 put。
15
+ - 用市场单买入 1 组 straddle。
16
+ - 持有 `holding_days` 后平仓。
17
+ - 使用 LEAN 的期权链、组合持仓、手续费/滑点/撮合模型能力。
18
+
19
+ ## 使用方式
20
+
21
+ 1. 在 QuantConnect 新建 Python algorithm。
22
+ 2. 将 `VolatilityStraddleAlgorithm.py` 内容复制到 `main.py`。
23
+ 3. 根据标的、日期、DTE、holding period 和资金规模修改参数。
24
+ 4. 运行回测,导出 orders/trades/equity 后可再交给 agent 分析。
25
+
26
+ ## 注意
27
+
28
+ 真实期权回测必须有历史期权链或历史期权报价。`yfinance` 只能查当前/近期期权链,不能可靠提供历史期权链,所以不能单独支撑严肃的历史期权策略回测。
quantconnect/VolatilityStraddleAlgorithm.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from AlgorithmImports import *
2
+
3
+
4
+ class VolatilityStraddleAlgorithm(QCAlgorithm):
5
+ """ATM long straddle template for real option backtests in QuantConnect/LEAN."""
6
+
7
+ def Initialize(self):
8
+ self.SetStartDate(2022, 1, 1)
9
+ self.SetEndDate(2024, 1, 1)
10
+ self.SetCash(100000)
11
+
12
+ self.ticker = "SPY"
13
+ self.target_dte = 30
14
+ self.holding_days = 5
15
+ self.entry_every_days = 5
16
+ self.contract_quantity = 1
17
+
18
+ equity = self.AddEquity(self.ticker, Resolution.Minute)
19
+ option = self.AddOption(self.ticker, Resolution.Minute)
20
+ option.SetFilter(self.OptionFilter)
21
+
22
+ self.underlying = equity.Symbol
23
+ self.option_symbol = option.Symbol
24
+ self.next_entry_time = self.StartDate
25
+ self.open_groups = []
26
+
27
+ def OptionFilter(self, universe):
28
+ min_dte = max(1, self.target_dte - 10)
29
+ max_dte = self.target_dte + 10
30
+ return universe.IncludeWeeklys().Strikes(-10, 10).Expiration(min_dte, max_dte)
31
+
32
+ def OnData(self, slice):
33
+ self.CloseExpiredHoldingGroups()
34
+
35
+ if self.Time < self.next_entry_time:
36
+ return
37
+
38
+ chain = slice.OptionChains.get(self.option_symbol)
39
+ if chain is None:
40
+ return
41
+
42
+ contracts = [contract for contract in chain if contract.Expiry.date() > self.Time.date()]
43
+ if not contracts:
44
+ return
45
+
46
+ expiry = min(contracts, key=lambda contract: abs((contract.Expiry.date() - self.Time.date()).days - self.target_dte)).Expiry
47
+ expiry_contracts = [contract for contract in contracts if contract.Expiry == expiry]
48
+ spot = self.Securities[self.underlying].Price
49
+
50
+ calls = [contract for contract in expiry_contracts if contract.Right == OptionRight.Call]
51
+ puts = [contract for contract in expiry_contracts if contract.Right == OptionRight.Put]
52
+ if not calls or not puts:
53
+ return
54
+
55
+ call = min(calls, key=lambda contract: abs(contract.Strike - spot))
56
+ put = min(puts, key=lambda contract: abs(contract.Strike - spot))
57
+
58
+ self.MarketOrder(call.Symbol, self.contract_quantity)
59
+ self.MarketOrder(put.Symbol, self.contract_quantity)
60
+
61
+ self.open_groups.append(
62
+ {
63
+ "entry_time": self.Time,
64
+ "exit_time": self.Time + timedelta(days=self.holding_days),
65
+ "symbols": [call.Symbol, put.Symbol],
66
+ }
67
+ )
68
+ self.next_entry_time = self.Time + timedelta(days=self.entry_every_days)
69
+
70
+ self.Debug(
71
+ f"Opened ATM straddle {call.Symbol.Value}, {put.Symbol.Value}; "
72
+ f"spot={spot:.2f}; expiry={expiry.date()}"
73
+ )
74
+
75
+ def CloseExpiredHoldingGroups(self):
76
+ remaining_groups = []
77
+ for group in self.open_groups:
78
+ if self.Time < group["exit_time"]:
79
+ remaining_groups.append(group)
80
+ continue
81
+
82
+ for symbol in group["symbols"]:
83
+ holding = self.Portfolio[symbol]
84
+ if holding.Invested:
85
+ self.MarketOrder(symbol, -holding.Quantity)
86
+ self.Debug(f"Closed straddle group from {group['entry_time']}")
87
+
88
+ self.open_groups = remaining_groups
89
+
90
+ def OnEndOfAlgorithm(self):
91
+ self.Debug(f"Final portfolio value: {self.Portfolio.TotalPortfolioValue:.2f}")
rag_eval_interview_notes.md ADDED
@@ -0,0 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG 评测模块构建总结
2
+
3
+ 本文档用于面试时说明:为什么需要 RAG 评测、如何设计 retrieval eval、如何接入公开数据集和自建 PDF 测试集,以及如何判断 RAG 优化是否真的有效。
4
+
5
+ ## 背景问题
6
+
7
+ 在优化 RAG 系统时,仅靠主观查看回答效果不稳定,也很难判断 PDF 解析、chunk 切分、embedding、reranker 或检索参数的改动是否真的带来提升。
8
+
9
+ 因此我先搭建了一个独立的 RAG retrieval evaluation 模块,用固定测试集和固定指标来做 before/after 对比。
10
+
11
+ 目标是:
12
+
13
+ - 能快速验证检索链路是否跑通。
14
+ - 能用公开 benchmark 做横向参考。
15
+ - 能用金融相关数据集贴近业务场景。
16
+ - 能用自己的期权 PDF 测试集验证 PDF 解析、公式抽取和章节切分是否有效。
17
+ - 每次改动后可以一条命令自动跑评测并生成报告。
18
+
19
+ ## 数据集接入顺序
20
+
21
+ 我按照由易到难、由通用到业务的顺序接入了 4 类测试集。
22
+
23
+ ### 1. BEIR/scifact
24
+
25
+ `scifact` 是 BEIR 中比较小的科学事实检索数据集,适合快速跑通 retrieval eval。
26
+
27
+ 接入它的目的不是追求业务贴合,而是验证:
28
+
29
+ - 数据下载和解析是否正常。
30
+ - corpus、query、qrels 能否正确对齐。
31
+ - 向量索引是否能构建。
32
+ - 检索指标是否能稳定输出。
33
+
34
+ ### 2. BEIR/fiqa
35
+
36
+ `fiqa` 是金融问答相关数据集,比 `scifact` 更贴近金融场景。
37
+
38
+ 接入它的目的:
39
+
40
+ - 验证金融语义检索能力。
41
+ - 检查 embedding 对金融术语、问答表达的适配情况。
42
+ - 作为后续期权 PDF 场景前的公开金融 benchmark。
43
+
44
+ ### 3. Open RAGBench
45
+
46
+ Open RAGBench 更接近长文档、PDF、报告类 RAG 场景。
47
+
48
+ 我选择了其中的 `pdf/arxiv` 子集,用来验证:
49
+
50
+ - 长文档解析后的检索效果。
51
+ - 多章节、多段落文档下的 chunk 检索表现。
52
+ - RAG 系统在 PDF-like 文档上的泛化能力。
53
+
54
+ ### 4. 自建期权 PDF 测试集
55
+
56
+ 最后补充自己的期权 PDF 测试集,因为公开 benchmark 无法完全覆盖当前项目中的业务难点。
57
+
58
+ 自建测试集重点覆盖:
59
+
60
+ - 期权定价概念。
61
+ - PDF 中的公式内容。
62
+ - 章节标题和上下文定位。
63
+ - 公式编号、页码、章节等 metadata 是否能帮助检索。
64
+
65
+ ## 模块设计
66
+
67
+ 评测模块放在 `eval/` 目录下,核心文件包括:
68
+
69
+ - `eval/rag_eval.py`:单数据集 retrieval eval 入口。
70
+ - `eval/run_eval_suite.py`:批量评测多个数据集的 suite runner。
71
+ - `eval/local_options_eval.jsonl`:自建期权 PDF 测试集。
72
+ - `eval/README.md`:调用示例和使用说明。
73
+
74
+ 整体流程如下:
75
+
76
+ ```text
77
+ 加载数据集
78
+ -> 构造 documents / queries / qrels
79
+ -> 构建 Chroma 向量索引
80
+ -> 执行 top-k retrieval
81
+ -> 按 doc_id 去重
82
+ -> 计算 hit@k / MRR / NDCG@K
83
+ -> 生成 JSON 和 Markdown 报告
84
+ ```
85
+
86
+ ## 为什么只先做 retrieval eval
87
+
88
+ RAG 的最终效果由两部分组成:
89
+
90
+ ```text
91
+ RAG = Retrieval + Generation
92
+ ```
93
+
94
+ 如果检索阶段没有找到正确上下文,后面的 LLM 生成很容易幻觉。因此我先评估 retrieval:
95
+
96
+ - 问题对应的正确文档有没有被找回来。
97
+ - 正确文档排在第几名。
98
+ - top-k 结果排序是否合理。
99
+
100
+ 这样可以先把问题定位在“检索是否正确”,再进一步评估生成答案。
101
+
102
+ ## 指标设计
103
+
104
+ ### Hit@K
105
+
106
+ `Hit@K` 表示前 K 个结果里是否包含正确文档。
107
+
108
+ 例如 `Hit@5 = 1`,表示正确文档出现在前 5 个检索结果中。
109
+
110
+ 它适合判断:
111
+
112
+ - 正确上下文有没有被召回。
113
+ - top-k 设大以后召回是否提升。
114
+
115
+ ### MRR
116
+
117
+ `MRR` 是 Mean Reciprocal Rank,关注第一个正确结果出现的位置。
118
+
119
+ 如果正确结果排第 1,得分是 `1`。
120
+ 如果正确结果排第 2,得分是 `1/2`。
121
+ 如果正确结果排第 5,得分是 `1/5`。
122
+
123
+ 它适合判断:
124
+
125
+ - 正确文档是否排得足够靠前。
126
+ - 检索排序质量是否提升。
127
+
128
+ ### NDCG@K
129
+
130
+ `NDCG@K` 衡量前 K 个结果的排序质量。
131
+
132
+ 计算方式是:
133
+
134
+ ```text
135
+ DCG@K = rel_1 / log2(2) + rel_2 / log2(3) + ... + rel_K / log2(K + 1)
136
+ NDCG@K = DCG@K / IDCG@K
137
+ ```
138
+
139
+ 其中 `rel_i = 1` 表示第 i 个结果相关,`rel_i = 0` 表示不相关。
140
+
141
+ NDCG 越接近 1,说明相关结果越靠前。
142
+
143
+ ## 关键实现细节
144
+
145
+ ### 1. 统一数据格式
146
+
147
+ 不同数据集格式不同,因此我统一抽象成:
148
+
149
+ ```python
150
+ documents = [
151
+ {
152
+ "doc_id": "...",
153
+ "title": "...",
154
+ "text": "...",
155
+ "metadata": {...}
156
+ }
157
+ ]
158
+
159
+ queries = [
160
+ {
161
+ "query_id": "...",
162
+ "question": "...",
163
+ "relevant_doc_ids": [...]
164
+ }
165
+ ]
166
+
167
+ qrels = {
168
+ "query_id": {"doc_id"}
169
+ }
170
+ ```
171
+
172
+ 这样后续索引构建和指标计算可以复用同一套逻辑。
173
+
174
+ ### 2. 小样本评测必须包含 gold 文档
175
+
176
+ 在做 smoke test 时,如果只取 corpus 前 N 篇文档,可能会出现 query 的正确文档不在测试 corpus 里,导致评测不公平。
177
+
178
+ 所以我在��载 BEIR 和 Open RAGBench 时,会先读取 qrels,确定当前 query 需要哪些 gold documents,再优先把这些文档纳入 corpus。
179
+
180
+ 这样小样本测试可以稳定评估检索能力,而不是被采样问题干扰。
181
+
182
+ ### 3. 检索结果按 doc_id 去重
183
+
184
+ 一个文档会被切成多个 chunk,检索时可能同一篇文档的多个 chunk 同时出现在 top-k 中。
185
+
186
+ 如果不去重,会导致:
187
+
188
+ - 指标被重复 chunk 影响。
189
+ - NDCG 可能异常偏高。
190
+ - top-k 实际上不是 top-k documents,而是 top-k chunks。
191
+
192
+ 因此评测时内部会多取一些 chunk,然后按 `doc_id` 去重,再计算 top-k 文档级指标。
193
+
194
+ ### 4. 支持 rebuild
195
+
196
+ 如果修改了:
197
+
198
+ - PDF 解析逻辑
199
+ - chunk 切分方式
200
+ - embedding 模型
201
+ - metadata 构造
202
+ - reranker 或检索参数
203
+
204
+ 必须使用 `--rebuild` 重建索引,否则会复用旧索引,评测结果不能代表最新代码。
205
+
206
+ ## 自动化评测脚本
207
+
208
+ 单数据集评测:
209
+
210
+ ```bash
211
+ uv --cache-dir .uv-cache run python -m eval.rag_eval \
212
+ --dataset local-options \
213
+ --max-queries 3 \
214
+ --top-k 5 \
215
+ --rebuild
216
+ ```
217
+
218
+ 批量评测:
219
+
220
+ ```bash
221
+ uv --cache-dir .uv-cache run python -m eval.run_eval_suite --rebuild
222
+ ```
223
+
224
+ 只跑指定数据集:
225
+
226
+ ```bash
227
+ uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
228
+ --datasets local-options,beir/fiqa \
229
+ --top-k 5 \
230
+ --max-queries 20 \
231
+ --rebuild
232
+ ```
233
+
234
+ 对比不同 chunk 设置:
235
+
236
+ ```bash
237
+ uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
238
+ --datasets local-options \
239
+ --chunk-size 384 \
240
+ --chunk-overlap 64 \
241
+ --output-name local_options_chunk384 \
242
+ --rebuild
243
+ ```
244
+
245
+ 报告会输出到:
246
+
247
+ ```text
248
+ eval/reports/
249
+ ```
250
+
251
+ 包括:
252
+
253
+ - 每个数据集的 JSON 报告。
254
+ - 每个数据集的 Markdown 报告。
255
+ - suite 级别的汇总报告。
256
+
257
+ ## 遇到的问题和解决方案
258
+
259
+ ### 问题 1:公开数据集需要联网下载
260
+
261
+ BEIR 和 Open RAGBench 都需要从公网下载数据。
262
+
263
+ 解决方法:
264
+
265
+ - 第一次运行时下载并缓存到 `eval/data/`。
266
+ - 后续运行直接复用本地数据。
267
+ - 数据和索引分开存放,便于排查问题。
268
+
269
+ ### 问题 2:Open RAGBench 实际目录结构和预期不一致
270
+
271
+ 最开始预设路径是 `official/pdf/arxiv`,但实际下载后路径是 `pdf/arxiv`。
272
+
273
+ 解决方法:
274
+
275
+ - loader 中兼容两种路径。
276
+ - 优先尝试 `pdf/arxiv`,不存在时再回退到 `official/pdf/arxiv`。
277
+
278
+ ### 问题 3:小样本采样会漏掉 gold document
279
+
280
+ 如果 `max_corpus_docs` 很小,直接截取 corpus 前 N 条可能不包含 qrels 中的正确文档。
281
+
282
+ 解决方法:
283
+
284
+ - 先根据 qrels 选择 query。
285
+ - 再把对应 gold documents 强制纳入 corpus。
286
+ - 最后补充其他文档作为干扰项。
287
+
288
+ ### 问题 4:chunk 重复导致指标异常
289
+
290
+ 同一篇文档的多个 chunk 可能同时命中,导致 NDCG 等指标不合理。
291
+
292
+ 解决方法:
293
+
294
+ - 检索时多取一些 chunk。
295
+ - 评估时按 `doc_id` 去重。
296
+ - 最终以 document-level top-k 计算指标。
297
+
298
+ ### 问题 5:不重建索引可能复用旧结果
299
+
300
+ 如果代码改了但没有 `--rebuild`,Chroma 可能复用旧索引。
301
+
302
+ 解决方法:
303
+
304
+ - 文档中明确说明改动后必须加 `--rebuild`。
305
+ - suite runner 支持统一传入 `--rebuild`。
306
+ - 用 `--output-name` 固定报告名,方便 before/after 对比。
307
+
308
+ ### 问题 6:RAG 只是独立模块,没有真正接入 Agent
309
+
310
+ 最开始 RAG 已经能单独查询知识库,但主 `CodeAgent` 的 tools 里没有注册知识库工具。这样在真实对话里,agent 实际只能查行情和时间,不能主动调用本地期权知识库。
311
+
312
+ 解决方法:
313
+
314
+ - 将 `QueryKnowledgeTool` 注册进主 agent。
315
+ - 优化 tool description,让模型知道它应该在期权概念、波动率、Greeks、策略、公式编号和书籍引用问题上调用该工具。
316
+ - 控制 tool 输出长度,只返回来源、页码、section、分数和截断后的片段,避免检索结果占满上下文。
317
+
318
+ 面试可以强调:
319
+
320
+ > RAG 不是只要能单独跑 query 就算完成,必须作为 agent 的一个可调用工具接入主工作流。否则用户问期权概念时,agent 不一定会查知识库,仍然可能凭模型参数记忆回答。
321
+
322
+ ### 问题 7:知识库目录和代码目录耦合
323
+
324
+ 早期知识库放在 `tools/knowledge_base` 下,代码、原始资料和 Chroma 数据库混在一起。随着知识库变大,这种结构不利于维护,也不利于后续把工具代码、数据和缓存分开管理。
325
+
326
+ 解决方法:
327
+
328
+ - 将知识库统一到项目根目录:
329
+
330
+ ```text
331
+ OptionAgent/knowledge_base/
332
+ raw/
333
+ chroma_db/
334
+ ```
335
+
336
+ - 工具代码中使用 `PROJECT_ROOT / "knowledge_base"` 作为主路径。
337
+ - 保留旧路径 fallback,避免迁移时旧数据立刻失效。
338
+
339
+ 面试可以强调:
340
+
341
+ > 我把知识库从工具目录迁到项目根目录,并保留 legacy fallback。这样既完成了结构治理,也避免了迁移时破坏已有索引和原始文档。
342
+
343
+ ### 问题 8:全量 rebuild 成本高
344
+
345
+ 只要文档、解析方法或 embedding 模型变化,就全量重建索引。书籍变多后,这会浪费大量时间,而且不方便频繁更新笔记。
346
+
347
+ 解决方法:
348
+
349
+ - 每个 chunk metadata 中保留:
350
+
351
+ ```text
352
+ source_file
353
+ file_hash
354
+ embedding_model
355
+ extraction_method
356
+ ```
357
+
358
+ - 启动时扫描当前 raw 文件,和 Chroma 中已有 metadata 对比:
359
+
360
+ ```text
361
+ 新增文件 -> 只入库新增文件
362
+ 修改文件 -> 删除该文件旧 chunks,再重新入库
363
+ 删除文件 -> 删除该文件对应 chunks
364
+ embedding/extraction 版本变化 -> 触发对应文件更新
365
+ ```
366
+
367
+ 面试可以强调:
368
+
369
+ > 我没有只依赖 collection 是否为空,而是基于 source_file、file_hash、embedding_model 和 extraction_method 做增量更新。这样文档更新后索引不会脏,也不用每次全量 rebuild。
370
+
371
+ ### 问题 9:纯向量检索对公式编号和专有名词不稳定
372
+
373
+ 期权书里有很多精确查询,例如:
374
+
375
+ ```text
376
+ Equation 21.23
377
+ WITH ZERO CORRELATION
378
+ Black-Scholes-Merton
379
+ vega
380
+ gamma
381
+ ```
382
+
383
+ 这类问题不只是语义相似,还需要字面命中。纯 dense embedding 对概念解释很强,但对公式编号、章节标题、专有名词有时不如关键词检索稳定。
384
+
385
+ 解决方法:
386
+
387
+ - 增加轻量 BM25 检索。
388
+ - 查询时同时跑:
389
+
390
+ ```text
391
+ dense vector retrieval
392
+ BM25 keyword retrieval
393
+ ```
394
+
395
+ - 使用 reciprocal-rank merge 合并结果。
396
+ - 再交给 cross-encoder reranker 做最终排序。
397
+
398
+ 最终链路:
399
+
400
+ ```text
401
+ query
402
+ -> dense top-k
403
+ -> BM25 top-k
404
+ -> merge / deduplicate
405
+ -> reranker
406
+ -> top results with citations
407
+ ```
408
+
409
+ 面试可以强调:
410
+
411
+ > 我做 hybrid search 是因为金融和期权文档里存在大量公式编号、章节名、ticker-like token 和专有名词。Dense retrieval 负责语义召回,BM25 负责精确词命中,reranker 负责最终排序。
412
+
413
+ ### 问题 10:本地评测集太小
414
+
415
+ 最初 `local-options` 只有 3 条 case,容易出现指标过高但不可泛化的问题。比如小样本里 Hit@5 为 1,并不代表系统在真实问题上稳定。
416
+
417
+ 解决方法:
418
+
419
+ - 新增 `eval/generate_local_options_eval.py`。
420
+ - 从已解析的 PDF/MD 文档中随机抽样 chunk。
421
+ - 优先覆盖:
422
+ - 公式问题。
423
+ - 章节定位问题。
424
+ - 期权关键词问题。
425
+ - 波动率、Greeks、风险中性、策略等业务术语。
426
+ - 过滤前言、索引页、表格/图注噪声,避免生成低质量 query。
427
+ - 将本地 eval 扩充到 40 条。
428
+
429
+ 面试可以强调:
430
+
431
+ > 我没有只手写少量 happy path case,而是做了一个本地 eval case generator,从真实 chunk 中抽样生成问题,并对噪声标题做过滤。这样可以更稳定地评估 PDF 解析和检索策略的变化。
432
+
433
+ ## Hybrid Search 和 Reranker 对比实验
434
+
435
+ 扩充到 40 条 local-options case 后,我做了三组对比:
436
+
437
+ ```text
438
+ dense-only:
439
+ MRR 0.4708
440
+ NDCG@5 0.3468
441
+ Hit@1 0.4250
442
+ Hit@3 0.5250
443
+ Hit@5 0.5250
444
+
445
+ hybrid:
446
+ MRR 0.4833
447
+ NDCG@5 0.3190
448
+ Hit@1 0.4250
449
+ Hit@3 0.5250
450
+ Hit@5 0.5750
451
+
452
+ hybrid + reranker:
453
+ MRR 0.7125
454
+ NDCG@5 0.4717
455
+ Hit@1 0.7000
456
+ Hit@3 0.7250
457
+ Hit@5 0.7250
458
+ ```
459
+
460
+ 结果解释:
461
+
462
+ - Hybrid search 单独提升了 Hit@5,说明 BM25 补充了召回,尤其对精确术语和公式编号有帮助。
463
+ - Hybrid 的 NDCG 略降,说明召回增加后排序还不够好。
464
+ - 加上 reranker 后,MRR、NDCG、Hit@1、Hit@5 都明显提升,说明 reranker 有效改善了排序质量。
465
+
466
+ 面试可以这样总结:
467
+
468
+ > 单独加 BM25 后,召回有提升但排序不一定更好;这符合预期,因为 BM25 会把更多字面相关结果拉进候选集。最终效果最好的是 dense + BM25 扩召回,再用 cross-encoder reranker 排序。这个实验也说明我不是凭感觉加组件,而是用 Hit@K、MRR 和 NDCG 验证每一步是否真的有效。
469
+
470
+ ## 当前评测结果示例
471
+
472
+ 早期小规模 smoke test 的结果示例:
473
+
474
+ ```text
475
+ BEIR/scifact:
476
+ MRR = 0.9000
477
+ NDCG@5 = 0.9262
478
+ Hit@1 = 0.8000
479
+ Hit@5 = 1.0000
480
+
481
+ BEIR/fiqa:
482
+ MRR = 0.8000
483
+ NDCG@5 = 0.6582
484
+ Hit@1 = 0.8000
485
+ Hit@5 = 0.8000
486
+
487
+ local-options:
488
+ MRR = 1.0000
489
+ NDCG@5 = 0.7162
490
+ Hit@1 = 1.0000
491
+ Hit@5 = 1.0000
492
+ ```
493
+
494
+ 这些结果主要用于验证评测流程和小样本趋势,不能直接代表完整 benchmark 成绩。正式对比时需要扩大 `max_queries` 和 `max_corpus_docs`。
495
+
496
+ ## 面试回答话术
497
+
498
+ 可以这样回答:
499
+
500
+ > 我在优化 RAG 系统时发现,单纯看回答效果很难判断改动是否真的有效,所以先搭了一个 retrieval evaluation 模块。我的思路是先用 BEIR/scifact 快速跑通标准检索评测,再接 BEIR/fiqa 贴近金融场景,然后接 Open RAGBench 验证长文档和 PDF-like 场景,最后补自己的期权 PDF 测试集,用来覆盖项目里公式、章节和金融术语这些业务难点。
501
+
502
+ 如果面试官问为什么先评估 retrieval:
503
+
504
+ > 因为 RAG 的生成质量高度依赖检索质量。如果检索阶段没有召回正确上下文,后面 LLM 很容易幻觉。所以我先用 Hit@K、MRR、NDCG@K 衡量正确文档是否被召回以及排序是否靠前,把 retrieval 问题和 generation 问题分开定位。
505
+
506
+ 如果面试官问如何保证评测可靠:
507
+
508
+ > 我做了几个处理。第一,所有数据集统一成 documents、queries、qrels 三类结构。第二,小样本 smoke test 会优先把 qrels 需要的 gold document 放进 corpus,避免因为采样漏掉正确文档导致评测不公平。第三,检索结果按 doc_id 去重,避免同一篇文档多个 chunk 重复命中导致指标虚高。第四,修改解析、chunk、embedding 或检索逻辑后必须 rebuild 索引,保证评测对应的是最新系统。
509
+
510
+ 如果面试官问这个模块怎么用:
511
+
512
+ > 我提供了单数据集入口和 suite 入口。单数据集可以用 `python -m eval.rag_eval --dataset local-options --rebuild`,批量评测可以用 `python -m eval.run_eval_suite --rebuild`。它会自动跑多个数据集,输出 JSON 和 Markdown 报告,便于做 before/after 对比。
513
+
514
+ 如果面试官问为什么要做 hybrid search:
515
+
516
+ > 因为期权和金融文档里有两类查询。一类是语义型,比如“为什么临近到期 gamma 风险变大”,dense embedding 很适合;另一类是精确匹配型,比如 `Equation 21.23`、`WITH ZERO CORRELATION`、`Black-Scholes-Merton`,这些 BM25 更稳定。所以我用 dense retrieval 负责语义召回,BM25 负责关键词召回,然后合并候选,再用 cross-encoder reranker 排序。
517
+
518
+ 如果面试官问 hybrid 是否真的提升了:
519
+
520
+ > 我用扩充后的 40 条 local-options eval 做了对比。Dense-only 的 Hit@5 是 0.525,MRR 是 0.471;加入 hybrid 后 Hit@5 提升到 0.575,说明召回变好,但 NDCG 有一点下降,说明排序还不够好;再加 reranker 后 Hit@5 到 0.725,MRR 到 0.713,Hit@1 到 0.700,说明 dense + BM25 + reranker 的组合最稳。
521
+
522
+ 如果面试官问为什么不能每次全量 rebuild:
523
+
524
+ > 全量 rebuild 在文档少的时候可以,但参考书和笔记变多后成本会越来越高。我在 metadata 里记录 source_file、file_hash、embedding_model 和 extraction_method,启动时对比当前文件状态和 Chroma 中已有 metadata。新增文件只入库新增部分,修改文件只删除并重建该文件对应 chunks,删除文件同步清理旧 chunks。这样既保证索引新鲜,也避免无意义的全量重建。
525
+
526
+ 如果面试官问 RAG 和 agent 怎么结合:
527
+
528
+ > 我把 RAG 封装成 `QueryKnowledgeTool` 注册到主 `CodeAgent`,而不是只做一个独立脚本。tool description 明确告诉模型在期权概念、波动率、Greeks、策略和公式编号问题上调用它。返回结果包含 source、page、section、content_type、score 和 excerpt,方便 agent 带引用地回答,而不是凭模型记忆回答。
529
+
530
+ 如果面试官问如何避免本地 eval 过拟合:
531
+
532
+ > 早期我只有几条手写 case,很容易高估效果。后来我写了 local eval generator,从真实 PDF chunks 中抽样生成问题,同时过滤前言、索引、表格和图注噪声。这样测试集覆盖公式、章节、概念和金融术语,能更真实地暴露 retrieval 的召回和排序问题。
533
+
534
+ ## 后续可扩展方向
535
+
536
+ 后续还可以继续扩展:
537
+
538
+ - 增加 reranker 前后的对比实验。
539
+ - 增加 answer-level evaluation,评估最终回答是否正确。
540
+ - 增加 citation accuracy,判断引用来源是否准确。
541
+ - 增加公式检索专门测试集。
542
+ - 增加表格类 query 测试集。
543
+ - 对不同 chunk 策略、embedding 模型、top-k 参数做批量实验。
544
+ - 将报告接入 CI 或定期任务,防止 RAG 效果回退。
requirements.txt CHANGED
@@ -3,6 +3,7 @@ smolagents==1.13.0
3
  requests
4
  duckduckgo_search
5
  pandas
 
6
  pypdf
7
  PyMuPDF
8
  chromadb
 
3
  requests
4
  duckduckgo_search
5
  pandas
6
+ yfinance
7
  pypdf
8
  PyMuPDF
9
  chromadb
strategy/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .builder import generate_volatility_strategies
2
+ from .payoff import expiration_payoff, strategy_summary
3
+ from .schemas import OptionLeg, OptionStrategy
4
+
5
+ __all__ = [
6
+ "expiration_payoff",
7
+ "generate_volatility_strategies",
8
+ "OptionLeg",
9
+ "OptionStrategy",
10
+ "strategy_summary",
11
+ ]
strategy/builder.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from market_data.schemas import OptionChain, OptionContract
4
+
5
+ from .payoff import estimate_breakevens
6
+ from .schemas import OptionLeg, OptionStrategy
7
+
8
+
9
+ def usable_contracts(contracts: list[OptionContract]) -> list[OptionContract]:
10
+ return [
11
+ contract
12
+ for contract in contracts
13
+ if contract.mid is not None
14
+ and contract.mid > 0
15
+ and not {"missing_or_zero_bid_ask", "zero_open_interest"}.intersection(contract.liquidity_warnings)
16
+ ]
17
+
18
+
19
+ def nearest_contract(contracts: list[OptionContract], target_strike: float) -> OptionContract | None:
20
+ valid = usable_contracts(contracts)
21
+ if not valid:
22
+ return None
23
+ return min(valid, key=lambda contract: abs(contract.strike - target_strike))
24
+
25
+
26
+ def contract_to_leg(contract: OptionContract, action: str, quantity: int = 1) -> OptionLeg:
27
+ return OptionLeg(
28
+ action=action,
29
+ option_type=contract.option_type,
30
+ strike=contract.strike,
31
+ expiration=contract.expiration,
32
+ quantity=quantity,
33
+ premium=contract.mid or contract.last_price or 0.0,
34
+ implied_volatility=contract.implied_volatility,
35
+ liquidity_warnings=contract.liquidity_warnings,
36
+ )
37
+
38
+
39
+ def make_strategy(
40
+ name: str,
41
+ volatility_view: str,
42
+ directional_view: str,
43
+ legs: list[OptionLeg],
44
+ rationale: str,
45
+ risks: list[str],
46
+ score: float,
47
+ ) -> OptionStrategy:
48
+ net_cash_flow = sum(leg.cash_flow() for leg in legs)
49
+ net_debit_or_credit = -net_cash_flow
50
+ breakevens = estimate_breakevens(legs)
51
+
52
+ max_profit: float | str | None = None
53
+ max_loss: float | str | None = None
54
+ if name in {"long_straddle", "long_strangle"}:
55
+ max_loss = round(max(net_debit_or_credit, 0.0), 2)
56
+ max_profit = "unlimited"
57
+ elif name == "short_straddle":
58
+ max_profit = round(abs(min(net_debit_or_credit, 0.0)), 2)
59
+ max_loss = "unlimited"
60
+ elif name == "iron_condor":
61
+ call_strikes = sorted(leg.strike for leg in legs if leg.option_type == "call")
62
+ put_strikes = sorted(leg.strike for leg in legs if leg.option_type == "put")
63
+ width = max(call_strikes[-1] - call_strikes[0], put_strikes[-1] - put_strikes[0])
64
+ credit = abs(min(net_debit_or_credit, 0.0))
65
+ max_profit = round(credit, 2)
66
+ max_loss = round(width * 100 - credit, 2)
67
+ elif name == "calendar_spread":
68
+ max_loss = round(max(net_debit_or_credit, 0.0), 2)
69
+ max_profit = "path_dependent"
70
+
71
+ return OptionStrategy(
72
+ name=name,
73
+ volatility_view=volatility_view,
74
+ directional_view=directional_view,
75
+ legs=legs,
76
+ rationale=rationale,
77
+ risks=risks,
78
+ max_profit=max_profit,
79
+ max_loss=max_loss,
80
+ breakevens=breakevens,
81
+ net_debit_or_credit=round(net_debit_or_credit, 2),
82
+ score=score,
83
+ )
84
+
85
+
86
+ def generate_volatility_strategies(
87
+ near_chain: OptionChain,
88
+ volatility_view: str = "neutral",
89
+ directional_view: str = "neutral",
90
+ far_chain: OptionChain | None = None,
91
+ ) -> list[OptionStrategy]:
92
+ if near_chain.underlying_price is None:
93
+ return []
94
+
95
+ spot = near_chain.underlying_price
96
+ atm_call = nearest_contract(near_chain.calls, spot)
97
+ atm_put = nearest_contract(near_chain.puts, spot)
98
+ otm_call = nearest_contract(near_chain.calls, spot * 1.05)
99
+ otm_put = nearest_contract(near_chain.puts, spot * 0.95)
100
+ strategies: list[OptionStrategy] = []
101
+
102
+ if atm_call and atm_put:
103
+ if volatility_view in {"long_vol", "neutral", "vol_expansion"}:
104
+ strategies.append(
105
+ make_strategy(
106
+ name="long_straddle",
107
+ volatility_view="long_vol",
108
+ directional_view="neutral",
109
+ legs=[contract_to_leg(atm_call, "buy"), contract_to_leg(atm_put, "buy")],
110
+ rationale="Benefits from a large realized move or IV expansion; risk is premium paid.",
111
+ risks=["theta_decay", "iv_crush", "requires_large_move"],
112
+ score=0.75,
113
+ )
114
+ )
115
+ if volatility_view in {"short_vol", "neutral", "vol_compression"}:
116
+ strategies.append(
117
+ make_strategy(
118
+ name="short_straddle",
119
+ volatility_view="short_vol",
120
+ directional_view="neutral",
121
+ legs=[contract_to_leg(atm_call, "sell"), contract_to_leg(atm_put, "sell")],
122
+ rationale="Benefits from realized volatility staying below implied volatility.",
123
+ risks=["unlimited_tail_risk", "gap_risk", "margin_requirement"],
124
+ score=0.45,
125
+ )
126
+ )
127
+
128
+ if otm_call and otm_put and volatility_view in {"long_vol", "neutral", "vol_expansion"}:
129
+ strategies.append(
130
+ make_strategy(
131
+ name="long_strangle",
132
+ volatility_view="long_vol",
133
+ directional_view="neutral",
134
+ legs=[contract_to_leg(otm_call, "buy"), contract_to_leg(otm_put, "buy")],
135
+ rationale="Lower-cost long volatility expression than a straddle, but needs a larger move.",
136
+ risks=["theta_decay", "wide_breakevens", "iv_crush"],
137
+ score=0.65,
138
+ )
139
+ )
140
+
141
+ if far_chain and atm_call and volatility_view in {"long_vol", "neutral", "term_structure"}:
142
+ far_call = nearest_contract(far_chain.calls, atm_call.strike)
143
+ if far_call:
144
+ strategies.append(
145
+ make_strategy(
146
+ name="calendar_spread",
147
+ volatility_view="term_structure",
148
+ directional_view="neutral",
149
+ legs=[contract_to_leg(atm_call, "sell"), contract_to_leg(far_call, "buy")],
150
+ rationale="Expresses a term-structure view and benefits if longer-dated IV holds up.",
151
+ risks=["path_dependency", "front_expiry_gamma", "term_structure_shift"],
152
+ score=0.60,
153
+ )
154
+ )
155
+
156
+ if otm_call and otm_put and volatility_view in {"short_vol", "neutral", "vol_compression"}:
157
+ long_call = nearest_contract(near_chain.calls, otm_call.strike * 1.03)
158
+ long_put = nearest_contract(near_chain.puts, otm_put.strike * 0.97)
159
+ if long_call and long_put:
160
+ strategies.append(
161
+ make_strategy(
162
+ name="iron_condor",
163
+ volatility_view="short_vol",
164
+ directional_view="neutral",
165
+ legs=[
166
+ contract_to_leg(otm_put, "sell"),
167
+ contract_to_leg(long_put, "buy"),
168
+ contract_to_leg(otm_call, "sell"),
169
+ contract_to_leg(long_call, "buy"),
170
+ ],
171
+ rationale="Defined-risk short volatility strategy for range-bound markets.",
172
+ risks=["short_gamma", "tail_loss_to_width", "assignment_risk"],
173
+ score=0.70,
174
+ )
175
+ )
176
+
177
+ return sorted(strategies, key=lambda strategy: strategy.score, reverse=True)
strategy/payoff.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .schemas import OptionLeg, OptionStrategy
4
+
5
+
6
+ def leg_expiration_payoff(leg: OptionLeg, underlying_price: float) -> float:
7
+ if leg.option_type == "call":
8
+ intrinsic = max(underlying_price - leg.strike, 0.0)
9
+ else:
10
+ intrinsic = max(leg.strike - underlying_price, 0.0)
11
+ return intrinsic * leg.signed_quantity() * 100 + leg.cash_flow()
12
+
13
+
14
+ def expiration_payoff(legs: list[OptionLeg], underlying_price: float) -> float:
15
+ return sum(leg_expiration_payoff(leg, underlying_price) for leg in legs)
16
+
17
+
18
+ def estimate_breakevens(legs: list[OptionLeg]) -> list[float]:
19
+ strikes = [leg.strike for leg in legs]
20
+ low = max(min(strikes) * 0.5, 0.01)
21
+ high = max(strikes) * 1.5
22
+ steps = 400
23
+ points = [low + (high - low) * index / steps for index in range(steps + 1)]
24
+ payoffs = [expiration_payoff(legs, point) for point in points]
25
+ breakevens = []
26
+ for index in range(1, len(points)):
27
+ previous = payoffs[index - 1]
28
+ current = payoffs[index]
29
+ if previous == 0:
30
+ breakevens.append(points[index - 1])
31
+ if previous * current < 0:
32
+ ratio = abs(previous) / (abs(previous) + abs(current))
33
+ breakevens.append(points[index - 1] + (points[index] - points[index - 1]) * ratio)
34
+ return [round(value, 2) for value in breakevens]
35
+
36
+
37
+ def strategy_summary(strategy: OptionStrategy) -> dict:
38
+ strikes = [leg.strike for leg in strategy.legs]
39
+ low = max(min(strikes) * 0.6, 0.01)
40
+ high = max(strikes) * 1.4
41
+ grid = [low + (high - low) * index / 80 for index in range(81)]
42
+ payoffs = [expiration_payoff(strategy.legs, price) for price in grid]
43
+ return {
44
+ "min_grid_payoff": round(min(payoffs), 2),
45
+ "max_grid_payoff": round(max(payoffs), 2),
46
+ "payoff_at_middle_strike": round(expiration_payoff(strategy.legs, sum(strikes) / len(strikes)), 2),
47
+ "sample_points": [
48
+ {"underlying_price": round(price, 2), "pnl": round(pnl, 2)}
49
+ for price, pnl in zip(grid[::10], payoffs[::10])
50
+ ],
51
+ }
strategy/schemas.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import asdict, dataclass
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class OptionLeg:
9
+ action: str
10
+ option_type: str
11
+ strike: float
12
+ expiration: str
13
+ quantity: int
14
+ premium: float
15
+ implied_volatility: float | None = None
16
+ delta: float | None = None
17
+ liquidity_warnings: list[str] | None = None
18
+
19
+ def signed_quantity(self) -> int:
20
+ return self.quantity if self.action == "buy" else -self.quantity
21
+
22
+ def cash_flow(self) -> float:
23
+ return -self.premium * self.signed_quantity() * 100
24
+
25
+ def to_dict(self) -> dict[str, Any]:
26
+ return asdict(self)
27
+
28
+
29
+ @dataclass
30
+ class OptionStrategy:
31
+ name: str
32
+ volatility_view: str
33
+ directional_view: str
34
+ legs: list[OptionLeg]
35
+ rationale: str
36
+ risks: list[str]
37
+ max_profit: float | str | None
38
+ max_loss: float | str | None
39
+ breakevens: list[float]
40
+ net_debit_or_credit: float
41
+ score: float
42
+
43
+ def to_dict(self) -> dict[str, Any]:
44
+ payload = asdict(self)
45
+ payload["legs"] = [leg.to_dict() for leg in self.legs]
46
+ return payload
strategy/tools.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from smolagents import tool
6
+
7
+ from market_data.providers import get_option_chain, list_option_expirations
8
+
9
+ from .builder import generate_volatility_strategies
10
+ from .payoff import strategy_summary
11
+
12
+
13
+ @tool
14
+ def build_volatility_strategy(
15
+ symbol: str,
16
+ volatility_view: str = "neutral",
17
+ directional_view: str = "neutral",
18
+ near_expiration: str = "",
19
+ far_expiration: str = "",
20
+ ) -> str:
21
+ """Build candidate volatility option strategies from the current option chain.
22
+
23
+ Args:
24
+ symbol: Yahoo Finance ticker.
25
+ volatility_view: long_vol, short_vol, vol_expansion, vol_compression, term_structure, or neutral.
26
+ directional_view: bullish, bearish, neutral, or range_bound.
27
+ near_expiration: Near option expiration in YYYY-MM-DD. Empty uses nearest expiration.
28
+ far_expiration: Far option expiration for calendar spreads. Empty uses a later available expiration.
29
+ """
30
+ try:
31
+ symbol = symbol.strip().upper()
32
+ expirations = list_option_expirations(symbol)
33
+ if not expirations:
34
+ raise ValueError(f"No option expirations found for {symbol}.")
35
+ near = near_expiration or expirations[0]
36
+ far = far_expiration or (expirations[1] if len(expirations) > 1 else "")
37
+ near_chain = get_option_chain(symbol, near)
38
+ far_chain = get_option_chain(symbol, far) if far else None
39
+ strategies = generate_volatility_strategies(
40
+ near_chain=near_chain,
41
+ volatility_view=volatility_view,
42
+ directional_view=directional_view,
43
+ far_chain=far_chain,
44
+ )
45
+ return json.dumps(
46
+ {
47
+ "status": "success",
48
+ "symbol": symbol,
49
+ "near_expiration": near,
50
+ "far_expiration": far or None,
51
+ "strategies": [
52
+ {
53
+ **strategy.to_dict(),
54
+ "payoff_summary": strategy_summary(strategy),
55
+ }
56
+ for strategy in strategies
57
+ ],
58
+ "risk_note": (
59
+ "This is research output, not a trade recommendation. "
60
+ "Validate quotes, liquidity, margin, assignment risk, and event risk before trading."
61
+ ),
62
+ },
63
+ ensure_ascii=False,
64
+ indent=2,
65
+ default=str,
66
+ )
67
+ except Exception as exc:
68
+ return json.dumps(
69
+ {"status": "error", "symbol": symbol, "message": str(exc)},
70
+ ensure_ascii=False,
71
+ indent=2,
72
+ )
tools/query_knowledge.py CHANGED
@@ -3,6 +3,7 @@ import asyncio
3
  from collections import Counter
4
  import hashlib
5
  import logging
 
6
  import os
7
  from pathlib import Path
8
  from typing import Iterable, List, Optional
@@ -13,19 +14,29 @@ from chromadb.errors import NotFoundError
13
  from pypdf import PdfReader
14
 
15
  from llama_index.core import StorageContext, VectorStoreIndex
16
- from llama_index.core.schema import Document, BaseNode
17
  from llama_index.core.node_parser import SentenceSplitter
18
  from llama_index.vector_stores.chroma import ChromaVectorStore
19
 
20
 
 
21
  BASE_DIR = Path(__file__).resolve().parent
 
22
  KNOWLEDGE_BASE_DIR = BASE_DIR / "knowledge_base"
 
 
23
  RAW_DIR = KNOWLEDGE_BASE_DIR / "raw"
24
  CHROMA_DB_DIR = KNOWLEDGE_BASE_DIR / "chroma_db"
25
- HF_CACHE_DIR = BASE_DIR / "hf_cache"
26
  COLLECTION_NAME = "options_knowledge"
27
 
28
- EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5"
 
 
 
 
 
 
29
  CHUNK_SIZE = 1000
30
  CHUNK_OVERLAP = 150
31
  PDF_REPEATED_LINE_MIN_PAGES = 3
@@ -68,31 +79,177 @@ def configure_model_cache() -> None:
68
  os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(
69
  HF_CACHE_DIR / "sentence_transformers"))
70
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
71
- cached_model_dir = (
72
- HF_CACHE_DIR
73
- / "sentence_transformers"
74
- / f"models--{EMBED_MODEL_NAME.replace('/', '--')}"
75
- )
76
- if cached_model_dir.exists():
77
  os.environ.setdefault("HF_HUB_OFFLINE", "1")
78
  os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
79
 
80
 
81
- def resolve_embed_model_name() -> str:
82
  cached_model_dir = (
83
  HF_CACHE_DIR
84
  / "sentence_transformers"
85
- / f"models--{EMBED_MODEL_NAME.replace('/', '--')}"
86
  )
87
  snapshots_dir = cached_model_dir / "snapshots"
88
  if snapshots_dir.exists():
89
  snapshots = sorted(path for path in snapshots_dir.iterdir() if path.is_dir())
90
- if snapshots:
91
- return str(snapshots[-1])
 
 
 
 
 
 
 
 
 
92
 
93
  return EMBED_MODEL_NAME
94
 
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def file_sha256(path: Path) -> str:
97
  digest = hashlib.sha256()
98
  with path.open("rb") as file:
@@ -116,6 +273,49 @@ def load_md_file(path: Path) -> Document:
116
  )
117
 
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def append_visual_fragment(line_parts: List[str], text: str, baseline_y: float, item: dict) -> None:
120
  if not text:
121
  return
@@ -1008,13 +1208,28 @@ def load_pdf_file(path: Path) -> List[Document]:
1008
 
1009
 
1010
  def load_txt_file(path: Path) -> List[Document]:
1011
- # TODO: load text file
1012
- pass
1013
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1014
 
1015
 
1016
  def iter_source_files(raw_dir: Path) -> Iterable[Path]:
1017
- supported_suffixes = {".md", ".markdown", ".pdf"}
1018
  for path in sorted(raw_dir.rglob("*")):
1019
  if path.is_file() and path.suffix.lower() in supported_suffixes:
1020
  yield path
@@ -1022,12 +1237,13 @@ def iter_source_files(raw_dir: Path) -> Iterable[Path]:
1022
 
1023
  def load_docs(raw_dir: Path = RAW_DIR) -> List[Document]:
1024
  documents: List[Document] = []
 
1025
 
1026
  for path in iter_source_files(raw_dir):
1027
  suffix = path.suffix.lower()
1028
 
1029
  if suffix in {".md", ".markdown"}:
1030
- documents.append(load_md_file(path))
1031
  elif suffix == ".pdf":
1032
  documents.extend(load_pdf_file(path))
1033
  elif suffix == ".txt":
@@ -1053,6 +1269,7 @@ def add_chunk_metadata(nodes: List[BaseNode]) -> List[BaseNode]:
1053
 
1054
  node.metadata["chunk_id"] = chunk_id
1055
  node.metadata["chunk_index"] = chunk_index
 
1056
  node.id_ = chunk_id
1057
 
1058
  return nodes
@@ -1073,8 +1290,7 @@ def validate_nodes(nodes: List[BaseNode]) -> None:
1073
  f"PDF node {node.node_id} is missing page_number metadata.")
1074
 
1075
 
1076
- def build_nodes(raw_dir: Path = RAW_DIR) -> List[BaseNode]:
1077
- documents = load_docs(raw_dir)
1078
  splitter = SentenceSplitter(
1079
  chunk_size=CHUNK_SIZE,
1080
  chunk_overlap=CHUNK_OVERLAP,
@@ -1085,7 +1301,123 @@ def build_nodes(raw_dir: Path = RAW_DIR) -> List[BaseNode]:
1085
  return nodes
1086
 
1087
 
1088
- def collection_needs_pdf_rebuild(chroma_collection) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1089
  if chroma_collection.count() == 0:
1090
  return True
1091
 
@@ -1095,6 +1427,8 @@ def collection_needs_pdf_rebuild(chroma_collection) -> bool:
1095
  return False
1096
 
1097
  for metadata in sample.get("metadatas") or []:
 
 
1098
  if metadata.get("file_type") == "pdf":
1099
  return metadata.get("extraction_method") != PDF_EXTRACTION_METHOD
1100
 
@@ -1107,6 +1441,7 @@ async def build_index(raw_dir: Path = RAW_DIR, rebuild: bool = False) -> VectorS
1107
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
1108
 
1109
  load_dotenv()
 
1110
  CHROMA_DB_DIR.mkdir(parents=True, exist_ok=True)
1111
 
1112
  db = chromadb.PersistentClient(path=str(CHROMA_DB_DIR))
@@ -1118,9 +1453,6 @@ async def build_index(raw_dir: Path = RAW_DIR, rebuild: bool = False) -> VectorS
1118
  pass
1119
 
1120
  chroma_collection = db.get_or_create_collection(COLLECTION_NAME)
1121
- if not rebuild and collection_needs_pdf_rebuild(chroma_collection):
1122
- db.delete_collection(COLLECTION_NAME)
1123
- chroma_collection = db.get_or_create_collection(COLLECTION_NAME)
1124
 
1125
  vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
1126
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
@@ -1141,6 +1473,13 @@ async def build_index(raw_dir: Path = RAW_DIR, rebuild: bool = False) -> VectorS
1141
  f"Indexed {len(nodes)} chunks into collection '{COLLECTION_NAME}'")
1142
  return index
1143
 
 
 
 
 
 
 
 
1144
  print(
1145
  f"Loaded existing collection '{COLLECTION_NAME}' with {chroma_collection.count()} chunks.")
1146
  return VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
@@ -1148,13 +1487,17 @@ async def build_index(raw_dir: Path = RAW_DIR, rebuild: bool = False) -> VectorS
1148
 
1149
  class QueryKnowledgeTool(Tool):
1150
  name = "query_knowledge"
1151
- description = "Performs a search of related information based on your query"
 
 
 
 
1152
  inputs = {'query': {'type': 'string',
1153
  'description': 'The search query to perform.'}}
1154
  output_type = "string"
1155
 
1156
  @staticmethod
1157
- def format_results(results):
1158
  output = []
1159
 
1160
  for result in results:
@@ -1166,6 +1509,8 @@ class QueryKnowledgeTool(Tool):
1166
  formula_id = metadata.get("formula_id", "")
1167
  score = result.score
1168
  text = result.node.get_content()
 
 
1169
 
1170
  output.append(
1171
  f"source:{source}\n"
@@ -1174,20 +1519,139 @@ class QueryKnowledgeTool(Tool):
1174
  f"content_type:{content_type}\n"
1175
  f"formula_id:{formula_id or 'n/a'}\n"
1176
  f"score:{score:.4f}\n"
 
 
1177
  f"content:{text}"
1178
  )
1179
 
1180
  return "\n\n---\n\n".join(output)
1181
 
1182
- def __init__(self, max_results=10, top_k=5, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1183
  super().__init__()
1184
  self.max_results = max_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1185
  index = asyncio.run(build_index(rebuild=False))
1186
- self.retriever = index.as_retriever(similarity_top_k=top_k)
 
 
 
 
 
 
1187
 
1188
  def forward(self, query: str) -> str:
1189
- results = self.retriever.retrieve(query)
1190
- return QueryKnowledgeTool.format_results(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1191
 
1192
 
1193
  if __name__ == "__main__":
 
3
  from collections import Counter
4
  import hashlib
5
  import logging
6
+ import math
7
  import os
8
  from pathlib import Path
9
  from typing import Iterable, List, Optional
 
14
  from pypdf import PdfReader
15
 
16
  from llama_index.core import StorageContext, VectorStoreIndex
17
+ from llama_index.core.schema import Document, BaseNode, NodeWithScore, TextNode
18
  from llama_index.core.node_parser import SentenceSplitter
19
  from llama_index.vector_stores.chroma import ChromaVectorStore
20
 
21
 
22
+ load_dotenv()
23
  BASE_DIR = Path(__file__).resolve().parent
24
+ PROJECT_ROOT = BASE_DIR.parent
25
  KNOWLEDGE_BASE_DIR = BASE_DIR / "knowledge_base"
26
+ LEGACY_KNOWLEDGE_BASE_DIR = BASE_DIR / "knowledge_base"
27
+ KNOWLEDGE_BASE_DIR = PROJECT_ROOT / "knowledge_base"
28
  RAW_DIR = KNOWLEDGE_BASE_DIR / "raw"
29
  CHROMA_DB_DIR = KNOWLEDGE_BASE_DIR / "chroma_db"
30
+ HF_CACHE_DIR = PROJECT_ROOT / "hf_cache"
31
  COLLECTION_NAME = "options_knowledge"
32
 
33
+ EMBED_MODEL_NAME = os.getenv("RAG_EMBED_MODEL", "BAAI/bge-small-en-v1.5")
34
+ RERANKER_MODEL_NAME = os.getenv(
35
+ "RAG_RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
36
+ RERANKER_BATCH_SIZE = int(os.getenv("RAG_RERANKER_BATCH_SIZE", "16"))
37
+ EMBED_MODEL_METADATA_KEY = "embedding_model"
38
+ BM25_METADATA_KEY = "bm25_score"
39
+ VECTOR_METADATA_KEY = "vector_score"
40
  CHUNK_SIZE = 1000
41
  CHUNK_OVERLAP = 150
42
  PDF_REPEATED_LINE_MIN_PAGES = 3
 
79
  os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(
80
  HF_CACHE_DIR / "sentence_transformers"))
81
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
82
+ if local_model_snapshot(EMBED_MODEL_NAME):
 
 
 
 
 
83
  os.environ.setdefault("HF_HUB_OFFLINE", "1")
84
  os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
85
 
86
 
87
+ def local_model_snapshot(model_name: str) -> Optional[Path]:
88
  cached_model_dir = (
89
  HF_CACHE_DIR
90
  / "sentence_transformers"
91
+ / f"models--{model_name.replace('/', '--')}"
92
  )
93
  snapshots_dir = cached_model_dir / "snapshots"
94
  if snapshots_dir.exists():
95
  snapshots = sorted(path for path in snapshots_dir.iterdir() if path.is_dir())
96
+ for snapshot in reversed(snapshots):
97
+ if (snapshot / "config.json").exists():
98
+ return snapshot
99
+
100
+ return None
101
+
102
+
103
+ def resolve_embed_model_name() -> str:
104
+ snapshot = local_model_snapshot(EMBED_MODEL_NAME)
105
+ if snapshot:
106
+ return str(snapshot)
107
 
108
  return EMBED_MODEL_NAME
109
 
110
 
111
+ def resolve_reranker_model_name(model_name: str = RERANKER_MODEL_NAME) -> str:
112
+ snapshot = local_model_snapshot(model_name)
113
+ if snapshot:
114
+ return str(snapshot)
115
+
116
+ return model_name
117
+
118
+
119
+ def env_flag(name: str, default: bool = False) -> bool:
120
+ value = os.getenv(name)
121
+ if value is None:
122
+ return default
123
+ return value.strip().lower() in {"1", "true", "yes", "on"}
124
+
125
+
126
+ def effective_raw_dir(raw_dir: Path = RAW_DIR) -> Path:
127
+ if any(iter_source_files(raw_dir)):
128
+ return raw_dir
129
+
130
+ legacy_raw_dir = LEGACY_KNOWLEDGE_BASE_DIR / "raw"
131
+ if any(iter_source_files(legacy_raw_dir)):
132
+ logging.warning(
133
+ "Using legacy knowledge base path %s. Move files to %s when convenient.",
134
+ legacy_raw_dir,
135
+ raw_dir,
136
+ )
137
+ return legacy_raw_dir
138
+
139
+ return raw_dir
140
+
141
+
142
+ class CrossEncoderReranker:
143
+ def __init__(
144
+ self,
145
+ model_name: str = RERANKER_MODEL_NAME,
146
+ batch_size: int = RERANKER_BATCH_SIZE,
147
+ ):
148
+ self.model_name = model_name
149
+ self.batch_size = batch_size
150
+ self._model = None
151
+
152
+ def _load_model(self):
153
+ if self._model is not None:
154
+ return self._model
155
+
156
+ from sentence_transformers import CrossEncoder
157
+
158
+ self._model = CrossEncoder(
159
+ resolve_reranker_model_name(self.model_name),
160
+ max_length=512,
161
+ cache_folder=str(HF_CACHE_DIR / "sentence_transformers"),
162
+ )
163
+ return self._model
164
+
165
+ def rerank(
166
+ self,
167
+ query: str,
168
+ results: list[NodeWithScore],
169
+ top_n: Optional[int] = None,
170
+ ) -> list[NodeWithScore]:
171
+ if not results:
172
+ return []
173
+
174
+ pairs = [
175
+ (query, result.node.get_content())
176
+ for result in results
177
+ ]
178
+ model = self._load_model()
179
+ scores = model.predict(
180
+ pairs,
181
+ batch_size=self.batch_size,
182
+ show_progress_bar=False,
183
+ )
184
+
185
+ reranked = [
186
+ NodeWithScore(node=result.node, score=float(score))
187
+ for result, score in zip(results, scores)
188
+ ]
189
+ reranked.sort(key=lambda item: item.score or float("-inf"), reverse=True)
190
+ return reranked[:top_n] if top_n else reranked
191
+
192
+
193
+ class BM25Retriever:
194
+ def __init__(self, nodes: list[TextNode]):
195
+ self.nodes = nodes
196
+ self.tokenized_docs = [self.tokenize(node.get_content()) for node in nodes]
197
+ self.doc_freqs: Counter[str] = Counter()
198
+ for tokens in self.tokenized_docs:
199
+ self.doc_freqs.update(set(tokens))
200
+ self.avg_doc_len = (
201
+ sum(len(tokens) for tokens in self.tokenized_docs) / len(self.tokenized_docs)
202
+ if self.tokenized_docs
203
+ else 0.0
204
+ )
205
+
206
+ @staticmethod
207
+ def tokenize(text: str) -> list[str]:
208
+ return [
209
+ token.lower()
210
+ for token in re.findall(r"[A-Za-z]+(?:[-'][A-Za-z]+)*|\d+(?:\.\d+)*|[^\sA-Za-z0-9]", text)
211
+ if token.strip()
212
+ ]
213
+
214
+ def score(self, query_tokens: list[str], doc_tokens: list[str]) -> float:
215
+ if not query_tokens or not doc_tokens:
216
+ return 0.0
217
+
218
+ token_counts = Counter(doc_tokens)
219
+ doc_len = len(doc_tokens)
220
+ total_docs = len(self.tokenized_docs)
221
+ k1 = 1.5
222
+ b = 0.75
223
+ score = 0.0
224
+
225
+ for token in query_tokens:
226
+ term_freq = token_counts.get(token, 0)
227
+ if term_freq == 0:
228
+ continue
229
+ doc_freq = self.doc_freqs.get(token, 0)
230
+ idf = math.log(1 + (total_docs - doc_freq + 0.5) / (doc_freq + 0.5))
231
+ denominator = term_freq + k1 * (
232
+ 1 - b + b * doc_len / max(self.avg_doc_len, 1.0)
233
+ )
234
+ score += idf * (term_freq * (k1 + 1)) / denominator
235
+
236
+ return score
237
+
238
+ def retrieve(self, query: str, top_k: int) -> list[NodeWithScore]:
239
+ query_tokens = self.tokenize(query)
240
+ scored: list[NodeWithScore] = []
241
+
242
+ for node, doc_tokens in zip(self.nodes, self.tokenized_docs):
243
+ score = self.score(query_tokens, doc_tokens)
244
+ if score <= 0:
245
+ continue
246
+ node.metadata[BM25_METADATA_KEY] = score
247
+ scored.append(NodeWithScore(node=node, score=score))
248
+
249
+ scored.sort(key=lambda item: item.score or float("-inf"), reverse=True)
250
+ return scored[:top_k]
251
+
252
+
253
  def file_sha256(path: Path) -> str:
254
  digest = hashlib.sha256()
255
  with path.open("rb") as file:
 
273
  )
274
 
275
 
276
+ def load_md_documents(path: Path) -> List[Document]:
277
+ text = path.read_text(encoding="utf-8")
278
+ file_hash = file_sha256(path)
279
+ documents: List[Document] = []
280
+ current_heading = ""
281
+ current_lines: List[str] = []
282
+
283
+ def flush() -> None:
284
+ nonlocal current_lines
285
+ section_text = "\n".join(current_lines).strip()
286
+ if not section_text:
287
+ current_lines = []
288
+ return
289
+ documents.append(
290
+ Document(
291
+ text=section_text,
292
+ metadata={
293
+ "source_file": str(path.resolve()),
294
+ "file_name": path.name,
295
+ "file_type": path.suffix.lower().lstrip("."),
296
+ "document_title": path.stem,
297
+ "file_hash": file_hash,
298
+ "content_type": "markdown_section",
299
+ "chapter_title": "",
300
+ "section_title": current_heading,
301
+ "section_path": current_heading,
302
+ "char_count": len(section_text),
303
+ },
304
+ )
305
+ )
306
+ current_lines = []
307
+
308
+ for line in text.splitlines():
309
+ heading_match = re.match(r"^(#{1,6})\s+(.+?)\s*$", line)
310
+ if heading_match:
311
+ flush()
312
+ current_heading = heading_match.group(2).strip()
313
+ current_lines.append(line)
314
+
315
+ flush()
316
+ return documents or [load_md_file(path)]
317
+
318
+
319
  def append_visual_fragment(line_parts: List[str], text: str, baseline_y: float, item: dict) -> None:
320
  if not text:
321
  return
 
1208
 
1209
 
1210
  def load_txt_file(path: Path) -> List[Document]:
1211
+ text = path.read_text(encoding="utf-8")
1212
+ return [
1213
+ Document(
1214
+ text=text,
1215
+ metadata={
1216
+ "source_file": str(path.resolve()),
1217
+ "file_name": path.name,
1218
+ "file_type": "txt",
1219
+ "document_title": path.stem,
1220
+ "file_hash": file_sha256(path),
1221
+ "content_type": "text",
1222
+ "chapter_title": "",
1223
+ "section_title": "",
1224
+ "section_path": "",
1225
+ "char_count": len(text),
1226
+ },
1227
+ )
1228
+ ]
1229
 
1230
 
1231
  def iter_source_files(raw_dir: Path) -> Iterable[Path]:
1232
+ supported_suffixes = {".md", ".markdown", ".pdf", ".txt"}
1233
  for path in sorted(raw_dir.rglob("*")):
1234
  if path.is_file() and path.suffix.lower() in supported_suffixes:
1235
  yield path
 
1237
 
1238
  def load_docs(raw_dir: Path = RAW_DIR) -> List[Document]:
1239
  documents: List[Document] = []
1240
+ raw_dir = effective_raw_dir(raw_dir)
1241
 
1242
  for path in iter_source_files(raw_dir):
1243
  suffix = path.suffix.lower()
1244
 
1245
  if suffix in {".md", ".markdown"}:
1246
+ documents.extend(load_md_documents(path))
1247
  elif suffix == ".pdf":
1248
  documents.extend(load_pdf_file(path))
1249
  elif suffix == ".txt":
 
1269
 
1270
  node.metadata["chunk_id"] = chunk_id
1271
  node.metadata["chunk_index"] = chunk_index
1272
+ node.metadata[EMBED_MODEL_METADATA_KEY] = EMBED_MODEL_NAME
1273
  node.id_ = chunk_id
1274
 
1275
  return nodes
 
1290
  f"PDF node {node.node_id} is missing page_number metadata.")
1291
 
1292
 
1293
+ def split_documents(documents: List[Document]) -> List[BaseNode]:
 
1294
  splitter = SentenceSplitter(
1295
  chunk_size=CHUNK_SIZE,
1296
  chunk_overlap=CHUNK_OVERLAP,
 
1301
  return nodes
1302
 
1303
 
1304
+ def build_nodes(raw_dir: Path = RAW_DIR) -> List[BaseNode]:
1305
+ documents = load_docs(raw_dir)
1306
+ return split_documents(documents)
1307
+
1308
+
1309
+ def load_source_file(path: Path) -> List[Document]:
1310
+ suffix = path.suffix.lower()
1311
+ if suffix in {".md", ".markdown"}:
1312
+ return load_md_documents(path)
1313
+ if suffix == ".pdf":
1314
+ return load_pdf_file(path)
1315
+ if suffix == ".txt":
1316
+ return load_txt_file(path)
1317
+ return []
1318
+
1319
+
1320
+ def list_current_sources(raw_dir: Path = RAW_DIR) -> dict[str, dict[str, str]]:
1321
+ raw_dir = effective_raw_dir(raw_dir)
1322
+ sources = {}
1323
+ for path in iter_source_files(raw_dir):
1324
+ resolved = str(path.resolve())
1325
+ sources[resolved] = {
1326
+ "file_hash": file_sha256(path),
1327
+ "file_type": path.suffix.lower().lstrip("."),
1328
+ }
1329
+ return sources
1330
+
1331
+
1332
+ def existing_source_metadata(chroma_collection) -> dict[str, dict[str, str]]:
1333
+ existing: dict[str, dict[str, str]] = {}
1334
+ if chroma_collection.count() == 0:
1335
+ return existing
1336
+
1337
+ offset = 0
1338
+ limit = 500
1339
+ while True:
1340
+ batch = chroma_collection.get(
1341
+ limit=limit,
1342
+ offset=offset,
1343
+ include=["metadatas"],
1344
+ )
1345
+ metadatas = batch.get("metadatas") or []
1346
+ if not metadatas:
1347
+ break
1348
+ for metadata in metadatas:
1349
+ source_file = metadata.get("source_file")
1350
+ if not source_file:
1351
+ continue
1352
+ existing[source_file] = {
1353
+ "file_hash": metadata.get("file_hash", ""),
1354
+ "file_type": metadata.get("file_type", ""),
1355
+ "embedding_model": metadata.get(EMBED_MODEL_METADATA_KEY, ""),
1356
+ "extraction_method": metadata.get("extraction_method", ""),
1357
+ }
1358
+ if len(metadatas) < limit:
1359
+ break
1360
+ offset += limit
1361
+ return existing
1362
+
1363
+
1364
+ def source_needs_update(current: dict[str, str], existing: dict[str, str] | None) -> bool:
1365
+ if not existing:
1366
+ return True
1367
+ if existing.get("file_hash") != current["file_hash"]:
1368
+ return True
1369
+ if existing.get("embedding_model") != EMBED_MODEL_NAME:
1370
+ return True
1371
+ if current["file_type"] == "pdf" and existing.get("extraction_method") != PDF_EXTRACTION_METHOD:
1372
+ return True
1373
+ return False
1374
+
1375
+
1376
+ def incremental_update_index(
1377
+ raw_dir: Path,
1378
+ chroma_collection,
1379
+ storage_context: StorageContext,
1380
+ embed_model,
1381
+ ) -> bool:
1382
+ current_sources = list_current_sources(raw_dir)
1383
+ existing_sources = existing_source_metadata(chroma_collection)
1384
+
1385
+ deleted_sources = sorted(set(existing_sources) - set(current_sources))
1386
+ changed_sources = sorted(
1387
+ source_file
1388
+ for source_file, current in current_sources.items()
1389
+ if source_needs_update(current, existing_sources.get(source_file))
1390
+ )
1391
+
1392
+ for source_file in deleted_sources + changed_sources:
1393
+ try:
1394
+ chroma_collection.delete(where={"source_file": source_file})
1395
+ except Exception as exc:
1396
+ logging.warning("Could not delete stale chunks for %s: %s", source_file, exc)
1397
+
1398
+ if not changed_sources:
1399
+ if deleted_sources:
1400
+ print(f"Removed {len(deleted_sources)} stale source(s) from collection '{COLLECTION_NAME}'.")
1401
+ return bool(deleted_sources)
1402
+
1403
+ documents: List[Document] = []
1404
+ for source_file in changed_sources:
1405
+ documents.extend(load_source_file(Path(source_file)))
1406
+
1407
+ nodes = split_documents(documents)
1408
+ VectorStoreIndex(
1409
+ nodes,
1410
+ storage_context=storage_context,
1411
+ embed_model=embed_model,
1412
+ show_progress=True,
1413
+ )
1414
+ print(
1415
+ f"Incrementally indexed {len(nodes)} chunk(s) from {len(changed_sources)} source file(s)."
1416
+ )
1417
+ return True
1418
+
1419
+
1420
+ def collection_needs_rebuild(chroma_collection) -> bool:
1421
  if chroma_collection.count() == 0:
1422
  return True
1423
 
 
1427
  return False
1428
 
1429
  for metadata in sample.get("metadatas") or []:
1430
+ if metadata.get(EMBED_MODEL_METADATA_KEY) != EMBED_MODEL_NAME:
1431
+ return True
1432
  if metadata.get("file_type") == "pdf":
1433
  return metadata.get("extraction_method") != PDF_EXTRACTION_METHOD
1434
 
 
1441
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
1442
 
1443
  load_dotenv()
1444
+ raw_dir = effective_raw_dir(raw_dir)
1445
  CHROMA_DB_DIR.mkdir(parents=True, exist_ok=True)
1446
 
1447
  db = chromadb.PersistentClient(path=str(CHROMA_DB_DIR))
 
1453
  pass
1454
 
1455
  chroma_collection = db.get_or_create_collection(COLLECTION_NAME)
 
 
 
1456
 
1457
  vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
1458
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
 
1473
  f"Indexed {len(nodes)} chunks into collection '{COLLECTION_NAME}'")
1474
  return index
1475
 
1476
+ incremental_update_index(
1477
+ raw_dir=raw_dir,
1478
+ chroma_collection=chroma_collection,
1479
+ storage_context=storage_context,
1480
+ embed_model=embed_model,
1481
+ )
1482
+
1483
  print(
1484
  f"Loaded existing collection '{COLLECTION_NAME}' with {chroma_collection.count()} chunks.")
1485
  return VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
 
1487
 
1488
  class QueryKnowledgeTool(Tool):
1489
  name = "query_knowledge"
1490
+ description = (
1491
+ "Searches the local options trading knowledge base. Use this for option "
1492
+ "concepts, volatility, Greeks, strategies, formulas, equation numbers, "
1493
+ "and citations from reference books."
1494
+ )
1495
  inputs = {'query': {'type': 'string',
1496
  'description': 'The search query to perform.'}}
1497
  output_type = "string"
1498
 
1499
  @staticmethod
1500
+ def format_results(results, max_chars: int = 800):
1501
  output = []
1502
 
1503
  for result in results:
 
1509
  formula_id = metadata.get("formula_id", "")
1510
  score = result.score
1511
  text = result.node.get_content()
1512
+ if len(text) > max_chars:
1513
+ text = f"{text[:max_chars].rstrip()}..."
1514
 
1515
  output.append(
1516
  f"source:{source}\n"
 
1519
  f"content_type:{content_type}\n"
1520
  f"formula_id:{formula_id or 'n/a'}\n"
1521
  f"score:{score:.4f}\n"
1522
+ f"vector_score:{metadata.get(VECTOR_METADATA_KEY, 'n/a')}\n"
1523
+ f"bm25_score:{metadata.get(BM25_METADATA_KEY, 'n/a')}\n"
1524
  f"content:{text}"
1525
  )
1526
 
1527
  return "\n\n---\n\n".join(output)
1528
 
1529
+ @staticmethod
1530
+ def load_bm25_nodes(collection_name: str = COLLECTION_NAME) -> list[TextNode]:
1531
+ db = chromadb.PersistentClient(path=str(CHROMA_DB_DIR))
1532
+ try:
1533
+ collection = db.get_collection(collection_name)
1534
+ except Exception:
1535
+ return []
1536
+
1537
+ nodes: list[TextNode] = []
1538
+ offset = 0
1539
+ limit = 500
1540
+ while True:
1541
+ batch = collection.get(
1542
+ limit=limit,
1543
+ offset=offset,
1544
+ include=["documents", "metadatas"],
1545
+ )
1546
+ documents = batch.get("documents") or []
1547
+ metadatas = batch.get("metadatas") or []
1548
+ ids = batch.get("ids") or []
1549
+ if not documents:
1550
+ break
1551
+
1552
+ for index, text in enumerate(documents):
1553
+ metadata = dict(metadatas[index] or {})
1554
+ node_id = ids[index] if index < len(ids) else metadata.get("chunk_id", "")
1555
+ nodes.append(TextNode(id_=node_id, text=text or "", metadata=metadata))
1556
+
1557
+ if len(documents) < limit:
1558
+ break
1559
+ offset += limit
1560
+
1561
+ return nodes
1562
+
1563
+ @staticmethod
1564
+ def merge_results(
1565
+ vector_results: list[NodeWithScore],
1566
+ bm25_results: list[NodeWithScore],
1567
+ top_k: int,
1568
+ ) -> list[NodeWithScore]:
1569
+ merged: dict[str, NodeWithScore] = {}
1570
+
1571
+ for rank, result in enumerate(vector_results):
1572
+ node_id = result.node.node_id
1573
+ result.node.metadata[VECTOR_METADATA_KEY] = result.score
1574
+ merged[node_id] = NodeWithScore(
1575
+ node=result.node,
1576
+ score=1.0 / (rank + 1),
1577
+ )
1578
+
1579
+ for rank, result in enumerate(bm25_results):
1580
+ node_id = result.node.node_id
1581
+ result.node.metadata[BM25_METADATA_KEY] = result.score
1582
+ reciprocal_rank_score = 1.0 / (rank + 1)
1583
+ if node_id in merged:
1584
+ merged[node_id].score = (merged[node_id].score or 0.0) + reciprocal_rank_score
1585
+ merged[node_id].node.metadata[BM25_METADATA_KEY] = result.score
1586
+ else:
1587
+ merged[node_id] = NodeWithScore(
1588
+ node=result.node,
1589
+ score=reciprocal_rank_score,
1590
+ )
1591
+
1592
+ results = list(merged.values())
1593
+ results.sort(key=lambda item: item.score or float("-inf"), reverse=True)
1594
+ return results[:top_k]
1595
+
1596
+ def __init__(
1597
+ self,
1598
+ max_results=20,
1599
+ top_k=5,
1600
+ use_reranker: Optional[bool] = None,
1601
+ use_hybrid: Optional[bool] = None,
1602
+ reranker_top_n: Optional[int] = None,
1603
+ reranker_model_name: Optional[str] = None,
1604
+ **kwargs,
1605
+ ):
1606
  super().__init__()
1607
  self.max_results = max_results
1608
+ self.top_k = top_k
1609
+ self.use_reranker = (
1610
+ env_flag("RAG_USE_RERANKER", True)
1611
+ if use_reranker is None
1612
+ else use_reranker
1613
+ )
1614
+ self.use_hybrid = (
1615
+ env_flag("RAG_USE_HYBRID", True)
1616
+ if use_hybrid is None
1617
+ else use_hybrid
1618
+ )
1619
+ self.reranker_top_n = reranker_top_n or top_k
1620
+ self.reranker = (
1621
+ CrossEncoderReranker(reranker_model_name or RERANKER_MODEL_NAME)
1622
+ if self.use_reranker
1623
+ else None
1624
+ )
1625
  index = asyncio.run(build_index(rebuild=False))
1626
+ retrieve_top_k = max(max_results, top_k) if self.use_reranker else top_k
1627
+ self.retriever = index.as_retriever(similarity_top_k=retrieve_top_k)
1628
+ self.bm25_retriever = (
1629
+ BM25Retriever(self.load_bm25_nodes())
1630
+ if self.use_hybrid
1631
+ else None
1632
+ )
1633
 
1634
  def forward(self, query: str) -> str:
1635
+ vector_results = self.retriever.retrieve(query)
1636
+ results = vector_results
1637
+ if self.bm25_retriever:
1638
+ bm25_results = self.bm25_retriever.retrieve(query, self.max_results)
1639
+ results = self.merge_results(
1640
+ vector_results=vector_results,
1641
+ bm25_results=bm25_results,
1642
+ top_k=max(self.max_results, self.top_k),
1643
+ )
1644
+ if self.reranker:
1645
+ try:
1646
+ results = self.reranker.rerank(
1647
+ query,
1648
+ results,
1649
+ top_n=self.reranker_top_n,
1650
+ )
1651
+ except Exception as exc:
1652
+ logging.warning("Reranker failed; falling back to vector ranking: %s", exc)
1653
+ results = results[:self.top_k]
1654
+ return QueryKnowledgeTool.format_results(results[:self.top_k])
1655
 
1656
 
1657
  if __name__ == "__main__":
tools/todo.md CHANGED
@@ -1,5 +1,437 @@
1
- 1. 添加reranker
2
- 2. 修改embedding模型
3
- 3. chunk策略粗糙,建议按照章节标题等行划分
4
- 4. 提升pdf提取能力
5
- 5. 完成load_txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OptionAgent 构建 TODO
2
+
3
+ 目标:构建一个能辅助期权交易研究的 agent。后续重点偏向波动率交易,而不是单纯方向性期权交易。它需要能查资料、查市场数据、分析 IV/RV/skew/term structure,生成波动率策略、回测策略,并根据回测结果迭代改。所有输出必须带假设、风险、数据来源和限制说明,不直接承诺收益。
4
+
5
+ ## 0. 当前已有能力
6
+
7
+ - [x] 本地 RAG:`QueryKnowledgeTool` 已接入主 agent,可查询期权书籍知识库。
8
+ - [x] PDF RAG 优化:支持 PyMuPDF 提取、公式块识别、章节 metadata、页码引用。
9
+ - [x] Hybrid search:dense retrieval + BM25 + reranker。
10
+ - [x] 本地 RAG eval:支持 local-options eval、BEIR/fiqa、Open RAGBench。
11
+ - [x] 基础市场价格工具:`query_market_asset` 可查询股票、指数、ETF、crypto、forex 的当前价格。
12
+ - [x] Web search/visit webpage 已接入主 agent,并通过工具初始化与网页解析 mock 测试。
13
+ - [x] 期权链、IV、Greeks、期限结构、偏斜等基础数据模块已完成 MVP,并通过 mock 测试。
14
+ - [x] 策略构建模块 MVP 已完成:支持 5 类波动率策略候选。
15
+ - [x] 回测模块 MVP 已完成:支持 payoff 情景分析、RV signal 历史回测 proxy、历史期权 bid/ask quote CSV 真实腿级回测。
16
+ - [x] QuantConnect/LEAN 期权回测模板已加入:可用于接入真实历史期权链、撮合、组合持仓和保证金模型。
17
+ - [x] 策略改进/优化闭环 MVP 已完成:支持参数扫描和 best vs baseline 对比。
18
+
19
+ ## 1. Research 模块:资料检索
20
+
21
+ ### 1.1 本地知识库 RAG
22
+
23
+ - [x] 查询本地书籍、PDF、Markdown 知识库。
24
+ - [x] 返回 source、page、section、content_type、score、excerpt。
25
+ - [x] 支持公式 chunk 和正文 chunk。
26
+ - [ ] 为期权策略类问题增加 query rewrite:
27
+ - 中文问题转英文检索词。
28
+ - 生成多个 query variants。
29
+ - 对公式/章节/策略/风险问题采用不同检索策略。
30
+ - [ ] 增加 citation policy:
31
+ - agent 最终回答必须引用 RAG 来源。
32
+ - 没查到资料时明确说“不确定/资料不足”。
33
+
34
+ ### 1.2 Web Search
35
+
36
+ - [x] 将 `DuckDuckGoSearchTool` 和 `VisitWebpageTool` 接入 `app.py` 的 tools。
37
+ - [x] 修复 `VisitWebpageTool` 中缺失的 `re` import。
38
+ - [x] 给 web search 加使用边界:
39
+ - 用于查最新市场事件、宏观事件、财报日期、公司公告、交易所规则。
40
+ - 本地书籍知识优先用 RAG,实时信息优先用 web。
41
+ - [x] Web 结果返回标题、URL 和摘要;发布时间后续按数据源能力增强。
42
+ - [ ] 对高风险市场信息做多源交叉验证。
43
+
44
+ ## 2. Market Data 模块:市场数据与期权数据
45
+
46
+ ### 2.1 标的行情
47
+
48
+ - [x] 当前价格、日内 OHLC、成交量。
49
+ - [x] 增加历史价格接口:
50
+ - 日线、小时线、分钟线。
51
+ - 支持 start/end/period/interval 参数。
52
+ - 输出用于回测的标准 DataFrame/JSON。
53
+ - [x] 增加 realized volatility 计算:
54
+ - 10D/20D/30D/60D realized vol。
55
+ - Parkinson/Garman-Klass 可选。
56
+
57
+ ### 2.2 期权链
58
+
59
+ - [x] 新增 `query_option_chain(symbol, expiration)` tool。
60
+ - [x] 返回 calls/puts:
61
+ - strike
62
+ - bid/ask/mid/last
63
+ - volume/open_interest
64
+ - implied_volatility
65
+ - in_the_money
66
+ - expiration
67
+ - days_to_expiration
68
+ - [x] 支持列出全部 expiration dates。
69
+ - [x] 对无流动性合约做标记:
70
+ - bid/ask 缺失
71
+ - spread 过宽
72
+ - volume/OI 过低
73
+
74
+ ### 2.3 Greeks 与波动率结构
75
+
76
+ - [x] 新增 Greeks 计算模块:
77
+ - delta/gamma/vega/theta/rho。
78
+ - 支持 Black-Scholes-Merton。
79
+ - 支持 dividend yield / risk-free rate 参数。
80
+ - [x] 新增 IV surface / skew 分析 MVP:
81
+ - ATM IV。
82
+ - 近似 put-call skew。
83
+ - ATM IV term structure slope。
84
+ - IV percentile / rank 后续在 Milestone 2 完成。
85
+ - [x] 新增 volatility trading 专用指标 MVP:
86
+ - realized volatility: 5D/10D/20D/30D/60D。
87
+ - implied vs realized spread。
88
+ - volatility risk premium: IV - RV。
89
+ - IV term structure slope。
90
+ - skew slope / put-call skew。
91
+ - vol-of-vol proxy 后续增强。
92
+ - event IV premium 后续增强。
93
+ - [ ] 对 yfinance IV 字段做 sanity check:
94
+ - IV 为 0、缺失、异常值时标记。
95
+ - bid/ask/mid 不合理时不参与策略构建。
96
+
97
+ ### 2.4 数据源抽象
98
+
99
+ - [x] 建立 `market_data/` 模块,避免所有行情逻辑堆在 `app.py`。
100
+ - [x] 设计统一 schema:
101
+ - `UnderlyingQuote`
102
+ - `OptionContract`
103
+ - `OptionChain`
104
+ - `VolSnapshot`
105
+ - [x] 第一阶段可用 yfinance,后续可接 Polygon/Tradier/IBKR。
106
+
107
+ ## 3. Strategy Builder 模块:策略构建
108
+
109
+ 后续策略构建以波动率观点为核心,方向观点为辅助变量。
110
+
111
+ ### 3.1 用户意图解析
112
+
113
+ - [ ] 解析用户输入:
114
+ - 标的 symbol。
115
+ - 波动率观点:long vol / short vol / vol mean reversion / event vol / skew trade。
116
+ - 方向观点:bullish/bearish/neutral/range-bound。
117
+ - 时间周期。
118
+ - 风险承受。
119
+ - 账户约束/最大亏损。
120
+ - 是否允许裸卖。
121
+ - [ ] 如果关键信息缺失,agent 需要追问,而不是直接生成交易。
122
+
123
+ ### 3.2 策略候选生成
124
+
125
+ - [ ] 支持基础策略模板:
126
+ - long call / long put
127
+ - covered call
128
+ - cash-secured put
129
+ - vertical spread
130
+ - calendar spread
131
+ - straddle / strangle
132
+ - iron condor
133
+ - collar
134
+ - [x] 支持波动率交易策略模板 MVP:
135
+ - long straddle / long strangle
136
+ - short straddle / short strangle
137
+ - delta-hedged straddle
138
+ - calendar spread
139
+ - diagonal spread
140
+ - variance-style option basket approximation
141
+ - skew trade: risk reversal / put spread vs call spread
142
+ - term structure trade: near-term short vol + longer-term long vol
143
+ - [x] 每个策略输出:
144
+ - legs
145
+ - expiration
146
+ - strike
147
+ - net debit/credit
148
+ - max profit
149
+ - max loss
150
+ - breakeven
151
+ - margin estimate
152
+ - Greeks exposure
153
+ - liquidity warnings
154
+
155
+ ### 3.3 策略筛选规则
156
+
157
+ - [ ] 根据市场状态筛选策略:
158
+ - 高 IV:偏向 credit spread / iron condor / covered call。
159
+ - 低 IV:偏向 long options / calendar / debit spread。
160
+ - 趋势观点强:vertical spread / directional options。
161
+ - 震荡观点:short premium / condor。
162
+ - [ ] 根据波动率状态筛选策略:
163
+ - IV 明显高于 RV:考虑 short vol,但必须检查事件风险和尾部风险。
164
+ - IV 明显低于 RV:考虑 long vol,但必须检查 theta bleed。
165
+ - 近月 IV 异常高:考虑 calendar/diagonal 或 event vol 策略。
166
+ - skew 极端:考虑 risk reversal、put spread、skew mean reversion。
167
+ - term structure 陡峭:考虑跨期限 vol trade。
168
+ - [ ] 加入风险约束:
169
+ - max loss 不超过用户预算。
170
+ - spread 不能过宽。
171
+ - OI/volume 低的合约排除。
172
+ - 禁止默认裸卖期权。
173
+ - [ ] 输出多个候选策略并排序,而不是只给一个。
174
+
175
+ ### 3.4 策略解释
176
+
177
+ - [ ] 每个策略必须解释:
178
+ - 为什么适合当前市场。
179
+ - 主要盈利条件。
180
+ - 主要亏损场景。
181
+ - Greeks 风险。
182
+ - IV crush / event risk。
183
+ - Vega / gamma / theta trade-off。
184
+ - Long vol 或 short vol 的核心假设。
185
+ - 流动性和滑点风险。
186
+ - [ ] 必须引用 RAG/web/market data 来源。
187
+
188
+ ## 3.5 Volatility Research 模块:波动率交易研究
189
+
190
+ - [x] 构建 volatility dashboard MVP:
191
+ - current IV vs historical IV range。
192
+ - IV percentile / rank。
193
+ - realized volatility windows。
194
+ - IV-RV spread。
195
+ - term structure chart。
196
+ - skew chart。
197
+ - [x] 识别波动率 regime MVP:
198
+ - low vol regime。
199
+ - high vol regime。
200
+ - vol expansion。
201
+ - vol compression。
202
+ - event-driven vol。
203
+ - [ ] 事件模块:
204
+ - earnings date。
205
+ - CPI/FOMC/NFP 等宏观事件。
206
+ - event implied move。
207
+ - post-event IV crush risk。
208
+ - [x] 输出波动率观点 MVP:
209
+ - long vol / short vol / neutral。
210
+ - confidence。
211
+ - key assumptions。
212
+ - invalidation conditions。
213
+
214
+ ## 4. Backtesting 模块:回测与情景分析
215
+
216
+ ### 4.1 第一阶段:Payoff 与情景分析
217
+
218
+ - [x] 新增 `backtest/` 模块。
219
+ - [x] 实现到期 payoff 情景表:
220
+ - 不同标的价格下 PnL。
221
+ - breakeven。
222
+ - max loss/max profit。
223
+ - [x] 实现情景分析 MVP:
224
+ - underlying price shock。
225
+ - IV up/down。
226
+ - days passed / theta decay。
227
+ - Greeks approximation。
228
+ - [x] 增加波动率情景 MVP:
229
+ - IV crush。
230
+ - IV expansion。
231
+ - realized move vs implied move。
232
+ - gamma scalp breakeven move。
233
+ - delta-hedging frequency sensitivity。
234
+ - [x] 输出表格和 JSON,方便 agent 总结。
235
+
236
+ ### 4.2 第二阶段:历史回测
237
+
238
+ - [x] 获取历史 underlying price。
239
+ - [ ] 获取或近似历史 IV:
240
+ - 优先真实历史 option chain。
241
+ - 没有数据时用 realized vol 或当前 IV 做近似,并明确标注限制。
242
+ - [x] 支持真实历史期权 quote CSV 输入:
243
+ - 必需字段:date、underlying_symbol、underlying_price、contract_symbol、option_type、expiration、strike、bid、ask。
244
+ - 可选字段:mid、delta、gamma、theta、vega、implied_volatility、volume、open_interest。
245
+ - 当前实现可做 ATM long straddle 的真实开仓/平仓腿级 PnL。
246
+ - 注意:yfinance 不能可靠提供历史 option chain,严肃回测需要 Polygon/ORATS/OptionMetrics/QuantConnect 等数据源。
247
+ - [x] 设计 entry/exit rules MVP:
248
+ - 入场条件。
249
+ - 出场条件。
250
+ - DTE 管理。
251
+ - 固定 holding period。
252
+ - 固定 entry frequency。
253
+ - [ ] 设计高级 entry/exit rules:
254
+ - 止盈止损。
255
+ - rolling 规则。
256
+ - [x] 为波动率策略增加专门规则 MVP:
257
+ - IV percentile 入场阈值。
258
+ - IV-RV spread 入场阈值。
259
+ - earnings 前后入场/退出。
260
+ - DTE bucket。
261
+ - delta hedge 频率。
262
+ - gamma scalp rule。
263
+ - [x] 计算指标 MVP:
264
+ - total PnL
265
+ - max drawdown
266
+ - win rate
267
+ - avg win/loss
268
+ - [ ] 计算高级指标:
269
+ - total return
270
+ - CAGR
271
+ - Sharpe/Sortino
272
+ - exposure time
273
+ - tail loss
274
+ - realized vs implied PnL attribution
275
+ - theta PnL
276
+ - vega PnL
277
+ - gamma scalping PnL
278
+
279
+ ### 4.3 第三阶段:组合级回测
280
+
281
+ - [x] 支持单策略多笔交易 MVP。
282
+ - [ ] 支持多策略/多标的组合交易。
283
+ - [ ] 支持现金、保证金、仓位占用。
284
+ - [x] 支持交易成本、bid/ask slippage MVP。
285
+ - [ ] 支持 assignment / early exercise 风险近似。
286
+ - [x] 生成交易日志 MVP。
287
+ - [ ] 生成风险归因。
288
+
289
+ ## 5. Strategy Optimizer 模块:回测后改进
290
+
291
+ - [x] 根据回测结果自动提出改进 MVP:
292
+ - 调整 expiration。
293
+ - 调整 strike/delta。
294
+ - 调整止盈止损。
295
+ - 限制入场市场环境。
296
+ - 避开财报/宏观事件。
297
+ - [ ] 对波动率策略提出专门改进:
298
+ - 调整 long/short vol 入场 IV percentile。
299
+ - 调整 straddle/strangle delta。
300
+ - 调整 delta hedge 频率。
301
+ - 调整 DTE bucket。
302
+ - 避开或利用 event vol。
303
+ - 加入 tail hedge。
304
+ - [x] 支持参数扫描 MVP:
305
+ - DTE range。
306
+ - delta target。
307
+ - width。
308
+ - profit target。
309
+ - stop loss。
310
+ - IV percentile threshold。
311
+ - IV-RV spread threshold。
312
+ - hedge frequency。
313
+ - [x] 输出对比表:
314
+ - baseline strategy
315
+ - improved strategy
316
+ - metrics delta
317
+ - trade-off
318
+ - [ ] 防止过拟合:
319
+ - train/test split。
320
+ - walk-forward analysis。
321
+ - out-of-sample period。
322
+
323
+ ## 6. Agent Orchestrator 模块:完整工作流
324
+
325
+ - [ ] 定义标准工作流:
326
+
327
+ ```text
328
+ 用户提出目标
329
+ -> 解析意图和约束
330
+ -> 查询 RAG/web 背景资料
331
+ -> 查询标的行情和期权链
332
+ -> 分析 IV/Greeks/流动性
333
+ -> 生成多个策略候选
334
+ -> 初步风险筛选
335
+ -> 回测/情景分析
336
+ -> 改进策略
337
+ -> 输出最终报告
338
+ ```
339
+
340
+ - [x] 增加 agent prompt 约束:
341
+ - 不承诺收益。
342
+ - 不给无风险建议。
343
+ - 必须说明假设和数据限制。
344
+ - 必须输出最大亏损。
345
+ - 必须说明流动性、滑点、IV、事件风险。
346
+ - [x] 增加结构化输出格式:
347
+ - `market_context`
348
+ - `strategy_candidates`
349
+ - `selected_strategy`
350
+ - `backtest_summary`
351
+ - `risk_warnings`
352
+ - `sources`
353
+
354
+ ## 7. UI / Report 模块
355
+
356
+ - [ ] Gradio UI 支持输入:
357
+ - symbol
358
+ - outlook
359
+ - time horizon
360
+ - risk budget
361
+ - strategy preference
362
+ - [ ] 展示:
363
+ - 策略 legs 表格。
364
+ - payoff 图。
365
+ - Greeks 表格。
366
+ - 回测指标。
367
+ - 引用来源。
368
+ - [ ] 支持导出 Markdown/HTML report。
369
+
370
+ ## 8. Evaluation 模块
371
+
372
+ - [x] RAG retrieval eval。
373
+ - [x] Market data tool 单元测试:已覆盖 RV、Greeks、历史价格 tool、期权链 tool、volatility snapshot mock。
374
+ - [x] Strategy builder 单元测试:
375
+ - payoff 计算正确。
376
+ - max loss/max profit 正确。
377
+ - breakeven 正确。
378
+ - [x] Backtest engine 测试:
379
+ - 单腿/多腿 payoff。
380
+ - 交易成本。
381
+ - rolling/exit rule。
382
+ - [ ] Agent end-to-end 测试:
383
+ - 给定 symbol + outlook,能完整输出策略、风险和来源。
384
+
385
+ ## 9. 推荐实现顺序
386
+
387
+ ### Milestone 1:Research + Market Data 可用
388
+
389
+ - [x] 接入 web search 和 visit webpage 到主 agent。
390
+ - [x] 修复 `VisitWebpageTool`。
391
+ - [x] 新增 option chain 查询工具。
392
+ - [x] 新增 Greeks/IV 基础计算。
393
+ - [x] 新增 IV/RV/skew/term structure 基础分析。
394
+ - [x] 将行情代码从 `app.py` 拆到独立模块。
395
+
396
+ ### Milestone 2:Volatility Dashboard MVP
397
+
398
+ - [x] 计算 realized volatility windows。
399
+ - [x] 计算 ATM IV、IV rank/percentile proxy。
400
+ - [x] 计算 IV-RV spread。
401
+ - [x] 计算 skew 和 term structure。
402
+ - [x] 输出 volatility regime 判断。
403
+
404
+ ### Milestone 3:波动率策略生成 MVP
405
+
406
+ - [x] 定义策略 leg schema。
407
+ - [x] 实现 5 个优先策略模板:
408
+ - long straddle
409
+ - long strangle
410
+ - short straddle
411
+ - calendar spread
412
+ - iron condor
413
+ - [x] 实现 payoff/max loss/breakeven 计算。
414
+ - [x] 根据 volatility regime 和 IV/RV 状态生成候选策略 MVP。
415
+
416
+ ### Milestone 4:回测 MVP
417
+
418
+ - [x] 实现到期 payoff 和情景分析。
419
+ - [x] 实现历史 underlying 回测 MVP。
420
+ - [x] 实现 IV/RV 条件入场回测 MVP。
421
+ - [x] 实现历史期权 quote CSV 的真实 long straddle 回测 MVP。
422
+ - [x] 添加 QuantConnect/LEAN ATM long straddle 回测模板。
423
+ - [x] 实现 straddle/strangle 的 delta hedge 情景分析 proxy。
424
+ - [x] 输出核心指标和交易日志。
425
+
426
+ ### Milestone 5:优化闭环
427
+
428
+ - [x] 参数扫描。
429
+ - [x] 策略改进建议 MVP。
430
+ - [x] 对比报告。
431
+ - [ ] 防过拟合验证。
432
+
433
+ ### Milestone 6:完整 Agent 工作流
434
+
435
+ - [x] 统一 prompt 和输出格式。
436
+ - [ ] Gradio UI 展示策略、图表和回测。
437
+ - [ ] 端到端测试。
tools/visit_webpage.py CHANGED
@@ -1,15 +1,20 @@
1
- from typing import Any, Optional
 
2
  from smolagents.tools import Tool
3
- import requests
4
- import markdownify
5
- import smolagents
6
 
7
  class VisitWebpageTool(Tool):
8
  name = "visit_webpage"
9
- description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
 
 
 
 
10
  inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
11
  output_type = "string"
12
 
 
 
 
13
  def forward(self, url: str) -> str:
14
  try:
15
  import requests
@@ -40,6 +45,3 @@ class VisitWebpageTool(Tool):
40
  return f"Error fetching the webpage: {str(e)}"
41
  except Exception as e:
42
  return f"An unexpected error occurred: {str(e)}"
43
-
44
- def __init__(self, *args, **kwargs):
45
- self.is_initialized = False
 
1
+ import re
2
+
3
  from smolagents.tools import Tool
 
 
 
4
 
5
  class VisitWebpageTool(Tool):
6
  name = "visit_webpage"
7
+ description = (
8
+ "Visits a webpage at the given URL and returns its readable Markdown content. "
9
+ "Use this after web_search when current market news, company events, "
10
+ "earnings information, exchange rules, or source verification is needed."
11
+ )
12
  inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
13
  output_type = "string"
14
 
15
+ def __init__(self, *args, **kwargs):
16
+ super().__init__(*args, **kwargs)
17
+
18
  def forward(self, url: str) -> str:
19
  try:
20
  import requests
 
45
  return f"Error fetching the webpage: {str(e)}"
46
  except Exception as e:
47
  return f"An unexpected error occurred: {str(e)}"
 
 
 
tools/web_search.py CHANGED
@@ -1,10 +1,12 @@
1
- from typing import Any, Optional
2
  from smolagents.tools import Tool
3
- import duckduckgo_search
4
 
5
  class DuckDuckGoSearchTool(Tool):
6
  name = "web_search"
7
- description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
 
 
 
 
8
  inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
9
  output_type = "string"
10
 
 
 
1
  from smolagents.tools import Tool
 
2
 
3
  class DuckDuckGoSearchTool(Tool):
4
  name = "web_search"
5
+ description = (
6
+ "Searches the web for current information. Use this for recent market events, "
7
+ "earnings dates, company announcements, macro events, current rules, or "
8
+ "source discovery. Prefer the local knowledge base for stable options concepts."
9
+ )
10
  inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
11
  output_type = "string"
12
 
uv.lock CHANGED
@@ -2,9 +2,15 @@ version = 1
2
  revision = 3
3
  requires-python = ">=3.12"
4
  resolution-markers = [
5
- "python_full_version >= '3.14'",
6
- "python_full_version == '3.13.*'",
7
- "python_full_version < '3.13'",
 
 
 
 
 
 
8
  ]
9
 
10
  [[package]]
@@ -246,6 +252,19 @@ wheels = [
246
  { url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
247
  ]
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  [[package]]
250
  name = "build"
251
  version = "1.5.0"
@@ -520,7 +539,7 @@ name = "cuda-bindings"
520
  version = "12.9.4"
521
  source = { registry = "https://pypi.org/simple" }
522
  dependencies = [
523
- { name = "cuda-pathfinder" },
524
  ]
525
  wheels = [
526
  { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" },
@@ -538,6 +557,39 @@ wheels = [
538
  { url = "https://files.pythonhosted.org/packages/11/d0/c177e29701cf1d3008d7d2b16b5fc626592ce13bd535f8795c5f57187e0e/cuda_pathfinder-1.5.4-py3-none-any.whl", hash = "sha256:9563d3175ce1828531acf4b94e1c1c7d67208c347ca002493e2654878b26f4b7", size = 51657, upload-time = "2026-04-27T22:42:07.712Z" },
539
  ]
540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  [[package]]
542
  name = "dataclasses-json"
543
  version = "0.6.7"
@@ -660,10 +712,12 @@ dependencies = [
660
  { name = "llama-index-core" },
661
  { name = "llama-index-embeddings-huggingface" },
662
  { name = "llama-index-vector-stores-chroma" },
 
663
  { name = "pymupdf" },
664
  { name = "pypdf" },
665
  { name = "tokenizers" },
666
  { name = "transformers" },
 
667
  ]
668
 
669
  [package.metadata]
@@ -674,10 +728,12 @@ requires-dist = [
674
  { name = "llama-index-core", specifier = ">=0.14.0" },
675
  { name = "llama-index-embeddings-huggingface", specifier = ">=0.6.0" },
676
  { name = "llama-index-vector-stores-chroma", specifier = ">=0.5.0" },
 
677
  { name = "pymupdf", specifier = ">=1.27.2.3" },
678
  { name = "pypdf", specifier = ">=6.0.0" },
679
  { name = "tokenizers", specifier = ">=0.22.0,<=0.23.0" },
680
  { name = "transformers", specifier = "<5" },
 
681
  ]
682
 
683
  [[package]]
@@ -1675,6 +1731,15 @@ wheels = [
1675
  { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
1676
  ]
1677
 
 
 
 
 
 
 
 
 
 
1678
  [[package]]
1679
  name = "mypy-extensions"
1680
  version = "1.1.0"
@@ -1815,7 +1880,7 @@ name = "nvidia-cudnn-cu12"
1815
  version = "9.10.2.21"
1816
  source = { registry = "https://pypi.org/simple" }
1817
  dependencies = [
1818
- { name = "nvidia-cublas-cu12" },
1819
  ]
1820
  wheels = [
1821
  { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
@@ -1826,7 +1891,7 @@ name = "nvidia-cufft-cu12"
1826
  version = "11.3.3.83"
1827
  source = { registry = "https://pypi.org/simple" }
1828
  dependencies = [
1829
- { name = "nvidia-nvjitlink-cu12" },
1830
  ]
1831
  wheels = [
1832
  { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
@@ -1853,9 +1918,9 @@ name = "nvidia-cusolver-cu12"
1853
  version = "11.7.3.90"
1854
  source = { registry = "https://pypi.org/simple" }
1855
  dependencies = [
1856
- { name = "nvidia-cublas-cu12" },
1857
- { name = "nvidia-cusparse-cu12" },
1858
- { name = "nvidia-nvjitlink-cu12" },
1859
  ]
1860
  wheels = [
1861
  { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
@@ -1866,7 +1931,7 @@ name = "nvidia-cusparse-cu12"
1866
  version = "12.5.8.93"
1867
  source = { registry = "https://pypi.org/simple" }
1868
  dependencies = [
1869
- { name = "nvidia-nvjitlink-cu12" },
1870
  ]
1871
  wheels = [
1872
  { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
@@ -2124,6 +2189,67 @@ wheels = [
2124
  { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
2125
  ]
2126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2127
  [[package]]
2128
  name = "pillow"
2129
  version = "12.2.0"
@@ -2636,6 +2762,15 @@ wheels = [
2636
  { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" },
2637
  ]
2638
 
 
 
 
 
 
 
 
 
 
2639
  [[package]]
2640
  name = "pyyaml"
2641
  version = "6.0.3"
@@ -3088,6 +3223,15 @@ wheels = [
3088
  { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
3089
  ]
3090
 
 
 
 
 
 
 
 
 
 
3091
  [[package]]
3092
  name = "sqlalchemy"
3093
  version = "2.0.49"
@@ -3405,6 +3549,15 @@ wheels = [
3405
  { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
3406
  ]
3407
 
 
 
 
 
 
 
 
 
 
3408
  [[package]]
3409
  name = "urllib3"
3410
  version = "2.7.0"
@@ -3778,6 +3931,28 @@ wheels = [
3778
  { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" },
3779
  ]
3780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3781
  [[package]]
3782
  name = "zipp"
3783
  version = "3.23.1"
 
2
  revision = 3
3
  requires-python = ">=3.12"
4
  resolution-markers = [
5
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
6
+ "python_full_version >= '3.14' and sys_platform == 'emscripten'",
7
+ "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
8
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
9
+ "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
10
+ "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
11
+ "python_full_version < '3.13' and sys_platform == 'win32'",
12
+ "python_full_version < '3.13' and sys_platform == 'emscripten'",
13
+ "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
14
  ]
15
 
16
  [[package]]
 
252
  { url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
253
  ]
254
 
255
+ [[package]]
256
+ name = "beautifulsoup4"
257
+ version = "4.14.3"
258
+ source = { registry = "https://pypi.org/simple" }
259
+ dependencies = [
260
+ { name = "soupsieve" },
261
+ { name = "typing-extensions" },
262
+ ]
263
+ sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
264
+ wheels = [
265
+ { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
266
+ ]
267
+
268
  [[package]]
269
  name = "build"
270
  version = "1.5.0"
 
539
  version = "12.9.4"
540
  source = { registry = "https://pypi.org/simple" }
541
  dependencies = [
542
+ { name = "cuda-pathfinder", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
543
  ]
544
  wheels = [
545
  { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" },
 
557
  { url = "https://files.pythonhosted.org/packages/11/d0/c177e29701cf1d3008d7d2b16b5fc626592ce13bd535f8795c5f57187e0e/cuda_pathfinder-1.5.4-py3-none-any.whl", hash = "sha256:9563d3175ce1828531acf4b94e1c1c7d67208c347ca002493e2654878b26f4b7", size = 51657, upload-time = "2026-04-27T22:42:07.712Z" },
558
  ]
559
 
560
+ [[package]]
561
+ name = "curl-cffi"
562
+ version = "0.15.0"
563
+ source = { registry = "https://pypi.org/simple" }
564
+ dependencies = [
565
+ { name = "certifi" },
566
+ { name = "cffi" },
567
+ { name = "rich" },
568
+ ]
569
+ sdist = { url = "https://files.pythonhosted.org/packages/48/5b/89fcfebd3e5e85134147ac99e9f2b2271165fd4d71984fc65da5f17819b7/curl_cffi-0.15.0.tar.gz", hash = "sha256:ea0c67652bf6893d34ee0f82c944f37e488f6147e9421bef1771cc6545b02ded", size = 196437, upload-time = "2026-04-03T11:12:31.525Z" }
570
+ wheels = [
571
+ { url = "https://files.pythonhosted.org/packages/5e/42/54ddd442c795f30ce5dd4e49f87ce77505958d3777cd96a91567a3975d2a/curl_cffi-0.15.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:bda66404010e9ed743b1b83c20c86f24fe21a9a6873e17479d6e67e29d8ded28", size = 2795267, upload-time = "2026-04-03T11:11:46.48Z" },
572
+ { url = "https://files.pythonhosted.org/packages/83/2d/3915e238579b3c5a92cead5c79130c3b8d20caaba7616cc4d894650e1d6b/curl_cffi-0.15.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:a25620d9bf989c9c029a7d1642999c4c265abb0bad811deb2f77b0b5b2b12e5b", size = 2573544, upload-time = "2026-04-03T11:11:47.951Z" },
573
+ { url = "https://files.pythonhosted.org/packages/2a/b3/9d2f1057749a1b07ba1989db3c1503ce8bed998310bae9aea2c43aa64f20/curl_cffi-0.15.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:582e570aa2586b96ed47cf4a17586b9a3c462cbe43f780487c3dc245c6ef1527", size = 10515369, upload-time = "2026-04-03T11:11:50.126Z" },
574
+ { url = "https://files.pythonhosted.org/packages/b5/1d/6d10dded5ce3fd8157e558ebd97d09e551b77a62cdc1c31e93d0a633cee5/curl_cffi-0.15.0-cp310-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:838e48212447d9c81364b04707a5c861daf08f8320f9ecb3406a8919d1d5c3b3", size = 10160045, upload-time = "2026-04-03T11:11:52.664Z" },
575
+ { url = "https://files.pythonhosted.org/packages/5c/12/c70b835487ace3b9ba1502631912e3440082b8ae3a162f60b59cb0b6444d/curl_cffi-0.15.0-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b6c847d86283b07ae69bb72c82eb8a59242277142aa35b89850f89e792a02fc", size = 11090433, upload-time = "2026-04-03T11:11:55.049Z" },
576
+ { url = "https://files.pythonhosted.org/packages/ea/0d/78edcc4f71934225db99df68197a107386d59080742fc7bf6bb4d007924f/curl_cffi-0.15.0-cp310-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9e5e69eee735f659287e2c84444319d68a1fa68dd37abf228943a4074864283a", size = 10479178, upload-time = "2026-04-03T11:11:57.685Z" },
577
+ { url = "https://files.pythonhosted.org/packages/5b/84/1e101c1acb1ea2f0b4992f5c3024f596d8e21db0d53540b9d583f673c4e7/curl_cffi-0.15.0-cp310-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:aa1323950224db24f4c510d010b3affa02196ca853fb424191fa917a513d3f4b", size = 10317051, upload-time = "2026-04-03T11:12:00.295Z" },
578
+ { url = "https://files.pythonhosted.org/packages/28/42/8ef236b22a6c23d096c85a1dc507efe37bfdfc7a2f8a4b34efb590197369/curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:41f80170ba844009273b2660da1964ec31e99e5719d16b3422ada87177e32e13", size = 11299660, upload-time = "2026-04-03T11:12:02.791Z" },
579
+ { url = "https://files.pythonhosted.org/packages/1d/01/56aeb055d962da87a1be0d74c6c644e251c7e88129b5471dc44ac724e678/curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1977e1e12cfb5c11352cbb74acef1bed24eb7d226dab61ca57c168c21acd4d61", size = 11945049, upload-time = "2026-04-03T11:12:05.912Z" },
580
+ { url = "https://files.pythonhosted.org/packages/d8/8c/2abf99a38d6340d66cf0557e0c750ef3f8883dfc5d450087e01c85861343/curl_cffi-0.15.0-cp310-abi3-win_amd64.whl", hash = "sha256:5a0c1896a0d5a5ac1eb89cd24b008d2b718dd1df6fd2f75451b59ca66e49e572", size = 1661649, upload-time = "2026-04-03T11:12:07.948Z" },
581
+ { url = "https://files.pythonhosted.org/packages/3d/39/dfd54f2240d3a9b96d77bacc62b97813b35e2aa8ecf5cd5013c683f1ba96/curl_cffi-0.15.0-cp310-abi3-win_arm64.whl", hash = "sha256:a6d57f8389273a3a1f94370473c74897467bcc36af0a17336989780c507fa43d", size = 1410741, upload-time = "2026-04-03T11:12:10.073Z" },
582
+ { url = "https://files.pythonhosted.org/packages/19/6a/c24df8a4fc22fa84070dcd94abeba43c15e08cc09e35869565c0bad196fd/curl_cffi-0.15.0-cp313-abi3-android_24_arm64_v8a.whl", hash = "sha256:4682dc38d4336e0eb0b185374db90a760efde63cbea994b4e63f3521d44c4c92", size = 7190427, upload-time = "2026-04-03T11:12:12.142Z" },
583
+ { url = "https://files.pythonhosted.org/packages/11/56/132225cb3491d07cc6adcce5fe395e059bde87c68cff1ef87a31c88c7819/curl_cffi-0.15.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:967ad7355bd8e9586f8c2d02eaa99953747549e7ea4a9b25cd53353e6b67fe6d", size = 2795723, upload-time = "2026-04-03T11:12:13.668Z" },
584
+ { url = "https://files.pythonhosted.org/packages/07/8f/f4f83cd303bef7e8f1749512e5dd157e7e5d08b0a36c8211f9640a2757bf/curl_cffi-0.15.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7e63539d0d839d0a8c5eacf86229bc68c57803547f35e0db7ee0986328b478c3", size = 2573739, upload-time = "2026-04-03T11:12:15.08Z" },
585
+ { url = "https://files.pythonhosted.org/packages/e8/5c/643d65c7fc9acd742876aa55c2d7823c438cb7665810acd2e66c9976c4d9/curl_cffi-0.15.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08c799b89740b9bc49c09fbc3d5907f13ac1f845ca52620507ef9466d4639dd5", size = 10521046, upload-time = "2026-04-03T11:12:17.034Z" },
586
+ { url = "https://files.pythonhosted.org/packages/7f/0b/9b8037113c93f4c5323096163471fa7c35c7676c3f608eeaf1287cd99d58/curl_cffi-0.15.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b7a92767a888ee90147e18964b396d8435ff42737030d6fb00824ffd6094805", size = 11096115, upload-time = "2026-04-03T11:12:19.694Z" },
587
+ { url = "https://files.pythonhosted.org/packages/5f/96/fff2fcbd924ef4042e0d67379f751a8a4e3186a91e75e35a4cf218b306ee/curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:829cc357061ecb99cc2d406301f609a039e05665322f5c025ec67c38b0dc49ce", size = 11305346, upload-time = "2026-04-03T11:12:22.151Z" },
588
+ { url = "https://files.pythonhosted.org/packages/53/1b/304b253a45ab28691c8c5e8cca1e6cbb9cf8e46dfceae4648dd536f75e73/curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:408d6f14e346841cd889c2e0962832bb235ba3b6749ebf609f347f747da5e60f", size = 11949834, upload-time = "2026-04-03T11:12:24.986Z" },
589
+ { url = "https://files.pythonhosted.org/packages/5a/ff/4723d92f08259c707a974aba27a08d0a822b9555e35ca581bf18d055a364/curl_cffi-0.15.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b624c7ce087bfda967a013ed0a64702a525444e5b6e97d23534d567ccc6525aa", size = 1702771, upload-time = "2026-04-03T11:12:28.201Z" },
590
+ { url = "https://files.pythonhosted.org/packages/59/8c/36bbe06d66fa2b765e4a07199f643a59a9cd1a754207a96335402a9520f4/curl_cffi-0.15.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0b6c0543b993996670e9e4b78e305a2d60809d5681903ffb5568e21a387434d3", size = 1466312, upload-time = "2026-04-03T11:12:30.054Z" },
591
+ ]
592
+
593
  [[package]]
594
  name = "dataclasses-json"
595
  version = "0.6.7"
 
712
  { name = "llama-index-core" },
713
  { name = "llama-index-embeddings-huggingface" },
714
  { name = "llama-index-vector-stores-chroma" },
715
+ { name = "pandas" },
716
  { name = "pymupdf" },
717
  { name = "pypdf" },
718
  { name = "tokenizers" },
719
  { name = "transformers" },
720
+ { name = "yfinance" },
721
  ]
722
 
723
  [package.metadata]
 
728
  { name = "llama-index-core", specifier = ">=0.14.0" },
729
  { name = "llama-index-embeddings-huggingface", specifier = ">=0.6.0" },
730
  { name = "llama-index-vector-stores-chroma", specifier = ">=0.5.0" },
731
+ { name = "pandas", specifier = ">=2.0.0" },
732
  { name = "pymupdf", specifier = ">=1.27.2.3" },
733
  { name = "pypdf", specifier = ">=6.0.0" },
734
  { name = "tokenizers", specifier = ">=0.22.0,<=0.23.0" },
735
  { name = "transformers", specifier = "<5" },
736
+ { name = "yfinance", specifier = ">=0.2.0" },
737
  ]
738
 
739
  [[package]]
 
1731
  { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
1732
  ]
1733
 
1734
+ [[package]]
1735
+ name = "multitasking"
1736
+ version = "0.0.13"
1737
+ source = { registry = "https://pypi.org/simple" }
1738
+ sdist = { url = "https://files.pythonhosted.org/packages/be/c3/ac2cc9307fb15cc28ed6d4a9266b216c83ee7fe64299f0264047982bce88/multitasking-0.0.13.tar.gz", hash = "sha256:d896b5df877c9ca5eeddbf0e5994124694d6cb535aba698fb23344c7025155a1", size = 20585, upload-time = "2026-04-23T12:14:15.049Z" }
1739
+ wheels = [
1740
+ { url = "https://files.pythonhosted.org/packages/d3/1c/24dbf69b247f287401c904a396233a43c89fd4fb9b7cd2e50e430e9cd57c/multitasking-0.0.13-py3-none-any.whl", hash = "sha256:ec9243af140c67bfe52dc98d7173c294512735a88e8425c458b250db99dc2b48", size = 16380, upload-time = "2026-04-23T12:14:13.776Z" },
1741
+ ]
1742
+
1743
  [[package]]
1744
  name = "mypy-extensions"
1745
  version = "1.1.0"
 
1880
  version = "9.10.2.21"
1881
  source = { registry = "https://pypi.org/simple" }
1882
  dependencies = [
1883
+ { name = "nvidia-cublas-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
1884
  ]
1885
  wheels = [
1886
  { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
 
1891
  version = "11.3.3.83"
1892
  source = { registry = "https://pypi.org/simple" }
1893
  dependencies = [
1894
+ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
1895
  ]
1896
  wheels = [
1897
  { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
 
1918
  version = "11.7.3.90"
1919
  source = { registry = "https://pypi.org/simple" }
1920
  dependencies = [
1921
+ { name = "nvidia-cublas-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
1922
+ { name = "nvidia-cusparse-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
1923
+ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
1924
  ]
1925
  wheels = [
1926
  { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
 
1931
  version = "12.5.8.93"
1932
  source = { registry = "https://pypi.org/simple" }
1933
  dependencies = [
1934
+ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
1935
  ]
1936
  wheels = [
1937
  { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
 
2189
  { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
2190
  ]
2191
 
2192
+ [[package]]
2193
+ name = "pandas"
2194
+ version = "3.0.3"
2195
+ source = { registry = "https://pypi.org/simple" }
2196
+ dependencies = [
2197
+ { name = "numpy" },
2198
+ { name = "python-dateutil" },
2199
+ { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" },
2200
+ ]
2201
+ sdist = { url = "https://files.pythonhosted.org/packages/f8/87/4341c6252d1c47b08768c3d25ac487362bf403f0313ddae4a2a26c9b1b4c/pandas-3.0.3.tar.gz", hash = "sha256:696a4a00a2a2a35d4e5deb3fc946641b96c944f02230e4f76137fe35d806c4fc", size = 4651414, upload-time = "2026-05-11T18:54:29.21Z" }
2202
+ wheels = [
2203
+ { url = "https://files.pythonhosted.org/packages/24/f1/392f8c5bfc16f66a0d2d41561c01627c228fe7ed2a0d056ef11315042570/pandas-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fed2ff7fd9779120e388e285fc029bd5cf9490cdd2e4166a9ee22c0e49a9ab09", size = 10357846, upload-time = "2026-05-11T18:52:36.143Z" },
2204
+ { url = "https://files.pythonhosted.org/packages/cf/3d/b16412745651e855f357e5e66930248688378853a6e2698a214e331fba1f/pandas-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b168fc218fd80a6cbdbdbc1a97ddc7889ed057d7eb45f50d866ceab5f39904c4", size = 9899550, upload-time = "2026-05-11T18:52:38.976Z" },
2205
+ { url = "https://files.pythonhosted.org/packages/31/a8/fa2535168fffcedf67f4f6de28d2dd903a747ca7c8ea6989451aaeb3a92f/pandas-3.0.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0383c72c75cdcca61a9e116e611143902dbfd08bff356829c2f6d1cf40a9ca8c", size = 10412965, upload-time = "2026-05-11T18:52:41.915Z" },
2206
+ { url = "https://files.pythonhosted.org/packages/65/b6/09b01cdbc15224e2850365192d17b7bdebb8bdbd8780ed221fcdf0d9a515/pandas-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6dc0b3fd2169c9157deed50b4d519553a3655c8c6a96027136d654592be973a9", size = 10894600, upload-time = "2026-05-11T18:52:45.02Z" },
2207
+ { url = "https://files.pythonhosted.org/packages/c9/a4/2eb28f2fccb4ced4a2c79ab2a5dee9ade1ebf44922ebad6fea158c9f95d4/pandas-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7e65d5407dc0b394f509699650e4a2ec01c0514f21850f453fa60f3be79a5dbf", size = 11422824, upload-time = "2026-05-11T18:52:48.058Z" },
2208
+ { url = "https://files.pythonhosted.org/packages/f8/45/830bb57f533a4604b355e07edcb8ea18cf88b5f94e5fca92f27052d7c597/pandas-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f8894dc474d648fe7b6ff0ca9b0bd73950d19952bc1a6534540762c5d79d305c", size = 11950889, upload-time = "2026-05-11T18:52:50.905Z" },
2209
+ { url = "https://files.pythonhosted.org/packages/b9/c5/fc1b368f303087d20e8c9bf3d6ceb186263cfac0ade735cd938538bea839/pandas-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:c7be265b62cef88e253a941e4698604973736dcfe242fdb5198f0f7bc473cdcc", size = 9755463, upload-time = "2026-05-11T18:52:53.386Z" },
2210
+ { url = "https://files.pythonhosted.org/packages/86/bd/fda8f9705b1b09c6ebe14bfc0fa0e4ec8584d54ea673628f157ff55131af/pandas-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:557409bc4178e70ee8d9ddb494798e51ebf6ea59330f6be22c51bab2a7db6c49", size = 9066158, upload-time = "2026-05-11T18:52:56.038Z" },
2211
+ { url = "https://files.pythonhosted.org/packages/c5/90/62d8302883c44308c477e222c3daf7c813a34c8e96985882fbd53d964352/pandas-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:67b3b64c11910cfa29f4e94a14d3bff9ee693b6fc76055e7cad549cee0aec5fa", size = 10331071, upload-time = "2026-05-11T18:52:58.838Z" },
2212
+ { url = "https://files.pythonhosted.org/packages/7f/ae/6a6493c783a101f165e4356953ba3c74d6f77f0042fa7d753da9dfbb640c/pandas-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:39436b377d56d2a2e52d0395bdbee171f01068e99af5250509aceeb929f765c7", size = 9875690, upload-time = "2026-05-11T18:53:01.431Z" },
2213
+ { url = "https://files.pythonhosted.org/packages/62/7c/5df8e9f56c69a2769fbe9382a5ef8f2658c007e376434e1e2cbb57ad895f/pandas-3.0.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4be06d68f9ddcfc645b87534911da79a8fbffc7573c80e0edcf42a5020624d8", size = 10381634, upload-time = "2026-05-11T18:53:04.393Z" },
2214
+ { url = "https://files.pythonhosted.org/packages/99/68/1237369725aa617bb358263d535803e3053fdbc593513ec5ed9c9896b5b6/pandas-3.0.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a4eeb6830daf35a71cc09649bd823e2b542dac246cdee9614c6e4bd65028cd6a", size = 10891243, upload-time = "2026-05-11T18:53:07.643Z" },
2215
+ { url = "https://files.pythonhosted.org/packages/25/93/77d108e8af7222b4a503ebde0e30215b1c2e4f8e53a526431890f22d5586/pandas-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1928e07221f82db493cd4af1e23c1bfca524a19a4699887975bff68f49a72bfb", size = 11388659, upload-time = "2026-05-11T18:53:10.634Z" },
2216
+ { url = "https://files.pythonhosted.org/packages/d0/bd/eff5b4399f332ac386c853f6cd2bd3fa2ca0061b9f36ecd9c4d7c4265649/pandas-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51b1fe551acb77dac643c6fda86084d8d446c10fe64b06a9cc29c4cc8540e7f2", size = 11942880, upload-time = "2026-05-11T18:53:13.536Z" },
2217
+ { url = "https://files.pythonhosted.org/packages/2c/20/559ace4200982c3887d0b86bfd0d856a2143ef8ddab63cc07934951a964c/pandas-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:a82d532a3351d435432cd913edbccaf8b8e01d4dd0e5ced5a8d2e8ecd94c7e44", size = 9757091, upload-time = "2026-05-11T18:53:16.306Z" },
2218
+ { url = "https://files.pythonhosted.org/packages/3a/66/69055a09fe200f29f922a3eeec4804611900b95f52d932ece3393c3c0c19/pandas-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:275c14e0fce14a2ec20eee474aecd305478ea3c1e6f6a9d8fe219a165542717e", size = 9057282, upload-time = "2026-05-11T18:53:18.768Z" },
2219
+ { url = "https://files.pythonhosted.org/packages/57/0e/efe801b0e6811e8e650cd21b7f2608e30f08a7067e2bf6e8752b0d56ee3c/pandas-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:46997386d528eb40376ecd6b033cf4a8a1e5282580f68f43de875b78cba2199d", size = 10767016, upload-time = "2026-05-11T18:53:21.227Z" },
2220
+ { url = "https://files.pythonhosted.org/packages/ea/dc/eb55135a1d5f0f0519f28da1f609a206d2cad1f9c35c32d51e38dd7261ae/pandas-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:261e308dfb22448384b7580cf719d2f998fe2966c92893c3e77d14008af1f066", size = 10420210, upload-time = "2026-05-11T18:53:23.982Z" },
2221
+ { url = "https://files.pythonhosted.org/packages/c6/3e/b1d5d955ce33ffecb407465a60bc32769d74fcf68224b7ae67ae11d4dea4/pandas-3.0.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd1a5d1def6a46002e964510bdc67c368aa0951df5d1d9f8365336f5a1f490cd", size = 10336126, upload-time = "2026-05-11T18:53:26.731Z" },
2222
+ { url = "https://files.pythonhosted.org/packages/f5/76/a01261711ab60a22d71b862f0de20e4c504bf80457270ad8cb42110f6abc/pandas-3.0.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d72828c20c6d6e83e1e22a6a3b47b326b71664112fa9705dcbccfd7a39b62085", size = 10728051, upload-time = "2026-05-11T18:53:29.125Z" },
2223
+ { url = "https://files.pythonhosted.org/packages/e9/21/ea191195e587b18cf682e97f433f81b2d0fbe341380e80a3e0d6e4403c8e/pandas-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d26cbe1fcfc12e8fd900e2454163e466b2d3af84f7c75481df7683ffc073d870", size = 11350796, upload-time = "2026-05-11T18:53:32.056Z" },
2224
+ { url = "https://files.pythonhosted.org/packages/64/69/f0eaaf54939f0e8c6768fd06be9af2cef9b36048b96dfb9e1b2c685a807e/pandas-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3e91cec1879ada0624fc3dc9953c5cbd60208e59c0db28f540c5d6d47502422f", size = 11799741, upload-time = "2026-05-11T18:53:34.985Z" },
2225
+ { url = "https://files.pythonhosted.org/packages/45/a4/865e0e510cae5fc2194de4db28be638952de942571ba9125934fd9c01d47/pandas-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:08d789b41f87e0905880e293cedf6197ce71fe67cc081358b1e148a491b9bd13", size = 10499958, upload-time = "2026-05-11T18:53:37.857Z" },
2226
+ { url = "https://files.pythonhosted.org/packages/86/54/effdcc3c0ff7a08037889200e148ebe94c16c4f653be078c7b3675955df1/pandas-3.0.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3650109c0f22879df8bd6179ab9ee3d7f1d1d4e7e0094a3f0032d9f51e2e64ac", size = 10336065, upload-time = "2026-05-11T18:53:41.099Z" },
2227
+ { url = "https://files.pythonhosted.org/packages/68/10/bf2d6738d72748b961a3751ab89522d58c54efc36a8e1a12161216cd45cf/pandas-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bab900348131a7db1f69a7309ef141fd5680f1487094193bcbbb61791573bf8f", size = 9926101, upload-time = "2026-05-11T18:53:43.515Z" },
2228
+ { url = "https://files.pythonhosted.org/packages/ae/e9/e35cf11c8a136e757b956f5f0efdcaa50aecde85ea055f1898dfc68262f3/pandas-3.0.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba7e08b9ac1d54569cd1e256e3668975ed624d6826f7b68df0342b012007bddb", size = 10457553, upload-time = "2026-05-11T18:53:46.394Z" },
2229
+ { url = "https://files.pythonhosted.org/packages/58/3b/1cdec6772bdbaf7b25dab360c59f03cadf05492dd724c6540af905389b07/pandas-3.0.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d71c63ae4ebdbf70209742096f1fc46a83a0613c99d4b23766cced9ff8cd62a", size = 10914065, upload-time = "2026-05-11T18:53:49.134Z" },
2230
+ { url = "https://files.pythonhosted.org/packages/c4/c2/1ef644445fcd72e3627bceec77e3560636f87ddce4ed841afe76b83b5bf9/pandas-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e3a2ec42c98ffa2565a67e08e218d06d72576d758d90facb7c00805194d8f360", size = 11459188, upload-time = "2026-05-11T18:53:52.527Z" },
2231
+ { url = "https://files.pythonhosted.org/packages/7e/49/4d8d4f42cbc9c4adc7a1870f269c02cbd6cd40d059622c06fb298addcbad/pandas-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:335f62418ed562cfc3c49e9e196375c28b729dcef8543abf4f9438e381bf3c76", size = 11982966, upload-time = "2026-05-11T18:53:55.043Z" },
2232
+ { url = "https://files.pythonhosted.org/packages/38/55/792619469bab9882d8bbd5865d45a72f6478762d04a9af4bf0d08c503e95/pandas-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:3c20a521bbb85902f79f7270c80a59e1b5452d96d170c034f207181870f97ac5", size = 9876755, upload-time = "2026-05-11T18:53:58.067Z" },
2233
+ { url = "https://files.pythonhosted.org/packages/2a/af/33c469653b0ba03b50c3a98192d4c07f0c75c66b263ceb097fce0ee97d31/pandas-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:a2d2dff8a04f3917b55ab3910c32990f8ddf7eceba114947838cefa976a68977", size = 9198658, upload-time = "2026-05-11T18:54:00.733Z" },
2234
+ { url = "https://files.pythonhosted.org/packages/a2/fa/b8c257bd76b8bd060c3a9151c1fca05e9b9c5e3af5d0f549c0356f6d143d/pandas-3.0.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:0d589105b3c14645af1738ff279b2995102d8f7a03b0a66dc8d95550eb513e04", size = 10787242, upload-time = "2026-05-11T18:54:03.564Z" },
2235
+ { url = "https://files.pythonhosted.org/packages/54/eb/f19206ffb0bf1919002969aa448b4702c6594845156a6f8050674855aac3/pandas-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:13fc1e853d9e04743d11ba75a985ccbc2a317fe07d8af61e445a6fd24dacd6a6", size = 10436369, upload-time = "2026-05-11T18:54:06.311Z" },
2236
+ { url = "https://files.pythonhosted.org/packages/fd/24/c7c39fb4fe22b71a0c2d78bf0c585c600092d85f94f086d2b3b2f6ca27e2/pandas-3.0.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:819959dab7bbd0049c15623fbac4e29a191b9528160a61fb1032242d8ced2d9c", size = 10358306, upload-time = "2026-05-11T18:54:09.085Z" },
2237
+ { url = "https://files.pythonhosted.org/packages/16/ec/dd2a9eb7fa1204df88c0864164e35b228ac581062ac612ba0a67fd812e4c/pandas-3.0.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:60ae316d3fd75d1858d450d0db0103ea2be3e7d4a95ec2f064f7e2ae63f7b028", size = 10758394, upload-time = "2026-05-11T18:54:11.956Z" },
2238
+ { url = "https://files.pythonhosted.org/packages/95/6e/00c61ea8e85b4f6d8d35e11852a1a4998fc7fafc91c6a602d1cc9c972d64/pandas-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bd3a518890b400d32f9023722dc9a9a5c969f00b415419a3c06c043f09bb5d7d", size = 11375717, upload-time = "2026-05-11T18:54:14.539Z" },
2239
+ { url = "https://files.pythonhosted.org/packages/31/89/8fc1c268969fac43688d65fd92e67df24bd128d53cb4d2eee534cd307399/pandas-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c39be2d709d01fa972a0cabc522389fceca4f3969332ba25a7d6c5802cf976a", size = 11828897, upload-time = "2026-05-11T18:54:17.146Z" },
2240
+ { url = "https://files.pythonhosted.org/packages/56/3b/e7d20dea247a3e6dc0bd8a6953854afbedc03951def4e7371e05e7263e25/pandas-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4db8c527972a821cf5286b40ccc57642a39bc62e62022b42f99f8a67fca8c3a1", size = 10900855, upload-time = "2026-05-11T18:54:19.72Z" },
2241
+ { url = "https://files.pythonhosted.org/packages/0f/54/68a0978d1ef8502b8492099beaa6e7a0c1b32e3b5d4f677f5810cb08711c/pandas-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b2c95f8bfc1ee412bf482605d7bfd30c12d1d26bd59fdd91efeef1d4718decb1", size = 9466464, upload-time = "2026-05-11T18:54:22.754Z" },
2242
+ ]
2243
+
2244
+ [[package]]
2245
+ name = "peewee"
2246
+ version = "4.0.6"
2247
+ source = { registry = "https://pypi.org/simple" }
2248
+ sdist = { url = "https://files.pythonhosted.org/packages/9f/09/a3b2a32ce498f405dce4320267e99b1b076c1ea39ad01151a353bc7f81d7/peewee-4.0.6.tar.gz", hash = "sha256:ea2f78f24ff9e3660281dc5b0be8bc00d9a9514bdc40c98e416fcd042b66ac6a", size = 724591, upload-time = "2026-05-20T13:18:17.26Z" }
2249
+ wheels = [
2250
+ { url = "https://files.pythonhosted.org/packages/69/6a/e1455b94ee48f5666f2e7831b6247098794bfe9747da457111be4d0bea10/peewee-4.0.6-py3-none-any.whl", hash = "sha256:5fa665913c410f0b5faef1469ed0aa9eceb9fef262665ebbb6f29408f826eeeb", size = 146222, upload-time = "2026-05-20T13:18:15.694Z" },
2251
+ ]
2252
+
2253
  [[package]]
2254
  name = "pillow"
2255
  version = "12.2.0"
 
2762
  { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" },
2763
  ]
2764
 
2765
+ [[package]]
2766
+ name = "pytz"
2767
+ version = "2026.2"
2768
+ source = { registry = "https://pypi.org/simple" }
2769
+ sdist = { url = "https://files.pythonhosted.org/packages/ff/46/dd499ec9038423421951e4fad73051febaa13d2df82b4064f87af8b8c0c3/pytz-2026.2.tar.gz", hash = "sha256:0e60b47b29f21574376f218fe21abc009894a2321ea16c6754f3cad6eb7cdd6a", size = 320861, upload-time = "2026-05-04T01:35:29.667Z" }
2770
+ wheels = [
2771
+ { url = "https://files.pythonhosted.org/packages/ec/dd/96da98f892250475bdf2328112d7468abdd4acc7b902b6af23f4ed958ea0/pytz-2026.2-py2.py3-none-any.whl", hash = "sha256:04156e608bee23d3792fd45c94ae47fae1036688e75032eea2e3bf0323d1f126", size = 510141, upload-time = "2026-05-04T01:35:27.408Z" },
2772
+ ]
2773
+
2774
  [[package]]
2775
  name = "pyyaml"
2776
  version = "6.0.3"
 
3223
  { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
3224
  ]
3225
 
3226
+ [[package]]
3227
+ name = "soupsieve"
3228
+ version = "2.8.4"
3229
+ source = { registry = "https://pypi.org/simple" }
3230
+ sdist = { url = "https://files.pythonhosted.org/packages/47/2c/0a5f6f8ee0d5589e48c7640213ed5175d52cf540a06725b628cc1a45d6ce/soupsieve-2.8.4.tar.gz", hash = "sha256:e121fd02e975c695e4e9e8774a5ee35d74714b59307868dcc5319ad2d9e3328e", size = 121110, upload-time = "2026-05-24T13:55:57.154Z" }
3231
+ wheels = [
3232
+ { url = "https://files.pythonhosted.org/packages/5e/f5/0c41cb68dcae6b7de4fac4188a3a9589e21fb31df21ea3a2e888db95e6c9/soupsieve-2.8.4-py3-none-any.whl", hash = "sha256:e7e6b0769c8f51ed59acab6e994b00621096cfb1c640a7509295987388fbaf65", size = 37304, upload-time = "2026-05-24T13:55:55.406Z" },
3233
+ ]
3234
+
3235
  [[package]]
3236
  name = "sqlalchemy"
3237
  version = "2.0.49"
 
3549
  { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
3550
  ]
3551
 
3552
+ [[package]]
3553
+ name = "tzdata"
3554
+ version = "2026.2"
3555
+ source = { registry = "https://pypi.org/simple" }
3556
+ sdist = { url = "https://files.pythonhosted.org/packages/ba/19/1b9b0e29f30c6d35cb345486df41110984ea67ae69dddbc0e8a100999493/tzdata-2026.2.tar.gz", hash = "sha256:9173fde7d80d9018e02a662e168e5a2d04f87c41ea174b139fbef642eda62d10", size = 198254, upload-time = "2026-04-24T15:22:08.651Z" }
3557
+ wheels = [
3558
+ { url = "https://files.pythonhosted.org/packages/ce/e4/dccd7f47c4b64213ac01ef921a1337ee6e30e8c6466046018326977efd95/tzdata-2026.2-py2.py3-none-any.whl", hash = "sha256:bbe9af844f658da81a5f95019480da3a89415801f6cc966806612cc7169bffe7", size = 349321, upload-time = "2026-04-24T15:22:05.876Z" },
3559
+ ]
3560
+
3561
  [[package]]
3562
  name = "urllib3"
3563
  version = "2.7.0"
 
3931
  { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" },
3932
  ]
3933
 
3934
+ [[package]]
3935
+ name = "yfinance"
3936
+ version = "1.4.0"
3937
+ source = { registry = "https://pypi.org/simple" }
3938
+ dependencies = [
3939
+ { name = "beautifulsoup4" },
3940
+ { name = "curl-cffi" },
3941
+ { name = "multitasking" },
3942
+ { name = "numpy" },
3943
+ { name = "pandas" },
3944
+ { name = "peewee" },
3945
+ { name = "platformdirs" },
3946
+ { name = "protobuf" },
3947
+ { name = "pytz" },
3948
+ { name = "requests" },
3949
+ { name = "websockets" },
3950
+ ]
3951
+ sdist = { url = "https://files.pythonhosted.org/packages/21/e2/b81f9cac78f1c23e444164f2135e19f849a66774474f8b156fc3702280c3/yfinance-1.4.0.tar.gz", hash = "sha256:6b049c3f28b0d66be54c32d84838ffd60c429277ba378afb0202c4792013c911", size = 153715, upload-time = "2026-05-23T16:28:08.961Z" }
3952
+ wheels = [
3953
+ { url = "https://files.pythonhosted.org/packages/95/58/31561402a60d317f9c36288223be99eabedc25b61f18d0b69f0889726545/yfinance-1.4.0-py2.py3-none-any.whl", hash = "sha256:6513654be21bd80a4e9e4e24193255fb4b1921618443113826494bf6efcedcb0", size = 137749, upload-time = "2026-05-23T16:28:07.656Z" },
3954
+ ]
3955
+
3956
  [[package]]
3957
  name = "zipp"
3958
  version = "3.23.1"