Spaces:
Runtime error
Runtime error
build option trading agent modules
Browse files- .gitignore +44 -2
- .python-version +1 -0
- app.py +38 -81
- backtest/__init__.py +8 -0
- backtest/option_backtest.py +233 -0
- backtest/tools.py +146 -0
- backtest/vol_backtest.py +83 -0
- eval/README.md +48 -0
- eval/generate_local_options_eval.py +172 -0
- eval/rag_eval.py +113 -8
- eval/run_eval_suite.py +22 -2
- market_data/__init__.py +30 -0
- market_data/analytics.py +206 -0
- market_data/providers.py +144 -0
- market_data/schemas.py +73 -0
- market_data/tools.py +214 -0
- optimizer/__init__.py +3 -0
- optimizer/tools.py +30 -0
- optimizer/vol_optimizer.py +76 -0
- prompts.yaml +12 -0
- pyproject.toml +2 -0
- quantconnect/README.md +28 -0
- quantconnect/VolatilityStraddleAlgorithm.py +91 -0
- rag_eval_interview_notes.md +544 -0
- requirements.txt +1 -0
- strategy/__init__.py +11 -0
- strategy/builder.py +177 -0
- strategy/payoff.py +51 -0
- strategy/schemas.py +46 -0
- strategy/tools.py +72 -0
- tools/query_knowledge.py +494 -30
- tools/todo.md +437 -5
- tools/visit_webpage.py +10 -8
- tools/web_search.py +5 -3
- uv.lock +185 -10
.gitignore
CHANGED
|
@@ -1,2 +1,44 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Secrets and local environment
|
| 2 |
+
.env
|
| 3 |
+
.env.*
|
| 4 |
+
!.env.example
|
| 5 |
+
.venv/
|
| 6 |
+
.uv-cache/
|
| 7 |
+
.python_history
|
| 8 |
+
|
| 9 |
+
# Python generated files
|
| 10 |
+
__pycache__/
|
| 11 |
+
*.py[cod]
|
| 12 |
+
*$py.class
|
| 13 |
+
.pytest_cache/
|
| 14 |
+
.ruff_cache/
|
| 15 |
+
.mypy_cache/
|
| 16 |
+
.pyright/
|
| 17 |
+
|
| 18 |
+
# App/runtime artifacts
|
| 19 |
+
.gradio/
|
| 20 |
+
*.log
|
| 21 |
+
.DS_Store
|
| 22 |
+
|
| 23 |
+
# Local vector databases and RAG inputs
|
| 24 |
+
alfred_chroma_db/
|
| 25 |
+
knowledge_base/
|
| 26 |
+
tools/knowledge_base/
|
| 27 |
+
*.sqlite3
|
| 28 |
+
*.sqlite
|
| 29 |
+
|
| 30 |
+
# Local model caches
|
| 31 |
+
hf_cache/
|
| 32 |
+
tools/hf_cache/
|
| 33 |
+
|
| 34 |
+
# Evaluation datasets, indexes, and generated reports
|
| 35 |
+
eval/data/
|
| 36 |
+
eval/indexes/
|
| 37 |
+
eval/reports/
|
| 38 |
+
eval/local_options_eval.jsonl
|
| 39 |
+
|
| 40 |
+
# Local market/backtest data exports
|
| 41 |
+
data/
|
| 42 |
+
backtest/data/
|
| 43 |
+
*.parquet
|
| 44 |
+
*.feather
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
app.py
CHANGED
|
@@ -1,91 +1,27 @@
|
|
| 1 |
-
from smolagents import CodeAgent,
|
| 2 |
import os
|
| 3 |
import datetime
|
| 4 |
-
import requests
|
| 5 |
import pytz
|
| 6 |
import yaml
|
| 7 |
-
import json
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
from tools.final_answer import FinalAnswerTool
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from Gradio_UI import GradioUI
|
| 12 |
|
| 13 |
-
@tool
|
| 14 |
-
def query_market_asset(symbol: str) -> str:
|
| 15 |
-
"""A universal market data tool to query the current price or level of ANY asset.
|
| 16 |
-
|
| 17 |
-
Supported asset classes include major indices, stocks, ETFs, crypto, and forex.
|
| 18 |
-
|
| 19 |
-
Args:
|
| 20 |
-
symbol: The specific ticker symbol used by Yahoo Finance. Examples:
|
| 21 |
-
- Indices (requires '^'): '^GSPC' (S&P 500), '^VIX' (Volatility Index), '^DJI' (Dow Jones), '^IXIC' (Nasdaq)
|
| 22 |
-
- Equities / ETFs: 'AAPL' (Apple), 'SPY' (SPDR S&P 500 ETF), 'TLT' (20+ Yr Treasury Bond)
|
| 23 |
-
- Crypto: 'BTC-USD' (Bitcoin), 'ETH-USD' (Ethereum)
|
| 24 |
-
- Forex: 'EURUSD=X' (EUR/USD rate), 'USDCNH=X' (USD/Offshore RMB)
|
| 25 |
-
|
| 26 |
-
Returns:
|
| 27 |
-
A JSON-formatted string containing the current price, high/low, timestamp, and asset info.
|
| 28 |
-
"""
|
| 29 |
-
symbol = symbol.strip().upper()
|
| 30 |
-
|
| 31 |
-
try:
|
| 32 |
-
ticker = yf.Ticker(symbol)
|
| 33 |
-
|
| 34 |
-
data = ticker.history(period="1d", interval="1m")
|
| 35 |
-
|
| 36 |
-
if not data.empty:
|
| 37 |
-
latest_row = data.iloc[-1]
|
| 38 |
-
current_price = float(latest_row['Close'])
|
| 39 |
-
open_price = float(latest_row['Open'])
|
| 40 |
-
high_price = float(latest_row['High'])
|
| 41 |
-
low_price = float(latest_row['Low'])
|
| 42 |
-
volume = int(latest_row['Volume'])
|
| 43 |
-
timestamp = str(data.index[-1])
|
| 44 |
-
|
| 45 |
-
result = {
|
| 46 |
-
"status": "success",
|
| 47 |
-
"symbol": symbol,
|
| 48 |
-
"current_price": round(current_price, 4),
|
| 49 |
-
"open": round(open_price, 4),
|
| 50 |
-
"high": round(high_price, 4),
|
| 51 |
-
"low": round(low_price, 4),
|
| 52 |
-
"volume": volume,
|
| 53 |
-
"timestamp": timestamp,
|
| 54 |
-
"data_type": "intraday_1m"
|
| 55 |
-
}
|
| 56 |
-
else:
|
| 57 |
-
info = ticker.info
|
| 58 |
-
current_price = info.get("regularMarketPrice") or info.get("previousClose") or info.get("ask") or info.get("bid")
|
| 59 |
-
|
| 60 |
-
if current_price:
|
| 61 |
-
result = {
|
| 62 |
-
"status": "success",
|
| 63 |
-
"symbol": symbol,
|
| 64 |
-
"current_price": round(float(current_price), 4),
|
| 65 |
-
"open": info.get("regularMarketOpen") or info.get("open"),
|
| 66 |
-
"high": info.get("regularMarketDayHigh") or info.get("dayHigh"),
|
| 67 |
-
"low": info.get("regularMarketDayLow") or info.get("dayLow"),
|
| 68 |
-
"volume": info.get("regularMarketVolume") or info.get("volume", 0),
|
| 69 |
-
"short_name": info.get("shortName", ""),
|
| 70 |
-
"data_type": "cached_info"
|
| 71 |
-
}
|
| 72 |
-
else:
|
| 73 |
-
result = {
|
| 74 |
-
"status": "error",
|
| 75 |
-
"symbol": symbol,
|
| 76 |
-
"message": "No price data could be resolved for this asset."
|
| 77 |
-
}
|
| 78 |
-
|
| 79 |
-
except Exception as e:
|
| 80 |
-
result = {
|
| 81 |
-
"status": "error",
|
| 82 |
-
"symbol": symbol,
|
| 83 |
-
"message": f"Exception occurred while querying: {str(e)}"
|
| 84 |
-
}
|
| 85 |
-
|
| 86 |
-
return json.dumps(result, ensure_ascii=False, indent=2)
|
| 87 |
-
|
| 88 |
-
|
| 89 |
@tool
|
| 90 |
def get_current_time_in_timezone(timezone: str) -> str:
|
| 91 |
"""A tool that fetches the current local time in a specified timezone.
|
|
@@ -104,6 +40,9 @@ def get_current_time_in_timezone(timezone: str) -> str:
|
|
| 104 |
|
| 105 |
if __name__ == "__main__":
|
| 106 |
final_answer = FinalAnswerTool()
|
|
|
|
|
|
|
|
|
|
| 107 |
load_dotenv()
|
| 108 |
hf_token = os.getenv("HF_TOKEN")
|
| 109 |
gemini_api_key = os.getenv("GEMINI_API_KEY");
|
|
@@ -118,7 +57,25 @@ if __name__ == "__main__":
|
|
| 118 |
|
| 119 |
agent = CodeAgent(
|
| 120 |
model=model,
|
| 121 |
-
tools=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
max_steps=6,
|
| 123 |
verbosity_level=1,
|
| 124 |
grammar=None,
|
|
|
|
| 1 |
+
from smolagents import CodeAgent, HfApiModel, load_tool, tool, LiteLLMModel
|
| 2 |
import os
|
| 3 |
import datetime
|
|
|
|
| 4 |
import pytz
|
| 5 |
import yaml
|
|
|
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
from tools.final_answer import FinalAnswerTool
|
| 8 |
+
from tools.query_knowledge import QueryKnowledgeTool
|
| 9 |
+
from tools.web_search import DuckDuckGoSearchTool
|
| 10 |
+
from tools.visit_webpage import VisitWebpageTool
|
| 11 |
+
from market_data.tools import (
|
| 12 |
+
calculate_option_greeks,
|
| 13 |
+
query_market_asset,
|
| 14 |
+
query_option_chain,
|
| 15 |
+
query_option_expirations,
|
| 16 |
+
query_price_history,
|
| 17 |
+
query_realized_volatility,
|
| 18 |
+
query_volatility_snapshot,
|
| 19 |
+
)
|
| 20 |
+
from strategy.tools import build_volatility_strategy
|
| 21 |
+
from backtest.tools import analyze_strategy_payoff, backtest_long_straddle_csv, backtest_volatility_signal
|
| 22 |
+
from optimizer.tools import optimize_volatility_signal_parameters
|
| 23 |
from Gradio_UI import GradioUI
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
@tool
|
| 26 |
def get_current_time_in_timezone(timezone: str) -> str:
|
| 27 |
"""A tool that fetches the current local time in a specified timezone.
|
|
|
|
| 40 |
|
| 41 |
if __name__ == "__main__":
|
| 42 |
final_answer = FinalAnswerTool()
|
| 43 |
+
query_knowledge = QueryKnowledgeTool()
|
| 44 |
+
web_search = DuckDuckGoSearchTool(max_results=6)
|
| 45 |
+
visit_webpage = VisitWebpageTool()
|
| 46 |
load_dotenv()
|
| 47 |
hf_token = os.getenv("HF_TOKEN")
|
| 48 |
gemini_api_key = os.getenv("GEMINI_API_KEY");
|
|
|
|
| 57 |
|
| 58 |
agent = CodeAgent(
|
| 59 |
model=model,
|
| 60 |
+
tools=[
|
| 61 |
+
query_market_asset,
|
| 62 |
+
query_price_history,
|
| 63 |
+
query_realized_volatility,
|
| 64 |
+
query_option_expirations,
|
| 65 |
+
query_option_chain,
|
| 66 |
+
query_volatility_snapshot,
|
| 67 |
+
calculate_option_greeks,
|
| 68 |
+
build_volatility_strategy,
|
| 69 |
+
analyze_strategy_payoff,
|
| 70 |
+
backtest_long_straddle_csv,
|
| 71 |
+
backtest_volatility_signal,
|
| 72 |
+
optimize_volatility_signal_parameters,
|
| 73 |
+
get_current_time_in_timezone,
|
| 74 |
+
query_knowledge,
|
| 75 |
+
web_search,
|
| 76 |
+
visit_webpage,
|
| 77 |
+
final_answer,
|
| 78 |
+
],
|
| 79 |
max_steps=6,
|
| 80 |
verbosity_level=1,
|
| 81 |
grammar=None,
|
backtest/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .vol_backtest import backtest_realized_vol_signal
|
| 2 |
+
from .option_backtest import backtest_long_straddle_from_quotes, load_option_quotes_csv
|
| 3 |
+
|
| 4 |
+
__all__ = [
|
| 5 |
+
"backtest_long_straddle_from_quotes",
|
| 6 |
+
"backtest_realized_vol_signal",
|
| 7 |
+
"load_option_quotes_csv",
|
| 8 |
+
]
|
backtest/option_backtest.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import asdict, dataclass
|
| 4 |
+
from datetime import timedelta
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
from .vol_backtest import max_drawdown
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
REQUIRED_QUOTE_COLUMNS = {
|
| 13 |
+
"date",
|
| 14 |
+
"underlying_symbol",
|
| 15 |
+
"underlying_price",
|
| 16 |
+
"contract_symbol",
|
| 17 |
+
"option_type",
|
| 18 |
+
"expiration",
|
| 19 |
+
"strike",
|
| 20 |
+
"bid",
|
| 21 |
+
"ask",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class OptionBacktestTrade:
|
| 27 |
+
entry_date: str
|
| 28 |
+
exit_date: str
|
| 29 |
+
contract_symbol: str
|
| 30 |
+
option_type: str
|
| 31 |
+
strike: float
|
| 32 |
+
expiration: str
|
| 33 |
+
quantity: int
|
| 34 |
+
entry_price: float
|
| 35 |
+
exit_price: float
|
| 36 |
+
fees: float
|
| 37 |
+
pnl: float
|
| 38 |
+
|
| 39 |
+
def to_dict(self) -> dict:
|
| 40 |
+
return asdict(self)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def validate_quote_frame(quotes: pd.DataFrame) -> None:
|
| 44 |
+
missing = REQUIRED_QUOTE_COLUMNS - set(quotes.columns)
|
| 45 |
+
if missing:
|
| 46 |
+
raise ValueError(f"Historical option quotes missing required columns: {sorted(missing)}")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def prepare_quotes(quotes: pd.DataFrame) -> pd.DataFrame:
|
| 50 |
+
validate_quote_frame(quotes)
|
| 51 |
+
frame = quotes.copy()
|
| 52 |
+
frame["date"] = pd.to_datetime(frame["date"]).dt.normalize()
|
| 53 |
+
frame["expiration"] = pd.to_datetime(frame["expiration"]).dt.normalize()
|
| 54 |
+
frame["option_type"] = frame["option_type"].str.lower()
|
| 55 |
+
quoted_mid = (frame["bid"] + frame["ask"]) / 2
|
| 56 |
+
if "mid" not in frame.columns:
|
| 57 |
+
frame["mid"] = quoted_mid
|
| 58 |
+
else:
|
| 59 |
+
frame["mid"] = frame["mid"].where(frame["mid"].notna(), quoted_mid)
|
| 60 |
+
frame["dte"] = (frame["expiration"] - frame["date"]).dt.days
|
| 61 |
+
frame = frame[(frame["bid"] >= 0) & (frame["ask"] > 0) & (frame["dte"] >= 0)]
|
| 62 |
+
return frame.sort_values(["date", "expiration", "strike", "option_type"]).reset_index(drop=True)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def load_option_quotes_csv(path: str | Path) -> pd.DataFrame:
|
| 66 |
+
return prepare_quotes(pd.read_csv(path))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def available_exit_date(
|
| 70 |
+
quotes: pd.DataFrame,
|
| 71 |
+
entry_date: pd.Timestamp,
|
| 72 |
+
target_exit_date: pd.Timestamp,
|
| 73 |
+
contract_symbol: str,
|
| 74 |
+
) -> pd.Timestamp | None:
|
| 75 |
+
contract_quotes = quotes[
|
| 76 |
+
(quotes["contract_symbol"] == contract_symbol)
|
| 77 |
+
& (quotes["date"] >= target_exit_date)
|
| 78 |
+
]
|
| 79 |
+
if contract_quotes.empty:
|
| 80 |
+
contract_quotes = quotes[quotes["contract_symbol"] == contract_symbol]
|
| 81 |
+
contract_quotes = contract_quotes[
|
| 82 |
+
(contract_quotes["date"] > entry_date)
|
| 83 |
+
& (contract_quotes["date"] < target_exit_date)
|
| 84 |
+
]
|
| 85 |
+
if contract_quotes.empty:
|
| 86 |
+
return None
|
| 87 |
+
return contract_quotes["date"].max()
|
| 88 |
+
if contract_quotes.empty:
|
| 89 |
+
return None
|
| 90 |
+
return contract_quotes["date"].min()
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def quote_price(row: pd.Series, side: str, price_field: str) -> float:
|
| 94 |
+
if price_field == "mid":
|
| 95 |
+
return float(row["mid"])
|
| 96 |
+
if price_field != "trade":
|
| 97 |
+
raise ValueError("price_field must be 'trade' or 'mid'.")
|
| 98 |
+
if side == "buy":
|
| 99 |
+
return float(row["ask"])
|
| 100 |
+
return float(row["bid"])
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def select_expiration_slice(day_quotes: pd.DataFrame, target_dte: int) -> pd.DataFrame:
|
| 104 |
+
candidates = day_quotes[day_quotes["dte"] > 0]
|
| 105 |
+
if candidates.empty:
|
| 106 |
+
return candidates
|
| 107 |
+
expiration = candidates.assign(dte_error=(candidates["dte"] - target_dte).abs()).sort_values("dte_error").iloc[0]["expiration"]
|
| 108 |
+
return candidates[candidates["expiration"] == expiration]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def select_atm_contract(expiration_slice: pd.DataFrame, option_type: str) -> pd.Series | None:
|
| 112 |
+
contracts = expiration_slice[expiration_slice["option_type"] == option_type]
|
| 113 |
+
if contracts.empty:
|
| 114 |
+
return None
|
| 115 |
+
spot = float(expiration_slice["underlying_price"].iloc[0])
|
| 116 |
+
return contracts.assign(strike_error=(contracts["strike"] - spot).abs()).sort_values("strike_error").iloc[0]
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def backtest_long_straddle_from_quotes(
|
| 120 |
+
quotes: pd.DataFrame,
|
| 121 |
+
symbol: str,
|
| 122 |
+
target_dte: int = 30,
|
| 123 |
+
holding_days: int = 5,
|
| 124 |
+
entry_every_days: int = 5,
|
| 125 |
+
contract_multiplier: int = 100,
|
| 126 |
+
fee_per_contract: float = 0.65,
|
| 127 |
+
price_field: str = "trade",
|
| 128 |
+
) -> dict:
|
| 129 |
+
frame = prepare_quotes(quotes)
|
| 130 |
+
frame = frame[frame["underlying_symbol"].str.upper() == symbol.upper()]
|
| 131 |
+
if frame.empty:
|
| 132 |
+
raise ValueError(f"No historical option quotes found for {symbol}.")
|
| 133 |
+
|
| 134 |
+
trades: list[OptionBacktestTrade] = []
|
| 135 |
+
trade_groups = []
|
| 136 |
+
equity = [0.0]
|
| 137 |
+
dates = sorted(frame["date"].unique())
|
| 138 |
+
next_entry_date = dates[0]
|
| 139 |
+
|
| 140 |
+
for entry_date in dates:
|
| 141 |
+
entry_date = pd.Timestamp(entry_date)
|
| 142 |
+
if entry_date < next_entry_date:
|
| 143 |
+
continue
|
| 144 |
+
|
| 145 |
+
day_quotes = frame[frame["date"] == entry_date]
|
| 146 |
+
expiration_slice = select_expiration_slice(day_quotes, target_dte)
|
| 147 |
+
if expiration_slice.empty:
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
call = select_atm_contract(expiration_slice, "call")
|
| 151 |
+
put = select_atm_contract(expiration_slice, "put")
|
| 152 |
+
if call is None or put is None:
|
| 153 |
+
continue
|
| 154 |
+
|
| 155 |
+
target_exit_date = entry_date + timedelta(days=holding_days)
|
| 156 |
+
pending_group_trades = []
|
| 157 |
+
group_pnl = 0.0
|
| 158 |
+
for leg in [call, put]:
|
| 159 |
+
exit_date = available_exit_date(frame, entry_date, target_exit_date, str(leg["contract_symbol"]))
|
| 160 |
+
if exit_date is None:
|
| 161 |
+
continue
|
| 162 |
+
exit_quote = frame[
|
| 163 |
+
(frame["date"] == exit_date)
|
| 164 |
+
& (frame["contract_symbol"] == leg["contract_symbol"])
|
| 165 |
+
].iloc[0]
|
| 166 |
+
|
| 167 |
+
entry_price = quote_price(leg, "buy", price_field)
|
| 168 |
+
exit_price = quote_price(exit_quote, "sell", price_field)
|
| 169 |
+
fees = fee_per_contract * 2
|
| 170 |
+
pnl = (exit_price - entry_price) * contract_multiplier - fees
|
| 171 |
+
trade = OptionBacktestTrade(
|
| 172 |
+
entry_date=str(entry_date.date()),
|
| 173 |
+
exit_date=str(pd.Timestamp(exit_date).date()),
|
| 174 |
+
contract_symbol=str(leg["contract_symbol"]),
|
| 175 |
+
option_type=str(leg["option_type"]),
|
| 176 |
+
strike=float(leg["strike"]),
|
| 177 |
+
expiration=str(pd.Timestamp(leg["expiration"]).date()),
|
| 178 |
+
quantity=1,
|
| 179 |
+
entry_price=round(entry_price, 4),
|
| 180 |
+
exit_price=round(exit_price, 4),
|
| 181 |
+
fees=round(fees, 2),
|
| 182 |
+
pnl=round(pnl, 2),
|
| 183 |
+
)
|
| 184 |
+
pending_group_trades.append(trade)
|
| 185 |
+
group_pnl += pnl
|
| 186 |
+
|
| 187 |
+
if len(pending_group_trades) == 2:
|
| 188 |
+
trades.extend(pending_group_trades)
|
| 189 |
+
equity.append(equity[-1] + group_pnl)
|
| 190 |
+
trade_groups.append(
|
| 191 |
+
{
|
| 192 |
+
"entry_date": str(entry_date.date()),
|
| 193 |
+
"exit_date": pending_group_trades[0].exit_date,
|
| 194 |
+
"strategy": "long_straddle",
|
| 195 |
+
"pnl": round(group_pnl, 2),
|
| 196 |
+
"legs": [trade.to_dict() for trade in pending_group_trades],
|
| 197 |
+
}
|
| 198 |
+
)
|
| 199 |
+
next_entry_date = entry_date + timedelta(days=entry_every_days)
|
| 200 |
+
|
| 201 |
+
equity_series = pd.Series(equity)
|
| 202 |
+
group_pnls = [group["pnl"] for group in trade_groups]
|
| 203 |
+
wins = [pnl for pnl in group_pnls if pnl > 0]
|
| 204 |
+
losses = [pnl for pnl in group_pnls if pnl <= 0]
|
| 205 |
+
|
| 206 |
+
return {
|
| 207 |
+
"strategy": "long_straddle",
|
| 208 |
+
"symbol": symbol.upper(),
|
| 209 |
+
"target_dte": target_dte,
|
| 210 |
+
"holding_days": holding_days,
|
| 211 |
+
"entry_every_days": entry_every_days,
|
| 212 |
+
"contract_multiplier": contract_multiplier,
|
| 213 |
+
"fee_per_contract": fee_per_contract,
|
| 214 |
+
"price_field": price_field,
|
| 215 |
+
"trade_count": len(trade_groups),
|
| 216 |
+
"leg_trade_count": len(trades),
|
| 217 |
+
"total_pnl": round(float(equity_series.iloc[-1]), 2) if not equity_series.empty else 0.0,
|
| 218 |
+
"max_drawdown": round(max_drawdown(equity_series + 100000), 6),
|
| 219 |
+
"win_rate": len(wins) / len(group_pnls) if group_pnls else 0.0,
|
| 220 |
+
"avg_win": round(sum(wins) / len(wins), 2) if wins else 0.0,
|
| 221 |
+
"avg_loss": round(sum(losses) / len(losses), 2) if losses else 0.0,
|
| 222 |
+
"trades": trade_groups[:200],
|
| 223 |
+
"data_requirements": [
|
| 224 |
+
"Historical option quotes with date, expiration, strike, bid, ask, and underlying_price.",
|
| 225 |
+
"For production-grade backtests, include deltas, IV, volume, open interest, and corporate action adjusted symbols.",
|
| 226 |
+
],
|
| 227 |
+
"limitations": [
|
| 228 |
+
"No early assignment model yet.",
|
| 229 |
+
"No margin model yet.",
|
| 230 |
+
"No intraday fills; entry and exit use the daily quote row.",
|
| 231 |
+
"Results are only as good as the historical option quote data supplied.",
|
| 232 |
+
],
|
| 233 |
+
}
|
backtest/tools.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
from smolagents import tool
|
| 6 |
+
|
| 7 |
+
from market_data.providers import get_price_history
|
| 8 |
+
from strategy.payoff import expiration_payoff, strategy_summary
|
| 9 |
+
from strategy.schemas import OptionLeg, OptionStrategy
|
| 10 |
+
|
| 11 |
+
from .option_backtest import backtest_long_straddle_from_quotes, load_option_quotes_csv
|
| 12 |
+
from .vol_backtest import backtest_realized_vol_signal
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def parse_legs(legs_json: str) -> list[OptionLeg]:
|
| 16 |
+
payload = json.loads(legs_json)
|
| 17 |
+
if isinstance(payload, dict) and "legs" in payload:
|
| 18 |
+
payload = payload["legs"]
|
| 19 |
+
return [OptionLeg(**leg) for leg in payload]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@tool
|
| 23 |
+
def analyze_strategy_payoff(legs_json: str, min_price: float, max_price: float, steps: int = 25) -> str:
|
| 24 |
+
"""Analyze expiration payoff for an option strategy.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
legs_json: JSON list of option legs from build_volatility_strategy.
|
| 28 |
+
min_price: Minimum underlying price scenario.
|
| 29 |
+
max_price: Maximum underlying price scenario.
|
| 30 |
+
steps: Number of scenario steps.
|
| 31 |
+
"""
|
| 32 |
+
try:
|
| 33 |
+
legs = parse_legs(legs_json)
|
| 34 |
+
points = [
|
| 35 |
+
min_price + (max_price - min_price) * index / max(steps, 1)
|
| 36 |
+
for index in range(max(steps, 1) + 1)
|
| 37 |
+
]
|
| 38 |
+
rows = [
|
| 39 |
+
{"underlying_price": round(price, 2), "pnl": round(expiration_payoff(legs, price), 2)}
|
| 40 |
+
for price in points
|
| 41 |
+
]
|
| 42 |
+
temp_strategy = OptionStrategy(
|
| 43 |
+
name="custom_strategy",
|
| 44 |
+
volatility_view="unknown",
|
| 45 |
+
directional_view="unknown",
|
| 46 |
+
legs=legs,
|
| 47 |
+
rationale="custom payoff analysis",
|
| 48 |
+
risks=[],
|
| 49 |
+
max_profit=None,
|
| 50 |
+
max_loss=None,
|
| 51 |
+
breakevens=[],
|
| 52 |
+
net_debit_or_credit=round(sum(leg.premium * leg.signed_quantity() * 100 for leg in legs), 2),
|
| 53 |
+
score=0.0,
|
| 54 |
+
)
|
| 55 |
+
return json.dumps(
|
| 56 |
+
{
|
| 57 |
+
"status": "success",
|
| 58 |
+
"payoff_rows": rows,
|
| 59 |
+
"payoff_summary": strategy_summary(temp_strategy),
|
| 60 |
+
},
|
| 61 |
+
ensure_ascii=False,
|
| 62 |
+
indent=2,
|
| 63 |
+
)
|
| 64 |
+
except Exception as exc:
|
| 65 |
+
return json.dumps({"status": "error", "message": str(exc)}, ensure_ascii=False, indent=2)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@tool
|
| 69 |
+
def backtest_volatility_signal(
|
| 70 |
+
symbol: str,
|
| 71 |
+
signal: str = "long_vol",
|
| 72 |
+
period: str = "2y",
|
| 73 |
+
short_window: int = 10,
|
| 74 |
+
long_window: int = 30,
|
| 75 |
+
holding_days: int = 5,
|
| 76 |
+
) -> str:
|
| 77 |
+
"""Backtest a simple realized-volatility expansion/compression signal on the underlying.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
symbol: Yahoo Finance ticker.
|
| 81 |
+
signal: long_vol or short_vol.
|
| 82 |
+
period: Yahoo Finance history period.
|
| 83 |
+
short_window: Short realized volatility lookback.
|
| 84 |
+
long_window: Long realized volatility lookback.
|
| 85 |
+
holding_days: Holding period after entry.
|
| 86 |
+
"""
|
| 87 |
+
try:
|
| 88 |
+
history = get_price_history(symbol, period=period, interval="1d")
|
| 89 |
+
result = backtest_realized_vol_signal(
|
| 90 |
+
history["Close"],
|
| 91 |
+
short_window=short_window,
|
| 92 |
+
long_window=long_window,
|
| 93 |
+
holding_days=holding_days,
|
| 94 |
+
signal=signal,
|
| 95 |
+
)
|
| 96 |
+
return json.dumps({"status": "success", "symbol": symbol.upper(), **result}, ensure_ascii=False, indent=2)
|
| 97 |
+
except Exception as exc:
|
| 98 |
+
return json.dumps({"status": "error", "symbol": symbol, "message": str(exc)}, ensure_ascii=False, indent=2)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@tool
|
| 102 |
+
def backtest_long_straddle_csv(
|
| 103 |
+
csv_path: str,
|
| 104 |
+
symbol: str,
|
| 105 |
+
target_dte: int = 30,
|
| 106 |
+
holding_days: int = 5,
|
| 107 |
+
entry_every_days: int = 5,
|
| 108 |
+
price_field: str = "trade",
|
| 109 |
+
) -> str:
|
| 110 |
+
"""Run a real option-quote backtest for repeated ATM long straddles.
|
| 111 |
+
|
| 112 |
+
This is a true option PnL backtest when supplied with historical option quotes.
|
| 113 |
+
Required CSV columns: date, underlying_symbol, underlying_price, contract_symbol,
|
| 114 |
+
option_type, expiration, strike, bid, ask. Optional columns include mid, delta,
|
| 115 |
+
gamma, theta, vega, implied_volatility, volume, open_interest.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
csv_path: Path to historical option quotes CSV.
|
| 119 |
+
symbol: Underlying ticker.
|
| 120 |
+
target_dte: Target days to expiration at entry.
|
| 121 |
+
holding_days: Number of calendar days to hold each straddle.
|
| 122 |
+
entry_every_days: Minimum days between new entries.
|
| 123 |
+
price_field: trade for buy-at-ask/sell-at-bid, or mid for mid-price marks.
|
| 124 |
+
"""
|
| 125 |
+
try:
|
| 126 |
+
quotes = load_option_quotes_csv(csv_path)
|
| 127 |
+
result = backtest_long_straddle_from_quotes(
|
| 128 |
+
quotes=quotes,
|
| 129 |
+
symbol=symbol,
|
| 130 |
+
target_dte=target_dte,
|
| 131 |
+
holding_days=holding_days,
|
| 132 |
+
entry_every_days=entry_every_days,
|
| 133 |
+
price_field=price_field,
|
| 134 |
+
)
|
| 135 |
+
return json.dumps({"status": "success", **result}, ensure_ascii=False, indent=2)
|
| 136 |
+
except Exception as exc:
|
| 137 |
+
return json.dumps(
|
| 138 |
+
{
|
| 139 |
+
"status": "error",
|
| 140 |
+
"symbol": symbol,
|
| 141 |
+
"message": str(exc),
|
| 142 |
+
"note": "A real option backtest requires historical option quote data. yfinance does not provide reliable historical option chains.",
|
| 143 |
+
},
|
| 144 |
+
ensure_ascii=False,
|
| 145 |
+
indent=2,
|
| 146 |
+
)
|
backtest/vol_backtest.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def max_drawdown(equity: pd.Series) -> float:
|
| 9 |
+
if equity.empty:
|
| 10 |
+
return 0.0
|
| 11 |
+
running_max = equity.cummax()
|
| 12 |
+
drawdown = equity / running_max - 1
|
| 13 |
+
return float(drawdown.min())
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def backtest_realized_vol_signal(
|
| 17 |
+
prices: pd.Series,
|
| 18 |
+
short_window: int = 10,
|
| 19 |
+
long_window: int = 30,
|
| 20 |
+
holding_days: int = 5,
|
| 21 |
+
signal: str = "long_vol",
|
| 22 |
+
) -> dict:
|
| 23 |
+
close = prices.dropna().astype(float)
|
| 24 |
+
returns = close.pct_change().dropna()
|
| 25 |
+
short_rv = returns.rolling(short_window).std() * math.sqrt(252)
|
| 26 |
+
long_rv = returns.rolling(long_window).std() * math.sqrt(252)
|
| 27 |
+
|
| 28 |
+
trades = []
|
| 29 |
+
equity = [1.0]
|
| 30 |
+
index = 0
|
| 31 |
+
dates = list(returns.index)
|
| 32 |
+
while index + holding_days < len(returns):
|
| 33 |
+
current_date = dates[index]
|
| 34 |
+
if pd.isna(short_rv.iloc[index]) or pd.isna(long_rv.iloc[index]):
|
| 35 |
+
index += 1
|
| 36 |
+
equity.append(equity[-1])
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
vol_expanding = short_rv.iloc[index] > long_rv.iloc[index]
|
| 40 |
+
enter = vol_expanding if signal == "long_vol" else not vol_expanding
|
| 41 |
+
if not enter:
|
| 42 |
+
index += 1
|
| 43 |
+
equity.append(equity[-1])
|
| 44 |
+
continue
|
| 45 |
+
|
| 46 |
+
period_returns = returns.iloc[index + 1:index + 1 + holding_days]
|
| 47 |
+
realized_move = float(period_returns.abs().sum())
|
| 48 |
+
signed_pnl = realized_move if signal == "long_vol" else -realized_move
|
| 49 |
+
equity.append(equity[-1] * (1 + signed_pnl))
|
| 50 |
+
trades.append(
|
| 51 |
+
{
|
| 52 |
+
"entry_date": str(current_date),
|
| 53 |
+
"exit_date": str(dates[index + holding_days]),
|
| 54 |
+
"short_rv": float(short_rv.iloc[index]),
|
| 55 |
+
"long_rv": float(long_rv.iloc[index]),
|
| 56 |
+
"realized_abs_move": realized_move,
|
| 57 |
+
"pnl_proxy": signed_pnl,
|
| 58 |
+
}
|
| 59 |
+
)
|
| 60 |
+
index += holding_days
|
| 61 |
+
|
| 62 |
+
equity_series = pd.Series(equity)
|
| 63 |
+
wins = [trade for trade in trades if trade["pnl_proxy"] > 0]
|
| 64 |
+
return {
|
| 65 |
+
"signal": signal,
|
| 66 |
+
"short_window": short_window,
|
| 67 |
+
"long_window": long_window,
|
| 68 |
+
"holding_days": holding_days,
|
| 69 |
+
"trade_count": len(trades),
|
| 70 |
+
"win_rate": len(wins) / len(trades) if trades else 0.0,
|
| 71 |
+
"total_return_proxy": float(equity_series.iloc[-1] - 1) if not equity_series.empty else 0.0,
|
| 72 |
+
"max_drawdown_proxy": max_drawdown(equity_series),
|
| 73 |
+
"avg_trade_pnl_proxy": (
|
| 74 |
+
sum(trade["pnl_proxy"] for trade in trades) / len(trades)
|
| 75 |
+
if trades
|
| 76 |
+
else 0.0
|
| 77 |
+
),
|
| 78 |
+
"trades": trades[:100],
|
| 79 |
+
"limitations": [
|
| 80 |
+
"This is an underlying-price realized-volatility signal backtest, not a true option PnL backtest.",
|
| 81 |
+
"It does not use historical option-chain prices, bid/ask spreads, margin, assignment, or delta hedging costs.",
|
| 82 |
+
],
|
| 83 |
+
}
|
eval/README.md
CHANGED
|
@@ -58,6 +58,23 @@ The suite writes per-dataset reports and one aggregate report under `eval/report
|
|
| 58 |
|
| 59 |
## Common Commands
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
Run the fastest local check while developing PDF parsing or chunking:
|
| 62 |
|
| 63 |
```bash
|
|
@@ -114,6 +131,36 @@ uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
|
|
| 114 |
--rebuild
|
| 115 |
```
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
Compare different chunk settings:
|
| 118 |
|
| 119 |
```bash
|
|
@@ -169,6 +216,7 @@ uv --cache-dir .uv-cache run python -m eval.rag_eval \
|
|
| 169 |
2. After changing PDF extraction, chunking, embeddings, or retrieval code, add `--rebuild`.
|
| 170 |
3. Before comparing two versions, use the same `--datasets`, `--max-queries`, `--max-corpus-docs`, `--top-k`, `--chunk-size`, and `--chunk-overlap`.
|
| 171 |
4. Use `--output-name` to save stable report names for before/after comparison.
|
|
|
|
| 172 |
|
| 173 |
## Metrics
|
| 174 |
|
|
|
|
| 58 |
|
| 59 |
## Common Commands
|
| 60 |
|
| 61 |
+
Run with the default multilingual embedding model:
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
uv --cache-dir .uv-cache run python -m eval.run_eval_suite --rebuild
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
Use a custom embedding model for experiments:
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
RAG_EMBED_MODEL=intfloat/multilingual-e5-base \
|
| 71 |
+
uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
|
| 72 |
+
--datasets local-options \
|
| 73 |
+
--top-k 5 \
|
| 74 |
+
--output-name local_options_e5_base \
|
| 75 |
+
--rebuild
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
Run the fastest local check while developing PDF parsing or chunking:
|
| 79 |
|
| 80 |
```bash
|
|
|
|
| 131 |
--rebuild
|
| 132 |
```
|
| 133 |
|
| 134 |
+
Compare retrieval with and without reranker:
|
| 135 |
+
|
| 136 |
+
```bash
|
| 137 |
+
uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
|
| 138 |
+
--datasets local-options \
|
| 139 |
+
--top-k 5 \
|
| 140 |
+
--output-name local_options_no_reranker \
|
| 141 |
+
--rebuild
|
| 142 |
+
|
| 143 |
+
uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
|
| 144 |
+
--datasets local-options \
|
| 145 |
+
--top-k 5 \
|
| 146 |
+
--use-reranker \
|
| 147 |
+
--reranker-candidates 25 \
|
| 148 |
+
--output-name local_options_with_reranker \
|
| 149 |
+
--rebuild
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
Use a custom reranker model:
|
| 153 |
+
|
| 154 |
+
```bash
|
| 155 |
+
uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
|
| 156 |
+
--datasets beir/fiqa \
|
| 157 |
+
--use-reranker \
|
| 158 |
+
--reranker-model cross-encoder/ms-marco-MiniLM-L-6-v2 \
|
| 159 |
+
--reranker-candidates 50 \
|
| 160 |
+
--top-k 5 \
|
| 161 |
+
--rebuild
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
Compare different chunk settings:
|
| 165 |
|
| 166 |
```bash
|
|
|
|
| 216 |
2. After changing PDF extraction, chunking, embeddings, or retrieval code, add `--rebuild`.
|
| 217 |
3. Before comparing two versions, use the same `--datasets`, `--max-queries`, `--max-corpus-docs`, `--top-k`, `--chunk-size`, and `--chunk-overlap`.
|
| 218 |
4. Use `--output-name` to save stable report names for before/after comparison.
|
| 219 |
+
5. When testing reranker, compare the same dataset once without `--use-reranker` and once with `--use-reranker`.
|
| 220 |
|
| 221 |
## Metrics
|
| 222 |
|
eval/generate_local_options_eval.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import random
|
| 6 |
+
import re
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
from tools.query_knowledge import RAW_DIR, iter_source_files, load_source_file
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
KEY_TERMS = [
|
| 14 |
+
"volatility smile",
|
| 15 |
+
"implied volatility",
|
| 16 |
+
"local volatility",
|
| 17 |
+
"stochastic volatility",
|
| 18 |
+
"Black-Scholes",
|
| 19 |
+
"delta",
|
| 20 |
+
"gamma",
|
| 21 |
+
"vega",
|
| 22 |
+
"theta",
|
| 23 |
+
"rho",
|
| 24 |
+
"skew",
|
| 25 |
+
"straddle",
|
| 26 |
+
"correlation",
|
| 27 |
+
"at-the-money",
|
| 28 |
+
"forward",
|
| 29 |
+
"risk-neutral",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 34 |
+
OUTPUT_PATH = PROJECT_ROOT / "eval" / "local_options_eval.jsonl"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def normalize_space(text: str) -> str:
|
| 38 |
+
return re.sub(r"\s+", " ", text).strip()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def extract_keywords(text: str, max_keywords: int = 4) -> list[str]:
|
| 42 |
+
lowered = text.lower()
|
| 43 |
+
keywords = [term for term in KEY_TERMS if term.lower() in lowered]
|
| 44 |
+
equation_ids = re.findall(r"\(\d+\.\d+[a-z]?\)", text)
|
| 45 |
+
formulas = re.findall(r"[A-Za-z𝜎𝜇𝜌𝜃𝛴][A-Za-z0-9𝜎𝜇𝜌𝜃𝛴_{}^]*\s*=", text)
|
| 46 |
+
keywords.extend(equation_ids[:2])
|
| 47 |
+
keywords.extend(item.strip() for item in formulas[:2])
|
| 48 |
+
|
| 49 |
+
if not keywords:
|
| 50 |
+
candidates = [
|
| 51 |
+
word
|
| 52 |
+
for word in re.findall(r"[A-Za-z][A-Za-z-]{4,}", text)
|
| 53 |
+
if word.lower() not in {"there", "where", "which", "would", "could", "should", "chapter"}
|
| 54 |
+
]
|
| 55 |
+
keywords.extend(candidates[:max_keywords])
|
| 56 |
+
|
| 57 |
+
deduped = []
|
| 58 |
+
banned = {"id=", "FORMULA", "value ="}
|
| 59 |
+
for keyword in keywords:
|
| 60 |
+
if keyword and keyword not in banned and keyword not in deduped:
|
| 61 |
+
deduped.append(keyword)
|
| 62 |
+
return deduped[:max_keywords]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def is_sane_section(section: str | None) -> bool:
|
| 66 |
+
if not section:
|
| 67 |
+
return False
|
| 68 |
+
section = section.strip()
|
| 69 |
+
if not 6 <= len(section) <= 90:
|
| 70 |
+
return False
|
| 71 |
+
if section.count(",") >= 2:
|
| 72 |
+
return False
|
| 73 |
+
digit_count = sum(char.isdigit() for char in section)
|
| 74 |
+
letter_count = sum(char.isalpha() for char in section)
|
| 75 |
+
if digit_count > max(2, letter_count // 3):
|
| 76 |
+
return False
|
| 77 |
+
if re.search(r"\b(figure|table|printed|united states|amount unit price|call price|under)$", section, re.I):
|
| 78 |
+
return False
|
| 79 |
+
if "figure" in section.lower() or "table" in section.lower():
|
| 80 |
+
return False
|
| 81 |
+
if re.search(r"\b(figure|table|printed|united states|amount unit price|call price)\b", section, re.I):
|
| 82 |
+
return False
|
| 83 |
+
words = section.split()
|
| 84 |
+
if len(words) > 12:
|
| 85 |
+
return False
|
| 86 |
+
return True
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def make_case(document: Any, index: int) -> dict[str, Any] | None:
|
| 90 |
+
metadata = document.metadata
|
| 91 |
+
text = normalize_space(document.text)
|
| 92 |
+
if len(text) < 80:
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
page = metadata.get("page_number")
|
| 96 |
+
if isinstance(page, int) and (page < 25 or page > 500):
|
| 97 |
+
return None
|
| 98 |
+
section = metadata.get("section_path") or metadata.get("section_title")
|
| 99 |
+
content_type = metadata.get("content_type", "text")
|
| 100 |
+
formula_id = metadata.get("formula_id")
|
| 101 |
+
keywords = extract_keywords(text)
|
| 102 |
+
if not keywords and not section:
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
if content_type == "formula" or formula_id:
|
| 106 |
+
question = f"What formula or equation is described on page {page}?"
|
| 107 |
+
answer_type = "formula"
|
| 108 |
+
elif is_sane_section(section):
|
| 109 |
+
question = f"What does the section {section} discuss?"
|
| 110 |
+
answer_type = "section"
|
| 111 |
+
keywords.append(section.split(">")[-1].strip())
|
| 112 |
+
else:
|
| 113 |
+
if not keywords:
|
| 114 |
+
return None
|
| 115 |
+
term = keywords[0]
|
| 116 |
+
if term.lower() in {"formula", "id=", "value ="}:
|
| 117 |
+
return None
|
| 118 |
+
question = f"Where does the options reference discuss {term}?"
|
| 119 |
+
answer_type = "concept"
|
| 120 |
+
|
| 121 |
+
expected_pages = [page] if page is not None else []
|
| 122 |
+
return {
|
| 123 |
+
"id": f"auto_options_{index:03d}",
|
| 124 |
+
"question": question,
|
| 125 |
+
"expected_pages": expected_pages,
|
| 126 |
+
"expected_keywords": keywords[:5],
|
| 127 |
+
"answer_type": answer_type,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def generate_cases(count: int, seed: int) -> list[dict[str, Any]]:
|
| 132 |
+
documents = []
|
| 133 |
+
for source_file in iter_source_files(RAW_DIR):
|
| 134 |
+
documents.extend(load_source_file(source_file))
|
| 135 |
+
|
| 136 |
+
random.Random(seed).shuffle(documents)
|
| 137 |
+
cases = []
|
| 138 |
+
seen_questions = set()
|
| 139 |
+
for document in documents:
|
| 140 |
+
case = make_case(document, len(cases) + 1)
|
| 141 |
+
if not case:
|
| 142 |
+
continue
|
| 143 |
+
if case["question"] in seen_questions:
|
| 144 |
+
continue
|
| 145 |
+
seen_questions.add(case["question"])
|
| 146 |
+
cases.append(case)
|
| 147 |
+
if len(cases) >= count:
|
| 148 |
+
break
|
| 149 |
+
|
| 150 |
+
if len(cases) < count:
|
| 151 |
+
raise RuntimeError(f"Only generated {len(cases)} cases; requested {count}.")
|
| 152 |
+
return cases
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def main() -> None:
|
| 156 |
+
parser = argparse.ArgumentParser(description="Generate local options RAG eval cases.")
|
| 157 |
+
parser.add_argument("--count", type=int, default=40)
|
| 158 |
+
parser.add_argument("--seed", type=int, default=20260525)
|
| 159 |
+
parser.add_argument("--output", type=Path, default=OUTPUT_PATH)
|
| 160 |
+
args = parser.parse_args()
|
| 161 |
+
|
| 162 |
+
cases = generate_cases(args.count, args.seed)
|
| 163 |
+
args.output.parent.mkdir(parents=True, exist_ok=True)
|
| 164 |
+
args.output.write_text(
|
| 165 |
+
"\n".join(json.dumps(case, ensure_ascii=False) for case in cases) + "\n",
|
| 166 |
+
encoding="utf-8",
|
| 167 |
+
)
|
| 168 |
+
print(f"Wrote {len(cases)} cases to {args.output}")
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
main()
|
eval/rag_eval.py
CHANGED
|
@@ -4,6 +4,7 @@ import argparse
|
|
| 4 |
import csv
|
| 5 |
import json
|
| 6 |
import math
|
|
|
|
| 7 |
import shutil
|
| 8 |
import zipfile
|
| 9 |
from dataclasses import dataclass
|
|
@@ -15,9 +16,17 @@ import requests
|
|
| 15 |
from llama_index.core import StorageContext, VectorStoreIndex
|
| 16 |
from llama_index.core.node_parser import SentenceSplitter
|
| 17 |
from llama_index.core.schema import Document
|
|
|
|
| 18 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
| 19 |
|
| 20 |
-
from tools.query_knowledge import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
@@ -387,7 +396,9 @@ def load_local_options_eval(max_queries: int | None) -> EvalCorpus:
|
|
| 387 |
|
| 388 |
from tools.query_knowledge import load_pdf_file
|
| 389 |
|
| 390 |
-
pdf_files = sorted((PROJECT_ROOT / "
|
|
|
|
|
|
|
| 391 |
documents = []
|
| 392 |
for pdf_file in pdf_files:
|
| 393 |
for doc_index, document in enumerate(load_pdf_file(pdf_file)):
|
|
@@ -443,6 +454,11 @@ def load_eval_corpus(args: argparse.Namespace) -> EvalCorpus:
|
|
| 443 |
raise ValueError(f"Unknown dataset: {args.dataset}")
|
| 444 |
|
| 445 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild: bool) -> VectorStoreIndex:
|
| 447 |
configure_model_cache()
|
| 448 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
@@ -453,7 +469,8 @@ def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild
|
|
| 453 |
index_path.mkdir(parents=True, exist_ok=True)
|
| 454 |
|
| 455 |
db = chromadb.PersistentClient(path=str(index_path))
|
| 456 |
-
|
|
|
|
| 457 |
if rebuild:
|
| 458 |
try:
|
| 459 |
db.delete_collection(collection_name)
|
|
@@ -464,7 +481,7 @@ def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild
|
|
| 464 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 465 |
embed_model = HuggingFaceEmbedding(
|
| 466 |
model_name=resolve_embed_model_name(),
|
| 467 |
-
cache_folder=str(PROJECT_ROOT / "
|
| 468 |
)
|
| 469 |
|
| 470 |
if collection.count() == 0:
|
|
@@ -491,8 +508,70 @@ def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild
|
|
| 491 |
return VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
|
| 492 |
|
| 493 |
|
| 494 |
-
def
|
| 495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
cases = []
|
| 497 |
hit_counts = {1: 0, 3: 0, 5: 0, top_k: 0}
|
| 498 |
reciprocal_ranks = []
|
|
@@ -500,7 +579,17 @@ def evaluate_retrieval(corpus: EvalCorpus, index: VectorStoreIndex, top_k: int)
|
|
| 500 |
|
| 501 |
for query in corpus.queries:
|
| 502 |
relevant_doc_ids = corpus.qrels.get(query["query_id"], set())
|
| 503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
retrieved = []
|
| 505 |
seen_doc_ids = set()
|
| 506 |
first_hit_rank = None
|
|
@@ -557,6 +646,8 @@ def evaluate_retrieval(corpus: EvalCorpus, index: VectorStoreIndex, top_k: int)
|
|
| 557 |
"top_k": top_k,
|
| 558 |
"mrr": sum(reciprocal_ranks) / total if total else 0.0,
|
| 559 |
"ndcg_at_k": sum(ndcg_scores) / total if total else 0.0,
|
|
|
|
|
|
|
| 560 |
}
|
| 561 |
for k, count in sorted(hit_counts.items()):
|
| 562 |
metrics[f"hit_at_{k}"] = count / total if total else 0.0
|
|
@@ -612,6 +703,10 @@ def parse_args() -> argparse.Namespace:
|
|
| 612 |
parser.add_argument("--max-corpus-docs", type=int, default=None)
|
| 613 |
parser.add_argument("--max-queries", type=int, default=None)
|
| 614 |
parser.add_argument("--rebuild", action="store_true")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
return parser.parse_args()
|
| 616 |
|
| 617 |
|
|
@@ -619,7 +714,17 @@ def main() -> None:
|
|
| 619 |
args = parse_args()
|
| 620 |
corpus = load_eval_corpus(args)
|
| 621 |
index = build_index(corpus, args.chunk_size, args.chunk_overlap, args.rebuild)
|
| 622 |
-
report = evaluate_retrieval(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
json_path, md_path = write_reports(report)
|
| 624 |
print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
|
| 625 |
print(f"JSON report: {json_path}")
|
|
|
|
| 4 |
import csv
|
| 5 |
import json
|
| 6 |
import math
|
| 7 |
+
import re
|
| 8 |
import shutil
|
| 9 |
import zipfile
|
| 10 |
from dataclasses import dataclass
|
|
|
|
| 16 |
from llama_index.core import StorageContext, VectorStoreIndex
|
| 17 |
from llama_index.core.node_parser import SentenceSplitter
|
| 18 |
from llama_index.core.schema import Document
|
| 19 |
+
from llama_index.core.schema import NodeWithScore, TextNode
|
| 20 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
| 21 |
|
| 22 |
+
from tools.query_knowledge import (
|
| 23 |
+
BM25Retriever,
|
| 24 |
+
EMBED_MODEL_NAME,
|
| 25 |
+
RERANKER_MODEL_NAME,
|
| 26 |
+
CrossEncoderReranker,
|
| 27 |
+
configure_model_cache,
|
| 28 |
+
resolve_embed_model_name,
|
| 29 |
+
)
|
| 30 |
|
| 31 |
|
| 32 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
|
|
| 396 |
|
| 397 |
from tools.query_knowledge import load_pdf_file
|
| 398 |
|
| 399 |
+
pdf_files = sorted((PROJECT_ROOT / "knowledge_base" / "raw").rglob("*.pdf"))
|
| 400 |
+
if not pdf_files:
|
| 401 |
+
pdf_files = sorted((PROJECT_ROOT / "tools" / "knowledge_base" / "raw").rglob("*.pdf"))
|
| 402 |
documents = []
|
| 403 |
for pdf_file in pdf_files:
|
| 404 |
for doc_index, document in enumerate(load_pdf_file(pdf_file)):
|
|
|
|
| 454 |
raise ValueError(f"Unknown dataset: {args.dataset}")
|
| 455 |
|
| 456 |
|
| 457 |
+
def collection_safe_name(value: str) -> str:
|
| 458 |
+
safe = re.sub(r"[^A-Za-z0-9_-]+", "_", value)
|
| 459 |
+
return safe.strip("_") or "default"
|
| 460 |
+
|
| 461 |
+
|
| 462 |
def build_index(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int, rebuild: bool) -> VectorStoreIndex:
|
| 463 |
configure_model_cache()
|
| 464 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
|
|
| 469 |
index_path.mkdir(parents=True, exist_ok=True)
|
| 470 |
|
| 471 |
db = chromadb.PersistentClient(path=str(index_path))
|
| 472 |
+
embed_slug = collection_safe_name(EMBED_MODEL_NAME)
|
| 473 |
+
collection_name = f"{corpus.name}_{embed_slug}_eval"
|
| 474 |
if rebuild:
|
| 475 |
try:
|
| 476 |
db.delete_collection(collection_name)
|
|
|
|
| 481 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 482 |
embed_model = HuggingFaceEmbedding(
|
| 483 |
model_name=resolve_embed_model_name(),
|
| 484 |
+
cache_folder=str(PROJECT_ROOT / "hf_cache" / "sentence_transformers"),
|
| 485 |
)
|
| 486 |
|
| 487 |
if collection.count() == 0:
|
|
|
|
| 508 |
return VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
|
| 509 |
|
| 510 |
|
| 511 |
+
def build_bm25_retriever(corpus: EvalCorpus, chunk_size: int, chunk_overlap: int) -> BM25Retriever:
|
| 512 |
+
documents = [
|
| 513 |
+
Document(
|
| 514 |
+
text=document["text"],
|
| 515 |
+
metadata={
|
| 516 |
+
"doc_id": document["doc_id"],
|
| 517 |
+
"title": document.get("title", ""),
|
| 518 |
+
**(document.get("metadata") or {}),
|
| 519 |
+
},
|
| 520 |
+
)
|
| 521 |
+
for document in corpus.documents
|
| 522 |
+
]
|
| 523 |
+
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 524 |
+
nodes = splitter.get_nodes_from_documents(documents)
|
| 525 |
+
text_nodes = [
|
| 526 |
+
TextNode(id_=node.node_id, text=node.get_content(), metadata=node.metadata)
|
| 527 |
+
for node in nodes
|
| 528 |
+
]
|
| 529 |
+
return BM25Retriever(text_nodes)
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
def merge_eval_results(
|
| 533 |
+
vector_results: list[NodeWithScore],
|
| 534 |
+
bm25_results: list[NodeWithScore],
|
| 535 |
+
top_k: int,
|
| 536 |
+
) -> list[NodeWithScore]:
|
| 537 |
+
merged: dict[str, NodeWithScore] = {}
|
| 538 |
+
|
| 539 |
+
for rank, result in enumerate(vector_results):
|
| 540 |
+
node_id = result.node.node_id
|
| 541 |
+
merged[node_id] = NodeWithScore(node=result.node, score=1.0 / (rank + 1))
|
| 542 |
+
|
| 543 |
+
for rank, result in enumerate(bm25_results):
|
| 544 |
+
node_id = result.node.node_id
|
| 545 |
+
reciprocal_rank_score = 1.0 / (rank + 1)
|
| 546 |
+
if node_id in merged:
|
| 547 |
+
merged[node_id].score = (merged[node_id].score or 0.0) + reciprocal_rank_score
|
| 548 |
+
else:
|
| 549 |
+
merged[node_id] = NodeWithScore(node=result.node, score=reciprocal_rank_score)
|
| 550 |
+
|
| 551 |
+
results = list(merged.values())
|
| 552 |
+
results.sort(key=lambda item: item.score or float("-inf"), reverse=True)
|
| 553 |
+
return results[:top_k]
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
def evaluate_retrieval(
|
| 557 |
+
corpus: EvalCorpus,
|
| 558 |
+
index: VectorStoreIndex,
|
| 559 |
+
top_k: int,
|
| 560 |
+
use_reranker: bool = False,
|
| 561 |
+
use_hybrid: bool = False,
|
| 562 |
+
chunk_size: int = 512,
|
| 563 |
+
chunk_overlap: int = 64,
|
| 564 |
+
reranker_model_name: str = RERANKER_MODEL_NAME,
|
| 565 |
+
reranker_candidates: int = 25,
|
| 566 |
+
) -> dict[str, Any]:
|
| 567 |
+
retrieve_top_k = max(reranker_candidates, top_k) if use_reranker else max(top_k * 5, top_k)
|
| 568 |
+
retriever = index.as_retriever(similarity_top_k=retrieve_top_k)
|
| 569 |
+
bm25_retriever = (
|
| 570 |
+
build_bm25_retriever(corpus, chunk_size, chunk_overlap)
|
| 571 |
+
if use_hybrid
|
| 572 |
+
else None
|
| 573 |
+
)
|
| 574 |
+
reranker = CrossEncoderReranker(reranker_model_name) if use_reranker else None
|
| 575 |
cases = []
|
| 576 |
hit_counts = {1: 0, 3: 0, 5: 0, top_k: 0}
|
| 577 |
reciprocal_ranks = []
|
|
|
|
| 579 |
|
| 580 |
for query in corpus.queries:
|
| 581 |
relevant_doc_ids = corpus.qrels.get(query["query_id"], set())
|
| 582 |
+
vector_results = retriever.retrieve(query["question"])
|
| 583 |
+
results = vector_results
|
| 584 |
+
if bm25_retriever:
|
| 585 |
+
bm25_results = bm25_retriever.retrieve(query["question"], retrieve_top_k)
|
| 586 |
+
results = merge_eval_results(vector_results, bm25_results, retrieve_top_k)
|
| 587 |
+
if reranker:
|
| 588 |
+
results = reranker.rerank(
|
| 589 |
+
query["question"],
|
| 590 |
+
results,
|
| 591 |
+
top_n=max(top_k * 5, top_k),
|
| 592 |
+
)
|
| 593 |
retrieved = []
|
| 594 |
seen_doc_ids = set()
|
| 595 |
first_hit_rank = None
|
|
|
|
| 646 |
"top_k": top_k,
|
| 647 |
"mrr": sum(reciprocal_ranks) / total if total else 0.0,
|
| 648 |
"ndcg_at_k": sum(ndcg_scores) / total if total else 0.0,
|
| 649 |
+
"reranker_enabled": use_reranker,
|
| 650 |
+
"hybrid_enabled": use_hybrid,
|
| 651 |
}
|
| 652 |
for k, count in sorted(hit_counts.items()):
|
| 653 |
metrics[f"hit_at_{k}"] = count / total if total else 0.0
|
|
|
|
| 703 |
parser.add_argument("--max-corpus-docs", type=int, default=None)
|
| 704 |
parser.add_argument("--max-queries", type=int, default=None)
|
| 705 |
parser.add_argument("--rebuild", action="store_true")
|
| 706 |
+
parser.add_argument("--use-hybrid", action="store_true")
|
| 707 |
+
parser.add_argument("--use-reranker", action="store_true")
|
| 708 |
+
parser.add_argument("--reranker-model", default=RERANKER_MODEL_NAME)
|
| 709 |
+
parser.add_argument("--reranker-candidates", type=int, default=25)
|
| 710 |
return parser.parse_args()
|
| 711 |
|
| 712 |
|
|
|
|
| 714 |
args = parse_args()
|
| 715 |
corpus = load_eval_corpus(args)
|
| 716 |
index = build_index(corpus, args.chunk_size, args.chunk_overlap, args.rebuild)
|
| 717 |
+
report = evaluate_retrieval(
|
| 718 |
+
corpus,
|
| 719 |
+
index,
|
| 720 |
+
args.top_k,
|
| 721 |
+
use_reranker=args.use_reranker,
|
| 722 |
+
use_hybrid=args.use_hybrid,
|
| 723 |
+
chunk_size=args.chunk_size,
|
| 724 |
+
chunk_overlap=args.chunk_overlap,
|
| 725 |
+
reranker_model_name=args.reranker_model,
|
| 726 |
+
reranker_candidates=args.reranker_candidates,
|
| 727 |
+
)
|
| 728 |
json_path, md_path = write_reports(report)
|
| 729 |
print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
|
| 730 |
print(f"JSON report: {json_path}")
|
eval/run_eval_suite.py
CHANGED
|
@@ -57,6 +57,10 @@ def build_dataset_args(args: argparse.Namespace, dataset: str) -> SimpleNamespac
|
|
| 57 |
else defaults["max_corpus_docs"],
|
| 58 |
max_queries=args.max_queries if args.max_queries is not None else defaults["max_queries"],
|
| 59 |
rebuild=args.rebuild,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
)
|
| 61 |
|
| 62 |
|
|
@@ -65,7 +69,9 @@ def run_one(dataset: str, args: argparse.Namespace) -> DatasetRun:
|
|
| 65 |
print(
|
| 66 |
f"\n=== Running {dataset} "
|
| 67 |
f"(top_k={dataset_args.top_k}, max_corpus_docs={dataset_args.max_corpus_docs}, "
|
| 68 |
-
f"max_queries={dataset_args.max_queries}, rebuild={dataset_args.rebuild}
|
|
|
|
|
|
|
| 69 |
)
|
| 70 |
|
| 71 |
corpus = load_eval_corpus(dataset_args)
|
|
@@ -75,7 +81,17 @@ def run_one(dataset: str, args: argparse.Namespace) -> DatasetRun:
|
|
| 75 |
chunk_overlap=dataset_args.chunk_overlap,
|
| 76 |
rebuild=dataset_args.rebuild,
|
| 77 |
)
|
| 78 |
-
report = evaluate_retrieval(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
json_path, md_path = write_reports(report)
|
| 80 |
print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
|
| 81 |
|
|
@@ -132,6 +148,10 @@ def parse_args() -> argparse.Namespace:
|
|
| 132 |
parser.add_argument("--max-corpus-docs", type=int, default=None)
|
| 133 |
parser.add_argument("--max-queries", type=int, default=None)
|
| 134 |
parser.add_argument("--rebuild", action="store_true")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
parser.add_argument("--fail-fast", action="store_true")
|
| 136 |
parser.add_argument("--output-name", default=None, help="Suite report filename stem under eval/reports.")
|
| 137 |
return parser.parse_args()
|
|
|
|
| 57 |
else defaults["max_corpus_docs"],
|
| 58 |
max_queries=args.max_queries if args.max_queries is not None else defaults["max_queries"],
|
| 59 |
rebuild=args.rebuild,
|
| 60 |
+
use_hybrid=args.use_hybrid,
|
| 61 |
+
use_reranker=args.use_reranker,
|
| 62 |
+
reranker_model=args.reranker_model,
|
| 63 |
+
reranker_candidates=args.reranker_candidates,
|
| 64 |
)
|
| 65 |
|
| 66 |
|
|
|
|
| 69 |
print(
|
| 70 |
f"\n=== Running {dataset} "
|
| 71 |
f"(top_k={dataset_args.top_k}, max_corpus_docs={dataset_args.max_corpus_docs}, "
|
| 72 |
+
f"max_queries={dataset_args.max_queries}, rebuild={dataset_args.rebuild}, "
|
| 73 |
+
f"use_hybrid={dataset_args.use_hybrid}, "
|
| 74 |
+
f"use_reranker={dataset_args.use_reranker}) ==="
|
| 75 |
)
|
| 76 |
|
| 77 |
corpus = load_eval_corpus(dataset_args)
|
|
|
|
| 81 |
chunk_overlap=dataset_args.chunk_overlap,
|
| 82 |
rebuild=dataset_args.rebuild,
|
| 83 |
)
|
| 84 |
+
report = evaluate_retrieval(
|
| 85 |
+
corpus,
|
| 86 |
+
index,
|
| 87 |
+
dataset_args.top_k,
|
| 88 |
+
use_hybrid=dataset_args.use_hybrid,
|
| 89 |
+
chunk_size=dataset_args.chunk_size,
|
| 90 |
+
chunk_overlap=dataset_args.chunk_overlap,
|
| 91 |
+
use_reranker=dataset_args.use_reranker,
|
| 92 |
+
reranker_model_name=dataset_args.reranker_model,
|
| 93 |
+
reranker_candidates=dataset_args.reranker_candidates,
|
| 94 |
+
)
|
| 95 |
json_path, md_path = write_reports(report)
|
| 96 |
print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
|
| 97 |
|
|
|
|
| 148 |
parser.add_argument("--max-corpus-docs", type=int, default=None)
|
| 149 |
parser.add_argument("--max-queries", type=int, default=None)
|
| 150 |
parser.add_argument("--rebuild", action="store_true")
|
| 151 |
+
parser.add_argument("--use-hybrid", action="store_true")
|
| 152 |
+
parser.add_argument("--use-reranker", action="store_true")
|
| 153 |
+
parser.add_argument("--reranker-model", default="cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 154 |
+
parser.add_argument("--reranker-candidates", type=int, default=25)
|
| 155 |
parser.add_argument("--fail-fast", action="store_true")
|
| 156 |
parser.add_argument("--output-name", default=None, help="Suite report filename stem under eval/reports.")
|
| 157 |
return parser.parse_args()
|
market_data/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .analytics import (
|
| 2 |
+
black_scholes_greeks,
|
| 3 |
+
classify_volatility_regime,
|
| 4 |
+
rank_current_iv_against_rv,
|
| 5 |
+
realized_volatility,
|
| 6 |
+
summarize_option_chain,
|
| 7 |
+
)
|
| 8 |
+
from .providers import (
|
| 9 |
+
get_current_quote,
|
| 10 |
+
get_option_chain,
|
| 11 |
+
get_price_history,
|
| 12 |
+
list_option_expirations,
|
| 13 |
+
)
|
| 14 |
+
from .schemas import OptionChain, OptionContract, UnderlyingQuote, VolSnapshot
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"black_scholes_greeks",
|
| 18 |
+
"classify_volatility_regime",
|
| 19 |
+
"get_current_quote",
|
| 20 |
+
"get_option_chain",
|
| 21 |
+
"get_price_history",
|
| 22 |
+
"list_option_expirations",
|
| 23 |
+
"OptionChain",
|
| 24 |
+
"OptionContract",
|
| 25 |
+
"realized_volatility",
|
| 26 |
+
"rank_current_iv_against_rv",
|
| 27 |
+
"summarize_option_chain",
|
| 28 |
+
"UnderlyingQuote",
|
| 29 |
+
"VolSnapshot",
|
| 30 |
+
]
|
market_data/analytics.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
from statistics import NormalDist
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from .schemas import OptionChain
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
NORMAL = NormalDist()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def realized_volatility(
|
| 15 |
+
prices: pd.Series,
|
| 16 |
+
windows: tuple[int, ...] = (5, 10, 20, 30, 60),
|
| 17 |
+
trading_days: int = 252,
|
| 18 |
+
) -> dict[str, float | None]:
|
| 19 |
+
close = prices.dropna().astype(float)
|
| 20 |
+
returns = close.pct_change().dropna()
|
| 21 |
+
output: dict[str, float | None] = {}
|
| 22 |
+
|
| 23 |
+
for window in windows:
|
| 24 |
+
key = f"{window}d"
|
| 25 |
+
if len(returns) < window:
|
| 26 |
+
output[key] = None
|
| 27 |
+
continue
|
| 28 |
+
output[key] = float(returns.tail(window).std(ddof=1) * math.sqrt(trading_days))
|
| 29 |
+
|
| 30 |
+
return output
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _norm_pdf(value: float) -> float:
|
| 34 |
+
return math.exp(-0.5 * value * value) / math.sqrt(2 * math.pi)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def black_scholes_greeks(
|
| 38 |
+
spot: float,
|
| 39 |
+
strike: float,
|
| 40 |
+
time_to_expiry: float,
|
| 41 |
+
volatility: float,
|
| 42 |
+
risk_free_rate: float = 0.0,
|
| 43 |
+
dividend_yield: float = 0.0,
|
| 44 |
+
option_type: str = "call",
|
| 45 |
+
) -> dict[str, float | None]:
|
| 46 |
+
if spot <= 0 or strike <= 0 or time_to_expiry <= 0 or volatility <= 0:
|
| 47 |
+
return {
|
| 48 |
+
"delta": None,
|
| 49 |
+
"gamma": None,
|
| 50 |
+
"vega": None,
|
| 51 |
+
"theta": None,
|
| 52 |
+
"rho": None,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
sqrt_t = math.sqrt(time_to_expiry)
|
| 56 |
+
d1 = (
|
| 57 |
+
math.log(spot / strike)
|
| 58 |
+
+ (risk_free_rate - dividend_yield + 0.5 * volatility * volatility) * time_to_expiry
|
| 59 |
+
) / (volatility * sqrt_t)
|
| 60 |
+
d2 = d1 - volatility * sqrt_t
|
| 61 |
+
discount_dividend = math.exp(-dividend_yield * time_to_expiry)
|
| 62 |
+
discount_rate = math.exp(-risk_free_rate * time_to_expiry)
|
| 63 |
+
option_type = option_type.lower()
|
| 64 |
+
|
| 65 |
+
if option_type == "put":
|
| 66 |
+
delta = discount_dividend * (NORMAL.cdf(d1) - 1)
|
| 67 |
+
theta = (
|
| 68 |
+
-spot * discount_dividend * _norm_pdf(d1) * volatility / (2 * sqrt_t)
|
| 69 |
+
+ dividend_yield * spot * discount_dividend * NORMAL.cdf(-d1)
|
| 70 |
+
- risk_free_rate * strike * discount_rate * NORMAL.cdf(-d2)
|
| 71 |
+
) / 365
|
| 72 |
+
rho = -strike * time_to_expiry * discount_rate * NORMAL.cdf(-d2) / 100
|
| 73 |
+
else:
|
| 74 |
+
delta = discount_dividend * NORMAL.cdf(d1)
|
| 75 |
+
theta = (
|
| 76 |
+
-spot * discount_dividend * _norm_pdf(d1) * volatility / (2 * sqrt_t)
|
| 77 |
+
- dividend_yield * spot * discount_dividend * NORMAL.cdf(d1)
|
| 78 |
+
+ risk_free_rate * strike * discount_rate * NORMAL.cdf(d2)
|
| 79 |
+
) / 365
|
| 80 |
+
rho = strike * time_to_expiry * discount_rate * NORMAL.cdf(d2) / 100
|
| 81 |
+
|
| 82 |
+
gamma = discount_dividend * _norm_pdf(d1) / (spot * volatility * sqrt_t)
|
| 83 |
+
vega = spot * discount_dividend * _norm_pdf(d1) * sqrt_t / 100
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
"delta": float(delta),
|
| 87 |
+
"gamma": float(gamma),
|
| 88 |
+
"vega": float(vega),
|
| 89 |
+
"theta": float(theta),
|
| 90 |
+
"rho": float(rho),
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def nearest_atm_iv(chain: OptionChain) -> float | None:
|
| 95 |
+
if chain.underlying_price is None:
|
| 96 |
+
return None
|
| 97 |
+
contracts = chain.calls + chain.puts
|
| 98 |
+
valid = [
|
| 99 |
+
contract
|
| 100 |
+
for contract in contracts
|
| 101 |
+
if contract.implied_volatility is not None and contract.implied_volatility > 0
|
| 102 |
+
]
|
| 103 |
+
if not valid:
|
| 104 |
+
return None
|
| 105 |
+
nearest = min(valid, key=lambda contract: abs(contract.strike - chain.underlying_price))
|
| 106 |
+
return nearest.implied_volatility
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def simple_skew(chain: OptionChain) -> float | None:
|
| 110 |
+
if chain.underlying_price is None:
|
| 111 |
+
return None
|
| 112 |
+
otm_puts = [
|
| 113 |
+
contract
|
| 114 |
+
for contract in chain.puts
|
| 115 |
+
if contract.strike < chain.underlying_price and contract.implied_volatility
|
| 116 |
+
]
|
| 117 |
+
otm_calls = [
|
| 118 |
+
contract
|
| 119 |
+
for contract in chain.calls
|
| 120 |
+
if contract.strike > chain.underlying_price and contract.implied_volatility
|
| 121 |
+
]
|
| 122 |
+
if not otm_puts or not otm_calls:
|
| 123 |
+
return None
|
| 124 |
+
put = max(otm_puts, key=lambda contract: contract.strike)
|
| 125 |
+
call = min(otm_calls, key=lambda contract: contract.strike)
|
| 126 |
+
return float((put.implied_volatility or 0) - (call.implied_volatility or 0))
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def summarize_option_chain(chain: OptionChain, realized_vol_20d: float | None = None) -> dict:
|
| 130 |
+
atm_iv = nearest_atm_iv(chain)
|
| 131 |
+
return {
|
| 132 |
+
"symbol": chain.symbol,
|
| 133 |
+
"expiration": chain.expiration,
|
| 134 |
+
"underlying_price": chain.underlying_price,
|
| 135 |
+
"atm_iv": atm_iv,
|
| 136 |
+
"iv_rv_spread_20d": (
|
| 137 |
+
float(atm_iv - realized_vol_20d)
|
| 138 |
+
if atm_iv is not None and realized_vol_20d is not None
|
| 139 |
+
else None
|
| 140 |
+
),
|
| 141 |
+
"skew_put_minus_call": simple_skew(chain),
|
| 142 |
+
"call_count": len(chain.calls),
|
| 143 |
+
"put_count": len(chain.puts),
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def rank_current_iv_against_rv(
|
| 148 |
+
current_iv: float | None,
|
| 149 |
+
realized_vols: dict[str, float | None],
|
| 150 |
+
) -> float | None:
|
| 151 |
+
if current_iv is None:
|
| 152 |
+
return None
|
| 153 |
+
rv_values = [value for value in realized_vols.values() if value is not None]
|
| 154 |
+
if len(rv_values) < 2:
|
| 155 |
+
return None
|
| 156 |
+
low = min(rv_values)
|
| 157 |
+
high = max(rv_values)
|
| 158 |
+
if high <= low:
|
| 159 |
+
return None
|
| 160 |
+
return max(0.0, min(1.0, (current_iv - low) / (high - low)))
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def classify_volatility_regime(
|
| 164 |
+
current_iv: float | None,
|
| 165 |
+
realized_vol_20d: float | None,
|
| 166 |
+
term_structure_slope: float | None,
|
| 167 |
+
skew: float | None,
|
| 168 |
+
) -> dict:
|
| 169 |
+
if current_iv is None or realized_vol_20d is None:
|
| 170 |
+
return {
|
| 171 |
+
"regime": "unknown",
|
| 172 |
+
"vol_signal": "insufficient_iv_or_rv",
|
| 173 |
+
"confidence": "low",
|
| 174 |
+
"notes": ["Need both option implied volatility and realized volatility."],
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
iv_rv_spread = current_iv - realized_vol_20d
|
| 178 |
+
notes = []
|
| 179 |
+
if iv_rv_spread > 0.08:
|
| 180 |
+
regime = "high_implied_vol_premium"
|
| 181 |
+
vol_signal = "short_vol_candidate"
|
| 182 |
+
notes.append("Current ATM IV is materially above 20D realized volatility.")
|
| 183 |
+
elif iv_rv_spread < -0.04:
|
| 184 |
+
regime = "low_implied_vol_discount"
|
| 185 |
+
vol_signal = "long_vol_candidate"
|
| 186 |
+
notes.append("Current ATM IV is below 20D realized volatility.")
|
| 187 |
+
else:
|
| 188 |
+
regime = "balanced_iv_vs_rv"
|
| 189 |
+
vol_signal = "neutral_vol"
|
| 190 |
+
notes.append("Current ATM IV is close to 20D realized volatility.")
|
| 191 |
+
|
| 192 |
+
if term_structure_slope is not None:
|
| 193 |
+
if term_structure_slope > 0.04:
|
| 194 |
+
notes.append("Term structure is upward sloping.")
|
| 195 |
+
elif term_structure_slope < -0.04:
|
| 196 |
+
notes.append("Term structure is inverted or front-loaded.")
|
| 197 |
+
if skew is not None and abs(skew) > 0.05:
|
| 198 |
+
notes.append("Put-call skew is elevated in the sampled expiration.")
|
| 199 |
+
|
| 200 |
+
confidence = "medium" if len(notes) >= 2 else "low"
|
| 201 |
+
return {
|
| 202 |
+
"regime": regime,
|
| 203 |
+
"vol_signal": vol_signal,
|
| 204 |
+
"confidence": confidence,
|
| 205 |
+
"notes": notes,
|
| 206 |
+
}
|
market_data/providers.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import date, datetime
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import yfinance as yf
|
| 8 |
+
|
| 9 |
+
from .schemas import OptionChain, OptionContract, UnderlyingQuote
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def none_if_nan(value: Any) -> Any:
|
| 13 |
+
if pd.isna(value):
|
| 14 |
+
return None
|
| 15 |
+
return value
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def to_float(value: Any) -> float | None:
|
| 19 |
+
value = none_if_nan(value)
|
| 20 |
+
return float(value) if value is not None else None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def to_int(value: Any) -> int | None:
|
| 24 |
+
value = none_if_nan(value)
|
| 25 |
+
return int(value) if value is not None else None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def get_price_history(
|
| 29 |
+
symbol: str,
|
| 30 |
+
period: str = "1y",
|
| 31 |
+
interval: str = "1d",
|
| 32 |
+
start: str | None = None,
|
| 33 |
+
end: str | None = None,
|
| 34 |
+
) -> pd.DataFrame:
|
| 35 |
+
ticker = yf.Ticker(symbol.strip().upper())
|
| 36 |
+
return ticker.history(period=period, interval=interval, start=start, end=end)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_current_quote(symbol: str) -> UnderlyingQuote:
|
| 40 |
+
symbol = symbol.strip().upper()
|
| 41 |
+
ticker = yf.Ticker(symbol)
|
| 42 |
+
data = ticker.history(period="1d", interval="1m")
|
| 43 |
+
|
| 44 |
+
if not data.empty:
|
| 45 |
+
latest_row = data.iloc[-1]
|
| 46 |
+
return UnderlyingQuote(
|
| 47 |
+
symbol=symbol,
|
| 48 |
+
current_price=float(latest_row["Close"]),
|
| 49 |
+
open=float(latest_row["Open"]),
|
| 50 |
+
high=float(latest_row["High"]),
|
| 51 |
+
low=float(latest_row["Low"]),
|
| 52 |
+
volume=int(latest_row["Volume"]),
|
| 53 |
+
timestamp=str(data.index[-1]),
|
| 54 |
+
data_type="intraday_1m",
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
info = ticker.info
|
| 58 |
+
current_price = (
|
| 59 |
+
info.get("regularMarketPrice")
|
| 60 |
+
or info.get("previousClose")
|
| 61 |
+
or info.get("ask")
|
| 62 |
+
or info.get("bid")
|
| 63 |
+
)
|
| 64 |
+
return UnderlyingQuote(
|
| 65 |
+
symbol=symbol,
|
| 66 |
+
current_price=float(current_price) if current_price else None,
|
| 67 |
+
open=to_float(info.get("regularMarketOpen") or info.get("open")),
|
| 68 |
+
high=to_float(info.get("regularMarketDayHigh") or info.get("dayHigh")),
|
| 69 |
+
low=to_float(info.get("regularMarketDayLow") or info.get("dayLow")),
|
| 70 |
+
volume=to_int(info.get("regularMarketVolume") or info.get("volume")),
|
| 71 |
+
timestamp=datetime.utcnow().isoformat(timespec="seconds"),
|
| 72 |
+
data_type="cached_info",
|
| 73 |
+
short_name=info.get("shortName", ""),
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def list_option_expirations(symbol: str) -> list[str]:
|
| 78 |
+
ticker = yf.Ticker(symbol.strip().upper())
|
| 79 |
+
return list(ticker.options or [])
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def liquidity_warnings(row: pd.Series) -> list[str]:
|
| 83 |
+
warnings = []
|
| 84 |
+
bid = to_float(row.get("bid"))
|
| 85 |
+
ask = to_float(row.get("ask"))
|
| 86 |
+
volume = to_int(row.get("volume")) or 0
|
| 87 |
+
open_interest = to_int(row.get("openInterest")) or 0
|
| 88 |
+
|
| 89 |
+
if bid is None or ask is None or bid <= 0 or ask <= 0:
|
| 90 |
+
warnings.append("missing_or_zero_bid_ask")
|
| 91 |
+
elif ask > 0 and (ask - bid) / ask > 0.25:
|
| 92 |
+
warnings.append("wide_bid_ask_spread")
|
| 93 |
+
if volume <= 0:
|
| 94 |
+
warnings.append("zero_volume")
|
| 95 |
+
if open_interest <= 0:
|
| 96 |
+
warnings.append("zero_open_interest")
|
| 97 |
+
return warnings
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def row_to_contract(row: pd.Series, option_type: str, expiration: str) -> OptionContract:
|
| 101 |
+
bid = to_float(row.get("bid"))
|
| 102 |
+
ask = to_float(row.get("ask"))
|
| 103 |
+
mid = (bid + ask) / 2 if bid is not None and ask is not None and bid > 0 and ask > 0 else None
|
| 104 |
+
days_to_expiration = max((date.fromisoformat(expiration) - date.today()).days, 0)
|
| 105 |
+
|
| 106 |
+
return OptionContract(
|
| 107 |
+
contract_symbol=str(row.get("contractSymbol", "")),
|
| 108 |
+
option_type=option_type,
|
| 109 |
+
expiration=expiration,
|
| 110 |
+
strike=float(row.get("strike")),
|
| 111 |
+
bid=bid,
|
| 112 |
+
ask=ask,
|
| 113 |
+
mid=mid,
|
| 114 |
+
last_price=to_float(row.get("lastPrice")),
|
| 115 |
+
volume=to_int(row.get("volume")),
|
| 116 |
+
open_interest=to_int(row.get("openInterest")),
|
| 117 |
+
implied_volatility=to_float(row.get("impliedVolatility")),
|
| 118 |
+
in_the_money=bool(row.get("inTheMoney", False)),
|
| 119 |
+
days_to_expiration=days_to_expiration,
|
| 120 |
+
liquidity_warnings=liquidity_warnings(row),
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def get_option_chain(symbol: str, expiration: str | None = None) -> OptionChain:
|
| 125 |
+
symbol = symbol.strip().upper()
|
| 126 |
+
ticker = yf.Ticker(symbol)
|
| 127 |
+
expirations = list(ticker.options or [])
|
| 128 |
+
if not expirations:
|
| 129 |
+
raise ValueError(f"No option expirations found for {symbol}.")
|
| 130 |
+
expiration = expiration or expirations[0]
|
| 131 |
+
if expiration not in expirations:
|
| 132 |
+
raise ValueError(f"Expiration {expiration} is not available for {symbol}.")
|
| 133 |
+
|
| 134 |
+
chain = ticker.option_chain(expiration)
|
| 135 |
+
quote = get_current_quote(symbol)
|
| 136 |
+
calls = [row_to_contract(row, "call", expiration) for _, row in chain.calls.iterrows()]
|
| 137 |
+
puts = [row_to_contract(row, "put", expiration) for _, row in chain.puts.iterrows()]
|
| 138 |
+
return OptionChain(
|
| 139 |
+
symbol=symbol,
|
| 140 |
+
expiration=expiration,
|
| 141 |
+
underlying_price=quote.current_price,
|
| 142 |
+
calls=calls,
|
| 143 |
+
puts=puts,
|
| 144 |
+
)
|
market_data/schemas.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import asdict, dataclass
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class UnderlyingQuote:
|
| 9 |
+
symbol: str
|
| 10 |
+
current_price: float | None
|
| 11 |
+
open: float | None
|
| 12 |
+
high: float | None
|
| 13 |
+
low: float | None
|
| 14 |
+
volume: int | None
|
| 15 |
+
timestamp: str
|
| 16 |
+
data_type: str
|
| 17 |
+
short_name: str = ""
|
| 18 |
+
|
| 19 |
+
def to_dict(self) -> dict[str, Any]:
|
| 20 |
+
return asdict(self)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class OptionContract:
|
| 25 |
+
contract_symbol: str
|
| 26 |
+
option_type: str
|
| 27 |
+
expiration: str
|
| 28 |
+
strike: float
|
| 29 |
+
bid: float | None
|
| 30 |
+
ask: float | None
|
| 31 |
+
mid: float | None
|
| 32 |
+
last_price: float | None
|
| 33 |
+
volume: int | None
|
| 34 |
+
open_interest: int | None
|
| 35 |
+
implied_volatility: float | None
|
| 36 |
+
in_the_money: bool
|
| 37 |
+
days_to_expiration: int
|
| 38 |
+
liquidity_warnings: list[str]
|
| 39 |
+
|
| 40 |
+
def to_dict(self) -> dict[str, Any]:
|
| 41 |
+
return asdict(self)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class OptionChain:
|
| 46 |
+
symbol: str
|
| 47 |
+
expiration: str
|
| 48 |
+
underlying_price: float | None
|
| 49 |
+
calls: list[OptionContract]
|
| 50 |
+
puts: list[OptionContract]
|
| 51 |
+
|
| 52 |
+
def to_dict(self) -> dict[str, Any]:
|
| 53 |
+
return {
|
| 54 |
+
"symbol": self.symbol,
|
| 55 |
+
"expiration": self.expiration,
|
| 56 |
+
"underlying_price": self.underlying_price,
|
| 57 |
+
"calls": [contract.to_dict() for contract in self.calls],
|
| 58 |
+
"puts": [contract.to_dict() for contract in self.puts],
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@dataclass
|
| 63 |
+
class VolSnapshot:
|
| 64 |
+
symbol: str
|
| 65 |
+
current_price: float | None
|
| 66 |
+
realized_volatility: dict[str, float | None]
|
| 67 |
+
atm_iv_by_expiration: dict[str, float | None]
|
| 68 |
+
iv_rv_spread_by_expiration: dict[str, float | None]
|
| 69 |
+
term_structure_slope: float | None
|
| 70 |
+
skew_by_expiration: dict[str, float | None]
|
| 71 |
+
|
| 72 |
+
def to_dict(self) -> dict[str, Any]:
|
| 73 |
+
return asdict(self)
|
market_data/tools.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
from smolagents import tool
|
| 6 |
+
|
| 7 |
+
from .analytics import (
|
| 8 |
+
black_scholes_greeks,
|
| 9 |
+
classify_volatility_regime,
|
| 10 |
+
rank_current_iv_against_rv,
|
| 11 |
+
realized_volatility,
|
| 12 |
+
summarize_option_chain,
|
| 13 |
+
)
|
| 14 |
+
from .providers import get_current_quote, get_option_chain, get_price_history, list_option_expirations
|
| 15 |
+
from .schemas import VolSnapshot
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def json_dumps(payload) -> str:
|
| 19 |
+
return json.dumps(payload, ensure_ascii=False, indent=2, default=str)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@tool
|
| 23 |
+
def query_market_asset(symbol: str) -> str:
|
| 24 |
+
"""Query the current price and intraday quote data for an asset.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
symbol: Yahoo Finance ticker, e.g. AAPL, SPY, ^VIX, BTC-USD, EURUSD=X.
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
return json_dumps({"status": "success", **get_current_quote(symbol).to_dict()})
|
| 31 |
+
except Exception as exc:
|
| 32 |
+
return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@tool
|
| 36 |
+
def query_price_history(symbol: str, period: str = "1y", interval: str = "1d") -> str:
|
| 37 |
+
"""Query historical OHLCV prices for an asset.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
symbol: Yahoo Finance ticker.
|
| 41 |
+
period: Yahoo Finance period such as 1mo, 6mo, 1y, 5y.
|
| 42 |
+
interval: Yahoo Finance interval such as 1d, 1h, 15m.
|
| 43 |
+
"""
|
| 44 |
+
try:
|
| 45 |
+
history = get_price_history(symbol, period=period, interval=interval)
|
| 46 |
+
records = history.tail(20).reset_index().to_dict(orient="records")
|
| 47 |
+
return json_dumps(
|
| 48 |
+
{
|
| 49 |
+
"status": "success",
|
| 50 |
+
"symbol": symbol.upper(),
|
| 51 |
+
"period": period,
|
| 52 |
+
"interval": interval,
|
| 53 |
+
"rows_returned": len(records),
|
| 54 |
+
"latest_rows": records,
|
| 55 |
+
}
|
| 56 |
+
)
|
| 57 |
+
except Exception as exc:
|
| 58 |
+
return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@tool
|
| 62 |
+
def query_realized_volatility(symbol: str, period: str = "1y") -> str:
|
| 63 |
+
"""Calculate realized volatility windows from historical close prices.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
symbol: Yahoo Finance ticker.
|
| 67 |
+
period: Yahoo Finance history period.
|
| 68 |
+
"""
|
| 69 |
+
try:
|
| 70 |
+
history = get_price_history(symbol, period=period, interval="1d")
|
| 71 |
+
rv = realized_volatility(history["Close"])
|
| 72 |
+
return json_dumps({"status": "success", "symbol": symbol.upper(), "realized_volatility": rv})
|
| 73 |
+
except Exception as exc:
|
| 74 |
+
return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@tool
|
| 78 |
+
def query_option_expirations(symbol: str) -> str:
|
| 79 |
+
"""List available option expiration dates for an underlying.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
symbol: Yahoo Finance ticker.
|
| 83 |
+
"""
|
| 84 |
+
try:
|
| 85 |
+
expirations = list_option_expirations(symbol)
|
| 86 |
+
return json_dumps({"status": "success", "symbol": symbol.upper(), "expirations": expirations})
|
| 87 |
+
except Exception as exc:
|
| 88 |
+
return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@tool
|
| 92 |
+
def query_option_chain(symbol: str, expiration: str = "") -> str:
|
| 93 |
+
"""Query an option chain with liquidity warnings and implied volatility.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
symbol: Yahoo Finance ticker.
|
| 97 |
+
expiration: Expiration date in YYYY-MM-DD. Leave empty to use the nearest expiration.
|
| 98 |
+
"""
|
| 99 |
+
try:
|
| 100 |
+
chain = get_option_chain(symbol, expiration or None)
|
| 101 |
+
summary = summarize_option_chain(chain)
|
| 102 |
+
payload = chain.to_dict()
|
| 103 |
+
payload["summary"] = summary
|
| 104 |
+
payload["calls"] = payload["calls"][:80]
|
| 105 |
+
payload["puts"] = payload["puts"][:80]
|
| 106 |
+
return json_dumps({"status": "success", **payload})
|
| 107 |
+
except Exception as exc:
|
| 108 |
+
return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@tool
|
| 112 |
+
def query_volatility_snapshot(symbol: str, max_expirations: int = 4, history_period: str = "1y") -> str:
|
| 113 |
+
"""Summarize realized volatility, ATM IV, IV-RV spread, skew, and term structure.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
symbol: Yahoo Finance ticker.
|
| 117 |
+
max_expirations: Number of expirations to sample from the option chain.
|
| 118 |
+
history_period: Yahoo Finance history period for realized volatility.
|
| 119 |
+
"""
|
| 120 |
+
try:
|
| 121 |
+
symbol = symbol.strip().upper()
|
| 122 |
+
quote = get_current_quote(symbol)
|
| 123 |
+
history = get_price_history(symbol, period=history_period, interval="1d")
|
| 124 |
+
rv = realized_volatility(history["Close"])
|
| 125 |
+
rv_20d = rv.get("20d")
|
| 126 |
+
expirations = list_option_expirations(symbol)[:max_expirations]
|
| 127 |
+
|
| 128 |
+
atm_iv_by_expiration = {}
|
| 129 |
+
iv_rv_spread_by_expiration = {}
|
| 130 |
+
skew_by_expiration = {}
|
| 131 |
+
for expiration in expirations:
|
| 132 |
+
chain = get_option_chain(symbol, expiration)
|
| 133 |
+
summary = summarize_option_chain(chain, realized_vol_20d=rv_20d)
|
| 134 |
+
atm_iv_by_expiration[expiration] = summary["atm_iv"]
|
| 135 |
+
iv_rv_spread_by_expiration[expiration] = summary["iv_rv_spread_20d"]
|
| 136 |
+
skew_by_expiration[expiration] = summary["skew_put_minus_call"]
|
| 137 |
+
|
| 138 |
+
valid_term_ivs = [
|
| 139 |
+
value
|
| 140 |
+
for value in atm_iv_by_expiration.values()
|
| 141 |
+
if value is not None
|
| 142 |
+
]
|
| 143 |
+
current_atm_iv = valid_term_ivs[0] if valid_term_ivs else None
|
| 144 |
+
sampled_skews = [value for value in skew_by_expiration.values() if value is not None]
|
| 145 |
+
front_skew = sampled_skews[0] if sampled_skews else None
|
| 146 |
+
term_structure_slope = (
|
| 147 |
+
float(valid_term_ivs[-1] - valid_term_ivs[0])
|
| 148 |
+
if len(valid_term_ivs) >= 2
|
| 149 |
+
else None
|
| 150 |
+
)
|
| 151 |
+
regime = classify_volatility_regime(
|
| 152 |
+
current_iv=current_atm_iv,
|
| 153 |
+
realized_vol_20d=rv_20d,
|
| 154 |
+
term_structure_slope=term_structure_slope,
|
| 155 |
+
skew=front_skew,
|
| 156 |
+
)
|
| 157 |
+
snapshot = VolSnapshot(
|
| 158 |
+
symbol=symbol,
|
| 159 |
+
current_price=quote.current_price,
|
| 160 |
+
realized_volatility=rv,
|
| 161 |
+
atm_iv_by_expiration=atm_iv_by_expiration,
|
| 162 |
+
iv_rv_spread_by_expiration=iv_rv_spread_by_expiration,
|
| 163 |
+
term_structure_slope=term_structure_slope,
|
| 164 |
+
skew_by_expiration=skew_by_expiration,
|
| 165 |
+
)
|
| 166 |
+
return json_dumps(
|
| 167 |
+
{
|
| 168 |
+
"status": "success",
|
| 169 |
+
**snapshot.to_dict(),
|
| 170 |
+
"front_atm_iv": current_atm_iv,
|
| 171 |
+
"front_skew": front_skew,
|
| 172 |
+
"iv_vs_rv_rank_proxy": rank_current_iv_against_rv(current_atm_iv, rv),
|
| 173 |
+
"volatility_regime": regime,
|
| 174 |
+
"limitations": [
|
| 175 |
+
"IV rank/percentile is a proxy based on current ATM IV versus realized-volatility windows.",
|
| 176 |
+
"True historical IV rank requires historical option-chain data from a richer provider.",
|
| 177 |
+
],
|
| 178 |
+
}
|
| 179 |
+
)
|
| 180 |
+
except Exception as exc:
|
| 181 |
+
return json_dumps({"status": "error", "symbol": symbol, "message": str(exc)})
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
@tool
|
| 185 |
+
def calculate_option_greeks(
|
| 186 |
+
spot: float,
|
| 187 |
+
strike: float,
|
| 188 |
+
time_to_expiry: float,
|
| 189 |
+
volatility: float,
|
| 190 |
+
option_type: str = "call",
|
| 191 |
+
risk_free_rate: float = 0.0,
|
| 192 |
+
dividend_yield: float = 0.0,
|
| 193 |
+
) -> str:
|
| 194 |
+
"""Calculate Black-Scholes-Merton Greeks for a single option.
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
spot: Current underlying price.
|
| 198 |
+
strike: Option strike.
|
| 199 |
+
time_to_expiry: Time to expiration in years.
|
| 200 |
+
volatility: Annualized implied volatility as a decimal.
|
| 201 |
+
option_type: call or put.
|
| 202 |
+
risk_free_rate: Annualized risk-free rate as a decimal.
|
| 203 |
+
dividend_yield: Annualized dividend yield as a decimal.
|
| 204 |
+
"""
|
| 205 |
+
greeks = black_scholes_greeks(
|
| 206 |
+
spot=spot,
|
| 207 |
+
strike=strike,
|
| 208 |
+
time_to_expiry=time_to_expiry,
|
| 209 |
+
volatility=volatility,
|
| 210 |
+
risk_free_rate=risk_free_rate,
|
| 211 |
+
dividend_yield=dividend_yield,
|
| 212 |
+
option_type=option_type,
|
| 213 |
+
)
|
| 214 |
+
return json_dumps({"status": "success", "greeks": greeks})
|
optimizer/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .vol_optimizer import optimize_volatility_signal
|
| 2 |
+
|
| 3 |
+
__all__ = ["optimize_volatility_signal"]
|
optimizer/tools.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
from smolagents import tool
|
| 6 |
+
|
| 7 |
+
from market_data.providers import get_price_history
|
| 8 |
+
|
| 9 |
+
from .vol_optimizer import optimize_volatility_signal
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@tool
|
| 13 |
+
def optimize_volatility_signal_parameters(
|
| 14 |
+
symbol: str,
|
| 15 |
+
signal: str = "long_vol",
|
| 16 |
+
period: str = "3y",
|
| 17 |
+
) -> str:
|
| 18 |
+
"""Scan simple realized-volatility signal parameters and compare best vs baseline.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
symbol: Yahoo Finance ticker.
|
| 22 |
+
signal: long_vol or short_vol.
|
| 23 |
+
period: Yahoo Finance history period.
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
history = get_price_history(symbol, period=period, interval="1d")
|
| 27 |
+
result = optimize_volatility_signal(history["Close"], signal=signal)
|
| 28 |
+
return json.dumps({"status": "success", "symbol": symbol.upper(), **result}, ensure_ascii=False, indent=2)
|
| 29 |
+
except Exception as exc:
|
| 30 |
+
return json.dumps({"status": "error", "symbol": symbol, "message": str(exc)}, ensure_ascii=False, indent=2)
|
optimizer/vol_optimizer.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
from backtest.vol_backtest import backtest_realized_vol_signal
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def optimize_volatility_signal(
|
| 9 |
+
prices: pd.Series,
|
| 10 |
+
signal: str = "long_vol",
|
| 11 |
+
short_windows: tuple[int, ...] = (5, 10, 15),
|
| 12 |
+
long_windows: tuple[int, ...] = (20, 30, 60),
|
| 13 |
+
holding_days_options: tuple[int, ...] = (3, 5, 10),
|
| 14 |
+
) -> dict:
|
| 15 |
+
runs = []
|
| 16 |
+
for short_window in short_windows:
|
| 17 |
+
for long_window in long_windows:
|
| 18 |
+
if short_window >= long_window:
|
| 19 |
+
continue
|
| 20 |
+
for holding_days in holding_days_options:
|
| 21 |
+
result = backtest_realized_vol_signal(
|
| 22 |
+
prices=prices,
|
| 23 |
+
short_window=short_window,
|
| 24 |
+
long_window=long_window,
|
| 25 |
+
holding_days=holding_days,
|
| 26 |
+
signal=signal,
|
| 27 |
+
)
|
| 28 |
+
runs.append(
|
| 29 |
+
{
|
| 30 |
+
"short_window": short_window,
|
| 31 |
+
"long_window": long_window,
|
| 32 |
+
"holding_days": holding_days,
|
| 33 |
+
"trade_count": result["trade_count"],
|
| 34 |
+
"win_rate": result["win_rate"],
|
| 35 |
+
"total_return_proxy": result["total_return_proxy"],
|
| 36 |
+
"max_drawdown_proxy": result["max_drawdown_proxy"],
|
| 37 |
+
"avg_trade_pnl_proxy": result["avg_trade_pnl_proxy"],
|
| 38 |
+
}
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
runs.sort(
|
| 42 |
+
key=lambda run: (
|
| 43 |
+
run["total_return_proxy"],
|
| 44 |
+
-abs(run["max_drawdown_proxy"]),
|
| 45 |
+
run["win_rate"],
|
| 46 |
+
),
|
| 47 |
+
reverse=True,
|
| 48 |
+
)
|
| 49 |
+
best = runs[0] if runs else None
|
| 50 |
+
baseline = next(
|
| 51 |
+
(
|
| 52 |
+
run
|
| 53 |
+
for run in runs
|
| 54 |
+
if run["short_window"] == 10 and run["long_window"] == 30 and run["holding_days"] == 5
|
| 55 |
+
),
|
| 56 |
+
runs[0] if runs else None,
|
| 57 |
+
)
|
| 58 |
+
return {
|
| 59 |
+
"signal": signal,
|
| 60 |
+
"best": best,
|
| 61 |
+
"baseline": baseline,
|
| 62 |
+
"top_runs": runs[:10],
|
| 63 |
+
"metrics_delta": (
|
| 64 |
+
{
|
| 65 |
+
"total_return_proxy_delta": best["total_return_proxy"] - baseline["total_return_proxy"],
|
| 66 |
+
"win_rate_delta": best["win_rate"] - baseline["win_rate"],
|
| 67 |
+
"max_drawdown_proxy_delta": best["max_drawdown_proxy"] - baseline["max_drawdown_proxy"],
|
| 68 |
+
}
|
| 69 |
+
if best and baseline
|
| 70 |
+
else None
|
| 71 |
+
),
|
| 72 |
+
"anti_overfit_note": (
|
| 73 |
+
"This is an in-sample parameter scan. Use walk-forward or out-of-sample validation "
|
| 74 |
+
"before trusting optimized parameters."
|
| 75 |
+
),
|
| 76 |
+
}
|
prompts.yaml
CHANGED
|
@@ -9,6 +9,18 @@
|
|
| 9 |
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
|
| 10 |
In the end you have to return a final answer using the `final_answer` tool.
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
Here are a few examples using notional tools:
|
| 13 |
---
|
| 14 |
Task: "Generate an image of the oldest person in this document."
|
|
|
|
| 9 |
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
|
| 10 |
In the end you have to return a final answer using the `final_answer` tool.
|
| 11 |
|
| 12 |
+
You are also an options research agent focused on volatility trading. When the task concerns options, volatility, market data, strategy construction, or backtesting, follow these rules:
|
| 13 |
+
- Treat all outputs as research and education, not guaranteed investment advice.
|
| 14 |
+
- Prefer `query_knowledge` for stable options concepts, formulas, Greeks, volatility trading theory, and citations from local reference books.
|
| 15 |
+
- Use `web_search` and `visit_webpage` for recent market events, earnings dates, company announcements, macro events, exchange rules, and source verification.
|
| 16 |
+
- Use market data tools for current price, option chains, realized volatility, IV/RV spread, skew, term structure, and Greeks before proposing a strategy.
|
| 17 |
+
- For volatility strategies, state whether the idea is long vol, short vol, term-structure, skew, or event-vol driven.
|
| 18 |
+
- Every strategy discussion must include legs, expiration, strikes, net debit/credit, max loss, breakevens, major Greeks exposure, liquidity warnings, and event/IV-crush risk when relevant.
|
| 19 |
+
- Before presenting a final strategy, use payoff/backtest/optimization tools when sufficient data is available, and clearly label any proxy backtest limitations.
|
| 20 |
+
- Never present short premium strategies as low-risk. Explicitly mention tail risk, margin, assignment, liquidity, slippage, and gap risk.
|
| 21 |
+
- If required inputs are missing, ask for the missing symbol, outlook, time horizon, risk budget, or whether naked option selling is allowed.
|
| 22 |
+
- Final answers for options tasks should use this structure when applicable: market_context, volatility_view, strategy_candidates, selected_strategy, backtest_summary, risk_warnings, sources, limitations.
|
| 23 |
+
|
| 24 |
Here are a few examples using notional tools:
|
| 25 |
---
|
| 26 |
Task: "Generate an image of the oldest person in this document."
|
pyproject.toml
CHANGED
|
@@ -18,6 +18,8 @@ dependencies = [
|
|
| 18 |
"tokenizers>=0.22.0,<=0.23.0",
|
| 19 |
"transformers<5",
|
| 20 |
"pymupdf>=1.27.2.3",
|
|
|
|
|
|
|
| 21 |
]
|
| 22 |
|
| 23 |
[build-system]
|
|
|
|
| 18 |
"tokenizers>=0.22.0,<=0.23.0",
|
| 19 |
"transformers<5",
|
| 20 |
"pymupdf>=1.27.2.3",
|
| 21 |
+
"pandas>=2.0.0",
|
| 22 |
+
"yfinance>=0.2.0",
|
| 23 |
]
|
| 24 |
|
| 25 |
[build-system]
|
quantconnect/README.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# QuantConnect / LEAN Option Backtest Template
|
| 2 |
+
|
| 3 |
+
这个目录放真正期权历史回测的 QuantConnect/LEAN 模板。它和本地
|
| 4 |
+
`backtest_long_straddle_csv` 的定位不同:
|
| 5 |
+
|
| 6 |
+
- 本地 CSV 回测:适合用供应商导出的历史期权 bid/ask quote 快速验证策略。
|
| 7 |
+
- QuantConnect/LEAN:适合用 LEAN 数据源、撮合模型、手续费模型、组合持仓和保证金模型做更完整的回测。
|
| 8 |
+
|
| 9 |
+
## 当前模板
|
| 10 |
+
|
| 11 |
+
`VolatilityStraddleAlgorithm.py` 实现一个 ATM long straddle 示例:
|
| 12 |
+
|
| 13 |
+
- 每隔 `entry_every_days` 天寻找目标 DTE 附近的期权到期日。
|
| 14 |
+
- 选择最接近 ATM 的 call 和 put。
|
| 15 |
+
- 用市场单买入 1 组 straddle。
|
| 16 |
+
- 持有 `holding_days` 后平仓。
|
| 17 |
+
- 使用 LEAN 的期权链、组合持仓、手续费/滑点/撮合模型能力。
|
| 18 |
+
|
| 19 |
+
## 使用方式
|
| 20 |
+
|
| 21 |
+
1. 在 QuantConnect 新建 Python algorithm。
|
| 22 |
+
2. 将 `VolatilityStraddleAlgorithm.py` 内容复制到 `main.py`。
|
| 23 |
+
3. 根据标的、日期、DTE、holding period 和资金规模修改参数。
|
| 24 |
+
4. 运行回测,导出 orders/trades/equity 后可再交给 agent 分析。
|
| 25 |
+
|
| 26 |
+
## 注意
|
| 27 |
+
|
| 28 |
+
真实期权回测必须有历史期权链或历史期权报价。`yfinance` 只能查当前/近期期权链,不能可靠提供历史期权链,所以不能单独支撑严肃的历史期权策略回测。
|
quantconnect/VolatilityStraddleAlgorithm.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from AlgorithmImports import *
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class VolatilityStraddleAlgorithm(QCAlgorithm):
|
| 5 |
+
"""ATM long straddle template for real option backtests in QuantConnect/LEAN."""
|
| 6 |
+
|
| 7 |
+
def Initialize(self):
|
| 8 |
+
self.SetStartDate(2022, 1, 1)
|
| 9 |
+
self.SetEndDate(2024, 1, 1)
|
| 10 |
+
self.SetCash(100000)
|
| 11 |
+
|
| 12 |
+
self.ticker = "SPY"
|
| 13 |
+
self.target_dte = 30
|
| 14 |
+
self.holding_days = 5
|
| 15 |
+
self.entry_every_days = 5
|
| 16 |
+
self.contract_quantity = 1
|
| 17 |
+
|
| 18 |
+
equity = self.AddEquity(self.ticker, Resolution.Minute)
|
| 19 |
+
option = self.AddOption(self.ticker, Resolution.Minute)
|
| 20 |
+
option.SetFilter(self.OptionFilter)
|
| 21 |
+
|
| 22 |
+
self.underlying = equity.Symbol
|
| 23 |
+
self.option_symbol = option.Symbol
|
| 24 |
+
self.next_entry_time = self.StartDate
|
| 25 |
+
self.open_groups = []
|
| 26 |
+
|
| 27 |
+
def OptionFilter(self, universe):
|
| 28 |
+
min_dte = max(1, self.target_dte - 10)
|
| 29 |
+
max_dte = self.target_dte + 10
|
| 30 |
+
return universe.IncludeWeeklys().Strikes(-10, 10).Expiration(min_dte, max_dte)
|
| 31 |
+
|
| 32 |
+
def OnData(self, slice):
|
| 33 |
+
self.CloseExpiredHoldingGroups()
|
| 34 |
+
|
| 35 |
+
if self.Time < self.next_entry_time:
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
chain = slice.OptionChains.get(self.option_symbol)
|
| 39 |
+
if chain is None:
|
| 40 |
+
return
|
| 41 |
+
|
| 42 |
+
contracts = [contract for contract in chain if contract.Expiry.date() > self.Time.date()]
|
| 43 |
+
if not contracts:
|
| 44 |
+
return
|
| 45 |
+
|
| 46 |
+
expiry = min(contracts, key=lambda contract: abs((contract.Expiry.date() - self.Time.date()).days - self.target_dte)).Expiry
|
| 47 |
+
expiry_contracts = [contract for contract in contracts if contract.Expiry == expiry]
|
| 48 |
+
spot = self.Securities[self.underlying].Price
|
| 49 |
+
|
| 50 |
+
calls = [contract for contract in expiry_contracts if contract.Right == OptionRight.Call]
|
| 51 |
+
puts = [contract for contract in expiry_contracts if contract.Right == OptionRight.Put]
|
| 52 |
+
if not calls or not puts:
|
| 53 |
+
return
|
| 54 |
+
|
| 55 |
+
call = min(calls, key=lambda contract: abs(contract.Strike - spot))
|
| 56 |
+
put = min(puts, key=lambda contract: abs(contract.Strike - spot))
|
| 57 |
+
|
| 58 |
+
self.MarketOrder(call.Symbol, self.contract_quantity)
|
| 59 |
+
self.MarketOrder(put.Symbol, self.contract_quantity)
|
| 60 |
+
|
| 61 |
+
self.open_groups.append(
|
| 62 |
+
{
|
| 63 |
+
"entry_time": self.Time,
|
| 64 |
+
"exit_time": self.Time + timedelta(days=self.holding_days),
|
| 65 |
+
"symbols": [call.Symbol, put.Symbol],
|
| 66 |
+
}
|
| 67 |
+
)
|
| 68 |
+
self.next_entry_time = self.Time + timedelta(days=self.entry_every_days)
|
| 69 |
+
|
| 70 |
+
self.Debug(
|
| 71 |
+
f"Opened ATM straddle {call.Symbol.Value}, {put.Symbol.Value}; "
|
| 72 |
+
f"spot={spot:.2f}; expiry={expiry.date()}"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
def CloseExpiredHoldingGroups(self):
|
| 76 |
+
remaining_groups = []
|
| 77 |
+
for group in self.open_groups:
|
| 78 |
+
if self.Time < group["exit_time"]:
|
| 79 |
+
remaining_groups.append(group)
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
for symbol in group["symbols"]:
|
| 83 |
+
holding = self.Portfolio[symbol]
|
| 84 |
+
if holding.Invested:
|
| 85 |
+
self.MarketOrder(symbol, -holding.Quantity)
|
| 86 |
+
self.Debug(f"Closed straddle group from {group['entry_time']}")
|
| 87 |
+
|
| 88 |
+
self.open_groups = remaining_groups
|
| 89 |
+
|
| 90 |
+
def OnEndOfAlgorithm(self):
|
| 91 |
+
self.Debug(f"Final portfolio value: {self.Portfolio.TotalPortfolioValue:.2f}")
|
rag_eval_interview_notes.md
ADDED
|
@@ -0,0 +1,544 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG 评测模块构建总结
|
| 2 |
+
|
| 3 |
+
本文档用于面试时说明:为什么需要 RAG 评测、如何设计 retrieval eval、如何接入公开数据集和自建 PDF 测试集,以及如何判断 RAG 优化是否真的有效。
|
| 4 |
+
|
| 5 |
+
## 背景问题
|
| 6 |
+
|
| 7 |
+
在优化 RAG 系统时,仅靠主观查看回答效果不稳定,也很难判断 PDF 解析、chunk 切分、embedding、reranker 或检索参数的改动是否真的带来提升。
|
| 8 |
+
|
| 9 |
+
因此我先搭建了一个独立的 RAG retrieval evaluation 模块,用固定测试集和固定指标来做 before/after 对比。
|
| 10 |
+
|
| 11 |
+
目标是:
|
| 12 |
+
|
| 13 |
+
- 能快速验证检索链路是否跑通。
|
| 14 |
+
- 能用公开 benchmark 做横向参考。
|
| 15 |
+
- 能用金融相关数据集贴近业务场景。
|
| 16 |
+
- 能用自己的期权 PDF 测试集验证 PDF 解析、公式抽取和章节切分是否有效。
|
| 17 |
+
- 每次改动后可以一条命令自动跑评测并生成报告。
|
| 18 |
+
|
| 19 |
+
## 数据集接入顺序
|
| 20 |
+
|
| 21 |
+
我按照由易到难、由通用到业务的顺序接入了 4 类测试集。
|
| 22 |
+
|
| 23 |
+
### 1. BEIR/scifact
|
| 24 |
+
|
| 25 |
+
`scifact` 是 BEIR 中比较小的科学事实检索数据集,适合快速跑通 retrieval eval。
|
| 26 |
+
|
| 27 |
+
接入它的目的不是追求业务贴合,而是验证:
|
| 28 |
+
|
| 29 |
+
- 数据下载和解析是否正常。
|
| 30 |
+
- corpus、query、qrels 能否正确对齐。
|
| 31 |
+
- 向量索引是否能构建。
|
| 32 |
+
- 检索指标是否能稳定输出。
|
| 33 |
+
|
| 34 |
+
### 2. BEIR/fiqa
|
| 35 |
+
|
| 36 |
+
`fiqa` 是金融问答相关数据集,比 `scifact` 更贴近金融场景。
|
| 37 |
+
|
| 38 |
+
接入它的目的:
|
| 39 |
+
|
| 40 |
+
- 验证金融语义检索能力。
|
| 41 |
+
- 检查 embedding 对金融术语、问答表达的适配情况。
|
| 42 |
+
- 作为后续期权 PDF 场景前的公开金融 benchmark。
|
| 43 |
+
|
| 44 |
+
### 3. Open RAGBench
|
| 45 |
+
|
| 46 |
+
Open RAGBench 更接近长文档、PDF、报告类 RAG 场景。
|
| 47 |
+
|
| 48 |
+
我选择了其中的 `pdf/arxiv` 子集,用来验证:
|
| 49 |
+
|
| 50 |
+
- 长文档解析后的检索效果。
|
| 51 |
+
- 多章节、多段落文档下的 chunk 检索表现。
|
| 52 |
+
- RAG 系统在 PDF-like 文档上的泛化能力。
|
| 53 |
+
|
| 54 |
+
### 4. 自建期权 PDF 测试集
|
| 55 |
+
|
| 56 |
+
最后补充自己的期权 PDF 测试集,因为公开 benchmark 无法完全覆盖当前项目中的业务难点。
|
| 57 |
+
|
| 58 |
+
自建测试集重点覆盖:
|
| 59 |
+
|
| 60 |
+
- 期权定价概念。
|
| 61 |
+
- PDF 中的公式内容。
|
| 62 |
+
- 章节标题和上下文定位。
|
| 63 |
+
- 公式编号、页码、章节等 metadata 是否能帮助检索。
|
| 64 |
+
|
| 65 |
+
## 模块设计
|
| 66 |
+
|
| 67 |
+
评测模块放在 `eval/` 目录下,核心文件包括:
|
| 68 |
+
|
| 69 |
+
- `eval/rag_eval.py`:单数据集 retrieval eval 入口。
|
| 70 |
+
- `eval/run_eval_suite.py`:批量评测多个数据集的 suite runner。
|
| 71 |
+
- `eval/local_options_eval.jsonl`:自建期权 PDF 测试集。
|
| 72 |
+
- `eval/README.md`:调用示例和使用说明。
|
| 73 |
+
|
| 74 |
+
整体流程如下:
|
| 75 |
+
|
| 76 |
+
```text
|
| 77 |
+
加载数据集
|
| 78 |
+
-> 构造 documents / queries / qrels
|
| 79 |
+
-> 构建 Chroma 向量索引
|
| 80 |
+
-> 执行 top-k retrieval
|
| 81 |
+
-> 按 doc_id 去重
|
| 82 |
+
-> 计算 hit@k / MRR / NDCG@K
|
| 83 |
+
-> 生成 JSON 和 Markdown 报告
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## 为什么只先做 retrieval eval
|
| 87 |
+
|
| 88 |
+
RAG 的最终效果由两部分组成:
|
| 89 |
+
|
| 90 |
+
```text
|
| 91 |
+
RAG = Retrieval + Generation
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
如果检索阶段没有找到正确上下文,后面的 LLM 生成很容易幻觉。因此我先评估 retrieval:
|
| 95 |
+
|
| 96 |
+
- 问题对应的正确文档有没有被找回来。
|
| 97 |
+
- 正确文档排在第几名。
|
| 98 |
+
- top-k 结果排序是否合理。
|
| 99 |
+
|
| 100 |
+
这样可以先把问题定位在“检索是否正确”,再进一步评估生成答案。
|
| 101 |
+
|
| 102 |
+
## 指标设计
|
| 103 |
+
|
| 104 |
+
### Hit@K
|
| 105 |
+
|
| 106 |
+
`Hit@K` 表示前 K 个结果里是否包含正确文档。
|
| 107 |
+
|
| 108 |
+
例如 `Hit@5 = 1`,表示正确文档出现在前 5 个检索结果中。
|
| 109 |
+
|
| 110 |
+
它适合判断:
|
| 111 |
+
|
| 112 |
+
- 正确上下文有没有被召回。
|
| 113 |
+
- top-k 设大以后召回是否提升。
|
| 114 |
+
|
| 115 |
+
### MRR
|
| 116 |
+
|
| 117 |
+
`MRR` 是 Mean Reciprocal Rank,关注第一个正确结果出现的位置。
|
| 118 |
+
|
| 119 |
+
如果正确结果排第 1,得分是 `1`。
|
| 120 |
+
如果正确结果排第 2,得分是 `1/2`。
|
| 121 |
+
如果正确结果排第 5,得分是 `1/5`。
|
| 122 |
+
|
| 123 |
+
它适合判断:
|
| 124 |
+
|
| 125 |
+
- 正确文档是否排得足够靠前。
|
| 126 |
+
- 检索排序质量是否提升。
|
| 127 |
+
|
| 128 |
+
### NDCG@K
|
| 129 |
+
|
| 130 |
+
`NDCG@K` 衡量前 K 个结果的排序质量。
|
| 131 |
+
|
| 132 |
+
计算方式是:
|
| 133 |
+
|
| 134 |
+
```text
|
| 135 |
+
DCG@K = rel_1 / log2(2) + rel_2 / log2(3) + ... + rel_K / log2(K + 1)
|
| 136 |
+
NDCG@K = DCG@K / IDCG@K
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
其中 `rel_i = 1` 表示第 i 个结果相关,`rel_i = 0` 表示不相关。
|
| 140 |
+
|
| 141 |
+
NDCG 越接近 1,说明相关结果越靠前。
|
| 142 |
+
|
| 143 |
+
## 关键实现细节
|
| 144 |
+
|
| 145 |
+
### 1. 统一数据格式
|
| 146 |
+
|
| 147 |
+
不同数据集格式不同,因此我统一抽象成:
|
| 148 |
+
|
| 149 |
+
```python
|
| 150 |
+
documents = [
|
| 151 |
+
{
|
| 152 |
+
"doc_id": "...",
|
| 153 |
+
"title": "...",
|
| 154 |
+
"text": "...",
|
| 155 |
+
"metadata": {...}
|
| 156 |
+
}
|
| 157 |
+
]
|
| 158 |
+
|
| 159 |
+
queries = [
|
| 160 |
+
{
|
| 161 |
+
"query_id": "...",
|
| 162 |
+
"question": "...",
|
| 163 |
+
"relevant_doc_ids": [...]
|
| 164 |
+
}
|
| 165 |
+
]
|
| 166 |
+
|
| 167 |
+
qrels = {
|
| 168 |
+
"query_id": {"doc_id"}
|
| 169 |
+
}
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
这样后续索引构建和指标计算可以复用同一套逻辑。
|
| 173 |
+
|
| 174 |
+
### 2. 小样本评测必须包含 gold 文档
|
| 175 |
+
|
| 176 |
+
在做 smoke test 时,如果只取 corpus 前 N 篇文档,可能会出现 query 的正确文档不在测试 corpus 里,导致评测不公平。
|
| 177 |
+
|
| 178 |
+
所以我在��载 BEIR 和 Open RAGBench 时,会先读取 qrels,确定当前 query 需要哪些 gold documents,再优先把这些文档纳入 corpus。
|
| 179 |
+
|
| 180 |
+
这样小样本测试可以稳定评估检索能力,而不是被采样问题干扰。
|
| 181 |
+
|
| 182 |
+
### 3. 检索结果按 doc_id 去重
|
| 183 |
+
|
| 184 |
+
一个文档会被切成多个 chunk,检索时可能同一篇文档的多个 chunk 同时出现在 top-k 中。
|
| 185 |
+
|
| 186 |
+
如果不去重,会导致:
|
| 187 |
+
|
| 188 |
+
- 指标被重复 chunk 影响。
|
| 189 |
+
- NDCG 可能异常偏高。
|
| 190 |
+
- top-k 实际上不是 top-k documents,而是 top-k chunks。
|
| 191 |
+
|
| 192 |
+
因此评测时内部会多取一些 chunk,然后按 `doc_id` 去重,再计算 top-k 文档级指标。
|
| 193 |
+
|
| 194 |
+
### 4. 支持 rebuild
|
| 195 |
+
|
| 196 |
+
如果修改了:
|
| 197 |
+
|
| 198 |
+
- PDF 解析逻辑
|
| 199 |
+
- chunk 切分方式
|
| 200 |
+
- embedding 模型
|
| 201 |
+
- metadata 构造
|
| 202 |
+
- reranker 或检索参数
|
| 203 |
+
|
| 204 |
+
必须使用 `--rebuild` 重建索引,否则会复用旧索引,评测结果不能代表最新代码。
|
| 205 |
+
|
| 206 |
+
## 自动化评测脚本
|
| 207 |
+
|
| 208 |
+
单数据集评测:
|
| 209 |
+
|
| 210 |
+
```bash
|
| 211 |
+
uv --cache-dir .uv-cache run python -m eval.rag_eval \
|
| 212 |
+
--dataset local-options \
|
| 213 |
+
--max-queries 3 \
|
| 214 |
+
--top-k 5 \
|
| 215 |
+
--rebuild
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
批量评测:
|
| 219 |
+
|
| 220 |
+
```bash
|
| 221 |
+
uv --cache-dir .uv-cache run python -m eval.run_eval_suite --rebuild
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
只跑指定数据集:
|
| 225 |
+
|
| 226 |
+
```bash
|
| 227 |
+
uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
|
| 228 |
+
--datasets local-options,beir/fiqa \
|
| 229 |
+
--top-k 5 \
|
| 230 |
+
--max-queries 20 \
|
| 231 |
+
--rebuild
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
对比不同 chunk 设置:
|
| 235 |
+
|
| 236 |
+
```bash
|
| 237 |
+
uv --cache-dir .uv-cache run python -m eval.run_eval_suite \
|
| 238 |
+
--datasets local-options \
|
| 239 |
+
--chunk-size 384 \
|
| 240 |
+
--chunk-overlap 64 \
|
| 241 |
+
--output-name local_options_chunk384 \
|
| 242 |
+
--rebuild
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
报告会输出到:
|
| 246 |
+
|
| 247 |
+
```text
|
| 248 |
+
eval/reports/
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
包括:
|
| 252 |
+
|
| 253 |
+
- 每个数据集的 JSON 报告。
|
| 254 |
+
- 每个数据集的 Markdown 报告。
|
| 255 |
+
- suite 级别的汇总报告。
|
| 256 |
+
|
| 257 |
+
## 遇到的问题和解决方案
|
| 258 |
+
|
| 259 |
+
### 问题 1:公开数据集需要联网下载
|
| 260 |
+
|
| 261 |
+
BEIR 和 Open RAGBench 都需要从公网下载数据。
|
| 262 |
+
|
| 263 |
+
解决方法:
|
| 264 |
+
|
| 265 |
+
- 第一次运行时下载并缓存到 `eval/data/`。
|
| 266 |
+
- 后续运行直接复用本地数据。
|
| 267 |
+
- 数据和索引分开存放,便于排查问题。
|
| 268 |
+
|
| 269 |
+
### 问题 2:Open RAGBench 实际目录结构和预期不一致
|
| 270 |
+
|
| 271 |
+
最开始预设路径是 `official/pdf/arxiv`,但实际下载后路径是 `pdf/arxiv`。
|
| 272 |
+
|
| 273 |
+
解决方法:
|
| 274 |
+
|
| 275 |
+
- loader 中兼容两种路径。
|
| 276 |
+
- 优先尝试 `pdf/arxiv`,不存在时再回退到 `official/pdf/arxiv`。
|
| 277 |
+
|
| 278 |
+
### 问题 3:小样本采样会漏掉 gold document
|
| 279 |
+
|
| 280 |
+
如果 `max_corpus_docs` 很小,直接截取 corpus 前 N 条可能不包含 qrels 中的正确文档。
|
| 281 |
+
|
| 282 |
+
解决方法:
|
| 283 |
+
|
| 284 |
+
- 先根据 qrels 选择 query。
|
| 285 |
+
- 再把对应 gold documents 强制纳入 corpus。
|
| 286 |
+
- 最后补充其他文档作为干扰项。
|
| 287 |
+
|
| 288 |
+
### 问题 4:chunk 重复导致指标异常
|
| 289 |
+
|
| 290 |
+
同一篇文档的多个 chunk 可能同时命中,导致 NDCG 等指标不合理。
|
| 291 |
+
|
| 292 |
+
解决方法:
|
| 293 |
+
|
| 294 |
+
- 检索时多取一些 chunk。
|
| 295 |
+
- 评估时按 `doc_id` 去重。
|
| 296 |
+
- 最终以 document-level top-k 计算指标。
|
| 297 |
+
|
| 298 |
+
### 问题 5:不重建索引可能复用旧结果
|
| 299 |
+
|
| 300 |
+
如果代码改了但没有 `--rebuild`,Chroma 可能复用旧索引。
|
| 301 |
+
|
| 302 |
+
解决方法:
|
| 303 |
+
|
| 304 |
+
- 文档中明确说明改动后必须加 `--rebuild`。
|
| 305 |
+
- suite runner 支持统一传入 `--rebuild`。
|
| 306 |
+
- 用 `--output-name` 固定报告名,方便 before/after 对比。
|
| 307 |
+
|
| 308 |
+
### 问题 6:RAG 只是独立模块,没有真正接入 Agent
|
| 309 |
+
|
| 310 |
+
最开始 RAG 已经能单独查询知识库,但主 `CodeAgent` 的 tools 里没有注册知识库工具。这样在真实对话里,agent 实际只能查行情和时间,不能主动调用本地期权知识库。
|
| 311 |
+
|
| 312 |
+
解决方法:
|
| 313 |
+
|
| 314 |
+
- 将 `QueryKnowledgeTool` 注册进主 agent。
|
| 315 |
+
- 优化 tool description,让模型知道它应该在期权概念、波动率、Greeks、策略、公式编号和书籍引用问题上调用该工具。
|
| 316 |
+
- 控制 tool 输出长度,只返回来源、页码、section、分数和截断后的片段,避免检索结果占满上下文。
|
| 317 |
+
|
| 318 |
+
面试可以强调:
|
| 319 |
+
|
| 320 |
+
> RAG 不是只要能单独跑 query 就算完成,必须作为 agent 的一个可调用工具接入主工作流。否则用户问期权概念时,agent 不一定会查知识库,仍然可能凭模型参数记忆回答。
|
| 321 |
+
|
| 322 |
+
### 问题 7:知识库目录和代码目录耦合
|
| 323 |
+
|
| 324 |
+
早期知识库放在 `tools/knowledge_base` 下,代码、原始资料和 Chroma 数据库混在一起。随着知识库变大,这种结构不利于维护,也不利于后续把工具代码、数据和缓存分开管理。
|
| 325 |
+
|
| 326 |
+
解决方法:
|
| 327 |
+
|
| 328 |
+
- 将知识库统一到项目根目录:
|
| 329 |
+
|
| 330 |
+
```text
|
| 331 |
+
OptionAgent/knowledge_base/
|
| 332 |
+
raw/
|
| 333 |
+
chroma_db/
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
- 工具代码中使用 `PROJECT_ROOT / "knowledge_base"` 作为主路径。
|
| 337 |
+
- 保留旧路径 fallback,避免迁移时旧数据立刻失效。
|
| 338 |
+
|
| 339 |
+
面试可以强调:
|
| 340 |
+
|
| 341 |
+
> 我把知识库从工具目录迁到项目根目录,并保留 legacy fallback。这样既完成了结构治理,也避免了迁移时破坏已有索引和原始文档。
|
| 342 |
+
|
| 343 |
+
### 问题 8:全量 rebuild 成本高
|
| 344 |
+
|
| 345 |
+
只要文档、解析方法或 embedding 模型变化,就全量重建索引。书籍变多后,这会浪费大量时间,而且不方便频繁更新笔记。
|
| 346 |
+
|
| 347 |
+
解决方法:
|
| 348 |
+
|
| 349 |
+
- 每个 chunk metadata 中保留:
|
| 350 |
+
|
| 351 |
+
```text
|
| 352 |
+
source_file
|
| 353 |
+
file_hash
|
| 354 |
+
embedding_model
|
| 355 |
+
extraction_method
|
| 356 |
+
```
|
| 357 |
+
|
| 358 |
+
- 启动时扫描当前 raw 文件,和 Chroma 中已有 metadata 对比:
|
| 359 |
+
|
| 360 |
+
```text
|
| 361 |
+
新增文件 -> 只入库新增文件
|
| 362 |
+
修改文件 -> 删除该文件旧 chunks,再重新入库
|
| 363 |
+
删除文件 -> 删除该文件对应 chunks
|
| 364 |
+
embedding/extraction 版本变化 -> 触发对应文件更新
|
| 365 |
+
```
|
| 366 |
+
|
| 367 |
+
面试可以强调:
|
| 368 |
+
|
| 369 |
+
> 我没有只依赖 collection 是否为空,而是基于 source_file、file_hash、embedding_model 和 extraction_method 做增量更新。这样文档更新后索引不会脏,也不用每次全量 rebuild。
|
| 370 |
+
|
| 371 |
+
### 问题 9:纯向量检索对公式编号和专有名词不稳定
|
| 372 |
+
|
| 373 |
+
期权书里有很多精确查询,例如:
|
| 374 |
+
|
| 375 |
+
```text
|
| 376 |
+
Equation 21.23
|
| 377 |
+
WITH ZERO CORRELATION
|
| 378 |
+
Black-Scholes-Merton
|
| 379 |
+
vega
|
| 380 |
+
gamma
|
| 381 |
+
```
|
| 382 |
+
|
| 383 |
+
这类问题不只是语义相似,还需要字面命中。纯 dense embedding 对概念解释很强,但对公式编号、章节标题、专有名词有时不如关键词检索稳定。
|
| 384 |
+
|
| 385 |
+
解决方法:
|
| 386 |
+
|
| 387 |
+
- 增加轻量 BM25 检索。
|
| 388 |
+
- 查询时同时跑:
|
| 389 |
+
|
| 390 |
+
```text
|
| 391 |
+
dense vector retrieval
|
| 392 |
+
BM25 keyword retrieval
|
| 393 |
+
```
|
| 394 |
+
|
| 395 |
+
- 使用 reciprocal-rank merge 合并结果。
|
| 396 |
+
- 再交给 cross-encoder reranker 做最终排序。
|
| 397 |
+
|
| 398 |
+
最终链路:
|
| 399 |
+
|
| 400 |
+
```text
|
| 401 |
+
query
|
| 402 |
+
-> dense top-k
|
| 403 |
+
-> BM25 top-k
|
| 404 |
+
-> merge / deduplicate
|
| 405 |
+
-> reranker
|
| 406 |
+
-> top results with citations
|
| 407 |
+
```
|
| 408 |
+
|
| 409 |
+
面试可以强调:
|
| 410 |
+
|
| 411 |
+
> 我做 hybrid search 是因为金融和期权文档里存在大量公式编号、章节名、ticker-like token 和专有名词。Dense retrieval 负责语义召回,BM25 负责精确词命中,reranker 负责最终排序。
|
| 412 |
+
|
| 413 |
+
### 问题 10:本地评测集太小
|
| 414 |
+
|
| 415 |
+
最初 `local-options` 只有 3 条 case,容易出现指标过高但不可泛化的问题。比如小样本里 Hit@5 为 1,并不代表系统在真实问题上稳定。
|
| 416 |
+
|
| 417 |
+
解决方法:
|
| 418 |
+
|
| 419 |
+
- 新增 `eval/generate_local_options_eval.py`。
|
| 420 |
+
- 从已解析的 PDF/MD 文档中随机抽样 chunk。
|
| 421 |
+
- 优先覆盖:
|
| 422 |
+
- 公式问题。
|
| 423 |
+
- 章节定位问题。
|
| 424 |
+
- 期权关键词问题。
|
| 425 |
+
- 波动率、Greeks、风险中性、策略等业务术语。
|
| 426 |
+
- 过滤前言、索引页、表格/图注噪声,避免生成低质量 query。
|
| 427 |
+
- 将本地 eval 扩充到 40 条。
|
| 428 |
+
|
| 429 |
+
面试可以强调:
|
| 430 |
+
|
| 431 |
+
> 我没有只手写少量 happy path case,而是做了一个本地 eval case generator,从真实 chunk 中抽样生成问题,并对噪声标题做过滤。这样可以更稳定地评估 PDF 解析和检索策略的变化。
|
| 432 |
+
|
| 433 |
+
## Hybrid Search 和 Reranker 对比实验
|
| 434 |
+
|
| 435 |
+
扩充到 40 条 local-options case 后,我做了三组对比:
|
| 436 |
+
|
| 437 |
+
```text
|
| 438 |
+
dense-only:
|
| 439 |
+
MRR 0.4708
|
| 440 |
+
NDCG@5 0.3468
|
| 441 |
+
Hit@1 0.4250
|
| 442 |
+
Hit@3 0.5250
|
| 443 |
+
Hit@5 0.5250
|
| 444 |
+
|
| 445 |
+
hybrid:
|
| 446 |
+
MRR 0.4833
|
| 447 |
+
NDCG@5 0.3190
|
| 448 |
+
Hit@1 0.4250
|
| 449 |
+
Hit@3 0.5250
|
| 450 |
+
Hit@5 0.5750
|
| 451 |
+
|
| 452 |
+
hybrid + reranker:
|
| 453 |
+
MRR 0.7125
|
| 454 |
+
NDCG@5 0.4717
|
| 455 |
+
Hit@1 0.7000
|
| 456 |
+
Hit@3 0.7250
|
| 457 |
+
Hit@5 0.7250
|
| 458 |
+
```
|
| 459 |
+
|
| 460 |
+
结果解释:
|
| 461 |
+
|
| 462 |
+
- Hybrid search 单独提升了 Hit@5,说明 BM25 补充了召回,尤其对精确术语和公式编号有帮助。
|
| 463 |
+
- Hybrid 的 NDCG 略降,说明召回增加后排序还不够好。
|
| 464 |
+
- 加上 reranker 后,MRR、NDCG、Hit@1、Hit@5 都明显提升,说明 reranker 有效改善了排序质量。
|
| 465 |
+
|
| 466 |
+
面试可以这样总结:
|
| 467 |
+
|
| 468 |
+
> 单独加 BM25 后,召回有提升但排序不一定更好;这符合预期,因为 BM25 会把更多字面相关结果拉进候选集。最终效果最好的是 dense + BM25 扩召回,再用 cross-encoder reranker 排序。这个实验也说明我不是凭感觉加组件,而是用 Hit@K、MRR 和 NDCG 验证每一步是否真的有效。
|
| 469 |
+
|
| 470 |
+
## 当前评测结果示例
|
| 471 |
+
|
| 472 |
+
早期小规模 smoke test 的结果示例:
|
| 473 |
+
|
| 474 |
+
```text
|
| 475 |
+
BEIR/scifact:
|
| 476 |
+
MRR = 0.9000
|
| 477 |
+
NDCG@5 = 0.9262
|
| 478 |
+
Hit@1 = 0.8000
|
| 479 |
+
Hit@5 = 1.0000
|
| 480 |
+
|
| 481 |
+
BEIR/fiqa:
|
| 482 |
+
MRR = 0.8000
|
| 483 |
+
NDCG@5 = 0.6582
|
| 484 |
+
Hit@1 = 0.8000
|
| 485 |
+
Hit@5 = 0.8000
|
| 486 |
+
|
| 487 |
+
local-options:
|
| 488 |
+
MRR = 1.0000
|
| 489 |
+
NDCG@5 = 0.7162
|
| 490 |
+
Hit@1 = 1.0000
|
| 491 |
+
Hit@5 = 1.0000
|
| 492 |
+
```
|
| 493 |
+
|
| 494 |
+
这些结果主要用于验证评测流程和小样本趋势,不能直接代表完整 benchmark 成绩。正式对比时需要扩大 `max_queries` 和 `max_corpus_docs`。
|
| 495 |
+
|
| 496 |
+
## 面试回答话术
|
| 497 |
+
|
| 498 |
+
可以这样回答:
|
| 499 |
+
|
| 500 |
+
> 我在优化 RAG 系统时发现,单纯看回答效果很难判断改动是否真的有效,所以先搭了一个 retrieval evaluation 模块。我的思路是先用 BEIR/scifact 快速跑通标准检索评测,再接 BEIR/fiqa 贴近金融场景,然后接 Open RAGBench 验证长文档和 PDF-like 场景,最后补自己的期权 PDF 测试集,用来覆盖项目里公式、章节和金融术语这些业务难点。
|
| 501 |
+
|
| 502 |
+
如果面试官问为什么先评估 retrieval:
|
| 503 |
+
|
| 504 |
+
> 因为 RAG 的生成质量高度依赖检索质量。如果检索阶段没有召回正确上下文,后面 LLM 很容易幻觉。所以我先用 Hit@K、MRR、NDCG@K 衡量正确文档是否被召回以及排序是否靠前,把 retrieval 问题和 generation 问题分开定位。
|
| 505 |
+
|
| 506 |
+
如果面试官问如何保证评测可靠:
|
| 507 |
+
|
| 508 |
+
> 我做了几个处理。第一,所有数据集统一成 documents、queries、qrels 三类结构。第二,小样本 smoke test 会优先把 qrels 需要的 gold document 放进 corpus,避免因为采样漏掉正确文档导致评测不公平。第三,检索结果按 doc_id 去重,避免同一篇文档多个 chunk 重复命中导致指标虚高。第四,修改解析、chunk、embedding 或检索逻辑后必须 rebuild 索引,保证评测对应的是最新系统。
|
| 509 |
+
|
| 510 |
+
如果面试官问这个模块怎么用:
|
| 511 |
+
|
| 512 |
+
> 我提供了单数据集入口和 suite 入口。单数据集可以用 `python -m eval.rag_eval --dataset local-options --rebuild`,批量评测可以用 `python -m eval.run_eval_suite --rebuild`。它会自动跑多个数据集,输出 JSON 和 Markdown 报告,便于做 before/after 对比。
|
| 513 |
+
|
| 514 |
+
如果面试官问为什么要做 hybrid search:
|
| 515 |
+
|
| 516 |
+
> 因为期权和金融文档里有两类查询。一类是语义型,比如“为什么临近到期 gamma 风险变大”,dense embedding 很适合;另一类是精确匹配型,比如 `Equation 21.23`、`WITH ZERO CORRELATION`、`Black-Scholes-Merton`,这些 BM25 更稳定。所以我用 dense retrieval 负责语义召回,BM25 负责关键词召回,然后合并候选,再用 cross-encoder reranker 排序。
|
| 517 |
+
|
| 518 |
+
如果面试官问 hybrid 是否真的提升了:
|
| 519 |
+
|
| 520 |
+
> 我用扩充后的 40 条 local-options eval 做了对比。Dense-only 的 Hit@5 是 0.525,MRR 是 0.471;加入 hybrid 后 Hit@5 提升到 0.575,说明召回变好,但 NDCG 有一点下降,说明排序还不够好;再加 reranker 后 Hit@5 到 0.725,MRR 到 0.713,Hit@1 到 0.700,说明 dense + BM25 + reranker 的组合最稳。
|
| 521 |
+
|
| 522 |
+
如果面试官问为什么不能每次全量 rebuild:
|
| 523 |
+
|
| 524 |
+
> 全量 rebuild 在文档少的时候可以,但参考书和笔记变多后成本会越来越高。我在 metadata 里记录 source_file、file_hash、embedding_model 和 extraction_method,启动时对比当前文件状态和 Chroma 中已有 metadata。新增文件只入库新增部分,修改文件只删除并重建该文件对应 chunks,删除文件同步清理旧 chunks。这样既保证索引新鲜,也避免无意义的全量重建。
|
| 525 |
+
|
| 526 |
+
如果面试官问 RAG 和 agent 怎么结合:
|
| 527 |
+
|
| 528 |
+
> 我把 RAG 封装成 `QueryKnowledgeTool` 注册到主 `CodeAgent`,而不是只做一个独立脚本。tool description 明确告诉模型在期权概念、波动率、Greeks、策略和公式编号问题上调用它。返回结果包含 source、page、section、content_type、score 和 excerpt,方便 agent 带引用地回答,而不是凭模型记忆回答。
|
| 529 |
+
|
| 530 |
+
如果面试官问如何避免本地 eval 过拟合:
|
| 531 |
+
|
| 532 |
+
> 早期我只有几条手写 case,很容易高估效果。后来我写了 local eval generator,从真实 PDF chunks 中抽样生成问题,同时过滤前言、索引、表格和图注噪声。这样测试集覆盖公式、章节、概念和金融术语,能更真实地暴露 retrieval 的召回和排序问题。
|
| 533 |
+
|
| 534 |
+
## 后续可扩展方向
|
| 535 |
+
|
| 536 |
+
后续还可以继续扩展:
|
| 537 |
+
|
| 538 |
+
- 增加 reranker 前后的对比实验。
|
| 539 |
+
- 增加 answer-level evaluation,评估最终回答是否正确。
|
| 540 |
+
- 增加 citation accuracy,判断引用来源是否准确。
|
| 541 |
+
- 增加公式检索专门测试集。
|
| 542 |
+
- 增加表格类 query 测试集。
|
| 543 |
+
- 对不同 chunk 策略、embedding 模型、top-k 参数做批量实验。
|
| 544 |
+
- 将报告接入 CI 或定期任务,防止 RAG 效果回退。
|
requirements.txt
CHANGED
|
@@ -3,6 +3,7 @@ smolagents==1.13.0
|
|
| 3 |
requests
|
| 4 |
duckduckgo_search
|
| 5 |
pandas
|
|
|
|
| 6 |
pypdf
|
| 7 |
PyMuPDF
|
| 8 |
chromadb
|
|
|
|
| 3 |
requests
|
| 4 |
duckduckgo_search
|
| 5 |
pandas
|
| 6 |
+
yfinance
|
| 7 |
pypdf
|
| 8 |
PyMuPDF
|
| 9 |
chromadb
|
strategy/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .builder import generate_volatility_strategies
|
| 2 |
+
from .payoff import expiration_payoff, strategy_summary
|
| 3 |
+
from .schemas import OptionLeg, OptionStrategy
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
"expiration_payoff",
|
| 7 |
+
"generate_volatility_strategies",
|
| 8 |
+
"OptionLeg",
|
| 9 |
+
"OptionStrategy",
|
| 10 |
+
"strategy_summary",
|
| 11 |
+
]
|
strategy/builder.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from market_data.schemas import OptionChain, OptionContract
|
| 4 |
+
|
| 5 |
+
from .payoff import estimate_breakevens
|
| 6 |
+
from .schemas import OptionLeg, OptionStrategy
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def usable_contracts(contracts: list[OptionContract]) -> list[OptionContract]:
|
| 10 |
+
return [
|
| 11 |
+
contract
|
| 12 |
+
for contract in contracts
|
| 13 |
+
if contract.mid is not None
|
| 14 |
+
and contract.mid > 0
|
| 15 |
+
and not {"missing_or_zero_bid_ask", "zero_open_interest"}.intersection(contract.liquidity_warnings)
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def nearest_contract(contracts: list[OptionContract], target_strike: float) -> OptionContract | None:
|
| 20 |
+
valid = usable_contracts(contracts)
|
| 21 |
+
if not valid:
|
| 22 |
+
return None
|
| 23 |
+
return min(valid, key=lambda contract: abs(contract.strike - target_strike))
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def contract_to_leg(contract: OptionContract, action: str, quantity: int = 1) -> OptionLeg:
|
| 27 |
+
return OptionLeg(
|
| 28 |
+
action=action,
|
| 29 |
+
option_type=contract.option_type,
|
| 30 |
+
strike=contract.strike,
|
| 31 |
+
expiration=contract.expiration,
|
| 32 |
+
quantity=quantity,
|
| 33 |
+
premium=contract.mid or contract.last_price or 0.0,
|
| 34 |
+
implied_volatility=contract.implied_volatility,
|
| 35 |
+
liquidity_warnings=contract.liquidity_warnings,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def make_strategy(
|
| 40 |
+
name: str,
|
| 41 |
+
volatility_view: str,
|
| 42 |
+
directional_view: str,
|
| 43 |
+
legs: list[OptionLeg],
|
| 44 |
+
rationale: str,
|
| 45 |
+
risks: list[str],
|
| 46 |
+
score: float,
|
| 47 |
+
) -> OptionStrategy:
|
| 48 |
+
net_cash_flow = sum(leg.cash_flow() for leg in legs)
|
| 49 |
+
net_debit_or_credit = -net_cash_flow
|
| 50 |
+
breakevens = estimate_breakevens(legs)
|
| 51 |
+
|
| 52 |
+
max_profit: float | str | None = None
|
| 53 |
+
max_loss: float | str | None = None
|
| 54 |
+
if name in {"long_straddle", "long_strangle"}:
|
| 55 |
+
max_loss = round(max(net_debit_or_credit, 0.0), 2)
|
| 56 |
+
max_profit = "unlimited"
|
| 57 |
+
elif name == "short_straddle":
|
| 58 |
+
max_profit = round(abs(min(net_debit_or_credit, 0.0)), 2)
|
| 59 |
+
max_loss = "unlimited"
|
| 60 |
+
elif name == "iron_condor":
|
| 61 |
+
call_strikes = sorted(leg.strike for leg in legs if leg.option_type == "call")
|
| 62 |
+
put_strikes = sorted(leg.strike for leg in legs if leg.option_type == "put")
|
| 63 |
+
width = max(call_strikes[-1] - call_strikes[0], put_strikes[-1] - put_strikes[0])
|
| 64 |
+
credit = abs(min(net_debit_or_credit, 0.0))
|
| 65 |
+
max_profit = round(credit, 2)
|
| 66 |
+
max_loss = round(width * 100 - credit, 2)
|
| 67 |
+
elif name == "calendar_spread":
|
| 68 |
+
max_loss = round(max(net_debit_or_credit, 0.0), 2)
|
| 69 |
+
max_profit = "path_dependent"
|
| 70 |
+
|
| 71 |
+
return OptionStrategy(
|
| 72 |
+
name=name,
|
| 73 |
+
volatility_view=volatility_view,
|
| 74 |
+
directional_view=directional_view,
|
| 75 |
+
legs=legs,
|
| 76 |
+
rationale=rationale,
|
| 77 |
+
risks=risks,
|
| 78 |
+
max_profit=max_profit,
|
| 79 |
+
max_loss=max_loss,
|
| 80 |
+
breakevens=breakevens,
|
| 81 |
+
net_debit_or_credit=round(net_debit_or_credit, 2),
|
| 82 |
+
score=score,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def generate_volatility_strategies(
|
| 87 |
+
near_chain: OptionChain,
|
| 88 |
+
volatility_view: str = "neutral",
|
| 89 |
+
directional_view: str = "neutral",
|
| 90 |
+
far_chain: OptionChain | None = None,
|
| 91 |
+
) -> list[OptionStrategy]:
|
| 92 |
+
if near_chain.underlying_price is None:
|
| 93 |
+
return []
|
| 94 |
+
|
| 95 |
+
spot = near_chain.underlying_price
|
| 96 |
+
atm_call = nearest_contract(near_chain.calls, spot)
|
| 97 |
+
atm_put = nearest_contract(near_chain.puts, spot)
|
| 98 |
+
otm_call = nearest_contract(near_chain.calls, spot * 1.05)
|
| 99 |
+
otm_put = nearest_contract(near_chain.puts, spot * 0.95)
|
| 100 |
+
strategies: list[OptionStrategy] = []
|
| 101 |
+
|
| 102 |
+
if atm_call and atm_put:
|
| 103 |
+
if volatility_view in {"long_vol", "neutral", "vol_expansion"}:
|
| 104 |
+
strategies.append(
|
| 105 |
+
make_strategy(
|
| 106 |
+
name="long_straddle",
|
| 107 |
+
volatility_view="long_vol",
|
| 108 |
+
directional_view="neutral",
|
| 109 |
+
legs=[contract_to_leg(atm_call, "buy"), contract_to_leg(atm_put, "buy")],
|
| 110 |
+
rationale="Benefits from a large realized move or IV expansion; risk is premium paid.",
|
| 111 |
+
risks=["theta_decay", "iv_crush", "requires_large_move"],
|
| 112 |
+
score=0.75,
|
| 113 |
+
)
|
| 114 |
+
)
|
| 115 |
+
if volatility_view in {"short_vol", "neutral", "vol_compression"}:
|
| 116 |
+
strategies.append(
|
| 117 |
+
make_strategy(
|
| 118 |
+
name="short_straddle",
|
| 119 |
+
volatility_view="short_vol",
|
| 120 |
+
directional_view="neutral",
|
| 121 |
+
legs=[contract_to_leg(atm_call, "sell"), contract_to_leg(atm_put, "sell")],
|
| 122 |
+
rationale="Benefits from realized volatility staying below implied volatility.",
|
| 123 |
+
risks=["unlimited_tail_risk", "gap_risk", "margin_requirement"],
|
| 124 |
+
score=0.45,
|
| 125 |
+
)
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
if otm_call and otm_put and volatility_view in {"long_vol", "neutral", "vol_expansion"}:
|
| 129 |
+
strategies.append(
|
| 130 |
+
make_strategy(
|
| 131 |
+
name="long_strangle",
|
| 132 |
+
volatility_view="long_vol",
|
| 133 |
+
directional_view="neutral",
|
| 134 |
+
legs=[contract_to_leg(otm_call, "buy"), contract_to_leg(otm_put, "buy")],
|
| 135 |
+
rationale="Lower-cost long volatility expression than a straddle, but needs a larger move.",
|
| 136 |
+
risks=["theta_decay", "wide_breakevens", "iv_crush"],
|
| 137 |
+
score=0.65,
|
| 138 |
+
)
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
if far_chain and atm_call and volatility_view in {"long_vol", "neutral", "term_structure"}:
|
| 142 |
+
far_call = nearest_contract(far_chain.calls, atm_call.strike)
|
| 143 |
+
if far_call:
|
| 144 |
+
strategies.append(
|
| 145 |
+
make_strategy(
|
| 146 |
+
name="calendar_spread",
|
| 147 |
+
volatility_view="term_structure",
|
| 148 |
+
directional_view="neutral",
|
| 149 |
+
legs=[contract_to_leg(atm_call, "sell"), contract_to_leg(far_call, "buy")],
|
| 150 |
+
rationale="Expresses a term-structure view and benefits if longer-dated IV holds up.",
|
| 151 |
+
risks=["path_dependency", "front_expiry_gamma", "term_structure_shift"],
|
| 152 |
+
score=0.60,
|
| 153 |
+
)
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
if otm_call and otm_put and volatility_view in {"short_vol", "neutral", "vol_compression"}:
|
| 157 |
+
long_call = nearest_contract(near_chain.calls, otm_call.strike * 1.03)
|
| 158 |
+
long_put = nearest_contract(near_chain.puts, otm_put.strike * 0.97)
|
| 159 |
+
if long_call and long_put:
|
| 160 |
+
strategies.append(
|
| 161 |
+
make_strategy(
|
| 162 |
+
name="iron_condor",
|
| 163 |
+
volatility_view="short_vol",
|
| 164 |
+
directional_view="neutral",
|
| 165 |
+
legs=[
|
| 166 |
+
contract_to_leg(otm_put, "sell"),
|
| 167 |
+
contract_to_leg(long_put, "buy"),
|
| 168 |
+
contract_to_leg(otm_call, "sell"),
|
| 169 |
+
contract_to_leg(long_call, "buy"),
|
| 170 |
+
],
|
| 171 |
+
rationale="Defined-risk short volatility strategy for range-bound markets.",
|
| 172 |
+
risks=["short_gamma", "tail_loss_to_width", "assignment_risk"],
|
| 173 |
+
score=0.70,
|
| 174 |
+
)
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
return sorted(strategies, key=lambda strategy: strategy.score, reverse=True)
|
strategy/payoff.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from .schemas import OptionLeg, OptionStrategy
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def leg_expiration_payoff(leg: OptionLeg, underlying_price: float) -> float:
|
| 7 |
+
if leg.option_type == "call":
|
| 8 |
+
intrinsic = max(underlying_price - leg.strike, 0.0)
|
| 9 |
+
else:
|
| 10 |
+
intrinsic = max(leg.strike - underlying_price, 0.0)
|
| 11 |
+
return intrinsic * leg.signed_quantity() * 100 + leg.cash_flow()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def expiration_payoff(legs: list[OptionLeg], underlying_price: float) -> float:
|
| 15 |
+
return sum(leg_expiration_payoff(leg, underlying_price) for leg in legs)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def estimate_breakevens(legs: list[OptionLeg]) -> list[float]:
|
| 19 |
+
strikes = [leg.strike for leg in legs]
|
| 20 |
+
low = max(min(strikes) * 0.5, 0.01)
|
| 21 |
+
high = max(strikes) * 1.5
|
| 22 |
+
steps = 400
|
| 23 |
+
points = [low + (high - low) * index / steps for index in range(steps + 1)]
|
| 24 |
+
payoffs = [expiration_payoff(legs, point) for point in points]
|
| 25 |
+
breakevens = []
|
| 26 |
+
for index in range(1, len(points)):
|
| 27 |
+
previous = payoffs[index - 1]
|
| 28 |
+
current = payoffs[index]
|
| 29 |
+
if previous == 0:
|
| 30 |
+
breakevens.append(points[index - 1])
|
| 31 |
+
if previous * current < 0:
|
| 32 |
+
ratio = abs(previous) / (abs(previous) + abs(current))
|
| 33 |
+
breakevens.append(points[index - 1] + (points[index] - points[index - 1]) * ratio)
|
| 34 |
+
return [round(value, 2) for value in breakevens]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def strategy_summary(strategy: OptionStrategy) -> dict:
|
| 38 |
+
strikes = [leg.strike for leg in strategy.legs]
|
| 39 |
+
low = max(min(strikes) * 0.6, 0.01)
|
| 40 |
+
high = max(strikes) * 1.4
|
| 41 |
+
grid = [low + (high - low) * index / 80 for index in range(81)]
|
| 42 |
+
payoffs = [expiration_payoff(strategy.legs, price) for price in grid]
|
| 43 |
+
return {
|
| 44 |
+
"min_grid_payoff": round(min(payoffs), 2),
|
| 45 |
+
"max_grid_payoff": round(max(payoffs), 2),
|
| 46 |
+
"payoff_at_middle_strike": round(expiration_payoff(strategy.legs, sum(strikes) / len(strikes)), 2),
|
| 47 |
+
"sample_points": [
|
| 48 |
+
{"underlying_price": round(price, 2), "pnl": round(pnl, 2)}
|
| 49 |
+
for price, pnl in zip(grid[::10], payoffs[::10])
|
| 50 |
+
],
|
| 51 |
+
}
|
strategy/schemas.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import asdict, dataclass
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class OptionLeg:
|
| 9 |
+
action: str
|
| 10 |
+
option_type: str
|
| 11 |
+
strike: float
|
| 12 |
+
expiration: str
|
| 13 |
+
quantity: int
|
| 14 |
+
premium: float
|
| 15 |
+
implied_volatility: float | None = None
|
| 16 |
+
delta: float | None = None
|
| 17 |
+
liquidity_warnings: list[str] | None = None
|
| 18 |
+
|
| 19 |
+
def signed_quantity(self) -> int:
|
| 20 |
+
return self.quantity if self.action == "buy" else -self.quantity
|
| 21 |
+
|
| 22 |
+
def cash_flow(self) -> float:
|
| 23 |
+
return -self.premium * self.signed_quantity() * 100
|
| 24 |
+
|
| 25 |
+
def to_dict(self) -> dict[str, Any]:
|
| 26 |
+
return asdict(self)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class OptionStrategy:
|
| 31 |
+
name: str
|
| 32 |
+
volatility_view: str
|
| 33 |
+
directional_view: str
|
| 34 |
+
legs: list[OptionLeg]
|
| 35 |
+
rationale: str
|
| 36 |
+
risks: list[str]
|
| 37 |
+
max_profit: float | str | None
|
| 38 |
+
max_loss: float | str | None
|
| 39 |
+
breakevens: list[float]
|
| 40 |
+
net_debit_or_credit: float
|
| 41 |
+
score: float
|
| 42 |
+
|
| 43 |
+
def to_dict(self) -> dict[str, Any]:
|
| 44 |
+
payload = asdict(self)
|
| 45 |
+
payload["legs"] = [leg.to_dict() for leg in self.legs]
|
| 46 |
+
return payload
|
strategy/tools.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
from smolagents import tool
|
| 6 |
+
|
| 7 |
+
from market_data.providers import get_option_chain, list_option_expirations
|
| 8 |
+
|
| 9 |
+
from .builder import generate_volatility_strategies
|
| 10 |
+
from .payoff import strategy_summary
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@tool
|
| 14 |
+
def build_volatility_strategy(
|
| 15 |
+
symbol: str,
|
| 16 |
+
volatility_view: str = "neutral",
|
| 17 |
+
directional_view: str = "neutral",
|
| 18 |
+
near_expiration: str = "",
|
| 19 |
+
far_expiration: str = "",
|
| 20 |
+
) -> str:
|
| 21 |
+
"""Build candidate volatility option strategies from the current option chain.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
symbol: Yahoo Finance ticker.
|
| 25 |
+
volatility_view: long_vol, short_vol, vol_expansion, vol_compression, term_structure, or neutral.
|
| 26 |
+
directional_view: bullish, bearish, neutral, or range_bound.
|
| 27 |
+
near_expiration: Near option expiration in YYYY-MM-DD. Empty uses nearest expiration.
|
| 28 |
+
far_expiration: Far option expiration for calendar spreads. Empty uses a later available expiration.
|
| 29 |
+
"""
|
| 30 |
+
try:
|
| 31 |
+
symbol = symbol.strip().upper()
|
| 32 |
+
expirations = list_option_expirations(symbol)
|
| 33 |
+
if not expirations:
|
| 34 |
+
raise ValueError(f"No option expirations found for {symbol}.")
|
| 35 |
+
near = near_expiration or expirations[0]
|
| 36 |
+
far = far_expiration or (expirations[1] if len(expirations) > 1 else "")
|
| 37 |
+
near_chain = get_option_chain(symbol, near)
|
| 38 |
+
far_chain = get_option_chain(symbol, far) if far else None
|
| 39 |
+
strategies = generate_volatility_strategies(
|
| 40 |
+
near_chain=near_chain,
|
| 41 |
+
volatility_view=volatility_view,
|
| 42 |
+
directional_view=directional_view,
|
| 43 |
+
far_chain=far_chain,
|
| 44 |
+
)
|
| 45 |
+
return json.dumps(
|
| 46 |
+
{
|
| 47 |
+
"status": "success",
|
| 48 |
+
"symbol": symbol,
|
| 49 |
+
"near_expiration": near,
|
| 50 |
+
"far_expiration": far or None,
|
| 51 |
+
"strategies": [
|
| 52 |
+
{
|
| 53 |
+
**strategy.to_dict(),
|
| 54 |
+
"payoff_summary": strategy_summary(strategy),
|
| 55 |
+
}
|
| 56 |
+
for strategy in strategies
|
| 57 |
+
],
|
| 58 |
+
"risk_note": (
|
| 59 |
+
"This is research output, not a trade recommendation. "
|
| 60 |
+
"Validate quotes, liquidity, margin, assignment risk, and event risk before trading."
|
| 61 |
+
),
|
| 62 |
+
},
|
| 63 |
+
ensure_ascii=False,
|
| 64 |
+
indent=2,
|
| 65 |
+
default=str,
|
| 66 |
+
)
|
| 67 |
+
except Exception as exc:
|
| 68 |
+
return json.dumps(
|
| 69 |
+
{"status": "error", "symbol": symbol, "message": str(exc)},
|
| 70 |
+
ensure_ascii=False,
|
| 71 |
+
indent=2,
|
| 72 |
+
)
|
tools/query_knowledge.py
CHANGED
|
@@ -3,6 +3,7 @@ import asyncio
|
|
| 3 |
from collections import Counter
|
| 4 |
import hashlib
|
| 5 |
import logging
|
|
|
|
| 6 |
import os
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Iterable, List, Optional
|
|
@@ -13,19 +14,29 @@ from chromadb.errors import NotFoundError
|
|
| 13 |
from pypdf import PdfReader
|
| 14 |
|
| 15 |
from llama_index.core import StorageContext, VectorStoreIndex
|
| 16 |
-
from llama_index.core.schema import Document, BaseNode
|
| 17 |
from llama_index.core.node_parser import SentenceSplitter
|
| 18 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
| 19 |
|
| 20 |
|
|
|
|
| 21 |
BASE_DIR = Path(__file__).resolve().parent
|
|
|
|
| 22 |
KNOWLEDGE_BASE_DIR = BASE_DIR / "knowledge_base"
|
|
|
|
|
|
|
| 23 |
RAW_DIR = KNOWLEDGE_BASE_DIR / "raw"
|
| 24 |
CHROMA_DB_DIR = KNOWLEDGE_BASE_DIR / "chroma_db"
|
| 25 |
-
HF_CACHE_DIR =
|
| 26 |
COLLECTION_NAME = "options_knowledge"
|
| 27 |
|
| 28 |
-
EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
CHUNK_SIZE = 1000
|
| 30 |
CHUNK_OVERLAP = 150
|
| 31 |
PDF_REPEATED_LINE_MIN_PAGES = 3
|
|
@@ -68,31 +79,177 @@ def configure_model_cache() -> None:
|
|
| 68 |
os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(
|
| 69 |
HF_CACHE_DIR / "sentence_transformers"))
|
| 70 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 71 |
-
|
| 72 |
-
HF_CACHE_DIR
|
| 73 |
-
/ "sentence_transformers"
|
| 74 |
-
/ f"models--{EMBED_MODEL_NAME.replace('/', '--')}"
|
| 75 |
-
)
|
| 76 |
-
if cached_model_dir.exists():
|
| 77 |
os.environ.setdefault("HF_HUB_OFFLINE", "1")
|
| 78 |
os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
|
| 79 |
|
| 80 |
|
| 81 |
-
def
|
| 82 |
cached_model_dir = (
|
| 83 |
HF_CACHE_DIR
|
| 84 |
/ "sentence_transformers"
|
| 85 |
-
/ f"models--{
|
| 86 |
)
|
| 87 |
snapshots_dir = cached_model_dir / "snapshots"
|
| 88 |
if snapshots_dir.exists():
|
| 89 |
snapshots = sorted(path for path in snapshots_dir.iterdir() if path.is_dir())
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
return EMBED_MODEL_NAME
|
| 94 |
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
def file_sha256(path: Path) -> str:
|
| 97 |
digest = hashlib.sha256()
|
| 98 |
with path.open("rb") as file:
|
|
@@ -116,6 +273,49 @@ def load_md_file(path: Path) -> Document:
|
|
| 116 |
)
|
| 117 |
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def append_visual_fragment(line_parts: List[str], text: str, baseline_y: float, item: dict) -> None:
|
| 120 |
if not text:
|
| 121 |
return
|
|
@@ -1008,13 +1208,28 @@ def load_pdf_file(path: Path) -> List[Document]:
|
|
| 1008 |
|
| 1009 |
|
| 1010 |
def load_txt_file(path: Path) -> List[Document]:
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
|
| 1015 |
|
| 1016 |
def iter_source_files(raw_dir: Path) -> Iterable[Path]:
|
| 1017 |
-
supported_suffixes = {".md", ".markdown", ".pdf"}
|
| 1018 |
for path in sorted(raw_dir.rglob("*")):
|
| 1019 |
if path.is_file() and path.suffix.lower() in supported_suffixes:
|
| 1020 |
yield path
|
|
@@ -1022,12 +1237,13 @@ def iter_source_files(raw_dir: Path) -> Iterable[Path]:
|
|
| 1022 |
|
| 1023 |
def load_docs(raw_dir: Path = RAW_DIR) -> List[Document]:
|
| 1024 |
documents: List[Document] = []
|
|
|
|
| 1025 |
|
| 1026 |
for path in iter_source_files(raw_dir):
|
| 1027 |
suffix = path.suffix.lower()
|
| 1028 |
|
| 1029 |
if suffix in {".md", ".markdown"}:
|
| 1030 |
-
documents.
|
| 1031 |
elif suffix == ".pdf":
|
| 1032 |
documents.extend(load_pdf_file(path))
|
| 1033 |
elif suffix == ".txt":
|
|
@@ -1053,6 +1269,7 @@ def add_chunk_metadata(nodes: List[BaseNode]) -> List[BaseNode]:
|
|
| 1053 |
|
| 1054 |
node.metadata["chunk_id"] = chunk_id
|
| 1055 |
node.metadata["chunk_index"] = chunk_index
|
|
|
|
| 1056 |
node.id_ = chunk_id
|
| 1057 |
|
| 1058 |
return nodes
|
|
@@ -1073,8 +1290,7 @@ def validate_nodes(nodes: List[BaseNode]) -> None:
|
|
| 1073 |
f"PDF node {node.node_id} is missing page_number metadata.")
|
| 1074 |
|
| 1075 |
|
| 1076 |
-
def
|
| 1077 |
-
documents = load_docs(raw_dir)
|
| 1078 |
splitter = SentenceSplitter(
|
| 1079 |
chunk_size=CHUNK_SIZE,
|
| 1080 |
chunk_overlap=CHUNK_OVERLAP,
|
|
@@ -1085,7 +1301,123 @@ def build_nodes(raw_dir: Path = RAW_DIR) -> List[BaseNode]:
|
|
| 1085 |
return nodes
|
| 1086 |
|
| 1087 |
|
| 1088 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1089 |
if chroma_collection.count() == 0:
|
| 1090 |
return True
|
| 1091 |
|
|
@@ -1095,6 +1427,8 @@ def collection_needs_pdf_rebuild(chroma_collection) -> bool:
|
|
| 1095 |
return False
|
| 1096 |
|
| 1097 |
for metadata in sample.get("metadatas") or []:
|
|
|
|
|
|
|
| 1098 |
if metadata.get("file_type") == "pdf":
|
| 1099 |
return metadata.get("extraction_method") != PDF_EXTRACTION_METHOD
|
| 1100 |
|
|
@@ -1107,6 +1441,7 @@ async def build_index(raw_dir: Path = RAW_DIR, rebuild: bool = False) -> VectorS
|
|
| 1107 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 1108 |
|
| 1109 |
load_dotenv()
|
|
|
|
| 1110 |
CHROMA_DB_DIR.mkdir(parents=True, exist_ok=True)
|
| 1111 |
|
| 1112 |
db = chromadb.PersistentClient(path=str(CHROMA_DB_DIR))
|
|
@@ -1118,9 +1453,6 @@ async def build_index(raw_dir: Path = RAW_DIR, rebuild: bool = False) -> VectorS
|
|
| 1118 |
pass
|
| 1119 |
|
| 1120 |
chroma_collection = db.get_or_create_collection(COLLECTION_NAME)
|
| 1121 |
-
if not rebuild and collection_needs_pdf_rebuild(chroma_collection):
|
| 1122 |
-
db.delete_collection(COLLECTION_NAME)
|
| 1123 |
-
chroma_collection = db.get_or_create_collection(COLLECTION_NAME)
|
| 1124 |
|
| 1125 |
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
| 1126 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
|
@@ -1141,6 +1473,13 @@ async def build_index(raw_dir: Path = RAW_DIR, rebuild: bool = False) -> VectorS
|
|
| 1141 |
f"Indexed {len(nodes)} chunks into collection '{COLLECTION_NAME}'")
|
| 1142 |
return index
|
| 1143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1144 |
print(
|
| 1145 |
f"Loaded existing collection '{COLLECTION_NAME}' with {chroma_collection.count()} chunks.")
|
| 1146 |
return VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
|
|
@@ -1148,13 +1487,17 @@ async def build_index(raw_dir: Path = RAW_DIR, rebuild: bool = False) -> VectorS
|
|
| 1148 |
|
| 1149 |
class QueryKnowledgeTool(Tool):
|
| 1150 |
name = "query_knowledge"
|
| 1151 |
-
description =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1152 |
inputs = {'query': {'type': 'string',
|
| 1153 |
'description': 'The search query to perform.'}}
|
| 1154 |
output_type = "string"
|
| 1155 |
|
| 1156 |
@staticmethod
|
| 1157 |
-
def format_results(results):
|
| 1158 |
output = []
|
| 1159 |
|
| 1160 |
for result in results:
|
|
@@ -1166,6 +1509,8 @@ class QueryKnowledgeTool(Tool):
|
|
| 1166 |
formula_id = metadata.get("formula_id", "")
|
| 1167 |
score = result.score
|
| 1168 |
text = result.node.get_content()
|
|
|
|
|
|
|
| 1169 |
|
| 1170 |
output.append(
|
| 1171 |
f"source:{source}\n"
|
|
@@ -1174,20 +1519,139 @@ class QueryKnowledgeTool(Tool):
|
|
| 1174 |
f"content_type:{content_type}\n"
|
| 1175 |
f"formula_id:{formula_id or 'n/a'}\n"
|
| 1176 |
f"score:{score:.4f}\n"
|
|
|
|
|
|
|
| 1177 |
f"content:{text}"
|
| 1178 |
)
|
| 1179 |
|
| 1180 |
return "\n\n---\n\n".join(output)
|
| 1181 |
|
| 1182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1183 |
super().__init__()
|
| 1184 |
self.max_results = max_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1185 |
index = asyncio.run(build_index(rebuild=False))
|
| 1186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1187 |
|
| 1188 |
def forward(self, query: str) -> str:
|
| 1189 |
-
|
| 1190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1191 |
|
| 1192 |
|
| 1193 |
if __name__ == "__main__":
|
|
|
|
| 3 |
from collections import Counter
|
| 4 |
import hashlib
|
| 5 |
import logging
|
| 6 |
+
import math
|
| 7 |
import os
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Iterable, List, Optional
|
|
|
|
| 14 |
from pypdf import PdfReader
|
| 15 |
|
| 16 |
from llama_index.core import StorageContext, VectorStoreIndex
|
| 17 |
+
from llama_index.core.schema import Document, BaseNode, NodeWithScore, TextNode
|
| 18 |
from llama_index.core.node_parser import SentenceSplitter
|
| 19 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
| 20 |
|
| 21 |
|
| 22 |
+
load_dotenv()
|
| 23 |
BASE_DIR = Path(__file__).resolve().parent
|
| 24 |
+
PROJECT_ROOT = BASE_DIR.parent
|
| 25 |
KNOWLEDGE_BASE_DIR = BASE_DIR / "knowledge_base"
|
| 26 |
+
LEGACY_KNOWLEDGE_BASE_DIR = BASE_DIR / "knowledge_base"
|
| 27 |
+
KNOWLEDGE_BASE_DIR = PROJECT_ROOT / "knowledge_base"
|
| 28 |
RAW_DIR = KNOWLEDGE_BASE_DIR / "raw"
|
| 29 |
CHROMA_DB_DIR = KNOWLEDGE_BASE_DIR / "chroma_db"
|
| 30 |
+
HF_CACHE_DIR = PROJECT_ROOT / "hf_cache"
|
| 31 |
COLLECTION_NAME = "options_knowledge"
|
| 32 |
|
| 33 |
+
EMBED_MODEL_NAME = os.getenv("RAG_EMBED_MODEL", "BAAI/bge-small-en-v1.5")
|
| 34 |
+
RERANKER_MODEL_NAME = os.getenv(
|
| 35 |
+
"RAG_RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 36 |
+
RERANKER_BATCH_SIZE = int(os.getenv("RAG_RERANKER_BATCH_SIZE", "16"))
|
| 37 |
+
EMBED_MODEL_METADATA_KEY = "embedding_model"
|
| 38 |
+
BM25_METADATA_KEY = "bm25_score"
|
| 39 |
+
VECTOR_METADATA_KEY = "vector_score"
|
| 40 |
CHUNK_SIZE = 1000
|
| 41 |
CHUNK_OVERLAP = 150
|
| 42 |
PDF_REPEATED_LINE_MIN_PAGES = 3
|
|
|
|
| 79 |
os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(
|
| 80 |
HF_CACHE_DIR / "sentence_transformers"))
|
| 81 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 82 |
+
if local_model_snapshot(EMBED_MODEL_NAME):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
os.environ.setdefault("HF_HUB_OFFLINE", "1")
|
| 84 |
os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
|
| 85 |
|
| 86 |
|
| 87 |
+
def local_model_snapshot(model_name: str) -> Optional[Path]:
|
| 88 |
cached_model_dir = (
|
| 89 |
HF_CACHE_DIR
|
| 90 |
/ "sentence_transformers"
|
| 91 |
+
/ f"models--{model_name.replace('/', '--')}"
|
| 92 |
)
|
| 93 |
snapshots_dir = cached_model_dir / "snapshots"
|
| 94 |
if snapshots_dir.exists():
|
| 95 |
snapshots = sorted(path for path in snapshots_dir.iterdir() if path.is_dir())
|
| 96 |
+
for snapshot in reversed(snapshots):
|
| 97 |
+
if (snapshot / "config.json").exists():
|
| 98 |
+
return snapshot
|
| 99 |
+
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def resolve_embed_model_name() -> str:
|
| 104 |
+
snapshot = local_model_snapshot(EMBED_MODEL_NAME)
|
| 105 |
+
if snapshot:
|
| 106 |
+
return str(snapshot)
|
| 107 |
|
| 108 |
return EMBED_MODEL_NAME
|
| 109 |
|
| 110 |
|
| 111 |
+
def resolve_reranker_model_name(model_name: str = RERANKER_MODEL_NAME) -> str:
|
| 112 |
+
snapshot = local_model_snapshot(model_name)
|
| 113 |
+
if snapshot:
|
| 114 |
+
return str(snapshot)
|
| 115 |
+
|
| 116 |
+
return model_name
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def env_flag(name: str, default: bool = False) -> bool:
|
| 120 |
+
value = os.getenv(name)
|
| 121 |
+
if value is None:
|
| 122 |
+
return default
|
| 123 |
+
return value.strip().lower() in {"1", "true", "yes", "on"}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def effective_raw_dir(raw_dir: Path = RAW_DIR) -> Path:
|
| 127 |
+
if any(iter_source_files(raw_dir)):
|
| 128 |
+
return raw_dir
|
| 129 |
+
|
| 130 |
+
legacy_raw_dir = LEGACY_KNOWLEDGE_BASE_DIR / "raw"
|
| 131 |
+
if any(iter_source_files(legacy_raw_dir)):
|
| 132 |
+
logging.warning(
|
| 133 |
+
"Using legacy knowledge base path %s. Move files to %s when convenient.",
|
| 134 |
+
legacy_raw_dir,
|
| 135 |
+
raw_dir,
|
| 136 |
+
)
|
| 137 |
+
return legacy_raw_dir
|
| 138 |
+
|
| 139 |
+
return raw_dir
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class CrossEncoderReranker:
|
| 143 |
+
def __init__(
|
| 144 |
+
self,
|
| 145 |
+
model_name: str = RERANKER_MODEL_NAME,
|
| 146 |
+
batch_size: int = RERANKER_BATCH_SIZE,
|
| 147 |
+
):
|
| 148 |
+
self.model_name = model_name
|
| 149 |
+
self.batch_size = batch_size
|
| 150 |
+
self._model = None
|
| 151 |
+
|
| 152 |
+
def _load_model(self):
|
| 153 |
+
if self._model is not None:
|
| 154 |
+
return self._model
|
| 155 |
+
|
| 156 |
+
from sentence_transformers import CrossEncoder
|
| 157 |
+
|
| 158 |
+
self._model = CrossEncoder(
|
| 159 |
+
resolve_reranker_model_name(self.model_name),
|
| 160 |
+
max_length=512,
|
| 161 |
+
cache_folder=str(HF_CACHE_DIR / "sentence_transformers"),
|
| 162 |
+
)
|
| 163 |
+
return self._model
|
| 164 |
+
|
| 165 |
+
def rerank(
|
| 166 |
+
self,
|
| 167 |
+
query: str,
|
| 168 |
+
results: list[NodeWithScore],
|
| 169 |
+
top_n: Optional[int] = None,
|
| 170 |
+
) -> list[NodeWithScore]:
|
| 171 |
+
if not results:
|
| 172 |
+
return []
|
| 173 |
+
|
| 174 |
+
pairs = [
|
| 175 |
+
(query, result.node.get_content())
|
| 176 |
+
for result in results
|
| 177 |
+
]
|
| 178 |
+
model = self._load_model()
|
| 179 |
+
scores = model.predict(
|
| 180 |
+
pairs,
|
| 181 |
+
batch_size=self.batch_size,
|
| 182 |
+
show_progress_bar=False,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
reranked = [
|
| 186 |
+
NodeWithScore(node=result.node, score=float(score))
|
| 187 |
+
for result, score in zip(results, scores)
|
| 188 |
+
]
|
| 189 |
+
reranked.sort(key=lambda item: item.score or float("-inf"), reverse=True)
|
| 190 |
+
return reranked[:top_n] if top_n else reranked
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class BM25Retriever:
|
| 194 |
+
def __init__(self, nodes: list[TextNode]):
|
| 195 |
+
self.nodes = nodes
|
| 196 |
+
self.tokenized_docs = [self.tokenize(node.get_content()) for node in nodes]
|
| 197 |
+
self.doc_freqs: Counter[str] = Counter()
|
| 198 |
+
for tokens in self.tokenized_docs:
|
| 199 |
+
self.doc_freqs.update(set(tokens))
|
| 200 |
+
self.avg_doc_len = (
|
| 201 |
+
sum(len(tokens) for tokens in self.tokenized_docs) / len(self.tokenized_docs)
|
| 202 |
+
if self.tokenized_docs
|
| 203 |
+
else 0.0
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
@staticmethod
|
| 207 |
+
def tokenize(text: str) -> list[str]:
|
| 208 |
+
return [
|
| 209 |
+
token.lower()
|
| 210 |
+
for token in re.findall(r"[A-Za-z]+(?:[-'][A-Za-z]+)*|\d+(?:\.\d+)*|[^\sA-Za-z0-9]", text)
|
| 211 |
+
if token.strip()
|
| 212 |
+
]
|
| 213 |
+
|
| 214 |
+
def score(self, query_tokens: list[str], doc_tokens: list[str]) -> float:
|
| 215 |
+
if not query_tokens or not doc_tokens:
|
| 216 |
+
return 0.0
|
| 217 |
+
|
| 218 |
+
token_counts = Counter(doc_tokens)
|
| 219 |
+
doc_len = len(doc_tokens)
|
| 220 |
+
total_docs = len(self.tokenized_docs)
|
| 221 |
+
k1 = 1.5
|
| 222 |
+
b = 0.75
|
| 223 |
+
score = 0.0
|
| 224 |
+
|
| 225 |
+
for token in query_tokens:
|
| 226 |
+
term_freq = token_counts.get(token, 0)
|
| 227 |
+
if term_freq == 0:
|
| 228 |
+
continue
|
| 229 |
+
doc_freq = self.doc_freqs.get(token, 0)
|
| 230 |
+
idf = math.log(1 + (total_docs - doc_freq + 0.5) / (doc_freq + 0.5))
|
| 231 |
+
denominator = term_freq + k1 * (
|
| 232 |
+
1 - b + b * doc_len / max(self.avg_doc_len, 1.0)
|
| 233 |
+
)
|
| 234 |
+
score += idf * (term_freq * (k1 + 1)) / denominator
|
| 235 |
+
|
| 236 |
+
return score
|
| 237 |
+
|
| 238 |
+
def retrieve(self, query: str, top_k: int) -> list[NodeWithScore]:
|
| 239 |
+
query_tokens = self.tokenize(query)
|
| 240 |
+
scored: list[NodeWithScore] = []
|
| 241 |
+
|
| 242 |
+
for node, doc_tokens in zip(self.nodes, self.tokenized_docs):
|
| 243 |
+
score = self.score(query_tokens, doc_tokens)
|
| 244 |
+
if score <= 0:
|
| 245 |
+
continue
|
| 246 |
+
node.metadata[BM25_METADATA_KEY] = score
|
| 247 |
+
scored.append(NodeWithScore(node=node, score=score))
|
| 248 |
+
|
| 249 |
+
scored.sort(key=lambda item: item.score or float("-inf"), reverse=True)
|
| 250 |
+
return scored[:top_k]
|
| 251 |
+
|
| 252 |
+
|
| 253 |
def file_sha256(path: Path) -> str:
|
| 254 |
digest = hashlib.sha256()
|
| 255 |
with path.open("rb") as file:
|
|
|
|
| 273 |
)
|
| 274 |
|
| 275 |
|
| 276 |
+
def load_md_documents(path: Path) -> List[Document]:
|
| 277 |
+
text = path.read_text(encoding="utf-8")
|
| 278 |
+
file_hash = file_sha256(path)
|
| 279 |
+
documents: List[Document] = []
|
| 280 |
+
current_heading = ""
|
| 281 |
+
current_lines: List[str] = []
|
| 282 |
+
|
| 283 |
+
def flush() -> None:
|
| 284 |
+
nonlocal current_lines
|
| 285 |
+
section_text = "\n".join(current_lines).strip()
|
| 286 |
+
if not section_text:
|
| 287 |
+
current_lines = []
|
| 288 |
+
return
|
| 289 |
+
documents.append(
|
| 290 |
+
Document(
|
| 291 |
+
text=section_text,
|
| 292 |
+
metadata={
|
| 293 |
+
"source_file": str(path.resolve()),
|
| 294 |
+
"file_name": path.name,
|
| 295 |
+
"file_type": path.suffix.lower().lstrip("."),
|
| 296 |
+
"document_title": path.stem,
|
| 297 |
+
"file_hash": file_hash,
|
| 298 |
+
"content_type": "markdown_section",
|
| 299 |
+
"chapter_title": "",
|
| 300 |
+
"section_title": current_heading,
|
| 301 |
+
"section_path": current_heading,
|
| 302 |
+
"char_count": len(section_text),
|
| 303 |
+
},
|
| 304 |
+
)
|
| 305 |
+
)
|
| 306 |
+
current_lines = []
|
| 307 |
+
|
| 308 |
+
for line in text.splitlines():
|
| 309 |
+
heading_match = re.match(r"^(#{1,6})\s+(.+?)\s*$", line)
|
| 310 |
+
if heading_match:
|
| 311 |
+
flush()
|
| 312 |
+
current_heading = heading_match.group(2).strip()
|
| 313 |
+
current_lines.append(line)
|
| 314 |
+
|
| 315 |
+
flush()
|
| 316 |
+
return documents or [load_md_file(path)]
|
| 317 |
+
|
| 318 |
+
|
| 319 |
def append_visual_fragment(line_parts: List[str], text: str, baseline_y: float, item: dict) -> None:
|
| 320 |
if not text:
|
| 321 |
return
|
|
|
|
| 1208 |
|
| 1209 |
|
| 1210 |
def load_txt_file(path: Path) -> List[Document]:
|
| 1211 |
+
text = path.read_text(encoding="utf-8")
|
| 1212 |
+
return [
|
| 1213 |
+
Document(
|
| 1214 |
+
text=text,
|
| 1215 |
+
metadata={
|
| 1216 |
+
"source_file": str(path.resolve()),
|
| 1217 |
+
"file_name": path.name,
|
| 1218 |
+
"file_type": "txt",
|
| 1219 |
+
"document_title": path.stem,
|
| 1220 |
+
"file_hash": file_sha256(path),
|
| 1221 |
+
"content_type": "text",
|
| 1222 |
+
"chapter_title": "",
|
| 1223 |
+
"section_title": "",
|
| 1224 |
+
"section_path": "",
|
| 1225 |
+
"char_count": len(text),
|
| 1226 |
+
},
|
| 1227 |
+
)
|
| 1228 |
+
]
|
| 1229 |
|
| 1230 |
|
| 1231 |
def iter_source_files(raw_dir: Path) -> Iterable[Path]:
|
| 1232 |
+
supported_suffixes = {".md", ".markdown", ".pdf", ".txt"}
|
| 1233 |
for path in sorted(raw_dir.rglob("*")):
|
| 1234 |
if path.is_file() and path.suffix.lower() in supported_suffixes:
|
| 1235 |
yield path
|
|
|
|
| 1237 |
|
| 1238 |
def load_docs(raw_dir: Path = RAW_DIR) -> List[Document]:
|
| 1239 |
documents: List[Document] = []
|
| 1240 |
+
raw_dir = effective_raw_dir(raw_dir)
|
| 1241 |
|
| 1242 |
for path in iter_source_files(raw_dir):
|
| 1243 |
suffix = path.suffix.lower()
|
| 1244 |
|
| 1245 |
if suffix in {".md", ".markdown"}:
|
| 1246 |
+
documents.extend(load_md_documents(path))
|
| 1247 |
elif suffix == ".pdf":
|
| 1248 |
documents.extend(load_pdf_file(path))
|
| 1249 |
elif suffix == ".txt":
|
|
|
|
| 1269 |
|
| 1270 |
node.metadata["chunk_id"] = chunk_id
|
| 1271 |
node.metadata["chunk_index"] = chunk_index
|
| 1272 |
+
node.metadata[EMBED_MODEL_METADATA_KEY] = EMBED_MODEL_NAME
|
| 1273 |
node.id_ = chunk_id
|
| 1274 |
|
| 1275 |
return nodes
|
|
|
|
| 1290 |
f"PDF node {node.node_id} is missing page_number metadata.")
|
| 1291 |
|
| 1292 |
|
| 1293 |
+
def split_documents(documents: List[Document]) -> List[BaseNode]:
|
|
|
|
| 1294 |
splitter = SentenceSplitter(
|
| 1295 |
chunk_size=CHUNK_SIZE,
|
| 1296 |
chunk_overlap=CHUNK_OVERLAP,
|
|
|
|
| 1301 |
return nodes
|
| 1302 |
|
| 1303 |
|
| 1304 |
+
def build_nodes(raw_dir: Path = RAW_DIR) -> List[BaseNode]:
|
| 1305 |
+
documents = load_docs(raw_dir)
|
| 1306 |
+
return split_documents(documents)
|
| 1307 |
+
|
| 1308 |
+
|
| 1309 |
+
def load_source_file(path: Path) -> List[Document]:
|
| 1310 |
+
suffix = path.suffix.lower()
|
| 1311 |
+
if suffix in {".md", ".markdown"}:
|
| 1312 |
+
return load_md_documents(path)
|
| 1313 |
+
if suffix == ".pdf":
|
| 1314 |
+
return load_pdf_file(path)
|
| 1315 |
+
if suffix == ".txt":
|
| 1316 |
+
return load_txt_file(path)
|
| 1317 |
+
return []
|
| 1318 |
+
|
| 1319 |
+
|
| 1320 |
+
def list_current_sources(raw_dir: Path = RAW_DIR) -> dict[str, dict[str, str]]:
|
| 1321 |
+
raw_dir = effective_raw_dir(raw_dir)
|
| 1322 |
+
sources = {}
|
| 1323 |
+
for path in iter_source_files(raw_dir):
|
| 1324 |
+
resolved = str(path.resolve())
|
| 1325 |
+
sources[resolved] = {
|
| 1326 |
+
"file_hash": file_sha256(path),
|
| 1327 |
+
"file_type": path.suffix.lower().lstrip("."),
|
| 1328 |
+
}
|
| 1329 |
+
return sources
|
| 1330 |
+
|
| 1331 |
+
|
| 1332 |
+
def existing_source_metadata(chroma_collection) -> dict[str, dict[str, str]]:
|
| 1333 |
+
existing: dict[str, dict[str, str]] = {}
|
| 1334 |
+
if chroma_collection.count() == 0:
|
| 1335 |
+
return existing
|
| 1336 |
+
|
| 1337 |
+
offset = 0
|
| 1338 |
+
limit = 500
|
| 1339 |
+
while True:
|
| 1340 |
+
batch = chroma_collection.get(
|
| 1341 |
+
limit=limit,
|
| 1342 |
+
offset=offset,
|
| 1343 |
+
include=["metadatas"],
|
| 1344 |
+
)
|
| 1345 |
+
metadatas = batch.get("metadatas") or []
|
| 1346 |
+
if not metadatas:
|
| 1347 |
+
break
|
| 1348 |
+
for metadata in metadatas:
|
| 1349 |
+
source_file = metadata.get("source_file")
|
| 1350 |
+
if not source_file:
|
| 1351 |
+
continue
|
| 1352 |
+
existing[source_file] = {
|
| 1353 |
+
"file_hash": metadata.get("file_hash", ""),
|
| 1354 |
+
"file_type": metadata.get("file_type", ""),
|
| 1355 |
+
"embedding_model": metadata.get(EMBED_MODEL_METADATA_KEY, ""),
|
| 1356 |
+
"extraction_method": metadata.get("extraction_method", ""),
|
| 1357 |
+
}
|
| 1358 |
+
if len(metadatas) < limit:
|
| 1359 |
+
break
|
| 1360 |
+
offset += limit
|
| 1361 |
+
return existing
|
| 1362 |
+
|
| 1363 |
+
|
| 1364 |
+
def source_needs_update(current: dict[str, str], existing: dict[str, str] | None) -> bool:
|
| 1365 |
+
if not existing:
|
| 1366 |
+
return True
|
| 1367 |
+
if existing.get("file_hash") != current["file_hash"]:
|
| 1368 |
+
return True
|
| 1369 |
+
if existing.get("embedding_model") != EMBED_MODEL_NAME:
|
| 1370 |
+
return True
|
| 1371 |
+
if current["file_type"] == "pdf" and existing.get("extraction_method") != PDF_EXTRACTION_METHOD:
|
| 1372 |
+
return True
|
| 1373 |
+
return False
|
| 1374 |
+
|
| 1375 |
+
|
| 1376 |
+
def incremental_update_index(
|
| 1377 |
+
raw_dir: Path,
|
| 1378 |
+
chroma_collection,
|
| 1379 |
+
storage_context: StorageContext,
|
| 1380 |
+
embed_model,
|
| 1381 |
+
) -> bool:
|
| 1382 |
+
current_sources = list_current_sources(raw_dir)
|
| 1383 |
+
existing_sources = existing_source_metadata(chroma_collection)
|
| 1384 |
+
|
| 1385 |
+
deleted_sources = sorted(set(existing_sources) - set(current_sources))
|
| 1386 |
+
changed_sources = sorted(
|
| 1387 |
+
source_file
|
| 1388 |
+
for source_file, current in current_sources.items()
|
| 1389 |
+
if source_needs_update(current, existing_sources.get(source_file))
|
| 1390 |
+
)
|
| 1391 |
+
|
| 1392 |
+
for source_file in deleted_sources + changed_sources:
|
| 1393 |
+
try:
|
| 1394 |
+
chroma_collection.delete(where={"source_file": source_file})
|
| 1395 |
+
except Exception as exc:
|
| 1396 |
+
logging.warning("Could not delete stale chunks for %s: %s", source_file, exc)
|
| 1397 |
+
|
| 1398 |
+
if not changed_sources:
|
| 1399 |
+
if deleted_sources:
|
| 1400 |
+
print(f"Removed {len(deleted_sources)} stale source(s) from collection '{COLLECTION_NAME}'.")
|
| 1401 |
+
return bool(deleted_sources)
|
| 1402 |
+
|
| 1403 |
+
documents: List[Document] = []
|
| 1404 |
+
for source_file in changed_sources:
|
| 1405 |
+
documents.extend(load_source_file(Path(source_file)))
|
| 1406 |
+
|
| 1407 |
+
nodes = split_documents(documents)
|
| 1408 |
+
VectorStoreIndex(
|
| 1409 |
+
nodes,
|
| 1410 |
+
storage_context=storage_context,
|
| 1411 |
+
embed_model=embed_model,
|
| 1412 |
+
show_progress=True,
|
| 1413 |
+
)
|
| 1414 |
+
print(
|
| 1415 |
+
f"Incrementally indexed {len(nodes)} chunk(s) from {len(changed_sources)} source file(s)."
|
| 1416 |
+
)
|
| 1417 |
+
return True
|
| 1418 |
+
|
| 1419 |
+
|
| 1420 |
+
def collection_needs_rebuild(chroma_collection) -> bool:
|
| 1421 |
if chroma_collection.count() == 0:
|
| 1422 |
return True
|
| 1423 |
|
|
|
|
| 1427 |
return False
|
| 1428 |
|
| 1429 |
for metadata in sample.get("metadatas") or []:
|
| 1430 |
+
if metadata.get(EMBED_MODEL_METADATA_KEY) != EMBED_MODEL_NAME:
|
| 1431 |
+
return True
|
| 1432 |
if metadata.get("file_type") == "pdf":
|
| 1433 |
return metadata.get("extraction_method") != PDF_EXTRACTION_METHOD
|
| 1434 |
|
|
|
|
| 1441 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 1442 |
|
| 1443 |
load_dotenv()
|
| 1444 |
+
raw_dir = effective_raw_dir(raw_dir)
|
| 1445 |
CHROMA_DB_DIR.mkdir(parents=True, exist_ok=True)
|
| 1446 |
|
| 1447 |
db = chromadb.PersistentClient(path=str(CHROMA_DB_DIR))
|
|
|
|
| 1453 |
pass
|
| 1454 |
|
| 1455 |
chroma_collection = db.get_or_create_collection(COLLECTION_NAME)
|
|
|
|
|
|
|
|
|
|
| 1456 |
|
| 1457 |
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
| 1458 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
|
|
|
| 1473 |
f"Indexed {len(nodes)} chunks into collection '{COLLECTION_NAME}'")
|
| 1474 |
return index
|
| 1475 |
|
| 1476 |
+
incremental_update_index(
|
| 1477 |
+
raw_dir=raw_dir,
|
| 1478 |
+
chroma_collection=chroma_collection,
|
| 1479 |
+
storage_context=storage_context,
|
| 1480 |
+
embed_model=embed_model,
|
| 1481 |
+
)
|
| 1482 |
+
|
| 1483 |
print(
|
| 1484 |
f"Loaded existing collection '{COLLECTION_NAME}' with {chroma_collection.count()} chunks.")
|
| 1485 |
return VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
|
|
|
|
| 1487 |
|
| 1488 |
class QueryKnowledgeTool(Tool):
|
| 1489 |
name = "query_knowledge"
|
| 1490 |
+
description = (
|
| 1491 |
+
"Searches the local options trading knowledge base. Use this for option "
|
| 1492 |
+
"concepts, volatility, Greeks, strategies, formulas, equation numbers, "
|
| 1493 |
+
"and citations from reference books."
|
| 1494 |
+
)
|
| 1495 |
inputs = {'query': {'type': 'string',
|
| 1496 |
'description': 'The search query to perform.'}}
|
| 1497 |
output_type = "string"
|
| 1498 |
|
| 1499 |
@staticmethod
|
| 1500 |
+
def format_results(results, max_chars: int = 800):
|
| 1501 |
output = []
|
| 1502 |
|
| 1503 |
for result in results:
|
|
|
|
| 1509 |
formula_id = metadata.get("formula_id", "")
|
| 1510 |
score = result.score
|
| 1511 |
text = result.node.get_content()
|
| 1512 |
+
if len(text) > max_chars:
|
| 1513 |
+
text = f"{text[:max_chars].rstrip()}..."
|
| 1514 |
|
| 1515 |
output.append(
|
| 1516 |
f"source:{source}\n"
|
|
|
|
| 1519 |
f"content_type:{content_type}\n"
|
| 1520 |
f"formula_id:{formula_id or 'n/a'}\n"
|
| 1521 |
f"score:{score:.4f}\n"
|
| 1522 |
+
f"vector_score:{metadata.get(VECTOR_METADATA_KEY, 'n/a')}\n"
|
| 1523 |
+
f"bm25_score:{metadata.get(BM25_METADATA_KEY, 'n/a')}\n"
|
| 1524 |
f"content:{text}"
|
| 1525 |
)
|
| 1526 |
|
| 1527 |
return "\n\n---\n\n".join(output)
|
| 1528 |
|
| 1529 |
+
@staticmethod
|
| 1530 |
+
def load_bm25_nodes(collection_name: str = COLLECTION_NAME) -> list[TextNode]:
|
| 1531 |
+
db = chromadb.PersistentClient(path=str(CHROMA_DB_DIR))
|
| 1532 |
+
try:
|
| 1533 |
+
collection = db.get_collection(collection_name)
|
| 1534 |
+
except Exception:
|
| 1535 |
+
return []
|
| 1536 |
+
|
| 1537 |
+
nodes: list[TextNode] = []
|
| 1538 |
+
offset = 0
|
| 1539 |
+
limit = 500
|
| 1540 |
+
while True:
|
| 1541 |
+
batch = collection.get(
|
| 1542 |
+
limit=limit,
|
| 1543 |
+
offset=offset,
|
| 1544 |
+
include=["documents", "metadatas"],
|
| 1545 |
+
)
|
| 1546 |
+
documents = batch.get("documents") or []
|
| 1547 |
+
metadatas = batch.get("metadatas") or []
|
| 1548 |
+
ids = batch.get("ids") or []
|
| 1549 |
+
if not documents:
|
| 1550 |
+
break
|
| 1551 |
+
|
| 1552 |
+
for index, text in enumerate(documents):
|
| 1553 |
+
metadata = dict(metadatas[index] or {})
|
| 1554 |
+
node_id = ids[index] if index < len(ids) else metadata.get("chunk_id", "")
|
| 1555 |
+
nodes.append(TextNode(id_=node_id, text=text or "", metadata=metadata))
|
| 1556 |
+
|
| 1557 |
+
if len(documents) < limit:
|
| 1558 |
+
break
|
| 1559 |
+
offset += limit
|
| 1560 |
+
|
| 1561 |
+
return nodes
|
| 1562 |
+
|
| 1563 |
+
@staticmethod
|
| 1564 |
+
def merge_results(
|
| 1565 |
+
vector_results: list[NodeWithScore],
|
| 1566 |
+
bm25_results: list[NodeWithScore],
|
| 1567 |
+
top_k: int,
|
| 1568 |
+
) -> list[NodeWithScore]:
|
| 1569 |
+
merged: dict[str, NodeWithScore] = {}
|
| 1570 |
+
|
| 1571 |
+
for rank, result in enumerate(vector_results):
|
| 1572 |
+
node_id = result.node.node_id
|
| 1573 |
+
result.node.metadata[VECTOR_METADATA_KEY] = result.score
|
| 1574 |
+
merged[node_id] = NodeWithScore(
|
| 1575 |
+
node=result.node,
|
| 1576 |
+
score=1.0 / (rank + 1),
|
| 1577 |
+
)
|
| 1578 |
+
|
| 1579 |
+
for rank, result in enumerate(bm25_results):
|
| 1580 |
+
node_id = result.node.node_id
|
| 1581 |
+
result.node.metadata[BM25_METADATA_KEY] = result.score
|
| 1582 |
+
reciprocal_rank_score = 1.0 / (rank + 1)
|
| 1583 |
+
if node_id in merged:
|
| 1584 |
+
merged[node_id].score = (merged[node_id].score or 0.0) + reciprocal_rank_score
|
| 1585 |
+
merged[node_id].node.metadata[BM25_METADATA_KEY] = result.score
|
| 1586 |
+
else:
|
| 1587 |
+
merged[node_id] = NodeWithScore(
|
| 1588 |
+
node=result.node,
|
| 1589 |
+
score=reciprocal_rank_score,
|
| 1590 |
+
)
|
| 1591 |
+
|
| 1592 |
+
results = list(merged.values())
|
| 1593 |
+
results.sort(key=lambda item: item.score or float("-inf"), reverse=True)
|
| 1594 |
+
return results[:top_k]
|
| 1595 |
+
|
| 1596 |
+
def __init__(
|
| 1597 |
+
self,
|
| 1598 |
+
max_results=20,
|
| 1599 |
+
top_k=5,
|
| 1600 |
+
use_reranker: Optional[bool] = None,
|
| 1601 |
+
use_hybrid: Optional[bool] = None,
|
| 1602 |
+
reranker_top_n: Optional[int] = None,
|
| 1603 |
+
reranker_model_name: Optional[str] = None,
|
| 1604 |
+
**kwargs,
|
| 1605 |
+
):
|
| 1606 |
super().__init__()
|
| 1607 |
self.max_results = max_results
|
| 1608 |
+
self.top_k = top_k
|
| 1609 |
+
self.use_reranker = (
|
| 1610 |
+
env_flag("RAG_USE_RERANKER", True)
|
| 1611 |
+
if use_reranker is None
|
| 1612 |
+
else use_reranker
|
| 1613 |
+
)
|
| 1614 |
+
self.use_hybrid = (
|
| 1615 |
+
env_flag("RAG_USE_HYBRID", True)
|
| 1616 |
+
if use_hybrid is None
|
| 1617 |
+
else use_hybrid
|
| 1618 |
+
)
|
| 1619 |
+
self.reranker_top_n = reranker_top_n or top_k
|
| 1620 |
+
self.reranker = (
|
| 1621 |
+
CrossEncoderReranker(reranker_model_name or RERANKER_MODEL_NAME)
|
| 1622 |
+
if self.use_reranker
|
| 1623 |
+
else None
|
| 1624 |
+
)
|
| 1625 |
index = asyncio.run(build_index(rebuild=False))
|
| 1626 |
+
retrieve_top_k = max(max_results, top_k) if self.use_reranker else top_k
|
| 1627 |
+
self.retriever = index.as_retriever(similarity_top_k=retrieve_top_k)
|
| 1628 |
+
self.bm25_retriever = (
|
| 1629 |
+
BM25Retriever(self.load_bm25_nodes())
|
| 1630 |
+
if self.use_hybrid
|
| 1631 |
+
else None
|
| 1632 |
+
)
|
| 1633 |
|
| 1634 |
def forward(self, query: str) -> str:
|
| 1635 |
+
vector_results = self.retriever.retrieve(query)
|
| 1636 |
+
results = vector_results
|
| 1637 |
+
if self.bm25_retriever:
|
| 1638 |
+
bm25_results = self.bm25_retriever.retrieve(query, self.max_results)
|
| 1639 |
+
results = self.merge_results(
|
| 1640 |
+
vector_results=vector_results,
|
| 1641 |
+
bm25_results=bm25_results,
|
| 1642 |
+
top_k=max(self.max_results, self.top_k),
|
| 1643 |
+
)
|
| 1644 |
+
if self.reranker:
|
| 1645 |
+
try:
|
| 1646 |
+
results = self.reranker.rerank(
|
| 1647 |
+
query,
|
| 1648 |
+
results,
|
| 1649 |
+
top_n=self.reranker_top_n,
|
| 1650 |
+
)
|
| 1651 |
+
except Exception as exc:
|
| 1652 |
+
logging.warning("Reranker failed; falling back to vector ranking: %s", exc)
|
| 1653 |
+
results = results[:self.top_k]
|
| 1654 |
+
return QueryKnowledgeTool.format_results(results[:self.top_k])
|
| 1655 |
|
| 1656 |
|
| 1657 |
if __name__ == "__main__":
|
tools/todo.md
CHANGED
|
@@ -1,5 +1,437 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OptionAgent 构建 TODO
|
| 2 |
+
|
| 3 |
+
目标:构建一个能辅助期权交易研究的 agent。后续重点偏向波动率交易,而不是单纯方向性期权交易。它需要能查资料、查市场数据、分析 IV/RV/skew/term structure,生成波动率策略、回测策略,并根据回测结果迭代改进。所有输出必须带假设、风险、数据来源和限制说明,不直接承诺收益。
|
| 4 |
+
|
| 5 |
+
## 0. 当前已有能力
|
| 6 |
+
|
| 7 |
+
- [x] 本地 RAG:`QueryKnowledgeTool` 已接入主 agent,可查询期权书籍知识库。
|
| 8 |
+
- [x] PDF RAG 优化:支持 PyMuPDF 提取、公式块识别、章节 metadata、页码引用。
|
| 9 |
+
- [x] Hybrid search:dense retrieval + BM25 + reranker。
|
| 10 |
+
- [x] 本地 RAG eval:支持 local-options eval、BEIR/fiqa、Open RAGBench。
|
| 11 |
+
- [x] 基础市场价格工具:`query_market_asset` 可查询股票、指数、ETF、crypto、forex 的当前价格。
|
| 12 |
+
- [x] Web search/visit webpage 已接入主 agent,并通过工具初始化与网页解析 mock 测试。
|
| 13 |
+
- [x] 期权链、IV、Greeks、期限结构、偏斜等基础数据模块已完成 MVP,并通过 mock 测试。
|
| 14 |
+
- [x] 策略构建模块 MVP 已完成:支持 5 类波动率策略候选。
|
| 15 |
+
- [x] 回测模块 MVP 已完成:支持 payoff 情景分析、RV signal 历史回测 proxy、历史期权 bid/ask quote CSV 真实腿级回测。
|
| 16 |
+
- [x] QuantConnect/LEAN 期权回测模板已加入:可用于接入真实历史期权链、撮合、组合持仓和保证金模型。
|
| 17 |
+
- [x] 策略改进/优化闭环 MVP 已完成:支持参数扫描和 best vs baseline 对比。
|
| 18 |
+
|
| 19 |
+
## 1. Research 模块:资料检索
|
| 20 |
+
|
| 21 |
+
### 1.1 本地知识库 RAG
|
| 22 |
+
|
| 23 |
+
- [x] 查询本地书籍、PDF、Markdown 知识库。
|
| 24 |
+
- [x] 返回 source、page、section、content_type、score、excerpt。
|
| 25 |
+
- [x] 支持公式 chunk 和正文 chunk。
|
| 26 |
+
- [ ] 为期权策略类问题增加 query rewrite:
|
| 27 |
+
- 中文问题转英文检索词。
|
| 28 |
+
- 生成多个 query variants。
|
| 29 |
+
- 对公式/章节/策略/风险问题采用不同检索策略。
|
| 30 |
+
- [ ] 增加 citation policy:
|
| 31 |
+
- agent 最终回答必须引用 RAG 来源。
|
| 32 |
+
- 没查到资料时明确说“不确定/资料不足”。
|
| 33 |
+
|
| 34 |
+
### 1.2 Web Search
|
| 35 |
+
|
| 36 |
+
- [x] 将 `DuckDuckGoSearchTool` 和 `VisitWebpageTool` 接入 `app.py` 的 tools。
|
| 37 |
+
- [x] 修复 `VisitWebpageTool` 中缺失的 `re` import。
|
| 38 |
+
- [x] 给 web search 加使用边界:
|
| 39 |
+
- 用于查最新市场事件、宏观事件、财报日期、公司公告、交易所规则。
|
| 40 |
+
- 本地书籍知识优先用 RAG,实时信息优先用 web。
|
| 41 |
+
- [x] Web 结果返回标题、URL 和摘要;发布时间后续按数据源能力增强。
|
| 42 |
+
- [ ] 对高风险市场信息做多源交叉验证。
|
| 43 |
+
|
| 44 |
+
## 2. Market Data 模块:市场数据与期权数据
|
| 45 |
+
|
| 46 |
+
### 2.1 标的行情
|
| 47 |
+
|
| 48 |
+
- [x] 当前价格、日内 OHLC、成交量。
|
| 49 |
+
- [x] 增加历史价格接口:
|
| 50 |
+
- 日线、小时线、分钟线。
|
| 51 |
+
- 支持 start/end/period/interval 参数。
|
| 52 |
+
- 输出用于回测的标准 DataFrame/JSON。
|
| 53 |
+
- [x] 增加 realized volatility 计算:
|
| 54 |
+
- 10D/20D/30D/60D realized vol。
|
| 55 |
+
- Parkinson/Garman-Klass 可选。
|
| 56 |
+
|
| 57 |
+
### 2.2 期权链
|
| 58 |
+
|
| 59 |
+
- [x] 新增 `query_option_chain(symbol, expiration)` tool。
|
| 60 |
+
- [x] 返回 calls/puts:
|
| 61 |
+
- strike
|
| 62 |
+
- bid/ask/mid/last
|
| 63 |
+
- volume/open_interest
|
| 64 |
+
- implied_volatility
|
| 65 |
+
- in_the_money
|
| 66 |
+
- expiration
|
| 67 |
+
- days_to_expiration
|
| 68 |
+
- [x] 支持列出全部 expiration dates。
|
| 69 |
+
- [x] 对无流动性合约做标记:
|
| 70 |
+
- bid/ask 缺失
|
| 71 |
+
- spread 过宽
|
| 72 |
+
- volume/OI 过低
|
| 73 |
+
|
| 74 |
+
### 2.3 Greeks 与波动率结构
|
| 75 |
+
|
| 76 |
+
- [x] 新增 Greeks 计算模块:
|
| 77 |
+
- delta/gamma/vega/theta/rho。
|
| 78 |
+
- 支持 Black-Scholes-Merton。
|
| 79 |
+
- 支持 dividend yield / risk-free rate 参数。
|
| 80 |
+
- [x] 新增 IV surface / skew 分析 MVP:
|
| 81 |
+
- ATM IV。
|
| 82 |
+
- 近似 put-call skew。
|
| 83 |
+
- ATM IV term structure slope。
|
| 84 |
+
- IV percentile / rank 后续在 Milestone 2 完成。
|
| 85 |
+
- [x] 新增 volatility trading 专用指标 MVP:
|
| 86 |
+
- realized volatility: 5D/10D/20D/30D/60D。
|
| 87 |
+
- implied vs realized spread。
|
| 88 |
+
- volatility risk premium: IV - RV。
|
| 89 |
+
- IV term structure slope。
|
| 90 |
+
- skew slope / put-call skew。
|
| 91 |
+
- vol-of-vol proxy 后续增强。
|
| 92 |
+
- event IV premium 后续增强。
|
| 93 |
+
- [ ] 对 yfinance IV 字段做 sanity check:
|
| 94 |
+
- IV 为 0、缺失、异常值时标记。
|
| 95 |
+
- bid/ask/mid 不合理时不参与策略构建。
|
| 96 |
+
|
| 97 |
+
### 2.4 数据源抽象
|
| 98 |
+
|
| 99 |
+
- [x] 建立 `market_data/` 模块,避免所有行情逻辑堆在 `app.py`。
|
| 100 |
+
- [x] 设计统一 schema:
|
| 101 |
+
- `UnderlyingQuote`
|
| 102 |
+
- `OptionContract`
|
| 103 |
+
- `OptionChain`
|
| 104 |
+
- `VolSnapshot`
|
| 105 |
+
- [x] 第一阶段可用 yfinance,后续可接 Polygon/Tradier/IBKR。
|
| 106 |
+
|
| 107 |
+
## 3. Strategy Builder 模块:策略构建
|
| 108 |
+
|
| 109 |
+
后续策略构建以波动率观点为核心,方向观点为辅助变量。
|
| 110 |
+
|
| 111 |
+
### 3.1 用户意图解析
|
| 112 |
+
|
| 113 |
+
- [ ] 解析用户输入:
|
| 114 |
+
- 标的 symbol。
|
| 115 |
+
- 波动率观点:long vol / short vol / vol mean reversion / event vol / skew trade。
|
| 116 |
+
- 方向观点:bullish/bearish/neutral/range-bound。
|
| 117 |
+
- 时间周期。
|
| 118 |
+
- 风险承受。
|
| 119 |
+
- 账户约束/最大亏损。
|
| 120 |
+
- 是否允许裸卖。
|
| 121 |
+
- [ ] 如果关键信息缺失,agent 需要追问,而不是直接生成交易。
|
| 122 |
+
|
| 123 |
+
### 3.2 策略候选生成
|
| 124 |
+
|
| 125 |
+
- [ ] 支持基础策略模板:
|
| 126 |
+
- long call / long put
|
| 127 |
+
- covered call
|
| 128 |
+
- cash-secured put
|
| 129 |
+
- vertical spread
|
| 130 |
+
- calendar spread
|
| 131 |
+
- straddle / strangle
|
| 132 |
+
- iron condor
|
| 133 |
+
- collar
|
| 134 |
+
- [x] 支持波动率交易策略模板 MVP:
|
| 135 |
+
- long straddle / long strangle
|
| 136 |
+
- short straddle / short strangle
|
| 137 |
+
- delta-hedged straddle
|
| 138 |
+
- calendar spread
|
| 139 |
+
- diagonal spread
|
| 140 |
+
- variance-style option basket approximation
|
| 141 |
+
- skew trade: risk reversal / put spread vs call spread
|
| 142 |
+
- term structure trade: near-term short vol + longer-term long vol
|
| 143 |
+
- [x] 每个策略输出:
|
| 144 |
+
- legs
|
| 145 |
+
- expiration
|
| 146 |
+
- strike
|
| 147 |
+
- net debit/credit
|
| 148 |
+
- max profit
|
| 149 |
+
- max loss
|
| 150 |
+
- breakeven
|
| 151 |
+
- margin estimate
|
| 152 |
+
- Greeks exposure
|
| 153 |
+
- liquidity warnings
|
| 154 |
+
|
| 155 |
+
### 3.3 策略筛选规则
|
| 156 |
+
|
| 157 |
+
- [ ] 根据市场状态筛选策略:
|
| 158 |
+
- 高 IV:偏向 credit spread / iron condor / covered call。
|
| 159 |
+
- 低 IV:偏向 long options / calendar / debit spread。
|
| 160 |
+
- 趋势观点强:vertical spread / directional options。
|
| 161 |
+
- 震荡观点:short premium / condor。
|
| 162 |
+
- [ ] 根据波动率状态筛选策略:
|
| 163 |
+
- IV 明显高于 RV:考虑 short vol,但必须检查事件风险和尾部风险。
|
| 164 |
+
- IV 明显低于 RV:考虑 long vol,但必须检查 theta bleed。
|
| 165 |
+
- 近月 IV 异常高:考虑 calendar/diagonal 或 event vol 策略。
|
| 166 |
+
- skew 极端:考虑 risk reversal、put spread、skew mean reversion。
|
| 167 |
+
- term structure 陡峭:考虑跨期限 vol trade。
|
| 168 |
+
- [ ] 加入风险约束:
|
| 169 |
+
- max loss 不超过用户预算。
|
| 170 |
+
- spread 不能过宽。
|
| 171 |
+
- OI/volume 低的合约排除。
|
| 172 |
+
- 禁止默认裸卖期权。
|
| 173 |
+
- [ ] 输出多个候选策略并排序,而不是只给一个。
|
| 174 |
+
|
| 175 |
+
### 3.4 策略解释
|
| 176 |
+
|
| 177 |
+
- [ ] 每个策略必须解释:
|
| 178 |
+
- 为什么适合当前市场。
|
| 179 |
+
- 主要盈利条件。
|
| 180 |
+
- 主要亏损场景。
|
| 181 |
+
- Greeks 风险。
|
| 182 |
+
- IV crush / event risk。
|
| 183 |
+
- Vega / gamma / theta trade-off。
|
| 184 |
+
- Long vol 或 short vol 的核心假设。
|
| 185 |
+
- 流动性和滑点风险。
|
| 186 |
+
- [ ] 必须引用 RAG/web/market data 来源。
|
| 187 |
+
|
| 188 |
+
## 3.5 Volatility Research 模块:波动率交易研究
|
| 189 |
+
|
| 190 |
+
- [x] 构建 volatility dashboard MVP:
|
| 191 |
+
- current IV vs historical IV range。
|
| 192 |
+
- IV percentile / rank。
|
| 193 |
+
- realized volatility windows。
|
| 194 |
+
- IV-RV spread。
|
| 195 |
+
- term structure chart。
|
| 196 |
+
- skew chart。
|
| 197 |
+
- [x] 识别波动率 regime MVP:
|
| 198 |
+
- low vol regime。
|
| 199 |
+
- high vol regime。
|
| 200 |
+
- vol expansion。
|
| 201 |
+
- vol compression。
|
| 202 |
+
- event-driven vol。
|
| 203 |
+
- [ ] 事件模块:
|
| 204 |
+
- earnings date。
|
| 205 |
+
- CPI/FOMC/NFP 等宏观事件。
|
| 206 |
+
- event implied move。
|
| 207 |
+
- post-event IV crush risk。
|
| 208 |
+
- [x] 输出波动率观点 MVP:
|
| 209 |
+
- long vol / short vol / neutral。
|
| 210 |
+
- confidence。
|
| 211 |
+
- key assumptions。
|
| 212 |
+
- invalidation conditions。
|
| 213 |
+
|
| 214 |
+
## 4. Backtesting 模块:回测与情景分析
|
| 215 |
+
|
| 216 |
+
### 4.1 第一阶段:Payoff 与情景分析
|
| 217 |
+
|
| 218 |
+
- [x] 新增 `backtest/` 模块。
|
| 219 |
+
- [x] 实现到期 payoff 情景表:
|
| 220 |
+
- 不同标的价格下 PnL。
|
| 221 |
+
- breakeven。
|
| 222 |
+
- max loss/max profit。
|
| 223 |
+
- [x] 实现情景分析 MVP:
|
| 224 |
+
- underlying price shock。
|
| 225 |
+
- IV up/down。
|
| 226 |
+
- days passed / theta decay。
|
| 227 |
+
- Greeks approximation。
|
| 228 |
+
- [x] 增加波动率情景 MVP:
|
| 229 |
+
- IV crush。
|
| 230 |
+
- IV expansion。
|
| 231 |
+
- realized move vs implied move。
|
| 232 |
+
- gamma scalp breakeven move。
|
| 233 |
+
- delta-hedging frequency sensitivity。
|
| 234 |
+
- [x] 输出表格和 JSON,方便 agent 总结。
|
| 235 |
+
|
| 236 |
+
### 4.2 第二阶段:历史回测
|
| 237 |
+
|
| 238 |
+
- [x] 获取历史 underlying price。
|
| 239 |
+
- [ ] 获取或近似历史 IV:
|
| 240 |
+
- 优先真实历史 option chain。
|
| 241 |
+
- 没有数据时用 realized vol 或当前 IV 做近似,并明确标注限制。
|
| 242 |
+
- [x] 支持真实历史期权 quote CSV 输入:
|
| 243 |
+
- 必需字段:date、underlying_symbol、underlying_price、contract_symbol、option_type、expiration、strike、bid、ask。
|
| 244 |
+
- 可选字段:mid、delta、gamma、theta、vega、implied_volatility、volume、open_interest。
|
| 245 |
+
- 当前实现可做 ATM long straddle 的真实开仓/平仓腿级 PnL。
|
| 246 |
+
- 注意:yfinance 不能可靠提供历史 option chain,严肃回测需要 Polygon/ORATS/OptionMetrics/QuantConnect 等数据源。
|
| 247 |
+
- [x] 设计 entry/exit rules MVP:
|
| 248 |
+
- 入场条件。
|
| 249 |
+
- 出场条件。
|
| 250 |
+
- DTE 管理。
|
| 251 |
+
- 固定 holding period。
|
| 252 |
+
- 固定 entry frequency。
|
| 253 |
+
- [ ] 设计高级 entry/exit rules:
|
| 254 |
+
- 止盈止损。
|
| 255 |
+
- rolling 规则。
|
| 256 |
+
- [x] 为波动率策略增加专门规则 MVP:
|
| 257 |
+
- IV percentile 入场阈值。
|
| 258 |
+
- IV-RV spread 入场阈值。
|
| 259 |
+
- earnings 前后入场/退出。
|
| 260 |
+
- DTE bucket。
|
| 261 |
+
- delta hedge 频率。
|
| 262 |
+
- gamma scalp rule。
|
| 263 |
+
- [x] 计算指标 MVP:
|
| 264 |
+
- total PnL
|
| 265 |
+
- max drawdown
|
| 266 |
+
- win rate
|
| 267 |
+
- avg win/loss
|
| 268 |
+
- [ ] 计算高级指标:
|
| 269 |
+
- total return
|
| 270 |
+
- CAGR
|
| 271 |
+
- Sharpe/Sortino
|
| 272 |
+
- exposure time
|
| 273 |
+
- tail loss
|
| 274 |
+
- realized vs implied PnL attribution
|
| 275 |
+
- theta PnL
|
| 276 |
+
- vega PnL
|
| 277 |
+
- gamma scalping PnL
|
| 278 |
+
|
| 279 |
+
### 4.3 第三阶段:组合级回测
|
| 280 |
+
|
| 281 |
+
- [x] 支持单策略多笔交易 MVP。
|
| 282 |
+
- [ ] 支持多策略/多标的组合交易。
|
| 283 |
+
- [ ] 支持现金、保证金、仓位占用。
|
| 284 |
+
- [x] 支持交易成本、bid/ask slippage MVP。
|
| 285 |
+
- [ ] 支持 assignment / early exercise 风险近似。
|
| 286 |
+
- [x] 生成交易日志 MVP。
|
| 287 |
+
- [ ] 生成风险归因。
|
| 288 |
+
|
| 289 |
+
## 5. Strategy Optimizer 模块:回测后改进
|
| 290 |
+
|
| 291 |
+
- [x] 根据回测结果自动提出改进 MVP:
|
| 292 |
+
- 调整 expiration。
|
| 293 |
+
- 调整 strike/delta。
|
| 294 |
+
- 调整止盈止损。
|
| 295 |
+
- 限制入场市场环境。
|
| 296 |
+
- 避开财报/宏观事件。
|
| 297 |
+
- [ ] 对波动率策略提出专门改进:
|
| 298 |
+
- 调整 long/short vol 入场 IV percentile。
|
| 299 |
+
- 调整 straddle/strangle delta。
|
| 300 |
+
- 调整 delta hedge 频率。
|
| 301 |
+
- 调整 DTE bucket。
|
| 302 |
+
- 避开或利用 event vol。
|
| 303 |
+
- 加入 tail hedge。
|
| 304 |
+
- [x] 支持参数扫描 MVP:
|
| 305 |
+
- DTE range。
|
| 306 |
+
- delta target。
|
| 307 |
+
- width。
|
| 308 |
+
- profit target。
|
| 309 |
+
- stop loss。
|
| 310 |
+
- IV percentile threshold。
|
| 311 |
+
- IV-RV spread threshold。
|
| 312 |
+
- hedge frequency。
|
| 313 |
+
- [x] 输出对比表:
|
| 314 |
+
- baseline strategy
|
| 315 |
+
- improved strategy
|
| 316 |
+
- metrics delta
|
| 317 |
+
- trade-off
|
| 318 |
+
- [ ] 防止过拟合:
|
| 319 |
+
- train/test split。
|
| 320 |
+
- walk-forward analysis。
|
| 321 |
+
- out-of-sample period。
|
| 322 |
+
|
| 323 |
+
## 6. Agent Orchestrator 模块:完整工作流
|
| 324 |
+
|
| 325 |
+
- [ ] 定义标准工作流:
|
| 326 |
+
|
| 327 |
+
```text
|
| 328 |
+
用户提出目标
|
| 329 |
+
-> 解析意图和约束
|
| 330 |
+
-> 查询 RAG/web 背景资料
|
| 331 |
+
-> 查询标的行情和期权链
|
| 332 |
+
-> 分析 IV/Greeks/流动性
|
| 333 |
+
-> 生成多个策略候选
|
| 334 |
+
-> 初步风险筛选
|
| 335 |
+
-> 回测/情景分析
|
| 336 |
+
-> 改进策略
|
| 337 |
+
-> 输出最终报告
|
| 338 |
+
```
|
| 339 |
+
|
| 340 |
+
- [x] 增加 agent prompt 约束:
|
| 341 |
+
- 不承诺收益。
|
| 342 |
+
- 不给无风险建议。
|
| 343 |
+
- 必须说明假设和数据限制。
|
| 344 |
+
- 必须输出最大亏损。
|
| 345 |
+
- 必须说明流动性、滑点、IV、事件风险。
|
| 346 |
+
- [x] 增加结构化输出格式:
|
| 347 |
+
- `market_context`
|
| 348 |
+
- `strategy_candidates`
|
| 349 |
+
- `selected_strategy`
|
| 350 |
+
- `backtest_summary`
|
| 351 |
+
- `risk_warnings`
|
| 352 |
+
- `sources`
|
| 353 |
+
|
| 354 |
+
## 7. UI / Report 模块
|
| 355 |
+
|
| 356 |
+
- [ ] Gradio UI 支持输入:
|
| 357 |
+
- symbol
|
| 358 |
+
- outlook
|
| 359 |
+
- time horizon
|
| 360 |
+
- risk budget
|
| 361 |
+
- strategy preference
|
| 362 |
+
- [ ] 展示:
|
| 363 |
+
- 策略 legs 表格。
|
| 364 |
+
- payoff 图。
|
| 365 |
+
- Greeks 表格。
|
| 366 |
+
- 回测指标。
|
| 367 |
+
- 引用来源。
|
| 368 |
+
- [ ] 支持导出 Markdown/HTML report。
|
| 369 |
+
|
| 370 |
+
## 8. Evaluation 模块
|
| 371 |
+
|
| 372 |
+
- [x] RAG retrieval eval。
|
| 373 |
+
- [x] Market data tool 单元测试:已覆盖 RV、Greeks、历史价格 tool、期权链 tool、volatility snapshot mock。
|
| 374 |
+
- [x] Strategy builder 单元测试:
|
| 375 |
+
- payoff 计算正确。
|
| 376 |
+
- max loss/max profit 正确。
|
| 377 |
+
- breakeven 正确。
|
| 378 |
+
- [x] Backtest engine 测试:
|
| 379 |
+
- 单腿/多腿 payoff。
|
| 380 |
+
- 交易成本。
|
| 381 |
+
- rolling/exit rule。
|
| 382 |
+
- [ ] Agent end-to-end 测试:
|
| 383 |
+
- 给定 symbol + outlook,能完整输出策略、风险和来源。
|
| 384 |
+
|
| 385 |
+
## 9. 推荐实现顺序
|
| 386 |
+
|
| 387 |
+
### Milestone 1:Research + Market Data 可用
|
| 388 |
+
|
| 389 |
+
- [x] 接入 web search 和 visit webpage 到主 agent。
|
| 390 |
+
- [x] 修复 `VisitWebpageTool`。
|
| 391 |
+
- [x] 新增 option chain 查询工具。
|
| 392 |
+
- [x] 新增 Greeks/IV 基础计算。
|
| 393 |
+
- [x] 新增 IV/RV/skew/term structure 基础分析。
|
| 394 |
+
- [x] 将行情代码从 `app.py` 拆到独立模块。
|
| 395 |
+
|
| 396 |
+
### Milestone 2:Volatility Dashboard MVP
|
| 397 |
+
|
| 398 |
+
- [x] 计算 realized volatility windows。
|
| 399 |
+
- [x] 计算 ATM IV、IV rank/percentile proxy。
|
| 400 |
+
- [x] 计算 IV-RV spread。
|
| 401 |
+
- [x] 计算 skew 和 term structure。
|
| 402 |
+
- [x] 输出 volatility regime 判断。
|
| 403 |
+
|
| 404 |
+
### Milestone 3:波动率策略生成 MVP
|
| 405 |
+
|
| 406 |
+
- [x] 定义策略 leg schema。
|
| 407 |
+
- [x] 实现 5 个优先策略模板:
|
| 408 |
+
- long straddle
|
| 409 |
+
- long strangle
|
| 410 |
+
- short straddle
|
| 411 |
+
- calendar spread
|
| 412 |
+
- iron condor
|
| 413 |
+
- [x] 实现 payoff/max loss/breakeven 计算。
|
| 414 |
+
- [x] 根据 volatility regime 和 IV/RV 状态生成候选策略 MVP。
|
| 415 |
+
|
| 416 |
+
### Milestone 4:回测 MVP
|
| 417 |
+
|
| 418 |
+
- [x] 实现到期 payoff 和情景分析。
|
| 419 |
+
- [x] 实现历史 underlying 回测 MVP。
|
| 420 |
+
- [x] 实现 IV/RV 条件入场回测 MVP。
|
| 421 |
+
- [x] 实现历史期权 quote CSV 的真实 long straddle 回测 MVP。
|
| 422 |
+
- [x] 添加 QuantConnect/LEAN ATM long straddle 回测模板。
|
| 423 |
+
- [x] 实现 straddle/strangle 的 delta hedge 情景分析 proxy。
|
| 424 |
+
- [x] 输出核心指标和交易日志。
|
| 425 |
+
|
| 426 |
+
### Milestone 5:优化闭环
|
| 427 |
+
|
| 428 |
+
- [x] 参数扫描。
|
| 429 |
+
- [x] 策略改进建议 MVP。
|
| 430 |
+
- [x] 对比报告。
|
| 431 |
+
- [ ] 防过拟合验证。
|
| 432 |
+
|
| 433 |
+
### Milestone 6:完整 Agent 工作流
|
| 434 |
+
|
| 435 |
+
- [x] 统一 prompt 和输出格式。
|
| 436 |
+
- [ ] Gradio UI 展示策略、图表和回测。
|
| 437 |
+
- [ ] 端到端测试。
|
tools/visit_webpage.py
CHANGED
|
@@ -1,15 +1,20 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
from smolagents.tools import Tool
|
| 3 |
-
import requests
|
| 4 |
-
import markdownify
|
| 5 |
-
import smolagents
|
| 6 |
|
| 7 |
class VisitWebpageTool(Tool):
|
| 8 |
name = "visit_webpage"
|
| 9 |
-
description =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
|
| 11 |
output_type = "string"
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
def forward(self, url: str) -> str:
|
| 14 |
try:
|
| 15 |
import requests
|
|
@@ -40,6 +45,3 @@ class VisitWebpageTool(Tool):
|
|
| 40 |
return f"Error fetching the webpage: {str(e)}"
|
| 41 |
except Exception as e:
|
| 42 |
return f"An unexpected error occurred: {str(e)}"
|
| 43 |
-
|
| 44 |
-
def __init__(self, *args, **kwargs):
|
| 45 |
-
self.is_initialized = False
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
from smolagents.tools import Tool
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
class VisitWebpageTool(Tool):
|
| 6 |
name = "visit_webpage"
|
| 7 |
+
description = (
|
| 8 |
+
"Visits a webpage at the given URL and returns its readable Markdown content. "
|
| 9 |
+
"Use this after web_search when current market news, company events, "
|
| 10 |
+
"earnings information, exchange rules, or source verification is needed."
|
| 11 |
+
)
|
| 12 |
inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
|
| 13 |
output_type = "string"
|
| 14 |
|
| 15 |
+
def __init__(self, *args, **kwargs):
|
| 16 |
+
super().__init__(*args, **kwargs)
|
| 17 |
+
|
| 18 |
def forward(self, url: str) -> str:
|
| 19 |
try:
|
| 20 |
import requests
|
|
|
|
| 45 |
return f"Error fetching the webpage: {str(e)}"
|
| 46 |
except Exception as e:
|
| 47 |
return f"An unexpected error occurred: {str(e)}"
|
|
|
|
|
|
|
|
|
tools/web_search.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
-
from typing import Any, Optional
|
| 2 |
from smolagents.tools import Tool
|
| 3 |
-
import duckduckgo_search
|
| 4 |
|
| 5 |
class DuckDuckGoSearchTool(Tool):
|
| 6 |
name = "web_search"
|
| 7 |
-
description =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
|
| 9 |
output_type = "string"
|
| 10 |
|
|
|
|
|
|
|
| 1 |
from smolagents.tools import Tool
|
|
|
|
| 2 |
|
| 3 |
class DuckDuckGoSearchTool(Tool):
|
| 4 |
name = "web_search"
|
| 5 |
+
description = (
|
| 6 |
+
"Searches the web for current information. Use this for recent market events, "
|
| 7 |
+
"earnings dates, company announcements, macro events, current rules, or "
|
| 8 |
+
"source discovery. Prefer the local knowledge base for stable options concepts."
|
| 9 |
+
)
|
| 10 |
inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
|
| 11 |
output_type = "string"
|
| 12 |
|
uv.lock
CHANGED
|
@@ -2,9 +2,15 @@ version = 1
|
|
| 2 |
revision = 3
|
| 3 |
requires-python = ">=3.12"
|
| 4 |
resolution-markers = [
|
| 5 |
-
"python_full_version >= '3.14'",
|
| 6 |
-
"python_full_version =
|
| 7 |
-
"python_full_version
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
]
|
| 9 |
|
| 10 |
[[package]]
|
|
@@ -246,6 +252,19 @@ wheels = [
|
|
| 246 |
{ url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
|
| 247 |
]
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
[[package]]
|
| 250 |
name = "build"
|
| 251 |
version = "1.5.0"
|
|
@@ -520,7 +539,7 @@ name = "cuda-bindings"
|
|
| 520 |
version = "12.9.4"
|
| 521 |
source = { registry = "https://pypi.org/simple" }
|
| 522 |
dependencies = [
|
| 523 |
-
{ name = "cuda-pathfinder" },
|
| 524 |
]
|
| 525 |
wheels = [
|
| 526 |
{ url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" },
|
|
@@ -538,6 +557,39 @@ wheels = [
|
|
| 538 |
{ url = "https://files.pythonhosted.org/packages/11/d0/c177e29701cf1d3008d7d2b16b5fc626592ce13bd535f8795c5f57187e0e/cuda_pathfinder-1.5.4-py3-none-any.whl", hash = "sha256:9563d3175ce1828531acf4b94e1c1c7d67208c347ca002493e2654878b26f4b7", size = 51657, upload-time = "2026-04-27T22:42:07.712Z" },
|
| 539 |
]
|
| 540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
[[package]]
|
| 542 |
name = "dataclasses-json"
|
| 543 |
version = "0.6.7"
|
|
@@ -660,10 +712,12 @@ dependencies = [
|
|
| 660 |
{ name = "llama-index-core" },
|
| 661 |
{ name = "llama-index-embeddings-huggingface" },
|
| 662 |
{ name = "llama-index-vector-stores-chroma" },
|
|
|
|
| 663 |
{ name = "pymupdf" },
|
| 664 |
{ name = "pypdf" },
|
| 665 |
{ name = "tokenizers" },
|
| 666 |
{ name = "transformers" },
|
|
|
|
| 667 |
]
|
| 668 |
|
| 669 |
[package.metadata]
|
|
@@ -674,10 +728,12 @@ requires-dist = [
|
|
| 674 |
{ name = "llama-index-core", specifier = ">=0.14.0" },
|
| 675 |
{ name = "llama-index-embeddings-huggingface", specifier = ">=0.6.0" },
|
| 676 |
{ name = "llama-index-vector-stores-chroma", specifier = ">=0.5.0" },
|
|
|
|
| 677 |
{ name = "pymupdf", specifier = ">=1.27.2.3" },
|
| 678 |
{ name = "pypdf", specifier = ">=6.0.0" },
|
| 679 |
{ name = "tokenizers", specifier = ">=0.22.0,<=0.23.0" },
|
| 680 |
{ name = "transformers", specifier = "<5" },
|
|
|
|
| 681 |
]
|
| 682 |
|
| 683 |
[[package]]
|
|
@@ -1675,6 +1731,15 @@ wheels = [
|
|
| 1675 |
{ url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
|
| 1676 |
]
|
| 1677 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1678 |
[[package]]
|
| 1679 |
name = "mypy-extensions"
|
| 1680 |
version = "1.1.0"
|
|
@@ -1815,7 +1880,7 @@ name = "nvidia-cudnn-cu12"
|
|
| 1815 |
version = "9.10.2.21"
|
| 1816 |
source = { registry = "https://pypi.org/simple" }
|
| 1817 |
dependencies = [
|
| 1818 |
-
{ name = "nvidia-cublas-cu12" },
|
| 1819 |
]
|
| 1820 |
wheels = [
|
| 1821 |
{ url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
|
|
@@ -1826,7 +1891,7 @@ name = "nvidia-cufft-cu12"
|
|
| 1826 |
version = "11.3.3.83"
|
| 1827 |
source = { registry = "https://pypi.org/simple" }
|
| 1828 |
dependencies = [
|
| 1829 |
-
{ name = "nvidia-nvjitlink-cu12" },
|
| 1830 |
]
|
| 1831 |
wheels = [
|
| 1832 |
{ url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
|
|
@@ -1853,9 +1918,9 @@ name = "nvidia-cusolver-cu12"
|
|
| 1853 |
version = "11.7.3.90"
|
| 1854 |
source = { registry = "https://pypi.org/simple" }
|
| 1855 |
dependencies = [
|
| 1856 |
-
{ name = "nvidia-cublas-cu12" },
|
| 1857 |
-
{ name = "nvidia-cusparse-cu12" },
|
| 1858 |
-
{ name = "nvidia-nvjitlink-cu12" },
|
| 1859 |
]
|
| 1860 |
wheels = [
|
| 1861 |
{ url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
|
|
@@ -1866,7 +1931,7 @@ name = "nvidia-cusparse-cu12"
|
|
| 1866 |
version = "12.5.8.93"
|
| 1867 |
source = { registry = "https://pypi.org/simple" }
|
| 1868 |
dependencies = [
|
| 1869 |
-
{ name = "nvidia-nvjitlink-cu12" },
|
| 1870 |
]
|
| 1871 |
wheels = [
|
| 1872 |
{ url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
|
|
@@ -2124,6 +2189,67 @@ wheels = [
|
|
| 2124 |
{ url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
|
| 2125 |
]
|
| 2126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2127 |
[[package]]
|
| 2128 |
name = "pillow"
|
| 2129 |
version = "12.2.0"
|
|
@@ -2636,6 +2762,15 @@ wheels = [
|
|
| 2636 |
{ url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" },
|
| 2637 |
]
|
| 2638 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2639 |
[[package]]
|
| 2640 |
name = "pyyaml"
|
| 2641 |
version = "6.0.3"
|
|
@@ -3088,6 +3223,15 @@ wheels = [
|
|
| 3088 |
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
| 3089 |
]
|
| 3090 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3091 |
[[package]]
|
| 3092 |
name = "sqlalchemy"
|
| 3093 |
version = "2.0.49"
|
|
@@ -3405,6 +3549,15 @@ wheels = [
|
|
| 3405 |
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
|
| 3406 |
]
|
| 3407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3408 |
[[package]]
|
| 3409 |
name = "urllib3"
|
| 3410 |
version = "2.7.0"
|
|
@@ -3778,6 +3931,28 @@ wheels = [
|
|
| 3778 |
{ url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" },
|
| 3779 |
]
|
| 3780 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3781 |
[[package]]
|
| 3782 |
name = "zipp"
|
| 3783 |
version = "3.23.1"
|
|
|
|
| 2 |
revision = 3
|
| 3 |
requires-python = ">=3.12"
|
| 4 |
resolution-markers = [
|
| 5 |
+
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
| 6 |
+
"python_full_version >= '3.14' and sys_platform == 'emscripten'",
|
| 7 |
+
"python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
|
| 8 |
+
"python_full_version == '3.13.*' and sys_platform == 'win32'",
|
| 9 |
+
"python_full_version == '3.13.*' and sys_platform == 'emscripten'",
|
| 10 |
+
"python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
|
| 11 |
+
"python_full_version < '3.13' and sys_platform == 'win32'",
|
| 12 |
+
"python_full_version < '3.13' and sys_platform == 'emscripten'",
|
| 13 |
+
"python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
|
| 14 |
]
|
| 15 |
|
| 16 |
[[package]]
|
|
|
|
| 252 |
{ url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
|
| 253 |
]
|
| 254 |
|
| 255 |
+
[[package]]
|
| 256 |
+
name = "beautifulsoup4"
|
| 257 |
+
version = "4.14.3"
|
| 258 |
+
source = { registry = "https://pypi.org/simple" }
|
| 259 |
+
dependencies = [
|
| 260 |
+
{ name = "soupsieve" },
|
| 261 |
+
{ name = "typing-extensions" },
|
| 262 |
+
]
|
| 263 |
+
sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
|
| 264 |
+
wheels = [
|
| 265 |
+
{ url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
|
| 266 |
+
]
|
| 267 |
+
|
| 268 |
[[package]]
|
| 269 |
name = "build"
|
| 270 |
version = "1.5.0"
|
|
|
|
| 539 |
version = "12.9.4"
|
| 540 |
source = { registry = "https://pypi.org/simple" }
|
| 541 |
dependencies = [
|
| 542 |
+
{ name = "cuda-pathfinder", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
|
| 543 |
]
|
| 544 |
wheels = [
|
| 545 |
{ url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" },
|
|
|
|
| 557 |
{ url = "https://files.pythonhosted.org/packages/11/d0/c177e29701cf1d3008d7d2b16b5fc626592ce13bd535f8795c5f57187e0e/cuda_pathfinder-1.5.4-py3-none-any.whl", hash = "sha256:9563d3175ce1828531acf4b94e1c1c7d67208c347ca002493e2654878b26f4b7", size = 51657, upload-time = "2026-04-27T22:42:07.712Z" },
|
| 558 |
]
|
| 559 |
|
| 560 |
+
[[package]]
|
| 561 |
+
name = "curl-cffi"
|
| 562 |
+
version = "0.15.0"
|
| 563 |
+
source = { registry = "https://pypi.org/simple" }
|
| 564 |
+
dependencies = [
|
| 565 |
+
{ name = "certifi" },
|
| 566 |
+
{ name = "cffi" },
|
| 567 |
+
{ name = "rich" },
|
| 568 |
+
]
|
| 569 |
+
sdist = { url = "https://files.pythonhosted.org/packages/48/5b/89fcfebd3e5e85134147ac99e9f2b2271165fd4d71984fc65da5f17819b7/curl_cffi-0.15.0.tar.gz", hash = "sha256:ea0c67652bf6893d34ee0f82c944f37e488f6147e9421bef1771cc6545b02ded", size = 196437, upload-time = "2026-04-03T11:12:31.525Z" }
|
| 570 |
+
wheels = [
|
| 571 |
+
{ url = "https://files.pythonhosted.org/packages/5e/42/54ddd442c795f30ce5dd4e49f87ce77505958d3777cd96a91567a3975d2a/curl_cffi-0.15.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:bda66404010e9ed743b1b83c20c86f24fe21a9a6873e17479d6e67e29d8ded28", size = 2795267, upload-time = "2026-04-03T11:11:46.48Z" },
|
| 572 |
+
{ url = "https://files.pythonhosted.org/packages/83/2d/3915e238579b3c5a92cead5c79130c3b8d20caaba7616cc4d894650e1d6b/curl_cffi-0.15.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:a25620d9bf989c9c029a7d1642999c4c265abb0bad811deb2f77b0b5b2b12e5b", size = 2573544, upload-time = "2026-04-03T11:11:47.951Z" },
|
| 573 |
+
{ url = "https://files.pythonhosted.org/packages/2a/b3/9d2f1057749a1b07ba1989db3c1503ce8bed998310bae9aea2c43aa64f20/curl_cffi-0.15.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:582e570aa2586b96ed47cf4a17586b9a3c462cbe43f780487c3dc245c6ef1527", size = 10515369, upload-time = "2026-04-03T11:11:50.126Z" },
|
| 574 |
+
{ url = "https://files.pythonhosted.org/packages/b5/1d/6d10dded5ce3fd8157e558ebd97d09e551b77a62cdc1c31e93d0a633cee5/curl_cffi-0.15.0-cp310-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:838e48212447d9c81364b04707a5c861daf08f8320f9ecb3406a8919d1d5c3b3", size = 10160045, upload-time = "2026-04-03T11:11:52.664Z" },
|
| 575 |
+
{ url = "https://files.pythonhosted.org/packages/5c/12/c70b835487ace3b9ba1502631912e3440082b8ae3a162f60b59cb0b6444d/curl_cffi-0.15.0-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b6c847d86283b07ae69bb72c82eb8a59242277142aa35b89850f89e792a02fc", size = 11090433, upload-time = "2026-04-03T11:11:55.049Z" },
|
| 576 |
+
{ url = "https://files.pythonhosted.org/packages/ea/0d/78edcc4f71934225db99df68197a107386d59080742fc7bf6bb4d007924f/curl_cffi-0.15.0-cp310-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9e5e69eee735f659287e2c84444319d68a1fa68dd37abf228943a4074864283a", size = 10479178, upload-time = "2026-04-03T11:11:57.685Z" },
|
| 577 |
+
{ url = "https://files.pythonhosted.org/packages/5b/84/1e101c1acb1ea2f0b4992f5c3024f596d8e21db0d53540b9d583f673c4e7/curl_cffi-0.15.0-cp310-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:aa1323950224db24f4c510d010b3affa02196ca853fb424191fa917a513d3f4b", size = 10317051, upload-time = "2026-04-03T11:12:00.295Z" },
|
| 578 |
+
{ url = "https://files.pythonhosted.org/packages/28/42/8ef236b22a6c23d096c85a1dc507efe37bfdfc7a2f8a4b34efb590197369/curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:41f80170ba844009273b2660da1964ec31e99e5719d16b3422ada87177e32e13", size = 11299660, upload-time = "2026-04-03T11:12:02.791Z" },
|
| 579 |
+
{ url = "https://files.pythonhosted.org/packages/1d/01/56aeb055d962da87a1be0d74c6c644e251c7e88129b5471dc44ac724e678/curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1977e1e12cfb5c11352cbb74acef1bed24eb7d226dab61ca57c168c21acd4d61", size = 11945049, upload-time = "2026-04-03T11:12:05.912Z" },
|
| 580 |
+
{ url = "https://files.pythonhosted.org/packages/d8/8c/2abf99a38d6340d66cf0557e0c750ef3f8883dfc5d450087e01c85861343/curl_cffi-0.15.0-cp310-abi3-win_amd64.whl", hash = "sha256:5a0c1896a0d5a5ac1eb89cd24b008d2b718dd1df6fd2f75451b59ca66e49e572", size = 1661649, upload-time = "2026-04-03T11:12:07.948Z" },
|
| 581 |
+
{ url = "https://files.pythonhosted.org/packages/3d/39/dfd54f2240d3a9b96d77bacc62b97813b35e2aa8ecf5cd5013c683f1ba96/curl_cffi-0.15.0-cp310-abi3-win_arm64.whl", hash = "sha256:a6d57f8389273a3a1f94370473c74897467bcc36af0a17336989780c507fa43d", size = 1410741, upload-time = "2026-04-03T11:12:10.073Z" },
|
| 582 |
+
{ url = "https://files.pythonhosted.org/packages/19/6a/c24df8a4fc22fa84070dcd94abeba43c15e08cc09e35869565c0bad196fd/curl_cffi-0.15.0-cp313-abi3-android_24_arm64_v8a.whl", hash = "sha256:4682dc38d4336e0eb0b185374db90a760efde63cbea994b4e63f3521d44c4c92", size = 7190427, upload-time = "2026-04-03T11:12:12.142Z" },
|
| 583 |
+
{ url = "https://files.pythonhosted.org/packages/11/56/132225cb3491d07cc6adcce5fe395e059bde87c68cff1ef87a31c88c7819/curl_cffi-0.15.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:967ad7355bd8e9586f8c2d02eaa99953747549e7ea4a9b25cd53353e6b67fe6d", size = 2795723, upload-time = "2026-04-03T11:12:13.668Z" },
|
| 584 |
+
{ url = "https://files.pythonhosted.org/packages/07/8f/f4f83cd303bef7e8f1749512e5dd157e7e5d08b0a36c8211f9640a2757bf/curl_cffi-0.15.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7e63539d0d839d0a8c5eacf86229bc68c57803547f35e0db7ee0986328b478c3", size = 2573739, upload-time = "2026-04-03T11:12:15.08Z" },
|
| 585 |
+
{ url = "https://files.pythonhosted.org/packages/e8/5c/643d65c7fc9acd742876aa55c2d7823c438cb7665810acd2e66c9976c4d9/curl_cffi-0.15.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08c799b89740b9bc49c09fbc3d5907f13ac1f845ca52620507ef9466d4639dd5", size = 10521046, upload-time = "2026-04-03T11:12:17.034Z" },
|
| 586 |
+
{ url = "https://files.pythonhosted.org/packages/7f/0b/9b8037113c93f4c5323096163471fa7c35c7676c3f608eeaf1287cd99d58/curl_cffi-0.15.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b7a92767a888ee90147e18964b396d8435ff42737030d6fb00824ffd6094805", size = 11096115, upload-time = "2026-04-03T11:12:19.694Z" },
|
| 587 |
+
{ url = "https://files.pythonhosted.org/packages/5f/96/fff2fcbd924ef4042e0d67379f751a8a4e3186a91e75e35a4cf218b306ee/curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:829cc357061ecb99cc2d406301f609a039e05665322f5c025ec67c38b0dc49ce", size = 11305346, upload-time = "2026-04-03T11:12:22.151Z" },
|
| 588 |
+
{ url = "https://files.pythonhosted.org/packages/53/1b/304b253a45ab28691c8c5e8cca1e6cbb9cf8e46dfceae4648dd536f75e73/curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:408d6f14e346841cd889c2e0962832bb235ba3b6749ebf609f347f747da5e60f", size = 11949834, upload-time = "2026-04-03T11:12:24.986Z" },
|
| 589 |
+
{ url = "https://files.pythonhosted.org/packages/5a/ff/4723d92f08259c707a974aba27a08d0a822b9555e35ca581bf18d055a364/curl_cffi-0.15.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b624c7ce087bfda967a013ed0a64702a525444e5b6e97d23534d567ccc6525aa", size = 1702771, upload-time = "2026-04-03T11:12:28.201Z" },
|
| 590 |
+
{ url = "https://files.pythonhosted.org/packages/59/8c/36bbe06d66fa2b765e4a07199f643a59a9cd1a754207a96335402a9520f4/curl_cffi-0.15.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0b6c0543b993996670e9e4b78e305a2d60809d5681903ffb5568e21a387434d3", size = 1466312, upload-time = "2026-04-03T11:12:30.054Z" },
|
| 591 |
+
]
|
| 592 |
+
|
| 593 |
[[package]]
|
| 594 |
name = "dataclasses-json"
|
| 595 |
version = "0.6.7"
|
|
|
|
| 712 |
{ name = "llama-index-core" },
|
| 713 |
{ name = "llama-index-embeddings-huggingface" },
|
| 714 |
{ name = "llama-index-vector-stores-chroma" },
|
| 715 |
+
{ name = "pandas" },
|
| 716 |
{ name = "pymupdf" },
|
| 717 |
{ name = "pypdf" },
|
| 718 |
{ name = "tokenizers" },
|
| 719 |
{ name = "transformers" },
|
| 720 |
+
{ name = "yfinance" },
|
| 721 |
]
|
| 722 |
|
| 723 |
[package.metadata]
|
|
|
|
| 728 |
{ name = "llama-index-core", specifier = ">=0.14.0" },
|
| 729 |
{ name = "llama-index-embeddings-huggingface", specifier = ">=0.6.0" },
|
| 730 |
{ name = "llama-index-vector-stores-chroma", specifier = ">=0.5.0" },
|
| 731 |
+
{ name = "pandas", specifier = ">=2.0.0" },
|
| 732 |
{ name = "pymupdf", specifier = ">=1.27.2.3" },
|
| 733 |
{ name = "pypdf", specifier = ">=6.0.0" },
|
| 734 |
{ name = "tokenizers", specifier = ">=0.22.0,<=0.23.0" },
|
| 735 |
{ name = "transformers", specifier = "<5" },
|
| 736 |
+
{ name = "yfinance", specifier = ">=0.2.0" },
|
| 737 |
]
|
| 738 |
|
| 739 |
[[package]]
|
|
|
|
| 1731 |
{ url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
|
| 1732 |
]
|
| 1733 |
|
| 1734 |
+
[[package]]
|
| 1735 |
+
name = "multitasking"
|
| 1736 |
+
version = "0.0.13"
|
| 1737 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1738 |
+
sdist = { url = "https://files.pythonhosted.org/packages/be/c3/ac2cc9307fb15cc28ed6d4a9266b216c83ee7fe64299f0264047982bce88/multitasking-0.0.13.tar.gz", hash = "sha256:d896b5df877c9ca5eeddbf0e5994124694d6cb535aba698fb23344c7025155a1", size = 20585, upload-time = "2026-04-23T12:14:15.049Z" }
|
| 1739 |
+
wheels = [
|
| 1740 |
+
{ url = "https://files.pythonhosted.org/packages/d3/1c/24dbf69b247f287401c904a396233a43c89fd4fb9b7cd2e50e430e9cd57c/multitasking-0.0.13-py3-none-any.whl", hash = "sha256:ec9243af140c67bfe52dc98d7173c294512735a88e8425c458b250db99dc2b48", size = 16380, upload-time = "2026-04-23T12:14:13.776Z" },
|
| 1741 |
+
]
|
| 1742 |
+
|
| 1743 |
[[package]]
|
| 1744 |
name = "mypy-extensions"
|
| 1745 |
version = "1.1.0"
|
|
|
|
| 1880 |
version = "9.10.2.21"
|
| 1881 |
source = { registry = "https://pypi.org/simple" }
|
| 1882 |
dependencies = [
|
| 1883 |
+
{ name = "nvidia-cublas-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
|
| 1884 |
]
|
| 1885 |
wheels = [
|
| 1886 |
{ url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
|
|
|
|
| 1891 |
version = "11.3.3.83"
|
| 1892 |
source = { registry = "https://pypi.org/simple" }
|
| 1893 |
dependencies = [
|
| 1894 |
+
{ name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
|
| 1895 |
]
|
| 1896 |
wheels = [
|
| 1897 |
{ url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
|
|
|
|
| 1918 |
version = "11.7.3.90"
|
| 1919 |
source = { registry = "https://pypi.org/simple" }
|
| 1920 |
dependencies = [
|
| 1921 |
+
{ name = "nvidia-cublas-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
|
| 1922 |
+
{ name = "nvidia-cusparse-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
|
| 1923 |
+
{ name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
|
| 1924 |
]
|
| 1925 |
wheels = [
|
| 1926 |
{ url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
|
|
|
|
| 1931 |
version = "12.5.8.93"
|
| 1932 |
source = { registry = "https://pypi.org/simple" }
|
| 1933 |
dependencies = [
|
| 1934 |
+
{ name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
|
| 1935 |
]
|
| 1936 |
wheels = [
|
| 1937 |
{ url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
|
|
|
|
| 2189 |
{ url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
|
| 2190 |
]
|
| 2191 |
|
| 2192 |
+
[[package]]
|
| 2193 |
+
name = "pandas"
|
| 2194 |
+
version = "3.0.3"
|
| 2195 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2196 |
+
dependencies = [
|
| 2197 |
+
{ name = "numpy" },
|
| 2198 |
+
{ name = "python-dateutil" },
|
| 2199 |
+
{ name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" },
|
| 2200 |
+
]
|
| 2201 |
+
sdist = { url = "https://files.pythonhosted.org/packages/f8/87/4341c6252d1c47b08768c3d25ac487362bf403f0313ddae4a2a26c9b1b4c/pandas-3.0.3.tar.gz", hash = "sha256:696a4a00a2a2a35d4e5deb3fc946641b96c944f02230e4f76137fe35d806c4fc", size = 4651414, upload-time = "2026-05-11T18:54:29.21Z" }
|
| 2202 |
+
wheels = [
|
| 2203 |
+
{ url = "https://files.pythonhosted.org/packages/24/f1/392f8c5bfc16f66a0d2d41561c01627c228fe7ed2a0d056ef11315042570/pandas-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fed2ff7fd9779120e388e285fc029bd5cf9490cdd2e4166a9ee22c0e49a9ab09", size = 10357846, upload-time = "2026-05-11T18:52:36.143Z" },
|
| 2204 |
+
{ url = "https://files.pythonhosted.org/packages/cf/3d/b16412745651e855f357e5e66930248688378853a6e2698a214e331fba1f/pandas-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b168fc218fd80a6cbdbdbc1a97ddc7889ed057d7eb45f50d866ceab5f39904c4", size = 9899550, upload-time = "2026-05-11T18:52:38.976Z" },
|
| 2205 |
+
{ url = "https://files.pythonhosted.org/packages/31/a8/fa2535168fffcedf67f4f6de28d2dd903a747ca7c8ea6989451aaeb3a92f/pandas-3.0.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0383c72c75cdcca61a9e116e611143902dbfd08bff356829c2f6d1cf40a9ca8c", size = 10412965, upload-time = "2026-05-11T18:52:41.915Z" },
|
| 2206 |
+
{ url = "https://files.pythonhosted.org/packages/65/b6/09b01cdbc15224e2850365192d17b7bdebb8bdbd8780ed221fcdf0d9a515/pandas-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6dc0b3fd2169c9157deed50b4d519553a3655c8c6a96027136d654592be973a9", size = 10894600, upload-time = "2026-05-11T18:52:45.02Z" },
|
| 2207 |
+
{ url = "https://files.pythonhosted.org/packages/c9/a4/2eb28f2fccb4ced4a2c79ab2a5dee9ade1ebf44922ebad6fea158c9f95d4/pandas-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7e65d5407dc0b394f509699650e4a2ec01c0514f21850f453fa60f3be79a5dbf", size = 11422824, upload-time = "2026-05-11T18:52:48.058Z" },
|
| 2208 |
+
{ url = "https://files.pythonhosted.org/packages/f8/45/830bb57f533a4604b355e07edcb8ea18cf88b5f94e5fca92f27052d7c597/pandas-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f8894dc474d648fe7b6ff0ca9b0bd73950d19952bc1a6534540762c5d79d305c", size = 11950889, upload-time = "2026-05-11T18:52:50.905Z" },
|
| 2209 |
+
{ url = "https://files.pythonhosted.org/packages/b9/c5/fc1b368f303087d20e8c9bf3d6ceb186263cfac0ade735cd938538bea839/pandas-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:c7be265b62cef88e253a941e4698604973736dcfe242fdb5198f0f7bc473cdcc", size = 9755463, upload-time = "2026-05-11T18:52:53.386Z" },
|
| 2210 |
+
{ url = "https://files.pythonhosted.org/packages/86/bd/fda8f9705b1b09c6ebe14bfc0fa0e4ec8584d54ea673628f157ff55131af/pandas-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:557409bc4178e70ee8d9ddb494798e51ebf6ea59330f6be22c51bab2a7db6c49", size = 9066158, upload-time = "2026-05-11T18:52:56.038Z" },
|
| 2211 |
+
{ url = "https://files.pythonhosted.org/packages/c5/90/62d8302883c44308c477e222c3daf7c813a34c8e96985882fbd53d964352/pandas-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:67b3b64c11910cfa29f4e94a14d3bff9ee693b6fc76055e7cad549cee0aec5fa", size = 10331071, upload-time = "2026-05-11T18:52:58.838Z" },
|
| 2212 |
+
{ url = "https://files.pythonhosted.org/packages/7f/ae/6a6493c783a101f165e4356953ba3c74d6f77f0042fa7d753da9dfbb640c/pandas-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:39436b377d56d2a2e52d0395bdbee171f01068e99af5250509aceeb929f765c7", size = 9875690, upload-time = "2026-05-11T18:53:01.431Z" },
|
| 2213 |
+
{ url = "https://files.pythonhosted.org/packages/62/7c/5df8e9f56c69a2769fbe9382a5ef8f2658c007e376434e1e2cbb57ad895f/pandas-3.0.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4be06d68f9ddcfc645b87534911da79a8fbffc7573c80e0edcf42a5020624d8", size = 10381634, upload-time = "2026-05-11T18:53:04.393Z" },
|
| 2214 |
+
{ url = "https://files.pythonhosted.org/packages/99/68/1237369725aa617bb358263d535803e3053fdbc593513ec5ed9c9896b5b6/pandas-3.0.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a4eeb6830daf35a71cc09649bd823e2b542dac246cdee9614c6e4bd65028cd6a", size = 10891243, upload-time = "2026-05-11T18:53:07.643Z" },
|
| 2215 |
+
{ url = "https://files.pythonhosted.org/packages/25/93/77d108e8af7222b4a503ebde0e30215b1c2e4f8e53a526431890f22d5586/pandas-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1928e07221f82db493cd4af1e23c1bfca524a19a4699887975bff68f49a72bfb", size = 11388659, upload-time = "2026-05-11T18:53:10.634Z" },
|
| 2216 |
+
{ url = "https://files.pythonhosted.org/packages/d0/bd/eff5b4399f332ac386c853f6cd2bd3fa2ca0061b9f36ecd9c4d7c4265649/pandas-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51b1fe551acb77dac643c6fda86084d8d446c10fe64b06a9cc29c4cc8540e7f2", size = 11942880, upload-time = "2026-05-11T18:53:13.536Z" },
|
| 2217 |
+
{ url = "https://files.pythonhosted.org/packages/2c/20/559ace4200982c3887d0b86bfd0d856a2143ef8ddab63cc07934951a964c/pandas-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:a82d532a3351d435432cd913edbccaf8b8e01d4dd0e5ced5a8d2e8ecd94c7e44", size = 9757091, upload-time = "2026-05-11T18:53:16.306Z" },
|
| 2218 |
+
{ url = "https://files.pythonhosted.org/packages/3a/66/69055a09fe200f29f922a3eeec4804611900b95f52d932ece3393c3c0c19/pandas-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:275c14e0fce14a2ec20eee474aecd305478ea3c1e6f6a9d8fe219a165542717e", size = 9057282, upload-time = "2026-05-11T18:53:18.768Z" },
|
| 2219 |
+
{ url = "https://files.pythonhosted.org/packages/57/0e/efe801b0e6811e8e650cd21b7f2608e30f08a7067e2bf6e8752b0d56ee3c/pandas-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:46997386d528eb40376ecd6b033cf4a8a1e5282580f68f43de875b78cba2199d", size = 10767016, upload-time = "2026-05-11T18:53:21.227Z" },
|
| 2220 |
+
{ url = "https://files.pythonhosted.org/packages/ea/dc/eb55135a1d5f0f0519f28da1f609a206d2cad1f9c35c32d51e38dd7261ae/pandas-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:261e308dfb22448384b7580cf719d2f998fe2966c92893c3e77d14008af1f066", size = 10420210, upload-time = "2026-05-11T18:53:23.982Z" },
|
| 2221 |
+
{ url = "https://files.pythonhosted.org/packages/c6/3e/b1d5d955ce33ffecb407465a60bc32769d74fcf68224b7ae67ae11d4dea4/pandas-3.0.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd1a5d1def6a46002e964510bdc67c368aa0951df5d1d9f8365336f5a1f490cd", size = 10336126, upload-time = "2026-05-11T18:53:26.731Z" },
|
| 2222 |
+
{ url = "https://files.pythonhosted.org/packages/f5/76/a01261711ab60a22d71b862f0de20e4c504bf80457270ad8cb42110f6abc/pandas-3.0.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d72828c20c6d6e83e1e22a6a3b47b326b71664112fa9705dcbccfd7a39b62085", size = 10728051, upload-time = "2026-05-11T18:53:29.125Z" },
|
| 2223 |
+
{ url = "https://files.pythonhosted.org/packages/e9/21/ea191195e587b18cf682e97f433f81b2d0fbe341380e80a3e0d6e4403c8e/pandas-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d26cbe1fcfc12e8fd900e2454163e466b2d3af84f7c75481df7683ffc073d870", size = 11350796, upload-time = "2026-05-11T18:53:32.056Z" },
|
| 2224 |
+
{ url = "https://files.pythonhosted.org/packages/64/69/f0eaaf54939f0e8c6768fd06be9af2cef9b36048b96dfb9e1b2c685a807e/pandas-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3e91cec1879ada0624fc3dc9953c5cbd60208e59c0db28f540c5d6d47502422f", size = 11799741, upload-time = "2026-05-11T18:53:34.985Z" },
|
| 2225 |
+
{ url = "https://files.pythonhosted.org/packages/45/a4/865e0e510cae5fc2194de4db28be638952de942571ba9125934fd9c01d47/pandas-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:08d789b41f87e0905880e293cedf6197ce71fe67cc081358b1e148a491b9bd13", size = 10499958, upload-time = "2026-05-11T18:53:37.857Z" },
|
| 2226 |
+
{ url = "https://files.pythonhosted.org/packages/86/54/effdcc3c0ff7a08037889200e148ebe94c16c4f653be078c7b3675955df1/pandas-3.0.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3650109c0f22879df8bd6179ab9ee3d7f1d1d4e7e0094a3f0032d9f51e2e64ac", size = 10336065, upload-time = "2026-05-11T18:53:41.099Z" },
|
| 2227 |
+
{ url = "https://files.pythonhosted.org/packages/68/10/bf2d6738d72748b961a3751ab89522d58c54efc36a8e1a12161216cd45cf/pandas-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bab900348131a7db1f69a7309ef141fd5680f1487094193bcbbb61791573bf8f", size = 9926101, upload-time = "2026-05-11T18:53:43.515Z" },
|
| 2228 |
+
{ url = "https://files.pythonhosted.org/packages/ae/e9/e35cf11c8a136e757b956f5f0efdcaa50aecde85ea055f1898dfc68262f3/pandas-3.0.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba7e08b9ac1d54569cd1e256e3668975ed624d6826f7b68df0342b012007bddb", size = 10457553, upload-time = "2026-05-11T18:53:46.394Z" },
|
| 2229 |
+
{ url = "https://files.pythonhosted.org/packages/58/3b/1cdec6772bdbaf7b25dab360c59f03cadf05492dd724c6540af905389b07/pandas-3.0.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d71c63ae4ebdbf70209742096f1fc46a83a0613c99d4b23766cced9ff8cd62a", size = 10914065, upload-time = "2026-05-11T18:53:49.134Z" },
|
| 2230 |
+
{ url = "https://files.pythonhosted.org/packages/c4/c2/1ef644445fcd72e3627bceec77e3560636f87ddce4ed841afe76b83b5bf9/pandas-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e3a2ec42c98ffa2565a67e08e218d06d72576d758d90facb7c00805194d8f360", size = 11459188, upload-time = "2026-05-11T18:53:52.527Z" },
|
| 2231 |
+
{ url = "https://files.pythonhosted.org/packages/7e/49/4d8d4f42cbc9c4adc7a1870f269c02cbd6cd40d059622c06fb298addcbad/pandas-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:335f62418ed562cfc3c49e9e196375c28b729dcef8543abf4f9438e381bf3c76", size = 11982966, upload-time = "2026-05-11T18:53:55.043Z" },
|
| 2232 |
+
{ url = "https://files.pythonhosted.org/packages/38/55/792619469bab9882d8bbd5865d45a72f6478762d04a9af4bf0d08c503e95/pandas-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:3c20a521bbb85902f79f7270c80a59e1b5452d96d170c034f207181870f97ac5", size = 9876755, upload-time = "2026-05-11T18:53:58.067Z" },
|
| 2233 |
+
{ url = "https://files.pythonhosted.org/packages/2a/af/33c469653b0ba03b50c3a98192d4c07f0c75c66b263ceb097fce0ee97d31/pandas-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:a2d2dff8a04f3917b55ab3910c32990f8ddf7eceba114947838cefa976a68977", size = 9198658, upload-time = "2026-05-11T18:54:00.733Z" },
|
| 2234 |
+
{ url = "https://files.pythonhosted.org/packages/a2/fa/b8c257bd76b8bd060c3a9151c1fca05e9b9c5e3af5d0f549c0356f6d143d/pandas-3.0.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:0d589105b3c14645af1738ff279b2995102d8f7a03b0a66dc8d95550eb513e04", size = 10787242, upload-time = "2026-05-11T18:54:03.564Z" },
|
| 2235 |
+
{ url = "https://files.pythonhosted.org/packages/54/eb/f19206ffb0bf1919002969aa448b4702c6594845156a6f8050674855aac3/pandas-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:13fc1e853d9e04743d11ba75a985ccbc2a317fe07d8af61e445a6fd24dacd6a6", size = 10436369, upload-time = "2026-05-11T18:54:06.311Z" },
|
| 2236 |
+
{ url = "https://files.pythonhosted.org/packages/fd/24/c7c39fb4fe22b71a0c2d78bf0c585c600092d85f94f086d2b3b2f6ca27e2/pandas-3.0.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:819959dab7bbd0049c15623fbac4e29a191b9528160a61fb1032242d8ced2d9c", size = 10358306, upload-time = "2026-05-11T18:54:09.085Z" },
|
| 2237 |
+
{ url = "https://files.pythonhosted.org/packages/16/ec/dd2a9eb7fa1204df88c0864164e35b228ac581062ac612ba0a67fd812e4c/pandas-3.0.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:60ae316d3fd75d1858d450d0db0103ea2be3e7d4a95ec2f064f7e2ae63f7b028", size = 10758394, upload-time = "2026-05-11T18:54:11.956Z" },
|
| 2238 |
+
{ url = "https://files.pythonhosted.org/packages/95/6e/00c61ea8e85b4f6d8d35e11852a1a4998fc7fafc91c6a602d1cc9c972d64/pandas-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bd3a518890b400d32f9023722dc9a9a5c969f00b415419a3c06c043f09bb5d7d", size = 11375717, upload-time = "2026-05-11T18:54:14.539Z" },
|
| 2239 |
+
{ url = "https://files.pythonhosted.org/packages/31/89/8fc1c268969fac43688d65fd92e67df24bd128d53cb4d2eee534cd307399/pandas-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c39be2d709d01fa972a0cabc522389fceca4f3969332ba25a7d6c5802cf976a", size = 11828897, upload-time = "2026-05-11T18:54:17.146Z" },
|
| 2240 |
+
{ url = "https://files.pythonhosted.org/packages/56/3b/e7d20dea247a3e6dc0bd8a6953854afbedc03951def4e7371e05e7263e25/pandas-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4db8c527972a821cf5286b40ccc57642a39bc62e62022b42f99f8a67fca8c3a1", size = 10900855, upload-time = "2026-05-11T18:54:19.72Z" },
|
| 2241 |
+
{ url = "https://files.pythonhosted.org/packages/0f/54/68a0978d1ef8502b8492099beaa6e7a0c1b32e3b5d4f677f5810cb08711c/pandas-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b2c95f8bfc1ee412bf482605d7bfd30c12d1d26bd59fdd91efeef1d4718decb1", size = 9466464, upload-time = "2026-05-11T18:54:22.754Z" },
|
| 2242 |
+
]
|
| 2243 |
+
|
| 2244 |
+
[[package]]
|
| 2245 |
+
name = "peewee"
|
| 2246 |
+
version = "4.0.6"
|
| 2247 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2248 |
+
sdist = { url = "https://files.pythonhosted.org/packages/9f/09/a3b2a32ce498f405dce4320267e99b1b076c1ea39ad01151a353bc7f81d7/peewee-4.0.6.tar.gz", hash = "sha256:ea2f78f24ff9e3660281dc5b0be8bc00d9a9514bdc40c98e416fcd042b66ac6a", size = 724591, upload-time = "2026-05-20T13:18:17.26Z" }
|
| 2249 |
+
wheels = [
|
| 2250 |
+
{ url = "https://files.pythonhosted.org/packages/69/6a/e1455b94ee48f5666f2e7831b6247098794bfe9747da457111be4d0bea10/peewee-4.0.6-py3-none-any.whl", hash = "sha256:5fa665913c410f0b5faef1469ed0aa9eceb9fef262665ebbb6f29408f826eeeb", size = 146222, upload-time = "2026-05-20T13:18:15.694Z" },
|
| 2251 |
+
]
|
| 2252 |
+
|
| 2253 |
[[package]]
|
| 2254 |
name = "pillow"
|
| 2255 |
version = "12.2.0"
|
|
|
|
| 2762 |
{ url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" },
|
| 2763 |
]
|
| 2764 |
|
| 2765 |
+
[[package]]
|
| 2766 |
+
name = "pytz"
|
| 2767 |
+
version = "2026.2"
|
| 2768 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2769 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ff/46/dd499ec9038423421951e4fad73051febaa13d2df82b4064f87af8b8c0c3/pytz-2026.2.tar.gz", hash = "sha256:0e60b47b29f21574376f218fe21abc009894a2321ea16c6754f3cad6eb7cdd6a", size = 320861, upload-time = "2026-05-04T01:35:29.667Z" }
|
| 2770 |
+
wheels = [
|
| 2771 |
+
{ url = "https://files.pythonhosted.org/packages/ec/dd/96da98f892250475bdf2328112d7468abdd4acc7b902b6af23f4ed958ea0/pytz-2026.2-py2.py3-none-any.whl", hash = "sha256:04156e608bee23d3792fd45c94ae47fae1036688e75032eea2e3bf0323d1f126", size = 510141, upload-time = "2026-05-04T01:35:27.408Z" },
|
| 2772 |
+
]
|
| 2773 |
+
|
| 2774 |
[[package]]
|
| 2775 |
name = "pyyaml"
|
| 2776 |
version = "6.0.3"
|
|
|
|
| 3223 |
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
| 3224 |
]
|
| 3225 |
|
| 3226 |
+
[[package]]
|
| 3227 |
+
name = "soupsieve"
|
| 3228 |
+
version = "2.8.4"
|
| 3229 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3230 |
+
sdist = { url = "https://files.pythonhosted.org/packages/47/2c/0a5f6f8ee0d5589e48c7640213ed5175d52cf540a06725b628cc1a45d6ce/soupsieve-2.8.4.tar.gz", hash = "sha256:e121fd02e975c695e4e9e8774a5ee35d74714b59307868dcc5319ad2d9e3328e", size = 121110, upload-time = "2026-05-24T13:55:57.154Z" }
|
| 3231 |
+
wheels = [
|
| 3232 |
+
{ url = "https://files.pythonhosted.org/packages/5e/f5/0c41cb68dcae6b7de4fac4188a3a9589e21fb31df21ea3a2e888db95e6c9/soupsieve-2.8.4-py3-none-any.whl", hash = "sha256:e7e6b0769c8f51ed59acab6e994b00621096cfb1c640a7509295987388fbaf65", size = 37304, upload-time = "2026-05-24T13:55:55.406Z" },
|
| 3233 |
+
]
|
| 3234 |
+
|
| 3235 |
[[package]]
|
| 3236 |
name = "sqlalchemy"
|
| 3237 |
version = "2.0.49"
|
|
|
|
| 3549 |
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
|
| 3550 |
]
|
| 3551 |
|
| 3552 |
+
[[package]]
|
| 3553 |
+
name = "tzdata"
|
| 3554 |
+
version = "2026.2"
|
| 3555 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3556 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ba/19/1b9b0e29f30c6d35cb345486df41110984ea67ae69dddbc0e8a100999493/tzdata-2026.2.tar.gz", hash = "sha256:9173fde7d80d9018e02a662e168e5a2d04f87c41ea174b139fbef642eda62d10", size = 198254, upload-time = "2026-04-24T15:22:08.651Z" }
|
| 3557 |
+
wheels = [
|
| 3558 |
+
{ url = "https://files.pythonhosted.org/packages/ce/e4/dccd7f47c4b64213ac01ef921a1337ee6e30e8c6466046018326977efd95/tzdata-2026.2-py2.py3-none-any.whl", hash = "sha256:bbe9af844f658da81a5f95019480da3a89415801f6cc966806612cc7169bffe7", size = 349321, upload-time = "2026-04-24T15:22:05.876Z" },
|
| 3559 |
+
]
|
| 3560 |
+
|
| 3561 |
[[package]]
|
| 3562 |
name = "urllib3"
|
| 3563 |
version = "2.7.0"
|
|
|
|
| 3931 |
{ url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" },
|
| 3932 |
]
|
| 3933 |
|
| 3934 |
+
[[package]]
|
| 3935 |
+
name = "yfinance"
|
| 3936 |
+
version = "1.4.0"
|
| 3937 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3938 |
+
dependencies = [
|
| 3939 |
+
{ name = "beautifulsoup4" },
|
| 3940 |
+
{ name = "curl-cffi" },
|
| 3941 |
+
{ name = "multitasking" },
|
| 3942 |
+
{ name = "numpy" },
|
| 3943 |
+
{ name = "pandas" },
|
| 3944 |
+
{ name = "peewee" },
|
| 3945 |
+
{ name = "platformdirs" },
|
| 3946 |
+
{ name = "protobuf" },
|
| 3947 |
+
{ name = "pytz" },
|
| 3948 |
+
{ name = "requests" },
|
| 3949 |
+
{ name = "websockets" },
|
| 3950 |
+
]
|
| 3951 |
+
sdist = { url = "https://files.pythonhosted.org/packages/21/e2/b81f9cac78f1c23e444164f2135e19f849a66774474f8b156fc3702280c3/yfinance-1.4.0.tar.gz", hash = "sha256:6b049c3f28b0d66be54c32d84838ffd60c429277ba378afb0202c4792013c911", size = 153715, upload-time = "2026-05-23T16:28:08.961Z" }
|
| 3952 |
+
wheels = [
|
| 3953 |
+
{ url = "https://files.pythonhosted.org/packages/95/58/31561402a60d317f9c36288223be99eabedc25b61f18d0b69f0889726545/yfinance-1.4.0-py2.py3-none-any.whl", hash = "sha256:6513654be21bd80a4e9e4e24193255fb4b1921618443113826494bf6efcedcb0", size = 137749, upload-time = "2026-05-23T16:28:07.656Z" },
|
| 3954 |
+
]
|
| 3955 |
+
|
| 3956 |
[[package]]
|
| 3957 |
name = "zipp"
|
| 3958 |
version = "3.23.1"
|