| |
| |
|
|
| import os |
| from pathlib import Path |
| from dotenv import load_dotenv |
|
|
| load_dotenv() |
|
|
| |
| ROOT_DIR = Path(__file__).resolve().parent.parent |
| DATA_DIR = ROOT_DIR / "data" |
| RAW_DATA_DIR = DATA_DIR / "raw" |
| PROCESSED_DATA_DIR = DATA_DIR / "processed" |
| CHROMA_DIR = PROCESSED_DATA_DIR / "chroma_store" |
| DUCKDB_PATH = PROCESSED_DATA_DIR / "financials.duckdb" |
| BM25_INDEX_PATH = PROCESSED_DATA_DIR / "bm25_index.pkl" |
|
|
| |
| LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai").lower() |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") |
| ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY", "") |
|
|
| |
| _DEFAULT_MODELS = { |
| "openai": "gpt-4o-mini", |
| "anthropic": "claude-sonnet-4-20250514", |
| "groq": "llama-3.3-70b-versatile", |
| } |
| LLM_MODEL = os.getenv("LLM_MODEL") or _DEFAULT_MODELS.get(LLM_PROVIDER, "gpt-4o-mini") |
|
|
| |
| |
| EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL") or "all-MiniLM-L6-v2" |
|
|
| |
| SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.35")) |
| TOP_K_RESULTS = int(os.getenv("TOP_K_RESULTS", "10")) |
| RERANK_TOP_N = int(os.getenv("RERANK_TOP_N", "5")) |
| CHUNK_SIZE = 1500 |
| CHUNK_OVERLAP = 100 |
|
|
| |
| LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") |
|
|
| |
| DATASET_DESCRIPTION = """ |
| Available structured data (DuckDB β SQL queries): |
| Table: companies |
| Columns: company_name, ticker, sector, revenue_mn (in millions USD), |
| net_income_mn, total_assets_mn, market_cap_mn, year, quarter |
| Coverage: ~5,000 publicly traded US companies, 2020β2024 quarterly data |
| |
| Available unstructured data (ChromaDB + BM25 β semantic & keyword search): |
| Source: Financial news articles and analyst reports |
| Fields per chunk: text, source, date, companies_mentioned, sector |
| Coverage: ~100,000 articles from 2020β2024 |
| Topics: earnings, market analysis, sector trends, M&A, analyst opinions |
| """.strip() |
|
|