Spaces:
Sleeping
Sleeping
| """ | |
| src/rag/loader.py | |
| Builds a FAISS index from Investopedia articles + FinDER dataset. | |
| Run once: | |
| uv run python -m src.rag.loader | |
| """ | |
| import os | |
| import time | |
| import json | |
| import yaml | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from datasets import load_dataset | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_openai import OpenAIEmbeddings | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with open("config.yaml") as f: | |
| cfg = yaml.safe_load(f) | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| INDEX_PATH = cfg["rag"]["index_path"] | |
| RAW_CACHE = cfg["rag"]["raw_cache"] | |
| CHUNK_SIZE = cfg["rag"]["chunk_size"] | |
| CHUNK_OVERLAP = cfg["rag"]["chunk_overlap"] | |
| DELAY = cfg["market"]["delay_seconds"] | |
| HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/120.0.0.0 Safari/537.36" | |
| ) | |
| } | |
| # ββ Investopedia URLs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ARTICLES = [ | |
| # Finance Q&A | |
| ("What Is Investing?", "https://www.investopedia.com/terms/i/investing.asp"), | |
| ("Dollar-Cost Averaging", "https://www.investopedia.com/terms/d/dollarcostaveraging.asp"), | |
| ("Compound Interest", "https://www.investopedia.com/terms/c/compoundinterest.asp"), | |
| ("Diversification", "https://www.investopedia.com/terms/d/diversification.asp"), | |
| ("Risk Tolerance", "https://www.investopedia.com/terms/r/risktolerance.asp"), | |
| ("Index Funds", "https://www.investopedia.com/terms/i/indexfund.asp"), | |
| ("ETF vs Mutual Fund", "https://www.investopedia.com/articles/exchangetradedfunds/08/etf-mutual-fund-difference.asp"), | |
| ("Stocks vs Bonds", "https://www.investopedia.com/ask/answers/09/difference-between-bond-stock-market.asp"), | |
| ("Liquidity", "https://www.investopedia.com/terms/l/liquidity.asp"), | |
| ("Bull vs Bear Market", "https://www.investopedia.com/insights/digging-deeper-bull-and-bear-markets/"), | |
| ("Time Value of Money", "https://www.investopedia.com/terms/t/timevalueofmoney.asp"), | |
| ("Inflation", "https://www.investopedia.com/terms/i/inflation.asp"), | |
| # Portfolio | |
| ("Asset Allocation", "https://www.investopedia.com/terms/a/assetallocation.asp"), | |
| ("Portfolio Rebalancing", "https://www.investopedia.com/terms/r/rebalancing.asp"), | |
| ("Modern Portfolio Theory", "https://www.investopedia.com/terms/m/modernportfoliotheory.asp"), | |
| ("Beta", "https://www.investopedia.com/terms/b/beta.asp"), | |
| ("Alpha", "https://www.investopedia.com/terms/a/alpha.asp"), | |
| ("Sharpe Ratio", "https://www.investopedia.com/terms/s/sharperatio.asp"), | |
| ("Standard Deviation", "https://www.investopedia.com/terms/s/standarddeviation.asp"), | |
| ("Correlation", "https://www.investopedia.com/terms/c/correlation.asp"), | |
| ("How to Build a Portfolio", "https://www.investopedia.com/articles/basics/06/invest1000.asp"), | |
| ("Expense Ratio", "https://www.investopedia.com/terms/e/expenseratio.asp"), | |
| # Market | |
| ("P/E Ratio", "https://www.investopedia.com/terms/p/price-earningsratio.asp"), | |
| ("Market Capitalization", "https://www.investopedia.com/terms/m/marketcapitalization.asp"), | |
| ("EPS", "https://www.investopedia.com/terms/e/eps.asp"), | |
| ("Dividend Yield", "https://www.investopedia.com/terms/d/dividendyield.asp"), | |
| ("P/B Ratio", "https://www.investopedia.com/terms/p/price-to-bookratio.asp"), | |
| ("Moving Averages", "https://www.investopedia.com/terms/m/movingaverage.asp"), | |
| ("Support and Resistance", "https://www.investopedia.com/trading/support-and-resistance-basics/"), | |
| ("Volume in Stock Trading", "https://www.investopedia.com/terms/v/volume.asp"), | |
| ("Market Sentiment", "https://www.investopedia.com/terms/m/marketsentiment.asp"), | |
| # Goal Planning | |
| ("Financial Goals", "https://www.investopedia.com/terms/f/financial_plan.asp"), | |
| ("Emergency Fund", "https://www.investopedia.com/terms/e/emergency_fund.asp"), | |
| ("Retirement Planning", "https://www.investopedia.com/terms/r/retirement-planning.asp"), | |
| ("Rule of 72", "https://www.investopedia.com/terms/r/ruleof72.asp"), | |
| ("Net Worth", "https://www.investopedia.com/terms/n/networth.asp"), | |
| ("Saving vs Investing", "https://www.investopedia.com/articles/investing/022516/saving-vs-investing-understanding-key-differences.asp"), | |
| ("Risk vs Reward", "https://www.investopedia.com/terms/r/riskreturntradeoff.asp"), | |
| ("SMART Financial Goals", "https://www.investopedia.com/articles/personal-finance/100516/setting-financial-goals/"), | |
| # News | |
| ("How Interest Rates Affect Markets","https://www.investopedia.com/articles/stocks/09/how-interest-rates-affect-markets.asp"), | |
| ("How the Stock Market Works", "https://www.investopedia.com/articles/investing/082614/how-stock-market-works.asp"), | |
| ("Federal Reserve", "https://www.investopedia.com/terms/f/federalreservebank.asp"), | |
| ("GDP", "https://www.investopedia.com/ask/answers/what-is-gdp-why-its-important-to-economists-investors/"), | |
| # Tax | |
| ("Capital Gains Tax", "https://www.investopedia.com/terms/c/capital_gains_tax.asp"), | |
| ("401(k)", "https://www.investopedia.com/terms/1/401kplan.asp"), | |
| ("IRA", "https://www.investopedia.com/terms/i/ira.asp"), | |
| ("Roth IRA", "https://www.investopedia.com/terms/r/rothira.asp"), | |
| ("Tax-Loss Harvesting", "https://www.investopedia.com/terms/t/taxgainlossharvesting.asp"), | |
| ("Roth vs Traditional IRA", "https://www.investopedia.com/retirement/roth-vs-traditional-ira-which-is-right-for-you/"), | |
| ("52-Week High and Low", "https://www.investopedia.com/terms/1/52weekhighlow.asp"), | |
| # Derivatives | |
| ("Swaps", "https://www.investopedia.com/terms/s/swap.asp"), | |
| ("Options", "https://www.investopedia.com/terms/o/option.asp"), | |
| ("Futures", "https://www.investopedia.com/terms/f/futures.asp"), | |
| # Personal Finance | |
| ("How to Save Money", "https://www.investopedia.com/articles/personal-finance/100516/setting-financial-goals/"), | |
| ("Budgeting Basics", "https://www.investopedia.com/terms/b/budget.asp"), | |
| ("50/30/20 Rule", "https://www.investopedia.com/ask/answers/022916/what-502030-budget-rule.asp"), | |
| ("Credit Score", "https://www.investopedia.com/terms/c/credit_score.asp"), | |
| ("Debt Management", "https://www.investopedia.com/terms/d/debtmanagement.asp"), | |
| ("Credit Cards", "https://www.investopedia.com/terms/c/creditcard.asp"), | |
| ("Repurchase Agreement (Repo)", "https://www.investopedia.com/terms/r/repurchaseagreement.asp"), | |
| ("Real Estate Investing", "https://www.investopedia.com/terms/r/realestate.asp"), | |
| ("REITs", "https://www.investopedia.com/terms/r/reit.asp"), | |
| ("How to Buy a Home", "https://www.investopedia.com/articles/mortgages-real-estate/08/first-time-homebuyer-tips.asp"), | |
| ("Mortgage Basics", "https://www.investopedia.com/terms/m/mortgage.asp"), | |
| ("Art as an Investment", "https://www.investopedia.com/articles/pf/08/fine-art.asp"), | |
| ("Commodities", "https://www.investopedia.com/terms/c/commodity.asp"), | |
| ("Gold as Investment", "https://www.investopedia.com/articles/basics/09/precious-metals-gold-silver-platinum.asp"), | |
| ("Cryptocurrency Basics", "https://www.investopedia.com/terms/c/cryptocurrency.asp"), | |
| ("Retirement Planning", "https://www.investopedia.com/terms/r/retirement-planning.asp"), | |
| ("Social Security", "https://www.investopedia.com/terms/s/socialsecurity.asp"), | |
| ("Medicare Basics", "https://www.investopedia.com/terms/m/medicare.asp"), | |
| ("Required Minimum Distributions", "https://www.investopedia.com/terms/r/requiredminimumdistribution.asp"), | |
| ("Annuities", "https://www.investopedia.com/terms/a/annuity.asp"), | |
| ("When to Retire", "https://www.investopedia.com/articles/retirement/when-can-you-retire.asp"), | |
| ("4% Rule", "https://www.investopedia.com/terms/f/four-percent-rule.asp"), | |
| ("Safe Withdrawal Rate", "https://www.investopedia.com/ask/answers/05/retirementmoneylast.asp"), | |
| ("Sequence of Returns Risk", "https://www.investopedia.com/terms/s/sequence-risk.asp"), | |
| ("Retirement Withdrawal Strategies", "https://www.investopedia.com/retirement/retirement-income-planning/"), | |
| ("Life Insurance", "https://www.investopedia.com/terms/l/lifeinsurance.asp"), | |
| ("Health Insurance Basics", "https://www.investopedia.com/terms/h/healthinsurance.asp"), | |
| ("Disability Insurance", "https://www.investopedia.com/terms/d/disability-insurance.asp"), | |
| ("Student Loans", "https://www.investopedia.com/terms/s/student-debt.asp"), | |
| ("How to Get Out of Debt", "https://www.investopedia.com/articles/pf/how-to-get-out-of-debt.asp"), | |
| ("Good Debt vs Bad Debt", "https://www.investopedia.com/articles/pf/12/good-debt-bad-debt.asp"), | |
| ("Wills and Trusts", "https://www.investopedia.com/terms/w/will.asp"), | |
| ("Estate Planning Basics", "https://www.investopedia.com/terms/e/estateplanning.asp"), | |
| ] | |
| # ββ Investopedia Scraper ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def scrape_article(title: str, url: str) -> dict | None: | |
| try: | |
| print(f" [{title}]") | |
| resp = requests.get(url, headers=HEADERS, timeout=15) | |
| resp.raise_for_status() | |
| except requests.RequestException as e: | |
| print(f" ERROR: {e}") | |
| return None | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| for tag in soup(["script", "style", "nav", "footer", "aside", "form", "header"]): | |
| tag.decompose() | |
| body = ( | |
| soup.find("article") | |
| or soup.find(class_="article-body-content") | |
| or soup.find(id="article-body") | |
| or soup.find("main") | |
| ) | |
| if not body: | |
| print(f" WARNING: no body found") | |
| return None | |
| text = body.get_text(separator="\n", strip=True) | |
| if len(text) < 300: | |
| print(f" WARNING: too short ({len(text)} chars)") | |
| return None | |
| return {"title": title, "url": url, "text": text, "source": "investopedia"} | |
| def load_investopedia() -> list[dict]: | |
| os.makedirs("data/raw", exist_ok=True) | |
| if os.path.exists(RAW_CACHE): | |
| print(f"Loading cached articles from {RAW_CACHE}") | |
| with open(RAW_CACHE) as f: | |
| return json.load(f) | |
| print(f"Scraping {len(ARTICLES)} Investopedia articles...") | |
| articles = [] | |
| for i, (title, url) in enumerate(ARTICLES, 1): | |
| print(f"[{i}/{len(ARTICLES)}]", end=" ") | |
| article = scrape_article(title, url) | |
| if article: | |
| articles.append(article) | |
| time.sleep(DELAY) | |
| with open(RAW_CACHE, "w") as f: | |
| json.dump(articles, f, indent=2) | |
| print(f"Saved {len(articles)} articles to {RAW_CACHE}") | |
| return articles | |
| # ββ Investopedia Chunker ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chunk_investopedia(articles: list[dict]) -> list[Document]: | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CHUNK_SIZE, | |
| chunk_overlap=CHUNK_OVERLAP, | |
| separators=["\n\n", "\n", ". ", " "], | |
| ) | |
| docs = [] | |
| for article in articles: | |
| for i, chunk in enumerate(splitter.split_text(article["text"])): | |
| docs.append(Document( | |
| page_content=chunk, | |
| metadata={ | |
| "title": article["title"], | |
| "url": article["url"], | |
| "source": "investopedia", | |
| "chunk": i, | |
| } | |
| )) | |
| print(f"Investopedia: {len(docs)} chunks from {len(articles)} articles") | |
| return docs | |
| # ββ FinDER Loader βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_finder() -> list[Document]: | |
| print("Loading FinDER dataset from HuggingFace...") | |
| ds = load_dataset("Linq-AI-Research/FinDER", split="train") | |
| docs = [] | |
| for row in ds: | |
| for ref in row["references"]: | |
| if not ref.strip(): | |
| continue | |
| docs.append(Document( | |
| page_content=ref.strip(), | |
| metadata={ | |
| "title": row["_id"], | |
| "source": "finder", | |
| "category": row["category"], | |
| "type": row["type"], | |
| "answer": row["answer"], | |
| } | |
| )) | |
| print(f"FinDER: {len(docs)} passages loaded") | |
| return docs | |
| # ββ Build Index βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_index(docs: list[Document], force: bool = False) -> None: | |
| index_file = os.path.join(INDEX_PATH, "index.faiss") | |
| if not force and os.path.exists(index_file): | |
| print(f"Index already exists at {INDEX_PATH}/, skipping build. Pass force=True to rebuild.") | |
| return | |
| print(f"\nEmbedding {len(docs)} total chunks...") | |
| embeddings = OpenAIEmbeddings( | |
| model="text-embedding-3-small", | |
| openai_api_key=OPENAI_API_KEY, | |
| ) | |
| vectorstore = FAISS.from_documents(docs, embeddings) | |
| os.makedirs(INDEX_PATH, exist_ok=True) | |
| vectorstore.save_local(INDEX_PATH) | |
| print(f"Index saved to {INDEX_PATH}/") | |
| # ββ Smoke Test ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def smoke_test() -> None: | |
| print("\nSmoke test...") | |
| embeddings = OpenAIEmbeddings( | |
| model="text-embedding-3-small", | |
| openai_api_key=OPENAI_API_KEY, | |
| ) | |
| vectorstore = FAISS.load_local( | |
| INDEX_PATH, embeddings, allow_dangerous_deserialization=True | |
| ) | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) | |
| queries = [ | |
| "What is dollar cost averaging?", | |
| "How does a Roth IRA work?", | |
| "Analyze CrowdStrike revenue growth", | |
| ] | |
| for q in queries: | |
| results = retriever.invoke(q) | |
| top = results[0] | |
| print(f"\n Q: {q}") | |
| print(f" -> [{top.metadata['source']}] {top.page_content[:100].replace(chr(10),' ')}...") | |
| # ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| print("=" * 55) | |
| print(" Finnie RAG Loader") | |
| print("=" * 55) | |
| # 1. Investopedia | |
| articles = load_investopedia() | |
| investopedia_docs = chunk_investopedia(articles) | |
| # 2. FinDER | |
| finder_docs = load_finder() | |
| # 3. Merge + build | |
| all_docs = investopedia_docs + finder_docs | |
| print(f"\nTotal documents: {len(all_docs)}") | |
| build_index(all_docs) | |
| # 4. Smoke test | |
| smoke_test() | |
| print("\n" + "=" * 55) | |
| print(" Done! RAG index ready.") | |
| print("=" * 55) |