finnie / src /rag /loader.py
Vishnu Rama
Fixing bugs
51c0848
"""
src/rag/loader.py
Builds a FAISS index from Investopedia articles + FinDER dataset.
Run once:
uv run python -m src.rag.loader
"""
import os
import time
import json
import yaml
import requests
from bs4 import BeautifulSoup
from datasets import load_dataset
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
# ── Config ────────────────────────────────────────────────────────────────────
with open("config.yaml") as f:
cfg = yaml.safe_load(f)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
INDEX_PATH = cfg["rag"]["index_path"]
RAW_CACHE = cfg["rag"]["raw_cache"]
CHUNK_SIZE = cfg["rag"]["chunk_size"]
CHUNK_OVERLAP = cfg["rag"]["chunk_overlap"]
DELAY = cfg["market"]["delay_seconds"]
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
# ── Investopedia URLs ─────────────────────────────────────────────────────────
ARTICLES = [
# Finance Q&A
("What Is Investing?", "https://www.investopedia.com/terms/i/investing.asp"),
("Dollar-Cost Averaging", "https://www.investopedia.com/terms/d/dollarcostaveraging.asp"),
("Compound Interest", "https://www.investopedia.com/terms/c/compoundinterest.asp"),
("Diversification", "https://www.investopedia.com/terms/d/diversification.asp"),
("Risk Tolerance", "https://www.investopedia.com/terms/r/risktolerance.asp"),
("Index Funds", "https://www.investopedia.com/terms/i/indexfund.asp"),
("ETF vs Mutual Fund", "https://www.investopedia.com/articles/exchangetradedfunds/08/etf-mutual-fund-difference.asp"),
("Stocks vs Bonds", "https://www.investopedia.com/ask/answers/09/difference-between-bond-stock-market.asp"),
("Liquidity", "https://www.investopedia.com/terms/l/liquidity.asp"),
("Bull vs Bear Market", "https://www.investopedia.com/insights/digging-deeper-bull-and-bear-markets/"),
("Time Value of Money", "https://www.investopedia.com/terms/t/timevalueofmoney.asp"),
("Inflation", "https://www.investopedia.com/terms/i/inflation.asp"),
# Portfolio
("Asset Allocation", "https://www.investopedia.com/terms/a/assetallocation.asp"),
("Portfolio Rebalancing", "https://www.investopedia.com/terms/r/rebalancing.asp"),
("Modern Portfolio Theory", "https://www.investopedia.com/terms/m/modernportfoliotheory.asp"),
("Beta", "https://www.investopedia.com/terms/b/beta.asp"),
("Alpha", "https://www.investopedia.com/terms/a/alpha.asp"),
("Sharpe Ratio", "https://www.investopedia.com/terms/s/sharperatio.asp"),
("Standard Deviation", "https://www.investopedia.com/terms/s/standarddeviation.asp"),
("Correlation", "https://www.investopedia.com/terms/c/correlation.asp"),
("How to Build a Portfolio", "https://www.investopedia.com/articles/basics/06/invest1000.asp"),
("Expense Ratio", "https://www.investopedia.com/terms/e/expenseratio.asp"),
# Market
("P/E Ratio", "https://www.investopedia.com/terms/p/price-earningsratio.asp"),
("Market Capitalization", "https://www.investopedia.com/terms/m/marketcapitalization.asp"),
("EPS", "https://www.investopedia.com/terms/e/eps.asp"),
("Dividend Yield", "https://www.investopedia.com/terms/d/dividendyield.asp"),
("P/B Ratio", "https://www.investopedia.com/terms/p/price-to-bookratio.asp"),
("Moving Averages", "https://www.investopedia.com/terms/m/movingaverage.asp"),
("Support and Resistance", "https://www.investopedia.com/trading/support-and-resistance-basics/"),
("Volume in Stock Trading", "https://www.investopedia.com/terms/v/volume.asp"),
("Market Sentiment", "https://www.investopedia.com/terms/m/marketsentiment.asp"),
# Goal Planning
("Financial Goals", "https://www.investopedia.com/terms/f/financial_plan.asp"),
("Emergency Fund", "https://www.investopedia.com/terms/e/emergency_fund.asp"),
("Retirement Planning", "https://www.investopedia.com/terms/r/retirement-planning.asp"),
("Rule of 72", "https://www.investopedia.com/terms/r/ruleof72.asp"),
("Net Worth", "https://www.investopedia.com/terms/n/networth.asp"),
("Saving vs Investing", "https://www.investopedia.com/articles/investing/022516/saving-vs-investing-understanding-key-differences.asp"),
("Risk vs Reward", "https://www.investopedia.com/terms/r/riskreturntradeoff.asp"),
("SMART Financial Goals", "https://www.investopedia.com/articles/personal-finance/100516/setting-financial-goals/"),
# News
("How Interest Rates Affect Markets","https://www.investopedia.com/articles/stocks/09/how-interest-rates-affect-markets.asp"),
("How the Stock Market Works", "https://www.investopedia.com/articles/investing/082614/how-stock-market-works.asp"),
("Federal Reserve", "https://www.investopedia.com/terms/f/federalreservebank.asp"),
("GDP", "https://www.investopedia.com/ask/answers/what-is-gdp-why-its-important-to-economists-investors/"),
# Tax
("Capital Gains Tax", "https://www.investopedia.com/terms/c/capital_gains_tax.asp"),
("401(k)", "https://www.investopedia.com/terms/1/401kplan.asp"),
("IRA", "https://www.investopedia.com/terms/i/ira.asp"),
("Roth IRA", "https://www.investopedia.com/terms/r/rothira.asp"),
("Tax-Loss Harvesting", "https://www.investopedia.com/terms/t/taxgainlossharvesting.asp"),
("Roth vs Traditional IRA", "https://www.investopedia.com/retirement/roth-vs-traditional-ira-which-is-right-for-you/"),
("52-Week High and Low", "https://www.investopedia.com/terms/1/52weekhighlow.asp"),
# Derivatives
("Swaps", "https://www.investopedia.com/terms/s/swap.asp"),
("Options", "https://www.investopedia.com/terms/o/option.asp"),
("Futures", "https://www.investopedia.com/terms/f/futures.asp"),
# Personal Finance
("How to Save Money", "https://www.investopedia.com/articles/personal-finance/100516/setting-financial-goals/"),
("Budgeting Basics", "https://www.investopedia.com/terms/b/budget.asp"),
("50/30/20 Rule", "https://www.investopedia.com/ask/answers/022916/what-502030-budget-rule.asp"),
("Credit Score", "https://www.investopedia.com/terms/c/credit_score.asp"),
("Debt Management", "https://www.investopedia.com/terms/d/debtmanagement.asp"),
("Credit Cards", "https://www.investopedia.com/terms/c/creditcard.asp"),
("Repurchase Agreement (Repo)", "https://www.investopedia.com/terms/r/repurchaseagreement.asp"),
("Real Estate Investing", "https://www.investopedia.com/terms/r/realestate.asp"),
("REITs", "https://www.investopedia.com/terms/r/reit.asp"),
("How to Buy a Home", "https://www.investopedia.com/articles/mortgages-real-estate/08/first-time-homebuyer-tips.asp"),
("Mortgage Basics", "https://www.investopedia.com/terms/m/mortgage.asp"),
("Art as an Investment", "https://www.investopedia.com/articles/pf/08/fine-art.asp"),
("Commodities", "https://www.investopedia.com/terms/c/commodity.asp"),
("Gold as Investment", "https://www.investopedia.com/articles/basics/09/precious-metals-gold-silver-platinum.asp"),
("Cryptocurrency Basics", "https://www.investopedia.com/terms/c/cryptocurrency.asp"),
("Retirement Planning", "https://www.investopedia.com/terms/r/retirement-planning.asp"),
("Social Security", "https://www.investopedia.com/terms/s/socialsecurity.asp"),
("Medicare Basics", "https://www.investopedia.com/terms/m/medicare.asp"),
("Required Minimum Distributions", "https://www.investopedia.com/terms/r/requiredminimumdistribution.asp"),
("Annuities", "https://www.investopedia.com/terms/a/annuity.asp"),
("When to Retire", "https://www.investopedia.com/articles/retirement/when-can-you-retire.asp"),
("4% Rule", "https://www.investopedia.com/terms/f/four-percent-rule.asp"),
("Safe Withdrawal Rate", "https://www.investopedia.com/ask/answers/05/retirementmoneylast.asp"),
("Sequence of Returns Risk", "https://www.investopedia.com/terms/s/sequence-risk.asp"),
("Retirement Withdrawal Strategies", "https://www.investopedia.com/retirement/retirement-income-planning/"),
("Life Insurance", "https://www.investopedia.com/terms/l/lifeinsurance.asp"),
("Health Insurance Basics", "https://www.investopedia.com/terms/h/healthinsurance.asp"),
("Disability Insurance", "https://www.investopedia.com/terms/d/disability-insurance.asp"),
("Student Loans", "https://www.investopedia.com/terms/s/student-debt.asp"),
("How to Get Out of Debt", "https://www.investopedia.com/articles/pf/how-to-get-out-of-debt.asp"),
("Good Debt vs Bad Debt", "https://www.investopedia.com/articles/pf/12/good-debt-bad-debt.asp"),
("Wills and Trusts", "https://www.investopedia.com/terms/w/will.asp"),
("Estate Planning Basics", "https://www.investopedia.com/terms/e/estateplanning.asp"),
]
# ── Investopedia Scraper ──────────────────────────────────────────────────────
def scrape_article(title: str, url: str) -> dict | None:
try:
print(f" [{title}]")
resp = requests.get(url, headers=HEADERS, timeout=15)
resp.raise_for_status()
except requests.RequestException as e:
print(f" ERROR: {e}")
return None
soup = BeautifulSoup(resp.text, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "aside", "form", "header"]):
tag.decompose()
body = (
soup.find("article")
or soup.find(class_="article-body-content")
or soup.find(id="article-body")
or soup.find("main")
)
if not body:
print(f" WARNING: no body found")
return None
text = body.get_text(separator="\n", strip=True)
if len(text) < 300:
print(f" WARNING: too short ({len(text)} chars)")
return None
return {"title": title, "url": url, "text": text, "source": "investopedia"}
def load_investopedia() -> list[dict]:
os.makedirs("data/raw", exist_ok=True)
if os.path.exists(RAW_CACHE):
print(f"Loading cached articles from {RAW_CACHE}")
with open(RAW_CACHE) as f:
return json.load(f)
print(f"Scraping {len(ARTICLES)} Investopedia articles...")
articles = []
for i, (title, url) in enumerate(ARTICLES, 1):
print(f"[{i}/{len(ARTICLES)}]", end=" ")
article = scrape_article(title, url)
if article:
articles.append(article)
time.sleep(DELAY)
with open(RAW_CACHE, "w") as f:
json.dump(articles, f, indent=2)
print(f"Saved {len(articles)} articles to {RAW_CACHE}")
return articles
# ── Investopedia Chunker ──────────────────────────────────────────────────────
def chunk_investopedia(articles: list[dict]) -> list[Document]:
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=["\n\n", "\n", ". ", " "],
)
docs = []
for article in articles:
for i, chunk in enumerate(splitter.split_text(article["text"])):
docs.append(Document(
page_content=chunk,
metadata={
"title": article["title"],
"url": article["url"],
"source": "investopedia",
"chunk": i,
}
))
print(f"Investopedia: {len(docs)} chunks from {len(articles)} articles")
return docs
# ── FinDER Loader ─────────────────────────────────────────────────────────────
def load_finder() -> list[Document]:
print("Loading FinDER dataset from HuggingFace...")
ds = load_dataset("Linq-AI-Research/FinDER", split="train")
docs = []
for row in ds:
for ref in row["references"]:
if not ref.strip():
continue
docs.append(Document(
page_content=ref.strip(),
metadata={
"title": row["_id"],
"source": "finder",
"category": row["category"],
"type": row["type"],
"answer": row["answer"],
}
))
print(f"FinDER: {len(docs)} passages loaded")
return docs
# ── Build Index ───────────────────────────────────────────────────────────────
def build_index(docs: list[Document], force: bool = False) -> None:
index_file = os.path.join(INDEX_PATH, "index.faiss")
if not force and os.path.exists(index_file):
print(f"Index already exists at {INDEX_PATH}/, skipping build. Pass force=True to rebuild.")
return
print(f"\nEmbedding {len(docs)} total chunks...")
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small",
openai_api_key=OPENAI_API_KEY,
)
vectorstore = FAISS.from_documents(docs, embeddings)
os.makedirs(INDEX_PATH, exist_ok=True)
vectorstore.save_local(INDEX_PATH)
print(f"Index saved to {INDEX_PATH}/")
# ── Smoke Test ────────────────────────────────────────────────────────────────
def smoke_test() -> None:
print("\nSmoke test...")
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small",
openai_api_key=OPENAI_API_KEY,
)
vectorstore = FAISS.load_local(
INDEX_PATH, embeddings, allow_dangerous_deserialization=True
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
queries = [
"What is dollar cost averaging?",
"How does a Roth IRA work?",
"Analyze CrowdStrike revenue growth",
]
for q in queries:
results = retriever.invoke(q)
top = results[0]
print(f"\n Q: {q}")
print(f" -> [{top.metadata['source']}] {top.page_content[:100].replace(chr(10),' ')}...")
# ── Main ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=" * 55)
print(" Finnie RAG Loader")
print("=" * 55)
# 1. Investopedia
articles = load_investopedia()
investopedia_docs = chunk_investopedia(articles)
# 2. FinDER
finder_docs = load_finder()
# 3. Merge + build
all_docs = investopedia_docs + finder_docs
print(f"\nTotal documents: {len(all_docs)}")
build_index(all_docs)
# 4. Smoke test
smoke_test()
print("\n" + "=" * 55)
print(" Done! RAG index ready.")
print("=" * 55)