fred-vdb / app.py
Jacob235's picture
agent API: clean /search endpoint, rich api_description, 33K count, short_description
37cd3ed verified
Raw
History Blame Contribute Delete
7.32 kB
"""
Hugging Face Space entrypoint for FRED-VDB semantic search.
This is a thin Gradio wrapper around fred_search.FREDSearcher. All the real
work lives in the package; this file only:
1. Downloads the prebuilt 118 MB LanceDB vector index from a HF Dataset.
2. Instantiates ONE FREDSearcher at module load (model + index loaded once).
3. Exposes a search box + frequency dropdown + popularity-boost toggle.
Search is fully offline at request time — no FRED API key is needed or used.
"""
from __future__ import annotations
import os
from pathlib import Path
import gradio as gr
from huggingface_hub import snapshot_download
from fred_search import FREDSearcher
from fred_search.models import FREDSearchResult
# ---------------------------------------------------------------------------
# 1. Fetch the prebuilt index from a HF Dataset.
#
# Upload data/fred_vector_index/ to a HF Dataset whose repo root contains a
# `fred_vector_index/` folder, then set this to "<username>/<dataset-name>".
# snapshot_download caches it, so this is a no-op after the first cold boot.
# ---------------------------------------------------------------------------
INDEX_DATASET = os.environ.get("INDEX_DATASET", "Jacob235/fred-vector-index")
# NOTE: must use local_dir to get REAL files, not the default cache symlinks.
# LanceDB does its own directory traversal + mmap over its multi-file table and
# cannot open it through HF's symlink-into-blobs snapshot layout (it fails with
# "lance error: file size is too small"). local_dir writes real copies.
_index_root = Path(
snapshot_download(
repo_id=INDEX_DATASET,
repo_type="dataset",
local_dir=os.environ.get("INDEX_LOCAL_DIR", "/tmp/fred_index"),
)
)
# ---------------------------------------------------------------------------
# 2. Load model + index ONCE. Doing this at module scope (not inside the
# handler) means the ~few-second startup cost is paid at boot, and every
# query afterward is the ~50ms vector scan.
# ---------------------------------------------------------------------------
_searcher = FREDSearcher(data_dir=_index_root)
FREQUENCIES = ["Any", "Daily", "Weekly", "Monthly", "Quarterly", "Annual"]
def _render(results: list[FREDSearchResult]) -> str:
"""Format search results as Markdown for the Gradio output panel."""
if not results:
return "_No matching series found. Try rephrasing your query._"
blocks: list[str] = []
for i, r in enumerate(results, start=1):
url = f"https://fred.stlouisfed.org/series/{r.series_id}"
header = f"### {i}. [`{r.series_id}`]({url}) — {r.title}"
meta = (
f"**Similarity:** {r.similarity_score:.3f} · "
f"**Frequency:** {r.frequency} · "
f"**Units:** {r.units} · "
f"**Popularity:** {r.popularity}"
)
notes = (r.notes or "").strip()
if len(notes) > 300:
notes = notes[:300].rstrip() + "…"
blocks.append("\n\n".join(p for p in (header, meta, notes) if p))
return "\n\n---\n\n".join(blocks)
def search(query: str, frequency: str, popularity_boost: bool) -> str:
"""Gradio handler: run a semantic search and return formatted Markdown."""
query = (query or "").strip()
if not query:
return "_Enter a natural-language description of the data you want._"
results = _searcher.search(
query,
top_k=10,
frequency=None if frequency == "Any" else frequency,
popularity_boost=popularity_boost,
)
return _render(results)
EXAMPLES = [
["indicators of commercial real estate credit stress", "Any", True],
["inflation expectations vs realized inflation", "Monthly", True],
["risk-free rate benchmarks at various maturities", "Daily", True],
["housing supply pipeline for multifamily", "Any", True],
]
with gr.Blocks(title="FRED-VDB — Semantic Search over FRED") as demo:
gr.Markdown(
"# FRED-VDB\n"
"Semantic search over 33,000+ curated FRED economic series. Describe the "
"data you want in plain language — matching is by *meaning*, not keywords."
)
with gr.Row():
query_box = gr.Textbox(
label="What are you looking for?",
placeholder="e.g. indicators of labor market slack",
scale=4,
)
freq_dropdown = gr.Dropdown(
FREQUENCIES, value="Any", label="Frequency", scale=1
)
boost_toggle = gr.Checkbox(
value=True,
label="Boost well-known series (popularity re-ranking)",
info="On: surfaces headline series like UNRATE/DGS10. Off: pure similarity.",
)
search_btn = gr.Button("Search", variant="primary")
output = gr.Markdown()
gr.Examples(EXAMPLES, inputs=[query_box, freq_dropdown, boost_toggle])
# The agent-facing API surface. Both UI triggers run the same handler, but
# only the button-click is exposed as the public `/search` endpoint; the
# textbox-submit is marked private so Enter-to-search still works in the
# browser without minting a confusing duplicate endpoint in the API schema
# that agents read via /gradio_api/info. (Gradio 6: use api_visibility, not
# the removed show_api / api_name=False.)
#
# `api_description` is the text an LLM agent sees when deciding whether and
# how to call this tool. TODO(you): refine the wording below — the draft is
# functional, but you know best how you want agents to understand and chain
# FRED-VDB. Keep the key facts: it returns series_ids, and series_id is the
# handle for fetching the actual data.
search_btn.click(
search,
inputs=[query_box, freq_dropdown, boost_toggle],
outputs=output,
api_name="search",
api_description=(
"Semantic search over 33,000+ curated FRED (Federal Reserve Economic "
"Data) time series — a high-signal subset filtered from FRED's full "
"~840K-series catalog. Given a natural-language description of the data you want "
"(e.g. 'labor market slack' or 'commercial real estate credit stress'), "
"returns the top 10 matching series ranked by meaning (not keywords), "
"formatted as Markdown. Each result includes the FRED series_id "
"(e.g. UNRATE), title, similarity score, frequency, units, and popularity. "
"Use the series_id to fetch the actual data via the FRED API or at "
"https://fred.stlouisfed.org/series/{series_id}. Optional args: filter by "
"`frequency` (Any/Daily/Weekly/Monthly/Quarterly/Annual) and toggle "
"`popularity_boost` to favor well-known headline series. The ranking "
"is a similarity heuristic, not an authoritative answer — the best "
"match for your intent is not always result #1. Read the titles and "
"metadata across the top 10 and use your own judgment to pick the "
"series that actually fits the request (or refine the query and "
"search again)."
),
)
query_box.submit(
search,
inputs=[query_box, freq_dropdown, boost_toggle],
outputs=output,
api_visibility="private",
)
if __name__ == "__main__":
demo.launch()