peter2000's picture
Update app.py
4cf0f22 verified
"""
Connected Archives MCP Server
==============================
A lightweight BM25 retrieval server that exposes a HuggingFace dataset
as an MCP tool for AI agents. Returns results in the Connected Archives
citation envelope format.
Template usage: duplicate this Space, edit config.json, deploy.
"""
import json
import re
import hashlib
import time
from pathlib import Path
from typing import Optional
import gradio as gr
from datasets import load_dataset
from rank_bm25 import BM25Okapi
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
CONFIG_PATH = Path(__file__).parent / "config.json"
with open(CONFIG_PATH) as f:
CONFIG = json.load(f)
# ---------------------------------------------------------------------------
# Dataset loading and BM25 index construction
# ---------------------------------------------------------------------------
print(f"Loading dataset: {CONFIG['dataset_id']}...", flush=True)
_start = time.time()
dataset = load_dataset(CONFIG["dataset_id"], split="train")
# Extract text and metadata from each row
texts: list[str] = []
metadata: list[dict] = []
for row in dataset:
texts.append(row[CONFIG["text_column"]])
meta = row[CONFIG["metadata_column"]]
if isinstance(meta, str):
meta = json.loads(meta)
metadata.append(meta)
print(f"Loaded {len(texts)} chunks in {time.time() - _start:.1f}s", flush=True)
# Build document manifest (unique documents)
_doc_map: dict[str, dict] = {} # filename -> {year, chunk_count, chunk_indices}
for idx, meta in enumerate(metadata):
fname = meta.get("filename", "unknown")
if fname not in _doc_map:
_doc_map[fname] = {
"filename": fname,
"year": meta.get("year"),
"chunk_count": 0,
"chunk_indices": [],
"document_id": hashlib.md5(fname.encode()).hexdigest()[:12],
}
_doc_map[fname]["chunk_count"] += 1
_doc_map[fname]["chunk_indices"].append(idx)
DOCUMENTS = list(_doc_map.values())
print(f"Found {len(DOCUMENTS)} unique documents", flush=True)
# Tokenize and build BM25 index
def tokenize(text: str) -> list[str]:
"""Simple whitespace tokenizer with lowercasing and minimum token length."""
return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 1]
print("Building BM25 index...", flush=True)
_start = time.time()
tokenized_corpus = [tokenize(t) for t in texts]
bm25_index = BM25Okapi(tokenized_corpus)
print(f"BM25 index built in {time.time() - _start:.1f}s", flush=True)
# ---------------------------------------------------------------------------
# Helper: build citation envelope from search results
# ---------------------------------------------------------------------------
def _make_title(filename: str) -> str:
"""Derive a human-readable title from a filename."""
name = filename.rsplit(".", 1)[0] # drop extension
name = re.sub(r"^[A-Z]+-\d+_", "", name) # drop prefixes like TEST-01_
name = name.replace("_", " ").replace("-", " ")
name = re.sub(r"\s+", " ", name).strip()
return name if name else filename
def _build_envelope(
query: str,
indices: list[int],
scores: list[float],
) -> str:
"""Format results as a Connected Archives citation envelope (JSON)."""
max_score = max(scores) if scores else 1.0
passages = []
for idx, score in zip(indices, scores):
meta = metadata[idx]
fname = meta.get("filename", "unknown")
doc_info = _doc_map.get(fname, {})
passages.append({
"passage_text": texts[idx],
"document_title": _make_title(fname),
"document_id": doc_info.get("document_id", "unknown"),
"filename": fname,
"publication_date": meta.get("year"),
"relevance_score": round(score / max_score, 4) if max_score > 0 else 0,
# Fields below are null until metadata enrichment
"page_or_section": meta.get("page_number"),
"authority_level": meta.get("authority_level"),
"knowledge_type": meta.get("knowledge_type"),
"country_iso": meta.get("country_iso"),
"source_url": meta.get("source_url"),
})
envelope = {
"library": {
"name": CONFIG["library_name"],
"id": CONFIG["library_id"],
"search_method": "bm25",
},
"query": query,
"total_results": len(passages),
"passages": passages,
}
return json.dumps(envelope, ensure_ascii=False, indent=2)
# ---------------------------------------------------------------------------
# MCP Tools
# ---------------------------------------------------------------------------
def search(
query: str,
top_k: int = 5,
year: Optional[str] = None,
) -> str:
"""
Search the EnDev knowledge base for passages relevant to your query.
Returns cited passages with provenance metadata (document title, year,
relevance score). Use this for questions about energy access, renewable
energy technologies, improved cookstoves, solar home systems, mini-grids,
and EnDev programme results in developing countries.
Args:
query: The search query describing what information you need
top_k: Number of results to return (1-20, default 5)
year: Optional year filter, e.g. '2021' to restrict results to a specific year
Returns:
JSON object containing an array of cited passages with source metadata
"""
top_k = max(1, min(top_k, CONFIG["search_defaults"]["max_top_k"]))
tokenized_query = tokenize(query)
if not tokenized_query:
return json.dumps({
"library": {"name": CONFIG["library_name"], "id": CONFIG["library_id"]},
"query": query,
"total_results": 0,
"passages": [],
"note": "Query produced no searchable tokens",
})
# Get BM25 scores for all documents
scores = bm25_index.get_scores(tokenized_query)
# Apply year filter if specified
if year:
for i, meta in enumerate(metadata):
if meta.get("year") != year:
scores[i] = 0.0
# Get top-k indices (sorted by score descending)
ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
# Filter by score threshold and take top_k
threshold = CONFIG["search_defaults"]["score_threshold"]
top_indices = []
top_scores = []
for i in ranked:
if scores[i] > threshold:
top_indices.append(i)
top_scores.append(scores[i])
if len(top_indices) >= top_k:
break
return _build_envelope(query, top_indices, top_scores)
def list_documents() -> str:
"""
List all documents available in the EnDev knowledge base.
Returns document titles, filenames, publication years, and chunk counts.
Use this to understand what is in the collection before searching.
Returns:
JSON object containing the document catalogue
"""
docs = []
for doc in sorted(DOCUMENTS, key=lambda d: (d.get("year") or "0000", d["filename"])):
docs.append({
"document_title": _make_title(doc["filename"]),
"filename": doc["filename"],
"document_id": doc["document_id"],
"year": doc.get("year"),
"chunk_count": doc["chunk_count"],
})
return json.dumps({
"library": CONFIG["library_name"],
"total_documents": len(docs),
"documents": docs,
}, ensure_ascii=False, indent=2)
def get_library_info() -> str:
"""
Get information about this knowledge library — what it contains,
what domains it covers, its geographic scope, and known limitations.
Call this first to decide whether this library is relevant to your query.
Returns:
JSON object with library metadata for routing decisions
"""
return json.dumps({
"library_id": CONFIG["library_id"],
"name": CONFIG["library_name"],
"summary": CONFIG["library_summary"],
"owner": CONFIG["library_owner"],
"domains": CONFIG["library_domains"],
"languages": CONFIG["library_languages"],
"geographic_scope": CONFIG["library_geographic_scope"],
"total_documents": len(DOCUMENTS),
"total_chunks": len(texts),
"search_method": "bm25",
"protocol": "connected_archives_v0.1",
}, ensure_ascii=False, indent=2)
# ---------------------------------------------------------------------------
# Gradio UI + MCP Server
# ---------------------------------------------------------------------------
with gr.Blocks(title="EnDev Connected Archives") as demo:
gr.Markdown(f"# 📚 {CONFIG['library_name']} — Connected Archives MCP Server")
gr.Markdown(
f"**{len(DOCUMENTS)}** documents, **{len(texts)}** searchable chunks. "
f"BM25 keyword search. MCP endpoint active.\n\n"
f"Connect via MCP: `https://giz-endev-mcp-server.hf.space/gradio_api/mcp/`"
)
with gr.Row():
with gr.Column(scale=3):
query_input = gr.Textbox(
label="Search query",
lines=2,
placeholder="e.g. What are the results of cookstove programmes in Kenya?",
)
with gr.Column(scale=1):
top_k_input = gr.Number(label="Results", value=5, minimum=1, maximum=20, precision=0)
year_input = gr.Textbox(label="Year filter (optional)", placeholder="e.g. 2021")
search_btn = gr.Button("Search", variant="primary")
output = gr.Textbox(label="Results (citation envelope)", lines=20)
search_btn.click(
fn=search,
inputs=[query_input, top_k_input, year_input],
outputs=output,
api_name="search",
)
# Register additional tools for MCP (no UI, API-only)
gr.api(list_documents, api_name="list_documents")
gr.api(get_library_info, api_name="get_library_info")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
mcp_server=True,
ssr_mode=False,
show_error=True,
)