""" Connected Archives MCP Server ============================== A lightweight BM25 retrieval server that exposes a HuggingFace dataset as an MCP tool for AI agents. Returns results in the Connected Archives citation envelope format. Template usage: duplicate this Space, edit config.json, deploy. """ import json import re import hashlib import time from pathlib import Path from typing import Optional import gradio as gr from datasets import load_dataset from rank_bm25 import BM25Okapi # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- CONFIG_PATH = Path(__file__).parent / "config.json" with open(CONFIG_PATH) as f: CONFIG = json.load(f) # --------------------------------------------------------------------------- # Dataset loading and BM25 index construction # --------------------------------------------------------------------------- print(f"Loading dataset: {CONFIG['dataset_id']}...", flush=True) _start = time.time() dataset = load_dataset(CONFIG["dataset_id"], split="train") # Extract text and metadata from each row texts: list[str] = [] metadata: list[dict] = [] for row in dataset: texts.append(row[CONFIG["text_column"]]) meta = row[CONFIG["metadata_column"]] if isinstance(meta, str): meta = json.loads(meta) metadata.append(meta) print(f"Loaded {len(texts)} chunks in {time.time() - _start:.1f}s", flush=True) # Build document manifest (unique documents) _doc_map: dict[str, dict] = {} # filename -> {year, chunk_count, chunk_indices} for idx, meta in enumerate(metadata): fname = meta.get("filename", "unknown") if fname not in _doc_map: _doc_map[fname] = { "filename": fname, "year": meta.get("year"), "chunk_count": 0, "chunk_indices": [], "document_id": hashlib.md5(fname.encode()).hexdigest()[:12], } _doc_map[fname]["chunk_count"] += 1 _doc_map[fname]["chunk_indices"].append(idx) DOCUMENTS = list(_doc_map.values()) print(f"Found {len(DOCUMENTS)} unique documents", flush=True) # Tokenize and build BM25 index def tokenize(text: str) -> list[str]: """Simple whitespace tokenizer with lowercasing and minimum token length.""" return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 1] print("Building BM25 index...", flush=True) _start = time.time() tokenized_corpus = [tokenize(t) for t in texts] bm25_index = BM25Okapi(tokenized_corpus) print(f"BM25 index built in {time.time() - _start:.1f}s", flush=True) # --------------------------------------------------------------------------- # Helper: build citation envelope from search results # --------------------------------------------------------------------------- def _make_title(filename: str) -> str: """Derive a human-readable title from a filename.""" name = filename.rsplit(".", 1)[0] # drop extension name = re.sub(r"^[A-Z]+-\d+_", "", name) # drop prefixes like TEST-01_ name = name.replace("_", " ").replace("-", " ") name = re.sub(r"\s+", " ", name).strip() return name if name else filename def _build_envelope( query: str, indices: list[int], scores: list[float], ) -> str: """Format results as a Connected Archives citation envelope (JSON).""" max_score = max(scores) if scores else 1.0 passages = [] for idx, score in zip(indices, scores): meta = metadata[idx] fname = meta.get("filename", "unknown") doc_info = _doc_map.get(fname, {}) passages.append({ "passage_text": texts[idx], "document_title": _make_title(fname), "document_id": doc_info.get("document_id", "unknown"), "filename": fname, "publication_date": meta.get("year"), "relevance_score": round(score / max_score, 4) if max_score > 0 else 0, # Fields below are null until metadata enrichment "page_or_section": meta.get("page_number"), "authority_level": meta.get("authority_level"), "knowledge_type": meta.get("knowledge_type"), "country_iso": meta.get("country_iso"), "source_url": meta.get("source_url"), }) envelope = { "library": { "name": CONFIG["library_name"], "id": CONFIG["library_id"], "search_method": "bm25", }, "query": query, "total_results": len(passages), "passages": passages, } return json.dumps(envelope, ensure_ascii=False, indent=2) # --------------------------------------------------------------------------- # MCP Tools # --------------------------------------------------------------------------- def search( query: str, top_k: int = 5, year: Optional[str] = None, ) -> str: """ Search the EnDev knowledge base for passages relevant to your query. Returns cited passages with provenance metadata (document title, year, relevance score). Use this for questions about energy access, renewable energy technologies, improved cookstoves, solar home systems, mini-grids, and EnDev programme results in developing countries. Args: query: The search query describing what information you need top_k: Number of results to return (1-20, default 5) year: Optional year filter, e.g. '2021' to restrict results to a specific year Returns: JSON object containing an array of cited passages with source metadata """ top_k = max(1, min(top_k, CONFIG["search_defaults"]["max_top_k"])) tokenized_query = tokenize(query) if not tokenized_query: return json.dumps({ "library": {"name": CONFIG["library_name"], "id": CONFIG["library_id"]}, "query": query, "total_results": 0, "passages": [], "note": "Query produced no searchable tokens", }) # Get BM25 scores for all documents scores = bm25_index.get_scores(tokenized_query) # Apply year filter if specified if year: for i, meta in enumerate(metadata): if meta.get("year") != year: scores[i] = 0.0 # Get top-k indices (sorted by score descending) ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) # Filter by score threshold and take top_k threshold = CONFIG["search_defaults"]["score_threshold"] top_indices = [] top_scores = [] for i in ranked: if scores[i] > threshold: top_indices.append(i) top_scores.append(scores[i]) if len(top_indices) >= top_k: break return _build_envelope(query, top_indices, top_scores) def list_documents() -> str: """ List all documents available in the EnDev knowledge base. Returns document titles, filenames, publication years, and chunk counts. Use this to understand what is in the collection before searching. Returns: JSON object containing the document catalogue """ docs = [] for doc in sorted(DOCUMENTS, key=lambda d: (d.get("year") or "0000", d["filename"])): docs.append({ "document_title": _make_title(doc["filename"]), "filename": doc["filename"], "document_id": doc["document_id"], "year": doc.get("year"), "chunk_count": doc["chunk_count"], }) return json.dumps({ "library": CONFIG["library_name"], "total_documents": len(docs), "documents": docs, }, ensure_ascii=False, indent=2) def get_library_info() -> str: """ Get information about this knowledge library — what it contains, what domains it covers, its geographic scope, and known limitations. Call this first to decide whether this library is relevant to your query. Returns: JSON object with library metadata for routing decisions """ return json.dumps({ "library_id": CONFIG["library_id"], "name": CONFIG["library_name"], "summary": CONFIG["library_summary"], "owner": CONFIG["library_owner"], "domains": CONFIG["library_domains"], "languages": CONFIG["library_languages"], "geographic_scope": CONFIG["library_geographic_scope"], "total_documents": len(DOCUMENTS), "total_chunks": len(texts), "search_method": "bm25", "protocol": "connected_archives_v0.1", }, ensure_ascii=False, indent=2) # --------------------------------------------------------------------------- # Gradio UI + MCP Server # --------------------------------------------------------------------------- with gr.Blocks(title="EnDev Connected Archives") as demo: gr.Markdown(f"# 📚 {CONFIG['library_name']} — Connected Archives MCP Server") gr.Markdown( f"**{len(DOCUMENTS)}** documents, **{len(texts)}** searchable chunks. " f"BM25 keyword search. MCP endpoint active.\n\n" f"Connect via MCP: `https://giz-endev-mcp-server.hf.space/gradio_api/mcp/`" ) with gr.Row(): with gr.Column(scale=3): query_input = gr.Textbox( label="Search query", lines=2, placeholder="e.g. What are the results of cookstove programmes in Kenya?", ) with gr.Column(scale=1): top_k_input = gr.Number(label="Results", value=5, minimum=1, maximum=20, precision=0) year_input = gr.Textbox(label="Year filter (optional)", placeholder="e.g. 2021") search_btn = gr.Button("Search", variant="primary") output = gr.Textbox(label="Results (citation envelope)", lines=20) search_btn.click( fn=search, inputs=[query_input, top_k_input, year_input], outputs=output, api_name="search", ) # Register additional tools for MCP (no UI, API-only) gr.api(list_documents, api_name="list_documents") gr.api(get_library_info, api_name="get_library_info") if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, mcp_server=True, ssr_mode=False, show_error=True, )