Spaces:
Sleeping
Sleeping
| """ | |
| Connected Archives MCP Server | |
| ============================== | |
| A lightweight BM25 retrieval server that exposes a HuggingFace dataset | |
| as an MCP tool for AI agents. Returns results in the Connected Archives | |
| citation envelope format. | |
| Template usage: duplicate this Space, edit config.json, deploy. | |
| """ | |
| import json | |
| import re | |
| import hashlib | |
| import time | |
| from pathlib import Path | |
| from typing import Optional | |
| import gradio as gr | |
| from datasets import load_dataset | |
| from rank_bm25 import BM25Okapi | |
| # --------------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------------- | |
| CONFIG_PATH = Path(__file__).parent / "config.json" | |
| with open(CONFIG_PATH) as f: | |
| CONFIG = json.load(f) | |
| # --------------------------------------------------------------------------- | |
| # Dataset loading and BM25 index construction | |
| # --------------------------------------------------------------------------- | |
| print(f"Loading dataset: {CONFIG['dataset_id']}...", flush=True) | |
| _start = time.time() | |
| dataset = load_dataset(CONFIG["dataset_id"], split="train") | |
| # Extract text and metadata from each row | |
| texts: list[str] = [] | |
| metadata: list[dict] = [] | |
| for row in dataset: | |
| texts.append(row[CONFIG["text_column"]]) | |
| meta = row[CONFIG["metadata_column"]] | |
| if isinstance(meta, str): | |
| meta = json.loads(meta) | |
| metadata.append(meta) | |
| print(f"Loaded {len(texts)} chunks in {time.time() - _start:.1f}s", flush=True) | |
| # Build document manifest (unique documents) | |
| _doc_map: dict[str, dict] = {} # filename -> {year, chunk_count, chunk_indices} | |
| for idx, meta in enumerate(metadata): | |
| fname = meta.get("filename", "unknown") | |
| if fname not in _doc_map: | |
| _doc_map[fname] = { | |
| "filename": fname, | |
| "year": meta.get("year"), | |
| "chunk_count": 0, | |
| "chunk_indices": [], | |
| "document_id": hashlib.md5(fname.encode()).hexdigest()[:12], | |
| } | |
| _doc_map[fname]["chunk_count"] += 1 | |
| _doc_map[fname]["chunk_indices"].append(idx) | |
| DOCUMENTS = list(_doc_map.values()) | |
| print(f"Found {len(DOCUMENTS)} unique documents", flush=True) | |
| # Tokenize and build BM25 index | |
| def tokenize(text: str) -> list[str]: | |
| """Simple whitespace tokenizer with lowercasing and minimum token length.""" | |
| return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 1] | |
| print("Building BM25 index...", flush=True) | |
| _start = time.time() | |
| tokenized_corpus = [tokenize(t) for t in texts] | |
| bm25_index = BM25Okapi(tokenized_corpus) | |
| print(f"BM25 index built in {time.time() - _start:.1f}s", flush=True) | |
| # --------------------------------------------------------------------------- | |
| # Helper: build citation envelope from search results | |
| # --------------------------------------------------------------------------- | |
| def _make_title(filename: str) -> str: | |
| """Derive a human-readable title from a filename.""" | |
| name = filename.rsplit(".", 1)[0] # drop extension | |
| name = re.sub(r"^[A-Z]+-\d+_", "", name) # drop prefixes like TEST-01_ | |
| name = name.replace("_", " ").replace("-", " ") | |
| name = re.sub(r"\s+", " ", name).strip() | |
| return name if name else filename | |
| def _build_envelope( | |
| query: str, | |
| indices: list[int], | |
| scores: list[float], | |
| ) -> str: | |
| """Format results as a Connected Archives citation envelope (JSON).""" | |
| max_score = max(scores) if scores else 1.0 | |
| passages = [] | |
| for idx, score in zip(indices, scores): | |
| meta = metadata[idx] | |
| fname = meta.get("filename", "unknown") | |
| doc_info = _doc_map.get(fname, {}) | |
| passages.append({ | |
| "passage_text": texts[idx], | |
| "document_title": _make_title(fname), | |
| "document_id": doc_info.get("document_id", "unknown"), | |
| "filename": fname, | |
| "publication_date": meta.get("year"), | |
| "relevance_score": round(score / max_score, 4) if max_score > 0 else 0, | |
| # Fields below are null until metadata enrichment | |
| "page_or_section": meta.get("page_number"), | |
| "authority_level": meta.get("authority_level"), | |
| "knowledge_type": meta.get("knowledge_type"), | |
| "country_iso": meta.get("country_iso"), | |
| "source_url": meta.get("source_url"), | |
| }) | |
| envelope = { | |
| "library": { | |
| "name": CONFIG["library_name"], | |
| "id": CONFIG["library_id"], | |
| "search_method": "bm25", | |
| }, | |
| "query": query, | |
| "total_results": len(passages), | |
| "passages": passages, | |
| } | |
| return json.dumps(envelope, ensure_ascii=False, indent=2) | |
| # --------------------------------------------------------------------------- | |
| # MCP Tools | |
| # --------------------------------------------------------------------------- | |
| def search( | |
| query: str, | |
| top_k: int = 5, | |
| year: Optional[str] = None, | |
| ) -> str: | |
| """ | |
| Search the EnDev knowledge base for passages relevant to your query. | |
| Returns cited passages with provenance metadata (document title, year, | |
| relevance score). Use this for questions about energy access, renewable | |
| energy technologies, improved cookstoves, solar home systems, mini-grids, | |
| and EnDev programme results in developing countries. | |
| Args: | |
| query: The search query describing what information you need | |
| top_k: Number of results to return (1-20, default 5) | |
| year: Optional year filter, e.g. '2021' to restrict results to a specific year | |
| Returns: | |
| JSON object containing an array of cited passages with source metadata | |
| """ | |
| top_k = max(1, min(top_k, CONFIG["search_defaults"]["max_top_k"])) | |
| tokenized_query = tokenize(query) | |
| if not tokenized_query: | |
| return json.dumps({ | |
| "library": {"name": CONFIG["library_name"], "id": CONFIG["library_id"]}, | |
| "query": query, | |
| "total_results": 0, | |
| "passages": [], | |
| "note": "Query produced no searchable tokens", | |
| }) | |
| # Get BM25 scores for all documents | |
| scores = bm25_index.get_scores(tokenized_query) | |
| # Apply year filter if specified | |
| if year: | |
| for i, meta in enumerate(metadata): | |
| if meta.get("year") != year: | |
| scores[i] = 0.0 | |
| # Get top-k indices (sorted by score descending) | |
| ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) | |
| # Filter by score threshold and take top_k | |
| threshold = CONFIG["search_defaults"]["score_threshold"] | |
| top_indices = [] | |
| top_scores = [] | |
| for i in ranked: | |
| if scores[i] > threshold: | |
| top_indices.append(i) | |
| top_scores.append(scores[i]) | |
| if len(top_indices) >= top_k: | |
| break | |
| return _build_envelope(query, top_indices, top_scores) | |
| def list_documents() -> str: | |
| """ | |
| List all documents available in the EnDev knowledge base. | |
| Returns document titles, filenames, publication years, and chunk counts. | |
| Use this to understand what is in the collection before searching. | |
| Returns: | |
| JSON object containing the document catalogue | |
| """ | |
| docs = [] | |
| for doc in sorted(DOCUMENTS, key=lambda d: (d.get("year") or "0000", d["filename"])): | |
| docs.append({ | |
| "document_title": _make_title(doc["filename"]), | |
| "filename": doc["filename"], | |
| "document_id": doc["document_id"], | |
| "year": doc.get("year"), | |
| "chunk_count": doc["chunk_count"], | |
| }) | |
| return json.dumps({ | |
| "library": CONFIG["library_name"], | |
| "total_documents": len(docs), | |
| "documents": docs, | |
| }, ensure_ascii=False, indent=2) | |
| def get_library_info() -> str: | |
| """ | |
| Get information about this knowledge library — what it contains, | |
| what domains it covers, its geographic scope, and known limitations. | |
| Call this first to decide whether this library is relevant to your query. | |
| Returns: | |
| JSON object with library metadata for routing decisions | |
| """ | |
| return json.dumps({ | |
| "library_id": CONFIG["library_id"], | |
| "name": CONFIG["library_name"], | |
| "summary": CONFIG["library_summary"], | |
| "owner": CONFIG["library_owner"], | |
| "domains": CONFIG["library_domains"], | |
| "languages": CONFIG["library_languages"], | |
| "geographic_scope": CONFIG["library_geographic_scope"], | |
| "total_documents": len(DOCUMENTS), | |
| "total_chunks": len(texts), | |
| "search_method": "bm25", | |
| "protocol": "connected_archives_v0.1", | |
| }, ensure_ascii=False, indent=2) | |
| # --------------------------------------------------------------------------- | |
| # Gradio UI + MCP Server | |
| # --------------------------------------------------------------------------- | |
| with gr.Blocks(title="EnDev Connected Archives") as demo: | |
| gr.Markdown(f"# 📚 {CONFIG['library_name']} — Connected Archives MCP Server") | |
| gr.Markdown( | |
| f"**{len(DOCUMENTS)}** documents, **{len(texts)}** searchable chunks. " | |
| f"BM25 keyword search. MCP endpoint active.\n\n" | |
| f"Connect via MCP: `https://giz-endev-mcp-server.hf.space/gradio_api/mcp/`" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| query_input = gr.Textbox( | |
| label="Search query", | |
| lines=2, | |
| placeholder="e.g. What are the results of cookstove programmes in Kenya?", | |
| ) | |
| with gr.Column(scale=1): | |
| top_k_input = gr.Number(label="Results", value=5, minimum=1, maximum=20, precision=0) | |
| year_input = gr.Textbox(label="Year filter (optional)", placeholder="e.g. 2021") | |
| search_btn = gr.Button("Search", variant="primary") | |
| output = gr.Textbox(label="Results (citation envelope)", lines=20) | |
| search_btn.click( | |
| fn=search, | |
| inputs=[query_input, top_k_input, year_input], | |
| outputs=output, | |
| api_name="search", | |
| ) | |
| # Register additional tools for MCP (no UI, API-only) | |
| gr.api(list_documents, api_name="list_documents") | |
| gr.api(get_library_info, api_name="get_library_info") | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| mcp_server=True, | |
| ssr_mode=False, | |
| show_error=True, | |
| ) |