Spaces:

GIZ
/

endev_mcp_server

Sleeping

App Files Files Community

endev_mcp_server / app.py

peter2000

Update app.py

4cf0f22 verified about 1 month ago

raw

history blame contribute delete

10.2 kB

	"""
	Connected Archives MCP Server
	==============================
	A lightweight BM25 retrieval server that exposes a HuggingFace dataset
	as an MCP tool for AI agents. Returns results in the Connected Archives
	citation envelope format.

	Template usage: duplicate this Space, edit config.json, deploy.
	"""

	import json
	import re
	import hashlib
	import time
	from pathlib import Path
	from typing import Optional

	import gradio as gr
	from datasets import load_dataset
	from rank_bm25 import BM25Okapi

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	CONFIG_PATH = Path(__file__).parent / "config.json"
	with open(CONFIG_PATH) as f:
	CONFIG = json.load(f)

	# ---------------------------------------------------------------------------
	# Dataset loading and BM25 index construction
	# ---------------------------------------------------------------------------

	print(f"Loading dataset: {CONFIG['dataset_id']}...", flush=True)
	_start = time.time()

	dataset = load_dataset(CONFIG["dataset_id"], split="train")

	# Extract text and metadata from each row
	texts: list[str] = []
	metadata: list[dict] = []

	for row in dataset:
	texts.append(row[CONFIG["text_column"]])
	meta = row[CONFIG["metadata_column"]]
	if isinstance(meta, str):
	meta = json.loads(meta)
	metadata.append(meta)

	print(f"Loaded {len(texts)} chunks in {time.time() - _start:.1f}s", flush=True)

	# Build document manifest (unique documents)
	_doc_map: dict[str, dict] = {} # filename -> {year, chunk_count, chunk_indices}
	for idx, meta in enumerate(metadata):
	fname = meta.get("filename", "unknown")
	if fname not in _doc_map:
	_doc_map[fname] = {
	"filename": fname,
	"year": meta.get("year"),
	"chunk_count": 0,
	"chunk_indices": [],
	"document_id": hashlib.md5(fname.encode()).hexdigest()[:12],
	}
	_doc_map[fname]["chunk_count"] += 1
	_doc_map[fname]["chunk_indices"].append(idx)

	DOCUMENTS = list(_doc_map.values())
	print(f"Found {len(DOCUMENTS)} unique documents", flush=True)

	# Tokenize and build BM25 index
	def tokenize(text: str) -> list[str]:
	"""Simple whitespace tokenizer with lowercasing and minimum token length."""
	return [t for t in re.findall(r"\w+", text.lower()) if len(t) > 1]

	print("Building BM25 index...", flush=True)
	_start = time.time()
	tokenized_corpus = [tokenize(t) for t in texts]
	bm25_index = BM25Okapi(tokenized_corpus)
	print(f"BM25 index built in {time.time() - _start:.1f}s", flush=True)


	# ---------------------------------------------------------------------------
	# Helper: build citation envelope from search results
	# ---------------------------------------------------------------------------

	def _make_title(filename: str) -> str:
	"""Derive a human-readable title from a filename."""
	name = filename.rsplit(".", 1)[0] # drop extension
	name = re.sub(r"^[A-Z]+-\d+_", "", name) # drop prefixes like TEST-01_
	name = name.replace("_", " ").replace("-", " ")
	name = re.sub(r"\s+", " ", name).strip()
	return name if name else filename


	def _build_envelope(
	query: str,
	indices: list[int],
	scores: list[float],
	) -> str:
	"""Format results as a Connected Archives citation envelope (JSON)."""
	max_score = max(scores) if scores else 1.0
	passages = []
	for idx, score in zip(indices, scores):
	meta = metadata[idx]
	fname = meta.get("filename", "unknown")
	doc_info = _doc_map.get(fname, {})
	passages.append({
	"passage_text": texts[idx],
	"document_title": _make_title(fname),
	"document_id": doc_info.get("document_id", "unknown"),
	"filename": fname,
	"publication_date": meta.get("year"),
	"relevance_score": round(score / max_score, 4) if max_score > 0 else 0,
	# Fields below are null until metadata enrichment
	"page_or_section": meta.get("page_number"),
	"authority_level": meta.get("authority_level"),
	"knowledge_type": meta.get("knowledge_type"),
	"country_iso": meta.get("country_iso"),
	"source_url": meta.get("source_url"),
	})

	envelope = {
	"library": {
	"name": CONFIG["library_name"],
	"id": CONFIG["library_id"],
	"search_method": "bm25",
	},
	"query": query,
	"total_results": len(passages),
	"passages": passages,
	}
	return json.dumps(envelope, ensure_ascii=False, indent=2)


	# ---------------------------------------------------------------------------
	# MCP Tools
	# ---------------------------------------------------------------------------

	def search(
	query: str,
	top_k: int = 5,
	year: Optional[str] = None,
	) -> str:
	"""
	Search the EnDev knowledge base for passages relevant to your query.
	Returns cited passages with provenance metadata (document title, year,
	relevance score). Use this for questions about energy access, renewable
	energy technologies, improved cookstoves, solar home systems, mini-grids,
	and EnDev programme results in developing countries.

	Args:
	query: The search query describing what information you need
	top_k: Number of results to return (1-20, default 5)
	year: Optional year filter, e.g. '2021' to restrict results to a specific year

	Returns:
	JSON object containing an array of cited passages with source metadata
	"""
	top_k = max(1, min(top_k, CONFIG["search_defaults"]["max_top_k"]))

	tokenized_query = tokenize(query)
	if not tokenized_query:
	return json.dumps({
	"library": {"name": CONFIG["library_name"], "id": CONFIG["library_id"]},
	"query": query,
	"total_results": 0,
	"passages": [],
	"note": "Query produced no searchable tokens",
	})

	# Get BM25 scores for all documents
	scores = bm25_index.get_scores(tokenized_query)

	# Apply year filter if specified
	if year:
	for i, meta in enumerate(metadata):
	if meta.get("year") != year:
	scores[i] = 0.0

	# Get top-k indices (sorted by score descending)
	ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

	# Filter by score threshold and take top_k
	threshold = CONFIG["search_defaults"]["score_threshold"]
	top_indices = []
	top_scores = []
	for i in ranked:
	if scores[i] > threshold:
	top_indices.append(i)
	top_scores.append(scores[i])
	if len(top_indices) >= top_k:
	break

	return _build_envelope(query, top_indices, top_scores)


	def list_documents() -> str:
	"""
	List all documents available in the EnDev knowledge base.
	Returns document titles, filenames, publication years, and chunk counts.
	Use this to understand what is in the collection before searching.

	Returns:
	JSON object containing the document catalogue
	"""
	docs = []
	for doc in sorted(DOCUMENTS, key=lambda d: (d.get("year") or "0000", d["filename"])):
	docs.append({
	"document_title": _make_title(doc["filename"]),
	"filename": doc["filename"],
	"document_id": doc["document_id"],
	"year": doc.get("year"),
	"chunk_count": doc["chunk_count"],
	})

	return json.dumps({
	"library": CONFIG["library_name"],
	"total_documents": len(docs),
	"documents": docs,
	}, ensure_ascii=False, indent=2)


	def get_library_info() -> str:
	"""
	Get information about this knowledge library — what it contains,
	what domains it covers, its geographic scope, and known limitations.
	Call this first to decide whether this library is relevant to your query.

	Returns:
	JSON object with library metadata for routing decisions
	"""
	return json.dumps({
	"library_id": CONFIG["library_id"],
	"name": CONFIG["library_name"],
	"summary": CONFIG["library_summary"],
	"owner": CONFIG["library_owner"],
	"domains": CONFIG["library_domains"],
	"languages": CONFIG["library_languages"],
	"geographic_scope": CONFIG["library_geographic_scope"],
	"total_documents": len(DOCUMENTS),
	"total_chunks": len(texts),
	"search_method": "bm25",
	"protocol": "connected_archives_v0.1",
	}, ensure_ascii=False, indent=2)


	# ---------------------------------------------------------------------------
	# Gradio UI + MCP Server
	# ---------------------------------------------------------------------------

	with gr.Blocks(title="EnDev Connected Archives") as demo:
	gr.Markdown(f"# 📚 {CONFIG['library_name']} — Connected Archives MCP Server")
	gr.Markdown(
	f"{len(DOCUMENTS)} documents, {len(texts)} searchable chunks. "
	f"BM25 keyword search. MCP endpoint active.\n\n"
	f"Connect via MCP: `https://giz-endev-mcp-server.hf.space/gradio_api/mcp/`"
	)

	with gr.Row():
	with gr.Column(scale=3):
	query_input = gr.Textbox(
	label="Search query",
	lines=2,
	placeholder="e.g. What are the results of cookstove programmes in Kenya?",
	)
	with gr.Column(scale=1):
	top_k_input = gr.Number(label="Results", value=5, minimum=1, maximum=20, precision=0)
	year_input = gr.Textbox(label="Year filter (optional)", placeholder="e.g. 2021")

	search_btn = gr.Button("Search", variant="primary")

	output = gr.Textbox(label="Results (citation envelope)", lines=20)

	search_btn.click(
	fn=search,
	inputs=[query_input, top_k_input, year_input],
	outputs=output,
	api_name="search",
	)

	# Register additional tools for MCP (no UI, API-only)
	gr.api(list_documents, api_name="list_documents")
	gr.api(get_library_info, api_name="get_library_info")


	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	mcp_server=True,
	ssr_mode=False,
	show_error=True,
	)