Spaces:

theodi
/

ndl-core-data-api

Sleeping

Huseyin Kir

lib mcp support added

3857228 2 months ago

6.9 kB

	import os
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import StreamingResponse, HTMLResponse
	import lancedb
	from sentence_transformers import SentenceTransformer
	from huggingface_hub import snapshot_download
	import shutil
	import requests
	import io

	HF_DATASET_BASE_URL = "https://huggingface.co/datasets/theodi/ndl-core-structured-data"
	HF_API_BASE_URL = "https://huggingface.co/api/datasets/theodi/ndl-core-structured-data"

	THIS_API_URL = "https://theodi-ndl-core-data-api.hf.space"

	app = FastAPI()

	# 1. Download ONLY the LanceDB folder (Saved space/time by ignoring FAISS)
	print("⏳ Downloading LanceDB index...")
	index_path = snapshot_download(
	repo_id="theodi/ndl-core-rag-index",
	repo_type="dataset",
	allow_patterns="lancedb_search_index/*", # only need this folder, not the FAISS one
	force_download=True # ensure we get the latest version
	)

	# This i mandatory to avoid "file size is too small" errors from LanceDB
	dst = "/tmp/lancedb_search_index"
	shutil.copytree(f"{index_path}/lancedb_search_index", dst)
	# Verify files copied
	for root, dirs, files in os.walk(dst):
	for f in files:
	p = os.path.join(root, f)
	print(p, os.path.getsize(p))

	# 2. Connect DB and load model
	db = lancedb.connect(dst)
	table = db.open_table("ndl_core_datasets")
	all_columns = table.schema.names
	columns_to_select = [col for col in all_columns if col != "vector"]

	model = SentenceTransformer('all-MiniLM-L6-v2')

	@app.get("/search")
	def search(query: str, limit: int = 5):
	query_vector = model.encode(query)
	results = (
	table.search(query_vector) # vector search
	.metric("cosine") # Ensure metric matches index
	.select(columns_to_select) # explicit column selection
	.limit(limit)
	.to_pandas()
	)

	# Truncate text column to preview only
	if "text" in results.columns:
	results["text"] = results["text"].apply(truncate_text)

	# Add download links to each result
	records = results.to_dict(orient='records')
	for record in records:
	record["download"] = generate_download_info(record)

	return records


	@app.get("/download/text/{identifier}")
	def download_text_file(identifier: str):
	"""
	Stream text content as a downloadable file.

	Args:
	identifier: The record identifier

	Returns:
	StreamingResponse with the text content as a downloadable file
	"""
	record = find_record_by_identifier(identifier)

	if record is None:
	raise HTTPException(status_code=404, detail=f"No record found with identifier: {identifier}")

	record_format = record.get("format", "")
	if record_format != "text":
	raise HTTPException(status_code=400, detail=f"Record is not text format: {record_format}")

	text_data = record.get("text", "")

	# Create a file-like object from the text
	file_stream = io.BytesIO(text_data.encode("utf-8"))

	return StreamingResponse(
	file_stream,
	media_type="text/plain",
	headers={
	"Content-Disposition": f"attachment; filename={identifier}.txt"
	}
	)

	def truncate_text(text: str, max_length: int = 100) -> str:
	"""Return first max_length characters of text with '...' if truncated, or empty string if no text."""
	if not text:
	return ""
	if len(text) <= max_length:
	return text
	return text[:max_length] + "..."

	def get_folder_file_urls(folder_name: str) -> list:
	"""Fetch all file URLs from a folder in the HuggingFace dataset."""
	api_url = f"{HF_API_BASE_URL}/tree/main/{folder_name}"
	response = requests.get(api_url)
	if response.status_code != 200:
	return []

	files = response.json()
	file_urls = []
	for file_info in files:
	if file_info.get("type") == "file":
	file_path = file_info.get("path", "")
	download_url = f"{HF_DATASET_BASE_URL}/resolve/main/{file_path}"
	file_urls.append(download_url)
	return file_urls


	def find_record_by_identifier(identifier: str):
	"""Search for a record in LanceDB by identifier."""
	results = (
	table.search()
	.where(f"identifier = '{identifier}'")
	.select(columns_to_select)
	.limit(1)
	.to_pandas()
	)
	return results.iloc[0] if not results.empty else None


	def generate_download_info(record: dict) -> list:
	"""Generate download URLs for a search result record."""
	identifier = record.get("identifier", "")
	record_format = record.get("format", "")

	if record_format == "text":
	download_url = f"{THIS_API_URL}/download/text/{identifier}"
	return [download_url]
	elif record_format == "parquet":
	data_file = record.get("data_file", "")
	if not data_file:
	return []
	if data_file.endswith(".parquet"):
	download_url = f"{HF_DATASET_BASE_URL}/resolve/main/{data_file}"
	return [download_url]
	# It's a folder (UUID) - fetch all files in the folder
	return get_folder_file_urls(data_file)
	else:
	return []


	# Root endpoint with HTML response
	@app.get("/", response_class=HTMLResponse, include_in_schema=False)
	def root():
	return """
	<!DOCTYPE html>
	<html>
	<head>
	<title>NDL Core Data API</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	max-width: 720px;
	margin: 40px auto;
	line-height: 1.6;
	}
	code {
	background: #f4f4f4;
	padding: 2px 6px;
	border-radius: 4px;
	}
	</style>
	</head>
	<body>
	<h1>NDL Core Data API</h1>
	<p>
	This Space provides a <strong>FastAPI-based service</strong> for semantic search
	and data download across NDL Core datasets.
	</p>

	<h3>Key Endpoints</h3>
	<ul>
	<li><code>GET /search</code> – Semantic search over NDL Core datasets</li>
	<li><code>GET /download/text/{identifier}</code> – Download dataset text files</li>
	</ul>

	<p>
	For detailed usage examples, parameters, and data definitions,
	see the full project README:
	</p>

	<p>
	👉 <a href="https://huggingface.co/spaces/theodi/ndl-core-data-api/blob/main/README.md" target="_blank">
	Project README
	</a>
	</p>

	<h3>Client Library</h3>
	<p>
	To easily interact with this API, use the official Python client library with built-in MCP server support:
	</p>
	<p>
	👉 <a href="https://github.com/theodi/ndl-core-client" target="_blank">
	ndl-core-client
	</a>
	</p>
	</body>
	</html>
	"""