Spaces:
Running
Running
File size: 6,895 Bytes
1f42e44 0f0e7c3 787b56a 21e9c76 7340cef 0f0e7c3 579774f 0f0e7c3 579774f 21e9c76 579774f 21e9c76 052f966 21e9c76 0f0e7c3 7340cef d8d0e61 7340cef 1f42e44 21e9c76 7340cef 21e9c76 0f0e7c3 21e9c76 0f0e7c3 d8d0e61 0f0e7c3 d8d0e61 0f0e7c3 d8d0e61 0f0e7c3 d8d0e61 0f0e7c3 d8d0e61 0f0e7c3 d8d0e61 787b56a 72323dd 3857228 72323dd 787b56a 0f0e7c3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | import os
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse, HTMLResponse
import lancedb
from sentence_transformers import SentenceTransformer
from huggingface_hub import snapshot_download
import shutil
import requests
import io
HF_DATASET_BASE_URL = "https://huggingface.co/datasets/theodi/ndl-core-structured-data"
HF_API_BASE_URL = "https://huggingface.co/api/datasets/theodi/ndl-core-structured-data"
THIS_API_URL = "https://theodi-ndl-core-data-api.hf.space"
app = FastAPI()
# 1. Download ONLY the LanceDB folder (Saved space/time by ignoring FAISS)
print("β³ Downloading LanceDB index...")
index_path = snapshot_download(
repo_id="theodi/ndl-core-rag-index",
repo_type="dataset",
allow_patterns="lancedb_search_index/*", # only need this folder, not the FAISS one
force_download=True # ensure we get the latest version
)
# This i mandatory to avoid "file size is too small" errors from LanceDB
dst = "/tmp/lancedb_search_index"
shutil.copytree(f"{index_path}/lancedb_search_index", dst)
# Verify files copied
for root, dirs, files in os.walk(dst):
for f in files:
p = os.path.join(root, f)
print(p, os.path.getsize(p))
# 2. Connect DB and load model
db = lancedb.connect(dst)
table = db.open_table("ndl_core_datasets")
all_columns = table.schema.names
columns_to_select = [col for col in all_columns if col != "vector"]
model = SentenceTransformer('all-MiniLM-L6-v2')
@app.get("/search")
def search(query: str, limit: int = 5):
query_vector = model.encode(query)
results = (
table.search(query_vector) # vector search
.metric("cosine") # Ensure metric matches index
.select(columns_to_select) # explicit column selection
.limit(limit)
.to_pandas()
)
# Truncate text column to preview only
if "text" in results.columns:
results["text"] = results["text"].apply(truncate_text)
# Add download links to each result
records = results.to_dict(orient='records')
for record in records:
record["download"] = generate_download_info(record)
return records
@app.get("/download/text/{identifier}")
def download_text_file(identifier: str):
"""
Stream text content as a downloadable file.
Args:
identifier: The record identifier
Returns:
StreamingResponse with the text content as a downloadable file
"""
record = find_record_by_identifier(identifier)
if record is None:
raise HTTPException(status_code=404, detail=f"No record found with identifier: {identifier}")
record_format = record.get("format", "")
if record_format != "text":
raise HTTPException(status_code=400, detail=f"Record is not text format: {record_format}")
text_data = record.get("text", "")
# Create a file-like object from the text
file_stream = io.BytesIO(text_data.encode("utf-8"))
return StreamingResponse(
file_stream,
media_type="text/plain",
headers={
"Content-Disposition": f"attachment; filename={identifier}.txt"
}
)
def truncate_text(text: str, max_length: int = 100) -> str:
"""Return first max_length characters of text with '...' if truncated, or empty string if no text."""
if not text:
return ""
if len(text) <= max_length:
return text
return text[:max_length] + "..."
def get_folder_file_urls(folder_name: str) -> list:
"""Fetch all file URLs from a folder in the HuggingFace dataset."""
api_url = f"{HF_API_BASE_URL}/tree/main/{folder_name}"
response = requests.get(api_url)
if response.status_code != 200:
return []
files = response.json()
file_urls = []
for file_info in files:
if file_info.get("type") == "file":
file_path = file_info.get("path", "")
download_url = f"{HF_DATASET_BASE_URL}/resolve/main/{file_path}"
file_urls.append(download_url)
return file_urls
def find_record_by_identifier(identifier: str):
"""Search for a record in LanceDB by identifier."""
results = (
table.search()
.where(f"identifier = '{identifier}'")
.select(columns_to_select)
.limit(1)
.to_pandas()
)
return results.iloc[0] if not results.empty else None
def generate_download_info(record: dict) -> list:
"""Generate download URLs for a search result record."""
identifier = record.get("identifier", "")
record_format = record.get("format", "")
if record_format == "text":
download_url = f"{THIS_API_URL}/download/text/{identifier}"
return [download_url]
elif record_format == "parquet":
data_file = record.get("data_file", "")
if not data_file:
return []
if data_file.endswith(".parquet"):
download_url = f"{HF_DATASET_BASE_URL}/resolve/main/{data_file}"
return [download_url]
# It's a folder (UUID) - fetch all files in the folder
return get_folder_file_urls(data_file)
else:
return []
# Root endpoint with HTML response
@app.get("/", response_class=HTMLResponse, include_in_schema=False)
def root():
return """
<!DOCTYPE html>
<html>
<head>
<title>NDL Core Data API</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 720px;
margin: 40px auto;
line-height: 1.6;
}
code {
background: #f4f4f4;
padding: 2px 6px;
border-radius: 4px;
}
</style>
</head>
<body>
<h1>NDL Core Data API</h1>
<p>
This Space provides a <strong>FastAPI-based service</strong> for semantic search
and data download across NDL Core datasets.
</p>
<h3>Key Endpoints</h3>
<ul>
<li><code>GET /search</code> β Semantic search over NDL Core datasets</li>
<li><code>GET /download/text/{identifier}</code> β Download dataset text files</li>
</ul>
<p>
For detailed usage examples, parameters, and data definitions,
see the full project README:
</p>
<p>
π <a href="https://huggingface.co/spaces/theodi/ndl-core-data-api/blob/main/README.md" target="_blank">
Project README
</a>
</p>
<h3>Client Library</h3>
<p>
To easily interact with this API, use the official Python client library with built-in MCP server support:
</p>
<p>
π <a href="https://github.com/theodi/ndl-core-client" target="_blank">
ndl-core-client
</a>
</p>
</body>
</html>
"""
|