Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- db.py +91 -0
- modules/config/divya_prabandham.py +1 -1
- modules/config/sri_stavam.py +1 -1
- server.py +71 -3
db.py
CHANGED
|
@@ -170,6 +170,97 @@ class SanatanDatabase:
|
|
| 170 |
"metadatas": [all_data["metadatas"][min_index]],
|
| 171 |
}
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
def search(
|
| 174 |
self,
|
| 175 |
collection_name: str,
|
|
|
|
| 170 |
"metadatas": [all_data["metadatas"][min_index]],
|
| 171 |
}
|
| 172 |
|
| 173 |
+
def fetch_all_matches(
|
| 174 |
+
self,
|
| 175 |
+
collection_name: str,
|
| 176 |
+
metadata_where_clause: MetadataWhereClause = None,
|
| 177 |
+
page: int = 1,
|
| 178 |
+
page_size: int = 20,
|
| 179 |
+
):
|
| 180 |
+
"""
|
| 181 |
+
Fetch all matching verses from the collection with optional pagination,
|
| 182 |
+
sorted by _global_index ascending.
|
| 183 |
+
"""
|
| 184 |
+
def normalize_for_match(s: str) -> str:
|
| 185 |
+
s = unicodedata.normalize("NFD", s)
|
| 186 |
+
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
| 187 |
+
return s
|
| 188 |
+
|
| 189 |
+
logger.info(
|
| 190 |
+
"fetching all matches from [%s] | filters=%s | page=%s | page_size=%s",
|
| 191 |
+
collection_name,
|
| 192 |
+
metadata_where_clause,
|
| 193 |
+
page,
|
| 194 |
+
page_size,
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 198 |
+
where_clause = (
|
| 199 |
+
metadata_where_clause.to_chroma_where() if metadata_where_clause else None
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
# First, try strict filter
|
| 203 |
+
data = collection.get(include=["metadatas", "documents"], where=where_clause)
|
| 204 |
+
|
| 205 |
+
if not data["metadatas"]:
|
| 206 |
+
# fallback regex
|
| 207 |
+
logger.warning("No data found using strict filter. Trying regex fallback.")
|
| 208 |
+
|
| 209 |
+
if not metadata_where_clause or not metadata_where_clause.filters:
|
| 210 |
+
return {"ids": [], "documents": [], "metadatas": [], "total_matches": 0}
|
| 211 |
+
|
| 212 |
+
regex_filters = [
|
| 213 |
+
f
|
| 214 |
+
for f in metadata_where_clause.filters
|
| 215 |
+
if f.metadata_search_operator == "$eq" and isinstance(f.metadata_value, str)
|
| 216 |
+
]
|
| 217 |
+
|
| 218 |
+
if regex_filters:
|
| 219 |
+
all_data = collection.get(include=["metadatas", "documents"])
|
| 220 |
+
matched_indices = []
|
| 221 |
+
for i, meta in enumerate(all_data["metadatas"]):
|
| 222 |
+
ok = True
|
| 223 |
+
for f in regex_filters:
|
| 224 |
+
field_val = str(meta.get(f.metadata_field, ""))
|
| 225 |
+
norm_val = normalize_for_match(field_val)
|
| 226 |
+
norm_query = normalize_for_match(f.metadata_value)
|
| 227 |
+
|
| 228 |
+
if not re.search(re.escape(norm_query), norm_val, flags=re.IGNORECASE):
|
| 229 |
+
ok = False
|
| 230 |
+
break
|
| 231 |
+
if ok:
|
| 232 |
+
matched_indices.append(i)
|
| 233 |
+
|
| 234 |
+
data = {
|
| 235 |
+
"ids": [all_data["ids"][i] for i in matched_indices],
|
| 236 |
+
"documents": [all_data["documents"][i] for i in matched_indices],
|
| 237 |
+
"metadatas": [all_data["metadatas"][i] for i in matched_indices],
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
total_matches = len(data["ids"])
|
| 241 |
+
if total_matches == 0:
|
| 242 |
+
return {"ids": [], "documents": [], "metadatas": [], "total_matches": 0}
|
| 243 |
+
|
| 244 |
+
# --- Sort by _global_index ascending ---
|
| 245 |
+
combined = list(zip(data["ids"], data["documents"], data["metadatas"]))
|
| 246 |
+
combined.sort(key=lambda x: x[2].get("_global_index", float("inf")))
|
| 247 |
+
|
| 248 |
+
ids_sorted, documents_sorted, metadatas_sorted = zip(*combined)
|
| 249 |
+
|
| 250 |
+
# Apply pagination
|
| 251 |
+
start = (page - 1) * page_size
|
| 252 |
+
end = start + page_size
|
| 253 |
+
|
| 254 |
+
paged_data = {
|
| 255 |
+
"ids": list(ids_sorted[start:end]),
|
| 256 |
+
"documents": list(documents_sorted[start:end]),
|
| 257 |
+
"metadatas": list(metadatas_sorted[start:end]),
|
| 258 |
+
"total_matches": total_matches,
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
return paged_data
|
| 262 |
+
|
| 263 |
+
|
| 264 |
def search(
|
| 265 |
self,
|
| 266 |
collection_name: str,
|
modules/config/divya_prabandham.py
CHANGED
|
@@ -235,7 +235,7 @@ divya_prabandham_config = {
|
|
| 235 |
"url": "https://www.youtube.com/@jagadacharya2405",
|
| 236 |
"role": "Upanyasam video provider",
|
| 237 |
"context": [{"start": 474, "end": 503, "description": "Thiruppaavai"}],
|
| 238 |
-
"photo_url": "https://
|
| 239 |
},
|
| 240 |
{
|
| 241 |
"name": "Srirangam Vikram Vasudevan",
|
|
|
|
| 235 |
"url": "https://www.youtube.com/@jagadacharya2405",
|
| 236 |
"role": "Upanyasam video provider",
|
| 237 |
"context": [{"start": 474, "end": 503, "description": "Thiruppaavai"}],
|
| 238 |
+
"photo_url": "https://drive.google.com/uc?export=download&id=1sM9-BiYRbjABJeihk5q6E2Lc5gc0_72V",
|
| 239 |
},
|
| 240 |
{
|
| 241 |
"name": "Srirangam Vikram Vasudevan",
|
modules/config/sri_stavam.py
CHANGED
|
@@ -98,7 +98,7 @@ sri_stavam_config = {
|
|
| 98 |
"url": "https://www.youtube.com/@jagadacharya2405",
|
| 99 |
"role": "Upanyasam video provider",
|
| 100 |
"context": [{"start": 1, "end": 10, "description": "Sri Stavam"}],
|
| 101 |
-
"photo_url": "https://
|
| 102 |
},
|
| 103 |
],
|
| 104 |
},
|
|
|
|
| 98 |
"url": "https://www.youtube.com/@jagadacharya2405",
|
| 99 |
"role": "Upanyasam video provider",
|
| 100 |
"context": [{"start": 1, "end": 10, "description": "Sri Stavam"}],
|
| 101 |
+
"photo_url": "https://drive.google.com/uc?export=download&id=1sM9-BiYRbjABJeihk5q6E2Lc5gc0_72V",
|
| 102 |
},
|
| 103 |
],
|
| 104 |
},
|
server.py
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
import json
|
| 3 |
import random
|
| 4 |
import traceback
|
| 5 |
-
from typing import Optional
|
| 6 |
import uuid
|
| 7 |
-
from fastapi import APIRouter, Request
|
| 8 |
from fastapi.responses import JSONResponse
|
| 9 |
import pycountry
|
| 10 |
from pydantic import BaseModel
|
|
@@ -286,7 +286,7 @@ async def get_scripture_configs():
|
|
| 286 |
|
| 287 |
|
| 288 |
@router.post("/scripture/{scripture_name}/search")
|
| 289 |
-
async def
|
| 290 |
scripture_name: str,
|
| 291 |
filter_obj: Optional[MetadataWhereClause] = None,
|
| 292 |
):
|
|
@@ -336,6 +336,74 @@ async def search_scripture(
|
|
| 336 |
logger.error("Error while searching %s", e, exc_info=True)
|
| 337 |
return {"error": str(e)}
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
| 340 |
@router.post("/audio")
|
| 341 |
async def generate_audio_urls(req: AudioRequest):
|
|
|
|
| 2 |
import json
|
| 3 |
import random
|
| 4 |
import traceback
|
| 5 |
+
from typing import List, Optional
|
| 6 |
import uuid
|
| 7 |
+
from fastapi import APIRouter, Request, Query
|
| 8 |
from fastapi.responses import JSONResponse
|
| 9 |
import pycountry
|
| 10 |
from pydantic import BaseModel
|
|
|
|
| 286 |
|
| 287 |
|
| 288 |
@router.post("/scripture/{scripture_name}/search")
|
| 289 |
+
async def search_scripture_find_first_match(
|
| 290 |
scripture_name: str,
|
| 291 |
filter_obj: Optional[MetadataWhereClause] = None,
|
| 292 |
):
|
|
|
|
| 336 |
logger.error("Error while searching %s", e, exc_info=True)
|
| 337 |
return {"error": str(e)}
|
| 338 |
|
| 339 |
+
class ScriptureMultiSearchRequest(BaseModel):
|
| 340 |
+
filter_obj: Optional[MetadataWhereClause] = None
|
| 341 |
+
page: int = 1
|
| 342 |
+
page_size: int = 20
|
| 343 |
+
|
| 344 |
+
@router.post("/scripture/{scripture_name}/search/all")
|
| 345 |
+
async def search_scripture_find_all_matches(
|
| 346 |
+
scripture_name: str,
|
| 347 |
+
req: ScriptureMultiSearchRequest
|
| 348 |
+
):
|
| 349 |
+
"""
|
| 350 |
+
Search scripture collection and return all matching results with pagination.
|
| 351 |
+
- `scripture_name`: Name of the collection
|
| 352 |
+
- `filter_obj`: MetadataWhereClause (filters, groups, operator)
|
| 353 |
+
- `page`: 1-based page number
|
| 354 |
+
- `page_size`: Number of results per page
|
| 355 |
+
"""
|
| 356 |
+
filter_obj = req.filter_obj
|
| 357 |
+
page = req.page
|
| 358 |
+
page_size = req.page_size
|
| 359 |
+
try:
|
| 360 |
+
logger.info(
|
| 361 |
+
"search_scripture_find_all_matches: searching for %s with filters %s | page=%s, page_size=%s",
|
| 362 |
+
scripture_name,
|
| 363 |
+
filter_obj,
|
| 364 |
+
page,
|
| 365 |
+
page_size,
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
db = SanatanDatabase()
|
| 369 |
+
config = next(
|
| 370 |
+
(s for s in SanatanConfig().scriptures if s["name"] == scripture_name),
|
| 371 |
+
None,
|
| 372 |
+
)
|
| 373 |
+
if not config:
|
| 374 |
+
return {"error": f"Scripture '{scripture_name}' not found"}
|
| 375 |
+
|
| 376 |
+
results = db.fetch_all_matches(
|
| 377 |
+
collection_name=config["collection_name"],
|
| 378 |
+
metadata_where_clause=filter_obj,
|
| 379 |
+
page=page,
|
| 380 |
+
page_size=page_size,
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
# Flatten + canonicalize results
|
| 384 |
+
formatted_results = []
|
| 385 |
+
for i in range(len(results["metadatas"])):
|
| 386 |
+
doc_id = results["ids"][i]
|
| 387 |
+
metadata_doc = results["metadatas"][i]
|
| 388 |
+
metadata_doc["id"] = doc_id
|
| 389 |
+
|
| 390 |
+
document_text = results["documents"][i] if results.get("documents") else None
|
| 391 |
+
|
| 392 |
+
canonical_doc = SanatanConfig().canonicalize_document(
|
| 393 |
+
scripture_name, document_text, metadata_doc
|
| 394 |
+
)
|
| 395 |
+
formatted_results.append(canonical_doc)
|
| 396 |
+
|
| 397 |
+
return {
|
| 398 |
+
"results": formatted_results,
|
| 399 |
+
"total_matches": results.get("total_matches", 0),
|
| 400 |
+
"page": page,
|
| 401 |
+
"page_size": page_size,
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
except Exception as e:
|
| 405 |
+
logger.error("Error while searching %s", e, exc_info=True)
|
| 406 |
+
return {"error": str(e)}
|
| 407 |
|
| 408 |
@router.post("/audio")
|
| 409 |
async def generate_audio_urls(req: AudioRequest):
|