Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files
db.py
CHANGED
|
@@ -188,6 +188,77 @@ class SanatanDatabase:
|
|
| 188 |
"metadatas": [all_data["metadatas"][min_index]],
|
| 189 |
}
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
def fetch_all_matches(
|
| 192 |
self,
|
| 193 |
collection_name: str,
|
|
|
|
| 188 |
"metadatas": [all_data["metadatas"][min_index]],
|
| 189 |
}
|
| 190 |
|
| 191 |
+
def count_where(
|
| 192 |
+
self,
|
| 193 |
+
collection_name: str,
|
| 194 |
+
metadata_where_clause: MetadataWhereClause = None,
|
| 195 |
+
) -> int:
|
| 196 |
+
"""
|
| 197 |
+
Count the number of matching verses in the collection without fetching documents.
|
| 198 |
+
Uses the same filtering and fallback logic as fetch_all_matches.
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
def normalize_for_match(s: str) -> str:
|
| 202 |
+
s = unicodedata.normalize("NFD", s)
|
| 203 |
+
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
| 204 |
+
return s
|
| 205 |
+
|
| 206 |
+
logger.info(
|
| 207 |
+
"count_where: counting matches in [%s] | filters=%s",
|
| 208 |
+
collection_name,
|
| 209 |
+
metadata_where_clause,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 213 |
+
where_clause = (
|
| 214 |
+
metadata_where_clause.to_chroma_where() if metadata_where_clause else None
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
# If conversion returns an empty dict, treat as None
|
| 218 |
+
if isinstance(where_clause, dict) and not where_clause:
|
| 219 |
+
where_clause = None
|
| 220 |
+
|
| 221 |
+
# Strict filter first
|
| 222 |
+
data = collection.get(include=["metadatas"], where=where_clause)
|
| 223 |
+
|
| 224 |
+
if not data["metadatas"]:
|
| 225 |
+
# fallback regex
|
| 226 |
+
logger.warning("count_where: No matches found with strict filter. Trying regex fallback.")
|
| 227 |
+
|
| 228 |
+
if not metadata_where_clause or not metadata_where_clause.filters:
|
| 229 |
+
return 0
|
| 230 |
+
|
| 231 |
+
regex_filters = [
|
| 232 |
+
f
|
| 233 |
+
for f in metadata_where_clause.filters
|
| 234 |
+
if f.metadata_search_operator == "$eq"
|
| 235 |
+
and isinstance(f.metadata_value, str)
|
| 236 |
+
]
|
| 237 |
+
|
| 238 |
+
if regex_filters:
|
| 239 |
+
all_data = collection.get(include=["metadatas"])
|
| 240 |
+
matched_count = 0
|
| 241 |
+
for meta in all_data["metadatas"]:
|
| 242 |
+
ok = True
|
| 243 |
+
for f in regex_filters:
|
| 244 |
+
field_val = str(meta.get(f.metadata_field, ""))
|
| 245 |
+
norm_val = normalize_for_match(field_val)
|
| 246 |
+
norm_query = normalize_for_match(f.metadata_value)
|
| 247 |
+
|
| 248 |
+
if not re.search(
|
| 249 |
+
re.escape(norm_query), norm_val, flags=re.IGNORECASE
|
| 250 |
+
):
|
| 251 |
+
ok = False
|
| 252 |
+
break
|
| 253 |
+
if ok:
|
| 254 |
+
matched_count += 1
|
| 255 |
+
return matched_count
|
| 256 |
+
else:
|
| 257 |
+
return 0
|
| 258 |
+
|
| 259 |
+
# Direct count
|
| 260 |
+
return len(data["metadatas"])
|
| 261 |
+
|
| 262 |
def fetch_all_matches(
|
| 263 |
self,
|
| 264 |
collection_name: str,
|
server.py
CHANGED
|
@@ -443,22 +443,23 @@ async def search_scripture_find_all_matches(
|
|
| 443 |
- `filter_obj`: MetadataWhereClause (filters, groups, operator)
|
| 444 |
- `page`: 1-based page number
|
| 445 |
- `page_size`: Number of results per page
|
| 446 |
-
- `has_audio`
|
| 447 |
"""
|
| 448 |
filter_obj = req.filter_obj
|
| 449 |
-
page = req.page
|
| 450 |
-
page_size = req.page_size
|
| 451 |
has_audio = req.has_audio
|
| 452 |
-
try:
|
| 453 |
-
logger.info(
|
| 454 |
-
"search_scripture_find_all_matches: searching for %s with filters %s | page=%s, page_size=%s, has_audio=%s",
|
| 455 |
-
scripture_name,
|
| 456 |
-
filter_obj,
|
| 457 |
-
page,
|
| 458 |
-
page_size,
|
| 459 |
-
has_audio,
|
| 460 |
-
)
|
| 461 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
db = SanatanDatabase()
|
| 463 |
config = next(
|
| 464 |
(s for s in SanatanConfig().scriptures if s["name"] == scripture_name),
|
|
@@ -467,16 +468,17 @@ async def search_scripture_find_all_matches(
|
|
| 467 |
if not config:
|
| 468 |
return {"error": f"Scripture '{scripture_name}' not found"}
|
| 469 |
|
| 470 |
-
# 1️⃣
|
|
|
|
| 471 |
results = db.fetch_all_matches(
|
| 472 |
collection_name=config["collection_name"],
|
| 473 |
metadata_where_clause=filter_obj,
|
| 474 |
-
page=None
|
| 475 |
-
page_size=None,
|
| 476 |
)
|
| 477 |
|
| 478 |
formatted_results = []
|
| 479 |
-
all_indices = []
|
| 480 |
for i in range(len(results["metadatas"])):
|
| 481 |
doc_id = results["ids"][i]
|
| 482 |
metadata_doc = results["metadatas"][i]
|
|
@@ -491,10 +493,10 @@ async def search_scripture_find_all_matches(
|
|
| 491 |
formatted_results.append(canonical_doc)
|
| 492 |
all_indices.append(canonical_doc["_global_index"])
|
| 493 |
|
| 494 |
-
# 2️⃣ Apply
|
| 495 |
if has_audio:
|
| 496 |
if has_audio == AudioType.none:
|
| 497 |
-
#
|
| 498 |
all_audio_indices = set()
|
| 499 |
for atype in [
|
| 500 |
AudioType.recitation,
|
|
@@ -505,15 +507,14 @@ async def search_scripture_find_all_matches(
|
|
| 505 |
indices = await svc_get_indices_with_audio(scripture_name, atype)
|
| 506 |
all_audio_indices.update(indices)
|
| 507 |
|
| 508 |
-
# Keep only indices that are NOT in all_audio_indices
|
| 509 |
formatted_results = [
|
| 510 |
r
|
| 511 |
for r in formatted_results
|
| 512 |
if r["_global_index"] not in all_audio_indices
|
| 513 |
]
|
| 514 |
else:
|
|
|
|
| 515 |
if has_audio == AudioType.any:
|
| 516 |
-
# Combine indices for all audio types
|
| 517 |
audio_indices = set()
|
| 518 |
for atype in [
|
| 519 |
AudioType.recitation,
|
|
@@ -530,16 +531,26 @@ async def search_scripture_find_all_matches(
|
|
| 530 |
await svc_get_indices_with_audio(scripture_name, has_audio)
|
| 531 |
)
|
| 532 |
|
| 533 |
-
# Keep only indices that match
|
| 534 |
formatted_results = [
|
| 535 |
r for r in formatted_results if r["_global_index"] in audio_indices
|
| 536 |
]
|
| 537 |
|
| 538 |
-
# 3️⃣
|
| 539 |
-
total_matches =
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
return {
|
| 545 |
"results": paginated_results,
|
|
|
|
| 443 |
- `filter_obj`: MetadataWhereClause (filters, groups, operator)
|
| 444 |
- `page`: 1-based page number
|
| 445 |
- `page_size`: Number of results per page
|
| 446 |
+
- `has_audio`: optional. can take values any|none|recitation|virutham|upanyasam
|
| 447 |
"""
|
| 448 |
filter_obj = req.filter_obj
|
| 449 |
+
page = req.page or 1
|
| 450 |
+
page_size = req.page_size or 20
|
| 451 |
has_audio = req.has_audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
| 453 |
+
logger.info(
|
| 454 |
+
"search_scripture_find_all_matches: searching for %s | filters=%s | page=%s | page_size=%s | has_audio=%s",
|
| 455 |
+
scripture_name,
|
| 456 |
+
filter_obj,
|
| 457 |
+
page,
|
| 458 |
+
page_size,
|
| 459 |
+
has_audio,
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
try:
|
| 463 |
db = SanatanDatabase()
|
| 464 |
config = next(
|
| 465 |
(s for s in SanatanConfig().scriptures if s["name"] == scripture_name),
|
|
|
|
| 468 |
if not config:
|
| 469 |
return {"error": f"Scripture '{scripture_name}' not found"}
|
| 470 |
|
| 471 |
+
# 1️⃣ Decide how much to fetch
|
| 472 |
+
fetch_all = has_audio is not None
|
| 473 |
results = db.fetch_all_matches(
|
| 474 |
collection_name=config["collection_name"],
|
| 475 |
metadata_where_clause=filter_obj,
|
| 476 |
+
page=None if fetch_all else page,
|
| 477 |
+
page_size=None if fetch_all else page_size,
|
| 478 |
)
|
| 479 |
|
| 480 |
formatted_results = []
|
| 481 |
+
all_indices = []
|
| 482 |
for i in range(len(results["metadatas"])):
|
| 483 |
doc_id = results["ids"][i]
|
| 484 |
metadata_doc = results["metadatas"][i]
|
|
|
|
| 493 |
formatted_results.append(canonical_doc)
|
| 494 |
all_indices.append(canonical_doc["_global_index"])
|
| 495 |
|
| 496 |
+
# 2️⃣ Apply audio filter only if requested
|
| 497 |
if has_audio:
|
| 498 |
if has_audio == AudioType.none:
|
| 499 |
+
# Remove anything that *has* any audio
|
| 500 |
all_audio_indices = set()
|
| 501 |
for atype in [
|
| 502 |
AudioType.recitation,
|
|
|
|
| 507 |
indices = await svc_get_indices_with_audio(scripture_name, atype)
|
| 508 |
all_audio_indices.update(indices)
|
| 509 |
|
|
|
|
| 510 |
formatted_results = [
|
| 511 |
r
|
| 512 |
for r in formatted_results
|
| 513 |
if r["_global_index"] not in all_audio_indices
|
| 514 |
]
|
| 515 |
else:
|
| 516 |
+
# Filter for specific or 'any'
|
| 517 |
if has_audio == AudioType.any:
|
|
|
|
| 518 |
audio_indices = set()
|
| 519 |
for atype in [
|
| 520 |
AudioType.recitation,
|
|
|
|
| 531 |
await svc_get_indices_with_audio(scripture_name, has_audio)
|
| 532 |
)
|
| 533 |
|
|
|
|
| 534 |
formatted_results = [
|
| 535 |
r for r in formatted_results if r["_global_index"] in audio_indices
|
| 536 |
]
|
| 537 |
|
| 538 |
+
# 3️⃣ Paginate *after* filtering if needed
|
| 539 |
+
total_matches = None
|
| 540 |
+
if fetch_all:
|
| 541 |
+
total_matches = len(formatted_results)
|
| 542 |
+
else:
|
| 543 |
+
total_matches = db.count_where(
|
| 544 |
+
collection_name=config["collection_name"],
|
| 545 |
+
metadata_where_clause=filter_obj,
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
if fetch_all:
|
| 549 |
+
start_idx = (page - 1) * page_size
|
| 550 |
+
end_idx = start_idx + page_size
|
| 551 |
+
paginated_results = formatted_results[start_idx:end_idx]
|
| 552 |
+
else:
|
| 553 |
+
paginated_results = formatted_results
|
| 554 |
|
| 555 |
return {
|
| 556 |
"results": paginated_results,
|