Spaces:

mkegel
/

post-n-RAG

Runtime error

App Files Files Community

MarlonKegel commited on May 28, 2025

Commit

8cd5cc6

1 Parent(s): 95f11ac

added diversity cap when searching across all sources

Browse files

Files changed (1) hide show

rag_ui.py +33 -2

rag_ui.py CHANGED Viewed

@@ -22,6 +22,7 @@ from rank_bm25 import BM25Okapi
 import io
 from docx import Document
 import hashlib
 # Caching for search results function
 @st.cache_data(show_spinner=False, max_entries=256)
@@ -182,11 +183,34 @@ def hybrid_search(query, chunk_idx_pool=None, k_sparse=TOPK_SPARSE, k_dense=TOPK
             if neighbor is not None:
                 extra_idxs.add(neighbor)
     all_final_idxs = list(dict.fromkeys(list(best_idxs) + list(extra_idxs)))
     selected_chunks = []
     for i in all_final_idxs:
         if i < len(chunks) and (chunk_idx_pool is None or i in chunk_idx_pool):
             rationale = []
-            # Trace rationale category:
             sparse_rank = sparse_ranks.get(i)
             dense_rank = dense_ranks.get(i)
             combined_rank = list(sorted(hybrid_scores, key=hybrid_scores.get, reverse=True)).index(i) if i in hybrid_scores else None
@@ -196,9 +220,16 @@ def hybrid_search(query, chunk_idx_pool=None, k_sparse=TOPK_SPARSE, k_dense=TOPK
                 rationale.append("high dense similarity (embedding rank top-3)")
             if combined_rank is not None and combined_rank < 3:
                 rationale.append("high combined score (RRF top-3)")
-            selected_chunk = dict(chunks[i])  # shallow copy, to avoid mutating source
             selected_chunk["retrieval_rationale"] = rationale if rationale else ["selected via hybrid search"]
             selected_chunks.append(selected_chunk)
     # --- Sort so that, within each paper_id, chunk_id is ascending ---
     selected_chunks.sort(key=lambda c: (c['paper_id'], c['chunk_id']))
     return selected_chunks

 import io
 from docx import Document
 import hashlib
+import math
 # Caching for search results function
 @st.cache_data(show_spinner=False, max_entries=256)
             if neighbor is not None:
                 extra_idxs.add(neighbor)
     all_final_idxs = list(dict.fromkeys(list(best_idxs) + list(extra_idxs)))
     selected_chunks = []
+    source_counts = {}
+    author_counts = {}
+    if chunk_idx_pool is None:  # Only apply capping when searching all sources
+        max_per_source = math.ceil(n_final * 0.5)
+        max_per_author = math.ceil(n_final * 0.7)
+    else:
+        # If subset, no caps
+        max_per_source = max_per_author = n_final
     for i in all_final_idxs:
         if i < len(chunks) and (chunk_idx_pool is None or i in chunk_idx_pool):
+            chunk = chunks[i]
+            meta = chunk["metadata"]
+            source_id = (meta.get("title", ""), meta.get("authors", ""))  # By title & authors (source)
+            author_id = meta.get("authors", "")
+            # Count how many from this source and author so far
+            s_count = source_counts.get(source_id, 0)
+            a_count = author_counts.get(author_id, 0)
+            # Enforce cap only if no source filter
+            if s_count >= max_per_source or a_count >= max_per_author:
+                continue
             rationale = []
             sparse_rank = sparse_ranks.get(i)
             dense_rank = dense_ranks.get(i)
             combined_rank = list(sorted(hybrid_scores, key=hybrid_scores.get, reverse=True)).index(i) if i in hybrid_scores else None
                 rationale.append("high dense similarity (embedding rank top-3)")
             if combined_rank is not None and combined_rank < 3:
                 rationale.append("high combined score (RRF top-3)")
+            selected_chunk = dict(chunk)  # shallow copy, to avoid mutating source
             selected_chunk["retrieval_rationale"] = rationale if rationale else ["selected via hybrid search"]
             selected_chunks.append(selected_chunk)
+            # Update counts
+            source_counts[source_id] = s_count + 1
+            author_counts[author_id] = a_count + 1
+            # Stop early if we have enough
+            if len(selected_chunks) >= n_final:
+                break
     # --- Sort so that, within each paper_id, chunk_id is ascending ---
     selected_chunks.sort(key=lambda c: (c['paper_id'], c['chunk_id']))
     return selected_chunks