Spaces:

rbbist
/

RAG_System_with_Nepal_Kanun_Patrika_Dataset

Sleeping

App Files Files Community

rbbist commited on Aug 12, 2025

Commit

2b0fbff

verified ·

1 Parent(s): c369c5f

Update chromadb_semantic_search_for_dataset.py

Browse files

Files changed (1) hide show

chromadb_semantic_search_for_dataset.py +32 -23

chromadb_semantic_search_for_dataset.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import sqlite3
 import chromadb
 from chromadb.utils import embedding_functions
@@ -9,9 +10,9 @@ DB_PATH = "2080_data.db"
 CHROMA_COLLECTION_NAME = "my_collection"
 # Truncation / summary settings
-MAX_CHUNK_CHARS = 1500         # Reduced for performance
-SUMMARY_MAX_LENGTH = 100       # Reduced tokens/words budget
-COMBINED_CONTEXT_MAX_CHARS = 1500  # Reduced total chars for answer model
 # --- Load data from SQLite ---
 try:
@@ -22,20 +23,16 @@ try:
             FROM cases
         """)
         rows = cursor.fetchall()
-        print("SQLite rows loaded:", len(rows))
-        if rows:
-            print("Sample row:", rows[0])
 except sqlite3.Error as e:
     print(f"SQLite error: {e}")
     raise
-# --- Setup ChromaDB ---
 try:
     chroma_client = chromadb.Client()
     sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
         model_name="paraphrase-multilingual-mpnet-base-v2"
     )
-    print("Embedding model loaded:", sentence_transformer_ef is not None)
     collection = chroma_client.get_or_create_collection(
         name=CHROMA_COLLECTION_NAME,
         embedding_function=sentence_transformer_ef
@@ -44,13 +41,14 @@ except Exception as e:
     print(f"ChromaDB setup error: {e}")
     raise
-# --- Load DB rows into ChromaDB collection ---
 documents = []
 metadatas = []
 ids = []
 for i, row in enumerate(rows):
     link, decision_no, year, mudda_type, subject, nibedak, vipakshi, prakaran, thahar = row
     thahar_text = (thahar or "")[:MAX_CHUNK_CHARS]
     prakaran_text = (prakaran or "")[:MAX_CHUNK_CHARS]
     case_text = f"{mudda_type} {subject} {nibedak} {vipakshi} {prakaran_text} {thahar_text}"
@@ -67,14 +65,17 @@ for i, row in enumerate(rows):
     })
     ids.append(str(i))
 try:
     if len(documents) > 0:
         collection.add(documents=documents, metadatas=metadatas, ids=ids)
-        print("ChromaDB collection size:", collection.count())
 except Exception as e:
     print(f"Warning while adding to ChromaDB: {e}")
-def semantic_search(query: str, n_results: int = 2):
     """
     Returns:
       - formatted_text: user-facing Markdown/plaintext summary of top results
@@ -82,9 +83,6 @@ def semantic_search(query: str, n_results: int = 2):
       - combined_context: concatenated text of top docs (UNSUMMARIZED, truncated per doc)
     """
     start = time.time()
-    if not query.strip():
-        return "Error: Query cannot be empty.", [], ""
     results = collection.query(
         query_texts=[query],
         n_results=n_results,
@@ -93,14 +91,11 @@ def semantic_search(query: str, n_results: int = 2):
     docs = results.get("documents", [[]])[0]
     metas = results.get("metadatas", [[]])[0]
-    distances = results.get("distances", [[]])[0]
-    print("Semantic search results:", len(docs))
-    if not docs:
-        return "No results found for query.", [], ""
     top_docs = []
     for doc, meta, dist in zip(docs, metas, distances):
         try:
             similarity = 1.0 - float(dist)
         except Exception:
@@ -111,11 +106,12 @@ def semantic_search(query: str, n_results: int = 2):
             "similarity": similarity
         })
     lines = []
     for i, item in enumerate(top_docs, start=1):
         m = item["metadata"]
         sim_str = f"{item['similarity']:.4f}" if item["similarity"] is not None else "N/A"
-        snippet = (item["document"][:300] + "...") if len(item["document"]) > 300 else item["document"]
         lines.append(
             f"🔹 Case {i}\n"
             f"   📄 मुद्दाको किसिम: {m.get('mudda_type','')}\n"
@@ -130,12 +126,24 @@ def semantic_search(query: str, n_results: int = 2):
         )
     formatted_text = "\n\n".join(lines)
-    combined_items = [f"[Case {i}] {item['document'][:MAX_CHUNK_CHARS]}" for i, item in enumerate(top_docs, start=1)]
     combined_context = "\n\n".join(combined_items)
     elapsed = time.time() - start
-    print("Semantic search elapsed:", elapsed)
     return formatted_text, top_docs, combined_context
 def build_compact_context(summaries: List[str]) -> str:
     """
     Given a list of per-case summaries, concatenate them while keeping
@@ -145,6 +153,7 @@ def build_compact_context(summaries: List[str]) -> str:
     total = 0
     for i, s in enumerate(summaries, start=1):
         if total + len(s) + 10 > COMBINED_CONTEXT_MAX_CHARS:
             remaining = COMBINED_CONTEXT_MAX_CHARS - total - 10
             if remaining <= 0:
                 break

+# chromadb_semantic_search_for_dataset.py
 import sqlite3
 import chromadb
 from chromadb.utils import embedding_functions
 CHROMA_COLLECTION_NAME = "my_collection"
 # Truncation / summary settings
+MAX_CHUNK_CHARS = 2000         # truncate each full case to this before summarizing
+SUMMARY_MAX_LENGTH = 150       # tokens/words budget for each per-case summary (ADDED THIS LINE)
+COMBINED_CONTEXT_MAX_CHARS = 3000  # total chars to send to the answer model
 # --- Load data from SQLite ---
 try:
             FROM cases
         """)
         rows = cursor.fetchall()
 except sqlite3.Error as e:
     print(f"SQLite error: {e}")
     raise
+# --- Setup ChromaDB (in-memory client; assumes embeddings will be computed at startup) ---
 try:
     chroma_client = chromadb.Client()
     sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
         model_name="paraphrase-multilingual-mpnet-base-v2"
     )
     collection = chroma_client.get_or_create_collection(
         name=CHROMA_COLLECTION_NAME,
         embedding_function=sentence_transformer_ef
     print(f"ChromaDB setup error: {e}")
     raise
+# --- Load DB rows into ChromaDB collection (one-time per start) ---
 documents = []
 metadatas = []
 ids = []
 for i, row in enumerate(rows):
     link, decision_no, year, mudda_type, subject, nibedak, vipakshi, prakaran, thahar = row
+    # Build a single text blob for embedding; truncate the large field
     thahar_text = (thahar or "")[:MAX_CHUNK_CHARS]
     prakaran_text = (prakaran or "")[:MAX_CHUNK_CHARS]
     case_text = f"{mudda_type} {subject} {nibedak} {vipakshi} {prakaran_text} {thahar_text}"
     })
     ids.append(str(i))
+# Add to collection (if collection already has items this may raise duplicates; you can adjust)
 try:
     if len(documents) > 0:
         collection.add(documents=documents, metadatas=metadatas, ids=ids)
+        print(f"Added {len(documents)} documents to ChromaDB collection")
 except Exception as e:
+    # If collection already contains these ids, you may see errors; ignore or handle as needed.
     print(f"Warning while adding to ChromaDB: {e}")
+# --- Semantic search function (returns nicely formatted top N + raw top docs) ---
+def semantic_search(query: str, n_results: int = 3):
     """
     Returns:
       - formatted_text: user-facing Markdown/plaintext summary of top results
       - combined_context: concatenated text of top docs (UNSUMMARIZED, truncated per doc)
     """
     start = time.time()
     results = collection.query(
         query_texts=[query],
         n_results=n_results,
     docs = results.get("documents", [[]])[0]
     metas = results.get("metadatas", [[]])[0]
+    distances = results.get("distances", [[]])[0]  # distances (Chroma uses 1 - cosine if using cosine)
     top_docs = []
     for doc, meta, dist in zip(docs, metas, distances):
+        # Convert distance -> cosine similarity (approx): cosine = 1 - distance
         try:
             similarity = 1.0 - float(dist)
         except Exception:
             "similarity": similarity
         })
+    # Build a formatted summary for display
     lines = []
     for i, item in enumerate(top_docs, start=1):
         m = item["metadata"]
         sim_str = f"{item['similarity']:.4f}" if item["similarity"] is not None else "N/A"
+        snippet = (item["document"][:400] + "...") if len(item["document"]) > 400 else item["document"]
         lines.append(
             f"🔹 Case {i}\n"
             f"   📄 मुद्दाको किसिम: {m.get('mudda_type','')}\n"
         )
     formatted_text = "\n\n".join(lines)
+    # Build combined_context (truncated per doc) for summarization/answering
+    combined_items = []
+    for i, item in enumerate(top_docs, start=1):
+        d = item["document"]
+        # ensure we don't exceed MAX_CHUNK_CHARS per doc (we already truncated at insertion)
+        combined_items.append(f"[Case {i}] {d[:MAX_CHUNK_CHARS]}")
     combined_context = "\n\n".join(combined_items)
     elapsed = time.time() - start
+    print(f"Semantic search completed in {elapsed:.2f}s")
     return formatted_text, top_docs, combined_context
+# --- Summarization + RAG preparation ---
+# We'll create summarizer and answerer pipelines in app.py (to avoid TF/torch duplicate loading),
+# but provide helper that trims the combined context to a length budget.
 def build_compact_context(summaries: List[str]) -> str:
     """
     Given a list of per-case summaries, concatenate them while keeping
     total = 0
     for i, s in enumerate(summaries, start=1):
         if total + len(s) + 10 > COMBINED_CONTEXT_MAX_CHARS:
+            # take partial from summary if needed
             remaining = COMBINED_CONTEXT_MAX_CHARS - total - 10
             if remaining <= 0:
                 break