Spaces:

JustscrAPIng
/

CultourSearchAPI

Sleeping

App Files Files Community

JustscrAPIng commited on Dec 14, 2025

Commit

e45a23a

verified ·

1 Parent(s): ac37ae3

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -58

app.py CHANGED Viewed

@@ -7,21 +7,19 @@ from rank_bm25 import BM25Okapi
 import string
 import os
 import sys
 # --- 1. SETUP & MODEL LOADING ---
 print("⏳ Loading models...")
-# Detect Hardware (GPU vs CPU)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Running on: {device}")
-# Embedding Function (Must match what you used to create the DB)
 ef = SentenceTransformerEmbeddingFunction(
     model_name="BAAI/bge-m3",
     device=device
 )
-# Reranker Model
 reranker = CrossEncoder(
     "BAAI/bge-reranker-v2-m3",
     device=device,
@@ -32,104 +30,131 @@ reranker = CrossEncoder(
 print("✅ Models loaded!")
 # --- 2. LOAD PERSISTENT VECTOR DB ---
-DB_PATH = "./vector_db"  # This must match the folder name you uploaded
 if not os.path.exists(DB_PATH):
-    print(f"❌ Error: The folder '{DB_PATH}' was not found in the Space.")
-    print("Please upload your local 'vector_db' folder to the Files tab.")
-    # We don't exit here so you can see the error in logs, but the app will fail later.
 else:
-    print(f"wd: {os.getcwd()}") # Print working directory for debugging
-# Initialize Persistent Client
 client = chromadb.PersistentClient(path=DB_PATH)
-# Get the existing collection
-# Note: We use get_collection because we assume it already exists.
 try:
     collection = client.get_collection(name='ct_data', embedding_function=ef)
     print(f"✅ Loaded collection 'ct_data' with {collection.count()} documents.")
 except Exception as e:
     print(f"❌ Error loading collection: {e}")
-    # Fallback for debugging if name is wrong
-    print(f"Available collections: {[c.name for c in client.list_collections()]}")
     sys.exit(1)
 # --- 3. BUILD IN-MEMORY INDICES (BM25) ---
-# We need to fetch all documents from the DB to build the BM25 index
-# and the metadata cache. This avoids needing the CSV files.
 bm25_index = None
 doc_id_map = {}
 all_metadatas = {}
 def build_indices_from_db():
     global bm25_index, doc_id_map, all_metadatas
-    print("⏳ Fetching data from ChromaDB to build BM25 index...")
-    # Fetch all data (IDs, Documents, Metadatas)
-    # If you have >100k docs, you might want to paginate this, but for typical RAG it's fine.
     data = collection.get()
     ids = data['ids']
     documents = data['documents']
     metadatas = data['metadatas']
-    if not documents:
-        print("⚠️ Warning: Collection is empty!")
-        return
-    # Build BM25 Corpus
-    print(f"Processing {len(documents)} documents for Keyword Search...")
     tokenized_corpus = [
         doc.lower().translate(str.maketrans('', '', string.punctuation)).split()
         for doc in documents
     ]
     bm25_index = BM25Okapi(tokenized_corpus)
-    # Reconstruct Cache
     for idx, (doc_id, doc_text, meta) in enumerate(zip(ids, documents, metadatas)):
-        # Map integer index (from BM25) back to string ID
         doc_id_map[idx] = doc_id
-        # Store in fast lookup dict
-        all_metadatas[doc_id] = {
-            "document": doc_text,
-            "meta": meta if meta else {}
-        }
     print("✅ Hybrid Index Ready.")
-# Run this immediately
 build_indices_from_db()
-# --- 4. SEARCH LOGIC ---
-def reciprocal_rank_fusion(vector_results, bm25_results, k=60):
     fused_scores = {}
-    for rank, doc_id in enumerate(vector_results):
-        fused_scores[doc_id] = fused_scores.get(doc_id, 0) + (1 / (k + rank))
-    for rank, doc_id in enumerate(bm25_results):
-        fused_scores[doc_id] = fused_scores.get(doc_id, 0) + (1 / (k + rank))
     return sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
-def granular_search(query: str, initial_k: int = 15, final_k: int = 3):
     try:
-        # A. Vector Search
-        # Querying the persistent DB
-        vec_res = collection.query(query_texts=[query], n_results=initial_k)
         vector_ids = vec_res['ids'][0] if vec_res['ids'] else []
-        # B. BM25 Search
         bm25_ids = []
         if bm25_index:
             tokenized = query.lower().translate(str.maketrans('', '', string.punctuation)).split()
-            scores = bm25_index.get_scores(tokenized)
-            top_indices = scores.argsort()[::-1][:initial_k]
-            bm25_ids = [doc_id_map[i] for i in top_indices if scores[i] > 0]
-        # C. Fusion
-        candidates_ids = reciprocal_rank_fusion(vector_ids, bm25_ids)[:initial_k]
         if not candidates_ids:
             return {"data": [], "message": "No results found"}
@@ -144,8 +169,7 @@ def granular_search(query: str, initial_k: int = 15, final_k: int = 3):
                 metas.append(item['meta'])
         # E. Rerank
-        if not docs:
-            return {"data": []}
         pairs = [[query, doc] for doc in docs]
         scores = reranker.predict(pairs)
@@ -178,12 +202,13 @@ def granular_search(query: str, initial_k: int = 15, final_k: int = 3):
 demo = gr.Interface(
     fn=granular_search,
     inputs=[
-        gr.Textbox(label="Query", placeholder="Search for Vietnamese architecture..."),
         gr.Number(value=5, label="Initial K", visible=False),
-        gr.Number(value=1, label="Final K", visible=False)
     ],
     outputs=gr.JSON(label="Results"),
-    title="Granular Search API (Persistent)",
     flagging_mode="never"
 )

 import string
 import os
 import sys
+import numpy as np # Needed for normalization
 # --- 1. SETUP & MODEL LOADING ---
 print("⏳ Loading models...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Running on: {device}")
 ef = SentenceTransformerEmbeddingFunction(
     model_name="BAAI/bge-m3",
     device=device
 )
 reranker = CrossEncoder(
     "BAAI/bge-reranker-v2-m3",
     device=device,
 print("✅ Models loaded!")
 # --- 2. LOAD PERSISTENT VECTOR DB ---
+DB_PATH = "./vector_db"
 if not os.path.exists(DB_PATH):
+    print(f"❌ Error: The folder '{DB_PATH}' was not found.")
 else:
+    print(f"wd: {os.getcwd()}")
 client = chromadb.PersistentClient(path=DB_PATH)
 try:
     collection = client.get_collection(name='ct_data', embedding_function=ef)
     print(f"✅ Loaded collection 'ct_data' with {collection.count()} documents.")
 except Exception as e:
     print(f"❌ Error loading collection: {e}")
     sys.exit(1)
 # --- 3. BUILD IN-MEMORY INDICES (BM25) ---
 bm25_index = None
 doc_id_map = {}
 all_metadatas = {}
 def build_indices_from_db():
     global bm25_index, doc_id_map, all_metadatas
+    print("⏳ Fetching data to build BM25 index...")
     data = collection.get()
     ids = data['ids']
     documents = data['documents']
     metadatas = data['metadatas']
+    if not documents: return
     tokenized_corpus = [
         doc.lower().translate(str.maketrans('', '', string.punctuation)).split()
         for doc in documents
     ]
     bm25_index = BM25Okapi(tokenized_corpus)
     for idx, (doc_id, doc_text, meta) in enumerate(zip(ids, documents, metadatas)):
         doc_id_map[idx] = doc_id
+        all_metadatas[doc_id] = {"document": doc_text, "meta": meta if meta else {}}
     print("✅ Hybrid Index Ready.")
 build_indices_from_db()
+# --- 4. NEW: WEIGHTED FUSION LOGIC ---
+def sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+def weighted_score_fusion(vector_results, vector_scores, bm25_results, bm25_scores, alpha=0.65):
+    """
+    Combines results using score weighting:
+    Final Score = alpha * NormalizedVector + (1-alpha) * NormalizedBM25
+    """
     fused_scores = {}
+    # 1. Normalize Vector Scores (Cosine Sim is -1 to 1, usually 0 to 1 for embeddings)
+    # We assume vector_scores are already somewhat normalized (0-1), but let's ensure it.
+    # If using L2 distance, you'd need to invert this. Chroma default is usually distance,
+    # but bge-m3 uses cosine similarity (higher is better).
+    # 2. Normalize BM25 Scores (They are unbounded, so we use MinMax or Sigmoid)
+    if bm25_scores:
+        min_bm25 = min(bm25_scores)
+        max_bm25 = max(bm25_scores)
+        if max_bm25 == min_bm25:
+            norm_bm25 = [1.0] * len(bm25_scores)
+        else:
+            norm_bm25 = [(s - min_bm25) / (max_bm25 - min_bm25) for s in bm25_scores]
+    else:
+        norm_bm25 = []
+    # Map scores to IDs
+    vec_map = {doc_id: score for doc_id, score in zip(vector_results, vector_scores)}
+    bm25_map = {doc_id: score for doc_id, score in zip(bm25_results, norm_bm25)}
+    # Union of all found documents
+    all_ids = set(vector_results) | set(bm25_results)
+    for doc_id in all_ids:
+        v_score = vec_map.get(doc_id, 0.0)
+        b_score = bm25_map.get(doc_id, 0.0)
+        # The Alpha Ratio Logic
+        final_score = (alpha * v_score) + ((1.0 - alpha) * b_score)
+        fused_scores[doc_id] = final_score
     return sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
+def granular_search(query: str, initial_k: int = 15, final_k: int = 3, alpha: float = 0.65):
     try:
+        # A. Vector Search (Get Scores too)
+        # include=['documents', 'distances'] tells Chroma to return scores
+        vec_res = collection.query(query_texts=[query], n_results=initial_k, include=['documents', 'distances'])
         vector_ids = vec_res['ids'][0] if vec_res['ids'] else []
+        # Chroma returns Distances (Lower is better for L2/Cosine Distance)
+        # But BGE-M3 is usually Cosine Similarity.
+        # If score is Distance: Sim = 1 - Distance
+        vector_dists = vec_res['distances'][0] if vec_res['distances'] else []
+        vector_scores = [1 - d for d in vector_dists] # Convert distance to similarity
+        # B. BM25 Search (Get Scores too)
         bm25_ids = []
+        bm25_scores = []
         if bm25_index:
             tokenized = query.lower().translate(str.maketrans('', '', string.punctuation)).split()
+            # Get all scores
+            all_scores = bm25_index.get_scores(tokenized)
+            # Sort top K
+            top_indices = all_scores.argsort()[::-1][:initial_k]
+            for i in top_indices:
+                score = all_scores[i]
+                if score > 0:
+                    bm25_ids.append(doc_id_map[i])
+                    bm25_scores.append(score)
+        # C. Weighted Fusion (USING ALPHA)
+        candidates_ids = weighted_score_fusion(
+            vector_ids, vector_scores,
+            bm25_ids, bm25_scores,
+            alpha=alpha
+        )[:initial_k] # Keep top K after fusion
         if not candidates_ids:
             return {"data": [], "message": "No results found"}
                 metas.append(item['meta'])
         # E. Rerank
+        if not docs: return {"data": []}
         pairs = [[query, doc] for doc in docs]
         scores = reranker.predict(pairs)
 demo = gr.Interface(
     fn=granular_search,
     inputs=[
+        gr.Textbox(label="Query", placeholder="Search..."),
         gr.Number(value=5, label="Initial K", visible=False),
+        gr.Number(value=1, label="Final K", visible=False),
+        gr.Number(value=0.65, label="Alpha (Vector Weight)", visible=False) # Expose Alpha
     ],
     outputs=gr.JSON(label="Results"),
+    title="Granular Search API (Weighted)",
     flagging_mode="never"
 )