Spaces:

broadfield-dev
/

bible-app

Paused

App Files Files Community

broadfield-dev commited on Sep 14, 2025

Commit

2e63d3b

verified ·

1 Parent(s): 5ef463a

Update build_rag.py

Browse files

Files changed (1) hide show

build_rag.py +18 -18

build_rag.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import json
 import os
 import pandas as pd
 import torch
-import torch.nn.functional as F # Import the functional module
 from transformers import AutoTokenizer, AutoModel
 import chromadb
 import sys
@@ -13,14 +15,14 @@ import traceback
 # --- Configuration ---
 CHROMA_PATH = "chroma_db"
 COLLECTION_NAME = "bible_verses"
-# *** CHANGE 1: UPDATE THE MODEL NAME ***
-MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
-# *** CHANGE 2: UPDATE THE DATASET REPO NAME TO AVOID CONFUSION ***
-DATASET_REPO = "broadfield-dev/bible-chromadb-mpnet"
 STATUS_FILE = "build_status.log"
 JSON_DIRECTORY = 'bible_json'
 CHUNK_SIZE = 3
-EMBEDDING_BATCH_SIZE = 16 # Adjust based on available VRAM
 # (BOOK_ID_TO_NAME dictionary remains the same)
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
@@ -44,13 +46,13 @@ def update_status(message):
     with open(STATUS_FILE, "w") as f:
         f.write(message)
-# Mean Pooling Function - Take attention mask into account for correct averaging
 def mean_pooling(model_output, attention_mask):
-    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
     # (This function is unchanged)
     all_verses = []
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
@@ -92,36 +94,35 @@ def main():
     collection = client.create_collection(
         name=COLLECTION_NAME,
-        metadata={"hnsw:space": "cosine"} # Use cosine distance
     )
     update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
-    update_status("IN_PROGRESS: Step 4/5 - Generating and NORMALIZING embeddings...")
     for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
         batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
         texts = batch_df['text'].tolist()
-        # *** CHANGE 3: USE THE CORRECT POOLING STRATEGY FOR SBERT MODELS ***
         encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(model.device)
         with torch.no_grad():
             model_output = model(**encoded_input)
-        # Perform pooling and normalization
-        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-        normalized_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
         collection.add(
             ids=[str(j) for j in range(i, i + len(batch_df))],
-            embeddings=normalized_embeddings.cpu().tolist(), # Convert to list
             documents=texts,
             metadatas=batch_df[['reference', 'version']].to_dict('records')
         )
     update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
-    # (This part is unchanged)
     create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
     api = HfApi()
     api.upload_folder(
@@ -136,7 +137,6 @@ if __name__ == "__main__":
     try:
         main()
     except Exception as e:
-        # (Error handling is unchanged)
         error_message = traceback.format_exc()
         if "401" in str(e) or "Unauthorized" in str(e):
             update_status("FAILED: Hugging Face authentication error. Ensure your HF_TOKEN secret has WRITE permissions.")

+# build_rag.py (Updated for a model with pre-normalized embeddings)
 import json
 import os
 import pandas as pd
 import torch
+import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel
 import chromadb
 import sys
 # --- Configuration ---
 CHROMA_PATH = "chroma_db"
 COLLECTION_NAME = "bible_verses"
+# *** CHANGE 1: USE A MODEL WITH NORMALIZED EMBEDDINGS ***
+MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
+# *** CHANGE 2: USE A NEW REPO FOR THE NEW DATABASE ***
+DATASET_REPO = "broadfield-dev/bible-chromadb-multi-qa-mpnet"
 STATUS_FILE = "build_status.log"
 JSON_DIRECTORY = 'bible_json'
 CHUNK_SIZE = 3
+EMBEDDING_BATCH_SIZE = 16
 # (BOOK_ID_TO_NAME dictionary remains the same)
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     with open(STATUS_FILE, "w") as f:
         f.write(message)
+# Mean Pooling Function - Crucial for sentence-transformer models
 def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+def process_bible_json_files(directory_path: str, chunk_size: int):
     # (This function is unchanged)
     all_verses = []
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
     collection = client.create_collection(
         name=COLLECTION_NAME,
+        metadata={"hnsw:space": "cosine"}
     )
     update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
+    update_status("IN_PROGRESS: Step 4/5 - Generating embeddings (no normalization needed)...")
     for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
         batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
         texts = batch_df['text'].tolist()
         encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(model.device)
         with torch.no_grad():
             model_output = model(**encoded_input)
+        embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+        # *** REMOVED: NO LONGER NEED TO NORMALIZE THE EMBEDDINGS ***
+        # embeddings = F.normalize(embeddings, p=2, dim=1)
         collection.add(
             ids=[str(j) for j in range(i, i + len(batch_df))],
+            embeddings=embeddings.cpu().tolist(),
             documents=texts,
             metadatas=batch_df[['reference', 'version']].to_dict('records')
         )
     update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
     create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
     api = HfApi()
     api.upload_folder(
     try:
         main()
     except Exception as e:
         error_message = traceback.format_exc()
         if "401" in str(e) or "Unauthorized" in str(e):
             update_status("FAILED: Hugging Face authentication error. Ensure your HF_TOKEN secret has WRITE permissions.")