Spaces:

broadfield-dev
/

bible-app

Paused

App Files Files Community

broadfield-dev commited on Sep 13, 2025

Commit

08c0e4e

verified ·

1 Parent(s): 9439f92

Update build_rag.py

Browse files

Files changed (1) hide show

build_rag.py +60 -79

build_rag.py CHANGED Viewed

@@ -1,26 +1,25 @@
 import json
 import os
 import pandas as pd
-from datasets import Dataset
-from transformers import AutoTokenizer, AutoModel
 import torch
-from huggingface_hub import create_repo
 import sys
 # --- Configuration ---
-# The name of the Gemma model for creating embeddings.
-# Make sure this matches the model used in app.py
-MODEL_NAME = "google/gemma-2b"
-# The name for the new dataset repository on the Hugging Face Hub.
-# This MUST match the DATASET_REPO in app.py
-DATASET_REPO = "broadfield-dev/bible-rag-dataset-gemma"
-# The directory containing the Bible JSON files
 JSON_DIRECTORY = 'bible_json'
-CHUNK_SIZE = 3 # Number of verses to group into a single text chunk
-# This dictionary maps the numeric book ID from the JSON to a human-readable name.
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
@@ -39,117 +38,99 @@ BOOK_ID_TO_NAME = {
 }
 def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
-    """
-    Reads all Bible JSON files from a directory, processes them, chunks them,
-    and returns a single unified Pandas DataFrame.
-    """
     all_verses = []
     print(f"Reading JSON files from '{directory_path}'...")
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
         print(f"Error: Directory '{directory_path}' is empty or does not exist.", file=sys.stderr)
-        print("Please add your Bible JSON files to this directory.", file=sys.stderr)
         sys.exit(1)
     for filename in os.listdir(directory_path):
         if filename.endswith('.json'):
             version_name = filename.split('.')[0].upper()
             file_path = os.path.join(directory_path, filename)
-            with open(file_path, 'r') as f:
-                data = json.load(f)
             rows = data.get("resultset", {}).get("row", [])
             for row in rows:
                 field = row.get("field", [])
                 if len(field) == 5:
                     _id, book_id, chapter, verse, text = field
                     book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
-                    all_verses.append({
-                        'version': version_name,
-                        'book_id': book_id,
-                        'book_name': book_name,
-                        'chapter': chapter,
-                        'verse': verse,
-                        'text': text.strip()
-                    })
     if not all_verses:
-        print("Error: No verses were processed. Check the format of your JSON files.", file=sys.stderr)
         sys.exit(1)
-    print(f"Successfully parsed {len(all_verses)} verses.")
     df = pd.DataFrame(all_verses)
-    print(f"Chunking verses into groups of {chunk_size}...")
     all_chunks = []
     for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
         group = group.sort_values('verse').reset_index(drop=True)
         for i in range(0, len(group), chunk_size):
             chunk_df = group.iloc[i:i+chunk_size]
             combined_text = " ".join(chunk_df['text'])
-            start_verse = chunk_df.iloc[0]['verse']
-            end_verse = chunk_df.iloc[-1]['verse']
-            if start_verse == end_verse:
-                reference = f"{book_name} {chapter}:{start_verse}"
-            else:
-                reference = f"{book_name} {chapter}:{start_verse}-{end_verse}"
-            all_chunks.append({
-                'text': combined_text,
-                'reference': reference,
-                'version': version,
-            })
     final_df = pd.DataFrame(all_chunks)
     print(f"Created {len(final_df)} text chunks.")
     return final_df
 if __name__ == "__main__":
-    print("--- Starting RAG Dataset Build Process ---")
-    # 1. Process local JSON files
-    print(f"\n--- Step 1: Processing JSON files from '{JSON_DIRECTORY}' ---")
     bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
-    # 2. Convert to Hugging Face Dataset
-    print("\n--- Step 2: Converting to Hugging Face Dataset ---")
-    hf_dataset = Dataset.from_pandas(bible_chunks_df)
-    print(hf_dataset)
     # 3. Load embedding model
-    print(f"\n--- Step 3: Loading embedding model: '{MODEL_NAME}' ---")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
-    print("Model loaded successfully.")
-    # 4. Generate embeddings
-    print("\n--- Step 4: Generating embeddings (this may take a while) ---")
-    def get_embeddings(batch):
-        inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt", max_length=512).to(model.device)
         with torch.no_grad():
             outputs = model(**inputs)
-        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
-        return {'embeddings': embeddings}
-    hf_dataset_with_embeddings = hf_dataset.map(get_embeddings, batched=True, batch_size=16)
-    print("Embeddings generated successfully.")
-    # 5. Add FAISS index
-    print("\n--- Step 5: Creating and adding FAISS index ---")
-    hf_dataset_with_embeddings.add_faiss_index(column="embeddings")
-    print("FAISS index added successfully.")
-    # 6. Push to Hub
-    print(f"\n--- Step 6: Pushing dataset to Hub: '{DATASET_REPO}' ---")
     try:
         create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
-        print(f"Repository '{DATASET_REPO}' created or already exists.")
-        hf_dataset_with_embeddings.push_to_hub(DATASET_REPO)
-        print("Dataset pushed successfully!")
     except Exception as e:
         print(f"An error occurred while pushing to the Hub: {e}", file=sys.stderr)
         sys.exit(1)
-    print("\n--- RAG Build Process Complete! ---")
-    print(f"The dataset is now available at: https://huggingface.co/datasets/{DATASET_REPO}")

 import json
 import os
 import pandas as pd
 import torch
+from transformers import AutoTokenizer, AutoModel
+import chromadb
 import sys
+from tqdm import tqdm
+from huggingface_hub import HfApi, create_repo
 # --- Configuration ---
+# Must match the settings in app.py
+CHROMA_PATH = "chroma_db"
+COLLECTION_NAME = "bible_verses"
+MODEL_NAME = "google/embeddinggemma-300m"
+DATASET_REPO = "broadfield-dev/bible-chromadb-gemma" # The HF Dataset to store the DB
 JSON_DIRECTORY = 'bible_json'
+CHUNK_SIZE = 3
+EMBEDDING_BATCH_SIZE = 16
+# --- Book ID Mapping (Unchanged) ---
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
 }
 def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
+    """Reads, processes, and chunks Bible JSON files into a Pandas DataFrame."""
+    # (This function's internal logic remains unchanged)
     all_verses = []
     print(f"Reading JSON files from '{directory_path}'...")
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
         print(f"Error: Directory '{directory_path}' is empty or does not exist.", file=sys.stderr)
         sys.exit(1)
     for filename in os.listdir(directory_path):
         if filename.endswith('.json'):
             version_name = filename.split('.')[0].upper()
             file_path = os.path.join(directory_path, filename)
+            with open(file_path, 'r') as f: data = json.load(f)
             rows = data.get("resultset", {}).get("row", [])
             for row in rows:
                 field = row.get("field", [])
                 if len(field) == 5:
                     _id, book_id, chapter, verse, text = field
                     book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
+                    all_verses.append({'version': version_name, 'book_name': book_name, 'chapter': chapter, 'verse': verse, 'text': text.strip()})
     if not all_verses:
+        print("Error: No verses were processed.", file=sys.stderr)
         sys.exit(1)
     df = pd.DataFrame(all_verses)
     all_chunks = []
     for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
         group = group.sort_values('verse').reset_index(drop=True)
         for i in range(0, len(group), chunk_size):
             chunk_df = group.iloc[i:i+chunk_size]
             combined_text = " ".join(chunk_df['text'])
+            start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
+            reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
+            all_chunks.append({'text': combined_text, 'reference': reference, 'version': version})
     final_df = pd.DataFrame(all_chunks)
     print(f"Created {len(final_df)} text chunks.")
     return final_df
 if __name__ == "__main__":
+    print("--- Starting Vector Database Build Process ---")
+    # 1. Process JSON
     bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
+    # 2. Setup local ChromaDB
+    print(f"\n--- Setting up local ChromaDB in '{CHROMA_PATH}' ---")
+    if os.path.exists(CHROMA_PATH):
+        import shutil
+        print("Deleting old local database directory...")
+        shutil.rmtree(CHROMA_PATH)
+    client = chromadb.PersistentClient(path=CHROMA_PATH)
+    collection = client.create_collection(name=COLLECTION_NAME)
     # 3. Load embedding model
+    print(f"\n--- Loading embedding model: '{MODEL_NAME}' ---")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
+    # 4. Generate embeddings and populate DB
+    print(f"\n--- Generating embeddings and populating database ---")
+    total_chunks = len(bible_chunks_df)
+    for i in tqdm(range(0, total_chunks, EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
+        batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
+        texts = batch_df['text'].tolist()
+        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
         with torch.no_grad():
             outputs = model(**inputs)
+        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().tolist()
+        collection.add(
+            ids=[str(j) for j in range(i, i + len(batch_df))],
+            embeddings=embeddings,
+            documents=texts,
+            metadatas=batch_df[['reference', 'version']].to_dict('records')
+        )
+    print(f"Successfully added {total_chunks} documents to the local ChromaDB.")
+    # 5. Upload the database directory to Hugging Face Hub
+    print(f"\n--- Pushing database to Hugging Face Hub: '{DATASET_REPO}' ---")
     try:
+        # Ensure the repo exists
         create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
+        # Upload the entire folder
+        api = HfApi()
+        api.upload_folder(
+            folder_path=CHROMA_PATH,
+            repo_id=DATASET_REPO,
+            repo_type="dataset",
+        )
+        print("Database pushed successfully!")
     except Exception as e:
         print(f"An error occurred while pushing to the Hub: {e}", file=sys.stderr)
+        print("Please ensure your HF_TOKEN secret has WRITE permissions.", file=sys.stderr)
         sys.exit(1)
+    print("\n--- Build Process Complete! ---")