Spaces:

broadfield-dev
/

bible-app

Paused

App Files Files Community

broadfield-dev commited on Sep 13, 2025

Commit

80d005a

verified ·

1 Parent(s): 46245df

Update build_rag.py

Browse files

Files changed (1) hide show

build_rag.py +72 -35

build_rag.py CHANGED Viewed

@@ -1,9 +1,26 @@
 import json
 import os
 import pandas as pd
-# This dictionary maps the numeric book ID from your JSON to a human-readable name.
-# It covers the standard 66 books of the Protestant Bible canon.
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
@@ -21,7 +38,7 @@ BOOK_ID_TO_NAME = {
     62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
 }
-def process_bible_json_files(directory_path: str, chunk_size: int = 3) -> pd.DataFrame:
     """
     Reads all Bible JSON files from a directory, processes them, chunks them,
     and returns a single unified Pandas DataFrame.
@@ -29,6 +46,11 @@ def process_bible_json_files(directory_path: str, chunk_size: int = 3) -> pd.Dat
     all_verses = []
     print(f"Reading JSON files from '{directory_path}'...")
     for filename in os.listdir(directory_path):
         if filename.endswith('.json'):
             version_name = filename.split('.')[0].upper()
@@ -37,15 +59,12 @@ def process_bible_json_files(directory_path: str, chunk_size: int = 3) -> pd.Dat
             with open(file_path, 'r') as f:
                 data = json.load(f)
-            # Navigate the nested JSON structure
             rows = data.get("resultset", {}).get("row", [])
             for row in rows:
                 field = row.get("field", [])
                 if len(field) == 5:
                     _id, book_id, chapter, verse, text = field
                     book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
                     all_verses.append({
                         'version': version_name,
                         'book_id': book_id,
@@ -56,34 +75,25 @@ def process_bible_json_files(directory_path: str, chunk_size: int = 3) -> pd.Dat
                     })
     if not all_verses:
-        raise ValueError("No verses were processed. Check the directory path and JSON structure.")
-    print(f"Successfully parsed {len(all_verses)} verses from {len(os.listdir(directory_path))} files.")
-    # Convert to DataFrame for easier manipulation
     df = pd.DataFrame(all_verses)
-    # --- Chunking Logic ---
     print(f"Chunking verses into groups of {chunk_size}...")
     all_chunks = []
-    # Group by version, book, and chapter to ensure chunks don't cross boundaries
     for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
         group = group.sort_values('verse').reset_index(drop=True)
         for i in range(0, len(group), chunk_size):
             chunk_df = group.iloc[i:i+chunk_size]
             combined_text = " ".join(chunk_df['text'])
             start_verse = chunk_df.iloc[0]['verse']
             end_verse = chunk_df.iloc[-1]['verse']
-            # Create a clean reference string
             if start_verse == end_verse:
                 reference = f"{book_name} {chapter}:{start_verse}"
             else:
                 reference = f"{book_name} {chapter}:{start_verse}-{end_verse}"
             all_chunks.append({
                 'text': combined_text,
                 'reference': reference,
@@ -92,27 +102,54 @@ def process_bible_json_files(directory_path: str, chunk_size: int = 3) -> pd.Dat
     final_df = pd.DataFrame(all_chunks)
     print(f"Created {len(final_df)} text chunks.")
     return final_df
-# --- Main execution ---
 if __name__ == "__main__":
-    # 1. Set the path to your directory containing the JSON files
-    json_directory = 'bible_json'
-    # 2. Run the processing and chunking function
-    bible_chunks_df = process_bible_json_files(json_directory, chunk_size=3)
-    # 3. Display the result
-    print("\n--- Processing Complete ---")
-    print("DataFrame Info:")
-    bible_chunks_df.info()
-    print("\n--- Example Chunks ---")
-    print(bible_chunks_df.head())
-    print("\n")
-    print(bible_chunks_df.sample(5))
-    # This DataFrame is now ready for the next step:
-    # `hf_dataset = Dataset.from_pandas(bible_chunks_df)`
-    # ...followed by Gemma embedding and FAISS indexing.

 import json
 import os
 import pandas as pd
+from datasets import Dataset
+from transformers import AutoTokenizer, AutoModel
+import torch
+from huggingface_hub import create_repo
+import sys
+# --- Configuration ---
+# The name of the Gemma model for creating embeddings.
+# Make sure this matches the model used in app.py
+MODEL_NAME = "google/gemma-2b"
+# The name for the new dataset repository on the Hugging Face Hub.
+# This MUST match the DATASET_REPO in app.py
+DATASET_REPO = "broadfield-dev/bible-rag-dataset-gemma"
+# The directory containing the Bible JSON files
+JSON_DIRECTORY = 'bible_json'
+CHUNK_SIZE = 3 # Number of verses to group into a single text chunk
+# This dictionary maps the numeric book ID from the JSON to a human-readable name.
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
     62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
 }
+def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
     """
     Reads all Bible JSON files from a directory, processes them, chunks them,
     and returns a single unified Pandas DataFrame.
     all_verses = []
     print(f"Reading JSON files from '{directory_path}'...")
+    if not os.path.exists(directory_path) or not os.listdir(directory_path):
+        print(f"Error: Directory '{directory_path}' is empty or does not exist.", file=sys.stderr)
+        print("Please add your Bible JSON files to this directory.", file=sys.stderr)
+        sys.exit(1)
     for filename in os.listdir(directory_path):
         if filename.endswith('.json'):
             version_name = filename.split('.')[0].upper()
             with open(file_path, 'r') as f:
                 data = json.load(f)
             rows = data.get("resultset", {}).get("row", [])
             for row in rows:
                 field = row.get("field", [])
                 if len(field) == 5:
                     _id, book_id, chapter, verse, text = field
                     book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
                     all_verses.append({
                         'version': version_name,
                         'book_id': book_id,
                     })
     if not all_verses:
+        print("Error: No verses were processed. Check the format of your JSON files.", file=sys.stderr)
+        sys.exit(1)
+    print(f"Successfully parsed {len(all_verses)} verses.")
     df = pd.DataFrame(all_verses)
     print(f"Chunking verses into groups of {chunk_size}...")
     all_chunks = []
     for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
         group = group.sort_values('verse').reset_index(drop=True)
         for i in range(0, len(group), chunk_size):
             chunk_df = group.iloc[i:i+chunk_size]
             combined_text = " ".join(chunk_df['text'])
             start_verse = chunk_df.iloc[0]['verse']
             end_verse = chunk_df.iloc[-1]['verse']
             if start_verse == end_verse:
                 reference = f"{book_name} {chapter}:{start_verse}"
             else:
                 reference = f"{book_name} {chapter}:{start_verse}-{end_verse}"
             all_chunks.append({
                 'text': combined_text,
                 'reference': reference,
     final_df = pd.DataFrame(all_chunks)
     print(f"Created {len(final_df)} text chunks.")
     return final_df
 if __name__ == "__main__":
+    print("--- Starting RAG Dataset Build Process ---")
+    # 1. Process local JSON files
+    print(f"\n--- Step 1: Processing JSON files from '{JSON_DIRECTORY}' ---")
+    bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
+    # 2. Convert to Hugging Face Dataset
+    print("\n--- Step 2: Converting to Hugging Face Dataset ---")
+    hf_dataset = Dataset.from_pandas(bible_chunks_df)
+    print(hf_dataset)
+    # 3. Load embedding model
+    print(f"\n--- Step 3: Loading embedding model: '{MODEL_NAME}' ---")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
+    print("Model loaded successfully.")
+    # 4. Generate embeddings
+    print("\n--- Step 4: Generating embeddings (this may take a while) ---")
+    def get_embeddings(batch):
+        inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt", max_length=512).to(model.device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
+        return {'embeddings': embeddings}
+    hf_dataset_with_embeddings = hf_dataset.map(get_embeddings, batched=True, batch_size=16)
+    print("Embeddings generated successfully.")
+    # 5. Add FAISS index
+    print("\n--- Step 5: Creating and adding FAISS index ---")
+    hf_dataset_with_embeddings.add_faiss_index(column="embeddings")
+    print("FAISS index added successfully.")
+    # 6. Push to Hub
+    print(f"\n--- Step 6: Pushing dataset to Hub: '{DATASET_REPO}' ---")
+    try:
+        create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
+        print(f"Repository '{DATASET_REPO}' created or already exists.")
+        hf_dataset_with_embeddings.push_to_hub(DATASET_REPO)
+        print("Dataset pushed successfully!")
+    except Exception as e:
+        print(f"An error occurred while pushing to the Hub: {e}", file=sys.stderr)
+        sys.exit(1)
+    print("\n--- RAG Build Process Complete! ---")
+    print(f"The dataset is now available at: https://huggingface.co/datasets/{DATASET_REPO}")