Spaces:

broadfield-dev
/

bible-app

Paused

App Files Files Community

broadfield-dev commited on Sep 13, 2025

Commit

5a84a4e

verified ·

1 Parent(s): 8a74e77

Create build_rag.py

Browse files

Files changed (1) hide show

build_rag.py +118 -0

build_rag.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import json
+import os
+import pandas as pd
+# This dictionary maps the numeric book ID from your JSON to a human-readable name.
+# It covers the standard 66 books of the Protestant Bible canon.
+BOOK_ID_TO_NAME = {
+    1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
+    6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
+    11: "1 Kings", 12: "2 Kings", 13: "1 Chronicles", 14: "2 Chronicles",
+    15: "Ezra", 16: "Nehemiah", 17: "Esther", 18: "Job", 19: "Psalms",
+    20: "Proverbs", 21: "Ecclesiastes", 22: "Song of Solomon", 23: "Isaiah",
+    24: "Jeremiah", 25: "Lamentations", 26: "Ezekiel", 27: "Daniel", 28: "Hosea",
+    29: "Joel", 30: "Amos", 31: "Obadiah", 32: "Jonah", 33: "Micah", 34: "Nahum",
+    35: "Habakkuk", 36: "Zephaniah", 37: "Haggai", 38: "Zechariah", 39: "Malachi",
+    40: "Matthew", 41: "Mark", 42: "Luke", 43: "John", 44: "Acts",
+    45: "Romans", 46: "1 Corinthians", 47: "2 Corinthians", 48: "Galatians",
+    49: "Ephesians", 50: "Philippians", 51: "Colossians", 52: "1 Thessalonians",
+    53: "2 Thessalonians", 54: "1 Timothy", 55: "2 Timothy", 56: "Titus",
+    57: "Philemon", 58: "Hebrews", 59: "James", 60: "1 Peter", 61: "2 Peter",
+    62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
+}
+def process_bible_json_files(directory_path: str, chunk_size: int = 3) -> pd.DataFrame:
+    """
+    Reads all Bible JSON files from a directory, processes them, chunks them,
+    and returns a single unified Pandas DataFrame.
+    """
+    all_verses = []
+    print(f"Reading JSON files from '{directory_path}'...")
+    for filename in os.listdir(directory_path):
+        if filename.endswith('.json'):
+            version_name = filename.split('.')[0].upper()
+            file_path = os.path.join(directory_path, filename)
+            with open(file_path, 'r') as f:
+                data = json.load(f)
+            # Navigate the nested JSON structure
+            rows = data.get("resultset", {}).get("row", [])
+            for row in rows:
+                field = row.get("field", [])
+                if len(field) == 5:
+                    _id, book_id, chapter, verse, text = field
+                    book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
+                    all_verses.append({
+                        'version': version_name,
+                        'book_id': book_id,
+                        'book_name': book_name,
+                        'chapter': chapter,
+                        'verse': verse,
+                        'text': text.strip()
+                    })
+    if not all_verses:
+        raise ValueError("No verses were processed. Check the directory path and JSON structure.")
+    print(f"Successfully parsed {len(all_verses)} verses from {len(os.listdir(directory_path))} files.")
+    # Convert to DataFrame for easier manipulation
+    df = pd.DataFrame(all_verses)
+    # --- Chunking Logic ---
+    print(f"Chunking verses into groups of {chunk_size}...")
+    all_chunks = []
+    # Group by version, book, and chapter to ensure chunks don't cross boundaries
+    for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
+        group = group.sort_values('verse').reset_index(drop=True)
+        for i in range(0, len(group), chunk_size):
+            chunk_df = group.iloc[i:i+chunk_size]
+            combined_text = " ".join(chunk_df['text'])
+            start_verse = chunk_df.iloc[0]['verse']
+            end_verse = chunk_df.iloc[-1]['verse']
+            # Create a clean reference string
+            if start_verse == end_verse:
+                reference = f"{book_name} {chapter}:{start_verse}"
+            else:
+                reference = f"{book_name} {chapter}:{start_verse}-{end_verse}"
+            all_chunks.append({
+                'text': combined_text,
+                'reference': reference,
+                'version': version,
+            })
+    final_df = pd.DataFrame(all_chunks)
+    print(f"Created {len(final_df)} text chunks.")
+    return final_df
+# --- Main execution ---
+if __name__ == "__main__":
+    # 1. Set the path to your directory containing the JSON files
+    json_directory = 'bible_json'
+    # 2. Run the processing and chunking function
+    bible_chunks_df = process_bible_json_files(json_directory, chunk_size=3)
+    # 3. Display the result
+    print("\n--- Processing Complete ---")
+    print("DataFrame Info:")
+    bible_chunks_df.info()
+    print("\n--- Example Chunks ---")
+    print(bible_chunks_df.head())
+    print("\n")
+    print(bible_chunks_df.sample(5))
+    # This DataFrame is now ready for the next step:
+    # `hf_dataset = Dataset.from_pandas(bible_chunks_df)`
+    # ...followed by Gemma embedding and FAISS indexing.