broadfield-dev commited on
Commit
5a84a4e
·
verified ·
1 Parent(s): 8a74e77

Create build_rag.py

Browse files
Files changed (1) hide show
  1. build_rag.py +118 -0
build_rag.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+
5
+ # This dictionary maps the numeric book ID from your JSON to a human-readable name.
6
+ # It covers the standard 66 books of the Protestant Bible canon.
7
+ BOOK_ID_TO_NAME = {
8
+ 1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
9
+ 6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
10
+ 11: "1 Kings", 12: "2 Kings", 13: "1 Chronicles", 14: "2 Chronicles",
11
+ 15: "Ezra", 16: "Nehemiah", 17: "Esther", 18: "Job", 19: "Psalms",
12
+ 20: "Proverbs", 21: "Ecclesiastes", 22: "Song of Solomon", 23: "Isaiah",
13
+ 24: "Jeremiah", 25: "Lamentations", 26: "Ezekiel", 27: "Daniel", 28: "Hosea",
14
+ 29: "Joel", 30: "Amos", 31: "Obadiah", 32: "Jonah", 33: "Micah", 34: "Nahum",
15
+ 35: "Habakkuk", 36: "Zephaniah", 37: "Haggai", 38: "Zechariah", 39: "Malachi",
16
+ 40: "Matthew", 41: "Mark", 42: "Luke", 43: "John", 44: "Acts",
17
+ 45: "Romans", 46: "1 Corinthians", 47: "2 Corinthians", 48: "Galatians",
18
+ 49: "Ephesians", 50: "Philippians", 51: "Colossians", 52: "1 Thessalonians",
19
+ 53: "2 Thessalonians", 54: "1 Timothy", 55: "2 Timothy", 56: "Titus",
20
+ 57: "Philemon", 58: "Hebrews", 59: "James", 60: "1 Peter", 61: "2 Peter",
21
+ 62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
22
+ }
23
+
24
+ def process_bible_json_files(directory_path: str, chunk_size: int = 3) -> pd.DataFrame:
25
+ """
26
+ Reads all Bible JSON files from a directory, processes them, chunks them,
27
+ and returns a single unified Pandas DataFrame.
28
+ """
29
+ all_verses = []
30
+
31
+ print(f"Reading JSON files from '{directory_path}'...")
32
+ for filename in os.listdir(directory_path):
33
+ if filename.endswith('.json'):
34
+ version_name = filename.split('.')[0].upper()
35
+ file_path = os.path.join(directory_path, filename)
36
+
37
+ with open(file_path, 'r') as f:
38
+ data = json.load(f)
39
+
40
+ # Navigate the nested JSON structure
41
+ rows = data.get("resultset", {}).get("row", [])
42
+ for row in rows:
43
+ field = row.get("field", [])
44
+ if len(field) == 5:
45
+ _id, book_id, chapter, verse, text = field
46
+
47
+ book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
48
+
49
+ all_verses.append({
50
+ 'version': version_name,
51
+ 'book_id': book_id,
52
+ 'book_name': book_name,
53
+ 'chapter': chapter,
54
+ 'verse': verse,
55
+ 'text': text.strip()
56
+ })
57
+
58
+ if not all_verses:
59
+ raise ValueError("No verses were processed. Check the directory path and JSON structure.")
60
+
61
+ print(f"Successfully parsed {len(all_verses)} verses from {len(os.listdir(directory_path))} files.")
62
+
63
+ # Convert to DataFrame for easier manipulation
64
+ df = pd.DataFrame(all_verses)
65
+
66
+ # --- Chunking Logic ---
67
+ print(f"Chunking verses into groups of {chunk_size}...")
68
+ all_chunks = []
69
+ # Group by version, book, and chapter to ensure chunks don't cross boundaries
70
+ for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
71
+ group = group.sort_values('verse').reset_index(drop=True)
72
+
73
+ for i in range(0, len(group), chunk_size):
74
+ chunk_df = group.iloc[i:i+chunk_size]
75
+
76
+ combined_text = " ".join(chunk_df['text'])
77
+
78
+ start_verse = chunk_df.iloc[0]['verse']
79
+ end_verse = chunk_df.iloc[-1]['verse']
80
+
81
+ # Create a clean reference string
82
+ if start_verse == end_verse:
83
+ reference = f"{book_name} {chapter}:{start_verse}"
84
+ else:
85
+ reference = f"{book_name} {chapter}:{start_verse}-{end_verse}"
86
+
87
+ all_chunks.append({
88
+ 'text': combined_text,
89
+ 'reference': reference,
90
+ 'version': version,
91
+ })
92
+
93
+ final_df = pd.DataFrame(all_chunks)
94
+ print(f"Created {len(final_df)} text chunks.")
95
+
96
+ return final_df
97
+
98
+ # --- Main execution ---
99
+ if __name__ == "__main__":
100
+ # 1. Set the path to your directory containing the JSON files
101
+ json_directory = 'bible_json'
102
+
103
+ # 2. Run the processing and chunking function
104
+ bible_chunks_df = process_bible_json_files(json_directory, chunk_size=3)
105
+
106
+ # 3. Display the result
107
+ print("\n--- Processing Complete ---")
108
+ print("DataFrame Info:")
109
+ bible_chunks_df.info()
110
+
111
+ print("\n--- Example Chunks ---")
112
+ print(bible_chunks_df.head())
113
+ print("\n")
114
+ print(bible_chunks_df.sample(5))
115
+
116
+ # This DataFrame is now ready for the next step:
117
+ # `hf_dataset = Dataset.from_pandas(bible_chunks_df)`
118
+ # ...followed by Gemma embedding and FAISS indexing.