broadfield-dev commited on
Commit
1dde6a2
·
verified ·
1 Parent(s): f75ebe1

Update build_rag.py

Browse files
Files changed (1) hide show
  1. build_rag.py +39 -48
build_rag.py CHANGED
@@ -7,19 +7,17 @@ import chromadb
7
  import sys
8
  from tqdm import tqdm
9
  from huggingface_hub import HfApi, create_repo
 
10
 
11
  # --- Configuration ---
12
- # Must match the settings in app.py
13
  CHROMA_PATH = "chroma_db"
14
  COLLECTION_NAME = "bible_verses"
15
  MODEL_NAME = "google/embeddinggemma-300m"
16
- DATASET_REPO = "broadfield-dev/bible-chromadb-gemma" # The HF Dataset to store the DB
17
-
18
  JSON_DIRECTORY = 'bible_json'
19
  CHUNK_SIZE = 3
20
  EMBEDDING_BATCH_SIZE = 16
21
-
22
- # --- Book ID Mapping (Unchanged) ---
23
  BOOK_ID_TO_NAME = {
24
  1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
25
  6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
@@ -37,14 +35,17 @@ BOOK_ID_TO_NAME = {
37
  62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
38
  }
39
 
 
 
 
 
 
 
40
  def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
41
- """Reads, processes, and chunks Bible JSON files into a Pandas DataFrame."""
42
- # (This function's internal logic remains unchanged)
43
  all_verses = []
44
- print(f"Reading JSON files from '{directory_path}'...")
45
  if not os.path.exists(directory_path) or not os.listdir(directory_path):
46
- print(f"Error: Directory '{directory_path}' is empty or does not exist.", file=sys.stderr)
47
- sys.exit(1)
48
  for filename in os.listdir(directory_path):
49
  if filename.endswith('.json'):
50
  version_name = filename.split('.')[0].upper()
@@ -57,9 +58,7 @@ def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFra
57
  _id, book_id, chapter, verse, text = field
58
  book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
59
  all_verses.append({'version': version_name, 'book_name': book_name, 'chapter': chapter, 'verse': verse, 'text': text.strip()})
60
- if not all_verses:
61
- print("Error: No verses were processed.", file=sys.stderr)
62
- sys.exit(1)
63
  df = pd.DataFrame(all_verses)
64
  all_chunks = []
65
  for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
@@ -70,67 +69,59 @@ def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFra
70
  start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
71
  reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
72
  all_chunks.append({'text': combined_text, 'reference': reference, 'version': version})
73
- final_df = pd.DataFrame(all_chunks)
74
- print(f"Created {len(final_df)} text chunks.")
75
- return final_df
76
 
77
- if __name__ == "__main__":
78
- print("--- Starting Vector Database Build Process ---")
79
-
80
- # 1. Process JSON
81
  bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
82
 
83
- # 2. Setup local ChromaDB
84
- print(f"\n--- Setting up local ChromaDB in '{CHROMA_PATH}' ---")
85
  if os.path.exists(CHROMA_PATH):
86
  import shutil
87
- print("Deleting old local database directory...")
88
  shutil.rmtree(CHROMA_PATH)
89
  client = chromadb.PersistentClient(path=CHROMA_PATH)
90
  collection = client.create_collection(name=COLLECTION_NAME)
91
 
92
- # 3. Load embedding model
93
- print(f"\n--- Loading embedding model: '{MODEL_NAME}' ---")
94
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
95
  model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
96
 
97
- # 4. Generate embeddings and populate DB
98
- print(f"\n--- Generating embeddings and populating database ---")
99
  total_chunks = len(bible_chunks_df)
100
  for i in tqdm(range(0, total_chunks, EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
101
  batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
102
  texts = batch_df['text'].tolist()
103
-
104
  inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
105
  with torch.no_grad():
106
  outputs = model(**inputs)
107
  embeddings = outputs.last_hidden_state.mean(dim=1).cpu().tolist()
108
-
109
  collection.add(
110
  ids=[str(j) for j in range(i, i + len(batch_df))],
111
  embeddings=embeddings,
112
  documents=texts,
113
  metadatas=batch_df[['reference', 'version']].to_dict('records')
114
  )
115
- print(f"Successfully added {total_chunks} documents to the local ChromaDB.")
116
 
117
- # 5. Upload the database directory to Hugging Face Hub
118
- print(f"\n--- Pushing database to Hugging Face Hub: '{DATASET_REPO}' ---")
 
 
 
 
 
 
 
 
 
 
119
  try:
120
- # Ensure the repo exists
121
- create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
122
-
123
- # Upload the entire folder
124
- api = HfApi()
125
- api.upload_folder(
126
- folder_path=CHROMA_PATH,
127
- repo_id=DATASET_REPO,
128
- repo_type="dataset",
129
- )
130
- print("Database pushed successfully!")
131
  except Exception as e:
132
- print(f"An error occurred while pushing to the Hub: {e}", file=sys.stderr)
133
- print("Please ensure your HF_TOKEN secret has WRITE permissions.", file=sys.stderr)
134
- sys.exit(1)
135
-
136
- print("\n--- Build Process Complete! ---")
 
 
 
7
  import sys
8
  from tqdm import tqdm
9
  from huggingface_hub import HfApi, create_repo
10
+ import traceback
11
 
12
  # --- Configuration ---
 
13
  CHROMA_PATH = "chroma_db"
14
  COLLECTION_NAME = "bible_verses"
15
  MODEL_NAME = "google/embeddinggemma-300m"
16
+ DATASET_REPO = "broadfield-dev/bible-chromadb-gemma"
17
+ STATUS_FILE = "build_status.log"
18
  JSON_DIRECTORY = 'bible_json'
19
  CHUNK_SIZE = 3
20
  EMBEDDING_BATCH_SIZE = 16
 
 
21
  BOOK_ID_TO_NAME = {
22
  1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
23
  6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
 
35
  62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
36
  }
37
 
38
+ def update_status(message):
39
+ """Writes a new status to the log file."""
40
+ print(message) # Also print to Space logs
41
+ with open(STATUS_FILE, "w") as f:
42
+ f.write(message)
43
+
44
  def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
45
+ # (This function's internal logic is unchanged)
 
46
  all_verses = []
 
47
  if not os.path.exists(directory_path) or not os.listdir(directory_path):
48
+ raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
 
49
  for filename in os.listdir(directory_path):
50
  if filename.endswith('.json'):
51
  version_name = filename.split('.')[0].upper()
 
58
  _id, book_id, chapter, verse, text = field
59
  book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
60
  all_verses.append({'version': version_name, 'book_name': book_name, 'chapter': chapter, 'verse': verse, 'text': text.strip()})
61
+ if not all_verses: raise ValueError("No verses were processed.")
 
 
62
  df = pd.DataFrame(all_verses)
63
  all_chunks = []
64
  for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
 
69
  start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
70
  reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
71
  all_chunks.append({'text': combined_text, 'reference': reference, 'version': version})
72
+ return pd.DataFrame(all_chunks)
 
 
73
 
74
+ def main():
75
+ """Main build process."""
76
+ update_status("IN_PROGRESS: Step 1/5 - Processing JSON files...")
 
77
  bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
78
 
79
+ update_status("IN_PROGRESS: Step 2/5 - Setting up local ChromaDB...")
 
80
  if os.path.exists(CHROMA_PATH):
81
  import shutil
 
82
  shutil.rmtree(CHROMA_PATH)
83
  client = chromadb.PersistentClient(path=CHROMA_PATH)
84
  collection = client.create_collection(name=COLLECTION_NAME)
85
 
86
+ update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
 
87
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
88
  model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
89
 
90
+ update_status("IN_PROGRESS: Step 4/5 - Generating embeddings and populating database...")
 
91
  total_chunks = len(bible_chunks_df)
92
  for i in tqdm(range(0, total_chunks, EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
93
  batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
94
  texts = batch_df['text'].tolist()
 
95
  inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
96
  with torch.no_grad():
97
  outputs = model(**inputs)
98
  embeddings = outputs.last_hidden_state.mean(dim=1).cpu().tolist()
 
99
  collection.add(
100
  ids=[str(j) for j in range(i, i + len(batch_df))],
101
  embeddings=embeddings,
102
  documents=texts,
103
  metadatas=batch_df[['reference', 'version']].to_dict('records')
104
  )
 
105
 
106
+ update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
107
+ create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
108
+ api = HfApi()
109
+ api.upload_folder(
110
+ folder_path=CHROMA_PATH,
111
+ repo_id=DATASET_REPO,
112
+ repo_type="dataset",
113
+ )
114
+
115
+ update_status("SUCCESS: Build complete! The application is ready.")
116
+
117
+ if __name__ == "__main__":
118
  try:
119
+ main()
 
 
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
+ error_message = traceback.format_exc()
122
+ # Be specific about token errors
123
+ if "401" in str(e) or "Unauthorized" in str(e):
124
+ update_status("FAILED: Hugging Face authentication error. Please ensure your HF_TOKEN secret is set correctly and has WRITE permissions.")
125
+ else:
126
+ update_status(f"FAILED: An unexpected error occurred. Check Space logs for details. Error: {e}")
127
+ print(error_message, file=sys.stderr)