Spaces:

broadfield-dev
/

bible-app

Paused

App Files Files Community

broadfield-dev commited on Sep 13, 2025

Commit

1dde6a2

verified ·

1 Parent(s): f75ebe1

Update build_rag.py

Browse files

Files changed (1) hide show

build_rag.py +39 -48

build_rag.py CHANGED Viewed

@@ -7,19 +7,17 @@ import chromadb
 import sys
 from tqdm import tqdm
 from huggingface_hub import HfApi, create_repo
 # --- Configuration ---
-# Must match the settings in app.py
 CHROMA_PATH = "chroma_db"
 COLLECTION_NAME = "bible_verses"
 MODEL_NAME = "google/embeddinggemma-300m"
-DATASET_REPO = "broadfield-dev/bible-chromadb-gemma" # The HF Dataset to store the DB
 JSON_DIRECTORY = 'bible_json'
 CHUNK_SIZE = 3
 EMBEDDING_BATCH_SIZE = 16
-# --- Book ID Mapping (Unchanged) ---
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
@@ -37,14 +35,17 @@ BOOK_ID_TO_NAME = {
     62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
 }
 def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
-    """Reads, processes, and chunks Bible JSON files into a Pandas DataFrame."""
-    # (This function's internal logic remains unchanged)
     all_verses = []
-    print(f"Reading JSON files from '{directory_path}'...")
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
-        print(f"Error: Directory '{directory_path}' is empty or does not exist.", file=sys.stderr)
-        sys.exit(1)
     for filename in os.listdir(directory_path):
         if filename.endswith('.json'):
             version_name = filename.split('.')[0].upper()
@@ -57,9 +58,7 @@ def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFra
                     _id, book_id, chapter, verse, text = field
                     book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
                     all_verses.append({'version': version_name, 'book_name': book_name, 'chapter': chapter, 'verse': verse, 'text': text.strip()})
-    if not all_verses:
-        print("Error: No verses were processed.", file=sys.stderr)
-        sys.exit(1)
     df = pd.DataFrame(all_verses)
     all_chunks = []
     for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
@@ -70,67 +69,59 @@ def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFra
             start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
             reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
             all_chunks.append({'text': combined_text, 'reference': reference, 'version': version})
-    final_df = pd.DataFrame(all_chunks)
-    print(f"Created {len(final_df)} text chunks.")
-    return final_df
-if __name__ == "__main__":
-    print("--- Starting Vector Database Build Process ---")
-    # 1. Process JSON
     bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
-    # 2. Setup local ChromaDB
-    print(f"\n--- Setting up local ChromaDB in '{CHROMA_PATH}' ---")
     if os.path.exists(CHROMA_PATH):
         import shutil
-        print("Deleting old local database directory...")
         shutil.rmtree(CHROMA_PATH)
     client = chromadb.PersistentClient(path=CHROMA_PATH)
     collection = client.create_collection(name=COLLECTION_NAME)
-    # 3. Load embedding model
-    print(f"\n--- Loading embedding model: '{MODEL_NAME}' ---")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
-    # 4. Generate embeddings and populate DB
-    print(f"\n--- Generating embeddings and populating database ---")
     total_chunks = len(bible_chunks_df)
     for i in tqdm(range(0, total_chunks, EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
         batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
         texts = batch_df['text'].tolist()
         inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
         with torch.no_grad():
             outputs = model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1).cpu().tolist()
         collection.add(
             ids=[str(j) for j in range(i, i + len(batch_df))],
             embeddings=embeddings,
             documents=texts,
             metadatas=batch_df[['reference', 'version']].to_dict('records')
         )
-    print(f"Successfully added {total_chunks} documents to the local ChromaDB.")
-    # 5. Upload the database directory to Hugging Face Hub
-    print(f"\n--- Pushing database to Hugging Face Hub: '{DATASET_REPO}' ---")
     try:
-        # Ensure the repo exists
-        create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
-        # Upload the entire folder
-        api = HfApi()
-        api.upload_folder(
-            folder_path=CHROMA_PATH,
-            repo_id=DATASET_REPO,
-            repo_type="dataset",
-        )
-        print("Database pushed successfully!")
     except Exception as e:
-        print(f"An error occurred while pushing to the Hub: {e}", file=sys.stderr)
-        print("Please ensure your HF_TOKEN secret has WRITE permissions.", file=sys.stderr)
-        sys.exit(1)
-    print("\n--- Build Process Complete! ---")

 import sys
 from tqdm import tqdm
 from huggingface_hub import HfApi, create_repo
+import traceback
 # --- Configuration ---
 CHROMA_PATH = "chroma_db"
 COLLECTION_NAME = "bible_verses"
 MODEL_NAME = "google/embeddinggemma-300m"
+DATASET_REPO = "broadfield-dev/bible-chromadb-gemma"
+STATUS_FILE = "build_status.log"
 JSON_DIRECTORY = 'bible_json'
 CHUNK_SIZE = 3
 EMBEDDING_BATCH_SIZE = 16
 BOOK_ID_TO_NAME = {
     1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
     6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
     62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
 }
+def update_status(message):
+    """Writes a new status to the log file."""
+    print(message)  # Also print to Space logs
+    with open(STATUS_FILE, "w") as f:
+        f.write(message)
 def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
+    # (This function's internal logic is unchanged)
     all_verses = []
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
+        raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
     for filename in os.listdir(directory_path):
         if filename.endswith('.json'):
             version_name = filename.split('.')[0].upper()
                     _id, book_id, chapter, verse, text = field
                     book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
                     all_verses.append({'version': version_name, 'book_name': book_name, 'chapter': chapter, 'verse': verse, 'text': text.strip()})
+    if not all_verses: raise ValueError("No verses were processed.")
     df = pd.DataFrame(all_verses)
     all_chunks = []
     for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
             start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
             reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
             all_chunks.append({'text': combined_text, 'reference': reference, 'version': version})
+    return pd.DataFrame(all_chunks)
+def main():
+    """Main build process."""
+    update_status("IN_PROGRESS: Step 1/5 - Processing JSON files...")
     bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
+    update_status("IN_PROGRESS: Step 2/5 - Setting up local ChromaDB...")
     if os.path.exists(CHROMA_PATH):
         import shutil
         shutil.rmtree(CHROMA_PATH)
     client = chromadb.PersistentClient(path=CHROMA_PATH)
     collection = client.create_collection(name=COLLECTION_NAME)
+    update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
+    update_status("IN_PROGRESS: Step 4/5 - Generating embeddings and populating database...")
     total_chunks = len(bible_chunks_df)
     for i in tqdm(range(0, total_chunks, EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
         batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
         texts = batch_df['text'].tolist()
         inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
         with torch.no_grad():
             outputs = model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1).cpu().tolist()
         collection.add(
             ids=[str(j) for j in range(i, i + len(batch_df))],
             embeddings=embeddings,
             documents=texts,
             metadatas=batch_df[['reference', 'version']].to_dict('records')
         )
+    update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
+    create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
+    api = HfApi()
+    api.upload_folder(
+        folder_path=CHROMA_PATH,
+        repo_id=DATASET_REPO,
+        repo_type="dataset",
+    )
+    update_status("SUCCESS: Build complete! The application is ready.")
+if __name__ == "__main__":
     try:
+        main()
     except Exception as e:
+        error_message = traceback.format_exc()
+        # Be specific about token errors
+        if "401" in str(e) or "Unauthorized" in str(e):
+            update_status("FAILED: Hugging Face authentication error. Please ensure your HF_TOKEN secret is set correctly and has WRITE permissions.")
+        else:
+            update_status(f"FAILED: An unexpected error occurred. Check Space logs for details. Error: {e}")
+        print(error_message, file=sys.stderr)