""" Data preparation script for Church Fathers Commentary API Copies commentary embedding JSON files from church-fathers repo or generates new ones """ import json import shutil from pathlib import Path import argparse OLD_TESTAMENT_BOOKS = [ "genesis", "exodus", "leviticus", "numbers", "deuteronomy", "joshua", "judges", "ruth", "1samuel", "2samuel", "1kings", "2kings", "1chronicles", "2chronicles", "ezra", "nehemiah", "esther", "job", "psalms", "proverbs", "ecclesiastes", "songofsolomon", "isaiah", "jeremiah", "lamentations", "ezekiel", "daniel", "hosea", "joel", "amos", "obadiah", "jonah", "micah", "nahum", "habakkuk", "zephaniah", "haggai", "zechariah", "malachi" ] NEW_TESTAMENT_BOOKS = [ "matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians", "galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians", "1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter", "2peter", "1john", "2john", "3john", "jude", "revelation" ] ALL_BOOKS = OLD_TESTAMENT_BOOKS + NEW_TESTAMENT_BOOKS def copy_embeddings_from_source(source_dir: Path, output_dir: Path): """ Copy commentary embeddings from church-fathers repository Args: source_dir: Path to church-fathers commentary_embeddings directory output_dir: Path to output directory (e.g., ./data) """ # Create output directory output_dir.mkdir(parents=True, exist_ok=True) copied_count = 0 total_entries = 0 missing_books = [] print(f"Copying embeddings from: {source_dir}") print(f"Output directory: {output_dir}") print("-" * 60) for book in ALL_BOOKS: book_dir = source_dir / book if not book_dir.exists(): print(f"✗ {book} directory not found") missing_books.append(book) continue # Copy all JSON files for this book json_files = list(book_dir.glob("*.json")) if not json_files: print(f"✗ No JSON files found for {book}") missing_books.append(book) continue # Create book subdirectory in output book_output_dir = output_dir / book book_output_dir.mkdir(exist_ok=True) book_entries = 0 for json_file in json_files: try: # Validate JSON structure with open(json_file, 'r') as f: data = json.load(f) # Verify required fields if 'content' not in data or 'metadata' not in data or 'embedding' not in data: print(f" ✗ Skipping {json_file.name}: missing required fields") continue # Verify embedding is valid if not isinstance(data['embedding'], list) or len(data['embedding']) == 0: print(f" ✗ Skipping {json_file.name}: invalid embedding") continue # Copy file output_file = book_output_dir / json_file.name shutil.copy2(json_file, output_file) book_entries += 1 except Exception as e: print(f" ✗ Error processing {json_file.name}: {e}") continue if book_entries > 0: print(f"✓ {book}: copied {book_entries} files") copied_count += len(json_files) total_entries += book_entries else: print(f"✗ {book}: no valid files found") missing_books.append(book) print("-" * 60) print(f"\nCopy complete:") print(f" Total files copied: {copied_count}") print(f" Total entries: {total_entries}") print(f" Books processed: {len(ALL_BOOKS) - len(missing_books)}/{len(ALL_BOOKS)}") if missing_books: print(f" Missing books: {', '.join(missing_books)}") # Calculate total size if output_dir.exists(): total_size = sum(f.stat().st_size for f in output_dir.rglob("*.json")) print(f" Total size: {total_size / 1024 / 1024:.2f} MB") return total_entries, missing_books def generate_embeddings_from_db(db_file: Path, output_dir: Path, model_name: str): """ Generate embeddings from SQLite database using commentary.py approach Args: db_file: Path to SQLite database (data.sqlite) output_dir: Path to output directory (e.g., ./data) model_name: HuggingFace model name (default: BAAI/bge-large-en-v1.5) """ import sqlite3 from datetime import datetime from tqdm import tqdm from sentence_transformers import SentenceTransformer print("WARNING: This will generate embeddings from scratch. This may take a long time!") print(f"Using model: {model_name}") print(f"Database: {db_file}") print("-" * 60) # Create output directory output_dir.mkdir(parents=True, exist_ok=True) # Load model print("Loading embedding model...") model = SentenceTransformer(model_name) # Connect to database connection = sqlite3.connect(db_file) cursor = connection.cursor() # Query church fathers (NT-only, 9 fathers) top_authors = [ "Augustine of Hippo", "Athanasius of Alexandria", "Basil of Caesarea", "Gregory of Nazianzus", "Gregory of Nyssa", "Cyril of Alexandria", "Irenaeus", "Cyprian", "Origen of Alexandria" ] query = """ SELECT id, father_name, file_name, append_to_author_name, ts, book, location_start, location_end, txt, source_url, source_title FROM commentary WHERE father_name IN ({}) AND book IN ({}) AND append_to_author_name NOT LIKE '%quoted by Aquinas%' AND LENGTH(txt) >= 1000 AND source_title IS NOT NULL AND source_title != '' """.format( ','.join('?' * len(top_authors)), ','.join('?' * len(NEW_TESTAMENT_BOOKS)) ) cursor.execute(query, top_authors + NEW_TESTAMENT_BOOKS) rows = cursor.fetchall() print(f"Found {len(rows)} commentary entries to process") # Process each row for row in tqdm(rows, desc="Generating embeddings"): id, father_name, file_name, append_to_author_name, ts, book, \ location_start, location_end, txt, source_url, source_title = row # Generate embedding embedding = model.encode(txt, normalize_embeddings=True).tolist() # Prepare data data = { "content": txt, "metadata": { "id": id, "father_name": father_name, "book": book, "location_start": location_start, "location_end": location_end, "source_url": source_url, "source_title": source_title, "append_to_author_name": append_to_author_name }, "embedding": embedding } # Save to file book_dir = output_dir / book book_dir.mkdir(exist_ok=True) # Generate unique filename safe_father = father_name.replace(' ', '_') filename = f"{book}_{safe_father}_{id}.json" with open(book_dir / filename, 'w') as f: json.dump(data, f) cursor.close() connection.close() print("✓ Embedding generation complete!") def main(): """Main entry point""" parser = argparse.ArgumentParser(description="Prepare Church Fathers commentary embeddings for HF Spaces") parser.add_argument( "--source", type=str, help="Source directory containing church-fathers commentary_embeddings (e.g., ../church-fathers/commentary_embeddings)" ) parser.add_argument( "--output", type=str, default="./data", help="Output directory for JSON files (default: ./data)" ) parser.add_argument( "--generate", action="store_true", help="Generate embeddings from SQLite database instead of copying" ) parser.add_argument( "--db", type=str, help="Path to SQLite database file (required if --generate is used)" ) parser.add_argument( "--model", type=str, default="BAAI/bge-large-en-v1.5", help="Model name for embedding generation (default: BAAI/bge-large-en-v1.5)" ) args = parser.parse_args() output_dir = Path(args.output) if args.generate: # Generate embeddings from database if not args.db: print("Error: --db is required when using --generate") return 1 db_file = Path(args.db) if not db_file.exists(): print(f"Error: Database file not found: {db_file}") return 1 generate_embeddings_from_db(db_file, output_dir, args.model) else: # Copy embeddings from source if not args.source: print("Error: --source is required unless using --generate") print("\nUsage:") print(" Copy embeddings: python prepare_data.py --source ../church-fathers/commentary_embeddings") print(" Generate new: python prepare_data.py --generate --db path/to/data.sqlite") return 1 source_dir = Path(args.source) if not source_dir.exists(): print(f"Error: Source directory does not exist: {source_dir}") return 1 total, missing = copy_embeddings_from_source(source_dir, output_dir) if total == 0: print("\nNo embeddings were copied. Please check the source directory.") return 1 print("\n✓ Data preparation complete!") print("\nNext steps:") print(" 1. Review the data/ directory") print(" 2. Test locally: uvicorn app:app --reload") print(" 3. Deploy to Hugging Face Spaces") return 0 if __name__ == "__main__": exit(main())