Spaces:
Sleeping
Sleeping
| """ | |
| Data preparation script for Church Fathers Commentary API | |
| Copies commentary embedding JSON files from church-fathers repo or generates new ones | |
| """ | |
| import json | |
| import shutil | |
| from pathlib import Path | |
| import argparse | |
| OLD_TESTAMENT_BOOKS = [ | |
| "genesis", "exodus", "leviticus", "numbers", "deuteronomy", "joshua", "judges", "ruth", | |
| "1samuel", "2samuel", "1kings", "2kings", "1chronicles", "2chronicles", "ezra", | |
| "nehemiah", "esther", "job", "psalms", "proverbs", "ecclesiastes", "songofsolomon", | |
| "isaiah", "jeremiah", "lamentations", "ezekiel", "daniel", "hosea", "joel", "amos", | |
| "obadiah", "jonah", "micah", "nahum", "habakkuk", "zephaniah", "haggai", "zechariah", | |
| "malachi" | |
| ] | |
| NEW_TESTAMENT_BOOKS = [ | |
| "matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians", | |
| "galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians", | |
| "1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter", | |
| "2peter", "1john", "2john", "3john", "jude", "revelation" | |
| ] | |
| ALL_BOOKS = OLD_TESTAMENT_BOOKS + NEW_TESTAMENT_BOOKS | |
| def copy_embeddings_from_source(source_dir: Path, output_dir: Path): | |
| """ | |
| Copy commentary embeddings from church-fathers repository | |
| Args: | |
| source_dir: Path to church-fathers commentary_embeddings directory | |
| output_dir: Path to output directory (e.g., ./data) | |
| """ | |
| # Create output directory | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| copied_count = 0 | |
| total_entries = 0 | |
| missing_books = [] | |
| print(f"Copying embeddings from: {source_dir}") | |
| print(f"Output directory: {output_dir}") | |
| print("-" * 60) | |
| for book in ALL_BOOKS: | |
| book_dir = source_dir / book | |
| if not book_dir.exists(): | |
| print(f"✗ {book} directory not found") | |
| missing_books.append(book) | |
| continue | |
| # Copy all JSON files for this book | |
| json_files = list(book_dir.glob("*.json")) | |
| if not json_files: | |
| print(f"✗ No JSON files found for {book}") | |
| missing_books.append(book) | |
| continue | |
| # Create book subdirectory in output | |
| book_output_dir = output_dir / book | |
| book_output_dir.mkdir(exist_ok=True) | |
| book_entries = 0 | |
| for json_file in json_files: | |
| try: | |
| # Validate JSON structure | |
| with open(json_file, 'r') as f: | |
| data = json.load(f) | |
| # Verify required fields | |
| if 'content' not in data or 'metadata' not in data or 'embedding' not in data: | |
| print(f" ✗ Skipping {json_file.name}: missing required fields") | |
| continue | |
| # Verify embedding is valid | |
| if not isinstance(data['embedding'], list) or len(data['embedding']) == 0: | |
| print(f" ✗ Skipping {json_file.name}: invalid embedding") | |
| continue | |
| # Copy file | |
| output_file = book_output_dir / json_file.name | |
| shutil.copy2(json_file, output_file) | |
| book_entries += 1 | |
| except Exception as e: | |
| print(f" ✗ Error processing {json_file.name}: {e}") | |
| continue | |
| if book_entries > 0: | |
| print(f"✓ {book}: copied {book_entries} files") | |
| copied_count += len(json_files) | |
| total_entries += book_entries | |
| else: | |
| print(f"✗ {book}: no valid files found") | |
| missing_books.append(book) | |
| print("-" * 60) | |
| print(f"\nCopy complete:") | |
| print(f" Total files copied: {copied_count}") | |
| print(f" Total entries: {total_entries}") | |
| print(f" Books processed: {len(ALL_BOOKS) - len(missing_books)}/{len(ALL_BOOKS)}") | |
| if missing_books: | |
| print(f" Missing books: {', '.join(missing_books)}") | |
| # Calculate total size | |
| if output_dir.exists(): | |
| total_size = sum(f.stat().st_size for f in output_dir.rglob("*.json")) | |
| print(f" Total size: {total_size / 1024 / 1024:.2f} MB") | |
| return total_entries, missing_books | |
| def generate_embeddings_from_db(db_file: Path, output_dir: Path, model_name: str): | |
| """ | |
| Generate embeddings from SQLite database using commentary.py approach | |
| Args: | |
| db_file: Path to SQLite database (data.sqlite) | |
| output_dir: Path to output directory (e.g., ./data) | |
| model_name: HuggingFace model name (default: BAAI/bge-large-en-v1.5) | |
| """ | |
| import sqlite3 | |
| from datetime import datetime | |
| from tqdm import tqdm | |
| from sentence_transformers import SentenceTransformer | |
| print("WARNING: This will generate embeddings from scratch. This may take a long time!") | |
| print(f"Using model: {model_name}") | |
| print(f"Database: {db_file}") | |
| print("-" * 60) | |
| # Create output directory | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Load model | |
| print("Loading embedding model...") | |
| model = SentenceTransformer(model_name) | |
| # Connect to database | |
| connection = sqlite3.connect(db_file) | |
| cursor = connection.cursor() | |
| # Query church fathers (NT-only, 9 fathers) | |
| top_authors = [ | |
| "Augustine of Hippo", | |
| "Athanasius of Alexandria", | |
| "Basil of Caesarea", | |
| "Gregory of Nazianzus", | |
| "Gregory of Nyssa", | |
| "Cyril of Alexandria", | |
| "Irenaeus", | |
| "Cyprian", | |
| "Origen of Alexandria" | |
| ] | |
| query = """ | |
| SELECT id, father_name, file_name, append_to_author_name, ts, book, | |
| location_start, location_end, txt, source_url, source_title | |
| FROM commentary | |
| WHERE father_name IN ({}) | |
| AND book IN ({}) | |
| AND append_to_author_name NOT LIKE '%quoted by Aquinas%' | |
| AND LENGTH(txt) >= 1000 | |
| AND source_title IS NOT NULL | |
| AND source_title != '' | |
| """.format( | |
| ','.join('?' * len(top_authors)), | |
| ','.join('?' * len(NEW_TESTAMENT_BOOKS)) | |
| ) | |
| cursor.execute(query, top_authors + NEW_TESTAMENT_BOOKS) | |
| rows = cursor.fetchall() | |
| print(f"Found {len(rows)} commentary entries to process") | |
| # Process each row | |
| for row in tqdm(rows, desc="Generating embeddings"): | |
| id, father_name, file_name, append_to_author_name, ts, book, \ | |
| location_start, location_end, txt, source_url, source_title = row | |
| # Generate embedding | |
| embedding = model.encode(txt, normalize_embeddings=True).tolist() | |
| # Prepare data | |
| data = { | |
| "content": txt, | |
| "metadata": { | |
| "id": id, | |
| "father_name": father_name, | |
| "book": book, | |
| "location_start": location_start, | |
| "location_end": location_end, | |
| "source_url": source_url, | |
| "source_title": source_title, | |
| "append_to_author_name": append_to_author_name | |
| }, | |
| "embedding": embedding | |
| } | |
| # Save to file | |
| book_dir = output_dir / book | |
| book_dir.mkdir(exist_ok=True) | |
| # Generate unique filename | |
| safe_father = father_name.replace(' ', '_') | |
| filename = f"{book}_{safe_father}_{id}.json" | |
| with open(book_dir / filename, 'w') as f: | |
| json.dump(data, f) | |
| cursor.close() | |
| connection.close() | |
| print("✓ Embedding generation complete!") | |
| def main(): | |
| """Main entry point""" | |
| parser = argparse.ArgumentParser(description="Prepare Church Fathers commentary embeddings for HF Spaces") | |
| parser.add_argument( | |
| "--source", | |
| type=str, | |
| help="Source directory containing church-fathers commentary_embeddings (e.g., ../church-fathers/commentary_embeddings)" | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| type=str, | |
| default="./data", | |
| help="Output directory for JSON files (default: ./data)" | |
| ) | |
| parser.add_argument( | |
| "--generate", | |
| action="store_true", | |
| help="Generate embeddings from SQLite database instead of copying" | |
| ) | |
| parser.add_argument( | |
| "--db", | |
| type=str, | |
| help="Path to SQLite database file (required if --generate is used)" | |
| ) | |
| parser.add_argument( | |
| "--model", | |
| type=str, | |
| default="BAAI/bge-large-en-v1.5", | |
| help="Model name for embedding generation (default: BAAI/bge-large-en-v1.5)" | |
| ) | |
| args = parser.parse_args() | |
| output_dir = Path(args.output) | |
| if args.generate: | |
| # Generate embeddings from database | |
| if not args.db: | |
| print("Error: --db is required when using --generate") | |
| return 1 | |
| db_file = Path(args.db) | |
| if not db_file.exists(): | |
| print(f"Error: Database file not found: {db_file}") | |
| return 1 | |
| generate_embeddings_from_db(db_file, output_dir, args.model) | |
| else: | |
| # Copy embeddings from source | |
| if not args.source: | |
| print("Error: --source is required unless using --generate") | |
| print("\nUsage:") | |
| print(" Copy embeddings: python prepare_data.py --source ../church-fathers/commentary_embeddings") | |
| print(" Generate new: python prepare_data.py --generate --db path/to/data.sqlite") | |
| return 1 | |
| source_dir = Path(args.source) | |
| if not source_dir.exists(): | |
| print(f"Error: Source directory does not exist: {source_dir}") | |
| return 1 | |
| total, missing = copy_embeddings_from_source(source_dir, output_dir) | |
| if total == 0: | |
| print("\nNo embeddings were copied. Please check the source directory.") | |
| return 1 | |
| print("\n✓ Data preparation complete!") | |
| print("\nNext steps:") | |
| print(" 1. Review the data/ directory") | |
| print(" 2. Test locally: uvicorn app:app --reload") | |
| print(" 3. Deploy to Hugging Face Spaces") | |
| return 0 | |
| if __name__ == "__main__": | |
| exit(main()) | |