Spaces:

dssjon
/

biblos-cf-api

Sleeping

File size: 9,965 Bytes

"""
Data preparation script for Church Fathers Commentary API
Copies commentary embedding JSON files from church-fathers repo or generates new ones
"""

import json
import shutil
from pathlib import Path
import argparse


OLD_TESTAMENT_BOOKS = [
    "genesis", "exodus", "leviticus", "numbers", "deuteronomy", "joshua", "judges", "ruth",
    "1samuel", "2samuel", "1kings", "2kings", "1chronicles", "2chronicles", "ezra",
    "nehemiah", "esther", "job", "psalms", "proverbs", "ecclesiastes", "songofsolomon",
    "isaiah", "jeremiah", "lamentations", "ezekiel", "daniel", "hosea", "joel", "amos",
    "obadiah", "jonah", "micah", "nahum", "habakkuk", "zephaniah", "haggai", "zechariah",
    "malachi"
]

NEW_TESTAMENT_BOOKS = [
    "matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians",
    "galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians",
    "1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter",
    "2peter", "1john", "2john", "3john", "jude", "revelation"
]

ALL_BOOKS = OLD_TESTAMENT_BOOKS + NEW_TESTAMENT_BOOKS


def copy_embeddings_from_source(source_dir: Path, output_dir: Path):
    """
    Copy commentary embeddings from church-fathers repository

    Args:
        source_dir: Path to church-fathers commentary_embeddings directory
        output_dir: Path to output directory (e.g., ./data)
    """

    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)

    copied_count = 0
    total_entries = 0
    missing_books = []

    print(f"Copying embeddings from: {source_dir}")
    print(f"Output directory: {output_dir}")
    print("-" * 60)

    for book in ALL_BOOKS:
        book_dir = source_dir / book

        if not book_dir.exists():
            print(f"✗ {book} directory not found")
            missing_books.append(book)
            continue

        # Copy all JSON files for this book
        json_files = list(book_dir.glob("*.json"))

        if not json_files:
            print(f"✗ No JSON files found for {book}")
            missing_books.append(book)
            continue

        # Create book subdirectory in output
        book_output_dir = output_dir / book
        book_output_dir.mkdir(exist_ok=True)

        book_entries = 0
        for json_file in json_files:
            try:
                # Validate JSON structure
                with open(json_file, 'r') as f:
                    data = json.load(f)

                    # Verify required fields
                    if 'content' not in data or 'metadata' not in data or 'embedding' not in data:
                        print(f"  ✗ Skipping {json_file.name}: missing required fields")
                        continue

                    # Verify embedding is valid
                    if not isinstance(data['embedding'], list) or len(data['embedding']) == 0:
                        print(f"  ✗ Skipping {json_file.name}: invalid embedding")
                        continue

                # Copy file
                output_file = book_output_dir / json_file.name
                shutil.copy2(json_file, output_file)
                book_entries += 1

            except Exception as e:
                print(f"  ✗ Error processing {json_file.name}: {e}")
                continue

        if book_entries > 0:
            print(f"✓ {book}: copied {book_entries} files")
            copied_count += len(json_files)
            total_entries += book_entries
        else:
            print(f"✗ {book}: no valid files found")
            missing_books.append(book)

    print("-" * 60)
    print(f"\nCopy complete:")
    print(f"  Total files copied: {copied_count}")
    print(f"  Total entries: {total_entries}")
    print(f"  Books processed: {len(ALL_BOOKS) - len(missing_books)}/{len(ALL_BOOKS)}")

    if missing_books:
        print(f"  Missing books: {', '.join(missing_books)}")

    # Calculate total size
    if output_dir.exists():
        total_size = sum(f.stat().st_size for f in output_dir.rglob("*.json"))
        print(f"  Total size: {total_size / 1024 / 1024:.2f} MB")

    return total_entries, missing_books


def generate_embeddings_from_db(db_file: Path, output_dir: Path, model_name: str):
    """
    Generate embeddings from SQLite database using commentary.py approach

    Args:
        db_file: Path to SQLite database (data.sqlite)
        output_dir: Path to output directory (e.g., ./data)
        model_name: HuggingFace model name (default: BAAI/bge-large-en-v1.5)
    """
    import sqlite3
    from datetime import datetime
    from tqdm import tqdm
    from sentence_transformers import SentenceTransformer

    print("WARNING: This will generate embeddings from scratch. This may take a long time!")
    print(f"Using model: {model_name}")
    print(f"Database: {db_file}")
    print("-" * 60)

    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)

    # Load model
    print("Loading embedding model...")
    model = SentenceTransformer(model_name)

    # Connect to database
    connection = sqlite3.connect(db_file)
    cursor = connection.cursor()

    # Query church fathers (NT-only, 9 fathers)
    top_authors = [
        "Augustine of Hippo",
        "Athanasius of Alexandria",
        "Basil of Caesarea",
        "Gregory of Nazianzus",
        "Gregory of Nyssa",
        "Cyril of Alexandria",
        "Irenaeus",
        "Cyprian",
        "Origen of Alexandria"
    ]

    query = """
        SELECT id, father_name, file_name, append_to_author_name, ts, book,
               location_start, location_end, txt, source_url, source_title
        FROM commentary
        WHERE father_name IN ({})
        AND book IN ({})
        AND append_to_author_name NOT LIKE '%quoted by Aquinas%'
        AND LENGTH(txt) >= 1000
        AND source_title IS NOT NULL
        AND source_title != ''
    """.format(
        ','.join('?' * len(top_authors)),
        ','.join('?' * len(NEW_TESTAMENT_BOOKS))
    )

    cursor.execute(query, top_authors + NEW_TESTAMENT_BOOKS)
    rows = cursor.fetchall()

    print(f"Found {len(rows)} commentary entries to process")

    # Process each row
    for row in tqdm(rows, desc="Generating embeddings"):
        id, father_name, file_name, append_to_author_name, ts, book, \
        location_start, location_end, txt, source_url, source_title = row

        # Generate embedding
        embedding = model.encode(txt, normalize_embeddings=True).tolist()

        # Prepare data
        data = {
            "content": txt,
            "metadata": {
                "id": id,
                "father_name": father_name,
                "book": book,
                "location_start": location_start,
                "location_end": location_end,
                "source_url": source_url,
                "source_title": source_title,
                "append_to_author_name": append_to_author_name
            },
            "embedding": embedding
        }

        # Save to file
        book_dir = output_dir / book
        book_dir.mkdir(exist_ok=True)

        # Generate unique filename
        safe_father = father_name.replace(' ', '_')
        filename = f"{book}_{safe_father}_{id}.json"

        with open(book_dir / filename, 'w') as f:
            json.dump(data, f)

    cursor.close()
    connection.close()

    print("✓ Embedding generation complete!")


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description="Prepare Church Fathers commentary embeddings for HF Spaces")
    parser.add_argument(
        "--source",
        type=str,
        help="Source directory containing church-fathers commentary_embeddings (e.g., ../church-fathers/commentary_embeddings)"
    )
    parser.add_argument(
        "--output",
        type=str,
        default="./data",
        help="Output directory for JSON files (default: ./data)"
    )
    parser.add_argument(
        "--generate",
        action="store_true",
        help="Generate embeddings from SQLite database instead of copying"
    )
    parser.add_argument(
        "--db",
        type=str,
        help="Path to SQLite database file (required if --generate is used)"
    )
    parser.add_argument(
        "--model",
        type=str,
        default="BAAI/bge-large-en-v1.5",
        help="Model name for embedding generation (default: BAAI/bge-large-en-v1.5)"
    )

    args = parser.parse_args()

    output_dir = Path(args.output)

    if args.generate:
        # Generate embeddings from database
        if not args.db:
            print("Error: --db is required when using --generate")
            return 1

        db_file = Path(args.db)
        if not db_file.exists():
            print(f"Error: Database file not found: {db_file}")
            return 1

        generate_embeddings_from_db(db_file, output_dir, args.model)

    else:
        # Copy embeddings from source
        if not args.source:
            print("Error: --source is required unless using --generate")
            print("\nUsage:")
            print("  Copy embeddings:    python prepare_data.py --source ../church-fathers/commentary_embeddings")
            print("  Generate new:       python prepare_data.py --generate --db path/to/data.sqlite")
            return 1

        source_dir = Path(args.source)
        if not source_dir.exists():
            print(f"Error: Source directory does not exist: {source_dir}")
            return 1

        total, missing = copy_embeddings_from_source(source_dir, output_dir)

        if total == 0:
            print("\nNo embeddings were copied. Please check the source directory.")
            return 1

    print("\n✓ Data preparation complete!")
    print("\nNext steps:")
    print("  1. Review the data/ directory")
    print("  2. Test locally: uvicorn app:app --reload")
    print("  3. Deploy to Hugging Face Spaces")

    return 0


if __name__ == "__main__":
    exit(main())