biblos-cf-api / prepare_data.py
rdmlx
Update to NT-only dataset with 9 Church Fathers
a9936e3
"""
Data preparation script for Church Fathers Commentary API
Copies commentary embedding JSON files from church-fathers repo or generates new ones
"""
import json
import shutil
from pathlib import Path
import argparse
OLD_TESTAMENT_BOOKS = [
"genesis", "exodus", "leviticus", "numbers", "deuteronomy", "joshua", "judges", "ruth",
"1samuel", "2samuel", "1kings", "2kings", "1chronicles", "2chronicles", "ezra",
"nehemiah", "esther", "job", "psalms", "proverbs", "ecclesiastes", "songofsolomon",
"isaiah", "jeremiah", "lamentations", "ezekiel", "daniel", "hosea", "joel", "amos",
"obadiah", "jonah", "micah", "nahum", "habakkuk", "zephaniah", "haggai", "zechariah",
"malachi"
]
NEW_TESTAMENT_BOOKS = [
"matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians",
"galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians",
"1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter",
"2peter", "1john", "2john", "3john", "jude", "revelation"
]
ALL_BOOKS = OLD_TESTAMENT_BOOKS + NEW_TESTAMENT_BOOKS
def copy_embeddings_from_source(source_dir: Path, output_dir: Path):
"""
Copy commentary embeddings from church-fathers repository
Args:
source_dir: Path to church-fathers commentary_embeddings directory
output_dir: Path to output directory (e.g., ./data)
"""
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
copied_count = 0
total_entries = 0
missing_books = []
print(f"Copying embeddings from: {source_dir}")
print(f"Output directory: {output_dir}")
print("-" * 60)
for book in ALL_BOOKS:
book_dir = source_dir / book
if not book_dir.exists():
print(f"✗ {book} directory not found")
missing_books.append(book)
continue
# Copy all JSON files for this book
json_files = list(book_dir.glob("*.json"))
if not json_files:
print(f"✗ No JSON files found for {book}")
missing_books.append(book)
continue
# Create book subdirectory in output
book_output_dir = output_dir / book
book_output_dir.mkdir(exist_ok=True)
book_entries = 0
for json_file in json_files:
try:
# Validate JSON structure
with open(json_file, 'r') as f:
data = json.load(f)
# Verify required fields
if 'content' not in data or 'metadata' not in data or 'embedding' not in data:
print(f" ✗ Skipping {json_file.name}: missing required fields")
continue
# Verify embedding is valid
if not isinstance(data['embedding'], list) or len(data['embedding']) == 0:
print(f" ✗ Skipping {json_file.name}: invalid embedding")
continue
# Copy file
output_file = book_output_dir / json_file.name
shutil.copy2(json_file, output_file)
book_entries += 1
except Exception as e:
print(f" ✗ Error processing {json_file.name}: {e}")
continue
if book_entries > 0:
print(f"✓ {book}: copied {book_entries} files")
copied_count += len(json_files)
total_entries += book_entries
else:
print(f"✗ {book}: no valid files found")
missing_books.append(book)
print("-" * 60)
print(f"\nCopy complete:")
print(f" Total files copied: {copied_count}")
print(f" Total entries: {total_entries}")
print(f" Books processed: {len(ALL_BOOKS) - len(missing_books)}/{len(ALL_BOOKS)}")
if missing_books:
print(f" Missing books: {', '.join(missing_books)}")
# Calculate total size
if output_dir.exists():
total_size = sum(f.stat().st_size for f in output_dir.rglob("*.json"))
print(f" Total size: {total_size / 1024 / 1024:.2f} MB")
return total_entries, missing_books
def generate_embeddings_from_db(db_file: Path, output_dir: Path, model_name: str):
"""
Generate embeddings from SQLite database using commentary.py approach
Args:
db_file: Path to SQLite database (data.sqlite)
output_dir: Path to output directory (e.g., ./data)
model_name: HuggingFace model name (default: BAAI/bge-large-en-v1.5)
"""
import sqlite3
from datetime import datetime
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
print("WARNING: This will generate embeddings from scratch. This may take a long time!")
print(f"Using model: {model_name}")
print(f"Database: {db_file}")
print("-" * 60)
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Load model
print("Loading embedding model...")
model = SentenceTransformer(model_name)
# Connect to database
connection = sqlite3.connect(db_file)
cursor = connection.cursor()
# Query church fathers (NT-only, 9 fathers)
top_authors = [
"Augustine of Hippo",
"Athanasius of Alexandria",
"Basil of Caesarea",
"Gregory of Nazianzus",
"Gregory of Nyssa",
"Cyril of Alexandria",
"Irenaeus",
"Cyprian",
"Origen of Alexandria"
]
query = """
SELECT id, father_name, file_name, append_to_author_name, ts, book,
location_start, location_end, txt, source_url, source_title
FROM commentary
WHERE father_name IN ({})
AND book IN ({})
AND append_to_author_name NOT LIKE '%quoted by Aquinas%'
AND LENGTH(txt) >= 1000
AND source_title IS NOT NULL
AND source_title != ''
""".format(
','.join('?' * len(top_authors)),
','.join('?' * len(NEW_TESTAMENT_BOOKS))
)
cursor.execute(query, top_authors + NEW_TESTAMENT_BOOKS)
rows = cursor.fetchall()
print(f"Found {len(rows)} commentary entries to process")
# Process each row
for row in tqdm(rows, desc="Generating embeddings"):
id, father_name, file_name, append_to_author_name, ts, book, \
location_start, location_end, txt, source_url, source_title = row
# Generate embedding
embedding = model.encode(txt, normalize_embeddings=True).tolist()
# Prepare data
data = {
"content": txt,
"metadata": {
"id": id,
"father_name": father_name,
"book": book,
"location_start": location_start,
"location_end": location_end,
"source_url": source_url,
"source_title": source_title,
"append_to_author_name": append_to_author_name
},
"embedding": embedding
}
# Save to file
book_dir = output_dir / book
book_dir.mkdir(exist_ok=True)
# Generate unique filename
safe_father = father_name.replace(' ', '_')
filename = f"{book}_{safe_father}_{id}.json"
with open(book_dir / filename, 'w') as f:
json.dump(data, f)
cursor.close()
connection.close()
print("✓ Embedding generation complete!")
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(description="Prepare Church Fathers commentary embeddings for HF Spaces")
parser.add_argument(
"--source",
type=str,
help="Source directory containing church-fathers commentary_embeddings (e.g., ../church-fathers/commentary_embeddings)"
)
parser.add_argument(
"--output",
type=str,
default="./data",
help="Output directory for JSON files (default: ./data)"
)
parser.add_argument(
"--generate",
action="store_true",
help="Generate embeddings from SQLite database instead of copying"
)
parser.add_argument(
"--db",
type=str,
help="Path to SQLite database file (required if --generate is used)"
)
parser.add_argument(
"--model",
type=str,
default="BAAI/bge-large-en-v1.5",
help="Model name for embedding generation (default: BAAI/bge-large-en-v1.5)"
)
args = parser.parse_args()
output_dir = Path(args.output)
if args.generate:
# Generate embeddings from database
if not args.db:
print("Error: --db is required when using --generate")
return 1
db_file = Path(args.db)
if not db_file.exists():
print(f"Error: Database file not found: {db_file}")
return 1
generate_embeddings_from_db(db_file, output_dir, args.model)
else:
# Copy embeddings from source
if not args.source:
print("Error: --source is required unless using --generate")
print("\nUsage:")
print(" Copy embeddings: python prepare_data.py --source ../church-fathers/commentary_embeddings")
print(" Generate new: python prepare_data.py --generate --db path/to/data.sqlite")
return 1
source_dir = Path(args.source)
if not source_dir.exists():
print(f"Error: Source directory does not exist: {source_dir}")
return 1
total, missing = copy_embeddings_from_source(source_dir, output_dir)
if total == 0:
print("\nNo embeddings were copied. Please check the source directory.")
return 1
print("\n✓ Data preparation complete!")
print("\nNext steps:")
print(" 1. Review the data/ directory")
print(" 2. Test locally: uvicorn app:app --reload")
print(" 3. Deploy to Hugging Face Spaces")
return 0
if __name__ == "__main__":
exit(main())