Spaces:

dssjon
/

biblos-cf-api

Sleeping

rdmlx

Update to NT-only dataset with 9 Church Fathers

a9936e3 4 months ago

9.97 kB

	"""
	Data preparation script for Church Fathers Commentary API
	Copies commentary embedding JSON files from church-fathers repo or generates new ones
	"""

	import json
	import shutil
	from pathlib import Path
	import argparse


	OLD_TESTAMENT_BOOKS = [
	"genesis", "exodus", "leviticus", "numbers", "deuteronomy", "joshua", "judges", "ruth",
	"1samuel", "2samuel", "1kings", "2kings", "1chronicles", "2chronicles", "ezra",
	"nehemiah", "esther", "job", "psalms", "proverbs", "ecclesiastes", "songofsolomon",
	"isaiah", "jeremiah", "lamentations", "ezekiel", "daniel", "hosea", "joel", "amos",
	"obadiah", "jonah", "micah", "nahum", "habakkuk", "zephaniah", "haggai", "zechariah",
	"malachi"
	]

	NEW_TESTAMENT_BOOKS = [
	"matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians",
	"galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians",
	"1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter",
	"2peter", "1john", "2john", "3john", "jude", "revelation"
	]

	ALL_BOOKS = OLD_TESTAMENT_BOOKS + NEW_TESTAMENT_BOOKS


	def copy_embeddings_from_source(source_dir: Path, output_dir: Path):
	"""
	Copy commentary embeddings from church-fathers repository

	Args:
	source_dir: Path to church-fathers commentary_embeddings directory
	output_dir: Path to output directory (e.g., ./data)
	"""

	# Create output directory
	output_dir.mkdir(parents=True, exist_ok=True)

	copied_count = 0
	total_entries = 0
	missing_books = []

	print(f"Copying embeddings from: {source_dir}")
	print(f"Output directory: {output_dir}")
	print("-" * 60)

	for book in ALL_BOOKS:
	book_dir = source_dir / book

	if not book_dir.exists():
	print(f"✗ {book} directory not found")
	missing_books.append(book)
	continue

	# Copy all JSON files for this book
	json_files = list(book_dir.glob("*.json"))

	if not json_files:
	print(f"✗ No JSON files found for {book}")
	missing_books.append(book)
	continue

	# Create book subdirectory in output
	book_output_dir = output_dir / book
	book_output_dir.mkdir(exist_ok=True)

	book_entries = 0
	for json_file in json_files:
	try:
	# Validate JSON structure
	with open(json_file, 'r') as f:
	data = json.load(f)

	# Verify required fields
	if 'content' not in data or 'metadata' not in data or 'embedding' not in data:
	print(f" ✗ Skipping {json_file.name}: missing required fields")
	continue

	# Verify embedding is valid
	if not isinstance(data['embedding'], list) or len(data['embedding']) == 0:
	print(f" ✗ Skipping {json_file.name}: invalid embedding")
	continue

	# Copy file
	output_file = book_output_dir / json_file.name
	shutil.copy2(json_file, output_file)
	book_entries += 1

	except Exception as e:
	print(f" ✗ Error processing {json_file.name}: {e}")
	continue

	if book_entries > 0:
	print(f"✓ {book}: copied {book_entries} files")
	copied_count += len(json_files)
	total_entries += book_entries
	else:
	print(f"✗ {book}: no valid files found")
	missing_books.append(book)

	print("-" * 60)
	print(f"\nCopy complete:")
	print(f" Total files copied: {copied_count}")
	print(f" Total entries: {total_entries}")
	print(f" Books processed: {len(ALL_BOOKS) - len(missing_books)}/{len(ALL_BOOKS)}")

	if missing_books:
	print(f" Missing books: {', '.join(missing_books)}")

	# Calculate total size
	if output_dir.exists():
	total_size = sum(f.stat().st_size for f in output_dir.rglob("*.json"))
	print(f" Total size: {total_size / 1024 / 1024:.2f} MB")

	return total_entries, missing_books


	def generate_embeddings_from_db(db_file: Path, output_dir: Path, model_name: str):
	"""
	Generate embeddings from SQLite database using commentary.py approach

	Args:
	db_file: Path to SQLite database (data.sqlite)
	output_dir: Path to output directory (e.g., ./data)
	model_name: HuggingFace model name (default: BAAI/bge-large-en-v1.5)
	"""
	import sqlite3
	from datetime import datetime
	from tqdm import tqdm
	from sentence_transformers import SentenceTransformer

	print("WARNING: This will generate embeddings from scratch. This may take a long time!")
	print(f"Using model: {model_name}")
	print(f"Database: {db_file}")
	print("-" * 60)

	# Create output directory
	output_dir.mkdir(parents=True, exist_ok=True)

	# Load model
	print("Loading embedding model...")
	model = SentenceTransformer(model_name)

	# Connect to database
	connection = sqlite3.connect(db_file)
	cursor = connection.cursor()

	# Query church fathers (NT-only, 9 fathers)
	top_authors = [
	"Augustine of Hippo",
	"Athanasius of Alexandria",
	"Basil of Caesarea",
	"Gregory of Nazianzus",
	"Gregory of Nyssa",
	"Cyril of Alexandria",
	"Irenaeus",
	"Cyprian",
	"Origen of Alexandria"
	]

	query = """
	SELECT id, father_name, file_name, append_to_author_name, ts, book,
	location_start, location_end, txt, source_url, source_title
	FROM commentary
	WHERE father_name IN ({})
	AND book IN ({})
	AND append_to_author_name NOT LIKE '%quoted by Aquinas%'
	AND LENGTH(txt) >= 1000
	AND source_title IS NOT NULL
	AND source_title != ''
	""".format(
	','.join('?' * len(top_authors)),
	','.join('?' * len(NEW_TESTAMENT_BOOKS))
	)

	cursor.execute(query, top_authors + NEW_TESTAMENT_BOOKS)
	rows = cursor.fetchall()

	print(f"Found {len(rows)} commentary entries to process")

	# Process each row
	for row in tqdm(rows, desc="Generating embeddings"):
	id, father_name, file_name, append_to_author_name, ts, book, \
	location_start, location_end, txt, source_url, source_title = row

	# Generate embedding
	embedding = model.encode(txt, normalize_embeddings=True).tolist()

	# Prepare data
	data = {
	"content": txt,
	"metadata": {
	"id": id,
	"father_name": father_name,
	"book": book,
	"location_start": location_start,
	"location_end": location_end,
	"source_url": source_url,
	"source_title": source_title,
	"append_to_author_name": append_to_author_name
	},
	"embedding": embedding
	}

	# Save to file
	book_dir = output_dir / book
	book_dir.mkdir(exist_ok=True)

	# Generate unique filename
	safe_father = father_name.replace(' ', '_')
	filename = f"{book}_{safe_father}_{id}.json"

	with open(book_dir / filename, 'w') as f:
	json.dump(data, f)

	cursor.close()
	connection.close()

	print("✓ Embedding generation complete!")


	def main():
	"""Main entry point"""
	parser = argparse.ArgumentParser(description="Prepare Church Fathers commentary embeddings for HF Spaces")
	parser.add_argument(
	"--source",
	type=str,
	help="Source directory containing church-fathers commentary_embeddings (e.g., ../church-fathers/commentary_embeddings)"
	)
	parser.add_argument(
	"--output",
	type=str,
	default="./data",
	help="Output directory for JSON files (default: ./data)"
	)
	parser.add_argument(
	"--generate",
	action="store_true",
	help="Generate embeddings from SQLite database instead of copying"
	)
	parser.add_argument(
	"--db",
	type=str,
	help="Path to SQLite database file (required if --generate is used)"
	)
	parser.add_argument(
	"--model",
	type=str,
	default="BAAI/bge-large-en-v1.5",
	help="Model name for embedding generation (default: BAAI/bge-large-en-v1.5)"
	)

	args = parser.parse_args()

	output_dir = Path(args.output)

	if args.generate:
	# Generate embeddings from database
	if not args.db:
	print("Error: --db is required when using --generate")
	return 1

	db_file = Path(args.db)
	if not db_file.exists():
	print(f"Error: Database file not found: {db_file}")
	return 1

	generate_embeddings_from_db(db_file, output_dir, args.model)

	else:
	# Copy embeddings from source
	if not args.source:
	print("Error: --source is required unless using --generate")
	print("\nUsage:")
	print(" Copy embeddings: python prepare_data.py --source ../church-fathers/commentary_embeddings")
	print(" Generate new: python prepare_data.py --generate --db path/to/data.sqlite")
	return 1

	source_dir = Path(args.source)
	if not source_dir.exists():
	print(f"Error: Source directory does not exist: {source_dir}")
	return 1

	total, missing = copy_embeddings_from_source(source_dir, output_dir)

	if total == 0:
	print("\nNo embeddings were copied. Please check the source directory.")
	return 1

	print("\n✓ Data preparation complete!")
	print("\nNext steps:")
	print(" 1. Review the data/ directory")
	print(" 2. Test locally: uvicorn app:app --reload")
	print(" 3. Deploy to Hugging Face Spaces")

	return 0


	if __name__ == "__main__":
	exit(main())