Spaces:
Sleeping
Sleeping
File size: 9,965 Bytes
b773b72 a356e85 b773b72 a356e85 b773b72 a356e85 b773b72 a356e85 b773b72 a9936e3 b773b72 a9936e3 b773b72 a9936e3 b773b72 a9936e3 b773b72 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 | """
Data preparation script for Church Fathers Commentary API
Copies commentary embedding JSON files from church-fathers repo or generates new ones
"""
import json
import shutil
from pathlib import Path
import argparse
OLD_TESTAMENT_BOOKS = [
"genesis", "exodus", "leviticus", "numbers", "deuteronomy", "joshua", "judges", "ruth",
"1samuel", "2samuel", "1kings", "2kings", "1chronicles", "2chronicles", "ezra",
"nehemiah", "esther", "job", "psalms", "proverbs", "ecclesiastes", "songofsolomon",
"isaiah", "jeremiah", "lamentations", "ezekiel", "daniel", "hosea", "joel", "amos",
"obadiah", "jonah", "micah", "nahum", "habakkuk", "zephaniah", "haggai", "zechariah",
"malachi"
]
NEW_TESTAMENT_BOOKS = [
"matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians",
"galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians",
"1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter",
"2peter", "1john", "2john", "3john", "jude", "revelation"
]
ALL_BOOKS = OLD_TESTAMENT_BOOKS + NEW_TESTAMENT_BOOKS
def copy_embeddings_from_source(source_dir: Path, output_dir: Path):
"""
Copy commentary embeddings from church-fathers repository
Args:
source_dir: Path to church-fathers commentary_embeddings directory
output_dir: Path to output directory (e.g., ./data)
"""
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
copied_count = 0
total_entries = 0
missing_books = []
print(f"Copying embeddings from: {source_dir}")
print(f"Output directory: {output_dir}")
print("-" * 60)
for book in ALL_BOOKS:
book_dir = source_dir / book
if not book_dir.exists():
print(f"✗ {book} directory not found")
missing_books.append(book)
continue
# Copy all JSON files for this book
json_files = list(book_dir.glob("*.json"))
if not json_files:
print(f"✗ No JSON files found for {book}")
missing_books.append(book)
continue
# Create book subdirectory in output
book_output_dir = output_dir / book
book_output_dir.mkdir(exist_ok=True)
book_entries = 0
for json_file in json_files:
try:
# Validate JSON structure
with open(json_file, 'r') as f:
data = json.load(f)
# Verify required fields
if 'content' not in data or 'metadata' not in data or 'embedding' not in data:
print(f" ✗ Skipping {json_file.name}: missing required fields")
continue
# Verify embedding is valid
if not isinstance(data['embedding'], list) or len(data['embedding']) == 0:
print(f" ✗ Skipping {json_file.name}: invalid embedding")
continue
# Copy file
output_file = book_output_dir / json_file.name
shutil.copy2(json_file, output_file)
book_entries += 1
except Exception as e:
print(f" ✗ Error processing {json_file.name}: {e}")
continue
if book_entries > 0:
print(f"✓ {book}: copied {book_entries} files")
copied_count += len(json_files)
total_entries += book_entries
else:
print(f"✗ {book}: no valid files found")
missing_books.append(book)
print("-" * 60)
print(f"\nCopy complete:")
print(f" Total files copied: {copied_count}")
print(f" Total entries: {total_entries}")
print(f" Books processed: {len(ALL_BOOKS) - len(missing_books)}/{len(ALL_BOOKS)}")
if missing_books:
print(f" Missing books: {', '.join(missing_books)}")
# Calculate total size
if output_dir.exists():
total_size = sum(f.stat().st_size for f in output_dir.rglob("*.json"))
print(f" Total size: {total_size / 1024 / 1024:.2f} MB")
return total_entries, missing_books
def generate_embeddings_from_db(db_file: Path, output_dir: Path, model_name: str):
"""
Generate embeddings from SQLite database using commentary.py approach
Args:
db_file: Path to SQLite database (data.sqlite)
output_dir: Path to output directory (e.g., ./data)
model_name: HuggingFace model name (default: BAAI/bge-large-en-v1.5)
"""
import sqlite3
from datetime import datetime
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
print("WARNING: This will generate embeddings from scratch. This may take a long time!")
print(f"Using model: {model_name}")
print(f"Database: {db_file}")
print("-" * 60)
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Load model
print("Loading embedding model...")
model = SentenceTransformer(model_name)
# Connect to database
connection = sqlite3.connect(db_file)
cursor = connection.cursor()
# Query church fathers (NT-only, 9 fathers)
top_authors = [
"Augustine of Hippo",
"Athanasius of Alexandria",
"Basil of Caesarea",
"Gregory of Nazianzus",
"Gregory of Nyssa",
"Cyril of Alexandria",
"Irenaeus",
"Cyprian",
"Origen of Alexandria"
]
query = """
SELECT id, father_name, file_name, append_to_author_name, ts, book,
location_start, location_end, txt, source_url, source_title
FROM commentary
WHERE father_name IN ({})
AND book IN ({})
AND append_to_author_name NOT LIKE '%quoted by Aquinas%'
AND LENGTH(txt) >= 1000
AND source_title IS NOT NULL
AND source_title != ''
""".format(
','.join('?' * len(top_authors)),
','.join('?' * len(NEW_TESTAMENT_BOOKS))
)
cursor.execute(query, top_authors + NEW_TESTAMENT_BOOKS)
rows = cursor.fetchall()
print(f"Found {len(rows)} commentary entries to process")
# Process each row
for row in tqdm(rows, desc="Generating embeddings"):
id, father_name, file_name, append_to_author_name, ts, book, \
location_start, location_end, txt, source_url, source_title = row
# Generate embedding
embedding = model.encode(txt, normalize_embeddings=True).tolist()
# Prepare data
data = {
"content": txt,
"metadata": {
"id": id,
"father_name": father_name,
"book": book,
"location_start": location_start,
"location_end": location_end,
"source_url": source_url,
"source_title": source_title,
"append_to_author_name": append_to_author_name
},
"embedding": embedding
}
# Save to file
book_dir = output_dir / book
book_dir.mkdir(exist_ok=True)
# Generate unique filename
safe_father = father_name.replace(' ', '_')
filename = f"{book}_{safe_father}_{id}.json"
with open(book_dir / filename, 'w') as f:
json.dump(data, f)
cursor.close()
connection.close()
print("✓ Embedding generation complete!")
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(description="Prepare Church Fathers commentary embeddings for HF Spaces")
parser.add_argument(
"--source",
type=str,
help="Source directory containing church-fathers commentary_embeddings (e.g., ../church-fathers/commentary_embeddings)"
)
parser.add_argument(
"--output",
type=str,
default="./data",
help="Output directory for JSON files (default: ./data)"
)
parser.add_argument(
"--generate",
action="store_true",
help="Generate embeddings from SQLite database instead of copying"
)
parser.add_argument(
"--db",
type=str,
help="Path to SQLite database file (required if --generate is used)"
)
parser.add_argument(
"--model",
type=str,
default="BAAI/bge-large-en-v1.5",
help="Model name for embedding generation (default: BAAI/bge-large-en-v1.5)"
)
args = parser.parse_args()
output_dir = Path(args.output)
if args.generate:
# Generate embeddings from database
if not args.db:
print("Error: --db is required when using --generate")
return 1
db_file = Path(args.db)
if not db_file.exists():
print(f"Error: Database file not found: {db_file}")
return 1
generate_embeddings_from_db(db_file, output_dir, args.model)
else:
# Copy embeddings from source
if not args.source:
print("Error: --source is required unless using --generate")
print("\nUsage:")
print(" Copy embeddings: python prepare_data.py --source ../church-fathers/commentary_embeddings")
print(" Generate new: python prepare_data.py --generate --db path/to/data.sqlite")
return 1
source_dir = Path(args.source)
if not source_dir.exists():
print(f"Error: Source directory does not exist: {source_dir}")
return 1
total, missing = copy_embeddings_from_source(source_dir, output_dir)
if total == 0:
print("\nNo embeddings were copied. Please check the source directory.")
return 1
print("\n✓ Data preparation complete!")
print("\nNext steps:")
print(" 1. Review the data/ directory")
print(" 2. Test locally: uvicorn app:app --reload")
print(" 3. Deploy to Hugging Face Spaces")
return 0
if __name__ == "__main__":
exit(main())
|