biblos-cf-api / app.py
rdmlx
Update to NT-only dataset with 9 Church Fathers
a9936e3
"""
Church Fathers Commentary Semantic Search API
Hugging Face Spaces deployment with FastAPI
Keeps model in memory for fast responses (~50-100ms after initial load)
"""
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from typing import List, Optional
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
import json
import os
from pathlib import Path
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize FastAPI app
app = FastAPI(
title="Church Fathers Commentary Search API",
description="Semantic search over Church Fathers commentaries using BGE embeddings",
version="1.0.0"
)
# Enable CORS for all origins
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Request/Response models
class SearchRequest(BaseModel):
query: str = Field(..., description="Search query text", min_length=1, max_length=500)
limit: int = Field(10, description="Number of results to return", ge=1, le=100)
books: Optional[List[str]] = Field(None, description="Filter by specific NT books (e.g., ['matthew', 'john'])")
fathers: Optional[List[str]] = Field(None, description="Filter by specific Church Fathers")
class SearchResult(BaseModel):
book: str
father_name: str
source_title: str
content: str
similarity: float
location_start: Optional[str] = None
location_end: Optional[str] = None
class SearchResponse(BaseModel):
query: str
results: List[SearchResult]
total_searched: int
execution_time_ms: float
# Global variables for model and data
MODEL_NAME = "BAAI/bge-large-en-v1.5"
tokenizer = None
model = None
commentary_embeddings = {}
commentary_metadata = {}
# Book and Father mappings
NEW_TESTAMENT_BOOKS = [
"matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians",
"galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians",
"1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter",
"2peter", "1john", "2john", "3john", "jude", "revelation"
]
CHURCH_FATHERS = [
"Augustine of Hippo",
"Athanasius of Alexandria",
"Basil of Caesarea",
"Gregory of Nazianzus",
"Gregory of Nyssa",
"Cyril of Alexandria",
"Irenaeus",
"Cyprian",
"Origen of Alexandria"
]
@app.on_event("startup")
async def load_model_and_data():
"""Load model and commentary embeddings into memory at startup"""
global tokenizer, model, commentary_embeddings, commentary_metadata
logger.info("Loading model and tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval() # Set to evaluation mode
# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
logger.info(f"Model loaded successfully on {device}")
except Exception as e:
logger.error(f"Error loading model: {e}")
raise
logger.info("Loading commentary embeddings...")
try:
# Load embeddings from data directory
data_dir = Path("data")
if not data_dir.exists():
logger.warning("Data directory not found. Embeddings will be empty.")
return
loaded_count = 0
total_entries = 0
# Load all JSON files from data directory
for json_file in data_dir.rglob("*.json"):
try:
with open(json_file, 'r') as f:
entry = json.load(f)
# Extract metadata
book = entry['metadata'].get('book', 'unknown')
# Initialize book storage if needed
if book not in commentary_embeddings:
commentary_embeddings[book] = []
commentary_metadata[book] = []
# Store embedding and metadata
commentary_embeddings[book].append(entry['embedding'])
commentary_metadata[book].append({
'content': entry['content'],
'father_name': entry['metadata'].get('father_name', 'Unknown'),
'source_title': entry['metadata'].get('source_title', ''),
'location_start': entry['metadata'].get('location_start', ''),
'location_end': entry['metadata'].get('location_end', ''),
})
total_entries += 1
except Exception as e:
logger.warning(f"Error loading {json_file}: {e}")
continue
# Convert lists to numpy arrays for faster computation
for book in commentary_embeddings:
commentary_embeddings[book] = np.array(commentary_embeddings[book], dtype=np.float32)
loaded_count += 1
logger.info(f"Loaded {len(commentary_embeddings[book])} entries for {book}")
logger.info(f"Successfully loaded {total_entries} total entries across {loaded_count} books")
except Exception as e:
logger.error(f"Error loading embeddings: {e}")
raise
def generate_embedding(text: str) -> np.ndarray:
"""Generate embedding for input text using loaded model"""
# Add query instruction for BGE model
query_instruction = "Represent the Religious Bible verse commentary text for semantic search:"
text_with_instruction = f"{query_instruction} {text}"
# Tokenize
inputs = tokenizer(text_with_instruction, return_tensors="pt", padding=True, truncation=True, max_length=512)
# Move to same device as model
device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate embeddings
with torch.no_grad():
outputs = model(**inputs)
# Mean pooling
embeddings = outputs.last_hidden_state.mean(dim=1)
# Normalize
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return embeddings.cpu().numpy()[0]
def cosine_similarity(query_embedding: np.ndarray, doc_embeddings: np.ndarray) -> np.ndarray:
"""Compute cosine similarity between query and document embeddings"""
# Normalize query embedding
query_norm = query_embedding / np.linalg.norm(query_embedding)
# Normalize document embeddings
doc_norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
doc_embeddings_norm = doc_embeddings / doc_norms
# Compute dot product (cosine similarity for normalized vectors)
similarities = np.dot(doc_embeddings_norm, query_norm)
return similarities
@app.get("/")
async def root():
"""Health check and API info"""
return {
"status": "online",
"model": MODEL_NAME,
"books_loaded": len(commentary_embeddings),
"total_entries": sum(len(emb) for emb in commentary_embeddings.values()),
"device": "cuda" if torch.cuda.is_available() else "cpu",
"available_books": list(commentary_embeddings.keys())
}
@app.get("/health")
async def health_check():
"""Detailed health check"""
return {
"model_loaded": model is not None,
"tokenizer_loaded": tokenizer is not None,
"embeddings_loaded": len(commentary_embeddings) > 0,
"books_available": list(commentary_embeddings.keys()),
"fathers_available": CHURCH_FATHERS
}
@app.post("/search", response_model=SearchResponse)
async def search(request: SearchRequest):
"""
Perform semantic search over Church Fathers commentaries
- **query**: The search query text
- **limit**: Number of results to return (1-100)
- **books**: Optional list of NT books to filter by
- **fathers**: Optional list of Church Fathers to filter by
"""
import time
start_time = time.time()
# Validate model is loaded
if model is None or tokenizer is None:
raise HTTPException(status_code=503, detail="Model not loaded yet. Please try again in a moment.")
# Validate we have embeddings
if len(commentary_embeddings) == 0:
raise HTTPException(status_code=503, detail="Commentary embeddings not loaded. Please check data directory.")
try:
# Generate query embedding
logger.info(f"Generating embedding for query: {request.query[:50]}...")
query_embedding = generate_embedding(request.query)
# Determine which books to search
if request.books:
books_to_search = [b for b in request.books if b in commentary_embeddings]
if not books_to_search:
raise HTTPException(status_code=400, detail="None of the specified books are available")
else:
books_to_search = list(commentary_embeddings.keys())
# Collect all results
all_results = []
total_searched = 0
for book in books_to_search:
book_embeddings = commentary_embeddings[book]
book_metadata = commentary_metadata[book]
# Compute similarities
similarities = cosine_similarity(query_embedding, book_embeddings)
# Create results
for i, similarity in enumerate(similarities):
if not np.isnan(similarity) and np.isfinite(similarity):
metadata = book_metadata[i]
# Apply father filter if specified
if request.fathers and metadata['father_name'] not in request.fathers:
continue
all_results.append({
"book": book,
"father_name": metadata['father_name'],
"source_title": metadata['source_title'],
"content": metadata['content'],
"similarity": float(similarity),
"location_start": str(metadata.get('location_start', '')),
"location_end": str(metadata.get('location_end', ''))
})
total_searched += len(similarities)
# Sort by similarity and limit
all_results.sort(key=lambda x: x['similarity'], reverse=True)
top_results = all_results[:request.limit]
execution_time = (time.time() - start_time) * 1000 # Convert to ms
logger.info(f"Search completed in {execution_time:.2f}ms, returning {len(top_results)} results")
return SearchResponse(
query=request.query,
results=top_results,
total_searched=total_searched,
execution_time_ms=round(execution_time, 2)
)
except Exception as e:
logger.error(f"Error during search: {e}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)