Spaces:

dssjon
/

biblos-cf-api

Sleeping

rdmlx

Update to NT-only dataset with 9 Church Fathers

a9936e3 4 months ago

11.1 kB

	"""
	Church Fathers Commentary Semantic Search API
	Hugging Face Spaces deployment with FastAPI
	Keeps model in memory for fast responses (~50-100ms after initial load)
	"""

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel, Field
	from typing import List, Optional
	import torch
	import numpy as np
	from transformers import AutoTokenizer, AutoModel
	import json
	import os
	from pathlib import Path
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize FastAPI app
	app = FastAPI(
	title="Church Fathers Commentary Search API",
	description="Semantic search over Church Fathers commentaries using BGE embeddings",
	version="1.0.0"
	)

	# Enable CORS for all origins
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Request/Response models
	class SearchRequest(BaseModel):
	query: str = Field(..., description="Search query text", min_length=1, max_length=500)
	limit: int = Field(10, description="Number of results to return", ge=1, le=100)
	books: Optional[List[str]] = Field(None, description="Filter by specific NT books (e.g., ['matthew', 'john'])")
	fathers: Optional[List[str]] = Field(None, description="Filter by specific Church Fathers")

	class SearchResult(BaseModel):
	book: str
	father_name: str
	source_title: str
	content: str
	similarity: float
	location_start: Optional[str] = None
	location_end: Optional[str] = None

	class SearchResponse(BaseModel):
	query: str
	results: List[SearchResult]
	total_searched: int
	execution_time_ms: float

	# Global variables for model and data
	MODEL_NAME = "BAAI/bge-large-en-v1.5"
	tokenizer = None
	model = None
	commentary_embeddings = {}
	commentary_metadata = {}

	# Book and Father mappings
	NEW_TESTAMENT_BOOKS = [
	"matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians",
	"galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians",
	"1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter",
	"2peter", "1john", "2john", "3john", "jude", "revelation"
	]

	CHURCH_FATHERS = [
	"Augustine of Hippo",
	"Athanasius of Alexandria",
	"Basil of Caesarea",
	"Gregory of Nazianzus",
	"Gregory of Nyssa",
	"Cyril of Alexandria",
	"Irenaeus",
	"Cyprian",
	"Origen of Alexandria"
	]


	@app.on_event("startup")
	async def load_model_and_data():
	"""Load model and commentary embeddings into memory at startup"""
	global tokenizer, model, commentary_embeddings, commentary_metadata

	logger.info("Loading model and tokenizer...")
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModel.from_pretrained(MODEL_NAME)
	model.eval() # Set to evaluation mode

	# Move to GPU if available
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)
	logger.info(f"Model loaded successfully on {device}")

	except Exception as e:
	logger.error(f"Error loading model: {e}")
	raise

	logger.info("Loading commentary embeddings...")
	try:
	# Load embeddings from data directory
	data_dir = Path("data")
	if not data_dir.exists():
	logger.warning("Data directory not found. Embeddings will be empty.")
	return

	loaded_count = 0
	total_entries = 0

	# Load all JSON files from data directory
	for json_file in data_dir.rglob("*.json"):
	try:
	with open(json_file, 'r') as f:
	entry = json.load(f)

	# Extract metadata
	book = entry['metadata'].get('book', 'unknown')

	# Initialize book storage if needed
	if book not in commentary_embeddings:
	commentary_embeddings[book] = []
	commentary_metadata[book] = []

	# Store embedding and metadata
	commentary_embeddings[book].append(entry['embedding'])
	commentary_metadata[book].append({
	'content': entry['content'],
	'father_name': entry['metadata'].get('father_name', 'Unknown'),
	'source_title': entry['metadata'].get('source_title', ''),
	'location_start': entry['metadata'].get('location_start', ''),
	'location_end': entry['metadata'].get('location_end', ''),
	})
	total_entries += 1

	except Exception as e:
	logger.warning(f"Error loading {json_file}: {e}")
	continue

	# Convert lists to numpy arrays for faster computation
	for book in commentary_embeddings:
	commentary_embeddings[book] = np.array(commentary_embeddings[book], dtype=np.float32)
	loaded_count += 1
	logger.info(f"Loaded {len(commentary_embeddings[book])} entries for {book}")

	logger.info(f"Successfully loaded {total_entries} total entries across {loaded_count} books")

	except Exception as e:
	logger.error(f"Error loading embeddings: {e}")
	raise


	def generate_embedding(text: str) -> np.ndarray:
	"""Generate embedding for input text using loaded model"""
	# Add query instruction for BGE model
	query_instruction = "Represent the Religious Bible verse commentary text for semantic search:"
	text_with_instruction = f"{query_instruction} {text}"

	# Tokenize
	inputs = tokenizer(text_with_instruction, return_tensors="pt", padding=True, truncation=True, max_length=512)

	# Move to same device as model
	device = next(model.parameters()).device
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Generate embeddings
	with torch.no_grad():
	outputs = model(**inputs)
	# Mean pooling
	embeddings = outputs.last_hidden_state.mean(dim=1)
	# Normalize
	embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

	return embeddings.cpu().numpy()[0]


	def cosine_similarity(query_embedding: np.ndarray, doc_embeddings: np.ndarray) -> np.ndarray:
	"""Compute cosine similarity between query and document embeddings"""
	# Normalize query embedding
	query_norm = query_embedding / np.linalg.norm(query_embedding)

	# Normalize document embeddings
	doc_norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
	doc_embeddings_norm = doc_embeddings / doc_norms

	# Compute dot product (cosine similarity for normalized vectors)
	similarities = np.dot(doc_embeddings_norm, query_norm)

	return similarities


	@app.get("/")
	async def root():
	"""Health check and API info"""
	return {
	"status": "online",
	"model": MODEL_NAME,
	"books_loaded": len(commentary_embeddings),
	"total_entries": sum(len(emb) for emb in commentary_embeddings.values()),
	"device": "cuda" if torch.cuda.is_available() else "cpu",
	"available_books": list(commentary_embeddings.keys())
	}


	@app.get("/health")
	async def health_check():
	"""Detailed health check"""
	return {
	"model_loaded": model is not None,
	"tokenizer_loaded": tokenizer is not None,
	"embeddings_loaded": len(commentary_embeddings) > 0,
	"books_available": list(commentary_embeddings.keys()),
	"fathers_available": CHURCH_FATHERS
	}


	@app.post("/search", response_model=SearchResponse)
	async def search(request: SearchRequest):
	"""
	Perform semantic search over Church Fathers commentaries

	- query: The search query text
	- limit: Number of results to return (1-100)
	- books: Optional list of NT books to filter by
	- fathers: Optional list of Church Fathers to filter by
	"""
	import time
	start_time = time.time()

	# Validate model is loaded
	if model is None or tokenizer is None:
	raise HTTPException(status_code=503, detail="Model not loaded yet. Please try again in a moment.")

	# Validate we have embeddings
	if len(commentary_embeddings) == 0:
	raise HTTPException(status_code=503, detail="Commentary embeddings not loaded. Please check data directory.")

	try:
	# Generate query embedding
	logger.info(f"Generating embedding for query: {request.query[:50]}...")
	query_embedding = generate_embedding(request.query)

	# Determine which books to search
	if request.books:
	books_to_search = [b for b in request.books if b in commentary_embeddings]
	if not books_to_search:
	raise HTTPException(status_code=400, detail="None of the specified books are available")
	else:
	books_to_search = list(commentary_embeddings.keys())

	# Collect all results
	all_results = []
	total_searched = 0

	for book in books_to_search:
	book_embeddings = commentary_embeddings[book]
	book_metadata = commentary_metadata[book]

	# Compute similarities
	similarities = cosine_similarity(query_embedding, book_embeddings)

	# Create results
	for i, similarity in enumerate(similarities):
	if not np.isnan(similarity) and np.isfinite(similarity):
	metadata = book_metadata[i]

	# Apply father filter if specified
	if request.fathers and metadata['father_name'] not in request.fathers:
	continue

	all_results.append({
	"book": book,
	"father_name": metadata['father_name'],
	"source_title": metadata['source_title'],
	"content": metadata['content'],
	"similarity": float(similarity),
	"location_start": str(metadata.get('location_start', '')),
	"location_end": str(metadata.get('location_end', ''))
	})

	total_searched += len(similarities)

	# Sort by similarity and limit
	all_results.sort(key=lambda x: x['similarity'], reverse=True)
	top_results = all_results[:request.limit]

	execution_time = (time.time() - start_time) * 1000 # Convert to ms
	logger.info(f"Search completed in {execution_time:.2f}ms, returning {len(top_results)} results")

	return SearchResponse(
	query=request.query,
	results=top_results,
	total_searched=total_searched,
	execution_time_ms=round(execution_time, 2)
	)

	except Exception as e:
	logger.error(f"Error during search: {e}")
	raise HTTPException(status_code=500, detail=str(e))


	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)