import torch from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from sentence_transformers import SentenceTransformer, CrossEncoder from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import os # Initialize FastAPI app = FastAPI(title="Davidic Sermon Intelligence API") # Add CORS Middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Load Models print("Loading Embedding model...") embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') print("Loading Reranker model...") reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') print("Loading Tiny LLM (TinyLlama-1.1B)...") model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" tokenizer = AutoTokenizer.from_pretrained(model_id) llm_model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True ) # Pipeline WITHOUT generation config to avoid warnings llm_pipeline = pipeline( "text-generation", model=llm_model, tokenizer=tokenizer ) print("All models loaded Ready.") class EmbedRequest(BaseModel): text: str class RerankRequest(BaseModel): query: str documents: list[str] class InsightRequest(BaseModel): query: str context: str @app.get("/") def health_check(): return {"status": "running"} @app.post("/embed") def embed(request: EmbedRequest): try: return embedding_model.encode(request.text).tolist() except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/rerank") def rerank(request: RerankRequest): try: pairs = [[request.query, doc] for doc in request.documents] return reranker_model.predict(pairs).tolist() except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/insight") def generate_insight(request: InsightRequest): try: print(f"Generating insight for: {request.query}") prompt = ( f"<|system|>\n" f"You are a helpful spiritual assistant for Davidic Generation Church. " f"Explain the spiritual context of the videos below based on their transcripts.\n" f"RULES:\n" f"1. Refer to videos like this: 'In [Video 1], Pastor explains...'.\n" f"2. Summarize WHY this moment is relevant to the question.\n" f"3. Do NOT just repeat the transcript. Explain the meaning.\n" f"4. Be thorough and long-form.\n" f"<|user|>\n" f"CONTEXT:\n{request.context}\n\n" f"QUESTION: {request.query}\n" f"<|assistant|>\n" ) # Explicitly set ALL parameters here output = llm_pipeline( prompt, max_new_tokens=512, temperature=0.7, do_sample=True, top_k=50, top_p=0.9, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id ) result = output[0]['generated_text'] if "<|assistant|>" in result: insight = result.split("<|assistant|>")[-1].strip() else: insight = result[len(prompt):].strip() return {"insight": insight} except Exception as e: print(f"Error: {e}") raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)