Spaces:
Sleeping
Sleeping
| import csv | |
| import os | |
| from pathlib import Path | |
| import faiss | |
| import numpy as np | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel, Field | |
| from sentence_transformers import SentenceTransformer | |
| DATA_PATH = Path(os.getenv("FAQ_CSV_PATH", "data/lauki_qna.csv")) | |
| MODEL_NAME = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2") | |
| app = FastAPI(title="Lauki FAQ Retrieval Service") | |
| class SearchRequest(BaseModel): | |
| query: str = Field(min_length=1) | |
| k: int = Field(default=3, ge=1, le=10) | |
| class SearchResult(BaseModel): | |
| rank: int | |
| score: float | |
| question: str | |
| answer: str | |
| content: str | |
| class SearchResponse(BaseModel): | |
| query: str | |
| k: int | |
| context: str | |
| results: list[SearchResult] | |
| def load_rows(path: Path) -> list[dict[str, str]]: | |
| if not path.exists(): | |
| raise FileNotFoundError(f"FAQ CSV not found: {path}") | |
| rows: list[dict[str, str]] = [] | |
| with path.open("r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| question = row["question"].strip() | |
| answer = row["answer"].strip() | |
| rows.append( | |
| { | |
| "question": question, | |
| "answer": answer, | |
| "content": f"Q: {question}\nA: {answer}", | |
| } | |
| ) | |
| return rows | |
| rows = load_rows(DATA_PATH) | |
| model = SentenceTransformer(MODEL_NAME) | |
| embeddings = model.encode( | |
| [row["content"] for row in rows], | |
| convert_to_numpy=True, | |
| normalize_embeddings=True, | |
| ) | |
| embeddings = np.asarray(embeddings, dtype="float32") | |
| index = faiss.IndexFlatIP(embeddings.shape[1]) | |
| index.add(embeddings) | |
| def health() -> dict[str, int | str]: | |
| return {"status": "ok", "documents": len(rows), "model": MODEL_NAME} | |
| def search(req: SearchRequest) -> SearchResponse: | |
| if not rows: | |
| raise HTTPException(status_code=503, detail="FAQ index is empty") | |
| query_embedding = model.encode( | |
| [req.query], | |
| convert_to_numpy=True, | |
| normalize_embeddings=True, | |
| ) | |
| query_embedding = np.asarray(query_embedding, dtype="float32") | |
| scores, indices = index.search(query_embedding, min(req.k, len(rows))) | |
| results: list[SearchResult] = [] | |
| for rank, (score, idx) in enumerate(zip(scores[0], indices[0]), start=1): | |
| if idx < 0: | |
| continue | |
| row = rows[int(idx)] | |
| results.append( | |
| SearchResult( | |
| rank=rank, | |
| score=float(score), | |
| question=row["question"], | |
| answer=row["answer"], | |
| content=row["content"], | |
| ) | |
| ) | |
| context = "\n\n---\n\n".join( | |
| f"FAQ Entry {result.rank}:\n{result.content}" for result in results | |
| ) | |
| return SearchResponse(query=req.query, k=req.k, context=context, results=results) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860"))) | |