Spaces:

nikeshn
/

kulibrary

Running

App Files Files Community

nikeshn commited on 27 days ago

Commit

cba5a66

verified ·

1 Parent(s): 80b7f18

Create app.pu

Browse files

Files changed (1) hide show

app.pu +252 -0

app.pu ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Khalifa University Library RAG Backend
+LangChain + FAISS + FastAPI on Hugging Face Spaces
+This app:
+1. Loads scraped library pages from the 'knowledge/' folder
+2. Chunks and embeds them using OpenAI embeddings
+3. Stores in a FAISS vector store
+4. Exposes a /rag endpoint that retrieves relevant chunks and generates grounded answers
+Environment variables (set as HF Space Secrets):
+  OPENAI_API_KEY  — for embeddings + LLM
+"""
+import os
+import glob
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain.schema import Document
+# ===== CONFIG =====
+KNOWLEDGE_DIR = "knowledge"
+FAISS_INDEX_PATH = "faiss_index"
+CHUNK_SIZE = 800
+CHUNK_OVERLAP = 100
+EMBEDDING_MODEL = "text-embedding-3-small"
+LLM_MODEL = "gpt-4o-mini"
+TOP_K = 5
+# ===== GLOBAL STATE =====
+qa_chain = None
+vectorstore = None
+def load_documents():
+    """Load all .txt files from the knowledge directory."""
+    docs = []
+    files = glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt"))
+    print(f"Found {len(files)} knowledge files")
+    for filepath in files:
+        try:
+            with open(filepath, "r", encoding="utf-8") as f:
+                content = f.read()
+            # Extract metadata from first two lines
+            lines = content.split("\n", 3)
+            source = ""
+            title = ""
+            text = content
+            for line in lines[:2]:
+                if line.startswith("SOURCE:"):
+                    source = line.replace("SOURCE:", "").strip()
+                elif line.startswith("TITLE:"):
+                    title = line.replace("TITLE:", "").strip()
+            if source or title:
+                text = "\n".join(lines[2:]).strip()
+            docs.append(Document(
+                page_content=text,
+                metadata={"source": source, "title": title, "file": os.path.basename(filepath)}
+            ))
+        except Exception as e:
+            print(f"Error loading {filepath}: {e}")
+    print(f"Loaded {len(docs)} documents")
+    return docs
+def build_vectorstore(docs):
+    """Chunk documents and create FAISS vector store."""
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        separators=["\n\n", "\n", ". ", " ", ""]
+    )
+    chunks = splitter.split_documents(docs)
+    print(f"Split into {len(chunks)} chunks")
+    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
+    # Try to load existing index first
+    if os.path.exists(FAISS_INDEX_PATH):
+        print("Loading existing FAISS index...")
+        store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
+        print(f"Loaded FAISS index with {store.index.ntotal} vectors")
+        return store
+    # Build new index
+    print("Building new FAISS index...")
+    store = FAISS.from_documents(chunks, embeddings)
+    store.save_local(FAISS_INDEX_PATH)
+    print(f"Created FAISS index with {store.index.ntotal} vectors")
+    return store
+def build_chain(store):
+    """Build the LangChain RetrievalQA chain."""
+    llm = ChatOpenAI(model=LLM_MODEL, temperature=0.2, max_tokens=500)
+    prompt_template = PromptTemplate(
+        input_variables=["context", "question"],
+        template="""You are the Khalifa University Library AI Assistant in Abu Dhabi, UAE.
+KU means Khalifa University, NOT Kuwait University.
+Use ONLY the following context from the Khalifa University Library website to answer the question.
+If the context doesn't contain enough information, say "I don't have specific information about this in our library knowledge base" and suggest contacting Ask a Librarian at https://library.ku.ac.ae/AskUs
+Always include relevant URLs from the context when available.
+Keep answers concise (2-4 sentences) and helpful.
+Context:
+{context}
+Question: {question}
+Answer:"""
+    )
+    chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=store.as_retriever(search_kwargs={"k": TOP_K}),
+        chain_type_kwargs={"prompt": prompt_template},
+        return_source_documents=True,
+    )
+    return chain
+# ===== STARTUP =====
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global qa_chain, vectorstore
+    print("=== Starting KU Library RAG Backend ===")
+    docs = load_documents()
+    if docs:
+        vectorstore = build_vectorstore(docs)
+        qa_chain = build_chain(vectorstore)
+        print("RAG chain ready!")
+    else:
+        print("WARNING: No knowledge files found. RAG will not work.")
+        print(f"Please add .txt files to the '{KNOWLEDGE_DIR}/' directory.")
+    yield
+    print("Shutting down...")
+# ===== FASTAPI APP =====
+app = FastAPI(title="KU Library RAG", lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Restrict to your domains in production
+    allow_methods=["POST", "GET"],
+    allow_headers=["*"],
+)
+class QueryRequest(BaseModel):
+    question: str
+    top_k: int = 5
+class SourceDoc(BaseModel):
+    title: str
+    source: str
+    snippet: str
+class QueryResponse(BaseModel):
+    answer: str
+    sources: list[SourceDoc]
+    error: str | None = None
+@app.get("/")
+def health():
+    return {
+        "status": "ok",
+        "rag_ready": qa_chain is not None,
+        "service": "KU Library RAG Backend",
+    }
+@app.post("/rag", response_model=QueryResponse)
+async def rag_query(req: QueryRequest):
+    if not qa_chain:
+        return QueryResponse(
+            answer="RAG system not initialized. Knowledge base may be empty.",
+            sources=[],
+            error="No knowledge files loaded"
+        )
+    try:
+        result = qa_chain.invoke({"query": req.question})
+        answer = result.get("result", "No answer generated.")
+        source_docs = result.get("source_documents", [])
+        sources = []
+        seen = set()
+        for doc in source_docs:
+            src = doc.metadata.get("source", "")
+            title = doc.metadata.get("title", "")
+            key = src or title
+            if key and key not in seen:
+                seen.add(key)
+                sources.append(SourceDoc(
+                    title=title,
+                    source=src,
+                    snippet=doc.page_content[:200] + "..."
+                ))
+        return QueryResponse(answer=answer, sources=sources)
+    except Exception as e:
+        return QueryResponse(
+            answer="Sorry, I encountered an error processing your question.",
+            sources=[],
+            error=str(e)
+        )
+@app.post("/rebuild")
+async def rebuild_index():
+    """Force rebuild the FAISS index from knowledge files."""
+    global qa_chain, vectorstore
+    try:
+        if os.path.exists(FAISS_INDEX_PATH):
+            import shutil
+            shutil.rmtree(FAISS_INDEX_PATH)
+        docs = load_documents()
+        if not docs:
+            return {"error": "No knowledge files found"}
+        vectorstore = build_vectorstore(docs)
+        qa_chain = build_chain(vectorstore)
+        return {"status": "ok", "chunks": vectorstore.index.ntotal}
+    except Exception as e:
+        return {"error": str(e)}