Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 4

Commit

0e44477

verified ·

1 Parent(s): 8066ccb

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +125 -38

app/app.py CHANGED Viewed

@@ -1,49 +1,136 @@
-from fastapi import FastAPI
 from pydantic import BaseModel
 from llama_cpp import Llama
 import os
-import requests
-from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
-MODEL_URL = "https://huggingface.co/Kalpokoch/QuantizedFineTunedPhi1.5/resolve/main/dop-phi-1.5-Q4_K_M.gguf"
-MODEL_PATH = "/tmp/models/dop-phi-1.5-Q4_K_M.gguf"
-CHUNKS_PATH = "/app/processed_chunks.json"
-# Download the model if not already present
-def download_model():
-    if not os.path.exists(MODEL_PATH):
-        os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
-        print("🔽 Downloading model...")
-        url = "https://huggingface.co/Kalpokoch/QuantizedFineTunedPhi1.5/resolve/main/dop-phi-1.5-Q4_K_M.gguf"
-        response = requests.get(url, stream=True)
-        if response.status_code == 200:
-            with open(MODEL_PATH, "wb") as f:
-                for chunk in response.iter_content(chunk_size=8192):
-                    f.write(chunk)
-            print("✅ Model downloaded successfully.")
-        else:
-            raise Exception(f"Failed to download model: {response.status_code}")
-download_model()
-# Initialize model and vector database
-llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=4)
-vector_db = PolicyVectorDB(CHUNKS_PATH)
-ensure_db_populated(vector_db)
-# FastAPI app setup
 app = FastAPI()
 class Query(BaseModel):
     question: str
-@app.post("/ask")
-async def ask_question(query: Query):
-    question = query.question
-    results = vector_db.query(question)
-    context_text = "\n".join([item["text"] for item in results])
-    prompt = f"Context:\n{context_text}\n\nQuestion: {question}\nAnswer:"
-    output = llm(prompt=prompt, max_tokens=512)
-    return {"answer": output["choices"][0]["text"].strip()}

+# Complete and final app.py
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
+import logging
+from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
+import asyncio
 import os
+# -----------------------------
+# ✅ Logging Configuration
+# -----------------------------
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("app")
+# -----------------------------
+# ✅ Initialize FastAPI App
+# -----------------------------
 app = FastAPI()
+@app.get("/")
+async def root():
+    return {"status": "✅ Server is running and ready."}
+# -----------------------------
+# ✅ Feedback Collection
+# -----------------------------
+class Feedback(BaseModel):
+    question: str
+    answer: str
+    feedback: str
+@app.post("/feedback")
+async def collect_feedback(feedback: Feedback):
+    logger.info(f"[FEEDBACK] Question: {feedback.question} | Answer: {feedback.answer} | Feedback: {feedback.feedback}")
+    return {"status": "✅ Feedback recorded. Thank you!"}
+# -----------------------------
+# ✅ Vector DB Configuration
+# -----------------------------
+DB_PERSIST_DIRECTORY = "/app/vector_database"
+CHUNKS_FILE_PATH = "/app/processed_chunks.json"
+logger.info("[INFO] Initializing vector DB...")
+db = PolicyVectorDB(
+    persist_directory=DB_PERSIST_DIRECTORY,
+    top_k_default=5,
+    relevance_threshold=0.2
+)
+if not ensure_db_populated(db, CHUNKS_FILE_PATH):
+    logger.warning("[WARNING] DB not populated. RAG will not function correctly.")
+else:
+    logger.info("[INFO] Vector DB ready.")
+# -----------------------------
+# ✅ Load Your GGUF Model
+# -----------------------------
+# <-- UPDATED: Points to the new local model file downloaded in the Dockerfile
+MODEL_PATH = "/app/phi1.5_dop_q4_k_m.gguf"
+logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
+llm = Llama(
+    model_path=MODEL_PATH,
+    n_ctx=2048,
+    n_threads=2,
+    n_gpu_layers=0,
+    verbose=False
+)
+logger.info("[INFO] Model loaded successfully.")
+# -----------------------------
+# ✅ Query Schema
+# -----------------------------
 class Query(BaseModel):
     question: str
+# -----------------------------
+# ✅ Chat Endpoint
+# -----------------------------
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
+logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
+async def generate_llm_response(prompt: str):
+    """Helper function to run synchronous LLM inference."""
+    response = llm(prompt, max_tokens=384, stop=["Instruct:", "Output:", "###"], temperature=0.2, echo=False)
+    answer = response["choices"][0]["text"].strip()
+    if not answer:
+        raise ValueError("Empty response from LLM")
+    return answer
+@app.post("/chat")
+async def chat(query: Query):
+    question = query.question.strip()
+    logger.info(f"[QUERY] {question}")
+    search_results = db.search(question)
+    filtered = sorted(
+        [r for r in search_results if r["relevance_score"] > db.relevance_threshold],
+        key=lambda x: x["relevance_score"],
+        reverse=True
+    )
+    if not filtered:
+        logger.info("[RESPONSE] No relevant context found.")
+        return {
+            "question": question,
+            "context_used": "No relevant context found.",
+            "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
+        }
+    context = filtered[0]["text"]
+    logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
+    # This prompt format matches how you fine-tuned Phi-1.5
+    prompt = f"""Instruct: Use the following context to answer the question.
+Context: {context}
+Question: {question}
+Output:"""
+    answer = "Sorry, I couldn't process your request right now. Please try again later."
+    try:
+        answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
+    except asyncio.TimeoutError:
+        logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
+        answer = "Sorry, the request took too long to process. Please try again with a simpler question."
+    except Exception as e:
+        logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
+        answer = "Sorry, an unexpected error occurred while generating a response."
+    logger.info(f"[RESPONSE] Answered: {answer[:100]}...")
+    return {
+        "question": question,
+        "context_used": context,
+        "answer": answer
+    }