Kalpokoch commited on
Commit
0e44477
Β·
verified Β·
1 Parent(s): 8066ccb

Update app/app.py

Browse files
Files changed (1) hide show
  1. app/app.py +125 -38
app/app.py CHANGED
@@ -1,49 +1,136 @@
1
- from fastapi import FastAPI
 
 
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
 
 
 
4
  import os
5
- import requests
6
 
7
- from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 
 
 
 
8
 
9
- MODEL_URL = "https://huggingface.co/Kalpokoch/QuantizedFineTunedPhi1.5/resolve/main/dop-phi-1.5-Q4_K_M.gguf"
10
- MODEL_PATH = "/tmp/models/dop-phi-1.5-Q4_K_M.gguf"
11
- CHUNKS_PATH = "/app/processed_chunks.json"
12
-
13
- # Download the model if not already present
14
- def download_model():
15
- if not os.path.exists(MODEL_PATH):
16
- os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
17
- print("πŸ”½ Downloading model...")
18
- url = "https://huggingface.co/Kalpokoch/QuantizedFineTunedPhi1.5/resolve/main/dop-phi-1.5-Q4_K_M.gguf"
19
- response = requests.get(url, stream=True)
20
- if response.status_code == 200:
21
- with open(MODEL_PATH, "wb") as f:
22
- for chunk in response.iter_content(chunk_size=8192):
23
- f.write(chunk)
24
- print("βœ… Model downloaded successfully.")
25
- else:
26
- raise Exception(f"Failed to download model: {response.status_code}")
27
-
28
- download_model()
29
-
30
- # Initialize model and vector database
31
- llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=4)
32
- vector_db = PolicyVectorDB(CHUNKS_PATH)
33
- ensure_db_populated(vector_db)
34
-
35
- # FastAPI app setup
36
  app = FastAPI()
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  class Query(BaseModel):
39
  question: str
40
 
41
- @app.post("/ask")
42
- async def ask_question(query: Query):
43
- question = query.question
44
- results = vector_db.query(question)
45
- context_text = "\n".join([item["text"] for item in results])
46
- prompt = f"Context:\n{context_text}\n\nQuestion: {question}\nAnswer:"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- output = llm(prompt=prompt, max_tokens=512)
49
- return {"answer": output["choices"][0]["text"].strip()}
 
 
 
 
 
1
+ # Complete and final app.py
2
+
3
+ from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
  from llama_cpp import Llama
6
+ import logging
7
+ from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
8
+ import asyncio
9
  import os
 
10
 
11
+ # -----------------------------
12
+ # βœ… Logging Configuration
13
+ # -----------------------------
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger("app")
16
 
17
+ # -----------------------------
18
+ # βœ… Initialize FastAPI App
19
+ # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  app = FastAPI()
21
 
22
+ @app.get("/")
23
+ async def root():
24
+ return {"status": "βœ… Server is running and ready."}
25
+
26
+ # -----------------------------
27
+ # βœ… Feedback Collection
28
+ # -----------------------------
29
+ class Feedback(BaseModel):
30
+ question: str
31
+ answer: str
32
+ feedback: str
33
+
34
+ @app.post("/feedback")
35
+ async def collect_feedback(feedback: Feedback):
36
+ logger.info(f"[FEEDBACK] Question: {feedback.question} | Answer: {feedback.answer} | Feedback: {feedback.feedback}")
37
+ return {"status": "βœ… Feedback recorded. Thank you!"}
38
+
39
+ # -----------------------------
40
+ # βœ… Vector DB Configuration
41
+ # -----------------------------
42
+ DB_PERSIST_DIRECTORY = "/app/vector_database"
43
+ CHUNKS_FILE_PATH = "/app/processed_chunks.json"
44
+ logger.info("[INFO] Initializing vector DB...")
45
+ db = PolicyVectorDB(
46
+ persist_directory=DB_PERSIST_DIRECTORY,
47
+ top_k_default=5,
48
+ relevance_threshold=0.2
49
+ )
50
+ if not ensure_db_populated(db, CHUNKS_FILE_PATH):
51
+ logger.warning("[WARNING] DB not populated. RAG will not function correctly.")
52
+ else:
53
+ logger.info("[INFO] Vector DB ready.")
54
+
55
+ # -----------------------------
56
+ # βœ… Load Your GGUF Model
57
+ # -----------------------------
58
+ # <-- UPDATED: Points to the new local model file downloaded in the Dockerfile
59
+ MODEL_PATH = "/app/phi1.5_dop_q4_k_m.gguf"
60
+
61
+ logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
62
+
63
+ llm = Llama(
64
+ model_path=MODEL_PATH,
65
+ n_ctx=2048,
66
+ n_threads=2,
67
+ n_gpu_layers=0,
68
+ verbose=False
69
+ )
70
+ logger.info("[INFO] Model loaded successfully.")
71
+
72
+ # -----------------------------
73
+ # βœ… Query Schema
74
+ # -----------------------------
75
  class Query(BaseModel):
76
  question: str
77
 
78
+ # -----------------------------
79
+ # βœ… Chat Endpoint
80
+ # -----------------------------
81
+ LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
82
+ logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
83
+
84
+ async def generate_llm_response(prompt: str):
85
+ """Helper function to run synchronous LLM inference."""
86
+ response = llm(prompt, max_tokens=384, stop=["Instruct:", "Output:", "###"], temperature=0.2, echo=False)
87
+ answer = response["choices"][0]["text"].strip()
88
+ if not answer:
89
+ raise ValueError("Empty response from LLM")
90
+ return answer
91
+
92
+ @app.post("/chat")
93
+ async def chat(query: Query):
94
+ question = query.question.strip()
95
+ logger.info(f"[QUERY] {question}")
96
+
97
+ search_results = db.search(question)
98
+ filtered = sorted(
99
+ [r for r in search_results if r["relevance_score"] > db.relevance_threshold],
100
+ key=lambda x: x["relevance_score"],
101
+ reverse=True
102
+ )
103
+
104
+ if not filtered:
105
+ logger.info("[RESPONSE] No relevant context found.")
106
+ return {
107
+ "question": question,
108
+ "context_used": "No relevant context found.",
109
+ "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
110
+ }
111
+
112
+ context = filtered[0]["text"]
113
+ logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
114
+
115
+ # This prompt format matches how you fine-tuned Phi-1.5
116
+ prompt = f"""Instruct: Use the following context to answer the question.
117
+ Context: {context}
118
+ Question: {question}
119
+ Output:"""
120
+
121
+ answer = "Sorry, I couldn't process your request right now. Please try again later."
122
+ try:
123
+ answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
124
+ except asyncio.TimeoutError:
125
+ logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
126
+ answer = "Sorry, the request took too long to process. Please try again with a simpler question."
127
+ except Exception as e:
128
+ logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
129
+ answer = "Sorry, an unexpected error occurred while generating a response."
130
 
131
+ logger.info(f"[RESPONSE] Answered: {answer[:100]}...")
132
+ return {
133
+ "question": question,
134
+ "context_used": context,
135
+ "answer": answer
136
+ }