Kalpokoch commited on
Commit
a47545a
·
1 Parent(s): e0fa8c4
Files changed (4) hide show
  1. Dockerfile +16 -25
  2. app/app.py +10 -17
  3. app/policy_vector_db.py +8 -15
  4. processed_chunks.json +0 -0
Dockerfile CHANGED
@@ -1,45 +1,36 @@
1
- # FINAL DOCKERFILE
2
 
3
- # Use the standard Python 3.11 image for maximum compatibility
4
- FROM python:3.11
5
-
6
- # Install system dependencies needed for compilation
7
  RUN apt-get update && apt-get install -y \
8
- curl build-essential cmake \
9
  && rm -rf /var/lib/apt/lists/*
10
 
 
11
  WORKDIR /app
12
 
13
  # Create writable directories
14
  RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
15
 
16
- # Set environment variables. CMAKE_ARGS is critical for the build process.
17
  ENV TRANSFORMERS_CACHE=/app/.cache \
18
  HF_HOME=/app/.cache \
19
- CHROMADB_DISABLE_TELEMETRY=true \
20
- CMAKE_ARGS="-DLLAMA_CUBLAS=OFF"
 
 
21
 
22
- # Copy and install Python requirements.
23
- # This step is slow (15-30 minutes) and that is normal.
24
  COPY requirements.txt .
25
  RUN pip install --no-cache-dir -r requirements.txt
26
 
27
- # Copy the application code (app/ and create_granular_chunks.py)
28
  COPY ./app ./app
29
- COPY ./create_granular_chunks.py .
30
- COPY ./combined_context.jsonl .
31
-
32
- # Generate the processed data file
33
- RUN python create_granular_chunks.py
34
-
35
- # Clean up source files
36
- RUN rm combined_context.jsonl create_granular_chunks.py
37
 
38
- # --- CORRECTED FILENAME in URL and output file name ---
39
- # Download your re-quantized, compatible GGUF model
40
- RUN curl -fL -o /app/phi1.5_dop_q4_k_m.gguf \
41
- https://huggingface.co/Kalpokoch/Phi1.5QuantizedFineTuned/resolve/main/phi1.5_dop_q4_k_m.gguf \
42
- && echo "✅ Model downloaded."
43
 
44
  # Expose the application port
45
  EXPOSE 7860
 
1
+ FROM python:3.11-slim
2
 
3
+ # Install required system dependencies
 
 
 
4
  RUN apt-get update && apt-get install -y \
5
+ git curl build-essential cmake \
6
  && rm -rf /var/lib/apt/lists/*
7
 
8
+ # Set working directory
9
  WORKDIR /app
10
 
11
  # Create writable directories
12
  RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
13
 
14
+ # Set environment variables
15
  ENV TRANSFORMERS_CACHE=/app/.cache \
16
  HF_HOME=/app/.cache \
17
+ CHROMADB_DISABLE_TELEMETRY=true
18
+
19
+ # Pre-install the specific, known-working version of llama-cpp-python for TinyLlama
20
+ RUN pip install --no-cache-dir llama-cpp-python==0.2.61
21
 
22
+ # Install other dependencies from requirements.txt
 
23
  COPY requirements.txt .
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
26
+ # Copy the application code and data file
27
  COPY ./app ./app
28
+ COPY ./processed_chunks.json .
 
 
 
 
 
 
 
29
 
30
+ # Download your fine-tuned TinyLlama GGUF model
31
+ RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
32
+ https://huggingface.co/Kalpokoch/FinetunedQuantizedTinyLama/resolve/main/tinyllama_dop_q4_k_m.gguf \
33
+ && echo "✅ TinyLlama model downloaded."
 
34
 
35
  # Expose the application port
36
  EXPOSE 7860
app/app.py CHANGED
@@ -26,35 +26,32 @@ async def root():
26
  # ✅ Vector DB and Data Configuration
27
  # -----------------------------
28
  DB_PERSIST_DIRECTORY = "/app/vector_database"
29
- # This file is generated by the create_granular_chunks.py script in the Dockerfile
30
  CHUNKS_FILE_PATH = "/app/processed_chunks.json"
31
 
32
  logger.info("[INFO] Initializing vector DB...")
33
  db = PolicyVectorDB(
34
  persist_directory=DB_PERSIST_DIRECTORY,
35
  top_k_default=5,
36
- relevance_threshold=0.35 # Start with a reasonable threshold for granular chunks
37
  )
38
 
39
- # This function now runs on startup to populate the DB if it's empty
40
  if not ensure_db_populated(db, CHUNKS_FILE_PATH):
41
- logger.warning("[WARNING] DB not populated. Chunks file may be missing or empty. RAG will not function correctly.")
42
  else:
43
  logger.info("[INFO] Vector DB is ready.")
44
 
45
  # -----------------------------
46
- # ✅ Load Your Re-Quantized GGUF Model
47
  # -----------------------------
48
- # Points to the compatible GGUF file downloaded in the Dockerfile
49
- MODEL_PATH = "/app/phi1.5_dop_q4_k_m.gguf"
50
-
51
  logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
52
 
53
  llm = Llama(
54
  model_path=MODEL_PATH,
55
- n_ctx=2048,
56
  n_threads=2,
57
- n_gpu_layers=0,
 
58
  verbose=False
59
  )
60
  logger.info("[INFO] Model loaded successfully.")
@@ -73,12 +70,12 @@ class Feedback(BaseModel):
73
  # -----------------------------
74
  # ✅ Endpoints
75
  # -----------------------------
76
- LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
77
  logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
78
 
79
  async def generate_llm_response(prompt: str):
80
  """Helper function to run synchronous LLM inference."""
81
- response = llm(prompt, max_tokens=384, stop=["Instruct:", "Output:", "###"], temperature=0.2, echo=False)
82
  answer = response["choices"][0]["text"].strip()
83
  if not answer:
84
  raise ValueError("Empty response from LLM")
@@ -107,11 +104,7 @@ async def chat(query: Query):
107
  context = filtered[0]["text"]
108
  logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
109
 
110
- # This prompt format matches how you fine-tuned Phi-1.5
111
- prompt = f"""Instruct: Use the following context to answer the question.
112
- Context: {context}
113
- Question: {question}
114
- Output:"""
115
 
116
  answer = "Sorry, I couldn't process your request right now. Please try again later."
117
  try:
 
26
  # ✅ Vector DB and Data Configuration
27
  # -----------------------------
28
  DB_PERSIST_DIRECTORY = "/app/vector_database"
 
29
  CHUNKS_FILE_PATH = "/app/processed_chunks.json"
30
 
31
  logger.info("[INFO] Initializing vector DB...")
32
  db = PolicyVectorDB(
33
  persist_directory=DB_PERSIST_DIRECTORY,
34
  top_k_default=5,
35
+ relevance_threshold=0.2
36
  )
37
 
 
38
  if not ensure_db_populated(db, CHUNKS_FILE_PATH):
39
+ logger.warning("[WARNING] DB not populated. RAG will not function correctly.")
40
  else:
41
  logger.info("[INFO] Vector DB is ready.")
42
 
43
  # -----------------------------
44
+ # ✅ Load TinyLlama GGUF Model
45
  # -----------------------------
46
+ MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
 
 
47
  logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
48
 
49
  llm = Llama(
50
  model_path=MODEL_PATH,
51
+ n_ctx=2048, # Increased context window to prevent errors
52
  n_threads=2,
53
+ n_batch=8,
54
+ use_mlock=False,
55
  verbose=False
56
  )
57
  logger.info("[INFO] Model loaded successfully.")
 
70
  # -----------------------------
71
  # ✅ Endpoints
72
  # -----------------------------
73
+ LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30"))
74
  logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
75
 
76
  async def generate_llm_response(prompt: str):
77
  """Helper function to run synchronous LLM inference."""
78
+ response = llm(prompt, max_tokens=384, stop=["###"], temperature=0.2, echo=False)
79
  answer = response["choices"][0]["text"].strip()
80
  if not answer:
81
  raise ValueError("Empty response from LLM")
 
104
  context = filtered[0]["text"]
105
  logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
106
 
107
+ prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
 
 
 
 
108
 
109
  answer = "Sorry, I couldn't process your request right now. Please try again later."
110
  try:
app/policy_vector_db.py CHANGED
@@ -14,8 +14,7 @@ class PolicyVectorDB:
14
  self.persist_directory = persist_directory
15
  self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
16
  self.collection_name = "neepco_dop_policies"
17
- # Using a faster, smaller model is recommended for better performance on CPU
18
- self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')
19
  self.collection = None
20
  self.top_k_default = top_k_default
21
  self.relevance_threshold = relevance_threshold
@@ -37,23 +36,18 @@ class PolicyVectorDB:
37
  logger.info("No chunks provided to add.")
38
  return
39
 
40
- existing_ids = set()
41
- try:
42
- # Check for existing IDs to avoid trying to re-insert them
43
- existing_ids = set(collection.get(ids=[str(c['id']) for c in chunks if c.get('id')])['ids'])
44
- except Exception:
45
- logger.warning("Could not efficiently retrieve existing IDs. Proceeding with add, ChromaDB will handle duplicates.")
46
- existing_ids = set()
47
-
48
-
49
- new_chunks = [chunk for chunk in chunks if chunk.get('id') and str(chunk.get('id')) not in existing_ids]
50
 
51
  if not new_chunks:
52
  logger.info("No new chunks to add to the database.")
53
  return
54
 
55
  logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
56
- batch_size = 64 # Smaller batch size can be more stable for large embeddings
57
  for i in range(0, len(new_chunks), batch_size):
58
  batch = new_chunks[i:i + batch_size]
59
  texts = [chunk['text'] for chunk in batch]
@@ -62,7 +56,7 @@ class PolicyVectorDB:
62
  metadatas = []
63
  for chunk in batch:
64
  meta = chunk.get('metadata')
65
- if not meta: # Handles cases where metadata is missing or empty
66
  meta = {"description": "General information chunk."}
67
  metadatas.append(self._flatten_metadata(meta))
68
 
@@ -100,7 +94,6 @@ def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
100
  logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
101
  return False
102
 
103
- # This is the correct method for a standard .json file
104
  with open(chunks_file_path, 'r', encoding='utf-8') as f:
105
  chunks_to_add = json.load(f)
106
 
 
14
  self.persist_directory = persist_directory
15
  self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
16
  self.collection_name = "neepco_dop_policies"
17
+ self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
 
18
  self.collection = None
19
  self.top_k_default = top_k_default
20
  self.relevance_threshold = relevance_threshold
 
36
  logger.info("No chunks provided to add.")
37
  return
38
 
39
+ # Ensure all IDs are strings before checking for existence
40
+ new_chunks = [chunk for chunk in chunks if chunk.get('id')]
41
+ existing_ids = set(collection.get(ids=[str(c['id']) for c in new_chunks])['ids'])
42
+
43
+ new_chunks = [chunk for chunk in new_chunks if str(chunk.get('id')) not in existing_ids]
 
 
 
 
 
44
 
45
  if not new_chunks:
46
  logger.info("No new chunks to add to the database.")
47
  return
48
 
49
  logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
50
+ batch_size = 64
51
  for i in range(0, len(new_chunks), batch_size):
52
  batch = new_chunks[i:i + batch_size]
53
  texts = [chunk['text'] for chunk in batch]
 
56
  metadatas = []
57
  for chunk in batch:
58
  meta = chunk.get('metadata')
59
+ if not meta:
60
  meta = {"description": "General information chunk."}
61
  metadatas.append(self._flatten_metadata(meta))
62
 
 
94
  logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
95
  return False
96
 
 
97
  with open(chunks_file_path, 'r', encoding='utf-8') as f:
98
  chunks_to_add = json.load(f)
99
 
processed_chunks.json ADDED
The diff for this file is too large to render. See raw diff