Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 4, 2025

Commit

a47545a

1 Parent(s): e0fa8c4

updates

Browse files

Files changed (4) hide show

Dockerfile +16 -25
app/app.py +10 -17
app/policy_vector_db.py +8 -15
processed_chunks.json +0 -0

Dockerfile CHANGED Viewed

@@ -1,45 +1,36 @@
-# FINAL DOCKERFILE
-# Use the standard Python 3.11 image for maximum compatibility
-FROM python:3.11
-# Install system dependencies needed for compilation
 RUN apt-get update && apt-get install -y \
-    curl build-essential cmake \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # Create writable directories
 RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
-# Set environment variables. CMAKE_ARGS is critical for the build process.
 ENV TRANSFORMERS_CACHE=/app/.cache \
     HF_HOME=/app/.cache \
-    CHROMADB_DISABLE_TELEMETRY=true \
-    CMAKE_ARGS="-DLLAMA_CUBLAS=OFF"
-# Copy and install Python requirements.
-# This step is slow (15-30 minutes) and that is normal.
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the application code (app/ and create_granular_chunks.py)
 COPY ./app ./app
-COPY ./create_granular_chunks.py .
-COPY ./combined_context.jsonl .
-# Generate the processed data file
-RUN python create_granular_chunks.py
-# Clean up source files
-RUN rm combined_context.jsonl create_granular_chunks.py
-# --- CORRECTED FILENAME in URL and output file name ---
-# Download your re-quantized, compatible GGUF model
-RUN curl -fL -o /app/phi1.5_dop_q4_k_m.gguf \
-    https://huggingface.co/Kalpokoch/Phi1.5QuantizedFineTuned/resolve/main/phi1.5_dop_q4_k_m.gguf \
-    && echo "✅ Model downloaded."
 # Expose the application port
 EXPOSE 7860

+FROM python:3.11-slim
+# Install required system dependencies
 RUN apt-get update && apt-get install -y \
+    git curl build-essential cmake \
     && rm -rf /var/lib/apt/lists/*
+# Set working directory
 WORKDIR /app
 # Create writable directories
 RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
+# Set environment variables
 ENV TRANSFORMERS_CACHE=/app/.cache \
     HF_HOME=/app/.cache \
+    CHROMADB_DISABLE_TELEMETRY=true
+# Pre-install the specific, known-working version of llama-cpp-python for TinyLlama
+RUN pip install --no-cache-dir llama-cpp-python==0.2.61
+# Install other dependencies from requirements.txt
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code and data file
 COPY ./app ./app
+COPY ./processed_chunks.json .
+# Download your fine-tuned TinyLlama GGUF model
+RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
+    https://huggingface.co/Kalpokoch/FinetunedQuantizedTinyLama/resolve/main/tinyllama_dop_q4_k_m.gguf \
+    && echo "✅ TinyLlama model downloaded."
 # Expose the application port
 EXPOSE 7860

app/app.py CHANGED Viewed

@@ -26,35 +26,32 @@ async def root():
 # ✅ Vector DB and Data Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = "/app/vector_database"
-# This file is generated by the create_granular_chunks.py script in the Dockerfile
 CHUNKS_FILE_PATH = "/app/processed_chunks.json"
 logger.info("[INFO] Initializing vector DB...")
 db = PolicyVectorDB(
     persist_directory=DB_PERSIST_DIRECTORY,
     top_k_default=5,
-    relevance_threshold=0.35 # Start with a reasonable threshold for granular chunks
 )
-# This function now runs on startup to populate the DB if it's empty
 if not ensure_db_populated(db, CHUNKS_FILE_PATH):
-    logger.warning("[WARNING] DB not populated. Chunks file may be missing or empty. RAG will not function correctly.")
 else:
     logger.info("[INFO] Vector DB is ready.")
 # -----------------------------
-# ✅ Load Your Re-Quantized GGUF Model
 # -----------------------------
-# Points to the compatible GGUF file downloaded in the Dockerfile
-MODEL_PATH = "/app/phi1.5_dop_q4_k_m.gguf"
 logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
 llm = Llama(
     model_path=MODEL_PATH,
-    n_ctx=2048,
     n_threads=2,
-    n_gpu_layers=0,
     verbose=False
 )
 logger.info("[INFO] Model loaded successfully.")
@@ -73,12 +70,12 @@ class Feedback(BaseModel):
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
 logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
 async def generate_llm_response(prompt: str):
     """Helper function to run synchronous LLM inference."""
-    response = llm(prompt, max_tokens=384, stop=["Instruct:", "Output:", "###"], temperature=0.2, echo=False)
     answer = response["choices"][0]["text"].strip()
     if not answer:
         raise ValueError("Empty response from LLM")
@@ -107,11 +104,7 @@ async def chat(query: Query):
     context = filtered[0]["text"]
     logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
-    # This prompt format matches how you fine-tuned Phi-1.5
-    prompt = f"""Instruct: Use the following context to answer the question.
-Context: {context}
-Question: {question}
-Output:"""
     answer = "Sorry, I couldn't process your request right now. Please try again later."
     try:

 # ✅ Vector DB and Data Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = "/app/vector_database"
 CHUNKS_FILE_PATH = "/app/processed_chunks.json"
 logger.info("[INFO] Initializing vector DB...")
 db = PolicyVectorDB(
     persist_directory=DB_PERSIST_DIRECTORY,
     top_k_default=5,
+    relevance_threshold=0.2
 )
 if not ensure_db_populated(db, CHUNKS_FILE_PATH):
+    logger.warning("[WARNING] DB not populated. RAG will not function correctly.")
 else:
     logger.info("[INFO] Vector DB is ready.")
 # -----------------------------
+# ✅ Load TinyLlama GGUF Model
 # -----------------------------
+MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
 logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
 llm = Llama(
     model_path=MODEL_PATH,
+    n_ctx=2048, # Increased context window to prevent errors
     n_threads=2,
+    n_batch=8,
+    use_mlock=False,
     verbose=False
 )
 logger.info("[INFO] Model loaded successfully.")
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30"))
 logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
 async def generate_llm_response(prompt: str):
     """Helper function to run synchronous LLM inference."""
+    response = llm(prompt, max_tokens=384, stop=["###"], temperature=0.2, echo=False)
     answer = response["choices"][0]["text"].strip()
     if not answer:
         raise ValueError("Empty response from LLM")
     context = filtered[0]["text"]
     logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
+    prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
     answer = "Sorry, I couldn't process your request right now. Please try again later."
     try:

app/policy_vector_db.py CHANGED Viewed

@@ -14,8 +14,7 @@ class PolicyVectorDB:
         self.persist_directory = persist_directory
         self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
         self.collection_name = "neepco_dop_policies"
-        # Using a faster, smaller model is recommended for better performance on CPU
-        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')
         self.collection = None
         self.top_k_default = top_k_default
         self.relevance_threshold = relevance_threshold
@@ -37,23 +36,18 @@ class PolicyVectorDB:
             logger.info("No chunks provided to add.")
             return
-        existing_ids = set()
-        try:
-            # Check for existing IDs to avoid trying to re-insert them
-            existing_ids = set(collection.get(ids=[str(c['id']) for c in chunks if c.get('id')])['ids'])
-        except Exception:
-            logger.warning("Could not efficiently retrieve existing IDs. Proceeding with add, ChromaDB will handle duplicates.")
-            existing_ids = set()
-        new_chunks = [chunk for chunk in chunks if chunk.get('id') and str(chunk.get('id')) not in existing_ids]
         if not new_chunks:
             logger.info("No new chunks to add to the database.")
             return
         logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
-        batch_size = 64 # Smaller batch size can be more stable for large embeddings
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
             texts = [chunk['text'] for chunk in batch]
@@ -62,7 +56,7 @@ class PolicyVectorDB:
             metadatas = []
             for chunk in batch:
                 meta = chunk.get('metadata')
-                if not meta:  # Handles cases where metadata is missing or empty
                     meta = {"description": "General information chunk."}
                 metadatas.append(self._flatten_metadata(meta))
@@ -100,7 +94,6 @@ def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
                 logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
                 return False
-            # This is the correct method for a standard .json file
             with open(chunks_file_path, 'r', encoding='utf-8') as f:
                 chunks_to_add = json.load(f)

         self.persist_directory = persist_directory
         self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
         self.collection_name = "neepco_dop_policies"
+        self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
         self.collection = None
         self.top_k_default = top_k_default
         self.relevance_threshold = relevance_threshold
             logger.info("No chunks provided to add.")
             return
+        # Ensure all IDs are strings before checking for existence
+        new_chunks = [chunk for chunk in chunks if chunk.get('id')]
+        existing_ids = set(collection.get(ids=[str(c['id']) for c in new_chunks])['ids'])
+        new_chunks = [chunk for chunk in new_chunks if str(chunk.get('id')) not in existing_ids]
         if not new_chunks:
             logger.info("No new chunks to add to the database.")
             return
         logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
+        batch_size = 64
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
             texts = [chunk['text'] for chunk in batch]
             metadatas = []
             for chunk in batch:
                 meta = chunk.get('metadata')
+                if not meta:
                     meta = {"description": "General information chunk."}
                 metadatas.append(self._flatten_metadata(meta))
                 logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
                 return False
             with open(chunks_file_path, 'r', encoding='utf-8') as f:
                 chunks_to_add = json.load(f)

processed_chunks.json ADDED Viewed

The diff for this file is too large to render. See raw diff