Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on Aug 1, 2025

Commit

fb236cf

1 Parent(s): 7502aed

chatbot updated

Browse files

Files changed (4) hide show

Dockerfile +2 -0
app.py +30 -29
backend/services/codingo_chatbot.py +319 -0
requirements.txt +5 -2

Dockerfile CHANGED Viewed

@@ -5,6 +5,8 @@ FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y \
     python3 python3-pip ffmpeg git libsndfile1 \
     && rm -rf /var/lib/apt/lists/*
 # Set up Python environment

 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y \
     python3 python3-pip ffmpeg git libsndfile1 \
+    # Development tools required to compile native extensions such as llama-cpp-python
+    build-essential cmake libopenblas-dev \
     && rm -rf /var/lib/apt/lists/*
 # Set up Python environment

app.py CHANGED Viewed

@@ -32,27 +32,34 @@ import re
 import json
 # -----------------------------------------------------------------------------
-# Chatbot setup
 #
-# The chatbot uses a local vector database (Chroma) to search the
-# ``chatbot/chatbot.txt`` knowledge base.  Retrieved passages are fed to
-# a lightweight conversational model from Hugging Face.  To avoid the
-# expensive model and database initialisation on every request, embeddings
-# and the vector collection are loaded lazily the first time a chat query
-# is processed.  Subsequent requests reuse the same global objects.  All
-# chatbot logic resides in ``chatbot/chatbot.py``.
-# Paths for the chatbot knowledge base and persistent vector store.  We
-# compute these relative to the current file so that the app can be deployed
-# anywhere without needing to change configuration.  The ``chroma_db``
-# directory will be created automatically by the Chroma client if it does not
-# exist.
-# The internal chatbot logic has been extracted to ``chatbot/chatbot.py``.  See
-# that module for details.  We import the ``get_chatbot_response`` function
-# here so that the Flask route can delegate queries directly to it.  This
-# prevents ``app.py`` from depending on the heavy ML libraries and keeps
-# the application entry point lean.
-from chatbot.chatbot import get_chatbot_response
 # Initialize Flask app
 app = Flask(
@@ -348,17 +355,11 @@ if __name__ == '__main__':
     with app.app_context():
         db.create_all()
-        # Pre-initialize the chatbot on startup for faster first response.  We
-        # deliberately trigger a dummy query here to force loading of the
-        # sentence encoder, vector store and conversational model.  Any
-        # exceptions during warm‑up are logged but do not stop the app from
-        # starting.
         print("Initializing chatbot...")
         try:
-            # Import inside the block to ensure the module has been
-            # properly loaded with the current environment settings.
-            from chatbot.chatbot import get_chatbot_response
-            _ = get_chatbot_response("Hello!")
             print("Chatbot initialized successfully")
         except Exception as e:
             print(f"Chatbot initialization warning: {e}")

 import json
 # -----------------------------------------------------------------------------
+# Chatbot integration
 #
+# We delegate all chatbot logic to the ``codingo_chatbot`` module within
+# ``backend/services``.  This module handles loading the knowledge base,
+# building embeddings, initialising the TinyLlama model and generating
+# responses.  Importing here ensures the heavy dependencies are loaded only
+# when the chatbot endpoint is used.  See ``backend/services/codingo_chatbot.py``
+# for implementation details.
+from backend.services.codingo_chatbot import get_response as _codingo_get_response
+def get_chatbot_response(query: str) -> str:
+    """Proxy to the codingo_chatbot implementation.
+    This function exists to preserve the original public API of
+    ``app.get_chatbot_response`` while redirecting calls to the new
+    implementation.  It catches any exceptions and returns a user
+    friendly message, ensuring the Flask route never raises.
+    """
+    try:
+        return _codingo_get_response(query)
+    except Exception as exc:
+        print(f"Chatbot error: {exc}", file=sys.stderr)
+        return (
+            "I'm having trouble processing your request. Please try again or ask "
+            "about Codingo's features, job matching, or how to use the platform."
+        )
 # Initialize Flask app
 app = Flask(
     with app.app_context():
         db.create_all()
+        # Pre-initialize chatbot on startup for faster first response
         print("Initializing chatbot...")
         try:
+            init_chatbot()
+            init_hf_model()
             print("Chatbot initialized successfully")
         except Exception as e:
             print(f"Chatbot initialization warning: {e}")

backend/services/codingo_chatbot.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""
+codingo_chatbot.py
+===================
+This module encapsulates the logic for Codingo's website chatbot.  It
+loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
+database using Chroma and SentenceTransformers, and uses a local LLM
+powered by ``llama‑cpp‑python`` to generate answers constrained to the
+retrieved context.  The code is written to initialise all heavy
+resources lazily on first use and to cache them for subsequent
+requests.  This prevents repeated model downloads and avoids
+recomputing embeddings for every chat query.
+The underlying LLM is the TinyLlama 1.1B chat model distributed via
+Hugging Face in GGUF format.  When the model file is not present
+locally it is downloaded automatically using ``huggingface_hub``.
+Depending on the environment the model will run on GPU if CUDA is
+available or fall back to CPU otherwise.  See the ``init_llm``
+function for details.
+Note: This module deliberately contains no references to OpenAI.  It
+relies solely on open‑source libraries available on PyPI (such as
+``llama‑cpp‑python`` and ``chromadb``) so that it can be used on
+Hugging Face Spaces without requiring proprietary API keys.
+"""
+from __future__ import annotations
+import os
+import threading
+from typing import List
+import numpy as np
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+import chromadb
+from chromadb.config import Settings
+from huggingface_hub import hf_hub_download
+try:
+    from llama_cpp import Llama  # type: ignore
+except Exception as exc:  # pragma: no cover - import may fail until dependency installed
+    # Provide a helpful error if llama_cpp isn't installed.
+    raise ImportError(
+        "llama_cpp is required for the chatbot. Please add 'llama-cpp-python' "
+        "to your requirements.txt"
+    ) from exc
+# ---------------------------------------------------------------------------
+# Configuration
+#
+# Compute the absolute path to the chatbot knowledge base.  We derive this
+# relative to this file so that the module works regardless of the working
+# directory.  The project structure places ``chatbot.txt`` at
+# ``Codingo12/chatbot/chatbot.txt``.
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")
+# Directory where Chroma will persist its database.  This location is
+# writable on both local machines and Hugging Face Spaces.  It is
+# intentionally distinct from the web app instance path to avoid
+# permission issues.
+CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")
+# Settings for the TinyLlama model.  These can be overridden via
+# environment variables if desired (for example to switch to a
+# different quantisation or to test with a smaller model).  See
+# https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF for
+# available filenames.
+LLAMA_REPO = os.getenv(
+    "LLAMA_REPO",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+)
+LLAMA_FILE = os.getenv(
+    "LLAMA_FILE",
+    "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+)
+# Local directory where the GGUF model file will be stored.  Using
+# ``/tmp`` avoids writing into the read‑only repository filesystem on
+# Hugging Face Spaces.  The directory will be created as needed.
+LLAMA_LOCAL_DIR = os.path.join("/tmp", "llama_models")
+# Generation parameters.  These values mirror those used in the
+# provided Jupyter notebook.  They can be tweaked via environment
+# variables if necessary to trade off quality against speed.
+MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "256"))
+TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.7"))
+TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9"))
+REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.15"))
+# Thread lock to guard lazy initialisation in multi‑threaded Flask
+# environments.  Without this lock multiple concurrent requests may
+# attempt to download the model or populate the database at the same
+# time, leading to redundant work or race conditions.
+_init_lock = threading.Lock()
+# Global singletons for embedder, vector collection and LLM.  These
+# variables are populated on first use and reused thereafter.
+_embedder: SentenceTransformer | None = None
+_collection: chromadb.Collection | None = None
+_llm: Llama | None = None
+def _load_chatbot_text() -> str:
+    """Read the chatbot knowledge base from disk.
+    If the file is missing, a small default description of Codingo is
+    returned.  This ensures the chatbot still provides a sensible
+    answer rather than crashing.
+    """
+    try:
+        with open(CHATBOT_TXT_PATH, encoding="utf-8") as f:
+            return f.read()
+    except FileNotFoundError:
+        # Fallback content if the knowledge base file is missing
+        return (
+            "Codingo is an AI‑powered recruitment platform designed to "
+            "streamline job applications, candidate screening and hiring. "
+            "We make hiring smarter, faster and fairer through automation "
+            "and intelligent recommendations."
+        )
+def init_embedder_and_db() -> None:
+    """Initialise the SentenceTransformer embedder and Chroma vector DB.
+    This function is idempotent: if the embedder and collection are
+    already initialised it returns immediately.  Otherwise it reads
+    ``chatbot.txt``, splits it into overlapping chunks, computes
+    embeddings and persists them to a Chroma collection.  The
+    resulting ``SentenceTransformer`` and collection objects are saved
+    in global variables for later reuse.
+    """
+    global _embedder, _collection
+    if _embedder is not None and _collection is not None:
+        return
+    with _init_lock:
+        if _embedder is not None and _collection is not None:
+            return
+        # Ensure persistence directory exists
+        os.makedirs(CHROMA_DB_DIR, exist_ok=True)
+        # Read knowledge base
+        text = _load_chatbot_text()
+        # Split into chunks; use double newlines to prefer splitting on
+        # paragraph boundaries.  Overlap helps the model maintain
+        # context across neighbouring chunks.
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=300,
+            chunk_overlap=100,
+            separators=["\n\n"],
+        )
+        docs: List[str] = [doc.strip() for doc in splitter.split_text(text) if doc.strip()]
+        # Initialise embedder (MiniLM).  We specify device via env.
+        embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        embeddings = embedder.encode(docs, show_progress_bar=False, batch_size=32)
+        # Initialise Chroma client
+        client = chromadb.Client(
+            Settings(
+                persist_directory=CHROMA_DB_DIR,
+                anonymized_telemetry=False,
+                is_persistent=True,
+            )
+        )
+        # Create or get collection.  This returns an existing collection if
+        # already present on disk.
+        collection = client.get_or_create_collection("codingo_chatbot")
+        # Populate collection only if empty.  A naive call to
+        # ``collection.get(limit=1)`` may raise if the collection does
+        # not exist yet, so we catch any exception and treat it as an
+        # empty DB.  Distances are stored as cosine similarity.
+        need_populate = False
+        try:
+            existing = collection.get(limit=1)
+            if not existing or not existing.get("documents"):
+                need_populate = True
+        except Exception:
+            need_populate = True
+        if need_populate:
+            ids = [f"doc_{i}" for i in range(len(docs))]
+            collection.add(documents=docs, embeddings=embeddings.tolist(), ids=ids)
+        _embedder = embedder
+        _collection = collection
+def init_llm() -> None:
+    """Initialise the llama‑cpp model for response generation.
+    This function lazily downloads the GGUF model from Hugging Face if
+    necessary and instantiates a ``llama_cpp.Llama`` object.  The
+    resulting instance is stored in the global ``_llm`` variable.  To
+    control GPU usage set the ``CUDA_VISIBLE_DEVICES`` environment
+    variable or override ``LLAMA_N_GPU_LAYERS``.  By default we use one
+    GPU layer when CUDA is available, otherwise the model runs on CPU.
+    """
+    global _llm
+    if _llm is not None:
+        return
+    with _init_lock:
+        if _llm is not None:
+            return
+        # Ensure the model directory exists
+        os.makedirs(LLAMA_LOCAL_DIR, exist_ok=True)
+        # Download model if not already present
+        local_path = os.path.join(LLAMA_LOCAL_DIR, LLAMA_FILE)
+        if not os.path.exists(local_path):
+            # The file will be downloaded to LLAMA_LOCAL_DIR.  Use
+            # ``local_dir_use_symlinks=False`` to avoid creating
+            # symlinks that may break on certain filesystems.
+            local_path = hf_hub_download(
+                repo_id=LLAMA_REPO,
+                filename=LLAMA_FILE,
+                local_dir=LLAMA_LOCAL_DIR,
+                local_dir_use_symlinks=False,
+            )
+        # Determine GPU usage.  We default to one GPU layer if CUDA
+        # appears available.  Users can override via LLAMA_N_GPU_LAYERS.
+        try:
+            import torch  # type: ignore
+            use_cuda = torch.cuda.is_available()
+        except Exception:
+            use_cuda = False
+        n_gpu_layers_env = os.getenv("LLAMA_N_GPU_LAYERS")
+        if n_gpu_layers_env:
+            try:
+                n_gpu_layers = int(n_gpu_layers_env)
+            except ValueError:
+                n_gpu_layers = 0
+        else:
+            n_gpu_layers = 1 if use_cuda else 0
+        # Construct the Llama instance.  The context window is set
+        # generously to 2048 tokens; adjust via LLAMA_N_CTX if needed.
+        n_ctx = int(os.getenv("LLAMA_N_CTX", "2048"))
+        # Use half the available CPU cores for inference threads to
+        # balance responsiveness and resource use.
+        try:
+            n_threads = max(1, os.cpu_count() // 2)
+        except Exception:
+            n_threads = 2
+        _llm = Llama(
+            model_path=local_path,
+            n_ctx=n_ctx,
+            n_threads=n_threads,
+            n_gpu_layers=n_gpu_layers,
+        )
+def _build_prompt(query: str, context: str) -> str:
+    """Construct the full prompt for the TinyLlama chat model.
+    The prompt format follows the conventions used by the model as
+    illustrated in the provided notebook.  We include a system message
+    instructing the model to answer only using the given context and to
+    politely decline if the information is unavailable.
+    """
+    system_prompt = (
+        "You are the official chatbot of Codingo. "
+        "Answer ONLY by using the CONTEXT. "
+        "If the information is not available for you, say it politely."
+    )
+    prompt = (
+        f"<|system|>\n{system_prompt}</s>\n"
+        f"<|user|>\n{query}\n\nCONTEXTE:\n{context}</s>\n"
+        f"<|assistant|>\n"
+    )
+    return prompt
+def get_response(query: str, k: int = 3, score_threshold: float = 2.0) -> str:
+    """Return a chatbot response for the given query.
+    This function performs the following steps:
+    1. Ensures the embedder, vector database and LLM are initialised.
+    2. Embeds the user's query and retrieves the top ``k`` most
+       similar documents from the Chroma collection.
+    3. Filters out documents whose cosine distance exceeds
+       ``score_threshold`` (larger distances indicate less similarity).
+    4. Builds a prompt containing the user query and the concatenated
+       relevant context.
+    5. Feeds the prompt to the TinyLlama model and returns its
+       response, trimming trailing whitespace.
+    If no relevant context is found, a fallback message is returned.
+    """
+    if not query or not query.strip():
+        return "Please type a question about the Codingo platform."
+    init_embedder_and_db()
+    init_llm()
+    assert _embedder is not None and _collection is not None and _llm is not None
+    # Embed query and search collection
+    query_vector = _embedder.encode([query])[0]
+    results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
+    docs = results.get("documents", [[]])[0] if results else []
+    distances = results.get("distances", [[]])[0] if results else []
+    # Filter by score
+    relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
+    if not relevant:
+        return "Sorry, I don't have enough information to answer that question."
+    context = "\n\n".join(relevant)
+    prompt = _build_prompt(query, context)
+    # Generate completion
+    output = _llm(
+        prompt,
+        max_tokens=MAX_TOKENS,
+        temperature=TEMPERATURE,
+        top_p=TOP_P,
+        repeat_penalty=REPEAT_PENALTY,
+        stop=["</s>"]
+    )
+    text = output["choices"][0]["text"].strip()
+    return text or "I'm here to answer your questions about Codingo. What would you like to know?"

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 flask
 flask_login
 flask_sqlalchemy
@@ -55,5 +56,7 @@ pydub>=0.25.1
 requests>=2.31.0
 # Additional dependencies for improved chatbot functionality
-# Note: We're using DialoGPT which requires transformers (already included above)
-# No OpenAI dependency needed - using Hugging Face models instead

 flask
 flask_login
 flask_sqlalchemy
 requests>=2.31.0
 # Additional dependencies for improved chatbot functionality
+# Note: The chatbot now uses a local Llama model via ``llama-cpp-python``.
+# We include the dependency here so that it is installed on Hugging Face
+# Spaces.  The version is pinned for reproducibility and compatibility.
+llama-cpp-python==0.2.27