Spaces:

tharunchndrn
/

Syslink_Chatbot

Build error

App Files Files Community

tharunchndrn commited on 23 days ago

Commit

d2509c7

verified ·

1 Parent(s): e33803e

Upload 8 files

Browse files

Files changed (8) hide show

backend_app/config.py +34 -0
backend_app/email_service.py +65 -0
backend_app/fetcher.py +80 -0
backend_app/flows.py +222 -0
backend_app/ingest.py +112 -0
backend_app/rag_hf.py +136 -0
backend_app/suggestions.py +79 -0
backend_app/web_search.py +20 -0

backend_app/config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+# Storage
+DATA_DIR = os.getenv("DATA_DIR", "data")
+FAISS_INDEX_PATH = os.path.join(DATA_DIR, "faiss.index")
+DOCSTORE_PATH = os.path.join(DATA_DIR, "docs.pkl")
+RAW_CACHE_PATH = os.path.join(DATA_DIR, "raw_cache.jsonl")
+URLS_PATH = os.path.join(DATA_DIR, "urls.json")
+# Retrieval + Web fallback tuning
+MIN_TOP_SCORE = float(os.getenv("MIN_TOP_SCORE", "0.30"))
+WEB_MAX_RESULTS = int(os.getenv("WEB_MAX_RESULTS", "3"))
+# Embeddings (free local)
+EMBED_MODEL_NAME = os.getenv(
+    "EMBED_MODEL_NAME",
+    "sentence-transformers/all-MiniLM-L6-v2"
+)
+# LLM Provider (free local via Ollama)
+LLM_PROVIDER = os.getenv("LLM_PROVIDER", "ollama").lower()
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "phi3")
+# Chatbot identity (UI + greeting)
+BOT_NAME = os.getenv("BOT_NAME", "SysLink Assistant")
+BOT_WELCOME = os.getenv(
+    "BOT_WELCOME",
+    "Welcome to SysLink Food System 👋 How can I help you today?"
+)
+BOT_LOGO_URL = os.getenv("BOT_LOGO_URL", "/assets/bot-logo.png")

backend_app/email_service.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import json
+import time
+import smtplib
+from email.message import EmailMessage
+from typing import Dict
+from .config import DATA_DIR
+CONTACT_LOG_PATH = os.path.join(DATA_DIR, "contact_messages.jsonl")
+def _save_locally(user_email: str, user_message: str) -> None:
+    os.makedirs(DATA_DIR, exist_ok=True)
+    entry = {
+        "email": user_email,
+        "message": user_message,
+        "created_at": int(time.time())
+    }
+    with open(CONTACT_LOG_PATH, "a", encoding="utf-8") as f:
+        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+def send_contact_email(user_email: str, user_message: str) -> Dict:
+    """
+    Free method: Use SMTP with your email account.
+    If SMTP isn't configured, we store messages locally and return ok=False.
+    """
+    # Always store a copy locally (good for backup)
+    _save_locally(user_email, user_message)
+    # SMTP settings from environment
+    smtp_host = os.getenv("SMTP_HOST", "")
+    smtp_port = int(os.getenv("SMTP_PORT", "0") or "0")
+    smtp_user = os.getenv("SMTP_USER", "")
+    smtp_pass = os.getenv("SMTP_PASS", "")
+    to_email = os.getenv("CONTACT_TO_EMAIL", "")
+    # If not configured, don't fail the whole chatbot
+    if not (smtp_host and smtp_port and smtp_user and smtp_pass and to_email):
+        return {"ok": False, "error": "SMTP not configured"}
+    try:
+        msg = EmailMessage()
+        msg["Subject"] = "New Contact Message - SysLink Food System"
+        msg["From"] = smtp_user
+        msg["To"] = to_email
+        msg.set_content(
+            f"User Email: {user_email}\n\n"
+            f"Message:\n{user_message}\n"
+        )
+        # TLS connection
+        with smtplib.SMTP(smtp_host, smtp_port, timeout=30) as server:
+            server.starttls()
+            server.login(smtp_user, smtp_pass)
+            server.send_message(msg)
+        return {"ok": True}
+    except Exception as e:
+        return {"ok": False, "error": str(e)}

backend_app/fetcher.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+import json
+import re
+import time
+from typing import Dict
+import requests
+from bs4 import BeautifulSoup
+from .config import RAW_CACHE_PATH
+USER_AGENT = "SysLinkBot/1.0 (RAG educational project)"
+def _clean_text(text: str) -> str:
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def _load_cache() -> dict:
+    if not os.path.exists(RAW_CACHE_PATH):
+        return {}
+    cache = {}
+    with open(RAW_CACHE_PATH, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                obj = json.loads(line)
+                cache[obj["url"]] = obj
+            except:
+                continue
+    return cache
+def _append_cache(entry: Dict):
+    os.makedirs(os.path.dirname(RAW_CACHE_PATH), exist_ok=True)
+    with open(RAW_CACHE_PATH, "a", encoding="utf-8") as f:
+        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+def fetch_page_text(url: str, use_cache: bool = True) -> Dict:
+    """
+    Fetch webpage content and return cleaned main text.
+    Caches pages to reduce repeated web delays.
+    """
+    cache = _load_cache()
+    if use_cache and url in cache:
+        return cache[url]
+    headers = {"User-Agent": USER_AGENT}
+    resp = requests.get(url, headers=headers, timeout=30)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.text, "lxml")
+    # Remove noisy tags
+    for tag in soup(["script", "style", "noscript", "svg", "footer", "nav"]):
+        tag.decompose()
+    main = soup.find("main") or soup.body
+    if not main:
+        raise ValueError("No readable content found")
+    text = _clean_text(main.get_text(separator=" "))
+    title = soup.title.get_text(strip=True) if soup.title else url
+    result = {
+        "url": url,
+        "title": title,
+        "text": text,
+        "fetched_at": int(time.time())
+    }
+    _append_cache(result)
+    return result

backend_app/flows.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from __future__ import annotations
+from typing import Dict, List, Optional
+import re
+from .email_service import send_contact_email
+from .suggestions import (
+    default_suggestions,
+    suggestions_for_intent,
+    suggestions_from_text,
+)
+class FlowManager:
+    """
+    Manages lightweight session state for:
+    - Contact flow (collect message + email)
+    - Language flow (choose language/region)
+    """
+    def __init__(self):
+        # session_id -> state
+        self.sessions: Dict[str, Dict] = {}
+    # ---------- Suggestions ----------
+    def default_suggestions(self) -> List[str]:
+        return default_suggestions()
+    # ---------- Session helpers ----------
+    def _get(self, session_id: str) -> Dict:
+        if session_id not in self.sessions:
+            self.sessions[session_id] = {
+                "mode": "normal",          # normal | contact_wait_msg | contact_wait_email | lang_wait
+                "contact_msg": None,
+                "lang": None,              # e.g. "Sinhala", "Tamil", "English"
+            }
+        return self.sessions[session_id]
+    # ---------- Intents ----------
+    def _detect_intents(self, text: str) -> List[str]:
+        t = text.lower()
+        intents = []
+        if any(k in t for k in ["contact", "support", "help desk", "reach", "email us", "contact us"]):
+            intents.append("contact")
+        if any(k in t for k in ["language", "sinhala", "tamil", "english", "change language", "translate"]):
+            intents.append("language")
+        if any(k in t for k in ["service", "services", "what do you do", "features", "what is syslink", "about"]):
+            intents.append("services")
+        return intents or ["rag"]
+    # ---------- Main entry ----------
+    def handle_message(self, session_id: str, user_message: str) -> Dict:
+        """
+        Returns dict:
+        {
+          "action": "flow" | "rag",
+          "answer": "...",
+          "suggestions": [...]
+          "lang": optional preferred language for RAG
+        }
+        """
+        state = self._get(session_id)
+        msg = user_message.strip()
+        # 1) If we're in the middle of a flow, handle it first
+        if state["mode"].startswith("contact_"):
+            return self._handle_contact_flow(state, msg)
+        if state["mode"] == "lang_wait":
+            return self._handle_language_flow(state, msg)
+        # 2) Not in a flow: detect intent(s)
+        intents = self._detect_intents(msg)
+        # If user typed custom prompt, we replace suggestions with new related ones
+        dynamic_suggestions = suggestions_from_text(msg)
+        # 3) Multi-intent handling (2+ in one message)
+        # We'll handle flow intents first, then allow RAG for remaining.
+        if "contact" in intents and "language" in intents:
+            # Ask language first (quick), then contact
+            state["mode"] = "lang_wait"
+            return {
+                "action": "flow",
+                "answer": "Sure. Which language would you like (Sinhala / Tamil / English)?",
+                "suggestions": suggestions_for_intent("language"),
+                "lang": state.get("lang"),
+            }
+        if "language" in intents:
+            state["mode"] = "lang_wait"
+            return {
+                "action": "flow",
+                "answer": "Sure. Which language would you like (Sinhala / Tamil / English)?",
+                "suggestions": suggestions_for_intent("language"),
+                "lang": state.get("lang"),
+            }
+        if "contact" in intents:
+            state["mode"] = "contact_wait_msg"
+            return {
+                "action": "flow",
+                "answer": "Sure — please type your message for our team.",
+                "suggestions": suggestions_for_intent("contact"),
+                "lang": state.get("lang"),
+            }
+        if "services" in intents:
+            # Let RAG answer, but provide service-related suggestions
+            return {
+                "action": "rag",
+                "answer": "",
+                "suggestions": suggestions_for_intent("services"),
+                "lang": state.get("lang"),
+            }
+        # 4) Default: RAG
+        return {
+            "action": "rag",
+            "answer": "",
+            "suggestions": dynamic_suggestions,
+            "lang": state.get("lang"),
+        }
+    # ---------- Contact flow ----------
+    def _handle_contact_flow(self, state: Dict, msg: str) -> Dict:
+        if state["mode"] == "contact_wait_msg":
+            state["contact_msg"] = msg
+            state["mode"] = "contact_wait_email"
+            return {
+                "action": "flow",
+                "answer": "Thanks. Now please enter your email address.",
+                "suggestions": [],
+                "lang": state.get("lang"),
+            }
+        if state["mode"] == "contact_wait_email":
+            if not self._is_valid_email(msg):
+                return {
+                    "action": "flow",
+                    "answer": "That email doesn’t look valid. Please type a valid email (example: name@gmail.com).",
+                    "suggestions": [],
+                    "lang": state.get("lang"),
+                }
+            # Send email (free SMTP). If not configured, we still store and confirm.
+            email = msg
+            message = state.get("contact_msg") or ""
+            result = send_contact_email(user_email=email, user_message=message)
+            # Reset flow state
+            state["mode"] = "normal"
+            state["contact_msg"] = None
+            if result["ok"]:
+                return {
+                    "action": "flow",
+                    "answer": "✅ Sent! Thanks — our team will contact you soon.",
+                    "suggestions": default_suggestions(),
+                    "lang": state.get("lang"),
+                }
+            return {
+                "action": "flow",
+                "answer": (
+                    "✅ I saved your message, but email sending isn’t configured yet on the server.\n"
+                    "Our team can still contact you using the details you provided."
+                ),
+                "suggestions": default_suggestions(),
+                "lang": state.get("lang"),
+            }
+        # fallback
+        state["mode"] = "normal"
+        return {"action": "rag", "answer": "", "suggestions": default_suggestions(), "lang": state.get("lang")}
+    def submit_contact(self, session_id: str, email: str, message: str) -> Dict:
+        """
+        Optional endpoint use.
+        """
+        state = self._get(session_id)
+        result = send_contact_email(user_email=email, user_message=message)
+        if result["ok"]:
+            return {"ok": True, "message": "Sent"}
+        return {"ok": False, "message": "Not configured"}
+    def _is_valid_email(self, s: str) -> bool:
+        return bool(re.match(r"^[^@\s]+@[^@\s]+\.[^@\s]+$", s.strip()))
+    # ---------- Language flow ----------
+    def _handle_language_flow(self, state: Dict, msg: str) -> Dict:
+        t = msg.strip().lower()
+        # Accept direct language choice
+        if "sinhala" in t or t in ["si", "sinhala", "sin"]:
+            state["lang"] = "Sinhala"
+        elif "tamil" in t or t in ["ta", "tamil"]:
+            state["lang"] = "Tamil"
+        elif "english" in t or t in ["en", "english"]:
+            state["lang"] = "English"
+        else:
+            # Accept region words -> map quickly
+            # (You can expand this later)
+            if any(k in t for k in ["sri lanka", "colombo", "kandy", "galle", "jaffna"]):
+                state["lang"] = "Sinhala"
+            else:
+                return {
+                    "action": "flow",
+                    "answer": "Please type the language you want: Sinhala / Tamil / English.",
+                    "suggestions": suggestions_for_intent("language"),
+                    "lang": state.get("lang"),
+                }
+        # Finish language flow
+        state["mode"] = "normal"
+        return {
+            "action": "flow",
+            "answer": f"✅ Done. I’ll reply in {state['lang']} from now on.",
+            "suggestions": default_suggestions(),
+            "lang": state.get("lang"),
+        }

backend_app/ingest.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import json
+import pickle
+from typing import List, Dict
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from .config import (
+    DATA_DIR,
+    URLS_PATH,
+    FAISS_INDEX_PATH,
+    DOCSTORE_PATH,
+    EMBED_MODEL_NAME,
+)
+from .fetcher import fetch_page_text
+def ensure_data_dir():
+    os.makedirs(DATA_DIR, exist_ok=True)
+def load_urls() -> List[str]:
+    """
+    Expects data/urls.json like:
+    {
+      "urls": ["https://...", "https://..."]
+    }
+    """
+    if not os.path.exists(URLS_PATH):
+        raise FileNotFoundError(
+            f"Missing {URLS_PATH}. Create it with your 4 URLs."
+        )
+    with open(URLS_PATH, "r", encoding="utf-8") as f:
+        obj = json.load(f)
+    urls = obj.get("urls", [])
+    if not urls:
+        raise ValueError("urls.json has no URLs. Add at least 1 URL.")
+    return urls
+def chunk_text(text: str, chunk_size_words: int = 900, overlap_words: int = 150) -> List[str]:
+    """
+    Simple word-based chunking (fast + reliable).
+    """
+    words = text.split()
+    chunks = []
+    i = 0
+    step = max(1, chunk_size_words - overlap_words)
+    while i < len(words):
+        chunk = words[i:i + chunk_size_words]
+        chunks.append(" ".join(chunk))
+        i += step
+    return chunks
+def build_docs_from_urls(urls: List[str]) -> List[Dict]:
+    docs: List[Dict] = []
+    for url in urls:
+        page = fetch_page_text(url, use_cache=True)
+        chunks = chunk_text(page["text"])
+        for idx, ch in enumerate(chunks):
+            docs.append({
+                "text": ch,
+                "meta": {
+                    "url": page["url"],
+                    "title": page["title"],
+                    "chunk": idx
+                }
+            })
+    return docs
+def build_faiss_index(docs: List[Dict]) -> None:
+    model = SentenceTransformer(EMBED_MODEL_NAME)
+    texts = [d["text"] for d in docs]
+    emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
+    emb = np.array(emb, dtype="float32")
+    index = faiss.IndexFlatIP(emb.shape[1])
+    index.add(emb)
+    faiss.write_index(index, FAISS_INDEX_PATH)
+    with open(DOCSTORE_PATH, "wb") as f:
+        pickle.dump(docs, f)
+def run_ingestion():
+    ensure_data_dir()
+    urls = load_urls()
+    docs = build_docs_from_urls(urls)
+    if not docs:
+        raise RuntimeError("No documents created from URLs. Check your URLs/pages.")
+    build_faiss_index(docs)
+    print("✅ Ingestion complete")
+    print(f"URLs: {len(urls)}")
+    print(f"Chunks: {len(docs)}")
+    print(f"Saved index: {FAISS_INDEX_PATH}")
+    print(f"Saved docs:  {DOCSTORE_PATH}")
+if __name__ == "__main__":
+    run_ingestion()

backend_app/rag_hf.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import pickle
+from typing import List, Dict, Optional, Tuple
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import InferenceClient
+from .config import (
+    FAISS_INDEX_PATH,
+    DOCSTORE_PATH,
+    EMBED_MODEL_NAME,
+    MIN_TOP_SCORE,
+    WEB_MAX_RESULTS,
+)
+from .fetcher import fetch_page_text
+from .web_search import web_search
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+HF_MODEL = os.getenv("HF_MODEL", "HuggingFaceH4/zephyr-7b-beta")  # you can change later
+class RAGEngineHF:
+    def __init__(self):
+        self.embedder = SentenceTransformer(EMBED_MODEL_NAME)
+        self.index = faiss.read_index(FAISS_INDEX_PATH)
+        with open(DOCSTORE_PATH, "rb") as f:
+            self.docs: List[Dict] = pickle.load(f)
+        self.client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
+        self.TOP_K = 5
+        self.MAX_CONTEXT_CHARS_PER_DOC = 1800
+    def retrieve_local(self, query: str, k: int = 5) -> List[Dict]:
+        q_emb = self.embedder.encode([query], normalize_embeddings=True)
+        q_emb = np.array(q_emb, dtype="float32")
+        scores, ids = self.index.search(q_emb, k)
+        out = []
+        for rank, doc_id in enumerate(ids[0]):
+            if doc_id == -1:
+                continue
+            d = self.docs[int(doc_id)]
+            out.append({
+                "rank": rank + 1,
+                "score": float(scores[0][rank]),
+                "text": d["text"],
+                "meta": d["meta"],
+            })
+        return out
+    def _needs_web_fallback(self, contexts: List[Dict]) -> bool:
+        return (not contexts) or (contexts[0]["score"] < MIN_TOP_SCORE)
+    def fetch_web_context(self, query: str) -> Tuple[List[Dict], List[Dict]]:
+        queries = [f"site:foodsystemsdashboard.org {query}", query]
+        links, seen = [], set()
+        for q in queries:
+            for r in web_search(q, max_results=WEB_MAX_RESULTS):
+                if r["url"] not in seen:
+                    links.append(r)
+                    seen.add(r["url"])
+            if len(links) >= WEB_MAX_RESULTS:
+                break
+        contexts, sources = [], []
+        for r in links[:WEB_MAX_RESULTS]:
+            try:
+                page = fetch_page_text(r["url"], use_cache=True)
+                contexts.append({
+                    "rank": len(contexts) + 1,
+                    "score": 0.0,
+                    "text": page["text"],
+                    "meta": {"url": page["url"], "title": page["title"], "chunk": 0},
+                })
+                sources.append({"title": page["title"], "url": page["url"]})
+            except:
+                continue
+        return contexts, sources
+    def answer(self, query: str, preferred_lang: Optional[str] = None) -> Dict:
+        local = self.retrieve_local(query, k=self.TOP_K)
+        used = "local"
+        contexts = local
+        sources = self._unique_sources(local)
+        if self._needs_web_fallback(local):
+            web_ctx, web_src = self.fetch_web_context(query)
+            if web_ctx:
+                used = "web"
+                contexts = web_ctx
+                sources = web_src
+        context_block = "\n\n".join(
+            [f"[{i+1}] {c['meta']['title']}\n{c['text'][:self.MAX_CONTEXT_CHARS_PER_DOC]}"
+             for i, c in enumerate(contexts)]
+        )
+        lang_line = f"Respond in {preferred_lang}.\n" if preferred_lang else ""
+        prompt = f"""
+You are the SysLink Food System assistant.
+Use ONLY the context below. Do not invent facts.
+Write in simple language, MEDIUM length (8–14 lines). Not too brief.
+If info is missing, say what is missing.
+{lang_line}
+QUESTION: {query}
+CONTEXT:
+{context_block}
+ANSWER:
+""".strip()
+        out = self.client.text_generation(
+            prompt,
+            max_new_tokens=250,
+            temperature=0.2,
+            return_full_text=False,
+        ).strip()
+        if not out:
+            out = "I couldn’t find enough reliable information in the provided sources. Please rephrase or share more details."
+        return {"answer": out, "sources": sources, "used": used}
+    def _unique_sources(self, contexts: List[Dict]) -> List[Dict]:
+        seen, out = set(), []
+        for c in contexts:
+            u = c["meta"]["url"]
+            if u not in seen:
+                out.append({"title": c["meta"]["title"], "url": u})
+                seen.add(u)
+        return out

backend_app/suggestions.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# backend/app/suggestions.py
+from typing import List
+def default_suggestions() -> List[str]:
+    """
+    Suggestions shown when chat opens and after flows complete.
+    """
+    return [
+        "Tell us about your services",
+        "Contact us",
+        "Change response language",
+    ]
+def suggestions_for_intent(intent: str) -> List[str]:
+    """
+    Suggestions used inside specific flows/intents.
+    """
+    intent = (intent or "").lower()
+    if intent == "contact":
+        return [
+            "I want to contact support",
+            "Back to main menu",
+        ]
+    if intent == "language":
+        return [
+            "Sinhala",
+            "Tamil",
+            "English",
+        ]
+    if intent == "services":
+        return [
+            "What is Food SysLink?",
+            "How does it work?",
+            "What problems does it solve?",
+            "Change response language",
+            "Contact us",
+        ]
+    return default_suggestions()
+def suggestions_from_text(user_text: str) -> List[str]:
+    """
+    When user types a custom prompt, old suggestions should disappear
+    and new related ones should appear.
+    This is a lightweight keyword-based approach (fast and free).
+    You can improve it later using embeddings or an LLM.
+    """
+    t = (user_text or "").lower()
+    # If they ask about language, show language options
+    if any(k in t for k in ["language", "sinhala", "tamil", "english", "translate"]):
+        return ["Sinhala", "Tamil", "English"]
+    # If they ask about contact/support
+    if any(k in t for k in ["contact", "support", "help", "email", "reach"]):
+        return ["Contact us", "Tell us about your services", "Change response language"]
+    # If they ask about services/features/about
+    if any(k in t for k in ["service", "services", "feature", "about", "what is", "syslink"]):
+        return [
+            "Tell us about your services",
+            "What is Food SysLink?",
+            "How does it work?",
+            "Contact us",
+        ]
+    # Default suggestions
+    return [
+        "Tell us about your services",
+        "Contact us",
+        "Change response language",
+    ]

backend_app/web_search.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# backend/app/web_search.py
+from typing import List, Dict
+from duckduckgo_search import DDGS
+def web_search(query: str, max_results: int = 3) -> List[Dict]:
+    """
+    DuckDuckGo web search (free).
+    Returns: [{"title": "...", "url": "..."}]
+    """
+    results: List[Dict] = []
+    with DDGS() as ddgs:
+        for r in ddgs.text(query, max_results=max_results):
+            href = r.get("href")
+            title = r.get("title")
+            if href and title:
+                results.append({"title": title, "url": href})
+    return results