Spaces:

decodingdatascience
/

dds-hr-rag-chatbot

Sleeping

App Files Files Community

decodingdatascience commited on 14 days ago

Commit

ef780a9

verified ·

1 Parent(s): 3a846c2

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -35

app.py CHANGED Viewed

@@ -41,18 +41,13 @@ FAQ_ITEMS = [
 LOGO_RAW_URL = "https://raw.githubusercontent.com/Decoding-Data-Science/airesidency/main/dds-logo-removebg-preview.png"
-# PDFs live in repo under ./data/pdfs
 PDF_DIR = Path("data/pdfs")
-# Use persistent disk if available
 PERSIST_ROOT = Path("/data") if Path("/data").exists() else Path(".")
 VDB_DIR = PERSIST_ROOT / "chroma"
-# Optional HF speed optimization when persistent disk exists
-# (HF docs mention setting HF_HOME to /data/.huggingface to speed restarts)
-if Path("/data").exists():
-    os.environ.setdefault("HF_HOME", "/data/.huggingface")
 # -----------------------------
 # Helpers
 # -----------------------------
@@ -74,23 +69,21 @@ def download_logo() -> str | None:
         return None
 def build_or_load_index():
-    # Guard: ensure OpenAI key exists
     if not os.getenv("OPENAI_API_KEY"):
         raise RuntimeError("OPENAI_API_KEY is not set. Add it in Space Settings → Repository secrets.")
     if not PDF_DIR.exists():
-        raise RuntimeError(f"PDF folder not found: {PDF_DIR}. Add your PDFs under data/pdfs/ in the Space repo.")
     pdfs = sorted(PDF_DIR.glob("*.pdf"))
     if not pdfs:
-        raise RuntimeError(f"No PDFs found in {PDF_DIR}. Upload your 4 HR PDFs there.")
     # LlamaIndex settings
     Settings.embed_model = OpenAIEmbedding(model=EMBED_MODEL)
     Settings.llm = LIOpenAI(model=LLM_MODEL, temperature=0.0)
     Settings.node_parser = SentenceSplitter(chunk_size=900, chunk_overlap=150)
-    # Read documents
     docs = SimpleDirectoryReader(
         input_dir=str(PDF_DIR),
         required_exts=[".pdf"],
@@ -101,21 +94,22 @@ def build_or_load_index():
     VDB_DIR.mkdir(parents=True, exist_ok=True)
     chroma_client = chromadb.PersistentClient(path=str(VDB_DIR))
-    # Reuse existing collection if present; otherwise create/build
     try:
         col = chroma_client.get_collection(COLLECTION_NAME)
-        # If count works and >0, reuse
         try:
             if col.count() > 0:
                 vector_store = ChromaVectorStore(chroma_collection=col)
                 storage_context = StorageContext.from_defaults(vector_store=vector_store)
-                return VectorStoreIndex.from_vector_store(vector_store=vector_store, storage_context=storage_context)
         except Exception:
             pass
     except Exception:
         pass
-    # Create/build fresh
     try:
         chroma_client.delete_collection(COLLECTION_NAME)
     except Exception:
@@ -127,7 +121,23 @@ def build_or_load_index():
     return VectorStoreIndex.from_documents(docs, storage_context=storage_context)
-# Build index at startup
 INDEX = build_or_load_index()
 CHAT_ENGINE = INDEX.as_chat_engine(
@@ -136,7 +146,11 @@ CHAT_ENGINE = INDEX.as_chat_engine(
     system_prompt=SYSTEM_PROMPT,
 )
-def answer(user_msg: str, history: list[tuple[str, str]], show_sources: bool):
     user_msg = (user_msg or "").strip()
     if not user_msg:
         return history, ""
@@ -145,20 +159,13 @@ def answer(user_msg: str, history: list[tuple[str, str]], show_sources: bool):
     text = str(resp).strip()
     if show_sources:
-        srcs = getattr(resp, "source_nodes", None) or []
-        if srcs:
-            lines = ["", "Sources:"]
-            for i, sn in enumerate(srcs[:5], start=1):
-                md = sn.node.metadata or {}
-                doc = _md_get(md, ["file_name", "filename", "doc_name", "source"], "unknown_doc")
-                page = _md_get(md, ["page_label", "page", "page_number"], "?")
-                score = sn.score if sn.score is not None else float("nan")
-                lines.append(f"{i}) {doc} | page {page} | score {score:.3f}")
-            text = text + "\n" + "\n".join(lines)
-        else:
-            text = text + "\n\nSources: (none returned)"
-    history = history + [(user_msg, text)]
     return history, ""
 def load_faq(faq_choice: str):
@@ -168,7 +175,7 @@ def clear_chat():
     return [], ""
 # -----------------------------
-# Gradio UI
 # -----------------------------
 logo_path = download_logo()
@@ -178,8 +185,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             gr.Image(value=logo_path, show_label=False, height=70, width=70, container=False)
         gr.Markdown(
             "# DDS HR Chatbot (RAG Demo)\n"
-            "Ask HR policy questions. The assistant answers **only from the provided DDS policy PDFs** "
-            "and can show sources."
         )
     with gr.Row():
@@ -193,7 +199,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             clear_btn = gr.Button("Clear chat")
         with gr.Column(scale=2, min_width=520):
-            chatbot = gr.Chatbot(label="DDS HR Assistant", height=520)
             user_input = gr.Textbox(label="Your question", placeholder="Ask a policy question and press Enter")
             send_btn = gr.Button("Send")

 LOGO_RAW_URL = "https://raw.githubusercontent.com/Decoding-Data-Science/airesidency/main/dds-logo-removebg-preview.png"
+# PDFs in repo
 PDF_DIR = Path("data/pdfs")
+# Persistent disk if enabled on Spaces
 PERSIST_ROOT = Path("/data") if Path("/data").exists() else Path(".")
 VDB_DIR = PERSIST_ROOT / "chroma"
 # -----------------------------
 # Helpers
 # -----------------------------
         return None
 def build_or_load_index():
     if not os.getenv("OPENAI_API_KEY"):
         raise RuntimeError("OPENAI_API_KEY is not set. Add it in Space Settings → Repository secrets.")
     if not PDF_DIR.exists():
+        raise RuntimeError(f"PDF folder not found: {PDF_DIR}. Add PDFs under data/pdfs/.")
     pdfs = sorted(PDF_DIR.glob("*.pdf"))
     if not pdfs:
+        raise RuntimeError(f"No PDFs found in {PDF_DIR}. Upload your HR PDFs there.")
     # LlamaIndex settings
     Settings.embed_model = OpenAIEmbedding(model=EMBED_MODEL)
     Settings.llm = LIOpenAI(model=LLM_MODEL, temperature=0.0)
     Settings.node_parser = SentenceSplitter(chunk_size=900, chunk_overlap=150)
     docs = SimpleDirectoryReader(
         input_dir=str(PDF_DIR),
         required_exts=[".pdf"],
     VDB_DIR.mkdir(parents=True, exist_ok=True)
     chroma_client = chromadb.PersistentClient(path=str(VDB_DIR))
+    # Reuse existing collection if it already has vectors
     try:
         col = chroma_client.get_collection(COLLECTION_NAME)
         try:
             if col.count() > 0:
                 vector_store = ChromaVectorStore(chroma_collection=col)
                 storage_context = StorageContext.from_defaults(vector_store=vector_store)
+                return VectorStoreIndex.from_vector_store(
+                    vector_store=vector_store, storage_context=storage_context
+                )
         except Exception:
             pass
     except Exception:
         pass
+    # Build fresh collection
     try:
         chroma_client.delete_collection(COLLECTION_NAME)
     except Exception:
     return VectorStoreIndex.from_documents(docs, storage_context=storage_context)
+def format_sources(resp, max_sources=5) -> str:
+    srcs = getattr(resp, "source_nodes", None) or []
+    if not srcs:
+        return "Sources: (none returned)"
+    lines = ["Sources:"]
+    for i, sn in enumerate(srcs[:max_sources], start=1):
+        md = sn.node.metadata or {}
+        doc = _md_get(md, ["file_name", "filename", "doc_name", "source"], "unknown_doc")
+        page = _md_get(md, ["page_label", "page", "page_number"], "?")
+        score = sn.score if sn.score is not None else float("nan")
+        lines.append(f"{i}) {doc} | page {page} | score {score:.3f}")
+    return "\n".join(lines)
+# -----------------------------
+# Build index + chat engine
+# -----------------------------
 INDEX = build_or_load_index()
 CHAT_ENGINE = INDEX.as_chat_engine(
     system_prompt=SYSTEM_PROMPT,
 )
+# -----------------------------
+# Gradio callbacks (MESSAGES format)
+# history is: [{"role":"user","content":"..."}, {"role":"assistant","content":"..."}, ...]
+# -----------------------------
+def answer(user_msg: str, history: list, show_sources: bool):
     user_msg = (user_msg or "").strip()
     if not user_msg:
         return history, ""
     text = str(resp).strip()
     if show_sources:
+        text = text + "\n\n" + format_sources(resp)
+    # Append messages (this fixes your error)
+    history = (history or []) + [
+        {"role": "user", "content": user_msg},
+        {"role": "assistant", "content": text},
+    ]
     return history, ""
 def load_faq(faq_choice: str):
     return [], ""
 # -----------------------------
+# UI
 # -----------------------------
 logo_path = download_logo()
             gr.Image(value=logo_path, show_label=False, height=70, width=70, container=False)
         gr.Markdown(
             "# DDS HR Chatbot (RAG Demo)\n"
+            "Ask HR policy questions. The assistant answers **only from the DDS HR PDFs** and can show sources."
         )
     with gr.Row():
             clear_btn = gr.Button("Clear chat")
         with gr.Column(scale=2, min_width=520):
+            # IMPORTANT: type="messages"
+            chatbot = gr.Chatbot(label="DDS HR Assistant", height=520, type="messages")
             user_input = gr.Textbox(label="Your question", placeholder="Ask a policy question and press Enter")
             send_btn = gr.Button("Send")