Spaces:

amitcoolll
/

Chatbot-Documents

Runtime error

App Files Files Community

amitcoolll commited on Jan 1

Commit

6e1d29c

1 Parent(s): 4cbbb67

Improve retrieval: LLM query rewriting, clarification, and chat UI

Browse files

Files changed (2) hide show

app.py +100 -41
src/rag.py +97 -9

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
 import streamlit as st
@@ -13,20 +12,18 @@ from src.parsers import read_pdf, read_docx
 from src.chunking import chunk_text
 from src.embeddings import embed_texts
 from src.vectorstore import add_documents, reset_collection
-from src.rag import answer_question
-# ---------------- Streamlit config ----------------
 st.set_page_config(
     page_title="Document Chatbot (RAG)",
     layout="wide"
 )
-st.title("📄 Document Chatbot (RAG) — Streamlit")
 st.caption(
-    "Upload multiple PDF/DOCX → Build Index → Ask questions → "
-    "Answers from documents only with citations"
 )
 # ---------------- Sidebar ----------------
@@ -38,13 +35,19 @@ with st.sidebar:
     if st.button("🧹 Clear Index"):
         reset_collection()
-        st.success("Index cleared successfully.")
 # ---------------- Ensure folders ----------------
 os.makedirs(UPLOAD_DIR, exist_ok=True)
 os.makedirs("./data", exist_ok=True)
-# ---------------- Upload Section ----------------
 st.subheader("📤 Upload Documents")
 uploaded_files = st.file_uploader(
@@ -59,15 +62,12 @@ if st.button("✅ Build Index"):
         st.warning("Please upload at least one document.")
     else:
         with st.spinner("Indexing documents..."):
-            documents = []
-            metadatas = []
-            ids = []
             for uploaded_file in uploaded_files:
                 file_name = uploaded_file.name
                 file_bytes = uploaded_file.read()
-                # Parse from memory (HF-safe)
                 if file_name.lower().endswith(".pdf"):
                     pages = read_pdf(file_bytes)
                 elif file_name.lower().endswith(".docx"):
@@ -76,9 +76,7 @@ if st.button("✅ Build Index"):
                     continue
                 for page_no, text in pages:
-                    chunks = chunk_text(text)
-                    for i, chunk in enumerate(chunks):
                         documents.append(chunk)
                         metadatas.append({
                             "file": file_name,
@@ -87,10 +85,7 @@ if st.button("✅ Build Index"):
                         ids.append(f"{file_name}_p{page_no}_c{i}")
             if not documents:
-                st.error(
-                    "No text could be extracted. "
-                    "Scanned PDFs require OCR."
-                )
             else:
                 vectors = embed_texts(documents)
                 add_documents(
@@ -99,35 +94,99 @@ if st.button("✅ Build Index"):
                     metadatas=metadatas,
                     ids=ids
                 )
                 st.success(
-                    f"✅ Indexed {len(documents)} chunks "
                     f"from {len(uploaded_files)} file(s)."
                 )
 st.divider()
-# ---------------- Ask Question ----------------
-st.subheader("💬 Ask a Question")
-question = st.text_input(
-    "Type your question based on uploaded documents"
-)
-if st.button("Ask"):
-    if not question.strip():
-        st.warning("Please enter a question.")
-    else:
-        with st.spinner("Thinking..."):
-            try:
-                answer, citations = answer_question(question)
-                st.markdown("### ✅ Answer")
-                st.write(answer)
-                st.markdown("### 📌 Citations")
-                for c in citations:
-                    st.write(c)
-            except Exception as e:
-                st.error(str(e))

 import os
 import streamlit as st
 from src.chunking import chunk_text
 from src.embeddings import embed_texts
 from src.vectorstore import add_documents, reset_collection
+from src.rag import answer_question, clarification_question
+# ---------------- Page config ----------------
 st.set_page_config(
     page_title="Document Chatbot (RAG)",
     layout="wide"
 )
+st.title("📄 Document Chatbot (RAG)")
 st.caption(
+    "Upload PDF/DOCX → Build Index → Chat using document knowledge with citations"
 )
 # ---------------- Sidebar ----------------
     if st.button("🧹 Clear Index"):
         reset_collection()
+        st.success("Index cleared.")
+    if st.button("🗑️ Clear Chat"):
+        st.session_state.messages = []
+        st.session_state.pending_question = None
+        st.session_state.clarification = None
+        st.success("Chat cleared.")
 # ---------------- Ensure folders ----------------
 os.makedirs(UPLOAD_DIR, exist_ok=True)
 os.makedirs("./data", exist_ok=True)
+# ---------------- Upload ----------------
 st.subheader("📤 Upload Documents")
 uploaded_files = st.file_uploader(
         st.warning("Please upload at least one document.")
     else:
         with st.spinner("Indexing documents..."):
+            documents, metadatas, ids = [], [], []
             for uploaded_file in uploaded_files:
                 file_name = uploaded_file.name
                 file_bytes = uploaded_file.read()
                 if file_name.lower().endswith(".pdf"):
                     pages = read_pdf(file_bytes)
                 elif file_name.lower().endswith(".docx"):
                     continue
                 for page_no, text in pages:
+                    for i, chunk in enumerate(chunk_text(text)):
                         documents.append(chunk)
                         metadatas.append({
                             "file": file_name,
                         ids.append(f"{file_name}_p{page_no}_c{i}")
             if not documents:
+                st.error("No text extracted. Scanned PDFs need OCR.")
             else:
                 vectors = embed_texts(documents)
                 add_documents(
                     metadatas=metadatas,
                     ids=ids
                 )
                 st.success(
+                    f"Indexed {len(documents)} chunks "
                     f"from {len(uploaded_files)} file(s)."
                 )
 st.divider()
+# ===================== CHAT UI =====================
+st.subheader("💬 Chat with your documents")
+# Session state
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "pending_question" not in st.session_state:
+    st.session_state.pending_question = None
+if "clarification" not in st.session_state:
+    st.session_state.clarification = None
+# Render chat history
+for msg in st.session_state.messages:
+    with st.chat_message(msg["role"]):
+        st.markdown(msg["content"])
+# Chat input
+user_input = st.chat_input("Ask something about the uploaded documents...")
+if user_input:
+    st.session_state.messages.append({
+        "role": "user",
+        "content": user_input
+    })
+    # Ask LLM if clarification is needed
+    clarify = clarification_question(user_input)
+    if clarify:
+        st.session_state.pending_question = user_input
+        st.session_state.clarification = clarify
+        st.session_state.messages.append({
+            "role": "assistant",
+            "content": clarify
+        })
+        st.rerun()
+    else:
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                answer, citations = answer_question(user_input)
+                final = answer
+                if citations:
+                    final += "\n\n**Sources:**\n" + "\n".join(f"- {c}" for c in citations)
+                st.markdown(final)
+                st.session_state.messages.append({
+                    "role": "assistant",
+                    "content": final
+                })
+# ---------- Clarification buttons ----------
+if st.session_state.pending_question:
+    col1, col2 = st.columns(2)
+    if col1.button("✅ Yes, that's what I mean"):
+        q = st.session_state.pending_question
+        st.session_state.pending_question = None
+        st.session_state.clarification = None
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                answer, citations = answer_question(q)
+                final = answer
+                if citations:
+                    final += "\n\n**Sources:**\n" + "\n".join(f"- {c}" for c in citations)
+                st.markdown(final)
+                st.session_state.messages.append({
+                    "role": "assistant",
+                    "content": final
+                })
+        st.rerun()
+    if col2.button("❌ No, something else"):
+        st.session_state.pending_question = None
+        st.session_state.clarification = None
+        st.session_state.messages.append({
+            "role": "assistant",
+            "content": "Okay — please type your question with a bit more detail."
+        })
+        st.rerun()

src/rag.py CHANGED Viewed

@@ -1,22 +1,100 @@
-from typing import List, Tuple
 from src.embeddings import embed_texts
 from src.vectorstore import query_by_embedding
 from src.openai_client import get_client
 from src.config import CHAT_MODEL, TOP_K
 def retrieve_context(question: str, top_k: int = TOP_K) -> Tuple[str, List[str]]:
-    q_vec = embed_texts([question])[0]
-    docs, metas = query_by_embedding(q_vec, top_k=top_k)
     context_blocks = []
     citations = []
-    for i, (doc, meta) in enumerate(zip(docs, metas), start=1):
         citations.append(f"[{i}] {meta.get('file')} (page {meta.get('page')})")
         context_blocks.append(
             f"Source {i}: {meta.get('file')} (page {meta.get('page')})\n{doc}"
@@ -24,12 +102,16 @@ def retrieve_context(question: str, top_k: int = TOP_K) -> Tuple[str, List[str]]
     return "\n\n---\n\n".join(context_blocks), citations
 def answer_question(question: str) -> Tuple[str, List[str]]:
     context, citations = retrieve_context(question, top_k=TOP_K)
     prompt = f"""
 You are a document assistant.
-Answer ONLY using the SOURCES below.
 If the answer is not in the sources, say: "I don't know from the uploaded documents."
 SOURCES:
@@ -38,11 +120,17 @@ SOURCES:
 QUESTION:
 {question}
 Return:
-1) Answer (clear & concise)
-2) Sources used (numbers only)
 """
     client = get_client()
     resp = client.responses.create(model=CHAT_MODEL, input=prompt)
     return resp.output_text.strip(), citations

+from typing import List, Tuple, Dict, Any, Optional
 from src.embeddings import embed_texts
 from src.vectorstore import query_by_embedding
 from src.openai_client import get_client
 from src.config import CHAT_MODEL, TOP_K
+# ---------------- Query Rewrite (Domain-agnostic) ----------------
+def rewrite_queries(question: str, n: int = 4) -> List[str]:
+    """
+    Creates multiple semantic variants of the user query to improve recall.
+    Works for any domain (medical/legal/finance/etc.) without hardcoded synonyms.
+    """
+    client = get_client()
+    prompt = f"""
+You help a RAG system retrieve relevant document chunks.
+Rewrite the user query into {n} short alternative search queries that capture the same intent.
+Include abbreviations, synonyms, and likely wording that might appear in documents.
+Return ONLY the queries, one per line. No numbering, no extra text.
+User query: {question}
+"""
+    resp = client.responses.create(model=CHAT_MODEL, input=prompt)
+    lines = [ln.strip() for ln in resp.output_text.splitlines() if ln.strip()]
+    # Always include original first + dedupe
+    out = [question] + lines
+    seen = set()
+    final = []
+    for q in out:
+        k = q.lower()
+        if k not in seen:
+            seen.add(k)
+            final.append(q)
+    return final[: n + 1]
+# ---------------- Clarification (Domain-agnostic) ----------------
+def clarification_question(user_query: str) -> Optional[str]:
+    """
+    If the query is too short/ambiguous, returns a clarification question.
+    Otherwise returns None.
+    """
+    client = get_client()
+    prompt = f"""
+Decide if this user query is too short or ambiguous for document retrieval.
+If clarification is needed, return ONE short clarification question.
+If not needed, return exactly: NO
+User query: {user_query}
+"""
+    resp = client.responses.create(model=CHAT_MODEL, input=prompt)
+    out = resp.output_text.strip()
+    if out.upper() == "NO":
+        return None
+    return out
+# ---------------- Multi-query Retrieval + Dedupe ----------------
 def retrieve_context(question: str, top_k: int = TOP_K) -> Tuple[str, List[str]]:
+    """
+    Retrieves context using multi-query rewrite to improve semantic matches.
+    Returns (context_string, citations_list).
+    """
+    queries = rewrite_queries(question, n=4)
+    all_docs: List[str] = []
+    all_metas: List[Dict[str, Any]] = []
+    for q in queries:
+        q_vec = embed_texts([q])[0]
+        docs, metas = query_by_embedding(q_vec, top_k=top_k)
+        all_docs.extend(docs)
+        all_metas.extend(metas)
+    # Deduplicate by (file, page, snippet)
+    seen = set()
+    final_docs: List[str] = []
+    final_metas: List[Dict[str, Any]] = []
+    for d, m in zip(all_docs, all_metas):
+        fp = (m.get("file"), m.get("page"), (d[:160] if d else ""))
+        if fp not in seen:
+            seen.add(fp)
+            final_docs.append(d)
+            final_metas.append(m)
+    final_docs = final_docs[:top_k]
+    final_metas = final_metas[:top_k]
     context_blocks = []
     citations = []
+    for i, (doc, meta) in enumerate(zip(final_docs, final_metas), start=1):
         citations.append(f"[{i}] {meta.get('file')} (page {meta.get('page')})")
         context_blocks.append(
             f"Source {i}: {meta.get('file')} (page {meta.get('page')})\n{doc}"
     return "\n\n---\n\n".join(context_blocks), citations
 def answer_question(question: str) -> Tuple[str, List[str]]:
+    """
+    Answers grounded in retrieved sources.
+    """
     context, citations = retrieve_context(question, top_k=TOP_K)
     prompt = f"""
 You are a document assistant.
+Answer using the SOURCES below.
 If the answer is not in the sources, say: "I don't know from the uploaded documents."
 SOURCES:
 QUESTION:
 {question}
+Rules:
+- Be helpful and concise.
+- It's okay to paraphrase, but do not invent facts.
+- At the end, list: Sources used: [numbers only]
 Return:
+1) Answer
+2) Sources used: [..]
 """
     client = get_client()
     resp = client.responses.create(model=CHAT_MODEL, input=prompt)
     return resp.output_text.strip(), citations