Spaces:

amitcoolll
/

Chatbot-Documents

Sleeping

App Files Files Community

amitcoolll commited on Dec 30, 2025

Commit

c4233b7

1 Parent(s): 28b62bf

Initial RAG document chatbot deployment

Browse files

Files changed (14) hide show

Dockerfile +13 -0
app.py +92 -0
requirements.txt +7 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/config.cpython-312.pyc +0 -0
src/__pycache__/vectorstore.cpython-312.pyc +0 -0
src/chunking.py +25 -0
src/config.py +13 -0
src/embeddings.py +10 -0
src/openai_client.py +10 -0
src/parsers.py +18 -0
src/rag.py +48 -0
src/vectorstore.py +51 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY . .
+RUN pip install --no-cache-dir -U pip \
+ && pip install --no-cache-dir -r requirements.txt
+# HF Spaces expects the app on port 7860 (best practice)
+EXPOSE 7860
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.headless=true"]

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import streamlit as st
+from src.config import UPLOAD_DIR, CHUNK_TOKENS, CHUNK_OVERLAP, TOP_K
+from src.parsers import read_pdf, read_docx
+from src.chunking import chunk_text
+from src.embeddings import embed_texts
+from src.vectorstore import add_documents, reset_collection
+from src.rag import answer_question
+st.set_page_config(page_title="Document Chatbot (RAG)", layout="wide")
+st.title("📄 Document Chatbot (RAG) — Streamlit")
+st.caption("Upload multiple PDF/DOCX → Build Index → Ask questions → Answers from docs only + citations")
+# Sidebar settings display (optional)
+with st.sidebar:
+    st.header("Settings")
+    st.write(f"Chunk size: {CHUNK_TOKENS} tokens")
+    st.write(f"Overlap: {CHUNK_OVERLAP} tokens")
+    st.write(f"Top-K retrieval: {TOP_K}")
+    if st.button("🧹 Clear Index"):
+        reset_collection()
+        st.success("Index cleared.")
+# Ensure folders exist
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+os.makedirs("./data", exist_ok=True)
+# Upload
+st.subheader("📤 Upload Documents")
+uploaded_files = st.file_uploader(
+    "Upload PDF/DOCX files",
+    type=["pdf", "docx"],
+    accept_multiple_files=True
+)
+# Build Index
+if st.button("✅ Build Index"):
+    if not uploaded_files:
+        st.warning("Please upload at least one document.")
+    else:
+        with st.spinner("Indexing documents..."):
+            documents, metadatas, ids = [], [], []
+            for f in uploaded_files:
+                save_path = os.path.join(UPLOAD_DIR, f.name)
+                with open(save_path, "wb") as out:
+                    out.write(f.getbuffer())
+                if f.name.lower().endswith(".pdf"):
+                    pages = read_pdf(save_path)
+                elif f.name.lower().endswith(".docx"):
+                    pages = read_docx(save_path)
+                else:
+                    continue
+                for page, text in pages:
+                    for i, chunk in enumerate(chunk_text(text)):
+                        documents.append(chunk)
+                        metadatas.append({"file": f.name, "page": page})
+                        ids.append(f"{f.name}_p{page}_c{i}")
+            if not documents:
+                st.error("No text could be extracted. If PDF is scanned, OCR is needed.")
+            else:
+                vectors = embed_texts(documents)
+                add_documents(documents, vectors, metadatas, ids)
+                st.success(f"✅ Indexed {len(documents)} chunks from {len(uploaded_files)} file(s).")
+st.divider()
+# Ask
+st.subheader("💬 Ask a question")
+question = st.text_input("Type your question")
+if st.button("Ask"):
+    if not question.strip():
+        st.warning("Please enter a question.")
+    else:
+        with st.spinner("Thinking..."):
+            try:
+                answer, citations = answer_question(question)
+                st.markdown("### ✅ Answer")
+                st.write(answer)
+                st.markdown("### 📌 Citations (retrieved)")
+                for c in citations:
+                    st.write(c)
+            except Exception as e:
+                st.error(str(e))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+openai
+chromadb
+pypdf
+python-docx
+tiktoken

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (121 Bytes). View file

src/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (419 Bytes). View file

src/__pycache__/vectorstore.cpython-312.pyc ADDED Viewed

Binary file (2.19 kB). View file

src/chunking.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import List
+import tiktoken
+from src.config import TOKEN_ENCODING, CHUNK_TOKENS, CHUNK_OVERLAP
+_enc = tiktoken.get_encoding(TOKEN_ENCODING)
+def chunk_text(text: str, chunk_tokens: int = CHUNK_TOKENS, overlap_tokens: int = CHUNK_OVERLAP) -> List[str]:
+    tokens = _enc.encode(text)
+    chunks = []
+    start = 0
+    while start < len(tokens):
+        end = min(start + chunk_tokens, len(tokens))
+        chunk = _enc.decode(tokens[start:end]).strip()
+        if chunk:
+            chunks.append(chunk)
+        start = end - overlap_tokens
+        if start < 0:
+            start = 0
+        if end == len(tokens):
+            break
+    return chunks

src/config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+EMBED_MODEL = "text-embedding-3-small"
+CHAT_MODEL = "gpt-3.5-turbo"
+CHUNK_TOKENS = 900
+CHUNK_OVERLAP = 150
+TOP_K = 6
+COLLECTION_NAME = "docs"
+CHROMA_DIR = "./data/chroma"
+UPLOAD_DIR = "./data/uploads"
+TOKEN_ENCODING = "cl100k_base"

src/embeddings.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from typing import List
+from src.openai_client import get_client
+from src.config import EMBED_MODEL
+def embed_texts(texts: List[str]) -> List[List[float]]:
+    client = get_client()
+    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
+    return [d.embedding for d in resp.data]

src/openai_client.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from openai import OpenAI
+def get_client() -> OpenAI:
+    key = os.getenv("OPENAI_API_KEY")
+    if not key:
+        raise RuntimeError("OPENAI_API_KEY not set. Add it in Hugging Face Space -> Settings -> Secrets.")
+    return OpenAI(api_key=key)

src/parsers.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing import List, Tuple
+from pypdf import PdfReader
+from docx import Document
+def read_pdf(path: str) -> List[Tuple[int, str]]:
+    reader = PdfReader(path)
+    pages = []
+    for i, page in enumerate(reader.pages):
+        text = (page.extract_text() or "").strip()
+        if text:
+            pages.append((i + 1, text))
+    return pages
+def read_docx(path: str) -> List[Tuple[int, str]]:
+    doc = Document(path)
+    text = "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip()
+    return [(1, text)] if text else []

src/rag.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from typing import List, Tuple
+from src.embeddings import embed_texts
+from src.vectorstore import query_by_embedding
+from src.openai_client import get_client
+from src.config import CHAT_MODEL, TOP_K
+def retrieve_context(question: str, top_k: int = TOP_K) -> Tuple[str, List[str]]:
+    q_vec = embed_texts([question])[0]
+    docs, metas = query_by_embedding(q_vec, top_k=top_k)
+    context_blocks = []
+    citations = []
+    for i, (doc, meta) in enumerate(zip(docs, metas), start=1):
+        citations.append(f"[{i}] {meta.get('file')} (page {meta.get('page')})")
+        context_blocks.append(
+            f"Source {i}: {meta.get('file')} (page {meta.get('page')})\n{doc}"
+        )
+    return "\n\n---\n\n".join(context_blocks), citations
+def answer_question(question: str) -> Tuple[str, List[str]]:
+    context, citations = retrieve_context(question, top_k=TOP_K)
+    prompt = f"""
+You are a document assistant.
+Answer ONLY using the SOURCES below.
+If the answer is not in the sources, say: "I don't know from the uploaded documents."
+SOURCES:
+{context}
+QUESTION:
+{question}
+Return:
+1) Answer (clear & concise)
+2) Sources used (numbers only)
+"""
+    client = get_client()
+    resp = client.responses.create(model=CHAT_MODEL, input=prompt)
+    return resp.output_text.strip(), citations

src/vectorstore.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+from typing import List, Dict, Any, Tuple
+import chromadb
+from src.config import CHROMA_DIR, COLLECTION_NAME
+# ---------------- COLLECTION ----------------
+def get_collection():
+    os.makedirs(CHROMA_DIR, exist_ok=True)
+    client = chromadb.PersistentClient(path=CHROMA_DIR)
+    return client.get_or_create_collection(COLLECTION_NAME)
+# ---------------- ADD DOCUMENTS ----------------
+def add_documents(
+    docs: List[str],
+    embeddings: List[List[float]],
+    metadatas: List[Dict[str, Any]],
+    ids: List[str]
+) -> None:
+    col = get_collection()
+    col.add(
+        documents=docs,
+        embeddings=embeddings,
+        metadatas=metadatas,
+        ids=ids
+    )
+# ---------------- QUERY ----------------
+def query_by_embedding(
+    q_embedding: List[float],
+    top_k: int
+) -> Tuple[List[str], List[Dict[str, Any]]]:
+    col = get_collection()
+    res = col.query(
+        query_embeddings=[q_embedding],
+        n_results=top_k,
+        include=["documents", "metadatas"]
+    )
+    return res["documents"][0], res["metadatas"][0]
+# ---------------- RESET ----------------
+def reset_collection() -> None:
+    os.makedirs(CHROMA_DIR, exist_ok=True)
+    client = chromadb.PersistentClient(path=CHROMA_DIR)
+    try:
+        client.delete_collection(COLLECTION_NAME)
+    except Exception:
+        pass
+    client.get_or_create_collection(COLLECTION_NAME)