amitcoolll commited on
Commit
c4233b7
·
1 Parent(s): 28b62bf

Initial RAG document chatbot deployment

Browse files
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM python:3.11-slim
3
+
4
+ WORKDIR /app
5
+ COPY . .
6
+
7
+ RUN pip install --no-cache-dir -U pip \
8
+ && pip install --no-cache-dir -r requirements.txt
9
+
10
+ # HF Spaces expects the app on port 7860 (best practice)
11
+ EXPOSE 7860
12
+
13
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.headless=true"]
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ import streamlit as st
5
+
6
+ from src.config import UPLOAD_DIR, CHUNK_TOKENS, CHUNK_OVERLAP, TOP_K
7
+ from src.parsers import read_pdf, read_docx
8
+ from src.chunking import chunk_text
9
+ from src.embeddings import embed_texts
10
+ from src.vectorstore import add_documents, reset_collection
11
+ from src.rag import answer_question
12
+
13
+ st.set_page_config(page_title="Document Chatbot (RAG)", layout="wide")
14
+ st.title("📄 Document Chatbot (RAG) — Streamlit")
15
+ st.caption("Upload multiple PDF/DOCX → Build Index → Ask questions → Answers from docs only + citations")
16
+
17
+ # Sidebar settings display (optional)
18
+ with st.sidebar:
19
+ st.header("Settings")
20
+ st.write(f"Chunk size: {CHUNK_TOKENS} tokens")
21
+ st.write(f"Overlap: {CHUNK_OVERLAP} tokens")
22
+ st.write(f"Top-K retrieval: {TOP_K}")
23
+ if st.button("🧹 Clear Index"):
24
+ reset_collection()
25
+ st.success("Index cleared.")
26
+
27
+ # Ensure folders exist
28
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
29
+ os.makedirs("./data", exist_ok=True)
30
+
31
+ # Upload
32
+ st.subheader("📤 Upload Documents")
33
+ uploaded_files = st.file_uploader(
34
+ "Upload PDF/DOCX files",
35
+ type=["pdf", "docx"],
36
+ accept_multiple_files=True
37
+ )
38
+
39
+ # Build Index
40
+ if st.button("✅ Build Index"):
41
+ if not uploaded_files:
42
+ st.warning("Please upload at least one document.")
43
+ else:
44
+ with st.spinner("Indexing documents..."):
45
+ documents, metadatas, ids = [], [], []
46
+
47
+ for f in uploaded_files:
48
+ save_path = os.path.join(UPLOAD_DIR, f.name)
49
+ with open(save_path, "wb") as out:
50
+ out.write(f.getbuffer())
51
+
52
+ if f.name.lower().endswith(".pdf"):
53
+ pages = read_pdf(save_path)
54
+ elif f.name.lower().endswith(".docx"):
55
+ pages = read_docx(save_path)
56
+ else:
57
+ continue
58
+
59
+ for page, text in pages:
60
+ for i, chunk in enumerate(chunk_text(text)):
61
+ documents.append(chunk)
62
+ metadatas.append({"file": f.name, "page": page})
63
+ ids.append(f"{f.name}_p{page}_c{i}")
64
+
65
+ if not documents:
66
+ st.error("No text could be extracted. If PDF is scanned, OCR is needed.")
67
+ else:
68
+ vectors = embed_texts(documents)
69
+ add_documents(documents, vectors, metadatas, ids)
70
+ st.success(f"✅ Indexed {len(documents)} chunks from {len(uploaded_files)} file(s).")
71
+
72
+ st.divider()
73
+
74
+ # Ask
75
+ st.subheader("💬 Ask a question")
76
+ question = st.text_input("Type your question")
77
+
78
+ if st.button("Ask"):
79
+ if not question.strip():
80
+ st.warning("Please enter a question.")
81
+ else:
82
+ with st.spinner("Thinking..."):
83
+ try:
84
+ answer, citations = answer_question(question)
85
+ st.markdown("### ✅ Answer")
86
+ st.write(answer)
87
+
88
+ st.markdown("### 📌 Citations (retrieved)")
89
+ for c in citations:
90
+ st.write(c)
91
+ except Exception as e:
92
+ st.error(str(e))
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ streamlit
3
+ openai
4
+ chromadb
5
+ pypdf
6
+ python-docx
7
+ tiktoken
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (121 Bytes). View file
 
src/__pycache__/config.cpython-312.pyc ADDED
Binary file (419 Bytes). View file
 
src/__pycache__/vectorstore.cpython-312.pyc ADDED
Binary file (2.19 kB). View file
 
src/chunking.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List
3
+ import tiktoken
4
+ from src.config import TOKEN_ENCODING, CHUNK_TOKENS, CHUNK_OVERLAP
5
+
6
+ _enc = tiktoken.get_encoding(TOKEN_ENCODING)
7
+
8
+ def chunk_text(text: str, chunk_tokens: int = CHUNK_TOKENS, overlap_tokens: int = CHUNK_OVERLAP) -> List[str]:
9
+ tokens = _enc.encode(text)
10
+ chunks = []
11
+ start = 0
12
+
13
+ while start < len(tokens):
14
+ end = min(start + chunk_tokens, len(tokens))
15
+ chunk = _enc.decode(tokens[start:end]).strip()
16
+ if chunk:
17
+ chunks.append(chunk)
18
+
19
+ start = end - overlap_tokens
20
+ if start < 0:
21
+ start = 0
22
+ if end == len(tokens):
23
+ break
24
+
25
+ return chunks
src/config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EMBED_MODEL = "text-embedding-3-small"
3
+ CHAT_MODEL = "gpt-3.5-turbo"
4
+
5
+ CHUNK_TOKENS = 900
6
+ CHUNK_OVERLAP = 150
7
+ TOP_K = 6
8
+
9
+ COLLECTION_NAME = "docs"
10
+ CHROMA_DIR = "./data/chroma"
11
+ UPLOAD_DIR = "./data/uploads"
12
+
13
+ TOKEN_ENCODING = "cl100k_base"
src/embeddings.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from typing import List
4
+ from src.openai_client import get_client
5
+ from src.config import EMBED_MODEL
6
+
7
+ def embed_texts(texts: List[str]) -> List[List[float]]:
8
+ client = get_client()
9
+ resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
10
+ return [d.embedding for d in resp.data]
src/openai_client.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ from openai import OpenAI
5
+
6
+ def get_client() -> OpenAI:
7
+ key = os.getenv("OPENAI_API_KEY")
8
+ if not key:
9
+ raise RuntimeError("OPENAI_API_KEY not set. Add it in Hugging Face Space -> Settings -> Secrets.")
10
+ return OpenAI(api_key=key)
src/parsers.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List, Tuple
3
+ from pypdf import PdfReader
4
+ from docx import Document
5
+
6
+ def read_pdf(path: str) -> List[Tuple[int, str]]:
7
+ reader = PdfReader(path)
8
+ pages = []
9
+ for i, page in enumerate(reader.pages):
10
+ text = (page.extract_text() or "").strip()
11
+ if text:
12
+ pages.append((i + 1, text))
13
+ return pages
14
+
15
+ def read_docx(path: str) -> List[Tuple[int, str]]:
16
+ doc = Document(path)
17
+ text = "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip()
18
+ return [(1, text)] if text else []
src/rag.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ from typing import List, Tuple
5
+ from src.embeddings import embed_texts
6
+ from src.vectorstore import query_by_embedding
7
+ from src.openai_client import get_client
8
+ from src.config import CHAT_MODEL, TOP_K
9
+
10
+
11
+
12
+ def retrieve_context(question: str, top_k: int = TOP_K) -> Tuple[str, List[str]]:
13
+ q_vec = embed_texts([question])[0]
14
+ docs, metas = query_by_embedding(q_vec, top_k=top_k)
15
+
16
+ context_blocks = []
17
+ citations = []
18
+
19
+ for i, (doc, meta) in enumerate(zip(docs, metas), start=1):
20
+ citations.append(f"[{i}] {meta.get('file')} (page {meta.get('page')})")
21
+ context_blocks.append(
22
+ f"Source {i}: {meta.get('file')} (page {meta.get('page')})\n{doc}"
23
+ )
24
+
25
+ return "\n\n---\n\n".join(context_blocks), citations
26
+
27
+ def answer_question(question: str) -> Tuple[str, List[str]]:
28
+ context, citations = retrieve_context(question, top_k=TOP_K)
29
+
30
+ prompt = f"""
31
+ You are a document assistant.
32
+ Answer ONLY using the SOURCES below.
33
+ If the answer is not in the sources, say: "I don't know from the uploaded documents."
34
+
35
+ SOURCES:
36
+ {context}
37
+
38
+ QUESTION:
39
+ {question}
40
+
41
+ Return:
42
+ 1) Answer (clear & concise)
43
+ 2) Sources used (numbers only)
44
+ """
45
+
46
+ client = get_client()
47
+ resp = client.responses.create(model=CHAT_MODEL, input=prompt)
48
+ return resp.output_text.strip(), citations
src/vectorstore.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ from typing import List, Dict, Any, Tuple
5
+ import chromadb
6
+
7
+ from src.config import CHROMA_DIR, COLLECTION_NAME
8
+
9
+ # ---------------- COLLECTION ----------------
10
+ def get_collection():
11
+ os.makedirs(CHROMA_DIR, exist_ok=True)
12
+ client = chromadb.PersistentClient(path=CHROMA_DIR)
13
+ return client.get_or_create_collection(COLLECTION_NAME)
14
+
15
+ # ---------------- ADD DOCUMENTS ----------------
16
+ def add_documents(
17
+ docs: List[str],
18
+ embeddings: List[List[float]],
19
+ metadatas: List[Dict[str, Any]],
20
+ ids: List[str]
21
+ ) -> None:
22
+ col = get_collection()
23
+ col.add(
24
+ documents=docs,
25
+ embeddings=embeddings,
26
+ metadatas=metadatas,
27
+ ids=ids
28
+ )
29
+
30
+ # ---------------- QUERY ----------------
31
+ def query_by_embedding(
32
+ q_embedding: List[float],
33
+ top_k: int
34
+ ) -> Tuple[List[str], List[Dict[str, Any]]]:
35
+ col = get_collection()
36
+ res = col.query(
37
+ query_embeddings=[q_embedding],
38
+ n_results=top_k,
39
+ include=["documents", "metadatas"]
40
+ )
41
+ return res["documents"][0], res["metadatas"][0]
42
+
43
+ # ---------------- RESET ----------------
44
+ def reset_collection() -> None:
45
+ os.makedirs(CHROMA_DIR, exist_ok=True)
46
+ client = chromadb.PersistentClient(path=CHROMA_DIR)
47
+ try:
48
+ client.delete_collection(COLLECTION_NAME)
49
+ except Exception:
50
+ pass
51
+ client.get_or_create_collection(COLLECTION_NAME)