import streamlit as st import chromadb from sentence_transformers import SentenceTransformer import fitz # PyMuPDF import os import requests import re import hashlib # ─── Page Config ────────────────────────────────────────────────────────────── st.set_page_config( page_title="PDF RAG · Upload & Ask", page_icon="📂", layout="wide", initial_sidebar_state="expanded" ) # ─── CSS ────────────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ─── Session State ──────────────────────────────────────────────────────────── if "indexed_files" not in st.session_state: st.session_state.indexed_files = {} # filename → {chunks, pages, size} if "chroma_collection" not in st.session_state: st.session_state.chroma_collection = None if "chroma_client" not in st.session_state: st.session_state.chroma_client = None if "total_chunks" not in st.session_state: st.session_state.total_chunks = 0 # ─── Load embedding model (cached globally) ─────────────────────────────────── @st.cache_resource(show_spinner=False) def load_embed_model(): return SentenceTransformer('all-MiniLM-L6-v2') # ─── PDF Extraction ─────────────────────────────────────────────────────────── def extract_text_from_pdf(pdf_bytes: bytes) -> list[dict]: """Returns list of {page, text} dicts.""" doc = fitz.open(stream=pdf_bytes, filetype="pdf") pages = [] for page_num, page in enumerate(doc, start=1): text = page.get_text("text").strip() if text: pages.append({"page": page_num, "text": text}) doc.close() return pages # ─── Chunking ───────────────────────────────────────────────────────────────── def chunk_text(pages: list[dict], chunk_size: int = 400, overlap: int = 60) -> list[dict]: """Splits page text into overlapping word-based chunks.""" chunks = [] for p in pages: words = p["text"].split() start = 0 while start < len(words): end = start + chunk_size chunk_words = words[start:end] chunk_text_str = " ".join(chunk_words).strip() if len(chunk_text_str) > 60: chunks.append({"text": chunk_text_str, "page": p["page"]}) start += chunk_size - overlap return chunks # ─── Index PDF into ChromaDB ────────────────────────────────────────────────── def index_pdf(filename: str, pdf_bytes: bytes, embed_model): # Init or reuse ChromaDB if st.session_state.chroma_client is None: st.session_state.chroma_client = chromadb.Client() st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection( name="pdf_rag", metadata={"hnsw:space": "cosine"} ) collection = st.session_state.chroma_collection # Extract & chunk pages = extract_text_from_pdf(pdf_bytes) chunks = chunk_text(pages) if not chunks: return 0, 0 # Embed & add texts = [c["text"] for c in chunks] embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=False).tolist() ids, docs, metas, embeds = [], [], [], [] for i, (chunk, emb) in enumerate(zip(chunks, embeddings)): chunk_id = f"{hashlib.md5(filename.encode()).hexdigest()[:8]}_chunk_{i}" ids.append(chunk_id) docs.append(chunk["text"]) metas.append({"filename": filename, "page": chunk["page"]}) embeds.append(emb) collection.add(ids=ids, embeddings=embeds, documents=docs, metadatas=metas) st.session_state.indexed_files[filename] = { "chunks": len(chunks), "pages": len(pages), "size_kb": round(len(pdf_bytes) / 1024, 1) } st.session_state.total_chunks += len(chunks) return len(chunks), len(pages) # ─── RAG Query ──────────────────────────────────────────────────────────────── def rag_query(question: str, embed_model, top_k: int, api_key: str): collection = st.session_state.chroma_collection q_emb = embed_model.encode(question).tolist() results = collection.query(query_embeddings=[q_emb], n_results=top_k) chunks = [] for i in range(len(results["documents"][0])): distance = results["distances"][0][i] chunks.append({ "text": results["documents"][0][i], "filename": results["metadatas"][0][i]["filename"], "page": results["metadatas"][0][i]["page"], "relevance": round((1 - distance) * 100, 1), }) context = "\n\n".join([ f"[Source: {c['filename']}, Page {c['page']}]\n{c['text']}" for c in chunks ]) prompt = f"""You are a helpful assistant. Answer the user's question using ONLY the document context provided below. Be concise and clear. Always mention the source filename and page number when referencing specific information. If the answer cannot be found in the provided context, say "I couldn't find that information in the uploaded documents." Document Context: {context} Question: {question} Answer:""" headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} payload = { "model": "llama-3.3-70b-versatile", "messages": [{"role": "user", "content": prompt}], "max_tokens": 600, "temperature": 0.2, } r = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload, timeout=30) r.raise_for_status() answer = r.json()["choices"][0]["message"]["content"] return answer, chunks # ─── Determine current phase ────────────────────────────────────────────────── has_docs = len(st.session_state.indexed_files) > 0 phase = 1 if not has_docs else 2 # ─── Sidebar ────────────────────────────────────────────────────────────────── with st.sidebar: st.markdown("## 📂 PDF RAG Demo") st.markdown("
Upload any PDF documents · They get extracted, chunked, embedded, and indexed · Then ask questions across all of them
No documents uploaded yet
Upload one or more PDF files above to get started.
Any topic works — reports, manuals, research papers, policies.