Spaces:

aneeb15
/

multi-doc-rag-v2

Sleeping

App Files Files Community

multi-doc-rag-v2 / app.py

aneeb15

Add pysqlite3 hack to app.py

d99b1e5 verified 27 days ago

raw

history blame contribute delete

15.5 kB


	__import__('pysqlite3')
	import sys
	sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

	import streamlit as st
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_google_genai import ChatGoogleGenerativeAI
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	import tempfile
	import chromadb
	import torch
	import time
	import os

	st.set_page_config(
	page_title="RAG Research Assistant",
	page_icon="⬡",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	st.markdown("""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;700&family=Inter:wght@300;400;600;800&display=swap');

	* { font-family: 'Inter', sans-serif; }
	html, body, [data-testid="stAppViewContainer"] {
	background: #080C10;
	color: #E2E8F0;
	}
	[data-testid="stSidebar"] {
	background: #0D1117 !important;
	border-right: 1px solid #1E2D40;
	}
	.rag-header { padding: 2rem 0 1rem 0; border-bottom: 1px solid #1E2D40; margin-bottom: 2rem; }
	.rag-title { font-size: 2.2rem; font-weight: 800; letter-spacing: -0.03em; color: #F1F5F9; margin: 0; }
	.rag-title span { color: #38BDF8; }
	.rag-sub { font-size: 0.85rem; color: #64748B; margin-top: 0.3rem; font-family: 'JetBrains Mono', monospace; }
	.log-terminal {
	background: #0D1117; border: 1px solid #1E2D40; border-radius: 8px;
	padding: 1rem 1.2rem; font-family: 'JetBrains Mono', monospace;
	font-size: 0.78rem; color: #94A3B8; min-height: 80px; max-height: 200px; overflow-y: auto;
	}
	.log-line { margin: 2px 0; }
	.log-ok { color: #34D399; }
	.log-info { color: #38BDF8; }
	.log-warn { color: #FBBF24; }
	.log-dim { color: #475569; }
	.answer-card {
	background: #0D1117; border: 1px solid #1E2D40; border-left: 3px solid #38BDF8;
	border-radius: 8px; padding: 1.2rem 1.5rem; margin: 1rem 0;
	line-height: 1.7; color: #CBD5E1;
	}
	.source-tag {
	display: inline-block; background: #1E2D40; border: 1px solid #2D3F55;
	border-radius: 4px; padding: 3px 10px; font-size: 0.75rem;
	font-family: 'JetBrains Mono', monospace; color: #94A3B8; margin: 3px 4px 3px 0;
	}
	.score-row { display: flex; gap: 12px; margin-top: 1rem; }
	.score-card {
	flex: 1; background: #0D1117; border: 1px solid #1E2D40;
	border-radius: 8px; padding: 1rem; text-align: center;
	}
	.score-label {
	font-size: 0.7rem; color: #64748B; font-family: 'JetBrains Mono', monospace;
	text-transform: uppercase; letter-spacing: 0.08em; margin-bottom: 0.4rem;
	}
	.score-value { font-size: 1.8rem; font-weight: 800; font-family: 'JetBrains Mono', monospace; }
	.score-high { color: #34D399; }
	.score-mid { color: #FBBF24; }
	.score-low { color: #F87171; }
	.stTextInput input, .stTextArea textarea {
	background: #0D1117 !important; border: 1px solid #1E2D40 !important;
	color: #E2E8F0 !important; border-radius: 6px !important;
	font-family: 'JetBrains Mono', monospace !important;
	}
	.stTextInput input:focus, .stTextArea textarea:focus {
	border-color: #38BDF8 !important; box-shadow: 0 0 0 1px #38BDF8 !important;
	}
	.stButton button {
	background: #38BDF8 !important; color: #080C10 !important;
	border: none !important; border-radius: 6px !important;
	font-weight: 700 !important; font-family: 'JetBrains Mono', monospace !important;
	letter-spacing: 0.05em !important; padding: 0.5rem 1.5rem !important;
	}
	.stButton button:hover { background: #7DD3FC !important; }
	hr { border-color: #1E2D40 !important; }
	.sidebar-label {
	font-size: 0.7rem; color: #475569; font-family: 'JetBrains Mono', monospace;
	text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 0.3rem;
	}
	</style>
	""", unsafe_allow_html=True)

	# ── Sidebar ───────────────────────────────────────────────────
	with st.sidebar:
	st.markdown('<p class="sidebar-label">⬡ RAG Research Assistant</p>', unsafe_allow_html=True)
	st.markdown("---")

	st.markdown('<p class="sidebar-label">API Configuration</p>', unsafe_allow_html=True)
	use_own_key = st.toggle("Use my own API key", value=False)
	if use_own_key:
	api_key = st.text_input(
	"Google AI API Key",
	type="password",
	placeholder="AIza..",
	help="Get free key at aistudio.google.com"
	)
	else:
	try:
	from google.colab import userdata
	DEFAULT_API_KEY = userdata.get('GOOGLE_API_KEY')
	except:
	from dotenv import load_dotenv
	load_dotenv()
	DEFAULT_API_KEY = os.getenv("GOOGLE_API_KEY", "")
	api_key = DEFAULT_API_KEY
	if api_key:
	st.markdown(
	'<p style="color:#34D399;font-size:0.75rem;font-family:JetBrains Mono">✓ Using default API key</p>',
	unsafe_allow_html=True
	)
	else:
	st.markdown(
	'<p style="color:#F87171;font-size:0.75rem;font-family:JetBrains Mono">⚠ No API key found. Add it in sidebar or .env</p>',
	unsafe_allow_html=True
	)

	st.markdown("---")
	st.markdown('<p class="sidebar-label">Model Settings</p>', unsafe_allow_html=True)

	use_custom_model = st.toggle("Use custom model", value=False)
	if use_custom_model:
	model_choice = st.text_input(
	"Model name",
	placeholder="gemini-1.5-pro, gemini-1.5-flash, gemini-3.1-flash-lite-preview...",
	help="Enter exact model string from Google AI Studio"
	)
	st.markdown(
	'<p style="color:#475569;font-size:0.72rem;font-family:JetBrains Mono">Find model names at aistudio.google.com</p>',
	unsafe_allow_html=True
	)
	else:
	model_choice = st.selectbox(
	"Gemini Model",
	["gemini-1.5-flash", "gemini-1.5-pro", "gemini-3.1-flash-lite-preview"],
	index=0
	)

	top_k = st.slider("Chunks to retrieve (k)", 3, 8, 5)
	chunk_size = st.slider("Chunk size (tokens)", 256, 1024, 512, step=128)

	st.markdown("---")
	st.markdown('<p class="sidebar-label">About</p>', unsafe_allow_html=True)
	st.markdown("""
	<p style="font-size:0.75rem;color:#475569;line-height:1.6">
	Multi-document RAG with<br>
	semantic retrieval, source<br>
	citations & quality evaluation.<br><br>
	Built by <span style="color:#38BDF8">Aneeb Naqvi</span>
	</p>
	""", unsafe_allow_html=True)

	# ── Header ────────────────────────────────────────────────────
	st.markdown("""
	<div class="rag-header">
	<h1 class="rag-title">Research <span>Assistant</span></h1>
	<p class="rag-sub">// semantic search · source citations · retrieval evaluation</p>
	</div>
	""", unsafe_allow_html=True)

	# ── Model loader ──────────────────────────────────────────────
	@st.cache_resource
	def load_embedding_model():
	return HuggingFaceEmbeddings(
	model_name="all-MiniLM-L6-v2",
	model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
	)

	def get_llm(api_key, model):
	return ChatGoogleGenerativeAI(
	model=model,
	google_api_key=api_key,
	temperature=0.3
	)

	# ── Processing ────────────────────────────────────────────────
	def process_pdfs(uploaded_files, embedding_model, chunk_size, log_placeholder):
	all_chunks = []
	logs = []

	def update_log(msg, level="info"):
	tag = {"ok": "log-ok", "info": "log-info", "warn": "log-warn", "dim": "log-dim"}.get(level, "log-info")
	logs.append(f'<div class="log-line {tag}">{msg}</div>')
	log_placeholder.markdown(
	f'<div class="log-terminal">{"".join(logs)}</div>',
	unsafe_allow_html=True
	)

	update_log("// initializing document pipeline", "dim")
	time.sleep(0.3)

	for uploaded_file in uploaded_files:
	update_log(f"→ loading [{uploaded_file.name}]", "info")
	time.sleep(0.2)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
	f.write(uploaded_file.read())
	temp_path = f.name
	loader = PyPDFLoader(temp_path)
	documents = loader.load()
	update_log(f" pages extracted: {len(documents)}", "dim")
	splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
	chunks = splitter.split_documents(documents)
	all_chunks.extend(chunks)
	update_log(f" chunks created: {len(chunks)}", "dim")

	update_log(f"→ embedding {len(all_chunks)} chunks into vector space", "info")
	time.sleep(0.3)

	client = chromadb.EphemeralClient()
	vectorstore = Chroma.from_documents(
	documents=all_chunks,
	embedding=embedding_model,
	client=client,
	collection_name="rag_docs"
	)
	update_log(f"✓ vectorstore ready — {vectorstore._collection.count()} vectors indexed", "ok")
	update_log("// system ready for queries", "dim")
	return vectorstore

	# ── RAG pipeline ──────────────────────────────────────────────
	def answer_question(query, vectorstore, llm, k):
	retrieved_docs = vectorstore.similarity_search(query, k=k)
	context = ""
	sources = []
	contexts = []
	for doc in retrieved_docs:
	context += doc.page_content + "\n\n"
	contexts.append(doc.page_content)
	sources.append({
	"source": doc.metadata.get('source', 'unknown'),
	"page": doc.metadata.get('page', 0) + 1
	})

	prompt = f"""Answer the question based only on the context below.
	Be specific and detailed. If not in context, say "I don't know".

	Context:
	{context}

	Question: {query}

	Answer:"""

	response = llm.invoke(prompt)
	if isinstance(response.content, list):
	answer = " ".join([b['text'] for b in response.content if b.get('type') == 'text'])
	else:
	answer = response.content

	return {"answer": answer, "sources": sources, "contexts": contexts}

	# ── Evaluator ─────────────────────────────────────────────────
	def evaluate_rag(query, result, embedding_model):
	answer = result['answer']
	contexts = result['contexts']
	answer_words = set(answer.lower().split())
	context_words = set(" ".join(contexts).lower().split())
	grounding = len(answer_words & context_words) / len(answer_words) if answer_words else 0
	query_vec = embedding_model.embed_query(query)
	chunk_vecs = embedding_model.embed_documents(contexts)
	sims = cosine_similarity([query_vec], chunk_vecs)[0]
	retrieval_relevance = float(np.mean(sims))
	completeness = len(answer_words & context_words) / len(context_words) if context_words else 0
	return {
	"grounding": round(grounding, 2),
	"relevance": round(retrieval_relevance, 2),
	"completeness": round(completeness, 2)
	}

	def score_color(val):
	if val >= 0.7: return "score-high"
	if val >= 0.4: return "score-mid"
	return "score-low"

	# ── Main UI ───────────────────────────────────────────────────
	embedding_model = load_embedding_model()

	col_upload, col_query = st.columns([1, 1], gap="large")

	with col_upload:
	st.markdown('<p class="sidebar-label">01 / Upload Documents</p>', unsafe_allow_html=True)
	uploaded_files = st.file_uploader(
	"Drop PDF files here",
	type="pdf",
	accept_multiple_files=True,
	label_visibility="collapsed"
	)
	log_placeholder = st.empty()
	log_placeholder.markdown(
	'<div class="log-terminal"><div class="log-line log-dim">// awaiting documents...</div></div>',
	unsafe_allow_html=True
	)
	if uploaded_files:
	if st.button("⬡ Process Documents", use_container_width=True):
	if not api_key:
	st.error("Add your API key in the sidebar first.")
	elif use_custom_model and not model_choice:
	st.error("Enter a model name in the sidebar.")
	else:
	vectorstore = process_pdfs(
	uploaded_files, embedding_model, chunk_size, log_placeholder
	)
	st.session_state.vectorstore = vectorstore
	st.session_state.llm = get_llm(api_key, model_choice)

	with col_query:
	st.markdown('<p class="sidebar-label">02 / Ask a Question</p>', unsafe_allow_html=True)
	query = st.text_input(
	"Query",
	placeholder="What does this document say about...",
	label_visibility="collapsed"
	)

	if query and "vectorstore" in st.session_state:
	with st.spinner(""):
	result = answer_question(query, st.session_state.vectorstore, st.session_state.llm, top_k)
	scores = evaluate_rag(query, result, embedding_model)

	st.markdown('<p class="sidebar-label">Answer</p>', unsafe_allow_html=True)
	st.markdown(f'<div class="answer-card">{result["answer"]}</div>', unsafe_allow_html=True)

	st.markdown('<p class="sidebar-label" style="margin-top:1rem">Sources</p>', unsafe_allow_html=True)
	sources_html = ""
	seen = set()
	for s in result['sources']:
	key = f"{s['source']}:p{s['page']}"
	if key not in seen:
	seen.add(key)
	name = s['source'].split('/')[-1]
	sources_html += f'<span class="source-tag">📄 {name} · p{s["page"]}</span>'
	st.markdown(sources_html, unsafe_allow_html=True)

	st.markdown('<p class="sidebar-label" style="margin-top:1.5rem">Retrieval Quality</p>', unsafe_allow_html=True)
	st.markdown(f"""
	<div class="score-row">
	<div class="score-card">
	<div class="score-label">Grounding</div>
	<div class="score-value {score_color(scores['grounding'])}">{scores['grounding']}</div>
	</div>
	<div class="score-card">
	<div class="score-label">Relevance</div>
	<div class="score-value {score_color(scores['relevance'])}">{scores['relevance']}</div>
	</div>
	<div class="score-card">
	<div class="score-label">Completeness</div>
	<div class="score-value {score_color(scores['completeness'])}">{scores['completeness']}</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	elif query and "vectorstore" not in st.session_state:
	st.warning("Upload and process documents first.")