Spaces:
Sleeping
Sleeping
| __import__('pysqlite3') | |
| import sys | |
| sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') | |
| import streamlit as st | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import tempfile | |
| import chromadb | |
| import torch | |
| import time | |
| import os | |
| st.set_page_config( | |
| page_title="RAG Research Assistant", | |
| page_icon="⬑", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;700&family=Inter:wght@300;400;600;800&display=swap'); | |
| * { font-family: 'Inter', sans-serif; } | |
| html, body, [data-testid="stAppViewContainer"] { | |
| background: #080C10; | |
| color: #E2E8F0; | |
| } | |
| [data-testid="stSidebar"] { | |
| background: #0D1117 !important; | |
| border-right: 1px solid #1E2D40; | |
| } | |
| .rag-header { padding: 2rem 0 1rem 0; border-bottom: 1px solid #1E2D40; margin-bottom: 2rem; } | |
| .rag-title { font-size: 2.2rem; font-weight: 800; letter-spacing: -0.03em; color: #F1F5F9; margin: 0; } | |
| .rag-title span { color: #38BDF8; } | |
| .rag-sub { font-size: 0.85rem; color: #64748B; margin-top: 0.3rem; font-family: 'JetBrains Mono', monospace; } | |
| .log-terminal { | |
| background: #0D1117; border: 1px solid #1E2D40; border-radius: 8px; | |
| padding: 1rem 1.2rem; font-family: 'JetBrains Mono', monospace; | |
| font-size: 0.78rem; color: #94A3B8; min-height: 80px; max-height: 200px; overflow-y: auto; | |
| } | |
| .log-line { margin: 2px 0; } | |
| .log-ok { color: #34D399; } | |
| .log-info { color: #38BDF8; } | |
| .log-warn { color: #FBBF24; } | |
| .log-dim { color: #475569; } | |
| .answer-card { | |
| background: #0D1117; border: 1px solid #1E2D40; border-left: 3px solid #38BDF8; | |
| border-radius: 8px; padding: 1.2rem 1.5rem; margin: 1rem 0; | |
| line-height: 1.7; color: #CBD5E1; | |
| } | |
| .source-tag { | |
| display: inline-block; background: #1E2D40; border: 1px solid #2D3F55; | |
| border-radius: 4px; padding: 3px 10px; font-size: 0.75rem; | |
| font-family: 'JetBrains Mono', monospace; color: #94A3B8; margin: 3px 4px 3px 0; | |
| } | |
| .score-row { display: flex; gap: 12px; margin-top: 1rem; } | |
| .score-card { | |
| flex: 1; background: #0D1117; border: 1px solid #1E2D40; | |
| border-radius: 8px; padding: 1rem; text-align: center; | |
| } | |
| .score-label { | |
| font-size: 0.7rem; color: #64748B; font-family: 'JetBrains Mono', monospace; | |
| text-transform: uppercase; letter-spacing: 0.08em; margin-bottom: 0.4rem; | |
| } | |
| .score-value { font-size: 1.8rem; font-weight: 800; font-family: 'JetBrains Mono', monospace; } | |
| .score-high { color: #34D399; } | |
| .score-mid { color: #FBBF24; } | |
| .score-low { color: #F87171; } | |
| .stTextInput input, .stTextArea textarea { | |
| background: #0D1117 !important; border: 1px solid #1E2D40 !important; | |
| color: #E2E8F0 !important; border-radius: 6px !important; | |
| font-family: 'JetBrains Mono', monospace !important; | |
| } | |
| .stTextInput input:focus, .stTextArea textarea:focus { | |
| border-color: #38BDF8 !important; box-shadow: 0 0 0 1px #38BDF8 !important; | |
| } | |
| .stButton button { | |
| background: #38BDF8 !important; color: #080C10 !important; | |
| border: none !important; border-radius: 6px !important; | |
| font-weight: 700 !important; font-family: 'JetBrains Mono', monospace !important; | |
| letter-spacing: 0.05em !important; padding: 0.5rem 1.5rem !important; | |
| } | |
| .stButton button:hover { background: #7DD3FC !important; } | |
| hr { border-color: #1E2D40 !important; } | |
| .sidebar-label { | |
| font-size: 0.7rem; color: #475569; font-family: 'JetBrains Mono', monospace; | |
| text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 0.3rem; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.sidebar: | |
| st.markdown('<p class="sidebar-label">⬑ RAG Research Assistant</p>', unsafe_allow_html=True) | |
| st.markdown("---") | |
| st.markdown('<p class="sidebar-label">API Configuration</p>', unsafe_allow_html=True) | |
| use_own_key = st.toggle("Use my own API key", value=False) | |
| if use_own_key: | |
| api_key = st.text_input( | |
| "Google AI API Key", | |
| type="password", | |
| placeholder="AIza..", | |
| help="Get free key at aistudio.google.com" | |
| ) | |
| else: | |
| try: | |
| from google.colab import userdata | |
| DEFAULT_API_KEY = userdata.get('GOOGLE_API_KEY') | |
| except: | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| DEFAULT_API_KEY = os.getenv("GOOGLE_API_KEY", "") | |
| api_key = DEFAULT_API_KEY | |
| if api_key: | |
| st.markdown( | |
| '<p style="color:#34D399;font-size:0.75rem;font-family:JetBrains Mono">β Using default API key</p>', | |
| unsafe_allow_html=True | |
| ) | |
| else: | |
| st.markdown( | |
| '<p style="color:#F87171;font-size:0.75rem;font-family:JetBrains Mono">β No API key found. Add it in sidebar or .env</p>', | |
| unsafe_allow_html=True | |
| ) | |
| st.markdown("---") | |
| st.markdown('<p class="sidebar-label">Model Settings</p>', unsafe_allow_html=True) | |
| use_custom_model = st.toggle("Use custom model", value=False) | |
| if use_custom_model: | |
| model_choice = st.text_input( | |
| "Model name", | |
| placeholder="gemini-1.5-pro, gemini-1.5-flash, gemini-3.1-flash-lite-preview...", | |
| help="Enter exact model string from Google AI Studio" | |
| ) | |
| st.markdown( | |
| '<p style="color:#475569;font-size:0.72rem;font-family:JetBrains Mono">Find model names at aistudio.google.com</p>', | |
| unsafe_allow_html=True | |
| ) | |
| else: | |
| model_choice = st.selectbox( | |
| "Gemini Model", | |
| ["gemini-1.5-flash", "gemini-1.5-pro", "gemini-3.1-flash-lite-preview"], | |
| index=0 | |
| ) | |
| top_k = st.slider("Chunks to retrieve (k)", 3, 8, 5) | |
| chunk_size = st.slider("Chunk size (tokens)", 256, 1024, 512, step=128) | |
| st.markdown("---") | |
| st.markdown('<p class="sidebar-label">About</p>', unsafe_allow_html=True) | |
| st.markdown(""" | |
| <p style="font-size:0.75rem;color:#475569;line-height:1.6"> | |
| Multi-document RAG with<br> | |
| semantic retrieval, source<br> | |
| citations & quality evaluation.<br><br> | |
| Built by <span style="color:#38BDF8">Aneeb Naqvi</span> | |
| </p> | |
| """, unsafe_allow_html=True) | |
| # ββ Header ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <div class="rag-header"> | |
| <h1 class="rag-title">Research <span>Assistant</span></h1> | |
| <p class="rag-sub">// semantic search Β· source citations Β· retrieval evaluation</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ββ Model loader ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_embedding_model(): | |
| return HuggingFaceEmbeddings( | |
| model_name="all-MiniLM-L6-v2", | |
| model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"} | |
| ) | |
| def get_llm(api_key, model): | |
| return ChatGoogleGenerativeAI( | |
| model=model, | |
| google_api_key=api_key, | |
| temperature=0.3 | |
| ) | |
| # ββ Processing ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_pdfs(uploaded_files, embedding_model, chunk_size, log_placeholder): | |
| all_chunks = [] | |
| logs = [] | |
| def update_log(msg, level="info"): | |
| tag = {"ok": "log-ok", "info": "log-info", "warn": "log-warn", "dim": "log-dim"}.get(level, "log-info") | |
| logs.append(f'<div class="log-line {tag}">{msg}</div>') | |
| log_placeholder.markdown( | |
| f'<div class="log-terminal">{"".join(logs)}</div>', | |
| unsafe_allow_html=True | |
| ) | |
| update_log("// initializing document pipeline", "dim") | |
| time.sleep(0.3) | |
| for uploaded_file in uploaded_files: | |
| update_log(f"β loading [{uploaded_file.name}]", "info") | |
| time.sleep(0.2) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f: | |
| f.write(uploaded_file.read()) | |
| temp_path = f.name | |
| loader = PyPDFLoader(temp_path) | |
| documents = loader.load() | |
| update_log(f" pages extracted: {len(documents)}", "dim") | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50) | |
| chunks = splitter.split_documents(documents) | |
| all_chunks.extend(chunks) | |
| update_log(f" chunks created: {len(chunks)}", "dim") | |
| update_log(f"β embedding {len(all_chunks)} chunks into vector space", "info") | |
| time.sleep(0.3) | |
| client = chromadb.EphemeralClient() | |
| vectorstore = Chroma.from_documents( | |
| documents=all_chunks, | |
| embedding=embedding_model, | |
| client=client, | |
| collection_name="rag_docs" | |
| ) | |
| update_log(f"β vectorstore ready β {vectorstore._collection.count()} vectors indexed", "ok") | |
| update_log("// system ready for queries", "dim") | |
| return vectorstore | |
| # ββ RAG pipeline ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def answer_question(query, vectorstore, llm, k): | |
| retrieved_docs = vectorstore.similarity_search(query, k=k) | |
| context = "" | |
| sources = [] | |
| contexts = [] | |
| for doc in retrieved_docs: | |
| context += doc.page_content + "\n\n" | |
| contexts.append(doc.page_content) | |
| sources.append({ | |
| "source": doc.metadata.get('source', 'unknown'), | |
| "page": doc.metadata.get('page', 0) + 1 | |
| }) | |
| prompt = f"""Answer the question based only on the context below. | |
| Be specific and detailed. If not in context, say "I don't know". | |
| Context: | |
| {context} | |
| Question: {query} | |
| Answer:""" | |
| response = llm.invoke(prompt) | |
| if isinstance(response.content, list): | |
| answer = " ".join([b['text'] for b in response.content if b.get('type') == 'text']) | |
| else: | |
| answer = response.content | |
| return {"answer": answer, "sources": sources, "contexts": contexts} | |
| # ββ Evaluator βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def evaluate_rag(query, result, embedding_model): | |
| answer = result['answer'] | |
| contexts = result['contexts'] | |
| answer_words = set(answer.lower().split()) | |
| context_words = set(" ".join(contexts).lower().split()) | |
| grounding = len(answer_words & context_words) / len(answer_words) if answer_words else 0 | |
| query_vec = embedding_model.embed_query(query) | |
| chunk_vecs = embedding_model.embed_documents(contexts) | |
| sims = cosine_similarity([query_vec], chunk_vecs)[0] | |
| retrieval_relevance = float(np.mean(sims)) | |
| completeness = len(answer_words & context_words) / len(context_words) if context_words else 0 | |
| return { | |
| "grounding": round(grounding, 2), | |
| "relevance": round(retrieval_relevance, 2), | |
| "completeness": round(completeness, 2) | |
| } | |
| def score_color(val): | |
| if val >= 0.7: return "score-high" | |
| if val >= 0.4: return "score-mid" | |
| return "score-low" | |
| # ββ Main UI βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| embedding_model = load_embedding_model() | |
| col_upload, col_query = st.columns([1, 1], gap="large") | |
| with col_upload: | |
| st.markdown('<p class="sidebar-label">01 / Upload Documents</p>', unsafe_allow_html=True) | |
| uploaded_files = st.file_uploader( | |
| "Drop PDF files here", | |
| type="pdf", | |
| accept_multiple_files=True, | |
| label_visibility="collapsed" | |
| ) | |
| log_placeholder = st.empty() | |
| log_placeholder.markdown( | |
| '<div class="log-terminal"><div class="log-line log-dim">// awaiting documents...</div></div>', | |
| unsafe_allow_html=True | |
| ) | |
| if uploaded_files: | |
| if st.button("⬑ Process Documents", use_container_width=True): | |
| if not api_key: | |
| st.error("Add your API key in the sidebar first.") | |
| elif use_custom_model and not model_choice: | |
| st.error("Enter a model name in the sidebar.") | |
| else: | |
| vectorstore = process_pdfs( | |
| uploaded_files, embedding_model, chunk_size, log_placeholder | |
| ) | |
| st.session_state.vectorstore = vectorstore | |
| st.session_state.llm = get_llm(api_key, model_choice) | |
| with col_query: | |
| st.markdown('<p class="sidebar-label">02 / Ask a Question</p>', unsafe_allow_html=True) | |
| query = st.text_input( | |
| "Query", | |
| placeholder="What does this document say about...", | |
| label_visibility="collapsed" | |
| ) | |
| if query and "vectorstore" in st.session_state: | |
| with st.spinner(""): | |
| result = answer_question(query, st.session_state.vectorstore, st.session_state.llm, top_k) | |
| scores = evaluate_rag(query, result, embedding_model) | |
| st.markdown('<p class="sidebar-label">Answer</p>', unsafe_allow_html=True) | |
| st.markdown(f'<div class="answer-card">{result["answer"]}</div>', unsafe_allow_html=True) | |
| st.markdown('<p class="sidebar-label" style="margin-top:1rem">Sources</p>', unsafe_allow_html=True) | |
| sources_html = "" | |
| seen = set() | |
| for s in result['sources']: | |
| key = f"{s['source']}:p{s['page']}" | |
| if key not in seen: | |
| seen.add(key) | |
| name = s['source'].split('/')[-1] | |
| sources_html += f'<span class="source-tag">π {name} Β· p{s["page"]}</span>' | |
| st.markdown(sources_html, unsafe_allow_html=True) | |
| st.markdown('<p class="sidebar-label" style="margin-top:1.5rem">Retrieval Quality</p>', unsafe_allow_html=True) | |
| st.markdown(f""" | |
| <div class="score-row"> | |
| <div class="score-card"> | |
| <div class="score-label">Grounding</div> | |
| <div class="score-value {score_color(scores['grounding'])}">{scores['grounding']}</div> | |
| </div> | |
| <div class="score-card"> | |
| <div class="score-label">Relevance</div> | |
| <div class="score-value {score_color(scores['relevance'])}">{scores['relevance']}</div> | |
| </div> | |
| <div class="score-card"> | |
| <div class="score-label">Completeness</div> | |
| <div class="score-value {score_color(scores['completeness'])}">{scores['completeness']}</div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| elif query and "vectorstore" not in st.session_state: | |
| st.warning("Upload and process documents first.") |