import streamlit as st from groq import Groq from sentence_transformers import SentenceTransformer import chromadb from chromadb.utils import embedding_functions import fitz # PyMuPDF import requests from duckduckgo_search import DDGS import os import hashlib import re # ── Page config ────────────────────────────────────────────── st.set_page_config( page_title="ResearchMind — RAG Assistant", page_icon="🧠", layout="wide", initial_sidebar_state="expanded" ) # ── Styling ────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── Initialize session state ────────────────────────────────── if 'chat_history' not in st.session_state: st.session_state.chat_history = [] if 'pdf_processed' not in st.session_state: st.session_state.pdf_processed = False if 'pdf_name' not in st.session_state: st.session_state.pdf_name = None if 'api_configured' not in st.session_state: st.session_state.api_configured = False # ── Load embedding model ────────────────────────────────────── @st.cache_resource def load_embedder(): return SentenceTransformer('all-MiniLM-L6-v2') @st.cache_resource def get_chroma_client(): return chromadb.Client() embedder = load_embedder() chroma_client = get_chroma_client() # ── Helper functions ────────────────────────────────────────── def extract_pdf_text(pdf_file): """Extract text from uploaded PDF.""" doc = fitz.open(stream=pdf_file.read(), filetype="pdf") chunks = [] for page_num, page in enumerate(doc): text = page.get_text() # Split into chunks of ~500 chars with overlap words = text.split() chunk_size = 100 # words overlap = 20 for i in range(0, len(words), chunk_size - overlap): chunk = ' '.join(words[i:i + chunk_size]) if len(chunk) > 100: # skip very short chunks chunks.append({ 'text': chunk, 'source': f"PDF Page {page_num + 1}", 'page': page_num + 1 }) return chunks def search_web(query, max_results=5): """Search DuckDuckGo for relevant results.""" try: with DDGS() as ddgs: results = list(ddgs.text( query, max_results=max_results, region='wt-wt', safesearch='off' )) chunks = [] for r in results: title = r.get('title', '') body = r.get('body', '') url = r.get('href', 'Web') if body and len(body) > 50: chunks.append({ 'text': f"{title}. {body}", 'source': url, 'title': title }) if chunks: return chunks raise Exception("No results") except Exception as e: # Fallback — use Groq to answer from its own knowledge return [] def get_kb_chunks(): """Built-in AI/ML knowledge base.""" return [ {'text': 'Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Transformer models are a type of neural network architecture that has revolutionized NLP. The attention mechanism allows the model to focus on different parts of the input when producing an output. BERT, GPT, and T5 are popular transformer models.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Retrieval Augmented Generation (RAG) is a technique that combines retrieval of relevant documents with generative AI to produce more accurate and grounded responses. It reduces hallucinations by providing real context to the LLM.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Large Language Models (LLMs) are AI systems trained on vast amounts of text data. They can generate human-like text, answer questions, summarize content, and perform many language tasks. Examples include GPT-4, Gemini, Claude, and LLaMA.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Vector databases store data as high-dimensional vectors (embeddings) and enable fast similarity search. They are essential for RAG systems. Popular options include ChromaDB, Pinecone, Weaviate, and FAISS.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Fine-tuning is the process of taking a pre-trained model and training it further on a specific dataset for a specific task. It allows models to adapt to domain-specific knowledge while retaining general capabilities.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Embeddings are dense vector representations of text that capture semantic meaning. Similar texts have similar embeddings. They are used in search, recommendation systems, and RAG pipelines.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Prompt engineering is the practice of designing and optimizing prompts to effectively communicate with AI language models. It involves crafting instructions that guide the model to produce desired outputs.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Data Science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Computer Vision is a field of AI that trains computers to interpret and understand the visual world. Using digital images from cameras and videos and deep learning models, machines can accurately identify and classify objects.', 'source': 'AI/ML Knowledge Base'}, ] def store_in_vectordb(chunks, collection_name): """Store chunks in ChromaDB.""" try: chroma_client.delete_collection(collection_name) except: pass collection = chroma_client.create_collection(collection_name) texts = [c['text'] for c in chunks] sources = [c['source'] for c in chunks] ids = [hashlib.md5(t.encode()).hexdigest()[:16] + str(i) for i, t in enumerate(texts)] embeddings = embedder.encode(texts).tolist() collection.add( documents=texts, embeddings=embeddings, metadatas=[{'source': s} for s in sources], ids=ids ) return collection def retrieve_context(query, collection, n_results=4): """Retrieve most relevant chunks for the query.""" query_embedding = embedder.encode([query]).tolist() results = collection.query( query_embeddings=query_embedding, n_results=min(n_results, collection.count()) ) chunks = [] for doc, meta in zip(results['documents'][0], results['metadatas'][0]): chunks.append({'text': doc, 'source': meta['source']}) return chunks def generate_answer(query, context_chunks, gemini_model): """Generate answer using Gemini with retrieved context.""" context_text = "\n\n".join([ f"[Source {i+1}: {c['source']}]\n{c['text']}" for i, c in enumerate(context_chunks) ]) prompt = f"""You are ResearchMind, an expert AI research assistant. Answer the question below using ONLY the provided context. Be comprehensive, accurate, and cite sources by number [1], [2], etc. If the context doesn't contain enough information, say so honestly. CONTEXT: {context_text} QUESTION: {query} INSTRUCTIONS: - Give a detailed, well-structured answer - Cite sources inline like [1], [2] - Use bullet points or numbered lists where appropriate - End with a brief summary - Be honest if information is limited ANSWER:""" client = st.session_state.groq_client response = client.chat.completions.create( model="llama-3.1-8b-instant", messages=[{"role": "user", "content": prompt}], max_tokens=1024 ) return response.choices[0].message.content # ── Sidebar ─────────────────────────────────────────────────── with st.sidebar: st.markdown("""

🧠 RESEARCHMIND

        RAG-POWERED ASSISTANT
    

""", unsafe_allow_html=True) # API Key st.markdown('

Configuration

', unsafe_allow_html=True) # Use secret from HuggingFace or user input hf_api_key = os.environ.get("GROQ_API_KEY", "") api_key = st.text_input("Groq API Key", value=hf_api_key, type="password", placeholder="gsk_...", help="Get free key at console.groq.com") if api_key: try: groq_client = Groq(api_key=api_key) # Test the connection with a simple call test = groq_client.chat.completions.create( model="llama-3.1-8b-instant", messages=[{"role": "user", "content": "hi"}], max_tokens=5 ) st.session_state.api_configured = True st.session_state.groq_client = groq_client st.success("✅ API Connected") except Exception as e: if "invalid_api_key" in str(e).lower() or "401" in str(e): st.error("❌ Invalid API key") else: # Key is valid but other error — still connect st.session_state.api_configured = True st.session_state.groq_client = Groq(api_key=api_key) st.success("✅ API Connected") st.session_state.api_configured = False # Mode selection st.markdown('

Knowledge Source

', unsafe_allow_html=True) mode = st.selectbox( "Source", ["📄 PDF Document", "🌐 Web Search", "📚 AI/ML Knowledge Base", "🔀 All Sources"], label_visibility="collapsed" ) # PDF upload if "PDF" in mode or "All" in mode: st.markdown('

Upload PDF

', unsafe_allow_html=True) uploaded_pdf = st.file_uploader( "Upload PDF", type=['pdf'], label_visibility="collapsed" ) if uploaded_pdf and uploaded_pdf.name != st.session_state.pdf_name: with st.spinner("Processing PDF..."): chunks = extract_pdf_text(uploaded_pdf) if chunks: store_in_vectordb(chunks, "pdf_collection") st.session_state.pdf_processed = True st.session_state.pdf_name = uploaded_pdf.name st.session_state.pdf_sample_questions = None # reset questions for new PDF st.success(f"✅ {len(chunks)} chunks extracted") else: st.error("Could not extract text from PDF") # Stats st.markdown('

Stats

', unsafe_allow_html=True) st.markdown(f"""

        Questions asked: {len(st.session_state.chat_history)}

        PDF loaded: {'Yes' if st.session_state.pdf_processed else 'No'}

        Model: Gemini 1.5 Flash

        Embeddings: MiniLM-L6-v2

""", unsafe_allow_html=True) # Clear history if st.button("🗑️ Clear History"): st.session_state.chat_history = [] st.rerun() # ── Main area ───────────────────────────────────────────────── st.markdown("""

ResearchMind

Ask anything. Get AI-powered answers with cited sources. Upload PDFs, search the web, or query the knowledge base.

""", unsafe_allow_html=True) # Mode indicator mode_labels = { "📄 PDF Document": ("mode-pdf", "📄 PDF MODE"), "🌐 Web Search": ("mode-web", "🌐 WEB SEARCH MODE"), "📚 AI/ML Knowledge Base": ("mode-kb", "📚 KNOWLEDGE BASE MODE"), "🔀 All Sources": ("mode-web", "🔀 ALL SOURCES MODE") } mode_class, mode_text = mode_labels.get(mode, ("mode-kb", "MODE")) st.markdown(f'

{mode_text}

', unsafe_allow_html=True) # Sample questions st.markdown('

Sample Questions

', unsafe_allow_html=True) # Generate PDF-specific questions if PDF is loaded if "PDF" in mode and st.session_state.pdf_processed: if 'pdf_sample_questions' not in st.session_state: st.session_state.pdf_sample_questions = None if st.session_state.pdf_sample_questions is None and st.session_state.api_configured: with st.spinner("Generating questions from PDF..."): try: # Get some text from PDF collection = chroma_client.get_collection("pdf_collection") results = collection.get(limit=3) sample_text = " ".join(results['documents'][:3])[:1000] prompt = f"""Based on this document excerpt, generate exactly 4 short, specific questions a researcher would ask. Return ONLY the 4 questions, one per line, no numbering, no extra text. Document: {sample_text} Questions:""" client = st.session_state.groq_client response = client.chat.completions.create( model="llama-3.1-8b-instant", messages=[{"role": "user", "content": prompt}], max_tokens=200 ) questions_text = response.choices[0].message.content questions = [q.strip() for q in questions_text.strip().split('\n') if q.strip()][:4] st.session_state.pdf_sample_questions = questions except: st.session_state.pdf_sample_questions = None samples = st.session_state.pdf_sample_questions or [ "What is the main topic of this document?", "What are the key findings?", "What methodology was used?", "What are the conclusions?" ] else: # Reset PDF questions when switching modes if 'pdf_sample_questions' in st.session_state: st.session_state.pdf_sample_questions = None samples = [ "What is Retrieval Augmented Generation?", "Explain transformer architecture", "What are vector databases used for?", "How does fine-tuning work?" ] cols = st.columns(4) for i, (col, sample) in enumerate(zip(cols, samples)): with col: if st.button(sample, key=f"sample_{i}"): st.session_state.query = sample # Query input st.markdown('

Ask Your Question

', unsafe_allow_html=True) query = st.text_area( "Question", value=st.session_state.get('query', ''), height=100, placeholder="Ask anything... e.g. 'What is the main contribution of this paper?' or 'Explain how RAG works'", label_visibility="collapsed" ) st.markdown(f"

{len(query)} characters

", unsafe_allow_html=True) st.markdown('

', unsafe_allow_html=True) ask_btn = st.button("🔍 SEARCH & ANSWER", use_container_width=True) st.markdown('

', unsafe_allow_html=True) # ── Process query ───────────────────────────────────────────── if ask_btn: if not query.strip(): st.error("Please enter a question!") elif not st.session_state.api_configured: st.error("Please enter your API key in the sidebar!") else: with st.spinner("🔍 Retrieving context and generating answer..."): all_chunks = [] # Get chunks based on mode if "PDF" in mode or "All" in mode: if st.session_state.pdf_processed: try: collection = chroma_client.get_collection("pdf_collection") pdf_chunks = retrieve_context(query, collection, n_results=3) all_chunks.extend(pdf_chunks) except: pass if "Web" in mode or "All" in mode: web_chunks = search_web(query, max_results=4) all_chunks.extend(web_chunks[:3]) if "Knowledge" in mode or "All" in mode: kb_chunks_all = get_kb_chunks() kb_collection = store_in_vectordb(kb_chunks_all, "kb_collection") kb_chunks = retrieve_context(query, kb_collection, n_results=3) all_chunks.extend(kb_chunks) if all_chunks: answer = generate_answer(query, all_chunks[:6], st.session_state.groq_client) st.session_state.chat_history.append({'query': query, 'answer': answer, 'sources': all_chunks[:6]}) st.markdown('

Answer

', unsafe_allow_html=True) st.markdown(f'

{answer}

', unsafe_allow_html=True) st.markdown('

Sources Used

', unsafe_allow_html=True) for i, chunk in enumerate(all_chunks[:6]): st.markdown(f"""

SOURCE {i+1} · {chunk['source']}

{chunk['text'][:200]}...

""", unsafe_allow_html=True) else: # Web search blocked on server — use LLM direct knowledge try: response = st.session_state.groq_client.chat.completions.create( model="llama-3.1-8b-instant", messages=[{"role": "user", "content": f"Answer this question comprehensively: {query}"}], max_tokens=1024 ) answer = response.choices[0].message.content st.session_state.chat_history.append({'query': query, 'answer': answer, 'sources': []}) st.markdown('

Answer

', unsafe_allow_html=True) st.markdown(f'

{answer}

', unsafe_allow_html=True) st.info("ℹ️ Web search unavailable on this server. Answer generated from LLM knowledge.") except Exception as e: st.error(f"Error: {str(e)}") # ── Chat history ────────────────────────────────────────────── if st.session_state.chat_history: st.markdown('

Previous Questions

', unsafe_allow_html=True) for i, item in enumerate(reversed(st.session_state.chat_history[:-1])): with st.expander(f"Q: {item['query'][:60]}..."): st.markdown(f'

{item["answer"]}

', unsafe_allow_html=True) # Footer st.markdown("""

    RESEARCHMIND v1.0 · BUILT BY VISHAL · GEMINI + RAG + CHROMADB

""", unsafe_allow_html=True)