import streamlit as st from groq import Groq from sentence_transformers import SentenceTransformer import chromadb from chromadb.utils import embedding_functions import fitz # PyMuPDF import requests from duckduckgo_search import DDGS import os import hashlib import re # ── Page config ────────────────────────────────────────────── st.set_page_config( page_title="ResearchMind — RAG Assistant", page_icon="🧠", layout="wide", initial_sidebar_state="expanded" ) # ── Styling ────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── Initialize session state ────────────────────────────────── if 'chat_history' not in st.session_state: st.session_state.chat_history = [] if 'pdf_processed' not in st.session_state: st.session_state.pdf_processed = False if 'pdf_name' not in st.session_state: st.session_state.pdf_name = None if 'api_configured' not in st.session_state: st.session_state.api_configured = False # ── Load embedding model ────────────────────────────────────── @st.cache_resource def load_embedder(): return SentenceTransformer('all-MiniLM-L6-v2') @st.cache_resource def get_chroma_client(): return chromadb.Client() embedder = load_embedder() chroma_client = get_chroma_client() # ── Helper functions ────────────────────────────────────────── def extract_pdf_text(pdf_file): """Extract text from uploaded PDF.""" doc = fitz.open(stream=pdf_file.read(), filetype="pdf") chunks = [] for page_num, page in enumerate(doc): text = page.get_text() # Split into chunks of ~500 chars with overlap words = text.split() chunk_size = 100 # words overlap = 20 for i in range(0, len(words), chunk_size - overlap): chunk = ' '.join(words[i:i + chunk_size]) if len(chunk) > 100: # skip very short chunks chunks.append({ 'text': chunk, 'source': f"PDF Page {page_num + 1}", 'page': page_num + 1 }) return chunks def search_web(query, max_results=5): """Search DuckDuckGo for relevant results.""" try: with DDGS() as ddgs: results = list(ddgs.text( query, max_results=max_results, region='wt-wt', safesearch='off' )) chunks = [] for r in results: title = r.get('title', '') body = r.get('body', '') url = r.get('href', 'Web') if body and len(body) > 50: chunks.append({ 'text': f"{title}. {body}", 'source': url, 'title': title }) if chunks: return chunks raise Exception("No results") except Exception as e: # Fallback — use Groq to answer from its own knowledge return [] def get_kb_chunks(): """Built-in AI/ML knowledge base.""" return [ {'text': 'Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Transformer models are a type of neural network architecture that has revolutionized NLP. The attention mechanism allows the model to focus on different parts of the input when producing an output. BERT, GPT, and T5 are popular transformer models.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Retrieval Augmented Generation (RAG) is a technique that combines retrieval of relevant documents with generative AI to produce more accurate and grounded responses. It reduces hallucinations by providing real context to the LLM.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Large Language Models (LLMs) are AI systems trained on vast amounts of text data. They can generate human-like text, answer questions, summarize content, and perform many language tasks. Examples include GPT-4, Gemini, Claude, and LLaMA.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Vector databases store data as high-dimensional vectors (embeddings) and enable fast similarity search. They are essential for RAG systems. Popular options include ChromaDB, Pinecone, Weaviate, and FAISS.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Fine-tuning is the process of taking a pre-trained model and training it further on a specific dataset for a specific task. It allows models to adapt to domain-specific knowledge while retaining general capabilities.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Embeddings are dense vector representations of text that capture semantic meaning. Similar texts have similar embeddings. They are used in search, recommendation systems, and RAG pipelines.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Prompt engineering is the practice of designing and optimizing prompts to effectively communicate with AI language models. It involves crafting instructions that guide the model to produce desired outputs.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Data Science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.', 'source': 'AI/ML Knowledge Base'}, {'text': 'Computer Vision is a field of AI that trains computers to interpret and understand the visual world. Using digital images from cameras and videos and deep learning models, machines can accurately identify and classify objects.', 'source': 'AI/ML Knowledge Base'}, ] def store_in_vectordb(chunks, collection_name): """Store chunks in ChromaDB.""" try: chroma_client.delete_collection(collection_name) except: pass collection = chroma_client.create_collection(collection_name) texts = [c['text'] for c in chunks] sources = [c['source'] for c in chunks] ids = [hashlib.md5(t.encode()).hexdigest()[:16] + str(i) for i, t in enumerate(texts)] embeddings = embedder.encode(texts).tolist() collection.add( documents=texts, embeddings=embeddings, metadatas=[{'source': s} for s in sources], ids=ids ) return collection def retrieve_context(query, collection, n_results=4): """Retrieve most relevant chunks for the query.""" query_embedding = embedder.encode([query]).tolist() results = collection.query( query_embeddings=query_embedding, n_results=min(n_results, collection.count()) ) chunks = [] for doc, meta in zip(results['documents'][0], results['metadatas'][0]): chunks.append({'text': doc, 'source': meta['source']}) return chunks def generate_answer(query, context_chunks, gemini_model): """Generate answer using Gemini with retrieved context.""" context_text = "\n\n".join([ f"[Source {i+1}: {c['source']}]\n{c['text']}" for i, c in enumerate(context_chunks) ]) prompt = f"""You are ResearchMind, an expert AI research assistant. Answer the question below using ONLY the provided context. Be comprehensive, accurate, and cite sources by number [1], [2], etc. If the context doesn't contain enough information, say so honestly. CONTEXT: {context_text} QUESTION: {query} INSTRUCTIONS: - Give a detailed, well-structured answer - Cite sources inline like [1], [2] - Use bullet points or numbered lists where appropriate - End with a brief summary - Be honest if information is limited ANSWER:""" client = st.session_state.groq_client response = client.chat.completions.create( model="llama-3.1-8b-instant", messages=[{"role": "user", "content": prompt}], max_tokens=1024 ) return response.choices[0].message.content # ── Sidebar ─────────────────────────────────────────────────── with st.sidebar: st.markdown("""
Ask anything. Get AI-powered answers with cited sources. Upload PDFs, search the web, or query the knowledge base.
{len(query)} characters
", unsafe_allow_html=True) st.markdown('