import gradio as gr import os import requests import numpy as np from pypdf import PdfReader from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity # ---------------- CONFIG ---------------- GROQ_API_KEY = os.environ.get("smartdoc_rag_chatbot") # HF Secrets me add hona chahiye GROQ_URL = "https://api.groq.com/openai/v1/chat/completions" MODEL_NAME = "llama-3.1-8b-instant" embedder = SentenceTransformer("all-MiniLM-L6-v2") chunks = [] chunk_embeddings = [] # ---------------- PDF LOADING ---------------- def load_pdfs(pdf_files): global chunks, chunk_embeddings if not pdf_files: return "❌ Please upload at least one PDF." documents = [] for doc_id, pdf in enumerate(pdf_files): reader = PdfReader(pdf) for page_num, page in enumerate(reader.pages): text = page.extract_text() if text: documents.append({ "text": text, "page": page_num + 1, "doc": f"Document {doc_id + 1}" }) # chunking chunks = [] for doc in documents: text = doc["text"] for i in range(0, len(text), 500): chunks.append({ "content": text[i:i+500], "page": doc["page"], "doc": doc["doc"] }) texts = [c["content"] for c in chunks] chunk_embeddings = embedder.encode(texts) return f"✅ Loaded {len(pdf_files)} PDF(s) with {len(chunks)} chunks." # ---------------- RETRIEVAL ---------------- def retrieve_context(query, k=3): query_embedding = embedder.encode([query]) similarities = cosine_similarity(query_embedding, chunk_embeddings)[0] top_k = np.argsort(similarities)[-k:] selected = [chunks[i] for i in top_k] context = "\n".join([c["content"] for c in selected]) source = selected[-1] return context, source # ---------------- GROQ CALL ---------------- def ask_question(question): if not chunks: return "⚠️ Please load PDFs first." context, source = retrieve_context(question) prompt = f""" You are SmartDoc RAG Chatbot. Answer the question using ONLY the context below. Context: {context} Question: {question} """ headers = { "Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json" } response = requests.post( GROQ_URL, headers=headers, json={ "model": MODEL_NAME, "messages": [{"role": "user", "content": prompt}], "temperature": 0.2 } ) answer = response.json()["choices"][0]["message"]["content"] return f"""{answer} 📄 Source: {source['doc']} — Page {source['page']}""" # ---------------- UI ---------------- css = """ body { background: linear-gradient(120deg, #e0f2ff, #f8fbff); } h1, h3 { text-align: center; } .gr-textbox textarea { font-size: 15px; } .gr-button-primary { font-weight: bold; } """ with gr.Blocks( theme=gr.themes.Soft( primary_hue="blue", secondary_hue="cyan", neutral_hue="slate", font=["Inter", "sans-serif"] ), css=css ) as demo: gr.Markdown(""" # 📄 SmartDoc RAG Chatbot ### Retrieval‑Augmented AI for Document Question Answering Upload PDFs and ask questions based **only** on their content. """) with gr.Row(): # LEFT PANEL with gr.Column(scale=1): pdf_files = gr.File( file_types=[".pdf"], file_count="multiple", label="📂 Upload PDF Documents" ) load_btn = gr.Button("📥 Load Documents", variant="primary") status = gr.Textbox(label="Status", interactive=False) # RIGHT PANEL with gr.Column(scale=2): with gr.Row(): question = gr.Textbox( placeholder="Type your question here…", lines=1, scale=8 ) send_btn = gr.Button("➤", scale=1) answer = gr.Textbox( label="Answer", lines=8 ) # EVENTS load_btn.click(load_pdfs, inputs=pdf_files, outputs=status) send_btn.click( ask_question, inputs=question, outputs=answer ).then(lambda: "", None, question) question.submit( ask_question, inputs=question, outputs=answer ).then(lambda: "", None, question) demo.launch()