import gradio as gr import fitz # PyMuPDF import os from sentence_transformers import SentenceTransformer import numpy as np import faiss from groq import Groq # Initialize Groq client key = os.getenv("GROQ_API_KEY") if not key: raise ValueError("No API key found") groq_client = Groq(api_key=key) model = "llama3-8b-8192" embedder = SentenceTransformer('all-MiniLM-L6-v2') # Global state state = { "document_chunks": [], "metadata": [], "index": None, "embeddings": None } # Extract text from PDF using file path def extract_text_from_pdf(file_path): doc = fitz.open(file_path) texts = [] for i, page in enumerate(doc): text = page.get_text().strip() if text: texts.append({"text": text, "page": i + 1}) return texts # Process PDFs def process_pdfs(files): state["document_chunks"] = [] state["metadata"] = [] for file in files: file_name = os.path.basename(file.name) chunks = extract_text_from_pdf(file.name) for chunk in chunks: state["document_chunks"].append(chunk['text']) state["metadata"].append({"file": file_name, "page": chunk['page']}) embeddings = embedder.encode(state["document_chunks"], show_progress_bar=True) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(np.array(embeddings)) state["index"] = index state["embeddings"] = embeddings return "✅ Book(s) loaded successfully!" # Retrieve top chunks def retrieve_chunks(question, top_k=3): if not state["index"]: return [] q_embedding = embedder.encode([question]) D, I = state["index"].search(q_embedding, top_k) return [(state["document_chunks"][i], state["metadata"][i]) for i in I[0]] # Generate answer with source references def generate_answer(context, question): context_text = "\n\n".join( f"{chunk}\n\n[Source: {meta['file']}, Page: {meta['page']}]" for chunk, meta in context ) prompt = f"""You are a helpful assistant. Use the context below to answer the question. Include the source references (file name and page number) in your answer. Context: {context_text} Question: {question} Answer (with sources):""" response = groq_client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], temperature=0.2 ) return response.choices[0].message.content # Chat function for ChatInterface def chatbot_interface_fn(message, history): if not state["document_chunks"]: return "⚠️ Please upload PDF files first." context = retrieve_chunks(message) return generate_answer(context, message) # Gradio UI with gr.Blocks(title="RAG Chatbot") as demo: gr.Markdown("# 📚 Enhanced RAG Chatbot\nUpload books and chat naturally!") with gr.Row(): pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="📂 Upload PDFs") upload_btn = gr.Button("Upload & Process PDFs") status = gr.Textbox(label="Status", interactive=False) upload_btn.click(process_pdfs, inputs=[pdf_input], outputs=[status]) gr.ChatInterface( fn=chatbot_interface_fn, chatbot=gr.Chatbot(height=400, type="messages"), textbox=gr.Textbox(placeholder="Ask about the PDFs...", scale=7), title="📖 PDF Chat", description="Ask questions based on uploaded PDF content.", submit_btn="Send" ) if __name__ == "__main__": demo.launch()