Spaces:

Zohaib366
/

Enhanced_RAG_Chatbot

Sleeping

File size: 3,503 Bytes

import gradio as gr
import fitz  # PyMuPDF
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from groq import Groq

# Initialize Groq client
key = os.getenv("GROQ_API_KEY")
if not key:
    raise ValueError("No API key found")
groq_client = Groq(api_key=key)
model = "llama3-8b-8192"

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Global state
state = {
    "document_chunks": [],
    "metadata": [],
    "index": None,
    "embeddings": None
}

# Extract text from PDF using file path
def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    texts = []
    for i, page in enumerate(doc):
        text = page.get_text().strip()
        if text:
            texts.append({"text": text, "page": i + 1})
    return texts

# Process PDFs
def process_pdfs(files):
    state["document_chunks"] = []
    state["metadata"] = []

    for file in files:
        file_name = os.path.basename(file.name)
        chunks = extract_text_from_pdf(file.name)
        for chunk in chunks:
            state["document_chunks"].append(chunk['text'])
            state["metadata"].append({"file": file_name, "page": chunk['page']})

    embeddings = embedder.encode(state["document_chunks"], show_progress_bar=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    state["index"] = index
    state["embeddings"] = embeddings

    return "✅ Book(s) loaded successfully!"

# Retrieve top chunks
def retrieve_chunks(question, top_k=3):
    if not state["index"]:
        return []
    q_embedding = embedder.encode([question])
    D, I = state["index"].search(q_embedding, top_k)
    return [(state["document_chunks"][i], state["metadata"][i]) for i in I[0]]

# Generate answer with source references
def generate_answer(context, question):
    context_text = "\n\n".join(
        f"{chunk}\n\n[Source: {meta['file']}, Page: {meta['page']}]"
        for chunk, meta in context
    )
    prompt = f"""You are a helpful assistant. Use the context below to answer the question. 
Include the source references (file name and page number) in your answer.

Context:
{context_text}

Question:
{question}

Answer (with sources):"""

    response = groq_client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content

# Chat function for ChatInterface
def chatbot_interface_fn(message, history):
    if not state["document_chunks"]:
        return "⚠️ Please upload PDF files first."
    context = retrieve_chunks(message)
    return generate_answer(context, message)

# Gradio UI
with gr.Blocks(title="RAG Chatbot") as demo:
    gr.Markdown("# 📚 Enhanced RAG Chatbot\nUpload books and chat naturally!")

    with gr.Row():
        pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="📂 Upload PDFs")
        upload_btn = gr.Button("Upload & Process PDFs")
        status = gr.Textbox(label="Status", interactive=False)

    upload_btn.click(process_pdfs, inputs=[pdf_input], outputs=[status])

    gr.ChatInterface(
        fn=chatbot_interface_fn,
        chatbot=gr.Chatbot(height=400, type="messages"),
        textbox=gr.Textbox(placeholder="Ask about the PDFs...", scale=7),
        title="📖 PDF Chat",
        description="Ask questions based on uploaded PDF content.",
        submit_btn="Send"
    )

if __name__ == "__main__":
    demo.launch()