Wosqa's picture
Update app.py
34551e8 verified
import os
import gradio as gr
import PyPDF2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
# ------------------ CONFIG ------------------
LLM_MODEL = "llama-3.1-8b-instant"
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
# Initialize sentence transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# Global storage for documents and embeddings
documents = []
embeddings = []
metadata = []
# ------------------ PDF Processing ------------------
def process_pdf(pdf_file):
global documents, embeddings, metadata
documents = []
metadata = []
reader = PyPDF2.PdfReader(pdf_file.name)
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text and text.strip():
documents.append(text)
metadata.append(f"{pdf_file.name} - Page {i+1}")
if not documents:
return "No text extracted from PDF. Are you sure it contains text?"
embeddings = embedder.encode(documents)
return f"βœ… Processed {len(documents)} text chunks from PDF: {pdf_file.name}"
# ------------------ Retrieve Context ------------------
def retrieve_context(question, top_k=5):
q_emb = embedder.encode([question])
scores = cosine_similarity(q_emb, embeddings)[0]
top_indices = np.argsort(scores)[-top_k:][::-1]
context = ""
sources = []
for idx in top_indices:
context += documents[idx] + "\n"
sources.append(metadata[idx])
return context, sources
# ------------------ Chat with Groq ------------------
def chat(question):
if not documents:
return "Please upload and process a PDF first."
context, sources = retrieve_context(question)
try:
response = client.chat.completions.create(
model=LLM_MODEL,
messages=[
{"role": "system", "content": "You are a helpful assistant answering questions based on the provided PDF context."},
{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
],
temperature=0.2
)
answer = response.choices[0].message.content
answer += "\n\nSources:\n" + "\n".join(sources)
return answer
except Exception as e:
return f"Error communicating with Groq: {e}"
# ------------------ GRADIO UI ------------------
with gr.Blocks(title="RAG PDF Chatbot") as demo:
gr.Markdown("# πŸ“„ RAG PDF Chatbot")
gr.Markdown("Upload a PDF, process it, and ask questions based on its content.")
pdf_input = gr.File(label="Upload PDF", file_types=['.pdf']) # βœ… single PDF only
process_status = gr.Markdown()
process_btn = gr.Button("Process PDF")
question = gr.Textbox(label="Ask a question")
ask_btn = gr.Button("Ask")
answer = gr.Markdown(label="Answer")
process_btn.click(process_pdf, pdf_input, process_status)
ask_btn.click(chat, question, answer)
demo.launch()