chatbot_rag / app.py
yashalhussain's picture
Create app.py
31fc4f4 verified
import os
import gradio as gr
import PyPDF2
import requests
# ================= CONFIG =================
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
GROQ_MODEL = "llama-3.1-8b-instant"
GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"
processed_texts = {}
# ================= PDF HANDLING =================
def extract_pdf_text(file):
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += (page.extract_text() or "") + "\n"
return text
def chunk_text(text, chunk_size=400, overlap=50):
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
i += chunk_size - overlap
return chunks
def preview_documents(files):
processed_texts.clear()
rows = []
for f in files:
text = extract_pdf_text(f)
name = os.path.basename(f)
chunks = chunk_text(text)
processed_texts[name] = chunks
rows.append([
name,
len(text.split()),
text[:300],
f"{len(chunks)} chunks"
])
return rows
def process_documents(files):
if not processed_texts:
return "❌ Preview documents first."
return f"βœ… {len(processed_texts)} document(s) processed."
# ================= GROQ CALL =================
def query_groq(prompt):
if not GROQ_API_KEY:
return "❌ GROQ_API_KEY not set."
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": GROQ_MODEL,
"messages": [
{"role": "system", "content": "Answer ONLY from the provided documents."},
{"role": "user", "content": prompt}
],
"temperature": 0.1,
"max_tokens": 400
}
r = requests.post(GROQ_URL, headers=headers, json=payload, timeout=30)
if r.status_code == 200:
return r.json()["choices"][0]["message"]["content"]
return f"❌ Groq Error {r.status_code}: {r.text}"
# ================= RAG =================
def retrieve_context(question):
question_words = set(question.lower().split())
best_chunk = ""
best_score = 0
for chunks in processed_texts.values():
for chunk in chunks:
chunk_words = set(chunk.lower().split())
score = len(question_words & chunk_words)
if score > best_score:
best_score = score
best_chunk = chunk
return best_chunk[:1000] if best_chunk else ""
def answer_question(question, history):
if history is None:
history = []
# πŸ”Ή STEP 1: show user message instantly
history.append((question, ""))
if not processed_texts:
history[-1] = (question, "⚠️ Upload and process PDFs first.")
return history, ""
context = retrieve_context(question)
if not context:
history[-1] = (question, "❌ No relevant information found in documents.")
return history, ""
prompt = f"""
DOCUMENT CONTEXT:
{context}
QUESTION:
{question}
Answer clearly using the document context only.
"""
# πŸ”Ή STEP 2: get model response
answer = query_groq(prompt)
# πŸ”Ή STEP 3: replace last empty reply
history[-1] = (question, answer)
return history, ""
# ================= UI =================
with gr.Blocks(title="RAG PDF Chatbot") as demo:
gr.Markdown("# πŸ“š RAG PDF Chatbot (Groq)")
gr.Markdown("*Upload PDFs β†’ Preview β†’ Ask questions*")
with gr.Row():
files = gr.File(file_types=[".pdf"], file_count="multiple")
preview_btn = gr.Button("πŸ“„ Preview")
process_btn = gr.Button("πŸš€ Process")
status = gr.Textbox(label="Status")
table = gr.DataFrame(
headers=["File", "Words", "Preview", "Chunks"],
interactive=False
)
chatbot = gr.Chatbot(height=420)
msg = gr.Textbox(
placeholder="Ask a question from the documents...",
lines=2
)
send = gr.Button("Send")
preview_btn.click(preview_documents, files, table)
process_btn.click(process_documents, files, status)
send.click(answer_question, [msg, chatbot], [chatbot, msg])
msg.submit(answer_question, [msg, chatbot], [chatbot, msg])
demo.launch()