sakanat's picture
Update app.py
75576a8 verified
import gradio as gr
import os
import requests
import numpy as np
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# ---------------- CONFIG ----------------
GROQ_API_KEY = os.environ.get("smartdoc_rag_chatbot") # HF Secrets me add hona chahiye
GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"
MODEL_NAME = "llama-3.1-8b-instant"
embedder = SentenceTransformer("all-MiniLM-L6-v2")
chunks = []
chunk_embeddings = []
# ---------------- PDF LOADING ----------------
def load_pdfs(pdf_files):
global chunks, chunk_embeddings
if not pdf_files:
return "❌ Please upload at least one PDF."
documents = []
for doc_id, pdf in enumerate(pdf_files):
reader = PdfReader(pdf)
for page_num, page in enumerate(reader.pages):
text = page.extract_text()
if text:
documents.append({
"text": text,
"page": page_num + 1,
"doc": f"Document {doc_id + 1}"
})
# chunking
chunks = []
for doc in documents:
text = doc["text"]
for i in range(0, len(text), 500):
chunks.append({
"content": text[i:i+500],
"page": doc["page"],
"doc": doc["doc"]
})
texts = [c["content"] for c in chunks]
chunk_embeddings = embedder.encode(texts)
return f"✅ Loaded {len(pdf_files)} PDF(s) with {len(chunks)} chunks."
# ---------------- RETRIEVAL ----------------
def retrieve_context(query, k=3):
query_embedding = embedder.encode([query])
similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
top_k = np.argsort(similarities)[-k:]
selected = [chunks[i] for i in top_k]
context = "\n".join([c["content"] for c in selected])
source = selected[-1]
return context, source
# ---------------- GROQ CALL ----------------
def ask_question(question):
if not chunks:
return "⚠️ Please load PDFs first."
context, source = retrieve_context(question)
prompt = f"""
You are SmartDoc RAG Chatbot.
Answer the question using ONLY the context below.
Context:
{context}
Question:
{question}
"""
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
response = requests.post(
GROQ_URL,
headers=headers,
json={
"model": MODEL_NAME,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.2
}
)
answer = response.json()["choices"][0]["message"]["content"]
return f"""{answer}
📄 Source: {source['doc']} — Page {source['page']}"""
# ---------------- UI ----------------
css = """
body {
background: linear-gradient(120deg, #e0f2ff, #f8fbff);
}
h1, h3 {
text-align: center;
}
.gr-textbox textarea {
font-size: 15px;
}
.gr-button-primary {
font-weight: bold;
}
"""
with gr.Blocks(
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="cyan",
neutral_hue="slate",
font=["Inter", "sans-serif"]
),
css=css
) as demo:
gr.Markdown("""
# 📄 SmartDoc RAG Chatbot
### Retrieval‑Augmented AI for Document Question Answering
Upload PDFs and ask questions based **only** on their content.
""")
with gr.Row():
# LEFT PANEL
with gr.Column(scale=1):
pdf_files = gr.File(
file_types=[".pdf"],
file_count="multiple",
label="📂 Upload PDF Documents"
)
load_btn = gr.Button("📥 Load Documents", variant="primary")
status = gr.Textbox(label="Status", interactive=False)
# RIGHT PANEL
with gr.Column(scale=2):
with gr.Row():
question = gr.Textbox(
placeholder="Type your question here…",
lines=1,
scale=8
)
send_btn = gr.Button("➤", scale=1)
answer = gr.Textbox(
label="Answer",
lines=8
)
# EVENTS
load_btn.click(load_pdfs, inputs=pdf_files, outputs=status)
send_btn.click(
ask_question,
inputs=question,
outputs=answer
).then(lambda: "", None, question)
question.submit(
ask_question,
inputs=question,
outputs=answer
).then(lambda: "", None, question)
demo.launch()