RAG_PDF_app / app.py
FurqanIshaq's picture
Update app.py
b605a56 verified
# ============================================
# πŸ“˜ Study Supervisor RAG App (Colab Compatible)
# ============================================
# πŸ”§ STEP 1: Install Required Packages
!pip -q install gradio faiss-cpu sentence-transformers PyPDF2 requests
# πŸ”§ STEP 2: Imports
import gradio as gr
import faiss
import os, requests
from io import BytesIO
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
# βœ… CONFIGURATION
GROQ_API_KEY = "" # πŸ” Set your Groq API key here
GROQ_MODEL = "llama3-70b-8192"
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
EMBED_MODEL = "all-MiniLM-L6-v2"
CHUNK_SIZE = 500
TOP_K = 5
# βœ… Load Embedding Model
embedder = SentenceTransformer(EMBED_MODEL)
# βœ… Global Variables
faiss_index = None
chunk_texts = []
# πŸ”„ STEP 3: Process PDF, Chunk, Embed, Index
def process_pdf(file_obj):
global faiss_index, chunk_texts
pdf_stream = BytesIO(file_obj.read())
pdf = PdfReader(pdf_stream)
full_text = ""
for page in pdf.pages:
text = page.extract_text()
if text:
full_text += text + "\n"
if not full_text.strip():
return "❌ No extractable text found."
# Split into chunks
chunk_texts = [full_text[i:i+CHUNK_SIZE] for i in range(0, len(full_text), CHUNK_SIZE)]
embeddings = embedder.encode(chunk_texts)
# FAISS index
dim = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(embeddings)
return f"βœ… PDF processed: {len(chunk_texts)} chunks indexed."
# 🧠 STEP 4: Call Groq LLaMA 3 for Answer Generation
def call_groq_llm(context, question):
if not GROQ_API_KEY:
return "❌ API Key is missing. Set your GROQ_API_KEY."
prompt = f"""You are an academic supervisor helping a student understand a research paper.
Context:
{context}
Student's Question:
{question}
Answer:"""
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
data = {
"model": GROQ_MODEL,
"messages": [
{"role": "system", "content": "You are a knowledgeable and supportive supervisor guiding a student through a research paper. Respond clearly and academically."},
{"role": "user", "content": prompt}
]
}
try:
response = requests.post(GROQ_API_URL, headers=headers, json=data, timeout=60)
if response.status_code == 200:
return response.json()['choices'][0]['message']['content']
else:
return f"❌ Groq API Error {response.status_code}: {response.text}"
except Exception as e:
return f"❌ Exception: {str(e)}"
# πŸ’¬ STEP 5: Query Handler
def ask_question(query):
if faiss_index is None or not chunk_texts:
return "❌ Please upload and process a PDF first."
query_embed = embedder.encode([query])
D, I = faiss_index.search(query_embed, TOP_K)
retrieved_chunks = [chunk_texts[i] for i in I[0]]
context = "\n---\n".join(retrieved_chunks)
return call_groq_llm(context[:3000], query)
# 🎨 STEP 6: Gradio Interface
with gr.Blocks() as app:
gr.Markdown("πŸ“˜ **Student Study Assistant** - Upload a research paper and ask questions.")
with gr.Row():
file_input = gr.File(label="πŸ“Ž Upload PDF")
process_button = gr.Button("πŸ“₯ Process Document")
status_output = gr.Textbox(label="Processing Status")
chatbot = gr.ChatInterface(
fn=ask_question,
title="πŸŽ“ Study Supervisor",
description="Ask your supervisor questions about the uploaded paper.",
theme="soft"
)
process_button.click(fn=process_pdf, inputs=file_input, outputs=status_output)
app.launch(share=True)