PDF_Supervisor / app.py
Ahmad-01's picture
Create app.py
c0979a1 verified
import os
import gradio as gr
import fitz # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import groq
import traceback
# πŸ” Set your GROQ API Key as a HF Space secret (recommended)
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "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") # or set here temporarily
groq_client = groq.Groq(api_key=GROQ_API_KEY)
# ==========================
# πŸ”§ Prompt Templates
# ==========================
SYSTEM_TEMPLATE = "You are an expert academic supervisor helping students understand academic papers. Be concise, clear, and encouraging."
USER_TEMPLATE = "Based on the following context, answer the student's question.\n\nContext:\n{context}\n\nQuestion:\n{question}"
# ==========================
# 🧠 Embedding Model
# ==========================
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# ==========================
# πŸ“„ PDF Text Extraction
# ==========================
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
return text
def chunk_text(text, chunk_size=500, overlap=100):
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def create_vector_store(chunks):
embeddings = embedder.encode(chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
return index, chunks, embeddings
def retrieve_relevant_chunks(question, index, chunks, embeddings, k=5):
question_embedding = embedder.encode([question])
D, I = index.search(np.array(question_embedding), k)
return "\n\n".join([chunks[i] for i in I[0]])
def call_llama3(system, user):
response = groq_client.chat.completions.create(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user}
],
model="llama3-8b-8192"
)
return response.choices[0].message.content
# ==========================
# 🌐 Gradio App
# ==========================
vector_index = None
stored_chunks = None
stored_embeddings = None
def process_pdf(file):
global vector_index, stored_chunks, stored_embeddings
try:
if isinstance(file, str):
file_path = file
elif hasattr(file, "name"):
file_path = file.name
else:
return "❌ Error: Unsupported file format."
text = extract_text_from_pdf(file_path)
if not text.strip():
return "❌ Error: No text found in the PDF. It might be image-based or encrypted."
chunks = chunk_text(text)
if len(chunks) == 0:
return "❌ Error: Could not generate chunks from text."
vector_index, stored_chunks, stored_embeddings = create_vector_store(chunks)
return f"βœ… Successfully processed the document with {len(chunks)} chunks."
except Exception as e:
return f"❌ Failed to process PDF:\n{str(e)}\n\n{traceback.format_exc()}"
def answer_question(question):
if not vector_index:
return "⚠️ Please upload and process a PDF first."
context = retrieve_relevant_chunks(question, vector_index, stored_chunks, stored_embeddings)
prompt = USER_TEMPLATE.format(context=context, question=question)
return call_llama3(SYSTEM_TEMPLATE, prompt)
with gr.Blocks() as app:
gr.Markdown("# πŸ“š RAG Paper Supervisor (LLaMA 3 via Groq)")
gr.Markdown("Upload an academic PDF and ask questions β€” powered by LLaMA 3 and semantic search.")
with gr.Row():
pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
upload_btn = gr.Button("Process Document")
upload_output = gr.Textbox(label="Status", interactive=False)
with gr.Row():
question = gr.Textbox(label="Ask a question about the paper")
ask_btn = gr.Button("Get Answer")
answer = gr.Textbox(label="Answer", lines=6)
upload_btn.click(process_pdf, inputs=pdf_upload, outputs=upload_output)
ask_btn.click(answer_question, inputs=question, outputs=answer)
app.launch()