Spaces:

Ahmad-01
/

PDF_Supervisor

Sleeping

App Files Files Community

PDF_Supervisor / app.py

Ahmad-01

Create app.py

c0979a1 verified 8 months ago

raw

history blame contribute delete

4.21 kB

	import os
	import gradio as gr
	import fitz # PyMuPDF
	import faiss
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import groq
	import traceback

	# 🔐 Set your GROQ API Key as a HF Space secret (recommended)
	GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "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") # or set here temporarily
	groq_client = groq.Groq(api_key=GROQ_API_KEY)

	# ==========================
	# 🔧 Prompt Templates
	# ==========================
	SYSTEM_TEMPLATE = "You are an expert academic supervisor helping students understand academic papers. Be concise, clear, and encouraging."
	USER_TEMPLATE = "Based on the following context, answer the student's question.\n\nContext:\n{context}\n\nQuestion:\n{question}"

	# ==========================
	# 🧠 Embedding Model
	# ==========================
	embedder = SentenceTransformer("all-MiniLM-L6-v2")

	# ==========================
	# 📄 PDF Text Extraction
	# ==========================
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = "\n".join([page.get_text() for page in doc])
	return text

	def chunk_text(text, chunk_size=500, overlap=100):
	words = text.split()
	chunks = []
	for i in range(0, len(words), chunk_size - overlap):
	chunk = " ".join(words[i:i + chunk_size])
	chunks.append(chunk)
	return chunks

	def create_vector_store(chunks):
	embeddings = embedder.encode(chunks)
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(np.array(embeddings))
	return index, chunks, embeddings

	def retrieve_relevant_chunks(question, index, chunks, embeddings, k=5):
	question_embedding = embedder.encode([question])
	D, I = index.search(np.array(question_embedding), k)
	return "\n\n".join([chunks[i] for i in I[0]])

	def call_llama3(system, user):
	response = groq_client.chat.completions.create(
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user}
	],
	model="llama3-8b-8192"
	)
	return response.choices[0].message.content

	# ==========================
	# 🌐 Gradio App
	# ==========================

	vector_index = None
	stored_chunks = None
	stored_embeddings = None

	def process_pdf(file):
	global vector_index, stored_chunks, stored_embeddings
	try:
	if isinstance(file, str):
	file_path = file
	elif hasattr(file, "name"):
	file_path = file.name
	else:
	return "❌ Error: Unsupported file format."

	text = extract_text_from_pdf(file_path)
	if not text.strip():
	return "❌ Error: No text found in the PDF. It might be image-based or encrypted."

	chunks = chunk_text(text)
	if len(chunks) == 0:
	return "❌ Error: Could not generate chunks from text."

	vector_index, stored_chunks, stored_embeddings = create_vector_store(chunks)
	return f"✅ Successfully processed the document with {len(chunks)} chunks."

	except Exception as e:
	return f"❌ Failed to process PDF:\n{str(e)}\n\n{traceback.format_exc()}"

	def answer_question(question):
	if not vector_index:
	return "⚠️ Please upload and process a PDF first."

	context = retrieve_relevant_chunks(question, vector_index, stored_chunks, stored_embeddings)
	prompt = USER_TEMPLATE.format(context=context, question=question)
	return call_llama3(SYSTEM_TEMPLATE, prompt)

	with gr.Blocks() as app:
	gr.Markdown("# 📚 RAG Paper Supervisor (LLaMA 3 via Groq)")
	gr.Markdown("Upload an academic PDF and ask questions — powered by LLaMA 3 and semantic search.")

	with gr.Row():
	pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
	upload_btn = gr.Button("Process Document")
	upload_output = gr.Textbox(label="Status", interactive=False)

	with gr.Row():
	question = gr.Textbox(label="Ask a question about the paper")
	ask_btn = gr.Button("Get Answer")
	answer = gr.Textbox(label="Answer", lines=6)

	upload_btn.click(process_pdf, inputs=pdf_upload, outputs=upload_output)
	ask_btn.click(answer_question, inputs=question, outputs=answer)

	app.launch()