import os import gradio as gr import fitz # PyMuPDF import faiss import numpy as np from sentence_transformers import SentenceTransformer import groq import traceback # 🔐 Set your GROQ API Key as a HF Space secret (recommended) GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "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") # or set here temporarily groq_client = groq.Groq(api_key=GROQ_API_KEY) # ========================== # 🔧 Prompt Templates # ========================== SYSTEM_TEMPLATE = "You are an expert academic supervisor helping students understand academic papers. Be concise, clear, and encouraging." USER_TEMPLATE = "Based on the following context, answer the student's question.\n\nContext:\n{context}\n\nQuestion:\n{question}" # ========================== # 🧠 Embedding Model # ========================== embedder = SentenceTransformer("all-MiniLM-L6-v2") # ========================== # 📄 PDF Text Extraction # ========================== def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "\n".join([page.get_text() for page in doc]) return text def chunk_text(text, chunk_size=500, overlap=100): words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = " ".join(words[i:i + chunk_size]) chunks.append(chunk) return chunks def create_vector_store(chunks): embeddings = embedder.encode(chunks) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(np.array(embeddings)) return index, chunks, embeddings def retrieve_relevant_chunks(question, index, chunks, embeddings, k=5): question_embedding = embedder.encode([question]) D, I = index.search(np.array(question_embedding), k) return "\n\n".join([chunks[i] for i in I[0]]) def call_llama3(system, user): response = groq_client.chat.completions.create( messages=[ {"role": "system", "content": system}, {"role": "user", "content": user} ], model="llama3-8b-8192" ) return response.choices[0].message.content # ========================== # 🌐 Gradio App # ========================== vector_index = None stored_chunks = None stored_embeddings = None def process_pdf(file): global vector_index, stored_chunks, stored_embeddings try: if isinstance(file, str): file_path = file elif hasattr(file, "name"): file_path = file.name else: return "❌ Error: Unsupported file format." text = extract_text_from_pdf(file_path) if not text.strip(): return "❌ Error: No text found in the PDF. It might be image-based or encrypted." chunks = chunk_text(text) if len(chunks) == 0: return "❌ Error: Could not generate chunks from text." vector_index, stored_chunks, stored_embeddings = create_vector_store(chunks) return f"✅ Successfully processed the document with {len(chunks)} chunks." except Exception as e: return f"❌ Failed to process PDF:\n{str(e)}\n\n{traceback.format_exc()}" def answer_question(question): if not vector_index: return "⚠️ Please upload and process a PDF first." context = retrieve_relevant_chunks(question, vector_index, stored_chunks, stored_embeddings) prompt = USER_TEMPLATE.format(context=context, question=question) return call_llama3(SYSTEM_TEMPLATE, prompt) with gr.Blocks() as app: gr.Markdown("# 📚 RAG Paper Supervisor (LLaMA 3 via Groq)") gr.Markdown("Upload an academic PDF and ask questions — powered by LLaMA 3 and semantic search.") with gr.Row(): pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"]) upload_btn = gr.Button("Process Document") upload_output = gr.Textbox(label="Status", interactive=False) with gr.Row(): question = gr.Textbox(label="Ask a question about the paper") ask_btn = gr.Button("Get Answer") answer = gr.Textbox(label="Answer", lines=6) upload_btn.click(process_pdf, inputs=pdf_upload, outputs=upload_output) ask_btn.click(answer_question, inputs=question, outputs=answer) app.launch()