import gradio as gr import fitz # PyMuPDF import os from sentence_transformers import SentenceTransformer import numpy as np import faiss import uuid from groq import Groq # Load embedding model embedder = SentenceTransformer("all-MiniLM-L6-v2") # Initialize vector store and document store document_chunks = [] doc_embeddings = [] doc_ids = [] index = None # Get Groq API key from environment variable GROQ_API_KEY = os.getenv("GROQ_API_KEY") client = Groq(api_key=GROQ_API_KEY) # Load and split PDF def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() return text # Chunking logic def chunk_text(text, max_tokens=500): import re sentences = re.split(r'(?<=[.!?]) +', text) chunks = [] chunk = "" tokens = 0 for sentence in sentences: sentence_tokens = len(sentence.split()) if tokens + sentence_tokens > max_tokens: chunks.append(chunk.strip()) chunk = sentence tokens = sentence_tokens else: chunk += " " + sentence tokens += sentence_tokens if chunk: chunks.append(chunk.strip()) return chunks # Indexing def index_pdf(pdf_file): global document_chunks, doc_embeddings, doc_ids, index if not pdf_file: return "❌ Please upload a PDF file." text = extract_text_from_pdf(pdf_file.name) document_chunks = chunk_text(text) doc_embeddings = embedder.encode(document_chunks) doc_embeddings = np.array(doc_embeddings).astype("float32") dimension = doc_embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(doc_embeddings) doc_ids = [str(uuid.uuid4()) for _ in range(len(document_chunks))] return "✅ PDF indexed successfully. You can now ask questions." # Retrieve top chunks def retrieve_relevant_chunks(query, k=3): query_embedding = embedder.encode([query]).astype("float32") distances, indices = index.search(query_embedding, k) return [document_chunks[i] for i in indices[0]] # Generate answer using Groq def generate_answer(user_query): if index is None: return "❌ Please upload and index a PDF first." top_chunks = retrieve_relevant_chunks(user_query, k=3) context = "\n\n".join(top_chunks) messages = [ {"role": "system", "content": "You are a helpful academic assistant who answers questions based on uploaded PDF papers."}, {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_query}"} ] try: response = client.chat.completions.create( messages=messages, model="llama3-8b-8192", ) return response.choices[0].message.content.strip() except Exception as e: return f"❌ Error generating response: {e}" # Gradio UI with gr.Blocks(title="📘 PDF Question Assistant") as demo: gr.Markdown("# 📘 Ask Questions About Your PDF") with gr.Tab("📄 Upload & Index"): with gr.Row(): pdf_input = gr.File(label="Upload PDF File", type="filepath", file_types=[".pdf"]) upload_btn = gr.Button("🔍 Index PDF", variant="primary") upload_status = gr.Textbox(label="", interactive=False, placeholder="Status will appear here...") with gr.Tab("❓ Ask a Question"): with gr.Row(): query = gr.Textbox(label="Ask something from the PDF", placeholder="E.g. What is the main argument of the paper?") query_btn = gr.Button("🧠 Get Answer") answer = gr.Textbox(label="Answer", placeholder="AI-generated answer will appear here...", lines=8) upload_btn.click(fn=index_pdf, inputs=[pdf_input], outputs=[upload_status]) query_btn.click(fn=generate_answer, inputs=[query], outputs=[answer]) if __name__ == "__main__": demo.launch()