import gradio as gr import fitz # PyMuPDF import faiss import numpy as np from sentence_transformers import SentenceTransformer import os import requests import json from typing import List # Load the sentence transformer model embedder = SentenceTransformer("all-MiniLM-L6-v2") # Initialize FAISS index dimension = 384 # vector size for MiniLM-L6-v2 index = faiss.IndexFlatL2(dimension) stored_chunks = [] stored_embeddings = [] # Set your Groq API key here (safe in Colab if you use secrets or input()) GROQ_API_KEY = "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" LLM_MODEL = "llama3-8b-8192" def extract_text_from_pdf(pdf_file): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() return text def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = words[i:i + chunk_size] chunks.append(" ".join(chunk)) return chunks def embed_and_store(chunks): global stored_chunks, stored_embeddings embeddings = embedder.encode(chunks) index.add(np.array(embeddings, dtype=np.float32)) stored_chunks.extend(chunks) stored_embeddings.extend(embeddings) def query_groq(prompt): url = "https://api.groq.com/openai/v1/chat/completions" headers = { "Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json" } payload = { "model": LLM_MODEL, "messages": [ {"role": "system", "content": "You are a helpful academic supervisor helping students study uploaded research papers."}, {"role": "user", "content": prompt} ], "temperature": 0.2 } response = requests.post(url, headers=headers, json=payload) return response.json()["choices"][0]["message"]["content"] def retrieve_answer(user_query): embedded_query = embedder.encode([user_query]) D, I = index.search(np.array(embedded_query, dtype=np.float32), k=3) context = "\n\n".join([stored_chunks[i] for i in I[0]]) prompt = f"Based on the following context:\n\n{context}\n\nAnswer this question:\n{user_query}" return query_groq(prompt) def handle_upload(file): text = extract_text_from_pdf(file) chunks = chunk_text(text) embed_and_store(chunks) return "PDF processed and indexed. You can now ask questions." def handle_question(question): if not stored_chunks: return "Please upload a PDF first." return retrieve_answer(question) with gr.Blocks() as demo: with gr.Row(): file_input = gr.File(label="Upload your PDF") upload_btn = gr.Button("Process PDF") output_text = gr.Textbox(label="Status / Answer") upload_btn.click(fn=handle_upload, inputs=file_input, outputs=output_text) with gr.Row(): query_input = gr.Textbox(label="Ask a Question") query_btn = gr.Button("Submit") query_btn.click(fn=handle_question, inputs=query_input, outputs=output_text) demo.launch()