Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import os | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| import faiss | |
| import uuid | |
| from groq import Groq | |
| # Load embedding model | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| # Initialize vector store and document store | |
| document_chunks = [] | |
| doc_embeddings = [] | |
| doc_ids = [] | |
| index = None | |
| # Get Groq API key from environment variable | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| client = Groq(api_key=GROQ_API_KEY) | |
| # Load and split PDF | |
| def extract_text_from_pdf(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| # Chunking logic | |
| def chunk_text(text, max_tokens=500): | |
| import re | |
| sentences = re.split(r'(?<=[.!?]) +', text) | |
| chunks = [] | |
| chunk = "" | |
| tokens = 0 | |
| for sentence in sentences: | |
| sentence_tokens = len(sentence.split()) | |
| if tokens + sentence_tokens > max_tokens: | |
| chunks.append(chunk.strip()) | |
| chunk = sentence | |
| tokens = sentence_tokens | |
| else: | |
| chunk += " " + sentence | |
| tokens += sentence_tokens | |
| if chunk: | |
| chunks.append(chunk.strip()) | |
| return chunks | |
| # Indexing | |
| def index_pdf(pdf_file): | |
| global document_chunks, doc_embeddings, doc_ids, index | |
| if not pdf_file: | |
| return "β Please upload a PDF file." | |
| text = extract_text_from_pdf(pdf_file.name) | |
| document_chunks = chunk_text(text) | |
| doc_embeddings = embedder.encode(document_chunks) | |
| doc_embeddings = np.array(doc_embeddings).astype("float32") | |
| dimension = doc_embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(doc_embeddings) | |
| doc_ids = [str(uuid.uuid4()) for _ in range(len(document_chunks))] | |
| return "β PDF indexed successfully. You can now ask questions." | |
| # Retrieve top chunks | |
| def retrieve_relevant_chunks(query, k=3): | |
| query_embedding = embedder.encode([query]).astype("float32") | |
| distances, indices = index.search(query_embedding, k) | |
| return [document_chunks[i] for i in indices[0]] | |
| # Generate answer using Groq | |
| def generate_answer(user_query): | |
| if index is None: | |
| return "β Please upload and index a PDF first." | |
| top_chunks = retrieve_relevant_chunks(user_query, k=3) | |
| context = "\n\n".join(top_chunks) | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful academic assistant who answers questions based on uploaded PDF papers."}, | |
| {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_query}"} | |
| ] | |
| try: | |
| response = client.chat.completions.create( | |
| messages=messages, | |
| model="llama3-8b-8192", | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| return f"β Error generating response: {e}" | |
| # Gradio UI | |
| with gr.Blocks(title="π PDF Question Assistant") as demo: | |
| gr.Markdown("# π Ask Questions About Your PDF") | |
| with gr.Tab("π Upload & Index"): | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload PDF File", type="filepath", file_types=[".pdf"]) | |
| upload_btn = gr.Button("π Index PDF", variant="primary") | |
| upload_status = gr.Textbox(label="", interactive=False, placeholder="Status will appear here...") | |
| with gr.Tab("β Ask a Question"): | |
| with gr.Row(): | |
| query = gr.Textbox(label="Ask something from the PDF", placeholder="E.g. What is the main argument of the paper?") | |
| query_btn = gr.Button("π§ Get Answer") | |
| answer = gr.Textbox(label="Answer", placeholder="AI-generated answer will appear here...", lines=8) | |
| upload_btn.click(fn=index_pdf, inputs=[pdf_input], outputs=[upload_status]) | |
| query_btn.click(fn=generate_answer, inputs=[query], outputs=[answer]) | |
| if __name__ == "__main__": | |
| demo.launch() | |