Spaces:
Runtime error
Runtime error
File size: 3,858 Bytes
ef66c93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import fitz # PyMuPDF
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import uuid
from groq import Groq
# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# Initialize vector store and document store
document_chunks = []
doc_embeddings = []
doc_ids = []
index = None
# Get Groq API key from environment variable
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
client = Groq(api_key=GROQ_API_KEY)
# Load and split PDF
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
# Chunking logic
def chunk_text(text, max_tokens=500):
import re
sentences = re.split(r'(?<=[.!?]) +', text)
chunks = []
chunk = ""
tokens = 0
for sentence in sentences:
sentence_tokens = len(sentence.split())
if tokens + sentence_tokens > max_tokens:
chunks.append(chunk.strip())
chunk = sentence
tokens = sentence_tokens
else:
chunk += " " + sentence
tokens += sentence_tokens
if chunk:
chunks.append(chunk.strip())
return chunks
# Indexing
def index_pdf(pdf_file):
global document_chunks, doc_embeddings, doc_ids, index
if not pdf_file:
return "β Please upload a PDF file."
text = extract_text_from_pdf(pdf_file.name)
document_chunks = chunk_text(text)
doc_embeddings = embedder.encode(document_chunks)
doc_embeddings = np.array(doc_embeddings).astype("float32")
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)
doc_ids = [str(uuid.uuid4()) for _ in range(len(document_chunks))]
return "β
PDF indexed successfully. You can now ask questions."
# Retrieve top chunks
def retrieve_relevant_chunks(query, k=3):
query_embedding = embedder.encode([query]).astype("float32")
distances, indices = index.search(query_embedding, k)
return [document_chunks[i] for i in indices[0]]
# Generate answer using Groq
def generate_answer(user_query):
if index is None:
return "β Please upload and index a PDF first."
top_chunks = retrieve_relevant_chunks(user_query, k=3)
context = "\n\n".join(top_chunks)
messages = [
{"role": "system", "content": "You are a helpful academic assistant who answers questions based on uploaded PDF papers."},
{"role": "user", "content": f"Context: {context}\n\nQuestion: {user_query}"}
]
try:
response = client.chat.completions.create(
messages=messages,
model="llama3-8b-8192",
)
return response.choices[0].message.content.strip()
except Exception as e:
return f"β Error generating response: {e}"
# Gradio UI
with gr.Blocks(title="π PDF Question Assistant") as demo:
gr.Markdown("# π Ask Questions About Your PDF")
with gr.Tab("π Upload & Index"):
with gr.Row():
pdf_input = gr.File(label="Upload PDF File", type="filepath", file_types=[".pdf"])
upload_btn = gr.Button("π Index PDF", variant="primary")
upload_status = gr.Textbox(label="", interactive=False, placeholder="Status will appear here...")
with gr.Tab("β Ask a Question"):
with gr.Row():
query = gr.Textbox(label="Ask something from the PDF", placeholder="E.g. What is the main argument of the paper?")
query_btn = gr.Button("π§ Get Answer")
answer = gr.Textbox(label="Answer", placeholder="AI-generated answer will appear here...", lines=8)
upload_btn.click(fn=index_pdf, inputs=[pdf_input], outputs=[upload_status])
query_btn.click(fn=generate_answer, inputs=[query], outputs=[answer])
if __name__ == "__main__":
demo.launch()
|