Spaces:
Sleeping
Sleeping
File size: 4,210 Bytes
c0979a1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | import os
import gradio as gr
import fitz # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import groq
import traceback
# π Set your GROQ API Key as a HF Space secret (recommended)
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "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") # or set here temporarily
groq_client = groq.Groq(api_key=GROQ_API_KEY)
# ==========================
# π§ Prompt Templates
# ==========================
SYSTEM_TEMPLATE = "You are an expert academic supervisor helping students understand academic papers. Be concise, clear, and encouraging."
USER_TEMPLATE = "Based on the following context, answer the student's question.\n\nContext:\n{context}\n\nQuestion:\n{question}"
# ==========================
# π§ Embedding Model
# ==========================
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# ==========================
# π PDF Text Extraction
# ==========================
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
return text
def chunk_text(text, chunk_size=500, overlap=100):
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def create_vector_store(chunks):
embeddings = embedder.encode(chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
return index, chunks, embeddings
def retrieve_relevant_chunks(question, index, chunks, embeddings, k=5):
question_embedding = embedder.encode([question])
D, I = index.search(np.array(question_embedding), k)
return "\n\n".join([chunks[i] for i in I[0]])
def call_llama3(system, user):
response = groq_client.chat.completions.create(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user}
],
model="llama3-8b-8192"
)
return response.choices[0].message.content
# ==========================
# π Gradio App
# ==========================
vector_index = None
stored_chunks = None
stored_embeddings = None
def process_pdf(file):
global vector_index, stored_chunks, stored_embeddings
try:
if isinstance(file, str):
file_path = file
elif hasattr(file, "name"):
file_path = file.name
else:
return "β Error: Unsupported file format."
text = extract_text_from_pdf(file_path)
if not text.strip():
return "β Error: No text found in the PDF. It might be image-based or encrypted."
chunks = chunk_text(text)
if len(chunks) == 0:
return "β Error: Could not generate chunks from text."
vector_index, stored_chunks, stored_embeddings = create_vector_store(chunks)
return f"β
Successfully processed the document with {len(chunks)} chunks."
except Exception as e:
return f"β Failed to process PDF:\n{str(e)}\n\n{traceback.format_exc()}"
def answer_question(question):
if not vector_index:
return "β οΈ Please upload and process a PDF first."
context = retrieve_relevant_chunks(question, vector_index, stored_chunks, stored_embeddings)
prompt = USER_TEMPLATE.format(context=context, question=question)
return call_llama3(SYSTEM_TEMPLATE, prompt)
with gr.Blocks() as app:
gr.Markdown("# π RAG Paper Supervisor (LLaMA 3 via Groq)")
gr.Markdown("Upload an academic PDF and ask questions β powered by LLaMA 3 and semantic search.")
with gr.Row():
pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
upload_btn = gr.Button("Process Document")
upload_output = gr.Textbox(label="Status", interactive=False)
with gr.Row():
question = gr.Textbox(label="Ask a question about the paper")
ask_btn = gr.Button("Get Answer")
answer = gr.Textbox(label="Answer", lines=6)
upload_btn.click(process_pdf, inputs=pdf_upload, outputs=upload_output)
ask_btn.click(answer_question, inputs=question, outputs=answer)
app.launch()
|