Spaces:
Sleeping
Sleeping
File size: 3,393 Bytes
8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 722f7a0 8107894 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import fitz # PyMuPDF
import faiss
import numpy as np
import gradio as gr
from groq import Groq
from sentence_transformers import SentenceTransformer
# β
Load Groq API key from Hugging Face Secrets
client = Groq(api_key=os.environ["GROQ_API_KEY"])
# β
Sentence embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# === PDF β Text extraction ===
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
# === Chunking text ===
def chunk_text(text, chunk_size=500):
sentences = text.split(". ")
chunks, current = [], ""
for sentence in sentences:
if len(current) + len(sentence) < chunk_size:
current += sentence + ". "
else:
chunks.append(current.strip())
current = sentence + ". "
if current:
chunks.append(current.strip())
return chunks
# === Vector store (FAISS) ===
class VectorStore:
def __init__(self):
self.index = faiss.IndexFlatL2(384)
self.chunks = []
def add(self, embeddings, texts):
self.index.add(np.array(embeddings))
self.chunks.extend(texts)
def search(self, query, top_k=5):
vec = embedding_model.encode([query])
_, I = self.index.search(np.array(vec), top_k)
return [self.chunks[i] for i in I[0]]
vs = VectorStore()
system_prompt = "You are a study supervisor helping students understand their uploaded documents."
# === Ask LLaMA 3 using Groq ===
def ask_llama3(system_prompt, user_prompt):
try:
result = client.chat.completions.create(
model="llama3-8b-8192",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
)
return result.choices[0].message.content
except Exception as e:
return f"β Groq API Error: {e}"
# === PDF upload handler ===
def upload_pdf(pdf_file):
try:
text = extract_text_from_pdf(pdf_file.name)
chunks = chunk_text(text)
embeddings = embedding_model.encode(chunks)
vs.add(embeddings, chunks)
return "β
Document uploaded and processed!"
except Exception as e:
return f"β PDF Processing Error: {e}"
# === QA handler ===
def ask_question(question):
if not vs.chunks:
return "β οΈ Please upload and process a PDF document first."
try:
docs = vs.search(question)
context = "\n".join(docs)
prompt = f"Use the context below to answer the question.\n\nContext:\n{context}\n\nQuestion: {question}"
return ask_llama3(system_prompt, prompt)
except Exception as e:
return f"β Question Answering Error: {e}"
# === Gradio UI ===
with gr.Blocks() as demo:
gr.Markdown("## π RAG PDF QA using LLaMA3 via Groq API")
with gr.Row():
pdf_file = gr.File(label="Upload PDF Document")
upload_button = gr.Button("Process PDF")
with gr.Row():
question = gr.Textbox(label="Ask a question from the document")
ask_button = gr.Button("Ask")
answer = gr.Textbox(label="Answer", lines=6)
upload_button.click(upload_pdf, inputs=pdf_file, outputs=answer)
ask_button.click(ask_question, inputs=question, outputs=answer)
demo.launch()
|