Spaces:

Dani786
/

student_rag

Sleeping

File size: 3,393 Bytes

import os
import fitz  # PyMuPDF
import faiss
import numpy as np
import gradio as gr
from groq import Groq
from sentence_transformers import SentenceTransformer

# ✅ Load Groq API key from Hugging Face Secrets
client = Groq(api_key=os.environ["GROQ_API_KEY"])

# ✅ Sentence embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# === PDF → Text extraction ===
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# === Chunking text ===
def chunk_text(text, chunk_size=500):
    sentences = text.split(". ")
    chunks, current = [], ""
    for sentence in sentences:
        if len(current) + len(sentence) < chunk_size:
            current += sentence + ". "
        else:
            chunks.append(current.strip())
            current = sentence + ". "
    if current:
        chunks.append(current.strip())
    return chunks

# === Vector store (FAISS) ===
class VectorStore:
    def __init__(self):
        self.index = faiss.IndexFlatL2(384)
        self.chunks = []

    def add(self, embeddings, texts):
        self.index.add(np.array(embeddings))
        self.chunks.extend(texts)

    def search(self, query, top_k=5):
        vec = embedding_model.encode([query])
        _, I = self.index.search(np.array(vec), top_k)
        return [self.chunks[i] for i in I[0]]

vs = VectorStore()
system_prompt = "You are a study supervisor helping students understand their uploaded documents."

# === Ask LLaMA 3 using Groq ===
def ask_llama3(system_prompt, user_prompt):
    try:
        result = client.chat.completions.create(
            model="llama3-8b-8192",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ]
        )
        return result.choices[0].message.content
    except Exception as e:
        return f"❌ Groq API Error: {e}"

# === PDF upload handler ===
def upload_pdf(pdf_file):
    try:
        text = extract_text_from_pdf(pdf_file.name)
        chunks = chunk_text(text)
        embeddings = embedding_model.encode(chunks)
        vs.add(embeddings, chunks)
        return "✅ Document uploaded and processed!"
    except Exception as e:
        return f"❌ PDF Processing Error: {e}"

# === QA handler ===
def ask_question(question):
    if not vs.chunks:
        return "⚠️ Please upload and process a PDF document first."
    try:
        docs = vs.search(question)
        context = "\n".join(docs)
        prompt = f"Use the context below to answer the question.\n\nContext:\n{context}\n\nQuestion: {question}"
        return ask_llama3(system_prompt, prompt)
    except Exception as e:
        return f"❌ Question Answering Error: {e}"

# === Gradio UI ===
with gr.Blocks() as demo:
    gr.Markdown("## 📚 RAG PDF QA using LLaMA3 via Groq API")
    with gr.Row():
        pdf_file = gr.File(label="Upload PDF Document")
        upload_button = gr.Button("Process PDF")
    with gr.Row():
        question = gr.Textbox(label="Ask a question from the document")
        ask_button = gr.Button("Ask")
        answer = gr.Textbox(label="Answer", lines=6)

    upload_button.click(upload_pdf, inputs=pdf_file, outputs=answer)
    ask_button.click(ask_question, inputs=question, outputs=answer)

demo.launch()