Spaces:

tayy786
/

RAG-based-application

Runtime error

File size: 4,543 Bytes

import os
import faiss
import numpy as np
import gradio as gr
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from groq import Groq

# -----------------------------
# Initialize Models
# -----------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")

client = Groq(
    api_key=os.environ.get("Tgb"),
)

# -----------------------------
# Global Variables
# -----------------------------
index = None
documents = []

# -----------------------------
# PDF Processing
# -----------------------------
def read_pdf(file):
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        if page.extract_text():
            text += page.extract_text()
    return text


def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap

    return chunks


# -----------------------------
# Create FAISS Index
# -----------------------------
def create_index(chunks):
    global index, documents

    documents = chunks
    embeddings = embedder.encode(chunks)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))


# -----------------------------
# Retrieval with Relevance Check
# -----------------------------
def retrieve(query, k=3, threshold=1.2):
    if index is None:
        return [], None

    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)

    relevant_chunks = []
    valid_distances = []

    for i, dist in zip(indices[0], distances[0]):
        if dist < threshold:
            relevant_chunks.append(documents[i])
            valid_distances.append(dist)

    # Confidence score (lower distance = better)
    confidence = None
    if len(valid_distances) > 0:
        avg_dist = np.mean(valid_distances)
        if avg_dist < 0.5:
            confidence = "High"
        elif avg_dist < 1.0:
            confidence = "Medium"
        else:
            confidence = "Low"

    return relevant_chunks, confidence


# -----------------------------
# Ask Groq LLM
# -----------------------------
def ask_groq(context_chunks, question):
    context = "\n".join(context_chunks)

    prompt = f"""
You are an intelligent assistant.

Rules:
1. If the answer is clearly present in the context, answer normally.
2. If the answer is NOT directly present but somewhat related, say:
   "This is not explicitly mentioned in the document, but based on related context..."
   then give a helpful answer.
3. If the context is completely irrelevant, say:
   "The document does not contain information related to this question."

Context:
{context}

Question:
{question}
"""

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "user", "content": prompt}
        ],
        model="llama-3.3-70b-versatile",
    )

    return chat_completion.choices[0].message.content


# -----------------------------
# Main Pipeline
# -----------------------------
def process_pdf(file):
    if file is None:
        return "Please upload a PDF first."

    text = read_pdf(file)
    if not text.strip():
        return "Could not extract text from PDF."

    chunks = chunk_text(text)
    create_index(chunks)

    return f"PDF processed successfully! Total chunks: {len(chunks)}"


def answer_question(question):
    if index is None:
        return "Please upload and process a PDF first."

    context_chunks, confidence = retrieve(question)

    if len(context_chunks) == 0:
        return "The document does not contain information related to this question."

    answer = ask_groq(context_chunks, question)

    if confidence:
        answer = f"(Confidence: {confidence})\n\n" + answer

    return answer


# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("# 📄 RAG PDF Q&A App (Groq + FAISS)")

    file_input = gr.File(label="Upload PDF")
    upload_btn = gr.Button("Process PDF")
    status = gr.Textbox(label="Status")

    question = gr.Textbox(label="Ask a question")
    answer = gr.Textbox(label="Answer")

    upload_btn.click(process_pdf, inputs=file_input, outputs=status)
    question.submit(answer_question, inputs=question, outputs=answer)


# -----------------------------
# Run App
# -----------------------------
if __name__ == "__main__":
    demo.launch()