Spaces:

tayy786
/

RAGbasedDocumentreader

Runtime error

File size: 4,831 Bytes
import os
import faiss
import numpy as np
import gradio as gr
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from groq import Groq

# -----------------------------
# Initialize Models
# -----------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Safely load API key
GROQ_API_KEY = os.getenv("Rag")
client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None

# -----------------------------
# Global Storage
# -----------------------------
index = None
documents = []

# -----------------------------
# PDF Processing
# -----------------------------
def read_pdf(file):
    try:
        reader = PdfReader(file.name)  # FIX for Hugging Face
        text = ""
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text()
        return text
    except Exception as e:
        return f"Error reading PDF: {str(e)}"


def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap

    return chunks


# -----------------------------
# Create FAISS Index
# -----------------------------
def create_index(chunks):
    global index, documents

    documents = chunks
    embeddings = embedder.encode(chunks)

    embeddings = np.array(embeddings).astype("float32")

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)


# -----------------------------
# Retrieval
# -----------------------------
def retrieve(query, k=3, threshold=1.2):
    global index

    if index is None:
        return [], None

    query_embedding = embedder.encode([query])
    query_embedding = np.array(query_embedding).astype("float32")

    distances, indices = index.search(query_embedding, k)

    relevant_chunks = []
    valid_distances = []

    for i, dist in zip(indices[0], distances[0]):
        if i < len(documents) and dist < threshold:
            relevant_chunks.append(documents[i])
            valid_distances.append(dist)

    # Confidence
    confidence = None
    if valid_distances:
        avg = np.mean(valid_distances)
        if avg < 0.5:
            confidence = "High"
        elif avg < 1.0:
            confidence = "Medium"
        else:
            confidence = "Low"

    return relevant_chunks, confidence


# -----------------------------
# LLM (Groq)
# -----------------------------
def ask_groq(context_chunks, question):
    if client is None:
        return "Error: GROQ_API_KEY not set in Hugging Face Secrets."

    context = "\n".join(context_chunks)

    prompt = f"""
You are an intelligent assistant.

Rules:
1. If answer is clearly in context → answer normally.
2. If related but not exact → say:
   "This is not explicitly mentioned in the document, but based on related context..."
3. If irrelevant → say:
   "The document does not contain information related to this question."

Context:
{context}

Question:
{question}
"""

    try:
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3.3-70b-versatile",
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Groq API Error: {str(e)}"


# -----------------------------
# Main Functions
# -----------------------------
def process_pdf(file):
    if file is None:
        return "Please upload a PDF."

    text = read_pdf(file)

    if not text or "Error" in text:
        return text

    chunks = chunk_text(text)
    create_index(chunks)

    return f"✅ PDF processed! Chunks: {len(chunks)}"


def answer_question(question):
    if index is None:
        return "Please upload and process a PDF first."

    context_chunks, confidence = retrieve(question)

    if not context_chunks:
        return "The document does not contain information related to this question."

    answer = ask_groq(context_chunks, question)

    if confidence:
        answer = f"(Confidence: {confidence})\n\n{answer}"

    return answer


# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("## 📄 RAG PDF Q&A (Groq + FAISS)")

    file_input = gr.File(label="Upload PDF")
    upload_btn = gr.Button("Process PDF")
    status = gr.Textbox(label="Status")

    question = gr.Textbox(label="Ask a question")
    answer = gr.Textbox(label="Answer")

    upload_btn.click(process_pdf, inputs=file_input, outputs=status)
    question.submit(answer_question, inputs=question, outputs=answer)

# -----------------------------
# Launch
# -----------------------------
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)