tayy786's picture
Update app.py
f8ad5a3 verified
import os
import faiss
import numpy as np
import gradio as gr
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from groq import Groq
# -----------------------------
# Initialize Models
# -----------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = Groq(
api_key=os.environ.get("Tgb"),
)
# -----------------------------
# Global Variables
# -----------------------------
index = None
documents = []
# -----------------------------
# PDF Processing
# -----------------------------
def read_pdf(file):
reader = PdfReader(file)
text = ""
for page in reader.pages:
if page.extract_text():
text += page.extract_text()
return text
def chunk_text(text, chunk_size=500, overlap=100):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
chunks.append(chunk)
start += chunk_size - overlap
return chunks
# -----------------------------
# Create FAISS Index
# -----------------------------
def create_index(chunks):
global index, documents
documents = chunks
embeddings = embedder.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
# -----------------------------
# Retrieval with Relevance Check
# -----------------------------
def retrieve(query, k=3, threshold=1.2):
if index is None:
return [], None
query_embedding = embedder.encode([query])
distances, indices = index.search(np.array(query_embedding), k)
relevant_chunks = []
valid_distances = []
for i, dist in zip(indices[0], distances[0]):
if dist < threshold:
relevant_chunks.append(documents[i])
valid_distances.append(dist)
# Confidence score (lower distance = better)
confidence = None
if len(valid_distances) > 0:
avg_dist = np.mean(valid_distances)
if avg_dist < 0.5:
confidence = "High"
elif avg_dist < 1.0:
confidence = "Medium"
else:
confidence = "Low"
return relevant_chunks, confidence
# -----------------------------
# Ask Groq LLM
# -----------------------------
def ask_groq(context_chunks, question):
context = "\n".join(context_chunks)
prompt = f"""
You are an intelligent assistant.
Rules:
1. If the answer is clearly present in the context, answer normally.
2. If the answer is NOT directly present but somewhat related, say:
"This is not explicitly mentioned in the document, but based on related context..."
then give a helpful answer.
3. If the context is completely irrelevant, say:
"The document does not contain information related to this question."
Context:
{context}
Question:
{question}
"""
chat_completion = client.chat.completions.create(
messages=[
{"role": "user", "content": prompt}
],
model="llama-3.3-70b-versatile",
)
return chat_completion.choices[0].message.content
# -----------------------------
# Main Pipeline
# -----------------------------
def process_pdf(file):
if file is None:
return "Please upload a PDF first."
text = read_pdf(file)
if not text.strip():
return "Could not extract text from PDF."
chunks = chunk_text(text)
create_index(chunks)
return f"PDF processed successfully! Total chunks: {len(chunks)}"
def answer_question(question):
if index is None:
return "Please upload and process a PDF first."
context_chunks, confidence = retrieve(question)
if len(context_chunks) == 0:
return "The document does not contain information related to this question."
answer = ask_groq(context_chunks, question)
if confidence:
answer = f"(Confidence: {confidence})\n\n" + answer
return answer
# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown("# 📄 RAG PDF Q&A App (Groq + FAISS)")
file_input = gr.File(label="Upload PDF")
upload_btn = gr.Button("Process PDF")
status = gr.Textbox(label="Status")
question = gr.Textbox(label="Ask a question")
answer = gr.Textbox(label="Answer")
upload_btn.click(process_pdf, inputs=file_input, outputs=status)
question.submit(answer_question, inputs=question, outputs=answer)
# -----------------------------
# Run App
# -----------------------------
if __name__ == "__main__":
demo.launch()