Spaces:

tayy786
/

RAGbasedDocumentreader

Runtime error

App Files Files Community

RAGbasedDocumentreader / app.py

tayy786

Create app.py

4590062 verified about 1 month ago

raw

history blame contribute delete

4.83 kB

	import os
	import faiss
	import numpy as np
	import gradio as gr
	from pypdf import PdfReader
	from sentence_transformers import SentenceTransformer
	from groq import Groq

	# -----------------------------
	# Initialize Models
	# -----------------------------
	embedder = SentenceTransformer("all-MiniLM-L6-v2")

	# Safely load API key
	GROQ_API_KEY = os.getenv("Rag")
	client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None

	# -----------------------------
	# Global Storage
	# -----------------------------
	index = None
	documents = []

	# -----------------------------
	# PDF Processing
	# -----------------------------
	def read_pdf(file):
	try:
	reader = PdfReader(file.name) # FIX for Hugging Face
	text = ""
	for page in reader.pages:
	if page.extract_text():
	text += page.extract_text()
	return text
	except Exception as e:
	return f"Error reading PDF: {str(e)}"


	def chunk_text(text, chunk_size=500, overlap=100):
	chunks = []
	start = 0

	while start < len(text):
	end = start + chunk_size
	chunks.append(text[start:end])
	start += chunk_size - overlap

	return chunks


	# -----------------------------
	# Create FAISS Index
	# -----------------------------
	def create_index(chunks):
	global index, documents

	documents = chunks
	embeddings = embedder.encode(chunks)

	embeddings = np.array(embeddings).astype("float32")

	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)


	# -----------------------------
	# Retrieval
	# -----------------------------
	def retrieve(query, k=3, threshold=1.2):
	global index

	if index is None:
	return [], None

	query_embedding = embedder.encode([query])
	query_embedding = np.array(query_embedding).astype("float32")

	distances, indices = index.search(query_embedding, k)

	relevant_chunks = []
	valid_distances = []

	for i, dist in zip(indices[0], distances[0]):
	if i < len(documents) and dist < threshold:
	relevant_chunks.append(documents[i])
	valid_distances.append(dist)

	# Confidence
	confidence = None
	if valid_distances:
	avg = np.mean(valid_distances)
	if avg < 0.5:
	confidence = "High"
	elif avg < 1.0:
	confidence = "Medium"
	else:
	confidence = "Low"

	return relevant_chunks, confidence


	# -----------------------------
	# LLM (Groq)
	# -----------------------------
	def ask_groq(context_chunks, question):
	if client is None:
	return "Error: GROQ_API_KEY not set in Hugging Face Secrets."

	context = "\n".join(context_chunks)

	prompt = f"""
	You are an intelligent assistant.

	Rules:
	1. If answer is clearly in context → answer normally.
	2. If related but not exact → say:
	"This is not explicitly mentioned in the document, but based on related context..."
	3. If irrelevant → say:
	"The document does not contain information related to this question."

	Context:
	{context}

	Question:
	{question}
	"""

	try:
	response = client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model="llama-3.3-70b-versatile",
	)
	return response.choices[0].message.content
	except Exception as e:
	return f"Groq API Error: {str(e)}"


	# -----------------------------
	# Main Functions
	# -----------------------------
	def process_pdf(file):
	if file is None:
	return "Please upload a PDF."

	text = read_pdf(file)

	if not text or "Error" in text:
	return text

	chunks = chunk_text(text)
	create_index(chunks)

	return f"✅ PDF processed! Chunks: {len(chunks)}"


	def answer_question(question):
	if index is None:
	return "Please upload and process a PDF first."

	context_chunks, confidence = retrieve(question)

	if not context_chunks:
	return "The document does not contain information related to this question."

	answer = ask_groq(context_chunks, question)

	if confidence:
	answer = f"(Confidence: {confidence})\n\n{answer}"

	return answer


	# -----------------------------
	# Gradio UI
	# -----------------------------
	with gr.Blocks() as demo:
	gr.Markdown("## 📄 RAG PDF Q&A (Groq + FAISS)")

	file_input = gr.File(label="Upload PDF")
	upload_btn = gr.Button("Process PDF")
	status = gr.Textbox(label="Status")

	question = gr.Textbox(label="Ask a question")
	answer = gr.Textbox(label="Answer")

	upload_btn.click(process_pdf, inputs=file_input, outputs=status)
	question.submit(answer_question, inputs=question, outputs=answer)

	# -----------------------------
	# Launch
	# -----------------------------
	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)