Spaces:

faryalnimra
/

RAG_BASED_APPLICATION

Sleeping

App Files Files Community

RAG_BASED_APPLICATION / app.py

faryalnimra

Update app.py

bed23b6 verified 6 months ago

raw

history blame contribute delete

10.8 kB

	import gradio as gr
	import os
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import PyPDF2
	import docx
	import requests
	import json
	from typing import List
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class RAGSystem:
	def __init__(self):
	# Initialize sentence transformer for embeddings
	self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
	self.documents = []
	self.embeddings = None
	self.groq_api_key = None
	self.groq_base_url = "https://api.groq.com/openai/v1/chat/completions"

	def set_api_key(self, api_key: str):
	"""Set the Groq API key"""
	self.groq_api_key = api_key

	def extract_text_from_pdf(self, file_path: str) -> str:
	"""Extract text from PDF file"""
	try:
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	logger.error(f"Error extracting text from PDF: {e}")
	return ""

	def extract_text_from_docx(self, file_path: str) -> str:
	"""Extract text from DOCX file"""
	try:
	doc = docx.Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text
	except Exception as e:
	logger.error(f"Error extracting text from DOCX: {e}")
	return ""

	def extract_text_from_txt(self, file_path: str) -> str:
	"""Extract text from TXT file"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()
	except Exception as e:
	logger.error(f"Error extracting text from TXT: {e}")
	return ""

	def process_documents(self, files) -> str:
	"""Process uploaded documents and create embeddings"""
	if not files:
	return "No files uploaded."

	self.documents = []
	all_text = ""

	for file in files:
	file_path = file.name
	file_extension = os.path.splitext(file_path)[1].lower()

	if file_extension == '.pdf':
	text = self.extract_text_from_pdf(file_path)
	elif file_extension == '.docx':
	text = self.extract_text_from_docx(file_path)
	elif file_extension == '.txt':
	text = self.extract_text_from_txt(file_path)
	else:
	continue

	if text.strip():
	# Split text into chunks (sentences or paragraphs)
	chunks = self.split_text(text)
	self.documents.extend(chunks)
	all_text += text + "\n"

	if self.documents:
	# Create embeddings for all document chunks
	self.embeddings = self.embedder.encode(self.documents)
	return f"✅ Processed {len(files)} files with {len(self.documents)} text chunks."
	else:
	return "⚠️ No text could be extracted from the uploaded files."

	def split_text(self, text: str, chunk_size: int = 500) -> List[str]:
	"""Split text into smaller chunks"""
	sentences = text.split('.')
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	if len(current_chunk) + len(sentence) < chunk_size:
	current_chunk += sentence + "."
	else:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence + "."

	if current_chunk:
	chunks.append(current_chunk.strip())

	return [chunk for chunk in chunks if chunk.strip()]

	def retrieve_relevant_chunks(self, query: str, top_k: int = 3) -> List[str]:
	"""Retrieve most relevant document chunks for the query"""
	if not self.documents or self.embeddings is None:
	return []

	# Encode the query
	query_embedding = self.embedder.encode([query])

	# Calculate similarities
	similarities = cosine_similarity(query_embedding, self.embeddings)[0]

	# Get top-k most similar chunks
	top_indices = np.argsort(similarities)[::-1][:top_k]
	relevant_chunks = [self.documents[i] for i in top_indices]

	return relevant_chunks

	def query_groq(self, prompt: str) -> str:
	"""Query Groq API with the given prompt"""
	if not self.groq_api_key:
	return "⚠️ Please set your Groq API key first."

	headers = {
	"Authorization": f"Bearer {self.groq_api_key}",
	"Content-Type": "application/json"
	}

	data = {
	"model": "llama-3.1-8b-instant", # ✅ Valid Groq model
	"messages": [
	{
	"role": "system",
	"content": "You are a helpful assistant. Answer questions based on the provided context. If the context doesn't contain enough information to answer the question, say so clearly."
	},
	{
	"role": "user",
	"content": prompt
	}
	],
	"temperature": 0.7,
	"max_tokens": 1024,
	"stream": False
	}

	try:
	response = requests.post(self.groq_base_url, headers=headers, json=data)
	response.raise_for_status()
	result = response.json()
	return result["choices"][0]["message"]["content"]
	except requests.exceptions.RequestException as e:
	logger.error(f"Error querying Groq API: {e}")
	return f"Error querying Groq API: {str(e)}"
	except KeyError:
	logger.error(f"Unexpected Groq API response: {result}")
	return f"Unexpected Groq API response: {json.dumps(result, indent=2)}"

	def answer_query(self, query: str) -> str:
	"""Answer a query using RAG"""
	if not self.documents:
	return "⚠️ No documents have been processed yet. Please upload and process documents first."

	if not self.groq_api_key:
	return "⚠️ Please set your Groq API key first."

	# Retrieve relevant chunks
	relevant_chunks = self.retrieve_relevant_chunks(query)

	if not relevant_chunks:
	return "⚠️ No relevant information found in the documents."

	# Create context from relevant chunks
	context = "\n\n".join(relevant_chunks)

	# Create prompt for the LLM
	prompt = f"""Context from documents:
	{context}

	Question: {query}

	Please answer the question based on the provided context. If the context doesn't contain enough information to fully answer the question, please mention what information is missing."""

	# Get response from Groq
	response = self.query_groq(prompt)

	return response

	# Initialize RAG system
	rag_system = RAGSystem()

	# Gradio interface functions
	def set_api_key(api_key):
	rag_system.set_api_key(api_key)
	return "✅ API key set successfully!"

	def process_files(files):
	if not files:
	return "⚠️ Please upload at least one file."
	return rag_system.process_documents(files)

	def answer_question(query):
	if not query.strip():
	return "⚠️ Please enter a question."
	return rag_system.answer_query(query)

	# Create Gradio interface
	with gr.Blocks(title="RAG Document Q&A System", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📚 RAG Document Q&A System")
	gr.Markdown("Upload documents and ask questions about their content using AI!")

	with gr.Tab("Setup"):
	gr.Markdown("## Step 1: Set your Groq API Key")
	gr.Markdown("Get your free API key from [Groq Console](https://console.groq.com/)")

	with gr.Row():
	api_key_input = gr.Textbox(
	type="password",
	label="Groq API Key",
	placeholder="Enter your Groq API key here..."
	)
	set_key_btn = gr.Button("Set API Key", variant="primary")

	api_key_status = gr.Textbox(label="Status", interactive=False)

	gr.Markdown("## Step 2: Upload Documents")
	gr.Markdown("Upload PDF, DOCX, or TXT files")

	file_upload = gr.Files(
	file_types=[".pdf", ".docx", ".txt"],
	label="Upload Documents",
	file_count="multiple"
	)

	process_btn = gr.Button("Process Documents", variant="primary")
	process_status = gr.Textbox(label="Processing Status", interactive=False)

	with gr.Tab("Ask Questions"):
	gr.Markdown("## Ask Questions About Your Documents")

	with gr.Row():
	with gr.Column(scale=4):
	query_input = gr.Textbox(
	label="Your Question",
	placeholder="Ask a question about your documents...",
	lines=2
	)
	with gr.Column(scale=1):
	ask_btn = gr.Button("Ask Question", variant="primary")

	answer_output = gr.Textbox(
	label="Answer",
	lines=10,
	interactive=False
	)

	# Example questions
	gr.Markdown("### Example Questions:")
	examples = gr.Examples(
	examples=[
	["What is the main topic of the document?"],
	["Can you summarize the key points?"],
	["What are the conclusions mentioned?"],
	["Are there any specific dates or numbers mentioned?"]
	],
	inputs=query_input
	)

	# Event handlers
	set_key_btn.click(
	fn=set_api_key,
	inputs=[api_key_input],
	outputs=[api_key_status]
	)

	process_btn.click(
	fn=process_files,
	inputs=[file_upload],
	outputs=[process_status]
	)

	ask_btn.click(
	fn=answer_question,
	inputs=[query_input],
	outputs=[answer_output]
	)

	# Allow Enter key to submit questions
	query_input.submit(
	fn=answer_question,
	inputs=[query_input],
	outputs=[answer_output]
	)

	if __name__ == "__main__":
	demo.launch(share=True)