faryalnimra's picture
Update app.py
bed23b6 verified
import gradio as gr
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2
import docx
import requests
import json
from typing import List
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RAGSystem:
def __init__(self):
# Initialize sentence transformer for embeddings
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
self.documents = []
self.embeddings = None
self.groq_api_key = None
self.groq_base_url = "https://api.groq.com/openai/v1/chat/completions"
def set_api_key(self, api_key: str):
"""Set the Groq API key"""
self.groq_api_key = api_key
def extract_text_from_pdf(self, file_path: str) -> str:
"""Extract text from PDF file"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error(f"Error extracting text from PDF: {e}")
return ""
def extract_text_from_docx(self, file_path: str) -> str:
"""Extract text from DOCX file"""
try:
doc = docx.Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
logger.error(f"Error extracting text from DOCX: {e}")
return ""
def extract_text_from_txt(self, file_path: str) -> str:
"""Extract text from TXT file"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
logger.error(f"Error extracting text from TXT: {e}")
return ""
def process_documents(self, files) -> str:
"""Process uploaded documents and create embeddings"""
if not files:
return "No files uploaded."
self.documents = []
all_text = ""
for file in files:
file_path = file.name
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.pdf':
text = self.extract_text_from_pdf(file_path)
elif file_extension == '.docx':
text = self.extract_text_from_docx(file_path)
elif file_extension == '.txt':
text = self.extract_text_from_txt(file_path)
else:
continue
if text.strip():
# Split text into chunks (sentences or paragraphs)
chunks = self.split_text(text)
self.documents.extend(chunks)
all_text += text + "\n"
if self.documents:
# Create embeddings for all document chunks
self.embeddings = self.embedder.encode(self.documents)
return f"✅ Processed {len(files)} files with {len(self.documents)} text chunks."
else:
return "⚠️ No text could be extracted from the uploaded files."
def split_text(self, text: str, chunk_size: int = 500) -> List[str]:
"""Split text into smaller chunks"""
sentences = text.split('.')
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < chunk_size:
current_chunk += sentence + "."
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + "."
if current_chunk:
chunks.append(current_chunk.strip())
return [chunk for chunk in chunks if chunk.strip()]
def retrieve_relevant_chunks(self, query: str, top_k: int = 3) -> List[str]:
"""Retrieve most relevant document chunks for the query"""
if not self.documents or self.embeddings is None:
return []
# Encode the query
query_embedding = self.embedder.encode([query])
# Calculate similarities
similarities = cosine_similarity(query_embedding, self.embeddings)[0]
# Get top-k most similar chunks
top_indices = np.argsort(similarities)[::-1][:top_k]
relevant_chunks = [self.documents[i] for i in top_indices]
return relevant_chunks
def query_groq(self, prompt: str) -> str:
"""Query Groq API with the given prompt"""
if not self.groq_api_key:
return "⚠️ Please set your Groq API key first."
headers = {
"Authorization": f"Bearer {self.groq_api_key}",
"Content-Type": "application/json"
}
data = {
"model": "llama-3.1-8b-instant", # ✅ Valid Groq model
"messages": [
{
"role": "system",
"content": "You are a helpful assistant. Answer questions based on the provided context. If the context doesn't contain enough information to answer the question, say so clearly."
},
{
"role": "user",
"content": prompt
}
],
"temperature": 0.7,
"max_tokens": 1024,
"stream": False
}
try:
response = requests.post(self.groq_base_url, headers=headers, json=data)
response.raise_for_status()
result = response.json()
return result["choices"][0]["message"]["content"]
except requests.exceptions.RequestException as e:
logger.error(f"Error querying Groq API: {e}")
return f"Error querying Groq API: {str(e)}"
except KeyError:
logger.error(f"Unexpected Groq API response: {result}")
return f"Unexpected Groq API response: {json.dumps(result, indent=2)}"
def answer_query(self, query: str) -> str:
"""Answer a query using RAG"""
if not self.documents:
return "⚠️ No documents have been processed yet. Please upload and process documents first."
if not self.groq_api_key:
return "⚠️ Please set your Groq API key first."
# Retrieve relevant chunks
relevant_chunks = self.retrieve_relevant_chunks(query)
if not relevant_chunks:
return "⚠️ No relevant information found in the documents."
# Create context from relevant chunks
context = "\n\n".join(relevant_chunks)
# Create prompt for the LLM
prompt = f"""Context from documents:
{context}
Question: {query}
Please answer the question based on the provided context. If the context doesn't contain enough information to fully answer the question, please mention what information is missing."""
# Get response from Groq
response = self.query_groq(prompt)
return response
# Initialize RAG system
rag_system = RAGSystem()
# Gradio interface functions
def set_api_key(api_key):
rag_system.set_api_key(api_key)
return "✅ API key set successfully!"
def process_files(files):
if not files:
return "⚠️ Please upload at least one file."
return rag_system.process_documents(files)
def answer_question(query):
if not query.strip():
return "⚠️ Please enter a question."
return rag_system.answer_query(query)
# Create Gradio interface
with gr.Blocks(title="RAG Document Q&A System", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 📚 RAG Document Q&A System")
gr.Markdown("Upload documents and ask questions about their content using AI!")
with gr.Tab("Setup"):
gr.Markdown("## Step 1: Set your Groq API Key")
gr.Markdown("Get your free API key from [Groq Console](https://console.groq.com/)")
with gr.Row():
api_key_input = gr.Textbox(
type="password",
label="Groq API Key",
placeholder="Enter your Groq API key here..."
)
set_key_btn = gr.Button("Set API Key", variant="primary")
api_key_status = gr.Textbox(label="Status", interactive=False)
gr.Markdown("## Step 2: Upload Documents")
gr.Markdown("Upload PDF, DOCX, or TXT files")
file_upload = gr.Files(
file_types=[".pdf", ".docx", ".txt"],
label="Upload Documents",
file_count="multiple"
)
process_btn = gr.Button("Process Documents", variant="primary")
process_status = gr.Textbox(label="Processing Status", interactive=False)
with gr.Tab("Ask Questions"):
gr.Markdown("## Ask Questions About Your Documents")
with gr.Row():
with gr.Column(scale=4):
query_input = gr.Textbox(
label="Your Question",
placeholder="Ask a question about your documents...",
lines=2
)
with gr.Column(scale=1):
ask_btn = gr.Button("Ask Question", variant="primary")
answer_output = gr.Textbox(
label="Answer",
lines=10,
interactive=False
)
# Example questions
gr.Markdown("### Example Questions:")
examples = gr.Examples(
examples=[
["What is the main topic of the document?"],
["Can you summarize the key points?"],
["What are the conclusions mentioned?"],
["Are there any specific dates or numbers mentioned?"]
],
inputs=query_input
)
# Event handlers
set_key_btn.click(
fn=set_api_key,
inputs=[api_key_input],
outputs=[api_key_status]
)
process_btn.click(
fn=process_files,
inputs=[file_upload],
outputs=[process_status]
)
ask_btn.click(
fn=answer_question,
inputs=[query_input],
outputs=[answer_output]
)
# Allow Enter key to submit questions
query_input.submit(
fn=answer_question,
inputs=[query_input],
outputs=[answer_output]
)
if __name__ == "__main__":
demo.launch(share=True)