Spaces:

faryalnimra
/

RAG_BASED_APPLICATION

Sleeping

File size: 10,757 Bytes

import gradio as gr
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2
import docx
import requests
import json
from typing import List
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class RAGSystem:
    def __init__(self):
        # Initialize sentence transformer for embeddings
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.documents = []
        self.embeddings = None
        self.groq_api_key = None
        self.groq_base_url = "https://api.groq.com/openai/v1/chat/completions"
        
    def set_api_key(self, api_key: str):
        """Set the Groq API key"""
        self.groq_api_key = api_key
        
    def extract_text_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                return text
        except Exception as e:
            logger.error(f"Error extracting text from PDF: {e}")
            return ""
    
    def extract_text_from_docx(self, file_path: str) -> str:
        """Extract text from DOCX file"""
        try:
            doc = docx.Document(file_path)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except Exception as e:
            logger.error(f"Error extracting text from DOCX: {e}")
            return ""
    
    def extract_text_from_txt(self, file_path: str) -> str:
        """Extract text from TXT file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            logger.error(f"Error extracting text from TXT: {e}")
            return ""
    
    def process_documents(self, files) -> str:
        """Process uploaded documents and create embeddings"""
        if not files:
            return "No files uploaded."
        
        self.documents = []
        all_text = ""
        
        for file in files:
            file_path = file.name
            file_extension = os.path.splitext(file_path)[1].lower()
            
            if file_extension == '.pdf':
                text = self.extract_text_from_pdf(file_path)
            elif file_extension == '.docx':
                text = self.extract_text_from_docx(file_path)
            elif file_extension == '.txt':
                text = self.extract_text_from_txt(file_path)
            else:
                continue
            
            if text.strip():
                # Split text into chunks (sentences or paragraphs)
                chunks = self.split_text(text)
                self.documents.extend(chunks)
                all_text += text + "\n"
        
        if self.documents:
            # Create embeddings for all document chunks
            self.embeddings = self.embedder.encode(self.documents)
            return f"✅ Processed {len(files)} files with {len(self.documents)} text chunks."
        else:
            return "⚠️ No text could be extracted from the uploaded files."
    
    def split_text(self, text: str, chunk_size: int = 500) -> List[str]:
        """Split text into smaller chunks"""
        sentences = text.split('.')
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            if len(current_chunk) + len(sentence) < chunk_size:
                current_chunk += sentence + "."
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + "."
        
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        return [chunk for chunk in chunks if chunk.strip()]
    
    def retrieve_relevant_chunks(self, query: str, top_k: int = 3) -> List[str]:
        """Retrieve most relevant document chunks for the query"""
        if not self.documents or self.embeddings is None:
            return []
        
        # Encode the query
        query_embedding = self.embedder.encode([query])
        
        # Calculate similarities
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        
        # Get top-k most similar chunks
        top_indices = np.argsort(similarities)[::-1][:top_k]
        relevant_chunks = [self.documents[i] for i in top_indices]
        
        return relevant_chunks
    
    def query_groq(self, prompt: str) -> str:
        """Query Groq API with the given prompt"""
        if not self.groq_api_key:
            return "⚠️ Please set your Groq API key first."
        
        headers = {
            "Authorization": f"Bearer {self.groq_api_key}",
            "Content-Type": "application/json"
        }
        
        data = {
            "model": "llama-3.1-8b-instant",  # ✅ Valid Groq model
            "messages": [
                {
                    "role": "system",
                    "content": "You are a helpful assistant. Answer questions based on the provided context. If the context doesn't contain enough information to answer the question, say so clearly."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "temperature": 0.7,
            "max_tokens": 1024,
            "stream": False
        }
        
        try:
            response = requests.post(self.groq_base_url, headers=headers, json=data)
            response.raise_for_status()
            result = response.json()
            return result["choices"][0]["message"]["content"]
        except requests.exceptions.RequestException as e:
            logger.error(f"Error querying Groq API: {e}")
            return f"Error querying Groq API: {str(e)}"
        except KeyError:
            logger.error(f"Unexpected Groq API response: {result}")
            return f"Unexpected Groq API response: {json.dumps(result, indent=2)}"
    
    def answer_query(self, query: str) -> str:
        """Answer a query using RAG"""
        if not self.documents:
            return "⚠️ No documents have been processed yet. Please upload and process documents first."
        
        if not self.groq_api_key:
            return "⚠️ Please set your Groq API key first."
        
        # Retrieve relevant chunks
        relevant_chunks = self.retrieve_relevant_chunks(query)
        
        if not relevant_chunks:
            return "⚠️ No relevant information found in the documents."
        
        # Create context from relevant chunks
        context = "\n\n".join(relevant_chunks)
        
        # Create prompt for the LLM
        prompt = f"""Context from documents:
{context}

Question: {query}

Please answer the question based on the provided context. If the context doesn't contain enough information to fully answer the question, please mention what information is missing."""
        
        # Get response from Groq
        response = self.query_groq(prompt)
        
        return response

# Initialize RAG system
rag_system = RAGSystem()

# Gradio interface functions
def set_api_key(api_key):
    rag_system.set_api_key(api_key)
    return "✅ API key set successfully!"

def process_files(files):
    if not files:
        return "⚠️ Please upload at least one file."
    return rag_system.process_documents(files)

def answer_question(query):
    if not query.strip():
        return "⚠️ Please enter a question."
    return rag_system.answer_query(query)

# Create Gradio interface
with gr.Blocks(title="RAG Document Q&A System", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📚 RAG Document Q&A System")
    gr.Markdown("Upload documents and ask questions about their content using AI!")
    
    with gr.Tab("Setup"):
        gr.Markdown("## Step 1: Set your Groq API Key")
        gr.Markdown("Get your free API key from [Groq Console](https://console.groq.com/)")
        
        with gr.Row():
            api_key_input = gr.Textbox(
                type="password",
                label="Groq API Key",
                placeholder="Enter your Groq API key here..."
            )
            set_key_btn = gr.Button("Set API Key", variant="primary")
        
        api_key_status = gr.Textbox(label="Status", interactive=False)
        
        gr.Markdown("## Step 2: Upload Documents")
        gr.Markdown("Upload PDF, DOCX, or TXT files")
        
        file_upload = gr.Files(
            file_types=[".pdf", ".docx", ".txt"],
            label="Upload Documents",
            file_count="multiple"
        )
        
        process_btn = gr.Button("Process Documents", variant="primary")
        process_status = gr.Textbox(label="Processing Status", interactive=False)
    
    with gr.Tab("Ask Questions"):
        gr.Markdown("## Ask Questions About Your Documents")
        
        with gr.Row():
            with gr.Column(scale=4):
                query_input = gr.Textbox(
                    label="Your Question",
                    placeholder="Ask a question about your documents...",
                    lines=2
                )
            with gr.Column(scale=1):
                ask_btn = gr.Button("Ask Question", variant="primary")
        
        answer_output = gr.Textbox(
            label="Answer",
            lines=10,
            interactive=False
        )
        
        # Example questions
        gr.Markdown("### Example Questions:")
        examples = gr.Examples(
            examples=[
                ["What is the main topic of the document?"],
                ["Can you summarize the key points?"],
                ["What are the conclusions mentioned?"],
                ["Are there any specific dates or numbers mentioned?"]
            ],
            inputs=query_input
        )
    
    # Event handlers
    set_key_btn.click(
        fn=set_api_key,
        inputs=[api_key_input],
        outputs=[api_key_status]
    )
    
    process_btn.click(
        fn=process_files,
        inputs=[file_upload],
        outputs=[process_status]
    )
    
    ask_btn.click(
        fn=answer_question,
        inputs=[query_input],
        outputs=[answer_output]
    )
    
    # Allow Enter key to submit questions
    query_input.submit(
        fn=answer_question,
        inputs=[query_input],
        outputs=[answer_output]
    )

if __name__ == "__main__":
    demo.launch(share=True)