Spaces:

akazmi
/

Legal2

Sleeping

File size: 4,250 Bytes

f26f589
a961ba1
 
88bb79e
 
983c9b5
88bb79e
a961ba1
983c9b5
88bb79e
983c9b5
 
 
 
 
 
 
 
a961ba1
 
 
 
 
88bb79e
 
a961ba1
88bb79e
a961ba1
983c9b5
a961ba1
 
 
88bb79e
 
983c9b5
8b03d21
983c9b5
 
 
 
8b03d21
88bb79e
983c9b5
 
 
 
 
 
 
a961ba1
 
 
 
 
983c9b5
 
 
 
a961ba1
983c9b5
 
a961ba1
 
 
 
 
 
88bb79e
 
a961ba1
 
 
 
 
c292876
a961ba1
 
 
 
 
 
 
 
 
 
 
 
88bb79e
 
983c9b5
88bb79e
983c9b5
 
 
 
 
 
88bb79e
983c9b5
88bb79e
a961ba1
983c9b5
88bb79e
 
983c9b5
88bb79e
a961ba1
 
88bb79e
 
 
 
983c9b5
88bb79e
 
a961ba1
88bb79e
 
a961ba1
88bb79e
 
8b03d21

import gradio as gr
import os
from groq import Groq
from PyPDF2 import PdfReader
import re
from datasets import load_dataset

# Function to read the uploaded PDFs and return the text
def read_pdf_from_dataset(file_name):
    try:
        # Load the dataset containing the PDF files
        dataset = load_dataset("akazmi/legal-documents")
        
        # Get the content of the selected document
        document = dataset["train"][file_name]
        file_path = document["file"]
        
        # Read the PDF file content
        with open(file_path, "rb") as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        return f"Error reading PDF: {str(e)}"

# Function to chunk large text for Groq model to avoid token limits
def chunk_text(text, chunk_size=3000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])
    return chunks

# Function to perform document retrieval (find the relevant chunks)
def retrieve_relevant_document(user_question, document_text):
    text_chunks = chunk_text(document_text)
    
    # Find chunk with the highest relevance to the user's question
    relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
    return relevant_chunk

# A simple similarity function (you can use a more advanced one, e.g., cosine similarity with embeddings)
def similarity(query, text):
    query_words = set(query.lower().split())
    text_words = set(text.lower().split())
    common_words = query_words.intersection(text_words)
    return len(common_words)

# Initialize Groq client
def initialize_groq():
    return Groq(api_key=os.getenv("GROQ_API_KEY"))

# Function to handle document selection and answer generation using RAG
def answer_question(selected_document, user_question):
    # Check if document is selected
    if selected_document is None:
        return "Please select a document before asking a question."

    # Read the content from the selected document
    document_text = read_pdf_from_dataset(selected_document)

    # If document text is empty, return an error message
    if not document_text:
        return "Error: The document content is empty or could not be extracted."

    # Perform document retrieval: get the most relevant chunk
    relevant_chunk = retrieve_relevant_document(user_question, document_text)

    # Prepare the query for the model, including the relevant chunk of text
    query = f"{user_question} \n\n Relevant Document: {relevant_chunk}"

    # Initialize Groq client
    client = initialize_groq()

    try:
        # Generate the answer from the Groq model
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": query}],
            model="llama3-8b-8192",  # Use your chosen model
        )
        # Return the model's response
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error generating answer: {str(e)}"

# Create Gradio Interface
def create_interface():
    with gr.Blocks() as demo:
        gr.Markdown("### Ask questions based on the selected document")
        
        # Dropdown to select the document
        document_dropdown = gr.Dropdown(
            label="Select Document", 
            choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"], 
            value="Income Tax Ordinance.pdf"
        )
        
        # Input for the user's question
        question_input = gr.Textbox(
            label="Enter your question", 
            placeholder="Ask something related to the selected document..."
        )
        
        # Output area for the answer
        answer_output = gr.Textbox(label="Answer", interactive=False)

        # Button to submit the question and get the answer
        submit_button = gr.Button("Ask")

        submit_button.click(
            fn=answer_question,
            inputs=[document_dropdown, question_input],
            outputs=answer_output
        )

    return demo

# Run the interface
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()