Spaces:

akazmi
/

Legal2

Runtime error

App Files Files Community

akazmi commited on Nov 18, 2024

Commit

983c9b5

verified ·

1 Parent(s): a961ba1

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -44

app.py CHANGED Viewed

@@ -3,10 +3,19 @@ import os
 from groq import Groq
 from PyPDF2 import PdfReader
 import re
 # Function to read the uploaded PDFs and return the text
-def read_pdf(file_path):
     try:
         with open(file_path, "rb") as file:
             reader = PdfReader(file)
             text = ""
@@ -17,58 +26,39 @@ def read_pdf(file_path):
         return f"Error reading PDF: {str(e)}"
 # Function to chunk large text for Groq model to avoid token limits
-def chunk_text(text, chunk_size=1000):
     chunks = []
-    # Split the text into chunks
     for i in range(0, len(text), chunk_size):
         chunks.append(text[i:i + chunk_size])
     return chunks
-# Function to retrieve the relevant chunk of text based on user question
 def retrieve_relevant_document(user_question, document_text):
-    # Extract keywords from the user question
-    keywords = re.findall(r"\b\w+\b", user_question.lower())
-    # Split text into smaller chunks for searching
-    text_chunks = chunk_text(document_text, chunk_size=1000)
-    # Find the chunk with the most keyword matches
-    relevant_chunk = ""
-    max_score = 0
-    for chunk in text_chunks:
-        # Count keyword matches in the chunk
-        chunk_score = sum(chunk.lower().count(keyword) for keyword in keywords)
-        if chunk_score > max_score:
-            max_score = chunk_score
-            relevant_chunk = chunk
-    # If no chunk is relevant, return a default message
-    if max_score == 0:
-        return "No relevant section found in the document."
-    # Return the most relevant chunk with highlighted keywords
-    for keyword in keywords:
-        relevant_chunk = re.sub(
-            fr"\b({keyword})\b", r"**\1**", relevant_chunk, flags=re.IGNORECASE
-        )
     return relevant_chunk
 # Initialize Groq client
 def initialize_groq():
     return Groq(api_key=os.getenv("GROQ_API_KEY"))
 # Function to handle document selection and answer generation using RAG
-def answer_question(uploaded_file, user_question):
-    # Check if file is uploaded
-    if uploaded_file is None:
-        return "Please upload a file before asking a question."
-    # Get the file path from Gradio's uploaded file component
-    file_path = uploaded_file.name
-    # Read the content from the uploaded PDF file
-    document_text = read_pdf(file_path)
     # If document text is empty, return an error message
     if not document_text:
@@ -97,16 +87,22 @@ def answer_question(uploaded_file, user_question):
 # Create Gradio Interface
 def create_interface():
     with gr.Blocks() as demo:
-        gr.Markdown("### Ask questions based on the uploaded document")
-        # File upload component (for users to upload documents)
-        file_input = gr.File(label="Upload a document (PDF)", file_count="single")
         question_input = gr.Textbox(
             label="Enter your question",
-            placeholder="Ask something related to the uploaded document..."
         )
         answer_output = gr.Textbox(label="Answer", interactive=False)
         # Button to submit the question and get the answer
@@ -114,7 +110,7 @@ def create_interface():
         submit_button.click(
             fn=answer_question,
-            inputs=[file_input, question_input],
             outputs=answer_output
         )

 from groq import Groq
 from PyPDF2 import PdfReader
 import re
+from datasets import load_dataset
 # Function to read the uploaded PDFs and return the text
+def read_pdf_from_dataset(file_name):
     try:
+        # Load the dataset containing the PDF files
+        dataset = load_dataset("akazmi/legal-documents")
+        # Get the content of the selected document
+        document = dataset["train"][file_name]
+        file_path = document["file"]
+        # Read the PDF file content
         with open(file_path, "rb") as file:
             reader = PdfReader(file)
             text = ""
         return f"Error reading PDF: {str(e)}"
 # Function to chunk large text for Groq model to avoid token limits
+def chunk_text(text, chunk_size=3000):
     chunks = []
     for i in range(0, len(text), chunk_size):
         chunks.append(text[i:i + chunk_size])
     return chunks
+# Function to perform document retrieval (find the relevant chunks)
 def retrieve_relevant_document(user_question, document_text):
+    text_chunks = chunk_text(document_text)
+    # Find chunk with the highest relevance to the user's question
+    relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
     return relevant_chunk
+# A simple similarity function (you can use a more advanced one, e.g., cosine similarity with embeddings)
+def similarity(query, text):
+    query_words = set(query.lower().split())
+    text_words = set(text.lower().split())
+    common_words = query_words.intersection(text_words)
+    return len(common_words)
 # Initialize Groq client
 def initialize_groq():
     return Groq(api_key=os.getenv("GROQ_API_KEY"))
 # Function to handle document selection and answer generation using RAG
+def answer_question(selected_document, user_question):
+    # Check if document is selected
+    if selected_document is None:
+        return "Please select a document before asking a question."
+    # Read the content from the selected document
+    document_text = read_pdf_from_dataset(selected_document)
     # If document text is empty, return an error message
     if not document_text:
 # Create Gradio Interface
 def create_interface():
     with gr.Blocks() as demo:
+        gr.Markdown("### Ask questions based on the selected document")
+        # Dropdown to select the document
+        document_dropdown = gr.Dropdown(
+            label="Select Document",
+            choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"],
+            value="Income Tax Ordinance.pdf"
+        )
+        # Input for the user's question
         question_input = gr.Textbox(
             label="Enter your question",
+            placeholder="Ask something related to the selected document..."
         )
+        # Output area for the answer
         answer_output = gr.Textbox(label="Answer", interactive=False)
         # Button to submit the question and get the answer
         submit_button.click(
             fn=answer_question,
+            inputs=[document_dropdown, question_input],
             outputs=answer_output
         )