Spaces:

akazmi
/

Legal2

Runtime error

App Files Files Community

akazmi commited on Nov 18, 2024

Commit

a961ba1

verified ·

1 Parent(s): 8b03d21

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -62

app.py CHANGED Viewed

@@ -1,97 +1,126 @@
 import gradio as gr
-import requests
 from PyPDF2 import PdfReader
 import re
-# URLs for your PDF files hosted on Hugging Face
-PDF_URLS = {
-    "Income Tax Ordinance": "https://huggingface.co/datasets/akazmi/legal-documents/resolve/main/Income%20Tax%20Ordinance.pdf",
-    "Companies Act 1984": "https://huggingface.co/datasets/akazmi/legal-documents/resolve/main/Companies%20Act%201984.pdf",
-}
-# Function to download and read the PDF from a URL
-def read_pdf_from_url(pdf_url):
     try:
-        response = requests.get(pdf_url)
-        response.raise_for_status()  # Check for errors
-        with open("temp.pdf", "wb") as f:
-            f.write(response.content)
-        # Read PDF content
-        reader = PdfReader("temp.pdf")
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text()
         return text
     except Exception as e:
-        return f"Error reading PDF from URL: {str(e)}"
-# Function to chunk large text
-def chunk_text(text, chunk_size=3000):
-    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
     return chunks
-# A simple similarity function
-def similarity(query, text):
-    query_words = set(query.lower().split())
-    text_words = set(text.lower().split())
-    return len(query_words.intersection(text_words))
-# Function to retrieve the most relevant chunk
 def retrieve_relevant_document(user_question, document_text):
-    text_chunks = chunk_text(document_text)
-    relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
     return relevant_chunk
-# Function to handle question answering
-def answer_question(document_name, user_question):
-    if not document_name or not user_question:
-        return "Please select a document and enter a question."
-    # Fetch the selected document's text
-    pdf_url = PDF_URLS[document_name]
-    document_text = read_pdf_from_url(pdf_url)
-    if "Error" in document_text:
-        return document_text  # Return error message if PDF reading failed
-    # Retrieve the most relevant chunk
     relevant_chunk = retrieve_relevant_document(user_question, document_text)
-    # Simulate model response (replace with Groq or other model integration)
-    response = f"Relevant Section:\n{relevant_chunk[:500]}...\n\nThis section might help answer your question."
-    return response
-# Create Gradio interface
 def create_interface():
     with gr.Blocks() as demo:
-        gr.Markdown("## Legal Document Q&A\nSelect a document and ask questions based on its content.")
-        document_dropdown = gr.Dropdown(
-            label="Select a Document",
-            choices=list(PDF_URLS.keys()),
-            value="Income Tax Ordinance",
-        )
         question_input = gr.Textbox(
-            label="Enter your question",
-            placeholder="Ask something related to the selected document..."
         )
         answer_output = gr.Textbox(label="Answer", interactive=False)
         submit_button = gr.Button("Ask")
-        # Connect inputs and outputs
         submit_button.click(
             fn=answer_question,
-            inputs=[document_dropdown, question_input],
             outputs=answer_output
         )
     return demo
-# Run the app
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch()

 import gradio as gr
+import os
+from groq import Groq
 from PyPDF2 import PdfReader
 import re
+# Function to read the uploaded PDFs and return the text
+def read_pdf(file_path):
     try:
+        with open(file_path, "rb") as file:
+            reader = PdfReader(file)
+            text = ""
+            for page in reader.pages:
+                text += page.extract_text()
         return text
     except Exception as e:
+        return f"Error reading PDF: {str(e)}"
+# Function to chunk large text for Groq model to avoid token limits
+def chunk_text(text, chunk_size=1000):
+    chunks = []
+    # Split the text into chunks
+    for i in range(0, len(text), chunk_size):
+        chunks.append(text[i:i + chunk_size])
     return chunks
+# Function to retrieve the relevant chunk of text based on user question
 def retrieve_relevant_document(user_question, document_text):
+    # Extract keywords from the user question
+    keywords = re.findall(r"\b\w+\b", user_question.lower())
+    # Split text into smaller chunks for searching
+    text_chunks = chunk_text(document_text, chunk_size=1000)
+    # Find the chunk with the most keyword matches
+    relevant_chunk = ""
+    max_score = 0
+    for chunk in text_chunks:
+        # Count keyword matches in the chunk
+        chunk_score = sum(chunk.lower().count(keyword) for keyword in keywords)
+        if chunk_score > max_score:
+            max_score = chunk_score
+            relevant_chunk = chunk
+    # If no chunk is relevant, return a default message
+    if max_score == 0:
+        return "No relevant section found in the document."
+    # Return the most relevant chunk with highlighted keywords
+    for keyword in keywords:
+        relevant_chunk = re.sub(
+            fr"\b({keyword})\b", r"**\1**", relevant_chunk, flags=re.IGNORECASE
+        )
     return relevant_chunk
+# Initialize Groq client
+def initialize_groq():
+    return Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Function to handle document selection and answer generation using RAG
+def answer_question(uploaded_file, user_question):
+    # Check if file is uploaded
+    if uploaded_file is None:
+        return "Please upload a file before asking a question."
+    # Get the file path from Gradio's uploaded file component
+    file_path = uploaded_file.name
+    # Read the content from the uploaded PDF file
+    document_text = read_pdf(file_path)
+    # If document text is empty, return an error message
+    if not document_text:
+        return "Error: The document content is empty or could not be extracted."
+    # Perform document retrieval: get the most relevant chunk
     relevant_chunk = retrieve_relevant_document(user_question, document_text)
+    # Prepare the query for the model, including the relevant chunk of text
+    query = f"{user_question} \n\n Relevant Document: {relevant_chunk}"
+    # Initialize Groq client
+    client = initialize_groq()
+    try:
+        # Generate the answer from the Groq model
+        chat_completion = client.chat.completions.create(
+            messages=[{"role": "user", "content": query}],
+            model="llama3-8b-8192",  # Use your chosen model
+        )
+        # Return the model's response
+        return chat_completion.choices[0].message.content
+    except Exception as e:
+        return f"Error generating answer: {str(e)}"
+# Create Gradio Interface
 def create_interface():
     with gr.Blocks() as demo:
+        gr.Markdown("### Ask questions based on the uploaded document")
+        # File upload component (for users to upload documents)
+        file_input = gr.File(label="Upload a document (PDF)", file_count="single")
         question_input = gr.Textbox(
+            label="Enter your question",
+            placeholder="Ask something related to the uploaded document..."
         )
         answer_output = gr.Textbox(label="Answer", interactive=False)
+        # Button to submit the question and get the answer
         submit_button = gr.Button("Ask")
         submit_button.click(
             fn=answer_question,
+            inputs=[file_input, question_input],
             outputs=answer_output
         )
     return demo
+# Run the interface
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch()