File size: 4,250 Bytes
f26f589 a961ba1 88bb79e 983c9b5 88bb79e a961ba1 983c9b5 88bb79e 983c9b5 a961ba1 88bb79e a961ba1 88bb79e a961ba1 983c9b5 a961ba1 88bb79e 983c9b5 8b03d21 983c9b5 8b03d21 88bb79e 983c9b5 a961ba1 983c9b5 a961ba1 983c9b5 a961ba1 88bb79e a961ba1 c292876 a961ba1 88bb79e 983c9b5 88bb79e 983c9b5 88bb79e 983c9b5 88bb79e a961ba1 983c9b5 88bb79e 983c9b5 88bb79e a961ba1 88bb79e 983c9b5 88bb79e a961ba1 88bb79e a961ba1 88bb79e 8b03d21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import os
from groq import Groq
from PyPDF2 import PdfReader
import re
from datasets import load_dataset
# Function to read the uploaded PDFs and return the text
def read_pdf_from_dataset(file_name):
try:
# Load the dataset containing the PDF files
dataset = load_dataset("akazmi/legal-documents")
# Get the content of the selected document
document = dataset["train"][file_name]
file_path = document["file"]
# Read the PDF file content
with open(file_path, "rb") as file:
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
return f"Error reading PDF: {str(e)}"
# Function to chunk large text for Groq model to avoid token limits
def chunk_text(text, chunk_size=3000):
chunks = []
for i in range(0, len(text), chunk_size):
chunks.append(text[i:i + chunk_size])
return chunks
# Function to perform document retrieval (find the relevant chunks)
def retrieve_relevant_document(user_question, document_text):
text_chunks = chunk_text(document_text)
# Find chunk with the highest relevance to the user's question
relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
return relevant_chunk
# A simple similarity function (you can use a more advanced one, e.g., cosine similarity with embeddings)
def similarity(query, text):
query_words = set(query.lower().split())
text_words = set(text.lower().split())
common_words = query_words.intersection(text_words)
return len(common_words)
# Initialize Groq client
def initialize_groq():
return Groq(api_key=os.getenv("GROQ_API_KEY"))
# Function to handle document selection and answer generation using RAG
def answer_question(selected_document, user_question):
# Check if document is selected
if selected_document is None:
return "Please select a document before asking a question."
# Read the content from the selected document
document_text = read_pdf_from_dataset(selected_document)
# If document text is empty, return an error message
if not document_text:
return "Error: The document content is empty or could not be extracted."
# Perform document retrieval: get the most relevant chunk
relevant_chunk = retrieve_relevant_document(user_question, document_text)
# Prepare the query for the model, including the relevant chunk of text
query = f"{user_question} \n\n Relevant Document: {relevant_chunk}"
# Initialize Groq client
client = initialize_groq()
try:
# Generate the answer from the Groq model
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": query}],
model="llama3-8b-8192", # Use your chosen model
)
# Return the model's response
return chat_completion.choices[0].message.content
except Exception as e:
return f"Error generating answer: {str(e)}"
# Create Gradio Interface
def create_interface():
with gr.Blocks() as demo:
gr.Markdown("### Ask questions based on the selected document")
# Dropdown to select the document
document_dropdown = gr.Dropdown(
label="Select Document",
choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"],
value="Income Tax Ordinance.pdf"
)
# Input for the user's question
question_input = gr.Textbox(
label="Enter your question",
placeholder="Ask something related to the selected document..."
)
# Output area for the answer
answer_output = gr.Textbox(label="Answer", interactive=False)
# Button to submit the question and get the answer
submit_button = gr.Button("Ask")
submit_button.click(
fn=answer_question,
inputs=[document_dropdown, question_input],
outputs=answer_output
)
return demo
# Run the interface
if __name__ == "__main__":
demo = create_interface()
demo.launch()
|