akazmi commited on
Commit
983c9b5
·
verified ·
1 Parent(s): a961ba1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -44
app.py CHANGED
@@ -3,10 +3,19 @@ import os
3
  from groq import Groq
4
  from PyPDF2 import PdfReader
5
  import re
 
6
 
7
  # Function to read the uploaded PDFs and return the text
8
- def read_pdf(file_path):
9
  try:
 
 
 
 
 
 
 
 
10
  with open(file_path, "rb") as file:
11
  reader = PdfReader(file)
12
  text = ""
@@ -17,58 +26,39 @@ def read_pdf(file_path):
17
  return f"Error reading PDF: {str(e)}"
18
 
19
  # Function to chunk large text for Groq model to avoid token limits
20
- def chunk_text(text, chunk_size=1000):
21
  chunks = []
22
- # Split the text into chunks
23
  for i in range(0, len(text), chunk_size):
24
  chunks.append(text[i:i + chunk_size])
25
  return chunks
26
 
27
- # Function to retrieve the relevant chunk of text based on user question
28
  def retrieve_relevant_document(user_question, document_text):
29
- # Extract keywords from the user question
30
- keywords = re.findall(r"\b\w+\b", user_question.lower())
31
-
32
- # Split text into smaller chunks for searching
33
- text_chunks = chunk_text(document_text, chunk_size=1000)
34
-
35
- # Find the chunk with the most keyword matches
36
- relevant_chunk = ""
37
- max_score = 0
38
- for chunk in text_chunks:
39
- # Count keyword matches in the chunk
40
- chunk_score = sum(chunk.lower().count(keyword) for keyword in keywords)
41
- if chunk_score > max_score:
42
- max_score = chunk_score
43
- relevant_chunk = chunk
44
-
45
- # If no chunk is relevant, return a default message
46
- if max_score == 0:
47
- return "No relevant section found in the document."
48
-
49
- # Return the most relevant chunk with highlighted keywords
50
- for keyword in keywords:
51
- relevant_chunk = re.sub(
52
- fr"\b({keyword})\b", r"**\1**", relevant_chunk, flags=re.IGNORECASE
53
- )
54
-
55
  return relevant_chunk
56
 
 
 
 
 
 
 
 
57
  # Initialize Groq client
58
  def initialize_groq():
59
  return Groq(api_key=os.getenv("GROQ_API_KEY"))
60
 
61
  # Function to handle document selection and answer generation using RAG
62
- def answer_question(uploaded_file, user_question):
63
- # Check if file is uploaded
64
- if uploaded_file is None:
65
- return "Please upload a file before asking a question."
66
-
67
- # Get the file path from Gradio's uploaded file component
68
- file_path = uploaded_file.name
69
 
70
- # Read the content from the uploaded PDF file
71
- document_text = read_pdf(file_path)
72
 
73
  # If document text is empty, return an error message
74
  if not document_text:
@@ -97,16 +87,22 @@ def answer_question(uploaded_file, user_question):
97
  # Create Gradio Interface
98
  def create_interface():
99
  with gr.Blocks() as demo:
100
- gr.Markdown("### Ask questions based on the uploaded document")
101
 
102
- # File upload component (for users to upload documents)
103
- file_input = gr.File(label="Upload a document (PDF)", file_count="single")
 
 
 
 
104
 
 
105
  question_input = gr.Textbox(
106
  label="Enter your question",
107
- placeholder="Ask something related to the uploaded document..."
108
  )
109
 
 
110
  answer_output = gr.Textbox(label="Answer", interactive=False)
111
 
112
  # Button to submit the question and get the answer
@@ -114,7 +110,7 @@ def create_interface():
114
 
115
  submit_button.click(
116
  fn=answer_question,
117
- inputs=[file_input, question_input],
118
  outputs=answer_output
119
  )
120
 
 
3
  from groq import Groq
4
  from PyPDF2 import PdfReader
5
  import re
6
+ from datasets import load_dataset
7
 
8
  # Function to read the uploaded PDFs and return the text
9
+ def read_pdf_from_dataset(file_name):
10
  try:
11
+ # Load the dataset containing the PDF files
12
+ dataset = load_dataset("akazmi/legal-documents")
13
+
14
+ # Get the content of the selected document
15
+ document = dataset["train"][file_name]
16
+ file_path = document["file"]
17
+
18
+ # Read the PDF file content
19
  with open(file_path, "rb") as file:
20
  reader = PdfReader(file)
21
  text = ""
 
26
  return f"Error reading PDF: {str(e)}"
27
 
28
  # Function to chunk large text for Groq model to avoid token limits
29
+ def chunk_text(text, chunk_size=3000):
30
  chunks = []
 
31
  for i in range(0, len(text), chunk_size):
32
  chunks.append(text[i:i + chunk_size])
33
  return chunks
34
 
35
+ # Function to perform document retrieval (find the relevant chunks)
36
  def retrieve_relevant_document(user_question, document_text):
37
+ text_chunks = chunk_text(document_text)
38
+
39
+ # Find chunk with the highest relevance to the user's question
40
+ relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  return relevant_chunk
42
 
43
+ # A simple similarity function (you can use a more advanced one, e.g., cosine similarity with embeddings)
44
+ def similarity(query, text):
45
+ query_words = set(query.lower().split())
46
+ text_words = set(text.lower().split())
47
+ common_words = query_words.intersection(text_words)
48
+ return len(common_words)
49
+
50
  # Initialize Groq client
51
  def initialize_groq():
52
  return Groq(api_key=os.getenv("GROQ_API_KEY"))
53
 
54
  # Function to handle document selection and answer generation using RAG
55
+ def answer_question(selected_document, user_question):
56
+ # Check if document is selected
57
+ if selected_document is None:
58
+ return "Please select a document before asking a question."
 
 
 
59
 
60
+ # Read the content from the selected document
61
+ document_text = read_pdf_from_dataset(selected_document)
62
 
63
  # If document text is empty, return an error message
64
  if not document_text:
 
87
  # Create Gradio Interface
88
  def create_interface():
89
  with gr.Blocks() as demo:
90
+ gr.Markdown("### Ask questions based on the selected document")
91
 
92
+ # Dropdown to select the document
93
+ document_dropdown = gr.Dropdown(
94
+ label="Select Document",
95
+ choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"],
96
+ value="Income Tax Ordinance.pdf"
97
+ )
98
 
99
+ # Input for the user's question
100
  question_input = gr.Textbox(
101
  label="Enter your question",
102
+ placeholder="Ask something related to the selected document..."
103
  )
104
 
105
+ # Output area for the answer
106
  answer_output = gr.Textbox(label="Answer", interactive=False)
107
 
108
  # Button to submit the question and get the answer
 
110
 
111
  submit_button.click(
112
  fn=answer_question,
113
+ inputs=[document_dropdown, question_input],
114
  outputs=answer_output
115
  )
116