akazmi commited on
Commit
a961ba1
·
verified ·
1 Parent(s): 8b03d21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -62
app.py CHANGED
@@ -1,97 +1,126 @@
1
  import gradio as gr
2
- import requests
 
3
  from PyPDF2 import PdfReader
4
  import re
5
 
6
- # URLs for your PDF files hosted on Hugging Face
7
- PDF_URLS = {
8
- "Income Tax Ordinance": "https://huggingface.co/datasets/akazmi/legal-documents/resolve/main/Income%20Tax%20Ordinance.pdf",
9
- "Companies Act 1984": "https://huggingface.co/datasets/akazmi/legal-documents/resolve/main/Companies%20Act%201984.pdf",
10
- }
11
-
12
- # Function to download and read the PDF from a URL
13
- def read_pdf_from_url(pdf_url):
14
  try:
15
- response = requests.get(pdf_url)
16
- response.raise_for_status() # Check for errors
17
- with open("temp.pdf", "wb") as f:
18
- f.write(response.content)
19
-
20
- # Read PDF content
21
- reader = PdfReader("temp.pdf")
22
- text = ""
23
- for page in reader.pages:
24
- text += page.extract_text()
25
  return text
26
  except Exception as e:
27
- return f"Error reading PDF from URL: {str(e)}"
28
 
29
- # Function to chunk large text
30
- def chunk_text(text, chunk_size=3000):
31
- chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
 
 
 
32
  return chunks
33
 
34
- # A simple similarity function
35
- def similarity(query, text):
36
- query_words = set(query.lower().split())
37
- text_words = set(text.lower().split())
38
- return len(query_words.intersection(text_words))
39
-
40
- # Function to retrieve the most relevant chunk
41
  def retrieve_relevant_document(user_question, document_text):
42
- text_chunks = chunk_text(document_text)
43
- relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  return relevant_chunk
45
 
46
- # Function to handle question answering
47
- def answer_question(document_name, user_question):
48
- if not document_name or not user_question:
49
- return "Please select a document and enter a question."
50
-
51
- # Fetch the selected document's text
52
- pdf_url = PDF_URLS[document_name]
53
- document_text = read_pdf_from_url(pdf_url)
54
-
55
- if "Error" in document_text:
56
- return document_text # Return error message if PDF reading failed
57
-
58
- # Retrieve the most relevant chunk
 
 
 
 
 
 
 
 
59
  relevant_chunk = retrieve_relevant_document(user_question, document_text)
60
 
61
- # Simulate model response (replace with Groq or other model integration)
62
- response = f"Relevant Section:\n{relevant_chunk[:500]}...\n\nThis section might help answer your question."
63
- return response
 
 
64
 
65
- # Create Gradio interface
 
 
 
 
 
 
 
 
 
 
 
66
  def create_interface():
67
  with gr.Blocks() as demo:
68
- gr.Markdown("## Legal Document Q&A\nSelect a document and ask questions based on its content.")
69
 
70
- document_dropdown = gr.Dropdown(
71
- label="Select a Document",
72
- choices=list(PDF_URLS.keys()),
73
- value="Income Tax Ordinance",
74
- )
75
 
76
  question_input = gr.Textbox(
77
- label="Enter your question",
78
- placeholder="Ask something related to the selected document..."
79
  )
80
 
81
  answer_output = gr.Textbox(label="Answer", interactive=False)
82
-
 
83
  submit_button = gr.Button("Ask")
84
 
85
- # Connect inputs and outputs
86
  submit_button.click(
87
  fn=answer_question,
88
- inputs=[document_dropdown, question_input],
89
  outputs=answer_output
90
  )
91
-
92
  return demo
93
 
94
- # Run the app
95
  if __name__ == "__main__":
96
  demo = create_interface()
97
  demo.launch()
 
1
  import gradio as gr
2
+ import os
3
+ from groq import Groq
4
  from PyPDF2 import PdfReader
5
  import re
6
 
7
+ # Function to read the uploaded PDFs and return the text
8
+ def read_pdf(file_path):
 
 
 
 
 
 
9
  try:
10
+ with open(file_path, "rb") as file:
11
+ reader = PdfReader(file)
12
+ text = ""
13
+ for page in reader.pages:
14
+ text += page.extract_text()
 
 
 
 
 
15
  return text
16
  except Exception as e:
17
+ return f"Error reading PDF: {str(e)}"
18
 
19
+ # Function to chunk large text for Groq model to avoid token limits
20
+ def chunk_text(text, chunk_size=1000):
21
+ chunks = []
22
+ # Split the text into chunks
23
+ for i in range(0, len(text), chunk_size):
24
+ chunks.append(text[i:i + chunk_size])
25
  return chunks
26
 
27
+ # Function to retrieve the relevant chunk of text based on user question
 
 
 
 
 
 
28
  def retrieve_relevant_document(user_question, document_text):
29
+ # Extract keywords from the user question
30
+ keywords = re.findall(r"\b\w+\b", user_question.lower())
31
+
32
+ # Split text into smaller chunks for searching
33
+ text_chunks = chunk_text(document_text, chunk_size=1000)
34
+
35
+ # Find the chunk with the most keyword matches
36
+ relevant_chunk = ""
37
+ max_score = 0
38
+ for chunk in text_chunks:
39
+ # Count keyword matches in the chunk
40
+ chunk_score = sum(chunk.lower().count(keyword) for keyword in keywords)
41
+ if chunk_score > max_score:
42
+ max_score = chunk_score
43
+ relevant_chunk = chunk
44
+
45
+ # If no chunk is relevant, return a default message
46
+ if max_score == 0:
47
+ return "No relevant section found in the document."
48
+
49
+ # Return the most relevant chunk with highlighted keywords
50
+ for keyword in keywords:
51
+ relevant_chunk = re.sub(
52
+ fr"\b({keyword})\b", r"**\1**", relevant_chunk, flags=re.IGNORECASE
53
+ )
54
+
55
  return relevant_chunk
56
 
57
+ # Initialize Groq client
58
+ def initialize_groq():
59
+ return Groq(api_key=os.getenv("GROQ_API_KEY"))
60
+
61
+ # Function to handle document selection and answer generation using RAG
62
+ def answer_question(uploaded_file, user_question):
63
+ # Check if file is uploaded
64
+ if uploaded_file is None:
65
+ return "Please upload a file before asking a question."
66
+
67
+ # Get the file path from Gradio's uploaded file component
68
+ file_path = uploaded_file.name
69
+
70
+ # Read the content from the uploaded PDF file
71
+ document_text = read_pdf(file_path)
72
+
73
+ # If document text is empty, return an error message
74
+ if not document_text:
75
+ return "Error: The document content is empty or could not be extracted."
76
+
77
+ # Perform document retrieval: get the most relevant chunk
78
  relevant_chunk = retrieve_relevant_document(user_question, document_text)
79
 
80
+ # Prepare the query for the model, including the relevant chunk of text
81
+ query = f"{user_question} \n\n Relevant Document: {relevant_chunk}"
82
+
83
+ # Initialize Groq client
84
+ client = initialize_groq()
85
 
86
+ try:
87
+ # Generate the answer from the Groq model
88
+ chat_completion = client.chat.completions.create(
89
+ messages=[{"role": "user", "content": query}],
90
+ model="llama3-8b-8192", # Use your chosen model
91
+ )
92
+ # Return the model's response
93
+ return chat_completion.choices[0].message.content
94
+ except Exception as e:
95
+ return f"Error generating answer: {str(e)}"
96
+
97
+ # Create Gradio Interface
98
  def create_interface():
99
  with gr.Blocks() as demo:
100
+ gr.Markdown("### Ask questions based on the uploaded document")
101
 
102
+ # File upload component (for users to upload documents)
103
+ file_input = gr.File(label="Upload a document (PDF)", file_count="single")
 
 
 
104
 
105
  question_input = gr.Textbox(
106
+ label="Enter your question",
107
+ placeholder="Ask something related to the uploaded document..."
108
  )
109
 
110
  answer_output = gr.Textbox(label="Answer", interactive=False)
111
+
112
+ # Button to submit the question and get the answer
113
  submit_button = gr.Button("Ask")
114
 
 
115
  submit_button.click(
116
  fn=answer_question,
117
+ inputs=[file_input, question_input],
118
  outputs=answer_output
119
  )
120
+
121
  return demo
122
 
123
+ # Run the interface
124
  if __name__ == "__main__":
125
  demo = create_interface()
126
  demo.launch()