akazmi commited on
Commit
155b271
·
verified ·
1 Parent(s): 1ee87ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -37
app.py CHANGED
@@ -3,10 +3,25 @@ import os
3
  from groq import Groq
4
  from PyPDF2 import PdfReader
5
 
6
- # Function to read the uploaded PDF file and return its text
7
- def read_pdf(file_obj):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  try:
9
- reader = PdfReader(file_obj)
10
  text = ""
11
  for page in reader.pages:
12
  page_text = page.extract_text()
@@ -16,65 +31,72 @@ def read_pdf(file_obj):
16
  except Exception as e:
17
  return f"Error reading PDF: {str(e)}"
18
 
19
- # Chunk text for better performance with LLM
20
  def chunk_text(text, chunk_size=3000):
21
- return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
22
 
23
- # A simple similarity function (token overlap)
24
  def similarity(query, text):
25
  query_words = set(query.lower().split())
26
  text_words = set(text.lower().split())
27
- return len(query_words.intersection(text_words))
28
 
29
- # Retrieve the most relevant chunk for a given query
30
  def retrieve_relevant_document(user_question, document_text):
31
- text_chunks = chunk_text(document_text)
32
- return max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
 
 
33
 
34
- # Initialize Groq client
35
- def initialize_groq():
36
- return Groq(api_key=os.getenv("GROQ_API_KEY"))
 
37
 
38
- # Handle question-answering using uploaded PDF
39
- def answer_question(uploaded_file, user_question):
40
- if uploaded_file is None:
41
- return "❗ Please upload a PDF document first."
42
-
43
- document_text = read_pdf(uploaded_file)
44
- if not document_text.strip():
45
- return "❗ No readable text found in the uploaded PDF."
46
 
47
  relevant_chunk = retrieve_relevant_document(user_question, document_text)
48
- prompt = f"{user_question}\n\nRelevant Document:\n{relevant_chunk}"
 
 
 
 
 
 
 
 
 
 
49
 
50
  try:
51
- client = initialize_groq()
52
- response = client.chat.completions.create(
53
  model="llama3-8b-8192",
54
- messages=[{"role": "user", "content": prompt}]
55
  )
56
- return response.choices[0].message.content
57
  except Exception as e:
58
  return f"Error generating answer: {str(e)}"
59
 
60
- # Gradio interface
61
  def create_interface():
62
  with gr.Blocks() as demo:
63
- gr.Markdown("### 🧠 Ask Questions Based on Your PDF Document")
64
 
65
- file_input = gr.File(label="Upload a PDF Document", type="binary")
66
- question_input = gr.Textbox(label="Enter your question")
67
- answer_output = gr.Textbox(label="Answer", interactive=False)
68
- ask_button = gr.Button("Ask")
69
 
70
- ask_button.click(
71
- fn=answer_question,
72
- inputs=[file_input, question_input],
73
- outputs=answer_output
74
- )
75
 
76
  return demo
77
 
 
78
  if __name__ == "__main__":
79
  demo = create_interface()
80
  demo.launch()
 
3
  from groq import Groq
4
  from PyPDF2 import PdfReader
5
 
6
+ # Initialize Groq client
7
+ def initialize_groq():
8
+ return Groq(api_key=os.getenv("GROQ_API_KEY"))
9
+
10
+ # Clean common typos in user questions
11
+ def clean_question(user_question):
12
+ corrections = {
13
+ "slaps": "slabs",
14
+ "salried": "salaried",
15
+ "slabbs": "slabs"
16
+ }
17
+ for wrong, correct in corrections.items():
18
+ user_question = user_question.replace(wrong, correct)
19
+ return user_question
20
+
21
+ # Read uploaded PDF and return its text
22
+ def read_pdf(uploaded_file):
23
  try:
24
+ reader = PdfReader(uploaded_file)
25
  text = ""
26
  for page in reader.pages:
27
  page_text = page.extract_text()
 
31
  except Exception as e:
32
  return f"Error reading PDF: {str(e)}"
33
 
34
+ # Split text into chunks for retrieval
35
  def chunk_text(text, chunk_size=3000):
36
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
37
 
38
+ # Basic keyword overlap similarity
39
  def similarity(query, text):
40
  query_words = set(query.lower().split())
41
  text_words = set(text.lower().split())
42
+ return len(query_words & text_words)
43
 
44
+ # Get most relevant chunk of document
45
  def retrieve_relevant_document(user_question, document_text):
46
+ chunks = chunk_text(document_text)
47
+ if not chunks:
48
+ return "No readable content in the PDF."
49
+ return max(chunks, key=lambda chunk: similarity(user_question, chunk))
50
 
51
+ # Generate answer using Groq model
52
+ def answer_question(file, user_question):
53
+ if file is None:
54
+ return "Please upload a PDF document."
55
 
56
+ user_question = clean_question(user_question)
57
+ document_text = read_pdf(file)
58
+
59
+ if not document_text or "error" in document_text.lower():
60
+ return "Unable to read document or it's empty."
 
 
 
61
 
62
  relevant_chunk = retrieve_relevant_document(user_question, document_text)
63
+
64
+ # Build the prompt for the LLM
65
+ prompt = f"""You are a tax and law expert. Read the document and answer the user query concisely.
66
+
67
+ User Question: {user_question}
68
+
69
+ Relevant Extract from Document:
70
+ {relevant_chunk}
71
+ """
72
+
73
+ client = initialize_groq()
74
 
75
  try:
76
+ chat_completion = client.chat.completions.create(
77
+ messages=[{"role": "user", "content": prompt}],
78
  model="llama3-8b-8192",
 
79
  )
80
+ return chat_completion.choices[0].message.content
81
  except Exception as e:
82
  return f"Error generating answer: {str(e)}"
83
 
84
+ # Create Gradio Interface
85
  def create_interface():
86
  with gr.Blocks() as demo:
87
+ gr.Markdown("## 📄 Legal Document Q&A Chatbot\nUpload a PDF and ask questions based on its contents.")
88
 
89
+ file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
90
+ question_input = gr.Textbox(label="Enter your question", placeholder="E.g., What are the tax slabs for salaried individuals?")
91
+ answer_output = gr.Textbox(label="Answer")
 
92
 
93
+ submit_btn = gr.Button("Ask")
94
+
95
+ submit_btn.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output)
 
 
96
 
97
  return demo
98
 
99
+ # Launch the app
100
  if __name__ == "__main__":
101
  demo = create_interface()
102
  demo.launch()