akazmi commited on
Commit
b373765
·
verified ·
1 Parent(s): 7d298ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -46
app.py CHANGED
@@ -1,102 +1,98 @@
1
  import gradio as gr
2
  import os
3
  from groq import Groq
4
- from PyPDF2 import PdfReader
 
 
 
 
 
5
 
6
- # Initialize Groq client
7
  def initialize_groq():
8
  return Groq(api_key=os.getenv("GROQ_API_KEY"))
9
 
10
- # Clean common typos in user questions
11
  def clean_question(user_question):
12
- corrections = {
13
- "slaps": "slabs",
14
- "salried": "salaried",
15
- "slabbs": "slabs"
16
- }
17
- for wrong, correct in corrections.items():
18
- user_question = user_question.replace(wrong, correct)
19
  return user_question
20
 
21
- # Read uploaded PDF and return its text
22
  def read_pdf(uploaded_file):
23
  try:
24
- reader = PdfReader(uploaded_file)
25
- text = ""
26
- for page in reader.pages:
27
- page_text = page.extract_text()
28
- if page_text:
29
- text += page_text
30
- return text
 
 
 
 
 
 
 
31
  except Exception as e:
32
- return f"Error reading PDF: {str(e)}"
33
 
34
- # Split text into chunks for retrieval
35
  def chunk_text(text, chunk_size=3000):
36
  return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
37
 
38
- # Basic keyword overlap similarity
39
  def similarity(query, text):
40
  query_words = set(query.lower().split())
41
  text_words = set(text.lower().split())
42
  return len(query_words & text_words)
43
 
44
- # Get most relevant chunk of document
45
  def retrieve_relevant_document(user_question, document_text):
46
  chunks = chunk_text(document_text)
47
- if not chunks:
48
- return "No readable content in the PDF."
49
- return max(chunks, key=lambda chunk: similarity(user_question, chunk))
50
 
51
- # Generate answer using Groq model
52
  def answer_question(file, user_question):
53
- if file is None:
54
  return "Please upload a PDF document."
55
 
56
  user_question = clean_question(user_question)
57
  document_text = read_pdf(file)
58
 
59
- if not document_text or "error" in document_text.lower():
60
- return "Unable to read document or it's empty."
61
 
62
  relevant_chunk = retrieve_relevant_document(user_question, document_text)
63
 
64
- # Build the prompt for the LLM
65
- prompt = f"""You are a tax and law expert. Read the document and answer the user query concisely.
66
-
67
  User Question: {user_question}
68
 
69
  Relevant Extract from Document:
70
  {relevant_chunk}
71
  """
72
 
73
- client = initialize_groq()
74
-
75
  try:
76
- chat_completion = client.chat.completions.create(
 
77
  messages=[{"role": "user", "content": prompt}],
78
- model="llama3-8b-8192",
79
  )
80
- return chat_completion.choices[0].message.content
81
  except Exception as e:
82
- return f"Error generating answer: {str(e)}"
 
 
83
 
84
- # Create Gradio Interface
85
  def create_interface():
86
  with gr.Blocks() as demo:
87
- gr.Markdown("## 📄 Legal Document Q&A Chatbot\nUpload a PDF and ask questions based on its contents.")
88
-
89
  file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
90
- question_input = gr.Textbox(label="Enter your question", placeholder="E.g., What are the tax slabs for salaried individuals?")
91
  answer_output = gr.Textbox(label="Answer")
92
 
93
- submit_btn = gr.Button("Ask")
94
-
95
- submit_btn.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output)
96
 
97
  return demo
98
 
99
- # Launch the app
100
  if __name__ == "__main__":
101
  demo = create_interface()
102
  demo.launch()
 
1
  import gradio as gr
2
  import os
3
  from groq import Groq
4
+ import pdfplumber
5
+ import pytesseract
6
+ from PIL import Image
7
+ from pdf2image import convert_from_path
8
+
9
+ # --- Helper Functions ---
10
 
 
11
  def initialize_groq():
12
  return Groq(api_key=os.getenv("GROQ_API_KEY"))
13
 
 
14
  def clean_question(user_question):
15
+ corrections = {"slaps": "slabs", "salried": "salaried"}
16
+ for wrong, right in corrections.items():
17
+ user_question = user_question.replace(wrong, right)
 
 
 
 
18
  return user_question
19
 
 
20
  def read_pdf(uploaded_file):
21
  try:
22
+ with pdfplumber.open(uploaded_file.name) as pdf:
23
+ full_text = ""
24
+ for page in pdf.pages:
25
+ text = page.extract_text()
26
+ if text:
27
+ full_text += text
28
+ if not full_text.strip():
29
+ # OCR fallback
30
+ images = convert_from_path(uploaded_file.name)
31
+ full_text = ""
32
+ for img in images:
33
+ text = pytesseract.image_to_string(img)
34
+ full_text += text
35
+ return full_text.strip()
36
  except Exception as e:
37
+ return f"Error reading PDF: {e}"
38
 
 
39
  def chunk_text(text, chunk_size=3000):
40
  return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
41
 
 
42
  def similarity(query, text):
43
  query_words = set(query.lower().split())
44
  text_words = set(text.lower().split())
45
  return len(query_words & text_words)
46
 
 
47
  def retrieve_relevant_document(user_question, document_text):
48
  chunks = chunk_text(document_text)
49
+ return max(chunks, key=lambda chunk: similarity(user_question, chunk)) if chunks else ""
 
 
50
 
 
51
  def answer_question(file, user_question):
52
+ if not file:
53
  return "Please upload a PDF document."
54
 
55
  user_question = clean_question(user_question)
56
  document_text = read_pdf(file)
57
 
58
+ if not document_text:
59
+ return " Document appears empty or unreadable. Please try a different file."
60
 
61
  relevant_chunk = retrieve_relevant_document(user_question, document_text)
62
 
63
+ prompt = f"""You are a tax/legal assistant. Read the following extract and answer the user's query.
64
+
 
65
  User Question: {user_question}
66
 
67
  Relevant Extract from Document:
68
  {relevant_chunk}
69
  """
70
 
 
 
71
  try:
72
+ client = initialize_groq()
73
+ response = client.chat.completions.create(
74
  messages=[{"role": "user", "content": prompt}],
75
+ model="llama3-8b-8192"
76
  )
77
+ return response.choices[0].message.content
78
  except Exception as e:
79
+ return f"Error generating answer: {e}"
80
+
81
+ # --- Gradio UI ---
82
 
 
83
  def create_interface():
84
  with gr.Blocks() as demo:
85
+ gr.Markdown("## 📄 Legal Document Q&A\nUpload a PDF and ask questions based on its content.")
 
86
  file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
87
+ question_input = gr.Textbox(label="Your Question")
88
  answer_output = gr.Textbox(label="Answer")
89
 
90
+ submit = gr.Button("Ask")
91
+ submit.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output)
 
92
 
93
  return demo
94
 
95
+ # Launch
96
  if __name__ == "__main__":
97
  demo = create_interface()
98
  demo.launch()