Prashanthsrn commited on
Commit
ae03db9
Β·
verified Β·
1 Parent(s): 912f1b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -27
app.py CHANGED
@@ -4,6 +4,12 @@ import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
  from transformers import pipeline
6
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
 
7
 
8
  # Global variables to store models and processed data
9
  model = None
@@ -13,38 +19,88 @@ embeddings = None
13
 
14
  def load_models():
15
  global model, generator
16
- if model is None or generator is None:
17
  model = SentenceTransformer('all-MiniLM-L6-v2')
18
  generator = pipeline('text-generation', model='facebook/bart-large-cnn')
19
- return "Models loaded successfully!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def extract_text_from_pdf(file):
22
- global chunks, embeddings
 
 
 
 
23
  if file is None:
24
- return "Please upload a PDF file."
25
 
26
  try:
27
  pdf_reader = PyPDF2.PdfReader(file)
28
  full_text = ""
 
29
  for page in pdf_reader.pages:
30
- full_text += page.extract_text()
 
 
 
 
 
 
 
31
 
32
  # Split text into chunks
33
- chunks = [full_text[i:i + 512] for i in range(0, len(full_text), 512)]
 
 
 
34
 
35
  # Generate embeddings
36
  embeddings = model.encode(chunks)
37
 
38
- return f"PDF processed successfully! Extracted {len(chunks)} text chunks."
39
  except Exception as e:
40
- return f"Error processing PDF: {str(e)}"
 
41
 
42
  def answer_question(question):
 
 
 
 
 
43
  if not chunks or embeddings is None:
44
- return "Please upload a PDF document first."
45
 
46
  if not question:
47
- return "Please enter a question."
48
 
49
  try:
50
  # Embed the question
@@ -56,36 +112,47 @@ def answer_question(question):
56
  context = chunks[most_similar_idx]
57
 
58
  # Generate answer
59
- prompt = f"Question: {question}\nContext: {context}"
60
  response = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
61
 
62
  return response
63
  except Exception as e:
64
- return f"Error generating answer: {str(e)}"
 
65
 
66
  # Create the Gradio interface
67
- with gr.Blocks() as demo:
68
- gr.Markdown("# RAG Chatbot using Sentence-BERT and BART")
 
69
 
70
  with gr.Row():
71
- with gr.Column():
72
- load_button = gr.Button("Load Models")
73
- model_status = gr.Textbox(label="Model Status")
74
- load_button.click(load_models, outputs=model_status)
75
 
 
76
  with gr.Row():
77
- with gr.Column():
78
- pdf_input = gr.File(label="Upload PDF")
79
- pdf_status = gr.Textbox(label="PDF Status")
80
- pdf_input.change(extract_text_from_pdf, inputs=pdf_input, outputs=pdf_status)
81
 
 
82
  with gr.Row():
83
- with gr.Column():
84
- question_input = gr.Textbox(label="Ask a question about the PDF")
85
- answer_output = gr.Textbox(label="Answer")
86
- question_button = gr.Button("Get Answer")
 
 
 
 
 
 
 
 
 
 
87
 
88
- question_button.click(answer_question, inputs=question_input, outputs=answer_output)
 
89
 
90
  # Launch the app
91
  demo.launch()
 
4
  from sentence_transformers import SentenceTransformer
5
  from transformers import pipeline
6
  from sklearn.metrics.pairwise import cosine_similarity
7
+ import logging
8
+ import re
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
 
14
  # Global variables to store models and processed data
15
  model = None
 
19
 
20
  def load_models():
21
  global model, generator
22
+ try:
23
  model = SentenceTransformer('all-MiniLM-L6-v2')
24
  generator = pipeline('text-generation', model='facebook/bart-large-cnn')
25
+ return "βœ… Models loaded successfully!"
26
+ except Exception as e:
27
+ logger.error(f"Error loading models: {e}")
28
+ return f"❌ Error loading models: {str(e)}"
29
+
30
+ def clean_text(text):
31
+ # Remove extra whitespace
32
+ text = re.sub(r'\s+', ' ', text)
33
+ # Remove special characters and digits
34
+ text = re.sub(r'[^\w\s]', '', text)
35
+ return text.strip()
36
+
37
+ def split_text(text, chunk_size=512):
38
+ # Split text into sentences (crude approximation)
39
+ sentences = re.split(r'(?<=[.!?])\s+', text)
40
+ chunks = []
41
+ current_chunk = ""
42
+
43
+ for sentence in sentences:
44
+ if len(current_chunk) + len(sentence) < chunk_size:
45
+ current_chunk += sentence + " "
46
+ else:
47
+ if current_chunk:
48
+ chunks.append(current_chunk.strip())
49
+ current_chunk = sentence + " "
50
+
51
+ if current_chunk:
52
+ chunks.append(current_chunk.strip())
53
+
54
+ return chunks
55
 
56
  def extract_text_from_pdf(file):
57
+ global chunks, embeddings, model
58
+
59
+ if model is None:
60
+ return "❌ Please load the models first."
61
+
62
  if file is None:
63
+ return "❌ Please upload a PDF file."
64
 
65
  try:
66
  pdf_reader = PyPDF2.PdfReader(file)
67
  full_text = ""
68
+
69
  for page in pdf_reader.pages:
70
+ text = page.extract_text()
71
+ if text:
72
+ cleaned_text = clean_text(text)
73
+ if cleaned_text:
74
+ full_text += cleaned_text + " "
75
+
76
+ if not full_text.strip():
77
+ return "❌ No readable text found in the PDF. The file might be scanned or contain only images."
78
 
79
  # Split text into chunks
80
+ chunks = split_text(full_text)
81
+
82
+ if not chunks:
83
+ return "❌ Could not create meaningful text chunks from the PDF."
84
 
85
  # Generate embeddings
86
  embeddings = model.encode(chunks)
87
 
88
+ return f"βœ… PDF processed successfully! Extracted {len(chunks)} text chunks."
89
  except Exception as e:
90
+ logger.error(f"Error processing PDF: {e}")
91
+ return f"❌ Error processing PDF: {str(e)}"
92
 
93
  def answer_question(question):
94
+ global model, generator, chunks, embeddings
95
+
96
+ if model is None or generator is None:
97
+ return "❌ Please load the models first."
98
+
99
  if not chunks or embeddings is None:
100
+ return "❌ Please upload and process a PDF document first."
101
 
102
  if not question:
103
+ return "❌ Please enter a question."
104
 
105
  try:
106
  # Embed the question
 
112
  context = chunks[most_similar_idx]
113
 
114
  # Generate answer
115
+ prompt = f"Question: {question}\nContext: {context}\nAnswer:"
116
  response = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
117
 
118
  return response
119
  except Exception as e:
120
+ logger.error(f"Error generating answer: {e}")
121
+ return f"❌ Error generating answer: {str(e)}"
122
 
123
  # Create the Gradio interface
124
+ with gr.Blocks(title="PDF Q&A Bot") as demo:
125
+ gr.Markdown("# PDF Question-Answering Bot")
126
+ gr.Markdown("### Step 1: Load the necessary models")
127
 
128
  with gr.Row():
129
+ load_button = gr.Button("1️⃣ Load Models", variant="primary")
130
+ model_status = gr.Textbox(label="Model Status", interactive=False)
 
 
131
 
132
+ gr.Markdown("### Step 2: Upload a PDF document")
133
  with gr.Row():
134
+ pdf_input = gr.File(label="2️⃣ Upload PDF")
135
+ pdf_status = gr.Textbox(label="PDF Status", interactive=False)
 
 
136
 
137
+ gr.Markdown("### Step 3: Ask questions about the document")
138
  with gr.Row():
139
+ question_input = gr.Textbox(label="3️⃣ Ask a question about the PDF")
140
+ answer_button = gr.Button("Get Answer", variant="primary")
141
+ answer_output = gr.Textbox(label="Answer", interactive=False)
142
+
143
+ # Event handlers
144
+ load_button.click(load_models, outputs=model_status)
145
+ pdf_input.change(extract_text_from_pdf, inputs=pdf_input, outputs=pdf_status)
146
+ answer_button.click(answer_question, inputs=question_input, outputs=answer_output)
147
+
148
+ gr.Markdown("""
149
+ ## How to use:
150
+ 1. Click 'Load Models' and wait for confirmation
151
+ 2. Upload a PDF document and wait for it to be processed
152
+ 3. Type your question and click 'Get Answer'
153
 
154
+ Note: This tool works best with PDFs that contain readable text. It may not work well with scanned documents or PDFs that are primarily images.
155
+ """)
156
 
157
  # Launch the app
158
  demo.launch()