Rezuwan commited on
Commit
0633b39
·
verified ·
1 Parent(s): a016803

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -74
app.py CHANGED
@@ -33,69 +33,19 @@ vectorstore = FAISS.load_local(
33
  "faiss_index_unmad_magz", embeddings, allow_dangerous_deserialization=True
34
  )
35
 
36
- def clean_bangla_content(text):
37
- """
38
- Clean the retrieved content to remove English watermarks, scan text, and unwanted content.
39
- Keep only Bengali content.
40
- """
41
- # Common English watermarks and scan text to remove
42
- english_patterns = [
43
- r'scanned by \w+',
44
- r'found in \w+',
45
- r'www\.\w+\.\w+',
46
- r'http[s]?://[^\s]+',
47
- r'\.pdf',
48
- r'\.com',
49
- r'\.org',
50
- r'\.net',
51
- r'banglapdf',
52
- r'sadaqpdf',
53
- r'pdf scanner',
54
- r'scan by',
55
- r'converted by',
56
- r'page \d+',
57
- r'source:',
58
- r'reference:',
59
- r'[a-zA-Z]+@[a-zA-Z]+\.[a-zA-Z]+', # emails
60
- r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', # English names
61
- r'\b[A-Z]{2,}\b', # Uppercase abbreviations
62
- ]
63
-
64
- # Remove lines containing English patterns
65
- lines = text.split('\n')
66
- cleaned_lines = []
67
-
68
- for line in lines:
69
- line = line.strip()
70
-
71
- # Skip empty lines
72
- if not line:
73
- continue
74
-
75
- # Check if line contains English patterns
76
- contains_english = False
77
- for pattern in english_patterns:
78
- if re.search(pattern, line, re.IGNORECASE):
79
- contains_english = True
80
- break
81
-
82
- # Check if line is mostly English (contains more English than Bengali)
83
- english_chars = len(re.findall(r'[a-zA-Z]', line))
84
- bengali_chars = len(re.findall(r'[\u0980-\u09FF]', line)) # Bengali Unicode range
85
-
86
- # If line has more English than Bengali, skip it
87
- if english_chars > bengali_chars and english_chars > 3:
88
- contains_english = True
89
-
90
- # Only keep lines that don't contain English patterns and have Bengali content
91
- if not contains_english and bengali_chars > 0:
92
- cleaned_lines.append(line)
93
-
94
- return '\n'.join(cleaned_lines)
95
-
96
  def maximal_marginal_relevance_search(query, vectorstore, k=10, lambda_param=0.5, top_k=3):
97
  """
98
  Implement Maximal Marginal Relevance (MMR) for diverse document retrieval.
 
 
 
 
 
 
 
 
 
 
99
  """
100
  # Get initial candidate documents (more than needed)
101
  candidate_docs = vectorstore.similarity_search_with_score(query, k=k)
@@ -163,18 +113,89 @@ llm = ChatOpenAI(
163
  openai_api_key=OPENAI_API_KEY
164
  )
165
 
166
- # Satirical QA function with MMR and content cleaning
167
- def custom_unmad_satirical_bot(message, history, top_k=3):
168
- # Use MMR search with default parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  docs = maximal_marginal_relevance_search(
170
  query=message,
171
  vectorstore=vectorstore,
172
  k=15, # Consider more candidates for better diversity
173
- lambda_param=0.6, # Slightly favor relevance over diversity
174
  top_k=top_k
175
  )
176
 
177
- # Extract context from MMR-selected documents with cleaning
178
  if docs:
179
  # Clean each document's content before joining
180
  cleaned_contexts = []
@@ -187,8 +208,20 @@ def custom_unmad_satirical_bot(message, history, top_k=3):
187
  top_contexts = "\n\n---\n\n".join(cleaned_contexts)
188
  else:
189
  top_contexts = "প্রাসঙ্গিক বাংলা তথ্য পাওয়া যায়নি।"
 
 
 
 
 
 
 
 
 
 
 
190
  else:
191
  top_contexts = "কোন প্রাসঙ্গিক তথ্য পাওয়া যায়নি।"
 
192
 
193
  messages = [
194
  SystemMessage(content="""
@@ -202,9 +235,11 @@ def custom_unmad_satirical_bot(message, history, top_k=3):
202
  ৪। প্রসঙ্গের মধ্যে যেসব ইংরেজি টেক্সট, স্ক্যান ওয়াটারমার্ক, ওয়েবসাইট নাম, বা প্রযুক্তিগত শব্দ আছে সেগুলো একেবারেই উল্লেখ করবে না।
203
  ৫। শুধুমাত্র বাংলা ভাষায় লেখা বিষয়বস্তু ব্যবহার করবে।
204
  ৬। যদি প্রসঙ্গে কোন বাংলা কন্টেন্ট না থাকে, তাহলে নিজের সাধারণ জ্ঞান দিয়ে উত্তর দেবে।
 
 
205
  """),
206
  HumanMessage(content=f"""
207
- প্রসঙ্গ (নির্বাচিত বাংলা তথ্য):
208
  {top_contexts}
209
 
210
  প্রশ্ন: {message}
@@ -217,25 +252,68 @@ def custom_unmad_satirical_bot(message, history, top_k=3):
217
  history.append((message, response))
218
  return "", history
219
 
220
- # Gradio UI
221
  with gr.Blocks(css=".gradio-container {padding-top: 80px;}") as demo:
222
- gr.Markdown("# USB: Unmad Satirical Bot", elem_id="title", elem_classes="title-text")
 
223
 
224
  with gr.Row():
225
  gr.Image("images/c1.png", width=450, show_label=False, container=False)
226
 
227
- chatbot = gr.Chatbot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  with gr.Row():
230
- msg = gr.Textbox(placeholder="কি চলে আপনার মনে বলেন শুনি?", scale=8, show_label=False)
 
 
 
 
231
  send = gr.Button("Send", variant="primary", scale=1)
232
 
233
- clear = gr.Button("Clear")
234
  state = gr.State([])
235
 
236
- # Connect interaction
237
- msg.submit(custom_unmad_satirical_bot, [msg, state], [msg, chatbot])
238
- send.click(custom_unmad_satirical_bot, [msg, state], [msg, chatbot])
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  clear.click(lambda: ([], ""), None, [chatbot, msg])
240
 
241
  if __name__ == "__main__":
 
33
  "faiss_index_unmad_magz", embeddings, allow_dangerous_deserialization=True
34
  )
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def maximal_marginal_relevance_search(query, vectorstore, k=10, lambda_param=0.5, top_k=3):
37
  """
38
  Implement Maximal Marginal Relevance (MMR) for diverse document retrieval.
39
+
40
+ Args:
41
+ query: Search query string
42
+ vectorstore: FAISS vectorstore instance
43
+ k: Number of candidate documents to consider
44
+ lambda_param: Trade-off between relevance and diversity (0-1)
45
+ top_k: Number of final documents to return
46
+
47
+ Returns:
48
+ List of selected documents with MMR ranking
49
  """
50
  # Get initial candidate documents (more than needed)
51
  candidate_docs = vectorstore.similarity_search_with_score(query, k=k)
 
113
  openai_api_key=OPENAI_API_KEY
114
  )
115
 
116
+ def clean_bangla_content(text):
117
+ """
118
+ Clean the retrieved content to remove English watermarks, scan text, and unwanted content.
119
+ Keep only Bengali content.
120
+ """
121
+ import re
122
+
123
+ # Common English watermarks and scan text to remove
124
+ english_patterns = [
125
+ r'scanned by \w+',
126
+ r'found in \w+',
127
+ r'www\.\w+\.\w+',
128
+ r'http[s]?://[^\s]+',
129
+ r'\.pdf',
130
+ r'\.com',
131
+ r'\.org',
132
+ r'\.net',
133
+ r'banglapdf',
134
+ r'sadaqpdf',
135
+ r'pdf scanner',
136
+ r'scan by',
137
+ r'converted by',
138
+ r'page \d+',
139
+ r'source:',
140
+ r'reference:',
141
+ r'[a-zA-Z]+@[a-zA-Z]+\.[a-zA-Z]+', # emails
142
+ r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', # English names
143
+ r'\b[A-Z]{2,}\b', # Uppercase abbreviations
144
+ ]
145
+
146
+ # Remove lines containing English patterns
147
+ lines = text.split('\n')
148
+ cleaned_lines = []
149
+
150
+ for line in lines:
151
+ line = line.strip()
152
+
153
+ # Skip empty lines
154
+ if not line:
155
+ continue
156
+
157
+ # Check if line contains English patterns
158
+ contains_english = False
159
+ for pattern in english_patterns:
160
+ if re.search(pattern, line, re.IGNORECASE):
161
+ contains_english = True
162
+ break
163
+
164
+ # Check if line is mostly English (contains more English than Bengali)
165
+ english_chars = len(re.findall(r'[a-zA-Z]', line))
166
+ bengali_chars = len(re.findall(r'[\u0980-\u09FF]', line)) # Bengali Unicode range
167
+
168
+ # If line has more English than Bengali, skip it
169
+ if english_chars > bengali_chars and english_chars > 3:
170
+ contains_english = True
171
+
172
+ # Only keep lines that don't contain English patterns and have Bengali content
173
+ if not contains_english and bengali_chars > 0:
174
+ cleaned_lines.append(line)
175
+
176
+ return '\n'.join(cleaned_lines)
177
+
178
+ # Enhanced Satirical QA function with MMR and content cleaning
179
+ def custom_unmad_satirical_bot(message, history, top_k=3, lambda_param=0.6):
180
+ """
181
+ Enhanced satirical bot using MMR for diverse and relevant content retrieval.
182
+
183
+ Args:
184
+ message: User query
185
+ history: Chat history
186
+ top_k: Number of documents to retrieve
187
+ lambda_param: MMR trade-off (0.6 = slightly favor relevance over diversity)
188
+ """
189
+ # Use MMR search instead of standard retriever
190
  docs = maximal_marginal_relevance_search(
191
  query=message,
192
  vectorstore=vectorstore,
193
  k=15, # Consider more candidates for better diversity
194
+ lambda_param=lambda_param,
195
  top_k=top_k
196
  )
197
 
198
+ # Extract context from MMR-selected documents
199
  if docs:
200
  # Clean each document's content before joining
201
  cleaned_contexts = []
 
208
  top_contexts = "\n\n---\n\n".join(cleaned_contexts)
209
  else:
210
  top_contexts = "প্রাসঙ্গিক বাংলা তথ্য পাওয়া যায়নি।"
211
+
212
+ # Add metadata about source diversity (optional)
213
+ source_info = []
214
+ for i, doc in enumerate(docs, 1):
215
+ source = doc.metadata.get('source', 'অজানা উৎস')
216
+ page = doc.metadata.get('page', 'অজানা পৃষ্ঠা')
217
+ # Clean source info too
218
+ if not re.search(r'[a-zA-Z]', source): # Only if source doesn't contain English
219
+ source_info.append(f"[{i}] {source} - {page}")
220
+
221
+ source_context = "উৎস: " + " | ".join(source_info[:3]) if source_info else "" # Removed emoji
222
  else:
223
  top_contexts = "কোন প্রাসঙ্গিক তথ্য পাওয়া যায়নি।"
224
+ source_context = ""
225
 
226
  messages = [
227
  SystemMessage(content="""
 
235
  ৪। প্রসঙ্গের মধ্যে যেসব ইংরেজি টেক্সট, স্ক্যান ওয়াটারমার্ক, ওয়েবসাইট নাম, বা প্রযুক্তিগত শব্দ আছে সেগুলো একেবারেই উল্লেখ করবে না।
236
  ৫। শুধুমাত্র বাংলা ভাষায় লেখা বিষয়বস্তু ব্যবহার করবে।
237
  ৬। যদি প্রসঙ্গে কোন বাংলা কন্টেন্ট না থাকে, তাহলে নিজের সাধারণ জ্ঞান দিয়ে উত্তর দেবে।
238
+ ৭। বিভিন্ন উৎস থেকে তথ্য মিলিয়ে একটি সমন্বিত উত্তর দেবে।
239
+ ৮। কোন ধরনের ওয়েবসাইট বা পিডিএফ রেফারেন্স দেবে না।
240
  """),
241
  HumanMessage(content=f"""
242
+ প্রসঙ্গ (বিভিন্ন উৎস থেকে সংগৃহীত):
243
  {top_contexts}
244
 
245
  প্রশ্ন: {message}
 
252
  history.append((message, response))
253
  return "", history
254
 
255
+ # Enhanced Gradio UI with MMR controls
256
  with gr.Blocks(css=".gradio-container {padding-top: 80px;}") as demo:
257
+ gr.Markdown("# USB: Unmad Satirical Bot (with MMR)", elem_id="title", elem_classes="title-text")
258
+ gr.Markdown("### 🔍 Enhanced with Maximal Marginal Relevance for diverse content retrieval")
259
 
260
  with gr.Row():
261
  gr.Image("images/c1.png", width=450, show_label=False, container=False)
262
 
263
+ with gr.Row():
264
+ with gr.Column(scale=3):
265
+ chatbot = gr.Chatbot()
266
+
267
+ with gr.Column(scale=1):
268
+ gr.Markdown("### ⚙️ MMR Settings")
269
+
270
+ lambda_slider = gr.Slider(
271
+ minimum=0.0,
272
+ maximum=1.0,
273
+ value=0.6,
274
+ step=0.1,
275
+ label="λ (Relevance vs Diversity)",
276
+ info="0.0 = Pure Diversity, 1.0 = Pure Relevance"
277
+ )
278
+
279
+ top_k_slider = gr.Slider(
280
+ minimum=1,
281
+ maximum=8,
282
+ value=3,
283
+ step=1,
284
+ label="Documents to Retrieve",
285
+ info="Number of diverse documents"
286
+ )
287
+
288
+ gr.Markdown()
289
 
290
  with gr.Row():
291
+ msg = gr.Textbox(
292
+ placeholder="কি চলে আপনার মনে বলেন শুনি?",
293
+ scale=8,
294
+ show_label=False
295
+ )
296
  send = gr.Button("Send", variant="primary", scale=1)
297
 
298
+ clear = gr.Button("Clear Chat")
299
  state = gr.State([])
300
 
301
+ # Connect interactions with MMR parameters
302
+ def chat_with_mmr(message, history, lambda_val, top_k_val):
303
+ return custom_unmad_satirical_bot(message, history, top_k=int(top_k_val), lambda_param=lambda_val)
304
+
305
+ msg.submit(
306
+ chat_with_mmr,
307
+ [msg, state, lambda_slider, top_k_slider],
308
+ [msg, chatbot]
309
+ )
310
+
311
+ send.click(
312
+ chat_with_mmr,
313
+ [msg, state, lambda_slider, top_k_slider],
314
+ [msg, chatbot]
315
+ )
316
+
317
  clear.click(lambda: ([], ""), None, [chatbot, msg])
318
 
319
  if __name__ == "__main__":