Spaces:

Rezuwan
/

USB_Unmad_Satirical_Bot

Sleeping

App Files Files Community

Rezuwan commited on Aug 28, 2025

Commit

0633b39

verified ·

1 Parent(s): a016803

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -74

app.py CHANGED Viewed

@@ -33,69 +33,19 @@ vectorstore = FAISS.load_local(
     "faiss_index_unmad_magz", embeddings, allow_dangerous_deserialization=True
 )
-def clean_bangla_content(text):
-    """
-    Clean the retrieved content to remove English watermarks, scan text, and unwanted content.
-    Keep only Bengali content.
-    """
-    # Common English watermarks and scan text to remove
-    english_patterns = [
-        r'scanned by \w+',
-        r'found in \w+',
-        r'www\.\w+\.\w+',
-        r'http[s]?://[^\s]+',
-        r'\.pdf',
-        r'\.com',
-        r'\.org',
-        r'\.net',
-        r'banglapdf',
-        r'sadaqpdf',
-        r'pdf scanner',
-        r'scan by',
-        r'converted by',
-        r'page \d+',
-        r'source:',
-        r'reference:',
-        r'[a-zA-Z]+@[a-zA-Z]+\.[a-zA-Z]+',  # emails
-        r'\b[A-Z][a-z]+ [A-Z][a-z]+\b',     # English names
-        r'\b[A-Z]{2,}\b',                    # Uppercase abbreviations
-    ]
-    # Remove lines containing English patterns
-    lines = text.split('\n')
-    cleaned_lines = []
-    for line in lines:
-        line = line.strip()
-        # Skip empty lines
-        if not line:
-            continue
-        # Check if line contains English patterns
-        contains_english = False
-        for pattern in english_patterns:
-            if re.search(pattern, line, re.IGNORECASE):
-                contains_english = True
-                break
-        # Check if line is mostly English (contains more English than Bengali)
-        english_chars = len(re.findall(r'[a-zA-Z]', line))
-        bengali_chars = len(re.findall(r'[\u0980-\u09FF]', line))  # Bengali Unicode range
-        # If line has more English than Bengali, skip it
-        if english_chars > bengali_chars and english_chars > 3:
-            contains_english = True
-        # Only keep lines that don't contain English patterns and have Bengali content
-        if not contains_english and bengali_chars > 0:
-            cleaned_lines.append(line)
-    return '\n'.join(cleaned_lines)
 def maximal_marginal_relevance_search(query, vectorstore, k=10, lambda_param=0.5, top_k=3):
     """
     Implement Maximal Marginal Relevance (MMR) for diverse document retrieval.
     """
     # Get initial candidate documents (more than needed)
     candidate_docs = vectorstore.similarity_search_with_score(query, k=k)
@@ -163,18 +113,89 @@ llm = ChatOpenAI(
     openai_api_key=OPENAI_API_KEY
 )
-# Satirical QA function with MMR and content cleaning
-def custom_unmad_satirical_bot(message, history, top_k=3):
-    # Use MMR search with default parameters
     docs = maximal_marginal_relevance_search(
         query=message,
         vectorstore=vectorstore,
         k=15,  # Consider more candidates for better diversity
-        lambda_param=0.6,  # Slightly favor relevance over diversity
         top_k=top_k
     )
-    # Extract context from MMR-selected documents with cleaning
     if docs:
         # Clean each document's content before joining
         cleaned_contexts = []
@@ -187,8 +208,20 @@ def custom_unmad_satirical_bot(message, history, top_k=3):
             top_contexts = "\n\n---\n\n".join(cleaned_contexts)
         else:
             top_contexts = "প্রাসঙ্গিক বাংলা তথ্য পাওয়া যায়নি।"
     else:
         top_contexts = "কোন প্রাসঙ্গিক তথ্য পাওয়া যায়নি।"
     messages = [
         SystemMessage(content="""
@@ -202,9 +235,11 @@ def custom_unmad_satirical_bot(message, history, top_k=3):
 ৪। প্রসঙ্গের মধ্যে যেসব ইংরেজি টেক্সট, স্ক্যান ওয়াটারমার্ক, ওয়েবসাইট নাম, বা প্রযুক্তিগত শব্দ আছে সেগুলো একেবারেই উল্লেখ করবে না।
 ৫। শুধুমাত্র বাংলা ভাষায় লেখা বিষয়বস্তু ব্যবহার করবে।
 ৬। যদি প্রসঙ্গে কোন বাংলা কন্টেন্ট না থাকে, তাহলে নিজের সাধারণ জ্ঞান দিয়ে উত্তর দেবে।
 """),
         HumanMessage(content=f"""
-প্রসঙ্গ (নির্বাচিত বাংলা তথ্য):
 {top_contexts}
 প্রশ্ন: {message}
@@ -217,25 +252,68 @@ def custom_unmad_satirical_bot(message, history, top_k=3):
     history.append((message, response))
     return "", history
-# Gradio UI
 with gr.Blocks(css=".gradio-container {padding-top: 80px;}") as demo:
-    gr.Markdown("# USB: Unmad Satirical Bot", elem_id="title", elem_classes="title-text")
     with gr.Row():
         gr.Image("images/c1.png", width=450, show_label=False, container=False)
-    chatbot = gr.Chatbot()
     with gr.Row():
-        msg = gr.Textbox(placeholder="কি চলে আপনার মনে বলেন শুনি?", scale=8, show_label=False)
         send = gr.Button("Send", variant="primary", scale=1)
-    clear = gr.Button("Clear")
     state = gr.State([])
-    # Connect interaction
-    msg.submit(custom_unmad_satirical_bot, [msg, state], [msg, chatbot])
-    send.click(custom_unmad_satirical_bot, [msg, state], [msg, chatbot])
     clear.click(lambda: ([], ""), None, [chatbot, msg])
 if __name__ == "__main__":

     "faiss_index_unmad_magz", embeddings, allow_dangerous_deserialization=True
 )
 def maximal_marginal_relevance_search(query, vectorstore, k=10, lambda_param=0.5, top_k=3):
     """
     Implement Maximal Marginal Relevance (MMR) for diverse document retrieval.
+    Args:
+        query: Search query string
+        vectorstore: FAISS vectorstore instance
+        k: Number of candidate documents to consider
+        lambda_param: Trade-off between relevance and diversity (0-1)
+        top_k: Number of final documents to return
+    Returns:
+        List of selected documents with MMR ranking
     """
     # Get initial candidate documents (more than needed)
     candidate_docs = vectorstore.similarity_search_with_score(query, k=k)
     openai_api_key=OPENAI_API_KEY
 )
+def clean_bangla_content(text):
+    """
+    Clean the retrieved content to remove English watermarks, scan text, and unwanted content.
+    Keep only Bengali content.
+    """
+    import re
+    # Common English watermarks and scan text to remove
+    english_patterns = [
+        r'scanned by \w+',
+        r'found in \w+',
+        r'www\.\w+\.\w+',
+        r'http[s]?://[^\s]+',
+        r'\.pdf',
+        r'\.com',
+        r'\.org',
+        r'\.net',
+        r'banglapdf',
+        r'sadaqpdf',
+        r'pdf scanner',
+        r'scan by',
+        r'converted by',
+        r'page \d+',
+        r'source:',
+        r'reference:',
+        r'[a-zA-Z]+@[a-zA-Z]+\.[a-zA-Z]+',  # emails
+        r'\b[A-Z][a-z]+ [A-Z][a-z]+\b',     # English names
+        r'\b[A-Z]{2,}\b',                    # Uppercase abbreviations
+    ]
+    # Remove lines containing English patterns
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        line = line.strip()
+        # Skip empty lines
+        if not line:
+            continue
+        # Check if line contains English patterns
+        contains_english = False
+        for pattern in english_patterns:
+            if re.search(pattern, line, re.IGNORECASE):
+                contains_english = True
+                break
+        # Check if line is mostly English (contains more English than Bengali)
+        english_chars = len(re.findall(r'[a-zA-Z]', line))
+        bengali_chars = len(re.findall(r'[\u0980-\u09FF]', line))  # Bengali Unicode range
+        # If line has more English than Bengali, skip it
+        if english_chars > bengali_chars and english_chars > 3:
+            contains_english = True
+        # Only keep lines that don't contain English patterns and have Bengali content
+        if not contains_english and bengali_chars > 0:
+            cleaned_lines.append(line)
+    return '\n'.join(cleaned_lines)
+# Enhanced Satirical QA function with MMR and content cleaning
+def custom_unmad_satirical_bot(message, history, top_k=3, lambda_param=0.6):
+    """
+    Enhanced satirical bot using MMR for diverse and relevant content retrieval.
+    Args:
+        message: User query
+        history: Chat history
+        top_k: Number of documents to retrieve
+        lambda_param: MMR trade-off (0.6 = slightly favor relevance over diversity)
+    """
+    # Use MMR search instead of standard retriever
     docs = maximal_marginal_relevance_search(
         query=message,
         vectorstore=vectorstore,
         k=15,  # Consider more candidates for better diversity
+        lambda_param=lambda_param,
         top_k=top_k
     )
+    # Extract context from MMR-selected documents
     if docs:
         # Clean each document's content before joining
         cleaned_contexts = []
             top_contexts = "\n\n---\n\n".join(cleaned_contexts)
         else:
             top_contexts = "প্রাসঙ্গিক বাংলা তথ্য পাওয়া যায়নি।"
+        # Add metadata about source diversity (optional)
+        source_info = []
+        for i, doc in enumerate(docs, 1):
+            source = doc.metadata.get('source', 'অজানা উৎস')
+            page = doc.metadata.get('page', 'অজানা পৃষ্ঠা')
+            # Clean source info too
+            if not re.search(r'[a-zA-Z]', source):  # Only if source doesn't contain English
+                source_info.append(f"[{i}] {source} - {page}")
+        source_context = "উৎস: " + " | ".join(source_info[:3]) if source_info else ""  # Removed emoji
     else:
         top_contexts = "কোন প্রাসঙ্গিক তথ্য পাওয়া যায়নি।"
+        source_context = ""
     messages = [
         SystemMessage(content="""
 ৪। প্রসঙ্গের মধ্যে যেসব ইংরেজি টেক্সট, স্ক্যান ওয়াটারমার্ক, ওয়েবসাইট নাম, বা প্রযুক্তিগত শব্দ আছে সেগুলো একেবারেই উল্লেখ করবে না।
 ৫। শুধুমাত্র বাংলা ভাষায় লেখা বিষয়বস্তু ব্যবহার করবে।
 ৬। যদি প্রসঙ্গে কোন বাংলা কন্টেন্ট না থাকে, তাহলে নিজের সাধারণ জ্ঞান দিয়ে উত্তর দেবে।
+৭। বিভিন্ন উৎস থেকে তথ্য মিলিয়ে একটি সমন্বিত উত্তর দেবে।
+৮। কোন ধরনের ওয়েবসাইট বা পিডিএফ রেফারেন্স দেবে না।
 """),
         HumanMessage(content=f"""
+প্রসঙ্গ (বিভিন্ন উৎস থেকে সংগৃহীত):
 {top_contexts}
 প্রশ্ন: {message}
     history.append((message, response))
     return "", history
+# Enhanced Gradio UI with MMR controls
 with gr.Blocks(css=".gradio-container {padding-top: 80px;}") as demo:
+    gr.Markdown("# USB: Unmad Satirical Bot (with MMR)", elem_id="title", elem_classes="title-text")
+    gr.Markdown("### 🔍 Enhanced with Maximal Marginal Relevance for diverse content retrieval")
     with gr.Row():
         gr.Image("images/c1.png", width=450, show_label=False, container=False)
+    with gr.Row():
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot()
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ MMR Settings")
+            lambda_slider = gr.Slider(
+                minimum=0.0,
+                maximum=1.0,
+                value=0.6,
+                step=0.1,
+                label="λ (Relevance vs Diversity)",
+                info="0.0 = Pure Diversity, 1.0 = Pure Relevance"
+            )
+            top_k_slider = gr.Slider(
+                minimum=1,
+                maximum=8,
+                value=3,
+                step=1,
+                label="Documents to Retrieve",
+                info="Number of diverse documents"
+            )
+            gr.Markdown()
     with gr.Row():
+        msg = gr.Textbox(
+            placeholder="কি চলে আপনার মনে বলেন শুনি?",
+            scale=8,
+            show_label=False
+        )
         send = gr.Button("Send", variant="primary", scale=1)
+    clear = gr.Button("Clear Chat")
     state = gr.State([])
+    # Connect interactions with MMR parameters
+    def chat_with_mmr(message, history, lambda_val, top_k_val):
+        return custom_unmad_satirical_bot(message, history, top_k=int(top_k_val), lambda_param=lambda_val)
+    msg.submit(
+        chat_with_mmr,
+        [msg, state, lambda_slider, top_k_slider],
+        [msg, chatbot]
+    )
+    send.click(
+        chat_with_mmr,
+        [msg, state, lambda_slider, top_k_slider],
+        [msg, chatbot]
+    )
     clear.click(lambda: ([], ""), None, [chatbot, msg])
 if __name__ == "__main__":