Spaces:
Running
Running
| import os | |
| import re | |
| import json | |
| import warnings | |
| from typing import List, Dict, Any, Optional | |
| import lancedb | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from datetime import datetime | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Patch Gradio bug (schema parsing issue) | |
| try: | |
| import gradio_client.utils | |
| gradio_client.utils.json_schema_to_python_type = lambda schema, defs=None: "string" | |
| except ImportError: | |
| pass | |
| # Load environment variables | |
| load_dotenv() | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY_Project") | |
| if not OPENAI_API_KEY: | |
| raise ValueError("Missing OPENAI_API_KEY. Please set it in your environment variables.") | |
| # Suppress warnings | |
| warnings.filterwarnings("ignore") | |
| class LanceDBRAG: | |
| def __init__(self, | |
| db_path: str = "lance_unmad_db", | |
| table_name: str = "unmad_documents"): | |
| """Initialize LanceDB RAG System""" | |
| self.db_path = db_path | |
| self.table_name = table_name | |
| # Initialize OpenAI client | |
| self.client = OpenAI(api_key=OPENAI_API_KEY) | |
| # Connect to LanceDB | |
| try: | |
| self.db = lancedb.connect(self.db_path) | |
| self.table = self.db.open_table(self.table_name) | |
| print(f"Connected to LanceDB: {self.db_path}/{self.table_name}") | |
| except Exception as e: | |
| raise ConnectionError(f"Failed to connect to LanceDB: {e}") | |
| def get_embedding(self, text: str) -> List[float]: | |
| """Get OpenAI embedding for query text""" | |
| try: | |
| response = self.client.embeddings.create( | |
| model="text-embedding-3-small", | |
| input=text | |
| ) | |
| return response.data[0].embedding | |
| except Exception as e: | |
| print(f"Error getting embedding: {e}") | |
| return None | |
| def search_similar_content(self, query: str, limit: int = 10) -> pd.DataFrame: | |
| """Search for similar content in the database""" | |
| print(f"Searching: '{query}'") | |
| # Get query embedding | |
| query_embedding = self.get_embedding(query) | |
| if not query_embedding: | |
| return pd.DataFrame() | |
| # Perform vector search | |
| try: | |
| search_query = self.table.search(query_embedding).limit(limit) | |
| results = search_query.to_pandas() | |
| if not results.empty: | |
| print(f"Found {len(results)} relevant results") | |
| else: | |
| print("No results found") | |
| return results | |
| except Exception as e: | |
| print(f"Search error: {e}") | |
| return pd.DataFrame() | |
| # Initialize global RAG instance | |
| rag_system = LanceDBRAG() | |
| def maximal_marginal_relevance_search(query, rag_instance, k=10, lambda_param=0.6, top_k=3): | |
| """ | |
| Implement Maximal Marginal Relevance (MMR) for diverse document retrieval using LanceDB. | |
| Args: | |
| query: Search query string | |
| rag_instance: LanceDB RAG instance | |
| k: Number of candidate documents to consider | |
| lambda_param: Trade-off between relevance and diversity (0-1) | |
| top_k: Number of final documents to return | |
| Returns: | |
| List of selected documents with MMR ranking | |
| """ | |
| # Get initial candidate documents using LanceDB search | |
| search_results = rag_instance.search_similar_content(query, limit=k) | |
| if search_results.empty: | |
| return [] | |
| # Convert to document-like objects for compatibility | |
| docs = [] | |
| for _, row in search_results.iterrows(): | |
| doc_obj = { | |
| 'page_content': row['text'], | |
| 'metadata': { | |
| 'source': row['magazine_name'], | |
| 'page': row['page_number'], | |
| 'chunk': row.get('chunk_id', 0) | |
| }, | |
| 'score': row['_distance'] | |
| } | |
| docs.append(doc_obj) | |
| # Apply MMR selection if we have enough documents | |
| if len(docs) <= top_k: | |
| return docs[:top_k] | |
| # MMR Selection Algorithm | |
| selected_docs = [] | |
| remaining_indices = list(range(len(docs))) | |
| for _ in range(min(top_k, len(docs))): | |
| if not remaining_indices: | |
| break | |
| mmr_scores = [] | |
| for i in remaining_indices: | |
| # Calculate relevance score (inverse of distance) | |
| relevance = 1 / (1 + docs[i]['score']) | |
| # Calculate diversity score (max similarity to already selected docs) | |
| if selected_docs: | |
| max_similarity = 0 | |
| for selected_doc in selected_docs: | |
| # Simple text-based similarity for diversity | |
| text1 = docs[i]['page_content'] | |
| text2 = selected_doc['page_content'] | |
| # Calculate simple Jaccard similarity | |
| words1 = set(text1.split()) | |
| words2 = set(text2.split()) | |
| if words1 and words2: | |
| similarity = len(words1.intersection(words2)) / len(words1.union(words2)) | |
| max_similarity = max(max_similarity, similarity) | |
| diversity = max_similarity | |
| else: | |
| diversity = 0 | |
| # Calculate MMR score | |
| mmr_score = lambda_param * relevance - (1 - lambda_param) * diversity | |
| mmr_scores.append((mmr_score, i)) | |
| # Select document with highest MMR score | |
| if mmr_scores: | |
| best_score, best_idx = max(mmr_scores, key=lambda x: x[0]) | |
| selected_docs.append(docs[best_idx]) | |
| remaining_indices.remove(best_idx) | |
| return selected_docs | |
| def clean_bangla_content(text): | |
| """ | |
| Clean the retrieved content to remove English watermarks, scan text, and unwanted content. | |
| Keep only Bengali content. | |
| """ | |
| # Common English watermarks and scan text to remove | |
| english_patterns = [ | |
| r'scanned by \w+', | |
| r'found in \w+', | |
| r'www\.\w+\.\w+', | |
| r'http[s]?://[^\s]+', | |
| r'\.pdf', | |
| r'\.com', | |
| r'\.org', | |
| r'\.net', | |
| r'banglapdf', | |
| r'sadaqpdf', | |
| r'pdf scanner', | |
| r'scan by', | |
| r'converted by', | |
| r'page \d+', | |
| r'source:', | |
| r'reference:', | |
| r'[a-zA-Z]+@[a-zA-Z]+\.[a-zA-Z]+', # emails | |
| r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', # English names | |
| r'\b[A-Z]{2,}\b', # Uppercase abbreviations | |
| ] | |
| # Remove lines containing English patterns | |
| lines = text.split('\n') | |
| cleaned_lines = [] | |
| for line in lines: | |
| line = line.strip() | |
| # Skip empty lines | |
| if not line: | |
| continue | |
| # Check if line contains English patterns | |
| contains_english = False | |
| for pattern in english_patterns: | |
| if re.search(pattern, line, re.IGNORECASE): | |
| contains_english = True | |
| break | |
| # Check if line is mostly English (contains more English than Bengali) | |
| english_chars = len(re.findall(r'[a-zA-Z]', line)) | |
| bengali_chars = len(re.findall(r'[\u0980-\u09FF]', line)) # Bengali Unicode range | |
| # If line has more English than Bengali, skip it | |
| if english_chars > bengali_chars and english_chars > 3: | |
| contains_english = True | |
| # Only keep lines that don't contain English patterns and have Bengali content | |
| if not contains_english and bengali_chars > 0: | |
| cleaned_lines.append(line) | |
| return '\n'.join(cleaned_lines) | |
| # Enhanced Satirical QA function with MMR and content cleaning | |
| def custom_unmad_satirical_bot(message, history, top_k=3, lambda_param=0.6): | |
| """ | |
| Enhanced satirical bot using MMR for diverse and relevant content retrieval. | |
| Args: | |
| message: User query | |
| history: Chat history | |
| top_k: Number of documents to retrieve | |
| lambda_param: MMR trade-off (0.6 = slightly favor relevance over diversity) | |
| """ | |
| # Use MMR search with LanceDB | |
| docs = maximal_marginal_relevance_search( | |
| query=message, | |
| rag_instance=rag_system, | |
| k=15, # Consider more candidates for better diversity | |
| lambda_param=lambda_param, | |
| top_k=top_k | |
| ) | |
| # Extract context from MMR-selected documents | |
| if docs: | |
| # Clean each document's content before joining | |
| cleaned_contexts = [] | |
| for doc in docs: | |
| cleaned_content = clean_bangla_content(doc['page_content']) | |
| if cleaned_content.strip(): # Only add if there's meaningful Bengali content | |
| cleaned_contexts.append(cleaned_content) | |
| if cleaned_contexts: | |
| top_contexts = "\n\n---\n\n".join(cleaned_contexts) | |
| else: | |
| top_contexts = "No relevant information were found" | |
| # Add metadata about source diversity (optional) | |
| source_info = [] | |
| for i, doc in enumerate(docs, 1): | |
| source = doc['metadata'].get('source', 'Unknown source') | |
| page = doc['metadata'].get('page', 'Unnown page') | |
| # Clean source info too | |
| if not re.search(r'[a-zA-Z]', str(source)): # Only if source doesn't contain English | |
| source_info.append(f"[{i}] {source} - {page}") | |
| source_context = "Source: " + " | ".join(source_info[:3]) if source_info else "" | |
| else: | |
| top_contexts = "No relevant information were found" | |
| source_context = "" | |
| # Prepare system prompt | |
| system_prompt = """ | |
| তুমি 'উন্মাদ' ম্যাগাজিনের একজন পুরানো ব্যঙ্গাত্মক লেখক। তোমার কাজ হলো ব্যবহারকারীর প্রশ্ন শুনে স্যাটায়ার, কটাক্ষ, রসিকতা, ঠাট্টা, আর একটু জ্ঞান মিশিয়ে উত্তর দেওয়া — যাতে লোক হাসে, চিন্তা করে, আবার নতুন কিছু শিখে। তুমি কখনোই একদম সোজাসাপ্টা উত্তর দেবে না — বরং একটু অভিনয় করে, অবাক হয়ে, ঠাট্টা করে, খোঁচা মেরে দেবে। | |
| **এই নির্দেশনাগুলো অবশ্যই মেনে চলবে - কোন ব্যতিক্রম নেই** | |
| ১। কোন ইমোজি (EMOJI) ব্যবহার করবে না - একটিও না। | |
| ২। কোন ইংরেজি টেক্সট ব্যবহার করবে না - একটি শব্দও না। | |
| ৩। কোন ইংরেজি সংখ্যা বা চিহ্ন লিখবে না (যেমন: PDF, URL, www, .com, scanned by, found in ইত্যাদি)। | |
| ৪। প্রসঙ্গের মধ্যে যেসব ইংরেজি টেক্সট, স্ক্যান ওয়াটারমার্ক, ওয়েবসাইট নাম, বা প্রযুক্তিগত শব্দ আছে সেগুলো একেবারেই উল্লেখ করবে না। | |
| ৫। শুধুমাত্র বাংলা ভাষায় লেখা বিষয়বস্তু ব্যবহার করবে। | |
| ৬। যদি প্রসঙ্গে কোন বাংলা কন্টেন্ট না থাকে, তাহলে নিজের সাধারণ জ্ঞান দিয়ে উত্তর দেবে। | |
| ৭। বিভিন্ন উৎস থেকে তথ্য মিলিয়ে একটি সমন্বিত উত্তর দেবে। | |
| ৮। কোন ধরনের ওয়েবসাইট বা পিডিএফ রেফারেন্স দেবে না। | |
| """ | |
| user_prompt = f""" | |
| প্রসঙ্গ (বিভিন্ন উৎস থেকে সংগৃহীত): | |
| {top_contexts} প্ | |
| রশ্ন: {message} | |
| নির্দেশনা: উপরের প্রসঙ্গ থেকে শুধুমাত্র বাংলা ভাষার বিষয়বস্তু ব্যবহার করে উন্মাদ ম্যাগাজিনের স্টাইলে উত্তর দাও। কোন ইংরেজি শব্দ, ইমোজি, বা স্ক্যান ওয়াটারমার্ক উল্লেখ করবে না। সম্পূর্ণ বাংলায় ব্যঙ্গাত্মক ও মজার উত্তর লেখো। | |
| """ | |
| # Generate response using OpenAI | |
| try: | |
| response = rag_system.client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ], | |
| temperature=0.7, | |
| max_tokens=700 | |
| ) | |
| ai_response = response.choices[0].message.content | |
| history.append((message, ai_response)) | |
| return "", history | |
| except Exception as e: | |
| error_response = f"উত্তর তৈরিতে সমস্যা হয়েছে। আবার চেষ্টা করুন।" | |
| history.append((message, error_response)) | |
| return "", history | |
| # Enhanced Gradio UI with MMR (simplified) | |
| with gr.Blocks(css=".gradio-container {padding-top: 80px;}") as demo: | |
| gr.Markdown("# USB: Unmad Satirical Bot", elem_id="title", elem_classes="title-text") | |
| gr.Markdown("### A chatbot that impersonates the satirical character UNMAD") | |
| with gr.Row(): | |
| try: | |
| gr.Image("images/c1.png", width=450, show_label=False, container=False) | |
| except: | |
| gr.Markdown("*[UNMAD Logo would appear here]*") | |
| chatbot = gr.Chatbot() | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| placeholder="কি চলে আপনার মনে বলেন শুনি?", | |
| scale=8, | |
| show_label=False | |
| ) | |
| send = gr.Button("Send", variant="primary", scale=1) | |
| clear = gr.Button("Clear Chat") | |
| state = gr.State([]) | |
| # Connect interactions with fixed MMR parameters | |
| def chat_with_fixed_mmr(message, history): | |
| return custom_unmad_satirical_bot(message, history, top_k=3, lambda_param=0.6) | |
| msg.submit( | |
| chat_with_fixed_mmr, | |
| [msg, state], | |
| [msg, chatbot] | |
| ) | |
| send.click( | |
| chat_with_fixed_mmr, | |
| [msg, state], | |
| [msg, chatbot] | |
| ) | |
| clear.click(lambda: ([], ""), None, [chatbot, msg]) | |
| if __name__ == "__main__": | |
| demo.launch() | |