RAG_System_For_Nepali_Law_V2 / enhanced_semantic_search.py
rbbist's picture
Update enhanced_semantic_search.py
d8517b3 verified
import sqlite3
import chromadb
from chromadb.utils import embedding_functions
import os
import pickle
import hashlib
import re
from google import genai
import PyPDF2
import fitz # PyMuPDF - better for multilingual PDFs
from io import BytesIO
# Initialize Gemini client
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise ValueError("Please set GEMINI_API_KEY environment variable")
client = genai.Client(api_key=api_key)
# Global variables for persistence
chroma_client = None
collection = None
DB_CACHE_FILE = "legal_cases_cache.pkl"
VECTOR_DB_PATH = "./legal_vector_db"
def init_database():
"""Initialize database with persistent storage"""
global chroma_client, collection
db_path = "after_2061.db"
# Setup ChromaDB with persistence (smaller model for storage efficiency)
try:
chroma_client = chromadb.PersistentClient(path=VECTOR_DB_PATH)
# Use smaller multilingual model to save storage
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="paraphrase-multilingual-MiniLM-L12-v2" # Smaller model
)
# Check if collection exists
try:
collection = chroma_client.get_collection(
name="legal_cases_collection",
embedding_function=sentence_transformer_ef
)
print("✅ Loaded existing vector database")
return
except:
# Collection doesn't exist, create new one
collection = chroma_client.create_collection(
name="legal_cases_collection",
embedding_function=sentence_transformer_ef
)
print("📦 Created new vector database")
# Load data from SQLite if collection is empty
if collection.count() == 0:
load_data_to_vector_db(db_path)
except Exception as e:
print(f"Database initialization error: {e}")
raise
def load_data_to_vector_db(db_path):
"""Load case data from SQLite to ChromaDB"""
global collection
try:
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT लिङ्क, निर्णय_नं, साल, मुद्दाको_किसिम, विषय, निवेदक, विपक्षी, प्रकरण, ठहर
FROM cases
""")
rows = cursor.fetchall()
except sqlite3.Error as e:
print(f"SQLite error: {e}")
raise
documents = []
metadatas = []
ids = []
for i, row in enumerate(rows):
link, decision_no, year, mudda_type, subject, nibedak, vipakshi, prakaran, thahar = row
# Enhanced text combination for better semantic search
case_text = f"""
मुद्दाको किसिम: {mudda_type}
विषय: {subject}
निवेदक: {nibedak}
विपक्षी: {vipakshi}
प्रकरण: {prakaran}
ठहर: {thahar}
"""
documents.append(case_text.strip())
metadatas.append({
"link": link,
"decision_no": decision_no,
"year": year,
"mudda_type": mudda_type,
"subject": subject,
"nibedak": nibedak,
"vipakshi": vipakshi,
"prakaran": prakaran,
"thahar": thahar
})
ids.append(f"case_{i}")
# Add to collection in batches to avoid memory issues
batch_size = 100
for i in range(0, len(documents), batch_size):
batch_docs = documents[i:i+batch_size]
batch_meta = metadatas[i:i+batch_size]
batch_ids = ids[i:i+batch_size]
collection.add(
documents=batch_docs,
metadatas=batch_meta,
ids=batch_ids
)
print(f"✅ Loaded {len(documents)} cases to vector database")
def extract_text_from_pdf(pdf_path):
"""Extract text from PDF supporting Nepali and English"""
try:
# Try PyMuPDF first (better for multilingual)
doc = fitz.open(pdf_path)
text = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text += page.get_text()
doc.close()
if text.strip():
return text.strip()
# Fallback to PyPDF2
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text.strip()
except Exception as e:
print(f"PDF extraction error: {e}")
raise Exception(f"PDF पढ्न सकिएन: {str(e)}")
def detect_language(text):
"""Detect if text is primarily Nepali or English"""
# Count Devanagari characters
nepali_chars = len(re.findall(r'[\u0900-\u097F]', text))
total_chars = len(re.sub(r'\s', '', text))
if total_chars == 0:
return "english"
nepali_ratio = nepali_chars / total_chars
return "nepali" if nepali_ratio > 0.3 else "english"
def search_cases(query: str) -> tuple:
"""Enhanced search with better formatting and analysis"""
global collection
if not collection:
raise Exception("Vector database not initialized")
try:
# Enhanced query preprocessing
processed_query = query.strip()
results = collection.query(
query_texts=[processed_query],
n_results=5,
include=["documents", "metadatas", "distances"]
)
hits = results["documents"][0]
ids_meta = results["metadatas"][0]
distances = results["distances"][0]
# Enhanced formatting for legal professionals
output_lines = []
raw_cases_data = []
for i in range(len(hits)):
meta = ids_meta[i]
similarity_score = max(0, (1 - distances[i]) * 100) # Convert distance to similarity %
# Enhanced display format
case_summary = (
f"📋 समान केस {i+1} (Similarity: {similarity_score:.1f}%)\n"
f" ⚖️ प्रकार: {meta['mudda_type']}\n"
f" 📑 विषय: {meta['subject']}\n"
f" 👤 निवेदक: {meta['nibedak']}\n"
f" 👤 विपक्षी: {meta['vipakshi']}\n"
f" 🆔 निर्णय नं: {meta['decision_no']} | साल: {meta['year']}\n"
f" 📄 प्रकरण: {meta.get('prakaran', 'N/A')[:100]}...\n"
f" ⚖️ ठहर: {meta.get('thahar', 'N/A')[:100]}...\n"
f" 🔗 {meta['link']}\n"
)
output_lines.append(case_summary)
# Enhanced data for RAG
case_data = {
"case_number": i+1,
"similarity_score": similarity_score,
"mudda_type": meta['mudda_type'],
"subject": meta['subject'],
"nibedak": meta['nibedak'],
"vipakshi": meta['vipakshi'],
"decision_no": meta['decision_no'],
"year": meta['year'],
"prakaran": meta.get('prakaran', ''),
"thahar": meta.get('thahar', ''),
"full_text": hits[i],
"link": meta['link']
}
raw_cases_data.append(case_data)
formatted_output = "\n" + "="*60 + "\n".join(output_lines)
# Enhanced RAG context
user_language = detect_language(query)
rag_context = f"""
=== प्रयोगकर्ताको हालको मुद्दा / User's Current Case ===
भाषा/Language: {user_language}
मुद्दा/Case: {query}
=== डेटाबेसबाट भेटिएका समान मुद्दाहरू / Retrieved Similar Cases ===
"""
for case in raw_cases_data:
rag_context += f"""
【समान केस {case['case_number']} - समानता: {case['similarity_score']:.1f}%】
• मुद्दाको किसिम: {case['mudda_type']}
• विषय: {case['subject']}
• निवेदक: {case['nibedak']}
• विपक्षी: {case['vipakshi']}
• निर्णय नं: {case['decision_no']} | साल: {case['year']}
• प्रकरण विवरण: {case['prakaran']}
• अदालतको ठहर: {case['thahar']}
• पूर्ण केस विवरण: {case['full_text']}
• सन्दर्भ लिङ्क: {case['link']}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"""
return formatted_output, rag_context, query
except Exception as e:
error_msg = f"Search error: {e}"
return error_msg, "", ""
def chat_with_cases(message: str, cases_context: str = "", user_case: str = "", mode: str = "general") -> str:
"""Enhanced chat function with professional legal analysis"""
try:
# Detect message language
is_nepali = bool(re.search(r'[\u0900-\u097F]', message))
if mode == "case_analysis" and cases_context:
# RAG mode with enhanced legal analysis
if is_nepali:
system_prompt = """तपाईं एक अनुभवी नेपाली कानूनी विश्लेषण विशेषज्ञ हुनुहुन्छ। तपाईंको विशेषज्ञता:
🎯 मुख्य कार्यहरू:
• प्रयोगकर्ताको मुद्दालाई समान पुराना केसहरूसँग तुलना गर्ने
• कानूनी precedent र pattern पहिचान गर्ने
• संभावित परिणाम र रणनीतिक सुझावहरू दिने
• समानता र भिन्नताहरूको गहिरो विश्लेषण
📋 विश्लेषण ढाँचा:
• स्पष्ट रूपमा "समान केस १", "समान केस २" भनेर उल्लेख गर्नुहोस्
• प्रत्येक समान केसको relevance बताउनुहोस्
• कानूनी तर्कहरू र precedent cite गर्नुहोस्
• व्यावहारिक सुझावहरू दिनुहोस्
⚖️ महत्वपूर्ण: तपाईं कानूनी सल्लाह दिनुहुन्न, केवल जानकारीमूलक comparative analysis गर्नुहुन्छ।
वकिलहरूलाई उनीहरूको case preparation मा सहयोग गर्ने उद्देश्यले विश्लेषण गर्नुहोस्।"""
else:
system_prompt = """You are an experienced Nepali legal analysis expert. Your expertise includes:
🎯 Primary Functions:
• Compare user's case with similar historical cases
• Identify legal precedents and patterns
• Provide insights on potential outcomes and strategic considerations
• Conduct deep analysis of similarities and differences
📋 Analysis Format:
• Clearly reference cases as "समान केस १", "समान केस २", etc.
• Explain relevance of each similar case
• Cite legal reasoning and precedents
• Provide practical insights for legal professionals
⚖️ Important: You provide informational comparative analysis only, not legal advice.
Focus on helping lawyers with case preparation and research."""
prompt = f"""{system_prompt}
{cases_context}
प्रयोगकर्ताको प्रश्न: {message}
Professional Legal Analysis:"""
else:
# Enhanced general legal chat mode
if is_nepali:
system_prompt = """तपाईं एक नेपाली कानूनी जानकारी विशेषज्ञ हुनुहुन्छ। तपाईंले:
🏛️ नेपाली कानूनी प्रणाली:
• नेपालको संविधान, ऐन कानूनका बारेमा जानकारी
• अदालती प्रक्रिया र न्यायिक प्रणालीको व्याख्या
• कानूनी अधिकार र कर्तव्यहरूको जानकारी
📚 सेवाहरू:
• कानूनी प्रक्रियाको स्पष्टीकरण
• अधिकार र कर्तव्यको जानकारी
• कानूनी वैधता/अवैधताको सामान्य जानकारी
⚠️ सीमाहरू:
• व्यक्तिगत कानूनी सल्लाह दिनु हुँदैन
• केवल सामान्य जानकारीमूलक सहायता
• विशिष्ट केसका लागि योग्य वकिलसँग सल्लाह लिन सुझाव"""
else:
system_prompt = """You are a Nepali legal information expert. You provide:
🏛️ Nepali Legal System Information:
• Constitution, laws and legal framework of Nepal
• Court procedures and judicial system explanations
• Legal rights and duties information
📚 Services:
• Legal process clarification
• Rights and obligations information
• General legal validity/invalidity information
⚠️ Limitations:
• Do NOT provide personal legal advice
• Only general informational assistance
• Always recommend consulting qualified lawyers for specific cases"""
prompt = f"""{system_prompt}
User Question: {message}
Response:"""
# Generate response with Gemini
response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=prompt
)
# Post-process response for better formatting
response_text = response.text.strip()
# Add professional disclaimer if not present
if mode == "case_analysis" and "legal advice" not in response_text.lower():
if is_nepali:
response_text += "\n\n⚠️ नोट: यो केवल जानकारीमूलक विश्लेषण हो। विशिष्ट कानूनी सल्लाहका लागि योग्य वकिलसँग सल्लाह लिनुहोस्।"
else:
response_text += "\n\n⚠️ Note: This is informational analysis only. Consult qualified lawyers for specific legal advice."
return response_text
except Exception as e:
print(f"Chat error: {e}")
is_nepali = bool(re.search(r'[\u0900-\u097F]', message))
if is_nepali:
return "माफ गर्नुहोस्, प्राविधिक समस्या भयो। कृपया केही बेरमा फेरि प्रयास गर्नुहोस्।"
else:
return "Sorry, there was a technical issue. Please try again in a moment."
def get_legal_advice_general(question: str) -> str:
"""Handle general legal questions without case context"""
is_nepali = bool(re.search(r'[\u0900-\u097F]', question))
if is_nepali:
system_prompt = """तपाईं नेपाली कानूनी जानकारी सहायक हुनुहुन्छ।
🎯 तपाईंको भूमिका:
• नेपाली कानूनका बारेमा सामान्य जानकारी प्रदान गर्ने
• अदालती प्रक्रिया र न्यायिक प्रणालीको बारेमा बताउने
• नागरिक अधिकार र कर्तव्यहरूको जानकारी दिने
• कानूनी प्रक्रियाहरूको स्पष्टीकरण
⚖️ सिद्धान्तहरू:
• स्पष्ट र बुझ्ने भाषामा जवाफ दिनुहोस्
• व्यावहारिक जानकारी प्रदान गर्नुहोस्
• सधैं योग्य वकिलसँग सल्लाह लिन सुझाव दिनुहोस्
• व्यक्तिगत कानूनी सल्लाह नदिनुहोस्"""
else:
system_prompt = """You are a Nepali legal information assistant.
🎯 Your Role:
• Provide general information about Nepali law
• Explain court procedures and judicial system
• Share information about civil rights and duties
• Clarify legal processes
⚖️ Principles:
• Respond in clear, understandable language
• Provide practical information
• Always recommend consulting qualified lawyers
• Do not give personal legal advice"""
prompt = f"""{system_prompt}
प्रश्न: {question}
जानकारीमूलक उत्तर:"""
try:
response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=prompt
)
return response.text.strip()
except Exception as e:
if is_nepali:
return f"माफ गर्नुहोस्, त्रुटि: {str(e)}"
else:
return f"Sorry, error occurred: {str(e)}"
# Additional utility functions for enhanced features
def generate_case_comparison_matrix(cases_data):
"""Generate a structured comparison matrix for multiple cases"""
if not cases_data:
return ""
comparison = "\n📊 केस तुलना म्याट्रिक्स (Case Comparison Matrix)\n"
comparison += "=" * 60 + "\n"
# Headers
comparison += f"{'केस नं':<8} {'प्रकार':<20} {'साल':<8} {'समानता':<10}\n"
comparison += "-" * 60 + "\n"
# Data rows
for case in cases_data:
comparison += f"{case['case_number']:<8} {case['mudda_type'][:18]:<20} {case['year']:<8} {case.get('similarity_score', 0):.1f}%\n"
return comparison
def extract_key_legal_points(case_text):
"""Extract key legal points from case text using AI"""
prompt = f"""
यो नेपाली कानूनी केसबाट मुख्य कानूनी बुँदाहरू निकाल्नुहोस्:
{case_text}
निम्न ढाँचामा उत्तर दिनुहोस्:
🔍 मुख्य कानूनी मुद्दाहरू:
⚖️ लागू हुने कानूनहरू:
📋 आवश्यक प्रमाणहरू:
"""
try:
response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=prompt
)
return response.text.strip()
except:
return "कानूनी बुँदाहरू निकाल्न सकिएन।"
# Database health check
def check_database_health():
"""Check if database is working properly"""
global collection
try:
if collection:
count = collection.count()
return f"✅ Database healthy: {count} cases loaded"
else:
return "❌ Database not initialized"
except Exception as e:
return f"❌ Database error: {str(e)}"