Spaces:
Sleeping
Sleeping
| import sqlite3 | |
| import chromadb | |
| from chromadb.utils import embedding_functions | |
| import os | |
| import pickle | |
| import hashlib | |
| import re | |
| from google import genai | |
| import PyPDF2 | |
| import fitz # PyMuPDF - better for multilingual PDFs | |
| from io import BytesIO | |
| # Initialize Gemini client | |
| api_key = os.getenv("GEMINI_API_KEY") | |
| if not api_key: | |
| raise ValueError("Please set GEMINI_API_KEY environment variable") | |
| client = genai.Client(api_key=api_key) | |
| # Global variables for persistence | |
| chroma_client = None | |
| collection = None | |
| DB_CACHE_FILE = "legal_cases_cache.pkl" | |
| VECTOR_DB_PATH = "./legal_vector_db" | |
| def init_database(): | |
| """Initialize database with persistent storage""" | |
| global chroma_client, collection | |
| db_path = "after_2061.db" | |
| # Setup ChromaDB with persistence (smaller model for storage efficiency) | |
| try: | |
| chroma_client = chromadb.PersistentClient(path=VECTOR_DB_PATH) | |
| # Use smaller multilingual model to save storage | |
| sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction( | |
| model_name="paraphrase-multilingual-MiniLM-L12-v2" # Smaller model | |
| ) | |
| # Check if collection exists | |
| try: | |
| collection = chroma_client.get_collection( | |
| name="legal_cases_collection", | |
| embedding_function=sentence_transformer_ef | |
| ) | |
| print("✅ Loaded existing vector database") | |
| return | |
| except: | |
| # Collection doesn't exist, create new one | |
| collection = chroma_client.create_collection( | |
| name="legal_cases_collection", | |
| embedding_function=sentence_transformer_ef | |
| ) | |
| print("📦 Created new vector database") | |
| # Load data from SQLite if collection is empty | |
| if collection.count() == 0: | |
| load_data_to_vector_db(db_path) | |
| except Exception as e: | |
| print(f"Database initialization error: {e}") | |
| raise | |
| def load_data_to_vector_db(db_path): | |
| """Load case data from SQLite to ChromaDB""" | |
| global collection | |
| try: | |
| with sqlite3.connect(db_path) as conn: | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| SELECT लिङ्क, निर्णय_नं, साल, मुद्दाको_किसिम, विषय, निवेदक, विपक्षी, प्रकरण, ठहर | |
| FROM cases | |
| """) | |
| rows = cursor.fetchall() | |
| except sqlite3.Error as e: | |
| print(f"SQLite error: {e}") | |
| raise | |
| documents = [] | |
| metadatas = [] | |
| ids = [] | |
| for i, row in enumerate(rows): | |
| link, decision_no, year, mudda_type, subject, nibedak, vipakshi, prakaran, thahar = row | |
| # Enhanced text combination for better semantic search | |
| case_text = f""" | |
| मुद्दाको किसिम: {mudda_type} | |
| विषय: {subject} | |
| निवेदक: {nibedak} | |
| विपक्षी: {vipakshi} | |
| प्रकरण: {prakaran} | |
| ठहर: {thahar} | |
| """ | |
| documents.append(case_text.strip()) | |
| metadatas.append({ | |
| "link": link, | |
| "decision_no": decision_no, | |
| "year": year, | |
| "mudda_type": mudda_type, | |
| "subject": subject, | |
| "nibedak": nibedak, | |
| "vipakshi": vipakshi, | |
| "prakaran": prakaran, | |
| "thahar": thahar | |
| }) | |
| ids.append(f"case_{i}") | |
| # Add to collection in batches to avoid memory issues | |
| batch_size = 100 | |
| for i in range(0, len(documents), batch_size): | |
| batch_docs = documents[i:i+batch_size] | |
| batch_meta = metadatas[i:i+batch_size] | |
| batch_ids = ids[i:i+batch_size] | |
| collection.add( | |
| documents=batch_docs, | |
| metadatas=batch_meta, | |
| ids=batch_ids | |
| ) | |
| print(f"✅ Loaded {len(documents)} cases to vector database") | |
| def extract_text_from_pdf(pdf_path): | |
| """Extract text from PDF supporting Nepali and English""" | |
| try: | |
| # Try PyMuPDF first (better for multilingual) | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| text += page.get_text() | |
| doc.close() | |
| if text.strip(): | |
| return text.strip() | |
| # Fallback to PyPDF2 | |
| with open(pdf_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text.strip() | |
| except Exception as e: | |
| print(f"PDF extraction error: {e}") | |
| raise Exception(f"PDF पढ्न सकिएन: {str(e)}") | |
| def detect_language(text): | |
| """Detect if text is primarily Nepali or English""" | |
| # Count Devanagari characters | |
| nepali_chars = len(re.findall(r'[\u0900-\u097F]', text)) | |
| total_chars = len(re.sub(r'\s', '', text)) | |
| if total_chars == 0: | |
| return "english" | |
| nepali_ratio = nepali_chars / total_chars | |
| return "nepali" if nepali_ratio > 0.3 else "english" | |
| def search_cases(query: str) -> tuple: | |
| """Enhanced search with better formatting and analysis""" | |
| global collection | |
| if not collection: | |
| raise Exception("Vector database not initialized") | |
| try: | |
| # Enhanced query preprocessing | |
| processed_query = query.strip() | |
| results = collection.query( | |
| query_texts=[processed_query], | |
| n_results=5, | |
| include=["documents", "metadatas", "distances"] | |
| ) | |
| hits = results["documents"][0] | |
| ids_meta = results["metadatas"][0] | |
| distances = results["distances"][0] | |
| # Enhanced formatting for legal professionals | |
| output_lines = [] | |
| raw_cases_data = [] | |
| for i in range(len(hits)): | |
| meta = ids_meta[i] | |
| similarity_score = max(0, (1 - distances[i]) * 100) # Convert distance to similarity % | |
| # Enhanced display format | |
| case_summary = ( | |
| f"📋 समान केस {i+1} (Similarity: {similarity_score:.1f}%)\n" | |
| f" ⚖️ प्रकार: {meta['mudda_type']}\n" | |
| f" 📑 विषय: {meta['subject']}\n" | |
| f" 👤 निवेदक: {meta['nibedak']}\n" | |
| f" 👤 विपक्षी: {meta['vipakshi']}\n" | |
| f" 🆔 निर्णय नं: {meta['decision_no']} | साल: {meta['year']}\n" | |
| f" 📄 प्रकरण: {meta.get('prakaran', 'N/A')[:100]}...\n" | |
| f" ⚖️ ठहर: {meta.get('thahar', 'N/A')[:100]}...\n" | |
| f" 🔗 {meta['link']}\n" | |
| ) | |
| output_lines.append(case_summary) | |
| # Enhanced data for RAG | |
| case_data = { | |
| "case_number": i+1, | |
| "similarity_score": similarity_score, | |
| "mudda_type": meta['mudda_type'], | |
| "subject": meta['subject'], | |
| "nibedak": meta['nibedak'], | |
| "vipakshi": meta['vipakshi'], | |
| "decision_no": meta['decision_no'], | |
| "year": meta['year'], | |
| "prakaran": meta.get('prakaran', ''), | |
| "thahar": meta.get('thahar', ''), | |
| "full_text": hits[i], | |
| "link": meta['link'] | |
| } | |
| raw_cases_data.append(case_data) | |
| formatted_output = "\n" + "="*60 + "\n".join(output_lines) | |
| # Enhanced RAG context | |
| user_language = detect_language(query) | |
| rag_context = f""" | |
| === प्रयोगकर्ताको हालको मुद्दा / User's Current Case === | |
| भाषा/Language: {user_language} | |
| मुद्दा/Case: {query} | |
| === डेटाबेसबाट भेटिएका समान मुद्दाहरू / Retrieved Similar Cases === | |
| """ | |
| for case in raw_cases_data: | |
| rag_context += f""" | |
| 【समान केस {case['case_number']} - समानता: {case['similarity_score']:.1f}%】 | |
| • मुद्दाको किसिम: {case['mudda_type']} | |
| • विषय: {case['subject']} | |
| • निवेदक: {case['nibedak']} | |
| • विपक्षी: {case['vipakshi']} | |
| • निर्णय नं: {case['decision_no']} | साल: {case['year']} | |
| • प्रकरण विवरण: {case['prakaran']} | |
| • अदालतको ठहर: {case['thahar']} | |
| • पूर्ण केस विवरण: {case['full_text']} | |
| • सन्दर्भ लिङ्क: {case['link']} | |
| ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| """ | |
| return formatted_output, rag_context, query | |
| except Exception as e: | |
| error_msg = f"Search error: {e}" | |
| return error_msg, "", "" | |
| def chat_with_cases(message: str, cases_context: str = "", user_case: str = "", mode: str = "general") -> str: | |
| """Enhanced chat function with professional legal analysis""" | |
| try: | |
| # Detect message language | |
| is_nepali = bool(re.search(r'[\u0900-\u097F]', message)) | |
| if mode == "case_analysis" and cases_context: | |
| # RAG mode with enhanced legal analysis | |
| if is_nepali: | |
| system_prompt = """तपाईं एक अनुभवी नेपाली कानूनी विश्लेषण विशेषज्ञ हुनुहुन्छ। तपाईंको विशेषज्ञता: | |
| 🎯 मुख्य कार्यहरू: | |
| • प्रयोगकर्ताको मुद्दालाई समान पुराना केसहरूसँग तुलना गर्ने | |
| • कानूनी precedent र pattern पहिचान गर्ने | |
| • संभावित परिणाम र रणनीतिक सुझावहरू दिने | |
| • समानता र भिन्नताहरूको गहिरो विश्लेषण | |
| 📋 विश्लेषण ढाँचा: | |
| • स्पष्ट रूपमा "समान केस १", "समान केस २" भनेर उल्लेख गर्नुहोस् | |
| • प्रत्येक समान केसको relevance बताउनुहोस् | |
| • कानूनी तर्कहरू र precedent cite गर्नुहोस् | |
| • व्यावहारिक सुझावहरू दिनुहोस् | |
| ⚖️ महत्वपूर्ण: तपाईं कानूनी सल्लाह दिनुहुन्न, केवल जानकारीमूलक comparative analysis गर्नुहुन्छ। | |
| वकिलहरूलाई उनीहरूको case preparation मा सहयोग गर्ने उद्देश्यले विश्लेषण गर्नुहोस्।""" | |
| else: | |
| system_prompt = """You are an experienced Nepali legal analysis expert. Your expertise includes: | |
| 🎯 Primary Functions: | |
| • Compare user's case with similar historical cases | |
| • Identify legal precedents and patterns | |
| • Provide insights on potential outcomes and strategic considerations | |
| • Conduct deep analysis of similarities and differences | |
| 📋 Analysis Format: | |
| • Clearly reference cases as "समान केस १", "समान केस २", etc. | |
| • Explain relevance of each similar case | |
| • Cite legal reasoning and precedents | |
| • Provide practical insights for legal professionals | |
| ⚖️ Important: You provide informational comparative analysis only, not legal advice. | |
| Focus on helping lawyers with case preparation and research.""" | |
| prompt = f"""{system_prompt} | |
| {cases_context} | |
| प्रयोगकर्ताको प्रश्न: {message} | |
| Professional Legal Analysis:""" | |
| else: | |
| # Enhanced general legal chat mode | |
| if is_nepali: | |
| system_prompt = """तपाईं एक नेपाली कानूनी जानकारी विशेषज्ञ हुनुहुन्छ। तपाईंले: | |
| 🏛️ नेपाली कानूनी प्रणाली: | |
| • नेपालको संविधान, ऐन कानूनका बारेमा जानकारी | |
| • अदालती प्रक्रिया र न्यायिक प्रणालीको व्याख्या | |
| • कानूनी अधिकार र कर्तव्यहरूको जानकारी | |
| 📚 सेवाहरू: | |
| • कानूनी प्रक्रियाको स्पष्टीकरण | |
| • अधिकार र कर्तव्यको जानकारी | |
| • कानूनी वैधता/अवैधताको सामान्य जानकारी | |
| ⚠️ सीमाहरू: | |
| • व्यक्तिगत कानूनी सल्लाह दिनु हुँदैन | |
| • केवल सामान्य जानकारीमूलक सहायता | |
| • विशिष्ट केसका लागि योग्य वकिलसँग सल्लाह लिन सुझाव""" | |
| else: | |
| system_prompt = """You are a Nepali legal information expert. You provide: | |
| 🏛️ Nepali Legal System Information: | |
| • Constitution, laws and legal framework of Nepal | |
| • Court procedures and judicial system explanations | |
| • Legal rights and duties information | |
| 📚 Services: | |
| • Legal process clarification | |
| • Rights and obligations information | |
| • General legal validity/invalidity information | |
| ⚠️ Limitations: | |
| • Do NOT provide personal legal advice | |
| • Only general informational assistance | |
| • Always recommend consulting qualified lawyers for specific cases""" | |
| prompt = f"""{system_prompt} | |
| User Question: {message} | |
| Response:""" | |
| # Generate response with Gemini | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash-exp", | |
| contents=prompt | |
| ) | |
| # Post-process response for better formatting | |
| response_text = response.text.strip() | |
| # Add professional disclaimer if not present | |
| if mode == "case_analysis" and "legal advice" not in response_text.lower(): | |
| if is_nepali: | |
| response_text += "\n\n⚠️ नोट: यो केवल जानकारीमूलक विश्लेषण हो। विशिष्ट कानूनी सल्लाहका लागि योग्य वकिलसँग सल्लाह लिनुहोस्।" | |
| else: | |
| response_text += "\n\n⚠️ Note: This is informational analysis only. Consult qualified lawyers for specific legal advice." | |
| return response_text | |
| except Exception as e: | |
| print(f"Chat error: {e}") | |
| is_nepali = bool(re.search(r'[\u0900-\u097F]', message)) | |
| if is_nepali: | |
| return "माफ गर्नुहोस्, प्राविधिक समस्या भयो। कृपया केही बेरमा फेरि प्रयास गर्नुहोस्।" | |
| else: | |
| return "Sorry, there was a technical issue. Please try again in a moment." | |
| def get_legal_advice_general(question: str) -> str: | |
| """Handle general legal questions without case context""" | |
| is_nepali = bool(re.search(r'[\u0900-\u097F]', question)) | |
| if is_nepali: | |
| system_prompt = """तपाईं नेपाली कानूनी जानकारी सहायक हुनुहुन्छ। | |
| 🎯 तपाईंको भूमिका: | |
| • नेपाली कानूनका बारेमा सामान्य जानकारी प्रदान गर्ने | |
| • अदालती प्रक्रिया र न्यायिक प्रणालीको बारेमा बताउने | |
| • नागरिक अधिकार र कर्तव्यहरूको जानकारी दिने | |
| • कानूनी प्रक्रियाहरूको स्पष्टीकरण | |
| ⚖️ सिद्धान्तहरू: | |
| • स्पष्ट र बुझ्ने भाषामा जवाफ दिनुहोस् | |
| • व्यावहारिक जानकारी प्रदान गर्नुहोस् | |
| • सधैं योग्य वकिलसँग सल्लाह लिन सुझाव दिनुहोस् | |
| • व्यक्तिगत कानूनी सल्लाह नदिनुहोस्""" | |
| else: | |
| system_prompt = """You are a Nepali legal information assistant. | |
| 🎯 Your Role: | |
| • Provide general information about Nepali law | |
| • Explain court procedures and judicial system | |
| • Share information about civil rights and duties | |
| • Clarify legal processes | |
| ⚖️ Principles: | |
| • Respond in clear, understandable language | |
| • Provide practical information | |
| • Always recommend consulting qualified lawyers | |
| • Do not give personal legal advice""" | |
| prompt = f"""{system_prompt} | |
| प्रश्न: {question} | |
| जानकारीमूलक उत्तर:""" | |
| try: | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash-exp", | |
| contents=prompt | |
| ) | |
| return response.text.strip() | |
| except Exception as e: | |
| if is_nepali: | |
| return f"माफ गर्नुहोस्, त्रुटि: {str(e)}" | |
| else: | |
| return f"Sorry, error occurred: {str(e)}" | |
| # Additional utility functions for enhanced features | |
| def generate_case_comparison_matrix(cases_data): | |
| """Generate a structured comparison matrix for multiple cases""" | |
| if not cases_data: | |
| return "" | |
| comparison = "\n📊 केस तुलना म्याट्रिक्स (Case Comparison Matrix)\n" | |
| comparison += "=" * 60 + "\n" | |
| # Headers | |
| comparison += f"{'केस नं':<8} {'प्रकार':<20} {'साल':<8} {'समानता':<10}\n" | |
| comparison += "-" * 60 + "\n" | |
| # Data rows | |
| for case in cases_data: | |
| comparison += f"{case['case_number']:<8} {case['mudda_type'][:18]:<20} {case['year']:<8} {case.get('similarity_score', 0):.1f}%\n" | |
| return comparison | |
| def extract_key_legal_points(case_text): | |
| """Extract key legal points from case text using AI""" | |
| prompt = f""" | |
| यो नेपाली कानूनी केसबाट मुख्य कानूनी बुँदाहरू निकाल्नुहोस्: | |
| {case_text} | |
| निम्न ढाँचामा उत्तर दिनुहोस्: | |
| 🔍 मुख्य कानूनी मुद्दाहरू: | |
| ⚖️ लागू हुने कानूनहरू: | |
| 📋 आवश्यक प्रमाणहरू: | |
| """ | |
| try: | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash-exp", | |
| contents=prompt | |
| ) | |
| return response.text.strip() | |
| except: | |
| return "कानूनी बुँदाहरू निकाल्न सकिएन।" | |
| # Database health check | |
| def check_database_health(): | |
| """Check if database is working properly""" | |
| global collection | |
| try: | |
| if collection: | |
| count = collection.count() | |
| return f"✅ Database healthy: {count} cases loaded" | |
| else: | |
| return "❌ Database not initialized" | |
| except Exception as e: | |
| return f"❌ Database error: {str(e)}" |