""" Iroh Legal Research Assistant Powered by Kenya Law database — Legislation + Case Law """ import sys from pathlib import Path sys.path.append(str(Path(__file__).parent.parent)) import streamlit as st import chromadb from sentence_transformers import SentenceTransformer from openai import OpenAI import random import os from huggingface_hub import snapshot_download if not os.path.exists("data/chroma_db/chroma.sqlite3"): os.makedirs("data/chroma_db", exist_ok=True) snapshot_download( repo_id="Daudipdg/iroh-chroma-db", repo_type="dataset", local_dir="data/chroma_db" ) # --- Config --- CHROMA_DIR = "data/chroma_db" COLLECTION_NAME = "kenya_law" EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY", "") TOP_K = 15 GREETINGS = {"hey", "hi", "hello", "helo", "sup", "yo", "howdy", "hii", "hey there"} # --- Loading messages --- RETRIEVAL_MESSAGES = [ "📂 Pulling files from the registry...", "🔍 Scanning the Kenya Law Reports...", "📜 Consulting the statute books...", "⚖️ Weighing the evidence...", "🗂️ Cross-referencing case law...", "📋 Checking the court records...", "🏛️ Walking the corridors of the High Court...", "📌 Pinning down the relevant provisions...", ] SYNTHESIS_MESSAGES = [ "🧠 Counsel is reviewing the brief...", "✍️ Drafting the legal position...", "📖 Citing chapter and verse...", "🔎 Stress-testing the argument...", "⚖️ Balancing the scales...", "📝 Putting pen to paper...", "🏛️ Consulting precedent...", "🎯 Sharpening the legal strategy...", ] # --- Page Config --- st.set_page_config( page_title="Iroh — Kenyan Legal Research", page_icon="⚖️", layout="wide" ) # --- Custom CSS --- st.markdown(""" """, unsafe_allow_html=True) # --- Load models --- @st.cache_resource def load_models(): client = chromadb.PersistentClient(path=CHROMA_DIR) collection = client.get_collection(COLLECTION_NAME) embedder = SentenceTransformer(EMBED_MODEL) llm = OpenAI( api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com" ) return collection, embedder, llm collection, embedder, llm = load_models() # --- Prompts --- SYSTEM_PROMPT = """You are Iroh, a senior Kenyan advocate and legal research assistant. You have deep expertise in Kenyan law — legislation, case law, civil and criminal procedure. When someone greets you or asks what you can do, respond warmly and explain your capabilities. For legal queries, your responses must: 1. Identify the applicable legal framework (Acts, Rules, case law) 2. State the legal position clearly 3. Cite specific sections of Acts or case references from the sources provided 4. Outline the procedure — who does what, where, in what order, within what timelines 5. Provide concrete next steps the lawyer should take immediately 6. Flag any gaps where more information is needed Only cite sources provided to you. Do not invent citations. If sources are insufficient, say so explicitly and suggest where to look.""" DRAFT_SYSTEM_PROMPT = """You are Iroh, a senior Kenyan advocate specializing in legal drafting. You have access to Kenyan legislation, Civil Procedure Rules, Criminal Procedure Code, and sample legal documents. For every drafting request: 1. LEGAL BASIS — identify the governing statute and specific sections 2. PROCEDURE — step by step process: who files what, in which court, in what order, with what timelines 3. DRAFT DOCUMENT — produce a complete, properly formatted draft using correct Kenyan legal language 4. PRESCRIBED FORMS — list any statutory forms required, which Act prescribes them, and the Kenya Law link 5. IMPORTANT NOTES — what the lawyer must verify or obtain before filing Your drafts must: - Follow the exact format and structure used in Kenyan courts - Use the sample documents provided as formatting references - Cite the sections that authorize or require each element of the document - Be complete enough that a lawyer can review and file with minimal changes Only use sources provided. Do not invent forms or citations.""" # --- Core functions --- def is_conversational(text: str) -> bool: t = text.strip().lower() if t in GREETINGS: return True if len(t.split()) <= 5 and any(t.startswith(g) for g in GREETINGS): return True conversational_phrases = [ "how can you help", "what can you do", "what is iroh", "who are you", "what are you", "help me", "how do you work", "what do you do", "can you help" ] return any(phrase in t for phrase in conversational_phrases) def rewrite_query(user_input: str) -> str: response = llm.chat.completions.create( model="deepseek-chat", messages=[ { "role": "system", "content": """You are a Kenyan legal search query optimizer. Convert the user's legal question into a concise 5-10 word search query. Focus on: the specific legal document type, the governing Act name, and the legal concept. Examples: - 'draft a plaint for breach of contract' → 'plaint institution of suit Civil Procedure Act Order 4' - 'how do I file at ELRC for dismissal' → 'unfair termination Employment Act ELRC claim' - 'bail application for murder charge' → 'bail application criminal charge Criminal Procedure Code' - 'petition for divorce' → 'divorce petition Marriage Act matrimonial cause' - 'landlord wants to evict my client' → 'eviction notice landlord tenant Rent Restriction Act' - 'how to file a succession matter' → 'grant of probate succession Law of Succession Act' - 'company director breach of fiduciary duty' → 'director fiduciary duty breach Companies Act' Return ONLY the search query, nothing else.""" }, {"role": "user", "content": user_input} ], max_tokens=50 ) return response.choices[0].message.content.strip() def retrieve(query: str, n_results: int = TOP_K) -> list[dict]: embedding = embedder.encode([query]).tolist() results = collection.query(query_embeddings=embedding, n_results=n_results) chunks = [] for doc, meta in zip(results["documents"][0], results["metadatas"][0]): chunks.append({ "text": doc, "title": meta.get("title", ""), "url": meta.get("url", ""), "section": meta.get("section", ""), "type": meta.get("type", "") }) return chunks def build_context(chunks: list[dict]) -> str: context = "RELEVANT KENYAN LAW & PROCEDURE:\n\n" for i, chunk in enumerate(chunks): source = chunk["title"] if chunk["section"]: source += f" — {chunk['section']}" if chunk["url"]: source += f"\nSource: {chunk['url']}" context += f"[{i+1}] {source}\n{chunk['text']}\n\n---\n\n" return context def get_conversational_response(message: str) -> str: response = llm.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": message} ], max_tokens=500 ) return response.choices[0].message.content def synthesize(fact_pattern: str, chunks: list[dict]) -> str: context = build_context(chunks) user_message = f"""FACT PATTERN: {fact_pattern} {context} Based on the Kenyan law and procedure provided above, analyze this fact pattern and provide: 1. APPLICABLE LAW — which statutes/sections apply and why 2. LEGAL POSITION — what the law says about this situation 3. PROCEDURE — the step by step process the lawyer must follow 4. PRACTICAL IMPLICATIONS — what this means for the client 5. NEXT STEPS — concrete, numbered actions the lawyer should take immediately 6. GAPS — what additional facts or documents are needed 7. CITATIONS — list all sources used with their URLs""" response = llm.chat.completions.create( model="deepseek-reasoner", messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_message} ], max_tokens=3000 ) return response.choices[0].message.content def synthesize_draft(problem: str, chunks: list[dict]) -> str: context = build_context(chunks) user_message = f"""LEGAL PROBLEM: {problem} {context} Based on the Kenyan law and procedure provided, produce: 1. LEGAL BASIS — which statutes/sections govern this matter 2. PROCEDURE — step by step: who files what, in which court, in what order, within what timelines 3. DRAFT DOCUMENT — a complete, properly formatted draft ready for lawyer review 4. PRESCRIBED FORMS — list any statutory forms required with the Act and Kenya Law link 5. IMPORTANT NOTES — anything the lawyer must verify before filing""" response = llm.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": DRAFT_SYSTEM_PROMPT}, {"role": "user", "content": user_message} ], max_tokens=4000 ) return response.choices[0].message.content def render_sources(chunks: list[dict]): st.subheader("📚 Sources Retrieved") seen = set() for chunk in chunks: if chunk["title"] not in seen: seen.add(chunk["title"]) type_icons = { "legislation": "📜", "case_law": "⚖️", "procedure_guide": "📋", "template": "📄" } icon = type_icons.get(chunk["type"], "📌") with st.expander(f"{icon} {chunk['title'][:70]}"): if chunk["url"]: st.markdown(f"**Link:** [{chunk['url']}]({chunk['url']})") st.markdown(f"**Type:** `{chunk['type']}`") st.caption(chunk["text"][:400] + "...") # --- UI --- st.markdown('