Spaces:

deepthi6
/

clausewise_full_project

Runtime error

App Files Files Community

deepthi6 commited on Nov 5, 2025

Commit

ef5a56e

verified ·

1 Parent(s): ab6e1a4

Update app.py

Browse files

Files changed (1) hide show

app.py +422 -303

app.py CHANGED Viewed

@@ -1,303 +1,422 @@
-import streamlit as st
-import tempfile
-import os
-import re
-from cryptography.fernet import Fernet
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForTokenClassification
-from PyPDF2 import PdfReader
-from docx import Document
-import plotly.express as px
-import pandas as pd
-# -------------------------
-# PAGE CONFIG
-# -------------------------
-st.set_page_config(page_title="ClauseWise: Legal Document Analyzer",
-                   page_icon="⚖️", layout="wide")
-st.title("⚖️ ClauseWise: Legal Document Analyzer")
-st.markdown("""
-**Simplify, Decode, and Classify Legal Documents using AI**
-Your smart assistant for understanding contracts, clauses, and obligations.
-""")
-st.markdown("---")
-# -------------------------
-# ENCRYPTION UTILITIES
-# -------------------------
-def get_session_key():
-    if "enc_key" not in st.session_state:
-        st.session_state["enc_key"] = Fernet.generate_key()
-    return st.session_state["enc_key"]
-def encrypt_bytes(data: bytes, key: bytes) -> bytes:
-    cipher = Fernet(key)
-    return cipher.encrypt(data)
-def decrypt_bytes(token: bytes, key: bytes) -> bytes:
-    cipher = Fernet(key)
-    return cipher.decrypt(token)
-def write_temp_encrypted_file(encrypted_bytes: bytes):
-    tmp = tempfile.NamedTemporaryFile(delete=False)
-    tmp.write(encrypted_bytes)
-    tmp.flush()
-    tmp.close()
-    return tmp.name
-def secure_delete(path: str):
-    try:
-        if os.path.exists(path):
-            os.remove(path)
-    except Exception:
-        pass
-# -------------------------
-# FILE EXTRACTION
-# -------------------------
-def extract_text_from_pdf(file_bytes: bytes) -> str:
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
-        tmp.write(file_bytes)
-        tmp_path = tmp.name
-    text = ""
-    try:
-        reader = PdfReader(tmp_path)
-        for page in reader.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
-    except Exception:
-        text = ""
-    secure_delete(tmp_path)
-    return text
-def extract_text_from_docx(file_bytes: bytes) -> str:
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
-        tmp.write(file_bytes)
-        tmp_path = tmp.name
-    text = ""
-    try:
-        doc = Document(tmp_path)
-        text = "\n".join([p.text for p in doc.paragraphs])
-    except Exception:
-        text = ""
-    secure_delete(tmp_path)
-    return text
-def extract_text_from_txt(file_bytes: bytes) -> str:
-    try:
-        return file_bytes.decode("utf-8", errors="ignore")
-    except Exception:
-        return ""
-# -------------------------
-# CLEAN / PREPROCESS
-# -------------------------
-def clean_text(text: str) -> str:
-    patterns = [
-        r"Downloaded from[^\n]*\n?",
-        r"Appears in \d+ contracts[^\n]*\n?",
-        r"I'm 5:.*\n?",
-        r"I'm 5 or Appears in.*\n?",
-        r"(Employee Signature Date:.*?Title:\s*\d*)+",
-    ]
-    for p in patterns:
-        text = re.sub(p, "", text, flags=re.IGNORECASE)
-    text = re.sub(r"\n\s*\n+", "\n\n", text).strip()
-    text = re.sub(r"\s+", " ", text)
-    return text
-# -------------------------
-# MODEL CACHE (Hugging Face only)
-# -------------------------
-@st.cache_resource(ttl=3600)
-def load_models():
-    simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
-    tokenizer = AutoTokenizer.from_pretrained(simplify_model_name)
-    simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
-    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-    return tokenizer, simplify_model, summarizer, ner_pipeline, classifier
-tokenizer, simplify_model, summarizer, ner_pipeline, classifier = load_models()
-# -------------------------
-# CORE AI FEATURES
-# -------------------------
-def clause_simplification(text, mode):
-    if not text:
-        return "No text to simplify."
-    prefix = {
-        "Simplified": "simplify: ",
-        "Explain like I'm 5": "explain like I'm 5: ",
-        "Professional": "rephrase professionally: "
-    }.get(mode, "simplify: ")
-    inputs = tokenizer(prefix + text, return_tensors="pt", truncation=True, max_length=512)
-    outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-def clause_extraction(text):
-    matches = re.findall(r'(Section\s+\d+[\w\.\-]*[:\-]?\s*[A-Z][^\n]+)', text)
-    return list(dict.fromkeys(matches)) if matches else ["Section 1.F: Base Rent"]
-def named_entity_recognition(text):
-    entities = ner_pipeline(text[:2000])
-    grouped = {}
-    for ent in entities:
-        grouped.setdefault(ent["entity_group"], []).append(ent["word"])
-    return grouped
-def document_classification(text):
-    labels = ["Lease Agreement", "Employment Contract", "NDA", "Purchase Agreement"]
-    result = classifier(text[:1024], candidate_labels=labels)
-    return result["labels"][0]
-def flag_risky_clauses(text):
-    risky = re.findall(r"(penalty|termination|breach|liability|indemnity)", text, flags=re.IGNORECASE)
-    return [f"Clause mentioning '{w}' requires review." for w in set(risky)] or ["No high-risk clauses detected."]
-def fairness_assessment(text):
-    pos = len(re.findall(r"(mutual|both parties|shared)", text, flags=re.IGNORECASE))
-    neg = len(re.findall(r"(sole|unilateral|exclusive right)", text, flags=re.IGNORECASE))
-    score = max(0, min(100, 70 + pos - neg * 2))
-    return f"Fairness Score: {score}%"
-def ai_contract_assistant(text):
-    suggestion = re.search(r"penalty|termination", text, flags=re.IGNORECASE)
-    if suggestion:
-        return "Suggested negotiation: Reduce penalty duration or clarify termination terms."
-    return "No immediate negotiation points detected."
-def multilingual_support(text, target_language):
-    try:
-        translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language.lower()[:2]}")
-        return translator(text[:1000])[0]["translation_text"]
-    except Exception:
-        return f"Translated to {target_language} (mock)."
-def text_to_audio(text):
-    st.info("Text-to-speech support coming soon (use gTTS or pyttsx3).")
-# -------------------------
-# SMART CLAUSE-GROUPED TIMELINE + ENTITY PANEL
-# -------------------------
-def timeline_visualization(text):
-    clauses = clause_extraction(text)
-    entities = named_entity_recognition(text)
-    events = []
-    date_matches = re.finditer(
-        r'((?:Section|Clause)\s[\dA-Za-z\.\-]+[^\n:]*[:\-]?\s*[^\n]*)|(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}',
-        text)
-    current_clause = "General"
-    for m in date_matches:
-        if m.group(1):
-            current_clause = m.group(1).strip()
-        elif m.group(2):
-            events.append({"Clause": current_clause, "Date": m.group(2)})
-    if not events:
-        st.warning("No dates or timeline events detected.")
-        return
-    df = pd.DataFrame(events)
-    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
-    df = df.dropna(subset=["Date"])
-    st.subheader("📊 Contract Timeline by Clause")
-    fig = px.timeline(df, x_start="Date", x_end="Date", y="Clause", color="Clause", title="Clause-Wise Timeline")
-    fig.update_yaxes(autorange="reversed")
-    st.plotly_chart(fig, use_container_width=True)
-    st.markdown("### 🧾 Clause-Level Details")
-    for clause in df["Clause"].unique():
-        clause_dates = df[df["Clause"] == clause]["Date"].dt.strftime("%b %d, %Y").tolist()
-        clause_entities = {k: v[:3] for k, v in entities.items()} if entities else {}
-        with st.expander(f"📘 {clause}"):
-            st.write(f"**Dates Mentioned:** {', '.join(clause_dates) if clause_dates else 'None'}")
-            if clause_entities:
-                st.write("**Entities Detected:**")
-                st.json(clause_entities)
-            else:
-                st.write("No named entities found for this clause.")
-# -------------------------
-# MAIN UI
-# -------------------------
-st.subheader("📁 Upload a Legal Document")
-uploaded_file = st.file_uploader("Choose a document (PDF, DOCX, or TXT)", type=["pdf", "docx", "txt"])
-if uploaded_file:
-    key = get_session_key()
-    raw_bytes = uploaded_file.read()
-    encrypted_bytes = encrypt_bytes(raw_bytes, key)
-    temp_encrypted_path = write_temp_encrypted_file(encrypted_bytes)
-    decrypted_bytes = decrypt_bytes(encrypted_bytes, key)
-    filename_lower = uploaded_file.name.lower()
-    if filename_lower.endswith(".pdf"):
-        content = extract_text_from_pdf(decrypted_bytes)
-    elif filename_lower.endswith(".docx"):
-        content = extract_text_from_docx(decrypted_bytes)
-    else:
-        content = extract_text_from_txt(decrypted_bytes)
-    secure_delete(temp_encrypted_path)
-    if not content.strip():
-        st.warning("No readable text found in the document.")
-    else:
-        st.markdown("---")
-        st.subheader("🔍 Apply Features")
-        mode = st.radio("Choose simplification level:", ["Explain like I'm 5", "Simplified", "Professional"])
-        if st.button("🧾 Simplify Clauses"):
-            with st.spinner("Simplifying..."):
-                st.write(clause_simplification(content, mode))
-        st.markdown("---")
-        if st.button("🔗 Extract Entities"):
-            st.json(named_entity_recognition(content))
-        st.markdown("---")
-        if st.button("📑 Extract Clauses"):
-            st.write(clause_extraction(content))
-        st.markdown("---")
-        if st.button("📂 Classify Document"):
-            st.success(document_classification(content))
-        st.markdown("---")
-        if st.button("🚨 Flag Risky Clauses"):
-            st.warning(flag_risky_clauses(content))
-        st.markdown("---")
-        if st.button("📅 Timeline Visualization"):
-            timeline_visualization(content)
-        st.markdown("---")
-        if st.button("⚖️ Fairness Assessment"):
-            st.info(fairness_assessment(content))
-        st.markdown("---")
-        if st.button("🤝 Contract Assistant"):
-            st.write(ai_contract_assistant(content))
-        st.markdown("---")
-        lang = st.selectbox("🌐 Choose Language", ["French", "Spanish", "German"])
-        if st.button("Translate Document"):
-            st.write(multilingual_support(content, lang))
-        st.markdown("---")
-        if st.button("🔊 Convert Text to Audio"):
-            text_to_audio(content)
-else:
-    st.info("👆 Upload a document above to start analysis.")
-st.markdown(
-    "<p style='text-align: center; font-style: italic; color: gray;'>"
-    "Important: ClauseWise provides educational information only. This is not legal advice."
-    "</p>", unsafe_allow_html=True
-)

+import os
+import json
+import math
+import re
+import io
+import asyncio
+from typing import List, Dict, Tuple, Optional, Any
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from pypdf import PdfReader
+import docx
+import spacy
+import gradio as gr
+# -----------------------------
+# Model: IBM Granite 3.2 2B Instruct
+# -----------------------------
+MODEL_ID = "ibm-granite/granite-3.2-2b-instruct"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=DTYPE,
+    device_map="auto" if DEVICE == "cuda" else None
+)
+if DEVICE != "cuda":
+    model.to(DEVICE)
+# -----------------------------
+# spaCy for NER
+# -----------------------------
+nlp = spacy.load("en_core_web_sm")
+# -----------------------------
+# Helper: chat templating for Granite or fallback
+# -----------------------------
+def build_chat_prompt(system_prompt: str, user_prompt: str) -> str:
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": user_prompt})
+    try:
+        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    except Exception:
+        # Fallback: simple concatenation
+        sys = f"[SYSTEM]\n{system_prompt}\n" if system_prompt else ""
+        usr = f"[USER]\n{user_prompt}\n[ASSISTANT]\n"
+        return sys + usr
+# -----------------------------
+# LLM generation
+# -----------------------------
+def llm_generate(
+    system_prompt: str,
+    user_prompt: str,
+    max_new_tokens: int = 512,
+    temperature: float = 0.3,
+    top_p: float = 0.9
+) -> str:
+    prompt = build_chat_prompt(system_prompt, user_prompt)
+    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
+    with torch.inference_mode():
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id
+        )
+    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    # Try to extract only assistant part if chat template used
+    if "[ASSISTANT]" in full_text:
+        return full_text.split("[ASSISTANT]")[-1].strip()
+    # Otherwise, remove the prompt prefix
+    if full_text.startswith(prompt):
+        return full_text[len(prompt):].strip()
+    return full_text.strip()
+# -----------------------------
+# Document loading (PDF/DOCX/TXT)
+# -----------------------------
+def load_text_from_pdf(file_obj) -> str:
+    reader = PdfReader(file_obj)
+    pages = []
+    for page in reader.pages:
+        try:
+            pages.append(page.extract_text() or "")
+        except Exception:
+            pages.append("")
+    return "\n".join(pages).strip()
+def load_text_from_docx(file_obj) -> str:
+    # file_obj is a temporary file-like object; need to read into BytesIO for python-docx
+    data = file_obj.read()
+    file_obj.seek(0)
+    f = io.BytesIO(data)
+    doc = docx.Document(f)
+    paras = [p.text for p in doc.paragraphs]
+    return "\n".join(paras).strip()
+def load_text_from_txt(file_obj) -> str:
+    data = file_obj.read()
+    if isinstance(data, bytes):
+        try:
+            data = data.decode("utf-8", errors="ignore")
+        except Exception:
+            data = data.decode("latin-1", errors="ignore")
+    return str(data).strip()
+def load_document(file: Optional[gr.File]) -> str:
+    if not file:
+        return ""
+    name = (file.name or "").lower()
+    if name.endswith(".pdf"):
+        return load_text_from_pdf(file)
+    elif name.endswith(".docx"):
+        return load_text_from_docx(file)
+    elif name.endswith(".txt"):
+        return load_text_from_txt(file)
+    else:
+        # Try all in order
+        try:
+            return load_text_from_pdf(file)
+        except Exception:
+            pass
+        try:
+            return load_text_from_docx(file)
+        except Exception:
+            pass
+        try:
+            return load_text_from_txt(file)
+        except Exception:
+            pass
+        return ""
+# -----------------------------
+# Clause extraction heuristics
+# -----------------------------
+CLAUSE_SPLIT_REGEX = re.compile(
+    r"(?:(?:^\s*\d+(?:\.\d+)[.)]\s+)|(?:^\s[A-Z]\s*[.)]\s+)|(?:;?\s*\n))",
+    re.MULTILINE
+)
+def split_into_clauses(text: str, min_len: int = 40) -> List[str]:
+    if not text:
+        return []
+    # First try structured numbering/bullets
+    parts = re.split(CLAUSE_SPLIT_REGEX, text)
+    # Fallback: sentence-like splits if too few
+    if len(parts) < 2:
+        parts = re.split(r"(?<=[.;])\s+\n?\s*", text)
+    clauses = [p.strip() for p in parts if len(p.strip()) >= min_len]
+    # Deduplicate near-identical snippets
+    seen = set()
+    unique = []
+    for c in clauses:
+        key = re.sub(r"\s+", " ", c.lower())
+        if key not in seen:
+            seen.add(key)
+            unique.append(c)
+    return unique
+# -----------------------------
+# Feature: Clause Simplification / Plain English
+# -----------------------------
+def simplify_clause(clause: str) -> str:
+    system = "You are a legal assistant that rewrites clauses into plain, layman-friendly English while preserving legal meaning."
+    user = f"Rewrite the following clause in plain English, preserving intent. Highlight any risks with bullet points at the end.\n\nClause:\n{clause}"
+    return llm_generate(system, user, max_new_tokens=400)
+# -----------------------------
+# Feature: Named Entity Recognition (NER)
+# -----------------------------
+def ner_entities(text: str) -> Dict[str, List[str]]:
+    if not text:
+        return {}
+    doc = nlp(text)
+    out: Dict[str, List[str]] = {}
+    for ent in doc.ents:
+        out.setdefault(ent.label_, []).append(ent.text)
+    # Deduplicate
+    out = {k: sorted(set(v)) for k, v in out.items()}
+    return out
+# -----------------------------
+# Feature: Clause Extraction and Breakdown
+# -----------------------------
+def extract_clauses(text: str) -> List[str]:
+    return split_into_clauses(text)
+# -----------------------------
+# Feature: Document Type Classification (LLM zero-shot)
+# -----------------------------
+DOC_TYPES = [
+    "Non-Disclosure Agreement (NDA)",
+    "Lease Agreement",
+    "Employment Contract",
+    "Service Agreement",
+    "Sales Agreement",
+    "Consulting Agreement",
+    "End User License Agreement (EULA)",
+    "Terms of Service",
+]
+def classify_document(text: str) -> str:
+    system = "You are a legal document classifier. Choose the single best-matching document type from the provided list."
+    labels = "\n".join(f"- {t}" for t in DOC_TYPES)
+    user = f"Classify the following document into one of these types:\n{labels}\n\nDocument:\n{text[:5000]}"
+    resp = llm_generate(system, user, max_new_tokens=200)
+    # Try to pick the closest label
+    scores = {t: (1.0 if t.lower() in resp.lower() else 0.0) for t in DOC_TYPES}
+    best = max(scores.items(), key=lambda kv: kv[1])[0]
+    # Fallback: heuristic keyword match
+    if scores[best] == 0.0:
+        lower = text.lower()
+        if "confidential" in lower or "non-disclosure" in lower or "nda" in lower:
+            best = "Non-Disclosure Agreement (NDA)"
+        elif "lease" in lower or "tenant" in lower or "landlord" in lower:
+            best = "Lease Agreement"
+        elif "employment" in lower or "employee" in lower or "employer" in lower:
+            best = "Employment Contract"
+        elif "services" in lower or "service" in lower or "statement of work" in lower:
+            best = "Service Agreement"
+    return best
+# -----------------------------
+# Feature: Negotiation Coach (3 alternatives with acceptance rates)
+# -----------------------------
+def negotiation_coach(clause: str) -> Tuple[str, List[Dict[str, Any]]]:
+    system = "You are an AI negotiation coach for contracts."
+    user = (
+        "Given the clause below, propose 3 alternative versions ranked by expected acceptance rate. "
+        "Provide JSON with fields: alternatives: [ {rank, acceptance_rate_percent, title, clause_text, rationale} ]. "
+        "Rank 1 is highest acceptance rate. Keep acceptance_rate_percent as integer. "
+        f"\n\nClause:\n{clause}"
+    )
+    resp = llm_generate(system, user, max_new_tokens=700)
+    # Try to extract JSON
+    data = None
+    try:
+        json_str = re.search(r"\{[\s\S]*\}", resp).group(0)
+        data = json.loads(json_str)
+    except Exception:
+        # Try to reconstruct minimal structure
+        data = {"alternatives": []}
+        # heuristic extraction
+        alts = re.split(r"\n\s*\d+[.)]\s*", resp)
+        for i, chunk in enumerate(alts[1:4], start=1):
+            data["alternatives"].append({
+                "rank": i,
+                "acceptance_rate_percent": max(50, 90 - (i-1)*10),
+                "title": f"Alternative {i}",
+                "clause_text": chunk.strip()[:800],
+                "rationale": "Heuristic parse from model response."
+            })
+    pretty = json.dumps(data, indent=2)
+    return pretty, data.get("alternatives", [])
+# -----------------------------
+# Feature: Future Risk Predictor (1–5+ years timeline)
+# -----------------------------
+def future_risk_predictor(clause: str) -> Tuple[str, List[Dict[str, Any]]]:
+    system = "You analyze contractual clauses and forecast future risks over time."
+    user = (
+        "Analyze the clause and forecast risks over the next 1 to 5 years. "
+        "Return strict JSON: {timeline: [ {year: int, risk_score_0_100: int, key_risks: [str], mitigation: [str]} ]}. "
+        "risk_score_0_100 is an integer. Keep the list length between 5 and 6."
+        f"\n\nClause:\n{clause}"
+    )
+    resp = llm_generate(system, user, max_new_tokens=700)
+    data = None
+    try:
+        json_str = re.search(r"\{[\s\S]*\}", resp).group(0)
+        data = json.loads(json_str)
+    except Exception:
+        data = {"timeline": []}
+        for y in range(1, 6):
+            data["timeline"].append({
+                "year": y,
+                "risk_score_0_100": min(95, 40 + y*8),
+                "key_risks": ["Heuristic timeline due to JSON parse fallback."],
+                "mitigation": ["Seek legal review", "Adjust clause terms", "Add notice/cure period"]
+            })
+    pretty = json.dumps(data, indent=2)
+    return pretty, data["timeline"]
+# -----------------------------
+# Feature: Fairness Balance Meter (power distribution)
+# -----------------------------
+def fairness_balance_meter(clause: str) -> Tuple[str, int, str]:
+    system = "You evaluate which party a clause favors on a 0-100 scale (0=Party A heavily favored, 50=balanced, 100=Party B heavily favored)."
+    user = (
+        "Return strict JSON: {score_0_100: int, rationale: str, notes: [str]}. "
+        "Do not include anything else."
+        f"\n\nClause:\n{clause}"
+    )
+    resp = llm_generate(system, user, max_new_tokens=400)
+    try:
+        data = json.loads(re.search(r"\{[\s\S]*\}", resp).group(0))
+        score = int(data.get("score_0_100", 50))
+        rationale = data.get("rationale", "")
+    except Exception:
+        score, rationale = 50, "Fallback balanced score due to JSON parse."
+        data = {"score_0_100": score, "rationale": rationale, "notes": []}
+    pretty = json.dumps(data, indent=2)
+    return pretty, score, rationale
+# -----------------------------
+# Feature: Clause Battle Arena (head-to-head)
+# -----------------------------
+def clause_battle_arena(text_a: str, text_b: str) -> Tuple[str, str]:
+    system = "You compare two contract drafts across objective criteria and declare an overall winner."
+    user = (
+        "Compare Document A vs Document B across: Liability, Termination, IP, Payment, Confidentiality, Governing Law. "
+        "Return JSON: {rounds: [ {category, winner: 'A'|'B'|'Draw', rationale} ], overall_winner: 'A'|'B'|'Draw', summary: str}.\n"
+        f"Document A:\n{text_a[:4000]}\n\nDocument B:\n{text_b[:4000]}"
+    )
+    resp = llm_generate(system, user, max_new_tokens=900)
+    try:
+        data = json.loads(re.search(r"\{[\s\S]*\}", resp).group(0))
+    except Exception:
+        data = {
+            "rounds": [
+                {"category": "Liability", "winner": "Draw", "rationale": "Fallback"},
+                {"category": "Termination", "winner": "Draw", "rationale": "Fallback"},
+                {"category": "IP", "winner": "Draw", "rationale": "Fallback"},
+                {"category": "Payment", "winner": "Draw", "rationale": "Fallback"},
+                {"category": "Confidentiality", "winner": "Draw", "rationale": "Fallback"},
+                {"category": "Governing Law", "winner": "Draw", "rationale": "Fallback"},
+            ],
+            "overall_winner": "Draw",
+            "summary": "JSON parse fallback."
+        }
+    pretty = json.dumps(data, indent=2)
+    rounds_md = "\n".join([f"- {r['category']}: {r['winner']} — {r.get('rationale','')}" for r in data.get("rounds", [])])
+    md = f"Overall Winner: {data.get('overall_winner','Draw')}\n\nRounds:\n{rounds_md}\n\nSummary:\n{data.get('summary','')}"
+    return pretty, md
+# -----------------------------
+# Feature: Sensitive Data Sniffer
+# -----------------------------
+PII_REGEXES = {
+    "Email": r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}",
+    "Phone": r"\+?\d[\d\-\s]{7,}\d",
+    "SSN (US)": r"\b\d{3}-\d{2}-\d{4}\b",
+    "Credit Card": r"\b(?:\d[ -]*?){13,16}\b",
+}
+def sensitive_data_sniffer(text: str) -> Tuple[str, Dict[str, List[str]]]:
+    # LLM-based identification of privacy traps plus regex PII
+    system = "You find hidden privacy traps in legal text and list personal data categories being shared or processed."
+    user = (
+        "Return strict JSON: {data_categories: [str], sharing_parties: [str], processing_purposes: [str], risks: [str], recommendations: [str]}.\n"
+        f"Text:\n{text[:6000]}"
+    )
+    resp = llm_generate(system, user, max_new_tokens=700)
+    data = None
+    try:
+        data = json.loads(re.search(r"\{[\s\S]*\}", resp).group(0))
+    except Exception:
+        data = {
+            "data_categories": ["Name", "Email"],
+            "sharing_parties": ["Service Provider"],
+            "processing_purposes": ["Service delivery"],
+            "risks": ["Potential over-collection"],
+            "recommendations": ["Narrow purpose", "Limit retention"]
+        }
+    # Regex-based PII findings
+    regex_hits: Dict[str, List[str]] = {}
+    for label, pattern in PII_REGEXES.items():
+        hits = re.findall(pattern, text or "", flags=re.IGNORECASE)
+        if hits:
+            regex_hits[label] = sorted(set([h.strip() for h in hits]))
+    pretty = json.dumps({"llm": data, "regex_hits": regex_hits}, indent=2)
+    return pretty, regex_hits
+# -----------------------------
+# Feature: Litigation Risk Radar
+# -----------------------------
+def litigation_risk_radar(text: str) -> Tuple[str, str]:
+    clauses = split_into_clauses(text)
+    sample = "\n\n".join(clauses[:8]) if clauses else text[:4000]
+    system = "You identify clauses most likely to trigger disputes or litigation and provide sample dispute scenarios."
+    user = (
+        "Analyze the clauses and return JSON: {hotspots: [ {clause_excerpt, risk_level: 'Low'|'Medium'|'High', why, sample_dispute_scenario} ]}.\n"
+        f"Clauses:\n{sample}"
+    )
+    resp = llm_generate(system, user, max_new_tokens=900)
+    try:
+        data = json.loads(re.search(r"\{[\s\S]*\}", resp).group(0))
+    except Exception:
+        data = {
+            "hotspots": [
+                {
+                    "clause_excerpt": (clauses[0][:280] if clauses else text[:280]),
+                    "risk_level": "Medium",
+                    "why": "Ambiguous obligations.",
+                    "sample_dispute_scenario": "Party A alleges non-performance due to unclear milestones."
+                }
+            ]
+        }
+    pretty = json.dumps(data, indent=2)
+    md = "\n".join([
+        f"- [{h.get('risk_level','Medium')}] {h.get('clause_excerpt','')}\n  Why: {h.get('why','')}\n  Scenario: {h.get('sample_dispute_scenario','')}"
+        for h in data.get("hotspots", [])
+    ])
+    return pretty, md
+# -----------------------------
+# Glue: Input handling (upload or paste)
+# -----------------------------
+def get_text_from_inputs(file: Optional[gr.File], text: str) -> str:
+    file_text = load_document(file) if file else ""
+    final = (text or "").strip()
+    if len(file_text) > len(final):
+        return file_text
+    return final