Spaces:

deepthi6
/

clausewise_full_project_v2

Sleeping

App Files Files Community

deepthi6 commited on Nov 8, 2025

Commit

9c30488

verified ·

1 Parent(s): 3f3fd40

Update util.py

Browse files

Files changed (1) hide show

util.py +211 -86

util.py CHANGED Viewed

@@ -1,96 +1,221 @@
-import re
-from pypdf import PdfReader
-import docx
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# -------------------------------------------------------
-# ✅ Extract text from uploaded files
-# -------------------------------------------------------
-def extract_text_from_file(uploaded):
-    name = uploaded.name.lower()
-    if name.endswith(".pdf"):
-        reader = PdfReader(uploaded)
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text()
-        return text
-    elif name.endswith(".txt"):
-        return uploaded.read().decode("utf-8")
-    elif name.endswith(".docx"):
-        doc = docx.Document(uploaded)
-        return "\n".join([p.text for p in doc.paragraphs])
-    return ""
-# -------------------------------------------------------
-# ✅ Split into clauses
-# -------------------------------------------------------
-def split_into_clauses(text):
-    return [c.strip() for c in re.split(r"\n+|\.\s+", text) if len(c.strip()) > 20]
-# -------------------------------------------------------
-# ✅ Clause Simplifier
-# -------------------------------------------------------
-def simplify_clause(clause, lang, mode):
-    if "Explain Like" in mode:
-        return f"This clause basically means: {clause[:120]}..."
-    if "Professional" in mode:
-        return f"A more formal interpretation: {clause}"
-    return f"Simply put: {clause[:160]}..."
-# -------------------------------------------------------
-# ✅ Risk Analyzer
-# -------------------------------------------------------
-def get_risks(text):
-    return [
-        "Broad confidentiality scope",
-        "Long-term obligations",
-        "Unilateral termination rights",
-        "Strong liability clause",
-        "Missing dispute resolution"
-    ]
-# -------------------------------------------------------
-# ✅ Fairness Meter
-# -------------------------------------------------------
-def get_fairness_score(text):
-    return 40, 80
-# -------------------------------------------------------
-# ✅ Entities
-# -------------------------------------------------------
-def extract_entities(text):
-    names = re.findall(r"[A-Z][a-z]+ [A-Z][a-z]+", text)
-    dates = re.findall(r"\b\d{1,2} \w+ \d{4}\b", text)
-    return {"names": names[:5], "dates": dates[:5]}
-# -------------------------------------------------------
-# ✅ Alternative Clause Suggestions
-# -------------------------------------------------------
-def suggest_alternatives(text):
-    return [
-        "Consider adding a mutual confidentiality clause.",
-        "Limit NDA duration to 2–3 years.",
-        "Specify permitted disclosures clearly.",
-    ]
-# -------------------------------------------------------
-# ✅ Load Local Chat Model
-# -------------------------------------------------------
 def load_chat_model():
     model_name = "distilgpt2"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(model_name)
     return model, tokenizer
-# -------------------------------------------------------
-# ✅ Chat with Model
-# -------------------------------------------------------
-def chat_with_model(model, tokenizer, user_text, history):
-    text = " ".join([f"{u}:{m}" for u, m in history]) + user_text
-    inputs = tokenizer.encode(text, return_tensors="pt")
-    outputs = model.generate(inputs, max_length=200)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)

+import streamlit as st
+import os
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from multilingual import UI_TEXT, translate_text
+from util import extract_text, split_into_clauses, simplify_clause, chat_with_model
+# ---------------------------------------------------
+# ✅ PAGE CONFIG
+# ---------------------------------------------------
+st.set_page_config(
+    page_title="ClauseWise – NDA Assistant",
+    layout="wide"
+)
+st.markdown(
+    "<h2 style='text-align:center;'>ClauseWise – Multilingual NDA Legal Assistant</h2>",
+    unsafe_allow_html=True
+)
+# ---------------------------------------------------
+# ✅ LANGUAGE HANDLING
+# ---------------------------------------------------
+LANGUAGES = {
+    "English": "en",
+    "हिन्दी (Hindi)": "hi",
+    "தமிழ் (Tamil)": "ta",
+    "తెలుగు (Telugu)": "te",
+    "ಕನ್ನಡ (Kannada)": "kn"
+}
+if "lang" not in st.session_state:
+    st.session_state.lang = "en"
+selected_label = st.selectbox("🌐 Language", list(LANGUAGES.keys()))
+st.session_state.lang = LANGUAGES[selected_label]
+T = {k: v[st.session_state.lang] for k, v in UI_TEXT.items()}
+# ---------------------------------------------------
+# ✅ LOAD CHAT MODEL (DistilGPT2 – HF SAFE)
+# ---------------------------------------------------
+@st.cache_resource
 def load_chat_model():
     model_name = "distilgpt2"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    model.config.pad_token_id = tokenizer.eos_token_id
     return model, tokenizer
+model, tokenizer = load_chat_model()
+# Chat history
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+# ---------------------------------------------------
+# ✅ FILE UPLOAD
+# ---------------------------------------------------
+st.subheader(T["upload_title"])
+uploaded = st.file_uploader(T["upload_instruction"], type=["pdf", "txt", "docx"])
+if uploaded:
+    st.info("⏳ Reading file...")
+    text = extract_text(uploaded)
+    # ---------------------------------------------------
+    # ✅ STRICT NDA DETECTION
+    # ---------------------------------------------------
+    NDA_KEYWORDS = [
+        "non-disclosure", "non disclosure", "nda",
+        "confidential information", "disclosing party",
+        "receiving party", "confidentiality",
+        "confidential materials", "protected information"
+    ]
+    if len(text) < 50 or not any(k.lower() in text.lower() for k in NDA_KEYWORDS):
+        st.error(T["error_not_nda"])
+        st.stop()
+    st.success(T["success_nda"])
+    # ---------------------------------------------------
+    # ✅ ANALYSIS TABS
+    # ---------------------------------------------------
+    st.subheader(T["analysis_title"])
+    tabs = st.tabs([
+        T["tab_clauses"],
+        T["tab_risks"],
+        T["tab_fairness"],
+        T["tab_entities"],
+        T["tab_alternatives"],
+        T["tab_chat"],
+    ])
+    # ===================================================
+    # ✅ TAB 1 — CLAUSE SIMPLIFICATION
+    # ===================================================
+    with tabs[0]:
+        st.markdown(f"### {T['clause_simplify']}")
+        mode = st.radio(
+            T["choose_mode"],
+            [("eli5", T["eli5"]), ("simple", T["simple"]), ("pro", T["pro"])],
+            format_func=lambda x: x[1]
+        )[0]
+        clauses = split_into_clauses(text)
+        for i, c in enumerate(clauses):
+            with st.expander(f"Clause {i+1}"):
+                st.write("**Original:**")
+                st.write(c)
+                st.write("**Explanation:**")
+                st.write(simplify_clause(c, mode))
+    # ===================================================
+    # ✅ TAB 2 — RISK ANALYSIS
+    # ===================================================
+    with tabs[1]:
+        st.markdown(f"### {T['risk_title']}")
+        # Simple risk detector
+        RISK_PATTERNS = {
+            "Broad confidentiality definition": ["broad", "all information", "any information"],
+            "Unlimited liability": ["unlimited", "full liability", "all damages"],
+            "One-sided obligations": ["shall not", "only the receiving party"],
+            "Long duration (>5 years)": ["5 years", "7 years", "perpetual"],
+            "No termination rights": ["cannot terminate", "no termination"]
+        }
+        risks_found = []
+        for clause in clauses:
+            lower_c = clause.lower()
+            for risk_label, kws in RISK_PATTERNS.items():
+                if any(k in lower_c for k in kws):
+                    risks_found.append(risk_label)
+        risks_found = list(dict.fromkeys(risks_found))[:5]  # top 5
+        if not risks_found:
+            st.success("✅ No major risks detected.")
+        else:
+            for r in risks_found:
+                st.error("⚠️ " + r)
+    # ===================================================
+    # ✅ TAB 3 — FAIRNESS METER
+    # ===================================================
+    with tabs[2]:
+        st.markdown(f"### {T['fairness_title']}")
+        fairness_score = max(20, min(90, 50 - len(risks_found) * 7))
+        st.write(f"**{T['your_position']}:** {fairness_score}%")
+        st.write(f"**{T['company_position']}:** {100 - fairness_score}%")
+        st.progress(fairness_score / 100)
+    # ===================================================
+    # ✅ TAB 4 — ENTITIES
+    # ===================================================
+    with tabs[3]:
+        st.markdown(f"### {T['entities_title']}")
+        parties = []
+        dates = []
+        money = []
+        import re
+        for clause in clauses:
+            if "party" in clause.lower():
+                parties.append(clause[:80] + "...")
+            money.extend(re.findall(r"\$[\d,]+", clause))
+            dates.extend(re.findall(r"\b(?:\d{1,2}\/\d{1,2}\/\d{2,4}|20\d{2})\b", clause))
+        st.write("**Parties:**", list(set(parties)))
+        st.write("**Dates:**", list(set(dates)))
+        st.write("**Amounts:**", list(set(money)))
+    # ===================================================
+    # ✅ TAB 5 — ALTERNATIVE CLAUSES
+    # ===================================================
+    with tabs[4]:
+        st.markdown(f"### {T['alt_title']}")
+        ALTS = [
+            "A mutual confidentiality clause where both parties share equal protection.",
+            "A time-limited confidentiality period of 2–3 years.",
+            "Liability capped at a fixed reasonable amount."
+        ]
+        for alt in ALTS:
+            st.info(alt)
+    # ===================================================
+    # ✅ TAB 6 — LEGAL CHAT ASSISTANT
+    # ===================================================
+    with tabs[5]:
+        st.markdown(f"### {T['chat_title']}")
+        user_input = st.text_input(T["chat_placeholder"])
+        if user_input:
+            reply = chat_with_model(model, tokenizer, user_input, st.session_state.chat_history)
+            st.session_state.chat_history.append(("User", user_input))
+            st.session_state.chat_history.append(("AI", reply))
+        for role, msg in st.session_state.chat_history[-10:]:
+            if role == "User":
+                st.markdown(f"🧑 **You:** {msg}")
+            else:
+                st.markdown(f"🤖 **ClauseWise:** {msg}")