Spaces:

JAYASREESS
/

final_year

Running

App Files Files Community

jayasrees commited on Apr 7

Commit

59a54c2

1 Parent(s): ccecc05

Change model to TinyLlama from Hub and remove local-only loading

Browse files

Files changed (2) hide show

analysis/llama_legal_verifier.py +6 -6
app.py +107 -35

analysis/llama_legal_verifier.py CHANGED Viewed

@@ -13,17 +13,13 @@ class LlamaLegalVerifier:
     """
     def __init__(self, model_path: str):
-        if not os.path.isdir(model_path):
-            raise FileNotFoundError(f"Model path not found: {model_path}")
         self.model_path = model_path
         self.device = 0 if torch.cuda.is_available() else -1
         dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-        tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
-            local_files_only=True,
             torch_dtype=dtype,
         )
         if tokenizer.pad_token_id is None:
@@ -41,7 +37,11 @@ class LlamaLegalVerifier:
         lowered = text.lower()
         if "contradiction" in lowered:
             return "Contradiction"
-        if "entailment" in lowered or "duplicate" in lowered or "same meaning" in lowered:
             return "Entailment"
         if "neutral" in lowered:
             return "Neutral"

     """
     def __init__(self, model_path: str):
         self.model_path = model_path
         self.device = 0 if torch.cuda.is_available() else -1
         dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=dtype,
         )
         if tokenizer.pad_token_id is None:
         lowered = text.lower()
         if "contradiction" in lowered:
             return "Contradiction"
+        if (
+            "entailment" in lowered
+            or "duplicate" in lowered
+            or "same meaning" in lowered
+        ):
             return "Entailment"
         if "neutral" in lowered:
             return "Neutral"

app.py CHANGED Viewed

@@ -3,9 +3,6 @@ import sys
 from pathlib import Path
 import importlib
 import json
 import base64
@@ -14,9 +11,10 @@ import re
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
-#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from preprocessing.text_extractor import extract_text_from_file
 from preprocessing.clause_extraction import extract_clauses
@@ -25,6 +23,7 @@ from storage.faiss_index import create_faiss_index
 from analysis.similarity_search import get_similar
 import analysis.common_analyzer
 importlib.reload(analysis.common_analyzer)
 from analysis.common_analyzer import analyze_pair
@@ -35,7 +34,7 @@ from auth.user_store import authenticate_user, create_user
 APP_TITLE = "Legal Semantic Integrity"
-DEFAULT_MODEL_PATH = "merged_tinyllama_instruction"
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
@@ -73,7 +72,9 @@ def _extract_party_name(text: str, role: str) -> str:
         if m:
             name = " ".join(m.group(1).split())
             # Filter generic captures like "hereinafter called"
-            if name and not re.search(r"hereinafter|called|referred|party|agreement", name, re.IGNORECASE):
                 return name[:80]
     if re.search(rf"\b{role_l}\b", t, flags=re.IGNORECASE):
@@ -121,7 +122,9 @@ def _extract_document_parties(text_data):
                 parties[role] = cleaned
                 break
         # Secondary fallback: explicit role in text without name
-        if parties[role] == "Not found" and re.search(rf"\b{role.lower()}\b", compact, flags=re.IGNORECASE):
             parties[role] = f"{role} mentioned (name not parsed)"
     return parties
@@ -137,9 +140,15 @@ def _extract_parties(text1: str, text2: str, doc_parties=None):
         vendee = _extract_party_name(text2, "vendee")
     if doc_parties:
-        if vendor in ["Not found", "Vendor mentioned (name not parsed)"] and doc_parties.get("Vendor"):
             vendor = doc_parties.get("Vendor")
-        if vendee in ["Not found", "Vendee mentioned (name not parsed)"] and doc_parties.get("Vendee"):
             vendee = doc_parties.get("Vendee")
     return vendor, vendee
@@ -299,7 +308,9 @@ def login_page():
         )
     with col_auth:
-        st.markdown('<div class="step">Step 1 of 3: Login</div>', unsafe_allow_html=True)
         tab_login, tab_signup = st.tabs(["Sign In", "Create Account"])
         with tab_login:
@@ -338,7 +349,9 @@ def login_page():
     st.caption("Local accounts are saved in data/users.db")
-def run_analysis(uploaded_file, sensitivity: float, backend: str, llama_model_path: str):
     file_ext = uploaded_file.name.split(".")[-1].lower()
     with st.spinner("Extracting text..."):
@@ -412,9 +425,13 @@ def run_analysis(uploaded_file, sensitivity: float, backend: str, llama_model_pa
             result["Vendee"] = vendee_name
             if backend == "llama":
-                _, llm_conf, llm_label, llm_reason = verifier.predict(result["Clause 1"], result["Clause 2"])
             else:
-                _, llm_conf, llm_label = verifier.predict(result["Clause 1"], result["Clause 2"])
                 llm_reason = f"NLI label: {llm_label}"
             if llm_label == "Neutral":
@@ -483,7 +500,9 @@ def upload_page():
         """,
         unsafe_allow_html=True,
     )
-    st.markdown('<div class="step">Step 2 of 3: Upload Document</div>', unsafe_allow_html=True)
     with st.sidebar:
         st.header("Scan Settings")
@@ -514,7 +533,7 @@ def upload_page():
             f"""
             <div class="mini-card">
                 <div class="mini-label">Active Mode</div>
-                <div class="mini-value">{scan_mode.split('(')[0].strip()}</div>
                 <div class="mono">Sensitivity: {sensitivity} | Backend: {model_backend}</div>
             </div>
             """,
@@ -578,7 +597,9 @@ def dashboard_page():
         """,
         unsafe_allow_html=True,
     )
-    st.markdown('<div class="step">Step 3 of 3: Dashboard</div>', unsafe_allow_html=True)
     results = st.session_state.results
     line_issues = st.session_state.line_issues
@@ -653,10 +674,16 @@ def dashboard_page():
                 st.caption(f"Single issue page: {page_min}")
                 page_sel = (page_min, page_max)
             else:
-                page_sel = st.slider("Page Range (analytics)", page_min, page_max, (page_min, page_max))
         with filter_col3:
-            vendors = ["All"] + sorted(line_df["Vendor"].dropna().astype(str).unique().tolist())
-            vendees = ["All"] + sorted(line_df["Vendee"].dropna().astype(str).unique().tolist())
             vendor_sel = st.selectbox("Vendor", vendors, index=0)
             vendee_sel = st.selectbox("Vendee", vendees, index=0)
@@ -664,7 +691,9 @@ def dashboard_page():
         if issue_sel:
             filtered = filtered[filtered["Issue Type"].isin(issue_sel)]
         filtered = filtered[filtered["Confidence"] >= conf_min]
-        filtered = filtered[(filtered["Page"] >= page_sel[0]) & (filtered["Page"] <= page_sel[1])]
         if vendor_sel != "All":
             filtered = filtered[filtered["Vendor"] == vendor_sel]
         if vendee_sel != "All":
@@ -672,9 +701,13 @@ def dashboard_page():
         total_issues = len(filtered)
         conflict_rate = (len(issues_df) / len(df) * 100.0) if len(df) else 0.0
-        top_issue = filtered["Issue Type"].mode().iloc[0] if not filtered.empty else "N/A"
         highest_risk_page = (
-            int(filtered.groupby("Page")["Confidence"].mean().idxmax()) if not filtered.empty else "N/A"
         )
         k1, k2, k3, k4 = st.columns(4)
         k1.metric("Filtered Issues", total_issues)
@@ -697,10 +730,23 @@ def dashboard_page():
             pie_fig.update_layout(margin=dict(l=10, r=10, t=50, b=10))
             st.plotly_chart(pie_fig, use_container_width=True)
-            top_lines = filtered.sort_values(by=["Confidence"], ascending=False).head(10)
             st.markdown("**Top 10 High-Risk Lines**")
             st.dataframe(
-                top_lines[["Issue Type", "Confidence", "Page", "Line", "Vendor", "Vendee", "Snippet", "Reason"]],
                 use_container_width=True,
             )
     else:
@@ -757,19 +803,35 @@ def dashboard_page():
                 st.caption(f"Only one page with issues: Page {page_min}")
                 page_range = (page_min, page_max)
             else:
-                page_range = st.slider("Page range", page_min, page_max, (page_min, page_max))
             if selected:
                 line_df = line_df[line_df["Issue Type"].isin(selected)]
-            line_df = line_df[(line_df["Page"] >= page_range[0]) & (line_df["Page"] <= page_range[1])]
             st.dataframe(line_df, use_container_width=True)
             st.markdown("**Issue Occurrence By Line With Parties**")
             by_line = line_df.copy()
-            by_line = by_line.sort_values(by=["Page", "Line", "Confidence"], ascending=[True, True, False])
             st.dataframe(
-                by_line[["Issue Type", "Page", "Line", "Vendor", "Vendee", "Confidence", "Reason"]],
                 use_container_width=True,
             )
@@ -778,10 +840,14 @@ def dashboard_page():
                 line_df = line_df.reset_index(drop=True)
                 line_df.insert(0, "Item", range(1, len(line_df) + 1))
                 line_df["Jump"] = line_df.apply(
-                    lambda r: f"#{r['Item']} | Pg {int(r['Page'])}, Ln {int(r['Line'])} | {r['Issue Type']}",
                     axis=1,
                 )
-                selected_jump = st.selectbox("Select issue line", line_df["Jump"].tolist())
                 chosen = line_df[line_df["Jump"] == selected_jump].iloc[0]
                 c1, c2 = st.columns([1.1, 1], gap="large")
@@ -790,8 +856,8 @@ def dashboard_page():
                         f"""
                         <div class="mini-card">
                             <div class="mini-label">Selected Line</div>
-                            <div class="mini-value">Pg {int(chosen['Page'])} · Ln {int(chosen['Line'])}</div>
-                            <div class="mono">{chosen['Issue Type']} | Confidence: {float(chosen['Confidence']):.2f}</div>
                         </div>
                         """,
                         unsafe_allow_html=True,
@@ -806,7 +872,9 @@ def dashboard_page():
                     if is_pdf and st.session_state.uploaded_bytes:
                         st.caption("PDF Preview (jumped to selected page)")
                         page_number = int(chosen["Page"])
-                        pdf_b64 = base64.b64encode(st.session_state.uploaded_bytes).decode("utf-8")
                         pdf_html = f"""
                         <iframe
                             src="data:application/pdf;base64,{pdf_b64}#page={page_number}&zoom=110"
@@ -817,7 +885,9 @@ def dashboard_page():
                         """
                         st.markdown(pdf_html, unsafe_allow_html=True)
                     else:
-                        st.info("Inline PDF preview is available for PDF uploads. Current file is not PDF.")
         else:
             st.info("No line-level issues to display.")
@@ -830,7 +900,9 @@ def dashboard_page():
             file_name="semantic_integrity_report.json",
             mime="application/json",
         )
-        pdf_bytes = generate_pdf_report([r for r in results if r["Label"] != "NO_CONFLICT"])
         st.download_button(
             label="Download PDF Report",
             data=pdf_bytes,

 from pathlib import Path
 import importlib
 import json
 import base64
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from preprocessing.text_extractor import extract_text_from_file
 from preprocessing.clause_extraction import extract_clauses
 from analysis.similarity_search import get_similar
 import analysis.common_analyzer
 importlib.reload(analysis.common_analyzer)
 from analysis.common_analyzer import analyze_pair
 APP_TITLE = "Legal Semantic Integrity"
+DEFAULT_MODEL_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
         if m:
             name = " ".join(m.group(1).split())
             # Filter generic captures like "hereinafter called"
+            if name and not re.search(
+                r"hereinafter|called|referred|party|agreement", name, re.IGNORECASE
+            ):
                 return name[:80]
     if re.search(rf"\b{role_l}\b", t, flags=re.IGNORECASE):
                 parties[role] = cleaned
                 break
         # Secondary fallback: explicit role in text without name
+        if parties[role] == "Not found" and re.search(
+            rf"\b{role.lower()}\b", compact, flags=re.IGNORECASE
+        ):
             parties[role] = f"{role} mentioned (name not parsed)"
     return parties
         vendee = _extract_party_name(text2, "vendee")
     if doc_parties:
+        if vendor in [
+            "Not found",
+            "Vendor mentioned (name not parsed)",
+        ] and doc_parties.get("Vendor"):
             vendor = doc_parties.get("Vendor")
+        if vendee in [
+            "Not found",
+            "Vendee mentioned (name not parsed)",
+        ] and doc_parties.get("Vendee"):
             vendee = doc_parties.get("Vendee")
     return vendor, vendee
         )
     with col_auth:
+        st.markdown(
+            '<div class="step">Step 1 of 3: Login</div>', unsafe_allow_html=True
+        )
         tab_login, tab_signup = st.tabs(["Sign In", "Create Account"])
         with tab_login:
     st.caption("Local accounts are saved in data/users.db")
+def run_analysis(
+    uploaded_file, sensitivity: float, backend: str, llama_model_path: str
+):
     file_ext = uploaded_file.name.split(".")[-1].lower()
     with st.spinner("Extracting text..."):
             result["Vendee"] = vendee_name
             if backend == "llama":
+                _, llm_conf, llm_label, llm_reason = verifier.predict(
+                    result["Clause 1"], result["Clause 2"]
+                )
             else:
+                _, llm_conf, llm_label = verifier.predict(
+                    result["Clause 1"], result["Clause 2"]
+                )
                 llm_reason = f"NLI label: {llm_label}"
             if llm_label == "Neutral":
         """,
         unsafe_allow_html=True,
     )
+    st.markdown(
+        '<div class="step">Step 2 of 3: Upload Document</div>', unsafe_allow_html=True
+    )
     with st.sidebar:
         st.header("Scan Settings")
             f"""
             <div class="mini-card">
                 <div class="mini-label">Active Mode</div>
+                <div class="mini-value">{scan_mode.split("(")[0].strip()}</div>
                 <div class="mono">Sensitivity: {sensitivity} | Backend: {model_backend}</div>
             </div>
             """,
         """,
         unsafe_allow_html=True,
     )
+    st.markdown(
+        '<div class="step">Step 3 of 3: Dashboard</div>', unsafe_allow_html=True
+    )
     results = st.session_state.results
     line_issues = st.session_state.line_issues
                 st.caption(f"Single issue page: {page_min}")
                 page_sel = (page_min, page_max)
             else:
+                page_sel = st.slider(
+                    "Page Range (analytics)", page_min, page_max, (page_min, page_max)
+                )
         with filter_col3:
+            vendors = ["All"] + sorted(
+                line_df["Vendor"].dropna().astype(str).unique().tolist()
+            )
+            vendees = ["All"] + sorted(
+                line_df["Vendee"].dropna().astype(str).unique().tolist()
+            )
             vendor_sel = st.selectbox("Vendor", vendors, index=0)
             vendee_sel = st.selectbox("Vendee", vendees, index=0)
         if issue_sel:
             filtered = filtered[filtered["Issue Type"].isin(issue_sel)]
         filtered = filtered[filtered["Confidence"] >= conf_min]
+        filtered = filtered[
+            (filtered["Page"] >= page_sel[0]) & (filtered["Page"] <= page_sel[1])
+        ]
         if vendor_sel != "All":
             filtered = filtered[filtered["Vendor"] == vendor_sel]
         if vendee_sel != "All":
         total_issues = len(filtered)
         conflict_rate = (len(issues_df) / len(df) * 100.0) if len(df) else 0.0
+        top_issue = (
+            filtered["Issue Type"].mode().iloc[0] if not filtered.empty else "N/A"
+        )
         highest_risk_page = (
+            int(filtered.groupby("Page")["Confidence"].mean().idxmax())
+            if not filtered.empty
+            else "N/A"
         )
         k1, k2, k3, k4 = st.columns(4)
         k1.metric("Filtered Issues", total_issues)
             pie_fig.update_layout(margin=dict(l=10, r=10, t=50, b=10))
             st.plotly_chart(pie_fig, use_container_width=True)
+            top_lines = filtered.sort_values(by=["Confidence"], ascending=False).head(
+                10
+            )
             st.markdown("**Top 10 High-Risk Lines**")
             st.dataframe(
+                top_lines[
+                    [
+                        "Issue Type",
+                        "Confidence",
+                        "Page",
+                        "Line",
+                        "Vendor",
+                        "Vendee",
+                        "Snippet",
+                        "Reason",
+                    ]
+                ],
                 use_container_width=True,
             )
     else:
                 st.caption(f"Only one page with issues: Page {page_min}")
                 page_range = (page_min, page_max)
             else:
+                page_range = st.slider(
+                    "Page range", page_min, page_max, (page_min, page_max)
+                )
             if selected:
                 line_df = line_df[line_df["Issue Type"].isin(selected)]
+            line_df = line_df[
+                (line_df["Page"] >= page_range[0]) & (line_df["Page"] <= page_range[1])
+            ]
             st.dataframe(line_df, use_container_width=True)
             st.markdown("**Issue Occurrence By Line With Parties**")
             by_line = line_df.copy()
+            by_line = by_line.sort_values(
+                by=["Page", "Line", "Confidence"], ascending=[True, True, False]
+            )
             st.dataframe(
+                by_line[
+                    [
+                        "Issue Type",
+                        "Page",
+                        "Line",
+                        "Vendor",
+                        "Vendee",
+                        "Confidence",
+                        "Reason",
+                    ]
+                ],
                 use_container_width=True,
             )
                 line_df = line_df.reset_index(drop=True)
                 line_df.insert(0, "Item", range(1, len(line_df) + 1))
                 line_df["Jump"] = line_df.apply(
+                    lambda r: (
+                        f"#{r['Item']} | Pg {int(r['Page'])}, Ln {int(r['Line'])} | {r['Issue Type']}"
+                    ),
                     axis=1,
                 )
+                selected_jump = st.selectbox(
+                    "Select issue line", line_df["Jump"].tolist()
+                )
                 chosen = line_df[line_df["Jump"] == selected_jump].iloc[0]
                 c1, c2 = st.columns([1.1, 1], gap="large")
                         f"""
                         <div class="mini-card">
                             <div class="mini-label">Selected Line</div>
+                            <div class="mini-value">Pg {int(chosen["Page"])} · Ln {int(chosen["Line"])}</div>
+                            <div class="mono">{chosen["Issue Type"]} | Confidence: {float(chosen["Confidence"]):.2f}</div>
                         </div>
                         """,
                         unsafe_allow_html=True,
                     if is_pdf and st.session_state.uploaded_bytes:
                         st.caption("PDF Preview (jumped to selected page)")
                         page_number = int(chosen["Page"])
+                        pdf_b64 = base64.b64encode(
+                            st.session_state.uploaded_bytes
+                        ).decode("utf-8")
                         pdf_html = f"""
                         <iframe
                             src="data:application/pdf;base64,{pdf_b64}#page={page_number}&zoom=110"
                         """
                         st.markdown(pdf_html, unsafe_allow_html=True)
                     else:
+                        st.info(
+                            "Inline PDF preview is available for PDF uploads. Current file is not PDF."
+                        )
         else:
             st.info("No line-level issues to display.")
             file_name="semantic_integrity_report.json",
             mime="application/json",
         )
+        pdf_bytes = generate_pdf_report(
+            [r for r in results if r["Label"] != "NO_CONFLICT"]
+        )
         st.download_button(
             label="Download PDF Report",
             data=pdf_bytes,