Spaces:

solfedge
/

Clause_Lense

Sleeping

App Files Files Community

solfedge commited on Aug 2, 2025

Commit

1f8cd6e

verified ·

1 Parent(s): b13c645

Upload 4 files

Browse files

Files changed (4) hide show

app.py +140 -0
llm_reviewer.py +132 -0
parser.py +38 -0
spacy_matcher.py +77 -0

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gradio as gr
+import os
+import traceback
+import time
+OUTPUT_DIR = "output"
+os.makedirs("data", exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+os.makedirs("models", exist_ok=True)
+def process_contract(file):
+    try:
+        # Clearing  previous outputs
+        for f in os.listdir(OUTPUT_DIR):
+            os.remove(os.path.join(OUTPUT_DIR, f))
+        if file is None:
+            yield " No file uploaded.", None, None, None
+            return
+        file_path = file.name
+        ext = os.path.splitext(file_path)[1].lower()
+        if ext not in [".pdf", ".docx"]:
+            yield f" Unsupported format: {ext}", None, None, None
+            return
+        yield " Extracting text...", None, None, None
+        time.sleep(0.1)
+        # Extract text
+        from parser import extract_text_from_pdf, extract_text_from_docx
+        text = extract_text_from_pdf(file_path) if ext == ".pdf" else extract_text_from_docx(file_path)
+        if not text or len(text.strip()) < 10:
+            yield "⚠ Failed to extract meaningful text.", None, None, None
+            return
+        yield " Finding clauses...", None, None, None
+        time.sleep(0.1)
+        import spacy
+        nlp = spacy.load("en_core_web_sm")
+        doc = nlp(text)
+        from spacy_matcher import find_clauses
+        matches = find_clauses(text)
+        if not matches:
+            yield " No clauses detected.", None, None, None
+            return
+        yield f" Analyzing {len(matches)} clauses with LLM...", None, None, None
+        time.sleep(0.1)
+        # Analyzeing with LLM
+        from llm_reviewer import review_clause_with_llm, get_clause_section
+        results = []
+        for label, _, start, end in matches:
+            section = get_clause_section(doc, start, end, window_size=30)
+            review = review_clause_with_llm(label, section)
+            results.append({
+                "label": label,
+                "section": section,
+                "review": review
+            })
+        from llm_reviewer import export_to_json, export_to_pdf
+        json_path = export_to_json(results, os.path.join(OUTPUT_DIR, "clause_reviews.json"))
+        pdf_path = export_to_pdf(results, os.path.join(OUTPUT_DIR, "clause_reviews.pdf"))
+        output_text = "##  Clause Reviews\n\n"
+        for r in results:
+            output_text += f" **{r['label'].replace('_', ' ').title()}**\n\n"
+            output_text += f" *Excerpt:* {r['section'][:300]}...\n\n"
+            output_text += f" *Review:* {r['review']}\n\n---\n\n"
+        found_types = sorted(set(r['label'].replace('_', ' ').title() for r in results))
+        clause_list = ", ".join(found_types)
+        yield (
+            f"Found {len(results)} clauses across {len(found_types)} types:\n\n{clause_list}",
+            output_text,
+            json_path,
+            pdf_path
+        )
+    except Exception as e:
+        tb = traceback.format_exc()
+        error_msg = f" Error: {str(e)}\n\n```\n{tb}\n```"
+        yield error_msg, None, None, None
+# Gradio Interface
+with gr.Blocks(title="ClauseLens - Legal Contract Analyzer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("#  ClauseLens: Legal Contract Analyzer")
+    gr.Markdown("Upload a legal contract (PDF or DOCX) for clause detection and LLM-powered review.")
+    with gr.Row():
+        file_input = gr.File(label="Upload Contract", file_types=[".pdf", ".docx"])
+    with gr.Row():
+        btn = gr.Button(" Analyze Contract", variant="primary")
+    with gr.Row():
+        status = gr.Textbox(label="Status")
+    with gr.Row():
+        output = gr.Markdown(label="Clause Reviews")
+    with gr.Row():
+        gr.Markdown("### 📎 Download Reports")
+    with gr.Row():
+        json_download = gr.File(label="Download JSON Report")
+        pdf_download = gr.File(label="Download PDF Report")
+    btn.click(
+        fn=process_contract,
+        inputs=file_input,
+        outputs=[status, output, json_download, pdf_download]
+    )
+# Enable queuing for streaming
+demo.queue()
+if __name__ == "__main__":
+    try:
+        demo.launch(share=True)
+    except Exception as e:
+        print(f"Launch failed: {e}")

llm_reviewer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+import os
+# Model Configuration
+MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+CACHE_DIR = "models"
+MODEL_PATH = os.path.join(CACHE_DIR, MODEL_FILE)
+os.makedirs(CACHE_DIR, exist_ok=True)
+if not os.path.exists(MODEL_PATH):
+    print(" Downloading TinyLlama-1.1B-Chat (Q4_K_M) from Hugging Face...")
+    MODEL_PATH = hf_hub_download(
+        repo_id=MODEL_NAME,
+        filename=MODEL_FILE,
+        local_dir=CACHE_DIR
+    )
+else:
+    print(f" Loaded cached model from {MODEL_PATH}")
+llm = Llama(
+    model_path=MODEL_PATH,
+    n_ctx=2048,
+    n_threads=4,
+    n_gpu_layers=0,
+    verbose=False
+)
+# Clause Context Extraction
+def get_clause_section(doc, start_token, end_token, window_size=30):
+    """
+    Extract context around matched clause.
+    """
+    start = max(0, start_token - window_size)
+    end = min(len(doc), end_token + window_size)
+    return doc[start:end].text
+# LLM Clause Review
+def review_clause_with_llm(clause_type, clause_text):
+    """
+    Send clause to TinyLlama for expert legal review.
+    Returns structured feedback.
+    """
+    prompt = f"""
+You are a senior legal expert reviewing a contract clause.
+Evaluate:
+- Is this clause fair, balanced, and standard?
+- Does it overly favor one party?
+- Are critical terms missing or ambiguous?
+Respond in this format:
+- Risk Level: [Low/Medium/High]
+- Feedback: Brief professional analysis
+- Suggestions: Bullet points for improvement
+CLAUSE TYPE: {clause_type.replace('_', ' ').title()}
+CLAUSE TEXT: {clause_text[:800]}  # Truncate long clauses
+""".strip()
+    try:
+        output = llm(
+            prompt,
+            max_tokens=512,
+            temperature=0.3,
+            stop=["\n\n", "User:", "###"]
+        )
+        return output['choices'][0]['text'].strip()
+    except Exception as e:
+        return f" LLM Review Error: {str(e)}"
+# Exporting Results
+def export_to_json(matched_clauses, filename="clause_reviews.json"):
+    """
+    Export clause reviews to JSON.
+    """
+    import json
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(matched_clauses, f, indent=2, ensure_ascii=False)
+    print(f" JSON report saved to {filename}")
+    return filename
+def export_to_pdf(matched_clauses, filename="clause_reviews.pdf"):
+    """
+    Export clause reviews to a clean PDF report.
+    """
+    from fpdf import FPDF
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.set_font("Arial", size=12)
+    pdf.cell(0, 10, "ClauseLens - Legal Contract Review Report", align='C', ln=True)
+    pdf.ln(10)
+    for clause in matched_clauses:
+        # Clause label
+        pdf.set_font("Arial", 'B', 12)
+        label = clause.get('label', 'Unknown').replace('_', ' ').title()
+        pdf.cell(0, 8, txt=f"Clause: {label}", ln=True)
+        pdf.ln(2)
+        # Section excerpt
+        pdf.set_font("Arial", 'B', 10)
+        pdf.cell(0, 8, txt="Excerpt:", ln=True)
+        pdf.set_font("Arial", size=10)
+        section = clause.get('section', 'N/A')
+        pdf.multi_cell(0, 6, txt=section)
+        pdf.ln(4)
+        # LLM Review
+        pdf.set_font("Arial", 'B', 10)
+        pdf.cell(0, 8, txt="LLM Review:", ln=True)
+        pdf.set_font("Arial", size=10)
+        review = clause.get('review', 'No review available')
+        pdf.multi_cell(0, 6, txt=review)
+        pdf.ln(8)
+    pdf.output(filename)
+    print(f" PDF report saved to {filename}")
+    return filename

parser.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import fitz
+from docx import Document
+import os
+def extract_text_from_pdf(pdf_path):
+    """Extract text from PDF using PyMuPDF."""
+    try:
+        doc = fitz.open(pdf_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text
+    except Exception as e:
+        print(f"Error reading PDF {pdf_path}: {e}")
+        return ""
+def extract_text_from_docx(docx_path):
+    """Extract text from DOCX using python-docx."""
+    try:
+        doc = Document(docx_path)
+        return "\n".join([para.text for para in doc.paragraphs])
+    except Exception as e:
+        print(f"Error reading DOCX {docx_path}: {e}")
+        return ""
+def load_documents(folder="data"):
+    """Load all supported documents from a folder."""
+    texts = []
+    for file in os.listdir(folder):
+        path = os.path.join(folder, file)
+        if file.endswith(".pdf"):
+            texts.append(extract_text_from_pdf(path))
+        elif file.endswith(".docx"):
+            texts.append(extract_text_from_docx(path))
+        else:
+            print(f"⚠ Skipped unsupported file: {file}")
+    return texts

spacy_matcher.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import spacy
+from spacy.matcher import Matcher
+nlp = spacy.load("en_core_web_sm")
+matcher = Matcher(nlp.vocab)
+clause_patterns = {
+    "CONFIDENTIALITY": [
+        [{"LOWER": "confidentiality"}],
+        [{"LOWER": "non-disclosure"}],
+        [{"LOWER": "nda"}],
+        [{"LOWER": "proprietary"}, {"LOWER": "information"}],
+    ],
+    "TERMINATION": [
+        [{"LOWER": "termination"}],
+        [{"LOWER": "end"}, {"LOWER": "of"}, {"LOWER": "agreement"}],
+        [{"LOWER": "terminate"}, {"LOWER": "this"}, {"LOWER": "agreement"}],
+    ],
+    "NON_COMPETE": [
+        [{"LOWER": "non-compete"}],
+        [{"LOWER": "non"}, {"LOWER": "compete"}],
+        [{"LOWER": "competition"}, {"LOWER": "restriction"}],
+    ],
+    "GOVERNING_LAW": [
+        [{"LOWER": "governing"}, {"LOWER": "law"}],
+        [{"LOWER": "jurisdiction"}],
+        [{"LOWER": "choice"}, {"LOWER": "of"}, {"LOWER": "law"}],
+    ],
+    "SEVERABILITY": [
+        [{"LOWER": "severability"}],
+        [{"LOWER": "invalidity"}, {"OP": "?"}, {"LOWER": "provision"}],
+        [{"LOWER": "severable"}],
+    ],
+    "LIABILITY": [
+        [{"LOWER": "liability"}],
+        [{"LOWER": "limitation"}, {"LOWER": "of"}, {"LOWER": "liability"}],
+        [{"LOWER": "indemnification"}],
+        [{"LOWER": "cap"}, {"LOWER": "on"}, {"LOWER": "damages"}],
+    ],
+    "FORCE_MAJEURE": [
+        [{"LOWER": "force"}, {"LOWER": "majeure"}],
+        [{"LOWER": "acts"}, {"LOWER": "of"}, {"LOWER": "god"}],
+        [{"LOWER": "unforeseen"}, {"LOWER": "events"}],
+        [{"LOWER": "pandemic"}],
+    ],
+    "PAYMENT_TERMS": [
+        [{"LOWER": "payment"}, {"LOWER": "terms"}],
+        [{"LOWER": "due"}, {"LOWER": "within"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["days", "weeks"]}}],
+        [{"LOWER": "invoice"}, {"LOWER": "shall"}, {"LOWER": "be"}],
+        [{"LOWER": "net"}, {"IS_DIGIT": True}],
+    ]
+}
+# Adding all patterns to matcher
+for label, patterns in clause_patterns.items():
+    for pattern in patterns:
+        matcher.add(label, [pattern])
+print("Clause matcher loaded with extended patterns.")
+def find_clauses(text, window_size=30):
+    """Find clauses in text and return context."""
+    doc = nlp(text)
+    matches = matcher(doc)
+    results = []
+    for match_id, start, end in matches:
+        span = doc[start:end]
+        label = nlp.vocab.strings[match_id]
+        # Get context window
+        ctx_start = max(0, start - window_size)
+        ctx_end = min(len(doc), end + window_size)
+        context = doc[ctx_start:ctx_end].text
+        results.append((label, context, start, end))
+    return results