solfedge commited on
Commit
1f8cd6e
·
verified ·
1 Parent(s): b13c645

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +140 -0
  2. llm_reviewer.py +132 -0
  3. parser.py +38 -0
  4. spacy_matcher.py +77 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import os
4
+ import traceback
5
+ import time
6
+
7
+
8
+ OUTPUT_DIR = "output"
9
+ os.makedirs("data", exist_ok=True)
10
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
11
+ os.makedirs("models", exist_ok=True)
12
+
13
+ def process_contract(file):
14
+ try:
15
+ # Clearing previous outputs
16
+ for f in os.listdir(OUTPUT_DIR):
17
+ os.remove(os.path.join(OUTPUT_DIR, f))
18
+
19
+
20
+ if file is None:
21
+ yield " No file uploaded.", None, None, None
22
+ return
23
+
24
+ file_path = file.name
25
+ ext = os.path.splitext(file_path)[1].lower()
26
+
27
+ if ext not in [".pdf", ".docx"]:
28
+ yield f" Unsupported format: {ext}", None, None, None
29
+ return
30
+
31
+ yield " Extracting text...", None, None, None
32
+ time.sleep(0.1)
33
+
34
+ # Extract text
35
+ from parser import extract_text_from_pdf, extract_text_from_docx
36
+ text = extract_text_from_pdf(file_path) if ext == ".pdf" else extract_text_from_docx(file_path)
37
+
38
+ if not text or len(text.strip()) < 10:
39
+ yield "⚠ Failed to extract meaningful text.", None, None, None
40
+ return
41
+
42
+ yield " Finding clauses...", None, None, None
43
+ time.sleep(0.1)
44
+
45
+
46
+ import spacy
47
+ nlp = spacy.load("en_core_web_sm")
48
+ doc = nlp(text)
49
+
50
+
51
+ from spacy_matcher import find_clauses
52
+ matches = find_clauses(text)
53
+ if not matches:
54
+ yield " No clauses detected.", None, None, None
55
+ return
56
+
57
+ yield f" Analyzing {len(matches)} clauses with LLM...", None, None, None
58
+ time.sleep(0.1)
59
+
60
+ # Analyzeing with LLM
61
+ from llm_reviewer import review_clause_with_llm, get_clause_section
62
+ results = []
63
+ for label, _, start, end in matches:
64
+ section = get_clause_section(doc, start, end, window_size=30)
65
+ review = review_clause_with_llm(label, section)
66
+ results.append({
67
+ "label": label,
68
+ "section": section,
69
+ "review": review
70
+ })
71
+
72
+
73
+ from llm_reviewer import export_to_json, export_to_pdf
74
+ json_path = export_to_json(results, os.path.join(OUTPUT_DIR, "clause_reviews.json"))
75
+ pdf_path = export_to_pdf(results, os.path.join(OUTPUT_DIR, "clause_reviews.pdf"))
76
+
77
+
78
+ output_text = "## Clause Reviews\n\n"
79
+ for r in results:
80
+ output_text += f" **{r['label'].replace('_', ' ').title()}**\n\n"
81
+ output_text += f" *Excerpt:* {r['section'][:300]}...\n\n"
82
+ output_text += f" *Review:* {r['review']}\n\n---\n\n"
83
+
84
+
85
+ found_types = sorted(set(r['label'].replace('_', ' ').title() for r in results))
86
+ clause_list = ", ".join(found_types)
87
+
88
+
89
+ yield (
90
+ f"Found {len(results)} clauses across {len(found_types)} types:\n\n{clause_list}",
91
+ output_text,
92
+ json_path,
93
+ pdf_path
94
+ )
95
+
96
+ except Exception as e:
97
+ tb = traceback.format_exc()
98
+ error_msg = f" Error: {str(e)}\n\n```\n{tb}\n```"
99
+ yield error_msg, None, None, None
100
+
101
+
102
+ # Gradio Interface
103
+ with gr.Blocks(title="ClauseLens - Legal Contract Analyzer", theme=gr.themes.Soft()) as demo:
104
+ gr.Markdown("# ClauseLens: Legal Contract Analyzer")
105
+ gr.Markdown("Upload a legal contract (PDF or DOCX) for clause detection and LLM-powered review.")
106
+
107
+ with gr.Row():
108
+ file_input = gr.File(label="Upload Contract", file_types=[".pdf", ".docx"])
109
+
110
+ with gr.Row():
111
+ btn = gr.Button(" Analyze Contract", variant="primary")
112
+
113
+ with gr.Row():
114
+ status = gr.Textbox(label="Status")
115
+
116
+ with gr.Row():
117
+ output = gr.Markdown(label="Clause Reviews")
118
+
119
+ with gr.Row():
120
+ gr.Markdown("### 📎 Download Reports")
121
+
122
+ with gr.Row():
123
+ json_download = gr.File(label="Download JSON Report")
124
+ pdf_download = gr.File(label="Download PDF Report")
125
+
126
+
127
+ btn.click(
128
+ fn=process_contract,
129
+ inputs=file_input,
130
+ outputs=[status, output, json_download, pdf_download]
131
+ )
132
+
133
+ # Enable queuing for streaming
134
+ demo.queue()
135
+
136
+ if __name__ == "__main__":
137
+ try:
138
+ demo.launch(share=True)
139
+ except Exception as e:
140
+ print(f"Launch failed: {e}")
llm_reviewer.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from huggingface_hub import hf_hub_download
3
+ from llama_cpp import Llama
4
+ import os
5
+
6
+
7
+ # Model Configuration
8
+
9
+ MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
10
+ MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
11
+ CACHE_DIR = "models"
12
+ MODEL_PATH = os.path.join(CACHE_DIR, MODEL_FILE)
13
+
14
+
15
+ os.makedirs(CACHE_DIR, exist_ok=True)
16
+
17
+ if not os.path.exists(MODEL_PATH):
18
+ print(" Downloading TinyLlama-1.1B-Chat (Q4_K_M) from Hugging Face...")
19
+ MODEL_PATH = hf_hub_download(
20
+ repo_id=MODEL_NAME,
21
+ filename=MODEL_FILE,
22
+ local_dir=CACHE_DIR
23
+ )
24
+ else:
25
+ print(f" Loaded cached model from {MODEL_PATH}")
26
+
27
+ llm = Llama(
28
+ model_path=MODEL_PATH,
29
+ n_ctx=2048,
30
+ n_threads=4,
31
+ n_gpu_layers=0,
32
+ verbose=False
33
+ )
34
+
35
+
36
+ # Clause Context Extraction
37
+
38
+ def get_clause_section(doc, start_token, end_token, window_size=30):
39
+ """
40
+ Extract context around matched clause.
41
+ """
42
+ start = max(0, start_token - window_size)
43
+ end = min(len(doc), end_token + window_size)
44
+ return doc[start:end].text
45
+
46
+
47
+ # LLM Clause Review
48
+
49
+ def review_clause_with_llm(clause_type, clause_text):
50
+ """
51
+ Send clause to TinyLlama for expert legal review.
52
+ Returns structured feedback.
53
+ """
54
+ prompt = f"""
55
+ You are a senior legal expert reviewing a contract clause.
56
+
57
+ Evaluate:
58
+ - Is this clause fair, balanced, and standard?
59
+ - Does it overly favor one party?
60
+ - Are critical terms missing or ambiguous?
61
+
62
+ Respond in this format:
63
+ - Risk Level: [Low/Medium/High]
64
+ - Feedback: Brief professional analysis
65
+ - Suggestions: Bullet points for improvement
66
+
67
+ CLAUSE TYPE: {clause_type.replace('_', ' ').title()}
68
+ CLAUSE TEXT: {clause_text[:800]} # Truncate long clauses
69
+ """.strip()
70
+
71
+ try:
72
+ output = llm(
73
+ prompt,
74
+ max_tokens=512,
75
+ temperature=0.3,
76
+ stop=["\n\n", "User:", "###"]
77
+ )
78
+ return output['choices'][0]['text'].strip()
79
+ except Exception as e:
80
+ return f" LLM Review Error: {str(e)}"
81
+
82
+
83
+ # Exporting Results
84
+
85
+ def export_to_json(matched_clauses, filename="clause_reviews.json"):
86
+ """
87
+ Export clause reviews to JSON.
88
+ """
89
+ import json
90
+ with open(filename, "w", encoding="utf-8") as f:
91
+ json.dump(matched_clauses, f, indent=2, ensure_ascii=False)
92
+ print(f" JSON report saved to {filename}")
93
+ return filename
94
+
95
+ def export_to_pdf(matched_clauses, filename="clause_reviews.pdf"):
96
+ """
97
+ Export clause reviews to a clean PDF report.
98
+ """
99
+ from fpdf import FPDF
100
+ pdf = FPDF()
101
+ pdf.add_page()
102
+ pdf.set_auto_page_break(auto=True, margin=15)
103
+ pdf.set_font("Arial", size=12)
104
+ pdf.cell(0, 10, "ClauseLens - Legal Contract Review Report", align='C', ln=True)
105
+ pdf.ln(10)
106
+
107
+ for clause in matched_clauses:
108
+ # Clause label
109
+ pdf.set_font("Arial", 'B', 12)
110
+ label = clause.get('label', 'Unknown').replace('_', ' ').title()
111
+ pdf.cell(0, 8, txt=f"Clause: {label}", ln=True)
112
+ pdf.ln(2)
113
+
114
+ # Section excerpt
115
+ pdf.set_font("Arial", 'B', 10)
116
+ pdf.cell(0, 8, txt="Excerpt:", ln=True)
117
+ pdf.set_font("Arial", size=10)
118
+ section = clause.get('section', 'N/A')
119
+ pdf.multi_cell(0, 6, txt=section)
120
+ pdf.ln(4)
121
+
122
+ # LLM Review
123
+ pdf.set_font("Arial", 'B', 10)
124
+ pdf.cell(0, 8, txt="LLM Review:", ln=True)
125
+ pdf.set_font("Arial", size=10)
126
+ review = clause.get('review', 'No review available')
127
+ pdf.multi_cell(0, 6, txt=review)
128
+ pdf.ln(8)
129
+
130
+ pdf.output(filename)
131
+ print(f" PDF report saved to {filename}")
132
+ return filename
parser.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import fitz
3
+ from docx import Document
4
+ import os
5
+
6
+ def extract_text_from_pdf(pdf_path):
7
+ """Extract text from PDF using PyMuPDF."""
8
+ try:
9
+ doc = fitz.open(pdf_path)
10
+ text = ""
11
+ for page in doc:
12
+ text += page.get_text()
13
+ return text
14
+ except Exception as e:
15
+ print(f"Error reading PDF {pdf_path}: {e}")
16
+ return ""
17
+
18
+ def extract_text_from_docx(docx_path):
19
+ """Extract text from DOCX using python-docx."""
20
+ try:
21
+ doc = Document(docx_path)
22
+ return "\n".join([para.text for para in doc.paragraphs])
23
+ except Exception as e:
24
+ print(f"Error reading DOCX {docx_path}: {e}")
25
+ return ""
26
+
27
+ def load_documents(folder="data"):
28
+ """Load all supported documents from a folder."""
29
+ texts = []
30
+ for file in os.listdir(folder):
31
+ path = os.path.join(folder, file)
32
+ if file.endswith(".pdf"):
33
+ texts.append(extract_text_from_pdf(path))
34
+ elif file.endswith(".docx"):
35
+ texts.append(extract_text_from_docx(path))
36
+ else:
37
+ print(f"⚠ Skipped unsupported file: {file}")
38
+ return texts
spacy_matcher.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import spacy
3
+ from spacy.matcher import Matcher
4
+
5
+
6
+ nlp = spacy.load("en_core_web_sm")
7
+ matcher = Matcher(nlp.vocab)
8
+
9
+
10
+ clause_patterns = {
11
+ "CONFIDENTIALITY": [
12
+ [{"LOWER": "confidentiality"}],
13
+ [{"LOWER": "non-disclosure"}],
14
+ [{"LOWER": "nda"}],
15
+ [{"LOWER": "proprietary"}, {"LOWER": "information"}],
16
+ ],
17
+ "TERMINATION": [
18
+ [{"LOWER": "termination"}],
19
+ [{"LOWER": "end"}, {"LOWER": "of"}, {"LOWER": "agreement"}],
20
+ [{"LOWER": "terminate"}, {"LOWER": "this"}, {"LOWER": "agreement"}],
21
+ ],
22
+ "NON_COMPETE": [
23
+ [{"LOWER": "non-compete"}],
24
+ [{"LOWER": "non"}, {"LOWER": "compete"}],
25
+ [{"LOWER": "competition"}, {"LOWER": "restriction"}],
26
+ ],
27
+ "GOVERNING_LAW": [
28
+ [{"LOWER": "governing"}, {"LOWER": "law"}],
29
+ [{"LOWER": "jurisdiction"}],
30
+ [{"LOWER": "choice"}, {"LOWER": "of"}, {"LOWER": "law"}],
31
+ ],
32
+ "SEVERABILITY": [
33
+ [{"LOWER": "severability"}],
34
+ [{"LOWER": "invalidity"}, {"OP": "?"}, {"LOWER": "provision"}],
35
+ [{"LOWER": "severable"}],
36
+ ],
37
+ "LIABILITY": [
38
+ [{"LOWER": "liability"}],
39
+ [{"LOWER": "limitation"}, {"LOWER": "of"}, {"LOWER": "liability"}],
40
+ [{"LOWER": "indemnification"}],
41
+ [{"LOWER": "cap"}, {"LOWER": "on"}, {"LOWER": "damages"}],
42
+ ],
43
+ "FORCE_MAJEURE": [
44
+ [{"LOWER": "force"}, {"LOWER": "majeure"}],
45
+ [{"LOWER": "acts"}, {"LOWER": "of"}, {"LOWER": "god"}],
46
+ [{"LOWER": "unforeseen"}, {"LOWER": "events"}],
47
+ [{"LOWER": "pandemic"}],
48
+ ],
49
+ "PAYMENT_TERMS": [
50
+ [{"LOWER": "payment"}, {"LOWER": "terms"}],
51
+ [{"LOWER": "due"}, {"LOWER": "within"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["days", "weeks"]}}],
52
+ [{"LOWER": "invoice"}, {"LOWER": "shall"}, {"LOWER": "be"}],
53
+ [{"LOWER": "net"}, {"IS_DIGIT": True}],
54
+ ]
55
+ }
56
+
57
+ # Adding all patterns to matcher
58
+ for label, patterns in clause_patterns.items():
59
+ for pattern in patterns:
60
+ matcher.add(label, [pattern])
61
+
62
+ print("Clause matcher loaded with extended patterns.")
63
+
64
+ def find_clauses(text, window_size=30):
65
+ """Find clauses in text and return context."""
66
+ doc = nlp(text)
67
+ matches = matcher(doc)
68
+ results = []
69
+ for match_id, start, end in matches:
70
+ span = doc[start:end]
71
+ label = nlp.vocab.strings[match_id]
72
+ # Get context window
73
+ ctx_start = max(0, start - window_size)
74
+ ctx_end = min(len(doc), end + window_size)
75
+ context = doc[ctx_start:ctx_end].text
76
+ results.append((label, context, start, end))
77
+ return results