Commit ·
9cb0411
1
Parent(s): f7ab2c4
changed code
Browse files- Dockerfile +27 -13
- app.py +202 -94
- requirements.txt +7 -20
Dockerfile
CHANGED
|
@@ -1,18 +1,32 @@
|
|
| 1 |
-
FROM
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
-
COPY requirements.txt /app/
|
| 5 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 6 |
-
COPY . /app/
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
CMD ["python", "app.py"]
|
|
|
|
| 1 |
+
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
|
| 2 |
|
| 3 |
+
# Set environment variables
|
| 4 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 7 |
+
|
| 8 |
+
# Install system dependencies
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
python3 \
|
| 11 |
+
python3-pip \
|
| 12 |
+
python3-dev \
|
| 13 |
+
build-essential \
|
| 14 |
+
git \
|
| 15 |
+
&& apt-get clean \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
+
|
| 18 |
+
# Set the working directory
|
| 19 |
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
# Copy requirements file
|
| 22 |
+
COPY requirements.txt .
|
| 23 |
+
|
| 24 |
+
# Install Python dependencies
|
| 25 |
+
RUN pip3 install --no-cache-dir -U pip setuptools wheel
|
| 26 |
+
RUN pip3 install --no-cache-dir -r requirements.txt
|
| 27 |
|
| 28 |
+
# Copy application code
|
| 29 |
+
COPY app.py .
|
| 30 |
|
| 31 |
+
# Set the default command to run the application
|
| 32 |
+
CMD ["python3", "app.py"]
|
|
|
app.py
CHANGED
|
@@ -7,9 +7,11 @@ from io import BytesIO, IOBase
|
|
| 7 |
import tempfile
|
| 8 |
import re
|
| 9 |
import datetime
|
| 10 |
-
import
|
| 11 |
-
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
###############################################################################
|
| 15 |
# 1) Logging Configuration
|
|
@@ -21,127 +23,234 @@ logging.basicConfig(
|
|
| 21 |
logger = logging.getLogger("LLM-Legal-App")
|
| 22 |
|
| 23 |
###############################################################################
|
| 24 |
-
# 2)
|
| 25 |
###############################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
if not api_key:
|
| 31 |
-
logger.error("Hugging Face API key not found in environment variables.")
|
| 32 |
-
raise ValueError("Hugging Face API key not found. Set it with `os.environ['HUGGINGFACE_API_KEY'] = 'your_api_key'`")
|
| 33 |
-
|
| 34 |
-
logger.info("Successfully retrieved Hugging Face API key.")
|
| 35 |
-
|
| 36 |
|
| 37 |
###############################################################################
|
| 38 |
-
# 3)
|
| 39 |
###############################################################################
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
def generate_legal_document(doc_type, party_a, party_b, context, country):
|
| 49 |
"""
|
| 50 |
-
Uses
|
| 51 |
"""
|
| 52 |
logger.info(f"Starting generation for doc_type={doc_type!r}.")
|
|
|
|
| 53 |
party_a = party_a if party_a else "[Party A Not Provided]"
|
| 54 |
party_b = party_b if party_b else "[Party B Not Provided]"
|
| 55 |
context = context if context else "[Context Not Provided]"
|
| 56 |
|
| 57 |
prompt = f"""
|
| 58 |
-
Generate a {doc_type} for:
|
| 59 |
-
1) {party_a}
|
| 60 |
-
2) {party_b}
|
| 61 |
-
|
| 62 |
-
Context/brief of the agreement:
|
| 63 |
-
{context}.
|
| 64 |
-
|
| 65 |
-
The document should include:
|
| 66 |
-
- Purpose of the {doc_type}
|
| 67 |
-
- Responsibilities and obligations of each party
|
| 68 |
-
- Confidentiality terms
|
| 69 |
-
- Payment terms (use [To Be Determined] if not specified)
|
| 70 |
-
- Term (duration) and termination
|
| 71 |
-
- Governing law: {country}
|
| 72 |
-
- Jurisdiction: [Appropriate region in {country} if not provided]
|
| 73 |
-
- Signature blocks
|
| 74 |
-
|
| 75 |
-
Use formal language, but keep it relatively clear and readable.
|
| 76 |
-
For any missing information, use placeholders like [To Be Determined].
|
| 77 |
-
Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
|
| 78 |
-
"""
|
| 79 |
logger.debug(f"Generated prompt:\n{prompt}")
|
| 80 |
|
| 81 |
-
|
| 82 |
-
# Use the Hugging Face pipeline
|
| 83 |
-
generated_text = generator(
|
| 84 |
-
prompt,
|
| 85 |
-
max_length=1400, # Adjust as needed
|
| 86 |
-
num_return_sequences=1,
|
| 87 |
-
temperature=0.3, # Adjust as needed
|
| 88 |
-
)[0]['generated_text']
|
| 89 |
-
|
| 90 |
-
logger.info("Document generation complete.")
|
| 91 |
-
return generated_text
|
| 92 |
-
|
| 93 |
-
except Exception as e:
|
| 94 |
-
logger.exception("Error generating legal document.")
|
| 95 |
-
return f"Error generating document: {e}"
|
| 96 |
-
|
| 97 |
|
| 98 |
def review_legal_document(doc_text, doc_type, party_a, party_b):
|
| 99 |
-
"""
|
|
|
|
|
|
|
| 100 |
logger.info("Starting document review (rule-based and wording).")
|
| 101 |
|
| 102 |
# --- Rule-Based Review ---
|
| 103 |
rule_based_prompt = f"""
|
| 104 |
-
|
|
|
|
|
|
|
| 105 |
|
| 106 |
Document text:
|
| 107 |
\"\"\"
|
| 108 |
{doc_text}
|
| 109 |
\"\"\"
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
"""
|
| 112 |
logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
|
| 113 |
|
| 114 |
try:
|
| 115 |
-
rule_based_review =
|
| 116 |
-
rule_based_prompt,
|
| 117 |
-
max_length=2000,
|
| 118 |
-
num_return_sequences=1,
|
| 119 |
-
temperature=0.3,
|
| 120 |
-
)[0]['generated_text']
|
| 121 |
except Exception as e:
|
| 122 |
logger.exception("Error during rule-based review.")
|
| 123 |
return f"Error during rule-based review: {e}"
|
| 124 |
|
| 125 |
# --- Wording Analysis ---
|
| 126 |
wording_analysis_prompt = f"""
|
| 127 |
-
|
| 128 |
|
| 129 |
Document text:
|
| 130 |
\"\"\"
|
| 131 |
{doc_text}
|
| 132 |
\"\"\"
|
| 133 |
|
| 134 |
-
Provide
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
"""
|
| 136 |
logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
|
| 137 |
|
| 138 |
try:
|
| 139 |
-
wording_analysis =
|
| 140 |
-
wording_analysis_prompt,
|
| 141 |
-
max_length=1400,
|
| 142 |
-
num_return_sequences=1,
|
| 143 |
-
temperature=0.3,
|
| 144 |
-
)[0]['generated_text']
|
| 145 |
except Exception as e:
|
| 146 |
logger.exception("Error during wording analysis.")
|
| 147 |
return f"Error during wording analysis: {e}"
|
|
@@ -149,15 +258,13 @@ Provide an analysis covering: ... (rest of prompt from previous turns) ...
|
|
| 149 |
combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
|
| 150 |
return combined_review
|
| 151 |
|
| 152 |
-
|
| 153 |
###############################################################################
|
| 154 |
# 4) File Parsing (PDF, DOCX)
|
| 155 |
###############################################################################
|
| 156 |
|
| 157 |
def parse_bytesio(file_data: BytesIO) -> str:
|
| 158 |
-
"""Parses a BytesIO object."""
|
| 159 |
logger.info("Parsing BytesIO object...")
|
| 160 |
-
# ... (rest of parse_bytesio function from previous turns) ...
|
| 161 |
try:
|
| 162 |
# Attempt to determine file type from content
|
| 163 |
try:
|
|
@@ -181,7 +288,6 @@ def parse_bytesio(file_data: BytesIO) -> str:
|
|
| 181 |
|
| 182 |
def parse_uploaded_file_path(file_data) -> str:
|
| 183 |
"""Takes file data, determines type, extracts text."""
|
| 184 |
-
# ... (rest of parse_uploaded_file_path from previous turns)
|
| 185 |
if not file_data:
|
| 186 |
logger.warning("No file provided.")
|
| 187 |
return ""
|
|
@@ -222,7 +328,6 @@ def parse_uploaded_file_path(file_data) -> str:
|
|
| 222 |
|
| 223 |
def clean_markdown(text):
|
| 224 |
"""Removes common Markdown formatting."""
|
| 225 |
-
# ... (rest of clean_markdown from previous turns)
|
| 226 |
if not text: return ""
|
| 227 |
text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
|
| 228 |
text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
|
|
@@ -237,14 +342,13 @@ def clean_markdown(text):
|
|
| 237 |
def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
|
| 238 |
"""Creates DOCX, adds review, saves to temp file, returns path."""
|
| 239 |
logger.debug("Creating and saving DOCX.")
|
| 240 |
-
# ... (rest of create_and_save_docx from previous turns) ...
|
| 241 |
document = docx.Document()
|
| 242 |
|
| 243 |
now = datetime.datetime.now()
|
| 244 |
timestamp = now.strftime("%Y%m%d_%H%M%S")
|
| 245 |
-
file_name = f"
|
| 246 |
|
| 247 |
-
title = f"
|
| 248 |
document.add_heading(title, level=1)
|
| 249 |
|
| 250 |
if doc_text:
|
|
@@ -277,7 +381,6 @@ def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a
|
|
| 277 |
else: # Other sections (if any)
|
| 278 |
document.add_paragraph(section)
|
| 279 |
|
| 280 |
-
|
| 281 |
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
|
| 282 |
document.save(tmpfile.name)
|
| 283 |
logger.debug(f"DOCX saved to: {tmpfile.name}")
|
|
@@ -292,7 +395,7 @@ def generate_document_interface(doc_type, party_a, party_b, context, country):
|
|
| 292 |
logger.info(f"User requested doc generation: {doc_type}, {country}")
|
| 293 |
doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
|
| 294 |
if doc_text.startswith("Error"):
|
| 295 |
-
|
| 296 |
docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
|
| 297 |
return doc_text, docx_file_path
|
| 298 |
|
|
@@ -316,6 +419,7 @@ def review_document_interface(file_data, doc_type, party_a, party_b):
|
|
| 316 |
###############################################################################
|
| 317 |
# 7) Build & Launch Gradio App
|
| 318 |
###############################################################################
|
|
|
|
| 319 |
custom_css = """
|
| 320 |
.tab-one {
|
| 321 |
background-color: #D1EEFC; /* Light blue */
|
|
@@ -325,13 +429,15 @@ custom_css = """
|
|
| 325 |
background-color: #FCEED1; /* Light orange */
|
| 326 |
color: #333;
|
| 327 |
}
|
|
|
|
|
|
|
| 328 |
"""
|
| 329 |
|
| 330 |
def build_app():
|
| 331 |
with gr.Blocks(css=custom_css) as demo:
|
| 332 |
gr.Markdown(
|
| 333 |
"""
|
| 334 |
-
# UST Global
|
| 335 |
|
| 336 |
**Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
|
| 337 |
|
|
@@ -339,7 +445,7 @@ def build_app():
|
|
| 339 |
"""
|
| 340 |
)
|
| 341 |
with gr.Tabs(selected=1):
|
| 342 |
-
with gr.Tab("Generate Document",visible=False):
|
| 343 |
doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
|
| 344 |
party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
|
| 345 |
party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
|
|
@@ -354,7 +460,7 @@ def build_app():
|
|
| 354 |
outputs=[gen_output_text, gen_output_file]
|
| 355 |
)
|
| 356 |
|
| 357 |
-
with gr.Tab("Review Document",elem_classes="tab-one", id=1):
|
| 358 |
# Hidden inputs to store values from Generate tab
|
| 359 |
doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
|
| 360 |
party_a_review = gr.Textbox(label="Party A Name", visible=False)
|
|
@@ -372,11 +478,13 @@ def build_app():
|
|
| 372 |
# Copy values from Generate to Review tab (hidden fields)
|
| 373 |
gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
|
| 374 |
|
| 375 |
-
|
| 376 |
-
|
| 377 |
return demo
|
| 378 |
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
import tempfile
|
| 8 |
import re
|
| 9 |
import datetime
|
| 10 |
+
import torch
|
|
|
|
| 11 |
|
| 12 |
+
import gradio as gr
|
| 13 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 14 |
+
import huggingface_hub
|
| 15 |
|
| 16 |
###############################################################################
|
| 17 |
# 1) Logging Configuration
|
|
|
|
| 23 |
logger = logging.getLogger("LLM-Legal-App")
|
| 24 |
|
| 25 |
###############################################################################
|
| 26 |
+
# 2) Initialize Hugging Face Model
|
| 27 |
###############################################################################
|
| 28 |
+
def initialize_model():
|
| 29 |
+
"""Initialize the DocumentCogito model and tokenizer from HuggingFace."""
|
| 30 |
+
logger.info("Initializing DocumentCogito model and tokenizer...")
|
| 31 |
+
try:
|
| 32 |
+
# Access token might be needed for some models
|
| 33 |
+
# token = huggingface_hub.get_token()
|
| 34 |
+
|
| 35 |
+
model_name = "Daemontatox/DocumentCogito"
|
| 36 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 37 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 38 |
+
model_name,
|
| 39 |
+
torch_dtype=torch.float16,
|
| 40 |
+
device_map="auto",
|
| 41 |
+
trust_remote_code=True
|
| 42 |
+
)
|
| 43 |
+
logger.info("Successfully initialized DocumentCogito model and tokenizer.")
|
| 44 |
+
return model, tokenizer
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.exception("Error initializing Hugging Face model.")
|
| 47 |
+
raise ValueError(f"Failed to initialize model: {e}")
|
| 48 |
|
| 49 |
+
# Initialize model and tokenizer
|
| 50 |
+
model, tokenizer = initialize_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
###############################################################################
|
| 53 |
+
# 3) LLM Utility Functions (Generation & Review)
|
| 54 |
###############################################################################
|
| 55 |
+
def generate_with_model(prompt, max_length=1400, temperature=0.3):
|
| 56 |
+
"""Generate text using the Hugging Face model."""
|
| 57 |
+
logger.info("Generating text with DocumentCogito model.")
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 61 |
+
|
| 62 |
+
# Generate with parameters similar to the original OpenAI call
|
| 63 |
+
generation_config = {
|
| 64 |
+
"max_new_tokens": max_length,
|
| 65 |
+
"temperature": temperature,
|
| 66 |
+
"top_p": 0.9,
|
| 67 |
+
"do_sample": temperature > 0,
|
| 68 |
+
"pad_token_id": tokenizer.eos_token_id
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
with torch.no_grad():
|
| 72 |
+
outputs = model.generate(**inputs, **generation_config)
|
| 73 |
+
|
| 74 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 75 |
+
|
| 76 |
+
# Remove the prompt from the response
|
| 77 |
+
if response.startswith(prompt):
|
| 78 |
+
response = response[len(prompt):].strip()
|
| 79 |
+
|
| 80 |
+
logger.info("Text generation complete.")
|
| 81 |
+
return response
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.exception("Error during text generation.")
|
| 85 |
+
return f"Error generating text: {e}"
|
| 86 |
|
| 87 |
def generate_legal_document(doc_type, party_a, party_b, context, country):
|
| 88 |
"""
|
| 89 |
+
Uses DocumentCogito to generate a legal document. Returns the document text.
|
| 90 |
"""
|
| 91 |
logger.info(f"Starting generation for doc_type={doc_type!r}.")
|
| 92 |
+
# Fill placeholders if fields are missing
|
| 93 |
party_a = party_a if party_a else "[Party A Not Provided]"
|
| 94 |
party_b = party_b if party_b else "[Party B Not Provided]"
|
| 95 |
context = context if context else "[Context Not Provided]"
|
| 96 |
|
| 97 |
prompt = f"""
|
| 98 |
+
You are a helpful legal assistant. Generate a {doc_type} for:
|
| 99 |
+
1) {party_a}
|
| 100 |
+
2) {party_b}
|
| 101 |
+
|
| 102 |
+
Context/brief of the agreement:
|
| 103 |
+
{context}.
|
| 104 |
+
|
| 105 |
+
The document should include:
|
| 106 |
+
- Purpose of the {doc_type}
|
| 107 |
+
- Responsibilities and obligations of each party
|
| 108 |
+
- Confidentiality terms
|
| 109 |
+
- Payment terms (use [To Be Determined] if not specified)
|
| 110 |
+
- Term (duration) and termination
|
| 111 |
+
- Governing law: {country}
|
| 112 |
+
- Jurisdiction: [Appropriate region in {country} if not provided]
|
| 113 |
+
- Signature blocks
|
| 114 |
+
|
| 115 |
+
Use formal language, but keep it relatively clear and readable.
|
| 116 |
+
For any missing information, use placeholders like [To Be Determined].
|
| 117 |
+
Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
|
| 118 |
+
"""
|
| 119 |
logger.debug(f"Generated prompt:\n{prompt}")
|
| 120 |
|
| 121 |
+
return generate_with_model(prompt, max_length=1400, temperature=0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
def review_legal_document(doc_text, doc_type, party_a, party_b):
|
| 124 |
+
"""
|
| 125 |
+
Reviews document: first with rule-based checks, then wording analysis.
|
| 126 |
+
"""
|
| 127 |
logger.info("Starting document review (rule-based and wording).")
|
| 128 |
|
| 129 |
# --- Rule-Based Review ---
|
| 130 |
rule_based_prompt = f"""
|
| 131 |
+
You are a legal AI assistant reviewing a document. Provide a review,
|
| 132 |
+
structured into the following numbered sections. Be concise and factual. Do NOT
|
| 133 |
+
use Markdown. Use plain text labels for each section.
|
| 134 |
|
| 135 |
Document text:
|
| 136 |
\"\"\"
|
| 137 |
{doc_text}
|
| 138 |
\"\"\"
|
| 139 |
+
|
| 140 |
+
Review Sections:
|
| 141 |
+
|
| 142 |
+
1) Parties and Authority:
|
| 143 |
+
- Confirm the full legal names of all parties.
|
| 144 |
+
- Make sure the people signing can legally commit their organizations.
|
| 145 |
+
|
| 146 |
+
2) Scope of Work / Obligations:
|
| 147 |
+
- Check that the contract clearly describes what each side must do.
|
| 148 |
+
- Look for deadlines, milestones, or deliverables.
|
| 149 |
+
- Ensure everything is realistic and not overly vague.
|
| 150 |
+
|
| 151 |
+
3) Definitions and Key Terms:
|
| 152 |
+
- See if there's a section that explains important terms.
|
| 153 |
+
- Ensure those terms are used the same way throughout the contract.
|
| 154 |
+
- Avoid or clarify any ambiguous language.
|
| 155 |
+
|
| 156 |
+
4) Payment Terms (If Applicable):
|
| 157 |
+
- Check how much is owed, the currency, and when it's due.
|
| 158 |
+
- Look for penalties, interest, or late fees.
|
| 159 |
+
- Note how and when invoices are sent or paid.
|
| 160 |
+
|
| 161 |
+
5) Term and Termination:
|
| 162 |
+
- Identify when the contract starts and ends.
|
| 163 |
+
- Understand how it can be renewed.
|
| 164 |
+
- See the conditions and notice required for ending the contract early.
|
| 165 |
+
|
| 166 |
+
6) Intellectual Property (IP) Rights:
|
| 167 |
+
- Confirm who owns any work created under the agreement.
|
| 168 |
+
- Note if licenses are granted for using the IP, and for how long.
|
| 169 |
+
|
| 170 |
+
7) Confidentiality and Privacy:
|
| 171 |
+
- Check what is considered confidential information.
|
| 172 |
+
- Look for exceptions (like already public info).
|
| 173 |
+
- See how long the confidentiality rules apply.
|
| 174 |
+
|
| 175 |
+
8) Warranties and Representations:
|
| 176 |
+
- Note any performance guarantees or quality promises.
|
| 177 |
+
- Look for disclaimers (like "as is" clauses).
|
| 178 |
+
|
| 179 |
+
9) Indemnification:
|
| 180 |
+
- See who will pay legal costs or damages if there's a lawsuit or claim.
|
| 181 |
+
- Check any limits on what's covered.
|
| 182 |
+
|
| 183 |
+
10) Limitation of Liability:
|
| 184 |
+
- Check if there's a maximum amount one side can claim in damages.
|
| 185 |
+
- Look for excluded damages, like lost profits.
|
| 186 |
+
|
| 187 |
+
11) Dispute Resolution and Governing Law:
|
| 188 |
+
- See if disputes go to arbitration, mediation, or court.
|
| 189 |
+
- Note which state or country's laws will apply.
|
| 190 |
+
|
| 191 |
+
12) Force Majeure (Unforeseen Events):
|
| 192 |
+
- Look for events like natural disasters or war that could suspend obligations.
|
| 193 |
+
- See if there are notice requirements for these events.
|
| 194 |
+
|
| 195 |
+
13) Notices and Amendments:
|
| 196 |
+
- Check how official notices must be sent (email, mail, etc.).
|
| 197 |
+
- Find out how to properly change the contract (in writing, signatures, etc.).
|
| 198 |
+
|
| 199 |
+
14) Entire Agreement and Severability:
|
| 200 |
+
- Confirm that this contract replaces all previous agreements.
|
| 201 |
+
- Ensure that if one clause is invalid, the rest still stands.
|
| 202 |
+
|
| 203 |
+
15) Signatures and Dates:
|
| 204 |
+
- Make sure the right people sign in their proper roles.
|
| 205 |
+
- Verify the date of signature and when the contract goes into effect.
|
| 206 |
+
|
| 207 |
+
16) Ambiguities, Contradictions, and Hidden Clauses:
|
| 208 |
+
- Watch for contradictory statements or clauses that conflict.
|
| 209 |
+
- Beware of vague phrases like "best efforts" without clear guidelines.
|
| 210 |
+
- Check for hidden or "buried" clauses in fine print or attachments.
|
| 211 |
+
|
| 212 |
+
17) Compliance and Regulatory Alignment:
|
| 213 |
+
- Ensure the contract follows relevant laws and rules.
|
| 214 |
+
- Check for industry-specific requirements.
|
| 215 |
+
|
| 216 |
+
18) Practical Considerations:
|
| 217 |
+
- Make sure deadlines and other requirements are doable.
|
| 218 |
+
- Confirm all negotiations are reflected in writing.
|
| 219 |
+
- Avoid blank or undefined items (like fees or dates "to be decided").
|
| 220 |
"""
|
| 221 |
logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
|
| 222 |
|
| 223 |
try:
|
| 224 |
+
rule_based_review = generate_with_model(rule_based_prompt, max_length=2000, temperature=0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
except Exception as e:
|
| 226 |
logger.exception("Error during rule-based review.")
|
| 227 |
return f"Error during rule-based review: {e}"
|
| 228 |
|
| 229 |
# --- Wording Analysis ---
|
| 230 |
wording_analysis_prompt = f"""
|
| 231 |
+
You are a legal AI assistant. Analyze the following legal document for its wording:
|
| 232 |
|
| 233 |
Document text:
|
| 234 |
\"\"\"
|
| 235 |
{doc_text}
|
| 236 |
\"\"\"
|
| 237 |
|
| 238 |
+
Provide a comprehensive analysis of the document's wording, covering these aspects for the ENTIRE document text:
|
| 239 |
+
|
| 240 |
+
1. **Clarity and Precision:** Identify ambiguous or vague language, and suggest improvements.
|
| 241 |
+
2. **Readability:** Assess the overall readability and suggest improvements for clarity, including sentence structure and complexity.
|
| 242 |
+
3. **Formal Tone:** Check if the language maintains a formal and professional tone appropriate for a legal document, and suggest changes if needed.
|
| 243 |
+
4. **Consistency:** Ensure consistent use of terms and phrasing throughout the document. Point out any inconsistencies.
|
| 244 |
+
5. **Redundancy:** Identify any unnecessary repetition of words or phrases.
|
| 245 |
+
6. **Jargon and Technical Terms:** Identify jargon or technical terms that might be unclear to a non-expert, and suggest clearer alternatives where appropriate.
|
| 246 |
+
7. **Overall Recommendations:** Give overall recommendations for improving the document's wording.
|
| 247 |
+
|
| 248 |
+
Provide your analysis in plain text, without using Markdown. Label each section of your analysis clearly (e.g., "Clarity and Precision:", "Readability:", etc.).
|
| 249 |
"""
|
| 250 |
logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
|
| 251 |
|
| 252 |
try:
|
| 253 |
+
wording_analysis = generate_with_model(wording_analysis_prompt, max_length=1000, temperature=0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
except Exception as e:
|
| 255 |
logger.exception("Error during wording analysis.")
|
| 256 |
return f"Error during wording analysis: {e}"
|
|
|
|
| 258 |
combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
|
| 259 |
return combined_review
|
| 260 |
|
|
|
|
| 261 |
###############################################################################
|
| 262 |
# 4) File Parsing (PDF, DOCX)
|
| 263 |
###############################################################################
|
| 264 |
|
| 265 |
def parse_bytesio(file_data: BytesIO) -> str:
|
| 266 |
+
"""Parses a BytesIO object representing a PDF or DOCX."""
|
| 267 |
logger.info("Parsing BytesIO object...")
|
|
|
|
| 268 |
try:
|
| 269 |
# Attempt to determine file type from content
|
| 270 |
try:
|
|
|
|
| 288 |
|
| 289 |
def parse_uploaded_file_path(file_data) -> str:
|
| 290 |
"""Takes file data, determines type, extracts text."""
|
|
|
|
| 291 |
if not file_data:
|
| 292 |
logger.warning("No file provided.")
|
| 293 |
return ""
|
|
|
|
| 328 |
|
| 329 |
def clean_markdown(text):
|
| 330 |
"""Removes common Markdown formatting."""
|
|
|
|
| 331 |
if not text: return ""
|
| 332 |
text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
|
| 333 |
text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
|
|
|
|
| 342 |
def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
|
| 343 |
"""Creates DOCX, adds review, saves to temp file, returns path."""
|
| 344 |
logger.debug("Creating and saving DOCX.")
|
|
|
|
| 345 |
document = docx.Document()
|
| 346 |
|
| 347 |
now = datetime.datetime.now()
|
| 348 |
timestamp = now.strftime("%Y%m%d_%H%M%S")
|
| 349 |
+
file_name = f"HF_AI_Review_{doc_type}_{timestamp}.docx"
|
| 350 |
|
| 351 |
+
title = f"DocumentCogito Analysis of {doc_type} between companies {party_a} and {party_b}"
|
| 352 |
document.add_heading(title, level=1)
|
| 353 |
|
| 354 |
if doc_text:
|
|
|
|
| 381 |
else: # Other sections (if any)
|
| 382 |
document.add_paragraph(section)
|
| 383 |
|
|
|
|
| 384 |
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
|
| 385 |
document.save(tmpfile.name)
|
| 386 |
logger.debug(f"DOCX saved to: {tmpfile.name}")
|
|
|
|
| 395 |
logger.info(f"User requested doc generation: {doc_type}, {country}")
|
| 396 |
doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
|
| 397 |
if doc_text.startswith("Error"):
|
| 398 |
+
return doc_text, None
|
| 399 |
docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
|
| 400 |
return doc_text, docx_file_path
|
| 401 |
|
|
|
|
| 419 |
###############################################################################
|
| 420 |
# 7) Build & Launch Gradio App
|
| 421 |
###############################################################################
|
| 422 |
+
# Define custom CSS in a string.
|
| 423 |
custom_css = """
|
| 424 |
.tab-one {
|
| 425 |
background-color: #D1EEFC; /* Light blue */
|
|
|
|
| 429 |
background-color: #FCEED1; /* Light orange */
|
| 430 |
color: #333;
|
| 431 |
}
|
| 432 |
+
/* If you want to style the tab label differently, you may need to target
|
| 433 |
+
specific child elements (like a .tab__header) within the class. */
|
| 434 |
"""
|
| 435 |
|
| 436 |
def build_app():
|
| 437 |
with gr.Blocks(css=custom_css) as demo:
|
| 438 |
gr.Markdown(
|
| 439 |
"""
|
| 440 |
+
# UST Global Legal Document Analyzer (Hugging Face Version)
|
| 441 |
|
| 442 |
**Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
|
| 443 |
|
|
|
|
| 445 |
"""
|
| 446 |
)
|
| 447 |
with gr.Tabs(selected=1):
|
| 448 |
+
with gr.Tab("Generate Document", visible=False):
|
| 449 |
doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
|
| 450 |
party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
|
| 451 |
party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
|
|
|
|
| 460 |
outputs=[gen_output_text, gen_output_file]
|
| 461 |
)
|
| 462 |
|
| 463 |
+
with gr.Tab("Review Document", elem_classes="tab-one", id=1):
|
| 464 |
# Hidden inputs to store values from Generate tab
|
| 465 |
doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
|
| 466 |
party_a_review = gr.Textbox(label="Party A Name", visible=False)
|
|
|
|
| 478 |
# Copy values from Generate to Review tab (hidden fields)
|
| 479 |
gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
|
| 480 |
|
| 481 |
+
gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.")
|
|
|
|
| 482 |
return demo
|
| 483 |
|
| 484 |
+
# For Hugging Face Spaces deployment
|
| 485 |
+
if __name__ == "__main__":
|
| 486 |
+
# create_requirements_file()
|
| 487 |
+
logger.info("Initializing Gradio interface...")
|
| 488 |
+
demo = build_app()
|
| 489 |
+
logger.info("Launching Gradio app.")
|
| 490 |
+
demo.launch()
|
requirements.txt
CHANGED
|
@@ -1,20 +1,7 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
# Other commonly used packages (you might already have these)
|
| 10 |
-
# - If you encounter installation issues, you can try removing or commenting out
|
| 11 |
-
# lines for packages you believe are already installed correctly in your
|
| 12 |
-
# environment. However, it's generally good practice to include them
|
| 13 |
-
# for reproducibility.
|
| 14 |
-
typing-extensions>=4.0.0 # For type hints and compatibility (often a dependency)
|
| 15 |
-
requests>=2.0.0 # Used by transformers and other libraries
|
| 16 |
-
filelock>=3.0.0 # Used by transformers for managing cache
|
| 17 |
-
packaging>=20.0 # For version handling
|
| 18 |
-
regex!=2019.12.17 # Used for text processing
|
| 19 |
-
tqdm>=4.27 # For progress bars (used by transformers)
|
| 20 |
-
numpy>=1.17 # Fundamental numerical computing
|
|
|
|
| 1 |
+
gradio>=3.50.2
|
| 2 |
+
transformers>=4.35.0
|
| 3 |
+
torch>=2.0.0
|
| 4 |
+
python-docx>=0.8.11
|
| 5 |
+
PyPDF2>=3.0.0
|
| 6 |
+
huggingface_hub>=0.19.0
|
| 7 |
+
accelerate>=0.20.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|