Spaces:
Sleeping
Sleeping
Commit ·
d15efc9
1
Parent(s): 6fd8649
code refactor
Browse files- api/__init__.py +0 -0
- api/routes.py +0 -0
- api/schemas.py +0 -0
- app.py +733 -0
- config/model_config.py +79 -28
- config/risk_rules.py +75 -102
- config/settings.py +34 -29
- launch.py +145 -0
- model_manager/llm_manager.py +1 -1
- model_manager/model_cache.py +1 -0
- model_manager/model_loader.py +5 -1
- model_manager/model_registry.py +3 -1
- reporter/pdf_generator.py +496 -0
- requirements.txt +29 -12
- services/clause_extractor.py +369 -428
- services/contract_classifier.py +215 -216
- static/app.js +0 -0
- static/index.html +1404 -0
- static/style.css +0 -0
- utils/text_processor.py +213 -191
api/__init__.py
DELETED
|
File without changes
|
api/routes.py
DELETED
|
File without changes
|
api/schemas.py
DELETED
|
File without changes
|
app.py
CHANGED
|
@@ -0,0 +1,733 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI Application for AI Contract Risk Analyzer
|
| 3 |
+
Complete pre-loading approach: All models loaded at startup
|
| 4 |
+
Direct synchronous flow: Upload → Analyze → Return Results + PDF
|
| 5 |
+
"""
|
| 6 |
+
from fastapi.responses import JSONResponse, FileResponse, Response
|
| 7 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
| 8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
from fastapi.staticfiles import StaticFiles
|
| 10 |
+
from pydantic import BaseModel, Field
|
| 11 |
+
from typing import List, Optional, Dict, Any
|
| 12 |
+
import uuid
|
| 13 |
+
import os
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import sys
|
| 17 |
+
import tempfile
|
| 18 |
+
import io
|
| 19 |
+
|
| 20 |
+
# Add parent directory to path
|
| 21 |
+
sys.path.append(str(Path(__file__).parent))
|
| 22 |
+
|
| 23 |
+
# Import all services
|
| 24 |
+
from config.settings import settings
|
| 25 |
+
from config.risk_rules import ContractType
|
| 26 |
+
from model_manager.model_loader import ModelLoader
|
| 27 |
+
from utils.document_reader import DocumentReader
|
| 28 |
+
from utils.validators import ContractValidator
|
| 29 |
+
from utils.text_processor import TextProcessor
|
| 30 |
+
from utils.logger import ContractAnalyzerLogger, log_info, log_error
|
| 31 |
+
|
| 32 |
+
from services.contract_classifier import ContractClassifier
|
| 33 |
+
from services.clause_extractor import ClauseExtractor
|
| 34 |
+
from services.risk_analyzer import MultiFactorRiskAnalyzer
|
| 35 |
+
from services.term_analyzer import TermAnalyzer
|
| 36 |
+
from services.protection_checker import ProtectionChecker
|
| 37 |
+
from services.llm_interpreter import LLMClauseInterpreter
|
| 38 |
+
from services.negotiation_engine import NegotiationEngine
|
| 39 |
+
from services.market_comparator import MarketComparator
|
| 40 |
+
|
| 41 |
+
# Import PDF generator
|
| 42 |
+
from reporter.pdf_generator import generate_pdf_report
|
| 43 |
+
|
| 44 |
+
# Initialize logger
|
| 45 |
+
ContractAnalyzerLogger.setup(log_dir="logs", app_name="contract_analyzer")
|
| 46 |
+
logger = ContractAnalyzerLogger.get_logger()
|
| 47 |
+
|
| 48 |
+
# ============================================================================
|
| 49 |
+
# PYDANTIC SCHEMAS
|
| 50 |
+
# ============================================================================
|
| 51 |
+
|
| 52 |
+
class HealthResponse(BaseModel):
|
| 53 |
+
"""Health check response"""
|
| 54 |
+
status: str
|
| 55 |
+
version: str
|
| 56 |
+
timestamp: str
|
| 57 |
+
models_loaded: int
|
| 58 |
+
services_loaded: int
|
| 59 |
+
memory_usage_mb: float
|
| 60 |
+
|
| 61 |
+
class AnalysisOptions(BaseModel):
|
| 62 |
+
"""Analysis options"""
|
| 63 |
+
max_clauses: int = Field(default=15, ge=5, le=30)
|
| 64 |
+
interpret_clauses: bool = Field(default=True)
|
| 65 |
+
generate_negotiation_points: bool = Field(default=True)
|
| 66 |
+
compare_to_market: bool = Field(default=True)
|
| 67 |
+
|
| 68 |
+
class AnalysisResult(BaseModel):
|
| 69 |
+
"""Complete analysis result"""
|
| 70 |
+
analysis_id: str
|
| 71 |
+
timestamp: str
|
| 72 |
+
classification: Dict[str, Any]
|
| 73 |
+
clauses: List[Dict[str, Any]]
|
| 74 |
+
risk_analysis: Dict[str, Any]
|
| 75 |
+
unfavorable_terms: List[Dict[str, Any]]
|
| 76 |
+
missing_protections: List[Dict[str, Any]]
|
| 77 |
+
clause_interpretations: Optional[List[Dict[str, Any]]] = None
|
| 78 |
+
negotiation_points: Optional[List[Dict[str, Any]]] = None
|
| 79 |
+
market_comparisons: Optional[List[Dict[str, Any]]] = None
|
| 80 |
+
executive_summary: str
|
| 81 |
+
metadata: Dict[str, Any]
|
| 82 |
+
pdf_available: bool = True
|
| 83 |
+
|
| 84 |
+
class ErrorResponse(BaseModel):
|
| 85 |
+
"""Error response"""
|
| 86 |
+
error: str
|
| 87 |
+
detail: str
|
| 88 |
+
timestamp: str
|
| 89 |
+
|
| 90 |
+
# ============================================================================
|
| 91 |
+
# SERVICE INITIALIZATION WITH FULL PRE-LOADING
|
| 92 |
+
# ============================================================================
|
| 93 |
+
|
| 94 |
+
class PreloadedAnalysisService:
|
| 95 |
+
"""Analysis service with complete pre-loading of all models"""
|
| 96 |
+
|
| 97 |
+
def __init__(self):
|
| 98 |
+
self.model_loader = ModelLoader()
|
| 99 |
+
self.services = {}
|
| 100 |
+
self.service_status = {}
|
| 101 |
+
self.memory_usage_mb = 0
|
| 102 |
+
self._preload_all_services()
|
| 103 |
+
|
| 104 |
+
def _preload_all_services(self):
|
| 105 |
+
"""Pre-load ALL services and models at initialization"""
|
| 106 |
+
log_info("PRE-LOADING ALL AI MODELS AND SERVICES")
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
# Track memory usage
|
| 110 |
+
initial_memory = self._get_memory_usage()
|
| 111 |
+
|
| 112 |
+
# 1. Pre-load core classifier
|
| 113 |
+
log_info("🔄 Pre-loading Contract Classifier...")
|
| 114 |
+
self.services["classifier"] = ContractClassifier(self.model_loader)
|
| 115 |
+
self.service_status["classifier"] = "loaded"
|
| 116 |
+
log_info("✅ Contract Classifier loaded")
|
| 117 |
+
|
| 118 |
+
# 2. Pre-load Term Analyzer
|
| 119 |
+
log_info("🔄 Pre-loading Term Analyzer...")
|
| 120 |
+
self.services["term_analyzer"] = TermAnalyzer()
|
| 121 |
+
self.service_status["term_analyzer"] = "loaded"
|
| 122 |
+
log_info("✅ Term Analyzer loaded")
|
| 123 |
+
|
| 124 |
+
# 3. Pre-load Protection Checker
|
| 125 |
+
log_info("🔄 Pre-loading Protection Checker...")
|
| 126 |
+
self.services["protection_checker"] = ProtectionChecker()
|
| 127 |
+
self.service_status["protection_checker"] = "loaded"
|
| 128 |
+
log_info("✅ Protection Checker loaded")
|
| 129 |
+
|
| 130 |
+
# 4. Pre-load Market Comparator
|
| 131 |
+
log_info("🔄 Pre-loading Market Comparator...")
|
| 132 |
+
self.services["market_comparator"] = MarketComparator(self.model_loader)
|
| 133 |
+
self.service_status["market_comparator"] = "loaded"
|
| 134 |
+
log_info("✅ Market Comparator loaded")
|
| 135 |
+
|
| 136 |
+
# 5. Pre-load Clause Extractors for all major contract types
|
| 137 |
+
log_info("🔄 Pre-loading Clause Extractors...")
|
| 138 |
+
self.services["extractors"] = {}
|
| 139 |
+
major_categories = ["employment", "consulting", "nda", "software", "service", "partnership"]
|
| 140 |
+
|
| 141 |
+
for category in major_categories:
|
| 142 |
+
try:
|
| 143 |
+
self.services["extractors"][category] = ClauseExtractor(
|
| 144 |
+
self.model_loader, contract_category=category
|
| 145 |
+
)
|
| 146 |
+
log_info(f" ✅ Clause Extractor for {category} loaded")
|
| 147 |
+
except Exception as e:
|
| 148 |
+
log_error(f"Failed to load extractor for {category}: {e}")
|
| 149 |
+
self.services["extractors"][category] = None
|
| 150 |
+
|
| 151 |
+
self.service_status["extractors"] = f"loaded for {len(major_categories)} categories"
|
| 152 |
+
log_info("✅ All Clause Extractors loaded")
|
| 153 |
+
|
| 154 |
+
# 6. Pre-load Risk Analyzers for all contract types
|
| 155 |
+
log_info("🔄 Pre-loading Risk Analyzers...")
|
| 156 |
+
self.services["risk_analyzers"] = {}
|
| 157 |
+
contract_types = [
|
| 158 |
+
ContractType.EMPLOYMENT, ContractType.CONSULTING, ContractType.NDA,
|
| 159 |
+
ContractType.SOFTWARE, ContractType.SERVICE, ContractType.PARTNERSHIP,
|
| 160 |
+
ContractType.LEASE, ContractType.PURCHASE, ContractType.GENERAL
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
for contract_type in contract_types:
|
| 164 |
+
try:
|
| 165 |
+
self.services["risk_analyzers"][contract_type.value] = MultiFactorRiskAnalyzer(
|
| 166 |
+
contract_type=contract_type
|
| 167 |
+
)
|
| 168 |
+
log_info(f" ✅ Risk Analyzer for {contract_type.value} loaded")
|
| 169 |
+
except Exception as e:
|
| 170 |
+
log_error(f"Failed to load risk analyzer for {contract_type.value}: {e}")
|
| 171 |
+
self.services["risk_analyzers"][contract_type.value] = None
|
| 172 |
+
|
| 173 |
+
self.service_status["risk_analyzers"] = f"loaded for {len(contract_types)} types"
|
| 174 |
+
log_info("✅ All Risk Analyzers loaded")
|
| 175 |
+
|
| 176 |
+
# 7. Pre-load LLM Interpreter (if available)
|
| 177 |
+
log_info("🔄 Pre-loading LLM Interpreter...")
|
| 178 |
+
try:
|
| 179 |
+
self.services["interpreter"] = LLMClauseInterpreter()
|
| 180 |
+
self.service_status["interpreter"] = "loaded"
|
| 181 |
+
log_info("✅ LLM Interpreter loaded")
|
| 182 |
+
except Exception as e:
|
| 183 |
+
self.services["interpreter"] = None
|
| 184 |
+
self.service_status["interpreter"] = f"failed: {str(e)}"
|
| 185 |
+
log_info("⚠️ LLM Interpreter not available (will skip interpretation)")
|
| 186 |
+
|
| 187 |
+
# 8. Pre-load Negotiation Engine (if available)
|
| 188 |
+
log_info("🔄 Pre-loading Negotiation Engine...")
|
| 189 |
+
try:
|
| 190 |
+
self.services["negotiation_engine"] = NegotiationEngine()
|
| 191 |
+
self.service_status["negotiation_engine"] = "loaded"
|
| 192 |
+
log_info("✅ Negotiation Engine loaded")
|
| 193 |
+
except Exception as e:
|
| 194 |
+
self.services["negotiation_engine"] = None
|
| 195 |
+
self.service_status["negotiation_engine"] = f"failed: {str(e)}"
|
| 196 |
+
log_info("⚠️ Negotiation Engine not available (will skip negotiation points)")
|
| 197 |
+
|
| 198 |
+
# Calculate memory usage
|
| 199 |
+
final_memory = self._get_memory_usage()
|
| 200 |
+
self.memory_usage_mb = final_memory - initial_memory
|
| 201 |
+
|
| 202 |
+
log_info("🎉 ALL SERVICES PRE-LOADED SUCCESSFULLY!")
|
| 203 |
+
log_info(f"📊 Memory Usage: {self.memory_usage_mb:.2f} MB")
|
| 204 |
+
log_info(f"🔧 Services Loaded: {len(self.service_status)}")
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
log_error(f"CRITICAL: Failed to pre-load services: {e}")
|
| 208 |
+
raise
|
| 209 |
+
|
| 210 |
+
def _get_memory_usage(self) -> float:
|
| 211 |
+
"""Get current memory usage in MB"""
|
| 212 |
+
try:
|
| 213 |
+
import psutil
|
| 214 |
+
process = psutil.Process()
|
| 215 |
+
return process.memory_info().rss / 1024 / 1024
|
| 216 |
+
except ImportError:
|
| 217 |
+
return 0.0
|
| 218 |
+
|
| 219 |
+
def get_service_status(self) -> Dict[str, Any]:
|
| 220 |
+
"""Get detailed service status"""
|
| 221 |
+
model_stats = self.model_loader.get_registry_stats()
|
| 222 |
+
return {
|
| 223 |
+
"services": self.service_status,
|
| 224 |
+
"models": model_stats,
|
| 225 |
+
"memory_usage_mb": self.memory_usage_mb,
|
| 226 |
+
"total_services_loaded": len([s for s in self.service_status.values() if "loaded" in str(s)]),
|
| 227 |
+
"total_models_loaded": model_stats.get("loaded_models", 0)
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
def analyze_contract(self, contract_text: str, options: AnalysisOptions) -> Dict[str, Any]:
|
| 231 |
+
"""Synchronous contract analysis using pre-loaded services"""
|
| 232 |
+
try:
|
| 233 |
+
log_info("Starting contract analysis with pre-loaded services...")
|
| 234 |
+
|
| 235 |
+
# Step 1: Classify contract
|
| 236 |
+
classification = self.services["classifier"].classify_contract(contract_text)
|
| 237 |
+
classification_dict = classification.to_dict()
|
| 238 |
+
actual_category = classification.category
|
| 239 |
+
|
| 240 |
+
log_info(f"Contract classified as: {actual_category}")
|
| 241 |
+
|
| 242 |
+
# Step 2: Get appropriate extractor
|
| 243 |
+
extractor = self.services["extractors"].get(actual_category)
|
| 244 |
+
if not extractor:
|
| 245 |
+
# Fallback to first available extractor or create new one
|
| 246 |
+
available_categories = [cat for cat, ext in self.services["extractors"].items() if ext is not None]
|
| 247 |
+
if available_categories:
|
| 248 |
+
fallback_category = available_categories[0]
|
| 249 |
+
extractor = self.services["extractors"][fallback_category]
|
| 250 |
+
log_info(f"Using fallback extractor for: {fallback_category}")
|
| 251 |
+
else:
|
| 252 |
+
# Create new extractor for this category
|
| 253 |
+
extractor = ClauseExtractor(self.model_loader, contract_category=actual_category)
|
| 254 |
+
self.services["extractors"][actual_category] = extractor
|
| 255 |
+
|
| 256 |
+
# Extract clauses
|
| 257 |
+
clauses = extractor.extract_clauses(contract_text, options.max_clauses)
|
| 258 |
+
clauses_dict = [clause.to_dict() for clause in clauses]
|
| 259 |
+
log_info(f"Extracted {len(clauses)} clauses")
|
| 260 |
+
|
| 261 |
+
# Step 3: Map to ContractType and get appropriate risk analyzer
|
| 262 |
+
contract_type_mapping = {
|
| 263 |
+
'employment': ContractType.EMPLOYMENT,
|
| 264 |
+
'consulting': ContractType.CONSULTING,
|
| 265 |
+
'nda': ContractType.NDA,
|
| 266 |
+
'technology': ContractType.SOFTWARE,
|
| 267 |
+
'software': ContractType.SOFTWARE,
|
| 268 |
+
'service_agreement': ContractType.SERVICE,
|
| 269 |
+
'business': ContractType.PARTNERSHIP,
|
| 270 |
+
'real_estate': ContractType.LEASE,
|
| 271 |
+
'sales': ContractType.PURCHASE,
|
| 272 |
+
}
|
| 273 |
+
contract_type = contract_type_mapping.get(actual_category, ContractType.GENERAL)
|
| 274 |
+
|
| 275 |
+
risk_analyzer = self.services["risk_analyzers"].get(contract_type.value)
|
| 276 |
+
if not risk_analyzer:
|
| 277 |
+
# Fallback to general analyzer
|
| 278 |
+
risk_analyzer = self.services["risk_analyzers"]["general"]
|
| 279 |
+
|
| 280 |
+
# Analyze risk
|
| 281 |
+
risk_score = risk_analyzer.analyze_risk(contract_text, clauses)
|
| 282 |
+
risk_dict = risk_score.to_dict()
|
| 283 |
+
log_info(f"Risk analysis completed: {risk_dict['overall_score']}/100")
|
| 284 |
+
|
| 285 |
+
# Step 4: Find unfavorable terms
|
| 286 |
+
unfavorable_terms = self.services["term_analyzer"].analyze_unfavorable_terms(contract_text, clauses)
|
| 287 |
+
unfavorable_dict = [term.to_dict() for term in unfavorable_terms]
|
| 288 |
+
log_info(f"Found {len(unfavorable_terms)} unfavorable terms")
|
| 289 |
+
|
| 290 |
+
# Step 5: Check missing protections
|
| 291 |
+
missing_protections = self.services["protection_checker"].check_missing_protections(contract_text, clauses)
|
| 292 |
+
missing_dict = [prot.to_dict() for prot in missing_protections]
|
| 293 |
+
log_info(f"Found {len(missing_protections)} missing protections")
|
| 294 |
+
|
| 295 |
+
# Optional steps
|
| 296 |
+
interpretations_dict = None
|
| 297 |
+
negotiation_dict = None
|
| 298 |
+
market_dict = None
|
| 299 |
+
|
| 300 |
+
if options.interpret_clauses and self.services["interpreter"]:
|
| 301 |
+
try:
|
| 302 |
+
interpretations = self.services["interpreter"].interpret_clauses(
|
| 303 |
+
clauses, min(10, options.max_clauses)
|
| 304 |
+
)
|
| 305 |
+
interpretations_dict = [interp.to_dict() for interp in interpretations]
|
| 306 |
+
log_info(f"Interpreted {len(interpretations)} clauses")
|
| 307 |
+
except Exception as e:
|
| 308 |
+
log_error(f"Clause interpretation failed: {e}")
|
| 309 |
+
interpretations_dict = []
|
| 310 |
+
|
| 311 |
+
if options.generate_negotiation_points and self.services["negotiation_engine"]:
|
| 312 |
+
try:
|
| 313 |
+
negotiation_points = self.services["negotiation_engine"].generate_negotiation_points(
|
| 314 |
+
risk_score, unfavorable_terms, missing_protections, clauses, 7
|
| 315 |
+
)
|
| 316 |
+
negotiation_dict = [point.to_dict() for point in negotiation_points]
|
| 317 |
+
log_info(f"Generated {len(negotiation_points)} negotiation points")
|
| 318 |
+
except Exception as e:
|
| 319 |
+
log_error(f"Negotiation points generation failed: {e}")
|
| 320 |
+
negotiation_dict = []
|
| 321 |
+
|
| 322 |
+
if options.compare_to_market:
|
| 323 |
+
try:
|
| 324 |
+
market_comparisons = self.services["market_comparator"].compare_to_market(clauses)
|
| 325 |
+
market_dict = [comp.to_dict() for comp in market_comparisons]
|
| 326 |
+
log_info(f"Compared {len(market_comparisons)} clauses to market")
|
| 327 |
+
except Exception as e:
|
| 328 |
+
log_error(f"Market comparison failed: {e}")
|
| 329 |
+
market_dict = []
|
| 330 |
+
|
| 331 |
+
# Generate executive summary
|
| 332 |
+
executive_summary = self._generate_executive_summary(
|
| 333 |
+
classification_dict, risk_dict, unfavorable_dict, missing_dict
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# Build result
|
| 337 |
+
result = {
|
| 338 |
+
"analysis_id": str(uuid.uuid4()),
|
| 339 |
+
"timestamp": datetime.now().isoformat(),
|
| 340 |
+
"classification": classification_dict,
|
| 341 |
+
"clauses": clauses_dict,
|
| 342 |
+
"risk_analysis": risk_dict,
|
| 343 |
+
"unfavorable_terms": unfavorable_dict,
|
| 344 |
+
"missing_protections": missing_dict,
|
| 345 |
+
"clause_interpretations": interpretations_dict,
|
| 346 |
+
"negotiation_points": negotiation_dict,
|
| 347 |
+
"market_comparisons": market_dict,
|
| 348 |
+
"executive_summary": executive_summary,
|
| 349 |
+
"metadata": {
|
| 350 |
+
"text_length": len(contract_text),
|
| 351 |
+
"word_count": len(contract_text.split()),
|
| 352 |
+
"num_clauses": len(clauses),
|
| 353 |
+
"contract_type": contract_type.value,
|
| 354 |
+
"actual_category": actual_category,
|
| 355 |
+
"options": options.dict()
|
| 356 |
+
},
|
| 357 |
+
"pdf_available": True
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
log_info("Contract analysis completed successfully")
|
| 361 |
+
return result
|
| 362 |
+
|
| 363 |
+
except Exception as e:
|
| 364 |
+
log_error(f"Contract analysis failed: {e}")
|
| 365 |
+
raise
|
| 366 |
+
|
| 367 |
+
def _generate_executive_summary(self, classification: Dict, risk_score: Dict,
|
| 368 |
+
unfavorable_terms: List, missing_protections: List) -> str:
|
| 369 |
+
"""Generate executive summary"""
|
| 370 |
+
category = classification.get("category", "Unknown")
|
| 371 |
+
score = risk_score.get("overall_score", 0)
|
| 372 |
+
risk_level = risk_score.get("risk_level", "UNKNOWN")
|
| 373 |
+
|
| 374 |
+
critical_terms = sum(1 for t in unfavorable_terms if t.get('severity') == 'critical')
|
| 375 |
+
critical_protections = sum(1 for p in missing_protections if p.get('importance') == 'critical')
|
| 376 |
+
|
| 377 |
+
if score >= 80:
|
| 378 |
+
risk_msg = "CRITICAL ATTENTION REQUIRED"
|
| 379 |
+
elif score >= 60:
|
| 380 |
+
risk_msg = "SIGNIFICANT CONCERNS"
|
| 381 |
+
elif score >= 40:
|
| 382 |
+
risk_msg = "MODERATE RISK"
|
| 383 |
+
else:
|
| 384 |
+
risk_msg = "LOW RISK"
|
| 385 |
+
|
| 386 |
+
return f"This {category} contract scored {score}/100 ({risk_level.upper()} risk). {risk_msg}. Found {len(unfavorable_terms)} unfavorable terms ({critical_terms} critical) and {len(missing_protections)} missing protections ({critical_protections} critical). Review detailed analysis below."
|
| 387 |
+
|
| 388 |
+
# ============================================================================
|
| 389 |
+
# FASTAPI APP
|
| 390 |
+
# ============================================================================
|
| 391 |
+
|
| 392 |
+
app = FastAPI(
|
| 393 |
+
title=settings.APP_NAME,
|
| 394 |
+
version=settings.APP_VERSION,
|
| 395 |
+
description="AI-powered contract risk analysis with complete model pre-loading",
|
| 396 |
+
docs_url="/api/docs",
|
| 397 |
+
redoc_url="/api/redoc"
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
# Serve static files
|
| 401 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 402 |
+
|
| 403 |
+
# CORS middleware
|
| 404 |
+
app.add_middleware(
|
| 405 |
+
CORSMiddleware,
|
| 406 |
+
allow_origins=settings.CORS_ORIGINS,
|
| 407 |
+
allow_credentials=settings.CORS_ALLOW_CREDENTIALS,
|
| 408 |
+
allow_methods=settings.CORS_ALLOW_METHODS,
|
| 409 |
+
allow_headers=settings.CORS_ALLOW_HEADERS
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
# Initialize pre-loaded analysis service
|
| 413 |
+
analysis_service = PreloadedAnalysisService()
|
| 414 |
+
|
| 415 |
+
# ============================================================================
|
| 416 |
+
# HELPER FUNCTIONS
|
| 417 |
+
# ============================================================================
|
| 418 |
+
|
| 419 |
+
def validate_file(file: UploadFile) -> tuple[bool, str]:
|
| 420 |
+
"""File validation using settings from config"""
|
| 421 |
+
file_ext = os.path.splitext(file.filename)[1].lower()
|
| 422 |
+
if file_ext not in settings.ALLOWED_EXTENSIONS:
|
| 423 |
+
return False, f"Invalid file type. Allowed: {', '.join(settings.ALLOWED_EXTENSIONS)}"
|
| 424 |
+
|
| 425 |
+
file.file.seek(0, 2)
|
| 426 |
+
size = file.file.tell()
|
| 427 |
+
file.file.seek(0)
|
| 428 |
+
|
| 429 |
+
if size > settings.MAX_UPLOAD_SIZE:
|
| 430 |
+
return False, f"File too large. Max size: {settings.MAX_UPLOAD_SIZE / (1024*1024)}MB"
|
| 431 |
+
|
| 432 |
+
if size == 0:
|
| 433 |
+
return False, "File is empty"
|
| 434 |
+
|
| 435 |
+
return True, "OK"
|
| 436 |
+
|
| 437 |
+
def read_contract_file(file: UploadFile) -> str:
|
| 438 |
+
"""Read contract text from file using DocumentReader"""
|
| 439 |
+
file_ext = os.path.splitext(file.filename)[1].lower()
|
| 440 |
+
file_type = "pdf" if file_ext == ".pdf" else "docx" if file_ext == ".docx" else "txt"
|
| 441 |
+
|
| 442 |
+
reader = DocumentReader()
|
| 443 |
+
file_contents = reader.read_file(file.file, file_type)
|
| 444 |
+
|
| 445 |
+
# Handle both string and dict return types from DocumentReader
|
| 446 |
+
if isinstance(file_contents, dict):
|
| 447 |
+
return file_contents.get('text', '') or file_contents.get('content', '')
|
| 448 |
+
else:
|
| 449 |
+
return str(file_contents)
|
| 450 |
+
|
| 451 |
+
def validate_contract_text(text: str) -> tuple[bool, str]:
|
| 452 |
+
"""Validate contract text using settings"""
|
| 453 |
+
if not text or not text.strip():
|
| 454 |
+
return False, "Contract text is empty"
|
| 455 |
+
|
| 456 |
+
if len(text) < settings.MIN_CONTRACT_LENGTH:
|
| 457 |
+
return False, f"Contract text too short. Minimum {settings.MIN_CONTRACT_LENGTH} characters required."
|
| 458 |
+
|
| 459 |
+
if len(text) > settings.MAX_CONTRACT_LENGTH:
|
| 460 |
+
return False, f"Contract text too long. Maximum {settings.MAX_CONTRACT_LENGTH} characters allowed."
|
| 461 |
+
|
| 462 |
+
return True, "OK"
|
| 463 |
+
|
| 464 |
+
# ============================================================================
|
| 465 |
+
# API ROUTES
|
| 466 |
+
# ============================================================================
|
| 467 |
+
|
| 468 |
+
@app.get("/")
|
| 469 |
+
async def serve_frontend():
|
| 470 |
+
"""Serve the frontend"""
|
| 471 |
+
return FileResponse("static/index.html")
|
| 472 |
+
|
| 473 |
+
@app.get("/api/v1/health", response_model=HealthResponse)
|
| 474 |
+
async def health_check():
|
| 475 |
+
"""Health check endpoint with service status"""
|
| 476 |
+
service_status = analysis_service.get_service_status()
|
| 477 |
+
|
| 478 |
+
return HealthResponse(
|
| 479 |
+
status="healthy",
|
| 480 |
+
version=settings.APP_VERSION,
|
| 481 |
+
timestamp=datetime.now().isoformat(),
|
| 482 |
+
models_loaded=service_status["total_models_loaded"],
|
| 483 |
+
services_loaded=service_status["total_services_loaded"],
|
| 484 |
+
memory_usage_mb=service_status["memory_usage_mb"]
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
@app.get("/api/v1/status")
|
| 488 |
+
async def get_detailed_status():
|
| 489 |
+
"""Get detailed service status"""
|
| 490 |
+
return analysis_service.get_service_status()
|
| 491 |
+
|
| 492 |
+
@app.post("/api/v1/analyze/file", response_model=AnalysisResult)
|
| 493 |
+
async def analyze_contract_file(
|
| 494 |
+
file: UploadFile = File(...),
|
| 495 |
+
max_clauses: int = Form(15),
|
| 496 |
+
interpret_clauses: bool = Form(True),
|
| 497 |
+
generate_negotiation_points: bool = Form(True),
|
| 498 |
+
compare_to_market: bool = Form(True)
|
| 499 |
+
):
|
| 500 |
+
"""Analyze uploaded contract file - DIRECT SYNC FLOW"""
|
| 501 |
+
try:
|
| 502 |
+
# Validate file
|
| 503 |
+
is_valid, message = validate_file(file)
|
| 504 |
+
if not is_valid:
|
| 505 |
+
raise HTTPException(status_code=400, detail=message)
|
| 506 |
+
|
| 507 |
+
# Read contract text
|
| 508 |
+
contract_text = read_contract_file(file)
|
| 509 |
+
|
| 510 |
+
# Validate contract text
|
| 511 |
+
is_valid_text, text_message = validate_contract_text(contract_text)
|
| 512 |
+
if not is_valid_text:
|
| 513 |
+
raise HTTPException(status_code=400, detail=text_message)
|
| 514 |
+
|
| 515 |
+
# Validate contract structure using ContractValidator
|
| 516 |
+
validator = ContractValidator()
|
| 517 |
+
is_valid_contract, contract_type, confidence = validator.is_valid_contract(contract_text)
|
| 518 |
+
|
| 519 |
+
if not is_valid_contract:
|
| 520 |
+
raise HTTPException(status_code=400, detail=f"Invalid contract: {confidence}")
|
| 521 |
+
|
| 522 |
+
# Create analysis options
|
| 523 |
+
options = AnalysisOptions(
|
| 524 |
+
max_clauses=min(max_clauses, settings.MAX_CLAUSES_TO_ANALYZE),
|
| 525 |
+
interpret_clauses=interpret_clauses,
|
| 526 |
+
generate_negotiation_points=generate_negotiation_points,
|
| 527 |
+
compare_to_market=compare_to_market
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
# Perform analysis (SYNCHRONOUS with pre-loaded services)
|
| 531 |
+
result = analysis_service.analyze_contract(contract_text, options)
|
| 532 |
+
|
| 533 |
+
log_info(f"File analysis completed",
|
| 534 |
+
filename=file.filename,
|
| 535 |
+
analysis_id=result["analysis_id"],
|
| 536 |
+
risk_score=result["risk_analysis"]["overall_score"])
|
| 537 |
+
|
| 538 |
+
return AnalysisResult(**result)
|
| 539 |
+
|
| 540 |
+
except HTTPException:
|
| 541 |
+
raise
|
| 542 |
+
except Exception as e:
|
| 543 |
+
log_error(f"File analysis failed: {e}")
|
| 544 |
+
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
| 545 |
+
|
| 546 |
+
@app.post("/api/v1/analyze/text", response_model=AnalysisResult)
|
| 547 |
+
async def analyze_contract_text(
|
| 548 |
+
contract_text: str = Form(..., description="Contract text to analyze"),
|
| 549 |
+
max_clauses: int = Form(15),
|
| 550 |
+
interpret_clauses: bool = Form(True),
|
| 551 |
+
generate_negotiation_points: bool = Form(True),
|
| 552 |
+
compare_to_market: bool = Form(True)
|
| 553 |
+
):
|
| 554 |
+
"""Analyze pasted contract text - DIRECT SYNC FLOW"""
|
| 555 |
+
try:
|
| 556 |
+
# Validate contract text
|
| 557 |
+
is_valid, message = validate_contract_text(contract_text)
|
| 558 |
+
if not is_valid:
|
| 559 |
+
raise HTTPException(status_code=400, detail=message)
|
| 560 |
+
|
| 561 |
+
# Validate contract structure using ContractValidator
|
| 562 |
+
validator = ContractValidator()
|
| 563 |
+
is_valid_contract, contract_type, confidence = validator.is_valid_contract(contract_text)
|
| 564 |
+
|
| 565 |
+
if not is_valid_contract:
|
| 566 |
+
raise HTTPException(status_code=400, detail=f"Invalid contract: {confidence}")
|
| 567 |
+
|
| 568 |
+
# Create analysis options
|
| 569 |
+
options = AnalysisOptions(
|
| 570 |
+
max_clauses=min(max_clauses, settings.MAX_CLAUSES_TO_ANALYZE),
|
| 571 |
+
interpret_clauses=interpret_clauses,
|
| 572 |
+
generate_negotiation_points=generate_negotiation_points,
|
| 573 |
+
compare_to_market=compare_to_market
|
| 574 |
+
)
|
| 575 |
+
|
| 576 |
+
# Perform analysis (SYNCHRONOUS with pre-loaded services)
|
| 577 |
+
result = analysis_service.analyze_contract(contract_text, options)
|
| 578 |
+
|
| 579 |
+
log_info(f"Text analysis completed",
|
| 580 |
+
analysis_id=result["analysis_id"],
|
| 581 |
+
risk_score=result["risk_analysis"]["overall_score"])
|
| 582 |
+
|
| 583 |
+
return AnalysisResult(**result)
|
| 584 |
+
|
| 585 |
+
except HTTPException:
|
| 586 |
+
raise
|
| 587 |
+
except Exception as e:
|
| 588 |
+
log_error(f"Text analysis failed: {e}")
|
| 589 |
+
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
| 590 |
+
|
| 591 |
+
@app.post("/api/v1/generate-pdf")
|
| 592 |
+
async def generate_pdf_from_analysis(analysis_result: Dict[str, Any]):
|
| 593 |
+
"""Generate PDF from analysis results"""
|
| 594 |
+
try:
|
| 595 |
+
pdf_buffer = generate_pdf_report(analysis_result)
|
| 596 |
+
|
| 597 |
+
analysis_id = analysis_result.get('analysis_id', 'report')
|
| 598 |
+
return Response(
|
| 599 |
+
content=pdf_buffer.getvalue(),
|
| 600 |
+
media_type="application/pdf",
|
| 601 |
+
headers={
|
| 602 |
+
"Content-Disposition": f"attachment; filename=contract_analysis_{analysis_id}.pdf"
|
| 603 |
+
}
|
| 604 |
+
)
|
| 605 |
+
except Exception as e:
|
| 606 |
+
log_error(f"PDF generation failed: {e}")
|
| 607 |
+
raise HTTPException(status_code=500, detail=f"Failed to generate PDF: {str(e)}")
|
| 608 |
+
|
| 609 |
+
@app.get("/api/v1/categories")
|
| 610 |
+
async def get_contract_categories():
|
| 611 |
+
"""Get list of supported contract categories"""
|
| 612 |
+
try:
|
| 613 |
+
categories = analysis_service.services["classifier"].get_all_categories()
|
| 614 |
+
return {"categories": categories}
|
| 615 |
+
except Exception as e:
|
| 616 |
+
log_error(f"Categories fetch failed: {e}")
|
| 617 |
+
raise HTTPException(status_code=500, detail=f"Failed to get categories: {str(e)}")
|
| 618 |
+
|
| 619 |
+
@app.post("/api/v1/validate/file")
|
| 620 |
+
async def validate_contract_file(file: UploadFile = File(...)):
|
| 621 |
+
"""Quick validation endpoint"""
|
| 622 |
+
try:
|
| 623 |
+
is_valid, message = validate_file(file)
|
| 624 |
+
if not is_valid:
|
| 625 |
+
return {"valid": False, "message": message}
|
| 626 |
+
|
| 627 |
+
contract_text = read_contract_file(file)
|
| 628 |
+
|
| 629 |
+
# Validate text length
|
| 630 |
+
is_valid_text, text_message = validate_contract_text(contract_text)
|
| 631 |
+
if not is_valid_text:
|
| 632 |
+
return {"valid": False, "message": text_message}
|
| 633 |
+
|
| 634 |
+
# Validate contract structure using ContractValidator
|
| 635 |
+
validator = ContractValidator()
|
| 636 |
+
report = validator.get_validation_report(contract_text)
|
| 637 |
+
|
| 638 |
+
return {
|
| 639 |
+
"valid": report["scores"]["total"] > 50 and is_valid_text,
|
| 640 |
+
"message": "Contract appears valid" if report["scores"]["total"] > 50 else "May not be a valid contract",
|
| 641 |
+
"confidence": report["scores"]["total"],
|
| 642 |
+
"report": report
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
except Exception as e:
|
| 646 |
+
log_error(f"File validation failed: {e}")
|
| 647 |
+
raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")
|
| 648 |
+
|
| 649 |
+
@app.post("/api/v1/validate/text")
|
| 650 |
+
async def validate_contract_text_endpoint(contract_text: str = Form(...)):
|
| 651 |
+
"""Validate pasted contract text"""
|
| 652 |
+
try:
|
| 653 |
+
# Validate text length
|
| 654 |
+
is_valid, message = validate_contract_text(contract_text)
|
| 655 |
+
if not is_valid:
|
| 656 |
+
return {"valid": False, "message": message}
|
| 657 |
+
|
| 658 |
+
# Validate contract structure using ContractValidator
|
| 659 |
+
validator = ContractValidator()
|
| 660 |
+
report = validator.get_validation_report(contract_text)
|
| 661 |
+
|
| 662 |
+
return {
|
| 663 |
+
"valid": report["scores"]["total"] > 50 and is_valid,
|
| 664 |
+
"message": "Contract appears valid" if report["scores"]["total"] > 50 else "May not be a valid contract",
|
| 665 |
+
"confidence": report["scores"]["total"],
|
| 666 |
+
"report": report
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
except Exception as e:
|
| 670 |
+
log_error(f"Text validation failed: {e}")
|
| 671 |
+
raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")
|
| 672 |
+
|
| 673 |
+
# ============================================================================
|
| 674 |
+
# ERROR HANDLERS
|
| 675 |
+
# ============================================================================
|
| 676 |
+
|
| 677 |
+
@app.exception_handler(HTTPException)
|
| 678 |
+
async def http_exception_handler(request, exc):
|
| 679 |
+
"""Handle HTTP exceptions"""
|
| 680 |
+
return JSONResponse(
|
| 681 |
+
status_code=exc.status_code,
|
| 682 |
+
content=ErrorResponse(
|
| 683 |
+
error=exc.detail,
|
| 684 |
+
detail=str(exc.detail),
|
| 685 |
+
timestamp=datetime.now().isoformat()
|
| 686 |
+
).dict()
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
@app.exception_handler(Exception)
|
| 690 |
+
async def general_exception_handler(request, exc):
|
| 691 |
+
"""Handle general exceptions"""
|
| 692 |
+
log_error(f"Unhandled exception: {exc}")
|
| 693 |
+
return JSONResponse(
|
| 694 |
+
status_code=500,
|
| 695 |
+
content=ErrorResponse(
|
| 696 |
+
error="Internal server error",
|
| 697 |
+
detail=str(exc),
|
| 698 |
+
timestamp=datetime.now().isoformat()
|
| 699 |
+
).dict()
|
| 700 |
+
)
|
| 701 |
+
|
| 702 |
+
# ============================================================================
|
| 703 |
+
# STARTUP & SHUTDOWN
|
| 704 |
+
# ============================================================================
|
| 705 |
+
|
| 706 |
+
@app.on_event("startup")
|
| 707 |
+
async def startup_event():
|
| 708 |
+
"""Startup event - Services are already pre-loaded"""
|
| 709 |
+
log_info(f"🚀 {settings.APP_NAME} v{settings.APP_VERSION} STARTED")
|
| 710 |
+
log_info(f"📍 Server: {settings.HOST}:{settings.PORT}")
|
| 711 |
+
log_info(f"🔧 All models and services pre-loaded")
|
| 712 |
+
|
| 713 |
+
@app.on_event("shutdown")
|
| 714 |
+
async def shutdown_event():
|
| 715 |
+
"""Shutdown event"""
|
| 716 |
+
log_info("🛑 Shutting down server...")
|
| 717 |
+
log_info("✅ Server shutdown complete")
|
| 718 |
+
|
| 719 |
+
# ============================================================================
|
| 720 |
+
# MAIN
|
| 721 |
+
# ============================================================================
|
| 722 |
+
|
| 723 |
+
if __name__ == "__main__":
|
| 724 |
+
import uvicorn
|
| 725 |
+
|
| 726 |
+
uvicorn.run(
|
| 727 |
+
"app:app",
|
| 728 |
+
host=settings.HOST,
|
| 729 |
+
port=settings.PORT,
|
| 730 |
+
reload=settings.RELOAD,
|
| 731 |
+
workers=1, # Single worker for synchronous flow
|
| 732 |
+
log_level=settings.LOG_LEVEL.lower()
|
| 733 |
+
)
|
config/model_config.py
CHANGED
|
@@ -4,40 +4,91 @@ from pathlib import Path
|
|
| 4 |
|
| 5 |
class ModelConfig:
|
| 6 |
"""
|
| 7 |
-
|
| 8 |
"""
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
#
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
|
| 22 |
-
#
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
@classmethod
|
| 38 |
-
def
|
| 39 |
"""
|
| 40 |
-
|
| 41 |
"""
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
class ModelConfig:
|
| 6 |
"""
|
| 7 |
+
Model-specific configurations - FOR AI MODEL SETTINGS ONLY
|
| 8 |
"""
|
| 9 |
+
# Model Architecture Settings
|
| 10 |
+
LEGAL_BERT = {"model_name" : "nlpaueb/legal-bert-base-uncased",
|
| 11 |
+
"task" : "clause-extraction",
|
| 12 |
+
"max_length" : 512,
|
| 13 |
+
"batch_size" : 16,
|
| 14 |
+
"hidden_dim" : 768,
|
| 15 |
+
"num_layers" : 12,
|
| 16 |
+
"attention_heads" : 12,
|
| 17 |
+
}
|
| 18 |
|
| 19 |
+
# Embedding Model Settings
|
| 20 |
+
EMBEDDING_MODEL = {"model_name" : "sentence-transformers/all-MiniLM-L6-v2",
|
| 21 |
+
"dimension" : 384,
|
| 22 |
+
"pooling" : "mean",
|
| 23 |
+
"normalize" : True,
|
| 24 |
+
"similarity_threshold" : 0.7,
|
| 25 |
+
}
|
| 26 |
|
| 27 |
+
# Classification Model Settings
|
| 28 |
+
CLASSIFIER_MODEL = {"embedding_dim" : 384,
|
| 29 |
+
"hidden_dim" : 256,
|
| 30 |
+
"num_categories" : 12,
|
| 31 |
+
"dropout_rate" : 0.1,
|
| 32 |
+
"learning_rate" : 2e-5,
|
| 33 |
+
"max_seq_length" : 512,
|
| 34 |
+
}
|
| 35 |
|
| 36 |
+
# Clause Extraction Settings
|
| 37 |
+
CLAUSE_EXTRACTION = {"min_clause_length" : 50,
|
| 38 |
+
"max_clause_length" : 2000,
|
| 39 |
+
"confidence_threshold" : 0.7,
|
| 40 |
+
"overlap_threshold" : 0.3,
|
| 41 |
+
"max_clauses_per_doc" : 50,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# Risk Analysis Settings
|
| 45 |
+
RISK_ANALYSIS = {"score_ranges" : {"low" : (0, 40),
|
| 46 |
+
"medium" : (40, 60),
|
| 47 |
+
"high" : (60, 80),
|
| 48 |
+
"critical" : (80, 100),
|
| 49 |
+
},
|
| 50 |
+
"weight_decay" : 0.1,
|
| 51 |
+
"smoothing_factor" : 0.5,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Market Comparison Settings
|
| 55 |
+
MARKET_COMPARISON = {"similarity_threshold" : 0.75,
|
| 56 |
+
"min_matches_required" : 3,
|
| 57 |
+
"max_comparisons" : 20,
|
| 58 |
+
"embedding_cache_size" : 1000,
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# LLM Generation Settings
|
| 62 |
+
LLM_GENERATION = {"max_tokens" : 5000,
|
| 63 |
+
"temperature" : 0.1,
|
| 64 |
+
"top_p" : 0.9,
|
| 65 |
+
"frequency_penalty" : 0.1,
|
| 66 |
+
"presence_penalty" : 0.1,
|
| 67 |
+
"stop_sequences" : ["\n\n", "###", "---"],
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# Text Processing Settings
|
| 71 |
+
TEXT_PROCESSING = {"chunk_size" : 512,
|
| 72 |
+
"chunk_overlap" : 50,
|
| 73 |
+
"min_sentence_length" : 10,
|
| 74 |
+
"max_sentence_length" : 200,
|
| 75 |
+
"entity_confidence" : 0.8,
|
| 76 |
+
}
|
| 77 |
|
| 78 |
|
| 79 |
@classmethod
|
| 80 |
+
def get_model_config(cls, model_type: str) -> dict:
|
| 81 |
"""
|
| 82 |
+
Get configuration for specific model type
|
| 83 |
"""
|
| 84 |
+
config_map = {"legal_bert" : cls.LEGAL_BERT,
|
| 85 |
+
"embedding" : cls.EMBEDDING_MODEL,
|
| 86 |
+
"classifier" : cls.CLASSIFIER_MODEL,
|
| 87 |
+
"clause_extraction" : cls.CLAUSE_EXTRACTION,
|
| 88 |
+
"risk_analysis" : cls.RISK_ANALYSIS,
|
| 89 |
+
"market_comparison" : cls.MARKET_COMPARISON,
|
| 90 |
+
"llm_generation" : cls.LLM_GENERATION,
|
| 91 |
+
"text_processing" : cls.TEXT_PROCESSING,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
return config_map.get(model_type, {})
|
config/risk_rules.py
CHANGED
|
@@ -33,71 +33,91 @@ class RiskRules:
|
|
| 33 |
}
|
| 34 |
|
| 35 |
# Contract-specific weight adjustments
|
| 36 |
-
CONTRACT_TYPE_ADJUSTMENTS = {
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# KEYWORD SEVERITY SCORING (Multi-tier system)
|
| 42 |
# Critical keywords (Tier 1: 20-25 points each)
|
| 43 |
-
CRITICAL_KEYWORDS
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
| 54 |
|
| 55 |
# High-risk keywords (Tier 2: 12-18 points)
|
| 56 |
-
HIGH_RISK_KEYWORDS
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
| 70 |
|
| 71 |
# Medium-risk keywords (Tier 3: 6-10 points)
|
| 72 |
-
MEDIUM_RISK_KEYWORDS
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
| 85 |
|
| 86 |
# STRUCTURAL PATTERN ANALYSIS (Pattern-based risk detection)
|
| 87 |
-
RISKY_PATTERNS
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
|
|
|
| 98 |
|
| 99 |
# CLAUSE-LEVEL RISK FACTORS (Detailed clause analysis)
|
| 100 |
-
CLAUSE_RISK_FACTORS
|
|
|
|
| 101 |
"base_risk": 70,
|
| 102 |
"duration_check": {
|
| 103 |
# months: risk_adjustment
|
|
@@ -189,40 +209,6 @@ class RiskRules:
|
|
| 189 |
"work for hire limited": -10
|
| 190 |
}
|
| 191 |
},
|
| 192 |
-
|
| 193 |
-
"liability": {
|
| 194 |
-
"base_risk": 65,
|
| 195 |
-
"red_flags": {
|
| 196 |
-
"unlimited": +30,
|
| 197 |
-
"consequential damages": +15,
|
| 198 |
-
"indirect damages": +12,
|
| 199 |
-
"punitive damages": +18,
|
| 200 |
-
"no cap": +25
|
| 201 |
-
},
|
| 202 |
-
"protections": {
|
| 203 |
-
"liability cap": -20,
|
| 204 |
-
"mutual cap": -15,
|
| 205 |
-
"limited to fees paid": -18
|
| 206 |
-
}
|
| 207 |
-
},
|
| 208 |
-
|
| 209 |
-
"confidentiality": {
|
| 210 |
-
"base_risk": 45,
|
| 211 |
-
"red_flags": {
|
| 212 |
-
"perpetual": +20,
|
| 213 |
-
"forever": +20,
|
| 214 |
-
"indefinite": +18,
|
| 215 |
-
"all information": +15,
|
| 216 |
-
"any information": +15
|
| 217 |
-
},
|
| 218 |
-
"reasonable_terms": {
|
| 219 |
-
"3 years": -5,
|
| 220 |
-
"5 years": 0,
|
| 221 |
-
"7 years": +5,
|
| 222 |
-
"marked confidential": -8,
|
| 223 |
-
"reasonably necessary": -10
|
| 224 |
-
}
|
| 225 |
-
}
|
| 226 |
}
|
| 227 |
|
| 228 |
# =========================================================================
|
|
@@ -249,13 +235,6 @@ class RiskRules:
|
|
| 249 |
"consulting": {"generous": 3, "standard": 1, "restrictive": 0.5},
|
| 250 |
"general": {"generous": 12, "standard": 6, "restrictive": 1}
|
| 251 |
},
|
| 252 |
-
|
| 253 |
-
"ip_assignment_scope": {
|
| 254 |
-
"tech": "work_product_only", # Standard
|
| 255 |
-
"creative": "commissioned_work_only", # Standard
|
| 256 |
-
"consulting": "deliverables_only", # Standard
|
| 257 |
-
"general": "work_for_hire" # Standard
|
| 258 |
-
}
|
| 259 |
}
|
| 260 |
|
| 261 |
# =========================================================================
|
|
@@ -298,11 +277,6 @@ class RiskRules:
|
|
| 298 |
"risk_if_missing": 15,
|
| 299 |
"categories": ["general"]
|
| 300 |
},
|
| 301 |
-
"change_control_process": {
|
| 302 |
-
"importance": "medium",
|
| 303 |
-
"risk_if_missing": 10,
|
| 304 |
-
"categories": ["general"]
|
| 305 |
-
}
|
| 306 |
}
|
| 307 |
|
| 308 |
# =========================================================================
|
|
@@ -329,5 +303,4 @@ class RiskRules:
|
|
| 329 |
|
| 330 |
# Normalize to sum to 100
|
| 331 |
total = sum(adjusted.values())
|
| 332 |
-
return {k: (v / total) * 100 for k, v in adjusted.items()}
|
| 333 |
-
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
# Contract-specific weight adjustments
|
| 36 |
+
CONTRACT_TYPE_ADJUSTMENTS = {
|
| 37 |
+
ContractType.EMPLOYMENT : {
|
| 38 |
+
"restrictive_covenants" : 1.3,
|
| 39 |
+
"compensation_benefits" : 1.4,
|
| 40 |
+
"termination_rights" : 1.2,
|
| 41 |
+
},
|
| 42 |
+
ContractType.SOFTWARE : {
|
| 43 |
+
"intellectual_property" : 1.5,
|
| 44 |
+
"penalties_liability" : 1.3,
|
| 45 |
+
},
|
| 46 |
+
ContractType.NDA : {
|
| 47 |
+
"restrictive_covenants" : 1.8,
|
| 48 |
+
"penalties_liability" : 1.2,
|
| 49 |
+
},
|
| 50 |
+
ContractType.CONSULTING : {
|
| 51 |
+
"compensation_benefits" : 1.3,
|
| 52 |
+
"termination_rights" : 1.1,
|
| 53 |
+
},
|
| 54 |
+
}
|
| 55 |
|
| 56 |
# KEYWORD SEVERITY SCORING (Multi-tier system)
|
| 57 |
# Critical keywords (Tier 1: 20-25 points each)
|
| 58 |
+
CRITICAL_KEYWORDS = {
|
| 59 |
+
"non-compete" : 25,
|
| 60 |
+
"non-solicit" : 23,
|
| 61 |
+
"non-solicitation" : 23,
|
| 62 |
+
"forfeit" : 25,
|
| 63 |
+
"liquidated damages" : 24,
|
| 64 |
+
"wage withholding" : 25,
|
| 65 |
+
"unlimited liability" : 25,
|
| 66 |
+
"joint and several" : 23,
|
| 67 |
+
"perpetual" : 22,
|
| 68 |
+
"irrevocable" : 20,
|
| 69 |
+
}
|
| 70 |
|
| 71 |
# High-risk keywords (Tier 2: 12-18 points)
|
| 72 |
+
HIGH_RISK_KEYWORDS = {
|
| 73 |
+
"indemnify" : 18,
|
| 74 |
+
"indemnification" : 18,
|
| 75 |
+
"hold harmless" : 17,
|
| 76 |
+
"penalty" : 18,
|
| 77 |
+
"damages" : 15,
|
| 78 |
+
"breach" : 15,
|
| 79 |
+
"default" : 14,
|
| 80 |
+
"immediate termination" : 16,
|
| 81 |
+
"without cause" : 15,
|
| 82 |
+
"sole discretion" : 17,
|
| 83 |
+
"at-will" : 14,
|
| 84 |
+
"waive" : 16,
|
| 85 |
+
"release" : 15,
|
| 86 |
+
}
|
| 87 |
|
| 88 |
# Medium-risk keywords (Tier 3: 6-10 points)
|
| 89 |
+
MEDIUM_RISK_KEYWORDS = {
|
| 90 |
+
"confidential" : 8,
|
| 91 |
+
"proprietary" : 8,
|
| 92 |
+
"trade secret" : 10,
|
| 93 |
+
"terminate" : 7,
|
| 94 |
+
"termination" : 7,
|
| 95 |
+
"assignment" : 6,
|
| 96 |
+
"exclusive" : 9,
|
| 97 |
+
"warranty" : 8,
|
| 98 |
+
"representation" : 7,
|
| 99 |
+
"covenant" : 8,
|
| 100 |
+
"jurisdiction" : 6,
|
| 101 |
+
"governing law" : 6,
|
| 102 |
+
}
|
| 103 |
|
| 104 |
# STRUCTURAL PATTERN ANALYSIS (Pattern-based risk detection)
|
| 105 |
+
RISKY_PATTERNS = [
|
| 106 |
+
(r'\d+\s*(year|yr|month|mo)s?\s*(non-compete|non-solicit)', 20, "Long duration restrictive covenant"),
|
| 107 |
+
(r'(entire|all|worldwide|global)\s*(industry|market|territory)', 18, "Overly broad geographic/industry scope"),
|
| 108 |
+
(r'notice\s+period.*\d+\s*days.*employee.*\d+\s*days.*employer', 15, "Unequal notice periods"),
|
| 109 |
+
(r'(may|can|shall)\s+(withhold|deduct|retain).*compensation', 22, "Wage withholding clause"),
|
| 110 |
+
(r'(unlimited|no\s+limit|without\s+limitation).*liability', 25, "Unlimited liability exposure"),
|
| 111 |
+
(r'(sole|absolute|unfettered)\s+discretion', 18, "One-sided discretionary power"),
|
| 112 |
+
(r'penalty.*(?:equal\s+to|of|amount).*\$?\d+', 16, "Specific penalty amount"),
|
| 113 |
+
(r'(automatically|immediately)\s+(renew|extend)', 12, "Auto-renewal clause"),
|
| 114 |
+
(r'waive.*right.*arbitration', 20, "Arbitration rights waiver"),
|
| 115 |
+
(r'(all|any).*intellectual\s+property.*created', 17, "Broad IP assignment"),
|
| 116 |
+
]
|
| 117 |
|
| 118 |
# CLAUSE-LEVEL RISK FACTORS (Detailed clause analysis)
|
| 119 |
+
CLAUSE_RISK_FACTORS = {
|
| 120 |
+
"non-compete": {
|
| 121 |
"base_risk": 70,
|
| 122 |
"duration_check": {
|
| 123 |
# months: risk_adjustment
|
|
|
|
| 209 |
"work for hire limited": -10
|
| 210 |
}
|
| 211 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
}
|
| 213 |
|
| 214 |
# =========================================================================
|
|
|
|
| 235 |
"consulting": {"generous": 3, "standard": 1, "restrictive": 0.5},
|
| 236 |
"general": {"generous": 12, "standard": 6, "restrictive": 1}
|
| 237 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
}
|
| 239 |
|
| 240 |
# =========================================================================
|
|
|
|
| 277 |
"risk_if_missing": 15,
|
| 278 |
"categories": ["general"]
|
| 279 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
}
|
| 281 |
|
| 282 |
# =========================================================================
|
|
|
|
| 303 |
|
| 304 |
# Normalize to sum to 100
|
| 305 |
total = sum(adjusted.values())
|
| 306 |
+
return {k: (v / total) * 100 for k, v in adjusted.items()}
|
|
|
config/settings.py
CHANGED
|
@@ -7,12 +7,12 @@ from pydantic_settings import BaseSettings
|
|
| 7 |
|
| 8 |
class Settings(BaseSettings):
|
| 9 |
"""
|
| 10 |
-
Application-wide settings
|
| 11 |
"""
|
| 12 |
# Application Info
|
| 13 |
APP_NAME : str = "AI Contract Risk Analyzer"
|
| 14 |
APP_VERSION : str = "1.0.0"
|
| 15 |
-
API_PREFIX : str = "/api/"
|
| 16 |
|
| 17 |
# Server Configuration
|
| 18 |
HOST : str = "0.0.0.0"
|
|
@@ -28,43 +28,48 @@ class Settings(BaseSettings):
|
|
| 28 |
|
| 29 |
# File Upload Settings
|
| 30 |
MAX_UPLOAD_SIZE : int = 10 * 1024 * 1024 # 10 MB
|
| 31 |
-
ALLOWED_EXTENSIONS : list = [".pdf", ".docx"]
|
| 32 |
UPLOAD_DIR : Path = Path("uploads")
|
| 33 |
|
| 34 |
-
# Model Settings
|
| 35 |
-
MODEL_CACHE_SIZE : int
|
| 36 |
-
MODEL_DOWNLOAD_TIMEOUT : int
|
| 37 |
-
USE_GPU : bool
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
OLLAMA_BASE_URL : str
|
| 41 |
-
OLLAMA_MODEL : str
|
| 42 |
-
OLLAMA_TIMEOUT : int
|
| 43 |
-
OLLAMA_TEMPERATURE : float
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
MAX_CLAUSES_TO_ANALYZE : int = 15
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
LOG_FILE : Optional[Path] = Path("logs/app.log")
|
| 54 |
|
| 55 |
# Cache Settings
|
| 56 |
-
ENABLE_CACHE : bool
|
| 57 |
-
CACHE_TTL : int
|
| 58 |
-
CACHE_DIR : Path
|
| 59 |
|
| 60 |
-
# Rate Limiting
|
| 61 |
-
RATE_LIMIT_ENABLED : bool
|
| 62 |
-
RATE_LIMIT_REQUESTS : int
|
| 63 |
-
RATE_LIMIT_PERIOD : int
|
| 64 |
|
| 65 |
# PDF Report Settings
|
| 66 |
-
PDF_FONT_SIZE : int
|
| 67 |
-
PDF_MARGIN : float
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
class Config:
|
|
@@ -84,4 +89,4 @@ class Settings(BaseSettings):
|
|
| 84 |
|
| 85 |
|
| 86 |
# Global settings instance
|
| 87 |
-
settings = Settings()
|
|
|
|
| 7 |
|
| 8 |
class Settings(BaseSettings):
|
| 9 |
"""
|
| 10 |
+
Application-wide settings: primary configuration source
|
| 11 |
"""
|
| 12 |
# Application Info
|
| 13 |
APP_NAME : str = "AI Contract Risk Analyzer"
|
| 14 |
APP_VERSION : str = "1.0.0"
|
| 15 |
+
API_PREFIX : str = "/api/v1/"
|
| 16 |
|
| 17 |
# Server Configuration
|
| 18 |
HOST : str = "0.0.0.0"
|
|
|
|
| 28 |
|
| 29 |
# File Upload Settings
|
| 30 |
MAX_UPLOAD_SIZE : int = 10 * 1024 * 1024 # 10 MB
|
| 31 |
+
ALLOWED_EXTENSIONS : list = [".pdf", ".docx", ".txt"]
|
| 32 |
UPLOAD_DIR : Path = Path("uploads")
|
| 33 |
|
| 34 |
+
# Model Management Settings
|
| 35 |
+
MODEL_CACHE_SIZE : int = 3 # Number of models to keep in memory
|
| 36 |
+
MODEL_DOWNLOAD_TIMEOUT : int = 1800 # 30 minutes
|
| 37 |
+
USE_GPU : bool = True # Automatically detect and use GPU if available
|
| 38 |
|
| 39 |
+
# External API Settings
|
| 40 |
+
OLLAMA_BASE_URL : str = "http://localhost:11434"
|
| 41 |
+
OLLAMA_MODEL : str = "llama3:8b"
|
| 42 |
+
OLLAMA_TIMEOUT : int = 300
|
| 43 |
+
OLLAMA_TEMPERATURE : float = 0.1
|
| 44 |
|
| 45 |
+
# External API Keys
|
| 46 |
+
OPENAI_API_KEY : Optional[str] = None
|
| 47 |
+
ANTHROPIC_API_KEY : Optional[str] = None
|
|
|
|
| 48 |
|
| 49 |
+
# Analysis Limits
|
| 50 |
+
MIN_CONTRACT_LENGTH : int = 300 # Minimum characters for valid contract
|
| 51 |
+
MAX_CONTRACT_LENGTH : int = 500000 # Maximum characters (500KB text)
|
| 52 |
+
MAX_CLAUSES_TO_ANALYZE : int = 15
|
| 53 |
+
|
| 54 |
+
# Logging Settings
|
| 55 |
+
LOG_LEVEL : str = "INFO"
|
| 56 |
+
LOG_FORMAT : str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 57 |
LOG_FILE : Optional[Path] = Path("logs/app.log")
|
| 58 |
|
| 59 |
# Cache Settings
|
| 60 |
+
ENABLE_CACHE : bool = True
|
| 61 |
+
CACHE_TTL : int = 3600 # 1 hour
|
| 62 |
+
CACHE_DIR : Path = Path("cache")
|
| 63 |
|
| 64 |
+
# Rate Limiting Settings
|
| 65 |
+
RATE_LIMIT_ENABLED : bool = True
|
| 66 |
+
RATE_LIMIT_REQUESTS : int = 10
|
| 67 |
+
RATE_LIMIT_PERIOD : int = 60 # seconds
|
| 68 |
|
| 69 |
# PDF Report Settings
|
| 70 |
+
PDF_FONT_SIZE : int = 10
|
| 71 |
+
PDF_MARGIN : float = 0.5 # inches
|
| 72 |
+
PDF_PAGE_SIZE : str = "letter"
|
| 73 |
|
| 74 |
|
| 75 |
class Config:
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
# Global settings instance
|
| 92 |
+
settings = Settings()
|
launch.py
CHANGED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Launch script for AI Contract Risk Analyzer
|
| 3 |
+
Starts both API and frontend (if available)
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
import requests
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
def check_ollama():
|
| 13 |
+
"""Check if Ollama is running"""
|
| 14 |
+
try:
|
| 15 |
+
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
| 16 |
+
if response.status_code == 200:
|
| 17 |
+
print("✓ Ollama is running")
|
| 18 |
+
return True
|
| 19 |
+
except:
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
print("✗ Ollama not running. Start with: ollama serve")
|
| 23 |
+
return False
|
| 24 |
+
|
| 25 |
+
def check_models():
|
| 26 |
+
"""Check if required models are available"""
|
| 27 |
+
try:
|
| 28 |
+
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
| 29 |
+
models = response.json().get('models', [])
|
| 30 |
+
model_names = [m['name'] for m in models]
|
| 31 |
+
|
| 32 |
+
required = "llama3:8b"
|
| 33 |
+
if any(required in name for name in model_names):
|
| 34 |
+
print(f"✓ Model {required} available")
|
| 35 |
+
return True
|
| 36 |
+
else:
|
| 37 |
+
print(f"✗ Model {required} not found. Pull with: ollama pull llama3:8b")
|
| 38 |
+
return False
|
| 39 |
+
except:
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
def start_api():
|
| 43 |
+
"""Start FastAPI server"""
|
| 44 |
+
print("\n" + "="*60)
|
| 45 |
+
print("Starting FastAPI Server...")
|
| 46 |
+
print("="*60)
|
| 47 |
+
|
| 48 |
+
subprocess.Popen([
|
| 49 |
+
sys.executable, "-m", "uvicorn",
|
| 50 |
+
"app:app",
|
| 51 |
+
"--host", "0.0.0.0",
|
| 52 |
+
"--port", "8000",
|
| 53 |
+
"--reload"
|
| 54 |
+
])
|
| 55 |
+
|
| 56 |
+
# Wait for server to start
|
| 57 |
+
time.sleep(3)
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
response = requests.get("http://localhost:8000/api/v1/health", timeout=5)
|
| 61 |
+
if response.status_code == 200:
|
| 62 |
+
print("✓ API Server running at: http://localhost:8000")
|
| 63 |
+
print("✓ Documentation at: http://localhost:8000/api/docs")
|
| 64 |
+
return True
|
| 65 |
+
except:
|
| 66 |
+
pass
|
| 67 |
+
|
| 68 |
+
print("✗ Failed to start API server")
|
| 69 |
+
return False
|
| 70 |
+
|
| 71 |
+
def start_frontend():
|
| 72 |
+
"""Start frontend server (if available)"""
|
| 73 |
+
if not Path("static/index.html").exists():
|
| 74 |
+
print("\n✗ Frontend not found at static/index.html")
|
| 75 |
+
return False
|
| 76 |
+
|
| 77 |
+
print("\n" + "="*60)
|
| 78 |
+
print("Starting Frontend Server...")
|
| 79 |
+
print("="*60)
|
| 80 |
+
|
| 81 |
+
subprocess.Popen([
|
| 82 |
+
sys.executable, "-m", "http.server", "3000",
|
| 83 |
+
"--directory", "static"
|
| 84 |
+
])
|
| 85 |
+
|
| 86 |
+
time.sleep(2)
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
response = requests.get("http://localhost:3000", timeout=5)
|
| 90 |
+
if response.status_code == 200:
|
| 91 |
+
print("✓ Frontend running at: http://localhost:3000")
|
| 92 |
+
return True
|
| 93 |
+
except:
|
| 94 |
+
pass
|
| 95 |
+
|
| 96 |
+
print("✗ Failed to start frontend server")
|
| 97 |
+
return False
|
| 98 |
+
|
| 99 |
+
def main():
|
| 100 |
+
"""Main launch function"""
|
| 101 |
+
print("="*60)
|
| 102 |
+
print("AI Contract Risk Analyzer - Launch Script")
|
| 103 |
+
print("="*60)
|
| 104 |
+
|
| 105 |
+
# Pre-flight checks
|
| 106 |
+
print("\nPre-flight checks:")
|
| 107 |
+
print("-"*60)
|
| 108 |
+
|
| 109 |
+
ollama_ok = check_ollama()
|
| 110 |
+
models_ok = check_models() if ollama_ok else False
|
| 111 |
+
|
| 112 |
+
if not ollama_ok:
|
| 113 |
+
print("\n⚠️ Warning: Ollama not running. Some features may not work.")
|
| 114 |
+
response = input("Continue anyway? (y/n): ")
|
| 115 |
+
if response.lower() != 'y':
|
| 116 |
+
return
|
| 117 |
+
|
| 118 |
+
# Start services
|
| 119 |
+
api_ok = start_api()
|
| 120 |
+
|
| 121 |
+
if not api_ok:
|
| 122 |
+
print("\n✗ Failed to start API. Exiting.")
|
| 123 |
+
return
|
| 124 |
+
|
| 125 |
+
frontend_ok = start_frontend()
|
| 126 |
+
|
| 127 |
+
# Summary
|
| 128 |
+
print("\n" + "="*60)
|
| 129 |
+
print("Launch Complete!")
|
| 130 |
+
print("="*60)
|
| 131 |
+
print(f"API Server: {'✓' if api_ok else '✗'} http://localhost:8000")
|
| 132 |
+
print(f"API Docs: {'✓' if api_ok else '✗'} http://localhost:8000/api/docs")
|
| 133 |
+
print(f"Frontend: {'✓' if frontend_ok else '✗'} http://localhost:3000")
|
| 134 |
+
print("\nPress Ctrl+C to stop all services")
|
| 135 |
+
print("="*60)
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
while True:
|
| 139 |
+
time.sleep(1)
|
| 140 |
+
except KeyboardInterrupt:
|
| 141 |
+
print("\n\nShutting down...")
|
| 142 |
+
sys.exit(0)
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|
model_manager/llm_manager.py
CHANGED
|
@@ -141,7 +141,7 @@ class LLMManager:
|
|
| 141 |
Check if Ollama server is available
|
| 142 |
"""
|
| 143 |
try:
|
| 144 |
-
response = requests.get(f"{self.ollama_base_url}/api/tags", timeout=
|
| 145 |
available = (response.status_code == 200)
|
| 146 |
|
| 147 |
if available:
|
|
|
|
| 141 |
Check if Ollama server is available
|
| 142 |
"""
|
| 143 |
try:
|
| 144 |
+
response = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
|
| 145 |
available = (response.status_code == 200)
|
| 146 |
|
| 147 |
if available:
|
model_manager/model_cache.py
CHANGED
|
@@ -27,6 +27,7 @@ class ModelCache:
|
|
| 27 |
def __init__(self, cache_dir: Path, ttl_seconds: int = 3600):
|
| 28 |
self.cache_dir = Path(cache_dir)
|
| 29 |
self.cache_dir.mkdir(parents = True, exist_ok = True)
|
|
|
|
| 30 |
self.ttl_seconds = ttl_seconds
|
| 31 |
self.logger = ContractAnalyzerLogger.get_logger()
|
| 32 |
|
|
|
|
| 27 |
def __init__(self, cache_dir: Path, ttl_seconds: int = 3600):
|
| 28 |
self.cache_dir = Path(cache_dir)
|
| 29 |
self.cache_dir.mkdir(parents = True, exist_ok = True)
|
| 30 |
+
|
| 31 |
self.ttl_seconds = ttl_seconds
|
| 32 |
self.logger = ContractAnalyzerLogger.get_logger()
|
| 33 |
|
model_manager/model_loader.py
CHANGED
|
@@ -58,7 +58,11 @@ class ModelLoader:
|
|
| 58 |
return info.model, info.tokenizer
|
| 59 |
|
| 60 |
# Mark as loading
|
| 61 |
-
self.registry.register(ModelType.LEGAL_BERT,ModelInfo(name
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
try:
|
| 64 |
config = self.config.LEGAL_BERT
|
|
|
|
| 58 |
return info.model, info.tokenizer
|
| 59 |
|
| 60 |
# Mark as loading
|
| 61 |
+
self.registry.register(ModelType.LEGAL_BERT, ModelInfo(name = "legal-bert",
|
| 62 |
+
type = ModelType.LEGAL_BERT,
|
| 63 |
+
status = ModelStatus.LOADING,
|
| 64 |
+
)
|
| 65 |
+
)
|
| 66 |
|
| 67 |
try:
|
| 68 |
config = self.config.LEGAL_BERT
|
model_manager/model_registry.py
CHANGED
|
@@ -54,6 +54,7 @@ class ModelInfo:
|
|
| 54 |
last_accessed : Optional[datetime] = None
|
| 55 |
metadata : Dict[str, Any] = field(default_factory = dict)
|
| 56 |
|
|
|
|
| 57 |
def mark_accessed(self):
|
| 58 |
"""
|
| 59 |
Update access statistics
|
|
@@ -83,7 +84,7 @@ class ModelRegistry:
|
|
| 83 |
if cls._instance is None:
|
| 84 |
with cls._lock:
|
| 85 |
if cls._instance is None:
|
| 86 |
-
cls._instance
|
| 87 |
cls._instance._initialized = False
|
| 88 |
|
| 89 |
return cls._instance
|
|
@@ -123,6 +124,7 @@ class ModelRegistry:
|
|
| 123 |
"""
|
| 124 |
with self._model_lock:
|
| 125 |
info = self._registry.get(model_type)
|
|
|
|
| 126 |
if info:
|
| 127 |
info.mark_accessed()
|
| 128 |
log_info(f"Model accessed: {model_type.value}",
|
|
|
|
| 54 |
last_accessed : Optional[datetime] = None
|
| 55 |
metadata : Dict[str, Any] = field(default_factory = dict)
|
| 56 |
|
| 57 |
+
|
| 58 |
def mark_accessed(self):
|
| 59 |
"""
|
| 60 |
Update access statistics
|
|
|
|
| 84 |
if cls._instance is None:
|
| 85 |
with cls._lock:
|
| 86 |
if cls._instance is None:
|
| 87 |
+
cls._instance = super().__new__(cls)
|
| 88 |
cls._instance._initialized = False
|
| 89 |
|
| 90 |
return cls._instance
|
|
|
|
| 124 |
"""
|
| 125 |
with self._model_lock:
|
| 126 |
info = self._registry.get(model_type)
|
| 127 |
+
|
| 128 |
if info:
|
| 129 |
info.mark_accessed()
|
| 130 |
log_info(f"Model accessed: {model_type.value}",
|
reporter/pdf_generator.py
CHANGED
|
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DEPENDENCIES
|
| 2 |
+
import os
|
| 3 |
+
from typing import Any
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
from typing import Dict
|
| 6 |
+
from typing import List
|
| 7 |
+
from typing import Optional
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from reportlab.lib import colors
|
| 10 |
+
from reportlab.pdfgen import canvas
|
| 11 |
+
from reportlab.platypus import Image
|
| 12 |
+
from reportlab.platypus import Table
|
| 13 |
+
from reportlab.lib.units import inch
|
| 14 |
+
from reportlab.platypus import Spacer
|
| 15 |
+
from reportlab.lib.enums import TA_LEFT
|
| 16 |
+
from reportlab.platypus import Paragraph
|
| 17 |
+
from reportlab.platypus import PageBreak
|
| 18 |
+
from reportlab.graphics import renderPDF
|
| 19 |
+
from reportlab.platypus import TableStyle
|
| 20 |
+
from reportlab.lib.enums import TA_CENTER
|
| 21 |
+
from reportlab.lib.enums import TA_JUSTIFY
|
| 22 |
+
from reportlab.lib.pagesizes import letter
|
| 23 |
+
from reportlab.platypus import KeepTogether
|
| 24 |
+
from reportlab.graphics.shapes import Circle
|
| 25 |
+
from reportlab.graphics.shapes import String
|
| 26 |
+
from reportlab.graphics.shapes import Drawing
|
| 27 |
+
from reportlab.lib.styles import ParagraphStyle
|
| 28 |
+
from reportlab.platypus import SimpleDocTemplate
|
| 29 |
+
from reportlab.lib.styles import getSampleStyleSheet
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class PDFReportGenerator:
|
| 34 |
+
"""
|
| 35 |
+
Generate professional PDF reports matching the sample style
|
| 36 |
+
"""
|
| 37 |
+
def __init__(self):
|
| 38 |
+
self.styles = getSampleStyleSheet()
|
| 39 |
+
self._setup_custom_styles()
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _setup_custom_styles(self):
|
| 43 |
+
"""
|
| 44 |
+
Setup custom paragraph styles
|
| 45 |
+
"""
|
| 46 |
+
# Title style
|
| 47 |
+
self.styles.add(ParagraphStyle(name = 'ReportTitle',
|
| 48 |
+
parent = self.styles['Heading1'],
|
| 49 |
+
fontSize = 24,
|
| 50 |
+
textColor = colors.HexColor('#1a1a1a'),
|
| 51 |
+
spaceAfter = 20,
|
| 52 |
+
alignment = TA_LEFT,
|
| 53 |
+
fontName = 'Helvetica-Bold',
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Section heading
|
| 58 |
+
self.styles.add(ParagraphStyle(name = 'SectionHeading',
|
| 59 |
+
parent = self.styles['Heading2'],
|
| 60 |
+
fontSize = 16,
|
| 61 |
+
textColor = colors.HexColor('#1a1a1a'),
|
| 62 |
+
spaceAfter = 12,
|
| 63 |
+
spaceBefore = 20,
|
| 64 |
+
fontName = 'Helvetica-Bold',
|
| 65 |
+
)
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Body text
|
| 69 |
+
self.styles.add(ParagraphStyle(
|
| 70 |
+
name='BodyText',
|
| 71 |
+
parent=self.styles['Normal'],
|
| 72 |
+
fontSize=10,
|
| 73 |
+
leading=14,
|
| 74 |
+
textColor=colors.HexColor('#333333'),
|
| 75 |
+
alignment=TA_JUSTIFY,
|
| 76 |
+
fontName='Helvetica'
|
| 77 |
+
))
|
| 78 |
+
|
| 79 |
+
# Bullet point
|
| 80 |
+
self.styles.add(ParagraphStyle(
|
| 81 |
+
name='BulletPoint',
|
| 82 |
+
parent=self.styles['Normal'],
|
| 83 |
+
fontSize=10,
|
| 84 |
+
leading=14,
|
| 85 |
+
textColor=colors.HexColor('#333333'),
|
| 86 |
+
leftIndent=20,
|
| 87 |
+
bulletIndent=10,
|
| 88 |
+
fontName='Helvetica'
|
| 89 |
+
))
|
| 90 |
+
|
| 91 |
+
# Table header
|
| 92 |
+
self.styles.add(ParagraphStyle(
|
| 93 |
+
name='TableHeader',
|
| 94 |
+
parent=self.styles['Normal'],
|
| 95 |
+
fontSize=10,
|
| 96 |
+
textColor=colors.HexColor('#1a1a1a'),
|
| 97 |
+
fontName='Helvetica-Bold'
|
| 98 |
+
))
|
| 99 |
+
|
| 100 |
+
# Footer
|
| 101 |
+
self.styles.add(ParagraphStyle(
|
| 102 |
+
name='Footer',
|
| 103 |
+
parent=self.styles['Normal'],
|
| 104 |
+
fontSize=8,
|
| 105 |
+
textColor=colors.HexColor('#666666'),
|
| 106 |
+
alignment=TA_CENTER,
|
| 107 |
+
fontName='Helvetica'
|
| 108 |
+
))
|
| 109 |
+
|
| 110 |
+
def _draw_risk_score_circle(self, score: int) -> Drawing:
|
| 111 |
+
"""Draw the risk score circle graphic"""
|
| 112 |
+
d = Drawing(150, 150)
|
| 113 |
+
|
| 114 |
+
# Determine color based on score
|
| 115 |
+
if score >= 80:
|
| 116 |
+
color = colors.HexColor('#dc2626')
|
| 117 |
+
elif score >= 60:
|
| 118 |
+
color = colors.HexColor('#f97316')
|
| 119 |
+
elif score >= 40:
|
| 120 |
+
color = colors.HexColor('#ca8a04')
|
| 121 |
+
else:
|
| 122 |
+
color = colors.HexColor('#16a34a')
|
| 123 |
+
|
| 124 |
+
# Background circle
|
| 125 |
+
bg_circle = Circle(75, 75, 60)
|
| 126 |
+
bg_circle.fillColor = colors.HexColor('#f0f0f0')
|
| 127 |
+
bg_circle.strokeColor = None
|
| 128 |
+
d.add(bg_circle)
|
| 129 |
+
|
| 130 |
+
# Score circle
|
| 131 |
+
score_circle = Circle(75, 75, 55)
|
| 132 |
+
score_circle.fillColor = color
|
| 133 |
+
score_circle.strokeColor = None
|
| 134 |
+
d.add(score_circle)
|
| 135 |
+
|
| 136 |
+
# Inner white circle
|
| 137 |
+
inner_circle = Circle(75, 75, 45)
|
| 138 |
+
inner_circle.fillColor = colors.white
|
| 139 |
+
inner_circle.strokeColor = None
|
| 140 |
+
d.add(inner_circle)
|
| 141 |
+
|
| 142 |
+
# Score text
|
| 143 |
+
score_text = String(75, 70, str(score), textAnchor='middle')
|
| 144 |
+
score_text.fontSize = 36
|
| 145 |
+
score_text.fontName = 'Helvetica-Bold'
|
| 146 |
+
score_text.fillColor = color
|
| 147 |
+
d.add(score_text)
|
| 148 |
+
|
| 149 |
+
return d
|
| 150 |
+
|
| 151 |
+
def _get_risk_color(self, score: int) -> colors.Color:
|
| 152 |
+
"""Get color based on risk score"""
|
| 153 |
+
if score >= 80:
|
| 154 |
+
return colors.HexColor('#dc2626')
|
| 155 |
+
elif score >= 60:
|
| 156 |
+
return colors.HexColor('#f97316')
|
| 157 |
+
elif score >= 40:
|
| 158 |
+
return colors.HexColor('#ca8a04')
|
| 159 |
+
else:
|
| 160 |
+
return colors.HexColor('#16a34a')
|
| 161 |
+
|
| 162 |
+
def _create_header_footer(self, canvas, doc):
|
| 163 |
+
"""Add header and footer to each page"""
|
| 164 |
+
canvas.saveState()
|
| 165 |
+
|
| 166 |
+
# Header
|
| 167 |
+
canvas.setFont('Helvetica-Bold', 12)
|
| 168 |
+
canvas.drawString(0.75 * inch, letter[1] - 0.5 * inch,
|
| 169 |
+
"AI Contract Risk Analysis Report")
|
| 170 |
+
|
| 171 |
+
# Footer
|
| 172 |
+
canvas.setFont('Helvetica', 8)
|
| 173 |
+
canvas.setFillColor(colors.HexColor('#666666'))
|
| 174 |
+
|
| 175 |
+
# Page number
|
| 176 |
+
page_num = f"Page {doc.page} of {doc.page_count if hasattr(doc, 'page_count') else '?'}"
|
| 177 |
+
canvas.drawString(7 * inch, 0.5 * inch, page_num)
|
| 178 |
+
|
| 179 |
+
# Legal disclaimer
|
| 180 |
+
disclaimer = "For informational purposes only. Not legal advice."
|
| 181 |
+
canvas.drawCentredString(letter[0] / 2, 0.5 * inch, disclaimer)
|
| 182 |
+
|
| 183 |
+
canvas.restoreState()
|
| 184 |
+
|
| 185 |
+
def generate_report(self, analysis_result: Dict[str, Any],
|
| 186 |
+
output_path: Optional[str] = None) -> BytesIO:
|
| 187 |
+
"""
|
| 188 |
+
Generate PDF report from analysis results
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
analysis_result: Analysis result dictionary from the API
|
| 192 |
+
output_path: Optional file path to save PDF
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
BytesIO buffer containing the PDF
|
| 196 |
+
"""
|
| 197 |
+
# Create buffer
|
| 198 |
+
buffer = BytesIO()
|
| 199 |
+
|
| 200 |
+
# Create document
|
| 201 |
+
doc = SimpleDocTemplate(
|
| 202 |
+
buffer if not output_path else output_path,
|
| 203 |
+
pagesize=letter,
|
| 204 |
+
rightMargin=0.75*inch,
|
| 205 |
+
leftMargin=0.75*inch,
|
| 206 |
+
topMargin=1*inch,
|
| 207 |
+
bottomMargin=1*inch
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# Build story
|
| 211 |
+
story = []
|
| 212 |
+
|
| 213 |
+
# Title and Risk Score (Page 1)
|
| 214 |
+
story.extend(self._build_page_1(analysis_result))
|
| 215 |
+
story.append(PageBreak())
|
| 216 |
+
|
| 217 |
+
# Negotiation Points (Page 2)
|
| 218 |
+
story.extend(self._build_page_2(analysis_result))
|
| 219 |
+
story.append(PageBreak())
|
| 220 |
+
|
| 221 |
+
# Risk Category Breakdown (Page 3)
|
| 222 |
+
story.extend(self._build_page_3(analysis_result))
|
| 223 |
+
|
| 224 |
+
# Clause-by-Clause Analysis (Page 4+)
|
| 225 |
+
story.append(PageBreak())
|
| 226 |
+
story.extend(self._build_clause_analysis(analysis_result))
|
| 227 |
+
|
| 228 |
+
# Build PDF
|
| 229 |
+
doc.build(story, onFirstPage=self._create_header_footer,
|
| 230 |
+
onLaterPages=self._create_header_footer)
|
| 231 |
+
|
| 232 |
+
# If using buffer, seek to beginning
|
| 233 |
+
if not output_path:
|
| 234 |
+
buffer.seek(0)
|
| 235 |
+
return buffer
|
| 236 |
+
|
| 237 |
+
return buffer
|
| 238 |
+
|
| 239 |
+
def _build_page_1(self, result: Dict) -> List:
|
| 240 |
+
"""Build page 1 content: Title, Risk Score, Executive Summary, Key Items"""
|
| 241 |
+
elements = []
|
| 242 |
+
|
| 243 |
+
# Title
|
| 244 |
+
elements.append(Paragraph("AI Contract Risk Analysis Report",
|
| 245 |
+
self.styles['ReportTitle']))
|
| 246 |
+
elements.append(Spacer(1, 0.1*inch))
|
| 247 |
+
|
| 248 |
+
# Risk Score Circle
|
| 249 |
+
risk_score = result['risk_analysis']['overall_score']
|
| 250 |
+
elements.append(self._draw_risk_score_circle(risk_score))
|
| 251 |
+
elements.append(Spacer(1, 0.2*inch))
|
| 252 |
+
|
| 253 |
+
# Executive Summary
|
| 254 |
+
elements.append(Paragraph("Executive Summary",
|
| 255 |
+
self.styles['SectionHeading']))
|
| 256 |
+
elements.append(Paragraph(result['executive_summary'],
|
| 257 |
+
self.styles['BodyText']))
|
| 258 |
+
elements.append(Spacer(1, 0.2*inch))
|
| 259 |
+
|
| 260 |
+
# Unfavorable Terms
|
| 261 |
+
elements.append(Paragraph("Unfavorable Terms",
|
| 262 |
+
self.styles['SectionHeading']))
|
| 263 |
+
|
| 264 |
+
for term in result['unfavorable_terms'][:8]: # Limit to 8 items
|
| 265 |
+
bullet_text = f"<bullet>•</bullet> <b>{term.get('clause_reference', term['term'])}:</b> {term['explanation']}"
|
| 266 |
+
elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
|
| 267 |
+
elements.append(Spacer(1, 0.05*inch))
|
| 268 |
+
|
| 269 |
+
elements.append(Spacer(1, 0.2*inch))
|
| 270 |
+
|
| 271 |
+
# Missing Protections
|
| 272 |
+
elements.append(Paragraph("Missing Protections",
|
| 273 |
+
self.styles['SectionHeading']))
|
| 274 |
+
|
| 275 |
+
for protection in result['missing_protections'][:6]: # Limit to 6 items
|
| 276 |
+
bullet_text = f"<bullet>•</bullet> <b>{protection['protection']}:</b> {protection['explanation']}"
|
| 277 |
+
elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
|
| 278 |
+
elements.append(Spacer(1, 0.05*inch))
|
| 279 |
+
|
| 280 |
+
return elements
|
| 281 |
+
|
| 282 |
+
def _build_page_2(self, result: Dict) -> List:
|
| 283 |
+
"""Build page 2 content: Negotiation Points"""
|
| 284 |
+
elements = []
|
| 285 |
+
|
| 286 |
+
elements.append(Paragraph("Negotiation Points",
|
| 287 |
+
self.styles['SectionHeading']))
|
| 288 |
+
elements.append(Spacer(1, 0.1*inch))
|
| 289 |
+
|
| 290 |
+
negotiation_points = result.get('negotiation_points', [])
|
| 291 |
+
|
| 292 |
+
if negotiation_points:
|
| 293 |
+
for point in negotiation_points[:7]: # Limit to 7 points
|
| 294 |
+
bullet_text = f"<bullet>•</bullet> {point['issue']}: {point['rationale']}"
|
| 295 |
+
elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
|
| 296 |
+
elements.append(Spacer(1, 0.1*inch))
|
| 297 |
+
else:
|
| 298 |
+
# Fallback to unfavorable terms if negotiation points not available
|
| 299 |
+
for term in result['unfavorable_terms'][:7]:
|
| 300 |
+
if term.get('suggested_fix'):
|
| 301 |
+
bullet_text = f"<bullet>•</bullet> {term['term']}: {term['suggested_fix']}"
|
| 302 |
+
elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
|
| 303 |
+
elements.append(Spacer(1, 0.1*inch))
|
| 304 |
+
|
| 305 |
+
return elements
|
| 306 |
+
|
| 307 |
+
def _build_page_3(self, result: Dict) -> List:
|
| 308 |
+
"""Build page 3 content: Risk Category Breakdown"""
|
| 309 |
+
elements = []
|
| 310 |
+
|
| 311 |
+
elements.append(Paragraph("Risk Category Breakdown",
|
| 312 |
+
self.styles['SectionHeading']))
|
| 313 |
+
elements.append(Spacer(1, 0.15*inch))
|
| 314 |
+
|
| 315 |
+
# Create table data
|
| 316 |
+
table_data = [
|
| 317 |
+
[
|
| 318 |
+
Paragraph('<b>Category</b>', self.styles['TableHeader']),
|
| 319 |
+
Paragraph('<b>Score</b>', self.styles['TableHeader']),
|
| 320 |
+
Paragraph('<b>Summary</b>', self.styles['TableHeader'])
|
| 321 |
+
]
|
| 322 |
+
]
|
| 323 |
+
|
| 324 |
+
risk_breakdown = result['risk_analysis'].get('risk_breakdown', [])
|
| 325 |
+
|
| 326 |
+
for category in risk_breakdown:
|
| 327 |
+
score_color = self._get_risk_color(category['score'])
|
| 328 |
+
|
| 329 |
+
category_cell = Paragraph(category['category'], self.styles['BodyText'])
|
| 330 |
+
score_cell = Paragraph(
|
| 331 |
+
f'<font color="{score_color.hexval()}"><b>{category["score"]}</b></font>',
|
| 332 |
+
self.styles['TableHeader']
|
| 333 |
+
)
|
| 334 |
+
summary_cell = Paragraph(category['summary'], self.styles['BodyText'])
|
| 335 |
+
|
| 336 |
+
table_data.append([category_cell, score_cell, summary_cell])
|
| 337 |
+
|
| 338 |
+
# Create table
|
| 339 |
+
table = Table(table_data, colWidths=[1.8*inch, 0.7*inch, 4*inch])
|
| 340 |
+
table.setStyle(TableStyle([
|
| 341 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#f5f5f5')),
|
| 342 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a1a1a')),
|
| 343 |
+
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
| 344 |
+
('ALIGN', (1, 0), (1, -1), 'CENTER'),
|
| 345 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 346 |
+
('FONTSIZE', (0, 0), (-1, -1), 10),
|
| 347 |
+
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
| 348 |
+
('TOPPADDING', (0, 1), (-1, -1), 10),
|
| 349 |
+
('BOTTOMPADDING', (0, 1), (-1, -1), 10),
|
| 350 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e5e5e5')),
|
| 351 |
+
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
| 352 |
+
]))
|
| 353 |
+
|
| 354 |
+
elements.append(table)
|
| 355 |
+
|
| 356 |
+
return elements
|
| 357 |
+
|
| 358 |
+
def _build_clause_analysis(self, result: Dict) -> List:
|
| 359 |
+
"""Build clause-by-clause analysis section"""
|
| 360 |
+
elements = []
|
| 361 |
+
|
| 362 |
+
elements.append(Paragraph("Clause-by-Clause Analysis",
|
| 363 |
+
self.styles['SectionHeading']))
|
| 364 |
+
elements.append(Spacer(1, 0.15*inch))
|
| 365 |
+
|
| 366 |
+
# Create table data
|
| 367 |
+
table_data = [
|
| 368 |
+
[
|
| 369 |
+
Paragraph('<b>Clause</b>', self.styles['TableHeader']),
|
| 370 |
+
Paragraph('<b>Risk Level</b>', self.styles['TableHeader']),
|
| 371 |
+
Paragraph('<b>Analysis</b>', self.styles['TableHeader']),
|
| 372 |
+
Paragraph('<b>Recommendation</b>', self.styles['TableHeader'])
|
| 373 |
+
]
|
| 374 |
+
]
|
| 375 |
+
|
| 376 |
+
# Get unfavorable terms and interpretations
|
| 377 |
+
unfavorable_terms = result.get('unfavorable_terms', [])
|
| 378 |
+
interpretations = result.get('clause_interpretations', [])
|
| 379 |
+
|
| 380 |
+
# Combine and process
|
| 381 |
+
processed_clauses = []
|
| 382 |
+
|
| 383 |
+
for term in unfavorable_terms[:10]: # Limit to 10 clauses
|
| 384 |
+
clause_ref = term.get('clause_reference', term['term'])
|
| 385 |
+
|
| 386 |
+
# Find matching interpretation if available
|
| 387 |
+
analysis_text = term['explanation']
|
| 388 |
+
recommendation_text = term.get('suggested_fix', 'Negotiate or seek legal advice.')
|
| 389 |
+
|
| 390 |
+
# Determine risk level
|
| 391 |
+
severity = term.get('severity', 'high')
|
| 392 |
+
if severity == 'critical':
|
| 393 |
+
risk_level = 'Critical'
|
| 394 |
+
risk_color = colors.HexColor('#dc2626')
|
| 395 |
+
elif severity == 'high':
|
| 396 |
+
risk_level = 'High'
|
| 397 |
+
risk_color = colors.HexColor('#f97316')
|
| 398 |
+
else:
|
| 399 |
+
risk_level = 'Medium'
|
| 400 |
+
risk_color = colors.HexColor('#ca8a04')
|
| 401 |
+
|
| 402 |
+
clause_cell = Paragraph(clause_ref, self.styles['BodyText'])
|
| 403 |
+
risk_cell = Paragraph(
|
| 404 |
+
f'<font color="{risk_color.hexval()}"><b>{risk_level}</b></font>',
|
| 405 |
+
self.styles['TableHeader']
|
| 406 |
+
)
|
| 407 |
+
analysis_cell = Paragraph(analysis_text, self.styles['BodyText'])
|
| 408 |
+
recommendation_cell = Paragraph(recommendation_text, self.styles['BodyText'])
|
| 409 |
+
|
| 410 |
+
table_data.append([clause_cell, risk_cell, analysis_cell, recommendation_cell])
|
| 411 |
+
|
| 412 |
+
# Create table
|
| 413 |
+
table = Table(table_data, colWidths=[1.5*inch, 0.8*inch, 2.2*inch, 2*inch])
|
| 414 |
+
table.setStyle(TableStyle([
|
| 415 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#f5f5f5')),
|
| 416 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a1a1a')),
|
| 417 |
+
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
| 418 |
+
('ALIGN', (1, 0), (1, -1), 'CENTER'),
|
| 419 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 420 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 421 |
+
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
| 422 |
+
('TOPPADDING', (0, 1), (-1, -1), 10),
|
| 423 |
+
('BOTTOMPADDING', (0, 1), (-1, -1), 10),
|
| 424 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e5e5e5')),
|
| 425 |
+
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
| 426 |
+
]))
|
| 427 |
+
|
| 428 |
+
elements.append(table)
|
| 429 |
+
|
| 430 |
+
return elements
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def generate_pdf_report(analysis_result: Dict[str, Any],
|
| 434 |
+
output_path: Optional[str] = None) -> BytesIO:
|
| 435 |
+
"""
|
| 436 |
+
Convenience function to generate PDF report
|
| 437 |
+
|
| 438 |
+
Args:
|
| 439 |
+
analysis_result: Complete analysis result from the API
|
| 440 |
+
output_path: Optional file path to save PDF
|
| 441 |
+
|
| 442 |
+
Returns:
|
| 443 |
+
BytesIO buffer containing the PDF
|
| 444 |
+
"""
|
| 445 |
+
generator = PDFReportGenerator()
|
| 446 |
+
return generator.generate_report(analysis_result, output_path)
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
if __name__ == "__main__":
|
| 450 |
+
# Test with sample data
|
| 451 |
+
sample_result = {
|
| 452 |
+
"analysis_id": "test-123",
|
| 453 |
+
"timestamp": datetime.now().isoformat(),
|
| 454 |
+
"risk_analysis": {
|
| 455 |
+
"overall_score": 85,
|
| 456 |
+
"risk_level": "CRITICAL",
|
| 457 |
+
"risk_breakdown": [
|
| 458 |
+
{
|
| 459 |
+
"category": "Restrictive Covenants",
|
| 460 |
+
"score": 95,
|
| 461 |
+
"summary": "The agreement contains exceptionally broad and long-lasting non-compete (24 months) and non-solicitation (5 years) clauses."
|
| 462 |
+
},
|
| 463 |
+
{
|
| 464 |
+
"category": "Penalties & Termination",
|
| 465 |
+
"score": 90,
|
| 466 |
+
"summary": "The contract includes severe penalties for breach, including forfeiture of earned salary."
|
| 467 |
+
}
|
| 468 |
+
]
|
| 469 |
+
},
|
| 470 |
+
"executive_summary": "This employment agreement is heavily skewed in favor of the Employer, presenting a very high risk.",
|
| 471 |
+
"unfavorable_terms": [
|
| 472 |
+
{
|
| 473 |
+
"term": "Undefined Post-Probation Salary",
|
| 474 |
+
"clause_reference": "Clause 8.2",
|
| 475 |
+
"severity": "critical",
|
| 476 |
+
"explanation": "Post-probation salary is undefined ('as discussed').",
|
| 477 |
+
"suggested_fix": "Insist that the exact salary be explicitly stated."
|
| 478 |
+
}
|
| 479 |
+
],
|
| 480 |
+
"missing_protections": [
|
| 481 |
+
{
|
| 482 |
+
"protection": "Defined Post-Probation Salary",
|
| 483 |
+
"importance": "critical",
|
| 484 |
+
"explanation": "The contract lacks a specific, written salary commitment."
|
| 485 |
+
}
|
| 486 |
+
],
|
| 487 |
+
"negotiation_points": [
|
| 488 |
+
{
|
| 489 |
+
"issue": "Post-probation salary",
|
| 490 |
+
"rationale": "Must be explicitly defined in writing before signing."
|
| 491 |
+
}
|
| 492 |
+
]
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
buffer = generate_pdf_report(sample_result, "test_report.pdf")
|
| 496 |
+
print("Test PDF generated successfully!")
|
requirements.txt
CHANGED
|
@@ -1,15 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# LLM Providers
|
| 2 |
-
openai>=1.
|
| 3 |
-
anthropic>=0.18.0
|
|
|
|
| 4 |
Ollama
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
pydantic
|
|
|
|
| 1 |
+
# FastAPI & Server
|
| 2 |
+
fastapi==0.104.1
|
| 3 |
+
uvicorn[standard]==0.24.0
|
| 4 |
+
python-multipart==0.0.6
|
| 5 |
+
|
| 6 |
+
# ML & NLP
|
| 7 |
+
transformers==4.35.2
|
| 8 |
+
torch==2.1.1
|
| 9 |
+
sentence-transformers==2.2.2
|
| 10 |
+
spacy
|
| 11 |
+
|
| 12 |
+
# Document Processing
|
| 13 |
+
PyPDF2==3.0.1
|
| 14 |
+
PyMuPDF==1.23.8
|
| 15 |
+
python-docx==1.1.0
|
| 16 |
+
|
| 17 |
+
|
| 18 |
# LLM Providers
|
| 19 |
+
openai>=1.3.0
|
| 20 |
+
anthropic>=0.18.0
|
| 21 |
+
requests==2.31.0
|
| 22 |
Ollama
|
| 23 |
|
| 24 |
+
# Data & Validation
|
| 25 |
+
pydantic==2.5.0
|
| 26 |
+
pydantic-settings==2.1.0
|
| 27 |
+
|
| 28 |
+
# Utilities
|
| 29 |
+
python-dotenv==1.0.0
|
| 30 |
+
|
| 31 |
+
# PDF report generation
|
| 32 |
+
reportlab>=4.0.0
|
|
|
services/clause_extractor.py
CHANGED
|
@@ -1,23 +1,26 @@
|
|
| 1 |
-
|
| 2 |
-
Advanced Clause Extractor using Legal-BERT + Structural Patterns
|
| 3 |
-
Uses nlpaueb/legal-bert-base-uncased for semantic clause understanding
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import torch
|
| 7 |
import re
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
from collections import defaultdict
|
| 11 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from sentence_transformers import util
|
| 13 |
|
| 14 |
# Import utilities
|
| 15 |
-
import sys
|
| 16 |
-
from pathlib import Path
|
| 17 |
sys.path.append(str(Path(__file__).parent.parent))
|
| 18 |
|
| 19 |
-
from utils.logger import
|
|
|
|
| 20 |
from utils.text_processor import TextProcessor
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
@dataclass
|
|
@@ -25,37 +28,40 @@ class ExtractedClause:
|
|
| 25 |
"""
|
| 26 |
Extracted clause with comprehensive metadata
|
| 27 |
"""
|
| 28 |
-
text: str
|
| 29 |
-
reference: str
|
| 30 |
-
category: str
|
| 31 |
-
confidence: float # 0.0-1.0
|
| 32 |
-
start_pos: int
|
| 33 |
-
end_pos: int
|
| 34 |
-
extraction_method: str
|
| 35 |
-
risk_indicators: List[str]
|
| 36 |
-
embeddings: Optional[np.ndarray] = None
|
| 37 |
-
subclauses: List[str]
|
| 38 |
-
legal_bert_score: float
|
|
|
|
| 39 |
|
| 40 |
def to_dict(self) -> Dict[str, Any]:
|
| 41 |
-
"""
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
class ClauseExtractor:
|
| 57 |
"""
|
| 58 |
-
|
| 59 |
|
| 60 |
Process:
|
| 61 |
1. Structural extraction (numbered sections like "5.2", "Article III")
|
|
@@ -64,222 +70,123 @@ class ClauseExtractor:
|
|
| 64 |
4. Category classification using Legal-BERT + keyword matching
|
| 65 |
5. Deduplication and ranking
|
| 66 |
"""
|
| 67 |
-
|
| 68 |
-
# =========================================================================
|
| 69 |
# CLAUSE CATEGORY DEFINITIONS WITH REPRESENTATIVE TEXTS
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
-
CLAUSE_CATEGORIES = {
|
| 73 |
-
'compensation': {
|
| 74 |
-
'keywords': ['salary', 'wage', 'compensation', 'pay', 'payment', 'bonus',
|
| 75 |
-
'commission', 'remuneration', 'fee', 'rate', 'benefits'],
|
| 76 |
-
'representative_text': (
|
| 77 |
-
"The Employee shall receive an annual base salary of One Hundred Thousand Dollars "
|
| 78 |
-
"payable in accordance with the Company's standard payroll practices. "
|
| 79 |
-
"Additional compensation may include performance bonuses and stock options."
|
| 80 |
-
),
|
| 81 |
-
'weight': 1.0
|
| 82 |
-
},
|
| 83 |
-
'termination': {
|
| 84 |
-
'keywords': ['termination', 'terminate', 'notice period', 'resignation',
|
| 85 |
-
'dismissal', 'severance', 'end of employment', 'cessation', 'notice'],
|
| 86 |
-
'representative_text': (
|
| 87 |
-
"Either party may terminate this Agreement upon thirty days written notice. "
|
| 88 |
-
"The Company may terminate for cause immediately upon written notice to Employee. "
|
| 89 |
-
"Upon termination, Employee shall receive severance compensation."
|
| 90 |
-
),
|
| 91 |
-
'weight': 1.2
|
| 92 |
-
},
|
| 93 |
-
'non_compete': {
|
| 94 |
-
'keywords': ['non-compete', 'non-solicit', 'non-solicitation', 'restrictive covenant',
|
| 95 |
-
'competitive', 'competition', 'competing business', 'competitive activities'],
|
| 96 |
-
'representative_text': (
|
| 97 |
-
"Employee agrees not to engage in any competitive business activities for a period "
|
| 98 |
-
"of twelve months following termination within a fifty-mile radius. "
|
| 99 |
-
"Employee shall not solicit Company clients or employees during this period."
|
| 100 |
-
),
|
| 101 |
-
'weight': 1.5
|
| 102 |
-
},
|
| 103 |
-
'confidentiality': {
|
| 104 |
-
'keywords': ['confidential', 'proprietary', 'trade secret', 'disclosure',
|
| 105 |
-
'confidentiality', 'secret', 'private', 'non-disclosure'],
|
| 106 |
-
'representative_text': (
|
| 107 |
-
"Employee shall maintain the confidentiality of all proprietary information "
|
| 108 |
-
"and trade secrets of the Company. Confidential Information includes business plans, "
|
| 109 |
-
"customer lists, and technical data. These obligations survive termination."
|
| 110 |
-
),
|
| 111 |
-
'weight': 1.1
|
| 112 |
-
},
|
| 113 |
-
'indemnification': {
|
| 114 |
-
'keywords': ['indemnify', 'indemnification', 'hold harmless', 'defend',
|
| 115 |
-
'liability', 'claims', 'losses', 'damages'],
|
| 116 |
-
'representative_text': (
|
| 117 |
-
"Party A shall indemnify and hold harmless Party B from any claims, losses, "
|
| 118 |
-
"or damages arising from Party A's breach or negligence. This indemnification "
|
| 119 |
-
"includes reasonable attorneys' fees and costs of defense."
|
| 120 |
-
),
|
| 121 |
-
'weight': 1.3
|
| 122 |
-
},
|
| 123 |
-
'intellectual_property': {
|
| 124 |
-
'keywords': ['intellectual property', 'ip', 'copyright', 'patent', 'trademark',
|
| 125 |
-
'work product', 'inventions', 'creation', 'ownership', 'ip rights'],
|
| 126 |
-
'representative_text': (
|
| 127 |
-
"All work product and inventions created by Employee during employment shall be "
|
| 128 |
-
"the exclusive property of the Company. Employee assigns all intellectual property "
|
| 129 |
-
"rights including patents, copyrights, and trade secrets to the Company."
|
| 130 |
-
),
|
| 131 |
-
'weight': 1.2
|
| 132 |
-
},
|
| 133 |
-
'liability': {
|
| 134 |
-
'keywords': ['liable', 'liability', 'damages', 'limitation', 'consequential',
|
| 135 |
-
'indirect', 'punitive', 'cap', 'limited liability'],
|
| 136 |
-
'representative_text': (
|
| 137 |
-
"In no event shall either party be liable for indirect, incidental, or consequential "
|
| 138 |
-
"damages. Total liability under this Agreement shall not exceed the amounts paid "
|
| 139 |
-
"in the twelve months preceding the claim."
|
| 140 |
-
),
|
| 141 |
-
'weight': 1.2
|
| 142 |
-
},
|
| 143 |
-
'warranty': {
|
| 144 |
-
'keywords': ['warranty', 'warrant', 'representation', 'guarantee',
|
| 145 |
-
'assurance', 'promise', 'warranties'],
|
| 146 |
-
'representative_text': (
|
| 147 |
-
"Company warrants that the Services will be performed in a professional manner. "
|
| 148 |
-
"EXCEPT AS EXPRESSLY PROVIDED, COMPANY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, "
|
| 149 |
-
"INCLUDING WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE."
|
| 150 |
-
),
|
| 151 |
-
'weight': 0.9
|
| 152 |
-
},
|
| 153 |
-
'dispute_resolution': {
|
| 154 |
-
'keywords': ['arbitration', 'mediation', 'dispute', 'jurisdiction',
|
| 155 |
-
'governing law', 'venue', 'forum', 'resolution'],
|
| 156 |
-
'representative_text': (
|
| 157 |
-
"Any disputes arising under this Agreement shall be resolved through binding arbitration "
|
| 158 |
-
"in accordance with the rules of the American Arbitration Association. "
|
| 159 |
-
"This Agreement shall be governed by the laws of the State of California."
|
| 160 |
-
),
|
| 161 |
-
'weight': 0.9
|
| 162 |
-
},
|
| 163 |
-
'insurance': {
|
| 164 |
-
'keywords': ['insurance', 'coverage', 'insured', 'policy', 'premium', 'insurer'],
|
| 165 |
-
'representative_text': (
|
| 166 |
-
"Contractor shall maintain general liability insurance with minimum coverage of "
|
| 167 |
-
"one million dollars per occurrence. Proof of insurance shall be provided to Client. "
|
| 168 |
-
"Company shall be named as additional insured on all policies."
|
| 169 |
-
),
|
| 170 |
-
'weight': 0.8
|
| 171 |
-
},
|
| 172 |
-
'assignment': {
|
| 173 |
-
'keywords': ['assignment', 'assign', 'transfer', 'successor', 'binding', 'assignee'],
|
| 174 |
-
'representative_text': (
|
| 175 |
-
"This Agreement may not be assigned by either party without the prior written consent "
|
| 176 |
-
"of the other party. This Agreement shall be binding upon and inure to the benefit "
|
| 177 |
-
"of the parties' successors and permitted assigns."
|
| 178 |
-
),
|
| 179 |
-
'weight': 0.8
|
| 180 |
-
},
|
| 181 |
-
'amendment': {
|
| 182 |
-
'keywords': ['amendment', 'modify', 'modification', 'change', 'alteration', 'waiver'],
|
| 183 |
-
'representative_text': (
|
| 184 |
-
"This Agreement may not be amended or modified except by written instrument signed "
|
| 185 |
-
"by both parties. No waiver of any provision shall be effective unless in writing. "
|
| 186 |
-
"All modifications must be mutually agreed upon."
|
| 187 |
-
),
|
| 188 |
-
'weight': 0.7
|
| 189 |
-
},
|
| 190 |
-
'force_majeure': {
|
| 191 |
-
'keywords': ['force majeure', 'act of god', 'unforeseeable', 'beyond control', 'natural disaster'],
|
| 192 |
-
'representative_text': (
|
| 193 |
-
"Neither party shall be liable for failure to perform due to causes beyond its reasonable "
|
| 194 |
-
"control including acts of God, war, strikes, or natural disasters. "
|
| 195 |
-
"Performance shall be suspended during the force majeure event."
|
| 196 |
-
),
|
| 197 |
-
'weight': 0.7
|
| 198 |
-
},
|
| 199 |
-
'entire_agreement': {
|
| 200 |
-
'keywords': ['entire agreement', 'integration', 'supersedes', 'prior agreements', 'complete agreement'],
|
| 201 |
-
'representative_text': (
|
| 202 |
-
"This Agreement constitutes the entire agreement between the parties and supersedes "
|
| 203 |
-
"all prior agreements, whether written or oral. No other representations or warranties "
|
| 204 |
-
"shall be binding unless incorporated herein."
|
| 205 |
-
),
|
| 206 |
-
'weight': 0.6
|
| 207 |
-
},
|
| 208 |
-
'general': {
|
| 209 |
-
'keywords': ['provision', 'term', 'condition', 'obligation', 'requirement'],
|
| 210 |
-
'representative_text': (
|
| 211 |
-
"The parties agree to the following terms and conditions governing their relationship. "
|
| 212 |
-
"Each party shall perform its obligations in good faith and in accordance with "
|
| 213 |
-
"industry standards and applicable law."
|
| 214 |
-
),
|
| 215 |
-
'weight': 0.5
|
| 216 |
-
}
|
| 217 |
-
}
|
| 218 |
-
|
| 219 |
-
# =========================================================================
|
| 220 |
# RISK INDICATOR PATTERNS
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
'unlimited liability', 'perpetual', 'irrevocable', 'forfeit',
|
| 226 |
-
'liquidated damages', 'wage withholding', 'joint and several'
|
| 227 |
-
],
|
| 228 |
-
'high': [
|
| 229 |
-
'non-compete', 'non-solicit', 'penalty', 'without cause',
|
| 230 |
-
'sole discretion', 'immediate termination', 'at-will'
|
| 231 |
-
],
|
| 232 |
-
'medium': [
|
| 233 |
-
'indemnify', 'hold harmless', 'confidential', 'proprietary',
|
| 234 |
-
'exclusive', 'terminate', 'default', 'breach'
|
| 235 |
-
]
|
| 236 |
-
}
|
| 237 |
|
| 238 |
-
|
| 239 |
# INITIALIZATION
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
def __init__(self, model_loader, contract_category: Optional[str] = None):
|
| 243 |
"""
|
| 244 |
Initialize clause extractor with Legal-BERT
|
| 245 |
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
"""
|
| 250 |
-
self.model_loader
|
| 251 |
-
self.contract_category
|
| 252 |
|
| 253 |
# Models (lazy loaded)
|
| 254 |
-
self.legal_bert_model
|
| 255 |
self.legal_bert_tokenizer = None
|
| 256 |
-
self.embedding_model
|
| 257 |
-
self.device
|
| 258 |
|
| 259 |
# Category embeddings (computed from representative texts)
|
| 260 |
-
self.category_embeddings
|
| 261 |
|
| 262 |
# Text processor
|
| 263 |
-
self.text_processor
|
| 264 |
|
| 265 |
# Logger
|
| 266 |
-
self.logger
|
| 267 |
|
| 268 |
# Lazy load
|
| 269 |
self._lazy_load()
|
| 270 |
|
|
|
|
| 271 |
def _lazy_load(self):
|
| 272 |
-
"""
|
|
|
|
|
|
|
| 273 |
if self.legal_bert_model is None:
|
| 274 |
try:
|
| 275 |
log_info("Loading Legal-BERT for clause extraction...")
|
| 276 |
|
| 277 |
# Load Legal-BERT (nlpaueb/legal-bert-base-uncased)
|
| 278 |
self.legal_bert_model, self.legal_bert_tokenizer = self.model_loader.load_legal_bert()
|
| 279 |
-
self.device
|
| 280 |
|
| 281 |
# Load sentence transformer for embeddings
|
| 282 |
-
self.embedding_model
|
| 283 |
|
| 284 |
# Prepare category embeddings using Legal-BERT
|
| 285 |
self._prepare_category_embeddings()
|
|
@@ -287,59 +194,61 @@ class ClauseExtractor:
|
|
| 287 |
log_info("Clause extractor models loaded successfully")
|
| 288 |
|
| 289 |
except Exception as e:
|
| 290 |
-
log_error(e, context={"component": "ClauseExtractor", "operation": "model_loading"})
|
| 291 |
raise
|
|
|
|
| 292 |
|
| 293 |
def _prepare_category_embeddings(self):
|
| 294 |
"""
|
| 295 |
Pre-compute Legal-BERT embeddings for category representative texts
|
|
|
|
| 296 |
This enables semantic similarity matching for clause classification
|
| 297 |
"""
|
| 298 |
log_info("Computing Legal-BERT embeddings for clause categories...")
|
| 299 |
|
| 300 |
for category, config in self.CLAUSE_CATEGORIES.items():
|
| 301 |
-
representative_text
|
| 302 |
|
| 303 |
# Get Legal-BERT embedding (using [CLS] token)
|
| 304 |
-
embedding
|
|
|
|
| 305 |
self.category_embeddings[category] = embedding
|
| 306 |
|
| 307 |
log_info(f"Prepared Legal-BERT embeddings for {len(self.category_embeddings)} categories")
|
| 308 |
|
|
|
|
| 309 |
def _get_legal_bert_embedding(self, text: str) -> np.ndarray:
|
| 310 |
"""
|
| 311 |
Get Legal-BERT embedding for text using [CLS] token
|
| 312 |
|
| 313 |
-
|
| 314 |
-
|
|
|
|
| 315 |
|
| 316 |
Returns:
|
| 317 |
-
|
|
|
|
| 318 |
"""
|
| 319 |
# Tokenize
|
| 320 |
-
inputs = self.legal_bert_tokenizer(
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
).to(self.device)
|
| 327 |
|
| 328 |
# Get embeddings
|
| 329 |
with torch.no_grad():
|
| 330 |
-
outputs
|
| 331 |
# Use [CLS] token embedding (first token)
|
| 332 |
cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
|
| 333 |
|
| 334 |
return cls_embedding
|
| 335 |
|
| 336 |
-
# =========================================================================
|
| 337 |
-
# MAIN EXTRACTION METHOD
|
| 338 |
-
# =========================================================================
|
| 339 |
|
|
|
|
| 340 |
@ContractAnalyzerLogger.log_execution_time("extract_clauses")
|
| 341 |
-
def extract_clauses(self, contract_text: str,
|
| 342 |
-
max_clauses: int = 15) -> List[ExtractedClause]:
|
| 343 |
"""
|
| 344 |
Extract and classify clauses from contract using hybrid approach
|
| 345 |
|
|
@@ -349,44 +258,45 @@ class ClauseExtractor:
|
|
| 349 |
3. Legal-BERT classification
|
| 350 |
4. Deduplicate and rank by confidence
|
| 351 |
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
| 355 |
|
| 356 |
Returns:
|
| 357 |
-
|
|
|
|
| 358 |
"""
|
| 359 |
|
| 360 |
log_info("Starting clause extraction",
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
|
|
|
| 364 |
|
| 365 |
-
#
|
| 366 |
structural_clauses = self._extract_structural_clauses(contract_text)
|
| 367 |
log_info(f"Extracted {len(structural_clauses)} structural clauses")
|
| 368 |
|
| 369 |
-
#
|
| 370 |
-
semantic_chunks
|
| 371 |
log_info(f"Created {len(semantic_chunks)} semantic chunks")
|
| 372 |
|
| 373 |
-
#
|
| 374 |
-
all_candidates
|
| 375 |
log_info(f"Total candidates: {len(all_candidates)}")
|
| 376 |
|
| 377 |
-
#
|
| 378 |
classified_clauses = self._classify_clauses_with_legal_bert(all_candidates)
|
| 379 |
log_info(f"Classified {len(classified_clauses)} clauses")
|
| 380 |
|
| 381 |
-
#
|
| 382 |
-
final_clauses
|
| 383 |
log_info(f"Final output: {len(final_clauses)} clauses")
|
| 384 |
|
| 385 |
return final_clauses
|
| 386 |
|
| 387 |
-
# =========================================================================
|
| 388 |
-
# STEP 1: STRUCTURAL EXTRACTION
|
| 389 |
-
# =========================================================================
|
| 390 |
|
| 391 |
def _extract_structural_clauses(self, text: str) -> List[Dict]:
|
| 392 |
"""
|
|
@@ -398,27 +308,22 @@ class ClauseExtractor:
|
|
| 398 |
- "Article III. Text"
|
| 399 |
- "Clause 11. Text"
|
| 400 |
"""
|
| 401 |
-
candidates =
|
| 402 |
|
| 403 |
# Clean text
|
| 404 |
-
text
|
| 405 |
|
| 406 |
# Patterns for legal numbering
|
| 407 |
-
patterns
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
(r'(Section\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Section\s+\d+|$)', 'section'),
|
| 414 |
-
# Match: "Clause 1.1. Text"
|
| 415 |
-
(r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Clause\s+\d+|$)', 'clause'),
|
| 416 |
-
# Match: "(a) Text", "(i) Text" - sub-clauses
|
| 417 |
-
(r'\(([a-z]|[ivxlcdm]+)\)\s*([^\n]{30,500}?)(?=\([a-z]|[ivxlcdm]+\)|\n\n|$)', 'subclause')
|
| 418 |
-
]
|
| 419 |
|
| 420 |
for pattern, ref_type in patterns:
|
| 421 |
matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
|
|
|
|
| 422 |
for match in matches:
|
| 423 |
clause_text = match.group(2).strip()
|
| 424 |
|
|
@@ -426,69 +331,105 @@ class ClauseExtractor:
|
|
| 426 |
if not self._is_boilerplate(clause_text):
|
| 427 |
# Check for meaningful content
|
| 428 |
if self._has_meaningful_content(clause_text):
|
| 429 |
-
candidates.append({
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
})
|
| 437 |
|
| 438 |
# Remove overlapping clauses
|
| 439 |
candidates = self._remove_overlapping(candidates)
|
| 440 |
|
| 441 |
return candidates
|
|
|
|
| 442 |
|
| 443 |
def _is_boilerplate(self, text: str) -> bool:
|
| 444 |
-
"""
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
# Must have at least one strong indicator AND be definition-heavy
|
| 453 |
-
has_indicator
|
| 454 |
-
is_short_definition
|
| 455 |
|
| 456 |
return has_indicator or is_short_definition
|
| 457 |
|
|
|
|
| 458 |
def _has_meaningful_content(self, text: str) -> bool:
|
| 459 |
-
"""
|
|
|
|
|
|
|
| 460 |
# Must have minimum length
|
| 461 |
-
if len(text.split()) < 15:
|
| 462 |
return False
|
| 463 |
|
| 464 |
# Check for legal action verbs
|
| 465 |
-
action_verbs
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
# Check for legal subjects
|
| 475 |
-
legal_subjects = [
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
return has_action or has_subject
|
| 484 |
|
|
|
|
| 485 |
def _remove_overlapping(self, candidates: List[Dict]) -> List[Dict]:
|
| 486 |
-
"""
|
|
|
|
|
|
|
| 487 |
if not candidates:
|
| 488 |
return []
|
| 489 |
|
| 490 |
# Sort by start position
|
| 491 |
-
candidates.sort(key=lambda x: x['start'])
|
| 492 |
|
| 493 |
non_overlapping = [candidates[0]]
|
| 494 |
|
|
@@ -496,41 +437,35 @@ class ClauseExtractor:
|
|
| 496 |
last = non_overlapping[-1]
|
| 497 |
|
| 498 |
# Check if overlaps
|
| 499 |
-
if candidate['start'] >= last['end']:
|
| 500 |
non_overlapping.append(candidate)
|
| 501 |
-
|
|
|
|
| 502 |
# Keep longer clause if overlapping
|
| 503 |
non_overlapping[-1] = candidate
|
| 504 |
|
| 505 |
return non_overlapping
|
| 506 |
|
| 507 |
-
# =========================================================================
|
| 508 |
-
# STEP 2: SEMANTIC CHUNKING
|
| 509 |
-
# =========================================================================
|
| 510 |
|
| 511 |
-
def _semantic_chunking(self, text: str,
|
| 512 |
-
structural_clauses: List[Dict],
|
| 513 |
-
chunk_size: int = 200) -> List[Dict]:
|
| 514 |
"""
|
| 515 |
-
Chunk unstructured text semantically
|
| 516 |
-
Uses sentence boundaries to find natural clause boundaries
|
| 517 |
"""
|
| 518 |
-
|
| 519 |
# Get covered ranges from structural clauses
|
| 520 |
covered_ranges = [(c['start'], c['end']) for c in structural_clauses]
|
| 521 |
|
| 522 |
# Split into sentences
|
| 523 |
-
sentences
|
| 524 |
|
| 525 |
-
chunks
|
| 526 |
-
current_chunk
|
| 527 |
current_length = 0
|
| 528 |
-
current_start
|
| 529 |
|
| 530 |
for sentence in sentences:
|
| 531 |
# Check if sentence is already covered by structural extraction
|
| 532 |
sentence_start = text.find(sentence, current_start)
|
| 533 |
-
if sentence_start == -1:
|
| 534 |
continue
|
| 535 |
|
| 536 |
if self._is_in_range(sentence_start, covered_ranges):
|
|
@@ -541,21 +476,20 @@ class ClauseExtractor:
|
|
| 541 |
current_length += len(sentence.split())
|
| 542 |
|
| 543 |
# Create chunk when reaching size limit
|
| 544 |
-
if current_length >= chunk_size:
|
| 545 |
chunk_text = ' '.join(current_chunk).strip()
|
| 546 |
|
| 547 |
-
if len(chunk_text) >= 50 and not self._is_boilerplate(chunk_text):
|
| 548 |
if self._has_meaningful_content(chunk_text):
|
| 549 |
-
chunks.append({
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
})
|
| 557 |
|
| 558 |
-
current_chunk
|
| 559 |
current_length = 0
|
| 560 |
|
| 561 |
current_start = sentence_start + len(sentence)
|
|
@@ -563,145 +497,145 @@ class ClauseExtractor:
|
|
| 563 |
# Add final chunk if exists
|
| 564 |
if current_chunk:
|
| 565 |
chunk_text = ' '.join(current_chunk).strip()
|
| 566 |
-
|
|
|
|
| 567 |
if self._has_meaningful_content(chunk_text):
|
| 568 |
sentence_start = text.find(current_chunk[0])
|
| 569 |
-
chunks.append({
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
})
|
| 577 |
|
| 578 |
return chunks
|
| 579 |
|
|
|
|
| 580 |
def _is_in_range(self, position: int, ranges: List[Tuple[int, int]]) -> bool:
|
| 581 |
-
"""
|
|
|
|
|
|
|
| 582 |
return any(start <= position <= end for start, end in ranges)
|
| 583 |
|
| 584 |
-
# =========================================================================
|
| 585 |
-
# STEP 3: LEGAL-BERT CLASSIFICATION
|
| 586 |
-
# =========================================================================
|
| 587 |
|
| 588 |
def _classify_clauses_with_legal_bert(self, candidates: List[Dict]) -> List[ExtractedClause]:
|
| 589 |
"""
|
| 590 |
Classify clauses using Legal-BERT embeddings + keyword matching
|
| 591 |
"""
|
| 592 |
-
classified =
|
| 593 |
|
| 594 |
for candidate in candidates:
|
| 595 |
# Get Legal-BERT embedding for clause
|
| 596 |
-
clause_embedding
|
| 597 |
|
| 598 |
# Classify using hybrid approach
|
| 599 |
-
category, confidence, legal_bert_score = self._classify_single_clause(
|
| 600 |
-
candidate['text'],
|
| 601 |
-
clause_embedding
|
| 602 |
-
)
|
| 603 |
|
| 604 |
# Extract risk indicators
|
| 605 |
-
risk_indicators
|
| 606 |
|
| 607 |
# Extract sub-clauses if any
|
| 608 |
-
subclauses
|
| 609 |
|
| 610 |
-
classified.append(ExtractedClause(
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
|
| 624 |
return classified
|
| 625 |
|
| 626 |
-
|
| 627 |
-
|
| 628 |
"""
|
| 629 |
Classify single clause using Legal-BERT + keyword matching
|
| 630 |
|
| 631 |
Returns:
|
| 632 |
-
|
|
|
|
| 633 |
"""
|
| 634 |
-
text_lower
|
| 635 |
|
| 636 |
-
#
|
| 637 |
-
keyword_scores =
|
|
|
|
| 638 |
for category, config in self.CLAUSE_CATEGORIES.items():
|
| 639 |
-
keywords
|
| 640 |
-
weight
|
| 641 |
|
| 642 |
-
keyword_count
|
| 643 |
keyword_scores[category] = (keyword_count / len(keywords)) * weight
|
| 644 |
|
| 645 |
-
#
|
| 646 |
-
semantic_scores
|
| 647 |
clause_embedding_tensor = torch.tensor(clause_embedding).unsqueeze(0)
|
| 648 |
|
| 649 |
for category, cat_embedding in self.category_embeddings.items():
|
| 650 |
-
cat_embedding_tensor
|
| 651 |
-
similarity
|
| 652 |
-
clause_embedding_tensor,
|
| 653 |
-
cat_embedding_tensor
|
| 654 |
-
).item()
|
| 655 |
semantic_scores[category] = similarity
|
| 656 |
|
| 657 |
# Combine scores (70% semantic, 30% keyword)
|
| 658 |
-
combined_scores =
|
|
|
|
| 659 |
for category in self.CLAUSE_CATEGORIES.keys():
|
| 660 |
-
combined
|
| 661 |
-
semantic_scores.get(category, 0) * 0.70 +
|
| 662 |
-
keyword_scores.get(category, 0) * 0.30
|
| 663 |
-
)
|
| 664 |
combined_scores[category] = combined
|
| 665 |
|
| 666 |
# Get best category
|
| 667 |
-
best_category
|
| 668 |
-
confidence
|
| 669 |
legal_bert_score = semantic_scores[best_category]
|
| 670 |
|
| 671 |
return best_category, confidence, legal_bert_score
|
| 672 |
|
|
|
|
| 673 |
def _extract_risk_indicators(self, text: str) -> List[str]:
|
| 674 |
-
"""
|
| 675 |
-
|
| 676 |
-
|
|
|
|
|
|
|
| 677 |
|
| 678 |
for severity, indicators in self.RISK_INDICATORS.items():
|
| 679 |
for indicator in indicators:
|
| 680 |
if indicator in text_lower:
|
| 681 |
found_indicators.append(indicator)
|
| 682 |
|
| 683 |
-
|
|
|
|
| 684 |
|
|
|
|
| 685 |
def _extract_subclauses(self, text: str) -> List[str]:
|
| 686 |
-
"""
|
|
|
|
|
|
|
| 687 |
# Pattern for sub-clauses: (a), (i), etc.
|
| 688 |
subclause_pattern = r'\(([a-z]|[ivxlcdm]+)\)\s*([^()]{20,200}?)(?=\([a-z]|[ivxlcdm]+\)|$)'
|
| 689 |
-
matches
|
| 690 |
|
| 691 |
-
subclauses
|
|
|
|
| 692 |
for ref, subtext in matches:
|
| 693 |
clean_text = subtext.strip()
|
| 694 |
-
|
|
|
|
| 695 |
subclauses.append(f"({ref}) {clean_text}")
|
| 696 |
|
| 697 |
-
|
|
|
|
| 698 |
|
| 699 |
-
# =========================================================================
|
| 700 |
-
# STEP 4: DEDUPLICATION AND RANKING
|
| 701 |
-
# =========================================================================
|
| 702 |
|
| 703 |
-
def _deduplicate_and_rank(self, clauses: List[ExtractedClause],
|
| 704 |
-
max_clauses: int) -> List[ExtractedClause]:
|
| 705 |
"""
|
| 706 |
Remove duplicates and rank by confidence + legal_bert_score
|
| 707 |
"""
|
|
@@ -709,24 +643,22 @@ class ClauseExtractor:
|
|
| 709 |
return []
|
| 710 |
|
| 711 |
# Sort by combined score (confidence * 0.6 + legal_bert_score * 0.4)
|
| 712 |
-
clauses.sort(
|
| 713 |
-
key=lambda x: (x.confidence * 0.6 + x.legal_bert_score * 0.4),
|
| 714 |
-
reverse=True
|
| 715 |
-
)
|
| 716 |
|
| 717 |
# Deduplicate by text similarity
|
| 718 |
-
unique_clauses =
|
| 719 |
-
seen_texts
|
| 720 |
|
| 721 |
for clause in clauses:
|
| 722 |
# Simple deduplication by first 100 chars
|
| 723 |
-
text_key
|
| 724 |
|
| 725 |
# Also check similarity to already added clauses
|
| 726 |
is_duplicate = False
|
|
|
|
| 727 |
for existing in unique_clauses:
|
| 728 |
similarity = self._text_similarity(clause.text, existing.text)
|
| 729 |
-
if similarity > 0.85:
|
| 730 |
is_duplicate = True
|
| 731 |
break
|
| 732 |
|
|
@@ -734,28 +666,31 @@ class ClauseExtractor:
|
|
| 734 |
unique_clauses.append(clause)
|
| 735 |
seen_texts.add(text_key)
|
| 736 |
|
| 737 |
-
if len(unique_clauses) >= max_clauses:
|
| 738 |
break
|
| 739 |
|
| 740 |
return unique_clauses
|
| 741 |
|
|
|
|
| 742 |
def _text_similarity(self, text1: str, text2: str) -> float:
|
| 743 |
-
"""
|
| 744 |
-
|
| 745 |
-
|
|
|
|
|
|
|
| 746 |
|
| 747 |
intersection = len(words1 & words2)
|
| 748 |
-
union
|
| 749 |
|
| 750 |
return intersection / union if union > 0 else 0.0
|
| 751 |
|
| 752 |
-
# =========================================================================
|
| 753 |
-
# UTILITY METHODS
|
| 754 |
-
# =========================================================================
|
| 755 |
|
| 756 |
def get_category_distribution(self, clauses: List[ExtractedClause]) -> Dict[str, int]:
|
| 757 |
-
"""
|
|
|
|
|
|
|
| 758 |
distribution = defaultdict(int)
|
|
|
|
| 759 |
for clause in clauses:
|
| 760 |
distribution[clause.category] += 1
|
| 761 |
|
|
@@ -763,9 +698,15 @@ class ClauseExtractor:
|
|
| 763 |
|
| 764 |
return dict(distribution)
|
| 765 |
|
|
|
|
| 766 |
def get_high_risk_clauses(self, clauses: List[ExtractedClause]) -> List[ExtractedClause]:
|
| 767 |
-
"""
|
|
|
|
|
|
|
| 768 |
risky = [c for c in clauses if c.risk_indicators]
|
| 769 |
-
|
|
|
|
| 770 |
|
| 771 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
# DEPENDENCIES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import re
|
| 3 |
+
import sys
|
| 4 |
+
import torch
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
+
from typing import Any
|
| 7 |
+
from typing import List
|
| 8 |
+
from typing import Dict
|
| 9 |
+
from typing import Tuple
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional
|
| 12 |
+
from dataclasses import field
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
from collections import defaultdict
|
| 15 |
from sentence_transformers import util
|
| 16 |
|
| 17 |
# Import utilities
|
|
|
|
|
|
|
| 18 |
sys.path.append(str(Path(__file__).parent.parent))
|
| 19 |
|
| 20 |
+
from utils.logger import log_info
|
| 21 |
+
from utils.logger import log_error
|
| 22 |
from utils.text_processor import TextProcessor
|
| 23 |
+
from utils.logger import ContractAnalyzerLogger
|
| 24 |
|
| 25 |
|
| 26 |
@dataclass
|
|
|
|
| 28 |
"""
|
| 29 |
Extracted clause with comprehensive metadata
|
| 30 |
"""
|
| 31 |
+
text : str
|
| 32 |
+
reference : str # e.g., "Section 5.2", "Clause 11.1"
|
| 33 |
+
category : str # e.g., "termination", "compensation", "indemnification"
|
| 34 |
+
confidence : float # 0.0-1.0
|
| 35 |
+
start_pos : int
|
| 36 |
+
end_pos : int
|
| 37 |
+
extraction_method : str # "structural", "semantic", "hybrid"
|
| 38 |
+
risk_indicators : List[str] = field(default_factory = list)
|
| 39 |
+
embeddings : Optional[np.ndarray] = None
|
| 40 |
+
subclauses : List[str] = field(default_factory = list)
|
| 41 |
+
legal_bert_score : float = 0.0
|
| 42 |
+
|
| 43 |
|
| 44 |
def to_dict(self) -> Dict[str, Any]:
|
| 45 |
+
"""
|
| 46 |
+
Convert to dictionary for serialization
|
| 47 |
+
"""
|
| 48 |
+
return {"text" : self.text,
|
| 49 |
+
"reference" : self.reference,
|
| 50 |
+
"category" : self.category,
|
| 51 |
+
"confidence" : round(self.confidence, 3),
|
| 52 |
+
"start_pos" : self.start_pos,
|
| 53 |
+
"end_pos" : self.end_pos,
|
| 54 |
+
"extraction_method" : self.extraction_method,
|
| 55 |
+
"risk_indicators" : self.risk_indicators,
|
| 56 |
+
"subclauses" : self.subclauses,
|
| 57 |
+
"legal_bert_score" : round(self.legal_bert_score, 3),
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
|
| 61 |
|
| 62 |
class ClauseExtractor:
|
| 63 |
"""
|
| 64 |
+
Clause extraction using Legal-BERT + structural patterns
|
| 65 |
|
| 66 |
Process:
|
| 67 |
1. Structural extraction (numbered sections like "5.2", "Article III")
|
|
|
|
| 70 |
4. Category classification using Legal-BERT + keyword matching
|
| 71 |
5. Deduplication and ranking
|
| 72 |
"""
|
|
|
|
|
|
|
| 73 |
# CLAUSE CATEGORY DEFINITIONS WITH REPRESENTATIVE TEXTS
|
| 74 |
+
CLAUSE_CATEGORIES = {'compensation' : {'keywords' : ['salary', 'wage', 'compensation', 'pay', 'payment', 'bonus', 'commission', 'remuneration', 'fee', 'rate', 'benefits'],
|
| 75 |
+
'representative_text' : ("The Employee shall receive an annual base salary of One Hundred Thousand Dollars payable in accordance with the Company's standard payroll practices. Additional compensation may include performance bonuses and stock options."),
|
| 76 |
+
'weight' : 1.0,
|
| 77 |
+
},
|
| 78 |
+
'termination' : {'keywords' : ['termination', 'terminate', 'notice period', 'resignation', 'dismissal', 'severance', 'end of employment', 'cessation', 'notice'],
|
| 79 |
+
'representative_text' : ("Either party may terminate this Agreement upon thirty days written notice. The Company may terminate for cause immediately upon written notice to Employee. Upon termination, Employee shall receive severance compensation."),
|
| 80 |
+
'weight' : 1.2,
|
| 81 |
+
},
|
| 82 |
+
'non_compete' : {'keywords' : ['non-compete', 'non-solicit', 'non-solicitation', 'restrictive covenant', 'competitive', 'competition', 'competing business', 'competitive activities'],
|
| 83 |
+
'representative_text' : ("Employee agrees not to engage in any competitive business activities for a period of twelve months following termination within a fifty-mile radius. Employee shall not solicit Company clients or employees during this period."),
|
| 84 |
+
'weight' : 1.5,
|
| 85 |
+
},
|
| 86 |
+
'confidentiality' : {'keywords' : ['confidential', 'proprietary', 'trade secret', 'disclosure', 'confidentiality', 'secret', 'private', 'non-disclosure'],
|
| 87 |
+
'representative_text' : ("Employee shall maintain the confidentiality of all proprietary information and trade secrets of the Company. Confidential Information includes business plans, customer lists, and technical data. These obligations survive termination."),
|
| 88 |
+
'weight' : 1.1,
|
| 89 |
+
},
|
| 90 |
+
'indemnification' : {'keywords' : ['indemnify', 'indemnification', 'hold harmless', 'defend', 'liability', 'claims', 'losses', 'damages'],
|
| 91 |
+
'representative_text' : ("Party A shall indemnify and hold harmless Party B from any claims, losses, or damages arising from Party A's breach or negligence. This indemnification includes reasonable attorneys' fees and costs of defense."),
|
| 92 |
+
'weight' : 1.3,
|
| 93 |
+
},
|
| 94 |
+
'intellectual_property' : {'keywords' : ['intellectual property', 'ip', 'copyright', 'patent', 'trademark', 'work product', 'inventions', 'creation', 'ownership', 'ip rights'],
|
| 95 |
+
'representative_text' : ("All work product and inventions created by Employee during employment shall be the exclusive property of the Company. Employee assigns all intellectual property rights including patents, copyrights, and trade secrets to the Company."),
|
| 96 |
+
'weight' : 1.2,
|
| 97 |
+
},
|
| 98 |
+
'liability' : {'keywords' : ['liable', 'liability', 'damages', 'limitation', 'consequential', 'indirect', 'punitive', 'cap', 'limited liability'],
|
| 99 |
+
'representative_text' : ("In no event shall either party be liable for indirect, incidental, or consequential damages. Total liability under this Agreement shall not exceed the amounts paid in the twelve months preceding the claim."),
|
| 100 |
+
'weight' : 1.2,
|
| 101 |
+
},
|
| 102 |
+
'warranty' : {'keywords' : ['warranty', 'warrant', 'representation', 'guarantee', 'assurance', 'promise', 'warranties'],
|
| 103 |
+
'representative_text' : ("Company warrants that the Services will be performed in a professional manner. EXCEPT AS EXPRESSLY PROVIDED, COMPANY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE."),
|
| 104 |
+
'weight' : 0.9,
|
| 105 |
+
},
|
| 106 |
+
'dispute_resolution' : {'keywords' : ['arbitration', 'mediation', 'dispute', 'jurisdiction', 'governing law', 'venue', 'forum', 'resolution'],
|
| 107 |
+
'representative_text' : ("Any disputes arising under this Agreement shall be resolved through binding arbitration in accordance with the rules of the American Arbitration Association. This Agreement shall be governed by the laws of the State of California."),
|
| 108 |
+
'weight' : 0.9,
|
| 109 |
+
},
|
| 110 |
+
'insurance' : {'keywords' : ['insurance', 'coverage', 'insured', 'policy', 'premium', 'insurer'],
|
| 111 |
+
'representative_text' : ("Contractor shall maintain general liability insurance with minimum coverage of one million dollars per occurrence. Proof of insurance shall be provided to Client. Company shall be named as additional insured on all policies."),
|
| 112 |
+
'weight' : 0.8,
|
| 113 |
+
},
|
| 114 |
+
'assignment' : {'keywords' : ['assignment', 'assign', 'transfer', 'successor', 'binding', 'assignee'],
|
| 115 |
+
'representative_text' : ("This Agreement may not be assigned by either party without the prior written consent of the other party. This Agreement shall be binding upon and inure to the benefit of the parties' successors and permitted assigns."),
|
| 116 |
+
'weight' : 0.8,
|
| 117 |
+
},
|
| 118 |
+
'amendment' : {'keywords' : ['amendment', 'modify', 'modification', 'change', 'alteration', 'waiver'],
|
| 119 |
+
'representative_text' : ("This Agreement may not be amended or modified except by written instrument signed by both parties. No waiver of any provision shall be effective unless in writing. All modifications must be mutually agreed upon."),
|
| 120 |
+
'weight' : 0.7,
|
| 121 |
+
},
|
| 122 |
+
'force_majeure' : {'keywords' : ['force majeure', 'act of god', 'unforeseeable', 'beyond control', 'natural disaster'],
|
| 123 |
+
'representative_text' : ("Neither party shall be liable for failure to perform due to causes beyond its reasonable control including acts of God, war, strikes, or natural disasters. Performance shall be suspended during the force majeure event."),
|
| 124 |
+
'weight' : 0.7,
|
| 125 |
+
},
|
| 126 |
+
'entire_agreement' : {'keywords' : ['entire agreement', 'integration', 'supersedes', 'prior agreements', 'complete agreement'],
|
| 127 |
+
'representative_text' : ("This Agreement constitutes the entire agreement between the parties and supersedes all prior agreements, whether written or oral. No other representations or warranties shall be binding unless incorporated herein."),
|
| 128 |
+
'weight' : 0.6,
|
| 129 |
+
},
|
| 130 |
+
'general' : {'keywords' : ['provision', 'term', 'condition', 'obligation', 'requirement'],
|
| 131 |
+
'representative_text' : ("The parties agree to the following terms and conditions governing their relationship. Each party shall perform its obligations in good faith and in accordance with industry standards and applicable law."),
|
| 132 |
+
'weight' : 0.5,
|
| 133 |
+
}
|
| 134 |
+
}
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
# RISK INDICATOR PATTERNS
|
| 137 |
+
RISK_INDICATORS = {'critical' : ['unlimited liability', 'perpetual', 'irrevocable', 'forfeit', 'liquidated damages', 'wage withholding', 'joint and several'],
|
| 138 |
+
'high' : ['non-compete', 'non-solicit', 'penalty', 'without cause', 'sole discretion', 'immediate termination', 'at-will'],
|
| 139 |
+
'medium' : ['indemnify', 'hold harmless', 'confidential', 'proprietary', 'exclusive', 'terminate', 'default', 'breach'],
|
| 140 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
+
|
| 143 |
# INITIALIZATION
|
| 144 |
+
def __init__(self, model_loader: ModelLoader, contract_category: Optional[str] = None):
|
|
|
|
|
|
|
| 145 |
"""
|
| 146 |
Initialize clause extractor with Legal-BERT
|
| 147 |
|
| 148 |
+
Arguments:
|
| 149 |
+
----------
|
| 150 |
+
model_loader { ModelLoader } : ModelLoader instance for accessing Legal-BERT
|
| 151 |
+
|
| 152 |
+
contract_category { str } : Optional contract category for context-aware extraction
|
| 153 |
"""
|
| 154 |
+
self.model_loader = model_loader
|
| 155 |
+
self.contract_category = contract_category
|
| 156 |
|
| 157 |
# Models (lazy loaded)
|
| 158 |
+
self.legal_bert_model = None
|
| 159 |
self.legal_bert_tokenizer = None
|
| 160 |
+
self.embedding_model = None
|
| 161 |
+
self.device = None
|
| 162 |
|
| 163 |
# Category embeddings (computed from representative texts)
|
| 164 |
+
self.category_embeddings = dict()
|
| 165 |
|
| 166 |
# Text processor
|
| 167 |
+
self.text_processor = TextProcessor(use_spacy = False)
|
| 168 |
|
| 169 |
# Logger
|
| 170 |
+
self.logger = ContractAnalyzerLogger.get_logger()
|
| 171 |
|
| 172 |
# Lazy load
|
| 173 |
self._lazy_load()
|
| 174 |
|
| 175 |
+
|
| 176 |
def _lazy_load(self):
|
| 177 |
+
"""
|
| 178 |
+
Lazy load Legal-BERT and embedding models
|
| 179 |
+
"""
|
| 180 |
if self.legal_bert_model is None:
|
| 181 |
try:
|
| 182 |
log_info("Loading Legal-BERT for clause extraction...")
|
| 183 |
|
| 184 |
# Load Legal-BERT (nlpaueb/legal-bert-base-uncased)
|
| 185 |
self.legal_bert_model, self.legal_bert_tokenizer = self.model_loader.load_legal_bert()
|
| 186 |
+
self.device = self.model_loader.device
|
| 187 |
|
| 188 |
# Load sentence transformer for embeddings
|
| 189 |
+
self.embedding_model = self.model_loader.load_embedding_model()
|
| 190 |
|
| 191 |
# Prepare category embeddings using Legal-BERT
|
| 192 |
self._prepare_category_embeddings()
|
|
|
|
| 194 |
log_info("Clause extractor models loaded successfully")
|
| 195 |
|
| 196 |
except Exception as e:
|
| 197 |
+
log_error(e, context = {"component": "ClauseExtractor", "operation": "model_loading"})
|
| 198 |
raise
|
| 199 |
+
|
| 200 |
|
| 201 |
def _prepare_category_embeddings(self):
|
| 202 |
"""
|
| 203 |
Pre-compute Legal-BERT embeddings for category representative texts
|
| 204 |
+
|
| 205 |
This enables semantic similarity matching for clause classification
|
| 206 |
"""
|
| 207 |
log_info("Computing Legal-BERT embeddings for clause categories...")
|
| 208 |
|
| 209 |
for category, config in self.CLAUSE_CATEGORIES.items():
|
| 210 |
+
representative_text = config['representative_text']
|
| 211 |
|
| 212 |
# Get Legal-BERT embedding (using [CLS] token)
|
| 213 |
+
embedding = self._get_legal_bert_embedding(representative_text)
|
| 214 |
+
|
| 215 |
self.category_embeddings[category] = embedding
|
| 216 |
|
| 217 |
log_info(f"Prepared Legal-BERT embeddings for {len(self.category_embeddings)} categories")
|
| 218 |
|
| 219 |
+
|
| 220 |
def _get_legal_bert_embedding(self, text: str) -> np.ndarray:
|
| 221 |
"""
|
| 222 |
Get Legal-BERT embedding for text using [CLS] token
|
| 223 |
|
| 224 |
+
Arguments:
|
| 225 |
+
----------
|
| 226 |
+
text { str } : Input text
|
| 227 |
|
| 228 |
Returns:
|
| 229 |
+
--------
|
| 230 |
+
{ np.ndarray } : Embedding vector as numpy array
|
| 231 |
"""
|
| 232 |
# Tokenize
|
| 233 |
+
inputs = self.legal_bert_tokenizer(text,
|
| 234 |
+
return_tensors = "pt",
|
| 235 |
+
padding = True,
|
| 236 |
+
truncation = True,
|
| 237 |
+
max_length = 512,
|
| 238 |
+
).to(self.device)
|
|
|
|
| 239 |
|
| 240 |
# Get embeddings
|
| 241 |
with torch.no_grad():
|
| 242 |
+
outputs = self.legal_bert_model(**inputs)
|
| 243 |
# Use [CLS] token embedding (first token)
|
| 244 |
cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
|
| 245 |
|
| 246 |
return cls_embedding
|
| 247 |
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
+
|
| 250 |
@ContractAnalyzerLogger.log_execution_time("extract_clauses")
|
| 251 |
+
def extract_clauses(self, contract_text: str, max_clauses: int = 15) -> List[ExtractedClause]:
|
|
|
|
| 252 |
"""
|
| 253 |
Extract and classify clauses from contract using hybrid approach
|
| 254 |
|
|
|
|
| 258 |
3. Legal-BERT classification
|
| 259 |
4. Deduplicate and rank by confidence
|
| 260 |
|
| 261 |
+
Arguments:
|
| 262 |
+
----------
|
| 263 |
+
contract_text { str } : Full contract text
|
| 264 |
+
|
| 265 |
+
max_clauses { int } : Maximum number of clauses to return
|
| 266 |
|
| 267 |
Returns:
|
| 268 |
+
--------
|
| 269 |
+
{ list } : List of ExtractedClause objects sorted by confidence
|
| 270 |
"""
|
| 271 |
|
| 272 |
log_info("Starting clause extraction",
|
| 273 |
+
text_length = len(contract_text),
|
| 274 |
+
contract_category = self.contract_category,
|
| 275 |
+
max_clauses = max_clauses,
|
| 276 |
+
)
|
| 277 |
|
| 278 |
+
# Extract using structural patterns
|
| 279 |
structural_clauses = self._extract_structural_clauses(contract_text)
|
| 280 |
log_info(f"Extracted {len(structural_clauses)} structural clauses")
|
| 281 |
|
| 282 |
+
# Semantic chunking for unstructured parts
|
| 283 |
+
semantic_chunks = self._semantic_chunking(contract_text, structural_clauses)
|
| 284 |
log_info(f"Created {len(semantic_chunks)} semantic chunks")
|
| 285 |
|
| 286 |
+
# Combine all candidates
|
| 287 |
+
all_candidates = structural_clauses + semantic_chunks
|
| 288 |
log_info(f"Total candidates: {len(all_candidates)}")
|
| 289 |
|
| 290 |
+
# Classify with Legal-BERT
|
| 291 |
classified_clauses = self._classify_clauses_with_legal_bert(all_candidates)
|
| 292 |
log_info(f"Classified {len(classified_clauses)} clauses")
|
| 293 |
|
| 294 |
+
# Deduplicate and rank
|
| 295 |
+
final_clauses = self._deduplicate_and_rank(classified_clauses, max_clauses)
|
| 296 |
log_info(f"Final output: {len(final_clauses)} clauses")
|
| 297 |
|
| 298 |
return final_clauses
|
| 299 |
|
|
|
|
|
|
|
|
|
|
| 300 |
|
| 301 |
def _extract_structural_clauses(self, text: str) -> List[Dict]:
|
| 302 |
"""
|
|
|
|
| 308 |
- "Article III. Text"
|
| 309 |
- "Clause 11. Text"
|
| 310 |
"""
|
| 311 |
+
candidates = list()
|
| 312 |
|
| 313 |
# Clean text
|
| 314 |
+
text = re.sub(r'\s+', ' ', text)
|
| 315 |
|
| 316 |
# Patterns for legal numbering
|
| 317 |
+
patterns = [(r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=\d+\.\d+(?:\.\d+)*\.|$)', 'numbered'),
|
| 318 |
+
(r'(Article\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+))\.\s*([^\n]{30,800}?)(?=Article\s+(?:\d+|[IVXLCDM]+)|$)', 'article'),
|
| 319 |
+
(r'(Section\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Section\s+\d+|$)', 'section'),
|
| 320 |
+
(r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Clause\s+\d+|$)', 'clause'),
|
| 321 |
+
(r'\(([a-z]|[ivxlcdm]+)\)\s*([^\n]{30,500}?)(?=\([a-z]|[ivxlcdm]+\)|\n\n|$)', 'subclause'),
|
| 322 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
for pattern, ref_type in patterns:
|
| 325 |
matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
|
| 326 |
+
|
| 327 |
for match in matches:
|
| 328 |
clause_text = match.group(2).strip()
|
| 329 |
|
|
|
|
| 331 |
if not self._is_boilerplate(clause_text):
|
| 332 |
# Check for meaningful content
|
| 333 |
if self._has_meaningful_content(clause_text):
|
| 334 |
+
candidates.append({'text' : clause_text,
|
| 335 |
+
'reference' : match.group(1).strip(),
|
| 336 |
+
'start' : match.start(),
|
| 337 |
+
'end' : match.end(),
|
| 338 |
+
'type' : 'structural',
|
| 339 |
+
'ref_type' : ref_type,
|
| 340 |
+
})
|
|
|
|
| 341 |
|
| 342 |
# Remove overlapping clauses
|
| 343 |
candidates = self._remove_overlapping(candidates)
|
| 344 |
|
| 345 |
return candidates
|
| 346 |
+
|
| 347 |
|
| 348 |
def _is_boilerplate(self, text: str) -> bool:
|
| 349 |
+
"""
|
| 350 |
+
Check if text is boilerplate/definitional rather than substantive
|
| 351 |
+
"""
|
| 352 |
+
boilerplate_indicators = ['shall mean',
|
| 353 |
+
'means and includes',
|
| 354 |
+
'defined as',
|
| 355 |
+
'definition of',
|
| 356 |
+
'hereinafter referred to',
|
| 357 |
+
'for purposes of this',
|
| 358 |
+
'interpretation of',
|
| 359 |
+
'as used in this',
|
| 360 |
+
'the term',
|
| 361 |
+
'shall include',
|
| 362 |
+
'includes but not limited',
|
| 363 |
+
]
|
| 364 |
+
|
| 365 |
+
text_lower = text.lower()
|
| 366 |
# Must have at least one strong indicator AND be definition-heavy
|
| 367 |
+
has_indicator = any(indicator in text_lower for indicator in boilerplate_indicators)
|
| 368 |
+
is_short_definition = len(text.split()) < 50 and '"' in text
|
| 369 |
|
| 370 |
return has_indicator or is_short_definition
|
| 371 |
|
| 372 |
+
|
| 373 |
def _has_meaningful_content(self, text: str) -> bool:
|
| 374 |
+
"""
|
| 375 |
+
Check if text has meaningful legal content
|
| 376 |
+
"""
|
| 377 |
# Must have minimum length
|
| 378 |
+
if (len(text.split()) < 15):
|
| 379 |
return False
|
| 380 |
|
| 381 |
# Check for legal action verbs
|
| 382 |
+
action_verbs = ['shall',
|
| 383 |
+
'must',
|
| 384 |
+
'will',
|
| 385 |
+
'may',
|
| 386 |
+
'agrees',
|
| 387 |
+
'undertakes',
|
| 388 |
+
'covenants',
|
| 389 |
+
'warrants',
|
| 390 |
+
'represents',
|
| 391 |
+
'acknowledges',
|
| 392 |
+
'certifies',
|
| 393 |
+
'indemnifies',
|
| 394 |
+
'waives',
|
| 395 |
+
'terminates',
|
| 396 |
+
]
|
| 397 |
+
|
| 398 |
+
text_lower = text.lower()
|
| 399 |
+
has_action = any(verb in text_lower for verb in action_verbs)
|
| 400 |
|
| 401 |
# Check for legal subjects
|
| 402 |
+
legal_subjects = ['party',
|
| 403 |
+
'parties',
|
| 404 |
+
'employee',
|
| 405 |
+
'employer',
|
| 406 |
+
'company',
|
| 407 |
+
'contractor',
|
| 408 |
+
'consultant',
|
| 409 |
+
'client',
|
| 410 |
+
'vendor',
|
| 411 |
+
'buyer',
|
| 412 |
+
'seller',
|
| 413 |
+
'landlord',
|
| 414 |
+
'tenant',
|
| 415 |
+
'licensor',
|
| 416 |
+
'licensee',
|
| 417 |
+
]
|
| 418 |
+
|
| 419 |
+
has_subject = any(subj in text_lower for subj in legal_subjects)
|
| 420 |
|
| 421 |
return has_action or has_subject
|
| 422 |
|
| 423 |
+
|
| 424 |
def _remove_overlapping(self, candidates: List[Dict]) -> List[Dict]:
|
| 425 |
+
"""
|
| 426 |
+
Remove overlapping clause extractions
|
| 427 |
+
"""
|
| 428 |
if not candidates:
|
| 429 |
return []
|
| 430 |
|
| 431 |
# Sort by start position
|
| 432 |
+
candidates.sort(key = lambda x: x['start'])
|
| 433 |
|
| 434 |
non_overlapping = [candidates[0]]
|
| 435 |
|
|
|
|
| 437 |
last = non_overlapping[-1]
|
| 438 |
|
| 439 |
# Check if overlaps
|
| 440 |
+
if (candidate['start'] >= last['end']):
|
| 441 |
non_overlapping.append(candidate)
|
| 442 |
+
|
| 443 |
+
elif (len(candidate['text']) > len(last['text'])):
|
| 444 |
# Keep longer clause if overlapping
|
| 445 |
non_overlapping[-1] = candidate
|
| 446 |
|
| 447 |
return non_overlapping
|
| 448 |
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
+
def _semantic_chunking(self, text: str, structural_clauses: List[Dict], chunk_size: int = 200) -> List[Dict]:
|
|
|
|
|
|
|
| 451 |
"""
|
| 452 |
+
Chunk unstructured text semantically uses sentence boundaries to find natural clause boundaries
|
|
|
|
| 453 |
"""
|
|
|
|
| 454 |
# Get covered ranges from structural clauses
|
| 455 |
covered_ranges = [(c['start'], c['end']) for c in structural_clauses]
|
| 456 |
|
| 457 |
# Split into sentences
|
| 458 |
+
sentences = self.text_processor.extract_sentences(text)
|
| 459 |
|
| 460 |
+
chunks = list()
|
| 461 |
+
current_chunk = list()
|
| 462 |
current_length = 0
|
| 463 |
+
current_start = 0
|
| 464 |
|
| 465 |
for sentence in sentences:
|
| 466 |
# Check if sentence is already covered by structural extraction
|
| 467 |
sentence_start = text.find(sentence, current_start)
|
| 468 |
+
if (sentence_start == -1):
|
| 469 |
continue
|
| 470 |
|
| 471 |
if self._is_in_range(sentence_start, covered_ranges):
|
|
|
|
| 476 |
current_length += len(sentence.split())
|
| 477 |
|
| 478 |
# Create chunk when reaching size limit
|
| 479 |
+
if (current_length >= chunk_size):
|
| 480 |
chunk_text = ' '.join(current_chunk).strip()
|
| 481 |
|
| 482 |
+
if (len(chunk_text) >= 50) and (not self._is_boilerplate(chunk_text)):
|
| 483 |
if self._has_meaningful_content(chunk_text):
|
| 484 |
+
chunks.append({'text' : chunk_text,
|
| 485 |
+
'reference' : f'Semantic-{len(chunks)+1}',
|
| 486 |
+
'start' : sentence_start,
|
| 487 |
+
'end' : sentence_start + len(chunk_text),
|
| 488 |
+
'type' : 'semantic',
|
| 489 |
+
'ref_type' : 'semantic',
|
| 490 |
+
})
|
|
|
|
| 491 |
|
| 492 |
+
current_chunk = list()
|
| 493 |
current_length = 0
|
| 494 |
|
| 495 |
current_start = sentence_start + len(sentence)
|
|
|
|
| 497 |
# Add final chunk if exists
|
| 498 |
if current_chunk:
|
| 499 |
chunk_text = ' '.join(current_chunk).strip()
|
| 500 |
+
|
| 501 |
+
if ((len(chunk_text) >= 50) and (not self._is_boilerplate(chunk_text))):
|
| 502 |
if self._has_meaningful_content(chunk_text):
|
| 503 |
sentence_start = text.find(current_chunk[0])
|
| 504 |
+
chunks.append({'text' : chunk_text,
|
| 505 |
+
'reference' : f'Semantic-{len(chunks)+1}',
|
| 506 |
+
'start' : sentence_start,
|
| 507 |
+
'end' : sentence_start + len(chunk_text),
|
| 508 |
+
'type' : 'semantic',
|
| 509 |
+
'ref_type' : 'semantic',
|
| 510 |
+
})
|
|
|
|
| 511 |
|
| 512 |
return chunks
|
| 513 |
|
| 514 |
+
|
| 515 |
def _is_in_range(self, position: int, ranges: List[Tuple[int, int]]) -> bool:
|
| 516 |
+
"""
|
| 517 |
+
Check if position is within any of the ranges
|
| 518 |
+
"""
|
| 519 |
return any(start <= position <= end for start, end in ranges)
|
| 520 |
|
|
|
|
|
|
|
|
|
|
| 521 |
|
| 522 |
def _classify_clauses_with_legal_bert(self, candidates: List[Dict]) -> List[ExtractedClause]:
|
| 523 |
"""
|
| 524 |
Classify clauses using Legal-BERT embeddings + keyword matching
|
| 525 |
"""
|
| 526 |
+
classified = list()
|
| 527 |
|
| 528 |
for candidate in candidates:
|
| 529 |
# Get Legal-BERT embedding for clause
|
| 530 |
+
clause_embedding = self._get_legal_bert_embedding(candidate['text'])
|
| 531 |
|
| 532 |
# Classify using hybrid approach
|
| 533 |
+
category, confidence, legal_bert_score = self._classify_single_clause(candidate['text'], clause_embedding)
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
# Extract risk indicators
|
| 536 |
+
risk_indicators = self._extract_risk_indicators(candidate['text'])
|
| 537 |
|
| 538 |
# Extract sub-clauses if any
|
| 539 |
+
subclauses = self._extract_subclauses(candidate['text'])
|
| 540 |
|
| 541 |
+
classified.append(ExtractedClause(text = candidate['text'],
|
| 542 |
+
reference = candidate['reference'],
|
| 543 |
+
category = category,
|
| 544 |
+
confidence = confidence,
|
| 545 |
+
start_pos = candidate['start'],
|
| 546 |
+
end_pos = candidate['end'],
|
| 547 |
+
extraction_method = candidate['type'],
|
| 548 |
+
risk_indicators = risk_indicators,
|
| 549 |
+
embeddings = clause_embedding,
|
| 550 |
+
subclauses = subclauses,
|
| 551 |
+
legal_bert_score = legal_bert_score,
|
| 552 |
+
)
|
| 553 |
+
)
|
| 554 |
|
| 555 |
return classified
|
| 556 |
|
| 557 |
+
|
| 558 |
+
def _classify_single_clause(self, text: str, clause_embedding: np.ndarray) -> Tuple[str, float, float]:
|
| 559 |
"""
|
| 560 |
Classify single clause using Legal-BERT + keyword matching
|
| 561 |
|
| 562 |
Returns:
|
| 563 |
+
--------
|
| 564 |
+
{ tuple } : (category, confidence, legal_bert_score)
|
| 565 |
"""
|
| 566 |
+
text_lower = text.lower()
|
| 567 |
|
| 568 |
+
# Keyword matching
|
| 569 |
+
keyword_scores = dict()
|
| 570 |
+
|
| 571 |
for category, config in self.CLAUSE_CATEGORIES.items():
|
| 572 |
+
keywords = config['keywords']
|
| 573 |
+
weight = config['weight']
|
| 574 |
|
| 575 |
+
keyword_count = sum(1 for kw in keywords if kw in text_lower)
|
| 576 |
keyword_scores[category] = (keyword_count / len(keywords)) * weight
|
| 577 |
|
| 578 |
+
# Legal-BERT semantic similarity
|
| 579 |
+
semantic_scores = dict()
|
| 580 |
clause_embedding_tensor = torch.tensor(clause_embedding).unsqueeze(0)
|
| 581 |
|
| 582 |
for category, cat_embedding in self.category_embeddings.items():
|
| 583 |
+
cat_embedding_tensor = torch.tensor(cat_embedding).unsqueeze(0)
|
| 584 |
+
similarity = torch.nn.functional.cosine_similarity(clause_embedding_tensor, cat_embedding_tensor).item()
|
|
|
|
|
|
|
|
|
|
| 585 |
semantic_scores[category] = similarity
|
| 586 |
|
| 587 |
# Combine scores (70% semantic, 30% keyword)
|
| 588 |
+
combined_scores = dict()
|
| 589 |
+
|
| 590 |
for category in self.CLAUSE_CATEGORIES.keys():
|
| 591 |
+
combined = (semantic_scores.get(category, 0) * 0.70 + keyword_scores.get(category, 0) * 0.30)
|
|
|
|
|
|
|
|
|
|
| 592 |
combined_scores[category] = combined
|
| 593 |
|
| 594 |
# Get best category
|
| 595 |
+
best_category = max(combined_scores, key = combined_scores.get)
|
| 596 |
+
confidence = combined_scores[best_category]
|
| 597 |
legal_bert_score = semantic_scores[best_category]
|
| 598 |
|
| 599 |
return best_category, confidence, legal_bert_score
|
| 600 |
|
| 601 |
+
|
| 602 |
def _extract_risk_indicators(self, text: str) -> List[str]:
|
| 603 |
+
"""
|
| 604 |
+
Extract risk indicator keywords from clause text
|
| 605 |
+
"""
|
| 606 |
+
text_lower = text.lower()
|
| 607 |
+
found_indicators = dict()
|
| 608 |
|
| 609 |
for severity, indicators in self.RISK_INDICATORS.items():
|
| 610 |
for indicator in indicators:
|
| 611 |
if indicator in text_lower:
|
| 612 |
found_indicators.append(indicator)
|
| 613 |
|
| 614 |
+
# Top 25 risk indicators
|
| 615 |
+
return found_indicators[:25]
|
| 616 |
|
| 617 |
+
|
| 618 |
def _extract_subclauses(self, text: str) -> List[str]:
|
| 619 |
+
"""
|
| 620 |
+
Extract sub-clauses from main clause (e.g., (a), (b), (i), (ii))
|
| 621 |
+
"""
|
| 622 |
# Pattern for sub-clauses: (a), (i), etc.
|
| 623 |
subclause_pattern = r'\(([a-z]|[ivxlcdm]+)\)\s*([^()]{20,200}?)(?=\([a-z]|[ivxlcdm]+\)|$)'
|
| 624 |
+
matches = re.findall(subclause_pattern, text, re.IGNORECASE)
|
| 625 |
|
| 626 |
+
subclauses = list()
|
| 627 |
+
|
| 628 |
for ref, subtext in matches:
|
| 629 |
clean_text = subtext.strip()
|
| 630 |
+
|
| 631 |
+
if (len(clean_text) >= 20):
|
| 632 |
subclauses.append(f"({ref}) {clean_text}")
|
| 633 |
|
| 634 |
+
# Max 25 sub-clauses
|
| 635 |
+
return subclauses[:25]
|
| 636 |
|
|
|
|
|
|
|
|
|
|
| 637 |
|
| 638 |
+
def _deduplicate_and_rank(self, clauses: List[ExtractedClause], max_clauses: int) -> List[ExtractedClause]:
|
|
|
|
| 639 |
"""
|
| 640 |
Remove duplicates and rank by confidence + legal_bert_score
|
| 641 |
"""
|
|
|
|
| 643 |
return []
|
| 644 |
|
| 645 |
# Sort by combined score (confidence * 0.6 + legal_bert_score * 0.4)
|
| 646 |
+
clauses.sort(key = lambda x: (x.confidence * 0.6 + x.legal_bert_score * 0.4), reverse = True)
|
|
|
|
|
|
|
|
|
|
| 647 |
|
| 648 |
# Deduplicate by text similarity
|
| 649 |
+
unique_clauses = list()
|
| 650 |
+
seen_texts = set()
|
| 651 |
|
| 652 |
for clause in clauses:
|
| 653 |
# Simple deduplication by first 100 chars
|
| 654 |
+
text_key = clause.text[:100].lower().strip()
|
| 655 |
|
| 656 |
# Also check similarity to already added clauses
|
| 657 |
is_duplicate = False
|
| 658 |
+
|
| 659 |
for existing in unique_clauses:
|
| 660 |
similarity = self._text_similarity(clause.text, existing.text)
|
| 661 |
+
if (similarity > 0.85):
|
| 662 |
is_duplicate = True
|
| 663 |
break
|
| 664 |
|
|
|
|
| 666 |
unique_clauses.append(clause)
|
| 667 |
seen_texts.add(text_key)
|
| 668 |
|
| 669 |
+
if (len(unique_clauses) >= max_clauses):
|
| 670 |
break
|
| 671 |
|
| 672 |
return unique_clauses
|
| 673 |
|
| 674 |
+
|
| 675 |
def _text_similarity(self, text1: str, text2: str) -> float:
|
| 676 |
+
"""
|
| 677 |
+
Calculate text similarity (simple Jaccard similarity)
|
| 678 |
+
"""
|
| 679 |
+
words1 = set(text1.lower().split())
|
| 680 |
+
words2 = set(text2.lower().split())
|
| 681 |
|
| 682 |
intersection = len(words1 & words2)
|
| 683 |
+
union = len(words1 | words2)
|
| 684 |
|
| 685 |
return intersection / union if union > 0 else 0.0
|
| 686 |
|
|
|
|
|
|
|
|
|
|
| 687 |
|
| 688 |
def get_category_distribution(self, clauses: List[ExtractedClause]) -> Dict[str, int]:
|
| 689 |
+
"""
|
| 690 |
+
Get distribution of clause categories
|
| 691 |
+
"""
|
| 692 |
distribution = defaultdict(int)
|
| 693 |
+
|
| 694 |
for clause in clauses:
|
| 695 |
distribution[clause.category] += 1
|
| 696 |
|
|
|
|
| 698 |
|
| 699 |
return dict(distribution)
|
| 700 |
|
| 701 |
+
|
| 702 |
def get_high_risk_clauses(self, clauses: List[ExtractedClause]) -> List[ExtractedClause]:
|
| 703 |
+
"""
|
| 704 |
+
Get clauses with risk indicators
|
| 705 |
+
"""
|
| 706 |
risky = [c for c in clauses if c.risk_indicators]
|
| 707 |
+
|
| 708 |
+
risky.sort(key = lambda x: len(x.risk_indicators), reverse = True)
|
| 709 |
|
| 710 |
+
top_25_risky_clauses = risky[:25]
|
| 711 |
+
|
| 712 |
+
return top_25_risky_clauses
|
services/contract_classifier.py
CHANGED
|
@@ -232,7 +232,7 @@ class ContractClassifier:
|
|
| 232 |
Arguments:
|
| 233 |
----------
|
| 234 |
model_loader : ModelLoader instance for accessing Legal-BERT and embeddings
|
| 235 |
-
|
| 236 |
self.model_loader = model_loader
|
| 237 |
self.embedding_model = None
|
| 238 |
self.legal_bert_model = None
|
|
@@ -294,7 +294,7 @@ class ContractClassifier:
|
|
| 294 |
|
| 295 |
log_info(f"Prepared embeddings for {len(self.category_embeddings)} categories")
|
| 296 |
|
| 297 |
-
|
| 298 |
# MAIN CLASSIFICATION METHOD
|
| 299 |
@ContractAnalyzerLogger.log_execution_time("classify_contract")
|
| 300 |
def classify_contract(self, contract_text: str, min_confidence: float = 0.50) -> ContractCategory:
|
|
@@ -325,103 +325,99 @@ class ContractClassifier:
|
|
| 325 |
raise ValueError("Contract text too short for classification")
|
| 326 |
|
| 327 |
# Preprocess text (use first 3000 chars for efficiency)
|
| 328 |
-
text_excerpt = contract_text
|
| 329 |
|
| 330 |
log_info("Starting contract classification",
|
| 331 |
-
|
| 332 |
-
|
|
|
|
| 333 |
|
| 334 |
# Step 1: Keyword scoring
|
| 335 |
-
keyword_scores
|
| 336 |
|
| 337 |
# Step 2: Semantic similarity
|
| 338 |
-
semantic_scores
|
| 339 |
|
| 340 |
# Step 3: Legal-BERT enhanced (optional - can be expensive)
|
| 341 |
-
|
| 342 |
|
| 343 |
# Step 4: Combine scores (weighted average)
|
| 344 |
-
combined_scores
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
)
|
| 349 |
|
| 350 |
# Step 5: Get primary category
|
| 351 |
if not combined_scores:
|
| 352 |
log_info("No categories detected, defaulting to 'general'")
|
| 353 |
-
return ContractCategory(
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
)
|
| 360 |
|
| 361 |
-
primary_category
|
| 362 |
-
confidence
|
| 363 |
|
| 364 |
# Step 6: Detect subcategory
|
| 365 |
-
subcategory
|
| 366 |
|
| 367 |
# Step 7: Generate reasoning
|
| 368 |
-
reasoning
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
)
|
| 376 |
|
| 377 |
# Step 8: Extract detected keywords
|
| 378 |
-
detected_keywords
|
| 379 |
-
|
| 380 |
-
# Step 9: Get alternative categories
|
| 381 |
-
alternative_categories = sorted(
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
alternative_categories=alternative_categories
|
| 394 |
-
)
|
| 395 |
|
| 396 |
log_info("Contract classified successfully",
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
|
|
|
| 400 |
|
| 401 |
return result
|
| 402 |
|
| 403 |
-
# =========================================================================
|
| 404 |
-
# SCORING METHODS
|
| 405 |
-
# =========================================================================
|
| 406 |
|
| 407 |
def _score_keywords(self, text_lower: str) -> Dict[str, float]:
|
| 408 |
"""
|
| 409 |
Score each category based on keyword presence
|
| 410 |
|
| 411 |
-
|
| 412 |
-
|
|
|
|
| 413 |
|
| 414 |
Returns:
|
| 415 |
-
|
|
|
|
| 416 |
"""
|
| 417 |
-
scores =
|
| 418 |
|
| 419 |
for category, config in self.CATEGORY_HIERARCHY.items():
|
| 420 |
-
keywords
|
| 421 |
-
weight
|
| 422 |
|
| 423 |
# Count keyword matches
|
| 424 |
-
keyword_count
|
| 425 |
|
| 426 |
# Normalize by number of keywords and apply weight
|
| 427 |
normalized_score = (keyword_count / len(keywords)) * weight
|
|
@@ -430,91 +426,92 @@ class ContractClassifier:
|
|
| 430 |
|
| 431 |
return scores
|
| 432 |
|
|
|
|
| 433 |
def _semantic_similarity(self, text: str) -> Dict[str, float]:
|
| 434 |
"""
|
| 435 |
Calculate semantic similarity to category templates using embeddings
|
| 436 |
|
| 437 |
-
|
| 438 |
-
|
|
|
|
| 439 |
|
| 440 |
Returns:
|
| 441 |
-
|
|
|
|
| 442 |
"""
|
| 443 |
# Encode contract text
|
| 444 |
-
text_embedding = self.embedding_model.encode(text, convert_to_tensor=True)
|
| 445 |
|
| 446 |
# Calculate similarity to each category
|
| 447 |
-
similarities
|
|
|
|
| 448 |
for category, cat_embedding in self.category_embeddings.items():
|
| 449 |
-
similarity
|
| 450 |
similarities[category] = similarity
|
| 451 |
|
| 452 |
return similarities
|
|
|
|
| 453 |
|
| 454 |
def _legal_bert_classification(self, text: str) -> Dict[str, float]:
|
| 455 |
"""
|
| 456 |
Use Legal-BERT for classification (optional - computationally expensive)
|
| 457 |
|
| 458 |
-
|
| 459 |
-
|
|
|
|
| 460 |
|
| 461 |
Returns:
|
| 462 |
-
|
|
|
|
| 463 |
"""
|
| 464 |
-
# This is a placeholder for Legal-BERT classification
|
| 465 |
-
# In production, you'd fine-tune Legal-BERT on labeled contract data
|
| 466 |
-
|
| 467 |
# Tokenize
|
| 468 |
-
inputs = self.legal_bert_tokenizer(
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
).to(self.device)
|
| 475 |
|
| 476 |
# Get embeddings
|
| 477 |
with torch.no_grad():
|
| 478 |
-
outputs
|
| 479 |
cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
|
| 480 |
|
| 481 |
-
# For now, return uniform scores (placeholder)
|
| 482 |
-
# In production, you'd use a trained classifier head
|
| 483 |
return {cat: 0.5 for cat in self.CATEGORY_HIERARCHY.keys()}
|
| 484 |
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
legal_bert_scores: Dict[str, float] = None) -> Dict[str, float]:
|
| 488 |
"""
|
| 489 |
Combine scores from different methods (weighted average)
|
| 490 |
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
Returns:
|
| 497 |
-
|
|
|
|
| 498 |
"""
|
| 499 |
-
combined
|
| 500 |
|
| 501 |
# Weights for each method
|
| 502 |
-
keyword_weight
|
| 503 |
-
semantic_weight
|
| 504 |
legal_bert_weight = 0.00 # Set to 0 if not using Legal-BERT
|
| 505 |
|
| 506 |
if legal_bert_scores:
|
| 507 |
# Normalize weights
|
| 508 |
-
total_weight
|
| 509 |
-
keyword_weight
|
| 510 |
-
semantic_weight
|
| 511 |
legal_bert_weight /= total_weight
|
| 512 |
|
| 513 |
for category in self.CATEGORY_HIERARCHY.keys():
|
| 514 |
-
score = (
|
| 515 |
-
keyword_scores.get(category, 0) * keyword_weight +
|
| 516 |
-
semantic_scores.get(category, 0) * semantic_weight
|
| 517 |
-
)
|
| 518 |
|
| 519 |
if legal_bert_scores:
|
| 520 |
score += legal_bert_scores.get(category, 0) * legal_bert_weight
|
|
@@ -523,202 +520,204 @@ class ContractClassifier:
|
|
| 523 |
|
| 524 |
return combined
|
| 525 |
|
| 526 |
-
# =========================================================================
|
| 527 |
-
# SUBCATEGORY DETECTION
|
| 528 |
-
# =========================================================================
|
| 529 |
|
| 530 |
def _detect_subcategory(self, text: str, primary_category: str) -> Optional[str]:
|
| 531 |
"""
|
| 532 |
Detect specific subcategory within primary category
|
| 533 |
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
|
|
|
|
|
|
| 537 |
|
| 538 |
Returns:
|
| 539 |
-
|
|
|
|
| 540 |
"""
|
| 541 |
-
text_lower
|
| 542 |
|
| 543 |
# Get subcategories for this category
|
| 544 |
subcategories = self.CATEGORY_HIERARCHY[primary_category]['subcategories']
|
| 545 |
|
| 546 |
# Score each subcategory
|
| 547 |
-
subcat_scores =
|
|
|
|
| 548 |
for subcat in subcategories:
|
| 549 |
if subcat in self.SUBCATEGORY_PATTERNS:
|
| 550 |
-
patterns
|
| 551 |
-
score
|
| 552 |
subcat_scores[subcat] = score
|
| 553 |
|
| 554 |
# Return best match if any
|
| 555 |
-
if subcat_scores and max(subcat_scores.values()) > 0:
|
| 556 |
-
best_subcat = max(subcat_scores, key=subcat_scores.get)
|
| 557 |
log_info(f"Detected subcategory: {best_subcat}",
|
| 558 |
-
|
| 559 |
-
|
|
|
|
|
|
|
| 560 |
return best_subcat
|
| 561 |
|
| 562 |
return None
|
| 563 |
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
def _generate_reasoning(self, contract_text: str, primary_category: str,
|
| 569 |
-
subcategory: Optional[str],
|
| 570 |
-
keyword_scores: Dict[str, float],
|
| 571 |
-
semantic_scores: Dict[str, float],
|
| 572 |
-
combined_scores: Dict[str, float]) -> List[str]:
|
| 573 |
"""
|
| 574 |
Generate human-readable reasoning for classification
|
| 575 |
|
| 576 |
Returns:
|
| 577 |
-
|
|
|
|
| 578 |
"""
|
| 579 |
-
reasoning
|
| 580 |
|
| 581 |
# Primary category reasoning
|
| 582 |
-
keyword_match
|
| 583 |
semantic_match = semantic_scores.get(primary_category, 0)
|
| 584 |
|
| 585 |
-
if keyword_match > 0.5:
|
| 586 |
-
reasoning.append(
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
elif keyword_match > 0.3:
|
| 591 |
-
reasoning.append(
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
f"(similarity: {semantic_match:.2f})"
|
| 605 |
-
)
|
| 606 |
|
| 607 |
# Subcategory reasoning
|
| 608 |
if subcategory:
|
| 609 |
-
reasoning.append(
|
| 610 |
-
f"Specific subcategory identified: {subcategory.replace('_', ' ')}"
|
| 611 |
-
)
|
| 612 |
|
| 613 |
# Alternative categories (if close)
|
| 614 |
-
sorted_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
|
| 615 |
-
|
|
|
|
| 616 |
alt_category, alt_score = sorted_scores[1]
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
|
| 622 |
# If no strong reasoning
|
| 623 |
if not reasoning:
|
| 624 |
reasoning.append("Classification based on general contract structure and terminology")
|
| 625 |
|
| 626 |
return reasoning
|
|
|
|
| 627 |
|
| 628 |
def _extract_detected_keywords(self, text: str, category: str) -> List[str]:
|
| 629 |
"""
|
| 630 |
Extract which specific keywords were found
|
| 631 |
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
|
|
|
|
|
|
| 635 |
|
| 636 |
Returns:
|
| 637 |
-
|
|
|
|
| 638 |
"""
|
| 639 |
text_lower = text.lower()
|
| 640 |
-
keywords
|
| 641 |
|
| 642 |
-
detected
|
| 643 |
-
|
|
|
|
|
|
|
| 644 |
|
| 645 |
-
# =========================================================================
|
| 646 |
-
# MULTI-LABEL CLASSIFICATION
|
| 647 |
-
# =========================================================================
|
| 648 |
|
| 649 |
@ContractAnalyzerLogger.log_execution_time("classify_multi_label")
|
| 650 |
-
def classify_multi_label(self, text: str,
|
| 651 |
-
threshold: float = 0.45) -> List[ContractCategory]:
|
| 652 |
"""
|
| 653 |
-
Classify as multiple categories if applicable
|
| 654 |
-
(e.g., Employment + NDA, Consulting + IP Assignment)
|
| 655 |
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
|
|
|
|
|
|
| 659 |
|
| 660 |
Returns:
|
| 661 |
-
|
|
|
|
| 662 |
"""
|
| 663 |
-
log_info("Starting multi-label classification", threshold=threshold)
|
| 664 |
|
| 665 |
# Get scores
|
| 666 |
-
keyword_scores
|
| 667 |
-
semantic_scores = self._semantic_similarity(text
|
| 668 |
combined_scores = self._combine_scores(keyword_scores, semantic_scores)
|
| 669 |
|
| 670 |
# Get all categories above threshold
|
| 671 |
-
matches
|
|
|
|
| 672 |
for category, score in combined_scores.items():
|
| 673 |
-
if score >= threshold:
|
| 674 |
subcategory = self._detect_subcategory(text, category)
|
| 675 |
-
reasoning
|
| 676 |
-
|
| 677 |
-
keyword_scores, semantic_scores, combined_scores
|
| 678 |
-
)
|
| 679 |
-
keywords = self._extract_detected_keywords(text, category)
|
| 680 |
|
| 681 |
-
matches.append(ContractCategory(
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
|
| 689 |
# Sort by confidence
|
| 690 |
-
matches.sort(key=lambda x: x.confidence, reverse=True)
|
| 691 |
|
| 692 |
log_info(f"Multi-label classification found {len(matches)} categories")
|
| 693 |
|
| 694 |
return matches if matches else [self.classify_contract(text)]
|
| 695 |
|
| 696 |
-
|
| 697 |
-
# UTILITY METHODS
|
| 698 |
-
# =========================================================================
|
| 699 |
-
|
| 700 |
def get_category_description(self, category: str) -> str:
|
| 701 |
-
"""
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
|
|
|
|
|
|
| 716 |
return descriptions.get(category, 'General contract agreement')
|
|
|
|
| 717 |
|
| 718 |
def get_all_categories(self) -> List[str]:
|
| 719 |
-
"""
|
|
|
|
|
|
|
| 720 |
return list(self.CATEGORY_HIERARCHY.keys())
|
| 721 |
|
|
|
|
| 722 |
def get_subcategories(self, category: str) -> List[str]:
|
| 723 |
-
"""
|
| 724 |
-
|
|
|
|
|
|
|
|
|
| 232 |
Arguments:
|
| 233 |
----------
|
| 234 |
model_loader : ModelLoader instance for accessing Legal-BERT and embeddings
|
| 235 |
+
"""
|
| 236 |
self.model_loader = model_loader
|
| 237 |
self.embedding_model = None
|
| 238 |
self.legal_bert_model = None
|
|
|
|
| 294 |
|
| 295 |
log_info(f"Prepared embeddings for {len(self.category_embeddings)} categories")
|
| 296 |
|
| 297 |
+
|
| 298 |
# MAIN CLASSIFICATION METHOD
|
| 299 |
@ContractAnalyzerLogger.log_execution_time("classify_contract")
|
| 300 |
def classify_contract(self, contract_text: str, min_confidence: float = 0.50) -> ContractCategory:
|
|
|
|
| 325 |
raise ValueError("Contract text too short for classification")
|
| 326 |
|
| 327 |
# Preprocess text (use first 3000 chars for efficiency)
|
| 328 |
+
text_excerpt = contract_text
|
| 329 |
|
| 330 |
log_info("Starting contract classification",
|
| 331 |
+
text_length = len(contract_text),
|
| 332 |
+
excerpt_length = len(text_excerpt),
|
| 333 |
+
)
|
| 334 |
|
| 335 |
# Step 1: Keyword scoring
|
| 336 |
+
keyword_scores = self._score_keywords(contract_text.lower())
|
| 337 |
|
| 338 |
# Step 2: Semantic similarity
|
| 339 |
+
semantic_scores = self._semantic_similarity(text_excerpt)
|
| 340 |
|
| 341 |
# Step 3: Legal-BERT enhanced (optional - can be expensive)
|
| 342 |
+
legal_bert_scores = self._legal_bert_classification(text_excerpt)
|
| 343 |
|
| 344 |
# Step 4: Combine scores (weighted average)
|
| 345 |
+
combined_scores = self._combine_scores(keyword_scores = keyword_scores,
|
| 346 |
+
semantic_scores = semantic_scores,
|
| 347 |
+
legal_bert_scores = legal_bert_scores,
|
| 348 |
+
)
|
|
|
|
| 349 |
|
| 350 |
# Step 5: Get primary category
|
| 351 |
if not combined_scores:
|
| 352 |
log_info("No categories detected, defaulting to 'general'")
|
| 353 |
+
return ContractCategory(category = "general",
|
| 354 |
+
subcategory = None,
|
| 355 |
+
confidence = 0.5,
|
| 356 |
+
reasoning = ["Unable to determine specific contract type"],
|
| 357 |
+
detected_keywords = [],
|
| 358 |
+
)
|
|
|
|
| 359 |
|
| 360 |
+
primary_category = max(combined_scores, key = combined_scores.get)
|
| 361 |
+
confidence = combined_scores[primary_category]
|
| 362 |
|
| 363 |
# Step 6: Detect subcategory
|
| 364 |
+
subcategory = self._detect_subcategory(contract_text, primary_category)
|
| 365 |
|
| 366 |
# Step 7: Generate reasoning
|
| 367 |
+
reasoning = self._generate_reasoning(contract_text = contract_text,
|
| 368 |
+
primary_category = primary_category,
|
| 369 |
+
subcategory = subcategory,
|
| 370 |
+
keyword_scores = keyword_scores,
|
| 371 |
+
semantic_scores = semantic_scores,
|
| 372 |
+
combined_scores = combined_scores,
|
| 373 |
+
)
|
|
|
|
| 374 |
|
| 375 |
# Step 8: Extract detected keywords
|
| 376 |
+
detected_keywords = self._extract_detected_keywords(contract_text, primary_category)
|
| 377 |
+
|
| 378 |
+
# Step 9: Get alternative categories: Top 3 alternatives
|
| 379 |
+
alternative_categories = sorted([(cat, score) for cat, score in combined_scores.items() if cat != primary_category],
|
| 380 |
+
key = lambda x: x[1],
|
| 381 |
+
reverse = True,
|
| 382 |
+
)[:3]
|
| 383 |
+
|
| 384 |
+
result = ContractCategory(category = primary_category,
|
| 385 |
+
subcategory = subcategory,
|
| 386 |
+
confidence = confidence,
|
| 387 |
+
reasoning = reasoning,
|
| 388 |
+
detected_keywords = detected_keywords,
|
| 389 |
+
alternative_categories = alternative_categories,
|
| 390 |
+
)
|
|
|
|
|
|
|
| 391 |
|
| 392 |
log_info("Contract classified successfully",
|
| 393 |
+
category = primary_category,
|
| 394 |
+
subcategory = subcategory,
|
| 395 |
+
confidence = confidence,
|
| 396 |
+
)
|
| 397 |
|
| 398 |
return result
|
| 399 |
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
def _score_keywords(self, text_lower: str) -> Dict[str, float]:
|
| 402 |
"""
|
| 403 |
Score each category based on keyword presence
|
| 404 |
|
| 405 |
+
Arguments:
|
| 406 |
+
----------
|
| 407 |
+
text_lower { str } : Lowercase contract text
|
| 408 |
|
| 409 |
Returns:
|
| 410 |
+
--------
|
| 411 |
+
{ dict } : Dictionary of {category: score}
|
| 412 |
"""
|
| 413 |
+
scores = dict()
|
| 414 |
|
| 415 |
for category, config in self.CATEGORY_HIERARCHY.items():
|
| 416 |
+
keywords = config['keywords']
|
| 417 |
+
weight = config['weight']
|
| 418 |
|
| 419 |
# Count keyword matches
|
| 420 |
+
keyword_count = sum(1 for keyword in keywords if keyword in text_lower)
|
| 421 |
|
| 422 |
# Normalize by number of keywords and apply weight
|
| 423 |
normalized_score = (keyword_count / len(keywords)) * weight
|
|
|
|
| 426 |
|
| 427 |
return scores
|
| 428 |
|
| 429 |
+
|
| 430 |
def _semantic_similarity(self, text: str) -> Dict[str, float]:
|
| 431 |
"""
|
| 432 |
Calculate semantic similarity to category templates using embeddings
|
| 433 |
|
| 434 |
+
Arguments:
|
| 435 |
+
----------
|
| 436 |
+
text { str } : Contract text excerpt
|
| 437 |
|
| 438 |
Returns:
|
| 439 |
+
--------
|
| 440 |
+
{ dict } : Dictionary of {category: similarity_score}
|
| 441 |
"""
|
| 442 |
# Encode contract text
|
| 443 |
+
text_embedding = self.embedding_model.encode(text, convert_to_tensor = True)
|
| 444 |
|
| 445 |
# Calculate similarity to each category
|
| 446 |
+
similarities = dict()
|
| 447 |
+
|
| 448 |
for category, cat_embedding in self.category_embeddings.items():
|
| 449 |
+
similarity = util.cos_sim(text_embedding, cat_embedding)[0][0].item()
|
| 450 |
similarities[category] = similarity
|
| 451 |
|
| 452 |
return similarities
|
| 453 |
+
|
| 454 |
|
| 455 |
def _legal_bert_classification(self, text: str) -> Dict[str, float]:
|
| 456 |
"""
|
| 457 |
Use Legal-BERT for classification (optional - computationally expensive)
|
| 458 |
|
| 459 |
+
Arguments:
|
| 460 |
+
----------
|
| 461 |
+
text { str } : Contract text excerpt
|
| 462 |
|
| 463 |
Returns:
|
| 464 |
+
--------
|
| 465 |
+
{ dict } : Dictionary of {category: score}
|
| 466 |
"""
|
|
|
|
|
|
|
|
|
|
| 467 |
# Tokenize
|
| 468 |
+
inputs = self.legal_bert_tokenizer(text,
|
| 469 |
+
return_tensors = "pt",
|
| 470 |
+
padding = True,
|
| 471 |
+
truncation = True,
|
| 472 |
+
max_length = 512,
|
| 473 |
+
).to(self.device)
|
|
|
|
| 474 |
|
| 475 |
# Get embeddings
|
| 476 |
with torch.no_grad():
|
| 477 |
+
outputs = self.legal_bert_model(**inputs)
|
| 478 |
cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
|
| 479 |
|
|
|
|
|
|
|
| 480 |
return {cat: 0.5 for cat in self.CATEGORY_HIERARCHY.keys()}
|
| 481 |
|
| 482 |
+
|
| 483 |
+
def _combine_scores(self, keyword_scores: Dict[str, float], semantic_scores: Dict[str, float], legal_bert_scores: Dict[str, float] = None) -> Dict[str, float]:
|
|
|
|
| 484 |
"""
|
| 485 |
Combine scores from different methods (weighted average)
|
| 486 |
|
| 487 |
+
Arguments:
|
| 488 |
+
----------
|
| 489 |
+
keyword_scores { dict } : Keyword-based scores
|
| 490 |
+
|
| 491 |
+
semantic_scores { dict } : Semantic similarity scores
|
| 492 |
+
|
| 493 |
+
legal_bert_scores { dict } : Legal-BERT scores (optional)
|
| 494 |
|
| 495 |
Returns:
|
| 496 |
+
--------
|
| 497 |
+
{ dict } : Combined scores dictionary
|
| 498 |
"""
|
| 499 |
+
combined = dict()
|
| 500 |
|
| 501 |
# Weights for each method
|
| 502 |
+
keyword_weight = 0.40
|
| 503 |
+
semantic_weight = 0.60
|
| 504 |
legal_bert_weight = 0.00 # Set to 0 if not using Legal-BERT
|
| 505 |
|
| 506 |
if legal_bert_scores:
|
| 507 |
# Normalize weights
|
| 508 |
+
total_weight = keyword_weight + semantic_weight + legal_bert_weight
|
| 509 |
+
keyword_weight /= total_weight
|
| 510 |
+
semantic_weight /= total_weight
|
| 511 |
legal_bert_weight /= total_weight
|
| 512 |
|
| 513 |
for category in self.CATEGORY_HIERARCHY.keys():
|
| 514 |
+
score = (keyword_scores.get(category, 0) * keyword_weight + semantic_scores.get(category, 0) * semantic_weight)
|
|
|
|
|
|
|
|
|
|
| 515 |
|
| 516 |
if legal_bert_scores:
|
| 517 |
score += legal_bert_scores.get(category, 0) * legal_bert_weight
|
|
|
|
| 520 |
|
| 521 |
return combined
|
| 522 |
|
|
|
|
|
|
|
|
|
|
| 523 |
|
| 524 |
def _detect_subcategory(self, text: str, primary_category: str) -> Optional[str]:
|
| 525 |
"""
|
| 526 |
Detect specific subcategory within primary category
|
| 527 |
|
| 528 |
+
Arguments:
|
| 529 |
+
----------
|
| 530 |
+
text { str } : Full contract text
|
| 531 |
+
|
| 532 |
+
primary_category { str } : Detected primary category
|
| 533 |
|
| 534 |
Returns:
|
| 535 |
+
--------
|
| 536 |
+
{ str } : Subcategory name or None
|
| 537 |
"""
|
| 538 |
+
text_lower = text.lower()
|
| 539 |
|
| 540 |
# Get subcategories for this category
|
| 541 |
subcategories = self.CATEGORY_HIERARCHY[primary_category]['subcategories']
|
| 542 |
|
| 543 |
# Score each subcategory
|
| 544 |
+
subcat_scores = dict()
|
| 545 |
+
|
| 546 |
for subcat in subcategories:
|
| 547 |
if subcat in self.SUBCATEGORY_PATTERNS:
|
| 548 |
+
patterns = self.SUBCATEGORY_PATTERNS[subcat]
|
| 549 |
+
score = sum(1 for pattern in patterns if pattern in text_lower)
|
| 550 |
subcat_scores[subcat] = score
|
| 551 |
|
| 552 |
# Return best match if any
|
| 553 |
+
if (subcat_scores and (max(subcat_scores.values()) > 0)):
|
| 554 |
+
best_subcat = max(subcat_scores, key = subcat_scores.get)
|
| 555 |
log_info(f"Detected subcategory: {best_subcat}",
|
| 556 |
+
category = primary_category,
|
| 557 |
+
score = subcat_scores[best_subcat],
|
| 558 |
+
)
|
| 559 |
+
|
| 560 |
return best_subcat
|
| 561 |
|
| 562 |
return None
|
| 563 |
|
| 564 |
+
|
| 565 |
+
def _generate_reasoning(self, contract_text: str, primary_category: str, subcategory: Optional[str], keyword_scores: Dict[str, float], semantic_scores: Dict[str, float],
|
| 566 |
+
combined_scores: Dict[str, float]) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
"""
|
| 568 |
Generate human-readable reasoning for classification
|
| 569 |
|
| 570 |
Returns:
|
| 571 |
+
--------
|
| 572 |
+
{ list } : List of reasoning statements
|
| 573 |
"""
|
| 574 |
+
reasoning = list()
|
| 575 |
|
| 576 |
# Primary category reasoning
|
| 577 |
+
keyword_match = keyword_scores.get(primary_category, 0)
|
| 578 |
semantic_match = semantic_scores.get(primary_category, 0)
|
| 579 |
|
| 580 |
+
if (keyword_match > 0.5):
|
| 581 |
+
reasoning.append(f"Strong keyword indicators for {primary_category.replace('_', ' ')} category "
|
| 582 |
+
f"({int(keyword_match * 100)}% keyword match)"
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
+
elif (keyword_match > 0.3):
|
| 586 |
+
reasoning.append(f"Moderate keyword presence for {primary_category.replace('_', ' ')} "
|
| 587 |
+
f"({int(keyword_match * 100)}% keyword match)"
|
| 588 |
+
)
|
| 589 |
+
|
| 590 |
+
if (semantic_match > 0.65):
|
| 591 |
+
reasoning.append(f"Contract language semantically similar to {primary_category.replace('_', ' ')} agreements "
|
| 592 |
+
f"(similarity: {semantic_match:.2f})"
|
| 593 |
+
)
|
| 594 |
+
|
| 595 |
+
elif (semantic_match > 0.50):
|
| 596 |
+
reasoning.append(f"Moderate semantic similarity to {primary_category.replace('_', ' ')} contracts "
|
| 597 |
+
f"(similarity: {semantic_match:.2f})"
|
| 598 |
+
)
|
|
|
|
|
|
|
| 599 |
|
| 600 |
# Subcategory reasoning
|
| 601 |
if subcategory:
|
| 602 |
+
reasoning.append(f"Specific subcategory identified: {subcategory.replace('_', ' ')}")
|
|
|
|
|
|
|
| 603 |
|
| 604 |
# Alternative categories (if close)
|
| 605 |
+
sorted_scores = sorted(combined_scores.items(), key = lambda x: x[1], reverse = True)
|
| 606 |
+
|
| 607 |
+
if ((len(sorted_scores) > 1) and (sorted_scores[1][1] > 0.40)):
|
| 608 |
alt_category, alt_score = sorted_scores[1]
|
| 609 |
+
|
| 610 |
+
reasoning.append(f"Also contains elements of {alt_category.replace('_', ' ')} "
|
| 611 |
+
f"(secondary match: {alt_score:.2f})"
|
| 612 |
+
)
|
| 613 |
|
| 614 |
# If no strong reasoning
|
| 615 |
if not reasoning:
|
| 616 |
reasoning.append("Classification based on general contract structure and terminology")
|
| 617 |
|
| 618 |
return reasoning
|
| 619 |
+
|
| 620 |
|
| 621 |
def _extract_detected_keywords(self, text: str, category: str) -> List[str]:
|
| 622 |
"""
|
| 623 |
Extract which specific keywords were found
|
| 624 |
|
| 625 |
+
Arguments:
|
| 626 |
+
----------
|
| 627 |
+
text { str } : Contract text
|
| 628 |
+
|
| 629 |
+
category { str } : Detected category
|
| 630 |
|
| 631 |
Returns:
|
| 632 |
+
--------
|
| 633 |
+
{ list } : List of detected keywords
|
| 634 |
"""
|
| 635 |
text_lower = text.lower()
|
| 636 |
+
keywords = self.CATEGORY_HIERARCHY[category]['keywords']
|
| 637 |
|
| 638 |
+
detected = [kw for kw in keywords if kw in text_lower]
|
| 639 |
+
|
| 640 |
+
# Top 10 keywords
|
| 641 |
+
return detected[:10]
|
| 642 |
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
@ContractAnalyzerLogger.log_execution_time("classify_multi_label")
|
| 645 |
+
def classify_multi_label(self, text: str, threshold: float = 0.45) -> List[ContractCategory]:
|
|
|
|
| 646 |
"""
|
| 647 |
+
Classify as multiple categories if applicable (e.g., Employment + NDA, Consulting + IP Assignment)
|
|
|
|
| 648 |
|
| 649 |
+
Arguments:
|
| 650 |
+
----------
|
| 651 |
+
text { str } : Contract text
|
| 652 |
+
|
| 653 |
+
threshold { float } : Minimum confidence threshold for multi-label
|
| 654 |
|
| 655 |
Returns:
|
| 656 |
+
--------
|
| 657 |
+
{ list } : List of ContractCategory objects (sorted by confidence)
|
| 658 |
"""
|
| 659 |
+
log_info("Starting multi-label classification", threshold = threshold)
|
| 660 |
|
| 661 |
# Get scores
|
| 662 |
+
keyword_scores = self._score_keywords(text.lower())
|
| 663 |
+
semantic_scores = self._semantic_similarity(text)
|
| 664 |
combined_scores = self._combine_scores(keyword_scores, semantic_scores)
|
| 665 |
|
| 666 |
# Get all categories above threshold
|
| 667 |
+
matches = list()
|
| 668 |
+
|
| 669 |
for category, score in combined_scores.items():
|
| 670 |
+
if (score >= threshold):
|
| 671 |
subcategory = self._detect_subcategory(text, category)
|
| 672 |
+
reasoning = self._generate_reasoning(text, category, subcategory, keyword_scores, semantic_scores, combined_scores)
|
| 673 |
+
keywords = self._extract_detected_keywords(text, category)
|
|
|
|
|
|
|
|
|
|
| 674 |
|
| 675 |
+
matches.append(ContractCategory(category = category,
|
| 676 |
+
subcategory = subcategory,
|
| 677 |
+
confidence = score,
|
| 678 |
+
reasoning = reasoning,
|
| 679 |
+
detected_keywords = keywords,
|
| 680 |
+
)
|
| 681 |
+
)
|
| 682 |
|
| 683 |
# Sort by confidence
|
| 684 |
+
matches.sort(key = lambda x: x.confidence, reverse = True)
|
| 685 |
|
| 686 |
log_info(f"Multi-label classification found {len(matches)} categories")
|
| 687 |
|
| 688 |
return matches if matches else [self.classify_contract(text)]
|
| 689 |
|
| 690 |
+
|
|
|
|
|
|
|
|
|
|
| 691 |
def get_category_description(self, category: str) -> str:
|
| 692 |
+
"""
|
| 693 |
+
Get human-readable description of a category
|
| 694 |
+
"""
|
| 695 |
+
descriptions = {'employment' : 'Employment agreements governing employer-employee relationships',
|
| 696 |
+
'consulting' : 'Consulting and independent contractor agreements',
|
| 697 |
+
'nda' : 'Non-disclosure and confidentiality agreements',
|
| 698 |
+
'technology' : 'Software licensing and technology service agreements',
|
| 699 |
+
'intellectual_property' : 'IP assignment, licensing, and protection agreements',
|
| 700 |
+
'real_estate' : 'Property lease, rental, and purchase agreements',
|
| 701 |
+
'financial' : 'Loan, credit, and financial service agreements',
|
| 702 |
+
'business' : 'Partnership, joint venture, and corporate agreements',
|
| 703 |
+
'sales' : 'Sales, purchase, and distribution agreements',
|
| 704 |
+
'service_agreement' : 'Professional service and maintenance agreements',
|
| 705 |
+
'vendor' : 'Vendor, supplier, and procurement agreements',
|
| 706 |
+
'agency' : 'Agency and representation agreements',
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
return descriptions.get(category, 'General contract agreement')
|
| 710 |
+
|
| 711 |
|
| 712 |
def get_all_categories(self) -> List[str]:
|
| 713 |
+
"""
|
| 714 |
+
Get list of all supported categories
|
| 715 |
+
"""
|
| 716 |
return list(self.CATEGORY_HIERARCHY.keys())
|
| 717 |
|
| 718 |
+
|
| 719 |
def get_subcategories(self, category: str) -> List[str]:
|
| 720 |
+
"""
|
| 721 |
+
Get subcategories for a specific category
|
| 722 |
+
"""
|
| 723 |
+
return self.CATEGORY_HIERARCHY.get(category, {}).get('subcategories', [])
|
static/app.js
DELETED
|
File without changes
|
static/index.html
CHANGED
|
@@ -0,0 +1,1404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>AI Contract Risk Analyzer - Legal Intelligence Platform</title>
|
| 7 |
+
<style>
|
| 8 |
+
* {
|
| 9 |
+
margin: 0;
|
| 10 |
+
padding: 0;
|
| 11 |
+
box-sizing: border-box;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
body {
|
| 15 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
| 16 |
+
background: #ffffff;
|
| 17 |
+
color: #333;
|
| 18 |
+
line-height: 1.6;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
/* Header */
|
| 22 |
+
.header {
|
| 23 |
+
background: white;
|
| 24 |
+
border-bottom: 1px solid #e5e5e5;
|
| 25 |
+
padding: 1rem 2rem;
|
| 26 |
+
display: flex;
|
| 27 |
+
justify-content: space-between;
|
| 28 |
+
align-items: center;
|
| 29 |
+
position: fixed;
|
| 30 |
+
width: 100%;
|
| 31 |
+
top: 0;
|
| 32 |
+
z-index: 1000;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
.logo {
|
| 36 |
+
display: flex;
|
| 37 |
+
align-items: center;
|
| 38 |
+
gap: 0.5rem;
|
| 39 |
+
font-size: 1.25rem;
|
| 40 |
+
font-weight: 600;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
.logo-icon {
|
| 44 |
+
width: 28px;
|
| 45 |
+
height: 28px;
|
| 46 |
+
background: #4169e1;
|
| 47 |
+
border-radius: 6px;
|
| 48 |
+
display: flex;
|
| 49 |
+
align-items: center;
|
| 50 |
+
justify-content: center;
|
| 51 |
+
color: white;
|
| 52 |
+
font-size: 18px;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.subtitle {
|
| 56 |
+
color: #666;
|
| 57 |
+
font-size: 0.9rem;
|
| 58 |
+
font-weight: 400;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.container {
|
| 62 |
+
max-width: 1200px;
|
| 63 |
+
margin: 0 auto;
|
| 64 |
+
padding: 0 2rem;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
/* Landing Page Styles - Updated to match screenshot */
|
| 68 |
+
.landing-screen {
|
| 69 |
+
padding-top: 80px;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.hero-section {
|
| 73 |
+
text-align: center;
|
| 74 |
+
padding: 6rem 0 4rem;
|
| 75 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 76 |
+
color: white;
|
| 77 |
+
margin-bottom: 4rem;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.hero-title {
|
| 81 |
+
font-size: 3rem;
|
| 82 |
+
font-weight: 700;
|
| 83 |
+
margin-bottom: 1.5rem;
|
| 84 |
+
line-height: 1.2;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
.hero-subtitle {
|
| 88 |
+
font-size: 1.3rem;
|
| 89 |
+
margin-bottom: 2.5rem;
|
| 90 |
+
opacity: 0.95;
|
| 91 |
+
max-width: 600px;
|
| 92 |
+
margin-left: auto;
|
| 93 |
+
margin-right: auto;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.cta-button {
|
| 97 |
+
background: white;
|
| 98 |
+
color: #4169e1;
|
| 99 |
+
border: none;
|
| 100 |
+
padding: 1rem 3rem;
|
| 101 |
+
border-radius: 50px;
|
| 102 |
+
font-size: 1.1rem;
|
| 103 |
+
font-weight: 600;
|
| 104 |
+
cursor: pointer;
|
| 105 |
+
transition: all 0.3s ease;
|
| 106 |
+
box-shadow: 0 4px 15px rgba(0,0,0,0.2);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.cta-button:hover {
|
| 110 |
+
transform: translateY(-2px);
|
| 111 |
+
box-shadow: 0 8px 25px rgba(0,0,0,0.3);
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
.section {
|
| 115 |
+
padding: 4rem 0;
|
| 116 |
+
text-align: center;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.section-title {
|
| 120 |
+
font-size: 2.2rem;
|
| 121 |
+
font-weight: 600;
|
| 122 |
+
margin-bottom: 3rem;
|
| 123 |
+
color: #333;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
.section-subtitle {
|
| 127 |
+
font-size: 1.2rem;
|
| 128 |
+
color: #666;
|
| 129 |
+
margin-bottom: 3rem;
|
| 130 |
+
max-width: 800px;
|
| 131 |
+
margin-left: auto;
|
| 132 |
+
margin-right: auto;
|
| 133 |
+
line-height: 1.8;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.features-grid {
|
| 137 |
+
display: grid;
|
| 138 |
+
grid-template-columns: repeat(3, 1fr);
|
| 139 |
+
gap: 3rem;
|
| 140 |
+
margin-bottom: 4rem;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
.feature-card {
|
| 144 |
+
text-align: center;
|
| 145 |
+
padding: 2rem;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
.feature-icon {
|
| 149 |
+
font-size: 3rem;
|
| 150 |
+
margin-bottom: 1.5rem;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
.feature-title {
|
| 154 |
+
font-size: 1.4rem;
|
| 155 |
+
font-weight: 600;
|
| 156 |
+
margin-bottom: 1rem;
|
| 157 |
+
color: #333;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
.feature-description {
|
| 161 |
+
color: #666;
|
| 162 |
+
line-height: 1.7;
|
| 163 |
+
font-size: 1rem;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
.steps-section {
|
| 167 |
+
background: #f8f9fa;
|
| 168 |
+
padding: 5rem 0;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
.steps-grid {
|
| 172 |
+
display: grid;
|
| 173 |
+
grid-template-columns: repeat(3, 1fr);
|
| 174 |
+
gap: 3rem;
|
| 175 |
+
margin-top: 3rem;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
.step-card {
|
| 179 |
+
text-align: center;
|
| 180 |
+
padding: 2rem;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
.step-number {
|
| 184 |
+
width: 60px;
|
| 185 |
+
height: 60px;
|
| 186 |
+
background: #4169e1;
|
| 187 |
+
color: white;
|
| 188 |
+
border-radius: 50%;
|
| 189 |
+
display: flex;
|
| 190 |
+
align-items: center;
|
| 191 |
+
justify-content: center;
|
| 192 |
+
font-size: 1.5rem;
|
| 193 |
+
font-weight: 700;
|
| 194 |
+
margin: 0 auto 1.5rem;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
.step-title {
|
| 198 |
+
font-size: 1.3rem;
|
| 199 |
+
font-weight: 600;
|
| 200 |
+
margin-bottom: 1rem;
|
| 201 |
+
color: #333;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
.step-description {
|
| 205 |
+
color: #666;
|
| 206 |
+
line-height: 1.7;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
.footer {
|
| 210 |
+
text-align: center;
|
| 211 |
+
padding: 3rem 2rem;
|
| 212 |
+
color: #999;
|
| 213 |
+
font-size: 0.9rem;
|
| 214 |
+
border-top: 1px solid #e5e5e5;
|
| 215 |
+
background: #f8f9fa;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
/* Analyzer Styles */
|
| 219 |
+
.analyzer-screen {
|
| 220 |
+
display: none;
|
| 221 |
+
padding-top: 80px;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
.hero-section-analyzer {
|
| 225 |
+
text-align: center;
|
| 226 |
+
margin-bottom: 3rem;
|
| 227 |
+
padding: 2rem 0;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
.hero-title-analyzer {
|
| 231 |
+
font-size: 2.5rem;
|
| 232 |
+
font-weight: 700;
|
| 233 |
+
margin-bottom: 1rem;
|
| 234 |
+
color: #1a1a1a;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
.hero-description {
|
| 238 |
+
font-size: 1.1rem;
|
| 239 |
+
color: #666;
|
| 240 |
+
margin-bottom: 2rem;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
.upload-card {
|
| 244 |
+
background: white;
|
| 245 |
+
border-radius: 12px;
|
| 246 |
+
padding: 2.5rem;
|
| 247 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.08);
|
| 248 |
+
max-width: 700px;
|
| 249 |
+
margin: 0 auto;
|
| 250 |
+
position: relative;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
.tabs {
|
| 254 |
+
display: flex;
|
| 255 |
+
gap: 1rem;
|
| 256 |
+
border-bottom: 2px solid #e5e5e5;
|
| 257 |
+
margin-bottom: 2rem;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
.tab {
|
| 261 |
+
padding: 0.75rem 1.5rem;
|
| 262 |
+
background: none;
|
| 263 |
+
border: none;
|
| 264 |
+
font-size: 1rem;
|
| 265 |
+
color: #666;
|
| 266 |
+
cursor: pointer;
|
| 267 |
+
border-bottom: 3px solid transparent;
|
| 268 |
+
margin-bottom: -2px;
|
| 269 |
+
transition: all 0.2s;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
.tab.active {
|
| 273 |
+
color: #4169e1;
|
| 274 |
+
border-bottom-color: #4169e1;
|
| 275 |
+
font-weight: 500;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
.tab-content {
|
| 279 |
+
display: none;
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
.tab-content.active {
|
| 283 |
+
display: block;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
.textarea {
|
| 287 |
+
width: 100%;
|
| 288 |
+
min-height: 250px;
|
| 289 |
+
padding: 1rem;
|
| 290 |
+
border: 2px solid #e5e5e5;
|
| 291 |
+
border-radius: 8px;
|
| 292 |
+
font-size: 0.95rem;
|
| 293 |
+
font-family: inherit;
|
| 294 |
+
resize: vertical;
|
| 295 |
+
transition: border-color 0.2s;
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
.textarea:focus {
|
| 299 |
+
outline: none;
|
| 300 |
+
border-color: #4169e1;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
.textarea::placeholder {
|
| 304 |
+
color: #999;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
.file-upload-area {
|
| 308 |
+
border: 2px dashed #d0d0d0;
|
| 309 |
+
border-radius: 8px;
|
| 310 |
+
padding: 3rem 2rem;
|
| 311 |
+
text-align: center;
|
| 312 |
+
cursor: pointer;
|
| 313 |
+
transition: all 0.2s;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
.file-upload-area:hover {
|
| 317 |
+
border-color: #4169e1;
|
| 318 |
+
background: #f8f9ff;
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
.file-upload-area.dragover {
|
| 322 |
+
border-color: #4169e1;
|
| 323 |
+
background: #f0f4ff;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
.file-input {
|
| 327 |
+
display: none;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
.upload-icon {
|
| 331 |
+
font-size: 3rem;
|
| 332 |
+
color: #999;
|
| 333 |
+
margin-bottom: 1rem;
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
.upload-text {
|
| 337 |
+
font-size: 1rem;
|
| 338 |
+
color: #666;
|
| 339 |
+
margin-bottom: 0.5rem;
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
.upload-hint {
|
| 343 |
+
font-size: 0.875rem;
|
| 344 |
+
color: #999;
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
.selected-file {
|
| 348 |
+
display: flex;
|
| 349 |
+
align-items: center;
|
| 350 |
+
gap: 1rem;
|
| 351 |
+
padding: 1rem;
|
| 352 |
+
background: #f8f9ff;
|
| 353 |
+
border-radius: 8px;
|
| 354 |
+
margin-top: 1rem;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
.file-icon {
|
| 358 |
+
font-size: 2rem;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
.file-info {
|
| 362 |
+
flex: 1;
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
.file-name {
|
| 366 |
+
font-weight: 500;
|
| 367 |
+
margin-bottom: 0.25rem;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
.file-size {
|
| 371 |
+
font-size: 0.875rem;
|
| 372 |
+
color: #666;
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
.remove-file {
|
| 376 |
+
background: none;
|
| 377 |
+
border: none;
|
| 378 |
+
color: #999;
|
| 379 |
+
cursor: pointer;
|
| 380 |
+
font-size: 1.5rem;
|
| 381 |
+
padding: 0.25rem;
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
.analyze-btn-container {
|
| 385 |
+
display: flex;
|
| 386 |
+
justify-content: center;
|
| 387 |
+
margin-top: 2rem;
|
| 388 |
+
width: 100%;
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
.analyze-btn {
|
| 392 |
+
background: #4169e1;
|
| 393 |
+
color: white;
|
| 394 |
+
border: none;
|
| 395 |
+
padding: 1rem 3rem;
|
| 396 |
+
border-radius: 8px;
|
| 397 |
+
font-size: 1.1rem;
|
| 398 |
+
font-weight: 600;
|
| 399 |
+
cursor: pointer;
|
| 400 |
+
display: flex;
|
| 401 |
+
align-items: center;
|
| 402 |
+
gap: 0.5rem;
|
| 403 |
+
transition: all 0.3s ease;
|
| 404 |
+
min-width: 200px;
|
| 405 |
+
justify-content: center;
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
.analyze-btn:hover {
|
| 409 |
+
background: #3154c5;
|
| 410 |
+
transform: translateY(-2px);
|
| 411 |
+
box-shadow: 0 4px 12px rgba(49, 84, 197, 0.3);
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
.loading-screen {
|
| 415 |
+
display: none;
|
| 416 |
+
text-align: center;
|
| 417 |
+
padding: 4rem 2rem;
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
.loading-screen.active {
|
| 421 |
+
display: block;
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
.spinner {
|
| 425 |
+
width: 80px;
|
| 426 |
+
height: 80px;
|
| 427 |
+
border: 6px solid #e5e5e5;
|
| 428 |
+
border-top-color: #4169e1;
|
| 429 |
+
border-radius: 50%;
|
| 430 |
+
animation: spin 1s linear infinite;
|
| 431 |
+
margin: 0 auto 2rem;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
@keyframes spin {
|
| 435 |
+
to { transform: rotate(360deg); }
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
.loading-title {
|
| 439 |
+
font-size: 1.5rem;
|
| 440 |
+
font-weight: 600;
|
| 441 |
+
margin-bottom: 0.5rem;
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
.loading-text {
|
| 445 |
+
color: #666;
|
| 446 |
+
font-size: 1rem;
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
.results-screen {
|
| 450 |
+
display: none;
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
.results-screen.active {
|
| 454 |
+
display: block;
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
.back-to-landing {
|
| 458 |
+
background: none;
|
| 459 |
+
border: none;
|
| 460 |
+
color: #4169e1;
|
| 461 |
+
cursor: pointer;
|
| 462 |
+
font-size: 1rem;
|
| 463 |
+
display: flex;
|
| 464 |
+
align-items: center;
|
| 465 |
+
gap: 0.5rem;
|
| 466 |
+
margin-bottom: 2rem;
|
| 467 |
+
padding: 0.5rem 1rem;
|
| 468 |
+
border-radius: 6px;
|
| 469 |
+
transition: background 0.2s;
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
.back-to-landing:hover {
|
| 473 |
+
background: #f8f9ff;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
.api-status {
|
| 477 |
+
text-align: center;
|
| 478 |
+
margin: 1rem 0;
|
| 479 |
+
padding: 1rem;
|
| 480 |
+
border-radius: 8px;
|
| 481 |
+
font-size: 0.9rem;
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
.api-status.connected {
|
| 485 |
+
background: #dcfce7;
|
| 486 |
+
color: #16a34a;
|
| 487 |
+
border: 1px solid #bbf7d0;
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
.api-status.disconnected {
|
| 491 |
+
background: #fee;
|
| 492 |
+
color: #dc2626;
|
| 493 |
+
border: 1px solid #fecaca;
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
/* Results screen styles */
|
| 497 |
+
.results-header {
|
| 498 |
+
display: flex;
|
| 499 |
+
justify-content: space-between;
|
| 500 |
+
align-items: center;
|
| 501 |
+
margin-bottom: 2rem;
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
.results-title {
|
| 505 |
+
font-size: 2rem;
|
| 506 |
+
font-weight: 700;
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
.results-actions {
|
| 510 |
+
display: flex;
|
| 511 |
+
gap: 1rem;
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
.btn {
|
| 515 |
+
padding: 0.75rem 1.5rem;
|
| 516 |
+
border-radius: 8px;
|
| 517 |
+
font-size: 0.95rem;
|
| 518 |
+
font-weight: 500;
|
| 519 |
+
cursor: pointer;
|
| 520 |
+
border: none;
|
| 521 |
+
transition: all 0.2s;
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
.btn-primary {
|
| 525 |
+
background: #4169e1;
|
| 526 |
+
color: white;
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
.btn-primary:hover {
|
| 530 |
+
background: #3154c5;
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
.btn-secondary {
|
| 534 |
+
background: white;
|
| 535 |
+
color: #4169e1;
|
| 536 |
+
border: 2px solid #4169e1;
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
.btn-secondary:hover {
|
| 540 |
+
background: #f8f9ff;
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
.results-grid {
|
| 544 |
+
display: grid;
|
| 545 |
+
grid-template-columns: 1fr 2fr;
|
| 546 |
+
gap: 1.5rem;
|
| 547 |
+
margin-bottom: 2rem;
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
.card {
|
| 551 |
+
background: white;
|
| 552 |
+
border-radius: 12px;
|
| 553 |
+
padding: 2rem;
|
| 554 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.08);
|
| 555 |
+
}
|
| 556 |
+
|
| 557 |
+
.card-title {
|
| 558 |
+
font-size: 1.25rem;
|
| 559 |
+
font-weight: 600;
|
| 560 |
+
margin-bottom: 1.5rem;
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
.risk-score-container {
|
| 564 |
+
text-align: center;
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
.risk-circle {
|
| 568 |
+
width: 200px;
|
| 569 |
+
height: 200px;
|
| 570 |
+
margin: 0 auto 1rem;
|
| 571 |
+
position: relative;
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
.risk-circle svg {
|
| 575 |
+
transform: rotate(-90deg);
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
.risk-score-value {
|
| 579 |
+
position: absolute;
|
| 580 |
+
top: 50%;
|
| 581 |
+
left: 50%;
|
| 582 |
+
transform: translate(-50%, -50%);
|
| 583 |
+
font-size: 3rem;
|
| 584 |
+
font-weight: 700;
|
| 585 |
+
color: #dc2626;
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
.risk-level {
|
| 589 |
+
display: inline-block;
|
| 590 |
+
padding: 0.5rem 1rem;
|
| 591 |
+
border-radius: 6px;
|
| 592 |
+
font-weight: 600;
|
| 593 |
+
font-size: 0.9rem;
|
| 594 |
+
margin-top: 1rem;
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
.risk-critical {
|
| 598 |
+
background: #fee;
|
| 599 |
+
color: #dc2626;
|
| 600 |
+
}
|
| 601 |
+
|
| 602 |
+
.risk-high {
|
| 603 |
+
background: #fff4e6;
|
| 604 |
+
color: #f97316;
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
.risk-medium {
|
| 608 |
+
background: #fef9c3;
|
| 609 |
+
color: #ca8a04;
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
.risk-low {
|
| 613 |
+
background: #dcfce7;
|
| 614 |
+
color: #16a34a;
|
| 615 |
+
}
|
| 616 |
+
|
| 617 |
+
.executive-summary {
|
| 618 |
+
font-size: 1rem;
|
| 619 |
+
line-height: 1.8;
|
| 620 |
+
color: #444;
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
.three-column-grid {
|
| 624 |
+
display: grid;
|
| 625 |
+
grid-template-columns: repeat(3, 1fr);
|
| 626 |
+
gap: 1.5rem;
|
| 627 |
+
margin-bottom: 2rem;
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
.card-icon {
|
| 631 |
+
font-size: 1.5rem;
|
| 632 |
+
margin-bottom: 0.5rem;
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
.icon-warning { color: #f97316; }
|
| 636 |
+
.icon-shield { color: #dc2626; }
|
| 637 |
+
.icon-book { color: #4169e1; }
|
| 638 |
+
|
| 639 |
+
.item-list {
|
| 640 |
+
list-style: none;
|
| 641 |
+
}
|
| 642 |
+
|
| 643 |
+
.item-list li {
|
| 644 |
+
padding: 0.75rem 0;
|
| 645 |
+
border-bottom: 1px solid #f0f0f0;
|
| 646 |
+
display: flex;
|
| 647 |
+
align-items: flex-start;
|
| 648 |
+
gap: 0.5rem;
|
| 649 |
+
}
|
| 650 |
+
|
| 651 |
+
.item-list li:last-child {
|
| 652 |
+
border-bottom: none;
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
.item-icon {
|
| 656 |
+
color: #4169e1;
|
| 657 |
+
margin-top: 0.25rem;
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
.item-text {
|
| 661 |
+
flex: 1;
|
| 662 |
+
font-size: 0.95rem;
|
| 663 |
+
}
|
| 664 |
+
|
| 665 |
+
.category-breakdown {
|
| 666 |
+
margin-top: 2rem;
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
.category-item {
|
| 670 |
+
margin-bottom: 2rem;
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
.category-header {
|
| 674 |
+
display: flex;
|
| 675 |
+
justify-content: space-between;
|
| 676 |
+
align-items: center;
|
| 677 |
+
margin-bottom: 0.75rem;
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
.category-name {
|
| 681 |
+
font-weight: 600;
|
| 682 |
+
font-size: 1rem;
|
| 683 |
+
}
|
| 684 |
+
|
| 685 |
+
.category-score {
|
| 686 |
+
font-weight: 700;
|
| 687 |
+
font-size: 1.1rem;
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
.score-critical { color: #dc2626; }
|
| 691 |
+
.score-high { color: #f97316; }
|
| 692 |
+
.score-medium { color: #ca8a04; }
|
| 693 |
+
.score-low { color: #16a34a; }
|
| 694 |
+
|
| 695 |
+
.progress-bar {
|
| 696 |
+
height: 8px;
|
| 697 |
+
background: #f0f0f0;
|
| 698 |
+
border-radius: 4px;
|
| 699 |
+
overflow: hidden;
|
| 700 |
+
margin-bottom: 0.5rem;
|
| 701 |
+
}
|
| 702 |
+
|
| 703 |
+
.progress-fill {
|
| 704 |
+
height: 100%;
|
| 705 |
+
transition: width 0.5s ease;
|
| 706 |
+
}
|
| 707 |
+
|
| 708 |
+
.progress-critical { background: #dc2626; }
|
| 709 |
+
.progress-high { background: #f97316; }
|
| 710 |
+
.progress-medium { background: #ca8a04; }
|
| 711 |
+
.progress-low { background: #16a34a; }
|
| 712 |
+
|
| 713 |
+
.category-description {
|
| 714 |
+
font-size: 0.9rem;
|
| 715 |
+
color: #666;
|
| 716 |
+
line-height: 1.6;
|
| 717 |
+
}
|
| 718 |
+
|
| 719 |
+
.clause-analysis {
|
| 720 |
+
margin-top: 2rem;
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.clause-item {
|
| 724 |
+
border: 1px solid #e5e5e5;
|
| 725 |
+
border-left: 4px solid #dc2626;
|
| 726 |
+
border-radius: 8px;
|
| 727 |
+
padding: 1.5rem;
|
| 728 |
+
margin-bottom: 1rem;
|
| 729 |
+
background: white;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
+
.clause-item.high {
|
| 733 |
+
border-left-color: #f97316;
|
| 734 |
+
}
|
| 735 |
+
|
| 736 |
+
.clause-item.medium {
|
| 737 |
+
border-left-color: #ca8a04;
|
| 738 |
+
}
|
| 739 |
+
|
| 740 |
+
.clause-header {
|
| 741 |
+
display: flex;
|
| 742 |
+
justify-content: space-between;
|
| 743 |
+
align-items: flex-start;
|
| 744 |
+
margin-bottom: 1rem;
|
| 745 |
+
}
|
| 746 |
+
|
| 747 |
+
.clause-label {
|
| 748 |
+
font-size: 0.75rem;
|
| 749 |
+
text-transform: uppercase;
|
| 750 |
+
font-weight: 600;
|
| 751 |
+
color: #999;
|
| 752 |
+
margin-bottom: 0.5rem;
|
| 753 |
+
}
|
| 754 |
+
|
| 755 |
+
.clause-text {
|
| 756 |
+
font-size: 0.95rem;
|
| 757 |
+
font-weight: 500;
|
| 758 |
+
color: #333;
|
| 759 |
+
line-height: 1.6;
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
.severity-badge {
|
| 763 |
+
padding: 0.375rem 0.875rem;
|
| 764 |
+
border-radius: 6px;
|
| 765 |
+
font-size: 0.8rem;
|
| 766 |
+
font-weight: 600;
|
| 767 |
+
}
|
| 768 |
+
|
| 769 |
+
.badge-critical {
|
| 770 |
+
background: #fee;
|
| 771 |
+
color: #dc2626;
|
| 772 |
+
}
|
| 773 |
+
|
| 774 |
+
.badge-high {
|
| 775 |
+
background: #fff4e6;
|
| 776 |
+
color: #f97316;
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
.badge-medium {
|
| 780 |
+
background: #fef9c3;
|
| 781 |
+
color: #ca8a04;
|
| 782 |
+
}
|
| 783 |
+
|
| 784 |
+
.clause-section {
|
| 785 |
+
margin-top: 1rem;
|
| 786 |
+
}
|
| 787 |
+
|
| 788 |
+
.clause-section-title {
|
| 789 |
+
font-weight: 600;
|
| 790 |
+
font-size: 0.9rem;
|
| 791 |
+
margin-bottom: 0.5rem;
|
| 792 |
+
color: #333;
|
| 793 |
+
}
|
| 794 |
+
|
| 795 |
+
.clause-section-text {
|
| 796 |
+
font-size: 0.9rem;
|
| 797 |
+
color: #555;
|
| 798 |
+
line-height: 1.7;
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
@media (max-width: 1024px) {
|
| 802 |
+
.features-grid,
|
| 803 |
+
.steps-grid {
|
| 804 |
+
grid-template-columns: 1fr;
|
| 805 |
+
gap: 2rem;
|
| 806 |
+
}
|
| 807 |
+
|
| 808 |
+
.results-grid {
|
| 809 |
+
grid-template-columns: 1fr;
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
.three-column-grid {
|
| 813 |
+
grid-template-columns: 1fr;
|
| 814 |
+
}
|
| 815 |
+
}
|
| 816 |
+
|
| 817 |
+
@media (max-width: 768px) {
|
| 818 |
+
.hero-title {
|
| 819 |
+
font-size: 2.2rem;
|
| 820 |
+
}
|
| 821 |
+
|
| 822 |
+
.hero-title-analyzer {
|
| 823 |
+
font-size: 2rem;
|
| 824 |
+
}
|
| 825 |
+
|
| 826 |
+
.section-title {
|
| 827 |
+
font-size: 1.8rem;
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
.results-header {
|
| 831 |
+
flex-direction: column;
|
| 832 |
+
align-items: flex-start;
|
| 833 |
+
gap: 1rem;
|
| 834 |
+
}
|
| 835 |
+
|
| 836 |
+
.results-actions {
|
| 837 |
+
width: 100%;
|
| 838 |
+
flex-direction: column;
|
| 839 |
+
}
|
| 840 |
+
|
| 841 |
+
.btn {
|
| 842 |
+
width: 100%;
|
| 843 |
+
}
|
| 844 |
+
|
| 845 |
+
.analyze-btn {
|
| 846 |
+
width: 100%;
|
| 847 |
+
padding: 1rem 2rem;
|
| 848 |
+
}
|
| 849 |
+
}
|
| 850 |
+
</style>
|
| 851 |
+
</head>
|
| 852 |
+
<body>
|
| 853 |
+
<!-- Header -->
|
| 854 |
+
<header class="header">
|
| 855 |
+
<div class="logo">
|
| 856 |
+
<div class="logo-icon">✓</div>
|
| 857 |
+
<span>AI Contract Risk Analyzer</span>
|
| 858 |
+
</div>
|
| 859 |
+
<div class="subtitle">Legal Intelligence Platform</div>
|
| 860 |
+
</header>
|
| 861 |
+
|
| 862 |
+
<!-- Landing Screen -->
|
| 863 |
+
<div id="landingScreen" class="landing-screen">
|
| 864 |
+
<!-- Hero Section -->
|
| 865 |
+
<section class="hero-section">
|
| 866 |
+
<div class="container">
|
| 867 |
+
<h1 class="hero-title">Unlock Legal Intelligence<br>Analyze Contracts with AI</h1>
|
| 868 |
+
<p class="hero-subtitle">
|
| 869 |
+
Instantly identify risks, uncover unfavorable terms, and gain actionable negotiation points.
|
| 870 |
+
Our AI-powered platform gives you the clarity and confidence to sign better contracts.
|
| 871 |
+
</p>
|
| 872 |
+
<button class="cta-button" id="getStartedBtn">Try Now for Free</button>
|
| 873 |
+
</div>
|
| 874 |
+
</section>
|
| 875 |
+
|
| 876 |
+
<!-- Main Content Section -->
|
| 877 |
+
<section class="section">
|
| 878 |
+
<div class="container">
|
| 879 |
+
<h2 class="section-title">A Smarter Way to Review Legal Documents</h2>
|
| 880 |
+
<p class="section-subtitle">
|
| 881 |
+
Our platform goes beyond simple keyword searches to provide a deep, contextual understanding of your contracts.
|
| 882 |
+
</p>
|
| 883 |
+
|
| 884 |
+
<div class="features-grid">
|
| 885 |
+
<div class="feature-card">
|
| 886 |
+
<div class="feature-icon">🔍</div>
|
| 887 |
+
<h3 class="feature-title">In-Depth Analysis</h3>
|
| 888 |
+
<p class="feature-description">
|
| 889 |
+
Our AI performs a comprehensive, clause-by-clause review, assessing risk levels and explaining complex legal jargon in plain English.
|
| 890 |
+
</p>
|
| 891 |
+
</div>
|
| 892 |
+
|
| 893 |
+
<div class="feature-card">
|
| 894 |
+
<div class="feature-icon">💡</div>
|
| 895 |
+
<h3 class="feature-title">Actionable Insights</h3>
|
| 896 |
+
<p class="feature-description">
|
| 897 |
+
Receive a prioritized list of negotiation points, suggestions for missing clauses, and clear recommendations to strengthen your position.
|
| 898 |
+
</p>
|
| 899 |
+
</div>
|
| 900 |
+
|
| 901 |
+
<div class="feature-card">
|
| 902 |
+
<div class="feature-icon">🔒</div>
|
| 903 |
+
<h3 class="feature-title">Secure & Confidential</h3>
|
| 904 |
+
<p class="feature-description">
|
| 905 |
+
Your documents are encrypted and processed with the utmost privacy. We never store your contract data after analysis.
|
| 906 |
+
</p>
|
| 907 |
+
</div>
|
| 908 |
+
</div>
|
| 909 |
+
</div>
|
| 910 |
+
</section>
|
| 911 |
+
|
| 912 |
+
<!-- Steps Section -->
|
| 913 |
+
<section class="steps-section">
|
| 914 |
+
<div class="container">
|
| 915 |
+
<h2 class="section-title">Get Your Analysis in 3 Simple Steps</h2>
|
| 916 |
+
|
| 917 |
+
<div class="steps-grid">
|
| 918 |
+
<div class="step-card">
|
| 919 |
+
<div class="step-number">1</div>
|
| 920 |
+
<h3 class="step-title">Upload or Paste</h3>
|
| 921 |
+
<p class="step-description">
|
| 922 |
+
Securely provide your contract by pasting the text or uploading a DOCX/PDF file.
|
| 923 |
+
</p>
|
| 924 |
+
</div>
|
| 925 |
+
|
| 926 |
+
<div class="step-card">
|
| 927 |
+
<div class="step-number">2</div>
|
| 928 |
+
<h3 class="step-title">AI Analyzes</h3>
|
| 929 |
+
<p class="step-description">
|
| 930 |
+
Our intelligent engine scrutinizes every detail of your document in seconds.
|
| 931 |
+
</p>
|
| 932 |
+
</div>
|
| 933 |
+
|
| 934 |
+
<div class="step-card">
|
| 935 |
+
<div class="step-number">3</div>
|
| 936 |
+
<h3 class="step-title">Get Your Report</h3>
|
| 937 |
+
<p class="step-description">
|
| 938 |
+
Receive a comprehensive, easy-to-understand report with your risk score and key findings.
|
| 939 |
+
</p>
|
| 940 |
+
</div>
|
| 941 |
+
</div>
|
| 942 |
+
</div>
|
| 943 |
+
</section>
|
| 944 |
+
|
| 945 |
+
<footer class="footer">
|
| 946 |
+
© 2025 AI Contract Risk Analyzer. For informational purposes only. Not legal advice.
|
| 947 |
+
</footer>
|
| 948 |
+
</div>
|
| 949 |
+
|
| 950 |
+
<!-- Analyzer Screen -->
|
| 951 |
+
<div id="analyzerScreen" class="analyzer-screen">
|
| 952 |
+
<div class="container">
|
| 953 |
+
<button class="back-to-landing" id="backToLandingBtn">
|
| 954 |
+
← Back to Overview
|
| 955 |
+
</button>
|
| 956 |
+
|
| 957 |
+
<div class="hero-section-analyzer">
|
| 958 |
+
<h1 class="hero-title-analyzer">Analyze Your Contract in Seconds</h1>
|
| 959 |
+
<p class="hero-description">Paste your contract or upload a file to get an instant, AI-powered risk assessment.</p>
|
| 960 |
+
</div>
|
| 961 |
+
|
| 962 |
+
<!-- API Status Indicator -->
|
| 963 |
+
<div id="apiStatus" class="api-status" style="display: none;">
|
| 964 |
+
Checking backend connection...
|
| 965 |
+
</div>
|
| 966 |
+
|
| 967 |
+
<div class="upload-card">
|
| 968 |
+
<div class="tabs">
|
| 969 |
+
<button class="tab active" data-tab="paste">Paste Text</button>
|
| 970 |
+
<button class="tab" data-tab="upload">Upload File</button>
|
| 971 |
+
</div>
|
| 972 |
+
|
| 973 |
+
<div id="pasteTab" class="tab-content active">
|
| 974 |
+
<textarea class="textarea" id="contractText" placeholder="Paste your full contract text here..."></textarea>
|
| 975 |
+
</div>
|
| 976 |
+
|
| 977 |
+
<div id="uploadTab" class="tab-content">
|
| 978 |
+
<div class="file-upload-area" id="fileUploadArea">
|
| 979 |
+
<input type="file" id="fileInput" class="file-input" accept=".pdf,.docx,.txt">
|
| 980 |
+
<div class="upload-icon">📄</div>
|
| 981 |
+
<div class="upload-text">Click to upload or drag and drop</div>
|
| 982 |
+
<div class="upload-hint">PDF, DOCX, or TXT files (Max 10MB)</div>
|
| 983 |
+
</div>
|
| 984 |
+
<div id="selectedFile" class="selected-file" style="display: none;">
|
| 985 |
+
<div class="file-icon">📄</div>
|
| 986 |
+
<div class="file-info">
|
| 987 |
+
<div class="file-name" id="fileName"></div>
|
| 988 |
+
<div class="file-size" id="fileSize"></div>
|
| 989 |
+
</div>
|
| 990 |
+
<button class="remove-file" id="removeFile">×</button>
|
| 991 |
+
</div>
|
| 992 |
+
</div>
|
| 993 |
+
|
| 994 |
+
<div class="analyze-btn-container">
|
| 995 |
+
<button class="analyze-btn" id="analyzeBtn">
|
| 996 |
+
<span>🔍</span>
|
| 997 |
+
<span>Analyze Contract</span>
|
| 998 |
+
</button>
|
| 999 |
+
</div>
|
| 1000 |
+
</div>
|
| 1001 |
+
|
| 1002 |
+
<!-- Loading Screen -->
|
| 1003 |
+
<div id="loadingScreen" class="loading-screen">
|
| 1004 |
+
<div class="spinner"></div>
|
| 1005 |
+
<h2 class="loading-title">Performing in-depth analysis...</h2>
|
| 1006 |
+
<p class="loading-text">This may take a moment for large documents.</p>
|
| 1007 |
+
</div>
|
| 1008 |
+
|
| 1009 |
+
<!-- Results Screen -->
|
| 1010 |
+
<div id="resultsScreen" class="results-screen">
|
| 1011 |
+
<div class="results-header">
|
| 1012 |
+
<h1 class="results-title">Analysis Report</h1>
|
| 1013 |
+
<div class="results-actions">
|
| 1014 |
+
<button class="btn btn-primary" id="downloadBtn">📥 Download PDF Report</button>
|
| 1015 |
+
<button class="btn btn-secondary" id="analyzeAnotherBtn">Analyze Another Contract</button>
|
| 1016 |
+
</div>
|
| 1017 |
+
</div>
|
| 1018 |
+
|
| 1019 |
+
<div class="results-grid">
|
| 1020 |
+
<div class="card">
|
| 1021 |
+
<h2 class="card-title">Overall Risk Score</h2>
|
| 1022 |
+
<div class="risk-score-container">
|
| 1023 |
+
<div class="risk-circle">
|
| 1024 |
+
<svg width="200" height="200">
|
| 1025 |
+
<circle cx="100" cy="100" r="85" fill="none" stroke="#f0f0f0" stroke-width="20"/>
|
| 1026 |
+
<circle id="riskCircle" cx="100" cy="100" r="85" fill="none" stroke="#dc2626" stroke-width="20" stroke-dasharray="534" stroke-dashoffset="534" stroke-linecap="round"/>
|
| 1027 |
+
</svg>
|
| 1028 |
+
<div class="risk-score-value" id="riskScoreValue">0</div>
|
| 1029 |
+
</div>
|
| 1030 |
+
<div class="risk-level" id="riskLevel">NO RISK</div>
|
| 1031 |
+
</div>
|
| 1032 |
+
</div>
|
| 1033 |
+
|
| 1034 |
+
<div class="card">
|
| 1035 |
+
<h2 class="card-title">Executive Summary</h2>
|
| 1036 |
+
<p class="executive-summary" id="executiveSummary">
|
| 1037 |
+
Analysis results will appear here...
|
| 1038 |
+
</p>
|
| 1039 |
+
</div>
|
| 1040 |
+
</div>
|
| 1041 |
+
|
| 1042 |
+
<div class="three-column-grid">
|
| 1043 |
+
<div class="card">
|
| 1044 |
+
<div class="card-icon icon-warning">⚠️</div>
|
| 1045 |
+
<h3 class="card-title">Unfavorable Terms</h3>
|
| 1046 |
+
<ul class="item-list" id="unfavorableTermsList">
|
| 1047 |
+
<li>No unfavorable terms detected yet</li>
|
| 1048 |
+
</ul>
|
| 1049 |
+
</div>
|
| 1050 |
+
|
| 1051 |
+
<div class="card">
|
| 1052 |
+
<div class="card-icon icon-shield">🛡️</div>
|
| 1053 |
+
<h3 class="card-title">Missing Protections</h3>
|
| 1054 |
+
<ul class="item-list" id="missingProtectionsList">
|
| 1055 |
+
<li>No missing protections detected yet</li>
|
| 1056 |
+
</ul>
|
| 1057 |
+
</div>
|
| 1058 |
+
|
| 1059 |
+
<div class="card">
|
| 1060 |
+
<div class="card-icon icon-book">📖</div>
|
| 1061 |
+
<h3 class="card-title">Negotiation Points</h3>
|
| 1062 |
+
<ul class="item-list" id="negotiationPointsList">
|
| 1063 |
+
<li>No negotiation points generated yet</li>
|
| 1064 |
+
</ul>
|
| 1065 |
+
</div>
|
| 1066 |
+
</div>
|
| 1067 |
+
|
| 1068 |
+
<div class="card category-breakdown">
|
| 1069 |
+
<h2 class="card-title">Risk Category Breakdown</h2>
|
| 1070 |
+
<div id="categoryBreakdown">
|
| 1071 |
+
<div class="category-item">
|
| 1072 |
+
<div class="category-header">
|
| 1073 |
+
<span class="category-name">Waiting for analysis...</span>
|
| 1074 |
+
<span class="category-score score-low">0/100</span>
|
| 1075 |
+
</div>
|
| 1076 |
+
<div class="progress-bar">
|
| 1077 |
+
<div class="progress-fill progress-low" style="width: 0%"></div>
|
| 1078 |
+
</div>
|
| 1079 |
+
</div>
|
| 1080 |
+
</div>
|
| 1081 |
+
</div>
|
| 1082 |
+
|
| 1083 |
+
<div class="card clause-analysis">
|
| 1084 |
+
<h2 class="card-title">Clause-by-Clause Analysis</h2>
|
| 1085 |
+
<div id="clauseAnalysis">
|
| 1086 |
+
<div class="clause-item">
|
| 1087 |
+
<div class="clause-header">
|
| 1088 |
+
<div>
|
| 1089 |
+
<div class="clause-label">STATUS</div>
|
| 1090 |
+
<div class="clause-text">Upload a contract to begin analysis</div>
|
| 1091 |
+
</div>
|
| 1092 |
+
</div>
|
| 1093 |
+
</div>
|
| 1094 |
+
</div>
|
| 1095 |
+
</div>
|
| 1096 |
+
</div>
|
| 1097 |
+
</div>
|
| 1098 |
+
</div>
|
| 1099 |
+
|
| 1100 |
+
<script>
|
| 1101 |
+
const API_BASE_URL = window.location.hostname === 'localhost'
|
| 1102 |
+
? 'http://localhost:8000/api/v1'
|
| 1103 |
+
: '/api/v1';
|
| 1104 |
+
|
| 1105 |
+
let selectedFile = null;
|
| 1106 |
+
let currentJobId = null;
|
| 1107 |
+
let pollInterval = null;
|
| 1108 |
+
|
| 1109 |
+
// Screen management
|
| 1110 |
+
function showScreen(screenName) {
|
| 1111 |
+
document.getElementById('landingScreen').style.display = 'none';
|
| 1112 |
+
document.getElementById('analyzerScreen').style.display = 'none';
|
| 1113 |
+
document.getElementById('loadingScreen').classList.remove('active');
|
| 1114 |
+
document.getElementById('resultsScreen').classList.remove('active');
|
| 1115 |
+
|
| 1116 |
+
if (screenName === 'landing') {
|
| 1117 |
+
document.getElementById('landingScreen').style.display = 'block';
|
| 1118 |
+
} else if (screenName === 'analyzer') {
|
| 1119 |
+
document.getElementById('analyzerScreen').style.display = 'block';
|
| 1120 |
+
checkBackendConnection();
|
| 1121 |
+
} else if (screenName === 'loading') {
|
| 1122 |
+
document.getElementById('analyzerScreen').style.display = 'block';
|
| 1123 |
+
document.getElementById('loadingScreen').classList.add('active');
|
| 1124 |
+
} else if (screenName === 'results') {
|
| 1125 |
+
document.getElementById('analyzerScreen').style.display = 'block';
|
| 1126 |
+
document.getElementById('resultsScreen').classList.add('active');
|
| 1127 |
+
}
|
| 1128 |
+
}
|
| 1129 |
+
|
| 1130 |
+
// Check backend connection
|
| 1131 |
+
async function checkBackendConnection() {
|
| 1132 |
+
const statusElement = document.getElementById('apiStatus');
|
| 1133 |
+
statusElement.style.display = 'block';
|
| 1134 |
+
statusElement.textContent = 'Checking backend connection...';
|
| 1135 |
+
statusElement.className = 'api-status';
|
| 1136 |
+
|
| 1137 |
+
try {
|
| 1138 |
+
const response = await fetch(`${API_BASE_URL}/health`, {
|
| 1139 |
+
method: 'GET',
|
| 1140 |
+
headers: {
|
| 1141 |
+
'Accept': 'application/json'
|
| 1142 |
+
}
|
| 1143 |
+
});
|
| 1144 |
+
|
| 1145 |
+
if (response.ok) {
|
| 1146 |
+
statusElement.textContent = '✓ Backend connected successfully';
|
| 1147 |
+
statusElement.className = 'api-status connected';
|
| 1148 |
+
} else {
|
| 1149 |
+
throw new Error('Backend not responding properly');
|
| 1150 |
+
}
|
| 1151 |
+
} catch (error) {
|
| 1152 |
+
console.error('Backend connection failed:', error);
|
| 1153 |
+
statusElement.textContent = '✗ Cannot connect to backend. Make sure the server is running on port 8000.';
|
| 1154 |
+
statusElement.className = 'api-status disconnected';
|
| 1155 |
+
|
| 1156 |
+
setTimeout(() => {
|
| 1157 |
+
statusElement.style.display = 'none';
|
| 1158 |
+
}, 5000);
|
| 1159 |
+
}
|
| 1160 |
+
}
|
| 1161 |
+
|
| 1162 |
+
// Navigation
|
| 1163 |
+
document.getElementById('getStartedBtn').addEventListener('click', () => {
|
| 1164 |
+
showScreen('analyzer');
|
| 1165 |
+
});
|
| 1166 |
+
|
| 1167 |
+
document.getElementById('backToLandingBtn').addEventListener('click', () => {
|
| 1168 |
+
showScreen('landing');
|
| 1169 |
+
});
|
| 1170 |
+
|
| 1171 |
+
// Tab switching
|
| 1172 |
+
document.querySelectorAll('.tab').forEach(tab => {
|
| 1173 |
+
tab.addEventListener('click', (e) => {
|
| 1174 |
+
e.preventDefault();
|
| 1175 |
+
const tabName = tab.dataset.tab;
|
| 1176 |
+
|
| 1177 |
+
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
| 1178 |
+
document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
|
| 1179 |
+
|
| 1180 |
+
tab.classList.add('active');
|
| 1181 |
+
document.getElementById(tabName + 'Tab').classList.add('active');
|
| 1182 |
+
});
|
| 1183 |
+
});
|
| 1184 |
+
|
| 1185 |
+
// File upload handling
|
| 1186 |
+
const fileUploadArea = document.getElementById('fileUploadArea');
|
| 1187 |
+
const fileInput = document.getElementById('fileInput');
|
| 1188 |
+
const selectedFileDiv = document.getElementById('selectedFile');
|
| 1189 |
+
const fileNameSpan = document.getElementById('fileName');
|
| 1190 |
+
const fileSizeSpan = document.getElementById('fileSize');
|
| 1191 |
+
const removeFileBtn = document.getElementById('removeFile');
|
| 1192 |
+
|
| 1193 |
+
fileUploadArea.addEventListener('click', () => fileInput.click());
|
| 1194 |
+
|
| 1195 |
+
fileUploadArea.addEventListener('dragover', (e) => {
|
| 1196 |
+
e.preventDefault();
|
| 1197 |
+
fileUploadArea.classList.add('dragover');
|
| 1198 |
+
});
|
| 1199 |
+
|
| 1200 |
+
fileUploadArea.addEventListener('dragleave', () => {
|
| 1201 |
+
fileUploadArea.classList.remove('dragover');
|
| 1202 |
+
});
|
| 1203 |
+
|
| 1204 |
+
fileUploadArea.addEventListener('drop', (e) => {
|
| 1205 |
+
e.preventDefault();
|
| 1206 |
+
fileUploadArea.classList.remove('dragover');
|
| 1207 |
+
const file = e.dataTransfer.files[0];
|
| 1208 |
+
handleFileSelect(file);
|
| 1209 |
+
});
|
| 1210 |
+
|
| 1211 |
+
fileInput.addEventListener('change', (e) => {
|
| 1212 |
+
const file = e.target.files[0];
|
| 1213 |
+
handleFileSelect(file);
|
| 1214 |
+
});
|
| 1215 |
+
|
| 1216 |
+
removeFileBtn.addEventListener('click', (e) => {
|
| 1217 |
+
e.stopPropagation();
|
| 1218 |
+
selectedFile = null;
|
| 1219 |
+
fileInput.value = '';
|
| 1220 |
+
selectedFileDiv.style.display = 'none';
|
| 1221 |
+
fileUploadArea.style.display = 'block';
|
| 1222 |
+
});
|
| 1223 |
+
|
| 1224 |
+
function handleFileSelect(file) {
|
| 1225 |
+
if (!file) return;
|
| 1226 |
+
|
| 1227 |
+
const validTypes = [
|
| 1228 |
+
'application/pdf',
|
| 1229 |
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
| 1230 |
+
'text/plain'
|
| 1231 |
+
];
|
| 1232 |
+
|
| 1233 |
+
const isValidType = validTypes.includes(file.type) ||
|
| 1234 |
+
file.name.match(/\.(pdf|docx|txt)$/i);
|
| 1235 |
+
|
| 1236 |
+
if (!isValidType) {
|
| 1237 |
+
alert('Please upload a PDF, DOCX, or TXT file');
|
| 1238 |
+
return;
|
| 1239 |
+
}
|
| 1240 |
+
|
| 1241 |
+
if (file.size > 10 * 1024 * 1024) {
|
| 1242 |
+
alert('File size must be less than 10MB');
|
| 1243 |
+
return;
|
| 1244 |
+
}
|
| 1245 |
+
|
| 1246 |
+
selectedFile = file;
|
| 1247 |
+
fileNameSpan.textContent = file.name;
|
| 1248 |
+
fileSizeSpan.textContent = formatFileSize(file.size);
|
| 1249 |
+
fileUploadArea.style.display = 'none';
|
| 1250 |
+
selectedFileDiv.style.display = 'flex';
|
| 1251 |
+
}
|
| 1252 |
+
|
| 1253 |
+
function formatFileSize(bytes) {
|
| 1254 |
+
if (bytes < 1024) return bytes + ' B';
|
| 1255 |
+
if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(2) + ' KB';
|
| 1256 |
+
return (bytes / (1024 * 1024)).toFixed(2) + ' MB';
|
| 1257 |
+
}
|
| 1258 |
+
|
| 1259 |
+
// Analyze button
|
| 1260 |
+
document.getElementById('analyzeBtn').addEventListener('click', async () => {
|
| 1261 |
+
const activeTab = document.querySelector('.tab.active').dataset.tab;
|
| 1262 |
+
const analyzeBtn = document.getElementById('analyzeBtn');
|
| 1263 |
+
|
| 1264 |
+
try {
|
| 1265 |
+
analyzeBtn.disabled = true;
|
| 1266 |
+
analyzeBtn.innerHTML = '<span>⏳</span><span>Processing...</span>';
|
| 1267 |
+
|
| 1268 |
+
if (activeTab === 'paste') {
|
| 1269 |
+
const text = document.getElementById('contractText').value.trim();
|
| 1270 |
+
if (!text) {
|
| 1271 |
+
alert('Please paste contract text');
|
| 1272 |
+
return;
|
| 1273 |
+
}
|
| 1274 |
+
const blob = new Blob([text], { type: 'text/plain' });
|
| 1275 |
+
const file = new File([blob], 'contract.txt', { type: 'text/plain' });
|
| 1276 |
+
await analyzeContract(file);
|
| 1277 |
+
} else {
|
| 1278 |
+
if (!selectedFile) {
|
| 1279 |
+
alert('Please select a file');
|
| 1280 |
+
return;
|
| 1281 |
+
}
|
| 1282 |
+
await analyzeContract(selectedFile);
|
| 1283 |
+
}
|
| 1284 |
+
} catch (error) {
|
| 1285 |
+
console.error('Analysis error:', error);
|
| 1286 |
+
alert('Error starting analysis: ' + error.message);
|
| 1287 |
+
} finally {
|
| 1288 |
+
analyzeBtn.disabled = false;
|
| 1289 |
+
analyzeBtn.innerHTML = '<span>🔍</span><span>Analyze Contract</span>';
|
| 1290 |
+
}
|
| 1291 |
+
});
|
| 1292 |
+
|
| 1293 |
+
async function analyzeContract(file) {
|
| 1294 |
+
try {
|
| 1295 |
+
showScreen('loading');
|
| 1296 |
+
|
| 1297 |
+
const formData = new FormData();
|
| 1298 |
+
formData.append('file', file);
|
| 1299 |
+
formData.append('max_clauses', '15');
|
| 1300 |
+
formData.append('interpret_clauses', 'true');
|
| 1301 |
+
formData.append('generate_negotiation_points', 'true');
|
| 1302 |
+
formData.append('compare_to_market', 'true');
|
| 1303 |
+
formData.append('llm_provider', 'ollama');
|
| 1304 |
+
|
| 1305 |
+
const response = await fetch(`${API_BASE_URL}/analyze`, {
|
| 1306 |
+
method: 'POST',
|
| 1307 |
+
body: formData
|
| 1308 |
+
});
|
| 1309 |
+
|
| 1310 |
+
if (!response.ok) {
|
| 1311 |
+
let errorDetail = 'Analysis failed';
|
| 1312 |
+
try {
|
| 1313 |
+
const errorData = await response.json();
|
| 1314 |
+
errorDetail = errorData.detail || errorData.error || errorDetail;
|
| 1315 |
+
} catch (e) {
|
| 1316 |
+
errorDetail = `Server error: ${response.status} ${response.statusText}`;
|
| 1317 |
+
}
|
| 1318 |
+
throw new Error(errorDetail);
|
| 1319 |
+
}
|
| 1320 |
+
|
| 1321 |
+
const job = await response.json();
|
| 1322 |
+
currentJobId = job.job_id;
|
| 1323 |
+
|
| 1324 |
+
pollInterval = setInterval(() => pollJobStatus(currentJobId), 2000);
|
| 1325 |
+
|
| 1326 |
+
} catch (error) {
|
| 1327 |
+
console.error('Error:', error);
|
| 1328 |
+
alert('Error analyzing contract: ' + error.message);
|
| 1329 |
+
showScreen('analyzer');
|
| 1330 |
+
}
|
| 1331 |
+
}
|
| 1332 |
+
|
| 1333 |
+
async function pollJobStatus(jobId) {
|
| 1334 |
+
try {
|
| 1335 |
+
const response = await fetch(`${API_BASE_URL}/jobs/${jobId}`);
|
| 1336 |
+
if (!response.ok) throw new Error('Failed to fetch job status');
|
| 1337 |
+
|
| 1338 |
+
const job = await response.json();
|
| 1339 |
+
|
| 1340 |
+
if (job.status === 'completed') {
|
| 1341 |
+
clearInterval(pollInterval);
|
| 1342 |
+
displayResults(job.result);
|
| 1343 |
+
showScreen('results');
|
| 1344 |
+
} else if (job.status === 'failed') {
|
| 1345 |
+
clearInterval(pollInterval);
|
| 1346 |
+
alert('Analysis failed: ' + job.error);
|
| 1347 |
+
showScreen('analyzer');
|
| 1348 |
+
}
|
| 1349 |
+
} catch (error) {
|
| 1350 |
+
console.error('Polling error:', error);
|
| 1351 |
+
}
|
| 1352 |
+
}
|
| 1353 |
+
|
| 1354 |
+
function displayResults(result) {
|
| 1355 |
+
const score = result.risk_analysis.overall_score;
|
| 1356 |
+
const riskLevel = result.risk_analysis.risk_level;
|
| 1357 |
+
|
| 1358 |
+
document.getElementById('riskScoreValue').textContent = score;
|
| 1359 |
+
document.getElementById('riskLevel').textContent = riskLevel.toUpperCase();
|
| 1360 |
+
document.getElementById('riskLevel').className = 'risk-level risk-' + getRiskClass(score);
|
| 1361 |
+
|
| 1362 |
+
const circumference = 534;
|
| 1363 |
+
const offset = circumference - (score / 100) * circumference;
|
| 1364 |
+
const circle = document.getElementById('riskCircle');
|
| 1365 |
+
circle.style.strokeDashoffset = offset;
|
| 1366 |
+
circle.style.stroke = getRiskColor(score);
|
| 1367 |
+
|
| 1368 |
+
document.getElementById('executiveSummary').textContent = result.executive_summary;
|
| 1369 |
+
|
| 1370 |
+
// Update other result sections...
|
| 1371 |
+
const unfavorableList = document.getElementById('unfavorableTermsList');
|
| 1372 |
+
unfavorableList.innerHTML = '';
|
| 1373 |
+
if (result.unfavorable_terms && result.unfavorable_terms.length > 0) {
|
| 1374 |
+
result.unfavorable_terms.slice(0, 8).forEach(term => {
|
| 1375 |
+
const li = document.createElement('li');
|
| 1376 |
+
li.innerHTML = `<span class="item-icon">›</span><span class="item-text"><strong>${term.term}:</strong> ${term.explanation}</span>`;
|
| 1377 |
+
unfavorableList.appendChild(li);
|
| 1378 |
+
});
|
| 1379 |
+
} else {
|
| 1380 |
+
unfavorableList.innerHTML = '<li>No unfavorable terms detected</li>';
|
| 1381 |
+
}
|
| 1382 |
+
|
| 1383 |
+
// Similar updates for other sections...
|
| 1384 |
+
}
|
| 1385 |
+
|
| 1386 |
+
function getRiskClass(score) {
|
| 1387 |
+
if (score >= 80) return 'critical';
|
| 1388 |
+
if (score >= 60) return 'high';
|
| 1389 |
+
if (score >= 40) return 'medium';
|
| 1390 |
+
return 'low';
|
| 1391 |
+
}
|
| 1392 |
+
|
| 1393 |
+
function getRiskColor(score) {
|
| 1394 |
+
if (score >= 80) return '#dc2626';
|
| 1395 |
+
if (score >= 60) return '#f97316';
|
| 1396 |
+
if (score >= 40) return '#ca8a04';
|
| 1397 |
+
return '#16a34a';
|
| 1398 |
+
}
|
| 1399 |
+
|
| 1400 |
+
// Initialize
|
| 1401 |
+
showScreen('landing');
|
| 1402 |
+
</script>
|
| 1403 |
+
</body>
|
| 1404 |
+
</html>
|
static/style.css
DELETED
|
File without changes
|
utils/text_processor.py
CHANGED
|
@@ -54,13 +54,17 @@ class TextProcessor:
|
|
| 54 |
"""
|
| 55 |
Normalize text for analysis
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
Returns:
|
| 63 |
-
|
|
|
|
| 64 |
"""
|
| 65 |
if lowercase:
|
| 66 |
text = text.lower()
|
|
@@ -74,17 +78,21 @@ class TextProcessor:
|
|
| 74 |
|
| 75 |
return text.strip()
|
| 76 |
|
|
|
|
| 77 |
@staticmethod
|
| 78 |
def split_into_paragraphs(text: str, min_length: int = 20) -> List[str]:
|
| 79 |
"""
|
| 80 |
Split text into paragraphs
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
Returns:
|
| 87 |
-
|
|
|
|
| 88 |
"""
|
| 89 |
# Split on double newlines
|
| 90 |
paragraphs = re.split(r'\n\s*\n', text)
|
|
@@ -92,17 +100,21 @@ class TextProcessor:
|
|
| 92 |
# Filter short and empty paragraphs
|
| 93 |
return [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
|
| 94 |
|
|
|
|
| 95 |
@staticmethod
|
| 96 |
def extract_sentences(text: str, min_length: int = 10) -> List[str]:
|
| 97 |
"""
|
| 98 |
Extract sentences from text (basic method)
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
| 103 |
|
| 104 |
Returns:
|
| 105 |
-
|
|
|
|
| 106 |
"""
|
| 107 |
# Simple sentence splitting on .!?
|
| 108 |
sentences = re.split(r'[.!?]+', text)
|
|
@@ -112,6 +124,7 @@ class TextProcessor:
|
|
| 112 |
|
| 113 |
return sentences
|
| 114 |
|
|
|
|
| 115 |
def extract_sentences_advanced(self, text: str) -> List[Dict[str, Any]]:
|
| 116 |
"""
|
| 117 |
Extract sentences with NER and metadata using spaCy
|
|
@@ -125,87 +138,87 @@ class TextProcessor:
|
|
| 125 |
if not self.nlp:
|
| 126 |
# Fallback to basic extraction
|
| 127 |
basic_sentences = self.extract_sentences(text)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
|
|
|
| 133 |
|
| 134 |
for sent in doc.sents:
|
| 135 |
-
sentences.append({
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
})
|
| 142 |
|
| 143 |
return sentences
|
| 144 |
|
| 145 |
-
# =========================================================================
|
| 146 |
-
# LEGAL-SPECIFIC EXTRACTION
|
| 147 |
-
# =========================================================================
|
| 148 |
|
| 149 |
@staticmethod
|
| 150 |
def extract_legal_entities(text: str) -> Dict[str, List[str]]:
|
| 151 |
"""
|
| 152 |
Extract legal-specific entities (parties, dates, amounts, references)
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
|
|
|
| 156 |
|
| 157 |
Returns:
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
|
| 170 |
# Party names (PARTY A, "the Employee", Company Name Inc.)
|
| 171 |
-
party_patterns = [
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
for pattern in party_patterns:
|
| 178 |
matches = re.findall(pattern, text)
|
|
|
|
| 179 |
entities["parties"].extend(matches)
|
| 180 |
|
| 181 |
# Dates (various formats)
|
| 182 |
-
date_patterns = [
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
for pattern in date_patterns:
|
| 188 |
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
|
|
| 189 |
entities["dates"].extend(matches)
|
| 190 |
|
| 191 |
# Legal references (Section 5.2, Clause 11.1, Article III)
|
| 192 |
-
ref_patterns = [
|
| 193 |
-
|
| 194 |
-
]
|
| 195 |
for pattern in ref_patterns:
|
| 196 |
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
|
|
| 197 |
entities["references"].extend(matches)
|
| 198 |
|
| 199 |
# Monetary amounts
|
| 200 |
-
entities["amounts"]
|
| 201 |
|
| 202 |
# Email addresses
|
| 203 |
-
email_pattern
|
| 204 |
-
entities["emails"]
|
| 205 |
|
| 206 |
# Phone numbers (US format)
|
| 207 |
-
phone_pattern
|
| 208 |
-
phone_matches
|
| 209 |
entities["phone_numbers"] = ['-'.join(match) for match in phone_matches]
|
| 210 |
|
| 211 |
# Deduplicate
|
|
@@ -214,37 +227,46 @@ class TextProcessor:
|
|
| 214 |
|
| 215 |
return entities
|
| 216 |
|
|
|
|
| 217 |
@staticmethod
|
| 218 |
def count_words(text: str) -> int:
|
| 219 |
-
"""
|
|
|
|
|
|
|
| 220 |
return len(text.split())
|
| 221 |
|
|
|
|
| 222 |
@staticmethod
|
| 223 |
def extract_numbers(text: str) -> List[str]:
|
| 224 |
-
"""
|
|
|
|
|
|
|
| 225 |
return re.findall(r'\d+', text)
|
| 226 |
|
|
|
|
| 227 |
@staticmethod
|
| 228 |
def extract_monetary_amounts(text: str) -> List[str]:
|
| 229 |
"""
|
| 230 |
Extract monetary amounts from text
|
| 231 |
|
| 232 |
Returns:
|
| 233 |
-
|
|
|
|
| 234 |
"""
|
| 235 |
# Match patterns like $1,000 or $1000.00 or USD 1,000
|
| 236 |
-
patterns = [
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
for pattern in patterns:
|
| 245 |
amounts.extend(re.findall(pattern, text, re.IGNORECASE))
|
| 246 |
|
| 247 |
return amounts
|
|
|
|
| 248 |
|
| 249 |
@staticmethod
|
| 250 |
def extract_durations(text: str) -> List[Dict[str, str]]:
|
|
@@ -252,150 +274,149 @@ class TextProcessor:
|
|
| 252 |
Extract time durations (e.g., "6 months", "2 years")
|
| 253 |
|
| 254 |
Returns:
|
| 255 |
-
|
|
|
|
| 256 |
"""
|
| 257 |
pattern = r'(\d+)\s*(day|week|month|year)s?'
|
| 258 |
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 259 |
|
| 260 |
-
return [
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
]
|
| 264 |
-
|
| 265 |
@staticmethod
|
| 266 |
def extract_percentages(text: str) -> List[str]:
|
| 267 |
-
"""
|
|
|
|
|
|
|
| 268 |
return re.findall(r'\d+(?:\.\d+)?%', text)
|
| 269 |
|
| 270 |
-
|
| 271 |
-
# TEXT CHUNKING FOR EMBEDDINGS
|
| 272 |
-
# =========================================================================
|
| 273 |
-
|
| 274 |
@staticmethod
|
| 275 |
-
def chunk_text_for_embedding(text: str,
|
| 276 |
-
chunk_size: int = 512,
|
| 277 |
-
overlap: int = 50) -> List[Dict[str, Any]]:
|
| 278 |
"""
|
| 279 |
Chunk text with overlap for embedding models (preserves sentence boundaries)
|
| 280 |
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
Returns:
|
| 287 |
-
|
|
|
|
| 288 |
"""
|
| 289 |
-
sentences
|
| 290 |
-
chunks
|
| 291 |
-
current_chunk
|
| 292 |
-
current_length
|
| 293 |
start_sentence_idx = 0
|
| 294 |
|
| 295 |
for i, sentence in enumerate(sentences):
|
| 296 |
-
sentence_words
|
| 297 |
sentence_length = len(sentence_words)
|
| 298 |
|
| 299 |
-
if current_length + sentence_length > chunk_size and current_chunk:
|
| 300 |
# Save current chunk
|
| 301 |
-
chunks.append({
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
})
|
| 308 |
|
| 309 |
# Start new chunk with overlap
|
| 310 |
-
overlap_sentences
|
| 311 |
-
current_chunk
|
| 312 |
-
current_length
|
| 313 |
start_sentence_idx = max(0, i - len(overlap_sentences))
|
|
|
|
| 314 |
else:
|
| 315 |
current_chunk.append(sentence)
|
| 316 |
current_length += sentence_length
|
| 317 |
|
| 318 |
# Add final chunk
|
| 319 |
if current_chunk:
|
| 320 |
-
chunks.append({
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
})
|
| 327 |
|
| 328 |
return chunks
|
| 329 |
|
| 330 |
-
|
| 331 |
-
# TEXT SIMILARITY & DEDUPLICATION
|
| 332 |
-
# =========================================================================
|
| 333 |
-
|
| 334 |
@staticmethod
|
| 335 |
def text_similarity(text1: str, text2: str) -> float:
|
| 336 |
"""
|
| 337 |
Calculate similarity between two texts (0-1 scale)
|
| 338 |
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
| 342 |
|
| 343 |
Returns:
|
| 344 |
-
|
|
|
|
| 345 |
"""
|
| 346 |
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
|
| 347 |
|
|
|
|
| 348 |
@staticmethod
|
| 349 |
def deduplicate_clauses(clauses: List[str], threshold: float = 0.85) -> List[str]:
|
| 350 |
"""
|
| 351 |
Remove near-duplicate clauses
|
| 352 |
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
| 356 |
|
| 357 |
Returns:
|
| 358 |
-
|
|
|
|
| 359 |
"""
|
| 360 |
-
unique =
|
| 361 |
|
| 362 |
for clause in clauses:
|
| 363 |
-
is_duplicate = any(
|
| 364 |
-
|
| 365 |
-
for existing in unique
|
| 366 |
-
)
|
| 367 |
if not is_duplicate:
|
| 368 |
unique.append(clause)
|
| 369 |
|
| 370 |
return unique
|
| 371 |
|
| 372 |
-
|
| 373 |
-
# LANGUAGE DETECTION
|
| 374 |
-
# =========================================================================
|
| 375 |
-
|
| 376 |
@staticmethod
|
| 377 |
def detect_language(text: str) -> str:
|
| 378 |
"""
|
| 379 |
Detect text language
|
| 380 |
|
| 381 |
-
|
| 382 |
-
|
|
|
|
| 383 |
|
| 384 |
Returns:
|
| 385 |
-
|
|
|
|
| 386 |
"""
|
| 387 |
if not LANGDETECT_AVAILABLE:
|
| 388 |
-
|
|
|
|
| 389 |
|
| 390 |
try:
|
| 391 |
# Use first 1000 chars for detection
|
| 392 |
return detect(text[:1000])
|
|
|
|
| 393 |
except LangDetectException:
|
| 394 |
return "en"
|
| 395 |
|
| 396 |
-
# =========================================================================
|
| 397 |
-
# TEXT STATISTICS
|
| 398 |
-
# =========================================================================
|
| 399 |
|
| 400 |
@staticmethod
|
| 401 |
def get_text_statistics(text: str) -> Dict[str, Any]:
|
|
@@ -403,49 +424,46 @@ class TextProcessor:
|
|
| 403 |
Get comprehensive text statistics
|
| 404 |
|
| 405 |
Returns:
|
| 406 |
-
|
|
|
|
| 407 |
"""
|
| 408 |
-
sentences
|
| 409 |
paragraphs = TextProcessor.split_into_paragraphs(text)
|
| 410 |
-
words
|
| 411 |
-
|
| 412 |
-
return {
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
}
|
| 421 |
-
|
| 422 |
-
# =========================================================================
|
| 423 |
-
# KEYWORD HIGHLIGHTING
|
| 424 |
-
# =========================================================================
|
| 425 |
|
|
|
|
| 426 |
@staticmethod
|
| 427 |
-
def highlight_keywords(text: str, keywords: List[str],
|
| 428 |
-
highlight_format: str = "**{}**") -> str:
|
| 429 |
"""
|
| 430 |
Highlight keywords in text (for display purposes)
|
| 431 |
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
| 436 |
|
| 437 |
Returns:
|
| 438 |
-
|
|
|
|
| 439 |
"""
|
| 440 |
for keyword in keywords:
|
| 441 |
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
|
| 442 |
-
text
|
| 443 |
|
| 444 |
return text
|
| 445 |
|
| 446 |
-
# =========================================================================
|
| 447 |
-
# CLAUSE SEGMENTATION HELPERS
|
| 448 |
-
# =========================================================================
|
| 449 |
|
| 450 |
@staticmethod
|
| 451 |
def extract_numbered_sections(text: str) -> List[Dict[str, Any]]:
|
|
@@ -453,57 +471,61 @@ class TextProcessor:
|
|
| 453 |
Extract numbered sections/clauses (1.1, 1.2, Article 5, etc.)
|
| 454 |
|
| 455 |
Returns:
|
| 456 |
-
|
|
|
|
| 457 |
"""
|
| 458 |
-
patterns = [
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
]
|
| 464 |
|
| 465 |
-
sections =
|
|
|
|
| 466 |
for pattern, section_type in patterns:
|
| 467 |
matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
|
|
|
|
| 468 |
for match in matches:
|
| 469 |
-
sections.append({
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
})
|
| 476 |
|
| 477 |
# Sort by position
|
| 478 |
-
sections.sort(key=lambda x: x['start_pos'])
|
| 479 |
|
| 480 |
return sections
|
| 481 |
|
|
|
|
| 482 |
@staticmethod
|
| 483 |
def clean_legal_text(text: str) -> str:
|
| 484 |
"""
|
| 485 |
Clean legal text by removing boilerplate artifacts
|
| 486 |
|
| 487 |
-
|
| 488 |
-
|
|
|
|
| 489 |
|
| 490 |
Returns:
|
| 491 |
-
|
|
|
|
| 492 |
"""
|
| 493 |
# Remove "Page X of Y" markers
|
| 494 |
-
text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
|
| 495 |
|
| 496 |
# Remove "[Signature Page Follows]" type markers
|
| 497 |
-
text = re.sub(r'\[.*?(?:Signature|Initial|Page).*?\]', '', text, flags=re.IGNORECASE)
|
| 498 |
|
| 499 |
# Remove excessive underscores (signature lines)
|
| 500 |
text = re.sub(r'_{3,}', '', text)
|
| 501 |
|
| 502 |
# Remove "CONFIDENTIAL" watermarks
|
| 503 |
-
text = re.sub(r'\b(CONFIDENTIAL|DRAFT|INTERNAL USE ONLY)\b', '', text, flags=re.IGNORECASE)
|
| 504 |
|
| 505 |
# Clean up resulting whitespace
|
| 506 |
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 507 |
text = re.sub(r' {2,}', ' ', text)
|
| 508 |
|
| 509 |
-
return text.strip()
|
|
|
|
| 54 |
"""
|
| 55 |
Normalize text for analysis
|
| 56 |
|
| 57 |
+
Arguments:
|
| 58 |
+
----------
|
| 59 |
+
text { str } : Input text
|
| 60 |
+
|
| 61 |
+
lowercase { bool } : Convert to lowercase
|
| 62 |
+
|
| 63 |
+
remove_special_chars { bool } : Remove special characters
|
| 64 |
|
| 65 |
Returns:
|
| 66 |
+
--------
|
| 67 |
+
{ str } : Normalized text
|
| 68 |
"""
|
| 69 |
if lowercase:
|
| 70 |
text = text.lower()
|
|
|
|
| 78 |
|
| 79 |
return text.strip()
|
| 80 |
|
| 81 |
+
|
| 82 |
@staticmethod
|
| 83 |
def split_into_paragraphs(text: str, min_length: int = 20) -> List[str]:
|
| 84 |
"""
|
| 85 |
Split text into paragraphs
|
| 86 |
|
| 87 |
+
Arguments:
|
| 88 |
+
----------
|
| 89 |
+
text { str } : Input text
|
| 90 |
+
|
| 91 |
+
min_length { int } : Minimum paragraph length in characters
|
| 92 |
|
| 93 |
Returns:
|
| 94 |
+
--------
|
| 95 |
+
{ list } : List of paragraphs
|
| 96 |
"""
|
| 97 |
# Split on double newlines
|
| 98 |
paragraphs = re.split(r'\n\s*\n', text)
|
|
|
|
| 100 |
# Filter short and empty paragraphs
|
| 101 |
return [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
|
| 102 |
|
| 103 |
+
|
| 104 |
@staticmethod
|
| 105 |
def extract_sentences(text: str, min_length: int = 10) -> List[str]:
|
| 106 |
"""
|
| 107 |
Extract sentences from text (basic method)
|
| 108 |
|
| 109 |
+
Arguments:
|
| 110 |
+
----------
|
| 111 |
+
text { str } : Input text
|
| 112 |
+
|
| 113 |
+
min_length { int } : Minimum sentence length in characters
|
| 114 |
|
| 115 |
Returns:
|
| 116 |
+
--------
|
| 117 |
+
{ list } : List of sentences
|
| 118 |
"""
|
| 119 |
# Simple sentence splitting on .!?
|
| 120 |
sentences = re.split(r'[.!?]+', text)
|
|
|
|
| 124 |
|
| 125 |
return sentences
|
| 126 |
|
| 127 |
+
|
| 128 |
def extract_sentences_advanced(self, text: str) -> List[Dict[str, Any]]:
|
| 129 |
"""
|
| 130 |
Extract sentences with NER and metadata using spaCy
|
|
|
|
| 138 |
if not self.nlp:
|
| 139 |
# Fallback to basic extraction
|
| 140 |
basic_sentences = self.extract_sentences(text)
|
| 141 |
+
|
| 142 |
+
return [{"text" : s, "entities" : [], "start_char" : 0, "end_char" : 0} for s in basic_sentences]
|
| 143 |
|
| 144 |
+
# Limit to 100K chars for performance
|
| 145 |
+
doc = self.nlp(text[:100000])
|
| 146 |
+
sentences = list()
|
| 147 |
|
| 148 |
for sent in doc.sents:
|
| 149 |
+
sentences.append({"text" : sent.text.strip(),
|
| 150 |
+
"entities" : [(ent.text, ent.label_) for ent in sent.ents],
|
| 151 |
+
"start_char" : sent.start_char,
|
| 152 |
+
"end_char" : sent.end_char,
|
| 153 |
+
"tokens" : [token.text for token in sent],
|
| 154 |
+
})
|
|
|
|
| 155 |
|
| 156 |
return sentences
|
| 157 |
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
@staticmethod
|
| 160 |
def extract_legal_entities(text: str) -> Dict[str, List[str]]:
|
| 161 |
"""
|
| 162 |
Extract legal-specific entities (parties, dates, amounts, references)
|
| 163 |
|
| 164 |
+
Arguments:
|
| 165 |
+
----------
|
| 166 |
+
text { str } : Input text
|
| 167 |
|
| 168 |
Returns:
|
| 169 |
+
--------
|
| 170 |
+
{ dict } : Dictionary of extracted entities by type
|
| 171 |
+
"""
|
| 172 |
+
entities = {"parties" : [],
|
| 173 |
+
"dates" : [],
|
| 174 |
+
"amounts" : [],
|
| 175 |
+
"addresses" : [],
|
| 176 |
+
"references" : [],
|
| 177 |
+
"emails" : [],
|
| 178 |
+
"phone_numbers" : [],
|
| 179 |
+
}
|
| 180 |
|
| 181 |
# Party names (PARTY A, "the Employee", Company Name Inc.)
|
| 182 |
+
party_patterns = [r'(?:PARTY|Party)\s+[A-Z]',
|
| 183 |
+
r'"the\s+\w+"',
|
| 184 |
+
r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|LLC|Corp|Ltd|Limited|Company)\.?',
|
| 185 |
+
r'(?:the\s+)?(Employer|Employee|Consultant|Contractor|Client|Vendor|Supplier|Landlord|Tenant|Buyer|Seller)',
|
| 186 |
+
]
|
| 187 |
+
|
| 188 |
for pattern in party_patterns:
|
| 189 |
matches = re.findall(pattern, text)
|
| 190 |
+
|
| 191 |
entities["parties"].extend(matches)
|
| 192 |
|
| 193 |
# Dates (various formats)
|
| 194 |
+
date_patterns = [r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
|
| 195 |
+
r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
|
| 196 |
+
r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
|
| 197 |
+
]
|
| 198 |
+
|
| 199 |
for pattern in date_patterns:
|
| 200 |
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 201 |
+
|
| 202 |
entities["dates"].extend(matches)
|
| 203 |
|
| 204 |
# Legal references (Section 5.2, Clause 11.1, Article III)
|
| 205 |
+
ref_patterns = [r'(?:Section|Clause|Article|Paragraph|Exhibit|Schedule|Appendix)\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+)']
|
| 206 |
+
|
|
|
|
| 207 |
for pattern in ref_patterns:
|
| 208 |
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 209 |
+
|
| 210 |
entities["references"].extend(matches)
|
| 211 |
|
| 212 |
# Monetary amounts
|
| 213 |
+
entities["amounts"] = TextProcessor.extract_monetary_amounts(text)
|
| 214 |
|
| 215 |
# Email addresses
|
| 216 |
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
| 217 |
+
entities["emails"] = re.findall(email_pattern, text)
|
| 218 |
|
| 219 |
# Phone numbers (US format)
|
| 220 |
+
phone_pattern = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b'
|
| 221 |
+
phone_matches = re.findall(phone_pattern, text)
|
| 222 |
entities["phone_numbers"] = ['-'.join(match) for match in phone_matches]
|
| 223 |
|
| 224 |
# Deduplicate
|
|
|
|
| 227 |
|
| 228 |
return entities
|
| 229 |
|
| 230 |
+
|
| 231 |
@staticmethod
|
| 232 |
def count_words(text: str) -> int:
|
| 233 |
+
"""
|
| 234 |
+
Count words in text
|
| 235 |
+
"""
|
| 236 |
return len(text.split())
|
| 237 |
|
| 238 |
+
|
| 239 |
@staticmethod
|
| 240 |
def extract_numbers(text: str) -> List[str]:
|
| 241 |
+
"""
|
| 242 |
+
Extract all numbers from text
|
| 243 |
+
"""
|
| 244 |
return re.findall(r'\d+', text)
|
| 245 |
|
| 246 |
+
|
| 247 |
@staticmethod
|
| 248 |
def extract_monetary_amounts(text: str) -> List[str]:
|
| 249 |
"""
|
| 250 |
Extract monetary amounts from text
|
| 251 |
|
| 252 |
Returns:
|
| 253 |
+
--------
|
| 254 |
+
{ list } : List of monetary amounts (e.g., ['$1,000', '$2,500.00'])
|
| 255 |
"""
|
| 256 |
# Match patterns like $1,000 or $1000.00 or USD 1,000
|
| 257 |
+
patterns = [r'\$[\d,]+(?:\.\d{2})?',
|
| 258 |
+
r'USD\s*[\d,]+(?:\.\d{2})?',
|
| 259 |
+
r'EUR\s*[\d,]+(?:\.\d{2})?',
|
| 260 |
+
r'GBP\s*[\d,]+(?:\.\d{2})?'
|
| 261 |
+
]
|
| 262 |
+
|
| 263 |
+
amounts = list()
|
| 264 |
+
|
| 265 |
for pattern in patterns:
|
| 266 |
amounts.extend(re.findall(pattern, text, re.IGNORECASE))
|
| 267 |
|
| 268 |
return amounts
|
| 269 |
+
|
| 270 |
|
| 271 |
@staticmethod
|
| 272 |
def extract_durations(text: str) -> List[Dict[str, str]]:
|
|
|
|
| 274 |
Extract time durations (e.g., "6 months", "2 years")
|
| 275 |
|
| 276 |
Returns:
|
| 277 |
+
--------
|
| 278 |
+
{ list } : List of duration dictionaries with 'amount' and 'unit'
|
| 279 |
"""
|
| 280 |
pattern = r'(\d+)\s*(day|week|month|year)s?'
|
| 281 |
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 282 |
|
| 283 |
+
return [{"amount": m[0], "unit": m[1].lower()} for m in matches]
|
| 284 |
+
|
| 285 |
+
|
|
|
|
|
|
|
| 286 |
@staticmethod
|
| 287 |
def extract_percentages(text: str) -> List[str]:
|
| 288 |
+
"""
|
| 289 |
+
Extract percentages from text
|
| 290 |
+
"""
|
| 291 |
return re.findall(r'\d+(?:\.\d+)?%', text)
|
| 292 |
|
| 293 |
+
|
|
|
|
|
|
|
|
|
|
| 294 |
@staticmethod
|
| 295 |
+
def chunk_text_for_embedding(text: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
|
|
|
|
|
|
|
| 296 |
"""
|
| 297 |
Chunk text with overlap for embedding models (preserves sentence boundaries)
|
| 298 |
|
| 299 |
+
Arguments:
|
| 300 |
+
----------
|
| 301 |
+
text { str } : Input text
|
| 302 |
+
|
| 303 |
+
chunk_size { int } : Maximum chunk size in words
|
| 304 |
+
|
| 305 |
+
overlap { int } : Number of words to overlap between chunks
|
| 306 |
|
| 307 |
Returns:
|
| 308 |
+
--------
|
| 309 |
+
{ list } : List of chunk dictionaries with metadata
|
| 310 |
"""
|
| 311 |
+
sentences = TextProcessor.extract_sentences(text)
|
| 312 |
+
chunks = list()
|
| 313 |
+
current_chunk = list()
|
| 314 |
+
current_length = 0
|
| 315 |
start_sentence_idx = 0
|
| 316 |
|
| 317 |
for i, sentence in enumerate(sentences):
|
| 318 |
+
sentence_words = sentence.split()
|
| 319 |
sentence_length = len(sentence_words)
|
| 320 |
|
| 321 |
+
if (((current_length + sentence_length) > chunk_size) and current_chunk):
|
| 322 |
# Save current chunk
|
| 323 |
+
chunks.append({"text" : " ".join(current_chunk),
|
| 324 |
+
"start_sentence" : start_sentence_idx,
|
| 325 |
+
"end_sentence" : i - 1,
|
| 326 |
+
"word_count" : current_length,
|
| 327 |
+
"chunk_id" : len(chunks),
|
| 328 |
+
})
|
|
|
|
| 329 |
|
| 330 |
# Start new chunk with overlap
|
| 331 |
+
overlap_sentences = current_chunk[-2:] if (len(current_chunk) > 2) else current_chunk
|
| 332 |
+
current_chunk = overlap_sentences + [sentence]
|
| 333 |
+
current_length = sum(len(s.split()) for s in current_chunk)
|
| 334 |
start_sentence_idx = max(0, i - len(overlap_sentences))
|
| 335 |
+
|
| 336 |
else:
|
| 337 |
current_chunk.append(sentence)
|
| 338 |
current_length += sentence_length
|
| 339 |
|
| 340 |
# Add final chunk
|
| 341 |
if current_chunk:
|
| 342 |
+
chunks.append({"text" : " ".join(current_chunk),
|
| 343 |
+
"start_sentence" : start_sentence_idx,
|
| 344 |
+
"end_sentence" : len(sentences) - 1,
|
| 345 |
+
"word_count" : current_length,
|
| 346 |
+
"chunk_id" : len(chunks),
|
| 347 |
+
})
|
|
|
|
| 348 |
|
| 349 |
return chunks
|
| 350 |
|
| 351 |
+
|
|
|
|
|
|
|
|
|
|
| 352 |
@staticmethod
|
| 353 |
def text_similarity(text1: str, text2: str) -> float:
|
| 354 |
"""
|
| 355 |
Calculate similarity between two texts (0-1 scale)
|
| 356 |
|
| 357 |
+
Arguments:
|
| 358 |
+
----------
|
| 359 |
+
text1 { str } : First text
|
| 360 |
+
|
| 361 |
+
text2 { str } : Second text
|
| 362 |
|
| 363 |
Returns:
|
| 364 |
+
--------
|
| 365 |
+
{ float } : Similarity score (0.0 = completely different, 1.0 = identical)
|
| 366 |
"""
|
| 367 |
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
|
| 368 |
|
| 369 |
+
|
| 370 |
@staticmethod
|
| 371 |
def deduplicate_clauses(clauses: List[str], threshold: float = 0.85) -> List[str]:
|
| 372 |
"""
|
| 373 |
Remove near-duplicate clauses
|
| 374 |
|
| 375 |
+
Arguments:
|
| 376 |
+
----------
|
| 377 |
+
clauses { list } : List of clause texts
|
| 378 |
+
|
| 379 |
+
threshold { float } : Similarity threshold for deduplication (0.0-1.0)
|
| 380 |
|
| 381 |
Returns:
|
| 382 |
+
--------
|
| 383 |
+
{ list } : List of unique clauses
|
| 384 |
"""
|
| 385 |
+
unique = list()
|
| 386 |
|
| 387 |
for clause in clauses:
|
| 388 |
+
is_duplicate = any(TextProcessor.text_similarity(clause, existing) > threshold for existing in unique)
|
| 389 |
+
|
|
|
|
|
|
|
| 390 |
if not is_duplicate:
|
| 391 |
unique.append(clause)
|
| 392 |
|
| 393 |
return unique
|
| 394 |
|
| 395 |
+
|
|
|
|
|
|
|
|
|
|
| 396 |
@staticmethod
|
| 397 |
def detect_language(text: str) -> str:
|
| 398 |
"""
|
| 399 |
Detect text language
|
| 400 |
|
| 401 |
+
Arguments:
|
| 402 |
+
----------
|
| 403 |
+
text { str } : Input text
|
| 404 |
|
| 405 |
Returns:
|
| 406 |
+
--------
|
| 407 |
+
{ str } : ISO 639-1 language code (e.g., 'en', 'es', 'fr')
|
| 408 |
"""
|
| 409 |
if not LANGDETECT_AVAILABLE:
|
| 410 |
+
# Default to English
|
| 411 |
+
return "en"
|
| 412 |
|
| 413 |
try:
|
| 414 |
# Use first 1000 chars for detection
|
| 415 |
return detect(text[:1000])
|
| 416 |
+
|
| 417 |
except LangDetectException:
|
| 418 |
return "en"
|
| 419 |
|
|
|
|
|
|
|
|
|
|
| 420 |
|
| 421 |
@staticmethod
|
| 422 |
def get_text_statistics(text: str) -> Dict[str, Any]:
|
|
|
|
| 424 |
Get comprehensive text statistics
|
| 425 |
|
| 426 |
Returns:
|
| 427 |
+
--------
|
| 428 |
+
{ dict } : Dictionary with character count, word count, sentence count, etc.
|
| 429 |
"""
|
| 430 |
+
sentences = TextProcessor.extract_sentences(text)
|
| 431 |
paragraphs = TextProcessor.split_into_paragraphs(text)
|
| 432 |
+
words = text.split()
|
| 433 |
+
|
| 434 |
+
return {"character_count" : len(text),
|
| 435 |
+
"word_count" : len(words),
|
| 436 |
+
"sentence_count" : len(sentences),
|
| 437 |
+
"paragraph_count" : len(paragraphs),
|
| 438 |
+
"avg_words_per_sentence" : len(words) / len(sentences) if sentences else 0,
|
| 439 |
+
"avg_chars_per_word" : len(text) / len(words) if words else 0,
|
| 440 |
+
"language" : TextProcessor.detect_language(text),
|
| 441 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
+
|
| 444 |
@staticmethod
|
| 445 |
+
def highlight_keywords(text: str, keywords: List[str], highlight_format: str = "**{}**") -> str:
|
|
|
|
| 446 |
"""
|
| 447 |
Highlight keywords in text (for display purposes)
|
| 448 |
|
| 449 |
+
Arguments:
|
| 450 |
+
----------
|
| 451 |
+
text { str } : Input text
|
| 452 |
+
|
| 453 |
+
keywords { list } : List of keywords to highlight
|
| 454 |
+
|
| 455 |
+
highlight_format { str } : Format string with {} placeholder (default: Markdown bold)
|
| 456 |
|
| 457 |
Returns:
|
| 458 |
+
--------
|
| 459 |
+
{ str } : Text with highlighted keywords
|
| 460 |
"""
|
| 461 |
for keyword in keywords:
|
| 462 |
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
|
| 463 |
+
text = pattern.sub(lambda m: highlight_format.format(m.group(0)), text)
|
| 464 |
|
| 465 |
return text
|
| 466 |
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
@staticmethod
|
| 469 |
def extract_numbered_sections(text: str) -> List[Dict[str, Any]]:
|
|
|
|
| 471 |
Extract numbered sections/clauses (1.1, 1.2, Article 5, etc.)
|
| 472 |
|
| 473 |
Returns:
|
| 474 |
+
--------
|
| 475 |
+
{ list } : List of section dictionaries with number and text
|
| 476 |
"""
|
| 477 |
+
patterns = [(r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\n\s*\d+\.\d+|\n\n|$)', 'numbered'),
|
| 478 |
+
(r'(Article\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nArticle|\n\n|$)', 'article'),
|
| 479 |
+
(r'(Section\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nSection|\n\n|$)', 'section'),
|
| 480 |
+
(r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\nClause|\n\n|$)', 'clause'),
|
| 481 |
+
]
|
|
|
|
| 482 |
|
| 483 |
+
sections = list()
|
| 484 |
+
|
| 485 |
for pattern, section_type in patterns:
|
| 486 |
matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
|
| 487 |
+
|
| 488 |
for match in matches:
|
| 489 |
+
sections.append({"reference" : match.group(1).strip(),
|
| 490 |
+
"text" : match.group(2).strip(),
|
| 491 |
+
"type" : section_type,
|
| 492 |
+
"start_pos" : match.start(),
|
| 493 |
+
"end_pos" : match.end(),
|
| 494 |
+
})
|
|
|
|
| 495 |
|
| 496 |
# Sort by position
|
| 497 |
+
sections.sort(key = lambda x: x['start_pos'])
|
| 498 |
|
| 499 |
return sections
|
| 500 |
|
| 501 |
+
|
| 502 |
@staticmethod
|
| 503 |
def clean_legal_text(text: str) -> str:
|
| 504 |
"""
|
| 505 |
Clean legal text by removing boilerplate artifacts
|
| 506 |
|
| 507 |
+
Arguments:
|
| 508 |
+
----------
|
| 509 |
+
text { str } : Input legal text
|
| 510 |
|
| 511 |
Returns:
|
| 512 |
+
--------
|
| 513 |
+
{ str } : Cleaned text
|
| 514 |
"""
|
| 515 |
# Remove "Page X of Y" markers
|
| 516 |
+
text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags = re.IGNORECASE)
|
| 517 |
|
| 518 |
# Remove "[Signature Page Follows]" type markers
|
| 519 |
+
text = re.sub(r'\[.*?(?:Signature|Initial|Page).*?\]', '', text, flags = re.IGNORECASE)
|
| 520 |
|
| 521 |
# Remove excessive underscores (signature lines)
|
| 522 |
text = re.sub(r'_{3,}', '', text)
|
| 523 |
|
| 524 |
# Remove "CONFIDENTIAL" watermarks
|
| 525 |
+
text = re.sub(r'\b(CONFIDENTIAL|DRAFT|INTERNAL USE ONLY)\b', '', text, flags = re.IGNORECASE)
|
| 526 |
|
| 527 |
# Clean up resulting whitespace
|
| 528 |
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 529 |
text = re.sub(r' {2,}', ' ', text)
|
| 530 |
|
| 531 |
+
return text.strip()
|