satyaki-mitra commited on
Commit
d15efc9
·
1 Parent(s): 6fd8649

code refactor

Browse files
api/__init__.py DELETED
File without changes
api/routes.py DELETED
File without changes
api/schemas.py DELETED
File without changes
app.py CHANGED
@@ -0,0 +1,733 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Application for AI Contract Risk Analyzer
3
+ Complete pre-loading approach: All models loaded at startup
4
+ Direct synchronous flow: Upload → Analyze → Return Results + PDF
5
+ """
6
+ from fastapi.responses import JSONResponse, FileResponse, Response
7
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from fastapi.staticfiles import StaticFiles
10
+ from pydantic import BaseModel, Field
11
+ from typing import List, Optional, Dict, Any
12
+ import uuid
13
+ import os
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+ import sys
17
+ import tempfile
18
+ import io
19
+
20
+ # Add parent directory to path
21
+ sys.path.append(str(Path(__file__).parent))
22
+
23
+ # Import all services
24
+ from config.settings import settings
25
+ from config.risk_rules import ContractType
26
+ from model_manager.model_loader import ModelLoader
27
+ from utils.document_reader import DocumentReader
28
+ from utils.validators import ContractValidator
29
+ from utils.text_processor import TextProcessor
30
+ from utils.logger import ContractAnalyzerLogger, log_info, log_error
31
+
32
+ from services.contract_classifier import ContractClassifier
33
+ from services.clause_extractor import ClauseExtractor
34
+ from services.risk_analyzer import MultiFactorRiskAnalyzer
35
+ from services.term_analyzer import TermAnalyzer
36
+ from services.protection_checker import ProtectionChecker
37
+ from services.llm_interpreter import LLMClauseInterpreter
38
+ from services.negotiation_engine import NegotiationEngine
39
+ from services.market_comparator import MarketComparator
40
+
41
+ # Import PDF generator
42
+ from reporter.pdf_generator import generate_pdf_report
43
+
44
+ # Initialize logger
45
+ ContractAnalyzerLogger.setup(log_dir="logs", app_name="contract_analyzer")
46
+ logger = ContractAnalyzerLogger.get_logger()
47
+
48
+ # ============================================================================
49
+ # PYDANTIC SCHEMAS
50
+ # ============================================================================
51
+
52
+ class HealthResponse(BaseModel):
53
+ """Health check response"""
54
+ status: str
55
+ version: str
56
+ timestamp: str
57
+ models_loaded: int
58
+ services_loaded: int
59
+ memory_usage_mb: float
60
+
61
+ class AnalysisOptions(BaseModel):
62
+ """Analysis options"""
63
+ max_clauses: int = Field(default=15, ge=5, le=30)
64
+ interpret_clauses: bool = Field(default=True)
65
+ generate_negotiation_points: bool = Field(default=True)
66
+ compare_to_market: bool = Field(default=True)
67
+
68
+ class AnalysisResult(BaseModel):
69
+ """Complete analysis result"""
70
+ analysis_id: str
71
+ timestamp: str
72
+ classification: Dict[str, Any]
73
+ clauses: List[Dict[str, Any]]
74
+ risk_analysis: Dict[str, Any]
75
+ unfavorable_terms: List[Dict[str, Any]]
76
+ missing_protections: List[Dict[str, Any]]
77
+ clause_interpretations: Optional[List[Dict[str, Any]]] = None
78
+ negotiation_points: Optional[List[Dict[str, Any]]] = None
79
+ market_comparisons: Optional[List[Dict[str, Any]]] = None
80
+ executive_summary: str
81
+ metadata: Dict[str, Any]
82
+ pdf_available: bool = True
83
+
84
+ class ErrorResponse(BaseModel):
85
+ """Error response"""
86
+ error: str
87
+ detail: str
88
+ timestamp: str
89
+
90
+ # ============================================================================
91
+ # SERVICE INITIALIZATION WITH FULL PRE-LOADING
92
+ # ============================================================================
93
+
94
+ class PreloadedAnalysisService:
95
+ """Analysis service with complete pre-loading of all models"""
96
+
97
+ def __init__(self):
98
+ self.model_loader = ModelLoader()
99
+ self.services = {}
100
+ self.service_status = {}
101
+ self.memory_usage_mb = 0
102
+ self._preload_all_services()
103
+
104
+ def _preload_all_services(self):
105
+ """Pre-load ALL services and models at initialization"""
106
+ log_info("PRE-LOADING ALL AI MODELS AND SERVICES")
107
+
108
+ try:
109
+ # Track memory usage
110
+ initial_memory = self._get_memory_usage()
111
+
112
+ # 1. Pre-load core classifier
113
+ log_info("🔄 Pre-loading Contract Classifier...")
114
+ self.services["classifier"] = ContractClassifier(self.model_loader)
115
+ self.service_status["classifier"] = "loaded"
116
+ log_info("✅ Contract Classifier loaded")
117
+
118
+ # 2. Pre-load Term Analyzer
119
+ log_info("🔄 Pre-loading Term Analyzer...")
120
+ self.services["term_analyzer"] = TermAnalyzer()
121
+ self.service_status["term_analyzer"] = "loaded"
122
+ log_info("✅ Term Analyzer loaded")
123
+
124
+ # 3. Pre-load Protection Checker
125
+ log_info("🔄 Pre-loading Protection Checker...")
126
+ self.services["protection_checker"] = ProtectionChecker()
127
+ self.service_status["protection_checker"] = "loaded"
128
+ log_info("✅ Protection Checker loaded")
129
+
130
+ # 4. Pre-load Market Comparator
131
+ log_info("🔄 Pre-loading Market Comparator...")
132
+ self.services["market_comparator"] = MarketComparator(self.model_loader)
133
+ self.service_status["market_comparator"] = "loaded"
134
+ log_info("✅ Market Comparator loaded")
135
+
136
+ # 5. Pre-load Clause Extractors for all major contract types
137
+ log_info("🔄 Pre-loading Clause Extractors...")
138
+ self.services["extractors"] = {}
139
+ major_categories = ["employment", "consulting", "nda", "software", "service", "partnership"]
140
+
141
+ for category in major_categories:
142
+ try:
143
+ self.services["extractors"][category] = ClauseExtractor(
144
+ self.model_loader, contract_category=category
145
+ )
146
+ log_info(f" ✅ Clause Extractor for {category} loaded")
147
+ except Exception as e:
148
+ log_error(f"Failed to load extractor for {category}: {e}")
149
+ self.services["extractors"][category] = None
150
+
151
+ self.service_status["extractors"] = f"loaded for {len(major_categories)} categories"
152
+ log_info("✅ All Clause Extractors loaded")
153
+
154
+ # 6. Pre-load Risk Analyzers for all contract types
155
+ log_info("🔄 Pre-loading Risk Analyzers...")
156
+ self.services["risk_analyzers"] = {}
157
+ contract_types = [
158
+ ContractType.EMPLOYMENT, ContractType.CONSULTING, ContractType.NDA,
159
+ ContractType.SOFTWARE, ContractType.SERVICE, ContractType.PARTNERSHIP,
160
+ ContractType.LEASE, ContractType.PURCHASE, ContractType.GENERAL
161
+ ]
162
+
163
+ for contract_type in contract_types:
164
+ try:
165
+ self.services["risk_analyzers"][contract_type.value] = MultiFactorRiskAnalyzer(
166
+ contract_type=contract_type
167
+ )
168
+ log_info(f" ✅ Risk Analyzer for {contract_type.value} loaded")
169
+ except Exception as e:
170
+ log_error(f"Failed to load risk analyzer for {contract_type.value}: {e}")
171
+ self.services["risk_analyzers"][contract_type.value] = None
172
+
173
+ self.service_status["risk_analyzers"] = f"loaded for {len(contract_types)} types"
174
+ log_info("✅ All Risk Analyzers loaded")
175
+
176
+ # 7. Pre-load LLM Interpreter (if available)
177
+ log_info("🔄 Pre-loading LLM Interpreter...")
178
+ try:
179
+ self.services["interpreter"] = LLMClauseInterpreter()
180
+ self.service_status["interpreter"] = "loaded"
181
+ log_info("✅ LLM Interpreter loaded")
182
+ except Exception as e:
183
+ self.services["interpreter"] = None
184
+ self.service_status["interpreter"] = f"failed: {str(e)}"
185
+ log_info("⚠️ LLM Interpreter not available (will skip interpretation)")
186
+
187
+ # 8. Pre-load Negotiation Engine (if available)
188
+ log_info("🔄 Pre-loading Negotiation Engine...")
189
+ try:
190
+ self.services["negotiation_engine"] = NegotiationEngine()
191
+ self.service_status["negotiation_engine"] = "loaded"
192
+ log_info("✅ Negotiation Engine loaded")
193
+ except Exception as e:
194
+ self.services["negotiation_engine"] = None
195
+ self.service_status["negotiation_engine"] = f"failed: {str(e)}"
196
+ log_info("⚠️ Negotiation Engine not available (will skip negotiation points)")
197
+
198
+ # Calculate memory usage
199
+ final_memory = self._get_memory_usage()
200
+ self.memory_usage_mb = final_memory - initial_memory
201
+
202
+ log_info("🎉 ALL SERVICES PRE-LOADED SUCCESSFULLY!")
203
+ log_info(f"📊 Memory Usage: {self.memory_usage_mb:.2f} MB")
204
+ log_info(f"🔧 Services Loaded: {len(self.service_status)}")
205
+
206
+ except Exception as e:
207
+ log_error(f"CRITICAL: Failed to pre-load services: {e}")
208
+ raise
209
+
210
+ def _get_memory_usage(self) -> float:
211
+ """Get current memory usage in MB"""
212
+ try:
213
+ import psutil
214
+ process = psutil.Process()
215
+ return process.memory_info().rss / 1024 / 1024
216
+ except ImportError:
217
+ return 0.0
218
+
219
+ def get_service_status(self) -> Dict[str, Any]:
220
+ """Get detailed service status"""
221
+ model_stats = self.model_loader.get_registry_stats()
222
+ return {
223
+ "services": self.service_status,
224
+ "models": model_stats,
225
+ "memory_usage_mb": self.memory_usage_mb,
226
+ "total_services_loaded": len([s for s in self.service_status.values() if "loaded" in str(s)]),
227
+ "total_models_loaded": model_stats.get("loaded_models", 0)
228
+ }
229
+
230
+ def analyze_contract(self, contract_text: str, options: AnalysisOptions) -> Dict[str, Any]:
231
+ """Synchronous contract analysis using pre-loaded services"""
232
+ try:
233
+ log_info("Starting contract analysis with pre-loaded services...")
234
+
235
+ # Step 1: Classify contract
236
+ classification = self.services["classifier"].classify_contract(contract_text)
237
+ classification_dict = classification.to_dict()
238
+ actual_category = classification.category
239
+
240
+ log_info(f"Contract classified as: {actual_category}")
241
+
242
+ # Step 2: Get appropriate extractor
243
+ extractor = self.services["extractors"].get(actual_category)
244
+ if not extractor:
245
+ # Fallback to first available extractor or create new one
246
+ available_categories = [cat for cat, ext in self.services["extractors"].items() if ext is not None]
247
+ if available_categories:
248
+ fallback_category = available_categories[0]
249
+ extractor = self.services["extractors"][fallback_category]
250
+ log_info(f"Using fallback extractor for: {fallback_category}")
251
+ else:
252
+ # Create new extractor for this category
253
+ extractor = ClauseExtractor(self.model_loader, contract_category=actual_category)
254
+ self.services["extractors"][actual_category] = extractor
255
+
256
+ # Extract clauses
257
+ clauses = extractor.extract_clauses(contract_text, options.max_clauses)
258
+ clauses_dict = [clause.to_dict() for clause in clauses]
259
+ log_info(f"Extracted {len(clauses)} clauses")
260
+
261
+ # Step 3: Map to ContractType and get appropriate risk analyzer
262
+ contract_type_mapping = {
263
+ 'employment': ContractType.EMPLOYMENT,
264
+ 'consulting': ContractType.CONSULTING,
265
+ 'nda': ContractType.NDA,
266
+ 'technology': ContractType.SOFTWARE,
267
+ 'software': ContractType.SOFTWARE,
268
+ 'service_agreement': ContractType.SERVICE,
269
+ 'business': ContractType.PARTNERSHIP,
270
+ 'real_estate': ContractType.LEASE,
271
+ 'sales': ContractType.PURCHASE,
272
+ }
273
+ contract_type = contract_type_mapping.get(actual_category, ContractType.GENERAL)
274
+
275
+ risk_analyzer = self.services["risk_analyzers"].get(contract_type.value)
276
+ if not risk_analyzer:
277
+ # Fallback to general analyzer
278
+ risk_analyzer = self.services["risk_analyzers"]["general"]
279
+
280
+ # Analyze risk
281
+ risk_score = risk_analyzer.analyze_risk(contract_text, clauses)
282
+ risk_dict = risk_score.to_dict()
283
+ log_info(f"Risk analysis completed: {risk_dict['overall_score']}/100")
284
+
285
+ # Step 4: Find unfavorable terms
286
+ unfavorable_terms = self.services["term_analyzer"].analyze_unfavorable_terms(contract_text, clauses)
287
+ unfavorable_dict = [term.to_dict() for term in unfavorable_terms]
288
+ log_info(f"Found {len(unfavorable_terms)} unfavorable terms")
289
+
290
+ # Step 5: Check missing protections
291
+ missing_protections = self.services["protection_checker"].check_missing_protections(contract_text, clauses)
292
+ missing_dict = [prot.to_dict() for prot in missing_protections]
293
+ log_info(f"Found {len(missing_protections)} missing protections")
294
+
295
+ # Optional steps
296
+ interpretations_dict = None
297
+ negotiation_dict = None
298
+ market_dict = None
299
+
300
+ if options.interpret_clauses and self.services["interpreter"]:
301
+ try:
302
+ interpretations = self.services["interpreter"].interpret_clauses(
303
+ clauses, min(10, options.max_clauses)
304
+ )
305
+ interpretations_dict = [interp.to_dict() for interp in interpretations]
306
+ log_info(f"Interpreted {len(interpretations)} clauses")
307
+ except Exception as e:
308
+ log_error(f"Clause interpretation failed: {e}")
309
+ interpretations_dict = []
310
+
311
+ if options.generate_negotiation_points and self.services["negotiation_engine"]:
312
+ try:
313
+ negotiation_points = self.services["negotiation_engine"].generate_negotiation_points(
314
+ risk_score, unfavorable_terms, missing_protections, clauses, 7
315
+ )
316
+ negotiation_dict = [point.to_dict() for point in negotiation_points]
317
+ log_info(f"Generated {len(negotiation_points)} negotiation points")
318
+ except Exception as e:
319
+ log_error(f"Negotiation points generation failed: {e}")
320
+ negotiation_dict = []
321
+
322
+ if options.compare_to_market:
323
+ try:
324
+ market_comparisons = self.services["market_comparator"].compare_to_market(clauses)
325
+ market_dict = [comp.to_dict() for comp in market_comparisons]
326
+ log_info(f"Compared {len(market_comparisons)} clauses to market")
327
+ except Exception as e:
328
+ log_error(f"Market comparison failed: {e}")
329
+ market_dict = []
330
+
331
+ # Generate executive summary
332
+ executive_summary = self._generate_executive_summary(
333
+ classification_dict, risk_dict, unfavorable_dict, missing_dict
334
+ )
335
+
336
+ # Build result
337
+ result = {
338
+ "analysis_id": str(uuid.uuid4()),
339
+ "timestamp": datetime.now().isoformat(),
340
+ "classification": classification_dict,
341
+ "clauses": clauses_dict,
342
+ "risk_analysis": risk_dict,
343
+ "unfavorable_terms": unfavorable_dict,
344
+ "missing_protections": missing_dict,
345
+ "clause_interpretations": interpretations_dict,
346
+ "negotiation_points": negotiation_dict,
347
+ "market_comparisons": market_dict,
348
+ "executive_summary": executive_summary,
349
+ "metadata": {
350
+ "text_length": len(contract_text),
351
+ "word_count": len(contract_text.split()),
352
+ "num_clauses": len(clauses),
353
+ "contract_type": contract_type.value,
354
+ "actual_category": actual_category,
355
+ "options": options.dict()
356
+ },
357
+ "pdf_available": True
358
+ }
359
+
360
+ log_info("Contract analysis completed successfully")
361
+ return result
362
+
363
+ except Exception as e:
364
+ log_error(f"Contract analysis failed: {e}")
365
+ raise
366
+
367
+ def _generate_executive_summary(self, classification: Dict, risk_score: Dict,
368
+ unfavorable_terms: List, missing_protections: List) -> str:
369
+ """Generate executive summary"""
370
+ category = classification.get("category", "Unknown")
371
+ score = risk_score.get("overall_score", 0)
372
+ risk_level = risk_score.get("risk_level", "UNKNOWN")
373
+
374
+ critical_terms = sum(1 for t in unfavorable_terms if t.get('severity') == 'critical')
375
+ critical_protections = sum(1 for p in missing_protections if p.get('importance') == 'critical')
376
+
377
+ if score >= 80:
378
+ risk_msg = "CRITICAL ATTENTION REQUIRED"
379
+ elif score >= 60:
380
+ risk_msg = "SIGNIFICANT CONCERNS"
381
+ elif score >= 40:
382
+ risk_msg = "MODERATE RISK"
383
+ else:
384
+ risk_msg = "LOW RISK"
385
+
386
+ return f"This {category} contract scored {score}/100 ({risk_level.upper()} risk). {risk_msg}. Found {len(unfavorable_terms)} unfavorable terms ({critical_terms} critical) and {len(missing_protections)} missing protections ({critical_protections} critical). Review detailed analysis below."
387
+
388
+ # ============================================================================
389
+ # FASTAPI APP
390
+ # ============================================================================
391
+
392
+ app = FastAPI(
393
+ title=settings.APP_NAME,
394
+ version=settings.APP_VERSION,
395
+ description="AI-powered contract risk analysis with complete model pre-loading",
396
+ docs_url="/api/docs",
397
+ redoc_url="/api/redoc"
398
+ )
399
+
400
+ # Serve static files
401
+ app.mount("/static", StaticFiles(directory="static"), name="static")
402
+
403
+ # CORS middleware
404
+ app.add_middleware(
405
+ CORSMiddleware,
406
+ allow_origins=settings.CORS_ORIGINS,
407
+ allow_credentials=settings.CORS_ALLOW_CREDENTIALS,
408
+ allow_methods=settings.CORS_ALLOW_METHODS,
409
+ allow_headers=settings.CORS_ALLOW_HEADERS
410
+ )
411
+
412
+ # Initialize pre-loaded analysis service
413
+ analysis_service = PreloadedAnalysisService()
414
+
415
+ # ============================================================================
416
+ # HELPER FUNCTIONS
417
+ # ============================================================================
418
+
419
+ def validate_file(file: UploadFile) -> tuple[bool, str]:
420
+ """File validation using settings from config"""
421
+ file_ext = os.path.splitext(file.filename)[1].lower()
422
+ if file_ext not in settings.ALLOWED_EXTENSIONS:
423
+ return False, f"Invalid file type. Allowed: {', '.join(settings.ALLOWED_EXTENSIONS)}"
424
+
425
+ file.file.seek(0, 2)
426
+ size = file.file.tell()
427
+ file.file.seek(0)
428
+
429
+ if size > settings.MAX_UPLOAD_SIZE:
430
+ return False, f"File too large. Max size: {settings.MAX_UPLOAD_SIZE / (1024*1024)}MB"
431
+
432
+ if size == 0:
433
+ return False, "File is empty"
434
+
435
+ return True, "OK"
436
+
437
+ def read_contract_file(file: UploadFile) -> str:
438
+ """Read contract text from file using DocumentReader"""
439
+ file_ext = os.path.splitext(file.filename)[1].lower()
440
+ file_type = "pdf" if file_ext == ".pdf" else "docx" if file_ext == ".docx" else "txt"
441
+
442
+ reader = DocumentReader()
443
+ file_contents = reader.read_file(file.file, file_type)
444
+
445
+ # Handle both string and dict return types from DocumentReader
446
+ if isinstance(file_contents, dict):
447
+ return file_contents.get('text', '') or file_contents.get('content', '')
448
+ else:
449
+ return str(file_contents)
450
+
451
+ def validate_contract_text(text: str) -> tuple[bool, str]:
452
+ """Validate contract text using settings"""
453
+ if not text or not text.strip():
454
+ return False, "Contract text is empty"
455
+
456
+ if len(text) < settings.MIN_CONTRACT_LENGTH:
457
+ return False, f"Contract text too short. Minimum {settings.MIN_CONTRACT_LENGTH} characters required."
458
+
459
+ if len(text) > settings.MAX_CONTRACT_LENGTH:
460
+ return False, f"Contract text too long. Maximum {settings.MAX_CONTRACT_LENGTH} characters allowed."
461
+
462
+ return True, "OK"
463
+
464
+ # ============================================================================
465
+ # API ROUTES
466
+ # ============================================================================
467
+
468
+ @app.get("/")
469
+ async def serve_frontend():
470
+ """Serve the frontend"""
471
+ return FileResponse("static/index.html")
472
+
473
+ @app.get("/api/v1/health", response_model=HealthResponse)
474
+ async def health_check():
475
+ """Health check endpoint with service status"""
476
+ service_status = analysis_service.get_service_status()
477
+
478
+ return HealthResponse(
479
+ status="healthy",
480
+ version=settings.APP_VERSION,
481
+ timestamp=datetime.now().isoformat(),
482
+ models_loaded=service_status["total_models_loaded"],
483
+ services_loaded=service_status["total_services_loaded"],
484
+ memory_usage_mb=service_status["memory_usage_mb"]
485
+ )
486
+
487
+ @app.get("/api/v1/status")
488
+ async def get_detailed_status():
489
+ """Get detailed service status"""
490
+ return analysis_service.get_service_status()
491
+
492
+ @app.post("/api/v1/analyze/file", response_model=AnalysisResult)
493
+ async def analyze_contract_file(
494
+ file: UploadFile = File(...),
495
+ max_clauses: int = Form(15),
496
+ interpret_clauses: bool = Form(True),
497
+ generate_negotiation_points: bool = Form(True),
498
+ compare_to_market: bool = Form(True)
499
+ ):
500
+ """Analyze uploaded contract file - DIRECT SYNC FLOW"""
501
+ try:
502
+ # Validate file
503
+ is_valid, message = validate_file(file)
504
+ if not is_valid:
505
+ raise HTTPException(status_code=400, detail=message)
506
+
507
+ # Read contract text
508
+ contract_text = read_contract_file(file)
509
+
510
+ # Validate contract text
511
+ is_valid_text, text_message = validate_contract_text(contract_text)
512
+ if not is_valid_text:
513
+ raise HTTPException(status_code=400, detail=text_message)
514
+
515
+ # Validate contract structure using ContractValidator
516
+ validator = ContractValidator()
517
+ is_valid_contract, contract_type, confidence = validator.is_valid_contract(contract_text)
518
+
519
+ if not is_valid_contract:
520
+ raise HTTPException(status_code=400, detail=f"Invalid contract: {confidence}")
521
+
522
+ # Create analysis options
523
+ options = AnalysisOptions(
524
+ max_clauses=min(max_clauses, settings.MAX_CLAUSES_TO_ANALYZE),
525
+ interpret_clauses=interpret_clauses,
526
+ generate_negotiation_points=generate_negotiation_points,
527
+ compare_to_market=compare_to_market
528
+ )
529
+
530
+ # Perform analysis (SYNCHRONOUS with pre-loaded services)
531
+ result = analysis_service.analyze_contract(contract_text, options)
532
+
533
+ log_info(f"File analysis completed",
534
+ filename=file.filename,
535
+ analysis_id=result["analysis_id"],
536
+ risk_score=result["risk_analysis"]["overall_score"])
537
+
538
+ return AnalysisResult(**result)
539
+
540
+ except HTTPException:
541
+ raise
542
+ except Exception as e:
543
+ log_error(f"File analysis failed: {e}")
544
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
545
+
546
+ @app.post("/api/v1/analyze/text", response_model=AnalysisResult)
547
+ async def analyze_contract_text(
548
+ contract_text: str = Form(..., description="Contract text to analyze"),
549
+ max_clauses: int = Form(15),
550
+ interpret_clauses: bool = Form(True),
551
+ generate_negotiation_points: bool = Form(True),
552
+ compare_to_market: bool = Form(True)
553
+ ):
554
+ """Analyze pasted contract text - DIRECT SYNC FLOW"""
555
+ try:
556
+ # Validate contract text
557
+ is_valid, message = validate_contract_text(contract_text)
558
+ if not is_valid:
559
+ raise HTTPException(status_code=400, detail=message)
560
+
561
+ # Validate contract structure using ContractValidator
562
+ validator = ContractValidator()
563
+ is_valid_contract, contract_type, confidence = validator.is_valid_contract(contract_text)
564
+
565
+ if not is_valid_contract:
566
+ raise HTTPException(status_code=400, detail=f"Invalid contract: {confidence}")
567
+
568
+ # Create analysis options
569
+ options = AnalysisOptions(
570
+ max_clauses=min(max_clauses, settings.MAX_CLAUSES_TO_ANALYZE),
571
+ interpret_clauses=interpret_clauses,
572
+ generate_negotiation_points=generate_negotiation_points,
573
+ compare_to_market=compare_to_market
574
+ )
575
+
576
+ # Perform analysis (SYNCHRONOUS with pre-loaded services)
577
+ result = analysis_service.analyze_contract(contract_text, options)
578
+
579
+ log_info(f"Text analysis completed",
580
+ analysis_id=result["analysis_id"],
581
+ risk_score=result["risk_analysis"]["overall_score"])
582
+
583
+ return AnalysisResult(**result)
584
+
585
+ except HTTPException:
586
+ raise
587
+ except Exception as e:
588
+ log_error(f"Text analysis failed: {e}")
589
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
590
+
591
+ @app.post("/api/v1/generate-pdf")
592
+ async def generate_pdf_from_analysis(analysis_result: Dict[str, Any]):
593
+ """Generate PDF from analysis results"""
594
+ try:
595
+ pdf_buffer = generate_pdf_report(analysis_result)
596
+
597
+ analysis_id = analysis_result.get('analysis_id', 'report')
598
+ return Response(
599
+ content=pdf_buffer.getvalue(),
600
+ media_type="application/pdf",
601
+ headers={
602
+ "Content-Disposition": f"attachment; filename=contract_analysis_{analysis_id}.pdf"
603
+ }
604
+ )
605
+ except Exception as e:
606
+ log_error(f"PDF generation failed: {e}")
607
+ raise HTTPException(status_code=500, detail=f"Failed to generate PDF: {str(e)}")
608
+
609
+ @app.get("/api/v1/categories")
610
+ async def get_contract_categories():
611
+ """Get list of supported contract categories"""
612
+ try:
613
+ categories = analysis_service.services["classifier"].get_all_categories()
614
+ return {"categories": categories}
615
+ except Exception as e:
616
+ log_error(f"Categories fetch failed: {e}")
617
+ raise HTTPException(status_code=500, detail=f"Failed to get categories: {str(e)}")
618
+
619
+ @app.post("/api/v1/validate/file")
620
+ async def validate_contract_file(file: UploadFile = File(...)):
621
+ """Quick validation endpoint"""
622
+ try:
623
+ is_valid, message = validate_file(file)
624
+ if not is_valid:
625
+ return {"valid": False, "message": message}
626
+
627
+ contract_text = read_contract_file(file)
628
+
629
+ # Validate text length
630
+ is_valid_text, text_message = validate_contract_text(contract_text)
631
+ if not is_valid_text:
632
+ return {"valid": False, "message": text_message}
633
+
634
+ # Validate contract structure using ContractValidator
635
+ validator = ContractValidator()
636
+ report = validator.get_validation_report(contract_text)
637
+
638
+ return {
639
+ "valid": report["scores"]["total"] > 50 and is_valid_text,
640
+ "message": "Contract appears valid" if report["scores"]["total"] > 50 else "May not be a valid contract",
641
+ "confidence": report["scores"]["total"],
642
+ "report": report
643
+ }
644
+
645
+ except Exception as e:
646
+ log_error(f"File validation failed: {e}")
647
+ raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")
648
+
649
+ @app.post("/api/v1/validate/text")
650
+ async def validate_contract_text_endpoint(contract_text: str = Form(...)):
651
+ """Validate pasted contract text"""
652
+ try:
653
+ # Validate text length
654
+ is_valid, message = validate_contract_text(contract_text)
655
+ if not is_valid:
656
+ return {"valid": False, "message": message}
657
+
658
+ # Validate contract structure using ContractValidator
659
+ validator = ContractValidator()
660
+ report = validator.get_validation_report(contract_text)
661
+
662
+ return {
663
+ "valid": report["scores"]["total"] > 50 and is_valid,
664
+ "message": "Contract appears valid" if report["scores"]["total"] > 50 else "May not be a valid contract",
665
+ "confidence": report["scores"]["total"],
666
+ "report": report
667
+ }
668
+
669
+ except Exception as e:
670
+ log_error(f"Text validation failed: {e}")
671
+ raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")
672
+
673
+ # ============================================================================
674
+ # ERROR HANDLERS
675
+ # ============================================================================
676
+
677
+ @app.exception_handler(HTTPException)
678
+ async def http_exception_handler(request, exc):
679
+ """Handle HTTP exceptions"""
680
+ return JSONResponse(
681
+ status_code=exc.status_code,
682
+ content=ErrorResponse(
683
+ error=exc.detail,
684
+ detail=str(exc.detail),
685
+ timestamp=datetime.now().isoformat()
686
+ ).dict()
687
+ )
688
+
689
+ @app.exception_handler(Exception)
690
+ async def general_exception_handler(request, exc):
691
+ """Handle general exceptions"""
692
+ log_error(f"Unhandled exception: {exc}")
693
+ return JSONResponse(
694
+ status_code=500,
695
+ content=ErrorResponse(
696
+ error="Internal server error",
697
+ detail=str(exc),
698
+ timestamp=datetime.now().isoformat()
699
+ ).dict()
700
+ )
701
+
702
+ # ============================================================================
703
+ # STARTUP & SHUTDOWN
704
+ # ============================================================================
705
+
706
+ @app.on_event("startup")
707
+ async def startup_event():
708
+ """Startup event - Services are already pre-loaded"""
709
+ log_info(f"🚀 {settings.APP_NAME} v{settings.APP_VERSION} STARTED")
710
+ log_info(f"📍 Server: {settings.HOST}:{settings.PORT}")
711
+ log_info(f"🔧 All models and services pre-loaded")
712
+
713
+ @app.on_event("shutdown")
714
+ async def shutdown_event():
715
+ """Shutdown event"""
716
+ log_info("🛑 Shutting down server...")
717
+ log_info("✅ Server shutdown complete")
718
+
719
+ # ============================================================================
720
+ # MAIN
721
+ # ============================================================================
722
+
723
+ if __name__ == "__main__":
724
+ import uvicorn
725
+
726
+ uvicorn.run(
727
+ "app:app",
728
+ host=settings.HOST,
729
+ port=settings.PORT,
730
+ reload=settings.RELOAD,
731
+ workers=1, # Single worker for synchronous flow
732
+ log_level=settings.LOG_LEVEL.lower()
733
+ )
config/model_config.py CHANGED
@@ -4,40 +4,91 @@ from pathlib import Path
4
 
5
  class ModelConfig:
6
  """
7
- Central configuration for all models
8
  """
9
- # Base directories
10
- BASE_DIR = Path(__file__).parent.parent
11
- MODEL_DIR = BASE_DIR / "models"
12
- CACHE_DIR = BASE_DIR / "cache"
 
 
 
 
 
13
 
14
- # Legal-BERT Configuration (for clause extraction)
15
- LEGAL_BERT = {"model_name" : "nlpaueb/legal-bert-base-uncased",
16
- "task" : "clause-extraction",
17
- "max_length" : 512,
18
- "batch_size" : 16,
19
- "local_path" : MODEL_DIR / "legal-bert",
20
- }
21
 
22
- # Embedding Model for Semantic Search
23
- EMBEDDING_MODEL = {"model_name" : "sentence-transformers/all-MiniLM-L6-v2",
24
- "dimension" : 384,
25
- "local_path" : MODEL_DIR / "embeddings",
26
- }
 
 
 
27
 
28
- # LLM for Analysis (Ollama)
29
- LLM_CONFIG = {"base_url" : "http://localhost:11434",
30
- "model" : "mistral:7b",
31
- "temperature" : 0.1,
32
- "max_tokens" : 5000,
33
- "timeout" : 120,
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  @classmethod
38
- def ensure_directories(cls):
39
  """
40
- Create necessary directories
41
  """
42
- cls.MODEL_DIR.mkdir(parents = True, exist_ok = True)
43
- cls.CACHE_DIR.mkdir(parents = True, exist_ok = True)
 
 
 
 
 
 
 
 
 
 
4
 
5
  class ModelConfig:
6
  """
7
+ Model-specific configurations - FOR AI MODEL SETTINGS ONLY
8
  """
9
+ # Model Architecture Settings
10
+ LEGAL_BERT = {"model_name" : "nlpaueb/legal-bert-base-uncased",
11
+ "task" : "clause-extraction",
12
+ "max_length" : 512,
13
+ "batch_size" : 16,
14
+ "hidden_dim" : 768,
15
+ "num_layers" : 12,
16
+ "attention_heads" : 12,
17
+ }
18
 
19
+ # Embedding Model Settings
20
+ EMBEDDING_MODEL = {"model_name" : "sentence-transformers/all-MiniLM-L6-v2",
21
+ "dimension" : 384,
22
+ "pooling" : "mean",
23
+ "normalize" : True,
24
+ "similarity_threshold" : 0.7,
25
+ }
26
 
27
+ # Classification Model Settings
28
+ CLASSIFIER_MODEL = {"embedding_dim" : 384,
29
+ "hidden_dim" : 256,
30
+ "num_categories" : 12,
31
+ "dropout_rate" : 0.1,
32
+ "learning_rate" : 2e-5,
33
+ "max_seq_length" : 512,
34
+ }
35
 
36
+ # Clause Extraction Settings
37
+ CLAUSE_EXTRACTION = {"min_clause_length" : 50,
38
+ "max_clause_length" : 2000,
39
+ "confidence_threshold" : 0.7,
40
+ "overlap_threshold" : 0.3,
41
+ "max_clauses_per_doc" : 50,
42
+ }
43
+
44
+ # Risk Analysis Settings
45
+ RISK_ANALYSIS = {"score_ranges" : {"low" : (0, 40),
46
+ "medium" : (40, 60),
47
+ "high" : (60, 80),
48
+ "critical" : (80, 100),
49
+ },
50
+ "weight_decay" : 0.1,
51
+ "smoothing_factor" : 0.5,
52
+ }
53
+
54
+ # Market Comparison Settings
55
+ MARKET_COMPARISON = {"similarity_threshold" : 0.75,
56
+ "min_matches_required" : 3,
57
+ "max_comparisons" : 20,
58
+ "embedding_cache_size" : 1000,
59
+ }
60
+
61
+ # LLM Generation Settings
62
+ LLM_GENERATION = {"max_tokens" : 5000,
63
+ "temperature" : 0.1,
64
+ "top_p" : 0.9,
65
+ "frequency_penalty" : 0.1,
66
+ "presence_penalty" : 0.1,
67
+ "stop_sequences" : ["\n\n", "###", "---"],
68
+ }
69
+
70
+ # Text Processing Settings
71
+ TEXT_PROCESSING = {"chunk_size" : 512,
72
+ "chunk_overlap" : 50,
73
+ "min_sentence_length" : 10,
74
+ "max_sentence_length" : 200,
75
+ "entity_confidence" : 0.8,
76
+ }
77
 
78
 
79
  @classmethod
80
+ def get_model_config(cls, model_type: str) -> dict:
81
  """
82
+ Get configuration for specific model type
83
  """
84
+ config_map = {"legal_bert" : cls.LEGAL_BERT,
85
+ "embedding" : cls.EMBEDDING_MODEL,
86
+ "classifier" : cls.CLASSIFIER_MODEL,
87
+ "clause_extraction" : cls.CLAUSE_EXTRACTION,
88
+ "risk_analysis" : cls.RISK_ANALYSIS,
89
+ "market_comparison" : cls.MARKET_COMPARISON,
90
+ "llm_generation" : cls.LLM_GENERATION,
91
+ "text_processing" : cls.TEXT_PROCESSING,
92
+ }
93
+
94
+ return config_map.get(model_type, {})
config/risk_rules.py CHANGED
@@ -33,71 +33,91 @@ class RiskRules:
33
  }
34
 
35
  # Contract-specific weight adjustments
36
- CONTRACT_TYPE_ADJUSTMENTS = {ContractType.EMPLOYMENT : {"restrictive_covenants" : 1.3, "compensation_benefits": 1.4, "termination_rights": 1.2},
37
- ContractType.SOFTWARE : {"intellectual_property" : 1.5, "penalties_liability" : 1.3},
38
- ContractType.NDA : {"restrictive_covenants" : 1.8, "penalties_liability" : 1.2},
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  # KEYWORD SEVERITY SCORING (Multi-tier system)
42
  # Critical keywords (Tier 1: 20-25 points each)
43
- CRITICAL_KEYWORDS = {"non-compete" : 25,
44
- "non-solicit" : 23,
45
- "non-solicitation" : 23,
46
- "forfeit" : 25,
47
- "liquidated damages" : 24,
48
- "wage withholding" : 25,
49
- "unlimited liability" : 25,
50
- "joint and several" : 23,
51
- "perpetual" : 22,
52
- "irrevocable" : 20,
53
- }
 
54
 
55
  # High-risk keywords (Tier 2: 12-18 points)
56
- HIGH_RISK_KEYWORDS = {"indemnify" : 18,
57
- "indemnification" : 18,
58
- "hold harmless" : 17,
59
- "penalty" : 18,
60
- "damages" : 15,
61
- "breach" : 15,
62
- "default" : 14,
63
- "immediate termination" : 16,
64
- "without cause" : 15,
65
- "sole discretion" : 17,
66
- "at-will" : 14,
67
- "waive" : 16,
68
- "release" : 15,
69
- }
 
70
 
71
  # Medium-risk keywords (Tier 3: 6-10 points)
72
- MEDIUM_RISK_KEYWORDS = {"confidential" : 8,
73
- "proprietary" : 8,
74
- "trade secret" : 10,
75
- "terminate" : 7,
76
- "termination" : 7,
77
- "assignment" : 6,
78
- "exclusive" : 9,
79
- "warranty" : 8,
80
- "representation" : 7,
81
- "covenant" : 8,
82
- "jurisdiction" : 6,
83
- "governing law" : 6,
84
- }
 
85
 
86
  # STRUCTURAL PATTERN ANALYSIS (Pattern-based risk detection)
87
- RISKY_PATTERNS = [(r'\d+\s*(year|yr|month|mo)s?\s*(non-compete|non-solicit)', 20, "Long duration restrictive covenant"),
88
- (r'(entire|all|worldwide|global)\s*(industry|market|territory)', 18, "Overly broad geographic/industry scope"),
89
- (r'notice\s+period.*\d+\s*days.*employee.*\d+\s*days.*employer', 15, "Unequal notice periods"),
90
- (r'(may|can|shall)\s+(withhold|deduct|retain).*compensation', 22, "Wage withholding clause"),
91
- (r'(unlimited|no\s+limit|without\s+limitation).*liability', 25, "Unlimited liability exposure"),
92
- (r'(sole|absolute|unfettered)\s+discretion', 18, "One-sided discretionary power"),
93
- (r'penalty.*(?:equal\s+to|of|amount).*\$?\d+', 16, "Specific penalty amount"),
94
- (r'(automatically|immediately)\s+(renew|extend)', 12, "Auto-renewal clause"),
95
- (r'waive.*right.*arbitration', 20, "Arbitration rights waiver"),
96
- (r'(all|any).*intellectual\s+property.*created', 17, "Broad IP assignment"),
97
- ]
 
98
 
99
  # CLAUSE-LEVEL RISK FACTORS (Detailed clause analysis)
100
- CLAUSE_RISK_FACTORS = {"non-compete": {
 
101
  "base_risk": 70,
102
  "duration_check": {
103
  # months: risk_adjustment
@@ -189,40 +209,6 @@ class RiskRules:
189
  "work for hire limited": -10
190
  }
191
  },
192
-
193
- "liability": {
194
- "base_risk": 65,
195
- "red_flags": {
196
- "unlimited": +30,
197
- "consequential damages": +15,
198
- "indirect damages": +12,
199
- "punitive damages": +18,
200
- "no cap": +25
201
- },
202
- "protections": {
203
- "liability cap": -20,
204
- "mutual cap": -15,
205
- "limited to fees paid": -18
206
- }
207
- },
208
-
209
- "confidentiality": {
210
- "base_risk": 45,
211
- "red_flags": {
212
- "perpetual": +20,
213
- "forever": +20,
214
- "indefinite": +18,
215
- "all information": +15,
216
- "any information": +15
217
- },
218
- "reasonable_terms": {
219
- "3 years": -5,
220
- "5 years": 0,
221
- "7 years": +5,
222
- "marked confidential": -8,
223
- "reasonably necessary": -10
224
- }
225
- }
226
  }
227
 
228
  # =========================================================================
@@ -249,13 +235,6 @@ class RiskRules:
249
  "consulting": {"generous": 3, "standard": 1, "restrictive": 0.5},
250
  "general": {"generous": 12, "standard": 6, "restrictive": 1}
251
  },
252
-
253
- "ip_assignment_scope": {
254
- "tech": "work_product_only", # Standard
255
- "creative": "commissioned_work_only", # Standard
256
- "consulting": "deliverables_only", # Standard
257
- "general": "work_for_hire" # Standard
258
- }
259
  }
260
 
261
  # =========================================================================
@@ -298,11 +277,6 @@ class RiskRules:
298
  "risk_if_missing": 15,
299
  "categories": ["general"]
300
  },
301
- "change_control_process": {
302
- "importance": "medium",
303
- "risk_if_missing": 10,
304
- "categories": ["general"]
305
- }
306
  }
307
 
308
  # =========================================================================
@@ -329,5 +303,4 @@ class RiskRules:
329
 
330
  # Normalize to sum to 100
331
  total = sum(adjusted.values())
332
- return {k: (v / total) * 100 for k, v in adjusted.items()}
333
-
 
33
  }
34
 
35
  # Contract-specific weight adjustments
36
+ CONTRACT_TYPE_ADJUSTMENTS = {
37
+ ContractType.EMPLOYMENT : {
38
+ "restrictive_covenants" : 1.3,
39
+ "compensation_benefits" : 1.4,
40
+ "termination_rights" : 1.2,
41
+ },
42
+ ContractType.SOFTWARE : {
43
+ "intellectual_property" : 1.5,
44
+ "penalties_liability" : 1.3,
45
+ },
46
+ ContractType.NDA : {
47
+ "restrictive_covenants" : 1.8,
48
+ "penalties_liability" : 1.2,
49
+ },
50
+ ContractType.CONSULTING : {
51
+ "compensation_benefits" : 1.3,
52
+ "termination_rights" : 1.1,
53
+ },
54
+ }
55
 
56
  # KEYWORD SEVERITY SCORING (Multi-tier system)
57
  # Critical keywords (Tier 1: 20-25 points each)
58
+ CRITICAL_KEYWORDS = {
59
+ "non-compete" : 25,
60
+ "non-solicit" : 23,
61
+ "non-solicitation" : 23,
62
+ "forfeit" : 25,
63
+ "liquidated damages" : 24,
64
+ "wage withholding" : 25,
65
+ "unlimited liability" : 25,
66
+ "joint and several" : 23,
67
+ "perpetual" : 22,
68
+ "irrevocable" : 20,
69
+ }
70
 
71
  # High-risk keywords (Tier 2: 12-18 points)
72
+ HIGH_RISK_KEYWORDS = {
73
+ "indemnify" : 18,
74
+ "indemnification" : 18,
75
+ "hold harmless" : 17,
76
+ "penalty" : 18,
77
+ "damages" : 15,
78
+ "breach" : 15,
79
+ "default" : 14,
80
+ "immediate termination" : 16,
81
+ "without cause" : 15,
82
+ "sole discretion" : 17,
83
+ "at-will" : 14,
84
+ "waive" : 16,
85
+ "release" : 15,
86
+ }
87
 
88
  # Medium-risk keywords (Tier 3: 6-10 points)
89
+ MEDIUM_RISK_KEYWORDS = {
90
+ "confidential" : 8,
91
+ "proprietary" : 8,
92
+ "trade secret" : 10,
93
+ "terminate" : 7,
94
+ "termination" : 7,
95
+ "assignment" : 6,
96
+ "exclusive" : 9,
97
+ "warranty" : 8,
98
+ "representation" : 7,
99
+ "covenant" : 8,
100
+ "jurisdiction" : 6,
101
+ "governing law" : 6,
102
+ }
103
 
104
  # STRUCTURAL PATTERN ANALYSIS (Pattern-based risk detection)
105
+ RISKY_PATTERNS = [
106
+ (r'\d+\s*(year|yr|month|mo)s?\s*(non-compete|non-solicit)', 20, "Long duration restrictive covenant"),
107
+ (r'(entire|all|worldwide|global)\s*(industry|market|territory)', 18, "Overly broad geographic/industry scope"),
108
+ (r'notice\s+period.*\d+\s*days.*employee.*\d+\s*days.*employer', 15, "Unequal notice periods"),
109
+ (r'(may|can|shall)\s+(withhold|deduct|retain).*compensation', 22, "Wage withholding clause"),
110
+ (r'(unlimited|no\s+limit|without\s+limitation).*liability', 25, "Unlimited liability exposure"),
111
+ (r'(sole|absolute|unfettered)\s+discretion', 18, "One-sided discretionary power"),
112
+ (r'penalty.*(?:equal\s+to|of|amount).*\$?\d+', 16, "Specific penalty amount"),
113
+ (r'(automatically|immediately)\s+(renew|extend)', 12, "Auto-renewal clause"),
114
+ (r'waive.*right.*arbitration', 20, "Arbitration rights waiver"),
115
+ (r'(all|any).*intellectual\s+property.*created', 17, "Broad IP assignment"),
116
+ ]
117
 
118
  # CLAUSE-LEVEL RISK FACTORS (Detailed clause analysis)
119
+ CLAUSE_RISK_FACTORS = {
120
+ "non-compete": {
121
  "base_risk": 70,
122
  "duration_check": {
123
  # months: risk_adjustment
 
209
  "work for hire limited": -10
210
  }
211
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  }
213
 
214
  # =========================================================================
 
235
  "consulting": {"generous": 3, "standard": 1, "restrictive": 0.5},
236
  "general": {"generous": 12, "standard": 6, "restrictive": 1}
237
  },
 
 
 
 
 
 
 
238
  }
239
 
240
  # =========================================================================
 
277
  "risk_if_missing": 15,
278
  "categories": ["general"]
279
  },
 
 
 
 
 
280
  }
281
 
282
  # =========================================================================
 
303
 
304
  # Normalize to sum to 100
305
  total = sum(adjusted.values())
306
+ return {k: (v / total) * 100 for k, v in adjusted.items()}
 
config/settings.py CHANGED
@@ -7,12 +7,12 @@ from pydantic_settings import BaseSettings
7
 
8
  class Settings(BaseSettings):
9
  """
10
- Application-wide settings
11
  """
12
  # Application Info
13
  APP_NAME : str = "AI Contract Risk Analyzer"
14
  APP_VERSION : str = "1.0.0"
15
- API_PREFIX : str = "/api/"
16
 
17
  # Server Configuration
18
  HOST : str = "0.0.0.0"
@@ -28,43 +28,48 @@ class Settings(BaseSettings):
28
 
29
  # File Upload Settings
30
  MAX_UPLOAD_SIZE : int = 10 * 1024 * 1024 # 10 MB
31
- ALLOWED_EXTENSIONS : list = [".pdf", ".docx"]
32
  UPLOAD_DIR : Path = Path("uploads")
33
 
34
- # Model Settings
35
- MODEL_CACHE_SIZE : int = 3 # Number of models to keep in memory
36
- MODEL_DOWNLOAD_TIMEOUT : int = 300 # 5 minutes
37
- USE_GPU : bool = True # Automatically detect and use GPU if available
38
 
39
- # Ollama Settings
40
- OLLAMA_BASE_URL : str = "http://localhost:11434"
41
- OLLAMA_MODEL : str = "llama3:8b"
42
- OLLAMA_TIMEOUT : int = 120
43
- OLLAMA_TEMPERATURE : float = 0.1
44
 
45
- # Analysis Settings
46
- MIN_CONTRACT_LENGTH : int = 300 # Minimum characters for valid contract
47
- MAX_CONTRACT_LENGTH : int = 500000 # Maximum characters (500KB text)
48
- MAX_CLAUSES_TO_ANALYZE : int = 15
49
 
50
- # Logging
51
- LOG_LEVEL : str = "INFO"
52
- LOG_FORMAT : str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 
 
 
 
 
53
  LOG_FILE : Optional[Path] = Path("logs/app.log")
54
 
55
  # Cache Settings
56
- ENABLE_CACHE : bool = True
57
- CACHE_TTL : int = 3600 # 1 hour
58
- CACHE_DIR : Path = Path("cache")
59
 
60
- # Rate Limiting
61
- RATE_LIMIT_ENABLED : bool = True
62
- RATE_LIMIT_REQUESTS : int = 10
63
- RATE_LIMIT_PERIOD : int = 60 # seconds
64
 
65
  # PDF Report Settings
66
- PDF_FONT_SIZE : int = 10
67
- PDF_MARGIN : float = 0.5 # inches
 
68
 
69
 
70
  class Config:
@@ -84,4 +89,4 @@ class Settings(BaseSettings):
84
 
85
 
86
  # Global settings instance
87
- settings = Settings()
 
7
 
8
  class Settings(BaseSettings):
9
  """
10
+ Application-wide settings: primary configuration source
11
  """
12
  # Application Info
13
  APP_NAME : str = "AI Contract Risk Analyzer"
14
  APP_VERSION : str = "1.0.0"
15
+ API_PREFIX : str = "/api/v1/"
16
 
17
  # Server Configuration
18
  HOST : str = "0.0.0.0"
 
28
 
29
  # File Upload Settings
30
  MAX_UPLOAD_SIZE : int = 10 * 1024 * 1024 # 10 MB
31
+ ALLOWED_EXTENSIONS : list = [".pdf", ".docx", ".txt"]
32
  UPLOAD_DIR : Path = Path("uploads")
33
 
34
+ # Model Management Settings
35
+ MODEL_CACHE_SIZE : int = 3 # Number of models to keep in memory
36
+ MODEL_DOWNLOAD_TIMEOUT : int = 1800 # 30 minutes
37
+ USE_GPU : bool = True # Automatically detect and use GPU if available
38
 
39
+ # External API Settings
40
+ OLLAMA_BASE_URL : str = "http://localhost:11434"
41
+ OLLAMA_MODEL : str = "llama3:8b"
42
+ OLLAMA_TIMEOUT : int = 300
43
+ OLLAMA_TEMPERATURE : float = 0.1
44
 
45
+ # External API Keys
46
+ OPENAI_API_KEY : Optional[str] = None
47
+ ANTHROPIC_API_KEY : Optional[str] = None
 
48
 
49
+ # Analysis Limits
50
+ MIN_CONTRACT_LENGTH : int = 300 # Minimum characters for valid contract
51
+ MAX_CONTRACT_LENGTH : int = 500000 # Maximum characters (500KB text)
52
+ MAX_CLAUSES_TO_ANALYZE : int = 15
53
+
54
+ # Logging Settings
55
+ LOG_LEVEL : str = "INFO"
56
+ LOG_FORMAT : str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
57
  LOG_FILE : Optional[Path] = Path("logs/app.log")
58
 
59
  # Cache Settings
60
+ ENABLE_CACHE : bool = True
61
+ CACHE_TTL : int = 3600 # 1 hour
62
+ CACHE_DIR : Path = Path("cache")
63
 
64
+ # Rate Limiting Settings
65
+ RATE_LIMIT_ENABLED : bool = True
66
+ RATE_LIMIT_REQUESTS : int = 10
67
+ RATE_LIMIT_PERIOD : int = 60 # seconds
68
 
69
  # PDF Report Settings
70
+ PDF_FONT_SIZE : int = 10
71
+ PDF_MARGIN : float = 0.5 # inches
72
+ PDF_PAGE_SIZE : str = "letter"
73
 
74
 
75
  class Config:
 
89
 
90
 
91
  # Global settings instance
92
+ settings = Settings()
launch.py CHANGED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Launch script for AI Contract Risk Analyzer
3
+ Starts both API and frontend (if available)
4
+ """
5
+
6
+ import subprocess
7
+ import sys
8
+ import time
9
+ import requests
10
+ from pathlib import Path
11
+
12
+ def check_ollama():
13
+ """Check if Ollama is running"""
14
+ try:
15
+ response = requests.get("http://localhost:11434/api/tags", timeout=5)
16
+ if response.status_code == 200:
17
+ print("✓ Ollama is running")
18
+ return True
19
+ except:
20
+ pass
21
+
22
+ print("✗ Ollama not running. Start with: ollama serve")
23
+ return False
24
+
25
+ def check_models():
26
+ """Check if required models are available"""
27
+ try:
28
+ response = requests.get("http://localhost:11434/api/tags", timeout=5)
29
+ models = response.json().get('models', [])
30
+ model_names = [m['name'] for m in models]
31
+
32
+ required = "llama3:8b"
33
+ if any(required in name for name in model_names):
34
+ print(f"✓ Model {required} available")
35
+ return True
36
+ else:
37
+ print(f"✗ Model {required} not found. Pull with: ollama pull llama3:8b")
38
+ return False
39
+ except:
40
+ return False
41
+
42
+ def start_api():
43
+ """Start FastAPI server"""
44
+ print("\n" + "="*60)
45
+ print("Starting FastAPI Server...")
46
+ print("="*60)
47
+
48
+ subprocess.Popen([
49
+ sys.executable, "-m", "uvicorn",
50
+ "app:app",
51
+ "--host", "0.0.0.0",
52
+ "--port", "8000",
53
+ "--reload"
54
+ ])
55
+
56
+ # Wait for server to start
57
+ time.sleep(3)
58
+
59
+ try:
60
+ response = requests.get("http://localhost:8000/api/v1/health", timeout=5)
61
+ if response.status_code == 200:
62
+ print("✓ API Server running at: http://localhost:8000")
63
+ print("✓ Documentation at: http://localhost:8000/api/docs")
64
+ return True
65
+ except:
66
+ pass
67
+
68
+ print("✗ Failed to start API server")
69
+ return False
70
+
71
+ def start_frontend():
72
+ """Start frontend server (if available)"""
73
+ if not Path("static/index.html").exists():
74
+ print("\n✗ Frontend not found at static/index.html")
75
+ return False
76
+
77
+ print("\n" + "="*60)
78
+ print("Starting Frontend Server...")
79
+ print("="*60)
80
+
81
+ subprocess.Popen([
82
+ sys.executable, "-m", "http.server", "3000",
83
+ "--directory", "static"
84
+ ])
85
+
86
+ time.sleep(2)
87
+
88
+ try:
89
+ response = requests.get("http://localhost:3000", timeout=5)
90
+ if response.status_code == 200:
91
+ print("✓ Frontend running at: http://localhost:3000")
92
+ return True
93
+ except:
94
+ pass
95
+
96
+ print("✗ Failed to start frontend server")
97
+ return False
98
+
99
+ def main():
100
+ """Main launch function"""
101
+ print("="*60)
102
+ print("AI Contract Risk Analyzer - Launch Script")
103
+ print("="*60)
104
+
105
+ # Pre-flight checks
106
+ print("\nPre-flight checks:")
107
+ print("-"*60)
108
+
109
+ ollama_ok = check_ollama()
110
+ models_ok = check_models() if ollama_ok else False
111
+
112
+ if not ollama_ok:
113
+ print("\n⚠️ Warning: Ollama not running. Some features may not work.")
114
+ response = input("Continue anyway? (y/n): ")
115
+ if response.lower() != 'y':
116
+ return
117
+
118
+ # Start services
119
+ api_ok = start_api()
120
+
121
+ if not api_ok:
122
+ print("\n✗ Failed to start API. Exiting.")
123
+ return
124
+
125
+ frontend_ok = start_frontend()
126
+
127
+ # Summary
128
+ print("\n" + "="*60)
129
+ print("Launch Complete!")
130
+ print("="*60)
131
+ print(f"API Server: {'✓' if api_ok else '✗'} http://localhost:8000")
132
+ print(f"API Docs: {'✓' if api_ok else '✗'} http://localhost:8000/api/docs")
133
+ print(f"Frontend: {'✓' if frontend_ok else '✗'} http://localhost:3000")
134
+ print("\nPress Ctrl+C to stop all services")
135
+ print("="*60)
136
+
137
+ try:
138
+ while True:
139
+ time.sleep(1)
140
+ except KeyboardInterrupt:
141
+ print("\n\nShutting down...")
142
+ sys.exit(0)
143
+
144
+ if __name__ == "__main__":
145
+ main()
model_manager/llm_manager.py CHANGED
@@ -141,7 +141,7 @@ class LLMManager:
141
  Check if Ollama server is available
142
  """
143
  try:
144
- response = requests.get(f"{self.ollama_base_url}/api/tags", timeout=5)
145
  available = (response.status_code == 200)
146
 
147
  if available:
 
141
  Check if Ollama server is available
142
  """
143
  try:
144
+ response = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
145
  available = (response.status_code == 200)
146
 
147
  if available:
model_manager/model_cache.py CHANGED
@@ -27,6 +27,7 @@ class ModelCache:
27
  def __init__(self, cache_dir: Path, ttl_seconds: int = 3600):
28
  self.cache_dir = Path(cache_dir)
29
  self.cache_dir.mkdir(parents = True, exist_ok = True)
 
30
  self.ttl_seconds = ttl_seconds
31
  self.logger = ContractAnalyzerLogger.get_logger()
32
 
 
27
  def __init__(self, cache_dir: Path, ttl_seconds: int = 3600):
28
  self.cache_dir = Path(cache_dir)
29
  self.cache_dir.mkdir(parents = True, exist_ok = True)
30
+
31
  self.ttl_seconds = ttl_seconds
32
  self.logger = ContractAnalyzerLogger.get_logger()
33
 
model_manager/model_loader.py CHANGED
@@ -58,7 +58,11 @@ class ModelLoader:
58
  return info.model, info.tokenizer
59
 
60
  # Mark as loading
61
- self.registry.register(ModelType.LEGAL_BERT,ModelInfo(name = "legal-bert", type = ModelType.LEGAL_BERT, status = ModelStatus.LOADING))
 
 
 
 
62
 
63
  try:
64
  config = self.config.LEGAL_BERT
 
58
  return info.model, info.tokenizer
59
 
60
  # Mark as loading
61
+ self.registry.register(ModelType.LEGAL_BERT, ModelInfo(name = "legal-bert",
62
+ type = ModelType.LEGAL_BERT,
63
+ status = ModelStatus.LOADING,
64
+ )
65
+ )
66
 
67
  try:
68
  config = self.config.LEGAL_BERT
model_manager/model_registry.py CHANGED
@@ -54,6 +54,7 @@ class ModelInfo:
54
  last_accessed : Optional[datetime] = None
55
  metadata : Dict[str, Any] = field(default_factory = dict)
56
 
 
57
  def mark_accessed(self):
58
  """
59
  Update access statistics
@@ -83,7 +84,7 @@ class ModelRegistry:
83
  if cls._instance is None:
84
  with cls._lock:
85
  if cls._instance is None:
86
- cls._instance = super().__new__(cls)
87
  cls._instance._initialized = False
88
 
89
  return cls._instance
@@ -123,6 +124,7 @@ class ModelRegistry:
123
  """
124
  with self._model_lock:
125
  info = self._registry.get(model_type)
 
126
  if info:
127
  info.mark_accessed()
128
  log_info(f"Model accessed: {model_type.value}",
 
54
  last_accessed : Optional[datetime] = None
55
  metadata : Dict[str, Any] = field(default_factory = dict)
56
 
57
+
58
  def mark_accessed(self):
59
  """
60
  Update access statistics
 
84
  if cls._instance is None:
85
  with cls._lock:
86
  if cls._instance is None:
87
+ cls._instance = super().__new__(cls)
88
  cls._instance._initialized = False
89
 
90
  return cls._instance
 
124
  """
125
  with self._model_lock:
126
  info = self._registry.get(model_type)
127
+
128
  if info:
129
  info.mark_accessed()
130
  log_info(f"Model accessed: {model_type.value}",
reporter/pdf_generator.py CHANGED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DEPENDENCIES
2
+ import os
3
+ from typing import Any
4
+ from io import BytesIO
5
+ from typing import Dict
6
+ from typing import List
7
+ from typing import Optional
8
+ from datetime import datetime
9
+ from reportlab.lib import colors
10
+ from reportlab.pdfgen import canvas
11
+ from reportlab.platypus import Image
12
+ from reportlab.platypus import Table
13
+ from reportlab.lib.units import inch
14
+ from reportlab.platypus import Spacer
15
+ from reportlab.lib.enums import TA_LEFT
16
+ from reportlab.platypus import Paragraph
17
+ from reportlab.platypus import PageBreak
18
+ from reportlab.graphics import renderPDF
19
+ from reportlab.platypus import TableStyle
20
+ from reportlab.lib.enums import TA_CENTER
21
+ from reportlab.lib.enums import TA_JUSTIFY
22
+ from reportlab.lib.pagesizes import letter
23
+ from reportlab.platypus import KeepTogether
24
+ from reportlab.graphics.shapes import Circle
25
+ from reportlab.graphics.shapes import String
26
+ from reportlab.graphics.shapes import Drawing
27
+ from reportlab.lib.styles import ParagraphStyle
28
+ from reportlab.platypus import SimpleDocTemplate
29
+ from reportlab.lib.styles import getSampleStyleSheet
30
+
31
+
32
+
33
+ class PDFReportGenerator:
34
+ """
35
+ Generate professional PDF reports matching the sample style
36
+ """
37
+ def __init__(self):
38
+ self.styles = getSampleStyleSheet()
39
+ self._setup_custom_styles()
40
+
41
+
42
+ def _setup_custom_styles(self):
43
+ """
44
+ Setup custom paragraph styles
45
+ """
46
+ # Title style
47
+ self.styles.add(ParagraphStyle(name = 'ReportTitle',
48
+ parent = self.styles['Heading1'],
49
+ fontSize = 24,
50
+ textColor = colors.HexColor('#1a1a1a'),
51
+ spaceAfter = 20,
52
+ alignment = TA_LEFT,
53
+ fontName = 'Helvetica-Bold',
54
+ )
55
+ )
56
+
57
+ # Section heading
58
+ self.styles.add(ParagraphStyle(name = 'SectionHeading',
59
+ parent = self.styles['Heading2'],
60
+ fontSize = 16,
61
+ textColor = colors.HexColor('#1a1a1a'),
62
+ spaceAfter = 12,
63
+ spaceBefore = 20,
64
+ fontName = 'Helvetica-Bold',
65
+ )
66
+ )
67
+
68
+ # Body text
69
+ self.styles.add(ParagraphStyle(
70
+ name='BodyText',
71
+ parent=self.styles['Normal'],
72
+ fontSize=10,
73
+ leading=14,
74
+ textColor=colors.HexColor('#333333'),
75
+ alignment=TA_JUSTIFY,
76
+ fontName='Helvetica'
77
+ ))
78
+
79
+ # Bullet point
80
+ self.styles.add(ParagraphStyle(
81
+ name='BulletPoint',
82
+ parent=self.styles['Normal'],
83
+ fontSize=10,
84
+ leading=14,
85
+ textColor=colors.HexColor('#333333'),
86
+ leftIndent=20,
87
+ bulletIndent=10,
88
+ fontName='Helvetica'
89
+ ))
90
+
91
+ # Table header
92
+ self.styles.add(ParagraphStyle(
93
+ name='TableHeader',
94
+ parent=self.styles['Normal'],
95
+ fontSize=10,
96
+ textColor=colors.HexColor('#1a1a1a'),
97
+ fontName='Helvetica-Bold'
98
+ ))
99
+
100
+ # Footer
101
+ self.styles.add(ParagraphStyle(
102
+ name='Footer',
103
+ parent=self.styles['Normal'],
104
+ fontSize=8,
105
+ textColor=colors.HexColor('#666666'),
106
+ alignment=TA_CENTER,
107
+ fontName='Helvetica'
108
+ ))
109
+
110
+ def _draw_risk_score_circle(self, score: int) -> Drawing:
111
+ """Draw the risk score circle graphic"""
112
+ d = Drawing(150, 150)
113
+
114
+ # Determine color based on score
115
+ if score >= 80:
116
+ color = colors.HexColor('#dc2626')
117
+ elif score >= 60:
118
+ color = colors.HexColor('#f97316')
119
+ elif score >= 40:
120
+ color = colors.HexColor('#ca8a04')
121
+ else:
122
+ color = colors.HexColor('#16a34a')
123
+
124
+ # Background circle
125
+ bg_circle = Circle(75, 75, 60)
126
+ bg_circle.fillColor = colors.HexColor('#f0f0f0')
127
+ bg_circle.strokeColor = None
128
+ d.add(bg_circle)
129
+
130
+ # Score circle
131
+ score_circle = Circle(75, 75, 55)
132
+ score_circle.fillColor = color
133
+ score_circle.strokeColor = None
134
+ d.add(score_circle)
135
+
136
+ # Inner white circle
137
+ inner_circle = Circle(75, 75, 45)
138
+ inner_circle.fillColor = colors.white
139
+ inner_circle.strokeColor = None
140
+ d.add(inner_circle)
141
+
142
+ # Score text
143
+ score_text = String(75, 70, str(score), textAnchor='middle')
144
+ score_text.fontSize = 36
145
+ score_text.fontName = 'Helvetica-Bold'
146
+ score_text.fillColor = color
147
+ d.add(score_text)
148
+
149
+ return d
150
+
151
+ def _get_risk_color(self, score: int) -> colors.Color:
152
+ """Get color based on risk score"""
153
+ if score >= 80:
154
+ return colors.HexColor('#dc2626')
155
+ elif score >= 60:
156
+ return colors.HexColor('#f97316')
157
+ elif score >= 40:
158
+ return colors.HexColor('#ca8a04')
159
+ else:
160
+ return colors.HexColor('#16a34a')
161
+
162
+ def _create_header_footer(self, canvas, doc):
163
+ """Add header and footer to each page"""
164
+ canvas.saveState()
165
+
166
+ # Header
167
+ canvas.setFont('Helvetica-Bold', 12)
168
+ canvas.drawString(0.75 * inch, letter[1] - 0.5 * inch,
169
+ "AI Contract Risk Analysis Report")
170
+
171
+ # Footer
172
+ canvas.setFont('Helvetica', 8)
173
+ canvas.setFillColor(colors.HexColor('#666666'))
174
+
175
+ # Page number
176
+ page_num = f"Page {doc.page} of {doc.page_count if hasattr(doc, 'page_count') else '?'}"
177
+ canvas.drawString(7 * inch, 0.5 * inch, page_num)
178
+
179
+ # Legal disclaimer
180
+ disclaimer = "For informational purposes only. Not legal advice."
181
+ canvas.drawCentredString(letter[0] / 2, 0.5 * inch, disclaimer)
182
+
183
+ canvas.restoreState()
184
+
185
+ def generate_report(self, analysis_result: Dict[str, Any],
186
+ output_path: Optional[str] = None) -> BytesIO:
187
+ """
188
+ Generate PDF report from analysis results
189
+
190
+ Args:
191
+ analysis_result: Analysis result dictionary from the API
192
+ output_path: Optional file path to save PDF
193
+
194
+ Returns:
195
+ BytesIO buffer containing the PDF
196
+ """
197
+ # Create buffer
198
+ buffer = BytesIO()
199
+
200
+ # Create document
201
+ doc = SimpleDocTemplate(
202
+ buffer if not output_path else output_path,
203
+ pagesize=letter,
204
+ rightMargin=0.75*inch,
205
+ leftMargin=0.75*inch,
206
+ topMargin=1*inch,
207
+ bottomMargin=1*inch
208
+ )
209
+
210
+ # Build story
211
+ story = []
212
+
213
+ # Title and Risk Score (Page 1)
214
+ story.extend(self._build_page_1(analysis_result))
215
+ story.append(PageBreak())
216
+
217
+ # Negotiation Points (Page 2)
218
+ story.extend(self._build_page_2(analysis_result))
219
+ story.append(PageBreak())
220
+
221
+ # Risk Category Breakdown (Page 3)
222
+ story.extend(self._build_page_3(analysis_result))
223
+
224
+ # Clause-by-Clause Analysis (Page 4+)
225
+ story.append(PageBreak())
226
+ story.extend(self._build_clause_analysis(analysis_result))
227
+
228
+ # Build PDF
229
+ doc.build(story, onFirstPage=self._create_header_footer,
230
+ onLaterPages=self._create_header_footer)
231
+
232
+ # If using buffer, seek to beginning
233
+ if not output_path:
234
+ buffer.seek(0)
235
+ return buffer
236
+
237
+ return buffer
238
+
239
+ def _build_page_1(self, result: Dict) -> List:
240
+ """Build page 1 content: Title, Risk Score, Executive Summary, Key Items"""
241
+ elements = []
242
+
243
+ # Title
244
+ elements.append(Paragraph("AI Contract Risk Analysis Report",
245
+ self.styles['ReportTitle']))
246
+ elements.append(Spacer(1, 0.1*inch))
247
+
248
+ # Risk Score Circle
249
+ risk_score = result['risk_analysis']['overall_score']
250
+ elements.append(self._draw_risk_score_circle(risk_score))
251
+ elements.append(Spacer(1, 0.2*inch))
252
+
253
+ # Executive Summary
254
+ elements.append(Paragraph("Executive Summary",
255
+ self.styles['SectionHeading']))
256
+ elements.append(Paragraph(result['executive_summary'],
257
+ self.styles['BodyText']))
258
+ elements.append(Spacer(1, 0.2*inch))
259
+
260
+ # Unfavorable Terms
261
+ elements.append(Paragraph("Unfavorable Terms",
262
+ self.styles['SectionHeading']))
263
+
264
+ for term in result['unfavorable_terms'][:8]: # Limit to 8 items
265
+ bullet_text = f"<bullet>•</bullet> <b>{term.get('clause_reference', term['term'])}:</b> {term['explanation']}"
266
+ elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
267
+ elements.append(Spacer(1, 0.05*inch))
268
+
269
+ elements.append(Spacer(1, 0.2*inch))
270
+
271
+ # Missing Protections
272
+ elements.append(Paragraph("Missing Protections",
273
+ self.styles['SectionHeading']))
274
+
275
+ for protection in result['missing_protections'][:6]: # Limit to 6 items
276
+ bullet_text = f"<bullet>•</bullet> <b>{protection['protection']}:</b> {protection['explanation']}"
277
+ elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
278
+ elements.append(Spacer(1, 0.05*inch))
279
+
280
+ return elements
281
+
282
+ def _build_page_2(self, result: Dict) -> List:
283
+ """Build page 2 content: Negotiation Points"""
284
+ elements = []
285
+
286
+ elements.append(Paragraph("Negotiation Points",
287
+ self.styles['SectionHeading']))
288
+ elements.append(Spacer(1, 0.1*inch))
289
+
290
+ negotiation_points = result.get('negotiation_points', [])
291
+
292
+ if negotiation_points:
293
+ for point in negotiation_points[:7]: # Limit to 7 points
294
+ bullet_text = f"<bullet>•</bullet> {point['issue']}: {point['rationale']}"
295
+ elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
296
+ elements.append(Spacer(1, 0.1*inch))
297
+ else:
298
+ # Fallback to unfavorable terms if negotiation points not available
299
+ for term in result['unfavorable_terms'][:7]:
300
+ if term.get('suggested_fix'):
301
+ bullet_text = f"<bullet>•</bullet> {term['term']}: {term['suggested_fix']}"
302
+ elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
303
+ elements.append(Spacer(1, 0.1*inch))
304
+
305
+ return elements
306
+
307
+ def _build_page_3(self, result: Dict) -> List:
308
+ """Build page 3 content: Risk Category Breakdown"""
309
+ elements = []
310
+
311
+ elements.append(Paragraph("Risk Category Breakdown",
312
+ self.styles['SectionHeading']))
313
+ elements.append(Spacer(1, 0.15*inch))
314
+
315
+ # Create table data
316
+ table_data = [
317
+ [
318
+ Paragraph('<b>Category</b>', self.styles['TableHeader']),
319
+ Paragraph('<b>Score</b>', self.styles['TableHeader']),
320
+ Paragraph('<b>Summary</b>', self.styles['TableHeader'])
321
+ ]
322
+ ]
323
+
324
+ risk_breakdown = result['risk_analysis'].get('risk_breakdown', [])
325
+
326
+ for category in risk_breakdown:
327
+ score_color = self._get_risk_color(category['score'])
328
+
329
+ category_cell = Paragraph(category['category'], self.styles['BodyText'])
330
+ score_cell = Paragraph(
331
+ f'<font color="{score_color.hexval()}"><b>{category["score"]}</b></font>',
332
+ self.styles['TableHeader']
333
+ )
334
+ summary_cell = Paragraph(category['summary'], self.styles['BodyText'])
335
+
336
+ table_data.append([category_cell, score_cell, summary_cell])
337
+
338
+ # Create table
339
+ table = Table(table_data, colWidths=[1.8*inch, 0.7*inch, 4*inch])
340
+ table.setStyle(TableStyle([
341
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#f5f5f5')),
342
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a1a1a')),
343
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
344
+ ('ALIGN', (1, 0), (1, -1), 'CENTER'),
345
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
346
+ ('FONTSIZE', (0, 0), (-1, -1), 10),
347
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
348
+ ('TOPPADDING', (0, 1), (-1, -1), 10),
349
+ ('BOTTOMPADDING', (0, 1), (-1, -1), 10),
350
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e5e5e5')),
351
+ ('VALIGN', (0, 0), (-1, -1), 'TOP'),
352
+ ]))
353
+
354
+ elements.append(table)
355
+
356
+ return elements
357
+
358
+ def _build_clause_analysis(self, result: Dict) -> List:
359
+ """Build clause-by-clause analysis section"""
360
+ elements = []
361
+
362
+ elements.append(Paragraph("Clause-by-Clause Analysis",
363
+ self.styles['SectionHeading']))
364
+ elements.append(Spacer(1, 0.15*inch))
365
+
366
+ # Create table data
367
+ table_data = [
368
+ [
369
+ Paragraph('<b>Clause</b>', self.styles['TableHeader']),
370
+ Paragraph('<b>Risk Level</b>', self.styles['TableHeader']),
371
+ Paragraph('<b>Analysis</b>', self.styles['TableHeader']),
372
+ Paragraph('<b>Recommendation</b>', self.styles['TableHeader'])
373
+ ]
374
+ ]
375
+
376
+ # Get unfavorable terms and interpretations
377
+ unfavorable_terms = result.get('unfavorable_terms', [])
378
+ interpretations = result.get('clause_interpretations', [])
379
+
380
+ # Combine and process
381
+ processed_clauses = []
382
+
383
+ for term in unfavorable_terms[:10]: # Limit to 10 clauses
384
+ clause_ref = term.get('clause_reference', term['term'])
385
+
386
+ # Find matching interpretation if available
387
+ analysis_text = term['explanation']
388
+ recommendation_text = term.get('suggested_fix', 'Negotiate or seek legal advice.')
389
+
390
+ # Determine risk level
391
+ severity = term.get('severity', 'high')
392
+ if severity == 'critical':
393
+ risk_level = 'Critical'
394
+ risk_color = colors.HexColor('#dc2626')
395
+ elif severity == 'high':
396
+ risk_level = 'High'
397
+ risk_color = colors.HexColor('#f97316')
398
+ else:
399
+ risk_level = 'Medium'
400
+ risk_color = colors.HexColor('#ca8a04')
401
+
402
+ clause_cell = Paragraph(clause_ref, self.styles['BodyText'])
403
+ risk_cell = Paragraph(
404
+ f'<font color="{risk_color.hexval()}"><b>{risk_level}</b></font>',
405
+ self.styles['TableHeader']
406
+ )
407
+ analysis_cell = Paragraph(analysis_text, self.styles['BodyText'])
408
+ recommendation_cell = Paragraph(recommendation_text, self.styles['BodyText'])
409
+
410
+ table_data.append([clause_cell, risk_cell, analysis_cell, recommendation_cell])
411
+
412
+ # Create table
413
+ table = Table(table_data, colWidths=[1.5*inch, 0.8*inch, 2.2*inch, 2*inch])
414
+ table.setStyle(TableStyle([
415
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#f5f5f5')),
416
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a1a1a')),
417
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
418
+ ('ALIGN', (1, 0), (1, -1), 'CENTER'),
419
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
420
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
421
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
422
+ ('TOPPADDING', (0, 1), (-1, -1), 10),
423
+ ('BOTTOMPADDING', (0, 1), (-1, -1), 10),
424
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e5e5e5')),
425
+ ('VALIGN', (0, 0), (-1, -1), 'TOP'),
426
+ ]))
427
+
428
+ elements.append(table)
429
+
430
+ return elements
431
+
432
+
433
+ def generate_pdf_report(analysis_result: Dict[str, Any],
434
+ output_path: Optional[str] = None) -> BytesIO:
435
+ """
436
+ Convenience function to generate PDF report
437
+
438
+ Args:
439
+ analysis_result: Complete analysis result from the API
440
+ output_path: Optional file path to save PDF
441
+
442
+ Returns:
443
+ BytesIO buffer containing the PDF
444
+ """
445
+ generator = PDFReportGenerator()
446
+ return generator.generate_report(analysis_result, output_path)
447
+
448
+
449
+ if __name__ == "__main__":
450
+ # Test with sample data
451
+ sample_result = {
452
+ "analysis_id": "test-123",
453
+ "timestamp": datetime.now().isoformat(),
454
+ "risk_analysis": {
455
+ "overall_score": 85,
456
+ "risk_level": "CRITICAL",
457
+ "risk_breakdown": [
458
+ {
459
+ "category": "Restrictive Covenants",
460
+ "score": 95,
461
+ "summary": "The agreement contains exceptionally broad and long-lasting non-compete (24 months) and non-solicitation (5 years) clauses."
462
+ },
463
+ {
464
+ "category": "Penalties & Termination",
465
+ "score": 90,
466
+ "summary": "The contract includes severe penalties for breach, including forfeiture of earned salary."
467
+ }
468
+ ]
469
+ },
470
+ "executive_summary": "This employment agreement is heavily skewed in favor of the Employer, presenting a very high risk.",
471
+ "unfavorable_terms": [
472
+ {
473
+ "term": "Undefined Post-Probation Salary",
474
+ "clause_reference": "Clause 8.2",
475
+ "severity": "critical",
476
+ "explanation": "Post-probation salary is undefined ('as discussed').",
477
+ "suggested_fix": "Insist that the exact salary be explicitly stated."
478
+ }
479
+ ],
480
+ "missing_protections": [
481
+ {
482
+ "protection": "Defined Post-Probation Salary",
483
+ "importance": "critical",
484
+ "explanation": "The contract lacks a specific, written salary commitment."
485
+ }
486
+ ],
487
+ "negotiation_points": [
488
+ {
489
+ "issue": "Post-probation salary",
490
+ "rationale": "Must be explicitly defined in writing before signing."
491
+ }
492
+ ]
493
+ }
494
+
495
+ buffer = generate_pdf_report(sample_result, "test_report.pdf")
496
+ print("Test PDF generated successfully!")
requirements.txt CHANGED
@@ -1,15 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # LLM Providers
2
- openai>=1.0.0 # OpenAI API
3
- anthropic>=0.18.0 # Anthropic Claude API
 
4
  Ollama
5
 
6
- requests # For Ollama
7
- transformers # For Legal-BERT
8
- sentence-transformers # For embeddings
9
- torch # PyTorch
10
- pypdf2
11
- python_docx
12
- PyMuPDF
13
- spacy
14
- pydantic-settings
15
- pydantic
 
1
+ # FastAPI & Server
2
+ fastapi==0.104.1
3
+ uvicorn[standard]==0.24.0
4
+ python-multipart==0.0.6
5
+
6
+ # ML & NLP
7
+ transformers==4.35.2
8
+ torch==2.1.1
9
+ sentence-transformers==2.2.2
10
+ spacy
11
+
12
+ # Document Processing
13
+ PyPDF2==3.0.1
14
+ PyMuPDF==1.23.8
15
+ python-docx==1.1.0
16
+
17
+
18
  # LLM Providers
19
+ openai>=1.3.0
20
+ anthropic>=0.18.0
21
+ requests==2.31.0
22
  Ollama
23
 
24
+ # Data & Validation
25
+ pydantic==2.5.0
26
+ pydantic-settings==2.1.0
27
+
28
+ # Utilities
29
+ python-dotenv==1.0.0
30
+
31
+ # PDF report generation
32
+ reportlab>=4.0.0
 
services/clause_extractor.py CHANGED
@@ -1,23 +1,26 @@
1
- """
2
- Advanced Clause Extractor using Legal-BERT + Structural Patterns
3
- Uses nlpaueb/legal-bert-base-uncased for semantic clause understanding
4
- """
5
-
6
- import torch
7
  import re
8
- from typing import List, Dict, Tuple, Optional, Any
9
- from dataclasses import dataclass, field
10
- from collections import defaultdict
11
  import numpy as np
 
 
 
 
 
 
 
 
 
12
  from sentence_transformers import util
13
 
14
  # Import utilities
15
- import sys
16
- from pathlib import Path
17
  sys.path.append(str(Path(__file__).parent.parent))
18
 
19
- from utils.logger import ContractAnalyzerLogger, log_info, log_error
 
20
  from utils.text_processor import TextProcessor
 
21
 
22
 
23
  @dataclass
@@ -25,37 +28,40 @@ class ExtractedClause:
25
  """
26
  Extracted clause with comprehensive metadata
27
  """
28
- text: str
29
- reference: str # e.g., "Section 5.2", "Clause 11.1"
30
- category: str # e.g., "termination", "compensation", "indemnification"
31
- confidence: float # 0.0-1.0
32
- start_pos: int
33
- end_pos: int
34
- extraction_method: str # "structural", "semantic", "hybrid"
35
- risk_indicators: List[str] = field(default_factory=list)
36
- embeddings: Optional[np.ndarray] = None
37
- subclauses: List[str] = field(default_factory=list)
38
- legal_bert_score: float = 0.0
 
39
 
40
  def to_dict(self) -> Dict[str, Any]:
41
- """Convert to dictionary for serialization"""
42
- return {
43
- "text": self.text,
44
- "reference": self.reference,
45
- "category": self.category,
46
- "confidence": round(self.confidence, 3),
47
- "start_pos": self.start_pos,
48
- "end_pos": self.end_pos,
49
- "extraction_method": self.extraction_method,
50
- "risk_indicators": self.risk_indicators,
51
- "subclauses": self.subclauses,
52
- "legal_bert_score": round(self.legal_bert_score, 3)
53
- }
 
 
54
 
55
 
56
  class ClauseExtractor:
57
  """
58
- Advanced clause extraction using Legal-BERT + structural patterns
59
 
60
  Process:
61
  1. Structural extraction (numbered sections like "5.2", "Article III")
@@ -64,222 +70,123 @@ class ClauseExtractor:
64
  4. Category classification using Legal-BERT + keyword matching
65
  5. Deduplication and ranking
66
  """
67
-
68
- # =========================================================================
69
  # CLAUSE CATEGORY DEFINITIONS WITH REPRESENTATIVE TEXTS
70
- # =========================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- CLAUSE_CATEGORIES = {
73
- 'compensation': {
74
- 'keywords': ['salary', 'wage', 'compensation', 'pay', 'payment', 'bonus',
75
- 'commission', 'remuneration', 'fee', 'rate', 'benefits'],
76
- 'representative_text': (
77
- "The Employee shall receive an annual base salary of One Hundred Thousand Dollars "
78
- "payable in accordance with the Company's standard payroll practices. "
79
- "Additional compensation may include performance bonuses and stock options."
80
- ),
81
- 'weight': 1.0
82
- },
83
- 'termination': {
84
- 'keywords': ['termination', 'terminate', 'notice period', 'resignation',
85
- 'dismissal', 'severance', 'end of employment', 'cessation', 'notice'],
86
- 'representative_text': (
87
- "Either party may terminate this Agreement upon thirty days written notice. "
88
- "The Company may terminate for cause immediately upon written notice to Employee. "
89
- "Upon termination, Employee shall receive severance compensation."
90
- ),
91
- 'weight': 1.2
92
- },
93
- 'non_compete': {
94
- 'keywords': ['non-compete', 'non-solicit', 'non-solicitation', 'restrictive covenant',
95
- 'competitive', 'competition', 'competing business', 'competitive activities'],
96
- 'representative_text': (
97
- "Employee agrees not to engage in any competitive business activities for a period "
98
- "of twelve months following termination within a fifty-mile radius. "
99
- "Employee shall not solicit Company clients or employees during this period."
100
- ),
101
- 'weight': 1.5
102
- },
103
- 'confidentiality': {
104
- 'keywords': ['confidential', 'proprietary', 'trade secret', 'disclosure',
105
- 'confidentiality', 'secret', 'private', 'non-disclosure'],
106
- 'representative_text': (
107
- "Employee shall maintain the confidentiality of all proprietary information "
108
- "and trade secrets of the Company. Confidential Information includes business plans, "
109
- "customer lists, and technical data. These obligations survive termination."
110
- ),
111
- 'weight': 1.1
112
- },
113
- 'indemnification': {
114
- 'keywords': ['indemnify', 'indemnification', 'hold harmless', 'defend',
115
- 'liability', 'claims', 'losses', 'damages'],
116
- 'representative_text': (
117
- "Party A shall indemnify and hold harmless Party B from any claims, losses, "
118
- "or damages arising from Party A's breach or negligence. This indemnification "
119
- "includes reasonable attorneys' fees and costs of defense."
120
- ),
121
- 'weight': 1.3
122
- },
123
- 'intellectual_property': {
124
- 'keywords': ['intellectual property', 'ip', 'copyright', 'patent', 'trademark',
125
- 'work product', 'inventions', 'creation', 'ownership', 'ip rights'],
126
- 'representative_text': (
127
- "All work product and inventions created by Employee during employment shall be "
128
- "the exclusive property of the Company. Employee assigns all intellectual property "
129
- "rights including patents, copyrights, and trade secrets to the Company."
130
- ),
131
- 'weight': 1.2
132
- },
133
- 'liability': {
134
- 'keywords': ['liable', 'liability', 'damages', 'limitation', 'consequential',
135
- 'indirect', 'punitive', 'cap', 'limited liability'],
136
- 'representative_text': (
137
- "In no event shall either party be liable for indirect, incidental, or consequential "
138
- "damages. Total liability under this Agreement shall not exceed the amounts paid "
139
- "in the twelve months preceding the claim."
140
- ),
141
- 'weight': 1.2
142
- },
143
- 'warranty': {
144
- 'keywords': ['warranty', 'warrant', 'representation', 'guarantee',
145
- 'assurance', 'promise', 'warranties'],
146
- 'representative_text': (
147
- "Company warrants that the Services will be performed in a professional manner. "
148
- "EXCEPT AS EXPRESSLY PROVIDED, COMPANY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, "
149
- "INCLUDING WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE."
150
- ),
151
- 'weight': 0.9
152
- },
153
- 'dispute_resolution': {
154
- 'keywords': ['arbitration', 'mediation', 'dispute', 'jurisdiction',
155
- 'governing law', 'venue', 'forum', 'resolution'],
156
- 'representative_text': (
157
- "Any disputes arising under this Agreement shall be resolved through binding arbitration "
158
- "in accordance with the rules of the American Arbitration Association. "
159
- "This Agreement shall be governed by the laws of the State of California."
160
- ),
161
- 'weight': 0.9
162
- },
163
- 'insurance': {
164
- 'keywords': ['insurance', 'coverage', 'insured', 'policy', 'premium', 'insurer'],
165
- 'representative_text': (
166
- "Contractor shall maintain general liability insurance with minimum coverage of "
167
- "one million dollars per occurrence. Proof of insurance shall be provided to Client. "
168
- "Company shall be named as additional insured on all policies."
169
- ),
170
- 'weight': 0.8
171
- },
172
- 'assignment': {
173
- 'keywords': ['assignment', 'assign', 'transfer', 'successor', 'binding', 'assignee'],
174
- 'representative_text': (
175
- "This Agreement may not be assigned by either party without the prior written consent "
176
- "of the other party. This Agreement shall be binding upon and inure to the benefit "
177
- "of the parties' successors and permitted assigns."
178
- ),
179
- 'weight': 0.8
180
- },
181
- 'amendment': {
182
- 'keywords': ['amendment', 'modify', 'modification', 'change', 'alteration', 'waiver'],
183
- 'representative_text': (
184
- "This Agreement may not be amended or modified except by written instrument signed "
185
- "by both parties. No waiver of any provision shall be effective unless in writing. "
186
- "All modifications must be mutually agreed upon."
187
- ),
188
- 'weight': 0.7
189
- },
190
- 'force_majeure': {
191
- 'keywords': ['force majeure', 'act of god', 'unforeseeable', 'beyond control', 'natural disaster'],
192
- 'representative_text': (
193
- "Neither party shall be liable for failure to perform due to causes beyond its reasonable "
194
- "control including acts of God, war, strikes, or natural disasters. "
195
- "Performance shall be suspended during the force majeure event."
196
- ),
197
- 'weight': 0.7
198
- },
199
- 'entire_agreement': {
200
- 'keywords': ['entire agreement', 'integration', 'supersedes', 'prior agreements', 'complete agreement'],
201
- 'representative_text': (
202
- "This Agreement constitutes the entire agreement between the parties and supersedes "
203
- "all prior agreements, whether written or oral. No other representations or warranties "
204
- "shall be binding unless incorporated herein."
205
- ),
206
- 'weight': 0.6
207
- },
208
- 'general': {
209
- 'keywords': ['provision', 'term', 'condition', 'obligation', 'requirement'],
210
- 'representative_text': (
211
- "The parties agree to the following terms and conditions governing their relationship. "
212
- "Each party shall perform its obligations in good faith and in accordance with "
213
- "industry standards and applicable law."
214
- ),
215
- 'weight': 0.5
216
- }
217
- }
218
-
219
- # =========================================================================
220
  # RISK INDICATOR PATTERNS
221
- # =========================================================================
222
-
223
- RISK_INDICATORS = {
224
- 'critical': [
225
- 'unlimited liability', 'perpetual', 'irrevocable', 'forfeit',
226
- 'liquidated damages', 'wage withholding', 'joint and several'
227
- ],
228
- 'high': [
229
- 'non-compete', 'non-solicit', 'penalty', 'without cause',
230
- 'sole discretion', 'immediate termination', 'at-will'
231
- ],
232
- 'medium': [
233
- 'indemnify', 'hold harmless', 'confidential', 'proprietary',
234
- 'exclusive', 'terminate', 'default', 'breach'
235
- ]
236
- }
237
 
238
- # =========================================================================
239
  # INITIALIZATION
240
- # =========================================================================
241
-
242
- def __init__(self, model_loader, contract_category: Optional[str] = None):
243
  """
244
  Initialize clause extractor with Legal-BERT
245
 
246
- Args:
247
- model_loader: ModelLoader instance for accessing Legal-BERT
248
- contract_category: Optional contract category for context-aware extraction
 
 
249
  """
250
- self.model_loader = model_loader
251
- self.contract_category = contract_category
252
 
253
  # Models (lazy loaded)
254
- self.legal_bert_model = None
255
  self.legal_bert_tokenizer = None
256
- self.embedding_model = None
257
- self.device = None
258
 
259
  # Category embeddings (computed from representative texts)
260
- self.category_embeddings = {}
261
 
262
  # Text processor
263
- self.text_processor = TextProcessor(use_spacy=False)
264
 
265
  # Logger
266
- self.logger = ContractAnalyzerLogger.get_logger()
267
 
268
  # Lazy load
269
  self._lazy_load()
270
 
 
271
  def _lazy_load(self):
272
- """Lazy load Legal-BERT and embedding models"""
 
 
273
  if self.legal_bert_model is None:
274
  try:
275
  log_info("Loading Legal-BERT for clause extraction...")
276
 
277
  # Load Legal-BERT (nlpaueb/legal-bert-base-uncased)
278
  self.legal_bert_model, self.legal_bert_tokenizer = self.model_loader.load_legal_bert()
279
- self.device = self.model_loader.device
280
 
281
  # Load sentence transformer for embeddings
282
- self.embedding_model = self.model_loader.load_embedding_model()
283
 
284
  # Prepare category embeddings using Legal-BERT
285
  self._prepare_category_embeddings()
@@ -287,59 +194,61 @@ class ClauseExtractor:
287
  log_info("Clause extractor models loaded successfully")
288
 
289
  except Exception as e:
290
- log_error(e, context={"component": "ClauseExtractor", "operation": "model_loading"})
291
  raise
 
292
 
293
  def _prepare_category_embeddings(self):
294
  """
295
  Pre-compute Legal-BERT embeddings for category representative texts
 
296
  This enables semantic similarity matching for clause classification
297
  """
298
  log_info("Computing Legal-BERT embeddings for clause categories...")
299
 
300
  for category, config in self.CLAUSE_CATEGORIES.items():
301
- representative_text = config['representative_text']
302
 
303
  # Get Legal-BERT embedding (using [CLS] token)
304
- embedding = self._get_legal_bert_embedding(representative_text)
 
305
  self.category_embeddings[category] = embedding
306
 
307
  log_info(f"Prepared Legal-BERT embeddings for {len(self.category_embeddings)} categories")
308
 
 
309
  def _get_legal_bert_embedding(self, text: str) -> np.ndarray:
310
  """
311
  Get Legal-BERT embedding for text using [CLS] token
312
 
313
- Args:
314
- text: Input text
 
315
 
316
  Returns:
317
- Embedding vector as numpy array
 
318
  """
319
  # Tokenize
320
- inputs = self.legal_bert_tokenizer(
321
- text,
322
- return_tensors="pt",
323
- padding=True,
324
- truncation=True,
325
- max_length=512
326
- ).to(self.device)
327
 
328
  # Get embeddings
329
  with torch.no_grad():
330
- outputs = self.legal_bert_model(**inputs)
331
  # Use [CLS] token embedding (first token)
332
  cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
333
 
334
  return cls_embedding
335
 
336
- # =========================================================================
337
- # MAIN EXTRACTION METHOD
338
- # =========================================================================
339
 
 
340
  @ContractAnalyzerLogger.log_execution_time("extract_clauses")
341
- def extract_clauses(self, contract_text: str,
342
- max_clauses: int = 15) -> List[ExtractedClause]:
343
  """
344
  Extract and classify clauses from contract using hybrid approach
345
 
@@ -349,44 +258,45 @@ class ClauseExtractor:
349
  3. Legal-BERT classification
350
  4. Deduplicate and rank by confidence
351
 
352
- Args:
353
- contract_text: Full contract text
354
- max_clauses: Maximum number of clauses to return
 
 
355
 
356
  Returns:
357
- List of ExtractedClause objects sorted by confidence
 
358
  """
359
 
360
  log_info("Starting clause extraction",
361
- text_length=len(contract_text),
362
- contract_category=self.contract_category,
363
- max_clauses=max_clauses)
 
364
 
365
- # Step 1: Extract using structural patterns
366
  structural_clauses = self._extract_structural_clauses(contract_text)
367
  log_info(f"Extracted {len(structural_clauses)} structural clauses")
368
 
369
- # Step 2: Semantic chunking for unstructured parts
370
- semantic_chunks = self._semantic_chunking(contract_text, structural_clauses)
371
  log_info(f"Created {len(semantic_chunks)} semantic chunks")
372
 
373
- # Step 3: Combine all candidates
374
- all_candidates = structural_clauses + semantic_chunks
375
  log_info(f"Total candidates: {len(all_candidates)}")
376
 
377
- # Step 4: Classify with Legal-BERT
378
  classified_clauses = self._classify_clauses_with_legal_bert(all_candidates)
379
  log_info(f"Classified {len(classified_clauses)} clauses")
380
 
381
- # Step 5: Deduplicate and rank
382
- final_clauses = self._deduplicate_and_rank(classified_clauses, max_clauses)
383
  log_info(f"Final output: {len(final_clauses)} clauses")
384
 
385
  return final_clauses
386
 
387
- # =========================================================================
388
- # STEP 1: STRUCTURAL EXTRACTION
389
- # =========================================================================
390
 
391
  def _extract_structural_clauses(self, text: str) -> List[Dict]:
392
  """
@@ -398,27 +308,22 @@ class ClauseExtractor:
398
  - "Article III. Text"
399
  - "Clause 11. Text"
400
  """
401
- candidates = []
402
 
403
  # Clean text
404
- text = re.sub(r'\s+', ' ', text)
405
 
406
  # Patterns for legal numbering
407
- patterns = [
408
- # Match: "1.1. Text" or "1.1 Text"
409
- (r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=\d+\.\d+(?:\.\d+)*\.|$)', 'numbered'),
410
- # Match: "Article 1.1. Text" or "Article III. Text"
411
- (r'(Article\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+))\.\s*([^\n]{30,800}?)(?=Article\s+(?:\d+|[IVXLCDM]+)|$)', 'article'),
412
- # Match: "Section 1.1. Text"
413
- (r'(Section\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Section\s+\d+|$)', 'section'),
414
- # Match: "Clause 1.1. Text"
415
- (r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Clause\s+\d+|$)', 'clause'),
416
- # Match: "(a) Text", "(i) Text" - sub-clauses
417
- (r'\(([a-z]|[ivxlcdm]+)\)\s*([^\n]{30,500}?)(?=\([a-z]|[ivxlcdm]+\)|\n\n|$)', 'subclause')
418
- ]
419
 
420
  for pattern, ref_type in patterns:
421
  matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
 
422
  for match in matches:
423
  clause_text = match.group(2).strip()
424
 
@@ -426,69 +331,105 @@ class ClauseExtractor:
426
  if not self._is_boilerplate(clause_text):
427
  # Check for meaningful content
428
  if self._has_meaningful_content(clause_text):
429
- candidates.append({
430
- 'text': clause_text,
431
- 'reference': match.group(1).strip(),
432
- 'start': match.start(),
433
- 'end': match.end(),
434
- 'type': 'structural',
435
- 'ref_type': ref_type
436
- })
437
 
438
  # Remove overlapping clauses
439
  candidates = self._remove_overlapping(candidates)
440
 
441
  return candidates
 
442
 
443
  def _is_boilerplate(self, text: str) -> bool:
444
- """Check if text is boilerplate/definitional rather than substantive"""
445
- boilerplate_indicators = [
446
- 'shall mean', 'means and includes', 'defined as', 'definition of',
447
- 'hereinafter referred to', 'for purposes of this', 'interpretation of',
448
- 'as used in this', 'the term', 'shall include', 'includes but not limited'
449
- ]
450
-
451
- text_lower = text.lower()
 
 
 
 
 
 
 
 
 
452
  # Must have at least one strong indicator AND be definition-heavy
453
- has_indicator = any(indicator in text_lower for indicator in boilerplate_indicators)
454
- is_short_definition = len(text.split()) < 50 and '"' in text
455
 
456
  return has_indicator or is_short_definition
457
 
 
458
  def _has_meaningful_content(self, text: str) -> bool:
459
- """Check if text has meaningful legal content"""
 
 
460
  # Must have minimum length
461
- if len(text.split()) < 15:
462
  return False
463
 
464
  # Check for legal action verbs
465
- action_verbs = [
466
- 'shall', 'must', 'will', 'may', 'agrees', 'undertakes',
467
- 'covenants', 'warrants', 'represents', 'acknowledges',
468
- 'certifies', 'indemnifies', 'waives', 'terminates'
469
- ]
470
-
471
- text_lower = text.lower()
472
- has_action = any(verb in text_lower for verb in action_verbs)
 
 
 
 
 
 
 
 
 
 
473
 
474
  # Check for legal subjects
475
- legal_subjects = [
476
- 'party', 'parties', 'employee', 'employer', 'company',
477
- 'contractor', 'consultant', 'client', 'vendor', 'buyer',
478
- 'seller', 'landlord', 'tenant', 'licensor', 'licensee'
479
- ]
480
-
481
- has_subject = any(subj in text_lower for subj in legal_subjects)
 
 
 
 
 
 
 
 
 
 
 
482
 
483
  return has_action or has_subject
484
 
 
485
  def _remove_overlapping(self, candidates: List[Dict]) -> List[Dict]:
486
- """Remove overlapping clause extractions"""
 
 
487
  if not candidates:
488
  return []
489
 
490
  # Sort by start position
491
- candidates.sort(key=lambda x: x['start'])
492
 
493
  non_overlapping = [candidates[0]]
494
 
@@ -496,41 +437,35 @@ class ClauseExtractor:
496
  last = non_overlapping[-1]
497
 
498
  # Check if overlaps
499
- if candidate['start'] >= last['end']:
500
  non_overlapping.append(candidate)
501
- elif len(candidate['text']) > len(last['text']):
 
502
  # Keep longer clause if overlapping
503
  non_overlapping[-1] = candidate
504
 
505
  return non_overlapping
506
 
507
- # =========================================================================
508
- # STEP 2: SEMANTIC CHUNKING
509
- # =========================================================================
510
 
511
- def _semantic_chunking(self, text: str,
512
- structural_clauses: List[Dict],
513
- chunk_size: int = 200) -> List[Dict]:
514
  """
515
- Chunk unstructured text semantically
516
- Uses sentence boundaries to find natural clause boundaries
517
  """
518
-
519
  # Get covered ranges from structural clauses
520
  covered_ranges = [(c['start'], c['end']) for c in structural_clauses]
521
 
522
  # Split into sentences
523
- sentences = self.text_processor.extract_sentences(text)
524
 
525
- chunks = []
526
- current_chunk = []
527
  current_length = 0
528
- current_start = 0
529
 
530
  for sentence in sentences:
531
  # Check if sentence is already covered by structural extraction
532
  sentence_start = text.find(sentence, current_start)
533
- if sentence_start == -1:
534
  continue
535
 
536
  if self._is_in_range(sentence_start, covered_ranges):
@@ -541,21 +476,20 @@ class ClauseExtractor:
541
  current_length += len(sentence.split())
542
 
543
  # Create chunk when reaching size limit
544
- if current_length >= chunk_size:
545
  chunk_text = ' '.join(current_chunk).strip()
546
 
547
- if len(chunk_text) >= 50 and not self._is_boilerplate(chunk_text):
548
  if self._has_meaningful_content(chunk_text):
549
- chunks.append({
550
- 'text': chunk_text,
551
- 'reference': f'Semantic-{len(chunks)+1}',
552
- 'start': sentence_start,
553
- 'end': sentence_start + len(chunk_text),
554
- 'type': 'semantic',
555
- 'ref_type': 'semantic'
556
- })
557
 
558
- current_chunk = []
559
  current_length = 0
560
 
561
  current_start = sentence_start + len(sentence)
@@ -563,145 +497,145 @@ class ClauseExtractor:
563
  # Add final chunk if exists
564
  if current_chunk:
565
  chunk_text = ' '.join(current_chunk).strip()
566
- if len(chunk_text) >= 50 and not self._is_boilerplate(chunk_text):
 
567
  if self._has_meaningful_content(chunk_text):
568
  sentence_start = text.find(current_chunk[0])
569
- chunks.append({
570
- 'text': chunk_text,
571
- 'reference': f'Semantic-{len(chunks)+1}',
572
- 'start': sentence_start,
573
- 'end': sentence_start + len(chunk_text),
574
- 'type': 'semantic',
575
- 'ref_type': 'semantic'
576
- })
577
 
578
  return chunks
579
 
 
580
  def _is_in_range(self, position: int, ranges: List[Tuple[int, int]]) -> bool:
581
- """Check if position is within any of the ranges"""
 
 
582
  return any(start <= position <= end for start, end in ranges)
583
 
584
- # =========================================================================
585
- # STEP 3: LEGAL-BERT CLASSIFICATION
586
- # =========================================================================
587
 
588
  def _classify_clauses_with_legal_bert(self, candidates: List[Dict]) -> List[ExtractedClause]:
589
  """
590
  Classify clauses using Legal-BERT embeddings + keyword matching
591
  """
592
- classified = []
593
 
594
  for candidate in candidates:
595
  # Get Legal-BERT embedding for clause
596
- clause_embedding = self._get_legal_bert_embedding(candidate['text'])
597
 
598
  # Classify using hybrid approach
599
- category, confidence, legal_bert_score = self._classify_single_clause(
600
- candidate['text'],
601
- clause_embedding
602
- )
603
 
604
  # Extract risk indicators
605
- risk_indicators = self._extract_risk_indicators(candidate['text'])
606
 
607
  # Extract sub-clauses if any
608
- subclauses = self._extract_subclauses(candidate['text'])
609
 
610
- classified.append(ExtractedClause(
611
- text=candidate['text'],
612
- reference=candidate['reference'],
613
- category=category,
614
- confidence=confidence,
615
- start_pos=candidate['start'],
616
- end_pos=candidate['end'],
617
- extraction_method=candidate['type'],
618
- risk_indicators=risk_indicators,
619
- embeddings=clause_embedding,
620
- subclauses=subclauses,
621
- legal_bert_score=legal_bert_score
622
- ))
623
 
624
  return classified
625
 
626
- def _classify_single_clause(self, text: str,
627
- clause_embedding: np.ndarray) -> Tuple[str, float, float]:
628
  """
629
  Classify single clause using Legal-BERT + keyword matching
630
 
631
  Returns:
632
- (category, confidence, legal_bert_score)
 
633
  """
634
- text_lower = text.lower()
635
 
636
- # Method 1: Keyword matching
637
- keyword_scores = {}
 
638
  for category, config in self.CLAUSE_CATEGORIES.items():
639
- keywords = config['keywords']
640
- weight = config['weight']
641
 
642
- keyword_count = sum(1 for kw in keywords if kw in text_lower)
643
  keyword_scores[category] = (keyword_count / len(keywords)) * weight
644
 
645
- # Method 2: Legal-BERT semantic similarity
646
- semantic_scores = {}
647
  clause_embedding_tensor = torch.tensor(clause_embedding).unsqueeze(0)
648
 
649
  for category, cat_embedding in self.category_embeddings.items():
650
- cat_embedding_tensor = torch.tensor(cat_embedding).unsqueeze(0)
651
- similarity = torch.nn.functional.cosine_similarity(
652
- clause_embedding_tensor,
653
- cat_embedding_tensor
654
- ).item()
655
  semantic_scores[category] = similarity
656
 
657
  # Combine scores (70% semantic, 30% keyword)
658
- combined_scores = {}
 
659
  for category in self.CLAUSE_CATEGORIES.keys():
660
- combined = (
661
- semantic_scores.get(category, 0) * 0.70 +
662
- keyword_scores.get(category, 0) * 0.30
663
- )
664
  combined_scores[category] = combined
665
 
666
  # Get best category
667
- best_category = max(combined_scores, key=combined_scores.get)
668
- confidence = combined_scores[best_category]
669
  legal_bert_score = semantic_scores[best_category]
670
 
671
  return best_category, confidence, legal_bert_score
672
 
 
673
  def _extract_risk_indicators(self, text: str) -> List[str]:
674
- """Extract risk indicator keywords from clause text"""
675
- text_lower = text.lower()
676
- found_indicators = []
 
 
677
 
678
  for severity, indicators in self.RISK_INDICATORS.items():
679
  for indicator in indicators:
680
  if indicator in text_lower:
681
  found_indicators.append(indicator)
682
 
683
- return found_indicators[:5] # Top 5 risk indicators
 
684
 
 
685
  def _extract_subclauses(self, text: str) -> List[str]:
686
- """Extract sub-clauses from main clause (e.g., (a), (b), (i), (ii))"""
 
 
687
  # Pattern for sub-clauses: (a), (i), etc.
688
  subclause_pattern = r'\(([a-z]|[ivxlcdm]+)\)\s*([^()]{20,200}?)(?=\([a-z]|[ivxlcdm]+\)|$)'
689
- matches = re.findall(subclause_pattern, text, re.IGNORECASE)
690
 
691
- subclauses = []
 
692
  for ref, subtext in matches:
693
  clean_text = subtext.strip()
694
- if len(clean_text) >= 20:
 
695
  subclauses.append(f"({ref}) {clean_text}")
696
 
697
- return subclauses[:5] # Max 5 sub-clauses
 
698
 
699
- # =========================================================================
700
- # STEP 4: DEDUPLICATION AND RANKING
701
- # =========================================================================
702
 
703
- def _deduplicate_and_rank(self, clauses: List[ExtractedClause],
704
- max_clauses: int) -> List[ExtractedClause]:
705
  """
706
  Remove duplicates and rank by confidence + legal_bert_score
707
  """
@@ -709,24 +643,22 @@ class ClauseExtractor:
709
  return []
710
 
711
  # Sort by combined score (confidence * 0.6 + legal_bert_score * 0.4)
712
- clauses.sort(
713
- key=lambda x: (x.confidence * 0.6 + x.legal_bert_score * 0.4),
714
- reverse=True
715
- )
716
 
717
  # Deduplicate by text similarity
718
- unique_clauses = []
719
- seen_texts = set()
720
 
721
  for clause in clauses:
722
  # Simple deduplication by first 100 chars
723
- text_key = clause.text[:100].lower().strip()
724
 
725
  # Also check similarity to already added clauses
726
  is_duplicate = False
 
727
  for existing in unique_clauses:
728
  similarity = self._text_similarity(clause.text, existing.text)
729
- if similarity > 0.85:
730
  is_duplicate = True
731
  break
732
 
@@ -734,28 +666,31 @@ class ClauseExtractor:
734
  unique_clauses.append(clause)
735
  seen_texts.add(text_key)
736
 
737
- if len(unique_clauses) >= max_clauses:
738
  break
739
 
740
  return unique_clauses
741
 
 
742
  def _text_similarity(self, text1: str, text2: str) -> float:
743
- """Calculate text similarity (simple Jaccard similarity)"""
744
- words1 = set(text1.lower().split())
745
- words2 = set(text2.lower().split())
 
 
746
 
747
  intersection = len(words1 & words2)
748
- union = len(words1 | words2)
749
 
750
  return intersection / union if union > 0 else 0.0
751
 
752
- # =========================================================================
753
- # UTILITY METHODS
754
- # =========================================================================
755
 
756
  def get_category_distribution(self, clauses: List[ExtractedClause]) -> Dict[str, int]:
757
- """Get distribution of clause categories"""
 
 
758
  distribution = defaultdict(int)
 
759
  for clause in clauses:
760
  distribution[clause.category] += 1
761
 
@@ -763,9 +698,15 @@ class ClauseExtractor:
763
 
764
  return dict(distribution)
765
 
 
766
  def get_high_risk_clauses(self, clauses: List[ExtractedClause]) -> List[ExtractedClause]:
767
- """Get clauses with risk indicators"""
 
 
768
  risky = [c for c in clauses if c.risk_indicators]
769
- risky.sort(key=lambda x: len(x.risk_indicators), reverse=True)
 
770
 
771
- log
 
 
 
1
+ # DEPENDENCIES
 
 
 
 
 
2
  import re
3
+ import sys
4
+ import torch
 
5
  import numpy as np
6
+ from typing import Any
7
+ from typing import List
8
+ from typing import Dict
9
+ from typing import Tuple
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ from dataclasses import field
13
+ from dataclasses import dataclass
14
+ from collections import defaultdict
15
  from sentence_transformers import util
16
 
17
  # Import utilities
 
 
18
  sys.path.append(str(Path(__file__).parent.parent))
19
 
20
+ from utils.logger import log_info
21
+ from utils.logger import log_error
22
  from utils.text_processor import TextProcessor
23
+ from utils.logger import ContractAnalyzerLogger
24
 
25
 
26
  @dataclass
 
28
  """
29
  Extracted clause with comprehensive metadata
30
  """
31
+ text : str
32
+ reference : str # e.g., "Section 5.2", "Clause 11.1"
33
+ category : str # e.g., "termination", "compensation", "indemnification"
34
+ confidence : float # 0.0-1.0
35
+ start_pos : int
36
+ end_pos : int
37
+ extraction_method : str # "structural", "semantic", "hybrid"
38
+ risk_indicators : List[str] = field(default_factory = list)
39
+ embeddings : Optional[np.ndarray] = None
40
+ subclauses : List[str] = field(default_factory = list)
41
+ legal_bert_score : float = 0.0
42
+
43
 
44
  def to_dict(self) -> Dict[str, Any]:
45
+ """
46
+ Convert to dictionary for serialization
47
+ """
48
+ return {"text" : self.text,
49
+ "reference" : self.reference,
50
+ "category" : self.category,
51
+ "confidence" : round(self.confidence, 3),
52
+ "start_pos" : self.start_pos,
53
+ "end_pos" : self.end_pos,
54
+ "extraction_method" : self.extraction_method,
55
+ "risk_indicators" : self.risk_indicators,
56
+ "subclauses" : self.subclauses,
57
+ "legal_bert_score" : round(self.legal_bert_score, 3),
58
+ }
59
+
60
 
61
 
62
  class ClauseExtractor:
63
  """
64
+ Clause extraction using Legal-BERT + structural patterns
65
 
66
  Process:
67
  1. Structural extraction (numbered sections like "5.2", "Article III")
 
70
  4. Category classification using Legal-BERT + keyword matching
71
  5. Deduplication and ranking
72
  """
 
 
73
  # CLAUSE CATEGORY DEFINITIONS WITH REPRESENTATIVE TEXTS
74
+ CLAUSE_CATEGORIES = {'compensation' : {'keywords' : ['salary', 'wage', 'compensation', 'pay', 'payment', 'bonus', 'commission', 'remuneration', 'fee', 'rate', 'benefits'],
75
+ 'representative_text' : ("The Employee shall receive an annual base salary of One Hundred Thousand Dollars payable in accordance with the Company's standard payroll practices. Additional compensation may include performance bonuses and stock options."),
76
+ 'weight' : 1.0,
77
+ },
78
+ 'termination' : {'keywords' : ['termination', 'terminate', 'notice period', 'resignation', 'dismissal', 'severance', 'end of employment', 'cessation', 'notice'],
79
+ 'representative_text' : ("Either party may terminate this Agreement upon thirty days written notice. The Company may terminate for cause immediately upon written notice to Employee. Upon termination, Employee shall receive severance compensation."),
80
+ 'weight' : 1.2,
81
+ },
82
+ 'non_compete' : {'keywords' : ['non-compete', 'non-solicit', 'non-solicitation', 'restrictive covenant', 'competitive', 'competition', 'competing business', 'competitive activities'],
83
+ 'representative_text' : ("Employee agrees not to engage in any competitive business activities for a period of twelve months following termination within a fifty-mile radius. Employee shall not solicit Company clients or employees during this period."),
84
+ 'weight' : 1.5,
85
+ },
86
+ 'confidentiality' : {'keywords' : ['confidential', 'proprietary', 'trade secret', 'disclosure', 'confidentiality', 'secret', 'private', 'non-disclosure'],
87
+ 'representative_text' : ("Employee shall maintain the confidentiality of all proprietary information and trade secrets of the Company. Confidential Information includes business plans, customer lists, and technical data. These obligations survive termination."),
88
+ 'weight' : 1.1,
89
+ },
90
+ 'indemnification' : {'keywords' : ['indemnify', 'indemnification', 'hold harmless', 'defend', 'liability', 'claims', 'losses', 'damages'],
91
+ 'representative_text' : ("Party A shall indemnify and hold harmless Party B from any claims, losses, or damages arising from Party A's breach or negligence. This indemnification includes reasonable attorneys' fees and costs of defense."),
92
+ 'weight' : 1.3,
93
+ },
94
+ 'intellectual_property' : {'keywords' : ['intellectual property', 'ip', 'copyright', 'patent', 'trademark', 'work product', 'inventions', 'creation', 'ownership', 'ip rights'],
95
+ 'representative_text' : ("All work product and inventions created by Employee during employment shall be the exclusive property of the Company. Employee assigns all intellectual property rights including patents, copyrights, and trade secrets to the Company."),
96
+ 'weight' : 1.2,
97
+ },
98
+ 'liability' : {'keywords' : ['liable', 'liability', 'damages', 'limitation', 'consequential', 'indirect', 'punitive', 'cap', 'limited liability'],
99
+ 'representative_text' : ("In no event shall either party be liable for indirect, incidental, or consequential damages. Total liability under this Agreement shall not exceed the amounts paid in the twelve months preceding the claim."),
100
+ 'weight' : 1.2,
101
+ },
102
+ 'warranty' : {'keywords' : ['warranty', 'warrant', 'representation', 'guarantee', 'assurance', 'promise', 'warranties'],
103
+ 'representative_text' : ("Company warrants that the Services will be performed in a professional manner. EXCEPT AS EXPRESSLY PROVIDED, COMPANY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE."),
104
+ 'weight' : 0.9,
105
+ },
106
+ 'dispute_resolution' : {'keywords' : ['arbitration', 'mediation', 'dispute', 'jurisdiction', 'governing law', 'venue', 'forum', 'resolution'],
107
+ 'representative_text' : ("Any disputes arising under this Agreement shall be resolved through binding arbitration in accordance with the rules of the American Arbitration Association. This Agreement shall be governed by the laws of the State of California."),
108
+ 'weight' : 0.9,
109
+ },
110
+ 'insurance' : {'keywords' : ['insurance', 'coverage', 'insured', 'policy', 'premium', 'insurer'],
111
+ 'representative_text' : ("Contractor shall maintain general liability insurance with minimum coverage of one million dollars per occurrence. Proof of insurance shall be provided to Client. Company shall be named as additional insured on all policies."),
112
+ 'weight' : 0.8,
113
+ },
114
+ 'assignment' : {'keywords' : ['assignment', 'assign', 'transfer', 'successor', 'binding', 'assignee'],
115
+ 'representative_text' : ("This Agreement may not be assigned by either party without the prior written consent of the other party. This Agreement shall be binding upon and inure to the benefit of the parties' successors and permitted assigns."),
116
+ 'weight' : 0.8,
117
+ },
118
+ 'amendment' : {'keywords' : ['amendment', 'modify', 'modification', 'change', 'alteration', 'waiver'],
119
+ 'representative_text' : ("This Agreement may not be amended or modified except by written instrument signed by both parties. No waiver of any provision shall be effective unless in writing. All modifications must be mutually agreed upon."),
120
+ 'weight' : 0.7,
121
+ },
122
+ 'force_majeure' : {'keywords' : ['force majeure', 'act of god', 'unforeseeable', 'beyond control', 'natural disaster'],
123
+ 'representative_text' : ("Neither party shall be liable for failure to perform due to causes beyond its reasonable control including acts of God, war, strikes, or natural disasters. Performance shall be suspended during the force majeure event."),
124
+ 'weight' : 0.7,
125
+ },
126
+ 'entire_agreement' : {'keywords' : ['entire agreement', 'integration', 'supersedes', 'prior agreements', 'complete agreement'],
127
+ 'representative_text' : ("This Agreement constitutes the entire agreement between the parties and supersedes all prior agreements, whether written or oral. No other representations or warranties shall be binding unless incorporated herein."),
128
+ 'weight' : 0.6,
129
+ },
130
+ 'general' : {'keywords' : ['provision', 'term', 'condition', 'obligation', 'requirement'],
131
+ 'representative_text' : ("The parties agree to the following terms and conditions governing their relationship. Each party shall perform its obligations in good faith and in accordance with industry standards and applicable law."),
132
+ 'weight' : 0.5,
133
+ }
134
+ }
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # RISK INDICATOR PATTERNS
137
+ RISK_INDICATORS = {'critical' : ['unlimited liability', 'perpetual', 'irrevocable', 'forfeit', 'liquidated damages', 'wage withholding', 'joint and several'],
138
+ 'high' : ['non-compete', 'non-solicit', 'penalty', 'without cause', 'sole discretion', 'immediate termination', 'at-will'],
139
+ 'medium' : ['indemnify', 'hold harmless', 'confidential', 'proprietary', 'exclusive', 'terminate', 'default', 'breach'],
140
+ }
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+
143
  # INITIALIZATION
144
+ def __init__(self, model_loader: ModelLoader, contract_category: Optional[str] = None):
 
 
145
  """
146
  Initialize clause extractor with Legal-BERT
147
 
148
+ Arguments:
149
+ ----------
150
+ model_loader { ModelLoader } : ModelLoader instance for accessing Legal-BERT
151
+
152
+ contract_category { str } : Optional contract category for context-aware extraction
153
  """
154
+ self.model_loader = model_loader
155
+ self.contract_category = contract_category
156
 
157
  # Models (lazy loaded)
158
+ self.legal_bert_model = None
159
  self.legal_bert_tokenizer = None
160
+ self.embedding_model = None
161
+ self.device = None
162
 
163
  # Category embeddings (computed from representative texts)
164
+ self.category_embeddings = dict()
165
 
166
  # Text processor
167
+ self.text_processor = TextProcessor(use_spacy = False)
168
 
169
  # Logger
170
+ self.logger = ContractAnalyzerLogger.get_logger()
171
 
172
  # Lazy load
173
  self._lazy_load()
174
 
175
+
176
  def _lazy_load(self):
177
+ """
178
+ Lazy load Legal-BERT and embedding models
179
+ """
180
  if self.legal_bert_model is None:
181
  try:
182
  log_info("Loading Legal-BERT for clause extraction...")
183
 
184
  # Load Legal-BERT (nlpaueb/legal-bert-base-uncased)
185
  self.legal_bert_model, self.legal_bert_tokenizer = self.model_loader.load_legal_bert()
186
+ self.device = self.model_loader.device
187
 
188
  # Load sentence transformer for embeddings
189
+ self.embedding_model = self.model_loader.load_embedding_model()
190
 
191
  # Prepare category embeddings using Legal-BERT
192
  self._prepare_category_embeddings()
 
194
  log_info("Clause extractor models loaded successfully")
195
 
196
  except Exception as e:
197
+ log_error(e, context = {"component": "ClauseExtractor", "operation": "model_loading"})
198
  raise
199
+
200
 
201
  def _prepare_category_embeddings(self):
202
  """
203
  Pre-compute Legal-BERT embeddings for category representative texts
204
+
205
  This enables semantic similarity matching for clause classification
206
  """
207
  log_info("Computing Legal-BERT embeddings for clause categories...")
208
 
209
  for category, config in self.CLAUSE_CATEGORIES.items():
210
+ representative_text = config['representative_text']
211
 
212
  # Get Legal-BERT embedding (using [CLS] token)
213
+ embedding = self._get_legal_bert_embedding(representative_text)
214
+
215
  self.category_embeddings[category] = embedding
216
 
217
  log_info(f"Prepared Legal-BERT embeddings for {len(self.category_embeddings)} categories")
218
 
219
+
220
  def _get_legal_bert_embedding(self, text: str) -> np.ndarray:
221
  """
222
  Get Legal-BERT embedding for text using [CLS] token
223
 
224
+ Arguments:
225
+ ----------
226
+ text { str } : Input text
227
 
228
  Returns:
229
+ --------
230
+ { np.ndarray } : Embedding vector as numpy array
231
  """
232
  # Tokenize
233
+ inputs = self.legal_bert_tokenizer(text,
234
+ return_tensors = "pt",
235
+ padding = True,
236
+ truncation = True,
237
+ max_length = 512,
238
+ ).to(self.device)
 
239
 
240
  # Get embeddings
241
  with torch.no_grad():
242
+ outputs = self.legal_bert_model(**inputs)
243
  # Use [CLS] token embedding (first token)
244
  cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
245
 
246
  return cls_embedding
247
 
 
 
 
248
 
249
+
250
  @ContractAnalyzerLogger.log_execution_time("extract_clauses")
251
+ def extract_clauses(self, contract_text: str, max_clauses: int = 15) -> List[ExtractedClause]:
 
252
  """
253
  Extract and classify clauses from contract using hybrid approach
254
 
 
258
  3. Legal-BERT classification
259
  4. Deduplicate and rank by confidence
260
 
261
+ Arguments:
262
+ ----------
263
+ contract_text { str } : Full contract text
264
+
265
+ max_clauses { int } : Maximum number of clauses to return
266
 
267
  Returns:
268
+ --------
269
+ { list } : List of ExtractedClause objects sorted by confidence
270
  """
271
 
272
  log_info("Starting clause extraction",
273
+ text_length = len(contract_text),
274
+ contract_category = self.contract_category,
275
+ max_clauses = max_clauses,
276
+ )
277
 
278
+ # Extract using structural patterns
279
  structural_clauses = self._extract_structural_clauses(contract_text)
280
  log_info(f"Extracted {len(structural_clauses)} structural clauses")
281
 
282
+ # Semantic chunking for unstructured parts
283
+ semantic_chunks = self._semantic_chunking(contract_text, structural_clauses)
284
  log_info(f"Created {len(semantic_chunks)} semantic chunks")
285
 
286
+ # Combine all candidates
287
+ all_candidates = structural_clauses + semantic_chunks
288
  log_info(f"Total candidates: {len(all_candidates)}")
289
 
290
+ # Classify with Legal-BERT
291
  classified_clauses = self._classify_clauses_with_legal_bert(all_candidates)
292
  log_info(f"Classified {len(classified_clauses)} clauses")
293
 
294
+ # Deduplicate and rank
295
+ final_clauses = self._deduplicate_and_rank(classified_clauses, max_clauses)
296
  log_info(f"Final output: {len(final_clauses)} clauses")
297
 
298
  return final_clauses
299
 
 
 
 
300
 
301
  def _extract_structural_clauses(self, text: str) -> List[Dict]:
302
  """
 
308
  - "Article III. Text"
309
  - "Clause 11. Text"
310
  """
311
+ candidates = list()
312
 
313
  # Clean text
314
+ text = re.sub(r'\s+', ' ', text)
315
 
316
  # Patterns for legal numbering
317
+ patterns = [(r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=\d+\.\d+(?:\.\d+)*\.|$)', 'numbered'),
318
+ (r'(Article\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+))\.\s*([^\n]{30,800}?)(?=Article\s+(?:\d+|[IVXLCDM]+)|$)', 'article'),
319
+ (r'(Section\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Section\s+\d+|$)', 'section'),
320
+ (r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Clause\s+\d+|$)', 'clause'),
321
+ (r'\(([a-z]|[ivxlcdm]+)\)\s*([^\n]{30,500}?)(?=\([a-z]|[ivxlcdm]+\)|\n\n|$)', 'subclause'),
322
+ ]
 
 
 
 
 
 
323
 
324
  for pattern, ref_type in patterns:
325
  matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
326
+
327
  for match in matches:
328
  clause_text = match.group(2).strip()
329
 
 
331
  if not self._is_boilerplate(clause_text):
332
  # Check for meaningful content
333
  if self._has_meaningful_content(clause_text):
334
+ candidates.append({'text' : clause_text,
335
+ 'reference' : match.group(1).strip(),
336
+ 'start' : match.start(),
337
+ 'end' : match.end(),
338
+ 'type' : 'structural',
339
+ 'ref_type' : ref_type,
340
+ })
 
341
 
342
  # Remove overlapping clauses
343
  candidates = self._remove_overlapping(candidates)
344
 
345
  return candidates
346
+
347
 
348
  def _is_boilerplate(self, text: str) -> bool:
349
+ """
350
+ Check if text is boilerplate/definitional rather than substantive
351
+ """
352
+ boilerplate_indicators = ['shall mean',
353
+ 'means and includes',
354
+ 'defined as',
355
+ 'definition of',
356
+ 'hereinafter referred to',
357
+ 'for purposes of this',
358
+ 'interpretation of',
359
+ 'as used in this',
360
+ 'the term',
361
+ 'shall include',
362
+ 'includes but not limited',
363
+ ]
364
+
365
+ text_lower = text.lower()
366
  # Must have at least one strong indicator AND be definition-heavy
367
+ has_indicator = any(indicator in text_lower for indicator in boilerplate_indicators)
368
+ is_short_definition = len(text.split()) < 50 and '"' in text
369
 
370
  return has_indicator or is_short_definition
371
 
372
+
373
  def _has_meaningful_content(self, text: str) -> bool:
374
+ """
375
+ Check if text has meaningful legal content
376
+ """
377
  # Must have minimum length
378
+ if (len(text.split()) < 15):
379
  return False
380
 
381
  # Check for legal action verbs
382
+ action_verbs = ['shall',
383
+ 'must',
384
+ 'will',
385
+ 'may',
386
+ 'agrees',
387
+ 'undertakes',
388
+ 'covenants',
389
+ 'warrants',
390
+ 'represents',
391
+ 'acknowledges',
392
+ 'certifies',
393
+ 'indemnifies',
394
+ 'waives',
395
+ 'terminates',
396
+ ]
397
+
398
+ text_lower = text.lower()
399
+ has_action = any(verb in text_lower for verb in action_verbs)
400
 
401
  # Check for legal subjects
402
+ legal_subjects = ['party',
403
+ 'parties',
404
+ 'employee',
405
+ 'employer',
406
+ 'company',
407
+ 'contractor',
408
+ 'consultant',
409
+ 'client',
410
+ 'vendor',
411
+ 'buyer',
412
+ 'seller',
413
+ 'landlord',
414
+ 'tenant',
415
+ 'licensor',
416
+ 'licensee',
417
+ ]
418
+
419
+ has_subject = any(subj in text_lower for subj in legal_subjects)
420
 
421
  return has_action or has_subject
422
 
423
+
424
  def _remove_overlapping(self, candidates: List[Dict]) -> List[Dict]:
425
+ """
426
+ Remove overlapping clause extractions
427
+ """
428
  if not candidates:
429
  return []
430
 
431
  # Sort by start position
432
+ candidates.sort(key = lambda x: x['start'])
433
 
434
  non_overlapping = [candidates[0]]
435
 
 
437
  last = non_overlapping[-1]
438
 
439
  # Check if overlaps
440
+ if (candidate['start'] >= last['end']):
441
  non_overlapping.append(candidate)
442
+
443
+ elif (len(candidate['text']) > len(last['text'])):
444
  # Keep longer clause if overlapping
445
  non_overlapping[-1] = candidate
446
 
447
  return non_overlapping
448
 
 
 
 
449
 
450
+ def _semantic_chunking(self, text: str, structural_clauses: List[Dict], chunk_size: int = 200) -> List[Dict]:
 
 
451
  """
452
+ Chunk unstructured text semantically uses sentence boundaries to find natural clause boundaries
 
453
  """
 
454
  # Get covered ranges from structural clauses
455
  covered_ranges = [(c['start'], c['end']) for c in structural_clauses]
456
 
457
  # Split into sentences
458
+ sentences = self.text_processor.extract_sentences(text)
459
 
460
+ chunks = list()
461
+ current_chunk = list()
462
  current_length = 0
463
+ current_start = 0
464
 
465
  for sentence in sentences:
466
  # Check if sentence is already covered by structural extraction
467
  sentence_start = text.find(sentence, current_start)
468
+ if (sentence_start == -1):
469
  continue
470
 
471
  if self._is_in_range(sentence_start, covered_ranges):
 
476
  current_length += len(sentence.split())
477
 
478
  # Create chunk when reaching size limit
479
+ if (current_length >= chunk_size):
480
  chunk_text = ' '.join(current_chunk).strip()
481
 
482
+ if (len(chunk_text) >= 50) and (not self._is_boilerplate(chunk_text)):
483
  if self._has_meaningful_content(chunk_text):
484
+ chunks.append({'text' : chunk_text,
485
+ 'reference' : f'Semantic-{len(chunks)+1}',
486
+ 'start' : sentence_start,
487
+ 'end' : sentence_start + len(chunk_text),
488
+ 'type' : 'semantic',
489
+ 'ref_type' : 'semantic',
490
+ })
 
491
 
492
+ current_chunk = list()
493
  current_length = 0
494
 
495
  current_start = sentence_start + len(sentence)
 
497
  # Add final chunk if exists
498
  if current_chunk:
499
  chunk_text = ' '.join(current_chunk).strip()
500
+
501
+ if ((len(chunk_text) >= 50) and (not self._is_boilerplate(chunk_text))):
502
  if self._has_meaningful_content(chunk_text):
503
  sentence_start = text.find(current_chunk[0])
504
+ chunks.append({'text' : chunk_text,
505
+ 'reference' : f'Semantic-{len(chunks)+1}',
506
+ 'start' : sentence_start,
507
+ 'end' : sentence_start + len(chunk_text),
508
+ 'type' : 'semantic',
509
+ 'ref_type' : 'semantic',
510
+ })
 
511
 
512
  return chunks
513
 
514
+
515
  def _is_in_range(self, position: int, ranges: List[Tuple[int, int]]) -> bool:
516
+ """
517
+ Check if position is within any of the ranges
518
+ """
519
  return any(start <= position <= end for start, end in ranges)
520
 
 
 
 
521
 
522
  def _classify_clauses_with_legal_bert(self, candidates: List[Dict]) -> List[ExtractedClause]:
523
  """
524
  Classify clauses using Legal-BERT embeddings + keyword matching
525
  """
526
+ classified = list()
527
 
528
  for candidate in candidates:
529
  # Get Legal-BERT embedding for clause
530
+ clause_embedding = self._get_legal_bert_embedding(candidate['text'])
531
 
532
  # Classify using hybrid approach
533
+ category, confidence, legal_bert_score = self._classify_single_clause(candidate['text'], clause_embedding)
 
 
 
534
 
535
  # Extract risk indicators
536
+ risk_indicators = self._extract_risk_indicators(candidate['text'])
537
 
538
  # Extract sub-clauses if any
539
+ subclauses = self._extract_subclauses(candidate['text'])
540
 
541
+ classified.append(ExtractedClause(text = candidate['text'],
542
+ reference = candidate['reference'],
543
+ category = category,
544
+ confidence = confidence,
545
+ start_pos = candidate['start'],
546
+ end_pos = candidate['end'],
547
+ extraction_method = candidate['type'],
548
+ risk_indicators = risk_indicators,
549
+ embeddings = clause_embedding,
550
+ subclauses = subclauses,
551
+ legal_bert_score = legal_bert_score,
552
+ )
553
+ )
554
 
555
  return classified
556
 
557
+
558
+ def _classify_single_clause(self, text: str, clause_embedding: np.ndarray) -> Tuple[str, float, float]:
559
  """
560
  Classify single clause using Legal-BERT + keyword matching
561
 
562
  Returns:
563
+ --------
564
+ { tuple } : (category, confidence, legal_bert_score)
565
  """
566
+ text_lower = text.lower()
567
 
568
+ # Keyword matching
569
+ keyword_scores = dict()
570
+
571
  for category, config in self.CLAUSE_CATEGORIES.items():
572
+ keywords = config['keywords']
573
+ weight = config['weight']
574
 
575
+ keyword_count = sum(1 for kw in keywords if kw in text_lower)
576
  keyword_scores[category] = (keyword_count / len(keywords)) * weight
577
 
578
+ # Legal-BERT semantic similarity
579
+ semantic_scores = dict()
580
  clause_embedding_tensor = torch.tensor(clause_embedding).unsqueeze(0)
581
 
582
  for category, cat_embedding in self.category_embeddings.items():
583
+ cat_embedding_tensor = torch.tensor(cat_embedding).unsqueeze(0)
584
+ similarity = torch.nn.functional.cosine_similarity(clause_embedding_tensor, cat_embedding_tensor).item()
 
 
 
585
  semantic_scores[category] = similarity
586
 
587
  # Combine scores (70% semantic, 30% keyword)
588
+ combined_scores = dict()
589
+
590
  for category in self.CLAUSE_CATEGORIES.keys():
591
+ combined = (semantic_scores.get(category, 0) * 0.70 + keyword_scores.get(category, 0) * 0.30)
 
 
 
592
  combined_scores[category] = combined
593
 
594
  # Get best category
595
+ best_category = max(combined_scores, key = combined_scores.get)
596
+ confidence = combined_scores[best_category]
597
  legal_bert_score = semantic_scores[best_category]
598
 
599
  return best_category, confidence, legal_bert_score
600
 
601
+
602
  def _extract_risk_indicators(self, text: str) -> List[str]:
603
+ """
604
+ Extract risk indicator keywords from clause text
605
+ """
606
+ text_lower = text.lower()
607
+ found_indicators = dict()
608
 
609
  for severity, indicators in self.RISK_INDICATORS.items():
610
  for indicator in indicators:
611
  if indicator in text_lower:
612
  found_indicators.append(indicator)
613
 
614
+ # Top 25 risk indicators
615
+ return found_indicators[:25]
616
 
617
+
618
  def _extract_subclauses(self, text: str) -> List[str]:
619
+ """
620
+ Extract sub-clauses from main clause (e.g., (a), (b), (i), (ii))
621
+ """
622
  # Pattern for sub-clauses: (a), (i), etc.
623
  subclause_pattern = r'\(([a-z]|[ivxlcdm]+)\)\s*([^()]{20,200}?)(?=\([a-z]|[ivxlcdm]+\)|$)'
624
+ matches = re.findall(subclause_pattern, text, re.IGNORECASE)
625
 
626
+ subclauses = list()
627
+
628
  for ref, subtext in matches:
629
  clean_text = subtext.strip()
630
+
631
+ if (len(clean_text) >= 20):
632
  subclauses.append(f"({ref}) {clean_text}")
633
 
634
+ # Max 25 sub-clauses
635
+ return subclauses[:25]
636
 
 
 
 
637
 
638
+ def _deduplicate_and_rank(self, clauses: List[ExtractedClause], max_clauses: int) -> List[ExtractedClause]:
 
639
  """
640
  Remove duplicates and rank by confidence + legal_bert_score
641
  """
 
643
  return []
644
 
645
  # Sort by combined score (confidence * 0.6 + legal_bert_score * 0.4)
646
+ clauses.sort(key = lambda x: (x.confidence * 0.6 + x.legal_bert_score * 0.4), reverse = True)
 
 
 
647
 
648
  # Deduplicate by text similarity
649
+ unique_clauses = list()
650
+ seen_texts = set()
651
 
652
  for clause in clauses:
653
  # Simple deduplication by first 100 chars
654
+ text_key = clause.text[:100].lower().strip()
655
 
656
  # Also check similarity to already added clauses
657
  is_duplicate = False
658
+
659
  for existing in unique_clauses:
660
  similarity = self._text_similarity(clause.text, existing.text)
661
+ if (similarity > 0.85):
662
  is_duplicate = True
663
  break
664
 
 
666
  unique_clauses.append(clause)
667
  seen_texts.add(text_key)
668
 
669
+ if (len(unique_clauses) >= max_clauses):
670
  break
671
 
672
  return unique_clauses
673
 
674
+
675
  def _text_similarity(self, text1: str, text2: str) -> float:
676
+ """
677
+ Calculate text similarity (simple Jaccard similarity)
678
+ """
679
+ words1 = set(text1.lower().split())
680
+ words2 = set(text2.lower().split())
681
 
682
  intersection = len(words1 & words2)
683
+ union = len(words1 | words2)
684
 
685
  return intersection / union if union > 0 else 0.0
686
 
 
 
 
687
 
688
  def get_category_distribution(self, clauses: List[ExtractedClause]) -> Dict[str, int]:
689
+ """
690
+ Get distribution of clause categories
691
+ """
692
  distribution = defaultdict(int)
693
+
694
  for clause in clauses:
695
  distribution[clause.category] += 1
696
 
 
698
 
699
  return dict(distribution)
700
 
701
+
702
  def get_high_risk_clauses(self, clauses: List[ExtractedClause]) -> List[ExtractedClause]:
703
+ """
704
+ Get clauses with risk indicators
705
+ """
706
  risky = [c for c in clauses if c.risk_indicators]
707
+
708
+ risky.sort(key = lambda x: len(x.risk_indicators), reverse = True)
709
 
710
+ top_25_risky_clauses = risky[:25]
711
+
712
+ return top_25_risky_clauses
services/contract_classifier.py CHANGED
@@ -232,7 +232,7 @@ class ContractClassifier:
232
  Arguments:
233
  ----------
234
  model_loader : ModelLoader instance for accessing Legal-BERT and embeddings
235
- """
236
  self.model_loader = model_loader
237
  self.embedding_model = None
238
  self.legal_bert_model = None
@@ -294,7 +294,7 @@ class ContractClassifier:
294
 
295
  log_info(f"Prepared embeddings for {len(self.category_embeddings)} categories")
296
 
297
-
298
  # MAIN CLASSIFICATION METHOD
299
  @ContractAnalyzerLogger.log_execution_time("classify_contract")
300
  def classify_contract(self, contract_text: str, min_confidence: float = 0.50) -> ContractCategory:
@@ -325,103 +325,99 @@ class ContractClassifier:
325
  raise ValueError("Contract text too short for classification")
326
 
327
  # Preprocess text (use first 3000 chars for efficiency)
328
- text_excerpt = contract_text[:3000]
329
 
330
  log_info("Starting contract classification",
331
- text_length=len(contract_text),
332
- excerpt_length=len(text_excerpt))
 
333
 
334
  # Step 1: Keyword scoring
335
- keyword_scores = self._score_keywords(contract_text.lower())
336
 
337
  # Step 2: Semantic similarity
338
- semantic_scores = self._semantic_similarity(text_excerpt)
339
 
340
  # Step 3: Legal-BERT enhanced (optional - can be expensive)
341
- # legal_bert_scores = self._legal_bert_classification(text_excerpt)
342
 
343
  # Step 4: Combine scores (weighted average)
344
- combined_scores = self._combine_scores(
345
- keyword_scores=keyword_scores,
346
- semantic_scores=semantic_scores,
347
- # legal_bert_scores=legal_bert_scores # Uncomment if using Legal-BERT
348
- )
349
 
350
  # Step 5: Get primary category
351
  if not combined_scores:
352
  log_info("No categories detected, defaulting to 'general'")
353
- return ContractCategory(
354
- category="general",
355
- subcategory=None,
356
- confidence=0.5,
357
- reasoning=["Unable to determine specific contract type"],
358
- detected_keywords=[]
359
- )
360
 
361
- primary_category = max(combined_scores, key=combined_scores.get)
362
- confidence = combined_scores[primary_category]
363
 
364
  # Step 6: Detect subcategory
365
- subcategory = self._detect_subcategory(contract_text, primary_category)
366
 
367
  # Step 7: Generate reasoning
368
- reasoning = self._generate_reasoning(
369
- contract_text=contract_text,
370
- primary_category=primary_category,
371
- subcategory=subcategory,
372
- keyword_scores=keyword_scores,
373
- semantic_scores=semantic_scores,
374
- combined_scores=combined_scores
375
- )
376
 
377
  # Step 8: Extract detected keywords
378
- detected_keywords = self._extract_detected_keywords(contract_text, primary_category)
379
-
380
- # Step 9: Get alternative categories
381
- alternative_categories = sorted(
382
- [(cat, score) for cat, score in combined_scores.items() if cat != primary_category],
383
- key=lambda x: x[1],
384
- reverse=True
385
- )[:3] # Top 3 alternatives
386
-
387
- result = ContractCategory(
388
- category=primary_category,
389
- subcategory=subcategory,
390
- confidence=confidence,
391
- reasoning=reasoning,
392
- detected_keywords=detected_keywords,
393
- alternative_categories=alternative_categories
394
- )
395
 
396
  log_info("Contract classified successfully",
397
- category=primary_category,
398
- subcategory=subcategory,
399
- confidence=confidence)
 
400
 
401
  return result
402
 
403
- # =========================================================================
404
- # SCORING METHODS
405
- # =========================================================================
406
 
407
  def _score_keywords(self, text_lower: str) -> Dict[str, float]:
408
  """
409
  Score each category based on keyword presence
410
 
411
- Args:
412
- text_lower: Lowercase contract text
 
413
 
414
  Returns:
415
- Dictionary of {category: score}
 
416
  """
417
- scores = {}
418
 
419
  for category, config in self.CATEGORY_HIERARCHY.items():
420
- keywords = config['keywords']
421
- weight = config['weight']
422
 
423
  # Count keyword matches
424
- keyword_count = sum(1 for keyword in keywords if keyword in text_lower)
425
 
426
  # Normalize by number of keywords and apply weight
427
  normalized_score = (keyword_count / len(keywords)) * weight
@@ -430,91 +426,92 @@ class ContractClassifier:
430
 
431
  return scores
432
 
 
433
  def _semantic_similarity(self, text: str) -> Dict[str, float]:
434
  """
435
  Calculate semantic similarity to category templates using embeddings
436
 
437
- Args:
438
- text: Contract text excerpt
 
439
 
440
  Returns:
441
- Dictionary of {category: similarity_score}
 
442
  """
443
  # Encode contract text
444
- text_embedding = self.embedding_model.encode(text, convert_to_tensor=True)
445
 
446
  # Calculate similarity to each category
447
- similarities = {}
 
448
  for category, cat_embedding in self.category_embeddings.items():
449
- similarity = util.cos_sim(text_embedding, cat_embedding)[0][0].item()
450
  similarities[category] = similarity
451
 
452
  return similarities
 
453
 
454
  def _legal_bert_classification(self, text: str) -> Dict[str, float]:
455
  """
456
  Use Legal-BERT for classification (optional - computationally expensive)
457
 
458
- Args:
459
- text: Contract text excerpt
 
460
 
461
  Returns:
462
- Dictionary of {category: score}
 
463
  """
464
- # This is a placeholder for Legal-BERT classification
465
- # In production, you'd fine-tune Legal-BERT on labeled contract data
466
-
467
  # Tokenize
468
- inputs = self.legal_bert_tokenizer(
469
- text,
470
- return_tensors="pt",
471
- padding=True,
472
- truncation=True,
473
- max_length=512
474
- ).to(self.device)
475
 
476
  # Get embeddings
477
  with torch.no_grad():
478
- outputs = self.legal_bert_model(**inputs)
479
  cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
480
 
481
- # For now, return uniform scores (placeholder)
482
- # In production, you'd use a trained classifier head
483
  return {cat: 0.5 for cat in self.CATEGORY_HIERARCHY.keys()}
484
 
485
- def _combine_scores(self, keyword_scores: Dict[str, float],
486
- semantic_scores: Dict[str, float],
487
- legal_bert_scores: Dict[str, float] = None) -> Dict[str, float]:
488
  """
489
  Combine scores from different methods (weighted average)
490
 
491
- Args:
492
- keyword_scores: Keyword-based scores
493
- semantic_scores: Semantic similarity scores
494
- legal_bert_scores: Legal-BERT scores (optional)
 
 
 
495
 
496
  Returns:
497
- Combined scores dictionary
 
498
  """
499
- combined = {}
500
 
501
  # Weights for each method
502
- keyword_weight = 0.40
503
- semantic_weight = 0.60
504
  legal_bert_weight = 0.00 # Set to 0 if not using Legal-BERT
505
 
506
  if legal_bert_scores:
507
  # Normalize weights
508
- total_weight = keyword_weight + semantic_weight + legal_bert_weight
509
- keyword_weight /= total_weight
510
- semantic_weight /= total_weight
511
  legal_bert_weight /= total_weight
512
 
513
  for category in self.CATEGORY_HIERARCHY.keys():
514
- score = (
515
- keyword_scores.get(category, 0) * keyword_weight +
516
- semantic_scores.get(category, 0) * semantic_weight
517
- )
518
 
519
  if legal_bert_scores:
520
  score += legal_bert_scores.get(category, 0) * legal_bert_weight
@@ -523,202 +520,204 @@ class ContractClassifier:
523
 
524
  return combined
525
 
526
- # =========================================================================
527
- # SUBCATEGORY DETECTION
528
- # =========================================================================
529
 
530
  def _detect_subcategory(self, text: str, primary_category: str) -> Optional[str]:
531
  """
532
  Detect specific subcategory within primary category
533
 
534
- Args:
535
- text: Full contract text
536
- primary_category: Detected primary category
 
 
537
 
538
  Returns:
539
- Subcategory name or None
 
540
  """
541
- text_lower = text.lower()
542
 
543
  # Get subcategories for this category
544
  subcategories = self.CATEGORY_HIERARCHY[primary_category]['subcategories']
545
 
546
  # Score each subcategory
547
- subcat_scores = {}
 
548
  for subcat in subcategories:
549
  if subcat in self.SUBCATEGORY_PATTERNS:
550
- patterns = self.SUBCATEGORY_PATTERNS[subcat]
551
- score = sum(1 for pattern in patterns if pattern in text_lower)
552
  subcat_scores[subcat] = score
553
 
554
  # Return best match if any
555
- if subcat_scores and max(subcat_scores.values()) > 0:
556
- best_subcat = max(subcat_scores, key=subcat_scores.get)
557
  log_info(f"Detected subcategory: {best_subcat}",
558
- category=primary_category,
559
- score=subcat_scores[best_subcat])
 
 
560
  return best_subcat
561
 
562
  return None
563
 
564
- # =========================================================================
565
- # REASONING & EXPLANATION
566
- # =========================================================================
567
-
568
- def _generate_reasoning(self, contract_text: str, primary_category: str,
569
- subcategory: Optional[str],
570
- keyword_scores: Dict[str, float],
571
- semantic_scores: Dict[str, float],
572
- combined_scores: Dict[str, float]) -> List[str]:
573
  """
574
  Generate human-readable reasoning for classification
575
 
576
  Returns:
577
- List of reasoning statements
 
578
  """
579
- reasoning = []
580
 
581
  # Primary category reasoning
582
- keyword_match = keyword_scores.get(primary_category, 0)
583
  semantic_match = semantic_scores.get(primary_category, 0)
584
 
585
- if keyword_match > 0.5:
586
- reasoning.append(
587
- f"Strong keyword indicators for {primary_category.replace('_', ' ')} category "
588
- f"({int(keyword_match * 100)}% keyword match)"
589
- )
590
- elif keyword_match > 0.3:
591
- reasoning.append(
592
- f"Moderate keyword presence for {primary_category.replace('_', ' ')} "
593
- f"({int(keyword_match * 100)}% keyword match)"
594
- )
595
-
596
- if semantic_match > 0.65:
597
- reasoning.append(
598
- f"Contract language semantically similar to {primary_category.replace('_', ' ')} agreements "
599
- f"(similarity: {semantic_match:.2f})"
600
- )
601
- elif semantic_match > 0.50:
602
- reasoning.append(
603
- f"Moderate semantic similarity to {primary_category.replace('_', ' ')} contracts "
604
- f"(similarity: {semantic_match:.2f})"
605
- )
606
 
607
  # Subcategory reasoning
608
  if subcategory:
609
- reasoning.append(
610
- f"Specific subcategory identified: {subcategory.replace('_', ' ')}"
611
- )
612
 
613
  # Alternative categories (if close)
614
- sorted_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
615
- if len(sorted_scores) > 1 and sorted_scores[1][1] > 0.40:
 
616
  alt_category, alt_score = sorted_scores[1]
617
- reasoning.append(
618
- f"Also contains elements of {alt_category.replace('_', ' ')} "
619
- f"(secondary match: {alt_score:.2f})"
620
- )
621
 
622
  # If no strong reasoning
623
  if not reasoning:
624
  reasoning.append("Classification based on general contract structure and terminology")
625
 
626
  return reasoning
 
627
 
628
  def _extract_detected_keywords(self, text: str, category: str) -> List[str]:
629
  """
630
  Extract which specific keywords were found
631
 
632
- Args:
633
- text: Contract text
634
- category: Detected category
 
 
635
 
636
  Returns:
637
- List of detected keywords
 
638
  """
639
  text_lower = text.lower()
640
- keywords = self.CATEGORY_HIERARCHY[category]['keywords']
641
 
642
- detected = [kw for kw in keywords if kw in text_lower]
643
- return detected[:10] # Top 10 keywords
 
 
644
 
645
- # =========================================================================
646
- # MULTI-LABEL CLASSIFICATION
647
- # =========================================================================
648
 
649
  @ContractAnalyzerLogger.log_execution_time("classify_multi_label")
650
- def classify_multi_label(self, text: str,
651
- threshold: float = 0.45) -> List[ContractCategory]:
652
  """
653
- Classify as multiple categories if applicable
654
- (e.g., Employment + NDA, Consulting + IP Assignment)
655
 
656
- Args:
657
- text: Contract text
658
- threshold: Minimum confidence threshold for multi-label
 
 
659
 
660
  Returns:
661
- List of ContractCategory objects (sorted by confidence)
 
662
  """
663
- log_info("Starting multi-label classification", threshold=threshold)
664
 
665
  # Get scores
666
- keyword_scores = self._score_keywords(text.lower())
667
- semantic_scores = self._semantic_similarity(text[:3000])
668
  combined_scores = self._combine_scores(keyword_scores, semantic_scores)
669
 
670
  # Get all categories above threshold
671
- matches = []
 
672
  for category, score in combined_scores.items():
673
- if score >= threshold:
674
  subcategory = self._detect_subcategory(text, category)
675
- reasoning = self._generate_reasoning(
676
- text, category, subcategory,
677
- keyword_scores, semantic_scores, combined_scores
678
- )
679
- keywords = self._extract_detected_keywords(text, category)
680
 
681
- matches.append(ContractCategory(
682
- category=category,
683
- subcategory=subcategory,
684
- confidence=score,
685
- reasoning=reasoning,
686
- detected_keywords=keywords
687
- ))
688
 
689
  # Sort by confidence
690
- matches.sort(key=lambda x: x.confidence, reverse=True)
691
 
692
  log_info(f"Multi-label classification found {len(matches)} categories")
693
 
694
  return matches if matches else [self.classify_contract(text)]
695
 
696
- # =========================================================================
697
- # UTILITY METHODS
698
- # =========================================================================
699
-
700
  def get_category_description(self, category: str) -> str:
701
- """Get human-readable description of a category"""
702
- descriptions = {
703
- 'employment': 'Employment agreements governing employer-employee relationships',
704
- 'consulting': 'Consulting and independent contractor agreements',
705
- 'nda': 'Non-disclosure and confidentiality agreements',
706
- 'technology': 'Software licensing and technology service agreements',
707
- 'intellectual_property': 'IP assignment, licensing, and protection agreements',
708
- 'real_estate': 'Property lease, rental, and purchase agreements',
709
- 'financial': 'Loan, credit, and financial service agreements',
710
- 'business': 'Partnership, joint venture, and corporate agreements',
711
- 'sales': 'Sales, purchase, and distribution agreements',
712
- 'service_agreement': 'Professional service and maintenance agreements',
713
- 'vendor': 'Vendor, supplier, and procurement agreements',
714
- 'agency': 'Agency and representation agreements'
715
- }
 
 
716
  return descriptions.get(category, 'General contract agreement')
 
717
 
718
  def get_all_categories(self) -> List[str]:
719
- """Get list of all supported categories"""
 
 
720
  return list(self.CATEGORY_HIERARCHY.keys())
721
 
 
722
  def get_subcategories(self, category: str) -> List[str]:
723
- """Get subcategories for a specific category"""
724
- return self.CATEGORY_HIERARCHY.get(category, {}).get('subcategories', [])
 
 
 
232
  Arguments:
233
  ----------
234
  model_loader : ModelLoader instance for accessing Legal-BERT and embeddings
235
+ """
236
  self.model_loader = model_loader
237
  self.embedding_model = None
238
  self.legal_bert_model = None
 
294
 
295
  log_info(f"Prepared embeddings for {len(self.category_embeddings)} categories")
296
 
297
+
298
  # MAIN CLASSIFICATION METHOD
299
  @ContractAnalyzerLogger.log_execution_time("classify_contract")
300
  def classify_contract(self, contract_text: str, min_confidence: float = 0.50) -> ContractCategory:
 
325
  raise ValueError("Contract text too short for classification")
326
 
327
  # Preprocess text (use first 3000 chars for efficiency)
328
+ text_excerpt = contract_text
329
 
330
  log_info("Starting contract classification",
331
+ text_length = len(contract_text),
332
+ excerpt_length = len(text_excerpt),
333
+ )
334
 
335
  # Step 1: Keyword scoring
336
+ keyword_scores = self._score_keywords(contract_text.lower())
337
 
338
  # Step 2: Semantic similarity
339
+ semantic_scores = self._semantic_similarity(text_excerpt)
340
 
341
  # Step 3: Legal-BERT enhanced (optional - can be expensive)
342
+ legal_bert_scores = self._legal_bert_classification(text_excerpt)
343
 
344
  # Step 4: Combine scores (weighted average)
345
+ combined_scores = self._combine_scores(keyword_scores = keyword_scores,
346
+ semantic_scores = semantic_scores,
347
+ legal_bert_scores = legal_bert_scores,
348
+ )
 
349
 
350
  # Step 5: Get primary category
351
  if not combined_scores:
352
  log_info("No categories detected, defaulting to 'general'")
353
+ return ContractCategory(category = "general",
354
+ subcategory = None,
355
+ confidence = 0.5,
356
+ reasoning = ["Unable to determine specific contract type"],
357
+ detected_keywords = [],
358
+ )
 
359
 
360
+ primary_category = max(combined_scores, key = combined_scores.get)
361
+ confidence = combined_scores[primary_category]
362
 
363
  # Step 6: Detect subcategory
364
+ subcategory = self._detect_subcategory(contract_text, primary_category)
365
 
366
  # Step 7: Generate reasoning
367
+ reasoning = self._generate_reasoning(contract_text = contract_text,
368
+ primary_category = primary_category,
369
+ subcategory = subcategory,
370
+ keyword_scores = keyword_scores,
371
+ semantic_scores = semantic_scores,
372
+ combined_scores = combined_scores,
373
+ )
 
374
 
375
  # Step 8: Extract detected keywords
376
+ detected_keywords = self._extract_detected_keywords(contract_text, primary_category)
377
+
378
+ # Step 9: Get alternative categories: Top 3 alternatives
379
+ alternative_categories = sorted([(cat, score) for cat, score in combined_scores.items() if cat != primary_category],
380
+ key = lambda x: x[1],
381
+ reverse = True,
382
+ )[:3]
383
+
384
+ result = ContractCategory(category = primary_category,
385
+ subcategory = subcategory,
386
+ confidence = confidence,
387
+ reasoning = reasoning,
388
+ detected_keywords = detected_keywords,
389
+ alternative_categories = alternative_categories,
390
+ )
 
 
391
 
392
  log_info("Contract classified successfully",
393
+ category = primary_category,
394
+ subcategory = subcategory,
395
+ confidence = confidence,
396
+ )
397
 
398
  return result
399
 
 
 
 
400
 
401
  def _score_keywords(self, text_lower: str) -> Dict[str, float]:
402
  """
403
  Score each category based on keyword presence
404
 
405
+ Arguments:
406
+ ----------
407
+ text_lower { str } : Lowercase contract text
408
 
409
  Returns:
410
+ --------
411
+ { dict } : Dictionary of {category: score}
412
  """
413
+ scores = dict()
414
 
415
  for category, config in self.CATEGORY_HIERARCHY.items():
416
+ keywords = config['keywords']
417
+ weight = config['weight']
418
 
419
  # Count keyword matches
420
+ keyword_count = sum(1 for keyword in keywords if keyword in text_lower)
421
 
422
  # Normalize by number of keywords and apply weight
423
  normalized_score = (keyword_count / len(keywords)) * weight
 
426
 
427
  return scores
428
 
429
+
430
  def _semantic_similarity(self, text: str) -> Dict[str, float]:
431
  """
432
  Calculate semantic similarity to category templates using embeddings
433
 
434
+ Arguments:
435
+ ----------
436
+ text { str } : Contract text excerpt
437
 
438
  Returns:
439
+ --------
440
+ { dict } : Dictionary of {category: similarity_score}
441
  """
442
  # Encode contract text
443
+ text_embedding = self.embedding_model.encode(text, convert_to_tensor = True)
444
 
445
  # Calculate similarity to each category
446
+ similarities = dict()
447
+
448
  for category, cat_embedding in self.category_embeddings.items():
449
+ similarity = util.cos_sim(text_embedding, cat_embedding)[0][0].item()
450
  similarities[category] = similarity
451
 
452
  return similarities
453
+
454
 
455
  def _legal_bert_classification(self, text: str) -> Dict[str, float]:
456
  """
457
  Use Legal-BERT for classification (optional - computationally expensive)
458
 
459
+ Arguments:
460
+ ----------
461
+ text { str } : Contract text excerpt
462
 
463
  Returns:
464
+ --------
465
+ { dict } : Dictionary of {category: score}
466
  """
 
 
 
467
  # Tokenize
468
+ inputs = self.legal_bert_tokenizer(text,
469
+ return_tensors = "pt",
470
+ padding = True,
471
+ truncation = True,
472
+ max_length = 512,
473
+ ).to(self.device)
 
474
 
475
  # Get embeddings
476
  with torch.no_grad():
477
+ outputs = self.legal_bert_model(**inputs)
478
  cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
479
 
 
 
480
  return {cat: 0.5 for cat in self.CATEGORY_HIERARCHY.keys()}
481
 
482
+
483
+ def _combine_scores(self, keyword_scores: Dict[str, float], semantic_scores: Dict[str, float], legal_bert_scores: Dict[str, float] = None) -> Dict[str, float]:
 
484
  """
485
  Combine scores from different methods (weighted average)
486
 
487
+ Arguments:
488
+ ----------
489
+ keyword_scores { dict } : Keyword-based scores
490
+
491
+ semantic_scores { dict } : Semantic similarity scores
492
+
493
+ legal_bert_scores { dict } : Legal-BERT scores (optional)
494
 
495
  Returns:
496
+ --------
497
+ { dict } : Combined scores dictionary
498
  """
499
+ combined = dict()
500
 
501
  # Weights for each method
502
+ keyword_weight = 0.40
503
+ semantic_weight = 0.60
504
  legal_bert_weight = 0.00 # Set to 0 if not using Legal-BERT
505
 
506
  if legal_bert_scores:
507
  # Normalize weights
508
+ total_weight = keyword_weight + semantic_weight + legal_bert_weight
509
+ keyword_weight /= total_weight
510
+ semantic_weight /= total_weight
511
  legal_bert_weight /= total_weight
512
 
513
  for category in self.CATEGORY_HIERARCHY.keys():
514
+ score = (keyword_scores.get(category, 0) * keyword_weight + semantic_scores.get(category, 0) * semantic_weight)
 
 
 
515
 
516
  if legal_bert_scores:
517
  score += legal_bert_scores.get(category, 0) * legal_bert_weight
 
520
 
521
  return combined
522
 
 
 
 
523
 
524
  def _detect_subcategory(self, text: str, primary_category: str) -> Optional[str]:
525
  """
526
  Detect specific subcategory within primary category
527
 
528
+ Arguments:
529
+ ----------
530
+ text { str } : Full contract text
531
+
532
+ primary_category { str } : Detected primary category
533
 
534
  Returns:
535
+ --------
536
+ { str } : Subcategory name or None
537
  """
538
+ text_lower = text.lower()
539
 
540
  # Get subcategories for this category
541
  subcategories = self.CATEGORY_HIERARCHY[primary_category]['subcategories']
542
 
543
  # Score each subcategory
544
+ subcat_scores = dict()
545
+
546
  for subcat in subcategories:
547
  if subcat in self.SUBCATEGORY_PATTERNS:
548
+ patterns = self.SUBCATEGORY_PATTERNS[subcat]
549
+ score = sum(1 for pattern in patterns if pattern in text_lower)
550
  subcat_scores[subcat] = score
551
 
552
  # Return best match if any
553
+ if (subcat_scores and (max(subcat_scores.values()) > 0)):
554
+ best_subcat = max(subcat_scores, key = subcat_scores.get)
555
  log_info(f"Detected subcategory: {best_subcat}",
556
+ category = primary_category,
557
+ score = subcat_scores[best_subcat],
558
+ )
559
+
560
  return best_subcat
561
 
562
  return None
563
 
564
+
565
+ def _generate_reasoning(self, contract_text: str, primary_category: str, subcategory: Optional[str], keyword_scores: Dict[str, float], semantic_scores: Dict[str, float],
566
+ combined_scores: Dict[str, float]) -> List[str]:
 
 
 
 
 
 
567
  """
568
  Generate human-readable reasoning for classification
569
 
570
  Returns:
571
+ --------
572
+ { list } : List of reasoning statements
573
  """
574
+ reasoning = list()
575
 
576
  # Primary category reasoning
577
+ keyword_match = keyword_scores.get(primary_category, 0)
578
  semantic_match = semantic_scores.get(primary_category, 0)
579
 
580
+ if (keyword_match > 0.5):
581
+ reasoning.append(f"Strong keyword indicators for {primary_category.replace('_', ' ')} category "
582
+ f"({int(keyword_match * 100)}% keyword match)"
583
+ )
584
+
585
+ elif (keyword_match > 0.3):
586
+ reasoning.append(f"Moderate keyword presence for {primary_category.replace('_', ' ')} "
587
+ f"({int(keyword_match * 100)}% keyword match)"
588
+ )
589
+
590
+ if (semantic_match > 0.65):
591
+ reasoning.append(f"Contract language semantically similar to {primary_category.replace('_', ' ')} agreements "
592
+ f"(similarity: {semantic_match:.2f})"
593
+ )
594
+
595
+ elif (semantic_match > 0.50):
596
+ reasoning.append(f"Moderate semantic similarity to {primary_category.replace('_', ' ')} contracts "
597
+ f"(similarity: {semantic_match:.2f})"
598
+ )
 
 
599
 
600
  # Subcategory reasoning
601
  if subcategory:
602
+ reasoning.append(f"Specific subcategory identified: {subcategory.replace('_', ' ')}")
 
 
603
 
604
  # Alternative categories (if close)
605
+ sorted_scores = sorted(combined_scores.items(), key = lambda x: x[1], reverse = True)
606
+
607
+ if ((len(sorted_scores) > 1) and (sorted_scores[1][1] > 0.40)):
608
  alt_category, alt_score = sorted_scores[1]
609
+
610
+ reasoning.append(f"Also contains elements of {alt_category.replace('_', ' ')} "
611
+ f"(secondary match: {alt_score:.2f})"
612
+ )
613
 
614
  # If no strong reasoning
615
  if not reasoning:
616
  reasoning.append("Classification based on general contract structure and terminology")
617
 
618
  return reasoning
619
+
620
 
621
  def _extract_detected_keywords(self, text: str, category: str) -> List[str]:
622
  """
623
  Extract which specific keywords were found
624
 
625
+ Arguments:
626
+ ----------
627
+ text { str } : Contract text
628
+
629
+ category { str } : Detected category
630
 
631
  Returns:
632
+ --------
633
+ { list } : List of detected keywords
634
  """
635
  text_lower = text.lower()
636
+ keywords = self.CATEGORY_HIERARCHY[category]['keywords']
637
 
638
+ detected = [kw for kw in keywords if kw in text_lower]
639
+
640
+ # Top 10 keywords
641
+ return detected[:10]
642
 
 
 
 
643
 
644
  @ContractAnalyzerLogger.log_execution_time("classify_multi_label")
645
+ def classify_multi_label(self, text: str, threshold: float = 0.45) -> List[ContractCategory]:
 
646
  """
647
+ Classify as multiple categories if applicable (e.g., Employment + NDA, Consulting + IP Assignment)
 
648
 
649
+ Arguments:
650
+ ----------
651
+ text { str } : Contract text
652
+
653
+ threshold { float } : Minimum confidence threshold for multi-label
654
 
655
  Returns:
656
+ --------
657
+ { list } : List of ContractCategory objects (sorted by confidence)
658
  """
659
+ log_info("Starting multi-label classification", threshold = threshold)
660
 
661
  # Get scores
662
+ keyword_scores = self._score_keywords(text.lower())
663
+ semantic_scores = self._semantic_similarity(text)
664
  combined_scores = self._combine_scores(keyword_scores, semantic_scores)
665
 
666
  # Get all categories above threshold
667
+ matches = list()
668
+
669
  for category, score in combined_scores.items():
670
+ if (score >= threshold):
671
  subcategory = self._detect_subcategory(text, category)
672
+ reasoning = self._generate_reasoning(text, category, subcategory, keyword_scores, semantic_scores, combined_scores)
673
+ keywords = self._extract_detected_keywords(text, category)
 
 
 
674
 
675
+ matches.append(ContractCategory(category = category,
676
+ subcategory = subcategory,
677
+ confidence = score,
678
+ reasoning = reasoning,
679
+ detected_keywords = keywords,
680
+ )
681
+ )
682
 
683
  # Sort by confidence
684
+ matches.sort(key = lambda x: x.confidence, reverse = True)
685
 
686
  log_info(f"Multi-label classification found {len(matches)} categories")
687
 
688
  return matches if matches else [self.classify_contract(text)]
689
 
690
+
 
 
 
691
  def get_category_description(self, category: str) -> str:
692
+ """
693
+ Get human-readable description of a category
694
+ """
695
+ descriptions = {'employment' : 'Employment agreements governing employer-employee relationships',
696
+ 'consulting' : 'Consulting and independent contractor agreements',
697
+ 'nda' : 'Non-disclosure and confidentiality agreements',
698
+ 'technology' : 'Software licensing and technology service agreements',
699
+ 'intellectual_property' : 'IP assignment, licensing, and protection agreements',
700
+ 'real_estate' : 'Property lease, rental, and purchase agreements',
701
+ 'financial' : 'Loan, credit, and financial service agreements',
702
+ 'business' : 'Partnership, joint venture, and corporate agreements',
703
+ 'sales' : 'Sales, purchase, and distribution agreements',
704
+ 'service_agreement' : 'Professional service and maintenance agreements',
705
+ 'vendor' : 'Vendor, supplier, and procurement agreements',
706
+ 'agency' : 'Agency and representation agreements',
707
+ }
708
+
709
  return descriptions.get(category, 'General contract agreement')
710
+
711
 
712
  def get_all_categories(self) -> List[str]:
713
+ """
714
+ Get list of all supported categories
715
+ """
716
  return list(self.CATEGORY_HIERARCHY.keys())
717
 
718
+
719
  def get_subcategories(self, category: str) -> List[str]:
720
+ """
721
+ Get subcategories for a specific category
722
+ """
723
+ return self.CATEGORY_HIERARCHY.get(category, {}).get('subcategories', [])
static/app.js DELETED
File without changes
static/index.html CHANGED
@@ -0,0 +1,1404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>AI Contract Risk Analyzer - Legal Intelligence Platform</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
16
+ background: #ffffff;
17
+ color: #333;
18
+ line-height: 1.6;
19
+ }
20
+
21
+ /* Header */
22
+ .header {
23
+ background: white;
24
+ border-bottom: 1px solid #e5e5e5;
25
+ padding: 1rem 2rem;
26
+ display: flex;
27
+ justify-content: space-between;
28
+ align-items: center;
29
+ position: fixed;
30
+ width: 100%;
31
+ top: 0;
32
+ z-index: 1000;
33
+ }
34
+
35
+ .logo {
36
+ display: flex;
37
+ align-items: center;
38
+ gap: 0.5rem;
39
+ font-size: 1.25rem;
40
+ font-weight: 600;
41
+ }
42
+
43
+ .logo-icon {
44
+ width: 28px;
45
+ height: 28px;
46
+ background: #4169e1;
47
+ border-radius: 6px;
48
+ display: flex;
49
+ align-items: center;
50
+ justify-content: center;
51
+ color: white;
52
+ font-size: 18px;
53
+ }
54
+
55
+ .subtitle {
56
+ color: #666;
57
+ font-size: 0.9rem;
58
+ font-weight: 400;
59
+ }
60
+
61
+ .container {
62
+ max-width: 1200px;
63
+ margin: 0 auto;
64
+ padding: 0 2rem;
65
+ }
66
+
67
+ /* Landing Page Styles - Updated to match screenshot */
68
+ .landing-screen {
69
+ padding-top: 80px;
70
+ }
71
+
72
+ .hero-section {
73
+ text-align: center;
74
+ padding: 6rem 0 4rem;
75
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
76
+ color: white;
77
+ margin-bottom: 4rem;
78
+ }
79
+
80
+ .hero-title {
81
+ font-size: 3rem;
82
+ font-weight: 700;
83
+ margin-bottom: 1.5rem;
84
+ line-height: 1.2;
85
+ }
86
+
87
+ .hero-subtitle {
88
+ font-size: 1.3rem;
89
+ margin-bottom: 2.5rem;
90
+ opacity: 0.95;
91
+ max-width: 600px;
92
+ margin-left: auto;
93
+ margin-right: auto;
94
+ }
95
+
96
+ .cta-button {
97
+ background: white;
98
+ color: #4169e1;
99
+ border: none;
100
+ padding: 1rem 3rem;
101
+ border-radius: 50px;
102
+ font-size: 1.1rem;
103
+ font-weight: 600;
104
+ cursor: pointer;
105
+ transition: all 0.3s ease;
106
+ box-shadow: 0 4px 15px rgba(0,0,0,0.2);
107
+ }
108
+
109
+ .cta-button:hover {
110
+ transform: translateY(-2px);
111
+ box-shadow: 0 8px 25px rgba(0,0,0,0.3);
112
+ }
113
+
114
+ .section {
115
+ padding: 4rem 0;
116
+ text-align: center;
117
+ }
118
+
119
+ .section-title {
120
+ font-size: 2.2rem;
121
+ font-weight: 600;
122
+ margin-bottom: 3rem;
123
+ color: #333;
124
+ }
125
+
126
+ .section-subtitle {
127
+ font-size: 1.2rem;
128
+ color: #666;
129
+ margin-bottom: 3rem;
130
+ max-width: 800px;
131
+ margin-left: auto;
132
+ margin-right: auto;
133
+ line-height: 1.8;
134
+ }
135
+
136
+ .features-grid {
137
+ display: grid;
138
+ grid-template-columns: repeat(3, 1fr);
139
+ gap: 3rem;
140
+ margin-bottom: 4rem;
141
+ }
142
+
143
+ .feature-card {
144
+ text-align: center;
145
+ padding: 2rem;
146
+ }
147
+
148
+ .feature-icon {
149
+ font-size: 3rem;
150
+ margin-bottom: 1.5rem;
151
+ }
152
+
153
+ .feature-title {
154
+ font-size: 1.4rem;
155
+ font-weight: 600;
156
+ margin-bottom: 1rem;
157
+ color: #333;
158
+ }
159
+
160
+ .feature-description {
161
+ color: #666;
162
+ line-height: 1.7;
163
+ font-size: 1rem;
164
+ }
165
+
166
+ .steps-section {
167
+ background: #f8f9fa;
168
+ padding: 5rem 0;
169
+ }
170
+
171
+ .steps-grid {
172
+ display: grid;
173
+ grid-template-columns: repeat(3, 1fr);
174
+ gap: 3rem;
175
+ margin-top: 3rem;
176
+ }
177
+
178
+ .step-card {
179
+ text-align: center;
180
+ padding: 2rem;
181
+ }
182
+
183
+ .step-number {
184
+ width: 60px;
185
+ height: 60px;
186
+ background: #4169e1;
187
+ color: white;
188
+ border-radius: 50%;
189
+ display: flex;
190
+ align-items: center;
191
+ justify-content: center;
192
+ font-size: 1.5rem;
193
+ font-weight: 700;
194
+ margin: 0 auto 1.5rem;
195
+ }
196
+
197
+ .step-title {
198
+ font-size: 1.3rem;
199
+ font-weight: 600;
200
+ margin-bottom: 1rem;
201
+ color: #333;
202
+ }
203
+
204
+ .step-description {
205
+ color: #666;
206
+ line-height: 1.7;
207
+ }
208
+
209
+ .footer {
210
+ text-align: center;
211
+ padding: 3rem 2rem;
212
+ color: #999;
213
+ font-size: 0.9rem;
214
+ border-top: 1px solid #e5e5e5;
215
+ background: #f8f9fa;
216
+ }
217
+
218
+ /* Analyzer Styles */
219
+ .analyzer-screen {
220
+ display: none;
221
+ padding-top: 80px;
222
+ }
223
+
224
+ .hero-section-analyzer {
225
+ text-align: center;
226
+ margin-bottom: 3rem;
227
+ padding: 2rem 0;
228
+ }
229
+
230
+ .hero-title-analyzer {
231
+ font-size: 2.5rem;
232
+ font-weight: 700;
233
+ margin-bottom: 1rem;
234
+ color: #1a1a1a;
235
+ }
236
+
237
+ .hero-description {
238
+ font-size: 1.1rem;
239
+ color: #666;
240
+ margin-bottom: 2rem;
241
+ }
242
+
243
+ .upload-card {
244
+ background: white;
245
+ border-radius: 12px;
246
+ padding: 2.5rem;
247
+ box-shadow: 0 2px 8px rgba(0,0,0,0.08);
248
+ max-width: 700px;
249
+ margin: 0 auto;
250
+ position: relative;
251
+ }
252
+
253
+ .tabs {
254
+ display: flex;
255
+ gap: 1rem;
256
+ border-bottom: 2px solid #e5e5e5;
257
+ margin-bottom: 2rem;
258
+ }
259
+
260
+ .tab {
261
+ padding: 0.75rem 1.5rem;
262
+ background: none;
263
+ border: none;
264
+ font-size: 1rem;
265
+ color: #666;
266
+ cursor: pointer;
267
+ border-bottom: 3px solid transparent;
268
+ margin-bottom: -2px;
269
+ transition: all 0.2s;
270
+ }
271
+
272
+ .tab.active {
273
+ color: #4169e1;
274
+ border-bottom-color: #4169e1;
275
+ font-weight: 500;
276
+ }
277
+
278
+ .tab-content {
279
+ display: none;
280
+ }
281
+
282
+ .tab-content.active {
283
+ display: block;
284
+ }
285
+
286
+ .textarea {
287
+ width: 100%;
288
+ min-height: 250px;
289
+ padding: 1rem;
290
+ border: 2px solid #e5e5e5;
291
+ border-radius: 8px;
292
+ font-size: 0.95rem;
293
+ font-family: inherit;
294
+ resize: vertical;
295
+ transition: border-color 0.2s;
296
+ }
297
+
298
+ .textarea:focus {
299
+ outline: none;
300
+ border-color: #4169e1;
301
+ }
302
+
303
+ .textarea::placeholder {
304
+ color: #999;
305
+ }
306
+
307
+ .file-upload-area {
308
+ border: 2px dashed #d0d0d0;
309
+ border-radius: 8px;
310
+ padding: 3rem 2rem;
311
+ text-align: center;
312
+ cursor: pointer;
313
+ transition: all 0.2s;
314
+ }
315
+
316
+ .file-upload-area:hover {
317
+ border-color: #4169e1;
318
+ background: #f8f9ff;
319
+ }
320
+
321
+ .file-upload-area.dragover {
322
+ border-color: #4169e1;
323
+ background: #f0f4ff;
324
+ }
325
+
326
+ .file-input {
327
+ display: none;
328
+ }
329
+
330
+ .upload-icon {
331
+ font-size: 3rem;
332
+ color: #999;
333
+ margin-bottom: 1rem;
334
+ }
335
+
336
+ .upload-text {
337
+ font-size: 1rem;
338
+ color: #666;
339
+ margin-bottom: 0.5rem;
340
+ }
341
+
342
+ .upload-hint {
343
+ font-size: 0.875rem;
344
+ color: #999;
345
+ }
346
+
347
+ .selected-file {
348
+ display: flex;
349
+ align-items: center;
350
+ gap: 1rem;
351
+ padding: 1rem;
352
+ background: #f8f9ff;
353
+ border-radius: 8px;
354
+ margin-top: 1rem;
355
+ }
356
+
357
+ .file-icon {
358
+ font-size: 2rem;
359
+ }
360
+
361
+ .file-info {
362
+ flex: 1;
363
+ }
364
+
365
+ .file-name {
366
+ font-weight: 500;
367
+ margin-bottom: 0.25rem;
368
+ }
369
+
370
+ .file-size {
371
+ font-size: 0.875rem;
372
+ color: #666;
373
+ }
374
+
375
+ .remove-file {
376
+ background: none;
377
+ border: none;
378
+ color: #999;
379
+ cursor: pointer;
380
+ font-size: 1.5rem;
381
+ padding: 0.25rem;
382
+ }
383
+
384
+ .analyze-btn-container {
385
+ display: flex;
386
+ justify-content: center;
387
+ margin-top: 2rem;
388
+ width: 100%;
389
+ }
390
+
391
+ .analyze-btn {
392
+ background: #4169e1;
393
+ color: white;
394
+ border: none;
395
+ padding: 1rem 3rem;
396
+ border-radius: 8px;
397
+ font-size: 1.1rem;
398
+ font-weight: 600;
399
+ cursor: pointer;
400
+ display: flex;
401
+ align-items: center;
402
+ gap: 0.5rem;
403
+ transition: all 0.3s ease;
404
+ min-width: 200px;
405
+ justify-content: center;
406
+ }
407
+
408
+ .analyze-btn:hover {
409
+ background: #3154c5;
410
+ transform: translateY(-2px);
411
+ box-shadow: 0 4px 12px rgba(49, 84, 197, 0.3);
412
+ }
413
+
414
+ .loading-screen {
415
+ display: none;
416
+ text-align: center;
417
+ padding: 4rem 2rem;
418
+ }
419
+
420
+ .loading-screen.active {
421
+ display: block;
422
+ }
423
+
424
+ .spinner {
425
+ width: 80px;
426
+ height: 80px;
427
+ border: 6px solid #e5e5e5;
428
+ border-top-color: #4169e1;
429
+ border-radius: 50%;
430
+ animation: spin 1s linear infinite;
431
+ margin: 0 auto 2rem;
432
+ }
433
+
434
+ @keyframes spin {
435
+ to { transform: rotate(360deg); }
436
+ }
437
+
438
+ .loading-title {
439
+ font-size: 1.5rem;
440
+ font-weight: 600;
441
+ margin-bottom: 0.5rem;
442
+ }
443
+
444
+ .loading-text {
445
+ color: #666;
446
+ font-size: 1rem;
447
+ }
448
+
449
+ .results-screen {
450
+ display: none;
451
+ }
452
+
453
+ .results-screen.active {
454
+ display: block;
455
+ }
456
+
457
+ .back-to-landing {
458
+ background: none;
459
+ border: none;
460
+ color: #4169e1;
461
+ cursor: pointer;
462
+ font-size: 1rem;
463
+ display: flex;
464
+ align-items: center;
465
+ gap: 0.5rem;
466
+ margin-bottom: 2rem;
467
+ padding: 0.5rem 1rem;
468
+ border-radius: 6px;
469
+ transition: background 0.2s;
470
+ }
471
+
472
+ .back-to-landing:hover {
473
+ background: #f8f9ff;
474
+ }
475
+
476
+ .api-status {
477
+ text-align: center;
478
+ margin: 1rem 0;
479
+ padding: 1rem;
480
+ border-radius: 8px;
481
+ font-size: 0.9rem;
482
+ }
483
+
484
+ .api-status.connected {
485
+ background: #dcfce7;
486
+ color: #16a34a;
487
+ border: 1px solid #bbf7d0;
488
+ }
489
+
490
+ .api-status.disconnected {
491
+ background: #fee;
492
+ color: #dc2626;
493
+ border: 1px solid #fecaca;
494
+ }
495
+
496
+ /* Results screen styles */
497
+ .results-header {
498
+ display: flex;
499
+ justify-content: space-between;
500
+ align-items: center;
501
+ margin-bottom: 2rem;
502
+ }
503
+
504
+ .results-title {
505
+ font-size: 2rem;
506
+ font-weight: 700;
507
+ }
508
+
509
+ .results-actions {
510
+ display: flex;
511
+ gap: 1rem;
512
+ }
513
+
514
+ .btn {
515
+ padding: 0.75rem 1.5rem;
516
+ border-radius: 8px;
517
+ font-size: 0.95rem;
518
+ font-weight: 500;
519
+ cursor: pointer;
520
+ border: none;
521
+ transition: all 0.2s;
522
+ }
523
+
524
+ .btn-primary {
525
+ background: #4169e1;
526
+ color: white;
527
+ }
528
+
529
+ .btn-primary:hover {
530
+ background: #3154c5;
531
+ }
532
+
533
+ .btn-secondary {
534
+ background: white;
535
+ color: #4169e1;
536
+ border: 2px solid #4169e1;
537
+ }
538
+
539
+ .btn-secondary:hover {
540
+ background: #f8f9ff;
541
+ }
542
+
543
+ .results-grid {
544
+ display: grid;
545
+ grid-template-columns: 1fr 2fr;
546
+ gap: 1.5rem;
547
+ margin-bottom: 2rem;
548
+ }
549
+
550
+ .card {
551
+ background: white;
552
+ border-radius: 12px;
553
+ padding: 2rem;
554
+ box-shadow: 0 2px 8px rgba(0,0,0,0.08);
555
+ }
556
+
557
+ .card-title {
558
+ font-size: 1.25rem;
559
+ font-weight: 600;
560
+ margin-bottom: 1.5rem;
561
+ }
562
+
563
+ .risk-score-container {
564
+ text-align: center;
565
+ }
566
+
567
+ .risk-circle {
568
+ width: 200px;
569
+ height: 200px;
570
+ margin: 0 auto 1rem;
571
+ position: relative;
572
+ }
573
+
574
+ .risk-circle svg {
575
+ transform: rotate(-90deg);
576
+ }
577
+
578
+ .risk-score-value {
579
+ position: absolute;
580
+ top: 50%;
581
+ left: 50%;
582
+ transform: translate(-50%, -50%);
583
+ font-size: 3rem;
584
+ font-weight: 700;
585
+ color: #dc2626;
586
+ }
587
+
588
+ .risk-level {
589
+ display: inline-block;
590
+ padding: 0.5rem 1rem;
591
+ border-radius: 6px;
592
+ font-weight: 600;
593
+ font-size: 0.9rem;
594
+ margin-top: 1rem;
595
+ }
596
+
597
+ .risk-critical {
598
+ background: #fee;
599
+ color: #dc2626;
600
+ }
601
+
602
+ .risk-high {
603
+ background: #fff4e6;
604
+ color: #f97316;
605
+ }
606
+
607
+ .risk-medium {
608
+ background: #fef9c3;
609
+ color: #ca8a04;
610
+ }
611
+
612
+ .risk-low {
613
+ background: #dcfce7;
614
+ color: #16a34a;
615
+ }
616
+
617
+ .executive-summary {
618
+ font-size: 1rem;
619
+ line-height: 1.8;
620
+ color: #444;
621
+ }
622
+
623
+ .three-column-grid {
624
+ display: grid;
625
+ grid-template-columns: repeat(3, 1fr);
626
+ gap: 1.5rem;
627
+ margin-bottom: 2rem;
628
+ }
629
+
630
+ .card-icon {
631
+ font-size: 1.5rem;
632
+ margin-bottom: 0.5rem;
633
+ }
634
+
635
+ .icon-warning { color: #f97316; }
636
+ .icon-shield { color: #dc2626; }
637
+ .icon-book { color: #4169e1; }
638
+
639
+ .item-list {
640
+ list-style: none;
641
+ }
642
+
643
+ .item-list li {
644
+ padding: 0.75rem 0;
645
+ border-bottom: 1px solid #f0f0f0;
646
+ display: flex;
647
+ align-items: flex-start;
648
+ gap: 0.5rem;
649
+ }
650
+
651
+ .item-list li:last-child {
652
+ border-bottom: none;
653
+ }
654
+
655
+ .item-icon {
656
+ color: #4169e1;
657
+ margin-top: 0.25rem;
658
+ }
659
+
660
+ .item-text {
661
+ flex: 1;
662
+ font-size: 0.95rem;
663
+ }
664
+
665
+ .category-breakdown {
666
+ margin-top: 2rem;
667
+ }
668
+
669
+ .category-item {
670
+ margin-bottom: 2rem;
671
+ }
672
+
673
+ .category-header {
674
+ display: flex;
675
+ justify-content: space-between;
676
+ align-items: center;
677
+ margin-bottom: 0.75rem;
678
+ }
679
+
680
+ .category-name {
681
+ font-weight: 600;
682
+ font-size: 1rem;
683
+ }
684
+
685
+ .category-score {
686
+ font-weight: 700;
687
+ font-size: 1.1rem;
688
+ }
689
+
690
+ .score-critical { color: #dc2626; }
691
+ .score-high { color: #f97316; }
692
+ .score-medium { color: #ca8a04; }
693
+ .score-low { color: #16a34a; }
694
+
695
+ .progress-bar {
696
+ height: 8px;
697
+ background: #f0f0f0;
698
+ border-radius: 4px;
699
+ overflow: hidden;
700
+ margin-bottom: 0.5rem;
701
+ }
702
+
703
+ .progress-fill {
704
+ height: 100%;
705
+ transition: width 0.5s ease;
706
+ }
707
+
708
+ .progress-critical { background: #dc2626; }
709
+ .progress-high { background: #f97316; }
710
+ .progress-medium { background: #ca8a04; }
711
+ .progress-low { background: #16a34a; }
712
+
713
+ .category-description {
714
+ font-size: 0.9rem;
715
+ color: #666;
716
+ line-height: 1.6;
717
+ }
718
+
719
+ .clause-analysis {
720
+ margin-top: 2rem;
721
+ }
722
+
723
+ .clause-item {
724
+ border: 1px solid #e5e5e5;
725
+ border-left: 4px solid #dc2626;
726
+ border-radius: 8px;
727
+ padding: 1.5rem;
728
+ margin-bottom: 1rem;
729
+ background: white;
730
+ }
731
+
732
+ .clause-item.high {
733
+ border-left-color: #f97316;
734
+ }
735
+
736
+ .clause-item.medium {
737
+ border-left-color: #ca8a04;
738
+ }
739
+
740
+ .clause-header {
741
+ display: flex;
742
+ justify-content: space-between;
743
+ align-items: flex-start;
744
+ margin-bottom: 1rem;
745
+ }
746
+
747
+ .clause-label {
748
+ font-size: 0.75rem;
749
+ text-transform: uppercase;
750
+ font-weight: 600;
751
+ color: #999;
752
+ margin-bottom: 0.5rem;
753
+ }
754
+
755
+ .clause-text {
756
+ font-size: 0.95rem;
757
+ font-weight: 500;
758
+ color: #333;
759
+ line-height: 1.6;
760
+ }
761
+
762
+ .severity-badge {
763
+ padding: 0.375rem 0.875rem;
764
+ border-radius: 6px;
765
+ font-size: 0.8rem;
766
+ font-weight: 600;
767
+ }
768
+
769
+ .badge-critical {
770
+ background: #fee;
771
+ color: #dc2626;
772
+ }
773
+
774
+ .badge-high {
775
+ background: #fff4e6;
776
+ color: #f97316;
777
+ }
778
+
779
+ .badge-medium {
780
+ background: #fef9c3;
781
+ color: #ca8a04;
782
+ }
783
+
784
+ .clause-section {
785
+ margin-top: 1rem;
786
+ }
787
+
788
+ .clause-section-title {
789
+ font-weight: 600;
790
+ font-size: 0.9rem;
791
+ margin-bottom: 0.5rem;
792
+ color: #333;
793
+ }
794
+
795
+ .clause-section-text {
796
+ font-size: 0.9rem;
797
+ color: #555;
798
+ line-height: 1.7;
799
+ }
800
+
801
+ @media (max-width: 1024px) {
802
+ .features-grid,
803
+ .steps-grid {
804
+ grid-template-columns: 1fr;
805
+ gap: 2rem;
806
+ }
807
+
808
+ .results-grid {
809
+ grid-template-columns: 1fr;
810
+ }
811
+
812
+ .three-column-grid {
813
+ grid-template-columns: 1fr;
814
+ }
815
+ }
816
+
817
+ @media (max-width: 768px) {
818
+ .hero-title {
819
+ font-size: 2.2rem;
820
+ }
821
+
822
+ .hero-title-analyzer {
823
+ font-size: 2rem;
824
+ }
825
+
826
+ .section-title {
827
+ font-size: 1.8rem;
828
+ }
829
+
830
+ .results-header {
831
+ flex-direction: column;
832
+ align-items: flex-start;
833
+ gap: 1rem;
834
+ }
835
+
836
+ .results-actions {
837
+ width: 100%;
838
+ flex-direction: column;
839
+ }
840
+
841
+ .btn {
842
+ width: 100%;
843
+ }
844
+
845
+ .analyze-btn {
846
+ width: 100%;
847
+ padding: 1rem 2rem;
848
+ }
849
+ }
850
+ </style>
851
+ </head>
852
+ <body>
853
+ <!-- Header -->
854
+ <header class="header">
855
+ <div class="logo">
856
+ <div class="logo-icon">✓</div>
857
+ <span>AI Contract Risk Analyzer</span>
858
+ </div>
859
+ <div class="subtitle">Legal Intelligence Platform</div>
860
+ </header>
861
+
862
+ <!-- Landing Screen -->
863
+ <div id="landingScreen" class="landing-screen">
864
+ <!-- Hero Section -->
865
+ <section class="hero-section">
866
+ <div class="container">
867
+ <h1 class="hero-title">Unlock Legal Intelligence<br>Analyze Contracts with AI</h1>
868
+ <p class="hero-subtitle">
869
+ Instantly identify risks, uncover unfavorable terms, and gain actionable negotiation points.
870
+ Our AI-powered platform gives you the clarity and confidence to sign better contracts.
871
+ </p>
872
+ <button class="cta-button" id="getStartedBtn">Try Now for Free</button>
873
+ </div>
874
+ </section>
875
+
876
+ <!-- Main Content Section -->
877
+ <section class="section">
878
+ <div class="container">
879
+ <h2 class="section-title">A Smarter Way to Review Legal Documents</h2>
880
+ <p class="section-subtitle">
881
+ Our platform goes beyond simple keyword searches to provide a deep, contextual understanding of your contracts.
882
+ </p>
883
+
884
+ <div class="features-grid">
885
+ <div class="feature-card">
886
+ <div class="feature-icon">🔍</div>
887
+ <h3 class="feature-title">In-Depth Analysis</h3>
888
+ <p class="feature-description">
889
+ Our AI performs a comprehensive, clause-by-clause review, assessing risk levels and explaining complex legal jargon in plain English.
890
+ </p>
891
+ </div>
892
+
893
+ <div class="feature-card">
894
+ <div class="feature-icon">💡</div>
895
+ <h3 class="feature-title">Actionable Insights</h3>
896
+ <p class="feature-description">
897
+ Receive a prioritized list of negotiation points, suggestions for missing clauses, and clear recommendations to strengthen your position.
898
+ </p>
899
+ </div>
900
+
901
+ <div class="feature-card">
902
+ <div class="feature-icon">🔒</div>
903
+ <h3 class="feature-title">Secure & Confidential</h3>
904
+ <p class="feature-description">
905
+ Your documents are encrypted and processed with the utmost privacy. We never store your contract data after analysis.
906
+ </p>
907
+ </div>
908
+ </div>
909
+ </div>
910
+ </section>
911
+
912
+ <!-- Steps Section -->
913
+ <section class="steps-section">
914
+ <div class="container">
915
+ <h2 class="section-title">Get Your Analysis in 3 Simple Steps</h2>
916
+
917
+ <div class="steps-grid">
918
+ <div class="step-card">
919
+ <div class="step-number">1</div>
920
+ <h3 class="step-title">Upload or Paste</h3>
921
+ <p class="step-description">
922
+ Securely provide your contract by pasting the text or uploading a DOCX/PDF file.
923
+ </p>
924
+ </div>
925
+
926
+ <div class="step-card">
927
+ <div class="step-number">2</div>
928
+ <h3 class="step-title">AI Analyzes</h3>
929
+ <p class="step-description">
930
+ Our intelligent engine scrutinizes every detail of your document in seconds.
931
+ </p>
932
+ </div>
933
+
934
+ <div class="step-card">
935
+ <div class="step-number">3</div>
936
+ <h3 class="step-title">Get Your Report</h3>
937
+ <p class="step-description">
938
+ Receive a comprehensive, easy-to-understand report with your risk score and key findings.
939
+ </p>
940
+ </div>
941
+ </div>
942
+ </div>
943
+ </section>
944
+
945
+ <footer class="footer">
946
+ © 2025 AI Contract Risk Analyzer. For informational purposes only. Not legal advice.
947
+ </footer>
948
+ </div>
949
+
950
+ <!-- Analyzer Screen -->
951
+ <div id="analyzerScreen" class="analyzer-screen">
952
+ <div class="container">
953
+ <button class="back-to-landing" id="backToLandingBtn">
954
+ ← Back to Overview
955
+ </button>
956
+
957
+ <div class="hero-section-analyzer">
958
+ <h1 class="hero-title-analyzer">Analyze Your Contract in Seconds</h1>
959
+ <p class="hero-description">Paste your contract or upload a file to get an instant, AI-powered risk assessment.</p>
960
+ </div>
961
+
962
+ <!-- API Status Indicator -->
963
+ <div id="apiStatus" class="api-status" style="display: none;">
964
+ Checking backend connection...
965
+ </div>
966
+
967
+ <div class="upload-card">
968
+ <div class="tabs">
969
+ <button class="tab active" data-tab="paste">Paste Text</button>
970
+ <button class="tab" data-tab="upload">Upload File</button>
971
+ </div>
972
+
973
+ <div id="pasteTab" class="tab-content active">
974
+ <textarea class="textarea" id="contractText" placeholder="Paste your full contract text here..."></textarea>
975
+ </div>
976
+
977
+ <div id="uploadTab" class="tab-content">
978
+ <div class="file-upload-area" id="fileUploadArea">
979
+ <input type="file" id="fileInput" class="file-input" accept=".pdf,.docx,.txt">
980
+ <div class="upload-icon">📄</div>
981
+ <div class="upload-text">Click to upload or drag and drop</div>
982
+ <div class="upload-hint">PDF, DOCX, or TXT files (Max 10MB)</div>
983
+ </div>
984
+ <div id="selectedFile" class="selected-file" style="display: none;">
985
+ <div class="file-icon">📄</div>
986
+ <div class="file-info">
987
+ <div class="file-name" id="fileName"></div>
988
+ <div class="file-size" id="fileSize"></div>
989
+ </div>
990
+ <button class="remove-file" id="removeFile">×</button>
991
+ </div>
992
+ </div>
993
+
994
+ <div class="analyze-btn-container">
995
+ <button class="analyze-btn" id="analyzeBtn">
996
+ <span>🔍</span>
997
+ <span>Analyze Contract</span>
998
+ </button>
999
+ </div>
1000
+ </div>
1001
+
1002
+ <!-- Loading Screen -->
1003
+ <div id="loadingScreen" class="loading-screen">
1004
+ <div class="spinner"></div>
1005
+ <h2 class="loading-title">Performing in-depth analysis...</h2>
1006
+ <p class="loading-text">This may take a moment for large documents.</p>
1007
+ </div>
1008
+
1009
+ <!-- Results Screen -->
1010
+ <div id="resultsScreen" class="results-screen">
1011
+ <div class="results-header">
1012
+ <h1 class="results-title">Analysis Report</h1>
1013
+ <div class="results-actions">
1014
+ <button class="btn btn-primary" id="downloadBtn">📥 Download PDF Report</button>
1015
+ <button class="btn btn-secondary" id="analyzeAnotherBtn">Analyze Another Contract</button>
1016
+ </div>
1017
+ </div>
1018
+
1019
+ <div class="results-grid">
1020
+ <div class="card">
1021
+ <h2 class="card-title">Overall Risk Score</h2>
1022
+ <div class="risk-score-container">
1023
+ <div class="risk-circle">
1024
+ <svg width="200" height="200">
1025
+ <circle cx="100" cy="100" r="85" fill="none" stroke="#f0f0f0" stroke-width="20"/>
1026
+ <circle id="riskCircle" cx="100" cy="100" r="85" fill="none" stroke="#dc2626" stroke-width="20" stroke-dasharray="534" stroke-dashoffset="534" stroke-linecap="round"/>
1027
+ </svg>
1028
+ <div class="risk-score-value" id="riskScoreValue">0</div>
1029
+ </div>
1030
+ <div class="risk-level" id="riskLevel">NO RISK</div>
1031
+ </div>
1032
+ </div>
1033
+
1034
+ <div class="card">
1035
+ <h2 class="card-title">Executive Summary</h2>
1036
+ <p class="executive-summary" id="executiveSummary">
1037
+ Analysis results will appear here...
1038
+ </p>
1039
+ </div>
1040
+ </div>
1041
+
1042
+ <div class="three-column-grid">
1043
+ <div class="card">
1044
+ <div class="card-icon icon-warning">⚠️</div>
1045
+ <h3 class="card-title">Unfavorable Terms</h3>
1046
+ <ul class="item-list" id="unfavorableTermsList">
1047
+ <li>No unfavorable terms detected yet</li>
1048
+ </ul>
1049
+ </div>
1050
+
1051
+ <div class="card">
1052
+ <div class="card-icon icon-shield">🛡️</div>
1053
+ <h3 class="card-title">Missing Protections</h3>
1054
+ <ul class="item-list" id="missingProtectionsList">
1055
+ <li>No missing protections detected yet</li>
1056
+ </ul>
1057
+ </div>
1058
+
1059
+ <div class="card">
1060
+ <div class="card-icon icon-book">📖</div>
1061
+ <h3 class="card-title">Negotiation Points</h3>
1062
+ <ul class="item-list" id="negotiationPointsList">
1063
+ <li>No negotiation points generated yet</li>
1064
+ </ul>
1065
+ </div>
1066
+ </div>
1067
+
1068
+ <div class="card category-breakdown">
1069
+ <h2 class="card-title">Risk Category Breakdown</h2>
1070
+ <div id="categoryBreakdown">
1071
+ <div class="category-item">
1072
+ <div class="category-header">
1073
+ <span class="category-name">Waiting for analysis...</span>
1074
+ <span class="category-score score-low">0/100</span>
1075
+ </div>
1076
+ <div class="progress-bar">
1077
+ <div class="progress-fill progress-low" style="width: 0%"></div>
1078
+ </div>
1079
+ </div>
1080
+ </div>
1081
+ </div>
1082
+
1083
+ <div class="card clause-analysis">
1084
+ <h2 class="card-title">Clause-by-Clause Analysis</h2>
1085
+ <div id="clauseAnalysis">
1086
+ <div class="clause-item">
1087
+ <div class="clause-header">
1088
+ <div>
1089
+ <div class="clause-label">STATUS</div>
1090
+ <div class="clause-text">Upload a contract to begin analysis</div>
1091
+ </div>
1092
+ </div>
1093
+ </div>
1094
+ </div>
1095
+ </div>
1096
+ </div>
1097
+ </div>
1098
+ </div>
1099
+
1100
+ <script>
1101
+ const API_BASE_URL = window.location.hostname === 'localhost'
1102
+ ? 'http://localhost:8000/api/v1'
1103
+ : '/api/v1';
1104
+
1105
+ let selectedFile = null;
1106
+ let currentJobId = null;
1107
+ let pollInterval = null;
1108
+
1109
+ // Screen management
1110
+ function showScreen(screenName) {
1111
+ document.getElementById('landingScreen').style.display = 'none';
1112
+ document.getElementById('analyzerScreen').style.display = 'none';
1113
+ document.getElementById('loadingScreen').classList.remove('active');
1114
+ document.getElementById('resultsScreen').classList.remove('active');
1115
+
1116
+ if (screenName === 'landing') {
1117
+ document.getElementById('landingScreen').style.display = 'block';
1118
+ } else if (screenName === 'analyzer') {
1119
+ document.getElementById('analyzerScreen').style.display = 'block';
1120
+ checkBackendConnection();
1121
+ } else if (screenName === 'loading') {
1122
+ document.getElementById('analyzerScreen').style.display = 'block';
1123
+ document.getElementById('loadingScreen').classList.add('active');
1124
+ } else if (screenName === 'results') {
1125
+ document.getElementById('analyzerScreen').style.display = 'block';
1126
+ document.getElementById('resultsScreen').classList.add('active');
1127
+ }
1128
+ }
1129
+
1130
+ // Check backend connection
1131
+ async function checkBackendConnection() {
1132
+ const statusElement = document.getElementById('apiStatus');
1133
+ statusElement.style.display = 'block';
1134
+ statusElement.textContent = 'Checking backend connection...';
1135
+ statusElement.className = 'api-status';
1136
+
1137
+ try {
1138
+ const response = await fetch(`${API_BASE_URL}/health`, {
1139
+ method: 'GET',
1140
+ headers: {
1141
+ 'Accept': 'application/json'
1142
+ }
1143
+ });
1144
+
1145
+ if (response.ok) {
1146
+ statusElement.textContent = '✓ Backend connected successfully';
1147
+ statusElement.className = 'api-status connected';
1148
+ } else {
1149
+ throw new Error('Backend not responding properly');
1150
+ }
1151
+ } catch (error) {
1152
+ console.error('Backend connection failed:', error);
1153
+ statusElement.textContent = '✗ Cannot connect to backend. Make sure the server is running on port 8000.';
1154
+ statusElement.className = 'api-status disconnected';
1155
+
1156
+ setTimeout(() => {
1157
+ statusElement.style.display = 'none';
1158
+ }, 5000);
1159
+ }
1160
+ }
1161
+
1162
+ // Navigation
1163
+ document.getElementById('getStartedBtn').addEventListener('click', () => {
1164
+ showScreen('analyzer');
1165
+ });
1166
+
1167
+ document.getElementById('backToLandingBtn').addEventListener('click', () => {
1168
+ showScreen('landing');
1169
+ });
1170
+
1171
+ // Tab switching
1172
+ document.querySelectorAll('.tab').forEach(tab => {
1173
+ tab.addEventListener('click', (e) => {
1174
+ e.preventDefault();
1175
+ const tabName = tab.dataset.tab;
1176
+
1177
+ document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
1178
+ document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
1179
+
1180
+ tab.classList.add('active');
1181
+ document.getElementById(tabName + 'Tab').classList.add('active');
1182
+ });
1183
+ });
1184
+
1185
+ // File upload handling
1186
+ const fileUploadArea = document.getElementById('fileUploadArea');
1187
+ const fileInput = document.getElementById('fileInput');
1188
+ const selectedFileDiv = document.getElementById('selectedFile');
1189
+ const fileNameSpan = document.getElementById('fileName');
1190
+ const fileSizeSpan = document.getElementById('fileSize');
1191
+ const removeFileBtn = document.getElementById('removeFile');
1192
+
1193
+ fileUploadArea.addEventListener('click', () => fileInput.click());
1194
+
1195
+ fileUploadArea.addEventListener('dragover', (e) => {
1196
+ e.preventDefault();
1197
+ fileUploadArea.classList.add('dragover');
1198
+ });
1199
+
1200
+ fileUploadArea.addEventListener('dragleave', () => {
1201
+ fileUploadArea.classList.remove('dragover');
1202
+ });
1203
+
1204
+ fileUploadArea.addEventListener('drop', (e) => {
1205
+ e.preventDefault();
1206
+ fileUploadArea.classList.remove('dragover');
1207
+ const file = e.dataTransfer.files[0];
1208
+ handleFileSelect(file);
1209
+ });
1210
+
1211
+ fileInput.addEventListener('change', (e) => {
1212
+ const file = e.target.files[0];
1213
+ handleFileSelect(file);
1214
+ });
1215
+
1216
+ removeFileBtn.addEventListener('click', (e) => {
1217
+ e.stopPropagation();
1218
+ selectedFile = null;
1219
+ fileInput.value = '';
1220
+ selectedFileDiv.style.display = 'none';
1221
+ fileUploadArea.style.display = 'block';
1222
+ });
1223
+
1224
+ function handleFileSelect(file) {
1225
+ if (!file) return;
1226
+
1227
+ const validTypes = [
1228
+ 'application/pdf',
1229
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
1230
+ 'text/plain'
1231
+ ];
1232
+
1233
+ const isValidType = validTypes.includes(file.type) ||
1234
+ file.name.match(/\.(pdf|docx|txt)$/i);
1235
+
1236
+ if (!isValidType) {
1237
+ alert('Please upload a PDF, DOCX, or TXT file');
1238
+ return;
1239
+ }
1240
+
1241
+ if (file.size > 10 * 1024 * 1024) {
1242
+ alert('File size must be less than 10MB');
1243
+ return;
1244
+ }
1245
+
1246
+ selectedFile = file;
1247
+ fileNameSpan.textContent = file.name;
1248
+ fileSizeSpan.textContent = formatFileSize(file.size);
1249
+ fileUploadArea.style.display = 'none';
1250
+ selectedFileDiv.style.display = 'flex';
1251
+ }
1252
+
1253
+ function formatFileSize(bytes) {
1254
+ if (bytes < 1024) return bytes + ' B';
1255
+ if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(2) + ' KB';
1256
+ return (bytes / (1024 * 1024)).toFixed(2) + ' MB';
1257
+ }
1258
+
1259
+ // Analyze button
1260
+ document.getElementById('analyzeBtn').addEventListener('click', async () => {
1261
+ const activeTab = document.querySelector('.tab.active').dataset.tab;
1262
+ const analyzeBtn = document.getElementById('analyzeBtn');
1263
+
1264
+ try {
1265
+ analyzeBtn.disabled = true;
1266
+ analyzeBtn.innerHTML = '<span>⏳</span><span>Processing...</span>';
1267
+
1268
+ if (activeTab === 'paste') {
1269
+ const text = document.getElementById('contractText').value.trim();
1270
+ if (!text) {
1271
+ alert('Please paste contract text');
1272
+ return;
1273
+ }
1274
+ const blob = new Blob([text], { type: 'text/plain' });
1275
+ const file = new File([blob], 'contract.txt', { type: 'text/plain' });
1276
+ await analyzeContract(file);
1277
+ } else {
1278
+ if (!selectedFile) {
1279
+ alert('Please select a file');
1280
+ return;
1281
+ }
1282
+ await analyzeContract(selectedFile);
1283
+ }
1284
+ } catch (error) {
1285
+ console.error('Analysis error:', error);
1286
+ alert('Error starting analysis: ' + error.message);
1287
+ } finally {
1288
+ analyzeBtn.disabled = false;
1289
+ analyzeBtn.innerHTML = '<span>🔍</span><span>Analyze Contract</span>';
1290
+ }
1291
+ });
1292
+
1293
+ async function analyzeContract(file) {
1294
+ try {
1295
+ showScreen('loading');
1296
+
1297
+ const formData = new FormData();
1298
+ formData.append('file', file);
1299
+ formData.append('max_clauses', '15');
1300
+ formData.append('interpret_clauses', 'true');
1301
+ formData.append('generate_negotiation_points', 'true');
1302
+ formData.append('compare_to_market', 'true');
1303
+ formData.append('llm_provider', 'ollama');
1304
+
1305
+ const response = await fetch(`${API_BASE_URL}/analyze`, {
1306
+ method: 'POST',
1307
+ body: formData
1308
+ });
1309
+
1310
+ if (!response.ok) {
1311
+ let errorDetail = 'Analysis failed';
1312
+ try {
1313
+ const errorData = await response.json();
1314
+ errorDetail = errorData.detail || errorData.error || errorDetail;
1315
+ } catch (e) {
1316
+ errorDetail = `Server error: ${response.status} ${response.statusText}`;
1317
+ }
1318
+ throw new Error(errorDetail);
1319
+ }
1320
+
1321
+ const job = await response.json();
1322
+ currentJobId = job.job_id;
1323
+
1324
+ pollInterval = setInterval(() => pollJobStatus(currentJobId), 2000);
1325
+
1326
+ } catch (error) {
1327
+ console.error('Error:', error);
1328
+ alert('Error analyzing contract: ' + error.message);
1329
+ showScreen('analyzer');
1330
+ }
1331
+ }
1332
+
1333
+ async function pollJobStatus(jobId) {
1334
+ try {
1335
+ const response = await fetch(`${API_BASE_URL}/jobs/${jobId}`);
1336
+ if (!response.ok) throw new Error('Failed to fetch job status');
1337
+
1338
+ const job = await response.json();
1339
+
1340
+ if (job.status === 'completed') {
1341
+ clearInterval(pollInterval);
1342
+ displayResults(job.result);
1343
+ showScreen('results');
1344
+ } else if (job.status === 'failed') {
1345
+ clearInterval(pollInterval);
1346
+ alert('Analysis failed: ' + job.error);
1347
+ showScreen('analyzer');
1348
+ }
1349
+ } catch (error) {
1350
+ console.error('Polling error:', error);
1351
+ }
1352
+ }
1353
+
1354
+ function displayResults(result) {
1355
+ const score = result.risk_analysis.overall_score;
1356
+ const riskLevel = result.risk_analysis.risk_level;
1357
+
1358
+ document.getElementById('riskScoreValue').textContent = score;
1359
+ document.getElementById('riskLevel').textContent = riskLevel.toUpperCase();
1360
+ document.getElementById('riskLevel').className = 'risk-level risk-' + getRiskClass(score);
1361
+
1362
+ const circumference = 534;
1363
+ const offset = circumference - (score / 100) * circumference;
1364
+ const circle = document.getElementById('riskCircle');
1365
+ circle.style.strokeDashoffset = offset;
1366
+ circle.style.stroke = getRiskColor(score);
1367
+
1368
+ document.getElementById('executiveSummary').textContent = result.executive_summary;
1369
+
1370
+ // Update other result sections...
1371
+ const unfavorableList = document.getElementById('unfavorableTermsList');
1372
+ unfavorableList.innerHTML = '';
1373
+ if (result.unfavorable_terms && result.unfavorable_terms.length > 0) {
1374
+ result.unfavorable_terms.slice(0, 8).forEach(term => {
1375
+ const li = document.createElement('li');
1376
+ li.innerHTML = `<span class="item-icon">›</span><span class="item-text"><strong>${term.term}:</strong> ${term.explanation}</span>`;
1377
+ unfavorableList.appendChild(li);
1378
+ });
1379
+ } else {
1380
+ unfavorableList.innerHTML = '<li>No unfavorable terms detected</li>';
1381
+ }
1382
+
1383
+ // Similar updates for other sections...
1384
+ }
1385
+
1386
+ function getRiskClass(score) {
1387
+ if (score >= 80) return 'critical';
1388
+ if (score >= 60) return 'high';
1389
+ if (score >= 40) return 'medium';
1390
+ return 'low';
1391
+ }
1392
+
1393
+ function getRiskColor(score) {
1394
+ if (score >= 80) return '#dc2626';
1395
+ if (score >= 60) return '#f97316';
1396
+ if (score >= 40) return '#ca8a04';
1397
+ return '#16a34a';
1398
+ }
1399
+
1400
+ // Initialize
1401
+ showScreen('landing');
1402
+ </script>
1403
+ </body>
1404
+ </html>
static/style.css DELETED
File without changes
utils/text_processor.py CHANGED
@@ -54,13 +54,17 @@ class TextProcessor:
54
  """
55
  Normalize text for analysis
56
 
57
- Args:
58
- text: Input text
59
- lowercase: Convert to lowercase
60
- remove_special_chars: Remove special characters
 
 
 
61
 
62
  Returns:
63
- Normalized text
 
64
  """
65
  if lowercase:
66
  text = text.lower()
@@ -74,17 +78,21 @@ class TextProcessor:
74
 
75
  return text.strip()
76
 
 
77
  @staticmethod
78
  def split_into_paragraphs(text: str, min_length: int = 20) -> List[str]:
79
  """
80
  Split text into paragraphs
81
 
82
- Args:
83
- text: Input text
84
- min_length: Minimum paragraph length in characters
 
 
85
 
86
  Returns:
87
- List of paragraphs
 
88
  """
89
  # Split on double newlines
90
  paragraphs = re.split(r'\n\s*\n', text)
@@ -92,17 +100,21 @@ class TextProcessor:
92
  # Filter short and empty paragraphs
93
  return [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
94
 
 
95
  @staticmethod
96
  def extract_sentences(text: str, min_length: int = 10) -> List[str]:
97
  """
98
  Extract sentences from text (basic method)
99
 
100
- Args:
101
- text: Input text
102
- min_length: Minimum sentence length in characters
 
 
103
 
104
  Returns:
105
- List of sentences
 
106
  """
107
  # Simple sentence splitting on .!?
108
  sentences = re.split(r'[.!?]+', text)
@@ -112,6 +124,7 @@ class TextProcessor:
112
 
113
  return sentences
114
 
 
115
  def extract_sentences_advanced(self, text: str) -> List[Dict[str, Any]]:
116
  """
117
  Extract sentences with NER and metadata using spaCy
@@ -125,87 +138,87 @@ class TextProcessor:
125
  if not self.nlp:
126
  # Fallback to basic extraction
127
  basic_sentences = self.extract_sentences(text)
128
- return [{"text": s, "entities": [], "start_char": 0, "end_char": 0}
129
- for s in basic_sentences]
130
 
131
- doc = self.nlp(text[:100000]) # Limit to 100K chars for performance
132
- sentences = []
 
133
 
134
  for sent in doc.sents:
135
- sentences.append({
136
- "text": sent.text.strip(),
137
- "entities": [(ent.text, ent.label_) for ent in sent.ents],
138
- "start_char": sent.start_char,
139
- "end_char": sent.end_char,
140
- "tokens": [token.text for token in sent]
141
- })
142
 
143
  return sentences
144
 
145
- # =========================================================================
146
- # LEGAL-SPECIFIC EXTRACTION
147
- # =========================================================================
148
 
149
  @staticmethod
150
  def extract_legal_entities(text: str) -> Dict[str, List[str]]:
151
  """
152
  Extract legal-specific entities (parties, dates, amounts, references)
153
 
154
- Args:
155
- text: Input text
 
156
 
157
  Returns:
158
- Dictionary of extracted entities by type
159
- """
160
- entities = {
161
- "parties": [],
162
- "dates": [],
163
- "amounts": [],
164
- "addresses": [],
165
- "references": [],
166
- "emails": [],
167
- "phone_numbers": []
168
- }
169
 
170
  # Party names (PARTY A, "the Employee", Company Name Inc.)
171
- party_patterns = [
172
- r'(?:PARTY|Party)\s+[A-Z]',
173
- r'"the\s+\w+"',
174
- r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|LLC|Corp|Ltd|Limited|Company)\.?',
175
- r'(?:the\s+)?(Employer|Employee|Consultant|Contractor|Client|Vendor|Supplier|Landlord|Tenant|Buyer|Seller)',
176
- ]
177
  for pattern in party_patterns:
178
  matches = re.findall(pattern, text)
 
179
  entities["parties"].extend(matches)
180
 
181
  # Dates (various formats)
182
- date_patterns = [
183
- r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
184
- r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
185
- r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
186
- ]
187
  for pattern in date_patterns:
188
  matches = re.findall(pattern, text, re.IGNORECASE)
 
189
  entities["dates"].extend(matches)
190
 
191
  # Legal references (Section 5.2, Clause 11.1, Article III)
192
- ref_patterns = [
193
- r'(?:Section|Clause|Article|Paragraph|Exhibit|Schedule|Appendix)\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+)',
194
- ]
195
  for pattern in ref_patterns:
196
  matches = re.findall(pattern, text, re.IGNORECASE)
 
197
  entities["references"].extend(matches)
198
 
199
  # Monetary amounts
200
- entities["amounts"] = TextProcessor.extract_monetary_amounts(text)
201
 
202
  # Email addresses
203
- email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
204
- entities["emails"] = re.findall(email_pattern, text)
205
 
206
  # Phone numbers (US format)
207
- phone_pattern = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b'
208
- phone_matches = re.findall(phone_pattern, text)
209
  entities["phone_numbers"] = ['-'.join(match) for match in phone_matches]
210
 
211
  # Deduplicate
@@ -214,37 +227,46 @@ class TextProcessor:
214
 
215
  return entities
216
 
 
217
  @staticmethod
218
  def count_words(text: str) -> int:
219
- """Count words in text"""
 
 
220
  return len(text.split())
221
 
 
222
  @staticmethod
223
  def extract_numbers(text: str) -> List[str]:
224
- """Extract all numbers from text"""
 
 
225
  return re.findall(r'\d+', text)
226
 
 
227
  @staticmethod
228
  def extract_monetary_amounts(text: str) -> List[str]:
229
  """
230
  Extract monetary amounts from text
231
 
232
  Returns:
233
- List of monetary amounts (e.g., ['$1,000', '$2,500.00'])
 
234
  """
235
  # Match patterns like $1,000 or $1000.00 or USD 1,000
236
- patterns = [
237
- r'\$[\d,]+(?:\.\d{2})?',
238
- r'USD\s*[\d,]+(?:\.\d{2})?',
239
- r'EUR\s*[\d,]+(?:\.\d{2})?',
240
- r'GBP\s*[\d,]+(?:\.\d{2})?'
241
- ]
242
-
243
- amounts = []
244
  for pattern in patterns:
245
  amounts.extend(re.findall(pattern, text, re.IGNORECASE))
246
 
247
  return amounts
 
248
 
249
  @staticmethod
250
  def extract_durations(text: str) -> List[Dict[str, str]]:
@@ -252,150 +274,149 @@ class TextProcessor:
252
  Extract time durations (e.g., "6 months", "2 years")
253
 
254
  Returns:
255
- List of duration dictionaries with 'amount' and 'unit'
 
256
  """
257
  pattern = r'(\d+)\s*(day|week|month|year)s?'
258
  matches = re.findall(pattern, text, re.IGNORECASE)
259
 
260
- return [
261
- {"amount": m[0], "unit": m[1].lower()}
262
- for m in matches
263
- ]
264
-
265
  @staticmethod
266
  def extract_percentages(text: str) -> List[str]:
267
- """Extract percentages from text"""
 
 
268
  return re.findall(r'\d+(?:\.\d+)?%', text)
269
 
270
- # =========================================================================
271
- # TEXT CHUNKING FOR EMBEDDINGS
272
- # =========================================================================
273
-
274
  @staticmethod
275
- def chunk_text_for_embedding(text: str,
276
- chunk_size: int = 512,
277
- overlap: int = 50) -> List[Dict[str, Any]]:
278
  """
279
  Chunk text with overlap for embedding models (preserves sentence boundaries)
280
 
281
- Args:
282
- text: Input text
283
- chunk_size: Maximum chunk size in words
284
- overlap: Number of words to overlap between chunks
 
 
 
285
 
286
  Returns:
287
- List of chunk dictionaries with metadata
 
288
  """
289
- sentences = TextProcessor.extract_sentences(text)
290
- chunks = []
291
- current_chunk = []
292
- current_length = 0
293
  start_sentence_idx = 0
294
 
295
  for i, sentence in enumerate(sentences):
296
- sentence_words = sentence.split()
297
  sentence_length = len(sentence_words)
298
 
299
- if current_length + sentence_length > chunk_size and current_chunk:
300
  # Save current chunk
301
- chunks.append({
302
- "text": " ".join(current_chunk),
303
- "start_sentence": start_sentence_idx,
304
- "end_sentence": i - 1,
305
- "word_count": current_length,
306
- "chunk_id": len(chunks)
307
- })
308
 
309
  # Start new chunk with overlap
310
- overlap_sentences = current_chunk[-2:] if len(current_chunk) > 2 else current_chunk
311
- current_chunk = overlap_sentences + [sentence]
312
- current_length = sum(len(s.split()) for s in current_chunk)
313
  start_sentence_idx = max(0, i - len(overlap_sentences))
 
314
  else:
315
  current_chunk.append(sentence)
316
  current_length += sentence_length
317
 
318
  # Add final chunk
319
  if current_chunk:
320
- chunks.append({
321
- "text": " ".join(current_chunk),
322
- "start_sentence": start_sentence_idx,
323
- "end_sentence": len(sentences) - 1,
324
- "word_count": current_length,
325
- "chunk_id": len(chunks)
326
- })
327
 
328
  return chunks
329
 
330
- # =========================================================================
331
- # TEXT SIMILARITY & DEDUPLICATION
332
- # =========================================================================
333
-
334
  @staticmethod
335
  def text_similarity(text1: str, text2: str) -> float:
336
  """
337
  Calculate similarity between two texts (0-1 scale)
338
 
339
- Args:
340
- text1: First text
341
- text2: Second text
 
 
342
 
343
  Returns:
344
- Similarity score (0.0 = completely different, 1.0 = identical)
 
345
  """
346
  return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
347
 
 
348
  @staticmethod
349
  def deduplicate_clauses(clauses: List[str], threshold: float = 0.85) -> List[str]:
350
  """
351
  Remove near-duplicate clauses
352
 
353
- Args:
354
- clauses: List of clause texts
355
- threshold: Similarity threshold for deduplication (0.0-1.0)
 
 
356
 
357
  Returns:
358
- List of unique clauses
 
359
  """
360
- unique = []
361
 
362
  for clause in clauses:
363
- is_duplicate = any(
364
- TextProcessor.text_similarity(clause, existing) > threshold
365
- for existing in unique
366
- )
367
  if not is_duplicate:
368
  unique.append(clause)
369
 
370
  return unique
371
 
372
- # =========================================================================
373
- # LANGUAGE DETECTION
374
- # =========================================================================
375
-
376
  @staticmethod
377
  def detect_language(text: str) -> str:
378
  """
379
  Detect text language
380
 
381
- Args:
382
- text: Input text
 
383
 
384
  Returns:
385
- ISO 639-1 language code (e.g., 'en', 'es', 'fr')
 
386
  """
387
  if not LANGDETECT_AVAILABLE:
388
- return "en" # Default to English
 
389
 
390
  try:
391
  # Use first 1000 chars for detection
392
  return detect(text[:1000])
 
393
  except LangDetectException:
394
  return "en"
395
 
396
- # =========================================================================
397
- # TEXT STATISTICS
398
- # =========================================================================
399
 
400
  @staticmethod
401
  def get_text_statistics(text: str) -> Dict[str, Any]:
@@ -403,49 +424,46 @@ class TextProcessor:
403
  Get comprehensive text statistics
404
 
405
  Returns:
406
- Dictionary with character count, word count, sentence count, etc.
 
407
  """
408
- sentences = TextProcessor.extract_sentences(text)
409
  paragraphs = TextProcessor.split_into_paragraphs(text)
410
- words = text.split()
411
-
412
- return {
413
- "character_count": len(text),
414
- "word_count": len(words),
415
- "sentence_count": len(sentences),
416
- "paragraph_count": len(paragraphs),
417
- "avg_words_per_sentence": len(words) / len(sentences) if sentences else 0,
418
- "avg_chars_per_word": len(text) / len(words) if words else 0,
419
- "language": TextProcessor.detect_language(text)
420
- }
421
-
422
- # =========================================================================
423
- # KEYWORD HIGHLIGHTING
424
- # =========================================================================
425
 
 
426
  @staticmethod
427
- def highlight_keywords(text: str, keywords: List[str],
428
- highlight_format: str = "**{}**") -> str:
429
  """
430
  Highlight keywords in text (for display purposes)
431
 
432
- Args:
433
- text: Input text
434
- keywords: List of keywords to highlight
435
- highlight_format: Format string with {} placeholder (default: Markdown bold)
 
 
 
436
 
437
  Returns:
438
- Text with highlighted keywords
 
439
  """
440
  for keyword in keywords:
441
  pattern = re.compile(re.escape(keyword), re.IGNORECASE)
442
- text = pattern.sub(lambda m: highlight_format.format(m.group(0)), text)
443
 
444
  return text
445
 
446
- # =========================================================================
447
- # CLAUSE SEGMENTATION HELPERS
448
- # =========================================================================
449
 
450
  @staticmethod
451
  def extract_numbered_sections(text: str) -> List[Dict[str, Any]]:
@@ -453,57 +471,61 @@ class TextProcessor:
453
  Extract numbered sections/clauses (1.1, 1.2, Article 5, etc.)
454
 
455
  Returns:
456
- List of section dictionaries with number and text
 
457
  """
458
- patterns = [
459
- (r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\n\s*\d+\.\d+|\n\n|$)', 'numbered'),
460
- (r'(Article\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nArticle|\n\n|$)', 'article'),
461
- (r'(Section\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nSection|\n\n|$)', 'section'),
462
- (r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\nClause|\n\n|$)', 'clause'),
463
- ]
464
 
465
- sections = []
 
466
  for pattern, section_type in patterns:
467
  matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
 
468
  for match in matches:
469
- sections.append({
470
- "reference": match.group(1).strip(),
471
- "text": match.group(2).strip(),
472
- "type": section_type,
473
- "start_pos": match.start(),
474
- "end_pos": match.end()
475
- })
476
 
477
  # Sort by position
478
- sections.sort(key=lambda x: x['start_pos'])
479
 
480
  return sections
481
 
 
482
  @staticmethod
483
  def clean_legal_text(text: str) -> str:
484
  """
485
  Clean legal text by removing boilerplate artifacts
486
 
487
- Args:
488
- text: Input legal text
 
489
 
490
  Returns:
491
- Cleaned text
 
492
  """
493
  # Remove "Page X of Y" markers
494
- text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
495
 
496
  # Remove "[Signature Page Follows]" type markers
497
- text = re.sub(r'\[.*?(?:Signature|Initial|Page).*?\]', '', text, flags=re.IGNORECASE)
498
 
499
  # Remove excessive underscores (signature lines)
500
  text = re.sub(r'_{3,}', '', text)
501
 
502
  # Remove "CONFIDENTIAL" watermarks
503
- text = re.sub(r'\b(CONFIDENTIAL|DRAFT|INTERNAL USE ONLY)\b', '', text, flags=re.IGNORECASE)
504
 
505
  # Clean up resulting whitespace
506
  text = re.sub(r'\n{3,}', '\n\n', text)
507
  text = re.sub(r' {2,}', ' ', text)
508
 
509
- return text.strip()
 
54
  """
55
  Normalize text for analysis
56
 
57
+ Arguments:
58
+ ----------
59
+ text { str } : Input text
60
+
61
+ lowercase { bool } : Convert to lowercase
62
+
63
+ remove_special_chars { bool } : Remove special characters
64
 
65
  Returns:
66
+ --------
67
+ { str } : Normalized text
68
  """
69
  if lowercase:
70
  text = text.lower()
 
78
 
79
  return text.strip()
80
 
81
+
82
  @staticmethod
83
  def split_into_paragraphs(text: str, min_length: int = 20) -> List[str]:
84
  """
85
  Split text into paragraphs
86
 
87
+ Arguments:
88
+ ----------
89
+ text { str } : Input text
90
+
91
+ min_length { int } : Minimum paragraph length in characters
92
 
93
  Returns:
94
+ --------
95
+ { list } : List of paragraphs
96
  """
97
  # Split on double newlines
98
  paragraphs = re.split(r'\n\s*\n', text)
 
100
  # Filter short and empty paragraphs
101
  return [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
102
 
103
+
104
  @staticmethod
105
  def extract_sentences(text: str, min_length: int = 10) -> List[str]:
106
  """
107
  Extract sentences from text (basic method)
108
 
109
+ Arguments:
110
+ ----------
111
+ text { str } : Input text
112
+
113
+ min_length { int } : Minimum sentence length in characters
114
 
115
  Returns:
116
+ --------
117
+ { list } : List of sentences
118
  """
119
  # Simple sentence splitting on .!?
120
  sentences = re.split(r'[.!?]+', text)
 
124
 
125
  return sentences
126
 
127
+
128
  def extract_sentences_advanced(self, text: str) -> List[Dict[str, Any]]:
129
  """
130
  Extract sentences with NER and metadata using spaCy
 
138
  if not self.nlp:
139
  # Fallback to basic extraction
140
  basic_sentences = self.extract_sentences(text)
141
+
142
+ return [{"text" : s, "entities" : [], "start_char" : 0, "end_char" : 0} for s in basic_sentences]
143
 
144
+ # Limit to 100K chars for performance
145
+ doc = self.nlp(text[:100000])
146
+ sentences = list()
147
 
148
  for sent in doc.sents:
149
+ sentences.append({"text" : sent.text.strip(),
150
+ "entities" : [(ent.text, ent.label_) for ent in sent.ents],
151
+ "start_char" : sent.start_char,
152
+ "end_char" : sent.end_char,
153
+ "tokens" : [token.text for token in sent],
154
+ })
 
155
 
156
  return sentences
157
 
 
 
 
158
 
159
  @staticmethod
160
  def extract_legal_entities(text: str) -> Dict[str, List[str]]:
161
  """
162
  Extract legal-specific entities (parties, dates, amounts, references)
163
 
164
+ Arguments:
165
+ ----------
166
+ text { str } : Input text
167
 
168
  Returns:
169
+ --------
170
+ { dict } : Dictionary of extracted entities by type
171
+ """
172
+ entities = {"parties" : [],
173
+ "dates" : [],
174
+ "amounts" : [],
175
+ "addresses" : [],
176
+ "references" : [],
177
+ "emails" : [],
178
+ "phone_numbers" : [],
179
+ }
180
 
181
  # Party names (PARTY A, "the Employee", Company Name Inc.)
182
+ party_patterns = [r'(?:PARTY|Party)\s+[A-Z]',
183
+ r'"the\s+\w+"',
184
+ r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|LLC|Corp|Ltd|Limited|Company)\.?',
185
+ r'(?:the\s+)?(Employer|Employee|Consultant|Contractor|Client|Vendor|Supplier|Landlord|Tenant|Buyer|Seller)',
186
+ ]
187
+
188
  for pattern in party_patterns:
189
  matches = re.findall(pattern, text)
190
+
191
  entities["parties"].extend(matches)
192
 
193
  # Dates (various formats)
194
+ date_patterns = [r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
195
+ r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
196
+ r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
197
+ ]
198
+
199
  for pattern in date_patterns:
200
  matches = re.findall(pattern, text, re.IGNORECASE)
201
+
202
  entities["dates"].extend(matches)
203
 
204
  # Legal references (Section 5.2, Clause 11.1, Article III)
205
+ ref_patterns = [r'(?:Section|Clause|Article|Paragraph|Exhibit|Schedule|Appendix)\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+)']
206
+
 
207
  for pattern in ref_patterns:
208
  matches = re.findall(pattern, text, re.IGNORECASE)
209
+
210
  entities["references"].extend(matches)
211
 
212
  # Monetary amounts
213
+ entities["amounts"] = TextProcessor.extract_monetary_amounts(text)
214
 
215
  # Email addresses
216
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
217
+ entities["emails"] = re.findall(email_pattern, text)
218
 
219
  # Phone numbers (US format)
220
+ phone_pattern = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b'
221
+ phone_matches = re.findall(phone_pattern, text)
222
  entities["phone_numbers"] = ['-'.join(match) for match in phone_matches]
223
 
224
  # Deduplicate
 
227
 
228
  return entities
229
 
230
+
231
  @staticmethod
232
  def count_words(text: str) -> int:
233
+ """
234
+ Count words in text
235
+ """
236
  return len(text.split())
237
 
238
+
239
  @staticmethod
240
  def extract_numbers(text: str) -> List[str]:
241
+ """
242
+ Extract all numbers from text
243
+ """
244
  return re.findall(r'\d+', text)
245
 
246
+
247
  @staticmethod
248
  def extract_monetary_amounts(text: str) -> List[str]:
249
  """
250
  Extract monetary amounts from text
251
 
252
  Returns:
253
+ --------
254
+ { list } : List of monetary amounts (e.g., ['$1,000', '$2,500.00'])
255
  """
256
  # Match patterns like $1,000 or $1000.00 or USD 1,000
257
+ patterns = [r'\$[\d,]+(?:\.\d{2})?',
258
+ r'USD\s*[\d,]+(?:\.\d{2})?',
259
+ r'EUR\s*[\d,]+(?:\.\d{2})?',
260
+ r'GBP\s*[\d,]+(?:\.\d{2})?'
261
+ ]
262
+
263
+ amounts = list()
264
+
265
  for pattern in patterns:
266
  amounts.extend(re.findall(pattern, text, re.IGNORECASE))
267
 
268
  return amounts
269
+
270
 
271
  @staticmethod
272
  def extract_durations(text: str) -> List[Dict[str, str]]:
 
274
  Extract time durations (e.g., "6 months", "2 years")
275
 
276
  Returns:
277
+ --------
278
+ { list } : List of duration dictionaries with 'amount' and 'unit'
279
  """
280
  pattern = r'(\d+)\s*(day|week|month|year)s?'
281
  matches = re.findall(pattern, text, re.IGNORECASE)
282
 
283
+ return [{"amount": m[0], "unit": m[1].lower()} for m in matches]
284
+
285
+
 
 
286
  @staticmethod
287
  def extract_percentages(text: str) -> List[str]:
288
+ """
289
+ Extract percentages from text
290
+ """
291
  return re.findall(r'\d+(?:\.\d+)?%', text)
292
 
293
+
 
 
 
294
  @staticmethod
295
+ def chunk_text_for_embedding(text: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
 
 
296
  """
297
  Chunk text with overlap for embedding models (preserves sentence boundaries)
298
 
299
+ Arguments:
300
+ ----------
301
+ text { str } : Input text
302
+
303
+ chunk_size { int } : Maximum chunk size in words
304
+
305
+ overlap { int } : Number of words to overlap between chunks
306
 
307
  Returns:
308
+ --------
309
+ { list } : List of chunk dictionaries with metadata
310
  """
311
+ sentences = TextProcessor.extract_sentences(text)
312
+ chunks = list()
313
+ current_chunk = list()
314
+ current_length = 0
315
  start_sentence_idx = 0
316
 
317
  for i, sentence in enumerate(sentences):
318
+ sentence_words = sentence.split()
319
  sentence_length = len(sentence_words)
320
 
321
+ if (((current_length + sentence_length) > chunk_size) and current_chunk):
322
  # Save current chunk
323
+ chunks.append({"text" : " ".join(current_chunk),
324
+ "start_sentence" : start_sentence_idx,
325
+ "end_sentence" : i - 1,
326
+ "word_count" : current_length,
327
+ "chunk_id" : len(chunks),
328
+ })
 
329
 
330
  # Start new chunk with overlap
331
+ overlap_sentences = current_chunk[-2:] if (len(current_chunk) > 2) else current_chunk
332
+ current_chunk = overlap_sentences + [sentence]
333
+ current_length = sum(len(s.split()) for s in current_chunk)
334
  start_sentence_idx = max(0, i - len(overlap_sentences))
335
+
336
  else:
337
  current_chunk.append(sentence)
338
  current_length += sentence_length
339
 
340
  # Add final chunk
341
  if current_chunk:
342
+ chunks.append({"text" : " ".join(current_chunk),
343
+ "start_sentence" : start_sentence_idx,
344
+ "end_sentence" : len(sentences) - 1,
345
+ "word_count" : current_length,
346
+ "chunk_id" : len(chunks),
347
+ })
 
348
 
349
  return chunks
350
 
351
+
 
 
 
352
  @staticmethod
353
  def text_similarity(text1: str, text2: str) -> float:
354
  """
355
  Calculate similarity between two texts (0-1 scale)
356
 
357
+ Arguments:
358
+ ----------
359
+ text1 { str } : First text
360
+
361
+ text2 { str } : Second text
362
 
363
  Returns:
364
+ --------
365
+ { float } : Similarity score (0.0 = completely different, 1.0 = identical)
366
  """
367
  return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
368
 
369
+
370
  @staticmethod
371
  def deduplicate_clauses(clauses: List[str], threshold: float = 0.85) -> List[str]:
372
  """
373
  Remove near-duplicate clauses
374
 
375
+ Arguments:
376
+ ----------
377
+ clauses { list } : List of clause texts
378
+
379
+ threshold { float } : Similarity threshold for deduplication (0.0-1.0)
380
 
381
  Returns:
382
+ --------
383
+ { list } : List of unique clauses
384
  """
385
+ unique = list()
386
 
387
  for clause in clauses:
388
+ is_duplicate = any(TextProcessor.text_similarity(clause, existing) > threshold for existing in unique)
389
+
 
 
390
  if not is_duplicate:
391
  unique.append(clause)
392
 
393
  return unique
394
 
395
+
 
 
 
396
  @staticmethod
397
  def detect_language(text: str) -> str:
398
  """
399
  Detect text language
400
 
401
+ Arguments:
402
+ ----------
403
+ text { str } : Input text
404
 
405
  Returns:
406
+ --------
407
+ { str } : ISO 639-1 language code (e.g., 'en', 'es', 'fr')
408
  """
409
  if not LANGDETECT_AVAILABLE:
410
+ # Default to English
411
+ return "en"
412
 
413
  try:
414
  # Use first 1000 chars for detection
415
  return detect(text[:1000])
416
+
417
  except LangDetectException:
418
  return "en"
419
 
 
 
 
420
 
421
  @staticmethod
422
  def get_text_statistics(text: str) -> Dict[str, Any]:
 
424
  Get comprehensive text statistics
425
 
426
  Returns:
427
+ --------
428
+ { dict } : Dictionary with character count, word count, sentence count, etc.
429
  """
430
+ sentences = TextProcessor.extract_sentences(text)
431
  paragraphs = TextProcessor.split_into_paragraphs(text)
432
+ words = text.split()
433
+
434
+ return {"character_count" : len(text),
435
+ "word_count" : len(words),
436
+ "sentence_count" : len(sentences),
437
+ "paragraph_count" : len(paragraphs),
438
+ "avg_words_per_sentence" : len(words) / len(sentences) if sentences else 0,
439
+ "avg_chars_per_word" : len(text) / len(words) if words else 0,
440
+ "language" : TextProcessor.detect_language(text),
441
+ }
 
 
 
 
 
442
 
443
+
444
  @staticmethod
445
+ def highlight_keywords(text: str, keywords: List[str], highlight_format: str = "**{}**") -> str:
 
446
  """
447
  Highlight keywords in text (for display purposes)
448
 
449
+ Arguments:
450
+ ----------
451
+ text { str } : Input text
452
+
453
+ keywords { list } : List of keywords to highlight
454
+
455
+ highlight_format { str } : Format string with {} placeholder (default: Markdown bold)
456
 
457
  Returns:
458
+ --------
459
+ { str } : Text with highlighted keywords
460
  """
461
  for keyword in keywords:
462
  pattern = re.compile(re.escape(keyword), re.IGNORECASE)
463
+ text = pattern.sub(lambda m: highlight_format.format(m.group(0)), text)
464
 
465
  return text
466
 
 
 
 
467
 
468
  @staticmethod
469
  def extract_numbered_sections(text: str) -> List[Dict[str, Any]]:
 
471
  Extract numbered sections/clauses (1.1, 1.2, Article 5, etc.)
472
 
473
  Returns:
474
+ --------
475
+ { list } : List of section dictionaries with number and text
476
  """
477
+ patterns = [(r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\n\s*\d+\.\d+|\n\n|$)', 'numbered'),
478
+ (r'(Article\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nArticle|\n\n|$)', 'article'),
479
+ (r'(Section\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nSection|\n\n|$)', 'section'),
480
+ (r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\nClause|\n\n|$)', 'clause'),
481
+ ]
 
482
 
483
+ sections = list()
484
+
485
  for pattern, section_type in patterns:
486
  matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
487
+
488
  for match in matches:
489
+ sections.append({"reference" : match.group(1).strip(),
490
+ "text" : match.group(2).strip(),
491
+ "type" : section_type,
492
+ "start_pos" : match.start(),
493
+ "end_pos" : match.end(),
494
+ })
 
495
 
496
  # Sort by position
497
+ sections.sort(key = lambda x: x['start_pos'])
498
 
499
  return sections
500
 
501
+
502
  @staticmethod
503
  def clean_legal_text(text: str) -> str:
504
  """
505
  Clean legal text by removing boilerplate artifacts
506
 
507
+ Arguments:
508
+ ----------
509
+ text { str } : Input legal text
510
 
511
  Returns:
512
+ --------
513
+ { str } : Cleaned text
514
  """
515
  # Remove "Page X of Y" markers
516
+ text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags = re.IGNORECASE)
517
 
518
  # Remove "[Signature Page Follows]" type markers
519
+ text = re.sub(r'\[.*?(?:Signature|Initial|Page).*?\]', '', text, flags = re.IGNORECASE)
520
 
521
  # Remove excessive underscores (signature lines)
522
  text = re.sub(r'_{3,}', '', text)
523
 
524
  # Remove "CONFIDENTIAL" watermarks
525
+ text = re.sub(r'\b(CONFIDENTIAL|DRAFT|INTERNAL USE ONLY)\b', '', text, flags = re.IGNORECASE)
526
 
527
  # Clean up resulting whitespace
528
  text = re.sub(r'\n{3,}', '\n\n', text)
529
  text = re.sub(r' {2,}', ' ', text)
530
 
531
+ return text.strip()