Spaces:

satyaki-mitra
/

ContractIntel_AI

Sleeping

App Files Files Community

satyaki-mitra commited on Nov 12, 2025

Commit

d15efc9

1 Parent(s): 6fd8649

code refactor

Browse files

Files changed (20) hide show

api/__init__.py +0 -0
api/routes.py +0 -0
api/schemas.py +0 -0
app.py +733 -0
config/model_config.py +79 -28
config/risk_rules.py +75 -102
config/settings.py +34 -29
launch.py +145 -0
model_manager/llm_manager.py +1 -1
model_manager/model_cache.py +1 -0
model_manager/model_loader.py +5 -1
model_manager/model_registry.py +3 -1
reporter/pdf_generator.py +496 -0
requirements.txt +29 -12
services/clause_extractor.py +369 -428
services/contract_classifier.py +215 -216
static/app.js +0 -0
static/index.html +1404 -0
static/style.css +0 -0
utils/text_processor.py +213 -191

api/__init__.py DELETED Viewed

File without changes

api/routes.py DELETED Viewed

File without changes

api/schemas.py DELETED Viewed

File without changes

app.py CHANGED Viewed

	@@ -0,0 +1,733 @@

+"""
+FastAPI Application for AI Contract Risk Analyzer
+Complete pre-loading approach: All models loaded at startup
+Direct synchronous flow: Upload → Analyze → Return Results + PDF
+"""
+from fastapi.responses import JSONResponse, FileResponse, Response
+from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+import uuid
+import os
+from datetime import datetime
+from pathlib import Path
+import sys
+import tempfile
+import io
+# Add parent directory to path
+sys.path.append(str(Path(__file__).parent))
+# Import all services
+from config.settings import settings
+from config.risk_rules import ContractType
+from model_manager.model_loader import ModelLoader
+from utils.document_reader import DocumentReader
+from utils.validators import ContractValidator
+from utils.text_processor import TextProcessor
+from utils.logger import ContractAnalyzerLogger, log_info, log_error
+from services.contract_classifier import ContractClassifier
+from services.clause_extractor import ClauseExtractor
+from services.risk_analyzer import MultiFactorRiskAnalyzer
+from services.term_analyzer import TermAnalyzer
+from services.protection_checker import ProtectionChecker
+from services.llm_interpreter import LLMClauseInterpreter
+from services.negotiation_engine import NegotiationEngine
+from services.market_comparator import MarketComparator
+# Import PDF generator
+from reporter.pdf_generator import generate_pdf_report
+# Initialize logger
+ContractAnalyzerLogger.setup(log_dir="logs", app_name="contract_analyzer")
+logger = ContractAnalyzerLogger.get_logger()
+# ============================================================================
+# PYDANTIC SCHEMAS
+# ============================================================================
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str
+    version: str
+    timestamp: str
+    models_loaded: int
+    services_loaded: int
+    memory_usage_mb: float
+class AnalysisOptions(BaseModel):
+    """Analysis options"""
+    max_clauses: int = Field(default=15, ge=5, le=30)
+    interpret_clauses: bool = Field(default=True)
+    generate_negotiation_points: bool = Field(default=True)
+    compare_to_market: bool = Field(default=True)
+class AnalysisResult(BaseModel):
+    """Complete analysis result"""
+    analysis_id: str
+    timestamp: str
+    classification: Dict[str, Any]
+    clauses: List[Dict[str, Any]]
+    risk_analysis: Dict[str, Any]
+    unfavorable_terms: List[Dict[str, Any]]
+    missing_protections: List[Dict[str, Any]]
+    clause_interpretations: Optional[List[Dict[str, Any]]] = None
+    negotiation_points: Optional[List[Dict[str, Any]]] = None
+    market_comparisons: Optional[List[Dict[str, Any]]] = None
+    executive_summary: str
+    metadata: Dict[str, Any]
+    pdf_available: bool = True
+class ErrorResponse(BaseModel):
+    """Error response"""
+    error: str
+    detail: str
+    timestamp: str
+# ============================================================================
+# SERVICE INITIALIZATION WITH FULL PRE-LOADING
+# ============================================================================
+class PreloadedAnalysisService:
+    """Analysis service with complete pre-loading of all models"""
+    def __init__(self):
+        self.model_loader = ModelLoader()
+        self.services = {}
+        self.service_status = {}
+        self.memory_usage_mb = 0
+        self._preload_all_services()
+    def _preload_all_services(self):
+        """Pre-load ALL services and models at initialization"""
+        log_info("PRE-LOADING ALL AI MODELS AND SERVICES")
+        try:
+            # Track memory usage
+            initial_memory = self._get_memory_usage()
+            # 1. Pre-load core classifier
+            log_info("🔄 Pre-loading Contract Classifier...")
+            self.services["classifier"] = ContractClassifier(self.model_loader)
+            self.service_status["classifier"] = "loaded"
+            log_info("✅ Contract Classifier loaded")
+            # 2. Pre-load Term Analyzer
+            log_info("🔄 Pre-loading Term Analyzer...")
+            self.services["term_analyzer"] = TermAnalyzer()
+            self.service_status["term_analyzer"] = "loaded"
+            log_info("✅ Term Analyzer loaded")
+            # 3. Pre-load Protection Checker
+            log_info("🔄 Pre-loading Protection Checker...")
+            self.services["protection_checker"] = ProtectionChecker()
+            self.service_status["protection_checker"] = "loaded"
+            log_info("✅ Protection Checker loaded")
+            # 4. Pre-load Market Comparator
+            log_info("🔄 Pre-loading Market Comparator...")
+            self.services["market_comparator"] = MarketComparator(self.model_loader)
+            self.service_status["market_comparator"] = "loaded"
+            log_info("✅ Market Comparator loaded")
+            # 5. Pre-load Clause Extractors for all major contract types
+            log_info("🔄 Pre-loading Clause Extractors...")
+            self.services["extractors"] = {}
+            major_categories = ["employment", "consulting", "nda", "software", "service", "partnership"]
+            for category in major_categories:
+                try:
+                    self.services["extractors"][category] = ClauseExtractor(
+                        self.model_loader, contract_category=category
+                    )
+                    log_info(f"  ✅ Clause Extractor for {category} loaded")
+                except Exception as e:
+                    log_error(f"Failed to load extractor for {category}: {e}")
+                    self.services["extractors"][category] = None
+            self.service_status["extractors"] = f"loaded for {len(major_categories)} categories"
+            log_info("✅ All Clause Extractors loaded")
+            # 6. Pre-load Risk Analyzers for all contract types
+            log_info("🔄 Pre-loading Risk Analyzers...")
+            self.services["risk_analyzers"] = {}
+            contract_types = [
+                ContractType.EMPLOYMENT, ContractType.CONSULTING, ContractType.NDA,
+                ContractType.SOFTWARE, ContractType.SERVICE, ContractType.PARTNERSHIP,
+                ContractType.LEASE, ContractType.PURCHASE, ContractType.GENERAL
+            ]
+            for contract_type in contract_types:
+                try:
+                    self.services["risk_analyzers"][contract_type.value] = MultiFactorRiskAnalyzer(
+                        contract_type=contract_type
+                    )
+                    log_info(f"  ✅ Risk Analyzer for {contract_type.value} loaded")
+                except Exception as e:
+                    log_error(f"Failed to load risk analyzer for {contract_type.value}: {e}")
+                    self.services["risk_analyzers"][contract_type.value] = None
+            self.service_status["risk_analyzers"] = f"loaded for {len(contract_types)} types"
+            log_info("✅ All Risk Analyzers loaded")
+            # 7. Pre-load LLM Interpreter (if available)
+            log_info("🔄 Pre-loading LLM Interpreter...")
+            try:
+                self.services["interpreter"] = LLMClauseInterpreter()
+                self.service_status["interpreter"] = "loaded"
+                log_info("✅ LLM Interpreter loaded")
+            except Exception as e:
+                self.services["interpreter"] = None
+                self.service_status["interpreter"] = f"failed: {str(e)}"
+                log_info("⚠️  LLM Interpreter not available (will skip interpretation)")
+            # 8. Pre-load Negotiation Engine (if available)
+            log_info("🔄 Pre-loading Negotiation Engine...")
+            try:
+                self.services["negotiation_engine"] = NegotiationEngine()
+                self.service_status["negotiation_engine"] = "loaded"
+                log_info("✅ Negotiation Engine loaded")
+            except Exception as e:
+                self.services["negotiation_engine"] = None
+                self.service_status["negotiation_engine"] = f"failed: {str(e)}"
+                log_info("⚠️  Negotiation Engine not available (will skip negotiation points)")
+            # Calculate memory usage
+            final_memory = self._get_memory_usage()
+            self.memory_usage_mb = final_memory - initial_memory
+            log_info("🎉 ALL SERVICES PRE-LOADED SUCCESSFULLY!")
+            log_info(f"📊 Memory Usage: {self.memory_usage_mb:.2f} MB")
+            log_info(f"🔧 Services Loaded: {len(self.service_status)}")
+        except Exception as e:
+            log_error(f"CRITICAL: Failed to pre-load services: {e}")
+            raise
+    def _get_memory_usage(self) -> float:
+        """Get current memory usage in MB"""
+        try:
+            import psutil
+            process = psutil.Process()
+            return process.memory_info().rss / 1024 / 1024
+        except ImportError:
+            return 0.0
+    def get_service_status(self) -> Dict[str, Any]:
+        """Get detailed service status"""
+        model_stats = self.model_loader.get_registry_stats()
+        return {
+            "services": self.service_status,
+            "models": model_stats,
+            "memory_usage_mb": self.memory_usage_mb,
+            "total_services_loaded": len([s for s in self.service_status.values() if "loaded" in str(s)]),
+            "total_models_loaded": model_stats.get("loaded_models", 0)
+        }
+    def analyze_contract(self, contract_text: str, options: AnalysisOptions) -> Dict[str, Any]:
+        """Synchronous contract analysis using pre-loaded services"""
+        try:
+            log_info("Starting contract analysis with pre-loaded services...")
+            # Step 1: Classify contract
+            classification = self.services["classifier"].classify_contract(contract_text)
+            classification_dict = classification.to_dict()
+            actual_category = classification.category
+            log_info(f"Contract classified as: {actual_category}")
+            # Step 2: Get appropriate extractor
+            extractor = self.services["extractors"].get(actual_category)
+            if not extractor:
+                # Fallback to first available extractor or create new one
+                available_categories = [cat for cat, ext in self.services["extractors"].items() if ext is not None]
+                if available_categories:
+                    fallback_category = available_categories[0]
+                    extractor = self.services["extractors"][fallback_category]
+                    log_info(f"Using fallback extractor for: {fallback_category}")
+                else:
+                    # Create new extractor for this category
+                    extractor = ClauseExtractor(self.model_loader, contract_category=actual_category)
+                    self.services["extractors"][actual_category] = extractor
+            # Extract clauses
+            clauses = extractor.extract_clauses(contract_text, options.max_clauses)
+            clauses_dict = [clause.to_dict() for clause in clauses]
+            log_info(f"Extracted {len(clauses)} clauses")
+            # Step 3: Map to ContractType and get appropriate risk analyzer
+            contract_type_mapping = {
+                'employment': ContractType.EMPLOYMENT,
+                'consulting': ContractType.CONSULTING,
+                'nda': ContractType.NDA,
+                'technology': ContractType.SOFTWARE,
+                'software': ContractType.SOFTWARE,
+                'service_agreement': ContractType.SERVICE,
+                'business': ContractType.PARTNERSHIP,
+                'real_estate': ContractType.LEASE,
+                'sales': ContractType.PURCHASE,
+            }
+            contract_type = contract_type_mapping.get(actual_category, ContractType.GENERAL)
+            risk_analyzer = self.services["risk_analyzers"].get(contract_type.value)
+            if not risk_analyzer:
+                # Fallback to general analyzer
+                risk_analyzer = self.services["risk_analyzers"]["general"]
+            # Analyze risk
+            risk_score = risk_analyzer.analyze_risk(contract_text, clauses)
+            risk_dict = risk_score.to_dict()
+            log_info(f"Risk analysis completed: {risk_dict['overall_score']}/100")
+            # Step 4: Find unfavorable terms
+            unfavorable_terms = self.services["term_analyzer"].analyze_unfavorable_terms(contract_text, clauses)
+            unfavorable_dict = [term.to_dict() for term in unfavorable_terms]
+            log_info(f"Found {len(unfavorable_terms)} unfavorable terms")
+            # Step 5: Check missing protections
+            missing_protections = self.services["protection_checker"].check_missing_protections(contract_text, clauses)
+            missing_dict = [prot.to_dict() for prot in missing_protections]
+            log_info(f"Found {len(missing_protections)} missing protections")
+            # Optional steps
+            interpretations_dict = None
+            negotiation_dict = None
+            market_dict = None
+            if options.interpret_clauses and self.services["interpreter"]:
+                try:
+                    interpretations = self.services["interpreter"].interpret_clauses(
+                        clauses, min(10, options.max_clauses)
+                    )
+                    interpretations_dict = [interp.to_dict() for interp in interpretations]
+                    log_info(f"Interpreted {len(interpretations)} clauses")
+                except Exception as e:
+                    log_error(f"Clause interpretation failed: {e}")
+                    interpretations_dict = []
+            if options.generate_negotiation_points and self.services["negotiation_engine"]:
+                try:
+                    negotiation_points = self.services["negotiation_engine"].generate_negotiation_points(
+                        risk_score, unfavorable_terms, missing_protections, clauses, 7
+                    )
+                    negotiation_dict = [point.to_dict() for point in negotiation_points]
+                    log_info(f"Generated {len(negotiation_points)} negotiation points")
+                except Exception as e:
+                    log_error(f"Negotiation points generation failed: {e}")
+                    negotiation_dict = []
+            if options.compare_to_market:
+                try:
+                    market_comparisons = self.services["market_comparator"].compare_to_market(clauses)
+                    market_dict = [comp.to_dict() for comp in market_comparisons]
+                    log_info(f"Compared {len(market_comparisons)} clauses to market")
+                except Exception as e:
+                    log_error(f"Market comparison failed: {e}")
+                    market_dict = []
+            # Generate executive summary
+            executive_summary = self._generate_executive_summary(
+                classification_dict, risk_dict, unfavorable_dict, missing_dict
+            )
+            # Build result
+            result = {
+                "analysis_id": str(uuid.uuid4()),
+                "timestamp": datetime.now().isoformat(),
+                "classification": classification_dict,
+                "clauses": clauses_dict,
+                "risk_analysis": risk_dict,
+                "unfavorable_terms": unfavorable_dict,
+                "missing_protections": missing_dict,
+                "clause_interpretations": interpretations_dict,
+                "negotiation_points": negotiation_dict,
+                "market_comparisons": market_dict,
+                "executive_summary": executive_summary,
+                "metadata": {
+                    "text_length": len(contract_text),
+                    "word_count": len(contract_text.split()),
+                    "num_clauses": len(clauses),
+                    "contract_type": contract_type.value,
+                    "actual_category": actual_category,
+                    "options": options.dict()
+                },
+                "pdf_available": True
+            }
+            log_info("Contract analysis completed successfully")
+            return result
+        except Exception as e:
+            log_error(f"Contract analysis failed: {e}")
+            raise
+    def _generate_executive_summary(self, classification: Dict, risk_score: Dict,
+                                   unfavorable_terms: List, missing_protections: List) -> str:
+        """Generate executive summary"""
+        category = classification.get("category", "Unknown")
+        score = risk_score.get("overall_score", 0)
+        risk_level = risk_score.get("risk_level", "UNKNOWN")
+        critical_terms = sum(1 for t in unfavorable_terms if t.get('severity') == 'critical')
+        critical_protections = sum(1 for p in missing_protections if p.get('importance') == 'critical')
+        if score >= 80:
+            risk_msg = "CRITICAL ATTENTION REQUIRED"
+        elif score >= 60:
+            risk_msg = "SIGNIFICANT CONCERNS"
+        elif score >= 40:
+            risk_msg = "MODERATE RISK"
+        else:
+            risk_msg = "LOW RISK"
+        return f"This {category} contract scored {score}/100 ({risk_level.upper()} risk). {risk_msg}. Found {len(unfavorable_terms)} unfavorable terms ({critical_terms} critical) and {len(missing_protections)} missing protections ({critical_protections} critical). Review detailed analysis below."
+# ============================================================================
+# FASTAPI APP
+# ============================================================================
+app = FastAPI(
+    title=settings.APP_NAME,
+    version=settings.APP_VERSION,
+    description="AI-powered contract risk analysis with complete model pre-loading",
+    docs_url="/api/docs",
+    redoc_url="/api/redoc"
+)
+# Serve static files
+app.mount("/static", StaticFiles(directory="static"), name="static")
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.CORS_ORIGINS,
+    allow_credentials=settings.CORS_ALLOW_CREDENTIALS,
+    allow_methods=settings.CORS_ALLOW_METHODS,
+    allow_headers=settings.CORS_ALLOW_HEADERS
+)
+# Initialize pre-loaded analysis service
+analysis_service = PreloadedAnalysisService()
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def validate_file(file: UploadFile) -> tuple[bool, str]:
+    """File validation using settings from config"""
+    file_ext = os.path.splitext(file.filename)[1].lower()
+    if file_ext not in settings.ALLOWED_EXTENSIONS:
+        return False, f"Invalid file type. Allowed: {', '.join(settings.ALLOWED_EXTENSIONS)}"
+    file.file.seek(0, 2)
+    size = file.file.tell()
+    file.file.seek(0)
+    if size > settings.MAX_UPLOAD_SIZE:
+        return False, f"File too large. Max size: {settings.MAX_UPLOAD_SIZE / (1024*1024)}MB"
+    if size == 0:
+        return False, "File is empty"
+    return True, "OK"
+def read_contract_file(file: UploadFile) -> str:
+    """Read contract text from file using DocumentReader"""
+    file_ext = os.path.splitext(file.filename)[1].lower()
+    file_type = "pdf" if file_ext == ".pdf" else "docx" if file_ext == ".docx" else "txt"
+    reader = DocumentReader()
+    file_contents = reader.read_file(file.file, file_type)
+    # Handle both string and dict return types from DocumentReader
+    if isinstance(file_contents, dict):
+        return file_contents.get('text', '') or file_contents.get('content', '')
+    else:
+        return str(file_contents)
+def validate_contract_text(text: str) -> tuple[bool, str]:
+    """Validate contract text using settings"""
+    if not text or not text.strip():
+        return False, "Contract text is empty"
+    if len(text) < settings.MIN_CONTRACT_LENGTH:
+        return False, f"Contract text too short. Minimum {settings.MIN_CONTRACT_LENGTH} characters required."
+    if len(text) > settings.MAX_CONTRACT_LENGTH:
+        return False, f"Contract text too long. Maximum {settings.MAX_CONTRACT_LENGTH} characters allowed."
+    return True, "OK"
+# ============================================================================
+# API ROUTES
+# ============================================================================
+@app.get("/")
+async def serve_frontend():
+    """Serve the frontend"""
+    return FileResponse("static/index.html")
+@app.get("/api/v1/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint with service status"""
+    service_status = analysis_service.get_service_status()
+    return HealthResponse(
+        status="healthy",
+        version=settings.APP_VERSION,
+        timestamp=datetime.now().isoformat(),
+        models_loaded=service_status["total_models_loaded"],
+        services_loaded=service_status["total_services_loaded"],
+        memory_usage_mb=service_status["memory_usage_mb"]
+    )
+@app.get("/api/v1/status")
+async def get_detailed_status():
+    """Get detailed service status"""
+    return analysis_service.get_service_status()
+@app.post("/api/v1/analyze/file", response_model=AnalysisResult)
+async def analyze_contract_file(
+    file: UploadFile = File(...),
+    max_clauses: int = Form(15),
+    interpret_clauses: bool = Form(True),
+    generate_negotiation_points: bool = Form(True),
+    compare_to_market: bool = Form(True)
+):
+    """Analyze uploaded contract file - DIRECT SYNC FLOW"""
+    try:
+        # Validate file
+        is_valid, message = validate_file(file)
+        if not is_valid:
+            raise HTTPException(status_code=400, detail=message)
+        # Read contract text
+        contract_text = read_contract_file(file)
+        # Validate contract text
+        is_valid_text, text_message = validate_contract_text(contract_text)
+        if not is_valid_text:
+            raise HTTPException(status_code=400, detail=text_message)
+        # Validate contract structure using ContractValidator
+        validator = ContractValidator()
+        is_valid_contract, contract_type, confidence = validator.is_valid_contract(contract_text)
+        if not is_valid_contract:
+            raise HTTPException(status_code=400, detail=f"Invalid contract: {confidence}")
+        # Create analysis options
+        options = AnalysisOptions(
+            max_clauses=min(max_clauses, settings.MAX_CLAUSES_TO_ANALYZE),
+            interpret_clauses=interpret_clauses,
+            generate_negotiation_points=generate_negotiation_points,
+            compare_to_market=compare_to_market
+        )
+        # Perform analysis (SYNCHRONOUS with pre-loaded services)
+        result = analysis_service.analyze_contract(contract_text, options)
+        log_info(f"File analysis completed",
+                filename=file.filename,
+                analysis_id=result["analysis_id"],
+                risk_score=result["risk_analysis"]["overall_score"])
+        return AnalysisResult(**result)
+    except HTTPException:
+        raise
+    except Exception as e:
+        log_error(f"File analysis failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+@app.post("/api/v1/analyze/text", response_model=AnalysisResult)
+async def analyze_contract_text(
+    contract_text: str = Form(..., description="Contract text to analyze"),
+    max_clauses: int = Form(15),
+    interpret_clauses: bool = Form(True),
+    generate_negotiation_points: bool = Form(True),
+    compare_to_market: bool = Form(True)
+):
+    """Analyze pasted contract text - DIRECT SYNC FLOW"""
+    try:
+        # Validate contract text
+        is_valid, message = validate_contract_text(contract_text)
+        if not is_valid:
+            raise HTTPException(status_code=400, detail=message)
+        # Validate contract structure using ContractValidator
+        validator = ContractValidator()
+        is_valid_contract, contract_type, confidence = validator.is_valid_contract(contract_text)
+        if not is_valid_contract:
+            raise HTTPException(status_code=400, detail=f"Invalid contract: {confidence}")
+        # Create analysis options
+        options = AnalysisOptions(
+            max_clauses=min(max_clauses, settings.MAX_CLAUSES_TO_ANALYZE),
+            interpret_clauses=interpret_clauses,
+            generate_negotiation_points=generate_negotiation_points,
+            compare_to_market=compare_to_market
+        )
+        # Perform analysis (SYNCHRONOUS with pre-loaded services)
+        result = analysis_service.analyze_contract(contract_text, options)
+        log_info(f"Text analysis completed",
+                analysis_id=result["analysis_id"],
+                risk_score=result["risk_analysis"]["overall_score"])
+        return AnalysisResult(**result)
+    except HTTPException:
+        raise
+    except Exception as e:
+        log_error(f"Text analysis failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+@app.post("/api/v1/generate-pdf")
+async def generate_pdf_from_analysis(analysis_result: Dict[str, Any]):
+    """Generate PDF from analysis results"""
+    try:
+        pdf_buffer = generate_pdf_report(analysis_result)
+        analysis_id = analysis_result.get('analysis_id', 'report')
+        return Response(
+            content=pdf_buffer.getvalue(),
+            media_type="application/pdf",
+            headers={
+                "Content-Disposition": f"attachment; filename=contract_analysis_{analysis_id}.pdf"
+            }
+        )
+    except Exception as e:
+        log_error(f"PDF generation failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to generate PDF: {str(e)}")
+@app.get("/api/v1/categories")
+async def get_contract_categories():
+    """Get list of supported contract categories"""
+    try:
+        categories = analysis_service.services["classifier"].get_all_categories()
+        return {"categories": categories}
+    except Exception as e:
+        log_error(f"Categories fetch failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to get categories: {str(e)}")
+@app.post("/api/v1/validate/file")
+async def validate_contract_file(file: UploadFile = File(...)):
+    """Quick validation endpoint"""
+    try:
+        is_valid, message = validate_file(file)
+        if not is_valid:
+            return {"valid": False, "message": message}
+        contract_text = read_contract_file(file)
+        # Validate text length
+        is_valid_text, text_message = validate_contract_text(contract_text)
+        if not is_valid_text:
+            return {"valid": False, "message": text_message}
+        # Validate contract structure using ContractValidator
+        validator = ContractValidator()
+        report = validator.get_validation_report(contract_text)
+        return {
+            "valid": report["scores"]["total"] > 50 and is_valid_text,
+            "message": "Contract appears valid" if report["scores"]["total"] > 50 else "May not be a valid contract",
+            "confidence": report["scores"]["total"],
+            "report": report
+        }
+    except Exception as e:
+        log_error(f"File validation failed: {e}")
+        raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")
+@app.post("/api/v1/validate/text")
+async def validate_contract_text_endpoint(contract_text: str = Form(...)):
+    """Validate pasted contract text"""
+    try:
+        # Validate text length
+        is_valid, message = validate_contract_text(contract_text)
+        if not is_valid:
+            return {"valid": False, "message": message}
+        # Validate contract structure using ContractValidator
+        validator = ContractValidator()
+        report = validator.get_validation_report(contract_text)
+        return {
+            "valid": report["scores"]["total"] > 50 and is_valid,
+            "message": "Contract appears valid" if report["scores"]["total"] > 50 else "May not be a valid contract",
+            "confidence": report["scores"]["total"],
+            "report": report
+        }
+    except Exception as e:
+        log_error(f"Text validation failed: {e}")
+        raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")
+# ============================================================================
+# ERROR HANDLERS
+# ============================================================================
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):
+    """Handle HTTP exceptions"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            error=exc.detail,
+            detail=str(exc.detail),
+            timestamp=datetime.now().isoformat()
+        ).dict()
+    )
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc):
+    """Handle general exceptions"""
+    log_error(f"Unhandled exception: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content=ErrorResponse(
+            error="Internal server error",
+            detail=str(exc),
+            timestamp=datetime.now().isoformat()
+        ).dict()
+    )
+# ============================================================================
+# STARTUP & SHUTDOWN
+# ============================================================================
+@app.on_event("startup")
+async def startup_event():
+    """Startup event - Services are already pre-loaded"""
+    log_info(f"🚀 {settings.APP_NAME} v{settings.APP_VERSION} STARTED")
+    log_info(f"📍 Server: {settings.HOST}:{settings.PORT}")
+    log_info(f"🔧 All models and services pre-loaded")
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Shutdown event"""
+    log_info("🛑 Shutting down server...")
+    log_info("✅ Server shutdown complete")
+# ============================================================================
+# MAIN
+# ============================================================================
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app:app",
+        host=settings.HOST,
+        port=settings.PORT,
+        reload=settings.RELOAD,
+        workers=1,  # Single worker for synchronous flow
+        log_level=settings.LOG_LEVEL.lower()
+    )

config/model_config.py CHANGED Viewed

@@ -4,40 +4,91 @@ from pathlib import Path
 class ModelConfig:
     """
-    Central configuration for all models
     """
-    # Base directories
-    BASE_DIR        = Path(__file__).parent.parent
-    MODEL_DIR       = BASE_DIR / "models"
-    CACHE_DIR       = BASE_DIR / "cache"
-    # Legal-BERT Configuration (for clause extraction)
-    LEGAL_BERT      = {"model_name" : "nlpaueb/legal-bert-base-uncased",
-                       "task"       : "clause-extraction",
-                       "max_length" : 512,
-                       "batch_size" : 16,
-                       "local_path" : MODEL_DIR / "legal-bert",
-                      }
-    # Embedding Model for Semantic Search
-    EMBEDDING_MODEL = {"model_name" : "sentence-transformers/all-MiniLM-L6-v2",
-                       "dimension"  : 384,
-                       "local_path" : MODEL_DIR / "embeddings",
-                      }
-    # LLM for Analysis (Ollama)
-    LLM_CONFIG      = {"base_url"    : "http://localhost:11434",
-                       "model"       : "mistral:7b",
-                       "temperature" : 0.1,
-                       "max_tokens"  : 5000,
-                       "timeout"     : 120,
-                      }
     @classmethod
-    def ensure_directories(cls):
         """
-        Create necessary directories
         """
-        cls.MODEL_DIR.mkdir(parents = True, exist_ok = True)
-        cls.CACHE_DIR.mkdir(parents = True, exist_ok = True)

 class ModelConfig:
     """
+    Model-specific configurations - FOR AI MODEL SETTINGS ONLY
     """
+    # Model Architecture Settings
+    LEGAL_BERT        = {"model_name"      : "nlpaueb/legal-bert-base-uncased",
+                         "task"            : "clause-extraction",
+                         "max_length"      : 512,
+                         "batch_size"      : 16,
+                         "hidden_dim"      : 768,
+                         "num_layers"      : 12,
+                         "attention_heads" : 12,
+                        }
+    # Embedding Model Settings
+    EMBEDDING_MODEL   = {"model_name"           : "sentence-transformers/all-MiniLM-L6-v2",
+                         "dimension"            : 384,
+                         "pooling"              : "mean",
+                         "normalize"            : True,
+                         "similarity_threshold" : 0.7,
+                        }
+    # Classification Model Settings
+    CLASSIFIER_MODEL  = {"embedding_dim"    : 384,
+                         "hidden_dim"       : 256,
+                         "num_categories"   : 12,
+                         "dropout_rate"     : 0.1,
+                         "learning_rate"    : 2e-5,
+                         "max_seq_length"   : 512,
+                        }
+    # Clause Extraction Settings
+    CLAUSE_EXTRACTION = {"min_clause_length"    : 50,
+                         "max_clause_length"    : 2000,
+                         "confidence_threshold" : 0.7,
+                         "overlap_threshold"    : 0.3,
+                         "max_clauses_per_doc"  : 50,
+                        }
+    # Risk Analysis Settings
+    RISK_ANALYSIS     = {"score_ranges"     : {"low"      : (0, 40),
+                                               "medium"   : (40, 60),
+                                               "high"     : (60, 80),
+                                               "critical" : (80, 100),
+                                              },
+                         "weight_decay"     : 0.1,
+                         "smoothing_factor" : 0.5,
+                        }
+    # Market Comparison Settings
+    MARKET_COMPARISON = {"similarity_threshold" : 0.75,
+                         "min_matches_required" : 3,
+                         "max_comparisons"      : 20,
+                         "embedding_cache_size" : 1000,
+                        }
+    # LLM Generation Settings
+    LLM_GENERATION    = {"max_tokens"        : 5000,
+                         "temperature"       : 0.1,
+                         "top_p"             : 0.9,
+                         "frequency_penalty" : 0.1,
+                         "presence_penalty"  : 0.1,
+                         "stop_sequences"    : ["\n\n", "###", "---"],
+                        }
+    # Text Processing Settings
+    TEXT_PROCESSING   = {"chunk_size"          : 512,
+                         "chunk_overlap"       : 50,
+                         "min_sentence_length" : 10,
+                         "max_sentence_length" : 200,
+                         "entity_confidence"   : 0.8,
+                        }
     @classmethod
+    def get_model_config(cls, model_type: str) -> dict:
         """
+        Get configuration for specific model type
         """
+        config_map = {"legal_bert"        : cls.LEGAL_BERT,
+                      "embedding"         : cls.EMBEDDING_MODEL,
+                      "classifier"        : cls.CLASSIFIER_MODEL,
+                      "clause_extraction" : cls.CLAUSE_EXTRACTION,
+                      "risk_analysis"     : cls.RISK_ANALYSIS,
+                      "market_comparison" : cls.MARKET_COMPARISON,
+                      "llm_generation"    : cls.LLM_GENERATION,
+                      "text_processing"   : cls.TEXT_PROCESSING,
+                     }
+        return config_map.get(model_type, {})

config/risk_rules.py CHANGED Viewed

@@ -33,71 +33,91 @@ class RiskRules:
                                 }
     # Contract-specific weight adjustments
-    CONTRACT_TYPE_ADJUSTMENTS = {ContractType.EMPLOYMENT : {"restrictive_covenants" : 1.3,  "compensation_benefits": 1.4, "termination_rights": 1.2},
-                                 ContractType.SOFTWARE   : {"intellectual_property" : 1.5, "penalties_liability" : 1.3},
-                                 ContractType.NDA        : {"restrictive_covenants" : 1.8, "penalties_liability" : 1.2},
-                                }
     # KEYWORD SEVERITY SCORING (Multi-tier system)
     # Critical keywords (Tier 1: 20-25 points each)
-    CRITICAL_KEYWORDS         = {"non-compete"         : 25,
-                                 "non-solicit"         : 23,
-                                 "non-solicitation"    : 23,
-                                 "forfeit"             : 25,
-                                 "liquidated damages"  : 24,
-                                 "wage withholding"    : 25,
-                                 "unlimited liability" : 25,
-                                 "joint and several"   : 23,
-                                 "perpetual"           : 22,
-                                 "irrevocable"         : 20,
-                                }
     # High-risk keywords (Tier 2: 12-18 points)
-    HIGH_RISK_KEYWORDS        = {"indemnify"             : 18,
-                                 "indemnification"       : 18,
-                                 "hold harmless"         : 17,
-                                 "penalty"               : 18,
-                                 "damages"               : 15,
-                                 "breach"                : 15,
-                                 "default"               : 14,
-                                 "immediate termination" : 16,
-                                 "without cause"         : 15,
-                                 "sole discretion"       : 17,
-                                 "at-will"               : 14,
-                                 "waive"                 : 16,
-                                 "release"               : 15,
-                                }
     # Medium-risk keywords (Tier 3: 6-10 points)
-    MEDIUM_RISK_KEYWORDS      = {"confidential"   : 8,
-                                 "proprietary"    : 8,
-                                 "trade secret"   : 10,
-                                 "terminate"      : 7,
-                                 "termination"    : 7,
-                                 "assignment"     : 6,
-                                 "exclusive"      : 9,
-                                 "warranty"       : 8,
-                                 "representation" : 7,
-                                 "covenant"       : 8,
-                                 "jurisdiction"   : 6,
-                                 "governing law"  : 6,
-                                }
     # STRUCTURAL PATTERN ANALYSIS (Pattern-based risk detection)
-    RISKY_PATTERNS            = [(r'\d+\s*(year|yr|month|mo)s?\s*(non-compete|non-solicit)', 20, "Long duration restrictive covenant"),
-                                 (r'(entire|all|worldwide|global)\s*(industry|market|territory)', 18, "Overly broad geographic/industry scope"),
-                                 (r'notice\s+period.*\d+\s*days.*employee.*\d+\s*days.*employer', 15, "Unequal notice periods"),
-                                 (r'(may|can|shall)\s+(withhold|deduct|retain).*compensation', 22, "Wage withholding clause"),
-                                 (r'(unlimited|no\s+limit|without\s+limitation).*liability', 25, "Unlimited liability exposure"),
-                                 (r'(sole|absolute|unfettered)\s+discretion', 18, "One-sided discretionary power"),
-                                 (r'penalty.*(?:equal\s+to|of|amount).*\$?\d+', 16, "Specific penalty amount"),
-                                 (r'(automatically|immediately)\s+(renew|extend)', 12, "Auto-renewal clause"),
-                                 (r'waive.*right.*arbitration', 20, "Arbitration rights waiver"),
-                                 (r'(all|any).*intellectual\s+property.*created', 17, "Broad IP assignment"),
-                                ]
     # CLAUSE-LEVEL RISK FACTORS (Detailed clause analysis)
-    CLAUSE_RISK_FACTORS       = {"non-compete": {
             "base_risk": 70,
             "duration_check": {
                 # months: risk_adjustment
@@ -189,40 +209,6 @@ class RiskRules:
                 "work for hire limited": -10
             }
         },
-        "liability": {
-            "base_risk": 65,
-            "red_flags": {
-                "unlimited": +30,
-                "consequential damages": +15,
-                "indirect damages": +12,
-                "punitive damages": +18,
-                "no cap": +25
-            },
-            "protections": {
-                "liability cap": -20,
-                "mutual cap": -15,
-                "limited to fees paid": -18
-            }
-        },
-        "confidentiality": {
-            "base_risk": 45,
-            "red_flags": {
-                "perpetual": +20,
-                "forever": +20,
-                "indefinite": +18,
-                "all information": +15,
-                "any information": +15
-            },
-            "reasonable_terms": {
-                "3 years": -5,
-                "5 years": 0,
-                "7 years": +5,
-                "marked confidential": -8,
-                "reasonably necessary": -10
-            }
-        }
     }
     # =========================================================================
@@ -249,13 +235,6 @@ class RiskRules:
             "consulting": {"generous": 3, "standard": 1, "restrictive": 0.5},
             "general": {"generous": 12, "standard": 6, "restrictive": 1}
         },
-        "ip_assignment_scope": {
-            "tech": "work_product_only",  # Standard
-            "creative": "commissioned_work_only",  # Standard
-            "consulting": "deliverables_only",  # Standard
-            "general": "work_for_hire"  # Standard
-        }
     }
     # =========================================================================
@@ -298,11 +277,6 @@ class RiskRules:
             "risk_if_missing": 15,
             "categories": ["general"]
         },
-        "change_control_process": {
-            "importance": "medium",
-            "risk_if_missing": 10,
-            "categories": ["general"]
-        }
     }
     # =========================================================================
@@ -329,5 +303,4 @@ class RiskRules:
         # Normalize to sum to 100
         total = sum(adjusted.values())
-        return {k: (v / total) * 100 for k, v in adjusted.items()}

                                 }
     # Contract-specific weight adjustments
+    CONTRACT_TYPE_ADJUSTMENTS = {
+        ContractType.EMPLOYMENT : {
+            "restrictive_covenants" : 1.3,
+            "compensation_benefits" : 1.4,
+            "termination_rights"    : 1.2,
+        },
+        ContractType.SOFTWARE : {
+            "intellectual_property" : 1.5,
+            "penalties_liability"   : 1.3,
+        },
+        ContractType.NDA : {
+            "restrictive_covenants" : 1.8,
+            "penalties_liability"   : 1.2,
+        },
+        ContractType.CONSULTING : {
+            "compensation_benefits" : 1.3,
+            "termination_rights"    : 1.1,
+        },
+    }
     # KEYWORD SEVERITY SCORING (Multi-tier system)
     # Critical keywords (Tier 1: 20-25 points each)
+    CRITICAL_KEYWORDS = {
+        "non-compete"         : 25,
+        "non-solicit"         : 23,
+        "non-solicitation"    : 23,
+        "forfeit"             : 25,
+        "liquidated damages"  : 24,
+        "wage withholding"    : 25,
+        "unlimited liability" : 25,
+        "joint and several"   : 23,
+        "perpetual"           : 22,
+        "irrevocable"         : 20,
+    }
     # High-risk keywords (Tier 2: 12-18 points)
+    HIGH_RISK_KEYWORDS = {
+        "indemnify"             : 18,
+        "indemnification"       : 18,
+        "hold harmless"         : 17,
+        "penalty"               : 18,
+        "damages"               : 15,
+        "breach"                : 15,
+        "default"               : 14,
+        "immediate termination" : 16,
+        "without cause"         : 15,
+        "sole discretion"       : 17,
+        "at-will"               : 14,
+        "waive"                 : 16,
+        "release"               : 15,
+    }
     # Medium-risk keywords (Tier 3: 6-10 points)
+    MEDIUM_RISK_KEYWORDS = {
+        "confidential"   : 8,
+        "proprietary"    : 8,
+        "trade secret"   : 10,
+        "terminate"      : 7,
+        "termination"    : 7,
+        "assignment"     : 6,
+        "exclusive"      : 9,
+        "warranty"       : 8,
+        "representation" : 7,
+        "covenant"       : 8,
+        "jurisdiction"   : 6,
+        "governing law"  : 6,
+    }
     # STRUCTURAL PATTERN ANALYSIS (Pattern-based risk detection)
+    RISKY_PATTERNS = [
+        (r'\d+\s*(year|yr|month|mo)s?\s*(non-compete|non-solicit)', 20, "Long duration restrictive covenant"),
+        (r'(entire|all|worldwide|global)\s*(industry|market|territory)', 18, "Overly broad geographic/industry scope"),
+        (r'notice\s+period.*\d+\s*days.*employee.*\d+\s*days.*employer', 15, "Unequal notice periods"),
+        (r'(may|can|shall)\s+(withhold|deduct|retain).*compensation', 22, "Wage withholding clause"),
+        (r'(unlimited|no\s+limit|without\s+limitation).*liability', 25, "Unlimited liability exposure"),
+        (r'(sole|absolute|unfettered)\s+discretion', 18, "One-sided discretionary power"),
+        (r'penalty.*(?:equal\s+to|of|amount).*\$?\d+', 16, "Specific penalty amount"),
+        (r'(automatically|immediately)\s+(renew|extend)', 12, "Auto-renewal clause"),
+        (r'waive.*right.*arbitration', 20, "Arbitration rights waiver"),
+        (r'(all|any).*intellectual\s+property.*created', 17, "Broad IP assignment"),
+    ]
     # CLAUSE-LEVEL RISK FACTORS (Detailed clause analysis)
+    CLAUSE_RISK_FACTORS = {
+        "non-compete": {
             "base_risk": 70,
             "duration_check": {
                 # months: risk_adjustment
                 "work for hire limited": -10
             }
         },
     }
     # =========================================================================
             "consulting": {"generous": 3, "standard": 1, "restrictive": 0.5},
             "general": {"generous": 12, "standard": 6, "restrictive": 1}
         },
     }
     # =========================================================================
             "risk_if_missing": 15,
             "categories": ["general"]
         },
     }
     # =========================================================================
         # Normalize to sum to 100
         total = sum(adjusted.values())
+        return {k: (v / total) * 100 for k, v in adjusted.items()}

config/settings.py CHANGED Viewed

@@ -7,12 +7,12 @@ from pydantic_settings import BaseSettings
 class Settings(BaseSettings):
     """
-    Application-wide settings
     """
     # Application Info
     APP_NAME               : str           = "AI Contract Risk Analyzer"
     APP_VERSION            : str           = "1.0.0"
-    API_PREFIX             : str           = "/api/"
     # Server Configuration
     HOST                   : str           = "0.0.0.0"
@@ -28,43 +28,48 @@ class Settings(BaseSettings):
     # File Upload Settings
     MAX_UPLOAD_SIZE        : int           = 10 * 1024 * 1024  # 10 MB
-    ALLOWED_EXTENSIONS     : list          = [".pdf", ".docx"]
     UPLOAD_DIR             : Path          = Path("uploads")
-    # Model Settings
-    MODEL_CACHE_SIZE       : int            = 3    # Number of models to keep in memory
-    MODEL_DOWNLOAD_TIMEOUT : int            = 300  # 5 minutes
-    USE_GPU                : bool           = True # Automatically detect and use GPU if available
-    # Ollama Settings
-    OLLAMA_BASE_URL        : str            = "http://localhost:11434"
-    OLLAMA_MODEL           : str            = "llama3:8b"
-    OLLAMA_TIMEOUT         : int            = 120
-    OLLAMA_TEMPERATURE     : float          = 0.1
-    # Analysis Settings
-    MIN_CONTRACT_LENGTH    : int            = 300    # Minimum characters for valid contract
-    MAX_CONTRACT_LENGTH    : int            = 500000 # Maximum characters (500KB text)
-    MAX_CLAUSES_TO_ANALYZE : int            = 15
-    # Logging
-    LOG_LEVEL              : str            = "INFO"
-    LOG_FORMAT             : str            = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
     LOG_FILE               : Optional[Path] = Path("logs/app.log")
     # Cache Settings
-    ENABLE_CACHE           : bool           = True
-    CACHE_TTL              : int            = 3600 # 1 hour
-    CACHE_DIR              : Path           = Path("cache")
-    # Rate Limiting
-    RATE_LIMIT_ENABLED     : bool           = True
-    RATE_LIMIT_REQUESTS    : int            = 10
-    RATE_LIMIT_PERIOD      : int            = 60  # seconds
     # PDF Report Settings
-    PDF_FONT_SIZE          : int            = 10
-    PDF_MARGIN             : float          = 0.5 # inches
     class Config:
@@ -84,4 +89,4 @@ class Settings(BaseSettings):
 # Global settings instance
-settings = Settings()

 class Settings(BaseSettings):
     """
+    Application-wide settings: primary configuration source
     """
     # Application Info
     APP_NAME               : str           = "AI Contract Risk Analyzer"
     APP_VERSION            : str           = "1.0.0"
+    API_PREFIX             : str           = "/api/v1/"
     # Server Configuration
     HOST                   : str           = "0.0.0.0"
     # File Upload Settings
     MAX_UPLOAD_SIZE        : int           = 10 * 1024 * 1024  # 10 MB
+    ALLOWED_EXTENSIONS     : list          = [".pdf", ".docx", ".txt"]
     UPLOAD_DIR             : Path          = Path("uploads")
+    # Model Management Settings
+    MODEL_CACHE_SIZE       : int           = 3     # Number of models to keep in memory
+    MODEL_DOWNLOAD_TIMEOUT : int           = 1800  # 30 minutes
+    USE_GPU                : bool          = True  # Automatically detect and use GPU if available
+    # External API Settings
+    OLLAMA_BASE_URL        : str           = "http://localhost:11434"
+    OLLAMA_MODEL           : str           = "llama3:8b"
+    OLLAMA_TIMEOUT         : int           = 300
+    OLLAMA_TEMPERATURE     : float         = 0.1
+    # External API Keys
+    OPENAI_API_KEY         : Optional[str] = None
+    ANTHROPIC_API_KEY      : Optional[str] = None
+    # Analysis Limits
+    MIN_CONTRACT_LENGTH    : int           = 300    # Minimum characters for valid contract
+    MAX_CONTRACT_LENGTH    : int           = 500000 # Maximum characters (500KB text)
+    MAX_CLAUSES_TO_ANALYZE : int           = 15
+    # Logging Settings
+    LOG_LEVEL              : str           = "INFO"
+    LOG_FORMAT             : str           = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
     LOG_FILE               : Optional[Path] = Path("logs/app.log")
     # Cache Settings
+    ENABLE_CACHE           : bool          = True
+    CACHE_TTL              : int           = 3600 # 1 hour
+    CACHE_DIR              : Path          = Path("cache")
+    # Rate Limiting Settings
+    RATE_LIMIT_ENABLED     : bool          = True
+    RATE_LIMIT_REQUESTS    : int           = 10
+    RATE_LIMIT_PERIOD      : int           = 60  # seconds
     # PDF Report Settings
+    PDF_FONT_SIZE          : int           = 10
+    PDF_MARGIN             : float         = 0.5 # inches
+    PDF_PAGE_SIZE          : str           = "letter"
     class Config:
 # Global settings instance
+settings = Settings()

launch.py CHANGED Viewed

	@@ -0,0 +1,145 @@

+"""
+Launch script for AI Contract Risk Analyzer
+Starts both API and frontend (if available)
+"""
+import subprocess
+import sys
+import time
+import requests
+from pathlib import Path
+def check_ollama():
+    """Check if Ollama is running"""
+    try:
+        response = requests.get("http://localhost:11434/api/tags", timeout=5)
+        if response.status_code == 200:
+            print("✓ Ollama is running")
+            return True
+    except:
+        pass
+    print("✗ Ollama not running. Start with: ollama serve")
+    return False
+def check_models():
+    """Check if required models are available"""
+    try:
+        response = requests.get("http://localhost:11434/api/tags", timeout=5)
+        models = response.json().get('models', [])
+        model_names = [m['name'] for m in models]
+        required = "llama3:8b"
+        if any(required in name for name in model_names):
+            print(f"✓ Model {required} available")
+            return True
+        else:
+            print(f"✗ Model {required} not found. Pull with: ollama pull llama3:8b")
+            return False
+    except:
+        return False
+def start_api():
+    """Start FastAPI server"""
+    print("\n" + "="*60)
+    print("Starting FastAPI Server...")
+    print("="*60)
+    subprocess.Popen([
+        sys.executable, "-m", "uvicorn",
+        "app:app",
+        "--host", "0.0.0.0",
+        "--port", "8000",
+        "--reload"
+    ])
+    # Wait for server to start
+    time.sleep(3)
+    try:
+        response = requests.get("http://localhost:8000/api/v1/health", timeout=5)
+        if response.status_code == 200:
+            print("✓ API Server running at: http://localhost:8000")
+            print("✓ Documentation at: http://localhost:8000/api/docs")
+            return True
+    except:
+        pass
+    print("✗ Failed to start API server")
+    return False
+def start_frontend():
+    """Start frontend server (if available)"""
+    if not Path("static/index.html").exists():
+        print("\n✗ Frontend not found at static/index.html")
+        return False
+    print("\n" + "="*60)
+    print("Starting Frontend Server...")
+    print("="*60)
+    subprocess.Popen([
+        sys.executable, "-m", "http.server", "3000",
+        "--directory", "static"
+    ])
+    time.sleep(2)
+    try:
+        response = requests.get("http://localhost:3000", timeout=5)
+        if response.status_code == 200:
+            print("✓ Frontend running at: http://localhost:3000")
+            return True
+    except:
+        pass
+    print("✗ Failed to start frontend server")
+    return False
+def main():
+    """Main launch function"""
+    print("="*60)
+    print("AI Contract Risk Analyzer - Launch Script")
+    print("="*60)
+    # Pre-flight checks
+    print("\nPre-flight checks:")
+    print("-"*60)
+    ollama_ok = check_ollama()
+    models_ok = check_models() if ollama_ok else False
+    if not ollama_ok:
+        print("\n⚠️  Warning: Ollama not running. Some features may not work.")
+        response = input("Continue anyway? (y/n): ")
+        if response.lower() != 'y':
+            return
+    # Start services
+    api_ok = start_api()
+    if not api_ok:
+        print("\n✗ Failed to start API. Exiting.")
+        return
+    frontend_ok = start_frontend()
+    # Summary
+    print("\n" + "="*60)
+    print("Launch Complete!")
+    print("="*60)
+    print(f"API Server: {'✓' if api_ok else '✗'} http://localhost:8000")
+    print(f"API Docs: {'✓' if api_ok else '✗'} http://localhost:8000/api/docs")
+    print(f"Frontend: {'✓' if frontend_ok else '✗'} http://localhost:3000")
+    print("\nPress Ctrl+C to stop all services")
+    print("="*60)
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("\n\nShutting down...")
+        sys.exit(0)
+if __name__ == "__main__":
+    main()

model_manager/llm_manager.py CHANGED Viewed

@@ -141,7 +141,7 @@ class LLMManager:
         Check if Ollama server is available
         """
         try:
-            response  = requests.get(f"{self.ollama_base_url}/api/tags", timeout=5)
             available = (response.status_code == 200)
             if available:

         Check if Ollama server is available
         """
         try:
+            response  = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
             available = (response.status_code == 200)
             if available:

model_manager/model_cache.py CHANGED Viewed

@@ -27,6 +27,7 @@ class ModelCache:
     def __init__(self, cache_dir: Path, ttl_seconds: int = 3600):
         self.cache_dir   = Path(cache_dir)
         self.cache_dir.mkdir(parents = True, exist_ok = True)
         self.ttl_seconds = ttl_seconds
         self.logger      = ContractAnalyzerLogger.get_logger()

     def __init__(self, cache_dir: Path, ttl_seconds: int = 3600):
         self.cache_dir   = Path(cache_dir)
         self.cache_dir.mkdir(parents = True, exist_ok = True)
         self.ttl_seconds = ttl_seconds
         self.logger      = ContractAnalyzerLogger.get_logger()

model_manager/model_loader.py CHANGED Viewed

@@ -58,7 +58,11 @@ class ModelLoader:
             return info.model, info.tokenizer
         # Mark as loading
-        self.registry.register(ModelType.LEGAL_BERT,ModelInfo(name = "legal-bert", type = ModelType.LEGAL_BERT, status = ModelStatus.LOADING))
         try:
             config = self.config.LEGAL_BERT

             return info.model, info.tokenizer
         # Mark as loading
+        self.registry.register(ModelType.LEGAL_BERT, ModelInfo(name   = "legal-bert",
+                                                               type   = ModelType.LEGAL_BERT,
+                                                               status = ModelStatus.LOADING,
+                                                              )
+                              )
         try:
             config = self.config.LEGAL_BERT

model_manager/model_registry.py CHANGED Viewed

@@ -54,6 +54,7 @@ class ModelInfo:
     last_accessed  : Optional[datetime] = None
     metadata       : Dict[str, Any]     = field(default_factory = dict)
     def mark_accessed(self):
         """
         Update access statistics
@@ -83,7 +84,7 @@ class ModelRegistry:
         if cls._instance is None:
             with cls._lock:
                 if cls._instance is None:
-                    cls._instance = super().__new__(cls)
                     cls._instance._initialized = False
         return cls._instance
@@ -123,6 +124,7 @@ class ModelRegistry:
         """
         with self._model_lock:
             info = self._registry.get(model_type)
             if info:
                 info.mark_accessed()
                 log_info(f"Model accessed: {model_type.value}",

     last_accessed  : Optional[datetime] = None
     metadata       : Dict[str, Any]     = field(default_factory = dict)
     def mark_accessed(self):
         """
         Update access statistics
         if cls._instance is None:
             with cls._lock:
                 if cls._instance is None:
+                    cls._instance              = super().__new__(cls)
                     cls._instance._initialized = False
         return cls._instance
         """
         with self._model_lock:
             info = self._registry.get(model_type)
             if info:
                 info.mark_accessed()
                 log_info(f"Model accessed: {model_type.value}",

reporter/pdf_generator.py CHANGED Viewed

	@@ -0,0 +1,496 @@

+# DEPENDENCIES
+import os
+from typing import Any
+from io import BytesIO
+from typing import Dict
+from typing import List
+from typing import Optional
+from datetime import datetime
+from reportlab.lib import colors
+from reportlab.pdfgen import canvas
+from reportlab.platypus import Image
+from reportlab.platypus import Table
+from reportlab.lib.units import inch
+from reportlab.platypus import Spacer
+from reportlab.lib.enums import TA_LEFT
+from reportlab.platypus import Paragraph
+from reportlab.platypus import PageBreak
+from reportlab.graphics import renderPDF
+from reportlab.platypus import TableStyle
+from reportlab.lib.enums import TA_CENTER
+from reportlab.lib.enums import TA_JUSTIFY
+from reportlab.lib.pagesizes import letter
+from reportlab.platypus import KeepTogether
+from reportlab.graphics.shapes import Circle
+from reportlab.graphics.shapes import String
+from reportlab.graphics.shapes import Drawing
+from reportlab.lib.styles import ParagraphStyle
+from reportlab.platypus import SimpleDocTemplate
+from reportlab.lib.styles import getSampleStyleSheet
+class PDFReportGenerator:
+    """
+    Generate professional PDF reports matching the sample style
+    """
+    def __init__(self):
+        self.styles = getSampleStyleSheet()
+        self._setup_custom_styles()
+    def _setup_custom_styles(self):
+        """
+        Setup custom paragraph styles
+        """
+        # Title style
+        self.styles.add(ParagraphStyle(name       = 'ReportTitle',
+                                       parent     = self.styles['Heading1'],
+                                       fontSize   = 24,
+                                       textColor  = colors.HexColor('#1a1a1a'),
+                                       spaceAfter = 20,
+                                       alignment  = TA_LEFT,
+                                       fontName   = 'Helvetica-Bold',
+                                      )
+                       )
+        # Section heading
+        self.styles.add(ParagraphStyle(name        = 'SectionHeading',
+                                       parent      = self.styles['Heading2'],
+                                       fontSize    = 16,
+                                       textColor   = colors.HexColor('#1a1a1a'),
+                                       spaceAfter  = 12,
+                                       spaceBefore = 20,
+                                       fontName    = 'Helvetica-Bold',
+                                      )
+                       )
+        # Body text
+        self.styles.add(ParagraphStyle(
+            name='BodyText',
+            parent=self.styles['Normal'],
+            fontSize=10,
+            leading=14,
+            textColor=colors.HexColor('#333333'),
+            alignment=TA_JUSTIFY,
+            fontName='Helvetica'
+        ))
+        # Bullet point
+        self.styles.add(ParagraphStyle(
+            name='BulletPoint',
+            parent=self.styles['Normal'],
+            fontSize=10,
+            leading=14,
+            textColor=colors.HexColor('#333333'),
+            leftIndent=20,
+            bulletIndent=10,
+            fontName='Helvetica'
+        ))
+        # Table header
+        self.styles.add(ParagraphStyle(
+            name='TableHeader',
+            parent=self.styles['Normal'],
+            fontSize=10,
+            textColor=colors.HexColor('#1a1a1a'),
+            fontName='Helvetica-Bold'
+        ))
+        # Footer
+        self.styles.add(ParagraphStyle(
+            name='Footer',
+            parent=self.styles['Normal'],
+            fontSize=8,
+            textColor=colors.HexColor('#666666'),
+            alignment=TA_CENTER,
+            fontName='Helvetica'
+        ))
+    def _draw_risk_score_circle(self, score: int) -> Drawing:
+        """Draw the risk score circle graphic"""
+        d = Drawing(150, 150)
+        # Determine color based on score
+        if score >= 80:
+            color = colors.HexColor('#dc2626')
+        elif score >= 60:
+            color = colors.HexColor('#f97316')
+        elif score >= 40:
+            color = colors.HexColor('#ca8a04')
+        else:
+            color = colors.HexColor('#16a34a')
+        # Background circle
+        bg_circle = Circle(75, 75, 60)
+        bg_circle.fillColor = colors.HexColor('#f0f0f0')
+        bg_circle.strokeColor = None
+        d.add(bg_circle)
+        # Score circle
+        score_circle = Circle(75, 75, 55)
+        score_circle.fillColor = color
+        score_circle.strokeColor = None
+        d.add(score_circle)
+        # Inner white circle
+        inner_circle = Circle(75, 75, 45)
+        inner_circle.fillColor = colors.white
+        inner_circle.strokeColor = None
+        d.add(inner_circle)
+        # Score text
+        score_text = String(75, 70, str(score), textAnchor='middle')
+        score_text.fontSize = 36
+        score_text.fontName = 'Helvetica-Bold'
+        score_text.fillColor = color
+        d.add(score_text)
+        return d
+    def _get_risk_color(self, score: int) -> colors.Color:
+        """Get color based on risk score"""
+        if score >= 80:
+            return colors.HexColor('#dc2626')
+        elif score >= 60:
+            return colors.HexColor('#f97316')
+        elif score >= 40:
+            return colors.HexColor('#ca8a04')
+        else:
+            return colors.HexColor('#16a34a')
+    def _create_header_footer(self, canvas, doc):
+        """Add header and footer to each page"""
+        canvas.saveState()
+        # Header
+        canvas.setFont('Helvetica-Bold', 12)
+        canvas.drawString(0.75 * inch, letter[1] - 0.5 * inch,
+                         "AI Contract Risk Analysis Report")
+        # Footer
+        canvas.setFont('Helvetica', 8)
+        canvas.setFillColor(colors.HexColor('#666666'))
+        # Page number
+        page_num = f"Page {doc.page} of {doc.page_count if hasattr(doc, 'page_count') else '?'}"
+        canvas.drawString(7 * inch, 0.5 * inch, page_num)
+        # Legal disclaimer
+        disclaimer = "For informational purposes only. Not legal advice."
+        canvas.drawCentredString(letter[0] / 2, 0.5 * inch, disclaimer)
+        canvas.restoreState()
+    def generate_report(self, analysis_result: Dict[str, Any],
+                       output_path: Optional[str] = None) -> BytesIO:
+        """
+        Generate PDF report from analysis results
+        Args:
+            analysis_result: Analysis result dictionary from the API
+            output_path: Optional file path to save PDF
+        Returns:
+            BytesIO buffer containing the PDF
+        """
+        # Create buffer
+        buffer = BytesIO()
+        # Create document
+        doc = SimpleDocTemplate(
+            buffer if not output_path else output_path,
+            pagesize=letter,
+            rightMargin=0.75*inch,
+            leftMargin=0.75*inch,
+            topMargin=1*inch,
+            bottomMargin=1*inch
+        )
+        # Build story
+        story = []
+        # Title and Risk Score (Page 1)
+        story.extend(self._build_page_1(analysis_result))
+        story.append(PageBreak())
+        # Negotiation Points (Page 2)
+        story.extend(self._build_page_2(analysis_result))
+        story.append(PageBreak())
+        # Risk Category Breakdown (Page 3)
+        story.extend(self._build_page_3(analysis_result))
+        # Clause-by-Clause Analysis (Page 4+)
+        story.append(PageBreak())
+        story.extend(self._build_clause_analysis(analysis_result))
+        # Build PDF
+        doc.build(story, onFirstPage=self._create_header_footer,
+                 onLaterPages=self._create_header_footer)
+        # If using buffer, seek to beginning
+        if not output_path:
+            buffer.seek(0)
+            return buffer
+        return buffer
+    def _build_page_1(self, result: Dict) -> List:
+        """Build page 1 content: Title, Risk Score, Executive Summary, Key Items"""
+        elements = []
+        # Title
+        elements.append(Paragraph("AI Contract Risk Analysis Report",
+                                 self.styles['ReportTitle']))
+        elements.append(Spacer(1, 0.1*inch))
+        # Risk Score Circle
+        risk_score = result['risk_analysis']['overall_score']
+        elements.append(self._draw_risk_score_circle(risk_score))
+        elements.append(Spacer(1, 0.2*inch))
+        # Executive Summary
+        elements.append(Paragraph("Executive Summary",
+                                 self.styles['SectionHeading']))
+        elements.append(Paragraph(result['executive_summary'],
+                                 self.styles['BodyText']))
+        elements.append(Spacer(1, 0.2*inch))
+        # Unfavorable Terms
+        elements.append(Paragraph("Unfavorable Terms",
+                                 self.styles['SectionHeading']))
+        for term in result['unfavorable_terms'][:8]:  # Limit to 8 items
+            bullet_text = f"<bullet>•</bullet> <b>{term.get('clause_reference', term['term'])}:</b> {term['explanation']}"
+            elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
+            elements.append(Spacer(1, 0.05*inch))
+        elements.append(Spacer(1, 0.2*inch))
+        # Missing Protections
+        elements.append(Paragraph("Missing Protections",
+                                 self.styles['SectionHeading']))
+        for protection in result['missing_protections'][:6]:  # Limit to 6 items
+            bullet_text = f"<bullet>•</bullet> <b>{protection['protection']}:</b> {protection['explanation']}"
+            elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
+            elements.append(Spacer(1, 0.05*inch))
+        return elements
+    def _build_page_2(self, result: Dict) -> List:
+        """Build page 2 content: Negotiation Points"""
+        elements = []
+        elements.append(Paragraph("Negotiation Points",
+                                 self.styles['SectionHeading']))
+        elements.append(Spacer(1, 0.1*inch))
+        negotiation_points = result.get('negotiation_points', [])
+        if negotiation_points:
+            for point in negotiation_points[:7]:  # Limit to 7 points
+                bullet_text = f"<bullet>•</bullet> {point['issue']}: {point['rationale']}"
+                elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
+                elements.append(Spacer(1, 0.1*inch))
+        else:
+            # Fallback to unfavorable terms if negotiation points not available
+            for term in result['unfavorable_terms'][:7]:
+                if term.get('suggested_fix'):
+                    bullet_text = f"<bullet>•</bullet> {term['term']}: {term['suggested_fix']}"
+                    elements.append(Paragraph(bullet_text, self.styles['BulletPoint']))
+                    elements.append(Spacer(1, 0.1*inch))
+        return elements
+    def _build_page_3(self, result: Dict) -> List:
+        """Build page 3 content: Risk Category Breakdown"""
+        elements = []
+        elements.append(Paragraph("Risk Category Breakdown",
+                                 self.styles['SectionHeading']))
+        elements.append(Spacer(1, 0.15*inch))
+        # Create table data
+        table_data = [
+            [
+                Paragraph('<b>Category</b>', self.styles['TableHeader']),
+                Paragraph('<b>Score</b>', self.styles['TableHeader']),
+                Paragraph('<b>Summary</b>', self.styles['TableHeader'])
+            ]
+        ]
+        risk_breakdown = result['risk_analysis'].get('risk_breakdown', [])
+        for category in risk_breakdown:
+            score_color = self._get_risk_color(category['score'])
+            category_cell = Paragraph(category['category'], self.styles['BodyText'])
+            score_cell = Paragraph(
+                f'<font color="{score_color.hexval()}"><b>{category["score"]}</b></font>',
+                self.styles['TableHeader']
+            )
+            summary_cell = Paragraph(category['summary'], self.styles['BodyText'])
+            table_data.append([category_cell, score_cell, summary_cell])
+        # Create table
+        table = Table(table_data, colWidths=[1.8*inch, 0.7*inch, 4*inch])
+        table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#f5f5f5')),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a1a1a')),
+            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+            ('ALIGN', (1, 0), (1, -1), 'CENTER'),
+            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+            ('FONTSIZE', (0, 0), (-1, -1), 10),
+            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+            ('TOPPADDING', (0, 1), (-1, -1), 10),
+            ('BOTTOMPADDING', (0, 1), (-1, -1), 10),
+            ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e5e5e5')),
+            ('VALIGN', (0, 0), (-1, -1), 'TOP'),
+        ]))
+        elements.append(table)
+        return elements
+    def _build_clause_analysis(self, result: Dict) -> List:
+        """Build clause-by-clause analysis section"""
+        elements = []
+        elements.append(Paragraph("Clause-by-Clause Analysis",
+                                 self.styles['SectionHeading']))
+        elements.append(Spacer(1, 0.15*inch))
+        # Create table data
+        table_data = [
+            [
+                Paragraph('<b>Clause</b>', self.styles['TableHeader']),
+                Paragraph('<b>Risk Level</b>', self.styles['TableHeader']),
+                Paragraph('<b>Analysis</b>', self.styles['TableHeader']),
+                Paragraph('<b>Recommendation</b>', self.styles['TableHeader'])
+            ]
+        ]
+        # Get unfavorable terms and interpretations
+        unfavorable_terms = result.get('unfavorable_terms', [])
+        interpretations = result.get('clause_interpretations', [])
+        # Combine and process
+        processed_clauses = []
+        for term in unfavorable_terms[:10]:  # Limit to 10 clauses
+            clause_ref = term.get('clause_reference', term['term'])
+            # Find matching interpretation if available
+            analysis_text = term['explanation']
+            recommendation_text = term.get('suggested_fix', 'Negotiate or seek legal advice.')
+            # Determine risk level
+            severity = term.get('severity', 'high')
+            if severity == 'critical':
+                risk_level = 'Critical'
+                risk_color = colors.HexColor('#dc2626')
+            elif severity == 'high':
+                risk_level = 'High'
+                risk_color = colors.HexColor('#f97316')
+            else:
+                risk_level = 'Medium'
+                risk_color = colors.HexColor('#ca8a04')
+            clause_cell = Paragraph(clause_ref, self.styles['BodyText'])
+            risk_cell = Paragraph(
+                f'<font color="{risk_color.hexval()}"><b>{risk_level}</b></font>',
+                self.styles['TableHeader']
+            )
+            analysis_cell = Paragraph(analysis_text, self.styles['BodyText'])
+            recommendation_cell = Paragraph(recommendation_text, self.styles['BodyText'])
+            table_data.append([clause_cell, risk_cell, analysis_cell, recommendation_cell])
+        # Create table
+        table = Table(table_data, colWidths=[1.5*inch, 0.8*inch, 2.2*inch, 2*inch])
+        table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#f5f5f5')),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a1a1a')),
+            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+            ('ALIGN', (1, 0), (1, -1), 'CENTER'),
+            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+            ('FONTSIZE', (0, 0), (-1, -1), 9),
+            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+            ('TOPPADDING', (0, 1), (-1, -1), 10),
+            ('BOTTOMPADDING', (0, 1), (-1, -1), 10),
+            ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e5e5e5')),
+            ('VALIGN', (0, 0), (-1, -1), 'TOP'),
+        ]))
+        elements.append(table)
+        return elements
+def generate_pdf_report(analysis_result: Dict[str, Any],
+                        output_path: Optional[str] = None) -> BytesIO:
+    """
+    Convenience function to generate PDF report
+    Args:
+        analysis_result: Complete analysis result from the API
+        output_path: Optional file path to save PDF
+    Returns:
+        BytesIO buffer containing the PDF
+    """
+    generator = PDFReportGenerator()
+    return generator.generate_report(analysis_result, output_path)
+if __name__ == "__main__":
+    # Test with sample data
+    sample_result = {
+        "analysis_id": "test-123",
+        "timestamp": datetime.now().isoformat(),
+        "risk_analysis": {
+            "overall_score": 85,
+            "risk_level": "CRITICAL",
+            "risk_breakdown": [
+                {
+                    "category": "Restrictive Covenants",
+                    "score": 95,
+                    "summary": "The agreement contains exceptionally broad and long-lasting non-compete (24 months) and non-solicitation (5 years) clauses."
+                },
+                {
+                    "category": "Penalties & Termination",
+                    "score": 90,
+                    "summary": "The contract includes severe penalties for breach, including forfeiture of earned salary."
+                }
+            ]
+        },
+        "executive_summary": "This employment agreement is heavily skewed in favor of the Employer, presenting a very high risk.",
+        "unfavorable_terms": [
+            {
+                "term": "Undefined Post-Probation Salary",
+                "clause_reference": "Clause 8.2",
+                "severity": "critical",
+                "explanation": "Post-probation salary is undefined ('as discussed').",
+                "suggested_fix": "Insist that the exact salary be explicitly stated."
+            }
+        ],
+        "missing_protections": [
+            {
+                "protection": "Defined Post-Probation Salary",
+                "importance": "critical",
+                "explanation": "The contract lacks a specific, written salary commitment."
+            }
+        ],
+        "negotiation_points": [
+            {
+                "issue": "Post-probation salary",
+                "rationale": "Must be explicitly defined in writing before signing."
+            }
+        ]
+    }
+    buffer = generate_pdf_report(sample_result, "test_report.pdf")
+    print("Test PDF generated successfully!")

requirements.txt CHANGED Viewed

@@ -1,15 +1,32 @@
 # LLM Providers
-openai>=1.0.0              # OpenAI API
-anthropic>=0.18.0          # Anthropic Claude API
 Ollama
-requests                 # For Ollama
-transformers             # For Legal-BERT
-sentence-transformers    # For embeddings
-torch                    # PyTorch
-pypdf2
-python_docx
-PyMuPDF
-spacy
-pydantic-settings
-pydantic

+# FastAPI & Server
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+python-multipart==0.0.6
+# ML & NLP
+transformers==4.35.2
+torch==2.1.1
+sentence-transformers==2.2.2
+spacy
+# Document Processing
+PyPDF2==3.0.1
+PyMuPDF==1.23.8
+python-docx==1.1.0
 # LLM Providers
+openai>=1.3.0
+anthropic>=0.18.0
+requests==2.31.0
 Ollama
+# Data & Validation
+pydantic==2.5.0
+pydantic-settings==2.1.0
+# Utilities
+python-dotenv==1.0.0
+# PDF report generation
+reportlab>=4.0.0

services/clause_extractor.py CHANGED Viewed

@@ -1,23 +1,26 @@
-"""
-Advanced Clause Extractor using Legal-BERT + Structural Patterns
-Uses nlpaueb/legal-bert-base-uncased for semantic clause understanding
-"""
-import torch
 import re
-from typing import List, Dict, Tuple, Optional, Any
-from dataclasses import dataclass, field
-from collections import defaultdict
 import numpy as np
 from sentence_transformers import util
 # Import utilities
-import sys
-from pathlib import Path
 sys.path.append(str(Path(__file__).parent.parent))
-from utils.logger import ContractAnalyzerLogger, log_info, log_error
 from utils.text_processor import TextProcessor
 @dataclass
@@ -25,37 +28,40 @@ class ExtractedClause:
     """
     Extracted clause with comprehensive metadata
     """
-    text: str
-    reference: str  # e.g., "Section 5.2", "Clause 11.1"
-    category: str  # e.g., "termination", "compensation", "indemnification"
-    confidence: float  # 0.0-1.0
-    start_pos: int
-    end_pos: int
-    extraction_method: str  # "structural", "semantic", "hybrid"
-    risk_indicators: List[str] = field(default_factory=list)
-    embeddings: Optional[np.ndarray] = None
-    subclauses: List[str] = field(default_factory=list)
-    legal_bert_score: float = 0.0
     def to_dict(self) -> Dict[str, Any]:
-        """Convert to dictionary for serialization"""
-        return {
-            "text": self.text,
-            "reference": self.reference,
-            "category": self.category,
-            "confidence": round(self.confidence, 3),
-            "start_pos": self.start_pos,
-            "end_pos": self.end_pos,
-            "extraction_method": self.extraction_method,
-            "risk_indicators": self.risk_indicators,
-            "subclauses": self.subclauses,
-            "legal_bert_score": round(self.legal_bert_score, 3)
-        }
 class ClauseExtractor:
     """
-    Advanced clause extraction using Legal-BERT + structural patterns
     Process:
     1. Structural extraction (numbered sections like "5.2", "Article III")
@@ -64,222 +70,123 @@ class ClauseExtractor:
     4. Category classification using Legal-BERT + keyword matching
     5. Deduplication and ranking
     """
-    # =========================================================================
     # CLAUSE CATEGORY DEFINITIONS WITH REPRESENTATIVE TEXTS
-    # =========================================================================
-    CLAUSE_CATEGORIES = {
-        'compensation': {
-            'keywords': ['salary', 'wage', 'compensation', 'pay', 'payment', 'bonus',
-                        'commission', 'remuneration', 'fee', 'rate', 'benefits'],
-            'representative_text': (
-                "The Employee shall receive an annual base salary of One Hundred Thousand Dollars "
-                "payable in accordance with the Company's standard payroll practices. "
-                "Additional compensation may include performance bonuses and stock options."
-            ),
-            'weight': 1.0
-        },
-        'termination': {
-            'keywords': ['termination', 'terminate', 'notice period', 'resignation',
-                        'dismissal', 'severance', 'end of employment', 'cessation', 'notice'],
-            'representative_text': (
-                "Either party may terminate this Agreement upon thirty days written notice. "
-                "The Company may terminate for cause immediately upon written notice to Employee. "
-                "Upon termination, Employee shall receive severance compensation."
-            ),
-            'weight': 1.2
-        },
-        'non_compete': {
-            'keywords': ['non-compete', 'non-solicit', 'non-solicitation', 'restrictive covenant',
-                        'competitive', 'competition', 'competing business', 'competitive activities'],
-            'representative_text': (
-                "Employee agrees not to engage in any competitive business activities for a period "
-                "of twelve months following termination within a fifty-mile radius. "
-                "Employee shall not solicit Company clients or employees during this period."
-            ),
-            'weight': 1.5
-        },
-        'confidentiality': {
-            'keywords': ['confidential', 'proprietary', 'trade secret', 'disclosure',
-                        'confidentiality', 'secret', 'private', 'non-disclosure'],
-            'representative_text': (
-                "Employee shall maintain the confidentiality of all proprietary information "
-                "and trade secrets of the Company. Confidential Information includes business plans, "
-                "customer lists, and technical data. These obligations survive termination."
-            ),
-            'weight': 1.1
-        },
-        'indemnification': {
-            'keywords': ['indemnify', 'indemnification', 'hold harmless', 'defend',
-                        'liability', 'claims', 'losses', 'damages'],
-            'representative_text': (
-                "Party A shall indemnify and hold harmless Party B from any claims, losses, "
-                "or damages arising from Party A's breach or negligence. This indemnification "
-                "includes reasonable attorneys' fees and costs of defense."
-            ),
-            'weight': 1.3
-        },
-        'intellectual_property': {
-            'keywords': ['intellectual property', 'ip', 'copyright', 'patent', 'trademark',
-                        'work product', 'inventions', 'creation', 'ownership', 'ip rights'],
-            'representative_text': (
-                "All work product and inventions created by Employee during employment shall be "
-                "the exclusive property of the Company. Employee assigns all intellectual property "
-                "rights including patents, copyrights, and trade secrets to the Company."
-            ),
-            'weight': 1.2
-        },
-        'liability': {
-            'keywords': ['liable', 'liability', 'damages', 'limitation', 'consequential',
-                        'indirect', 'punitive', 'cap', 'limited liability'],
-            'representative_text': (
-                "In no event shall either party be liable for indirect, incidental, or consequential "
-                "damages. Total liability under this Agreement shall not exceed the amounts paid "
-                "in the twelve months preceding the claim."
-            ),
-            'weight': 1.2
-        },
-        'warranty': {
-            'keywords': ['warranty', 'warrant', 'representation', 'guarantee',
-                        'assurance', 'promise', 'warranties'],
-            'representative_text': (
-                "Company warrants that the Services will be performed in a professional manner. "
-                "EXCEPT AS EXPRESSLY PROVIDED, COMPANY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, "
-                "INCLUDING WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE."
-            ),
-            'weight': 0.9
-        },
-        'dispute_resolution': {
-            'keywords': ['arbitration', 'mediation', 'dispute', 'jurisdiction',
-                        'governing law', 'venue', 'forum', 'resolution'],
-            'representative_text': (
-                "Any disputes arising under this Agreement shall be resolved through binding arbitration "
-                "in accordance with the rules of the American Arbitration Association. "
-                "This Agreement shall be governed by the laws of the State of California."
-            ),
-            'weight': 0.9
-        },
-        'insurance': {
-            'keywords': ['insurance', 'coverage', 'insured', 'policy', 'premium', 'insurer'],
-            'representative_text': (
-                "Contractor shall maintain general liability insurance with minimum coverage of "
-                "one million dollars per occurrence. Proof of insurance shall be provided to Client. "
-                "Company shall be named as additional insured on all policies."
-            ),
-            'weight': 0.8
-        },
-        'assignment': {
-            'keywords': ['assignment', 'assign', 'transfer', 'successor', 'binding', 'assignee'],
-            'representative_text': (
-                "This Agreement may not be assigned by either party without the prior written consent "
-                "of the other party. This Agreement shall be binding upon and inure to the benefit "
-                "of the parties' successors and permitted assigns."
-            ),
-            'weight': 0.8
-        },
-        'amendment': {
-            'keywords': ['amendment', 'modify', 'modification', 'change', 'alteration', 'waiver'],
-            'representative_text': (
-                "This Agreement may not be amended or modified except by written instrument signed "
-                "by both parties. No waiver of any provision shall be effective unless in writing. "
-                "All modifications must be mutually agreed upon."
-            ),
-            'weight': 0.7
-        },
-        'force_majeure': {
-            'keywords': ['force majeure', 'act of god', 'unforeseeable', 'beyond control', 'natural disaster'],
-            'representative_text': (
-                "Neither party shall be liable for failure to perform due to causes beyond its reasonable "
-                "control including acts of God, war, strikes, or natural disasters. "
-                "Performance shall be suspended during the force majeure event."
-            ),
-            'weight': 0.7
-        },
-        'entire_agreement': {
-            'keywords': ['entire agreement', 'integration', 'supersedes', 'prior agreements', 'complete agreement'],
-            'representative_text': (
-                "This Agreement constitutes the entire agreement between the parties and supersedes "
-                "all prior agreements, whether written or oral. No other representations or warranties "
-                "shall be binding unless incorporated herein."
-            ),
-            'weight': 0.6
-        },
-        'general': {
-            'keywords': ['provision', 'term', 'condition', 'obligation', 'requirement'],
-            'representative_text': (
-                "The parties agree to the following terms and conditions governing their relationship. "
-                "Each party shall perform its obligations in good faith and in accordance with "
-                "industry standards and applicable law."
-            ),
-            'weight': 0.5
-        }
-    }
-    # =========================================================================
     # RISK INDICATOR PATTERNS
-    # =========================================================================
-    RISK_INDICATORS = {
-        'critical': [
-            'unlimited liability', 'perpetual', 'irrevocable', 'forfeit',
-            'liquidated damages', 'wage withholding', 'joint and several'
-        ],
-        'high': [
-            'non-compete', 'non-solicit', 'penalty', 'without cause',
-            'sole discretion', 'immediate termination', 'at-will'
-        ],
-        'medium': [
-            'indemnify', 'hold harmless', 'confidential', 'proprietary',
-            'exclusive', 'terminate', 'default', 'breach'
-        ]
-    }
-    # =========================================================================
     # INITIALIZATION
-    # =========================================================================
-    def __init__(self, model_loader, contract_category: Optional[str] = None):
         """
         Initialize clause extractor with Legal-BERT
-        Args:
-            model_loader: ModelLoader instance for accessing Legal-BERT
-            contract_category: Optional contract category for context-aware extraction
         """
-        self.model_loader = model_loader
-        self.contract_category = contract_category
         # Models (lazy loaded)
-        self.legal_bert_model = None
         self.legal_bert_tokenizer = None
-        self.embedding_model = None
-        self.device = None
         # Category embeddings (computed from representative texts)
-        self.category_embeddings = {}
         # Text processor
-        self.text_processor = TextProcessor(use_spacy=False)
         # Logger
-        self.logger = ContractAnalyzerLogger.get_logger()
         # Lazy load
         self._lazy_load()
     def _lazy_load(self):
-        """Lazy load Legal-BERT and embedding models"""
         if self.legal_bert_model is None:
             try:
                 log_info("Loading Legal-BERT for clause extraction...")
                 # Load Legal-BERT (nlpaueb/legal-bert-base-uncased)
                 self.legal_bert_model, self.legal_bert_tokenizer = self.model_loader.load_legal_bert()
-                self.device = self.model_loader.device
                 # Load sentence transformer for embeddings
-                self.embedding_model = self.model_loader.load_embedding_model()
                 # Prepare category embeddings using Legal-BERT
                 self._prepare_category_embeddings()
@@ -287,59 +194,61 @@ class ClauseExtractor:
                 log_info("Clause extractor models loaded successfully")
             except Exception as e:
-                log_error(e, context={"component": "ClauseExtractor", "operation": "model_loading"})
                 raise
     def _prepare_category_embeddings(self):
         """
         Pre-compute Legal-BERT embeddings for category representative texts
         This enables semantic similarity matching for clause classification
         """
         log_info("Computing Legal-BERT embeddings for clause categories...")
         for category, config in self.CLAUSE_CATEGORIES.items():
-            representative_text = config['representative_text']
             # Get Legal-BERT embedding (using [CLS] token)
-            embedding = self._get_legal_bert_embedding(representative_text)
             self.category_embeddings[category] = embedding
         log_info(f"Prepared Legal-BERT embeddings for {len(self.category_embeddings)} categories")
     def _get_legal_bert_embedding(self, text: str) -> np.ndarray:
         """
         Get Legal-BERT embedding for text using [CLS] token
-        Args:
-            text: Input text
         Returns:
-            Embedding vector as numpy array
         """
         # Tokenize
-        inputs = self.legal_bert_tokenizer(
-            text,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=512
-        ).to(self.device)
         # Get embeddings
         with torch.no_grad():
-            outputs = self.legal_bert_model(**inputs)
             # Use [CLS] token embedding (first token)
             cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
         return cls_embedding
-    # =========================================================================
-    # MAIN EXTRACTION METHOD
-    # =========================================================================
     @ContractAnalyzerLogger.log_execution_time("extract_clauses")
-    def extract_clauses(self, contract_text: str,
-                       max_clauses: int = 15) -> List[ExtractedClause]:
         """
         Extract and classify clauses from contract using hybrid approach
@@ -349,44 +258,45 @@ class ClauseExtractor:
         3. Legal-BERT classification
         4. Deduplicate and rank by confidence
-        Args:
-            contract_text: Full contract text
-            max_clauses: Maximum number of clauses to return
         Returns:
-            List of ExtractedClause objects sorted by confidence
         """
         log_info("Starting clause extraction",
-                text_length=len(contract_text),
-                contract_category=self.contract_category,
-                max_clauses=max_clauses)
-        # Step 1: Extract using structural patterns
         structural_clauses = self._extract_structural_clauses(contract_text)
         log_info(f"Extracted {len(structural_clauses)} structural clauses")
-        # Step 2: Semantic chunking for unstructured parts
-        semantic_chunks = self._semantic_chunking(contract_text, structural_clauses)
         log_info(f"Created {len(semantic_chunks)} semantic chunks")
-        # Step 3: Combine all candidates
-        all_candidates = structural_clauses + semantic_chunks
         log_info(f"Total candidates: {len(all_candidates)}")
-        # Step 4: Classify with Legal-BERT
         classified_clauses = self._classify_clauses_with_legal_bert(all_candidates)
         log_info(f"Classified {len(classified_clauses)} clauses")
-        # Step 5: Deduplicate and rank
-        final_clauses = self._deduplicate_and_rank(classified_clauses, max_clauses)
         log_info(f"Final output: {len(final_clauses)} clauses")
         return final_clauses
-    # =========================================================================
-    # STEP 1: STRUCTURAL EXTRACTION
-    # =========================================================================
     def _extract_structural_clauses(self, text: str) -> List[Dict]:
         """
@@ -398,27 +308,22 @@ class ClauseExtractor:
         - "Article III. Text"
         - "Clause 11. Text"
         """
-        candidates = []
         # Clean text
-        text = re.sub(r'\s+', ' ', text)
         # Patterns for legal numbering
-        patterns = [
-            # Match: "1.1. Text" or "1.1 Text"
-            (r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=\d+\.\d+(?:\.\d+)*\.|$)', 'numbered'),
-            # Match: "Article 1.1. Text" or "Article III. Text"
-            (r'(Article\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+))\.\s*([^\n]{30,800}?)(?=Article\s+(?:\d+|[IVXLCDM]+)|$)', 'article'),
-            # Match: "Section 1.1. Text"
-            (r'(Section\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Section\s+\d+|$)', 'section'),
-            # Match: "Clause 1.1. Text"
-            (r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Clause\s+\d+|$)', 'clause'),
-            # Match: "(a) Text", "(i) Text" - sub-clauses
-            (r'\(([a-z]|[ivxlcdm]+)\)\s*([^\n]{30,500}?)(?=\([a-z]|[ivxlcdm]+\)|\n\n|$)', 'subclause')
-        ]
         for pattern, ref_type in patterns:
             matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
             for match in matches:
                 clause_text = match.group(2).strip()
@@ -426,69 +331,105 @@ class ClauseExtractor:
                 if not self._is_boilerplate(clause_text):
                     # Check for meaningful content
                     if self._has_meaningful_content(clause_text):
-                        candidates.append({
-                            'text': clause_text,
-                            'reference': match.group(1).strip(),
-                            'start': match.start(),
-                            'end': match.end(),
-                            'type': 'structural',
-                            'ref_type': ref_type
-                        })
         # Remove overlapping clauses
         candidates = self._remove_overlapping(candidates)
         return candidates
     def _is_boilerplate(self, text: str) -> bool:
-        """Check if text is boilerplate/definitional rather than substantive"""
-        boilerplate_indicators = [
-            'shall mean', 'means and includes', 'defined as', 'definition of',
-            'hereinafter referred to', 'for purposes of this', 'interpretation of',
-            'as used in this', 'the term', 'shall include', 'includes but not limited'
-        ]
-        text_lower = text.lower()
         # Must have at least one strong indicator AND be definition-heavy
-        has_indicator = any(indicator in text_lower for indicator in boilerplate_indicators)
-        is_short_definition = len(text.split()) < 50 and '"' in text
         return has_indicator or is_short_definition
     def _has_meaningful_content(self, text: str) -> bool:
-        """Check if text has meaningful legal content"""
         # Must have minimum length
-        if len(text.split()) < 15:
             return False
         # Check for legal action verbs
-        action_verbs = [
-            'shall', 'must', 'will', 'may', 'agrees', 'undertakes',
-            'covenants', 'warrants', 'represents', 'acknowledges',
-            'certifies', 'indemnifies', 'waives', 'terminates'
-        ]
-        text_lower = text.lower()
-        has_action = any(verb in text_lower for verb in action_verbs)
         # Check for legal subjects
-        legal_subjects = [
-            'party', 'parties', 'employee', 'employer', 'company',
-            'contractor', 'consultant', 'client', 'vendor', 'buyer',
-            'seller', 'landlord', 'tenant', 'licensor', 'licensee'
-        ]
-        has_subject = any(subj in text_lower for subj in legal_subjects)
         return has_action or has_subject
     def _remove_overlapping(self, candidates: List[Dict]) -> List[Dict]:
-        """Remove overlapping clause extractions"""
         if not candidates:
             return []
         # Sort by start position
-        candidates.sort(key=lambda x: x['start'])
         non_overlapping = [candidates[0]]
@@ -496,41 +437,35 @@ class ClauseExtractor:
             last = non_overlapping[-1]
             # Check if overlaps
-            if candidate['start'] >= last['end']:
                 non_overlapping.append(candidate)
-            elif len(candidate['text']) > len(last['text']):
                 # Keep longer clause if overlapping
                 non_overlapping[-1] = candidate
         return non_overlapping
-    # =========================================================================
-    # STEP 2: SEMANTIC CHUNKING
-    # =========================================================================
-    def _semantic_chunking(self, text: str,
-                          structural_clauses: List[Dict],
-                          chunk_size: int = 200) -> List[Dict]:
         """
-        Chunk unstructured text semantically
-        Uses sentence boundaries to find natural clause boundaries
         """
         # Get covered ranges from structural clauses
         covered_ranges = [(c['start'], c['end']) for c in structural_clauses]
         # Split into sentences
-        sentences = self.text_processor.extract_sentences(text)
-        chunks = []
-        current_chunk = []
         current_length = 0
-        current_start = 0
         for sentence in sentences:
             # Check if sentence is already covered by structural extraction
             sentence_start = text.find(sentence, current_start)
-            if sentence_start == -1:
                 continue
             if self._is_in_range(sentence_start, covered_ranges):
@@ -541,21 +476,20 @@ class ClauseExtractor:
             current_length += len(sentence.split())
             # Create chunk when reaching size limit
-            if current_length >= chunk_size:
                 chunk_text = ' '.join(current_chunk).strip()
-                if len(chunk_text) >= 50 and not self._is_boilerplate(chunk_text):
                     if self._has_meaningful_content(chunk_text):
-                        chunks.append({
-                            'text': chunk_text,
-                            'reference': f'Semantic-{len(chunks)+1}',
-                            'start': sentence_start,
-                            'end': sentence_start + len(chunk_text),
-                            'type': 'semantic',
-                            'ref_type': 'semantic'
-                        })
-                current_chunk = []
                 current_length = 0
             current_start = sentence_start + len(sentence)
@@ -563,145 +497,145 @@ class ClauseExtractor:
         # Add final chunk if exists
         if current_chunk:
             chunk_text = ' '.join(current_chunk).strip()
-            if len(chunk_text) >= 50 and not self._is_boilerplate(chunk_text):
                 if self._has_meaningful_content(chunk_text):
                     sentence_start = text.find(current_chunk[0])
-                    chunks.append({
-                        'text': chunk_text,
-                        'reference': f'Semantic-{len(chunks)+1}',
-                        'start': sentence_start,
-                        'end': sentence_start + len(chunk_text),
-                        'type': 'semantic',
-                        'ref_type': 'semantic'
-                    })
         return chunks
     def _is_in_range(self, position: int, ranges: List[Tuple[int, int]]) -> bool:
-        """Check if position is within any of the ranges"""
         return any(start <= position <= end for start, end in ranges)
-    # =========================================================================
-    # STEP 3: LEGAL-BERT CLASSIFICATION
-    # =========================================================================
     def _classify_clauses_with_legal_bert(self, candidates: List[Dict]) -> List[ExtractedClause]:
         """
         Classify clauses using Legal-BERT embeddings + keyword matching
         """
-        classified = []
         for candidate in candidates:
             # Get Legal-BERT embedding for clause
-            clause_embedding = self._get_legal_bert_embedding(candidate['text'])
             # Classify using hybrid approach
-            category, confidence, legal_bert_score = self._classify_single_clause(
-                candidate['text'],
-                clause_embedding
-            )
             # Extract risk indicators
-            risk_indicators = self._extract_risk_indicators(candidate['text'])
             # Extract sub-clauses if any
-            subclauses = self._extract_subclauses(candidate['text'])
-            classified.append(ExtractedClause(
-                text=candidate['text'],
-                reference=candidate['reference'],
-                category=category,
-                confidence=confidence,
-                start_pos=candidate['start'],
-                end_pos=candidate['end'],
-                extraction_method=candidate['type'],
-                risk_indicators=risk_indicators,
-                embeddings=clause_embedding,
-                subclauses=subclauses,
-                legal_bert_score=legal_bert_score
-            ))
         return classified
-    def _classify_single_clause(self, text: str,
-                               clause_embedding: np.ndarray) -> Tuple[str, float, float]:
         """
         Classify single clause using Legal-BERT + keyword matching
         Returns:
-            (category, confidence, legal_bert_score)
         """
-        text_lower = text.lower()
-        # Method 1: Keyword matching
-        keyword_scores = {}
         for category, config in self.CLAUSE_CATEGORIES.items():
-            keywords = config['keywords']
-            weight = config['weight']
-            keyword_count = sum(1 for kw in keywords if kw in text_lower)
             keyword_scores[category] = (keyword_count / len(keywords)) * weight
-        # Method 2: Legal-BERT semantic similarity
-        semantic_scores = {}
         clause_embedding_tensor = torch.tensor(clause_embedding).unsqueeze(0)
         for category, cat_embedding in self.category_embeddings.items():
-            cat_embedding_tensor = torch.tensor(cat_embedding).unsqueeze(0)
-            similarity = torch.nn.functional.cosine_similarity(
-                clause_embedding_tensor,
-                cat_embedding_tensor
-            ).item()
             semantic_scores[category] = similarity
         # Combine scores (70% semantic, 30% keyword)
-        combined_scores = {}
         for category in self.CLAUSE_CATEGORIES.keys():
-            combined = (
-                semantic_scores.get(category, 0) * 0.70 +
-                keyword_scores.get(category, 0) * 0.30
-            )
             combined_scores[category] = combined
         # Get best category
-        best_category = max(combined_scores, key=combined_scores.get)
-        confidence = combined_scores[best_category]
         legal_bert_score = semantic_scores[best_category]
         return best_category, confidence, legal_bert_score
     def _extract_risk_indicators(self, text: str) -> List[str]:
-        """Extract risk indicator keywords from clause text"""
-        text_lower = text.lower()
-        found_indicators = []
         for severity, indicators in self.RISK_INDICATORS.items():
             for indicator in indicators:
                 if indicator in text_lower:
                     found_indicators.append(indicator)
-        return found_indicators[:5]  # Top 5 risk indicators
     def _extract_subclauses(self, text: str) -> List[str]:
-        """Extract sub-clauses from main clause (e.g., (a), (b), (i), (ii))"""
         # Pattern for sub-clauses: (a), (i), etc.
         subclause_pattern = r'\(([a-z]|[ivxlcdm]+)\)\s*([^()]{20,200}?)(?=\([a-z]|[ivxlcdm]+\)|$)'
-        matches = re.findall(subclause_pattern, text, re.IGNORECASE)
-        subclauses = []
         for ref, subtext in matches:
             clean_text = subtext.strip()
-            if len(clean_text) >= 20:
                 subclauses.append(f"({ref}) {clean_text}")
-        return subclauses[:5]  # Max 5 sub-clauses
-    # =========================================================================
-    # STEP 4: DEDUPLICATION AND RANKING
-    # =========================================================================
-    def _deduplicate_and_rank(self, clauses: List[ExtractedClause],
-                             max_clauses: int) -> List[ExtractedClause]:
         """
         Remove duplicates and rank by confidence + legal_bert_score
         """
@@ -709,24 +643,22 @@ class ClauseExtractor:
             return []
         # Sort by combined score (confidence * 0.6 + legal_bert_score * 0.4)
-        clauses.sort(
-            key=lambda x: (x.confidence * 0.6 + x.legal_bert_score * 0.4),
-            reverse=True
-        )
         # Deduplicate by text similarity
-        unique_clauses = []
-        seen_texts = set()
         for clause in clauses:
             # Simple deduplication by first 100 chars
-            text_key = clause.text[:100].lower().strip()
             # Also check similarity to already added clauses
             is_duplicate = False
             for existing in unique_clauses:
                 similarity = self._text_similarity(clause.text, existing.text)
-                if similarity > 0.85:
                     is_duplicate = True
                     break
@@ -734,28 +666,31 @@ class ClauseExtractor:
                 unique_clauses.append(clause)
                 seen_texts.add(text_key)
-                if len(unique_clauses) >= max_clauses:
                     break
         return unique_clauses
     def _text_similarity(self, text1: str, text2: str) -> float:
-        """Calculate text similarity (simple Jaccard similarity)"""
-        words1 = set(text1.lower().split())
-        words2 = set(text2.lower().split())
         intersection = len(words1 & words2)
-        union = len(words1 | words2)
         return intersection / union if union > 0 else 0.0
-    # =========================================================================
-    # UTILITY METHODS
-    # =========================================================================
     def get_category_distribution(self, clauses: List[ExtractedClause]) -> Dict[str, int]:
-        """Get distribution of clause categories"""
         distribution = defaultdict(int)
         for clause in clauses:
             distribution[clause.category] += 1
@@ -763,9 +698,15 @@ class ClauseExtractor:
         return dict(distribution)
     def get_high_risk_clauses(self, clauses: List[ExtractedClause]) -> List[ExtractedClause]:
-        """Get clauses with risk indicators"""
         risky = [c for c in clauses if c.risk_indicators]
-        risky.sort(key=lambda x: len(x.risk_indicators), reverse=True)
-        log

+# DEPENDENCIES
 import re
+import sys
+import torch
 import numpy as np
+from typing import Any
+from typing import List
+from typing import Dict
+from typing import Tuple
+from pathlib import Path
+from typing import Optional
+from dataclasses import field
+from dataclasses import dataclass
+from collections import defaultdict
 from sentence_transformers import util
 # Import utilities
 sys.path.append(str(Path(__file__).parent.parent))
+from utils.logger import log_info
+from utils.logger import log_error
 from utils.text_processor import TextProcessor
+from utils.logger import ContractAnalyzerLogger
 @dataclass
     """
     Extracted clause with comprehensive metadata
     """
+    text              : str
+    reference         : str    # e.g., "Section 5.2", "Clause 11.1"
+    category          : str    # e.g., "termination", "compensation", "indemnification"
+    confidence        : float  # 0.0-1.0
+    start_pos         : int
+    end_pos           : int
+    extraction_method : str    # "structural", "semantic", "hybrid"
+    risk_indicators   : List[str]            = field(default_factory = list)
+    embeddings        : Optional[np.ndarray] = None
+    subclauses        : List[str]            = field(default_factory = list)
+    legal_bert_score  : float                = 0.0
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert to dictionary for serialization
+        """
+        return {"text"              : self.text,
+                "reference"         : self.reference,
+                "category"          : self.category,
+                "confidence"        : round(self.confidence, 3),
+                "start_pos"         : self.start_pos,
+                "end_pos"           : self.end_pos,
+                "extraction_method" : self.extraction_method,
+                "risk_indicators"   : self.risk_indicators,
+                "subclauses"        : self.subclauses,
+                "legal_bert_score"  : round(self.legal_bert_score, 3),
+               }
 class ClauseExtractor:
     """
+    Clause extraction using Legal-BERT + structural patterns
     Process:
     1. Structural extraction (numbered sections like "5.2", "Article III")
     4. Category classification using Legal-BERT + keyword matching
     5. Deduplication and ranking
     """
     # CLAUSE CATEGORY DEFINITIONS WITH REPRESENTATIVE TEXTS
+    CLAUSE_CATEGORIES = {'compensation'          : {'keywords'            : ['salary', 'wage', 'compensation', 'pay', 'payment', 'bonus', 'commission', 'remuneration', 'fee', 'rate', 'benefits'],
+                                                    'representative_text' : ("The Employee shall receive an annual base salary of One Hundred Thousand Dollars payable in accordance with the Company's standard payroll practices. Additional compensation may include performance bonuses and stock options."),
+                                                    'weight'              : 1.0,
+                                                   },
+                         'termination'           : {'keywords'            : ['termination', 'terminate', 'notice period', 'resignation', 'dismissal', 'severance', 'end of employment', 'cessation', 'notice'],
+                                                    'representative_text' : ("Either party may terminate this Agreement upon thirty days written notice. The Company may terminate for cause immediately upon written notice to Employee. Upon termination, Employee shall receive severance compensation."),
+                                                    'weight'              : 1.2,
+                                                   },
+                         'non_compete'           : {'keywords'            : ['non-compete', 'non-solicit', 'non-solicitation', 'restrictive covenant', 'competitive', 'competition', 'competing business', 'competitive activities'],
+                                                    'representative_text' : ("Employee agrees not to engage in any competitive business activities for a period of twelve months following termination within a fifty-mile radius. Employee shall not solicit Company clients or employees during this period."),
+                                                    'weight'              : 1.5,
+                                                   },
+                         'confidentiality'       : {'keywords'            : ['confidential', 'proprietary', 'trade secret', 'disclosure', 'confidentiality', 'secret', 'private', 'non-disclosure'],
+                                                    'representative_text' : ("Employee shall maintain the confidentiality of all proprietary information and trade secrets of the Company. Confidential Information includes business plans, customer lists, and technical data. These obligations survive termination."),
+                                                    'weight'              : 1.1,
+                                                   },
+                         'indemnification'       : {'keywords'            : ['indemnify', 'indemnification', 'hold harmless', 'defend', 'liability', 'claims', 'losses', 'damages'],
+                                                    'representative_text' : ("Party A shall indemnify and hold harmless Party B from any claims, losses, or damages arising from Party A's breach or negligence. This indemnification includes reasonable attorneys' fees and costs of defense."),
+                                                    'weight'              : 1.3,
+                                                   },
+                         'intellectual_property' : {'keywords'            : ['intellectual property', 'ip', 'copyright', 'patent', 'trademark', 'work product', 'inventions', 'creation', 'ownership', 'ip rights'],
+                                                    'representative_text' : ("All work product and inventions created by Employee during employment shall be the exclusive property of the Company. Employee assigns all intellectual property rights including patents, copyrights, and trade secrets to the Company."),
+                                                    'weight'              : 1.2,
+                                                   },
+                         'liability'             : {'keywords'            : ['liable', 'liability', 'damages', 'limitation', 'consequential', 'indirect', 'punitive', 'cap', 'limited liability'],
+                                                    'representative_text' : ("In no event shall either party be liable for indirect, incidental, or consequential damages. Total liability under this Agreement shall not exceed the amounts paid in the twelve months preceding the claim."),
+                                                    'weight'              : 1.2,
+                            },
+                         'warranty'              : {'keywords'            : ['warranty', 'warrant', 'representation', 'guarantee', 'assurance', 'promise', 'warranties'],
+                                                    'representative_text' : ("Company warrants that the Services will be performed in a professional manner. EXCEPT AS EXPRESSLY PROVIDED, COMPANY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE."),
+                                                    'weight'              : 0.9,
+                                                   },
+                         'dispute_resolution'    : {'keywords'            : ['arbitration', 'mediation', 'dispute', 'jurisdiction', 'governing law', 'venue', 'forum', 'resolution'],
+                                                    'representative_text' : ("Any disputes arising under this Agreement shall be resolved through binding arbitration in accordance with the rules of the American Arbitration Association. This Agreement shall be governed by the laws of the State of California."),
+                                                    'weight'              : 0.9,
+                                                   },
+                         'insurance'             : {'keywords'            : ['insurance', 'coverage', 'insured', 'policy', 'premium', 'insurer'],
+                                                    'representative_text' : ("Contractor shall maintain general liability insurance with minimum coverage of one million dollars per occurrence. Proof of insurance shall be provided to Client. Company shall be named as additional insured on all policies."),
+                                                    'weight'              : 0.8,
+                                                   },
+                         'assignment'            : {'keywords'            : ['assignment', 'assign', 'transfer', 'successor', 'binding', 'assignee'],
+                                                    'representative_text' : ("This Agreement may not be assigned by either party without the prior written consent of the other party. This Agreement shall be binding upon and inure to the benefit of the parties' successors and permitted assigns."),
+                                                    'weight'              : 0.8,
+                                                   },
+                         'amendment'             : {'keywords'            : ['amendment', 'modify', 'modification', 'change', 'alteration', 'waiver'],
+                                                    'representative_text' : ("This Agreement may not be amended or modified except by written instrument signed by both parties. No waiver of any provision shall be effective unless in writing. All modifications must be mutually agreed upon."),
+                                                    'weight'              : 0.7,
+                                                   },
+                         'force_majeure'         : {'keywords'            : ['force majeure', 'act of god', 'unforeseeable', 'beyond control', 'natural disaster'],
+                                                    'representative_text' : ("Neither party shall be liable for failure to perform due to causes beyond its reasonable control including acts of God, war, strikes, or natural disasters. Performance shall be suspended during the force majeure event."),
+                                                    'weight'              : 0.7,
+                                                   },
+                         'entire_agreement'      : {'keywords'            : ['entire agreement', 'integration', 'supersedes', 'prior agreements', 'complete agreement'],
+                                                    'representative_text' : ("This Agreement constitutes the entire agreement between the parties and supersedes all prior agreements, whether written or oral. No other representations or warranties shall be binding unless incorporated herein."),
+                                                    'weight'              : 0.6,
+                                                   },
+                         'general'               : {'keywords'            : ['provision', 'term', 'condition', 'obligation', 'requirement'],
+                                                    'representative_text' : ("The parties agree to the following terms and conditions governing their relationship. Each party shall perform its obligations in good faith and in accordance with industry standards and applicable law."),
+                                                    'weight'              : 0.5,
+                                                   }
+                        }
     # RISK INDICATOR PATTERNS
+    RISK_INDICATORS   = {'critical' : ['unlimited liability', 'perpetual', 'irrevocable', 'forfeit', 'liquidated damages', 'wage withholding', 'joint and several'],
+                         'high'     : ['non-compete', 'non-solicit', 'penalty', 'without cause', 'sole discretion', 'immediate termination', 'at-will'],
+                         'medium'   : ['indemnify', 'hold harmless', 'confidential', 'proprietary', 'exclusive', 'terminate', 'default', 'breach'],
+                        }
     # INITIALIZATION
+    def __init__(self, model_loader: ModelLoader, contract_category: Optional[str] = None):
         """
         Initialize clause extractor with Legal-BERT
+        Arguments:
+        ----------
+            model_loader      { ModelLoader } : ModelLoader instance for accessing Legal-BERT
+            contract_category      { str }    : Optional contract category for context-aware extraction
         """
+        self.model_loader         = model_loader
+        self.contract_category    = contract_category
         # Models (lazy loaded)
+        self.legal_bert_model     = None
         self.legal_bert_tokenizer = None
+        self.embedding_model      = None
+        self.device               = None
         # Category embeddings (computed from representative texts)
+        self.category_embeddings  = dict()
         # Text processor
+        self.text_processor       = TextProcessor(use_spacy = False)
         # Logger
+        self.logger               = ContractAnalyzerLogger.get_logger()
         # Lazy load
         self._lazy_load()
     def _lazy_load(self):
+        """
+        Lazy load Legal-BERT and embedding models
+        """
         if self.legal_bert_model is None:
             try:
                 log_info("Loading Legal-BERT for clause extraction...")
                 # Load Legal-BERT (nlpaueb/legal-bert-base-uncased)
                 self.legal_bert_model, self.legal_bert_tokenizer = self.model_loader.load_legal_bert()
+                self.device                                      = self.model_loader.device
                 # Load sentence transformer for embeddings
+                self.embedding_model                             = self.model_loader.load_embedding_model()
                 # Prepare category embeddings using Legal-BERT
                 self._prepare_category_embeddings()
                 log_info("Clause extractor models loaded successfully")
             except Exception as e:
+                log_error(e, context = {"component": "ClauseExtractor", "operation": "model_loading"})
                 raise
     def _prepare_category_embeddings(self):
         """
         Pre-compute Legal-BERT embeddings for category representative texts
         This enables semantic similarity matching for clause classification
         """
         log_info("Computing Legal-BERT embeddings for clause categories...")
         for category, config in self.CLAUSE_CATEGORIES.items():
+            representative_text                = config['representative_text']
             # Get Legal-BERT embedding (using [CLS] token)
+            embedding                          = self._get_legal_bert_embedding(representative_text)
             self.category_embeddings[category] = embedding
         log_info(f"Prepared Legal-BERT embeddings for {len(self.category_embeddings)} categories")
     def _get_legal_bert_embedding(self, text: str) -> np.ndarray:
         """
         Get Legal-BERT embedding for text using [CLS] token
+        Arguments:
+        ----------
+            text { str }   : Input text
         Returns:
+        --------
+            { np.ndarray } : Embedding vector as numpy array
         """
         # Tokenize
+        inputs = self.legal_bert_tokenizer(text,
+                                           return_tensors = "pt",
+                                           padding        = True,
+                                           truncation     = True,
+                                           max_length     = 512,
+                                          ).to(self.device)
         # Get embeddings
         with torch.no_grad():
+            outputs       = self.legal_bert_model(**inputs)
             # Use [CLS] token embedding (first token)
             cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
         return cls_embedding
     @ContractAnalyzerLogger.log_execution_time("extract_clauses")
+    def extract_clauses(self, contract_text: str, max_clauses: int = 15) -> List[ExtractedClause]:
         """
         Extract and classify clauses from contract using hybrid approach
         3. Legal-BERT classification
         4. Deduplicate and rank by confidence
+        Arguments:
+        ----------
+            contract_text { str } : Full contract text
+            max_clauses   { int } : Maximum number of clauses to return
         Returns:
+        --------
+                 { list }         : List of ExtractedClause objects sorted by confidence
         """
         log_info("Starting clause extraction",
+                 text_length       = len(contract_text),
+                 contract_category = self.contract_category,
+                 max_clauses       = max_clauses,
+                )
+        # Extract using structural patterns
         structural_clauses = self._extract_structural_clauses(contract_text)
         log_info(f"Extracted {len(structural_clauses)} structural clauses")
+        # Semantic chunking for unstructured parts
+        semantic_chunks    = self._semantic_chunking(contract_text, structural_clauses)
         log_info(f"Created {len(semantic_chunks)} semantic chunks")
+        # Combine all candidates
+        all_candidates     = structural_clauses + semantic_chunks
         log_info(f"Total candidates: {len(all_candidates)}")
+        # Classify with Legal-BERT
         classified_clauses = self._classify_clauses_with_legal_bert(all_candidates)
         log_info(f"Classified {len(classified_clauses)} clauses")
+        # Deduplicate and rank
+        final_clauses      = self._deduplicate_and_rank(classified_clauses, max_clauses)
         log_info(f"Final output: {len(final_clauses)} clauses")
         return final_clauses
     def _extract_structural_clauses(self, text: str) -> List[Dict]:
         """
         - "Article III. Text"
         - "Clause 11. Text"
         """
+        candidates = list()
         # Clean text
+        text       = re.sub(r'\s+', ' ', text)
         # Patterns for legal numbering
+        patterns   = [(r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=\d+\.\d+(?:\.\d+)*\.|$)', 'numbered'),
+                      (r'(Article\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+))\.\s*([^\n]{30,800}?)(?=Article\s+(?:\d+|[IVXLCDM]+)|$)', 'article'),
+                      (r'(Section\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Section\s+\d+|$)', 'section'),
+                      (r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{30,800}?)(?=Clause\s+\d+|$)', 'clause'),
+                      (r'\(([a-z]|[ivxlcdm]+)\)\s*([^\n]{30,500}?)(?=\([a-z]|[ivxlcdm]+\)|\n\n|$)', 'subclause'),
+                     ]
         for pattern, ref_type in patterns:
             matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
             for match in matches:
                 clause_text = match.group(2).strip()
                 if not self._is_boilerplate(clause_text):
                     # Check for meaningful content
                     if self._has_meaningful_content(clause_text):
+                        candidates.append({'text'      : clause_text,
+                                           'reference' : match.group(1).strip(),
+                                           'start'     : match.start(),
+                                           'end'       : match.end(),
+                                           'type'      : 'structural',
+                                           'ref_type'  : ref_type,
+                                         })
         # Remove overlapping clauses
         candidates = self._remove_overlapping(candidates)
         return candidates
     def _is_boilerplate(self, text: str) -> bool:
+        """
+        Check if text is boilerplate/definitional rather than substantive
+        """
+        boilerplate_indicators = ['shall mean',
+                                  'means and includes',
+                                  'defined as',
+                                  'definition of',
+                                  'hereinafter referred to',
+                                  'for purposes of this',
+                                  'interpretation of',
+                                  'as used in this',
+                                  'the term',
+                                  'shall include',
+                                  'includes but not limited',
+                                 ]
+        text_lower             = text.lower()
         # Must have at least one strong indicator AND be definition-heavy
+        has_indicator          = any(indicator in text_lower for indicator in boilerplate_indicators)
+        is_short_definition    = len(text.split()) < 50 and '"' in text
         return has_indicator or is_short_definition
     def _has_meaningful_content(self, text: str) -> bool:
+        """
+        Check if text has meaningful legal content
+        """
         # Must have minimum length
+        if (len(text.split()) < 15):
             return False
         # Check for legal action verbs
+        action_verbs   = ['shall',
+                          'must',
+                          'will',
+                          'may',
+                          'agrees',
+                          'undertakes',
+                          'covenants',
+                          'warrants',
+                          'represents',
+                          'acknowledges',
+                          'certifies',
+                          'indemnifies',
+                          'waives',
+                          'terminates',
+                         ]
+        text_lower     = text.lower()
+        has_action     = any(verb in text_lower for verb in action_verbs)
         # Check for legal subjects
+        legal_subjects = ['party',
+                          'parties',
+                          'employee',
+                          'employer',
+                          'company',
+                          'contractor',
+                          'consultant',
+                          'client',
+                          'vendor',
+                          'buyer',
+                          'seller',
+                          'landlord',
+                          'tenant',
+                          'licensor',
+                          'licensee',
+                         ]
+        has_subject    = any(subj in text_lower for subj in legal_subjects)
         return has_action or has_subject
     def _remove_overlapping(self, candidates: List[Dict]) -> List[Dict]:
+        """
+        Remove overlapping clause extractions
+        """
         if not candidates:
             return []
         # Sort by start position
+        candidates.sort(key = lambda x: x['start'])
         non_overlapping = [candidates[0]]
             last = non_overlapping[-1]
             # Check if overlaps
+            if (candidate['start'] >= last['end']):
                 non_overlapping.append(candidate)
+            elif (len(candidate['text']) > len(last['text'])):
                 # Keep longer clause if overlapping
                 non_overlapping[-1] = candidate
         return non_overlapping
+    def _semantic_chunking(self, text: str, structural_clauses: List[Dict], chunk_size: int = 200) -> List[Dict]:
         """
+        Chunk unstructured text semantically uses sentence boundaries to find natural clause boundaries
         """
         # Get covered ranges from structural clauses
         covered_ranges = [(c['start'], c['end']) for c in structural_clauses]
         # Split into sentences
+        sentences      = self.text_processor.extract_sentences(text)
+        chunks         = list()
+        current_chunk  = list()
         current_length = 0
+        current_start  = 0
         for sentence in sentences:
             # Check if sentence is already covered by structural extraction
             sentence_start = text.find(sentence, current_start)
+            if (sentence_start == -1):
                 continue
             if self._is_in_range(sentence_start, covered_ranges):
             current_length += len(sentence.split())
             # Create chunk when reaching size limit
+            if (current_length >= chunk_size):
                 chunk_text = ' '.join(current_chunk).strip()
+                if (len(chunk_text) >= 50) and (not self._is_boilerplate(chunk_text)):
                     if self._has_meaningful_content(chunk_text):
+                        chunks.append({'text'      : chunk_text,
+                                       'reference' : f'Semantic-{len(chunks)+1}',
+                                       'start'     : sentence_start,
+                                       'end'       : sentence_start + len(chunk_text),
+                                       'type'      : 'semantic',
+                                       'ref_type'  : 'semantic',
+                                     })
+                current_chunk  = list()
                 current_length = 0
             current_start = sentence_start + len(sentence)
         # Add final chunk if exists
         if current_chunk:
             chunk_text = ' '.join(current_chunk).strip()
+            if ((len(chunk_text) >= 50) and (not self._is_boilerplate(chunk_text))):
                 if self._has_meaningful_content(chunk_text):
                     sentence_start = text.find(current_chunk[0])
+                    chunks.append({'text'      : chunk_text,
+                                   'reference' : f'Semantic-{len(chunks)+1}',
+                                   'start'     : sentence_start,
+                                   'end'       : sentence_start + len(chunk_text),
+                                   'type'      : 'semantic',
+                                   'ref_type'  : 'semantic',
+                                 })
         return chunks
     def _is_in_range(self, position: int, ranges: List[Tuple[int, int]]) -> bool:
+        """
+        Check if position is within any of the ranges
+        """
         return any(start <= position <= end for start, end in ranges)
     def _classify_clauses_with_legal_bert(self, candidates: List[Dict]) -> List[ExtractedClause]:
         """
         Classify clauses using Legal-BERT embeddings + keyword matching
         """
+        classified = list()
         for candidate in candidates:
             # Get Legal-BERT embedding for clause
+            clause_embedding                       = self._get_legal_bert_embedding(candidate['text'])
             # Classify using hybrid approach
+            category, confidence, legal_bert_score = self._classify_single_clause(candidate['text'], clause_embedding)
             # Extract risk indicators
+            risk_indicators                        = self._extract_risk_indicators(candidate['text'])
             # Extract sub-clauses if any
+            subclauses                             = self._extract_subclauses(candidate['text'])
+            classified.append(ExtractedClause(text              = candidate['text'],
+                                              reference         = candidate['reference'],
+                                              category          = category,
+                                              confidence        = confidence,
+                                              start_pos         = candidate['start'],
+                                              end_pos           = candidate['end'],
+                                              extraction_method = candidate['type'],
+                                              risk_indicators   = risk_indicators,
+                                              embeddings        = clause_embedding,
+                                              subclauses        = subclauses,
+                                              legal_bert_score  = legal_bert_score,
+                                             )
+                             )
         return classified
+    def _classify_single_clause(self, text: str, clause_embedding: np.ndarray) -> Tuple[str, float, float]:
         """
         Classify single clause using Legal-BERT + keyword matching
         Returns:
+        --------
+            { tuple } :    (category, confidence, legal_bert_score)
         """
+        text_lower     = text.lower()
+        # Keyword matching
+        keyword_scores = dict()
         for category, config in self.CLAUSE_CATEGORIES.items():
+            keywords                 = config['keywords']
+            weight                   = config['weight']
+            keyword_count            = sum(1 for kw in keywords if kw in text_lower)
             keyword_scores[category] = (keyword_count / len(keywords)) * weight
+        # Legal-BERT semantic similarity
+        semantic_scores         = dict()
         clause_embedding_tensor = torch.tensor(clause_embedding).unsqueeze(0)
         for category, cat_embedding in self.category_embeddings.items():
+            cat_embedding_tensor      = torch.tensor(cat_embedding).unsqueeze(0)
+            similarity                = torch.nn.functional.cosine_similarity(clause_embedding_tensor, cat_embedding_tensor).item()
             semantic_scores[category] = similarity
         # Combine scores (70% semantic, 30% keyword)
+        combined_scores = dict()
         for category in self.CLAUSE_CATEGORIES.keys():
+            combined                  = (semantic_scores.get(category, 0) * 0.70 + keyword_scores.get(category, 0) * 0.30)
             combined_scores[category] = combined
         # Get best category
+        best_category    = max(combined_scores, key = combined_scores.get)
+        confidence       = combined_scores[best_category]
         legal_bert_score = semantic_scores[best_category]
         return best_category, confidence, legal_bert_score
     def _extract_risk_indicators(self, text: str) -> List[str]:
+        """
+        Extract risk indicator keywords from clause text
+        """
+        text_lower       = text.lower()
+        found_indicators = dict()
         for severity, indicators in self.RISK_INDICATORS.items():
             for indicator in indicators:
                 if indicator in text_lower:
                     found_indicators.append(indicator)
+        # Top 25 risk indicators
+        return found_indicators[:25]
     def _extract_subclauses(self, text: str) -> List[str]:
+        """
+        Extract sub-clauses from main clause (e.g., (a), (b), (i), (ii))
+        """
         # Pattern for sub-clauses: (a), (i), etc.
         subclause_pattern = r'\(([a-z]|[ivxlcdm]+)\)\s*([^()]{20,200}?)(?=\([a-z]|[ivxlcdm]+\)|$)'
+        matches           = re.findall(subclause_pattern, text, re.IGNORECASE)
+        subclauses        = list()
         for ref, subtext in matches:
             clean_text = subtext.strip()
+            if (len(clean_text) >= 20):
                 subclauses.append(f"({ref}) {clean_text}")
+        # Max 25 sub-clauses
+        return subclauses[:25]
+    def _deduplicate_and_rank(self, clauses: List[ExtractedClause], max_clauses: int) -> List[ExtractedClause]:
         """
         Remove duplicates and rank by confidence + legal_bert_score
         """
             return []
         # Sort by combined score (confidence * 0.6 + legal_bert_score * 0.4)
+        clauses.sort(key = lambda x: (x.confidence * 0.6 + x.legal_bert_score * 0.4), reverse = True)
         # Deduplicate by text similarity
+        unique_clauses = list()
+        seen_texts     = set()
         for clause in clauses:
             # Simple deduplication by first 100 chars
+            text_key     = clause.text[:100].lower().strip()
             # Also check similarity to already added clauses
             is_duplicate = False
             for existing in unique_clauses:
                 similarity = self._text_similarity(clause.text, existing.text)
+                if (similarity > 0.85):
                     is_duplicate = True
                     break
                 unique_clauses.append(clause)
                 seen_texts.add(text_key)
+                if (len(unique_clauses) >= max_clauses):
                     break
         return unique_clauses
     def _text_similarity(self, text1: str, text2: str) -> float:
+        """
+        Calculate text similarity (simple Jaccard similarity)
+        """
+        words1       = set(text1.lower().split())
+        words2       = set(text2.lower().split())
         intersection = len(words1 & words2)
+        union        = len(words1 | words2)
         return intersection / union if union > 0 else 0.0
     def get_category_distribution(self, clauses: List[ExtractedClause]) -> Dict[str, int]:
+        """
+        Get distribution of clause categories
+        """
         distribution = defaultdict(int)
         for clause in clauses:
             distribution[clause.category] += 1
         return dict(distribution)
     def get_high_risk_clauses(self, clauses: List[ExtractedClause]) -> List[ExtractedClause]:
+        """
+        Get clauses with risk indicators
+        """
         risky = [c for c in clauses if c.risk_indicators]
+        risky.sort(key = lambda x: len(x.risk_indicators), reverse = True)
+        top_25_risky_clauses = risky[:25]
+        return top_25_risky_clauses

services/contract_classifier.py CHANGED Viewed

@@ -232,7 +232,7 @@ class ContractClassifier:
         Arguments:
         ----------
             model_loader : ModelLoader instance for accessing Legal-BERT and embeddings
-        """
         self.model_loader         = model_loader
         self.embedding_model      = None
         self.legal_bert_model     = None
@@ -294,7 +294,7 @@ class ContractClassifier:
         log_info(f"Prepared embeddings for {len(self.category_embeddings)} categories")
     # MAIN CLASSIFICATION METHOD
     @ContractAnalyzerLogger.log_execution_time("classify_contract")
     def classify_contract(self, contract_text: str, min_confidence: float = 0.50) -> ContractCategory:
@@ -325,103 +325,99 @@ class ContractClassifier:
             raise ValueError("Contract text too short for classification")
         # Preprocess text (use first 3000 chars for efficiency)
-        text_excerpt = contract_text[:3000]
         log_info("Starting contract classification",
-                text_length=len(contract_text),
-                excerpt_length=len(text_excerpt))
         # Step 1: Keyword scoring
-        keyword_scores = self._score_keywords(contract_text.lower())
         # Step 2: Semantic similarity
-        semantic_scores = self._semantic_similarity(text_excerpt)
         # Step 3: Legal-BERT enhanced (optional - can be expensive)
-        # legal_bert_scores = self._legal_bert_classification(text_excerpt)
         # Step 4: Combine scores (weighted average)
-        combined_scores = self._combine_scores(
-            keyword_scores=keyword_scores,
-            semantic_scores=semantic_scores,
-            # legal_bert_scores=legal_bert_scores  # Uncomment if using Legal-BERT
-        )
         # Step 5: Get primary category
         if not combined_scores:
             log_info("No categories detected, defaulting to 'general'")
-            return ContractCategory(
-                category="general",
-                subcategory=None,
-                confidence=0.5,
-                reasoning=["Unable to determine specific contract type"],
-                detected_keywords=[]
-            )
-        primary_category = max(combined_scores, key=combined_scores.get)
-        confidence = combined_scores[primary_category]
         # Step 6: Detect subcategory
-        subcategory = self._detect_subcategory(contract_text, primary_category)
         # Step 7: Generate reasoning
-        reasoning = self._generate_reasoning(
-            contract_text=contract_text,
-            primary_category=primary_category,
-            subcategory=subcategory,
-            keyword_scores=keyword_scores,
-            semantic_scores=semantic_scores,
-            combined_scores=combined_scores
-        )
         # Step 8: Extract detected keywords
-        detected_keywords = self._extract_detected_keywords(contract_text, primary_category)
-        # Step 9: Get alternative categories
-        alternative_categories = sorted(
-            [(cat, score) for cat, score in combined_scores.items() if cat != primary_category],
-            key=lambda x: x[1],
-            reverse=True
-        )[:3]  # Top 3 alternatives
-        result = ContractCategory(
-            category=primary_category,
-            subcategory=subcategory,
-            confidence=confidence,
-            reasoning=reasoning,
-            detected_keywords=detected_keywords,
-            alternative_categories=alternative_categories
-        )
         log_info("Contract classified successfully",
-                category=primary_category,
-                subcategory=subcategory,
-                confidence=confidence)
         return result
-    # =========================================================================
-    # SCORING METHODS
-    # =========================================================================
     def _score_keywords(self, text_lower: str) -> Dict[str, float]:
         """
         Score each category based on keyword presence
-        Args:
-            text_lower: Lowercase contract text
         Returns:
-            Dictionary of {category: score}
         """
-        scores = {}
         for category, config in self.CATEGORY_HIERARCHY.items():
-            keywords = config['keywords']
-            weight = config['weight']
             # Count keyword matches
-            keyword_count = sum(1 for keyword in keywords if keyword in text_lower)
             # Normalize by number of keywords and apply weight
             normalized_score = (keyword_count / len(keywords)) * weight
@@ -430,91 +426,92 @@ class ContractClassifier:
         return scores
     def _semantic_similarity(self, text: str) -> Dict[str, float]:
         """
         Calculate semantic similarity to category templates using embeddings
-        Args:
-            text: Contract text excerpt
         Returns:
-            Dictionary of {category: similarity_score}
         """
         # Encode contract text
-        text_embedding = self.embedding_model.encode(text, convert_to_tensor=True)
         # Calculate similarity to each category
-        similarities = {}
         for category, cat_embedding in self.category_embeddings.items():
-            similarity = util.cos_sim(text_embedding, cat_embedding)[0][0].item()
             similarities[category] = similarity
         return similarities
     def _legal_bert_classification(self, text: str) -> Dict[str, float]:
         """
         Use Legal-BERT for classification (optional - computationally expensive)
-        Args:
-            text: Contract text excerpt
         Returns:
-            Dictionary of {category: score}
         """
-        # This is a placeholder for Legal-BERT classification
-        # In production, you'd fine-tune Legal-BERT on labeled contract data
         # Tokenize
-        inputs = self.legal_bert_tokenizer(
-            text,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=512
-        ).to(self.device)
         # Get embeddings
         with torch.no_grad():
-            outputs = self.legal_bert_model(**inputs)
             cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
-        # For now, return uniform scores (placeholder)
-        # In production, you'd use a trained classifier head
         return {cat: 0.5 for cat in self.CATEGORY_HIERARCHY.keys()}
-    def _combine_scores(self, keyword_scores: Dict[str, float],
-                       semantic_scores: Dict[str, float],
-                       legal_bert_scores: Dict[str, float] = None) -> Dict[str, float]:
         """
         Combine scores from different methods (weighted average)
-        Args:
-            keyword_scores: Keyword-based scores
-            semantic_scores: Semantic similarity scores
-            legal_bert_scores: Legal-BERT scores (optional)
         Returns:
-            Combined scores dictionary
         """
-        combined = {}
         # Weights for each method
-        keyword_weight = 0.40
-        semantic_weight = 0.60
         legal_bert_weight = 0.00  # Set to 0 if not using Legal-BERT
         if legal_bert_scores:
             # Normalize weights
-            total_weight = keyword_weight + semantic_weight + legal_bert_weight
-            keyword_weight /= total_weight
-            semantic_weight /= total_weight
             legal_bert_weight /= total_weight
         for category in self.CATEGORY_HIERARCHY.keys():
-            score = (
-                keyword_scores.get(category, 0) * keyword_weight +
-                semantic_scores.get(category, 0) * semantic_weight
-            )
             if legal_bert_scores:
                 score += legal_bert_scores.get(category, 0) * legal_bert_weight
@@ -523,202 +520,204 @@ class ContractClassifier:
         return combined
-    # =========================================================================
-    # SUBCATEGORY DETECTION
-    # =========================================================================
     def _detect_subcategory(self, text: str, primary_category: str) -> Optional[str]:
         """
         Detect specific subcategory within primary category
-        Args:
-            text: Full contract text
-            primary_category: Detected primary category
         Returns:
-            Subcategory name or None
         """
-        text_lower = text.lower()
         # Get subcategories for this category
         subcategories = self.CATEGORY_HIERARCHY[primary_category]['subcategories']
         # Score each subcategory
-        subcat_scores = {}
         for subcat in subcategories:
             if subcat in self.SUBCATEGORY_PATTERNS:
-                patterns = self.SUBCATEGORY_PATTERNS[subcat]
-                score = sum(1 for pattern in patterns if pattern in text_lower)
                 subcat_scores[subcat] = score
         # Return best match if any
-        if subcat_scores and max(subcat_scores.values()) > 0:
-            best_subcat = max(subcat_scores, key=subcat_scores.get)
             log_info(f"Detected subcategory: {best_subcat}",
-                    category=primary_category,
-                    score=subcat_scores[best_subcat])
             return best_subcat
         return None
-    # =========================================================================
-    # REASONING & EXPLANATION
-    # =========================================================================
-    def _generate_reasoning(self, contract_text: str, primary_category: str,
-                           subcategory: Optional[str],
-                           keyword_scores: Dict[str, float],
-                           semantic_scores: Dict[str, float],
-                           combined_scores: Dict[str, float]) -> List[str]:
         """
         Generate human-readable reasoning for classification
         Returns:
-            List of reasoning statements
         """
-        reasoning = []
         # Primary category reasoning
-        keyword_match = keyword_scores.get(primary_category, 0)
         semantic_match = semantic_scores.get(primary_category, 0)
-        if keyword_match > 0.5:
-            reasoning.append(
-                f"Strong keyword indicators for {primary_category.replace('_', ' ')} category "
-                f"({int(keyword_match * 100)}% keyword match)"
-            )
-        elif keyword_match > 0.3:
-            reasoning.append(
-                f"Moderate keyword presence for {primary_category.replace('_', ' ')} "
-                f"({int(keyword_match * 100)}% keyword match)"
-            )
-        if semantic_match > 0.65:
-            reasoning.append(
-                f"Contract language semantically similar to {primary_category.replace('_', ' ')} agreements "
-                f"(similarity: {semantic_match:.2f})"
-            )
-        elif semantic_match > 0.50:
-            reasoning.append(
-                f"Moderate semantic similarity to {primary_category.replace('_', ' ')} contracts "
-                f"(similarity: {semantic_match:.2f})"
-            )
         # Subcategory reasoning
         if subcategory:
-            reasoning.append(
-                f"Specific subcategory identified: {subcategory.replace('_', ' ')}"
-            )
         # Alternative categories (if close)
-        sorted_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
-        if len(sorted_scores) > 1 and sorted_scores[1][1] > 0.40:
             alt_category, alt_score = sorted_scores[1]
-            reasoning.append(
-                f"Also contains elements of {alt_category.replace('_', ' ')} "
-                f"(secondary match: {alt_score:.2f})"
-            )
         # If no strong reasoning
         if not reasoning:
             reasoning.append("Classification based on general contract structure and terminology")
         return reasoning
     def _extract_detected_keywords(self, text: str, category: str) -> List[str]:
         """
         Extract which specific keywords were found
-        Args:
-            text: Contract text
-            category: Detected category
         Returns:
-            List of detected keywords
         """
         text_lower = text.lower()
-        keywords = self.CATEGORY_HIERARCHY[category]['keywords']
-        detected = [kw for kw in keywords if kw in text_lower]
-        return detected[:10]  # Top 10 keywords
-    # =========================================================================
-    # MULTI-LABEL CLASSIFICATION
-    # =========================================================================
     @ContractAnalyzerLogger.log_execution_time("classify_multi_label")
-    def classify_multi_label(self, text: str,
-                            threshold: float = 0.45) -> List[ContractCategory]:
         """
-        Classify as multiple categories if applicable
-        (e.g., Employment + NDA, Consulting + IP Assignment)
-        Args:
-            text: Contract text
-            threshold: Minimum confidence threshold for multi-label
         Returns:
-            List of ContractCategory objects (sorted by confidence)
         """
-        log_info("Starting multi-label classification", threshold=threshold)
         # Get scores
-        keyword_scores = self._score_keywords(text.lower())
-        semantic_scores = self._semantic_similarity(text[:3000])
         combined_scores = self._combine_scores(keyword_scores, semantic_scores)
         # Get all categories above threshold
-        matches = []
         for category, score in combined_scores.items():
-            if score >= threshold:
                 subcategory = self._detect_subcategory(text, category)
-                reasoning = self._generate_reasoning(
-                    text, category, subcategory,
-                    keyword_scores, semantic_scores, combined_scores
-                )
-                keywords = self._extract_detected_keywords(text, category)
-                matches.append(ContractCategory(
-                    category=category,
-                    subcategory=subcategory,
-                    confidence=score,
-                    reasoning=reasoning,
-                    detected_keywords=keywords
-                ))
         # Sort by confidence
-        matches.sort(key=lambda x: x.confidence, reverse=True)
         log_info(f"Multi-label classification found {len(matches)} categories")
         return matches if matches else [self.classify_contract(text)]
-    # =========================================================================
-    # UTILITY METHODS
-    # =========================================================================
     def get_category_description(self, category: str) -> str:
-        """Get human-readable description of a category"""
-        descriptions = {
-            'employment': 'Employment agreements governing employer-employee relationships',
-            'consulting': 'Consulting and independent contractor agreements',
-            'nda': 'Non-disclosure and confidentiality agreements',
-            'technology': 'Software licensing and technology service agreements',
-            'intellectual_property': 'IP assignment, licensing, and protection agreements',
-            'real_estate': 'Property lease, rental, and purchase agreements',
-            'financial': 'Loan, credit, and financial service agreements',
-            'business': 'Partnership, joint venture, and corporate agreements',
-            'sales': 'Sales, purchase, and distribution agreements',
-            'service_agreement': 'Professional service and maintenance agreements',
-            'vendor': 'Vendor, supplier, and procurement agreements',
-            'agency': 'Agency and representation agreements'
-        }
         return descriptions.get(category, 'General contract agreement')
     def get_all_categories(self) -> List[str]:
-        """Get list of all supported categories"""
         return list(self.CATEGORY_HIERARCHY.keys())
     def get_subcategories(self, category: str) -> List[str]:
-        """Get subcategories for a specific category"""
-        return self.CATEGORY_HIERARCHY.get(category, {}).get('subcategories', [])

         Arguments:
         ----------
             model_loader : ModelLoader instance for accessing Legal-BERT and embeddings
+         """
         self.model_loader         = model_loader
         self.embedding_model      = None
         self.legal_bert_model     = None
         log_info(f"Prepared embeddings for {len(self.category_embeddings)} categories")
     # MAIN CLASSIFICATION METHOD
     @ContractAnalyzerLogger.log_execution_time("classify_contract")
     def classify_contract(self, contract_text: str, min_confidence: float = 0.50) -> ContractCategory:
             raise ValueError("Contract text too short for classification")
         # Preprocess text (use first 3000 chars for efficiency)
+        text_excerpt = contract_text
         log_info("Starting contract classification",
+                 text_length    = len(contract_text),
+                 excerpt_length = len(text_excerpt),
+                )
         # Step 1: Keyword scoring
+        keyword_scores    = self._score_keywords(contract_text.lower())
         # Step 2: Semantic similarity
+        semantic_scores   = self._semantic_similarity(text_excerpt)
         # Step 3: Legal-BERT enhanced (optional - can be expensive)
+        legal_bert_scores = self._legal_bert_classification(text_excerpt)
         # Step 4: Combine scores (weighted average)
+        combined_scores   = self._combine_scores(keyword_scores    = keyword_scores,
+                                                 semantic_scores   = semantic_scores,
+                                                 legal_bert_scores = legal_bert_scores,
+                                                )
         # Step 5: Get primary category
         if not combined_scores:
             log_info("No categories detected, defaulting to 'general'")
+            return ContractCategory(category          = "general",
+                                    subcategory       = None,
+                                    confidence        = 0.5,
+                                    reasoning         = ["Unable to determine specific contract type"],
+                                    detected_keywords = [],
+                                   )
+        primary_category       = max(combined_scores, key = combined_scores.get)
+        confidence             = combined_scores[primary_category]
         # Step 6: Detect subcategory
+        subcategory            = self._detect_subcategory(contract_text, primary_category)
         # Step 7: Generate reasoning
+        reasoning              = self._generate_reasoning(contract_text    = contract_text,
+                                                    primary_category = primary_category,
+                                                    subcategory      = subcategory,
+                                                    keyword_scores   = keyword_scores,
+                                                    semantic_scores  = semantic_scores,
+                                                    combined_scores  = combined_scores,
+                                                   )
         # Step 8: Extract detected keywords
+        detected_keywords      = self._extract_detected_keywords(contract_text, primary_category)
+        # Step 9: Get alternative categories: Top 3 alternatives
+        alternative_categories = sorted([(cat, score) for cat, score in combined_scores.items() if cat != primary_category],
+                                        key     = lambda x: x[1],
+                                        reverse = True,
+                                       )[:3]
+        result                 = ContractCategory(category               = primary_category,
+                                                  subcategory            = subcategory,
+                                                  confidence             = confidence,
+                                                  reasoning              = reasoning,
+                                                  detected_keywords      = detected_keywords,
+                                                  alternative_categories = alternative_categories,
+                                                 )
         log_info("Contract classified successfully",
+                 category    = primary_category,
+                 subcategory = subcategory,
+                 confidence  = confidence,
+                )
         return result
     def _score_keywords(self, text_lower: str) -> Dict[str, float]:
         """
         Score each category based on keyword presence
+        Arguments:
+        ----------
+            text_lower { str } : Lowercase contract text
         Returns:
+        --------
+               { dict }        : Dictionary of {category: score}
         """
+        scores = dict()
         for category, config in self.CATEGORY_HIERARCHY.items():
+            keywords         = config['keywords']
+            weight           = config['weight']
             # Count keyword matches
+            keyword_count    = sum(1 for keyword in keywords if keyword in text_lower)
             # Normalize by number of keywords and apply weight
             normalized_score = (keyword_count / len(keywords)) * weight
         return scores
     def _semantic_similarity(self, text: str) -> Dict[str, float]:
         """
         Calculate semantic similarity to category templates using embeddings
+        Arguments:
+        ----------
+            text { str } : Contract text excerpt
         Returns:
+        --------
+            { dict }     : Dictionary of {category: similarity_score}
         """
         # Encode contract text
+        text_embedding = self.embedding_model.encode(text, convert_to_tensor = True)
         # Calculate similarity to each category
+        similarities   = dict()
         for category, cat_embedding in self.category_embeddings.items():
+            similarity             = util.cos_sim(text_embedding, cat_embedding)[0][0].item()
             similarities[category] = similarity
         return similarities
     def _legal_bert_classification(self, text: str) -> Dict[str, float]:
         """
         Use Legal-BERT for classification (optional - computationally expensive)
+        Arguments:
+        ----------
+            text { str } : Contract text excerpt
         Returns:
+        --------
+            { dict }     : Dictionary of {category: score}
         """
         # Tokenize
+        inputs = self.legal_bert_tokenizer(text,
+                                           return_tensors = "pt",
+                                           padding        = True,
+                                           truncation     = True,
+                                           max_length     = 512,
+                                          ).to(self.device)
         # Get embeddings
         with torch.no_grad():
+            outputs       = self.legal_bert_model(**inputs)
             cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
         return {cat: 0.5 for cat in self.CATEGORY_HIERARCHY.keys()}
+    def _combine_scores(self, keyword_scores: Dict[str, float], semantic_scores: Dict[str, float], legal_bert_scores: Dict[str, float] = None) -> Dict[str, float]:
         """
         Combine scores from different methods (weighted average)
+        Arguments:
+        ----------
+            keyword_scores    { dict } : Keyword-based scores
+            semantic_scores   { dict } : Semantic similarity scores
+            legal_bert_scores { dict } : Legal-BERT scores (optional)
         Returns:
+        --------
+                   { dict }            : Combined scores dictionary
         """
+        combined          = dict()
         # Weights for each method
+        keyword_weight    = 0.40
+        semantic_weight   = 0.60
         legal_bert_weight = 0.00  # Set to 0 if not using Legal-BERT
         if legal_bert_scores:
             # Normalize weights
+            total_weight       = keyword_weight + semantic_weight + legal_bert_weight
+            keyword_weight    /= total_weight
+            semantic_weight   /= total_weight
             legal_bert_weight /= total_weight
         for category in self.CATEGORY_HIERARCHY.keys():
+            score = (keyword_scores.get(category, 0) * keyword_weight + semantic_scores.get(category, 0) * semantic_weight)
             if legal_bert_scores:
                 score += legal_bert_scores.get(category, 0) * legal_bert_weight
         return combined
     def _detect_subcategory(self, text: str, primary_category: str) -> Optional[str]:
         """
         Detect specific subcategory within primary category
+        Arguments:
+        ----------
+            text             { str } : Full contract text
+            primary_category { str } : Detected primary category
         Returns:
+        --------
+                  { str }            : Subcategory name or None
         """
+        text_lower    = text.lower()
         # Get subcategories for this category
         subcategories = self.CATEGORY_HIERARCHY[primary_category]['subcategories']
         # Score each subcategory
+        subcat_scores = dict()
         for subcat in subcategories:
             if subcat in self.SUBCATEGORY_PATTERNS:
+                patterns              = self.SUBCATEGORY_PATTERNS[subcat]
+                score                 = sum(1 for pattern in patterns if pattern in text_lower)
                 subcat_scores[subcat] = score
         # Return best match if any
+        if (subcat_scores and (max(subcat_scores.values()) > 0)):
+            best_subcat = max(subcat_scores, key = subcat_scores.get)
             log_info(f"Detected subcategory: {best_subcat}",
+                     category = primary_category,
+                     score    = subcat_scores[best_subcat],
+                    )
             return best_subcat
         return None
+    def _generate_reasoning(self, contract_text: str, primary_category: str, subcategory: Optional[str], keyword_scores: Dict[str, float], semantic_scores: Dict[str, float],
+                            combined_scores: Dict[str, float]) -> List[str]:
         """
         Generate human-readable reasoning for classification
         Returns:
+        --------
+            { list } : List of reasoning statements
         """
+        reasoning      = list()
         # Primary category reasoning
+        keyword_match  = keyword_scores.get(primary_category, 0)
         semantic_match = semantic_scores.get(primary_category, 0)
+        if (keyword_match > 0.5):
+            reasoning.append(f"Strong keyword indicators for {primary_category.replace('_', ' ')} category "
+                             f"({int(keyword_match * 100)}% keyword match)"
+                            )
+        elif (keyword_match > 0.3):
+            reasoning.append(f"Moderate keyword presence for {primary_category.replace('_', ' ')} "
+                             f"({int(keyword_match * 100)}% keyword match)"
+                            )
+        if (semantic_match > 0.65):
+            reasoning.append(f"Contract language semantically similar to {primary_category.replace('_', ' ')} agreements "
+                             f"(similarity: {semantic_match:.2f})"
+                            )
+        elif (semantic_match > 0.50):
+            reasoning.append(f"Moderate semantic similarity to {primary_category.replace('_', ' ')} contracts "
+                             f"(similarity: {semantic_match:.2f})"
+                            )
         # Subcategory reasoning
         if subcategory:
+            reasoning.append(f"Specific subcategory identified: {subcategory.replace('_', ' ')}")
         # Alternative categories (if close)
+        sorted_scores = sorted(combined_scores.items(), key = lambda x: x[1], reverse = True)
+        if ((len(sorted_scores) > 1) and (sorted_scores[1][1] > 0.40)):
             alt_category, alt_score = sorted_scores[1]
+            reasoning.append(f"Also contains elements of {alt_category.replace('_', ' ')} "
+                             f"(secondary match: {alt_score:.2f})"
+                            )
         # If no strong reasoning
         if not reasoning:
             reasoning.append("Classification based on general contract structure and terminology")
         return reasoning
     def _extract_detected_keywords(self, text: str, category: str) -> List[str]:
         """
         Extract which specific keywords were found
+        Arguments:
+        ----------
+            text     { str } : Contract text
+            category { str } : Detected category
         Returns:
+        --------
+                { list }     : List of detected keywords
         """
         text_lower = text.lower()
+        keywords   = self.CATEGORY_HIERARCHY[category]['keywords']
+        detected   = [kw for kw in keywords if kw in text_lower]
+        # Top 10 keywords
+        return detected[:10]
     @ContractAnalyzerLogger.log_execution_time("classify_multi_label")
+    def classify_multi_label(self, text: str, threshold: float = 0.45) -> List[ContractCategory]:
         """
+        Classify as multiple categories if applicable (e.g., Employment + NDA, Consulting + IP Assignment)
+        Arguments:
+        ----------
+            text       { str }  : Contract text
+            threshold { float } : Minimum confidence threshold for multi-label
         Returns:
+        --------
+                 { list }       : List of ContractCategory objects (sorted by confidence)
         """
+        log_info("Starting multi-label classification", threshold = threshold)
         # Get scores
+        keyword_scores  = self._score_keywords(text.lower())
+        semantic_scores = self._semantic_similarity(text)
         combined_scores = self._combine_scores(keyword_scores, semantic_scores)
         # Get all categories above threshold
+        matches         = list()
         for category, score in combined_scores.items():
+            if (score >= threshold):
                 subcategory = self._detect_subcategory(text, category)
+                reasoning   = self._generate_reasoning(text, category, subcategory, keyword_scores, semantic_scores, combined_scores)
+                keywords    = self._extract_detected_keywords(text, category)
+                matches.append(ContractCategory(category          = category,
+                                                subcategory       = subcategory,
+                                                confidence        = score,
+                                                reasoning         = reasoning,
+                                                detected_keywords = keywords,
+                                               )
+                              )
         # Sort by confidence
+        matches.sort(key = lambda x: x.confidence, reverse = True)
         log_info(f"Multi-label classification found {len(matches)} categories")
         return matches if matches else [self.classify_contract(text)]
     def get_category_description(self, category: str) -> str:
+        """
+        Get human-readable description of a category
+        """
+        descriptions = {'employment'            : 'Employment agreements governing employer-employee relationships',
+                        'consulting'            : 'Consulting and independent contractor agreements',
+                        'nda'                   : 'Non-disclosure and confidentiality agreements',
+                        'technology'            : 'Software licensing and technology service agreements',
+                        'intellectual_property' : 'IP assignment, licensing, and protection agreements',
+                        'real_estate'           : 'Property lease, rental, and purchase agreements',
+                        'financial'             : 'Loan, credit, and financial service agreements',
+                        'business'              : 'Partnership, joint venture, and corporate agreements',
+                        'sales'                 : 'Sales, purchase, and distribution agreements',
+                        'service_agreement'     : 'Professional service and maintenance agreements',
+                        'vendor'                : 'Vendor, supplier, and procurement agreements',
+                        'agency'                : 'Agency and representation agreements',
+                       }
         return descriptions.get(category, 'General contract agreement')
     def get_all_categories(self) -> List[str]:
+        """
+        Get list of all supported categories
+        """
         return list(self.CATEGORY_HIERARCHY.keys())
     def get_subcategories(self, category: str) -> List[str]:
+        """
+        Get subcategories for a specific category
+        """
+        return self.CATEGORY_HIERARCHY.get(category, {}).get('subcategories', [])

static/app.js DELETED Viewed

File without changes

static/index.html CHANGED Viewed

	@@ -0,0 +1,1404 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>AI Contract Risk Analyzer - Legal Intelligence Platform</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
+            background: #ffffff;
+            color: #333;
+            line-height: 1.6;
+        }
+        /* Header */
+        .header {
+            background: white;
+            border-bottom: 1px solid #e5e5e5;
+            padding: 1rem 2rem;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            position: fixed;
+            width: 100%;
+            top: 0;
+            z-index: 1000;
+        }
+        .logo {
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            font-size: 1.25rem;
+            font-weight: 600;
+        }
+        .logo-icon {
+            width: 28px;
+            height: 28px;
+            background: #4169e1;
+            border-radius: 6px;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            color: white;
+            font-size: 18px;
+        }
+        .subtitle {
+            color: #666;
+            font-size: 0.9rem;
+            font-weight: 400;
+        }
+        .container {
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 0 2rem;
+        }
+        /* Landing Page Styles - Updated to match screenshot */
+        .landing-screen {
+            padding-top: 80px;
+        }
+        .hero-section {
+            text-align: center;
+            padding: 6rem 0 4rem;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            margin-bottom: 4rem;
+        }
+        .hero-title {
+            font-size: 3rem;
+            font-weight: 700;
+            margin-bottom: 1.5rem;
+            line-height: 1.2;
+        }
+        .hero-subtitle {
+            font-size: 1.3rem;
+            margin-bottom: 2.5rem;
+            opacity: 0.95;
+            max-width: 600px;
+            margin-left: auto;
+            margin-right: auto;
+        }
+        .cta-button {
+            background: white;
+            color: #4169e1;
+            border: none;
+            padding: 1rem 3rem;
+            border-radius: 50px;
+            font-size: 1.1rem;
+            font-weight: 600;
+            cursor: pointer;
+            transition: all 0.3s ease;
+            box-shadow: 0 4px 15px rgba(0,0,0,0.2);
+        }
+        .cta-button:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 8px 25px rgba(0,0,0,0.3);
+        }
+        .section {
+            padding: 4rem 0;
+            text-align: center;
+        }
+        .section-title {
+            font-size: 2.2rem;
+            font-weight: 600;
+            margin-bottom: 3rem;
+            color: #333;
+        }
+        .section-subtitle {
+            font-size: 1.2rem;
+            color: #666;
+            margin-bottom: 3rem;
+            max-width: 800px;
+            margin-left: auto;
+            margin-right: auto;
+            line-height: 1.8;
+        }
+        .features-grid {
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 3rem;
+            margin-bottom: 4rem;
+        }
+        .feature-card {
+            text-align: center;
+            padding: 2rem;
+        }
+        .feature-icon {
+            font-size: 3rem;
+            margin-bottom: 1.5rem;
+        }
+        .feature-title {
+            font-size: 1.4rem;
+            font-weight: 600;
+            margin-bottom: 1rem;
+            color: #333;
+        }
+        .feature-description {
+            color: #666;
+            line-height: 1.7;
+            font-size: 1rem;
+        }
+        .steps-section {
+            background: #f8f9fa;
+            padding: 5rem 0;
+        }
+        .steps-grid {
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 3rem;
+            margin-top: 3rem;
+        }
+        .step-card {
+            text-align: center;
+            padding: 2rem;
+        }
+        .step-number {
+            width: 60px;
+            height: 60px;
+            background: #4169e1;
+            color: white;
+            border-radius: 50%;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            font-size: 1.5rem;
+            font-weight: 700;
+            margin: 0 auto 1.5rem;
+        }
+        .step-title {
+            font-size: 1.3rem;
+            font-weight: 600;
+            margin-bottom: 1rem;
+            color: #333;
+        }
+        .step-description {
+            color: #666;
+            line-height: 1.7;
+        }
+        .footer {
+            text-align: center;
+            padding: 3rem 2rem;
+            color: #999;
+            font-size: 0.9rem;
+            border-top: 1px solid #e5e5e5;
+            background: #f8f9fa;
+        }
+        /* Analyzer Styles */
+        .analyzer-screen {
+            display: none;
+            padding-top: 80px;
+        }
+        .hero-section-analyzer {
+            text-align: center;
+            margin-bottom: 3rem;
+            padding: 2rem 0;
+        }
+        .hero-title-analyzer {
+            font-size: 2.5rem;
+            font-weight: 700;
+            margin-bottom: 1rem;
+            color: #1a1a1a;
+        }
+        .hero-description {
+            font-size: 1.1rem;
+            color: #666;
+            margin-bottom: 2rem;
+        }
+        .upload-card {
+            background: white;
+            border-radius: 12px;
+            padding: 2.5rem;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.08);
+            max-width: 700px;
+            margin: 0 auto;
+            position: relative;
+        }
+        .tabs {
+            display: flex;
+            gap: 1rem;
+            border-bottom: 2px solid #e5e5e5;
+            margin-bottom: 2rem;
+        }
+        .tab {
+            padding: 0.75rem 1.5rem;
+            background: none;
+            border: none;
+            font-size: 1rem;
+            color: #666;
+            cursor: pointer;
+            border-bottom: 3px solid transparent;
+            margin-bottom: -2px;
+            transition: all 0.2s;
+        }
+        .tab.active {
+            color: #4169e1;
+            border-bottom-color: #4169e1;
+            font-weight: 500;
+        }
+        .tab-content {
+            display: none;
+        }
+        .tab-content.active {
+            display: block;
+        }
+        .textarea {
+            width: 100%;
+            min-height: 250px;
+            padding: 1rem;
+            border: 2px solid #e5e5e5;
+            border-radius: 8px;
+            font-size: 0.95rem;
+            font-family: inherit;
+            resize: vertical;
+            transition: border-color 0.2s;
+        }
+        .textarea:focus {
+            outline: none;
+            border-color: #4169e1;
+        }
+        .textarea::placeholder {
+            color: #999;
+        }
+        .file-upload-area {
+            border: 2px dashed #d0d0d0;
+            border-radius: 8px;
+            padding: 3rem 2rem;
+            text-align: center;
+            cursor: pointer;
+            transition: all 0.2s;
+        }
+        .file-upload-area:hover {
+            border-color: #4169e1;
+            background: #f8f9ff;
+        }
+        .file-upload-area.dragover {
+            border-color: #4169e1;
+            background: #f0f4ff;
+        }
+        .file-input {
+            display: none;
+        }
+        .upload-icon {
+            font-size: 3rem;
+            color: #999;
+            margin-bottom: 1rem;
+        }
+        .upload-text {
+            font-size: 1rem;
+            color: #666;
+            margin-bottom: 0.5rem;
+        }
+        .upload-hint {
+            font-size: 0.875rem;
+            color: #999;
+        }
+        .selected-file {
+            display: flex;
+            align-items: center;
+            gap: 1rem;
+            padding: 1rem;
+            background: #f8f9ff;
+            border-radius: 8px;
+            margin-top: 1rem;
+        }
+        .file-icon {
+            font-size: 2rem;
+        }
+        .file-info {
+            flex: 1;
+        }
+        .file-name {
+            font-weight: 500;
+            margin-bottom: 0.25rem;
+        }
+        .file-size {
+            font-size: 0.875rem;
+            color: #666;
+        }
+        .remove-file {
+            background: none;
+            border: none;
+            color: #999;
+            cursor: pointer;
+            font-size: 1.5rem;
+            padding: 0.25rem;
+        }
+        .analyze-btn-container {
+            display: flex;
+            justify-content: center;
+            margin-top: 2rem;
+            width: 100%;
+        }
+        .analyze-btn {
+            background: #4169e1;
+            color: white;
+            border: none;
+            padding: 1rem 3rem;
+            border-radius: 8px;
+            font-size: 1.1rem;
+            font-weight: 600;
+            cursor: pointer;
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            transition: all 0.3s ease;
+            min-width: 200px;
+            justify-content: center;
+        }
+        .analyze-btn:hover {
+            background: #3154c5;
+            transform: translateY(-2px);
+            box-shadow: 0 4px 12px rgba(49, 84, 197, 0.3);
+        }
+        .loading-screen {
+            display: none;
+            text-align: center;
+            padding: 4rem 2rem;
+        }
+        .loading-screen.active {
+            display: block;
+        }
+        .spinner {
+            width: 80px;
+            height: 80px;
+            border: 6px solid #e5e5e5;
+            border-top-color: #4169e1;
+            border-radius: 50%;
+            animation: spin 1s linear infinite;
+            margin: 0 auto 2rem;
+        }
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+        .loading-title {
+            font-size: 1.5rem;
+            font-weight: 600;
+            margin-bottom: 0.5rem;
+        }
+        .loading-text {
+            color: #666;
+            font-size: 1rem;
+        }
+        .results-screen {
+            display: none;
+        }
+        .results-screen.active {
+            display: block;
+        }
+        .back-to-landing {
+            background: none;
+            border: none;
+            color: #4169e1;
+            cursor: pointer;
+            font-size: 1rem;
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            margin-bottom: 2rem;
+            padding: 0.5rem 1rem;
+            border-radius: 6px;
+            transition: background 0.2s;
+        }
+        .back-to-landing:hover {
+            background: #f8f9ff;
+        }
+        .api-status {
+            text-align: center;
+            margin: 1rem 0;
+            padding: 1rem;
+            border-radius: 8px;
+            font-size: 0.9rem;
+        }
+        .api-status.connected {
+            background: #dcfce7;
+            color: #16a34a;
+            border: 1px solid #bbf7d0;
+        }
+        .api-status.disconnected {
+            background: #fee;
+            color: #dc2626;
+            border: 1px solid #fecaca;
+        }
+        /* Results screen styles */
+        .results-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 2rem;
+        }
+        .results-title {
+            font-size: 2rem;
+            font-weight: 700;
+        }
+        .results-actions {
+            display: flex;
+            gap: 1rem;
+        }
+        .btn {
+            padding: 0.75rem 1.5rem;
+            border-radius: 8px;
+            font-size: 0.95rem;
+            font-weight: 500;
+            cursor: pointer;
+            border: none;
+            transition: all 0.2s;
+        }
+        .btn-primary {
+            background: #4169e1;
+            color: white;
+        }
+        .btn-primary:hover {
+            background: #3154c5;
+        }
+        .btn-secondary {
+            background: white;
+            color: #4169e1;
+            border: 2px solid #4169e1;
+        }
+        .btn-secondary:hover {
+            background: #f8f9ff;
+        }
+        .results-grid {
+            display: grid;
+            grid-template-columns: 1fr 2fr;
+            gap: 1.5rem;
+            margin-bottom: 2rem;
+        }
+        .card {
+            background: white;
+            border-radius: 12px;
+            padding: 2rem;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.08);
+        }
+        .card-title {
+            font-size: 1.25rem;
+            font-weight: 600;
+            margin-bottom: 1.5rem;
+        }
+        .risk-score-container {
+            text-align: center;
+        }
+        .risk-circle {
+            width: 200px;
+            height: 200px;
+            margin: 0 auto 1rem;
+            position: relative;
+        }
+        .risk-circle svg {
+            transform: rotate(-90deg);
+        }
+        .risk-score-value {
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            font-size: 3rem;
+            font-weight: 700;
+            color: #dc2626;
+        }
+        .risk-level {
+            display: inline-block;
+            padding: 0.5rem 1rem;
+            border-radius: 6px;
+            font-weight: 600;
+            font-size: 0.9rem;
+            margin-top: 1rem;
+        }
+        .risk-critical {
+            background: #fee;
+            color: #dc2626;
+        }
+        .risk-high {
+            background: #fff4e6;
+            color: #f97316;
+        }
+        .risk-medium {
+            background: #fef9c3;
+            color: #ca8a04;
+        }
+        .risk-low {
+            background: #dcfce7;
+            color: #16a34a;
+        }
+        .executive-summary {
+            font-size: 1rem;
+            line-height: 1.8;
+            color: #444;
+        }
+        .three-column-grid {
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 1.5rem;
+            margin-bottom: 2rem;
+        }
+        .card-icon {
+            font-size: 1.5rem;
+            margin-bottom: 0.5rem;
+        }
+        .icon-warning { color: #f97316; }
+        .icon-shield { color: #dc2626; }
+        .icon-book { color: #4169e1; }
+        .item-list {
+            list-style: none;
+        }
+        .item-list li {
+            padding: 0.75rem 0;
+            border-bottom: 1px solid #f0f0f0;
+            display: flex;
+            align-items: flex-start;
+            gap: 0.5rem;
+        }
+        .item-list li:last-child {
+            border-bottom: none;
+        }
+        .item-icon {
+            color: #4169e1;
+            margin-top: 0.25rem;
+        }
+        .item-text {
+            flex: 1;
+            font-size: 0.95rem;
+        }
+        .category-breakdown {
+            margin-top: 2rem;
+        }
+        .category-item {
+            margin-bottom: 2rem;
+        }
+        .category-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 0.75rem;
+        }
+        .category-name {
+            font-weight: 600;
+            font-size: 1rem;
+        }
+        .category-score {
+            font-weight: 700;
+            font-size: 1.1rem;
+        }
+        .score-critical { color: #dc2626; }
+        .score-high { color: #f97316; }
+        .score-medium { color: #ca8a04; }
+        .score-low { color: #16a34a; }
+        .progress-bar {
+            height: 8px;
+            background: #f0f0f0;
+            border-radius: 4px;
+            overflow: hidden;
+            margin-bottom: 0.5rem;
+        }
+        .progress-fill {
+            height: 100%;
+            transition: width 0.5s ease;
+        }
+        .progress-critical { background: #dc2626; }
+        .progress-high { background: #f97316; }
+        .progress-medium { background: #ca8a04; }
+        .progress-low { background: #16a34a; }
+        .category-description {
+            font-size: 0.9rem;
+            color: #666;
+            line-height: 1.6;
+        }
+        .clause-analysis {
+            margin-top: 2rem;
+        }
+        .clause-item {
+            border: 1px solid #e5e5e5;
+            border-left: 4px solid #dc2626;
+            border-radius: 8px;
+            padding: 1.5rem;
+            margin-bottom: 1rem;
+            background: white;
+        }
+        .clause-item.high {
+            border-left-color: #f97316;
+        }
+        .clause-item.medium {
+            border-left-color: #ca8a04;
+        }
+        .clause-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: flex-start;
+            margin-bottom: 1rem;
+        }
+        .clause-label {
+            font-size: 0.75rem;
+            text-transform: uppercase;
+            font-weight: 600;
+            color: #999;
+            margin-bottom: 0.5rem;
+        }
+        .clause-text {
+            font-size: 0.95rem;
+            font-weight: 500;
+            color: #333;
+            line-height: 1.6;
+        }
+        .severity-badge {
+            padding: 0.375rem 0.875rem;
+            border-radius: 6px;
+            font-size: 0.8rem;
+            font-weight: 600;
+        }
+        .badge-critical {
+            background: #fee;
+            color: #dc2626;
+        }
+        .badge-high {
+            background: #fff4e6;
+            color: #f97316;
+        }
+        .badge-medium {
+            background: #fef9c3;
+            color: #ca8a04;
+        }
+        .clause-section {
+            margin-top: 1rem;
+        }
+        .clause-section-title {
+            font-weight: 600;
+            font-size: 0.9rem;
+            margin-bottom: 0.5rem;
+            color: #333;
+        }
+        .clause-section-text {
+            font-size: 0.9rem;
+            color: #555;
+            line-height: 1.7;
+        }
+        @media (max-width: 1024px) {
+            .features-grid,
+            .steps-grid {
+                grid-template-columns: 1fr;
+                gap: 2rem;
+            }
+            .results-grid {
+                grid-template-columns: 1fr;
+            }
+            .three-column-grid {
+                grid-template-columns: 1fr;
+            }
+        }
+        @media (max-width: 768px) {
+            .hero-title {
+                font-size: 2.2rem;
+            }
+            .hero-title-analyzer {
+                font-size: 2rem;
+            }
+            .section-title {
+                font-size: 1.8rem;
+            }
+            .results-header {
+                flex-direction: column;
+                align-items: flex-start;
+                gap: 1rem;
+            }
+            .results-actions {
+                width: 100%;
+                flex-direction: column;
+            }
+            .btn {
+                width: 100%;
+            }
+            .analyze-btn {
+                width: 100%;
+                padding: 1rem 2rem;
+            }
+        }
+    </style>
+</head>
+<body>
+    <!-- Header -->
+    <header class="header">
+        <div class="logo">
+            <div class="logo-icon">✓</div>
+            <span>AI Contract Risk Analyzer</span>
+        </div>
+        <div class="subtitle">Legal Intelligence Platform</div>
+    </header>
+    <!-- Landing Screen -->
+    <div id="landingScreen" class="landing-screen">
+        <!-- Hero Section -->
+        <section class="hero-section">
+            <div class="container">
+                <h1 class="hero-title">Unlock Legal Intelligence<br>Analyze Contracts with AI</h1>
+                <p class="hero-subtitle">
+                    Instantly identify risks, uncover unfavorable terms, and gain actionable negotiation points.
+                    Our AI-powered platform gives you the clarity and confidence to sign better contracts.
+                </p>
+                <button class="cta-button" id="getStartedBtn">Try Now for Free</button>
+            </div>
+        </section>
+        <!-- Main Content Section -->
+        <section class="section">
+            <div class="container">
+                <h2 class="section-title">A Smarter Way to Review Legal Documents</h2>
+                <p class="section-subtitle">
+                    Our platform goes beyond simple keyword searches to provide a deep, contextual understanding of your contracts.
+                </p>
+                <div class="features-grid">
+                    <div class="feature-card">
+                        <div class="feature-icon">🔍</div>
+                        <h3 class="feature-title">In-Depth Analysis</h3>
+                        <p class="feature-description">
+                            Our AI performs a comprehensive, clause-by-clause review, assessing risk levels and explaining complex legal jargon in plain English.
+                        </p>
+                    </div>
+                    <div class="feature-card">
+                        <div class="feature-icon">💡</div>
+                        <h3 class="feature-title">Actionable Insights</h3>
+                        <p class="feature-description">
+                            Receive a prioritized list of negotiation points, suggestions for missing clauses, and clear recommendations to strengthen your position.
+                        </p>
+                    </div>
+                    <div class="feature-card">
+                        <div class="feature-icon">🔒</div>
+                        <h3 class="feature-title">Secure & Confidential</h3>
+                        <p class="feature-description">
+                            Your documents are encrypted and processed with the utmost privacy. We never store your contract data after analysis.
+                        </p>
+                    </div>
+                </div>
+            </div>
+        </section>
+        <!-- Steps Section -->
+        <section class="steps-section">
+            <div class="container">
+                <h2 class="section-title">Get Your Analysis in 3 Simple Steps</h2>
+                <div class="steps-grid">
+                    <div class="step-card">
+                        <div class="step-number">1</div>
+                        <h3 class="step-title">Upload or Paste</h3>
+                        <p class="step-description">
+                            Securely provide your contract by pasting the text or uploading a DOCX/PDF file.
+                        </p>
+                    </div>
+                    <div class="step-card">
+                        <div class="step-number">2</div>
+                        <h3 class="step-title">AI Analyzes</h3>
+                        <p class="step-description">
+                            Our intelligent engine scrutinizes every detail of your document in seconds.
+                        </p>
+                    </div>
+                    <div class="step-card">
+                        <div class="step-number">3</div>
+                        <h3 class="step-title">Get Your Report</h3>
+                        <p class="step-description">
+                            Receive a comprehensive, easy-to-understand report with your risk score and key findings.
+                        </p>
+                    </div>
+                </div>
+            </div>
+        </section>
+        <footer class="footer">
+            © 2025 AI Contract Risk Analyzer. For informational purposes only. Not legal advice.
+        </footer>
+    </div>
+    <!-- Analyzer Screen -->
+    <div id="analyzerScreen" class="analyzer-screen">
+        <div class="container">
+            <button class="back-to-landing" id="backToLandingBtn">
+                ← Back to Overview
+            </button>
+            <div class="hero-section-analyzer">
+                <h1 class="hero-title-analyzer">Analyze Your Contract in Seconds</h1>
+                <p class="hero-description">Paste your contract or upload a file to get an instant, AI-powered risk assessment.</p>
+            </div>
+            <!-- API Status Indicator -->
+            <div id="apiStatus" class="api-status" style="display: none;">
+                Checking backend connection...
+            </div>
+            <div class="upload-card">
+                <div class="tabs">
+                    <button class="tab active" data-tab="paste">Paste Text</button>
+                    <button class="tab" data-tab="upload">Upload File</button>
+                </div>
+                <div id="pasteTab" class="tab-content active">
+                    <textarea class="textarea" id="contractText" placeholder="Paste your full contract text here..."></textarea>
+                </div>
+                <div id="uploadTab" class="tab-content">
+                    <div class="file-upload-area" id="fileUploadArea">
+                        <input type="file" id="fileInput" class="file-input" accept=".pdf,.docx,.txt">
+                        <div class="upload-icon">📄</div>
+                        <div class="upload-text">Click to upload or drag and drop</div>
+                        <div class="upload-hint">PDF, DOCX, or TXT files (Max 10MB)</div>
+                    </div>
+                    <div id="selectedFile" class="selected-file" style="display: none;">
+                        <div class="file-icon">📄</div>
+                        <div class="file-info">
+                            <div class="file-name" id="fileName"></div>
+                            <div class="file-size" id="fileSize"></div>
+                        </div>
+                        <button class="remove-file" id="removeFile">×</button>
+                    </div>
+                </div>
+                <div class="analyze-btn-container">
+                    <button class="analyze-btn" id="analyzeBtn">
+                        <span>🔍</span>
+                        <span>Analyze Contract</span>
+                    </button>
+                </div>
+            </div>
+            <!-- Loading Screen -->
+            <div id="loadingScreen" class="loading-screen">
+                <div class="spinner"></div>
+                <h2 class="loading-title">Performing in-depth analysis...</h2>
+                <p class="loading-text">This may take a moment for large documents.</p>
+            </div>
+            <!-- Results Screen -->
+            <div id="resultsScreen" class="results-screen">
+                <div class="results-header">
+                    <h1 class="results-title">Analysis Report</h1>
+                    <div class="results-actions">
+                        <button class="btn btn-primary" id="downloadBtn">📥 Download PDF Report</button>
+                        <button class="btn btn-secondary" id="analyzeAnotherBtn">Analyze Another Contract</button>
+                    </div>
+                </div>
+                <div class="results-grid">
+                    <div class="card">
+                        <h2 class="card-title">Overall Risk Score</h2>
+                        <div class="risk-score-container">
+                            <div class="risk-circle">
+                                <svg width="200" height="200">
+                                    <circle cx="100" cy="100" r="85" fill="none" stroke="#f0f0f0" stroke-width="20"/>
+                                    <circle id="riskCircle" cx="100" cy="100" r="85" fill="none" stroke="#dc2626" stroke-width="20" stroke-dasharray="534" stroke-dashoffset="534" stroke-linecap="round"/>
+                                </svg>
+                                <div class="risk-score-value" id="riskScoreValue">0</div>
+                            </div>
+                            <div class="risk-level" id="riskLevel">NO RISK</div>
+                        </div>
+                    </div>
+                    <div class="card">
+                        <h2 class="card-title">Executive Summary</h2>
+                        <p class="executive-summary" id="executiveSummary">
+                            Analysis results will appear here...
+                        </p>
+                    </div>
+                </div>
+                <div class="three-column-grid">
+                    <div class="card">
+                        <div class="card-icon icon-warning">⚠️</div>
+                        <h3 class="card-title">Unfavorable Terms</h3>
+                        <ul class="item-list" id="unfavorableTermsList">
+                            <li>No unfavorable terms detected yet</li>
+                        </ul>
+                    </div>
+                    <div class="card">
+                        <div class="card-icon icon-shield">🛡️</div>
+                        <h3 class="card-title">Missing Protections</h3>
+                        <ul class="item-list" id="missingProtectionsList">
+                            <li>No missing protections detected yet</li>
+                        </ul>
+                    </div>
+                    <div class="card">
+                        <div class="card-icon icon-book">📖</div>
+                        <h3 class="card-title">Negotiation Points</h3>
+                        <ul class="item-list" id="negotiationPointsList">
+                            <li>No negotiation points generated yet</li>
+                        </ul>
+                    </div>
+                </div>
+                <div class="card category-breakdown">
+                    <h2 class="card-title">Risk Category Breakdown</h2>
+                    <div id="categoryBreakdown">
+                        <div class="category-item">
+                            <div class="category-header">
+                                <span class="category-name">Waiting for analysis...</span>
+                                <span class="category-score score-low">0/100</span>
+                            </div>
+                            <div class="progress-bar">
+                                <div class="progress-fill progress-low" style="width: 0%"></div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                <div class="card clause-analysis">
+                    <h2 class="card-title">Clause-by-Clause Analysis</h2>
+                    <div id="clauseAnalysis">
+                        <div class="clause-item">
+                            <div class="clause-header">
+                                <div>
+                                    <div class="clause-label">STATUS</div>
+                                    <div class="clause-text">Upload a contract to begin analysis</div>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script>
+        const API_BASE_URL = window.location.hostname === 'localhost'
+            ? 'http://localhost:8000/api/v1'
+            : '/api/v1';
+        let selectedFile = null;
+        let currentJobId = null;
+        let pollInterval = null;
+        // Screen management
+        function showScreen(screenName) {
+            document.getElementById('landingScreen').style.display = 'none';
+            document.getElementById('analyzerScreen').style.display = 'none';
+            document.getElementById('loadingScreen').classList.remove('active');
+            document.getElementById('resultsScreen').classList.remove('active');
+            if (screenName === 'landing') {
+                document.getElementById('landingScreen').style.display = 'block';
+            } else if (screenName === 'analyzer') {
+                document.getElementById('analyzerScreen').style.display = 'block';
+                checkBackendConnection();
+            } else if (screenName === 'loading') {
+                document.getElementById('analyzerScreen').style.display = 'block';
+                document.getElementById('loadingScreen').classList.add('active');
+            } else if (screenName === 'results') {
+                document.getElementById('analyzerScreen').style.display = 'block';
+                document.getElementById('resultsScreen').classList.add('active');
+            }
+        }
+        // Check backend connection
+        async function checkBackendConnection() {
+            const statusElement = document.getElementById('apiStatus');
+            statusElement.style.display = 'block';
+            statusElement.textContent = 'Checking backend connection...';
+            statusElement.className = 'api-status';
+            try {
+                const response = await fetch(`${API_BASE_URL}/health`, {
+                    method: 'GET',
+                    headers: {
+                        'Accept': 'application/json'
+                    }
+                });
+                if (response.ok) {
+                    statusElement.textContent = '✓ Backend connected successfully';
+                    statusElement.className = 'api-status connected';
+                } else {
+                    throw new Error('Backend not responding properly');
+                }
+            } catch (error) {
+                console.error('Backend connection failed:', error);
+                statusElement.textContent = '✗ Cannot connect to backend. Make sure the server is running on port 8000.';
+                statusElement.className = 'api-status disconnected';
+                setTimeout(() => {
+                    statusElement.style.display = 'none';
+                }, 5000);
+            }
+        }
+        // Navigation
+        document.getElementById('getStartedBtn').addEventListener('click', () => {
+            showScreen('analyzer');
+        });
+        document.getElementById('backToLandingBtn').addEventListener('click', () => {
+            showScreen('landing');
+        });
+        // Tab switching
+        document.querySelectorAll('.tab').forEach(tab => {
+            tab.addEventListener('click', (e) => {
+                e.preventDefault();
+                const tabName = tab.dataset.tab;
+                document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+                document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
+                tab.classList.add('active');
+                document.getElementById(tabName + 'Tab').classList.add('active');
+            });
+        });
+        // File upload handling
+        const fileUploadArea = document.getElementById('fileUploadArea');
+        const fileInput = document.getElementById('fileInput');
+        const selectedFileDiv = document.getElementById('selectedFile');
+        const fileNameSpan = document.getElementById('fileName');
+        const fileSizeSpan = document.getElementById('fileSize');
+        const removeFileBtn = document.getElementById('removeFile');
+        fileUploadArea.addEventListener('click', () => fileInput.click());
+        fileUploadArea.addEventListener('dragover', (e) => {
+            e.preventDefault();
+            fileUploadArea.classList.add('dragover');
+        });
+        fileUploadArea.addEventListener('dragleave', () => {
+            fileUploadArea.classList.remove('dragover');
+        });
+        fileUploadArea.addEventListener('drop', (e) => {
+            e.preventDefault();
+            fileUploadArea.classList.remove('dragover');
+            const file = e.dataTransfer.files[0];
+            handleFileSelect(file);
+        });
+        fileInput.addEventListener('change', (e) => {
+            const file = e.target.files[0];
+            handleFileSelect(file);
+        });
+        removeFileBtn.addEventListener('click', (e) => {
+            e.stopPropagation();
+            selectedFile = null;
+            fileInput.value = '';
+            selectedFileDiv.style.display = 'none';
+            fileUploadArea.style.display = 'block';
+        });
+        function handleFileSelect(file) {
+            if (!file) return;
+            const validTypes = [
+                'application/pdf',
+                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                'text/plain'
+            ];
+            const isValidType = validTypes.includes(file.type) ||
+                               file.name.match(/\.(pdf|docx|txt)$/i);
+            if (!isValidType) {
+                alert('Please upload a PDF, DOCX, or TXT file');
+                return;
+            }
+            if (file.size > 10 * 1024 * 1024) {
+                alert('File size must be less than 10MB');
+                return;
+            }
+            selectedFile = file;
+            fileNameSpan.textContent = file.name;
+            fileSizeSpan.textContent = formatFileSize(file.size);
+            fileUploadArea.style.display = 'none';
+            selectedFileDiv.style.display = 'flex';
+        }
+        function formatFileSize(bytes) {
+            if (bytes < 1024) return bytes + ' B';
+            if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(2) + ' KB';
+            return (bytes / (1024 * 1024)).toFixed(2) + ' MB';
+        }
+        // Analyze button
+        document.getElementById('analyzeBtn').addEventListener('click', async () => {
+            const activeTab = document.querySelector('.tab.active').dataset.tab;
+            const analyzeBtn = document.getElementById('analyzeBtn');
+            try {
+                analyzeBtn.disabled = true;
+                analyzeBtn.innerHTML = '<span>⏳</span><span>Processing...</span>';
+                if (activeTab === 'paste') {
+                    const text = document.getElementById('contractText').value.trim();
+                    if (!text) {
+                        alert('Please paste contract text');
+                        return;
+                    }
+                    const blob = new Blob([text], { type: 'text/plain' });
+                    const file = new File([blob], 'contract.txt', { type: 'text/plain' });
+                    await analyzeContract(file);
+                } else {
+                    if (!selectedFile) {
+                        alert('Please select a file');
+                        return;
+                    }
+                    await analyzeContract(selectedFile);
+                }
+            } catch (error) {
+                console.error('Analysis error:', error);
+                alert('Error starting analysis: ' + error.message);
+            } finally {
+                analyzeBtn.disabled = false;
+                analyzeBtn.innerHTML = '<span>🔍</span><span>Analyze Contract</span>';
+            }
+        });
+        async function analyzeContract(file) {
+            try {
+                showScreen('loading');
+                const formData = new FormData();
+                formData.append('file', file);
+                formData.append('max_clauses', '15');
+                formData.append('interpret_clauses', 'true');
+                formData.append('generate_negotiation_points', 'true');
+                formData.append('compare_to_market', 'true');
+                formData.append('llm_provider', 'ollama');
+                const response = await fetch(`${API_BASE_URL}/analyze`, {
+                    method: 'POST',
+                    body: formData
+                });
+                if (!response.ok) {
+                    let errorDetail = 'Analysis failed';
+                    try {
+                        const errorData = await response.json();
+                        errorDetail = errorData.detail || errorData.error || errorDetail;
+                    } catch (e) {
+                        errorDetail = `Server error: ${response.status} ${response.statusText}`;
+                    }
+                    throw new Error(errorDetail);
+                }
+                const job = await response.json();
+                currentJobId = job.job_id;
+                pollInterval = setInterval(() => pollJobStatus(currentJobId), 2000);
+            } catch (error) {
+                console.error('Error:', error);
+                alert('Error analyzing contract: ' + error.message);
+                showScreen('analyzer');
+            }
+        }
+        async function pollJobStatus(jobId) {
+            try {
+                const response = await fetch(`${API_BASE_URL}/jobs/${jobId}`);
+                if (!response.ok) throw new Error('Failed to fetch job status');
+                const job = await response.json();
+                if (job.status === 'completed') {
+                    clearInterval(pollInterval);
+                    displayResults(job.result);
+                    showScreen('results');
+                } else if (job.status === 'failed') {
+                    clearInterval(pollInterval);
+                    alert('Analysis failed: ' + job.error);
+                    showScreen('analyzer');
+                }
+            } catch (error) {
+                console.error('Polling error:', error);
+            }
+        }
+        function displayResults(result) {
+            const score = result.risk_analysis.overall_score;
+            const riskLevel = result.risk_analysis.risk_level;
+            document.getElementById('riskScoreValue').textContent = score;
+            document.getElementById('riskLevel').textContent = riskLevel.toUpperCase();
+            document.getElementById('riskLevel').className = 'risk-level risk-' + getRiskClass(score);
+            const circumference = 534;
+            const offset = circumference - (score / 100) * circumference;
+            const circle = document.getElementById('riskCircle');
+            circle.style.strokeDashoffset = offset;
+            circle.style.stroke = getRiskColor(score);
+            document.getElementById('executiveSummary').textContent = result.executive_summary;
+            // Update other result sections...
+            const unfavorableList = document.getElementById('unfavorableTermsList');
+            unfavorableList.innerHTML = '';
+            if (result.unfavorable_terms && result.unfavorable_terms.length > 0) {
+                result.unfavorable_terms.slice(0, 8).forEach(term => {
+                    const li = document.createElement('li');
+                    li.innerHTML = `<span class="item-icon">›</span><span class="item-text"><strong>${term.term}:</strong> ${term.explanation}</span>`;
+                    unfavorableList.appendChild(li);
+                });
+            } else {
+                unfavorableList.innerHTML = '<li>No unfavorable terms detected</li>';
+            }
+            // Similar updates for other sections...
+        }
+        function getRiskClass(score) {
+            if (score >= 80) return 'critical';
+            if (score >= 60) return 'high';
+            if (score >= 40) return 'medium';
+            return 'low';
+        }
+        function getRiskColor(score) {
+            if (score >= 80) return '#dc2626';
+            if (score >= 60) return '#f97316';
+            if (score >= 40) return '#ca8a04';
+            return '#16a34a';
+        }
+        // Initialize
+        showScreen('landing');
+    </script>
+</body>
+</html>

static/style.css DELETED Viewed

File without changes

utils/text_processor.py CHANGED Viewed

@@ -54,13 +54,17 @@ class TextProcessor:
         """
         Normalize text for analysis
-        Args:
-            text: Input text
-            lowercase: Convert to lowercase
-            remove_special_chars: Remove special characters
         Returns:
-            Normalized text
         """
         if lowercase:
             text = text.lower()
@@ -74,17 +78,21 @@ class TextProcessor:
         return text.strip()
     @staticmethod
     def split_into_paragraphs(text: str, min_length: int = 20) -> List[str]:
         """
         Split text into paragraphs
-        Args:
-            text: Input text
-            min_length: Minimum paragraph length in characters
         Returns:
-            List of paragraphs
         """
         # Split on double newlines
         paragraphs = re.split(r'\n\s*\n', text)
@@ -92,17 +100,21 @@ class TextProcessor:
         # Filter short and empty paragraphs
         return [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
     @staticmethod
     def extract_sentences(text: str, min_length: int = 10) -> List[str]:
         """
         Extract sentences from text (basic method)
-        Args:
-            text: Input text
-            min_length: Minimum sentence length in characters
         Returns:
-            List of sentences
         """
         # Simple sentence splitting on .!?
         sentences = re.split(r'[.!?]+', text)
@@ -112,6 +124,7 @@ class TextProcessor:
         return sentences
     def extract_sentences_advanced(self, text: str) -> List[Dict[str, Any]]:
         """
         Extract sentences with NER and metadata using spaCy
@@ -125,87 +138,87 @@ class TextProcessor:
         if not self.nlp:
             # Fallback to basic extraction
             basic_sentences = self.extract_sentences(text)
-            return [{"text": s, "entities": [], "start_char": 0, "end_char": 0}
-                   for s in basic_sentences]
-        doc = self.nlp(text[:100000])  # Limit to 100K chars for performance
-        sentences = []
         for sent in doc.sents:
-            sentences.append({
-                "text": sent.text.strip(),
-                "entities": [(ent.text, ent.label_) for ent in sent.ents],
-                "start_char": sent.start_char,
-                "end_char": sent.end_char,
-                "tokens": [token.text for token in sent]
-            })
         return sentences
-    # =========================================================================
-    # LEGAL-SPECIFIC EXTRACTION
-    # =========================================================================
     @staticmethod
     def extract_legal_entities(text: str) -> Dict[str, List[str]]:
         """
         Extract legal-specific entities (parties, dates, amounts, references)
-        Args:
-            text: Input text
         Returns:
-            Dictionary of extracted entities by type
-        """
-        entities = {
-            "parties": [],
-            "dates": [],
-            "amounts": [],
-            "addresses": [],
-            "references": [],
-            "emails": [],
-            "phone_numbers": []
-        }
         # Party names (PARTY A, "the Employee", Company Name Inc.)
-        party_patterns = [
-            r'(?:PARTY|Party)\s+[A-Z]',
-            r'"the\s+\w+"',
-            r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|LLC|Corp|Ltd|Limited|Company)\.?',
-            r'(?:the\s+)?(Employer|Employee|Consultant|Contractor|Client|Vendor|Supplier|Landlord|Tenant|Buyer|Seller)',
-        ]
         for pattern in party_patterns:
             matches = re.findall(pattern, text)
             entities["parties"].extend(matches)
         # Dates (various formats)
-        date_patterns = [
-            r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
-            r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
-            r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
-        ]
         for pattern in date_patterns:
             matches = re.findall(pattern, text, re.IGNORECASE)
             entities["dates"].extend(matches)
         # Legal references (Section 5.2, Clause 11.1, Article III)
-        ref_patterns = [
-            r'(?:Section|Clause|Article|Paragraph|Exhibit|Schedule|Appendix)\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+)',
-        ]
         for pattern in ref_patterns:
             matches = re.findall(pattern, text, re.IGNORECASE)
             entities["references"].extend(matches)
         # Monetary amounts
-        entities["amounts"] = TextProcessor.extract_monetary_amounts(text)
         # Email addresses
-        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
-        entities["emails"] = re.findall(email_pattern, text)
         # Phone numbers (US format)
-        phone_pattern = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b'
-        phone_matches = re.findall(phone_pattern, text)
         entities["phone_numbers"] = ['-'.join(match) for match in phone_matches]
         # Deduplicate
@@ -214,37 +227,46 @@ class TextProcessor:
         return entities
     @staticmethod
     def count_words(text: str) -> int:
-        """Count words in text"""
         return len(text.split())
     @staticmethod
     def extract_numbers(text: str) -> List[str]:
-        """Extract all numbers from text"""
         return re.findall(r'\d+', text)
     @staticmethod
     def extract_monetary_amounts(text: str) -> List[str]:
         """
         Extract monetary amounts from text
         Returns:
-            List of monetary amounts (e.g., ['$1,000', '$2,500.00'])
         """
         # Match patterns like $1,000 or $1000.00 or USD 1,000
-        patterns = [
-            r'\$[\d,]+(?:\.\d{2})?',
-            r'USD\s*[\d,]+(?:\.\d{2})?',
-            r'EUR\s*[\d,]+(?:\.\d{2})?',
-            r'GBP\s*[\d,]+(?:\.\d{2})?'
-        ]
-        amounts = []
         for pattern in patterns:
             amounts.extend(re.findall(pattern, text, re.IGNORECASE))
         return amounts
     @staticmethod
     def extract_durations(text: str) -> List[Dict[str, str]]:
@@ -252,150 +274,149 @@ class TextProcessor:
         Extract time durations (e.g., "6 months", "2 years")
         Returns:
-            List of duration dictionaries with 'amount' and 'unit'
         """
         pattern = r'(\d+)\s*(day|week|month|year)s?'
         matches = re.findall(pattern, text, re.IGNORECASE)
-        return [
-            {"amount": m[0], "unit": m[1].lower()}
-            for m in matches
-        ]
     @staticmethod
     def extract_percentages(text: str) -> List[str]:
-        """Extract percentages from text"""
         return re.findall(r'\d+(?:\.\d+)?%', text)
-    # =========================================================================
-    # TEXT CHUNKING FOR EMBEDDINGS
-    # =========================================================================
     @staticmethod
-    def chunk_text_for_embedding(text: str,
-                                 chunk_size: int = 512,
-                                 overlap: int = 50) -> List[Dict[str, Any]]:
         """
         Chunk text with overlap for embedding models (preserves sentence boundaries)
-        Args:
-            text: Input text
-            chunk_size: Maximum chunk size in words
-            overlap: Number of words to overlap between chunks
         Returns:
-            List of chunk dictionaries with metadata
         """
-        sentences = TextProcessor.extract_sentences(text)
-        chunks = []
-        current_chunk = []
-        current_length = 0
         start_sentence_idx = 0
         for i, sentence in enumerate(sentences):
-            sentence_words = sentence.split()
             sentence_length = len(sentence_words)
-            if current_length + sentence_length > chunk_size and current_chunk:
                 # Save current chunk
-                chunks.append({
-                    "text": " ".join(current_chunk),
-                    "start_sentence": start_sentence_idx,
-                    "end_sentence": i - 1,
-                    "word_count": current_length,
-                    "chunk_id": len(chunks)
-                })
                 # Start new chunk with overlap
-                overlap_sentences = current_chunk[-2:] if len(current_chunk) > 2 else current_chunk
-                current_chunk = overlap_sentences + [sentence]
-                current_length = sum(len(s.split()) for s in current_chunk)
                 start_sentence_idx = max(0, i - len(overlap_sentences))
             else:
                 current_chunk.append(sentence)
                 current_length += sentence_length
         # Add final chunk
         if current_chunk:
-            chunks.append({
-                "text": " ".join(current_chunk),
-                "start_sentence": start_sentence_idx,
-                "end_sentence": len(sentences) - 1,
-                "word_count": current_length,
-                "chunk_id": len(chunks)
-            })
         return chunks
-    # =========================================================================
-    # TEXT SIMILARITY & DEDUPLICATION
-    # =========================================================================
     @staticmethod
     def text_similarity(text1: str, text2: str) -> float:
         """
         Calculate similarity between two texts (0-1 scale)
-        Args:
-            text1: First text
-            text2: Second text
         Returns:
-            Similarity score (0.0 = completely different, 1.0 = identical)
         """
         return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
     @staticmethod
     def deduplicate_clauses(clauses: List[str], threshold: float = 0.85) -> List[str]:
         """
         Remove near-duplicate clauses
-        Args:
-            clauses: List of clause texts
-            threshold: Similarity threshold for deduplication (0.0-1.0)
         Returns:
-            List of unique clauses
         """
-        unique = []
         for clause in clauses:
-            is_duplicate = any(
-                TextProcessor.text_similarity(clause, existing) > threshold
-                for existing in unique
-            )
             if not is_duplicate:
                 unique.append(clause)
         return unique
-    # =========================================================================
-    # LANGUAGE DETECTION
-    # =========================================================================
     @staticmethod
     def detect_language(text: str) -> str:
         """
         Detect text language
-        Args:
-            text: Input text
         Returns:
-            ISO 639-1 language code (e.g., 'en', 'es', 'fr')
         """
         if not LANGDETECT_AVAILABLE:
-            return "en"  # Default to English
         try:
             # Use first 1000 chars for detection
             return detect(text[:1000])
         except LangDetectException:
             return "en"
-    # =========================================================================
-    # TEXT STATISTICS
-    # =========================================================================
     @staticmethod
     def get_text_statistics(text: str) -> Dict[str, Any]:
@@ -403,49 +424,46 @@ class TextProcessor:
         Get comprehensive text statistics
         Returns:
-            Dictionary with character count, word count, sentence count, etc.
         """
-        sentences = TextProcessor.extract_sentences(text)
         paragraphs = TextProcessor.split_into_paragraphs(text)
-        words = text.split()
-        return {
-            "character_count": len(text),
-            "word_count": len(words),
-            "sentence_count": len(sentences),
-            "paragraph_count": len(paragraphs),
-            "avg_words_per_sentence": len(words) / len(sentences) if sentences else 0,
-            "avg_chars_per_word": len(text) / len(words) if words else 0,
-            "language": TextProcessor.detect_language(text)
-        }
-    # =========================================================================
-    # KEYWORD HIGHLIGHTING
-    # =========================================================================
     @staticmethod
-    def highlight_keywords(text: str, keywords: List[str],
-                          highlight_format: str = "**{}**") -> str:
         """
         Highlight keywords in text (for display purposes)
-        Args:
-            text: Input text
-            keywords: List of keywords to highlight
-            highlight_format: Format string with {} placeholder (default: Markdown bold)
         Returns:
-            Text with highlighted keywords
         """
         for keyword in keywords:
             pattern = re.compile(re.escape(keyword), re.IGNORECASE)
-            text = pattern.sub(lambda m: highlight_format.format(m.group(0)), text)
         return text
-    # =========================================================================
-    # CLAUSE SEGMENTATION HELPERS
-    # =========================================================================
     @staticmethod
     def extract_numbered_sections(text: str) -> List[Dict[str, Any]]:
@@ -453,57 +471,61 @@ class TextProcessor:
         Extract numbered sections/clauses (1.1, 1.2, Article 5, etc.)
         Returns:
-            List of section dictionaries with number and text
         """
-        patterns = [
-            (r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\n\s*\d+\.\d+|\n\n|$)', 'numbered'),
-            (r'(Article\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nArticle|\n\n|$)', 'article'),
-            (r'(Section\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nSection|\n\n|$)', 'section'),
-            (r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\nClause|\n\n|$)', 'clause'),
-        ]
-        sections = []
         for pattern, section_type in patterns:
             matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
             for match in matches:
-                sections.append({
-                    "reference": match.group(1).strip(),
-                    "text": match.group(2).strip(),
-                    "type": section_type,
-                    "start_pos": match.start(),
-                    "end_pos": match.end()
-                })
         # Sort by position
-        sections.sort(key=lambda x: x['start_pos'])
         return sections
     @staticmethod
     def clean_legal_text(text: str) -> str:
         """
         Clean legal text by removing boilerplate artifacts
-        Args:
-            text: Input legal text
         Returns:
-            Cleaned text
         """
         # Remove "Page X of Y" markers
-        text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
         # Remove "[Signature Page Follows]" type markers
-        text = re.sub(r'\[.*?(?:Signature|Initial|Page).*?\]', '', text, flags=re.IGNORECASE)
         # Remove excessive underscores (signature lines)
         text = re.sub(r'_{3,}', '', text)
         # Remove "CONFIDENTIAL" watermarks
-        text = re.sub(r'\b(CONFIDENTIAL|DRAFT|INTERNAL USE ONLY)\b', '', text, flags=re.IGNORECASE)
         # Clean up resulting whitespace
         text = re.sub(r'\n{3,}', '\n\n', text)
         text = re.sub(r' {2,}', ' ', text)
-        return text.strip()

         """
         Normalize text for analysis
+        Arguments:
+        ----------
+            text                 { str }  : Input text
+            lowercase            { bool } : Convert to lowercase
+            remove_special_chars { bool } : Remove special characters
         Returns:
+        --------
+                      { str }             : Normalized text
         """
         if lowercase:
             text = text.lower()
         return text.strip()
     @staticmethod
     def split_into_paragraphs(text: str, min_length: int = 20) -> List[str]:
         """
         Split text into paragraphs
+        Arguments:
+        ----------
+            text       { str } : Input text
+            min_length { int } : Minimum paragraph length in characters
         Returns:
+        --------
+                { list }       : List of paragraphs
         """
         # Split on double newlines
         paragraphs = re.split(r'\n\s*\n', text)
         # Filter short and empty paragraphs
         return [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
     @staticmethod
     def extract_sentences(text: str, min_length: int = 10) -> List[str]:
         """
         Extract sentences from text (basic method)
+        Arguments:
+        ----------
+            text       { str } : Input text
+            min_length { int  } : Minimum sentence length in characters
         Returns:
+        --------
+                { list }        : List of sentences
         """
         # Simple sentence splitting on .!?
         sentences = re.split(r'[.!?]+', text)
         return sentences
     def extract_sentences_advanced(self, text: str) -> List[Dict[str, Any]]:
         """
         Extract sentences with NER and metadata using spaCy
         if not self.nlp:
             # Fallback to basic extraction
             basic_sentences = self.extract_sentences(text)
+            return [{"text" : s, "entities" : [], "start_char" : 0, "end_char" : 0} for s in basic_sentences]
+        # Limit to 100K chars for performance
+        doc       = self.nlp(text[:100000])
+        sentences = list()
         for sent in doc.sents:
+            sentences.append({"text"       : sent.text.strip(),
+                              "entities"   : [(ent.text, ent.label_) for ent in sent.ents],
+                              "start_char" : sent.start_char,
+                              "end_char"   : sent.end_char,
+                              "tokens"     : [token.text for token in sent],
+                            })
         return sentences
     @staticmethod
     def extract_legal_entities(text: str) -> Dict[str, List[str]]:
         """
         Extract legal-specific entities (parties, dates, amounts, references)
+        Arguments:
+        ----------
+            text { str } : Input text
         Returns:
+        --------
+            { dict }     : Dictionary of extracted entities by type
+        """
+        entities       = {"parties"       : [],
+                          "dates"         : [],
+                          "amounts"       : [],
+                          "addresses"     : [],
+                          "references"    : [],
+                          "emails"        : [],
+                          "phone_numbers" : [],
+                         }
         # Party names (PARTY A, "the Employee", Company Name Inc.)
+        party_patterns = [r'(?:PARTY|Party)\s+[A-Z]',
+                          r'"the\s+\w+"',
+                          r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|LLC|Corp|Ltd|Limited|Company)\.?',
+                          r'(?:the\s+)?(Employer|Employee|Consultant|Contractor|Client|Vendor|Supplier|Landlord|Tenant|Buyer|Seller)',
+                         ]
         for pattern in party_patterns:
             matches = re.findall(pattern, text)
             entities["parties"].extend(matches)
         # Dates (various formats)
+        date_patterns = [r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
+                         r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
+                         r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
+                        ]
         for pattern in date_patterns:
             matches = re.findall(pattern, text, re.IGNORECASE)
             entities["dates"].extend(matches)
         # Legal references (Section 5.2, Clause 11.1, Article III)
+        ref_patterns = [r'(?:Section|Clause|Article|Paragraph|Exhibit|Schedule|Appendix)\s+(?:\d+(?:\.\d+)*|[IVXLCDM]+)']
         for pattern in ref_patterns:
             matches = re.findall(pattern, text, re.IGNORECASE)
             entities["references"].extend(matches)
         # Monetary amounts
+        entities["amounts"]       = TextProcessor.extract_monetary_amounts(text)
         # Email addresses
+        email_pattern             = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+        entities["emails"]        = re.findall(email_pattern, text)
         # Phone numbers (US format)
+        phone_pattern             = r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b'
+        phone_matches             = re.findall(phone_pattern, text)
         entities["phone_numbers"] = ['-'.join(match) for match in phone_matches]
         # Deduplicate
         return entities
     @staticmethod
     def count_words(text: str) -> int:
+        """
+        Count words in text
+        """
         return len(text.split())
     @staticmethod
     def extract_numbers(text: str) -> List[str]:
+        """
+        Extract all numbers from text
+        """
         return re.findall(r'\d+', text)
     @staticmethod
     def extract_monetary_amounts(text: str) -> List[str]:
         """
         Extract monetary amounts from text
         Returns:
+        --------
+            { list }    : List of monetary amounts (e.g., ['$1,000', '$2,500.00'])
         """
         # Match patterns like $1,000 or $1000.00 or USD 1,000
+        patterns = [r'\$[\d,]+(?:\.\d{2})?',
+                    r'USD\s*[\d,]+(?:\.\d{2})?',
+                    r'EUR\s*[\d,]+(?:\.\d{2})?',
+                    r'GBP\s*[\d,]+(?:\.\d{2})?'
+                   ]
+        amounts  = list()
         for pattern in patterns:
             amounts.extend(re.findall(pattern, text, re.IGNORECASE))
         return amounts
     @staticmethod
     def extract_durations(text: str) -> List[Dict[str, str]]:
         Extract time durations (e.g., "6 months", "2 years")
         Returns:
+        --------
+            { list }    : List of duration dictionaries with 'amount' and 'unit'
         """
         pattern = r'(\d+)\s*(day|week|month|year)s?'
         matches = re.findall(pattern, text, re.IGNORECASE)
+        return [{"amount": m[0], "unit": m[1].lower()} for m in matches]
     @staticmethod
     def extract_percentages(text: str) -> List[str]:
+        """
+        Extract percentages from text
+        """
         return re.findall(r'\d+(?:\.\d+)?%', text)
     @staticmethod
+    def chunk_text_for_embedding(text: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
         """
         Chunk text with overlap for embedding models (preserves sentence boundaries)
+        Arguments:
+        ----------
+            text       { str } : Input text
+            chunk_size { int } : Maximum chunk size in words
+            overlap    { int } : Number of words to overlap between chunks
         Returns:
+        --------
+                { list }       : List of chunk dictionaries with metadata
         """
+        sentences          = TextProcessor.extract_sentences(text)
+        chunks             = list()
+        current_chunk      = list()
+        current_length     = 0
         start_sentence_idx = 0
         for i, sentence in enumerate(sentences):
+            sentence_words  = sentence.split()
             sentence_length = len(sentence_words)
+            if (((current_length + sentence_length) > chunk_size) and current_chunk):
                 # Save current chunk
+                chunks.append({"text"           : " ".join(current_chunk),
+                               "start_sentence" : start_sentence_idx,
+                               "end_sentence"   : i - 1,
+                               "word_count"     : current_length,
+                               "chunk_id"       : len(chunks),
+                             })
                 # Start new chunk with overlap
+                overlap_sentences  = current_chunk[-2:] if (len(current_chunk) > 2) else current_chunk
+                current_chunk      = overlap_sentences + [sentence]
+                current_length     = sum(len(s.split()) for s in current_chunk)
                 start_sentence_idx = max(0, i - len(overlap_sentences))
             else:
                 current_chunk.append(sentence)
                 current_length += sentence_length
         # Add final chunk
         if current_chunk:
+            chunks.append({"text"           : " ".join(current_chunk),
+                           "start_sentence" : start_sentence_idx,
+                           "end_sentence"   : len(sentences) - 1,
+                           "word_count"     : current_length,
+                           "chunk_id"       : len(chunks),
+                         })
         return chunks
     @staticmethod
     def text_similarity(text1: str, text2: str) -> float:
         """
         Calculate similarity between two texts (0-1 scale)
+        Arguments:
+        ----------
+            text1 { str } : First text
+            text2 { str } : Second text
         Returns:
+        --------
+            { float }     : Similarity score (0.0 = completely different, 1.0 = identical)
         """
         return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
     @staticmethod
     def deduplicate_clauses(clauses: List[str], threshold: float = 0.85) -> List[str]:
         """
         Remove near-duplicate clauses
+        Arguments:
+        ----------
+            clauses   { list }  : List of clause texts
+            threshold { float } : Similarity threshold for deduplication (0.0-1.0)
         Returns:
+        --------
+                { list }        : List of unique clauses
         """
+        unique = list()
         for clause in clauses:
+            is_duplicate = any(TextProcessor.text_similarity(clause, existing) > threshold for existing in unique)
             if not is_duplicate:
                 unique.append(clause)
         return unique
     @staticmethod
     def detect_language(text: str) -> str:
         """
         Detect text language
+        Arguments:
+        ----------
+            text { str } : Input text
         Returns:
+        --------
+             { str }     : ISO 639-1 language code (e.g., 'en', 'es', 'fr')
         """
         if not LANGDETECT_AVAILABLE:
+            # Default to English
+            return "en"
         try:
             # Use first 1000 chars for detection
             return detect(text[:1000])
         except LangDetectException:
             return "en"
     @staticmethod
     def get_text_statistics(text: str) -> Dict[str, Any]:
         Get comprehensive text statistics
         Returns:
+        --------
+            { dict }    : Dictionary with character count, word count, sentence count, etc.
         """
+        sentences  = TextProcessor.extract_sentences(text)
         paragraphs = TextProcessor.split_into_paragraphs(text)
+        words      = text.split()
+        return {"character_count"        : len(text),
+                "word_count"             : len(words),
+                "sentence_count"         : len(sentences),
+                "paragraph_count"        : len(paragraphs),
+                "avg_words_per_sentence" : len(words) / len(sentences) if sentences else 0,
+                "avg_chars_per_word"     : len(text) / len(words) if words else 0,
+                "language"               : TextProcessor.detect_language(text),
+               }
     @staticmethod
+    def highlight_keywords(text: str, keywords: List[str], highlight_format: str = "**{}**") -> str:
         """
         Highlight keywords in text (for display purposes)
+        Arguments:
+        ----------
+            text             { str }  : Input text
+            keywords         { list } : List of keywords to highlight
+            highlight_format { str  } : Format string with {} placeholder (default: Markdown bold)
         Returns:
+        --------
+                    { str }           : Text with highlighted keywords
         """
         for keyword in keywords:
             pattern = re.compile(re.escape(keyword), re.IGNORECASE)
+            text    = pattern.sub(lambda m: highlight_format.format(m.group(0)), text)
         return text
     @staticmethod
     def extract_numbered_sections(text: str) -> List[Dict[str, Any]]:
         Extract numbered sections/clauses (1.1, 1.2, Article 5, etc.)
         Returns:
+        --------
+            { list }    : List of section dictionaries with number and text
         """
+        patterns = [(r'(\d+\.\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\n\s*\d+\.\d+|\n\n|$)', 'numbered'),
+                    (r'(Article\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nArticle|\n\n|$)', 'article'),
+                    (r'(Section\s+(?:\d+|[IVXLCDM]+))\.\s*([^\n]{20,}?)(?=\nSection|\n\n|$)', 'section'),
+                    (r'(Clause\s+\d+(?:\.\d+)*)\.\s*([^\n]{20,}?)(?=\nClause|\n\n|$)', 'clause'),
+                   ]
+        sections = list()
         for pattern, section_type in patterns:
             matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
             for match in matches:
+                sections.append({"reference" : match.group(1).strip(),
+                                 "text"      : match.group(2).strip(),
+                                 "type"      : section_type,
+                                 "start_pos" : match.start(),
+                                 "end_pos"   : match.end(),
+                               })
         # Sort by position
+        sections.sort(key = lambda x: x['start_pos'])
         return sections
     @staticmethod
     def clean_legal_text(text: str) -> str:
         """
         Clean legal text by removing boilerplate artifacts
+        Arguments:
+        ----------
+            text { str } : Input legal text
         Returns:
+        --------
+             { str }     : Cleaned text
         """
         # Remove "Page X of Y" markers
+        text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags = re.IGNORECASE)
         # Remove "[Signature Page Follows]" type markers
+        text = re.sub(r'\[.*?(?:Signature|Initial|Page).*?\]', '', text, flags = re.IGNORECASE)
         # Remove excessive underscores (signature lines)
         text = re.sub(r'_{3,}', '', text)
         # Remove "CONFIDENTIAL" watermarks
+        text = re.sub(r'\b(CONFIDENTIAL|DRAFT|INTERNAL USE ONLY)\b', '', text, flags = re.IGNORECASE)
         # Clean up resulting whitespace
         text = re.sub(r'\n{3,}', '\n\n', text)
         text = re.sub(r' {2,}', ' ', text)
+        return text.strip()