contract-iq-api / app /services /ml_service.py
kshitij230's picture
Upload 36 files
308e240 verified
import logging
import torch
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MLService:
def __init__(self):
self.model_name = 'all-MiniLM-L6-v2'
self.model = None
self.risk_patterns = {
"Termination": [
"terminate at any time without notice",
"immediate cancellation by client",
"termination for convenience",
"no cure period for breaches"
],
"Liability": [
"unlimited liability for contractor",
"indemnify client for all losses",
"waive all rights to sue",
"contractor bears all risk"
],
"Payment Terms": [
"payment within 90 days",
"client may withhold payment for any reason",
"non-negotiable fees",
"late payment penalties for contractor only"
],
"Intellectual Property": [
"client owns all preexisting work",
"contractor waives all moral rights",
"assignment of all past and future inventions",
"perpetual exclusive license to all tools"
]
}
self.pattern_embeddings = {}
self.enabled = False
self._initialize_model()
def _initialize_model(self):
try:
import numpy as np
from sentence_transformers import SentenceTransformer
logger.info(f"Loading ML model: {self.model_name}...")
# Use CPU by default for stability
device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = SentenceTransformer(self.model_name, device=device)
# Pre-calculate embeddings for risk patterns
for category, patterns in self.risk_patterns.items():
self.pattern_embeddings[category] = self.model.encode(patterns)
self.enabled = True
logger.info("✅ ML Model loaded successfully")
except ImportError as e:
logger.error(f"⚠️ ML dependencies missing: {e}. Semantic analysis disabled.")
self.enabled = False
except Exception as e:
logger.error(f"❌ Failed to load ML model: {e}")
self.enabled = False
def analyze_clause_semantic(self, text: str, threshold: float = 0.45):
"""
Uses semantic similarity to detect risks that might not use exact keywords.
"""
if not self.enabled or self.model is None:
return []
try:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Split text into sentences/clauses (rough split)
sentences = [s.strip() for s in text.replace('\n', '. ').split('.') if len(s.strip()) > 10]
if not sentences:
return []
sentence_embeddings = self.model.encode(sentences)
semantic_risks = []
for i, s_emb in enumerate(sentence_embeddings):
for category, p_embs in self.pattern_embeddings.items():
similarities = cosine_similarity([s_emb], p_embs)[0]
max_sim = np.max(similarities)
if max_sim > threshold:
semantic_risks.append({
"category": category,
"finding": sentences[i],
"severity": "High" if max_sim > 0.65 else "Medium",
"confidence": float(max_sim)
})
# De-duplicate: Keep highest confidence per sentence
refined_risks = []
seen_sentences = set()
for risk in sorted(semantic_risks, key=lambda x: x['confidence'], reverse=True):
if risk['finding'] not in seen_sentences:
refined_risks.append(risk)
seen_sentences.add(risk['finding'])
return refined_risks
except Exception as e:
logger.error(f"Error during ML analysis: {e}")
return []
ml_service = MLService()