File size: 1,416 Bytes
7d9d8b5
a238aa1
7d9d8b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a238aa1
 
 
 
7d9d8b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a238aa1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"

# Explicitly load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Apply truncation and max length
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    truncation=True,       # ✅ This enforces 512-token limit
    max_length=512,
    return_all_scores=False
)

def classify_clauses(clauses):
    results = []
    for clause in clauses:
        try:
            result = classifier(clause)
            score = result[0]['score']
            label = result[0]['label']
            risk_score = score if label == 'POSITIVE' else 1 - score

            risk_level = (
                "High" if risk_score > 0.7 else
                "Medium" if risk_score > 0.4 else
                "Low"
            )

            results.append({
                "clause": clause[:300],  # Optional: Truncate for display only
                "risk_score": risk_score,
                "risk_level": risk_level
            })

        except Exception as e:
            results.append({
                "clause": clause[:300],
                "risk_score": 0,
                "risk_level": "Unknown",
                "error": str(e)
            })

    return results