ABDALLAH31's picture
Create model.py
a58ecfc verified
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
model_name = "distilbert-base-uncased"
# Explicitly load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Apply truncation and max length
classifier = pipeline(
"text-classification",
model=model,
tokenizer=tokenizer,
truncation=True, # ✅ This enforces 512-token limit
max_length=512,
return_all_scores=False
)
def classify_clauses(clauses):
results = []
for clause in clauses:
try:
result = classifier(clause)
score = result[0]['score']
label = result[0]['label']
risk_score = score if label == 'POSITIVE' else 1 - score
risk_level = (
"High" if risk_score > 0.7 else
"Medium" if risk_score > 0.4 else
"Low"
)
results.append({
"clause": clause[:300], # Optional: Truncate for display only
"risk_score": risk_score,
"risk_level": risk_level
})
except Exception as e:
results.append({
"clause": clause[:300],
"risk_score": 0,
"risk_level": "Unknown",
"error": str(e)
})
return results