from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification model_name = "distilbert-base-uncased" # Explicitly load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) # Apply truncation and max length classifier = pipeline( "text-classification", model=model, tokenizer=tokenizer, truncation=True, # ✅ This enforces 512-token limit max_length=512, return_all_scores=False ) def classify_clauses(clauses): results = [] for clause in clauses: try: result = classifier(clause) score = result[0]['score'] label = result[0]['label'] risk_score = score if label == 'POSITIVE' else 1 - score risk_level = ( "High" if risk_score > 0.7 else "Medium" if risk_score > 0.4 else "Low" ) results.append({ "clause": clause[:300], # Optional: Truncate for display only "risk_score": risk_score, "risk_level": risk_level }) except Exception as e: results.append({ "clause": clause[:300], "risk_score": 0, "risk_level": "Unknown", "error": str(e) }) return results