Spaces:
Sleeping
Sleeping
Update src/validation.py
Browse files- src/validation.py +20 -9
src/validation.py
CHANGED
|
@@ -101,11 +101,17 @@ def validate_predictions_content(predictions: pd.DataFrame) -> Dict:
|
|
| 101 |
if duplicate_predictions > len(predictions) * 0.5: # More than 50%
|
| 102 |
warnings.append(f"{duplicate_predictions} duplicate prediction texts")
|
| 103 |
|
| 104 |
-
# Check for non-text content
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
return {
|
| 111 |
'has_issues': len(issues) > 0,
|
|
@@ -166,8 +172,7 @@ def generate_validation_report(
|
|
| 166 |
report = []
|
| 167 |
|
| 168 |
# Header
|
| 169 |
-
report.append(f"
|
| 170 |
-
report.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 171 |
report.append("")
|
| 172 |
|
| 173 |
# File format validation
|
|
@@ -244,7 +249,9 @@ def validate_submission_complete(file_content: bytes, filename: str, test_set: p
|
|
| 244 |
return {
|
| 245 |
'valid': False,
|
| 246 |
'report': generate_validation_report(format_result, {}, {}, model_name),
|
| 247 |
-
'predictions': None
|
|
|
|
|
|
|
| 248 |
}
|
| 249 |
|
| 250 |
predictions = format_result['dataframe']
|
|
@@ -270,5 +277,9 @@ def validate_submission_complete(file_content: bytes, filename: str, test_set: p
|
|
| 270 |
'coverage': test_set_result['overall_coverage'],
|
| 271 |
'report': report,
|
| 272 |
'predictions': predictions,
|
| 273 |
-
'pair_coverage': test_set_result['pair_coverage']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
}
|
|
|
|
| 101 |
if duplicate_predictions > len(predictions) * 0.5: # More than 50%
|
| 102 |
warnings.append(f"{duplicate_predictions} duplicate prediction texts")
|
| 103 |
|
| 104 |
+
# Check for non-text content (more permissive regex for multiple languages)
|
| 105 |
+
# Allow Unicode characters for non-English languages
|
| 106 |
+
non_text_pattern = r'^[\w\s\'".,!?;:()\-àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]+$'
|
| 107 |
+
try:
|
| 108 |
+
non_text_predictions = ~predictions['prediction'].str.match(non_text_pattern, na=False)
|
| 109 |
+
unusual_char_count = non_text_predictions.sum()
|
| 110 |
+
if unusual_char_count > len(predictions) * 0.2: # More than 20%
|
| 111 |
+
warnings.append(f"{unusual_char_count} predictions may contain special characters")
|
| 112 |
+
except:
|
| 113 |
+
# Skip this check if regex fails
|
| 114 |
+
pass
|
| 115 |
|
| 116 |
return {
|
| 117 |
'has_issues': len(issues) > 0,
|
|
|
|
| 172 |
report = []
|
| 173 |
|
| 174 |
# Header
|
| 175 |
+
report.append(f"## Validation Report: {model_name or 'Submission'}")
|
|
|
|
| 176 |
report.append("")
|
| 177 |
|
| 178 |
# File format validation
|
|
|
|
| 249 |
return {
|
| 250 |
'valid': False,
|
| 251 |
'report': generate_validation_report(format_result, {}, {}, model_name),
|
| 252 |
+
'predictions': None,
|
| 253 |
+
'coverage': 0.0,
|
| 254 |
+
'pair_coverage': {}
|
| 255 |
}
|
| 256 |
|
| 257 |
predictions = format_result['dataframe']
|
|
|
|
| 277 |
'coverage': test_set_result['overall_coverage'],
|
| 278 |
'report': report,
|
| 279 |
'predictions': predictions,
|
| 280 |
+
'pair_coverage': test_set_result['pair_coverage'],
|
| 281 |
+
'quality_score': content_result.get('quality_score', 0.8),
|
| 282 |
+
'warnings': content_result.get('warnings', []),
|
| 283 |
+
'matching_count': test_set_result['matching_count'],
|
| 284 |
+
'missing_count': test_set_result['missing_count']
|
| 285 |
}
|