Spaces:
Sleeping
Sleeping
| """ | |
| Directional Tests for Skill Classification Model | |
| These tests verify that specific changes to the input lead to PREDICTABLE changes | |
| in the model's predictions. For example: | |
| - Adding skill-specific keywords should increase confidence in related skills | |
| - Removing domain-specific terms should decrease confidence in those domains | |
| - Adding context about a technology should add related skill predictions | |
| Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models" | |
| """ | |
| import pytest | |
| import numpy as np | |
| class TestDirectional: | |
| """Test suite for directional expectations of the model.""" | |
| def test_adding_language_keyword(self, predict_with_labels, predict_text): | |
| """ | |
| Test that adding programming language keywords increases language-related predictions. | |
| Adding "Java" or "Python" should make language skills more likely. | |
| """ | |
| base = "Fixed bug in authentication system" | |
| with_java = "Fixed bug in Java authentication system" | |
| with_python = "Fixed bug in Python authentication system" | |
| pred_base = set(predict_with_labels(base)) | |
| pred_java = set(predict_with_labels(with_java)) | |
| pred_python = set(predict_with_labels(with_python)) | |
| # Check if language-related labels appear (depends on your label schema) | |
| # Note: Adjust these checks based on actual labels in your dataset | |
| print(f"\nBase predictions: {pred_base}") | |
| print(f"With Java: {pred_java}") | |
| print(f"With Python: {pred_python}") | |
| # At minimum, predictions should not become drastically worse | |
| # It's acceptable if predictions stay the same (model might already predict Language) | |
| assert len(pred_java) >= len(pred_base) * 0.5, ( | |
| "Adding Java should not drastically reduce predictions" | |
| ) | |
| assert len(pred_python) >= len(pred_base) * 0.5, ( | |
| "Adding Python should not drastically reduce predictions" | |
| ) | |
| def test_adding_data_structure_keyword(self, predict_with_labels): | |
| """ | |
| Test that adding data structure keywords increases data structure predictions. | |
| """ | |
| base = "Implemented search functionality" | |
| with_hashmap = "Implemented search functionality using HashMap" | |
| with_tree = "Implemented search functionality using binary tree" | |
| pred_base = set(predict_with_labels(base)) | |
| pred_hashmap = set(predict_with_labels(with_hashmap)) | |
| pred_tree = set(predict_with_labels(with_tree)) | |
| print(f"\nBase: {pred_base}") | |
| print(f"With HashMap: {pred_hashmap}") | |
| print(f"With Tree: {pred_tree}") | |
| # Adding data structures should increase related predictions | |
| # pred_hashmap and pred_tree should have more or different labels than base | |
| assert len(pred_hashmap) >= len(pred_base) * 0.8, ( | |
| "Adding HashMap should not drastically reduce predictions" | |
| ) | |
| assert len(pred_tree) >= len(pred_base) * 0.8, ( | |
| "Adding tree should not drastically reduce predictions" | |
| ) | |
| def test_adding_error_handling_context(self, predict_with_labels): | |
| """ | |
| Test that adding error handling keywords increases error handling predictions. | |
| """ | |
| base = "Updated user login flow" | |
| with_exception = "Updated user login flow with exception handling" | |
| with_try_catch = "Updated user login flow with try-catch blocks" | |
| pred_base = set(predict_with_labels(base)) | |
| pred_exception = set(predict_with_labels(with_exception)) | |
| pred_try_catch = set(predict_with_labels(with_try_catch)) | |
| print(f"\nBase: {pred_base}") | |
| print(f"With exception: {pred_exception}") | |
| print(f"With try-catch: {pred_try_catch}") | |
| # Error handling keywords should not drastically reduce predictions | |
| # Check if "Error Handling" is in predictions (likely already there) | |
| has_error_handling = any("Error" in label or "Exception" in label | |
| for label in pred_exception | pred_try_catch) | |
| assert len(pred_exception) >= len(pred_base) * 0.5, ( | |
| "Adding error handling context should not drastically reduce predictions" | |
| ) | |
| # At least one prediction should contain error-related terms | |
| print(f"Has error handling related labels: {has_error_handling}") | |
| def test_removing_specific_technology(self, predict_text): | |
| """ | |
| Test that removing technology-specific keywords reduces related predictions. | |
| """ | |
| with_tech = "Fixed database connection pooling issue in PostgreSQL" | |
| without_tech = "Fixed database connection pooling issue" | |
| pred_with = predict_text(with_tech) | |
| pred_without = predict_text(without_tech) | |
| # Predictions should differ when removing specific technology | |
| # The version with specific tech should generally have same or more predictions | |
| assert len(pred_with) >= len(pred_without) * 0.7, ( | |
| "Removing technology specifics should not drastically increase predictions" | |
| ) | |
| def test_adding_api_context(self, predict_with_labels): | |
| """ | |
| Test that adding API-related keywords increases API/web service predictions. | |
| """ | |
| base = "Fixed user authentication" | |
| with_api = "Fixed user authentication REST API endpoint" | |
| with_graphql = "Fixed user authentication GraphQL endpoint" | |
| pred_base = set(predict_with_labels(base)) | |
| pred_api = set(predict_with_labels(with_api)) | |
| pred_graphql = set(predict_with_labels(with_graphql)) | |
| print(f"\nBase: {pred_base}") | |
| print(f"With REST API: {pred_api}") | |
| print(f"With GraphQL: {pred_graphql}") | |
| # API keywords should not drastically reduce predictions | |
| assert len(pred_api) >= len(pred_base) * 0.5, ( | |
| "Adding REST API should not drastically reduce predictions" | |
| ) | |
| assert len(pred_graphql) >= len(pred_base) * 0.5, ( | |
| "Adding GraphQL should not drastically reduce predictions" | |
| ) | |
| def test_adding_testing_keywords(self, predict_with_labels): | |
| """ | |
| Test that adding testing-related keywords increases testing skill predictions. | |
| """ | |
| base = "Implemented new feature for user management" | |
| with_tests = "Implemented new feature for user management with unit tests" | |
| with_integration = "Implemented new feature for user management with integration tests" | |
| pred_base = set(predict_with_labels(base)) | |
| pred_unit = set(predict_with_labels(with_tests)) | |
| pred_integration = set(predict_with_labels(with_integration)) | |
| print(f"\nBase: {pred_base}") | |
| print(f"With unit tests: {pred_unit}") | |
| print(f"With integration tests: {pred_integration}") | |
| # Testing keywords should not drastically reduce predictions | |
| # Check if testing-related labels are present | |
| has_testing = any("Test" in label or "Automated" in label | |
| for label in pred_unit | pred_integration) | |
| assert len(pred_unit) >= len(pred_base) * 0.5, ( | |
| "Adding testing keywords should not drastically reduce predictions" | |
| ) | |
| print(f"Has testing related labels: {has_testing}") | |
| def test_adding_performance_keywords(self, predict_with_labels): | |
| """ | |
| Test that adding performance-related keywords affects predictions. | |
| """ | |
| base = "Optimized search algorithm" | |
| with_perf = "Optimized search algorithm for better performance and reduced memory usage" | |
| with_cache = "Optimized search algorithm with caching" | |
| pred_base = set(predict_with_labels(base)) | |
| pred_perf = set(predict_with_labels(with_perf)) | |
| pred_cache = set(predict_with_labels(with_cache)) | |
| print(f"\nBase: {pred_base}") | |
| print(f"With performance: {pred_perf}") | |
| print(f"With caching: {pred_cache}") | |
| # Performance keywords should affect predictions | |
| # More specific descriptions should generally maintain or add labels | |
| assert len(pred_perf) >= len(pred_base) * 0.7, ( | |
| "Adding performance context should not drastically reduce predictions" | |
| ) | |
| def test_adding_security_context(self, predict_with_labels): | |
| """ | |
| Test that adding security keywords increases security-related predictions. | |
| """ | |
| base = "Updated authentication system" | |
| with_security = "Updated authentication system with OAuth2 security" | |
| with_encryption = "Updated authentication system with password encryption" | |
| pred_base = set(predict_with_labels(base)) | |
| pred_oauth = set(predict_with_labels(with_security)) | |
| pred_encryption = set(predict_with_labels(with_encryption)) | |
| print(f"\nBase: {pred_base}") | |
| print(f"With OAuth: {pred_oauth}") | |
| print(f"With encryption: {pred_encryption}") | |
| # Security keywords should not drastically reduce predictions | |
| # Authentication is already security-related, so predictions should be stable | |
| assert len(pred_oauth) >= len(pred_base) * 0.5, ( | |
| "Adding OAuth2 should not drastically reduce predictions" | |
| ) | |
| assert len(pred_encryption) >= len(pred_base) * 0.5, ( | |
| "Adding encryption should not drastically reduce predictions" | |
| ) | |
| def test_adding_devops_keywords(self, predict_with_labels): | |
| """ | |
| Test that adding DevOps keywords increases DevOps-related predictions. | |
| """ | |
| base = "Deployed new version" | |
| with_docker = "Deployed new version using Docker containers" | |
| with_ci = "Deployed new version through CI/CD pipeline" | |
| pred_base = set(predict_with_labels(base)) | |
| pred_docker = set(predict_with_labels(with_docker)) | |
| pred_ci = set(predict_with_labels(with_ci)) | |
| print(f"\nBase: {pred_base}") | |
| print(f"With Docker: {pred_docker}") | |
| print(f"With CI/CD: {pred_ci}") | |
| # DevOps keywords should not drastically reduce predictions | |
| # Check if DevOps-related labels are present | |
| has_devops = any("DevOps" in label or "Operations" in label or "Deployment" in label | |
| for label in pred_docker | pred_ci | pred_base) | |
| assert len(pred_docker) >= len(pred_base) * 0.5, ( | |
| "Adding Docker should not drastically reduce predictions" | |
| ) | |
| print(f"Has DevOps related labels: {has_devops}") | |
| def test_increasing_technical_detail(self, predict_text): | |
| """ | |
| Test that adding more technical detail generally increases or maintains predictions. | |
| More specific descriptions should not drastically reduce the number of relevant skills. | |
| """ | |
| vague = "Fixed bug" | |
| specific = "Fixed null pointer exception in user service layer" | |
| very_specific = "Fixed null pointer exception in UserService.getUserById() method when handling deleted users" | |
| pred_vague = predict_text(vague) | |
| pred_specific = predict_text(specific) | |
| pred_very_specific = predict_text(very_specific) | |
| print(f"\nVague ({len(pred_vague)} labels): {pred_vague}") | |
| print(f"Specific ({len(pred_specific)} labels): {pred_specific}") | |
| print(f"Very specific ({len(pred_very_specific)} labels): {pred_very_specific}") | |
| # More detail should generally add relevant skills, not remove them drastically | |
| # Allow some variance since very specific text might lose some general predictions | |
| assert len(pred_specific) >= len(pred_vague) * 0.5, ( | |
| "Adding technical detail should not reduce predictions drastically" | |
| ) | |