Spaces:
Runtime error
Runtime error
| # test_data_quality.py | |
| import pytest | |
| import pandas as pd | |
| import os | |
| # --- Configuration --- | |
| PROCESSED_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet" | |
| EXPECTED_COLUMNS = ['question', 'answer', 'source', 'licence'] | |
| EXPECTED_SOURCES = [ | |
| 'BoltMonkey/psychology-question-answer', | |
| 'Gragroo/psychology-question-answer_psygpt_with_validation', | |
| 'PsychoLexQA', | |
| 'MMLU/professional_psychology', | |
| 'MMLU/high_school_psychology' | |
| ] | |
| MIN_QUESTION_LENGTH = 10 # A reasonable minimum length for a question | |
| MAX_QUESTION_LENGTH = 1500 # A reasonable maximum | |
| def data(): | |
| """A pytest fixture to load the main dataset once for all tests.""" | |
| if not os.path.exists(PROCESSED_DATA_FILE): | |
| pytest.fail(f"FATAL: Processed data file not found at {PROCESSED_DATA_FILE}. Run normalize_psych_data.py first.") | |
| return pd.read_parquet(PROCESSED_DATA_FILE) | |
| # --- Test Cases --- | |
| def test_file_exists(): | |
| """Test 1: Ensures the processed data file was actually created.""" | |
| assert os.path.exists(PROCESSED_DATA_FILE), "The final processed parquet file is missing." | |
| def test_schema_is_correct(data): | |
| """Test 2: Validates that all expected columns are present.""" | |
| for col in EXPECTED_COLUMNS: | |
| assert col in data.columns, f"Missing expected column: '{col}'" | |
| def test_no_missing_critical_data(data): | |
| """Test 3: Ensures there are no nulls in the core 'question' and 'answer' fields.""" | |
| assert data['question'].isnull().sum() == 0, "There are missing values in the 'question' column." | |
| assert data['answer'].isnull().sum() == 0, "There are missing values in the 'answer' column." | |
| def test_content_plausibility(data): | |
| """Test 4: Checks if the data content is reasonable (e.g., not too short).""" | |
| shortest_question = data['question'].str.len().min() | |
| assert shortest_question >= MIN_QUESTION_LENGTH, f"Found a question with length {shortest_question}, which is shorter than the minimum threshold of {MIN_QUESTION_LENGTH}." | |
| longest_question = data['question'].str.len().max() | |
| assert longest_question <= MAX_QUESTION_LENGTH, f"Found a question with length {longest_question}, which is longer than the maximum threshold of {MAX_QUESTION_LENGTH}." | |
| def test_source_column_is_valid(data): | |
| """Test 5: Checks if the 'source' column contains only known, expected values.""" | |
| unexpected_sources = set(data['source'].unique()) - set(EXPECTED_SOURCES) | |
| assert not unexpected_sources, f"Found unexpected data sources: {unexpected_sources}" |