psychology-tutor-engine / test_data_quality.py
adfras's picture
Initial commit: Psychology tutor engine and data pipelines
1da14e1
# test_data_quality.py
import pytest
import pandas as pd
import os
# --- Configuration ---
PROCESSED_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
EXPECTED_COLUMNS = ['question', 'answer', 'source', 'licence']
EXPECTED_SOURCES = [
'BoltMonkey/psychology-question-answer',
'Gragroo/psychology-question-answer_psygpt_with_validation',
'PsychoLexQA',
'MMLU/professional_psychology',
'MMLU/high_school_psychology'
]
MIN_QUESTION_LENGTH = 10 # A reasonable minimum length for a question
MAX_QUESTION_LENGTH = 1500 # A reasonable maximum
@pytest.fixture(scope="module")
def data():
"""A pytest fixture to load the main dataset once for all tests."""
if not os.path.exists(PROCESSED_DATA_FILE):
pytest.fail(f"FATAL: Processed data file not found at {PROCESSED_DATA_FILE}. Run normalize_psych_data.py first.")
return pd.read_parquet(PROCESSED_DATA_FILE)
# --- Test Cases ---
def test_file_exists():
"""Test 1: Ensures the processed data file was actually created."""
assert os.path.exists(PROCESSED_DATA_FILE), "The final processed parquet file is missing."
def test_schema_is_correct(data):
"""Test 2: Validates that all expected columns are present."""
for col in EXPECTED_COLUMNS:
assert col in data.columns, f"Missing expected column: '{col}'"
def test_no_missing_critical_data(data):
"""Test 3: Ensures there are no nulls in the core 'question' and 'answer' fields."""
assert data['question'].isnull().sum() == 0, "There are missing values in the 'question' column."
assert data['answer'].isnull().sum() == 0, "There are missing values in the 'answer' column."
def test_content_plausibility(data):
"""Test 4: Checks if the data content is reasonable (e.g., not too short)."""
shortest_question = data['question'].str.len().min()
assert shortest_question >= MIN_QUESTION_LENGTH, f"Found a question with length {shortest_question}, which is shorter than the minimum threshold of {MIN_QUESTION_LENGTH}."
longest_question = data['question'].str.len().max()
assert longest_question <= MAX_QUESTION_LENGTH, f"Found a question with length {longest_question}, which is longer than the maximum threshold of {MAX_QUESTION_LENGTH}."
def test_source_column_is_valid(data):
"""Test 5: Checks if the 'source' column contains only known, expected values."""
unexpected_sources = set(data['source'].unique()) - set(EXPECTED_SOURCES)
assert not unexpected_sources, f"Found unexpected data sources: {unexpected_sources}"