Spaces:

adfras
/

psychology-tutor-engine

Runtime error

App Files Files Community

psychology-tutor-engine / test_data_quality.py

adfras

Initial commit: Psychology tutor engine and data pipelines

1da14e1 7 months ago

raw

history blame contribute delete

2.54 kB

	# test_data_quality.py
	import pytest
	import pandas as pd
	import os

	# --- Configuration ---
	PROCESSED_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
	EXPECTED_COLUMNS = ['question', 'answer', 'source', 'licence']
	EXPECTED_SOURCES = [
	'BoltMonkey/psychology-question-answer',
	'Gragroo/psychology-question-answer_psygpt_with_validation',
	'PsychoLexQA',
	'MMLU/professional_psychology',
	'MMLU/high_school_psychology'
	]
	MIN_QUESTION_LENGTH = 10 # A reasonable minimum length for a question
	MAX_QUESTION_LENGTH = 1500 # A reasonable maximum

	@pytest.fixture(scope="module")
	def data():
	"""A pytest fixture to load the main dataset once for all tests."""
	if not os.path.exists(PROCESSED_DATA_FILE):
	pytest.fail(f"FATAL: Processed data file not found at {PROCESSED_DATA_FILE}. Run normalize_psych_data.py first.")
	return pd.read_parquet(PROCESSED_DATA_FILE)

	# --- Test Cases ---

	def test_file_exists():
	"""Test 1: Ensures the processed data file was actually created."""
	assert os.path.exists(PROCESSED_DATA_FILE), "The final processed parquet file is missing."

	def test_schema_is_correct(data):
	"""Test 2: Validates that all expected columns are present."""
	for col in EXPECTED_COLUMNS:
	assert col in data.columns, f"Missing expected column: '{col}'"

	def test_no_missing_critical_data(data):
	"""Test 3: Ensures there are no nulls in the core 'question' and 'answer' fields."""
	assert data['question'].isnull().sum() == 0, "There are missing values in the 'question' column."
	assert data['answer'].isnull().sum() == 0, "There are missing values in the 'answer' column."

	def test_content_plausibility(data):
	"""Test 4: Checks if the data content is reasonable (e.g., not too short)."""
	shortest_question = data['question'].str.len().min()
	assert shortest_question >= MIN_QUESTION_LENGTH, f"Found a question with length {shortest_question}, which is shorter than the minimum threshold of {MIN_QUESTION_LENGTH}."
	longest_question = data['question'].str.len().max()
	assert longest_question <= MAX_QUESTION_LENGTH, f"Found a question with length {longest_question}, which is longer than the maximum threshold of {MAX_QUESTION_LENGTH}."


	def test_source_column_is_valid(data):
	"""Test 5: Checks if the 'source' column contains only known, expected values."""
	unexpected_sources = set(data['source'].unique()) - set(EXPECTED_SOURCES)
	assert not unexpected_sources, f"Found unexpected data sources: {unexpected_sources}"