UserSyncUI

Paused

App Files Files Community

UserSyncUI / docs /api /tinytroupe /validation /simulation_validator.html

harvesthealth

Upload folder using huggingface_hub

f6686e1 verified 2 months ago

raw

history blame contribute delete

108 kB

	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
	<meta name="generator" content="pdoc 0.10.0" />
	<title>tinytroupe.validation.simulation_validator API documentation</title>
	<meta name="description" content="Simulation experiment empirical validation mechanisms for TinyTroupe …" />
	<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
	<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
	<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
	<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > :last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > {white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
	<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
	<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
	<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
	<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
	</head>
	<body>
	<main>
	<article id="content">
	<header>
	<h1 class="title">Module <code>tinytroupe.validation.simulation_validator</code></h1>
	</header>
	<section id="section-intro">
	<p>Simulation experiment empirical validation mechanisms for TinyTroupe.</p>
	<p>This module provides tools to validate simulation experiment results against empirical control data,
	supporting both statistical hypothesis testing and semantic validation approaches.
	This is distinct from LLM-based evaluations, focusing on data-driven validation
	against known empirical benchmarks.</p>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">"""
	Simulation experiment empirical validation mechanisms for TinyTroupe.

	This module provides tools to validate simulation experiment results against empirical control data,
	supporting both statistical hypothesis testing and semantic validation approaches.
	This is distinct from LLM-based evaluations, focusing on data-driven validation
	against known empirical benchmarks.
	"""

	from typing import Dict, List, Optional, Union, Any
	import json
	from datetime import datetime
	from pydantic import BaseModel, Field

	from tinytroupe.experimentation.statistical_tests import StatisticalTester
	from tinytroupe.utils.semantics import compute_semantic_proximity

	# TODO Work-in-Progress below

	class SimulationExperimentDataset(BaseModel):
	"""
	Represents a dataset from a simulation experiment or empirical study.

	This contains data that can be used for validation, including quantitative metrics
	and qualitative agent justifications from simulation experiments or empirical studies.

	Attributes:
	name: Optional name for the dataset
	description: Optional description of the dataset
	key_results: Map from result names to their values (numbers, proportions, booleans, etc.)
	result_types: Map indicating whether each result is "aggregate" or "per_agent"
	agent_names: Optional list of agent names (can be referenced by index in results)
	agent_justifications: List of justifications (with optional agent references)
	justification_summary: Optional summary of all agent justifications
	"""
	name: Optional[str] = None
	description: Optional[str] = None
	key_results: Dict[str, Union[float, int, bool, List[Union[float, int, bool, None]], None]] = Field(default_factory=dict)
	result_types: Dict[str, str] = Field(default_factory=dict, description="Map from result name to 'aggregate' or 'per_agent'")
	agent_names: Optional[List[Optional[str]]] = Field(None, description="Optional list of agent names for reference (can contain None for unnamed agents)")
	agent_justifications: List[Union[str, Dict[str, Union[str, int]]]] = Field(
	default_factory=list,
	description="List of justifications as strings or dicts with optional 'agent_name'/'agent_index' and 'justification'"
	)
	justification_summary: Optional[str] = None

	class Config:
	"""Pydantic configuration."""
	extra = "forbid" # Prevent accidental extra fields
	validate_assignment = True # Validate on assignment after creation

	def get_agent_name(self, index: int) -> Optional[str]:
	"""Get agent name by index, if available."""
	if self.agent_names and 0 <= index < len(self.agent_names):
	agent_name = self.agent_names[index]
	return agent_name if agent_name is not None else None
	return None

	def get_agent_data(self, metric_name: str, agent_index: int) -> Optional[Union[float, int, bool]]:
	"""Get a specific agent's data for a given metric. Returns None for missing data."""
	if metric_name not in self.key_results:
	return None

	metric_data = self.key_results[metric_name]

	# Check if it's per-agent data
	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	if 0 <= agent_index < len(metric_data):
	return metric_data[agent_index] # This can be None for missing data

	return None

	def get_all_agent_data(self, metric_name: str) -> Dict[str, Union[float, int, bool]]:
	"""Get all agents' data for a given metric as a dictionary mapping agent names/indices to values."""
	if metric_name not in self.key_results:
	return {}

	metric_data = self.key_results[metric_name]
	result = {}

	# For per-agent data, create mapping
	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	for i, value in enumerate(metric_data):
	agent_name = self.get_agent_name(i) or f"Agent_{i}"
	# Only include non-None values in the result
	if value is not None:
	result[agent_name] = value

	# For aggregate data, return single value
	elif self.result_types.get(metric_name) == "aggregate":
	result["aggregate"] = metric_data

	return result

	def get_valid_agent_data(self, metric_name: str) -> List[Union[float, int, bool]]:
	"""Get only valid (non-None) values for a per-agent metric."""
	if metric_name not in self.key_results:
	return []

	metric_data = self.key_results[metric_name]

	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	return [value for value in metric_data if value is not None]

	return []

	def validate_data_consistency(self) -> List[str]:
	"""Validate that per-agent data is consistent across metrics and with agent names."""
	errors = []
	warnings = []

	# Check per-agent metrics have consistent lengths
	per_agent_lengths = []
	per_agent_metrics = []

	for metric_name, result_type in self.result_types.items():
	if result_type == "per_agent" and metric_name in self.key_results:
	metric_data = self.key_results[metric_name]
	if isinstance(metric_data, list):
	per_agent_lengths.append(len(metric_data))
	per_agent_metrics.append(metric_name)
	else:
	errors.append(f"Metric '{metric_name}' marked as per_agent but is not a list")

	# Check all per-agent metrics have same length
	if per_agent_lengths and len(set(per_agent_lengths)) > 1:
	errors.append(f"Per-agent metrics have inconsistent lengths: {dict(zip(per_agent_metrics, per_agent_lengths))}")

	# Check agent_names length matches per-agent data length
	if self.agent_names and per_agent_lengths:
	agent_count = len(self.agent_names)
	data_length = per_agent_lengths[0] if per_agent_lengths else 0
	if agent_count != data_length:
	errors.append(f"agent_names length ({agent_count}) doesn't match per-agent data length ({data_length})")

	# Check for None values in agent_names and provide warnings
	if self.agent_names:
	none_indices = [i for i, name in enumerate(self.agent_names) if name is None]
	if none_indices:
	warnings.append(f"agent_names contains None values at indices: {none_indices}")

	# Check for None values in per-agent data and provide info
	for metric_name in per_agent_metrics:
	if metric_name in self.key_results:
	metric_data = self.key_results[metric_name]
	none_indices = [i for i, value in enumerate(metric_data) if value is None]
	if none_indices:
	warnings.append(f"Metric '{metric_name}' has missing data (None) at indices: {none_indices}")

	# Return errors and warnings combined
	return errors + [f"WARNING: {warning}" for warning in warnings]

	def get_justification_text(self, justification_item: Union[str, Dict[str, Union[str, int]]]) -> str:
	"""Extract justification text from various formats."""
	if isinstance(justification_item, str):
	return justification_item
	elif isinstance(justification_item, dict):
	return justification_item.get("justification", "")
	return ""

	def get_justification_agent_reference(self, justification_item: Union[str, Dict[str, Union[str, int]]]) -> Optional[str]:
	"""Get agent reference from justification, returning name if available."""
	if isinstance(justification_item, dict):
	# Direct agent name
	if "agent_name" in justification_item:
	return justification_item["agent_name"]
	# Agent index reference
	elif "agent_index" in justification_item:
	return self.get_agent_name(justification_item["agent_index"])
	return None


	class SimulationExperimentEmpiricalValidationResult(BaseModel):
	"""
	Contains the results of a simulation experiment validation against empirical data.

	This represents the outcome of validating simulation experiment data
	against empirical benchmarks, using statistical and semantic methods.

	Attributes:
	validation_type: Type of validation performed
	control_name: Name of the control/empirical dataset
	treatment_name: Name of the treatment/simulation experiment dataset
	statistical_results: Results from statistical tests (if performed)
	semantic_results: Results from semantic proximity analysis (if performed)
	overall_score: Overall validation score (0.0 to 1.0)
	summary: Summary of validation findings
	timestamp: When the validation was performed
	"""
	validation_type: str
	control_name: str
	treatment_name: str
	statistical_results: Optional[Dict[str, Any]] = None
	semantic_results: Optional[Dict[str, Any]] = None
	overall_score: Optional[float] = Field(None, ge=0.0, le=1.0, description="Overall validation score between 0.0 and 1.0")
	summary: str = ""
	timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())

	class Config:
	"""Pydantic configuration."""
	extra = "forbid"
	validate_assignment = True


	class SimulationExperimentEmpiricalValidator:
	"""
	A validator for comparing simulation experiment data against empirical control data.

	This validator performs data-driven validation using statistical hypothesis testing
	and semantic proximity analysis of agent justifications. It is designed to validate
	simulation experiment results against known empirical benchmarks, distinct from LLM-based evaluations.
	"""

	def __init__(self):
	"""Initialize the simulation experiment empirical validator."""
	pass

	def validate(self,
	control: SimulationExperimentDataset,
	treatment: SimulationExperimentDataset,
	validation_types: List[str] = ["statistical", "semantic"],
	significance_level: float = 0.05,
	output_format: str = "values") -> Union[SimulationExperimentEmpiricalValidationResult, str]:
	"""
	Validate a simulation experiment dataset against an empirical control dataset.

	Args:
	control: The control/empirical reference dataset
	treatment: The treatment/simulation experiment dataset to validate
	validation_types: List of validation types to perform ("statistical", "semantic")
	significance_level: Significance level for statistical tests
	output_format: "values" for SimulationExperimentEmpiricalValidationResult object, "report" for markdown report

	Returns:
	SimulationExperimentEmpiricalValidationResult object or markdown report string
	"""
	result = SimulationExperimentEmpiricalValidationResult(
	validation_type=", ".join(validation_types),
	control_name=control.name or "Control",
	treatment_name=treatment.name or "Treatment"
	)

	# Perform statistical validation
	if "statistical" in validation_types:
	result.statistical_results = self._perform_statistical_validation(
	control, treatment, significance_level
	)

	# Perform semantic validation
	if "semantic" in validation_types:
	result.semantic_results = self._perform_semantic_validation(
	control, treatment
	)

	# Calculate overall score and summary
	result.overall_score = self._calculate_overall_score(result)
	result.summary = self._generate_summary(result)

	if output_format == "report":
	return self._generate_markdown_report(result)
	else:
	return result

	def _perform_statistical_validation(self,
	control: SimulationExperimentDataset,
	treatment: SimulationExperimentDataset,
	significance_level: float) -> Dict[str, Any]:
	"""Perform statistical hypothesis testing on simulation experiment key results."""
	if not control.key_results or not treatment.key_results:
	return {"error": "No key results available for statistical testing"}

	try:
	# Prepare data for StatisticalTester
	control_data = {"control": {}}
	treatment_data = {"treatment": {}}

	# Convert single values to lists if needed and find common metrics
	common_metrics = set(control.key_results.keys()) & set(treatment.key_results.keys())

	for metric in common_metrics:
	control_value = control.key_results[metric]
	treatment_value = treatment.key_results[metric]

	# Convert single values to lists and filter out None values
	if not isinstance(control_value, list):
	control_value = [control_value] if control_value is not None else []
	else:
	control_value = [v for v in control_value if v is not None]

	if not isinstance(treatment_value, list):
	treatment_value = [treatment_value] if treatment_value is not None else []
	else:
	treatment_value = [v for v in treatment_value if v is not None]

	# Only include metrics that have valid data points
	if len(control_value) > 0 and len(treatment_value) > 0:
	control_data["control"][metric] = control_value
	treatment_data["treatment"][metric] = treatment_value

	if not common_metrics:
	return {"error": "No common metrics found between control and treatment"}

	# Run statistical tests
	tester = StatisticalTester(control_data, treatment_data)
	test_results = tester.run_test(
	test_type="welch_t_test",
	alpha=significance_level
	)

	return {
	"common_metrics": list(common_metrics),
	"test_results": test_results,
	"significance_level": significance_level
	}

	except Exception as e:
	return {"error": f"Statistical testing failed: {str(e)}"}

	def _perform_semantic_validation(self,
	control: SimulationExperimentDataset,
	treatment: SimulationExperimentDataset) -> Dict[str, Any]:
	"""Perform semantic proximity analysis on simulation experiment agent justifications."""
	results = {
	"individual_comparisons": [],
	"summary_comparison": None,
	"average_proximity": None
	}

	# Compare individual justifications if available
	if control.agent_justifications and treatment.agent_justifications:
	proximities = []

	for i, control_just in enumerate(control.agent_justifications):
	for j, treatment_just in enumerate(treatment.agent_justifications):
	control_text = control.get_justification_text(control_just)
	treatment_text = treatment.get_justification_text(treatment_just)

	if control_text and treatment_text:
	proximity_result = compute_semantic_proximity(
	control_text,
	treatment_text,
	context="Comparing agent justifications from simulation experiments"
	)

	# Get agent references (names or indices)
	control_agent_ref = control.get_justification_agent_reference(control_just) or f"Agent_{i}"
	treatment_agent_ref = treatment.get_justification_agent_reference(treatment_just) or f"Agent_{j}"

	comparison = {
	"control_agent": control_agent_ref,
	"treatment_agent": treatment_agent_ref,
	"proximity_score": proximity_result["proximity_score"],
	"justification": proximity_result["justification"]
	}

	results["individual_comparisons"].append(comparison)
	proximities.append(proximity_result["proximity_score"])

	if proximities:
	results["average_proximity"] = sum(proximities) / len(proximities)

	# Compare summary justifications if available
	if control.justification_summary and treatment.justification_summary:
	summary_proximity = compute_semantic_proximity(
	control.justification_summary,
	treatment.justification_summary,
	context="Comparing summary justifications from simulation experiments"
	)
	results["summary_comparison"] = summary_proximity

	return results

	def _calculate_overall_score(self, result: SimulationExperimentEmpiricalValidationResult) -> float:
	"""Calculate an overall simulation experiment empirical validation score based on statistical and semantic results."""
	scores = []

	# Statistical component based on effect sizes
	if result.statistical_results and "test_results" in result.statistical_results:
	test_results = result.statistical_results["test_results"]
	effect_sizes = []

	for treatment_name, treatment_results in test_results.items():
	for metric, metric_result in treatment_results.items():
	# Extract effect size based on test type
	effect_size = self._extract_effect_size(metric_result)
	if effect_size is not None:
	effect_sizes.append(effect_size)

	if effect_sizes:
	# Convert effect sizes to similarity scores (closer to 0 = more similar)
	# Use inverse transformation: similarity = 1 / (1 + \|effect_size\|)
	similarity_scores = [1.0 / (1.0 + abs(es)) for es in effect_sizes]
	statistical_score = sum(similarity_scores) / len(similarity_scores)
	scores.append(statistical_score)

	# Semantic component
	if result.semantic_results:
	semantic_scores = []

	# Average proximity from individual comparisons
	if result.semantic_results.get("average_proximity") is not None:
	semantic_scores.append(result.semantic_results["average_proximity"])

	# Summary proximity
	if result.semantic_results.get("summary_comparison"):
	semantic_scores.append(result.semantic_results["summary_comparison"]["proximity_score"])

	if semantic_scores:
	scores.append(sum(semantic_scores) / len(semantic_scores))

	return sum(scores) / len(scores) if scores else 0.0

	def _generate_summary(self, result: SimulationExperimentEmpiricalValidationResult) -> str:
	"""Generate a text summary of the simulation experiment empirical validation results."""
	summary_parts = []

	if result.statistical_results:
	if "error" in result.statistical_results:
	summary_parts.append(f"Statistical validation: {result.statistical_results['error']}")
	else:
	test_results = result.statistical_results.get("test_results", {})
	effect_sizes = []
	significant_tests = 0
	total_tests = 0

	for treatment_results in test_results.values():
	for metric_result in treatment_results.values():
	total_tests += 1
	if metric_result.get("significant", False):
	significant_tests += 1

	# Collect effect sizes
	effect_size = self._extract_effect_size(metric_result)
	if effect_size is not None:
	effect_sizes.append(abs(effect_size))

	if effect_sizes:
	avg_effect_size = sum(effect_sizes) / len(effect_sizes)
	summary_parts.append(
	f"Statistical validation: {significant_tests}/{total_tests} tests significant, "
	f"average effect size: {avg_effect_size:.3f}"
	)
	else:
	summary_parts.append(
	f"Statistical validation: {significant_tests}/{total_tests} tests showed significant differences"
	)

	if result.semantic_results:
	avg_proximity = result.semantic_results.get("average_proximity")
	if avg_proximity is not None:
	summary_parts.append(
	f"Semantic validation: Average proximity score of {avg_proximity:.3f}"
	)

	summary_comparison = result.semantic_results.get("summary_comparison")
	if summary_comparison:
	summary_parts.append(
	f"Summary proximity: {summary_comparison['proximity_score']:.3f}"
	)

	if result.overall_score is not None:
	summary_parts.append(f"Overall validation score: {result.overall_score:.3f}")

	return "; ".join(summary_parts) if summary_parts else "No validation results available"

	def _generate_markdown_report(self, result: SimulationExperimentEmpiricalValidationResult) -> str:
	"""Generate a comprehensive markdown report for simulation experiment empirical validation."""
	overall_score_str = f"{result.overall_score:.3f}" if result.overall_score is not None else "N/A"

	report = f"""# Simulation Experiment Empirical Validation Report

	Validation Type: {result.validation_type}
	Control/Empirical: {result.control_name}
	Treatment/Simulation: {result.treatment_name}
	Timestamp: {result.timestamp}
	Overall Score: {overall_score_str}

	## Summary

	{result.summary}

	"""

	# Statistical Results Section
	if result.statistical_results:
	report += "## Statistical Validation\n\n"

	if "error" in result.statistical_results:
	report += f"Error: {result.statistical_results['error']}\n\n"
	else:
	stats = result.statistical_results
	report += f"Common Metrics: {', '.join(stats.get('common_metrics', []))}\n\n"
	report += f"Significance Level: {stats.get('significance_level', 'N/A')}\n\n"

	test_results = stats.get("test_results", {})
	if test_results:
	report += "### Test Results\n\n"

	for treatment_name, treatment_results in test_results.items():
	report += f"#### {treatment_name}\n\n"

	for metric, metric_result in treatment_results.items():
	report += f"{metric}:\n\n"

	significant = metric_result.get("significant", False)
	p_value = metric_result.get("p_value", "N/A")
	test_type = metric_result.get("test_type", "N/A")
	effect_size = self._extract_effect_size(metric_result)

	# Get the appropriate statistic based on test type
	statistic = "N/A"
	if "t_statistic" in metric_result:
	statistic = metric_result["t_statistic"]
	elif "u_statistic" in metric_result:
	statistic = metric_result["u_statistic"]
	elif "f_statistic" in metric_result:
	statistic = metric_result["f_statistic"]
	elif "chi2_statistic" in metric_result:
	statistic = metric_result["chi2_statistic"]

	status = "✅ Significant" if significant else "❌ Not Significant"

	report += f"- {test_type}: {status}\n"
	report += f" - p-value: {p_value}\n"
	report += f" - statistic: {statistic}\n"
	if effect_size is not None:
	effect_interpretation = self._interpret_effect_size(abs(effect_size))
	report += f" - effect size: {effect_size:.3f} ({effect_interpretation})\n"

	report += "\n"

	# Semantic Results Section
	if result.semantic_results:
	report += "## Semantic Validation\n\n"

	semantic = result.semantic_results

	# Individual comparisons
	individual_comps = semantic.get("individual_comparisons", [])
	if individual_comps:
	report += "### Individual Agent Comparisons\n\n"

	for comp in individual_comps:
	score = comp["proximity_score"]
	control_agent = comp["control_agent"]
	treatment_agent = comp["treatment_agent"]
	justification = comp["justification"]

	report += f"{control_agent} vs {treatment_agent}: {score:.3f}\n\n"
	report += f"{justification}\n\n"

	avg_proximity = semantic.get("average_proximity")
	if avg_proximity:
	report += f"Average Proximity Score: {avg_proximity:.3f}\n\n"

	# Summary comparison
	summary_comp = semantic.get("summary_comparison")
	if summary_comp:
	report += "### Summary Comparison\n\n"
	report += f"Proximity Score: {summary_comp['proximity_score']:.3f}\n\n"
	report += f"Justification: {summary_comp['justification']}\n\n"

	return report

	def _extract_effect_size(self, metric_result: Dict[str, Any]) -> Optional[float]:
	"""Extract effect size from statistical test result, regardless of test type."""
	# Cohen's d for t-tests (most common)
	if "effect_size" in metric_result:
	return metric_result["effect_size"]

	# For tests that don't provide Cohen's d, calculate standardized effect size
	test_type = metric_result.get("test_type", "").lower()

	if "t-test" in test_type:
	# For t-tests, effect_size should be Cohen's d
	return metric_result.get("effect_size", 0.0)

	elif "mann-whitney" in test_type:
	# For Mann-Whitney, use Common Language Effect Size (CLES)
	# Convert CLES to Cohen's d equivalent: d ≈ 2 * Φ^(-1)(CLES)
	cles = metric_result.get("effect_size", 0.5)
	# Simple approximation: convert CLES to d-like measure
	# CLES of 0.5 = no effect, CLES of 0.71 ≈ small effect (d=0.2)
	return 2 * (cles - 0.5)

	elif "anova" in test_type:
	# For ANOVA, use eta-squared and convert to Cohen's d equivalent
	eta_squared = metric_result.get("effect_size", 0.0)
	# Convert eta-squared to Cohen's d: d = 2 * sqrt(eta^2 / (1 - eta^2))
	if eta_squared > 0 and eta_squared < 1:
	return 2 * (eta_squared / (1 - eta_squared)) ** 0.5
	return 0.0

	elif "chi-square" in test_type:
	# For Chi-square, use Cramer's V and convert to Cohen's d equivalent
	cramers_v = metric_result.get("effect_size", 0.0)
	# Rough conversion: d ≈ 2 * Cramer's V
	return 2 * cramers_v

	# Fallback: try to calculate from means and standard deviations
	if all(k in metric_result for k in ["control_mean", "treatment_mean", "control_std", "treatment_std"]):
	control_mean = metric_result["control_mean"]
	treatment_mean = metric_result["treatment_mean"]
	control_std = metric_result["control_std"]
	treatment_std = metric_result["treatment_std"]

	# Calculate pooled standard deviation
	pooled_std = ((control_std 2 + treatment_std 2) / 2) ** 0.5
	if pooled_std > 0:
	return abs(treatment_mean - control_mean) / pooled_std

	# If all else fails, return 0 (no effect)
	return 0.0

	def _interpret_effect_size(self, effect_size: float) -> str:
	"""Provide interpretation of effect size magnitude (Cohen's conventions)."""
	if effect_size < 0.2:
	return "negligible"
	elif effect_size < 0.5:
	return "small"
	elif effect_size < 0.8:
	return "medium"
	else:
	return "large"


	def validate_simulation_experiment_empirically(control_data: Dict[str, Any],
	treatment_data: Dict[str, Any],
	validation_types: List[str] = ["statistical", "semantic"],
	significance_level: float = 0.05,
	output_format: str = "values") -> Union[SimulationExperimentEmpiricalValidationResult, str]:
	"""
	Convenience function to validate simulation experiment data against empirical control data.

	This performs data-driven validation using statistical and semantic methods,
	distinct from LLM-based evaluations.

	Args:
	control_data: Dictionary containing control/empirical data
	treatment_data: Dictionary containing treatment/simulation experiment data
	validation_types: List of validation types to perform
	significance_level: Significance level for statistical tests
	output_format: "values" for SimulationExperimentEmpiricalValidationResult object, "report" for markdown report

	Returns:
	SimulationExperimentEmpiricalValidationResult object or markdown report string
	"""
	# Use Pydantic's built-in parsing instead of from_dict
	control_dataset = SimulationExperimentDataset.parse_obj(control_data)
	treatment_dataset = SimulationExperimentDataset.parse_obj(treatment_data)

	validator = SimulationExperimentEmpiricalValidator()
	return validator.validate(
	control_dataset,
	treatment_dataset,
	validation_types=validation_types,
	significance_level=significance_level,
	output_format=output_format
	)</code></pre>
	</details>
	</section>
	<section>
	</section>
	<section>
	</section>
	<section>
	<h2 class="section-title" id="header-functions">Functions</h2>
	<dl>
	<dt id="tinytroupe.validation.simulation_validator.validate_simulation_experiment_empirically"><code class="name flex">
	<span>def <span class="ident">validate_simulation_experiment_empirically</span></span>(<span>control_data: Dict[str, Any], treatment_data: Dict[str, Any], validation_types: List[str] = ['statistical', 'semantic'], significance_level: float = 0.05, output_format: str = 'values') ‑> Union[<a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult">SimulationExperimentEmpiricalValidationResult</a>, str]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Convenience function to validate simulation experiment data against empirical control data.</p>
	<p>This performs data-driven validation using statistical and semantic methods,
	distinct from LLM-based evaluations.</p>
	<h2 id="args">Args</h2>
	<dl>
	<dt><strong><code>control_data</code></strong></dt>
	<dd>Dictionary containing control/empirical data</dd>
	<dt><strong><code>treatment_data</code></strong></dt>
	<dd>Dictionary containing treatment/simulation experiment data</dd>
	<dt><strong><code>validation_types</code></strong></dt>
	<dd>List of validation types to perform</dd>
	<dt><strong><code>significance_level</code></strong></dt>
	<dd>Significance level for statistical tests</dd>
	<dt><strong><code>output_format</code></strong></dt>
	<dd>"values" for SimulationExperimentEmpiricalValidationResult object, "report" for markdown report</dd>
	</dl>
	<h2 id="returns">Returns</h2>
	<p>SimulationExperimentEmpiricalValidationResult object or markdown report string</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def validate_simulation_experiment_empirically(control_data: Dict[str, Any],
	treatment_data: Dict[str, Any],
	validation_types: List[str] = ["statistical", "semantic"],
	significance_level: float = 0.05,
	output_format: str = "values") -> Union[SimulationExperimentEmpiricalValidationResult, str]:
	"""
	Convenience function to validate simulation experiment data against empirical control data.

	This performs data-driven validation using statistical and semantic methods,
	distinct from LLM-based evaluations.

	Args:
	control_data: Dictionary containing control/empirical data
	treatment_data: Dictionary containing treatment/simulation experiment data
	validation_types: List of validation types to perform
	significance_level: Significance level for statistical tests
	output_format: "values" for SimulationExperimentEmpiricalValidationResult object, "report" for markdown report

	Returns:
	SimulationExperimentEmpiricalValidationResult object or markdown report string
	"""
	# Use Pydantic's built-in parsing instead of from_dict
	control_dataset = SimulationExperimentDataset.parse_obj(control_data)
	treatment_dataset = SimulationExperimentDataset.parse_obj(treatment_data)

	validator = SimulationExperimentEmpiricalValidator()
	return validator.validate(
	control_dataset,
	treatment_dataset,
	validation_types=validation_types,
	significance_level=significance_level,
	output_format=output_format
	)</code></pre>
	</details>
	</dd>
	</dl>
	</section>
	<section>
	<h2 class="section-title" id="header-classes">Classes</h2>
	<dl>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset"><code class="flex name class">
	<span>class <span class="ident">SimulationExperimentDataset</span></span>
	<span>(</span><span>**data: Any)</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Represents a dataset from a simulation experiment or empirical study.</p>
	<p>This contains data that can be used for validation, including quantitative metrics
	and qualitative agent justifications from simulation experiments or empirical studies.</p>
	<h2 id="attributes">Attributes</h2>
	<dl>
	<dt><strong><code>name</code></strong></dt>
	<dd>Optional name for the dataset</dd>
	<dt><strong><code>description</code></strong></dt>
	<dd>Optional description of the dataset</dd>
	<dt><strong><code>key_results</code></strong></dt>
	<dd>Map from result names to their values (numbers, proportions, booleans, etc.)</dd>
	<dt><strong><code>result_types</code></strong></dt>
	<dd>Map indicating whether each result is "aggregate" or "per_agent"</dd>
	<dt><strong><code>agent_names</code></strong></dt>
	<dd>Optional list of agent names (can be referenced by index in results)</dd>
	<dt><strong><code>agent_justifications</code></strong></dt>
	<dd>List of justifications (with optional agent references)</dd>
	<dt><strong><code>justification_summary</code></strong></dt>
	<dd>Optional summary of all agent justifications</dd>
	</dl>
	<p>Create a new model by parsing and validating input data from keyword arguments.</p>
	<p>Raises [<code>ValidationError</code>][pydantic_core.ValidationError] if the input data cannot be
	validated to form a valid model.</p>
	<p><code>__init__</code> uses <code>__pydantic_self__</code> instead of the more common <code>self</code> for the first arg to
	allow <code>self</code> as a field name.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">class SimulationExperimentDataset(BaseModel):
	"""
	Represents a dataset from a simulation experiment or empirical study.

	This contains data that can be used for validation, including quantitative metrics
	and qualitative agent justifications from simulation experiments or empirical studies.

	Attributes:
	name: Optional name for the dataset
	description: Optional description of the dataset
	key_results: Map from result names to their values (numbers, proportions, booleans, etc.)
	result_types: Map indicating whether each result is "aggregate" or "per_agent"
	agent_names: Optional list of agent names (can be referenced by index in results)
	agent_justifications: List of justifications (with optional agent references)
	justification_summary: Optional summary of all agent justifications
	"""
	name: Optional[str] = None
	description: Optional[str] = None
	key_results: Dict[str, Union[float, int, bool, List[Union[float, int, bool, None]], None]] = Field(default_factory=dict)
	result_types: Dict[str, str] = Field(default_factory=dict, description="Map from result name to 'aggregate' or 'per_agent'")
	agent_names: Optional[List[Optional[str]]] = Field(None, description="Optional list of agent names for reference (can contain None for unnamed agents)")
	agent_justifications: List[Union[str, Dict[str, Union[str, int]]]] = Field(
	default_factory=list,
	description="List of justifications as strings or dicts with optional 'agent_name'/'agent_index' and 'justification'"
	)
	justification_summary: Optional[str] = None

	class Config:
	"""Pydantic configuration."""
	extra = "forbid" # Prevent accidental extra fields
	validate_assignment = True # Validate on assignment after creation

	def get_agent_name(self, index: int) -> Optional[str]:
	"""Get agent name by index, if available."""
	if self.agent_names and 0 <= index < len(self.agent_names):
	agent_name = self.agent_names[index]
	return agent_name if agent_name is not None else None
	return None

	def get_agent_data(self, metric_name: str, agent_index: int) -> Optional[Union[float, int, bool]]:
	"""Get a specific agent's data for a given metric. Returns None for missing data."""
	if metric_name not in self.key_results:
	return None

	metric_data = self.key_results[metric_name]

	# Check if it's per-agent data
	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	if 0 <= agent_index < len(metric_data):
	return metric_data[agent_index] # This can be None for missing data

	return None

	def get_all_agent_data(self, metric_name: str) -> Dict[str, Union[float, int, bool]]:
	"""Get all agents' data for a given metric as a dictionary mapping agent names/indices to values."""
	if metric_name not in self.key_results:
	return {}

	metric_data = self.key_results[metric_name]
	result = {}

	# For per-agent data, create mapping
	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	for i, value in enumerate(metric_data):
	agent_name = self.get_agent_name(i) or f"Agent_{i}"
	# Only include non-None values in the result
	if value is not None:
	result[agent_name] = value

	# For aggregate data, return single value
	elif self.result_types.get(metric_name) == "aggregate":
	result["aggregate"] = metric_data

	return result

	def get_valid_agent_data(self, metric_name: str) -> List[Union[float, int, bool]]:
	"""Get only valid (non-None) values for a per-agent metric."""
	if metric_name not in self.key_results:
	return []

	metric_data = self.key_results[metric_name]

	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	return [value for value in metric_data if value is not None]

	return []

	def validate_data_consistency(self) -> List[str]:
	"""Validate that per-agent data is consistent across metrics and with agent names."""
	errors = []
	warnings = []

	# Check per-agent metrics have consistent lengths
	per_agent_lengths = []
	per_agent_metrics = []

	for metric_name, result_type in self.result_types.items():
	if result_type == "per_agent" and metric_name in self.key_results:
	metric_data = self.key_results[metric_name]
	if isinstance(metric_data, list):
	per_agent_lengths.append(len(metric_data))
	per_agent_metrics.append(metric_name)
	else:
	errors.append(f"Metric '{metric_name}' marked as per_agent but is not a list")

	# Check all per-agent metrics have same length
	if per_agent_lengths and len(set(per_agent_lengths)) > 1:
	errors.append(f"Per-agent metrics have inconsistent lengths: {dict(zip(per_agent_metrics, per_agent_lengths))}")

	# Check agent_names length matches per-agent data length
	if self.agent_names and per_agent_lengths:
	agent_count = len(self.agent_names)
	data_length = per_agent_lengths[0] if per_agent_lengths else 0
	if agent_count != data_length:
	errors.append(f"agent_names length ({agent_count}) doesn't match per-agent data length ({data_length})")

	# Check for None values in agent_names and provide warnings
	if self.agent_names:
	none_indices = [i for i, name in enumerate(self.agent_names) if name is None]
	if none_indices:
	warnings.append(f"agent_names contains None values at indices: {none_indices}")

	# Check for None values in per-agent data and provide info
	for metric_name in per_agent_metrics:
	if metric_name in self.key_results:
	metric_data = self.key_results[metric_name]
	none_indices = [i for i, value in enumerate(metric_data) if value is None]
	if none_indices:
	warnings.append(f"Metric '{metric_name}' has missing data (None) at indices: {none_indices}")

	# Return errors and warnings combined
	return errors + [f"WARNING: {warning}" for warning in warnings]

	def get_justification_text(self, justification_item: Union[str, Dict[str, Union[str, int]]]) -> str:
	"""Extract justification text from various formats."""
	if isinstance(justification_item, str):
	return justification_item
	elif isinstance(justification_item, dict):
	return justification_item.get("justification", "")
	return ""

	def get_justification_agent_reference(self, justification_item: Union[str, Dict[str, Union[str, int]]]) -> Optional[str]:
	"""Get agent reference from justification, returning name if available."""
	if isinstance(justification_item, dict):
	# Direct agent name
	if "agent_name" in justification_item:
	return justification_item["agent_name"]
	# Agent index reference
	elif "agent_index" in justification_item:
	return self.get_agent_name(justification_item["agent_index"])
	return None</code></pre>
	</details>
	<h3>Ancestors</h3>
	<ul class="hlist">
	<li>pydantic.main.BaseModel</li>
	</ul>
	<h3>Class variables</h3>
	<dl>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.Config"><code class="name">var <span class="ident">Config</span></code></dt>
	<dd>
	<div class="desc"><p>Pydantic configuration.</p></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.agent_justifications"><code class="name">var <span class="ident">agent_justifications</span> : List[Union[str, Dict[str, Union[str, int]]]]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.agent_names"><code class="name">var <span class="ident">agent_names</span> : Optional[List[Optional[str]]]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.description"><code class="name">var <span class="ident">description</span> : Optional[str]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.justification_summary"><code class="name">var <span class="ident">justification_summary</span> : Optional[str]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.key_results"><code class="name">var <span class="ident">key_results</span> : Dict[str, Union[float, int, bool, List[Union[float, int, bool, ForwardRef(None)]], ForwardRef(None)]]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.model_config"><code class="name">var <span class="ident">model_config</span></code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.model_fields"><code class="name">var <span class="ident">model_fields</span></code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.name"><code class="name">var <span class="ident">name</span> : Optional[str]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.result_types"><code class="name">var <span class="ident">result_types</span> : Dict[str, str]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	</dl>
	<h3>Methods</h3>
	<dl>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_agent_data"><code class="name flex">
	<span>def <span class="ident">get_agent_data</span></span>(<span>self, metric_name: str, agent_index: int) ‑> Union[float, int, bool, ForwardRef(None)]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Get a specific agent's data for a given metric. Returns None for missing data.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def get_agent_data(self, metric_name: str, agent_index: int) -> Optional[Union[float, int, bool]]:
	"""Get a specific agent's data for a given metric. Returns None for missing data."""
	if metric_name not in self.key_results:
	return None

	metric_data = self.key_results[metric_name]

	# Check if it's per-agent data
	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	if 0 <= agent_index < len(metric_data):
	return metric_data[agent_index] # This can be None for missing data

	return None</code></pre>
	</details>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_agent_name"><code class="name flex">
	<span>def <span class="ident">get_agent_name</span></span>(<span>self, index: int) ‑> Optional[str]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Get agent name by index, if available.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def get_agent_name(self, index: int) -> Optional[str]:
	"""Get agent name by index, if available."""
	if self.agent_names and 0 <= index < len(self.agent_names):
	agent_name = self.agent_names[index]
	return agent_name if agent_name is not None else None
	return None</code></pre>
	</details>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_all_agent_data"><code class="name flex">
	<span>def <span class="ident">get_all_agent_data</span></span>(<span>self, metric_name: str) ‑> Dict[str, Union[float, int, bool]]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Get all agents' data for a given metric as a dictionary mapping agent names/indices to values.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def get_all_agent_data(self, metric_name: str) -> Dict[str, Union[float, int, bool]]:
	"""Get all agents' data for a given metric as a dictionary mapping agent names/indices to values."""
	if metric_name not in self.key_results:
	return {}

	metric_data = self.key_results[metric_name]
	result = {}

	# For per-agent data, create mapping
	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	for i, value in enumerate(metric_data):
	agent_name = self.get_agent_name(i) or f"Agent_{i}"
	# Only include non-None values in the result
	if value is not None:
	result[agent_name] = value

	# For aggregate data, return single value
	elif self.result_types.get(metric_name) == "aggregate":
	result["aggregate"] = metric_data

	return result</code></pre>
	</details>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_justification_agent_reference"><code class="name flex">
	<span>def <span class="ident">get_justification_agent_reference</span></span>(<span>self, justification_item: Union[str, Dict[str, Union[str, int]]]) ‑> Optional[str]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Get agent reference from justification, returning name if available.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def get_justification_agent_reference(self, justification_item: Union[str, Dict[str, Union[str, int]]]) -> Optional[str]:
	"""Get agent reference from justification, returning name if available."""
	if isinstance(justification_item, dict):
	# Direct agent name
	if "agent_name" in justification_item:
	return justification_item["agent_name"]
	# Agent index reference
	elif "agent_index" in justification_item:
	return self.get_agent_name(justification_item["agent_index"])
	return None</code></pre>
	</details>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_justification_text"><code class="name flex">
	<span>def <span class="ident">get_justification_text</span></span>(<span>self, justification_item: Union[str, Dict[str, Union[str, int]]]) ‑> str</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Extract justification text from various formats.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def get_justification_text(self, justification_item: Union[str, Dict[str, Union[str, int]]]) -> str:
	"""Extract justification text from various formats."""
	if isinstance(justification_item, str):
	return justification_item
	elif isinstance(justification_item, dict):
	return justification_item.get("justification", "")
	return ""</code></pre>
	</details>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_valid_agent_data"><code class="name flex">
	<span>def <span class="ident">get_valid_agent_data</span></span>(<span>self, metric_name: str) ‑> List[Union[float, int, bool]]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Get only valid (non-None) values for a per-agent metric.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def get_valid_agent_data(self, metric_name: str) -> List[Union[float, int, bool]]:
	"""Get only valid (non-None) values for a per-agent metric."""
	if metric_name not in self.key_results:
	return []

	metric_data = self.key_results[metric_name]

	if self.result_types.get(metric_name) == "per_agent" and isinstance(metric_data, list):
	return [value for value in metric_data if value is not None]

	return []</code></pre>
	</details>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.validate_data_consistency"><code class="name flex">
	<span>def <span class="ident">validate_data_consistency</span></span>(<span>self) ‑> List[str]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Validate that per-agent data is consistent across metrics and with agent names.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def validate_data_consistency(self) -> List[str]:
	"""Validate that per-agent data is consistent across metrics and with agent names."""
	errors = []
	warnings = []

	# Check per-agent metrics have consistent lengths
	per_agent_lengths = []
	per_agent_metrics = []

	for metric_name, result_type in self.result_types.items():
	if result_type == "per_agent" and metric_name in self.key_results:
	metric_data = self.key_results[metric_name]
	if isinstance(metric_data, list):
	per_agent_lengths.append(len(metric_data))
	per_agent_metrics.append(metric_name)
	else:
	errors.append(f"Metric '{metric_name}' marked as per_agent but is not a list")

	# Check all per-agent metrics have same length
	if per_agent_lengths and len(set(per_agent_lengths)) > 1:
	errors.append(f"Per-agent metrics have inconsistent lengths: {dict(zip(per_agent_metrics, per_agent_lengths))}")

	# Check agent_names length matches per-agent data length
	if self.agent_names and per_agent_lengths:
	agent_count = len(self.agent_names)
	data_length = per_agent_lengths[0] if per_agent_lengths else 0
	if agent_count != data_length:
	errors.append(f"agent_names length ({agent_count}) doesn't match per-agent data length ({data_length})")

	# Check for None values in agent_names and provide warnings
	if self.agent_names:
	none_indices = [i for i, name in enumerate(self.agent_names) if name is None]
	if none_indices:
	warnings.append(f"agent_names contains None values at indices: {none_indices}")

	# Check for None values in per-agent data and provide info
	for metric_name in per_agent_metrics:
	if metric_name in self.key_results:
	metric_data = self.key_results[metric_name]
	none_indices = [i for i, value in enumerate(metric_data) if value is None]
	if none_indices:
	warnings.append(f"Metric '{metric_name}' has missing data (None) at indices: {none_indices}")

	# Return errors and warnings combined
	return errors + [f"WARNING: {warning}" for warning in warnings]</code></pre>
	</details>
	</dd>
	</dl>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult"><code class="flex name class">
	<span>class <span class="ident">SimulationExperimentEmpiricalValidationResult</span></span>
	<span>(</span><span>**data: Any)</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Contains the results of a simulation experiment validation against empirical data.</p>
	<p>This represents the outcome of validating simulation experiment data
	against empirical benchmarks, using statistical and semantic methods.</p>
	<h2 id="attributes">Attributes</h2>
	<dl>
	<dt><strong><code>validation_type</code></strong></dt>
	<dd>Type of validation performed</dd>
	<dt><strong><code>control_name</code></strong></dt>
	<dd>Name of the control/empirical dataset</dd>
	<dt><strong><code>treatment_name</code></strong></dt>
	<dd>Name of the treatment/simulation experiment dataset</dd>
	<dt><strong><code>statistical_results</code></strong></dt>
	<dd>Results from statistical tests (if performed)</dd>
	<dt><strong><code>semantic_results</code></strong></dt>
	<dd>Results from semantic proximity analysis (if performed)</dd>
	<dt><strong><code>overall_score</code></strong></dt>
	<dd>Overall validation score (0.0 to 1.0)</dd>
	<dt><strong><code>summary</code></strong></dt>
	<dd>Summary of validation findings</dd>
	<dt><strong><code>timestamp</code></strong></dt>
	<dd>When the validation was performed</dd>
	</dl>
	<p>Create a new model by parsing and validating input data from keyword arguments.</p>
	<p>Raises [<code>ValidationError</code>][pydantic_core.ValidationError] if the input data cannot be
	validated to form a valid model.</p>
	<p><code>__init__</code> uses <code>__pydantic_self__</code> instead of the more common <code>self</code> for the first arg to
	allow <code>self</code> as a field name.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">class SimulationExperimentEmpiricalValidationResult(BaseModel):
	"""
	Contains the results of a simulation experiment validation against empirical data.

	This represents the outcome of validating simulation experiment data
	against empirical benchmarks, using statistical and semantic methods.

	Attributes:
	validation_type: Type of validation performed
	control_name: Name of the control/empirical dataset
	treatment_name: Name of the treatment/simulation experiment dataset
	statistical_results: Results from statistical tests (if performed)
	semantic_results: Results from semantic proximity analysis (if performed)
	overall_score: Overall validation score (0.0 to 1.0)
	summary: Summary of validation findings
	timestamp: When the validation was performed
	"""
	validation_type: str
	control_name: str
	treatment_name: str
	statistical_results: Optional[Dict[str, Any]] = None
	semantic_results: Optional[Dict[str, Any]] = None
	overall_score: Optional[float] = Field(None, ge=0.0, le=1.0, description="Overall validation score between 0.0 and 1.0")
	summary: str = ""
	timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())

	class Config:
	"""Pydantic configuration."""
	extra = "forbid"
	validate_assignment = True</code></pre>
	</details>
	<h3>Ancestors</h3>
	<ul class="hlist">
	<li>pydantic.main.BaseModel</li>
	</ul>
	<h3>Class variables</h3>
	<dl>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.Config"><code class="name">var <span class="ident">Config</span></code></dt>
	<dd>
	<div class="desc"><p>Pydantic configuration.</p></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.control_name"><code class="name">var <span class="ident">control_name</span> : str</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.model_config"><code class="name">var <span class="ident">model_config</span></code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.model_fields"><code class="name">var <span class="ident">model_fields</span></code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.overall_score"><code class="name">var <span class="ident">overall_score</span> : Optional[float]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.semantic_results"><code class="name">var <span class="ident">semantic_results</span> : Optional[Dict[str, Any]]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.statistical_results"><code class="name">var <span class="ident">statistical_results</span> : Optional[Dict[str, Any]]</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.summary"><code class="name">var <span class="ident">summary</span> : str</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.timestamp"><code class="name">var <span class="ident">timestamp</span> : str</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.treatment_name"><code class="name">var <span class="ident">treatment_name</span> : str</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.validation_type"><code class="name">var <span class="ident">validation_type</span> : str</code></dt>
	<dd>
	<div class="desc"></div>
	</dd>
	</dl>
	</dd>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidator"><code class="flex name class">
	<span>class <span class="ident">SimulationExperimentEmpiricalValidator</span></span>
	</code></dt>
	<dd>
	<div class="desc"><p>A validator for comparing simulation experiment data against empirical control data.</p>
	<p>This validator performs data-driven validation using statistical hypothesis testing
	and semantic proximity analysis of agent justifications. It is designed to validate
	simulation experiment results against known empirical benchmarks, distinct from LLM-based evaluations.</p>
	<p>Initialize the simulation experiment empirical validator.</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">class SimulationExperimentEmpiricalValidator:
	"""
	A validator for comparing simulation experiment data against empirical control data.

	This validator performs data-driven validation using statistical hypothesis testing
	and semantic proximity analysis of agent justifications. It is designed to validate
	simulation experiment results against known empirical benchmarks, distinct from LLM-based evaluations.
	"""

	def __init__(self):
	"""Initialize the simulation experiment empirical validator."""
	pass

	def validate(self,
	control: SimulationExperimentDataset,
	treatment: SimulationExperimentDataset,
	validation_types: List[str] = ["statistical", "semantic"],
	significance_level: float = 0.05,
	output_format: str = "values") -> Union[SimulationExperimentEmpiricalValidationResult, str]:
	"""
	Validate a simulation experiment dataset against an empirical control dataset.

	Args:
	control: The control/empirical reference dataset
	treatment: The treatment/simulation experiment dataset to validate
	validation_types: List of validation types to perform ("statistical", "semantic")
	significance_level: Significance level for statistical tests
	output_format: "values" for SimulationExperimentEmpiricalValidationResult object, "report" for markdown report

	Returns:
	SimulationExperimentEmpiricalValidationResult object or markdown report string
	"""
	result = SimulationExperimentEmpiricalValidationResult(
	validation_type=", ".join(validation_types),
	control_name=control.name or "Control",
	treatment_name=treatment.name or "Treatment"
	)

	# Perform statistical validation
	if "statistical" in validation_types:
	result.statistical_results = self._perform_statistical_validation(
	control, treatment, significance_level
	)

	# Perform semantic validation
	if "semantic" in validation_types:
	result.semantic_results = self._perform_semantic_validation(
	control, treatment
	)

	# Calculate overall score and summary
	result.overall_score = self._calculate_overall_score(result)
	result.summary = self._generate_summary(result)

	if output_format == "report":
	return self._generate_markdown_report(result)
	else:
	return result

	def _perform_statistical_validation(self,
	control: SimulationExperimentDataset,
	treatment: SimulationExperimentDataset,
	significance_level: float) -> Dict[str, Any]:
	"""Perform statistical hypothesis testing on simulation experiment key results."""
	if not control.key_results or not treatment.key_results:
	return {"error": "No key results available for statistical testing"}

	try:
	# Prepare data for StatisticalTester
	control_data = {"control": {}}
	treatment_data = {"treatment": {}}

	# Convert single values to lists if needed and find common metrics
	common_metrics = set(control.key_results.keys()) & set(treatment.key_results.keys())

	for metric in common_metrics:
	control_value = control.key_results[metric]
	treatment_value = treatment.key_results[metric]

	# Convert single values to lists and filter out None values
	if not isinstance(control_value, list):
	control_value = [control_value] if control_value is not None else []
	else:
	control_value = [v for v in control_value if v is not None]

	if not isinstance(treatment_value, list):
	treatment_value = [treatment_value] if treatment_value is not None else []
	else:
	treatment_value = [v for v in treatment_value if v is not None]

	# Only include metrics that have valid data points
	if len(control_value) > 0 and len(treatment_value) > 0:
	control_data["control"][metric] = control_value
	treatment_data["treatment"][metric] = treatment_value

	if not common_metrics:
	return {"error": "No common metrics found between control and treatment"}

	# Run statistical tests
	tester = StatisticalTester(control_data, treatment_data)
	test_results = tester.run_test(
	test_type="welch_t_test",
	alpha=significance_level
	)

	return {
	"common_metrics": list(common_metrics),
	"test_results": test_results,
	"significance_level": significance_level
	}

	except Exception as e:
	return {"error": f"Statistical testing failed: {str(e)}"}

	def _perform_semantic_validation(self,
	control: SimulationExperimentDataset,
	treatment: SimulationExperimentDataset) -> Dict[str, Any]:
	"""Perform semantic proximity analysis on simulation experiment agent justifications."""
	results = {
	"individual_comparisons": [],
	"summary_comparison": None,
	"average_proximity": None
	}

	# Compare individual justifications if available
	if control.agent_justifications and treatment.agent_justifications:
	proximities = []

	for i, control_just in enumerate(control.agent_justifications):
	for j, treatment_just in enumerate(treatment.agent_justifications):
	control_text = control.get_justification_text(control_just)
	treatment_text = treatment.get_justification_text(treatment_just)

	if control_text and treatment_text:
	proximity_result = compute_semantic_proximity(
	control_text,
	treatment_text,
	context="Comparing agent justifications from simulation experiments"
	)

	# Get agent references (names or indices)
	control_agent_ref = control.get_justification_agent_reference(control_just) or f"Agent_{i}"
	treatment_agent_ref = treatment.get_justification_agent_reference(treatment_just) or f"Agent_{j}"

	comparison = {
	"control_agent": control_agent_ref,
	"treatment_agent": treatment_agent_ref,
	"proximity_score": proximity_result["proximity_score"],
	"justification": proximity_result["justification"]
	}

	results["individual_comparisons"].append(comparison)
	proximities.append(proximity_result["proximity_score"])

	if proximities:
	results["average_proximity"] = sum(proximities) / len(proximities)

	# Compare summary justifications if available
	if control.justification_summary and treatment.justification_summary:
	summary_proximity = compute_semantic_proximity(
	control.justification_summary,
	treatment.justification_summary,
	context="Comparing summary justifications from simulation experiments"
	)
	results["summary_comparison"] = summary_proximity

	return results

	def _calculate_overall_score(self, result: SimulationExperimentEmpiricalValidationResult) -> float:
	"""Calculate an overall simulation experiment empirical validation score based on statistical and semantic results."""
	scores = []

	# Statistical component based on effect sizes
	if result.statistical_results and "test_results" in result.statistical_results:
	test_results = result.statistical_results["test_results"]
	effect_sizes = []

	for treatment_name, treatment_results in test_results.items():
	for metric, metric_result in treatment_results.items():
	# Extract effect size based on test type
	effect_size = self._extract_effect_size(metric_result)
	if effect_size is not None:
	effect_sizes.append(effect_size)

	if effect_sizes:
	# Convert effect sizes to similarity scores (closer to 0 = more similar)
	# Use inverse transformation: similarity = 1 / (1 + \|effect_size\|)
	similarity_scores = [1.0 / (1.0 + abs(es)) for es in effect_sizes]
	statistical_score = sum(similarity_scores) / len(similarity_scores)
	scores.append(statistical_score)

	# Semantic component
	if result.semantic_results:
	semantic_scores = []

	# Average proximity from individual comparisons
	if result.semantic_results.get("average_proximity") is not None:
	semantic_scores.append(result.semantic_results["average_proximity"])

	# Summary proximity
	if result.semantic_results.get("summary_comparison"):
	semantic_scores.append(result.semantic_results["summary_comparison"]["proximity_score"])

	if semantic_scores:
	scores.append(sum(semantic_scores) / len(semantic_scores))

	return sum(scores) / len(scores) if scores else 0.0

	def _generate_summary(self, result: SimulationExperimentEmpiricalValidationResult) -> str:
	"""Generate a text summary of the simulation experiment empirical validation results."""
	summary_parts = []

	if result.statistical_results:
	if "error" in result.statistical_results:
	summary_parts.append(f"Statistical validation: {result.statistical_results['error']}")
	else:
	test_results = result.statistical_results.get("test_results", {})
	effect_sizes = []
	significant_tests = 0
	total_tests = 0

	for treatment_results in test_results.values():
	for metric_result in treatment_results.values():
	total_tests += 1
	if metric_result.get("significant", False):
	significant_tests += 1

	# Collect effect sizes
	effect_size = self._extract_effect_size(metric_result)
	if effect_size is not None:
	effect_sizes.append(abs(effect_size))

	if effect_sizes:
	avg_effect_size = sum(effect_sizes) / len(effect_sizes)
	summary_parts.append(
	f"Statistical validation: {significant_tests}/{total_tests} tests significant, "
	f"average effect size: {avg_effect_size:.3f}"
	)
	else:
	summary_parts.append(
	f"Statistical validation: {significant_tests}/{total_tests} tests showed significant differences"
	)

	if result.semantic_results:
	avg_proximity = result.semantic_results.get("average_proximity")
	if avg_proximity is not None:
	summary_parts.append(
	f"Semantic validation: Average proximity score of {avg_proximity:.3f}"
	)

	summary_comparison = result.semantic_results.get("summary_comparison")
	if summary_comparison:
	summary_parts.append(
	f"Summary proximity: {summary_comparison['proximity_score']:.3f}"
	)

	if result.overall_score is not None:
	summary_parts.append(f"Overall validation score: {result.overall_score:.3f}")

	return "; ".join(summary_parts) if summary_parts else "No validation results available"

	def _generate_markdown_report(self, result: SimulationExperimentEmpiricalValidationResult) -> str:
	"""Generate a comprehensive markdown report for simulation experiment empirical validation."""
	overall_score_str = f"{result.overall_score:.3f}" if result.overall_score is not None else "N/A"

	report = f"""# Simulation Experiment Empirical Validation Report

	Validation Type: {result.validation_type}
	Control/Empirical: {result.control_name}
	Treatment/Simulation: {result.treatment_name}
	Timestamp: {result.timestamp}
	Overall Score: {overall_score_str}

	## Summary

	{result.summary}

	"""

	# Statistical Results Section
	if result.statistical_results:
	report += "## Statistical Validation\n\n"

	if "error" in result.statistical_results:
	report += f"Error: {result.statistical_results['error']}\n\n"
	else:
	stats = result.statistical_results
	report += f"Common Metrics: {', '.join(stats.get('common_metrics', []))}\n\n"
	report += f"Significance Level: {stats.get('significance_level', 'N/A')}\n\n"

	test_results = stats.get("test_results", {})
	if test_results:
	report += "### Test Results\n\n"

	for treatment_name, treatment_results in test_results.items():
	report += f"#### {treatment_name}\n\n"

	for metric, metric_result in treatment_results.items():
	report += f"{metric}:\n\n"

	significant = metric_result.get("significant", False)
	p_value = metric_result.get("p_value", "N/A")
	test_type = metric_result.get("test_type", "N/A")
	effect_size = self._extract_effect_size(metric_result)

	# Get the appropriate statistic based on test type
	statistic = "N/A"
	if "t_statistic" in metric_result:
	statistic = metric_result["t_statistic"]
	elif "u_statistic" in metric_result:
	statistic = metric_result["u_statistic"]
	elif "f_statistic" in metric_result:
	statistic = metric_result["f_statistic"]
	elif "chi2_statistic" in metric_result:
	statistic = metric_result["chi2_statistic"]

	status = "✅ Significant" if significant else "❌ Not Significant"

	report += f"- {test_type}: {status}\n"
	report += f" - p-value: {p_value}\n"
	report += f" - statistic: {statistic}\n"
	if effect_size is not None:
	effect_interpretation = self._interpret_effect_size(abs(effect_size))
	report += f" - effect size: {effect_size:.3f} ({effect_interpretation})\n"

	report += "\n"

	# Semantic Results Section
	if result.semantic_results:
	report += "## Semantic Validation\n\n"

	semantic = result.semantic_results

	# Individual comparisons
	individual_comps = semantic.get("individual_comparisons", [])
	if individual_comps:
	report += "### Individual Agent Comparisons\n\n"

	for comp in individual_comps:
	score = comp["proximity_score"]
	control_agent = comp["control_agent"]
	treatment_agent = comp["treatment_agent"]
	justification = comp["justification"]

	report += f"{control_agent} vs {treatment_agent}: {score:.3f}\n\n"
	report += f"{justification}\n\n"

	avg_proximity = semantic.get("average_proximity")
	if avg_proximity:
	report += f"Average Proximity Score: {avg_proximity:.3f}\n\n"

	# Summary comparison
	summary_comp = semantic.get("summary_comparison")
	if summary_comp:
	report += "### Summary Comparison\n\n"
	report += f"Proximity Score: {summary_comp['proximity_score']:.3f}\n\n"
	report += f"Justification: {summary_comp['justification']}\n\n"

	return report

	def _extract_effect_size(self, metric_result: Dict[str, Any]) -> Optional[float]:
	"""Extract effect size from statistical test result, regardless of test type."""
	# Cohen's d for t-tests (most common)
	if "effect_size" in metric_result:
	return metric_result["effect_size"]

	# For tests that don't provide Cohen's d, calculate standardized effect size
	test_type = metric_result.get("test_type", "").lower()

	if "t-test" in test_type:
	# For t-tests, effect_size should be Cohen's d
	return metric_result.get("effect_size", 0.0)

	elif "mann-whitney" in test_type:
	# For Mann-Whitney, use Common Language Effect Size (CLES)
	# Convert CLES to Cohen's d equivalent: d ≈ 2 * Φ^(-1)(CLES)
	cles = metric_result.get("effect_size", 0.5)
	# Simple approximation: convert CLES to d-like measure
	# CLES of 0.5 = no effect, CLES of 0.71 ≈ small effect (d=0.2)
	return 2 * (cles - 0.5)

	elif "anova" in test_type:
	# For ANOVA, use eta-squared and convert to Cohen's d equivalent
	eta_squared = metric_result.get("effect_size", 0.0)
	# Convert eta-squared to Cohen's d: d = 2 * sqrt(eta^2 / (1 - eta^2))
	if eta_squared > 0 and eta_squared < 1:
	return 2 * (eta_squared / (1 - eta_squared)) ** 0.5
	return 0.0

	elif "chi-square" in test_type:
	# For Chi-square, use Cramer's V and convert to Cohen's d equivalent
	cramers_v = metric_result.get("effect_size", 0.0)
	# Rough conversion: d ≈ 2 * Cramer's V
	return 2 * cramers_v

	# Fallback: try to calculate from means and standard deviations
	if all(k in metric_result for k in ["control_mean", "treatment_mean", "control_std", "treatment_std"]):
	control_mean = metric_result["control_mean"]
	treatment_mean = metric_result["treatment_mean"]
	control_std = metric_result["control_std"]
	treatment_std = metric_result["treatment_std"]

	# Calculate pooled standard deviation
	pooled_std = ((control_std 2 + treatment_std 2) / 2) ** 0.5
	if pooled_std > 0:
	return abs(treatment_mean - control_mean) / pooled_std

	# If all else fails, return 0 (no effect)
	return 0.0

	def _interpret_effect_size(self, effect_size: float) -> str:
	"""Provide interpretation of effect size magnitude (Cohen's conventions)."""
	if effect_size < 0.2:
	return "negligible"
	elif effect_size < 0.5:
	return "small"
	elif effect_size < 0.8:
	return "medium"
	else:
	return "large"</code></pre>
	</details>
	<h3>Methods</h3>
	<dl>
	<dt id="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidator.validate"><code class="name flex">
	<span>def <span class="ident">validate</span></span>(<span>self, control: <a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset">SimulationExperimentDataset</a>, treatment: <a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset">SimulationExperimentDataset</a>, validation_types: List[str] = ['statistical', 'semantic'], significance_level: float = 0.05, output_format: str = 'values') ‑> Union[<a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult">SimulationExperimentEmpiricalValidationResult</a>, str]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Validate a simulation experiment dataset against an empirical control dataset.</p>
	<h2 id="args">Args</h2>
	<dl>
	<dt><strong><code>control</code></strong></dt>
	<dd>The control/empirical reference dataset</dd>
	<dt><strong><code>treatment</code></strong></dt>
	<dd>The treatment/simulation experiment dataset to validate</dd>
	<dt><strong><code>validation_types</code></strong></dt>
	<dd>List of validation types to perform ("statistical", "semantic")</dd>
	<dt><strong><code>significance_level</code></strong></dt>
	<dd>Significance level for statistical tests</dd>
	<dt><strong><code>output_format</code></strong></dt>
	<dd>"values" for SimulationExperimentEmpiricalValidationResult object, "report" for markdown report</dd>
	</dl>
	<h2 id="returns">Returns</h2>
	<p>SimulationExperimentEmpiricalValidationResult object or markdown report string</p></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def validate(self,
	control: SimulationExperimentDataset,
	treatment: SimulationExperimentDataset,
	validation_types: List[str] = ["statistical", "semantic"],
	significance_level: float = 0.05,
	output_format: str = "values") -> Union[SimulationExperimentEmpiricalValidationResult, str]:
	"""
	Validate a simulation experiment dataset against an empirical control dataset.

	Args:
	control: The control/empirical reference dataset
	treatment: The treatment/simulation experiment dataset to validate
	validation_types: List of validation types to perform ("statistical", "semantic")
	significance_level: Significance level for statistical tests
	output_format: "values" for SimulationExperimentEmpiricalValidationResult object, "report" for markdown report

	Returns:
	SimulationExperimentEmpiricalValidationResult object or markdown report string
	"""
	result = SimulationExperimentEmpiricalValidationResult(
	validation_type=", ".join(validation_types),
	control_name=control.name or "Control",
	treatment_name=treatment.name or "Treatment"
	)

	# Perform statistical validation
	if "statistical" in validation_types:
	result.statistical_results = self._perform_statistical_validation(
	control, treatment, significance_level
	)

	# Perform semantic validation
	if "semantic" in validation_types:
	result.semantic_results = self._perform_semantic_validation(
	control, treatment
	)

	# Calculate overall score and summary
	result.overall_score = self._calculate_overall_score(result)
	result.summary = self._generate_summary(result)

	if output_format == "report":
	return self._generate_markdown_report(result)
	else:
	return result</code></pre>
	</details>
	</dd>
	</dl>
	</dd>
	</dl>
	</section>
	</article>
	<nav id="sidebar">
	<h1>Index</h1>
	<div class="toc">
	<ul></ul>
	</div>
	<ul id="index">
	<li><h3>Super-module</h3>
	<ul>
	<li><code><a title="tinytroupe.validation" href="index.html">tinytroupe.validation</a></code></li>
	</ul>
	</li>
	<li><h3><a href="#header-functions">Functions</a></h3>
	<ul class="">
	<li><code><a title="tinytroupe.validation.simulation_validator.validate_simulation_experiment_empirically" href="#tinytroupe.validation.simulation_validator.validate_simulation_experiment_empirically">validate_simulation_experiment_empirically</a></code></li>
	</ul>
	</li>
	<li><h3><a href="#header-classes">Classes</a></h3>
	<ul>
	<li>
	<h4><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset">SimulationExperimentDataset</a></code></h4>
	<ul class="">
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.Config" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.Config">Config</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.agent_justifications" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.agent_justifications">agent_justifications</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.agent_names" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.agent_names">agent_names</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.description" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.description">description</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_agent_data" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_agent_data">get_agent_data</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_agent_name" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_agent_name">get_agent_name</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_all_agent_data" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_all_agent_data">get_all_agent_data</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_justification_agent_reference" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_justification_agent_reference">get_justification_agent_reference</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_justification_text" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_justification_text">get_justification_text</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_valid_agent_data" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.get_valid_agent_data">get_valid_agent_data</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.justification_summary" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.justification_summary">justification_summary</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.key_results" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.key_results">key_results</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.model_config" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.model_config">model_config</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.model_fields" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.model_fields">model_fields</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.name" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.name">name</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.result_types" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.result_types">result_types</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentDataset.validate_data_consistency" href="#tinytroupe.validation.simulation_validator.SimulationExperimentDataset.validate_data_consistency">validate_data_consistency</a></code></li>
	</ul>
	</li>
	<li>
	<h4><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult">SimulationExperimentEmpiricalValidationResult</a></code></h4>
	<ul class="two-column">
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.Config" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.Config">Config</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.control_name" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.control_name">control_name</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.model_config" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.model_config">model_config</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.model_fields" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.model_fields">model_fields</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.overall_score" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.overall_score">overall_score</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.semantic_results" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.semantic_results">semantic_results</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.statistical_results" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.statistical_results">statistical_results</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.summary" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.summary">summary</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.timestamp" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.timestamp">timestamp</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.treatment_name" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.treatment_name">treatment_name</a></code></li>
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.validation_type" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidationResult.validation_type">validation_type</a></code></li>
	</ul>
	</li>
	<li>
	<h4><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidator" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidator">SimulationExperimentEmpiricalValidator</a></code></h4>
	<ul class="">
	<li><code><a title="tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidator.validate" href="#tinytroupe.validation.simulation_validator.SimulationExperimentEmpiricalValidator.validate">validate</a></code></li>
	</ul>
	</li>
	</ul>
	</li>
	</ul>
	</nav>
	</main>
	<footer id="footer">
	<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
	</footer>
	</body>
	</html>