Mango-Metrics-NLM

feat: Phi-3.5-MoE multi-agent model repository

c8b77b5 7 months ago

26.1 kB

	"""
	Reasoning Module Synthetic Data Generator for MangoMAS Local

	This module generates synthetic training data for the reasoning capability.
	"""

	import json
	import logging
	import random
	from pathlib import Path
	from typing import Any, Dict, List

	from ..synthetic_data_generator import (SyntheticDataGenerator,
	SyntheticDataGeneratorRegistry)

	logger = logging.getLogger(__name__)

	# Templates for reasoning scenarios
	REASONING_DOMAINS = [
	"logical deduction",
	"causal analysis",
	"statistical reasoning",
	"comparative analysis",
	"analogical reasoning",
	"counterfactual reasoning",
	"inductive reasoning",
	"abductive reasoning",
	"conditional reasoning",
	"syllogistic reasoning",
	]

	PREMISE_TEMPLATES = [
	"In a study of {topic}, researchers observed that {observation}.",
	"The data shows that {observation} when {condition}.",
	"Historical records indicate that {observation} throughout {period}.",
	"Given that {fact_1} and {fact_2}, we need to determine {question}.",
	"In the context of {domain}, we observe that {observation}.",
	"When analyzing {topic}, we can see that {observation} correlates with {factor}.",
	"System logs reveal that {observation} occurs whenever {condition}.",
	"The problem states that {observation} under conditions where {condition}.",
	"In the experiment, {observation} was measured when {condition}.",
	"The scenario presents a situation where {observation} after {event}.",
	]

	REASONING_TEMPLATES = [
	"First, we need to identify the key variables: {variables}. Looking at the relationship between {var1} and {var2}, we can see that {relationship}. This suggests that {inference}. Furthermore, considering {var3}, we can deduce that {deduction}.",
	"Let's analyze this step by step. If {fact_1}, then {consequence_1}. Given that {fact_2}, we can infer {consequence_2}. Combining these insights, {combined_inference}. Additionally, {extra_insight} further supports this reasoning.",
	"Breaking this down systematically: 1) {step_1}, 2) {step_2}, 3) {step_3}, 4) {step_4}. The logical connection between steps 2 and 3 shows that {connection}, which leads us to {inference}.",
	"The key insight here is that {key_insight}. This means that when {condition}, the result is {result}. We can verify this by examining {evidence}, which confirms {confirmation}.",
	"To solve this, I'll use {method}. Starting with {starting_point}, I can determine that {determination}. This leads to {intermediate_conclusion}, and finally to {final_step}.",
	]

	CONCLUSION_TEMPLATES = [
	"Therefore, we can conclude that {conclusion}.",
	"Based on this analysis, the answer is {conclusion}.",
	"The evidence strongly suggests that {conclusion}.",
	"This reasoning leads to the conclusion that {conclusion}.",
	"We can definitively state that {conclusion} based on the above analysis.",
	]

	# Topics and facts to fill in templates
	TOPICS = [
	"climate patterns",
	"market trends",
	"neural network performance",
	"population demographics",
	"traffic flow optimization",
	"disease spread",
	"economic indicators",
	"algorithm efficiency",
	"material properties",
	"social network dynamics",
	"cognitive biases",
	"language acquisition",
	"genetic inheritance",
	"planetary motion",
	"quantum phenomena",
	]

	FACTS = [
	"energy consumption increases with temperature",
	"user engagement drops after 3 minutes",
	"performance plateaus with more than 8 layers",
	"error rates increase exponentially with load",
	"response time correlates with system memory",
	"conversion rates are highest on Tuesdays",
	"signal strength decreases with distance squared",
	"learning rate affects convergence time",
	"failure probability doubles every 5 years",
	"growth rate is proportional to nutrient concentration",
	]


	class ReasoningDataGenerator(SyntheticDataGenerator):
	"""Generator for synthetic reasoning data."""

	def _load_templates(self) -> List[Dict[str, Any]]:
	"""Load reasoning templates."""
	templates = {
	"domains": REASONING_DOMAINS,
	"premises": PREMISE_TEMPLATES,
	"reasoning": REASONING_TEMPLATES,
	"conclusions": CONCLUSION_TEMPLATES,
	"topics": TOPICS,
	"facts": FACTS,
	}

	# Additional domain-specific templates could be loaded from files
	template_path = self.config.get("template_path")
	if template_path and Path(template_path).exists():
	try:
	with open(template_path, "r", encoding="utf-8") as f:
	custom_templates = json.load(f)
	for key, values in custom_templates.items():
	if key in templates and isinstance(values, list):
	templates[key].extend(values)
	except Exception as e:
	logger.warning(f"Failed to load custom templates: {e}")

	return templates

	def generate_example(self) -> Dict[str, Any]:
	"""Generate a single reasoning example."""
	# Select templates
	domain = random.choice(self.templates["domains"])
	premise_template = random.choice(self.templates["premises"])
	reasoning_template = random.choice(self.templates["reasoning"])
	conclusion_template = random.choice(self.templates["conclusions"])

	# Generate content for templates
	topic = random.choice(self.templates["topics"])
	fact_1 = random.choice(self.templates["facts"])
	fact_2 = random.choice(self.templates["facts"])
	while fact_2 == fact_1: # Ensure different facts
	fact_2 = random.choice(self.templates["facts"])

	# Generate dynamic content
	observation = self._generate_observation(topic)
	condition = self._generate_condition(topic)
	variables = self._generate_variables(domain)
	var1, var2, var3 = variables[:3]
	relationship = self._generate_relationship(var1, var2)
	inference = self._generate_inference(relationship)
	deduction = self._generate_deduction(var3)

	# Fill in templates
	context = {
	"domain": domain,
	"topic": topic,
	"observation": observation,
	"condition": condition,
	"fact_1": fact_1,
	"fact_2": fact_2,
	"variables": ", ".join(variables),
	"var1": var1,
	"var2": var2,
	"var3": var3,
	"relationship": relationship,
	"inference": inference,
	"deduction": deduction,
	"question": self._generate_question(topic),
	"period": self._generate_period(),
	"factor": self._generate_factor(topic),
	"event": self._generate_event(),
	"consequence_1": self._generate_consequence(fact_1),
	"consequence_2": self._generate_consequence(fact_2),
	"combined_inference": self._generate_combined_inference(fact_1, fact_2),
	"extra_insight": self._generate_extra_insight(topic),
	"step_1": self._generate_step(1, domain),
	"step_2": self._generate_step(2, domain),
	"step_3": self._generate_step(3, domain),
	"step_4": self._generate_step(4, domain),
	"connection": self._generate_connection(),
	"key_insight": self._generate_key_insight(domain),
	"result": self._generate_result(domain),
	"evidence": self._generate_evidence(domain),
	"confirmation": self._generate_confirmation(domain),
	"method": self._generate_method(domain),
	"starting_point": self._generate_starting_point(domain),
	"determination": self._generate_determination(domain),
	"intermediate_conclusion": self._generate_intermediate_conclusion(domain),
	"final_step": self._generate_final_step(domain),
	}

	# Format strings with context
	premise = premise_template.format(**context)
	reasoning = reasoning_template.format(**context)

	# Generate conclusion based on the premise and reasoning
	conclusion = self._generate_conclusion(premise, reasoning)
	context["conclusion"] = conclusion
	conclusion_text = conclusion_template.format(**context)

	# Return the structured example
	return {
	"premise": premise,
	"reasoning": reasoning,
	"conclusion": conclusion_text,
	"domain": domain,
	"complexity": random.choice(["low", "medium", "high"]),
	"metadata": {
	"variables": variables,
	"topic": topic,
	"reasoning_type": domain,
	},
	}

	# Helper methods to generate dynamic content
	def _generate_observation(self, topic: str) -> str:
	observations = [
	"the rate of change increases over time",
	"there is a strong correlation between input and output variables",
	"performance degrades under specific conditions",
	"the system exhibits unexpected behavior when stressed",
	"outliers significantly impact the overall trend",
	"recurring patterns emerge after sufficient iterations",
	"the distribution follows a power law rather than normal distribution",
	"feedback loops amplify small initial differences",
	"thresholds exist beyond which behavior changes dramatically",
	"cyclical patterns emerge with a period of varying length",
	]
	return random.choice(observations)

	def _generate_condition(self, topic: str) -> str:
	conditions = [
	"the system is under heavy load",
	"external factors remain constant",
	"all variables are optimized simultaneously",
	"specific constraints are applied",
	"the environment changes unexpectedly",
	"resource limitations come into play",
	"feedback mechanisms are activated",
	"multiple agents interact simultaneously",
	"time delays exceed a critical threshold",
	"boundary conditions are enforced",
	]
	return random.choice(conditions)

	def _generate_variables(self, domain: str) -> List[str]:
	variable_sets = {
	"logical deduction": [
	"premise validity",
	"logical consistency",
	"conclusion strength",
	"assumption bias",
	],
	"causal analysis": [
	"cause magnitude",
	"effect delay",
	"confounding factors",
	"intervention efficacy",
	],
	"statistical reasoning": [
	"sample size",
	"confidence interval",
	"p-value",
	"effect size",
	"statistical power",
	],
	"comparative analysis": [
	"baseline performance",
	"improvement margin",
	"relative efficiency",
	"comparison fairness",
	],
	"analogical reasoning": [
	"source similarity",
	"target applicability",
	"mapping strength",
	"inference validity",
	],
	}

	# Get domain-specific variables or use generic ones
	variables = variable_sets.get(
	domain,
	[
	"factor A",
	"factor B",
	"factor C",
	"response variable",
	"control variable",
	],
	)

	# Shuffle and return
	random.shuffle(variables)
	return variables

	def _generate_relationship(self, var1: str, var2: str) -> str:
	relationships = [
	f"an increase in {var1} leads to a proportional increase in {var2}",
	f"{var1} and {var2} have an inverse relationship",
	f"changes in {var1} precede changes in {var2} by a consistent time interval",
	f"{var1} influences {var2} only after exceeding a critical threshold",
	f"the relationship between {var1} and {var2} is non-linear and follows a power law",
	f"{var1} and {var2} are conditionally independent given certain conditions",
	f"extreme values of {var1} have a disproportionate effect on {var2}",
	f"the correlation between {var1} and {var2} changes direction over time",
	]
	return random.choice(relationships)

	def _generate_inference(self, relationship: str) -> str:
	inferences = [
	"we should focus our optimization efforts on the most sensitive parameters",
	"the system will likely reach equilibrium after sufficient time",
	"interventions should target root causes rather than symptoms",
	"small changes can potentially lead to significant improvements",
	"we need to account for interaction effects between variables",
	"the observed behavior is likely part of a larger pattern",
	"we should implement fail-safes for extreme conditions",
	"multiple pathways may lead to the same outcome",
	]
	return random.choice(inferences)

	def _generate_deduction(self, var: str) -> str:
	deductions = [
	f"optimizing {var} alone will not solve the underlying problem",
	f"changes in {var} represent a leading indicator for system performance",
	f"the role of {var} has been previously underestimated",
	f"{var} acts as a moderating variable in this context",
	f"the impact of {var} follows a diminishing returns pattern",
	f"{var} exhibits threshold effects that must be accounted for",
	f"historical data on {var} supports this conclusion",
	f"contrary to conventional wisdom, {var} is not the limiting factor",
	]
	return random.choice(deductions)

	def _generate_question(self, topic: str) -> str:
	questions = [
	"how to optimize performance under these conditions",
	"whether the observed pattern will continue in the future",
	"which factors contribute most significantly to the outcome",
	"how to mitigate negative effects while preserving benefits",
	"what intervention would produce the most efficient solution",
	"how robust the system is to unexpected disturbances",
	"whether the findings can be generalized to other contexts",
	"how to distinguish correlation from causation in this case",
	]
	return random.choice(questions)

	def _generate_period(self) -> str:
	periods = [
	"the past decade",
	"periods of economic volatility",
	"the system's entire operational history",
	"multiple successive iterations",
	"both growth and decline phases",
	"controlled experimental conditions",
	"repeated stress-test cycles",
	"varying environmental conditions",
	]
	return random.choice(periods)

	def _generate_factor(self, topic: str) -> str:
	factors = [
	"resource utilization",
	"system complexity",
	"external pressure",
	"user engagement",
	"adaptation rate",
	"failure frequency",
	"communication efficiency",
	"innovation adoption",
	]
	return random.choice(factors)

	def _generate_event(self) -> str:
	events = [
	"system initialization",
	"critical resource depletion",
	"unexpected environmental change",
	"crossing a performance threshold",
	"implementing a major upgrade",
	"integrating new components",
	"encountering novel inputs",
	"recovering from failure",
	]
	return random.choice(events)

	def _generate_consequence(self, fact: str) -> str:
	return "the system will adapt by adjusting its parameters accordingly"

	def _generate_combined_inference(self, fact1: str, fact2: str) -> str:
	return "we can establish a clear causal relationship between the observed phenomena"

	def _generate_extra_insight(self, topic: str) -> str:
	insights = [
	"temporal patterns reveal cyclical behavior",
	"boundary conditions significantly affect outcomes",
	"network effects amplify individual contributions",
	"emergent properties cannot be predicted from components alone",
	"system resilience depends on redundant pathways",
	"optimization often involves trade-offs between competing goals",
	"adaptation requires continuous feedback and adjustment",
	"complex systems often exhibit counterintuitive behavior",
	]
	return random.choice(insights)

	def _generate_step(self, step_num: int, domain: str) -> str:
	if step_num == 1:
	steps = [
	"Identify the key variables and their relationships",
	"Establish the initial conditions and constraints",
	"Define the problem space and boundaries",
	"Gather relevant data and observations",
	"Frame the question in precise terms",
	]
	elif step_num == 2:
	steps = [
	"Analyze the patterns and correlations in the data",
	"Apply appropriate analytical methods",
	"Consider alternative explanations",
	"Map the causal relationships between factors",
	"Identify potential confounding variables",
	]
	elif step_num == 3:
	steps = [
	"Evaluate the strength of evidence for each possibility",
	"Synthesize insights from multiple perspectives",
	"Test hypotheses against available data",
	"Assess the logical consistency of arguments",
	"Consider edge cases and exceptions",
	]
	else:
	steps = [
	"Draw conclusions based on the strongest evidence",
	"Formulate actionable recommendations",
	"Identify remaining uncertainties",
	"Propose methods to validate findings",
	"Connect conclusions to the original question",
	]
	return random.choice(steps)

	def _generate_connection(self) -> str:
	connections = [
	"there's a causal relationship rather than mere correlation",
	"feedback mechanisms create self-reinforcing patterns",
	"threshold effects trigger qualitative changes in behavior",
	"multiple factors interact in non-linear ways",
	"temporal sequences reveal important dependencies",
	"structural constraints limit possible outcomes",
	"probabilistic influences accumulate deterministically",
	"conditional dependencies reveal deeper patterns",
	]
	return random.choice(connections)

	def _generate_key_insight(self, domain: str) -> str:
	insights = [
	"optimizing for average cases often fails at the extremes",
	"emergent properties cannot be reduced to component behaviors",
	"apparent contradictions point to incomplete models",
	"historical patterns constrain future possibilities",
	"local optimizations can lead to global suboptimality",
	"precision must be balanced with generalizability",
	"second-order effects often dominate in the long run",
	"robust systems prioritize adaptation over optimization",
	]
	return random.choice(insights)

	def _generate_result(self, domain: str) -> str:
	results = [
	"performance improves non-linearly",
	"stability increases at the cost of responsiveness",
	"resource utilization becomes more efficient",
	"adaptability improves in novel situations",
	"resilience to disturbances increases",
	"error rates decrease systematically",
	"learning accelerates with experience",
	"coordination emerges without central control",
	]
	return random.choice(results)

	def _generate_evidence(self, domain: str) -> str:
	evidence = [
	"historical performance data",
	"controlled experimental results",
	"comparative case studies",
	"simulation outcomes under varied conditions",
	"natural experiments from system perturbations",
	"user feedback and behavioral patterns",
	"statistical analysis of large datasets",
	"theoretical models with empirical validation",
	]
	return random.choice(evidence)

	def _generate_confirmation(self, domain: str) -> str:
	confirmations = [
	"the hypothesized mechanism actually operates as expected",
	"predicted outcomes match observed results",
	"alternative explanations can be ruled out",
	"the pattern holds across different contexts",
	"edge cases follow the same principles",
	"the model successfully predicts future behavior",
	"interventions produce expected effects",
	"independent measures converge on the same conclusion",
	]
	return random.choice(confirmations)

	def _generate_method(self, domain: str) -> str:
	methods = [
	"systematic decomposition into components",
	"counterfactual analysis",
	"process tracing through causal chains",
	"comparative analysis of similar cases",
	"statistical inference from patterns",
	"first principles reasoning",
	"model-based simulation",
	"abductive inference to the best explanation",
	]
	return random.choice(methods)

	def _generate_starting_point(self, domain: str) -> str:
	starting_points = [
	"the fundamental constraints of the system",
	"established principles in this domain",
	"patterns observed in similar situations",
	"the minimal necessary conditions",
	"key defining relationships",
	"initial boundary conditions",
	"critical assumptions that must hold",
	"invariant properties across contexts",
	]
	return random.choice(starting_points)

	def _generate_determination(self, domain: str) -> str:
	determinations = [
	"certain factors exert disproportionate influence",
	"system behavior follows predictable patterns under specific conditions",
	"apparent anomalies actually confirm deeper principles",
	"constraints channel possible outcomes in specific directions",
	"dynamic equilibria emerge from competing forces",
	"feedback loops stabilize or amplify depending on parameters",
	"path dependencies limit future possibilities",
	"critical thresholds separate qualitatively different regimes",
	]
	return random.choice(determinations)

	def _generate_intermediate_conclusion(self, domain: str) -> str:
	conclusions = [
	"we need to reconsider fundamental assumptions",
	"the system exhibits emergent properties not predictable from components",
	"apparent contradictions resolve at a higher level of analysis",
	"complex interactions require a more nuanced approach",
	"optimal solutions balance multiple competing objectives",
	"robustness comes at the cost of peak performance",
	"adaptability requires maintaining strategic flexibility",
	"precision must be traded off against generalizability",
	]
	return random.choice(conclusions)

	def _generate_final_step(self, domain: str) -> str:
	final_steps = [
	"we can formulate a general principle that applies broadly",
	"we can predict system behavior under novel conditions",
	"we can design interventions that leverage key mechanisms",
	"we can identify early warning signals for critical transitions",
	"we can optimize for robust performance across scenarios",
	"we can balance competing objectives through targeted trade-offs",
	"we can establish boundaries of applicability for our conclusions",
	"we can translate insights into actionable recommendations",
	]
	return random.choice(final_steps)

	def _generate_conclusion(self, premise: str, reasoning: str) -> str:
	"""Generate a coherent conclusion based on premise and reasoning."""
	conclusions = [
	"the observed patterns indicate a fundamental relationship between key variables",
	"we should prioritize interventions that address root causes rather than symptoms",
	"the system's behavior can be predicted with reasonable accuracy under specified conditions",
	"optimizing for extreme cases provides more robust performance than optimizing for average cases",
	"adaptation mechanisms are essential for maintaining performance in changing environments",
	"complex interactions between components create emergent properties at the system level",
	"resource allocation should follow a dynamic rather than static strategy",
	"feedback loops must be carefully managed to prevent unintended consequences",
	]
	return random.choice(conclusions)


	# Register the generator
	SyntheticDataGeneratorRegistry.register("reasoning", ReasoningDataGenerator)