| """ |
| Reasoning Module Synthetic Data Generator for MangoMAS Local |
| |
| This module generates synthetic training data for the reasoning capability. |
| """ |
|
|
| import json |
| import logging |
| import random |
| from pathlib import Path |
| from typing import Any, Dict, List |
|
|
| from ..synthetic_data_generator import (SyntheticDataGenerator, |
| SyntheticDataGeneratorRegistry) |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| REASONING_DOMAINS = [ |
| "logical deduction", |
| "causal analysis", |
| "statistical reasoning", |
| "comparative analysis", |
| "analogical reasoning", |
| "counterfactual reasoning", |
| "inductive reasoning", |
| "abductive reasoning", |
| "conditional reasoning", |
| "syllogistic reasoning", |
| ] |
|
|
| PREMISE_TEMPLATES = [ |
| "In a study of {topic}, researchers observed that {observation}.", |
| "The data shows that {observation} when {condition}.", |
| "Historical records indicate that {observation} throughout {period}.", |
| "Given that {fact_1} and {fact_2}, we need to determine {question}.", |
| "In the context of {domain}, we observe that {observation}.", |
| "When analyzing {topic}, we can see that {observation} correlates with {factor}.", |
| "System logs reveal that {observation} occurs whenever {condition}.", |
| "The problem states that {observation} under conditions where {condition}.", |
| "In the experiment, {observation} was measured when {condition}.", |
| "The scenario presents a situation where {observation} after {event}.", |
| ] |
|
|
| REASONING_TEMPLATES = [ |
| "First, we need to identify the key variables: {variables}. Looking at the relationship between {var1} and {var2}, we can see that {relationship}. This suggests that {inference}. Furthermore, considering {var3}, we can deduce that {deduction}.", |
| "Let's analyze this step by step. If {fact_1}, then {consequence_1}. Given that {fact_2}, we can infer {consequence_2}. Combining these insights, {combined_inference}. Additionally, {extra_insight} further supports this reasoning.", |
| "Breaking this down systematically: 1) {step_1}, 2) {step_2}, 3) {step_3}, 4) {step_4}. The logical connection between steps 2 and 3 shows that {connection}, which leads us to {inference}.", |
| "The key insight here is that {key_insight}. This means that when {condition}, the result is {result}. We can verify this by examining {evidence}, which confirms {confirmation}.", |
| "To solve this, I'll use {method}. Starting with {starting_point}, I can determine that {determination}. This leads to {intermediate_conclusion}, and finally to {final_step}.", |
| ] |
|
|
| CONCLUSION_TEMPLATES = [ |
| "Therefore, we can conclude that {conclusion}.", |
| "Based on this analysis, the answer is {conclusion}.", |
| "The evidence strongly suggests that {conclusion}.", |
| "This reasoning leads to the conclusion that {conclusion}.", |
| "We can definitively state that {conclusion} based on the above analysis.", |
| ] |
|
|
| |
| TOPICS = [ |
| "climate patterns", |
| "market trends", |
| "neural network performance", |
| "population demographics", |
| "traffic flow optimization", |
| "disease spread", |
| "economic indicators", |
| "algorithm efficiency", |
| "material properties", |
| "social network dynamics", |
| "cognitive biases", |
| "language acquisition", |
| "genetic inheritance", |
| "planetary motion", |
| "quantum phenomena", |
| ] |
|
|
| FACTS = [ |
| "energy consumption increases with temperature", |
| "user engagement drops after 3 minutes", |
| "performance plateaus with more than 8 layers", |
| "error rates increase exponentially with load", |
| "response time correlates with system memory", |
| "conversion rates are highest on Tuesdays", |
| "signal strength decreases with distance squared", |
| "learning rate affects convergence time", |
| "failure probability doubles every 5 years", |
| "growth rate is proportional to nutrient concentration", |
| ] |
|
|
|
|
| class ReasoningDataGenerator(SyntheticDataGenerator): |
| """Generator for synthetic reasoning data.""" |
|
|
| def _load_templates(self) -> List[Dict[str, Any]]: |
| """Load reasoning templates.""" |
| templates = { |
| "domains": REASONING_DOMAINS, |
| "premises": PREMISE_TEMPLATES, |
| "reasoning": REASONING_TEMPLATES, |
| "conclusions": CONCLUSION_TEMPLATES, |
| "topics": TOPICS, |
| "facts": FACTS, |
| } |
|
|
| |
| template_path = self.config.get("template_path") |
| if template_path and Path(template_path).exists(): |
| try: |
| with open(template_path, "r", encoding="utf-8") as f: |
| custom_templates = json.load(f) |
| for key, values in custom_templates.items(): |
| if key in templates and isinstance(values, list): |
| templates[key].extend(values) |
| except Exception as e: |
| logger.warning(f"Failed to load custom templates: {e}") |
|
|
| return templates |
|
|
| def generate_example(self) -> Dict[str, Any]: |
| """Generate a single reasoning example.""" |
| |
| domain = random.choice(self.templates["domains"]) |
| premise_template = random.choice(self.templates["premises"]) |
| reasoning_template = random.choice(self.templates["reasoning"]) |
| conclusion_template = random.choice(self.templates["conclusions"]) |
|
|
| |
| topic = random.choice(self.templates["topics"]) |
| fact_1 = random.choice(self.templates["facts"]) |
| fact_2 = random.choice(self.templates["facts"]) |
| while fact_2 == fact_1: |
| fact_2 = random.choice(self.templates["facts"]) |
|
|
| |
| observation = self._generate_observation(topic) |
| condition = self._generate_condition(topic) |
| variables = self._generate_variables(domain) |
| var1, var2, var3 = variables[:3] |
| relationship = self._generate_relationship(var1, var2) |
| inference = self._generate_inference(relationship) |
| deduction = self._generate_deduction(var3) |
|
|
| |
| context = { |
| "domain": domain, |
| "topic": topic, |
| "observation": observation, |
| "condition": condition, |
| "fact_1": fact_1, |
| "fact_2": fact_2, |
| "variables": ", ".join(variables), |
| "var1": var1, |
| "var2": var2, |
| "var3": var3, |
| "relationship": relationship, |
| "inference": inference, |
| "deduction": deduction, |
| "question": self._generate_question(topic), |
| "period": self._generate_period(), |
| "factor": self._generate_factor(topic), |
| "event": self._generate_event(), |
| "consequence_1": self._generate_consequence(fact_1), |
| "consequence_2": self._generate_consequence(fact_2), |
| "combined_inference": self._generate_combined_inference(fact_1, fact_2), |
| "extra_insight": self._generate_extra_insight(topic), |
| "step_1": self._generate_step(1, domain), |
| "step_2": self._generate_step(2, domain), |
| "step_3": self._generate_step(3, domain), |
| "step_4": self._generate_step(4, domain), |
| "connection": self._generate_connection(), |
| "key_insight": self._generate_key_insight(domain), |
| "result": self._generate_result(domain), |
| "evidence": self._generate_evidence(domain), |
| "confirmation": self._generate_confirmation(domain), |
| "method": self._generate_method(domain), |
| "starting_point": self._generate_starting_point(domain), |
| "determination": self._generate_determination(domain), |
| "intermediate_conclusion": self._generate_intermediate_conclusion(domain), |
| "final_step": self._generate_final_step(domain), |
| } |
|
|
| |
| premise = premise_template.format(**context) |
| reasoning = reasoning_template.format(**context) |
|
|
| |
| conclusion = self._generate_conclusion(premise, reasoning) |
| context["conclusion"] = conclusion |
| conclusion_text = conclusion_template.format(**context) |
|
|
| |
| return { |
| "premise": premise, |
| "reasoning": reasoning, |
| "conclusion": conclusion_text, |
| "domain": domain, |
| "complexity": random.choice(["low", "medium", "high"]), |
| "metadata": { |
| "variables": variables, |
| "topic": topic, |
| "reasoning_type": domain, |
| }, |
| } |
|
|
| |
| def _generate_observation(self, topic: str) -> str: |
| observations = [ |
| "the rate of change increases over time", |
| "there is a strong correlation between input and output variables", |
| "performance degrades under specific conditions", |
| "the system exhibits unexpected behavior when stressed", |
| "outliers significantly impact the overall trend", |
| "recurring patterns emerge after sufficient iterations", |
| "the distribution follows a power law rather than normal distribution", |
| "feedback loops amplify small initial differences", |
| "thresholds exist beyond which behavior changes dramatically", |
| "cyclical patterns emerge with a period of varying length", |
| ] |
| return random.choice(observations) |
|
|
| def _generate_condition(self, topic: str) -> str: |
| conditions = [ |
| "the system is under heavy load", |
| "external factors remain constant", |
| "all variables are optimized simultaneously", |
| "specific constraints are applied", |
| "the environment changes unexpectedly", |
| "resource limitations come into play", |
| "feedback mechanisms are activated", |
| "multiple agents interact simultaneously", |
| "time delays exceed a critical threshold", |
| "boundary conditions are enforced", |
| ] |
| return random.choice(conditions) |
|
|
| def _generate_variables(self, domain: str) -> List[str]: |
| variable_sets = { |
| "logical deduction": [ |
| "premise validity", |
| "logical consistency", |
| "conclusion strength", |
| "assumption bias", |
| ], |
| "causal analysis": [ |
| "cause magnitude", |
| "effect delay", |
| "confounding factors", |
| "intervention efficacy", |
| ], |
| "statistical reasoning": [ |
| "sample size", |
| "confidence interval", |
| "p-value", |
| "effect size", |
| "statistical power", |
| ], |
| "comparative analysis": [ |
| "baseline performance", |
| "improvement margin", |
| "relative efficiency", |
| "comparison fairness", |
| ], |
| "analogical reasoning": [ |
| "source similarity", |
| "target applicability", |
| "mapping strength", |
| "inference validity", |
| ], |
| } |
|
|
| |
| variables = variable_sets.get( |
| domain, |
| [ |
| "factor A", |
| "factor B", |
| "factor C", |
| "response variable", |
| "control variable", |
| ], |
| ) |
|
|
| |
| random.shuffle(variables) |
| return variables |
|
|
| def _generate_relationship(self, var1: str, var2: str) -> str: |
| relationships = [ |
| f"an increase in {var1} leads to a proportional increase in {var2}", |
| f"{var1} and {var2} have an inverse relationship", |
| f"changes in {var1} precede changes in {var2} by a consistent time interval", |
| f"{var1} influences {var2} only after exceeding a critical threshold", |
| f"the relationship between {var1} and {var2} is non-linear and follows a power law", |
| f"{var1} and {var2} are conditionally independent given certain conditions", |
| f"extreme values of {var1} have a disproportionate effect on {var2}", |
| f"the correlation between {var1} and {var2} changes direction over time", |
| ] |
| return random.choice(relationships) |
|
|
| def _generate_inference(self, relationship: str) -> str: |
| inferences = [ |
| "we should focus our optimization efforts on the most sensitive parameters", |
| "the system will likely reach equilibrium after sufficient time", |
| "interventions should target root causes rather than symptoms", |
| "small changes can potentially lead to significant improvements", |
| "we need to account for interaction effects between variables", |
| "the observed behavior is likely part of a larger pattern", |
| "we should implement fail-safes for extreme conditions", |
| "multiple pathways may lead to the same outcome", |
| ] |
| return random.choice(inferences) |
|
|
| def _generate_deduction(self, var: str) -> str: |
| deductions = [ |
| f"optimizing {var} alone will not solve the underlying problem", |
| f"changes in {var} represent a leading indicator for system performance", |
| f"the role of {var} has been previously underestimated", |
| f"{var} acts as a moderating variable in this context", |
| f"the impact of {var} follows a diminishing returns pattern", |
| f"{var} exhibits threshold effects that must be accounted for", |
| f"historical data on {var} supports this conclusion", |
| f"contrary to conventional wisdom, {var} is not the limiting factor", |
| ] |
| return random.choice(deductions) |
|
|
| def _generate_question(self, topic: str) -> str: |
| questions = [ |
| "how to optimize performance under these conditions", |
| "whether the observed pattern will continue in the future", |
| "which factors contribute most significantly to the outcome", |
| "how to mitigate negative effects while preserving benefits", |
| "what intervention would produce the most efficient solution", |
| "how robust the system is to unexpected disturbances", |
| "whether the findings can be generalized to other contexts", |
| "how to distinguish correlation from causation in this case", |
| ] |
| return random.choice(questions) |
|
|
| def _generate_period(self) -> str: |
| periods = [ |
| "the past decade", |
| "periods of economic volatility", |
| "the system's entire operational history", |
| "multiple successive iterations", |
| "both growth and decline phases", |
| "controlled experimental conditions", |
| "repeated stress-test cycles", |
| "varying environmental conditions", |
| ] |
| return random.choice(periods) |
|
|
| def _generate_factor(self, topic: str) -> str: |
| factors = [ |
| "resource utilization", |
| "system complexity", |
| "external pressure", |
| "user engagement", |
| "adaptation rate", |
| "failure frequency", |
| "communication efficiency", |
| "innovation adoption", |
| ] |
| return random.choice(factors) |
|
|
| def _generate_event(self) -> str: |
| events = [ |
| "system initialization", |
| "critical resource depletion", |
| "unexpected environmental change", |
| "crossing a performance threshold", |
| "implementing a major upgrade", |
| "integrating new components", |
| "encountering novel inputs", |
| "recovering from failure", |
| ] |
| return random.choice(events) |
|
|
| def _generate_consequence(self, fact: str) -> str: |
| return "the system will adapt by adjusting its parameters accordingly" |
|
|
| def _generate_combined_inference(self, fact1: str, fact2: str) -> str: |
| return "we can establish a clear causal relationship between the observed phenomena" |
|
|
| def _generate_extra_insight(self, topic: str) -> str: |
| insights = [ |
| "temporal patterns reveal cyclical behavior", |
| "boundary conditions significantly affect outcomes", |
| "network effects amplify individual contributions", |
| "emergent properties cannot be predicted from components alone", |
| "system resilience depends on redundant pathways", |
| "optimization often involves trade-offs between competing goals", |
| "adaptation requires continuous feedback and adjustment", |
| "complex systems often exhibit counterintuitive behavior", |
| ] |
| return random.choice(insights) |
|
|
| def _generate_step(self, step_num: int, domain: str) -> str: |
| if step_num == 1: |
| steps = [ |
| "Identify the key variables and their relationships", |
| "Establish the initial conditions and constraints", |
| "Define the problem space and boundaries", |
| "Gather relevant data and observations", |
| "Frame the question in precise terms", |
| ] |
| elif step_num == 2: |
| steps = [ |
| "Analyze the patterns and correlations in the data", |
| "Apply appropriate analytical methods", |
| "Consider alternative explanations", |
| "Map the causal relationships between factors", |
| "Identify potential confounding variables", |
| ] |
| elif step_num == 3: |
| steps = [ |
| "Evaluate the strength of evidence for each possibility", |
| "Synthesize insights from multiple perspectives", |
| "Test hypotheses against available data", |
| "Assess the logical consistency of arguments", |
| "Consider edge cases and exceptions", |
| ] |
| else: |
| steps = [ |
| "Draw conclusions based on the strongest evidence", |
| "Formulate actionable recommendations", |
| "Identify remaining uncertainties", |
| "Propose methods to validate findings", |
| "Connect conclusions to the original question", |
| ] |
| return random.choice(steps) |
|
|
| def _generate_connection(self) -> str: |
| connections = [ |
| "there's a causal relationship rather than mere correlation", |
| "feedback mechanisms create self-reinforcing patterns", |
| "threshold effects trigger qualitative changes in behavior", |
| "multiple factors interact in non-linear ways", |
| "temporal sequences reveal important dependencies", |
| "structural constraints limit possible outcomes", |
| "probabilistic influences accumulate deterministically", |
| "conditional dependencies reveal deeper patterns", |
| ] |
| return random.choice(connections) |
|
|
| def _generate_key_insight(self, domain: str) -> str: |
| insights = [ |
| "optimizing for average cases often fails at the extremes", |
| "emergent properties cannot be reduced to component behaviors", |
| "apparent contradictions point to incomplete models", |
| "historical patterns constrain future possibilities", |
| "local optimizations can lead to global suboptimality", |
| "precision must be balanced with generalizability", |
| "second-order effects often dominate in the long run", |
| "robust systems prioritize adaptation over optimization", |
| ] |
| return random.choice(insights) |
|
|
| def _generate_result(self, domain: str) -> str: |
| results = [ |
| "performance improves non-linearly", |
| "stability increases at the cost of responsiveness", |
| "resource utilization becomes more efficient", |
| "adaptability improves in novel situations", |
| "resilience to disturbances increases", |
| "error rates decrease systematically", |
| "learning accelerates with experience", |
| "coordination emerges without central control", |
| ] |
| return random.choice(results) |
|
|
| def _generate_evidence(self, domain: str) -> str: |
| evidence = [ |
| "historical performance data", |
| "controlled experimental results", |
| "comparative case studies", |
| "simulation outcomes under varied conditions", |
| "natural experiments from system perturbations", |
| "user feedback and behavioral patterns", |
| "statistical analysis of large datasets", |
| "theoretical models with empirical validation", |
| ] |
| return random.choice(evidence) |
|
|
| def _generate_confirmation(self, domain: str) -> str: |
| confirmations = [ |
| "the hypothesized mechanism actually operates as expected", |
| "predicted outcomes match observed results", |
| "alternative explanations can be ruled out", |
| "the pattern holds across different contexts", |
| "edge cases follow the same principles", |
| "the model successfully predicts future behavior", |
| "interventions produce expected effects", |
| "independent measures converge on the same conclusion", |
| ] |
| return random.choice(confirmations) |
|
|
| def _generate_method(self, domain: str) -> str: |
| methods = [ |
| "systematic decomposition into components", |
| "counterfactual analysis", |
| "process tracing through causal chains", |
| "comparative analysis of similar cases", |
| "statistical inference from patterns", |
| "first principles reasoning", |
| "model-based simulation", |
| "abductive inference to the best explanation", |
| ] |
| return random.choice(methods) |
|
|
| def _generate_starting_point(self, domain: str) -> str: |
| starting_points = [ |
| "the fundamental constraints of the system", |
| "established principles in this domain", |
| "patterns observed in similar situations", |
| "the minimal necessary conditions", |
| "key defining relationships", |
| "initial boundary conditions", |
| "critical assumptions that must hold", |
| "invariant properties across contexts", |
| ] |
| return random.choice(starting_points) |
|
|
| def _generate_determination(self, domain: str) -> str: |
| determinations = [ |
| "certain factors exert disproportionate influence", |
| "system behavior follows predictable patterns under specific conditions", |
| "apparent anomalies actually confirm deeper principles", |
| "constraints channel possible outcomes in specific directions", |
| "dynamic equilibria emerge from competing forces", |
| "feedback loops stabilize or amplify depending on parameters", |
| "path dependencies limit future possibilities", |
| "critical thresholds separate qualitatively different regimes", |
| ] |
| return random.choice(determinations) |
|
|
| def _generate_intermediate_conclusion(self, domain: str) -> str: |
| conclusions = [ |
| "we need to reconsider fundamental assumptions", |
| "the system exhibits emergent properties not predictable from components", |
| "apparent contradictions resolve at a higher level of analysis", |
| "complex interactions require a more nuanced approach", |
| "optimal solutions balance multiple competing objectives", |
| "robustness comes at the cost of peak performance", |
| "adaptability requires maintaining strategic flexibility", |
| "precision must be traded off against generalizability", |
| ] |
| return random.choice(conclusions) |
|
|
| def _generate_final_step(self, domain: str) -> str: |
| final_steps = [ |
| "we can formulate a general principle that applies broadly", |
| "we can predict system behavior under novel conditions", |
| "we can design interventions that leverage key mechanisms", |
| "we can identify early warning signals for critical transitions", |
| "we can optimize for robust performance across scenarios", |
| "we can balance competing objectives through targeted trade-offs", |
| "we can establish boundaries of applicability for our conclusions", |
| "we can translate insights into actionable recommendations", |
| ] |
| return random.choice(final_steps) |
|
|
| def _generate_conclusion(self, premise: str, reasoning: str) -> str: |
| """Generate a coherent conclusion based on premise and reasoning.""" |
| conclusions = [ |
| "the observed patterns indicate a fundamental relationship between key variables", |
| "we should prioritize interventions that address root causes rather than symptoms", |
| "the system's behavior can be predicted with reasonable accuracy under specified conditions", |
| "optimizing for extreme cases provides more robust performance than optimizing for average cases", |
| "adaptation mechanisms are essential for maintaining performance in changing environments", |
| "complex interactions between components create emergent properties at the system level", |
| "resource allocation should follow a dynamic rather than static strategy", |
| "feedback loops must be carefully managed to prevent unintended consequences", |
| ] |
| return random.choice(conclusions) |
|
|
|
|
| |
| SyntheticDataGeneratorRegistry.register("reasoning", ReasoningDataGenerator) |
|
|