Spaces:
Sleeping
Sleeping
| import json | |
| import statistics # Add this import | |
| import tinytroupe.utils as utils | |
| from tinytroupe.control import transactional, current_simulation | |
| import tinytroupe.openai_utils as openai_utils | |
| from tinytroupe.validation import propositions | |
| from tinytroupe.utils import JsonSerializableRegistry | |
| from tinytroupe.experimentation import Proposition | |
| class ActionGenerator(JsonSerializableRegistry): | |
| def __init__(self, max_attempts=2, | |
| enable_quality_checks=True, | |
| enable_regeneration=True, | |
| enable_direct_correction=False, # TODO enable_direct_correction not working very well yet | |
| enable_quality_check_for_persona_adherence=True, | |
| enable_quality_check_for_selfconsistency=True, | |
| enable_quality_check_for_fluency=True, | |
| enable_quality_check_for_suitability=False, | |
| enable_quality_check_for_similarity=False, | |
| continue_on_failure=True, | |
| quality_threshold=7, | |
| max_action_similarity=0.6, | |
| enable_reasoning_step=False): # TODO enable_reasoning_step not working very well yet | |
| """ | |
| Initializes the ActionGenerator. | |
| Args: | |
| max_attempts (int): The maximum number of attempts to generate an action. | |
| enable_quality_checks (bool): Whether to perform quality checks on the generated action. If False, the first action generated | |
| is returned without any checks. | |
| enable_regeneration (bool): Whether to try to make the agent regenerate the action if the first attempt fails. | |
| enable_direct_correction (bool): Whether to directly correct the action if the first attempt fails, without asking the agent to regenerate it. | |
| enable_quality_check_for_persona_adherence (bool): Whether to check the action for persona adherence. | |
| enable_quality_check_for_selfconsistency (bool): Whether to check the action for self-consistency. | |
| enable_quality_check_for_fluency (bool): Whether to check the action for fluency. | |
| enable_quality_check_for_suitability (bool): Whether to check the action for suitability. | |
| continue_on_failure (bool): Whether to return the last tentative action, even if it fails to pass quality checks. | |
| Presumably, the last tentative action is the one that is most likely to be correct, since it has gone through the most iterations of regeneration and correction. | |
| quality_threshold (int): The minimum score for each quality check for the action to be considered good quality. | |
| enable_reasoning_step (bool): Whether to enable reasoning step in the action generation process. This IS NOT the use of "reasoning models" (e.g., o1, o3), | |
| but rather the use of an additional reasoning step in the regular text completion. | |
| """ | |
| self.max_attempts = max_attempts | |
| self.regeneration_attempts = 0 | |
| self.direct_correction_attempts = 0 | |
| self.enable_quality_checks = enable_quality_checks | |
| self.enable_regeneration = enable_regeneration | |
| self.enable_direct_correction = enable_direct_correction | |
| self.enable_quality_check_for_persona_adherence = enable_quality_check_for_persona_adherence | |
| self.enable_quality_check_for_selfconsistency = enable_quality_check_for_selfconsistency | |
| self.enable_quality_check_for_fluency = enable_quality_check_for_fluency | |
| self.enable_quality_check_for_suitability = enable_quality_check_for_suitability | |
| self.enable_quality_check_for_similarity = enable_quality_check_for_similarity | |
| self.continue_on_failure = continue_on_failure | |
| self.quality_threshold = quality_threshold | |
| self.max_action_similarity = max_action_similarity | |
| self.enable_reasoning_step = enable_reasoning_step | |
| # This generator has its own copies of the propositions, in order to be able to isolate them | |
| # from other agents, particularly when running the simulation in parallel. | |
| self.action_persona_adherence = propositions.hard_action_persona_adherence.copy() | |
| self.action_self_consistency = propositions.action_self_consistency.copy() | |
| self.action_fluency = propositions.action_fluency.copy() | |
| self.action_suitability = propositions.action_suitability.copy() | |
| # initialize statistics | |
| self.regeneration_failures = 0 | |
| self.direct_correction_failures = 0 | |
| self.regeneration_scores = [] | |
| self.direct_correction_scores = [] | |
| self.total_actions_produced = 0 | |
| self.total_original_actions_succeeded = 0 | |
| def generate_next_action(self, agent, current_messages:list): | |
| from tinytroupe.agent import logger # import here to avoid circular import issues | |
| # clean up (remove unnecessary elements) and copy the list of current messages to avoid modifying the original ones | |
| current_messages = [ | |
| {"role": msg["role"], "content": json.dumps(msg["content"])} | |
| for msg in current_messages | |
| ] | |
| # starts with no feedback | |
| cur_feedback = None | |
| all_negative_feedbacks = [] | |
| best_action = None | |
| best_role = None | |
| best_content = None | |
| best_score = float('-inf') | |
| original_score = None | |
| def update_best(tentative_action, role, content, total_score): | |
| nonlocal best_action, best_role, best_content, best_score | |
| if total_score > best_score: | |
| best_action = tentative_action | |
| best_role = role | |
| best_content = content | |
| best_score = total_score | |
| def finish_return(tentative_action, role, content, final_score): | |
| if original_score is not None and final_score > original_score: | |
| logger.warning(f"[{agent.name}] improved total quality from {original_score} to {final_score}") | |
| # ensure that tentative_action and content are dicts | |
| if isinstance(tentative_action, str): | |
| tentative_action = json.loads(tentative_action) | |
| if isinstance(content, str): | |
| content = json.loads(content) | |
| return tentative_action, role, content, all_negative_feedbacks | |
| # First attempt to generate an action | |
| tentative_action, role, content = self._generate_tentative_action(agent, current_messages, | |
| feedback_from_previous_attempt=cur_feedback, | |
| previous_tentative_action=None, | |
| previous_llm_role=None, previous_llm_content=None) | |
| if self.enable_quality_checks: | |
| # First quality check | |
| good_quality, total_score, cur_feedback = self._check_action_quality("Original Action", agent, tentative_action=tentative_action) | |
| update_best(tentative_action, role, content, total_score) | |
| if original_score is None: | |
| original_score = total_score | |
| if good_quality: | |
| self.total_original_actions_succeeded += 1 | |
| # Found a good action, let's return it now | |
| return finish_return(tentative_action, role, content, total_score) | |
| else: | |
| logger.warning(f"[{agent.name}] Original action did not pass quality checks: {cur_feedback}") | |
| all_negative_feedbacks.append(cur_feedback) | |
| # GENERATE AND REGENERATE the action by the agent | |
| # | |
| # We first try to make the agent generate (via the current_messages passed) or regenerate the | |
| # action based on feedback. | |
| if self.enable_regeneration: | |
| for attempt in range(self.max_attempts): | |
| # Generate tentative action | |
| tentative_action, role, content = self._generate_tentative_action(agent, current_messages, | |
| feedback_from_previous_attempt=cur_feedback, | |
| previous_tentative_action=tentative_action, | |
| previous_llm_role=role, previous_llm_content=content) | |
| logger.debug(f"[{agent.name}] Tentative action: {tentative_action}") | |
| self.regeneration_attempts += 1 | |
| good_quality, total_score, cur_feedback = self._check_action_quality(f"Action Regeneration ({attempt})", agent, tentative_action=tentative_action) | |
| update_best(tentative_action, role, content, total_score) | |
| if good_quality: | |
| # Found a good action, let's return it now | |
| return finish_return(tentative_action, role, content, total_score) | |
| else: | |
| self.regeneration_failures += 1 | |
| self.regeneration_scores.append(total_score) # Assuming feedback contains a score | |
| all_negative_feedbacks.append(cur_feedback) | |
| # CORRECT OR REPHRASE the action directly | |
| # | |
| # If we got here, it means the agent was not able to directly generate an action | |
| # of sufficient quality, so we'll try to rephrase it correctly directly now. | |
| if self.enable_direct_correction: | |
| for attempt in range(self.max_attempts): | |
| tentative_action, role, content = self._correct_action(tentative_action, feedback=cur_feedback, llm_role=role, llm_content=content) | |
| logger.warning(f"[{agent.name}] Rephrased the action directly as: {tentative_action}") | |
| self.direct_correction_attempts += 1 | |
| good_quality, total_score, cur_feedback = self._check_action_quality(f"Direct Action Correction or Rephrasing ({attempt})", agent, tentative_action=tentative_action) | |
| update_best(tentative_action, role, content, total_score) | |
| if good_quality: | |
| # Found a good action, let's return it now | |
| return finish_return(tentative_action, role, content, total_score) | |
| else: | |
| self.direct_correction_failures += 1 | |
| self.direct_correction_scores.append(total_score) # Assuming feedback contains a score | |
| all_negative_feedbacks.append(cur_feedback) | |
| # If we got here, all attempts to generate a good action failed | |
| if self.continue_on_failure: | |
| logger.warning(f"[{agent.name}] All attempts to generate a good action failed. Returning the best one.") | |
| return finish_return(best_action, best_role, best_content, best_score) | |
| else: | |
| raise PoorQualityActionException() | |
| else: | |
| # If we got here, it means that the action was generated without quality checks | |
| # and we are not doing any regeneration or direct correction, so we can return it now. | |
| return tentative_action, role, content, [] | |
| def _generate_tentative_action(self, agent, current_messages, feedback_from_previous_attempt=None, | |
| previous_tentative_action=None, | |
| previous_llm_role=None, previous_llm_content=None): | |
| from tinytroupe.agent import logger, CognitiveActionModel, CognitiveActionModelWithReasoning # import here to avoid circular import issues | |
| self.total_actions_produced += 1 | |
| # shallow clone current_messages | |
| current_messages_context = current_messages.copy() | |
| logger.debug(f"[{agent.name}] Sending messages to OpenAI API") | |
| logger.debug(f"[{agent.name}] Last interaction: {current_messages[-1]}") | |
| if feedback_from_previous_attempt: | |
| #current_messages_copy.append({"role": previous_llm_role, | |
| # "content": "TENTATIVE ACTION:" + json.dumps(previous_llm_content)}) | |
| current_messages_context.append({"role": "user", | |
| "content": \ | |
| f""" | |
| WARNING! TENTATIVE ACTION GENERATION FAILED IN QUALITY CHECKS! | |
| You were about to produce the following action, as a sequence for the previous actions or feedbacks (if any): | |
| ``` | |
| {previous_tentative_action} | |
| ``` | |
| However, it failed to pass the quality checks (as described in the quality feedback below), and therefore it was aborted and not added | |
| to the simulation trajectory. | |
| Now you **must** try again to generate a **BETTER** action, such that the quality issues mentioned in the feedback are addressed, | |
| or instead issue a DONE action and stop for this turn if it is unclear how to improve quality. | |
| Your objective is to **PASS** the quality checks this time if possible. | |
| You can choose either to FIX somehow the action you were about to produce, or to generate something COMPLETELY NEW and DIFFERENT. | |
| Each time your tentative action fail a quality check, you should be MORE RADICAL in your changes, and try to produce | |
| something that is **very** different from the previous attempts. | |
| If it is unclear how to produce a better action, you can choose to issue a DONE action instead. | |
| **It is better to stop acting than to act poorly.** | |
| In general, desireable properties of the action are: | |
| - The action is consistent with the agent's persona, it is what one would expect from the agent given its persona. | |
| - The action is self-consistent, it does contradict the agent's previous actions. | |
| - The action is fluent and natural, and does not repeat itself or use overly formulaic language. | |
| {feedback_from_previous_attempt} | |
| """}) | |
| current_messages_context.append({"role": "system", | |
| "content": "Now generate a better action based on the above feedback, or issue a DONE action if it is unclear how to improve quality."}) | |
| # TODO: remind the model of some key rules to follow? | |
| # | |
| # | |
| #current_messages_context.append({"role": "user", | |
| # "content": """ | |
| # Now you must generate a sequence of actions following the directives in your agent specification, | |
| # complying with **all** instructions and contraints related to the action you use. | |
| # In particular, to ensure the quality of your actions: | |
| # - **DO NOT** generate similar content in a row! We want human-like, natural and fluent behavior, and thus avoid#repeatitive behavior. | |
| # - THINK before taking further actions. | |
| # - Avoid thinking for too long, and actually take some concrete action before being done, particularly if you are expected to provide some action. | |
| # - Intercalate thinking with other actions. | |
| # - The new sequence of actions must be coherent and consistent with the previous actions and stimuli. For example, do not assume an expected or | |
| # desireable action already happened if that's not registered in the simulation history. | |
| # - If you received any quality feedback, you **MUST** take it into account and improve your performance. Your next actions | |
| # **must** be better than your previous ones if possible. | |
| # | |
| # If you can't produce a very good action, you may just issue a DONE action instead and remain silent. Rules to follow in #this case: | |
| # - It is better to remain silent than repeating similar actions or making other mistakes. | |
| # - Avoid remaining silent for too long (i.e., more than 3 times in a row), as this looks robotic and unnatural. If #necessary, you | |
| # can communicate your difficulties in coming up with a proper action, or just say something like "I don't know what to say". | |
| # - In case your thoughts or goals insistenly require you to **not** being quiet or silent, then you avoid just issuing #DONE if possible, | |
| # and try to produce a new action. In this case, the new action might refer to the difficulties you are having in #coming up with | |
| # a proper action in the first place. | |
| # | |
| # All of these actions **MUST** be rendered following the JSON specification perfectly, including all required keys (even #if their value is empty), **ALWAYS**. | |
| # """ | |
| # }) | |
| # | |
| current_messages_context.append({"role": "system", | |
| "content": "Remember: the action you will now generate **MUST** be a **well-formatted** and **valid** JSON object. No extra text, no extra brackets, commas, or other syntax errors."}) | |
| if not self.enable_reasoning_step: | |
| logger.debug(f"[{agent.name}] Reasoning step disabled.") | |
| next_message = openai_utils.client().send_message(current_messages_context, response_format=CognitiveActionModel) | |
| else: | |
| logger.debug(f"[{agent.name}] Reasoning step enabled.") | |
| # If the reasoning step is enabled, we add a system message to the context asking it to think step-by-step | |
| # | |
| # | |
| #current_messages_context.append({"role": "system", | |
| # "content": "In your response, you first use the \"reasoning\" field to think step-by-step about what is the next action and cognitive state that you are going to generate. To do so, you carefully consider: the agent specification given initially; additional instructions given later; and the history of stimuli and actions present in the simulation trajectory." + | |
| # "Then, you generate the action in the \"action\" field, and generate cognitive state in the \"cognitive_state\" field." }) | |
| current_messages_context.append({"role": "system", | |
| "content": "Use the \"reasoning\" field to add any reasoning process you might wish to use before generating the next action and cognitive state. "}) | |
| next_message = openai_utils.client().send_message(current_messages_context, response_format=CognitiveActionModelWithReasoning) | |
| logger.debug(f"[{agent.name}] Received message: {next_message}") | |
| role, content = next_message["role"], utils.extract_json(next_message["content"]) | |
| action = content['action'] | |
| logger.debug(f"{agent.name}'s action: {action}") | |
| return action, role, content | |
| ############################################################################################### | |
| # Quality evaluation methods | |
| ############################################################################################### | |
| def _check_action_quality(self, stage, agent, tentative_action): | |
| from tinytroupe.agent import logger # import here to avoid circular import issues | |
| # | |
| # Compute various propositions about the action | |
| # | |
| persona_adherence_passed, persona_adherence_score, persona_adherence_feedback = \ | |
| self._check_proposition(agent, self.action_persona_adherence, tentative_action, enable_proposition_check=self.enable_quality_check_for_persona_adherence) | |
| selfconsistency_passed, selfconsistency_score, selfconsistency_feedback = \ | |
| self._check_proposition(agent, self.action_self_consistency, tentative_action, minimum_required_qty_of_actions=1, enable_proposition_check=self.enable_quality_check_for_selfconsistency) | |
| fluency_passed, fluency_passed_score, fluency_feedback = \ | |
| self._check_proposition(agent, self.action_fluency, tentative_action, enable_proposition_check=self.enable_quality_check_for_fluency) | |
| suitability_passed, suitability_score, suitability_feedback = \ | |
| self._check_proposition(agent, self.action_suitability, tentative_action, enable_proposition_check=self.enable_quality_check_for_suitability) | |
| similarity_passed, similarity_score, similarity_feedback = \ | |
| self._check_next_action_similarity(agent, tentative_action, threshold=self.max_action_similarity, enable_similarity_check=self.enable_quality_check_for_similarity) | |
| # put the results together | |
| good_quality = persona_adherence_passed and selfconsistency_passed and fluency_passed and suitability_passed and similarity_passed | |
| total_score = persona_adherence_score + selfconsistency_score + fluency_passed_score + suitability_score + (similarity_score * Proposition.MAX_SCORE) | |
| combined_feedback = utils.combine_texts( | |
| persona_adherence_feedback, selfconsistency_feedback, fluency_feedback, suitability_feedback, similarity_feedback | |
| ) | |
| # give verdict | |
| if good_quality: | |
| return True, total_score, combined_feedback | |
| else: | |
| failure_feedback = \ | |
| f""" | |
| # Quality feedback | |
| This is the action that was about to be generated by the agent: | |
| {tentative_action} | |
| Unfortunately, the action failed to pass the quality checks, and therefore was aborted and not added to the similation trajectory. | |
| The following problems were detected. | |
| """ | |
| if not persona_adherence_passed: | |
| failure_feedback += f""" | |
| ## Problem: The action does not adhere to the persona specification. | |
| {persona_adherence_feedback} | |
| ### RECOMMENDATIONS FOR IMPROVEMENT | |
| Please follow the recommendations below when trying to generate this action again. | |
| {self.action_persona_adherence.recommendations_for_improvement()} | |
| """ | |
| if not selfconsistency_passed: | |
| failure_feedback += f""" | |
| ## Problem: The action is not self-consistent. | |
| {selfconsistency_feedback} | |
| ### RECOMMENDATIONS FOR IMPROVEMENT | |
| Please follow the recommendations below when trying to generate this action again. | |
| {self.action_self_consistency.recommendations_for_improvement()} | |
| """ | |
| if not fluency_passed: | |
| failure_feedback += f""" | |
| ## Problem: The action is not fluent. | |
| {fluency_feedback} | |
| ### RECOMMENDATIONS FOR IMPROVEMENT | |
| Please follow the recommendations below when trying to generate this action again. | |
| {self.action_fluency.recommendations_for_improvement()} | |
| """ | |
| if not suitability_passed: | |
| failure_feedback += f""" | |
| ## Problem: The action is not suitable to the situation or task. | |
| {suitability_feedback} | |
| ### RECOMMENDATIONS FOR IMPROVEMENT | |
| Please follow the recommendations below when trying to generate this action again. | |
| {self.action_suitability.recommendations_for_improvement()} | |
| """ | |
| if not similarity_passed: | |
| failure_feedback += f""" | |
| ## Problem: The action is too similar to the previous one. | |
| {similarity_feedback} | |
| """ | |
| logger.warning(f"[{agent.name}][{stage}] failed to pass quality checks: {failure_feedback}") | |
| return False, total_score, failure_feedback | |
| def _check_proposition(self, agent, proposition, tentative_action, minimum_required_qty_of_actions=0, enable_proposition_check=True): | |
| if enable_proposition_check: | |
| if agent.actions_count >= minimum_required_qty_of_actions: | |
| result = proposition.score(target=agent, claim_variables={"action": tentative_action}, return_full_response=True) | |
| value_with_justification = f"Score = {result['value']} (out of {Proposition.MAX_SCORE}). Justification = {result['justification']}" | |
| if result["value"] >= self.quality_threshold: | |
| return True, result["value"], value_with_justification | |
| else: | |
| return False, result["value"], value_with_justification | |
| else: | |
| return True, Proposition.MAX_SCORE, f"The proposition is trivially true due to the lack of enough actions for comparison." | |
| else: | |
| # If the proposition check is disabled, we assume it passed | |
| return True, Proposition.MAX_SCORE, f"The proposition check is disabled, so it is assumed to have passed." | |
| def _check_next_action_similarity(self, agent, proposed_next_action, threshold, enable_similarity_check=True): | |
| """ | |
| Checks the similarity between the agent's current action and a proposed next action. | |
| High similarity indicates that the proposed action is too similar to the current one, and this | |
| check fails. | |
| """ | |
| from tinytroupe.agent import logger # import here to avoid circular import issues | |
| if enable_similarity_check: | |
| similarity = utils.next_action_jaccard_similarity(agent, proposed_next_action) | |
| logger.debug(f"[{agent.name}] Next-action Jaccard similarity: {similarity}") | |
| if similarity >= threshold: | |
| logger.warning(f"[{agent.name}] Next-action Jaccard similarity is above the threshold ({threshold}).") | |
| return False, similarity, f"Similarity = {similarity} (range: 0.0 to 1.0). The action is too similar to the previous one." | |
| else: | |
| logger.debug(f"[{agent.name}] Next-action Jaccard similarity is below the threshold ({threshold}).") | |
| return True, similarity, f"Similarity = {similarity} (range: 0.0 to 1.0). The action is sufficiently different from the previous one." | |
| else: | |
| # If the similarity check is disabled, we assume it passed | |
| return True, 0.0, f"The similarity check is disabled, so it is assumed to have passed." | |
| ################################################################################################ | |
| # Action correction methods | |
| ################################################################################################ | |
| def _correct_action(self, action:dict, feedback, llm_role, llm_content): | |
| situation = \ | |
| f""" | |
| The following action by an agent was observed: | |
| {action} | |
| However, it does not conform to expectations about this agent behavior, | |
| due to the following reasons. | |
| {feedback} | |
| """ | |
| #restructured_situation =\ | |
| # utils.restructure_as_observed_vs_expected(\ | |
| # """) | |
| #rule = utils.formulate_corrective_rule(restructured_situation) | |
| rules = utils.extract_observed_vs_expected_rules(situation) | |
| rephrased_action_content = utils.correct_according_to_rule(action["content"], rules) | |
| # copy action | |
| rephrased_action = action.copy() | |
| # update content | |
| rephrased_action["content"] = rephrased_action_content | |
| # replace in the 'action' key in the original llm content message | |
| llm_content["action"] = rephrased_action | |
| return rephrased_action, llm_role, llm_content | |
| def get_statistics(self): | |
| regeneration_failure_rate = self.regeneration_failures / self.regeneration_attempts if self.regeneration_attempts else 0 | |
| direct_correction_failure_rate = self.direct_correction_failures / self.direct_correction_attempts if self.direct_correction_attempts else 0 | |
| regeneration_mean_score = statistics.mean(self.regeneration_scores) if self.regeneration_scores else 0 | |
| regeneration_sd_score = statistics.stdev(self.regeneration_scores) if len(self.regeneration_scores) > 1 else 0 | |
| direct_correction_mean_score = statistics.mean(self.direct_correction_scores) if self.direct_correction_scores else 0 | |
| direct_correction_sd_score = statistics.stdev(self.direct_correction_scores) if len(self.direct_correction_scores) > 1 else 0 | |
| original_success_rate = self.total_original_actions_succeeded / self.total_actions_produced if self.total_actions_produced else 0 | |
| return { | |
| "regeneration_failure_rate": regeneration_failure_rate, | |
| "direct_correction_failure_rate": direct_correction_failure_rate, | |
| "regeneration_mean_score": regeneration_mean_score, | |
| "regeneration_sd_score": regeneration_sd_score, | |
| "direct_correction_mean_score": direct_correction_mean_score, | |
| "direct_correction_sd_score": direct_correction_sd_score, | |
| "total_actions_produced": self.total_actions_produced, | |
| "total_original_actions_succeeded": self.total_original_actions_succeeded, | |
| "original_success_rate": original_success_rate, | |
| "regeneration_success_rate": 1 - regeneration_failure_rate, | |
| "direct_correction_success_rate": 1 - direct_correction_failure_rate | |
| } | |
| class PoorQualityActionException(Exception): | |
| def __init__(self, message="The generated action is of poor quality"): | |
| self.message = message | |
| super().__init__(self.message) | |