Spaces:

harvesthealth
/

tiny_factory

Paused

App Files Files Community

harvesthealth commited on Feb 24

Commit

3886ded

verified ·

1 Parent(s): 81753f0

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

app.py +3 -3
development/development_plan.txt +0 -0
tinytroupe/agent/action_generator.py +66 -10
tinytroupe/experimentation/proposition.py +8 -2
tinytroupe/factory/tiny_person_factory.py +1 -1
tinytroupe/openai_utils.py +8 -8
tinytroupe/utils/semantics.py +1 -2

app.py CHANGED Viewed

@@ -103,11 +103,11 @@ def find_best_persona(criteria):
     personas = load_persona_base()
     if not personas:
         return {"error": "Persona base is empty. Generate some personas first!"}
     try:
         # select_best_persona uses LLM to find the best index
         idx = select_best_persona(criteria=criteria, personas=personas)
         try:
             idx = int(idx)
         except (ValueError, TypeError):
@@ -137,7 +137,7 @@ with gr.Blocks() as demo:
             )
             generate_button = gr.Button("Generate Personas")
             gr.Markdown("---")
             gr.Markdown("<h3>Search Tresor</h3>")
             criteria_input = gr.Textbox(label="Criteria to find best matching persona", lines=2)

     personas = load_persona_base()
     if not personas:
         return {"error": "Persona base is empty. Generate some personas first!"}
     try:
         # select_best_persona uses LLM to find the best index
         idx = select_best_persona(criteria=criteria, personas=personas)
         try:
             idx = int(idx)
         except (ValueError, TypeError):
             )
             generate_button = gr.Button("Generate Personas")
             gr.Markdown("---")
             gr.Markdown("<h3>Search Tresor</h3>")
             criteria_input = gr.Textbox(label="Criteria to find best matching persona", lines=2)

development/development_plan.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tinytroupe/agent/action_generator.py CHANGED Viewed

@@ -11,13 +11,13 @@ from tinytroupe.experimentation import Proposition
 class ActionGenerator(JsonSerializableRegistry):
-    def __init__(self, max_attempts=2,
                  enable_quality_checks=True,
                  enable_regeneration=True,
                  enable_direct_correction=False, # TODO enable_direct_correction not working very well yet
                  enable_quality_check_for_persona_adherence=True,
                  enable_quality_check_for_selfconsistency=True,
-                 enable_quality_check_for_fluency=True,
                  enable_quality_check_for_suitability=False,
                  enable_quality_check_for_similarity=False,
                  continue_on_failure=True,
@@ -67,10 +67,14 @@ class ActionGenerator(JsonSerializableRegistry):
         # This generator has its own copies of the propositions, in order to be able to isolate them
         # from other agents, particularly when running the simulation in parallel.
         self.action_persona_adherence = propositions.hard_action_persona_adherence.copy()
         self.action_self_consistency = propositions.action_self_consistency.copy()
         self.action_fluency = propositions.action_fluency.copy()
         self.action_suitability = propositions.action_suitability.copy()
         # initialize statistics
         self.regeneration_failures = 0
         self.direct_correction_failures = 0
@@ -79,6 +83,9 @@ class ActionGenerator(JsonSerializableRegistry):
         self.total_actions_produced = 0
         self.total_original_actions_succeeded = 0
     def generate_next_action(self, agent, current_messages:list):
         from tinytroupe.agent import logger # import here to avoid circular import issues
@@ -317,25 +324,68 @@ class ActionGenerator(JsonSerializableRegistry):
     # Quality evaluation methods
     ###############################################################################################
     def _check_action_quality(self, stage, agent, tentative_action):
         from tinytroupe.agent import logger # import here to avoid circular import issues
         #
-        # Compute various propositions about the action
         #
         persona_adherence_passed, persona_adherence_score, persona_adherence_feedback = \
             self._check_proposition(agent, self.action_persona_adherence, tentative_action, enable_proposition_check=self.enable_quality_check_for_persona_adherence)
-        selfconsistency_passed, selfconsistency_score, selfconsistency_feedback = \
-            self._check_proposition(agent, self.action_self_consistency, tentative_action, minimum_required_qty_of_actions=1, enable_proposition_check=self.enable_quality_check_for_selfconsistency)
-        fluency_passed, fluency_passed_score, fluency_feedback = \
-            self._check_proposition(agent, self.action_fluency, tentative_action, enable_proposition_check=self.enable_quality_check_for_fluency)
-        suitability_passed, suitability_score, suitability_feedback = \
-            self._check_proposition(agent, self.action_suitability, tentative_action, enable_proposition_check=self.enable_quality_check_for_suitability)
         similarity_passed, similarity_score, similarity_feedback = \
             self._check_next_action_similarity(agent, tentative_action, threshold=self.max_action_similarity, enable_similarity_check=self.enable_quality_check_for_similarity)
@@ -427,7 +477,13 @@ class ActionGenerator(JsonSerializableRegistry):
         if enable_proposition_check:
             if agent.actions_count >= minimum_required_qty_of_actions:
-                result = proposition.score(target=agent, claim_variables={"action": tentative_action}, return_full_response=True)
                 value_with_justification = f"Score = {result['value']} (out of {Proposition.MAX_SCORE}). Justification = {result['justification']}"

 class ActionGenerator(JsonSerializableRegistry):
+    def __init__(self, max_attempts=1,
                  enable_quality_checks=True,
                  enable_regeneration=True,
                  enable_direct_correction=False, # TODO enable_direct_correction not working very well yet
                  enable_quality_check_for_persona_adherence=True,
                  enable_quality_check_for_selfconsistency=True,
+                 enable_quality_check_for_fluency=False,
                  enable_quality_check_for_suitability=False,
                  enable_quality_check_for_similarity=False,
                  continue_on_failure=True,
         # This generator has its own copies of the propositions, in order to be able to isolate them
         # from other agents, particularly when running the simulation in parallel.
         self.action_persona_adherence = propositions.hard_action_persona_adherence.copy()
+        self.action_persona_adherence.model = "alias-large" # Critical check uses a larger model
         self.action_self_consistency = propositions.action_self_consistency.copy()
         self.action_fluency = propositions.action_fluency.copy()
         self.action_suitability = propositions.action_suitability.copy()
+        # Non-critical checks use the default model (assumed to be faster)
         # initialize statistics
         self.regeneration_failures = 0
         self.direct_correction_failures = 0
         self.total_actions_produced = 0
         self.total_original_actions_succeeded = 0
+        # initialize evaluation cache
+        self.evaluation_cache = {}
     def generate_next_action(self, agent, current_messages:list):
         from tinytroupe.agent import logger # import here to avoid circular import issues
     # Quality evaluation methods
     ###############################################################################################
+    def _pre_filter_action(self, action):
+        """
+        Quick rule-based checks before LLM evaluation.
+        """
+        content = action.get("content", "")
+        if not isinstance(content, str):
+            content = str(content)
+        # Check for obvious violations without LLM
+        if len(content) < 5:
+            return False, 0, "Action too short - rule-based filter"
+        # Check for prohibited content patterns
+        prohibited_patterns = ["I cannot", "I'm sorry", "As an AI"]
+        if any(pattern in content for pattern in prohibited_patterns):
+            return False, 0, "Prohibited content pattern detected"
+        return True, Proposition.MAX_SCORE, "Passed pre-filter"
     def _check_action_quality(self, stage, agent, tentative_action):
         from tinytroupe.agent import logger # import here to avoid circular import issues
+        from tinytroupe.utils.parallel import parallel_map
         #
+        # Pre-filter check
+        #
+        pre_filter_passed, pre_filter_score, pre_filter_feedback = self._pre_filter_action(tentative_action)
+        if not pre_filter_passed:
+            return False, pre_filter_score, pre_filter_feedback
+        #
+        # Critical Check: Persona Adherence (Sequential because it's critical and common to fail)
         #
         persona_adherence_passed, persona_adherence_score, persona_adherence_feedback = \
             self._check_proposition(agent, self.action_persona_adherence, tentative_action, enable_proposition_check=self.enable_quality_check_for_persona_adherence)
+        # Early exit if persona adherence fails
+        if not persona_adherence_passed:
+             return False, persona_adherence_score, persona_adherence_feedback
+        #
+        # Parallel Quality Checks for the rest
+        #
+        def run_check(check_info):
+            name, prop, min_actions, enabled = check_info
+            return name, self._check_proposition(agent, prop, tentative_action, minimum_required_qty_of_actions=min_actions, enable_proposition_check=enabled)
+        other_checks = [
+            ("self_consistency", self.action_self_consistency, 1, self.enable_quality_check_for_selfconsistency),
+            ("fluency", self.action_fluency, 0, self.enable_quality_check_for_fluency),
+            ("suitability", self.action_suitability, 0, self.enable_quality_check_for_suitability)
+        ]
+        parallel_results = parallel_map(other_checks, run_check)
+        results_dict = dict(parallel_results)
+        selfconsistency_passed, selfconsistency_score, selfconsistency_feedback = results_dict["self_consistency"]
+        fluency_passed, fluency_passed_score, fluency_feedback = results_dict["fluency"]
+        suitability_passed, suitability_score, suitability_feedback = results_dict["suitability"]
+        # Similarity check (local, so no need to parallelize)
         similarity_passed, similarity_score, similarity_feedback = \
             self._check_next_action_similarity(agent, tentative_action, threshold=self.max_action_similarity, enable_similarity_check=self.enable_quality_check_for_similarity)
         if enable_proposition_check:
             if agent.actions_count >= minimum_required_qty_of_actions:
+                # Cache check
+                cache_key = (id(proposition), agent.name, json.dumps(tentative_action, sort_keys=True))
+                if cache_key in self.evaluation_cache:
+                    result = self.evaluation_cache[cache_key]
+                else:
+                    result = proposition.score(target=agent, claim_variables={"action": tentative_action}, return_full_response=True)
+                    self.evaluation_cache[cache_key] = result
                 value_with_justification = f"Score = {result['value']} (out of {Proposition.MAX_SCORE}). Justification = {result['justification']}"

tinytroupe/experimentation/proposition.py CHANGED Viewed

@@ -15,7 +15,7 @@ class Proposition:
     MAX_SCORE = 9
     def __init__(self, claim:str, target=None, include_personas:bool=False, first_n:int=None, last_n:int=None,
-                 double_check:bool=False, use_reasoning_model:bool=False, precondition_function=None):
         """
         Define a proposition as a (textual) claim about a target, which can be a TinyWorld, a TinyPerson or several of any.
         The proposition's truth value can then either be checked as a boolean or computed as an integer score denoting the degree of truth.
@@ -55,6 +55,8 @@ class Proposition:
         self.precondition_function = precondition_function
         # the chat with the LLM is preserved until the proposition is re-evaluated. While it is available,
         # the chat can be used to follow up on the proposition, e.g., to ask for more details about the evaluation.
         self.llm_chat = None
@@ -79,7 +81,8 @@ class Proposition:
             last_n=self.last_n,
             double_check=self.double_check,
             use_reasoning_model=self.use_reasoning_model,
-            precondition_function=self.precondition_function
         )
         return new_prop
@@ -368,6 +371,9 @@ class Proposition:
         return recommendation
     def _model(self, use_reasoning_model):
         if use_reasoning_model:
             return default["reasoning_model"]
         else:

     MAX_SCORE = 9
     def __init__(self, claim:str, target=None, include_personas:bool=False, first_n:int=None, last_n:int=None,
+                 double_check:bool=False, use_reasoning_model:bool=False, precondition_function=None, model:str=None):
         """
         Define a proposition as a (textual) claim about a target, which can be a TinyWorld, a TinyPerson or several of any.
         The proposition's truth value can then either be checked as a boolean or computed as an integer score denoting the degree of truth.
         self.precondition_function = precondition_function
+        self.model = model
         # the chat with the LLM is preserved until the proposition is re-evaluated. While it is available,
         # the chat can be used to follow up on the proposition, e.g., to ask for more details about the evaluation.
         self.llm_chat = None
             last_n=self.last_n,
             double_check=self.double_check,
             use_reasoning_model=self.use_reasoning_model,
+            precondition_function=self.precondition_function,
+            model=self.model
         )
         return new_prop
         return recommendation
     def _model(self, use_reasoning_model):
+        if self.model:
+            return self.model
         if use_reasoning_model:
             return default["reasoning_model"]
         else:

tinytroupe/factory/tiny_person_factory.py CHANGED Viewed

@@ -557,7 +557,7 @@ class TinyPersonFactory(TinyFactory):
             # This is not a problem, as the sampling space is still valid and can be used, though it may not be as rich as expected.
             if len(self.remaining_characteristics_sample) != n:
                 logger.warning(f"Expected {n} samples, but got {len(self.remaining_characteristics_sample)} samples. The LLM may have failed to sum up the quantities in the sampling plan correctly.")
             # If we got more samples than requested, we truncate them to avoid generating too many names or personas.
             if len(self.remaining_characteristics_sample) > n:
                 logger.info(f"Truncating {len(self.remaining_characteristics_sample)} samples to the requested {n} samples.")

             # This is not a problem, as the sampling space is still valid and can be used, though it may not be as rich as expected.
             if len(self.remaining_characteristics_sample) != n:
                 logger.warning(f"Expected {n} samples, but got {len(self.remaining_characteristics_sample)} samples. The LLM may have failed to sum up the quantities in the sampling plan correctly.")
             # If we got more samples than requested, we truncate them to avoid generating too many names or personas.
             if len(self.remaining_characteristics_sample) > n:
                 logger.info(f"Truncating {len(self.remaining_characteristics_sample)} samples to the requested {n} samples.")

tinytroupe/openai_utils.py CHANGED Viewed

@@ -179,7 +179,7 @@ class OpenAIClient:
                 else:
                     current_model = config["OpenAI"].get("FALLBACK_MODEL_HUGE", "alias-huge")
                     current_wait_time = 60
                 chat_api_params["model"] = current_model
                 try:
@@ -217,15 +217,15 @@ class OpenAIClient:
                 logger.error(f"[{i}] Invalid request error, won't retry: {e}")
                 return None
-            except (openai.RateLimitError,
-                    openai.APITimeoutError,
-                    openai.APIConnectionError,
-                    openai.InternalServerError,
                     NonTerminalError,
                     Exception) as e:
                 msg = f"[{i}] {type(e).__name__} Error with {current_model}: {e}. Waiting {current_wait_time} seconds before next attempt..."
                 logger.warning(msg)
                 time.sleep(current_wait_time)
                 continue
@@ -250,12 +250,12 @@ class OpenAIClient:
             chat_api_params["reasoning_effort"] = default["reasoning_effort"]
-        # To make the log cleaner, we remove the messages from the logged parameters,
         # unless we are in debug mode
         if logger.getEffectiveLevel() <= logging.DEBUG:
             logged_params = chat_api_params
         else:
-            logged_params = {k: v for k, v in chat_api_params.items() if k != "messages"}
         if "response_format" in chat_api_params:
             # to enforce the response format via pydantic, we need to use a different method

                 else:
                     current_model = config["OpenAI"].get("FALLBACK_MODEL_HUGE", "alias-huge")
                     current_wait_time = 60
                 chat_api_params["model"] = current_model
                 try:
                 logger.error(f"[{i}] Invalid request error, won't retry: {e}")
                 return None
+            except (openai.RateLimitError,
+                    openai.APITimeoutError,
+                    openai.APIConnectionError,
+                    openai.InternalServerError,
                     NonTerminalError,
                     Exception) as e:
                 msg = f"[{i}] {type(e).__name__} Error with {current_model}: {e}. Waiting {current_wait_time} seconds before next attempt..."
                 logger.warning(msg)
                 time.sleep(current_wait_time)
                 continue
             chat_api_params["reasoning_effort"] = default["reasoning_effort"]
+        # To make the log cleaner, we remove the messages from the logged parameters,
         # unless we are in debug mode
         if logger.getEffectiveLevel() <= logging.DEBUG:
             logged_params = chat_api_params
         else:
+            logged_params = {k: v for k, v in chat_api_params.items() if k != "messages"}
         if "response_format" in chat_api_params:
             # to enforce the response format via pydantic, we need to use a different method

tinytroupe/utils/semantics.py CHANGED Viewed

@@ -268,7 +268,7 @@ def compute_semantic_proximity(text1: str, text2: str, context: str = None) -> f
 @llm()
 def select_best_persona(criteria: str, personas: list) -> int:
     """
-    Given a set of criteria and a list of personas (each a dictionary),
     select the index of the persona that best matches the criteria.
     If no persona matches at all, return -1.
@@ -286,4 +286,3 @@ def select_best_persona(criteria: str, personas: list) -> int:
         int: The index of the best matching persona, or -1 if none match.
     """
     # llm decorator will handle the body of this function

 @llm()
 def select_best_persona(criteria: str, personas: list) -> int:
     """
+    Given a set of criteria and a list of personas (each a dictionary),
     select the index of the persona that best matches the criteria.
     If no persona matches at all, return -1.
         int: The index of the best matching persona, or -1 if none match.
     """
     # llm decorator will handle the body of this function