harvesthealth commited on
Commit
3886ded
·
verified ·
1 Parent(s): 81753f0

Upload folder using huggingface_hub

Browse files
app.py CHANGED
@@ -103,11 +103,11 @@ def find_best_persona(criteria):
103
  personas = load_persona_base()
104
  if not personas:
105
  return {"error": "Persona base is empty. Generate some personas first!"}
106
-
107
  try:
108
  # select_best_persona uses LLM to find the best index
109
  idx = select_best_persona(criteria=criteria, personas=personas)
110
-
111
  try:
112
  idx = int(idx)
113
  except (ValueError, TypeError):
@@ -137,7 +137,7 @@ with gr.Blocks() as demo:
137
  )
138
 
139
  generate_button = gr.Button("Generate Personas")
140
-
141
  gr.Markdown("---")
142
  gr.Markdown("<h3>Search Tresor</h3>")
143
  criteria_input = gr.Textbox(label="Criteria to find best matching persona", lines=2)
 
103
  personas = load_persona_base()
104
  if not personas:
105
  return {"error": "Persona base is empty. Generate some personas first!"}
106
+
107
  try:
108
  # select_best_persona uses LLM to find the best index
109
  idx = select_best_persona(criteria=criteria, personas=personas)
110
+
111
  try:
112
  idx = int(idx)
113
  except (ValueError, TypeError):
 
137
  )
138
 
139
  generate_button = gr.Button("Generate Personas")
140
+
141
  gr.Markdown("---")
142
  gr.Markdown("<h3>Search Tresor</h3>")
143
  criteria_input = gr.Textbox(label="Criteria to find best matching persona", lines=2)
development/development_plan.txt ADDED
The diff for this file is too large to render. See raw diff
 
tinytroupe/agent/action_generator.py CHANGED
@@ -11,13 +11,13 @@ from tinytroupe.experimentation import Proposition
11
 
12
  class ActionGenerator(JsonSerializableRegistry):
13
 
14
- def __init__(self, max_attempts=2,
15
  enable_quality_checks=True,
16
  enable_regeneration=True,
17
  enable_direct_correction=False, # TODO enable_direct_correction not working very well yet
18
  enable_quality_check_for_persona_adherence=True,
19
  enable_quality_check_for_selfconsistency=True,
20
- enable_quality_check_for_fluency=True,
21
  enable_quality_check_for_suitability=False,
22
  enable_quality_check_for_similarity=False,
23
  continue_on_failure=True,
@@ -67,10 +67,14 @@ class ActionGenerator(JsonSerializableRegistry):
67
  # This generator has its own copies of the propositions, in order to be able to isolate them
68
  # from other agents, particularly when running the simulation in parallel.
69
  self.action_persona_adherence = propositions.hard_action_persona_adherence.copy()
 
 
70
  self.action_self_consistency = propositions.action_self_consistency.copy()
71
  self.action_fluency = propositions.action_fluency.copy()
72
  self.action_suitability = propositions.action_suitability.copy()
73
 
 
 
74
  # initialize statistics
75
  self.regeneration_failures = 0
76
  self.direct_correction_failures = 0
@@ -79,6 +83,9 @@ class ActionGenerator(JsonSerializableRegistry):
79
  self.total_actions_produced = 0
80
  self.total_original_actions_succeeded = 0
81
 
 
 
 
82
  def generate_next_action(self, agent, current_messages:list):
83
 
84
  from tinytroupe.agent import logger # import here to avoid circular import issues
@@ -317,25 +324,68 @@ class ActionGenerator(JsonSerializableRegistry):
317
  # Quality evaluation methods
318
  ###############################################################################################
319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  def _check_action_quality(self, stage, agent, tentative_action):
321
 
322
  from tinytroupe.agent import logger # import here to avoid circular import issues
 
323
 
324
  #
325
- # Compute various propositions about the action
 
 
 
 
 
 
 
326
  #
327
  persona_adherence_passed, persona_adherence_score, persona_adherence_feedback = \
328
  self._check_proposition(agent, self.action_persona_adherence, tentative_action, enable_proposition_check=self.enable_quality_check_for_persona_adherence)
329
 
330
- selfconsistency_passed, selfconsistency_score, selfconsistency_feedback = \
331
- self._check_proposition(agent, self.action_self_consistency, tentative_action, minimum_required_qty_of_actions=1, enable_proposition_check=self.enable_quality_check_for_selfconsistency)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
- fluency_passed, fluency_passed_score, fluency_feedback = \
334
- self._check_proposition(agent, self.action_fluency, tentative_action, enable_proposition_check=self.enable_quality_check_for_fluency)
335
 
336
- suitability_passed, suitability_score, suitability_feedback = \
337
- self._check_proposition(agent, self.action_suitability, tentative_action, enable_proposition_check=self.enable_quality_check_for_suitability)
 
338
 
 
339
  similarity_passed, similarity_score, similarity_feedback = \
340
  self._check_next_action_similarity(agent, tentative_action, threshold=self.max_action_similarity, enable_similarity_check=self.enable_quality_check_for_similarity)
341
 
@@ -427,7 +477,13 @@ class ActionGenerator(JsonSerializableRegistry):
427
 
428
  if enable_proposition_check:
429
  if agent.actions_count >= minimum_required_qty_of_actions:
430
- result = proposition.score(target=agent, claim_variables={"action": tentative_action}, return_full_response=True)
 
 
 
 
 
 
431
 
432
  value_with_justification = f"Score = {result['value']} (out of {Proposition.MAX_SCORE}). Justification = {result['justification']}"
433
 
 
11
 
12
  class ActionGenerator(JsonSerializableRegistry):
13
 
14
+ def __init__(self, max_attempts=1,
15
  enable_quality_checks=True,
16
  enable_regeneration=True,
17
  enable_direct_correction=False, # TODO enable_direct_correction not working very well yet
18
  enable_quality_check_for_persona_adherence=True,
19
  enable_quality_check_for_selfconsistency=True,
20
+ enable_quality_check_for_fluency=False,
21
  enable_quality_check_for_suitability=False,
22
  enable_quality_check_for_similarity=False,
23
  continue_on_failure=True,
 
67
  # This generator has its own copies of the propositions, in order to be able to isolate them
68
  # from other agents, particularly when running the simulation in parallel.
69
  self.action_persona_adherence = propositions.hard_action_persona_adherence.copy()
70
+ self.action_persona_adherence.model = "alias-large" # Critical check uses a larger model
71
+
72
  self.action_self_consistency = propositions.action_self_consistency.copy()
73
  self.action_fluency = propositions.action_fluency.copy()
74
  self.action_suitability = propositions.action_suitability.copy()
75
 
76
+ # Non-critical checks use the default model (assumed to be faster)
77
+
78
  # initialize statistics
79
  self.regeneration_failures = 0
80
  self.direct_correction_failures = 0
 
83
  self.total_actions_produced = 0
84
  self.total_original_actions_succeeded = 0
85
 
86
+ # initialize evaluation cache
87
+ self.evaluation_cache = {}
88
+
89
  def generate_next_action(self, agent, current_messages:list):
90
 
91
  from tinytroupe.agent import logger # import here to avoid circular import issues
 
324
  # Quality evaluation methods
325
  ###############################################################################################
326
 
327
+ def _pre_filter_action(self, action):
328
+ """
329
+ Quick rule-based checks before LLM evaluation.
330
+ """
331
+ content = action.get("content", "")
332
+ if not isinstance(content, str):
333
+ content = str(content)
334
+
335
+ # Check for obvious violations without LLM
336
+ if len(content) < 5:
337
+ return False, 0, "Action too short - rule-based filter"
338
+
339
+ # Check for prohibited content patterns
340
+ prohibited_patterns = ["I cannot", "I'm sorry", "As an AI"]
341
+ if any(pattern in content for pattern in prohibited_patterns):
342
+ return False, 0, "Prohibited content pattern detected"
343
+
344
+ return True, Proposition.MAX_SCORE, "Passed pre-filter"
345
+
346
  def _check_action_quality(self, stage, agent, tentative_action):
347
 
348
  from tinytroupe.agent import logger # import here to avoid circular import issues
349
+ from tinytroupe.utils.parallel import parallel_map
350
 
351
  #
352
+ # Pre-filter check
353
+ #
354
+ pre_filter_passed, pre_filter_score, pre_filter_feedback = self._pre_filter_action(tentative_action)
355
+ if not pre_filter_passed:
356
+ return False, pre_filter_score, pre_filter_feedback
357
+
358
+ #
359
+ # Critical Check: Persona Adherence (Sequential because it's critical and common to fail)
360
  #
361
  persona_adherence_passed, persona_adherence_score, persona_adherence_feedback = \
362
  self._check_proposition(agent, self.action_persona_adherence, tentative_action, enable_proposition_check=self.enable_quality_check_for_persona_adherence)
363
 
364
+ # Early exit if persona adherence fails
365
+ if not persona_adherence_passed:
366
+ return False, persona_adherence_score, persona_adherence_feedback
367
+
368
+ #
369
+ # Parallel Quality Checks for the rest
370
+ #
371
+ def run_check(check_info):
372
+ name, prop, min_actions, enabled = check_info
373
+ return name, self._check_proposition(agent, prop, tentative_action, minimum_required_qty_of_actions=min_actions, enable_proposition_check=enabled)
374
+
375
+ other_checks = [
376
+ ("self_consistency", self.action_self_consistency, 1, self.enable_quality_check_for_selfconsistency),
377
+ ("fluency", self.action_fluency, 0, self.enable_quality_check_for_fluency),
378
+ ("suitability", self.action_suitability, 0, self.enable_quality_check_for_suitability)
379
+ ]
380
 
381
+ parallel_results = parallel_map(other_checks, run_check)
382
+ results_dict = dict(parallel_results)
383
 
384
+ selfconsistency_passed, selfconsistency_score, selfconsistency_feedback = results_dict["self_consistency"]
385
+ fluency_passed, fluency_passed_score, fluency_feedback = results_dict["fluency"]
386
+ suitability_passed, suitability_score, suitability_feedback = results_dict["suitability"]
387
 
388
+ # Similarity check (local, so no need to parallelize)
389
  similarity_passed, similarity_score, similarity_feedback = \
390
  self._check_next_action_similarity(agent, tentative_action, threshold=self.max_action_similarity, enable_similarity_check=self.enable_quality_check_for_similarity)
391
 
 
477
 
478
  if enable_proposition_check:
479
  if agent.actions_count >= minimum_required_qty_of_actions:
480
+ # Cache check
481
+ cache_key = (id(proposition), agent.name, json.dumps(tentative_action, sort_keys=True))
482
+ if cache_key in self.evaluation_cache:
483
+ result = self.evaluation_cache[cache_key]
484
+ else:
485
+ result = proposition.score(target=agent, claim_variables={"action": tentative_action}, return_full_response=True)
486
+ self.evaluation_cache[cache_key] = result
487
 
488
  value_with_justification = f"Score = {result['value']} (out of {Proposition.MAX_SCORE}). Justification = {result['justification']}"
489
 
tinytroupe/experimentation/proposition.py CHANGED
@@ -15,7 +15,7 @@ class Proposition:
15
  MAX_SCORE = 9
16
 
17
  def __init__(self, claim:str, target=None, include_personas:bool=False, first_n:int=None, last_n:int=None,
18
- double_check:bool=False, use_reasoning_model:bool=False, precondition_function=None):
19
  """
20
  Define a proposition as a (textual) claim about a target, which can be a TinyWorld, a TinyPerson or several of any.
21
  The proposition's truth value can then either be checked as a boolean or computed as an integer score denoting the degree of truth.
@@ -55,6 +55,8 @@ class Proposition:
55
 
56
  self.precondition_function = precondition_function
57
 
 
 
58
  # the chat with the LLM is preserved until the proposition is re-evaluated. While it is available,
59
  # the chat can be used to follow up on the proposition, e.g., to ask for more details about the evaluation.
60
  self.llm_chat = None
@@ -79,7 +81,8 @@ class Proposition:
79
  last_n=self.last_n,
80
  double_check=self.double_check,
81
  use_reasoning_model=self.use_reasoning_model,
82
- precondition_function=self.precondition_function
 
83
  )
84
  return new_prop
85
 
@@ -368,6 +371,9 @@ class Proposition:
368
  return recommendation
369
 
370
  def _model(self, use_reasoning_model):
 
 
 
371
  if use_reasoning_model:
372
  return default["reasoning_model"]
373
  else:
 
15
  MAX_SCORE = 9
16
 
17
  def __init__(self, claim:str, target=None, include_personas:bool=False, first_n:int=None, last_n:int=None,
18
+ double_check:bool=False, use_reasoning_model:bool=False, precondition_function=None, model:str=None):
19
  """
20
  Define a proposition as a (textual) claim about a target, which can be a TinyWorld, a TinyPerson or several of any.
21
  The proposition's truth value can then either be checked as a boolean or computed as an integer score denoting the degree of truth.
 
55
 
56
  self.precondition_function = precondition_function
57
 
58
+ self.model = model
59
+
60
  # the chat with the LLM is preserved until the proposition is re-evaluated. While it is available,
61
  # the chat can be used to follow up on the proposition, e.g., to ask for more details about the evaluation.
62
  self.llm_chat = None
 
81
  last_n=self.last_n,
82
  double_check=self.double_check,
83
  use_reasoning_model=self.use_reasoning_model,
84
+ precondition_function=self.precondition_function,
85
+ model=self.model
86
  )
87
  return new_prop
88
 
 
371
  return recommendation
372
 
373
  def _model(self, use_reasoning_model):
374
+ if self.model:
375
+ return self.model
376
+
377
  if use_reasoning_model:
378
  return default["reasoning_model"]
379
  else:
tinytroupe/factory/tiny_person_factory.py CHANGED
@@ -557,7 +557,7 @@ class TinyPersonFactory(TinyFactory):
557
  # This is not a problem, as the sampling space is still valid and can be used, though it may not be as rich as expected.
558
  if len(self.remaining_characteristics_sample) != n:
559
  logger.warning(f"Expected {n} samples, but got {len(self.remaining_characteristics_sample)} samples. The LLM may have failed to sum up the quantities in the sampling plan correctly.")
560
-
561
  # If we got more samples than requested, we truncate them to avoid generating too many names or personas.
562
  if len(self.remaining_characteristics_sample) > n:
563
  logger.info(f"Truncating {len(self.remaining_characteristics_sample)} samples to the requested {n} samples.")
 
557
  # This is not a problem, as the sampling space is still valid and can be used, though it may not be as rich as expected.
558
  if len(self.remaining_characteristics_sample) != n:
559
  logger.warning(f"Expected {n} samples, but got {len(self.remaining_characteristics_sample)} samples. The LLM may have failed to sum up the quantities in the sampling plan correctly.")
560
+
561
  # If we got more samples than requested, we truncate them to avoid generating too many names or personas.
562
  if len(self.remaining_characteristics_sample) > n:
563
  logger.info(f"Truncating {len(self.remaining_characteristics_sample)} samples to the requested {n} samples.")
tinytroupe/openai_utils.py CHANGED
@@ -179,7 +179,7 @@ class OpenAIClient:
179
  else:
180
  current_model = config["OpenAI"].get("FALLBACK_MODEL_HUGE", "alias-huge")
181
  current_wait_time = 60
182
-
183
  chat_api_params["model"] = current_model
184
 
185
  try:
@@ -217,15 +217,15 @@ class OpenAIClient:
217
  logger.error(f"[{i}] Invalid request error, won't retry: {e}")
218
  return None
219
 
220
- except (openai.RateLimitError,
221
- openai.APITimeoutError,
222
- openai.APIConnectionError,
223
- openai.InternalServerError,
224
  NonTerminalError,
225
  Exception) as e:
226
  msg = f"[{i}] {type(e).__name__} Error with {current_model}: {e}. Waiting {current_wait_time} seconds before next attempt..."
227
  logger.warning(msg)
228
-
229
  time.sleep(current_wait_time)
230
  continue
231
 
@@ -250,12 +250,12 @@ class OpenAIClient:
250
  chat_api_params["reasoning_effort"] = default["reasoning_effort"]
251
 
252
 
253
- # To make the log cleaner, we remove the messages from the logged parameters,
254
  # unless we are in debug mode
255
  if logger.getEffectiveLevel() <= logging.DEBUG:
256
  logged_params = chat_api_params
257
  else:
258
- logged_params = {k: v for k, v in chat_api_params.items() if k != "messages"}
259
 
260
  if "response_format" in chat_api_params:
261
  # to enforce the response format via pydantic, we need to use a different method
 
179
  else:
180
  current_model = config["OpenAI"].get("FALLBACK_MODEL_HUGE", "alias-huge")
181
  current_wait_time = 60
182
+
183
  chat_api_params["model"] = current_model
184
 
185
  try:
 
217
  logger.error(f"[{i}] Invalid request error, won't retry: {e}")
218
  return None
219
 
220
+ except (openai.RateLimitError,
221
+ openai.APITimeoutError,
222
+ openai.APIConnectionError,
223
+ openai.InternalServerError,
224
  NonTerminalError,
225
  Exception) as e:
226
  msg = f"[{i}] {type(e).__name__} Error with {current_model}: {e}. Waiting {current_wait_time} seconds before next attempt..."
227
  logger.warning(msg)
228
+
229
  time.sleep(current_wait_time)
230
  continue
231
 
 
250
  chat_api_params["reasoning_effort"] = default["reasoning_effort"]
251
 
252
 
253
+ # To make the log cleaner, we remove the messages from the logged parameters,
254
  # unless we are in debug mode
255
  if logger.getEffectiveLevel() <= logging.DEBUG:
256
  logged_params = chat_api_params
257
  else:
258
+ logged_params = {k: v for k, v in chat_api_params.items() if k != "messages"}
259
 
260
  if "response_format" in chat_api_params:
261
  # to enforce the response format via pydantic, we need to use a different method
tinytroupe/utils/semantics.py CHANGED
@@ -268,7 +268,7 @@ def compute_semantic_proximity(text1: str, text2: str, context: str = None) -> f
268
  @llm()
269
  def select_best_persona(criteria: str, personas: list) -> int:
270
  """
271
- Given a set of criteria and a list of personas (each a dictionary),
272
  select the index of the persona that best matches the criteria.
273
  If no persona matches at all, return -1.
274
 
@@ -286,4 +286,3 @@ def select_best_persona(criteria: str, personas: list) -> int:
286
  int: The index of the best matching persona, or -1 if none match.
287
  """
288
  # llm decorator will handle the body of this function
289
-
 
268
  @llm()
269
  def select_best_persona(criteria: str, personas: list) -> int:
270
  """
271
+ Given a set of criteria and a list of personas (each a dictionary),
272
  select the index of the persona that best matches the criteria.
273
  If no persona matches at all, return -1.
274
 
 
286
  int: The index of the best matching persona, or -1 if none match.
287
  """
288
  # llm decorator will handle the body of this function