Spaces:
Paused
Paused
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -103,11 +103,11 @@ def find_best_persona(criteria):
|
|
| 103 |
personas = load_persona_base()
|
| 104 |
if not personas:
|
| 105 |
return {"error": "Persona base is empty. Generate some personas first!"}
|
| 106 |
-
|
| 107 |
try:
|
| 108 |
# select_best_persona uses LLM to find the best index
|
| 109 |
idx = select_best_persona(criteria=criteria, personas=personas)
|
| 110 |
-
|
| 111 |
try:
|
| 112 |
idx = int(idx)
|
| 113 |
except (ValueError, TypeError):
|
|
@@ -137,7 +137,7 @@ with gr.Blocks() as demo:
|
|
| 137 |
)
|
| 138 |
|
| 139 |
generate_button = gr.Button("Generate Personas")
|
| 140 |
-
|
| 141 |
gr.Markdown("---")
|
| 142 |
gr.Markdown("<h3>Search Tresor</h3>")
|
| 143 |
criteria_input = gr.Textbox(label="Criteria to find best matching persona", lines=2)
|
|
|
|
| 103 |
personas = load_persona_base()
|
| 104 |
if not personas:
|
| 105 |
return {"error": "Persona base is empty. Generate some personas first!"}
|
| 106 |
+
|
| 107 |
try:
|
| 108 |
# select_best_persona uses LLM to find the best index
|
| 109 |
idx = select_best_persona(criteria=criteria, personas=personas)
|
| 110 |
+
|
| 111 |
try:
|
| 112 |
idx = int(idx)
|
| 113 |
except (ValueError, TypeError):
|
|
|
|
| 137 |
)
|
| 138 |
|
| 139 |
generate_button = gr.Button("Generate Personas")
|
| 140 |
+
|
| 141 |
gr.Markdown("---")
|
| 142 |
gr.Markdown("<h3>Search Tresor</h3>")
|
| 143 |
criteria_input = gr.Textbox(label="Criteria to find best matching persona", lines=2)
|
development/development_plan.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tinytroupe/agent/action_generator.py
CHANGED
|
@@ -11,13 +11,13 @@ from tinytroupe.experimentation import Proposition
|
|
| 11 |
|
| 12 |
class ActionGenerator(JsonSerializableRegistry):
|
| 13 |
|
| 14 |
-
def __init__(self, max_attempts=
|
| 15 |
enable_quality_checks=True,
|
| 16 |
enable_regeneration=True,
|
| 17 |
enable_direct_correction=False, # TODO enable_direct_correction not working very well yet
|
| 18 |
enable_quality_check_for_persona_adherence=True,
|
| 19 |
enable_quality_check_for_selfconsistency=True,
|
| 20 |
-
enable_quality_check_for_fluency=
|
| 21 |
enable_quality_check_for_suitability=False,
|
| 22 |
enable_quality_check_for_similarity=False,
|
| 23 |
continue_on_failure=True,
|
|
@@ -67,10 +67,14 @@ class ActionGenerator(JsonSerializableRegistry):
|
|
| 67 |
# This generator has its own copies of the propositions, in order to be able to isolate them
|
| 68 |
# from other agents, particularly when running the simulation in parallel.
|
| 69 |
self.action_persona_adherence = propositions.hard_action_persona_adherence.copy()
|
|
|
|
|
|
|
| 70 |
self.action_self_consistency = propositions.action_self_consistency.copy()
|
| 71 |
self.action_fluency = propositions.action_fluency.copy()
|
| 72 |
self.action_suitability = propositions.action_suitability.copy()
|
| 73 |
|
|
|
|
|
|
|
| 74 |
# initialize statistics
|
| 75 |
self.regeneration_failures = 0
|
| 76 |
self.direct_correction_failures = 0
|
|
@@ -79,6 +83,9 @@ class ActionGenerator(JsonSerializableRegistry):
|
|
| 79 |
self.total_actions_produced = 0
|
| 80 |
self.total_original_actions_succeeded = 0
|
| 81 |
|
|
|
|
|
|
|
|
|
|
| 82 |
def generate_next_action(self, agent, current_messages:list):
|
| 83 |
|
| 84 |
from tinytroupe.agent import logger # import here to avoid circular import issues
|
|
@@ -317,25 +324,68 @@ class ActionGenerator(JsonSerializableRegistry):
|
|
| 317 |
# Quality evaluation methods
|
| 318 |
###############################################################################################
|
| 319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
def _check_action_quality(self, stage, agent, tentative_action):
|
| 321 |
|
| 322 |
from tinytroupe.agent import logger # import here to avoid circular import issues
|
|
|
|
| 323 |
|
| 324 |
#
|
| 325 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
#
|
| 327 |
persona_adherence_passed, persona_adherence_score, persona_adherence_feedback = \
|
| 328 |
self._check_proposition(agent, self.action_persona_adherence, tentative_action, enable_proposition_check=self.enable_quality_check_for_persona_adherence)
|
| 329 |
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
| 335 |
|
| 336 |
-
|
| 337 |
-
|
|
|
|
| 338 |
|
|
|
|
| 339 |
similarity_passed, similarity_score, similarity_feedback = \
|
| 340 |
self._check_next_action_similarity(agent, tentative_action, threshold=self.max_action_similarity, enable_similarity_check=self.enable_quality_check_for_similarity)
|
| 341 |
|
|
@@ -427,7 +477,13 @@ class ActionGenerator(JsonSerializableRegistry):
|
|
| 427 |
|
| 428 |
if enable_proposition_check:
|
| 429 |
if agent.actions_count >= minimum_required_qty_of_actions:
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
value_with_justification = f"Score = {result['value']} (out of {Proposition.MAX_SCORE}). Justification = {result['justification']}"
|
| 433 |
|
|
|
|
| 11 |
|
| 12 |
class ActionGenerator(JsonSerializableRegistry):
|
| 13 |
|
| 14 |
+
def __init__(self, max_attempts=1,
|
| 15 |
enable_quality_checks=True,
|
| 16 |
enable_regeneration=True,
|
| 17 |
enable_direct_correction=False, # TODO enable_direct_correction not working very well yet
|
| 18 |
enable_quality_check_for_persona_adherence=True,
|
| 19 |
enable_quality_check_for_selfconsistency=True,
|
| 20 |
+
enable_quality_check_for_fluency=False,
|
| 21 |
enable_quality_check_for_suitability=False,
|
| 22 |
enable_quality_check_for_similarity=False,
|
| 23 |
continue_on_failure=True,
|
|
|
|
| 67 |
# This generator has its own copies of the propositions, in order to be able to isolate them
|
| 68 |
# from other agents, particularly when running the simulation in parallel.
|
| 69 |
self.action_persona_adherence = propositions.hard_action_persona_adherence.copy()
|
| 70 |
+
self.action_persona_adherence.model = "alias-large" # Critical check uses a larger model
|
| 71 |
+
|
| 72 |
self.action_self_consistency = propositions.action_self_consistency.copy()
|
| 73 |
self.action_fluency = propositions.action_fluency.copy()
|
| 74 |
self.action_suitability = propositions.action_suitability.copy()
|
| 75 |
|
| 76 |
+
# Non-critical checks use the default model (assumed to be faster)
|
| 77 |
+
|
| 78 |
# initialize statistics
|
| 79 |
self.regeneration_failures = 0
|
| 80 |
self.direct_correction_failures = 0
|
|
|
|
| 83 |
self.total_actions_produced = 0
|
| 84 |
self.total_original_actions_succeeded = 0
|
| 85 |
|
| 86 |
+
# initialize evaluation cache
|
| 87 |
+
self.evaluation_cache = {}
|
| 88 |
+
|
| 89 |
def generate_next_action(self, agent, current_messages:list):
|
| 90 |
|
| 91 |
from tinytroupe.agent import logger # import here to avoid circular import issues
|
|
|
|
| 324 |
# Quality evaluation methods
|
| 325 |
###############################################################################################
|
| 326 |
|
| 327 |
+
def _pre_filter_action(self, action):
|
| 328 |
+
"""
|
| 329 |
+
Quick rule-based checks before LLM evaluation.
|
| 330 |
+
"""
|
| 331 |
+
content = action.get("content", "")
|
| 332 |
+
if not isinstance(content, str):
|
| 333 |
+
content = str(content)
|
| 334 |
+
|
| 335 |
+
# Check for obvious violations without LLM
|
| 336 |
+
if len(content) < 5:
|
| 337 |
+
return False, 0, "Action too short - rule-based filter"
|
| 338 |
+
|
| 339 |
+
# Check for prohibited content patterns
|
| 340 |
+
prohibited_patterns = ["I cannot", "I'm sorry", "As an AI"]
|
| 341 |
+
if any(pattern in content for pattern in prohibited_patterns):
|
| 342 |
+
return False, 0, "Prohibited content pattern detected"
|
| 343 |
+
|
| 344 |
+
return True, Proposition.MAX_SCORE, "Passed pre-filter"
|
| 345 |
+
|
| 346 |
def _check_action_quality(self, stage, agent, tentative_action):
|
| 347 |
|
| 348 |
from tinytroupe.agent import logger # import here to avoid circular import issues
|
| 349 |
+
from tinytroupe.utils.parallel import parallel_map
|
| 350 |
|
| 351 |
#
|
| 352 |
+
# Pre-filter check
|
| 353 |
+
#
|
| 354 |
+
pre_filter_passed, pre_filter_score, pre_filter_feedback = self._pre_filter_action(tentative_action)
|
| 355 |
+
if not pre_filter_passed:
|
| 356 |
+
return False, pre_filter_score, pre_filter_feedback
|
| 357 |
+
|
| 358 |
+
#
|
| 359 |
+
# Critical Check: Persona Adherence (Sequential because it's critical and common to fail)
|
| 360 |
#
|
| 361 |
persona_adherence_passed, persona_adherence_score, persona_adherence_feedback = \
|
| 362 |
self._check_proposition(agent, self.action_persona_adherence, tentative_action, enable_proposition_check=self.enable_quality_check_for_persona_adherence)
|
| 363 |
|
| 364 |
+
# Early exit if persona adherence fails
|
| 365 |
+
if not persona_adherence_passed:
|
| 366 |
+
return False, persona_adherence_score, persona_adherence_feedback
|
| 367 |
+
|
| 368 |
+
#
|
| 369 |
+
# Parallel Quality Checks for the rest
|
| 370 |
+
#
|
| 371 |
+
def run_check(check_info):
|
| 372 |
+
name, prop, min_actions, enabled = check_info
|
| 373 |
+
return name, self._check_proposition(agent, prop, tentative_action, minimum_required_qty_of_actions=min_actions, enable_proposition_check=enabled)
|
| 374 |
+
|
| 375 |
+
other_checks = [
|
| 376 |
+
("self_consistency", self.action_self_consistency, 1, self.enable_quality_check_for_selfconsistency),
|
| 377 |
+
("fluency", self.action_fluency, 0, self.enable_quality_check_for_fluency),
|
| 378 |
+
("suitability", self.action_suitability, 0, self.enable_quality_check_for_suitability)
|
| 379 |
+
]
|
| 380 |
|
| 381 |
+
parallel_results = parallel_map(other_checks, run_check)
|
| 382 |
+
results_dict = dict(parallel_results)
|
| 383 |
|
| 384 |
+
selfconsistency_passed, selfconsistency_score, selfconsistency_feedback = results_dict["self_consistency"]
|
| 385 |
+
fluency_passed, fluency_passed_score, fluency_feedback = results_dict["fluency"]
|
| 386 |
+
suitability_passed, suitability_score, suitability_feedback = results_dict["suitability"]
|
| 387 |
|
| 388 |
+
# Similarity check (local, so no need to parallelize)
|
| 389 |
similarity_passed, similarity_score, similarity_feedback = \
|
| 390 |
self._check_next_action_similarity(agent, tentative_action, threshold=self.max_action_similarity, enable_similarity_check=self.enable_quality_check_for_similarity)
|
| 391 |
|
|
|
|
| 477 |
|
| 478 |
if enable_proposition_check:
|
| 479 |
if agent.actions_count >= minimum_required_qty_of_actions:
|
| 480 |
+
# Cache check
|
| 481 |
+
cache_key = (id(proposition), agent.name, json.dumps(tentative_action, sort_keys=True))
|
| 482 |
+
if cache_key in self.evaluation_cache:
|
| 483 |
+
result = self.evaluation_cache[cache_key]
|
| 484 |
+
else:
|
| 485 |
+
result = proposition.score(target=agent, claim_variables={"action": tentative_action}, return_full_response=True)
|
| 486 |
+
self.evaluation_cache[cache_key] = result
|
| 487 |
|
| 488 |
value_with_justification = f"Score = {result['value']} (out of {Proposition.MAX_SCORE}). Justification = {result['justification']}"
|
| 489 |
|
tinytroupe/experimentation/proposition.py
CHANGED
|
@@ -15,7 +15,7 @@ class Proposition:
|
|
| 15 |
MAX_SCORE = 9
|
| 16 |
|
| 17 |
def __init__(self, claim:str, target=None, include_personas:bool=False, first_n:int=None, last_n:int=None,
|
| 18 |
-
double_check:bool=False, use_reasoning_model:bool=False, precondition_function=None):
|
| 19 |
"""
|
| 20 |
Define a proposition as a (textual) claim about a target, which can be a TinyWorld, a TinyPerson or several of any.
|
| 21 |
The proposition's truth value can then either be checked as a boolean or computed as an integer score denoting the degree of truth.
|
|
@@ -55,6 +55,8 @@ class Proposition:
|
|
| 55 |
|
| 56 |
self.precondition_function = precondition_function
|
| 57 |
|
|
|
|
|
|
|
| 58 |
# the chat with the LLM is preserved until the proposition is re-evaluated. While it is available,
|
| 59 |
# the chat can be used to follow up on the proposition, e.g., to ask for more details about the evaluation.
|
| 60 |
self.llm_chat = None
|
|
@@ -79,7 +81,8 @@ class Proposition:
|
|
| 79 |
last_n=self.last_n,
|
| 80 |
double_check=self.double_check,
|
| 81 |
use_reasoning_model=self.use_reasoning_model,
|
| 82 |
-
precondition_function=self.precondition_function
|
|
|
|
| 83 |
)
|
| 84 |
return new_prop
|
| 85 |
|
|
@@ -368,6 +371,9 @@ class Proposition:
|
|
| 368 |
return recommendation
|
| 369 |
|
| 370 |
def _model(self, use_reasoning_model):
|
|
|
|
|
|
|
|
|
|
| 371 |
if use_reasoning_model:
|
| 372 |
return default["reasoning_model"]
|
| 373 |
else:
|
|
|
|
| 15 |
MAX_SCORE = 9
|
| 16 |
|
| 17 |
def __init__(self, claim:str, target=None, include_personas:bool=False, first_n:int=None, last_n:int=None,
|
| 18 |
+
double_check:bool=False, use_reasoning_model:bool=False, precondition_function=None, model:str=None):
|
| 19 |
"""
|
| 20 |
Define a proposition as a (textual) claim about a target, which can be a TinyWorld, a TinyPerson or several of any.
|
| 21 |
The proposition's truth value can then either be checked as a boolean or computed as an integer score denoting the degree of truth.
|
|
|
|
| 55 |
|
| 56 |
self.precondition_function = precondition_function
|
| 57 |
|
| 58 |
+
self.model = model
|
| 59 |
+
|
| 60 |
# the chat with the LLM is preserved until the proposition is re-evaluated. While it is available,
|
| 61 |
# the chat can be used to follow up on the proposition, e.g., to ask for more details about the evaluation.
|
| 62 |
self.llm_chat = None
|
|
|
|
| 81 |
last_n=self.last_n,
|
| 82 |
double_check=self.double_check,
|
| 83 |
use_reasoning_model=self.use_reasoning_model,
|
| 84 |
+
precondition_function=self.precondition_function,
|
| 85 |
+
model=self.model
|
| 86 |
)
|
| 87 |
return new_prop
|
| 88 |
|
|
|
|
| 371 |
return recommendation
|
| 372 |
|
| 373 |
def _model(self, use_reasoning_model):
|
| 374 |
+
if self.model:
|
| 375 |
+
return self.model
|
| 376 |
+
|
| 377 |
if use_reasoning_model:
|
| 378 |
return default["reasoning_model"]
|
| 379 |
else:
|
tinytroupe/factory/tiny_person_factory.py
CHANGED
|
@@ -557,7 +557,7 @@ class TinyPersonFactory(TinyFactory):
|
|
| 557 |
# This is not a problem, as the sampling space is still valid and can be used, though it may not be as rich as expected.
|
| 558 |
if len(self.remaining_characteristics_sample) != n:
|
| 559 |
logger.warning(f"Expected {n} samples, but got {len(self.remaining_characteristics_sample)} samples. The LLM may have failed to sum up the quantities in the sampling plan correctly.")
|
| 560 |
-
|
| 561 |
# If we got more samples than requested, we truncate them to avoid generating too many names or personas.
|
| 562 |
if len(self.remaining_characteristics_sample) > n:
|
| 563 |
logger.info(f"Truncating {len(self.remaining_characteristics_sample)} samples to the requested {n} samples.")
|
|
|
|
| 557 |
# This is not a problem, as the sampling space is still valid and can be used, though it may not be as rich as expected.
|
| 558 |
if len(self.remaining_characteristics_sample) != n:
|
| 559 |
logger.warning(f"Expected {n} samples, but got {len(self.remaining_characteristics_sample)} samples. The LLM may have failed to sum up the quantities in the sampling plan correctly.")
|
| 560 |
+
|
| 561 |
# If we got more samples than requested, we truncate them to avoid generating too many names or personas.
|
| 562 |
if len(self.remaining_characteristics_sample) > n:
|
| 563 |
logger.info(f"Truncating {len(self.remaining_characteristics_sample)} samples to the requested {n} samples.")
|
tinytroupe/openai_utils.py
CHANGED
|
@@ -179,7 +179,7 @@ class OpenAIClient:
|
|
| 179 |
else:
|
| 180 |
current_model = config["OpenAI"].get("FALLBACK_MODEL_HUGE", "alias-huge")
|
| 181 |
current_wait_time = 60
|
| 182 |
-
|
| 183 |
chat_api_params["model"] = current_model
|
| 184 |
|
| 185 |
try:
|
|
@@ -217,15 +217,15 @@ class OpenAIClient:
|
|
| 217 |
logger.error(f"[{i}] Invalid request error, won't retry: {e}")
|
| 218 |
return None
|
| 219 |
|
| 220 |
-
except (openai.RateLimitError,
|
| 221 |
-
openai.APITimeoutError,
|
| 222 |
-
openai.APIConnectionError,
|
| 223 |
-
openai.InternalServerError,
|
| 224 |
NonTerminalError,
|
| 225 |
Exception) as e:
|
| 226 |
msg = f"[{i}] {type(e).__name__} Error with {current_model}: {e}. Waiting {current_wait_time} seconds before next attempt..."
|
| 227 |
logger.warning(msg)
|
| 228 |
-
|
| 229 |
time.sleep(current_wait_time)
|
| 230 |
continue
|
| 231 |
|
|
@@ -250,12 +250,12 @@ class OpenAIClient:
|
|
| 250 |
chat_api_params["reasoning_effort"] = default["reasoning_effort"]
|
| 251 |
|
| 252 |
|
| 253 |
-
# To make the log cleaner, we remove the messages from the logged parameters,
|
| 254 |
# unless we are in debug mode
|
| 255 |
if logger.getEffectiveLevel() <= logging.DEBUG:
|
| 256 |
logged_params = chat_api_params
|
| 257 |
else:
|
| 258 |
-
logged_params = {k: v for k, v in chat_api_params.items() if k != "messages"}
|
| 259 |
|
| 260 |
if "response_format" in chat_api_params:
|
| 261 |
# to enforce the response format via pydantic, we need to use a different method
|
|
|
|
| 179 |
else:
|
| 180 |
current_model = config["OpenAI"].get("FALLBACK_MODEL_HUGE", "alias-huge")
|
| 181 |
current_wait_time = 60
|
| 182 |
+
|
| 183 |
chat_api_params["model"] = current_model
|
| 184 |
|
| 185 |
try:
|
|
|
|
| 217 |
logger.error(f"[{i}] Invalid request error, won't retry: {e}")
|
| 218 |
return None
|
| 219 |
|
| 220 |
+
except (openai.RateLimitError,
|
| 221 |
+
openai.APITimeoutError,
|
| 222 |
+
openai.APIConnectionError,
|
| 223 |
+
openai.InternalServerError,
|
| 224 |
NonTerminalError,
|
| 225 |
Exception) as e:
|
| 226 |
msg = f"[{i}] {type(e).__name__} Error with {current_model}: {e}. Waiting {current_wait_time} seconds before next attempt..."
|
| 227 |
logger.warning(msg)
|
| 228 |
+
|
| 229 |
time.sleep(current_wait_time)
|
| 230 |
continue
|
| 231 |
|
|
|
|
| 250 |
chat_api_params["reasoning_effort"] = default["reasoning_effort"]
|
| 251 |
|
| 252 |
|
| 253 |
+
# To make the log cleaner, we remove the messages from the logged parameters,
|
| 254 |
# unless we are in debug mode
|
| 255 |
if logger.getEffectiveLevel() <= logging.DEBUG:
|
| 256 |
logged_params = chat_api_params
|
| 257 |
else:
|
| 258 |
+
logged_params = {k: v for k, v in chat_api_params.items() if k != "messages"}
|
| 259 |
|
| 260 |
if "response_format" in chat_api_params:
|
| 261 |
# to enforce the response format via pydantic, we need to use a different method
|
tinytroupe/utils/semantics.py
CHANGED
|
@@ -268,7 +268,7 @@ def compute_semantic_proximity(text1: str, text2: str, context: str = None) -> f
|
|
| 268 |
@llm()
|
| 269 |
def select_best_persona(criteria: str, personas: list) -> int:
|
| 270 |
"""
|
| 271 |
-
Given a set of criteria and a list of personas (each a dictionary),
|
| 272 |
select the index of the persona that best matches the criteria.
|
| 273 |
If no persona matches at all, return -1.
|
| 274 |
|
|
@@ -286,4 +286,3 @@ def select_best_persona(criteria: str, personas: list) -> int:
|
|
| 286 |
int: The index of the best matching persona, or -1 if none match.
|
| 287 |
"""
|
| 288 |
# llm decorator will handle the body of this function
|
| 289 |
-
|
|
|
|
| 268 |
@llm()
|
| 269 |
def select_best_persona(criteria: str, personas: list) -> int:
|
| 270 |
"""
|
| 271 |
+
Given a set of criteria and a list of personas (each a dictionary),
|
| 272 |
select the index of the persona that best matches the criteria.
|
| 273 |
If no persona matches at all, return -1.
|
| 274 |
|
|
|
|
| 286 |
int: The index of the best matching persona, or -1 if none match.
|
| 287 |
"""
|
| 288 |
# llm decorator will handle the body of this function
|
|
|