Spaces:
Runtime error
Runtime error
Upped max steps to 20, updated context management
Browse files
app.py
CHANGED
|
@@ -463,26 +463,72 @@ def manage_context(prompt, max_allowed_tokens=30000):
|
|
| 463 |
|
| 464 |
# Now update the try_model_call_with_fallbacks function to use this context management
|
| 465 |
def try_model_call_with_fallbacks(prompt):
|
| 466 |
-
"""Try to use the primary model first
|
| 467 |
-
# First
|
| 468 |
try:
|
| 469 |
-
#
|
| 470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
except Exception as primary_error:
|
| 474 |
-
# If
|
| 475 |
if "Input validation error: inputs tokens + max_new_tokens" in str(primary_error):
|
| 476 |
try:
|
| 477 |
-
print("Token limit exceeded
|
| 478 |
-
|
| 479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
except Exception:
|
| 481 |
-
print("
|
| 482 |
|
| 483 |
print(f"Primary model call failed: {str(primary_error)}")
|
| 484 |
print("Trying fallback models...")
|
| 485 |
|
|
|
|
| 486 |
# List of fallback models
|
| 487 |
fallbacks = [
|
| 488 |
{
|
|
@@ -502,16 +548,20 @@ def try_model_call_with_fallbacks(prompt):
|
|
| 502 |
if not api_key:
|
| 503 |
raise ValueError("No Hugging Face API key found in environment variables")
|
| 504 |
|
| 505 |
-
# Try each fallback model in sequence
|
| 506 |
for fallback in fallbacks:
|
| 507 |
try:
|
| 508 |
print(f"Trying fallback model: {fallback['display_name']}")
|
| 509 |
client = InferenceClient(provider=fallback["provider"], api_key=api_key)
|
| 510 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
completion = client.chat.completions.create(
|
| 512 |
model=fallback["model_name"],
|
| 513 |
messages=messages,
|
| 514 |
-
max_tokens=
|
| 515 |
temperature=0.5
|
| 516 |
)
|
| 517 |
print(f"Successfully used fallback model: {fallback['display_name']}")
|
|
@@ -520,13 +570,16 @@ def try_model_call_with_fallbacks(prompt):
|
|
| 520 |
print(f"Fallback model {fallback['display_name']} failed: {str(e)}")
|
| 521 |
continue
|
| 522 |
|
| 523 |
-
# If all fallbacks fail,
|
| 524 |
-
|
| 525 |
|
| 526 |
# Monkey patch the model's __call__ method to use our fallback logic
|
| 527 |
original_call = model.__call__
|
| 528 |
model.__call__ = try_model_call_with_fallbacks
|
| 529 |
|
|
|
|
|
|
|
|
|
|
| 530 |
# Import tool from Hub
|
| 531 |
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
|
| 532 |
|
|
@@ -546,7 +599,7 @@ agent = CodeAgent(
|
|
| 546 |
Check_Dataset_Validity,
|
| 547 |
visit_webpage_tool, # This is correctly initialized as VisitWebpageTool()
|
| 548 |
],
|
| 549 |
-
max_steps=
|
| 550 |
verbosity_level=1,
|
| 551 |
grammar=None,
|
| 552 |
planning_interval=3,
|
|
|
|
| 463 |
|
| 464 |
# Now update the try_model_call_with_fallbacks function to use this context management
|
| 465 |
def try_model_call_with_fallbacks(prompt):
|
| 466 |
+
"""Try to use the primary model first with aggressive context management."""
|
| 467 |
+
# First, ALWAYS apply context management, but more aggressively
|
| 468 |
try:
|
| 469 |
+
# Get a rough token count estimate
|
| 470 |
+
estimated_tokens = len(prompt.split())
|
| 471 |
+
print(f"Estimated input tokens: {estimated_tokens}")
|
| 472 |
+
|
| 473 |
+
# Start with 25000 as the maximum (leaving ~6K tokens buffer for the model limits)
|
| 474 |
+
managed_prompt = manage_context(prompt, max_allowed_tokens=25000)
|
| 475 |
+
|
| 476 |
+
# If still potentially too large, reduce further
|
| 477 |
+
if len(managed_prompt.split()) > 24000:
|
| 478 |
+
print("First context reduction still too large, reducing further...")
|
| 479 |
+
managed_prompt = manage_context(managed_prompt, max_allowed_tokens=22000)
|
| 480 |
+
|
| 481 |
+
# Final emergency truncation if needed
|
| 482 |
+
if len(managed_prompt.split()) > 22000:
|
| 483 |
+
print("Emergency truncation required")
|
| 484 |
+
words = managed_prompt.split()
|
| 485 |
+
# Keep first 5000 and last 15000 words with a note in between
|
| 486 |
+
managed_prompt = " ".join(words[:5000]) + "\n\n[CONTEXT SEVERELY TRUNCATED]\n\n" + " ".join(words[-15000:])
|
| 487 |
+
|
| 488 |
+
print(f"Final managed prompt size: {len(managed_prompt.split())} estimated tokens")
|
| 489 |
+
|
| 490 |
+
# Temporarily reduce output tokens even further if the prompt is large
|
| 491 |
+
temp_max_tokens = model.max_tokens
|
| 492 |
+
if len(managed_prompt.split()) > 20000:
|
| 493 |
+
print("Large prompt detected, temporarily reducing output tokens")
|
| 494 |
+
model.max_tokens = 750 # Temporarily reduce to 750 for this call
|
| 495 |
|
| 496 |
+
try:
|
| 497 |
+
result = original_call(managed_prompt)
|
| 498 |
+
model.max_tokens = temp_max_tokens # Restore original setting
|
| 499 |
+
return result
|
| 500 |
+
except Exception as call_error:
|
| 501 |
+
# Restore original setting before handling the error
|
| 502 |
+
model.max_tokens = temp_max_tokens
|
| 503 |
+
raise call_error
|
| 504 |
+
|
| 505 |
except Exception as primary_error:
|
| 506 |
+
# If we still get a token limit error, try even more aggressive reduction
|
| 507 |
if "Input validation error: inputs tokens + max_new_tokens" in str(primary_error):
|
| 508 |
try:
|
| 509 |
+
print("Critical: Token limit exceeded despite context management. Implementing emergency measures...")
|
| 510 |
+
# Take a more drastic approach - keep only system instructions and last part
|
| 511 |
+
lines = prompt.strip().split('\n')
|
| 512 |
+
# Keep first 50 lines and last 100 lines only
|
| 513 |
+
emergency_prompt = "\n".join(lines[:50] + ["\n[MAJORITY OF CONTEXT REMOVED DUE TO TOKEN LIMITS]\n"] + lines[-100:])
|
| 514 |
+
|
| 515 |
+
# Reduce output tokens drastically
|
| 516 |
+
temp_max_tokens = model.max_tokens
|
| 517 |
+
model.max_tokens = 500
|
| 518 |
+
try:
|
| 519 |
+
result = original_call(emergency_prompt)
|
| 520 |
+
model.max_tokens = temp_max_tokens
|
| 521 |
+
return result
|
| 522 |
+
except Exception:
|
| 523 |
+
model.max_tokens = temp_max_tokens
|
| 524 |
+
print("Emergency measures failed. Trying fallback models...")
|
| 525 |
except Exception:
|
| 526 |
+
print("Emergency context management failed. Proceeding to fallback models...")
|
| 527 |
|
| 528 |
print(f"Primary model call failed: {str(primary_error)}")
|
| 529 |
print("Trying fallback models...")
|
| 530 |
|
| 531 |
+
# Rest of fallback logic remains the same...
|
| 532 |
# List of fallback models
|
| 533 |
fallbacks = [
|
| 534 |
{
|
|
|
|
| 548 |
if not api_key:
|
| 549 |
raise ValueError("No Hugging Face API key found in environment variables")
|
| 550 |
|
| 551 |
+
# Try each fallback model in sequence with highly aggressive context management
|
| 552 |
for fallback in fallbacks:
|
| 553 |
try:
|
| 554 |
print(f"Trying fallback model: {fallback['display_name']}")
|
| 555 |
client = InferenceClient(provider=fallback["provider"], api_key=api_key)
|
| 556 |
+
|
| 557 |
+
# Apply even more aggressive context management for fallbacks
|
| 558 |
+
emergency_prompt = manage_context(prompt, max_allowed_tokens=15000)
|
| 559 |
+
messages = [{"role": "user", "content": emergency_prompt}]
|
| 560 |
+
|
| 561 |
completion = client.chat.completions.create(
|
| 562 |
model=fallback["model_name"],
|
| 563 |
messages=messages,
|
| 564 |
+
max_tokens=1000, # Reduced tokens for output
|
| 565 |
temperature=0.5
|
| 566 |
)
|
| 567 |
print(f"Successfully used fallback model: {fallback['display_name']}")
|
|
|
|
| 570 |
print(f"Fallback model {fallback['display_name']} failed: {str(e)}")
|
| 571 |
continue
|
| 572 |
|
| 573 |
+
# If all fallbacks fail, provide a useful error message
|
| 574 |
+
return "ERROR: Unable to process request due to context size limitations. Please break your request into smaller parts or simplify your query."
|
| 575 |
|
| 576 |
# Monkey patch the model's __call__ method to use our fallback logic
|
| 577 |
original_call = model.__call__
|
| 578 |
model.__call__ = try_model_call_with_fallbacks
|
| 579 |
|
| 580 |
+
# Reduce the model's output tokens immediately to improve chances of success
|
| 581 |
+
model.max_tokens = 1000 # Reduce from 2096 to 1000 to stay under token limits
|
| 582 |
+
|
| 583 |
# Import tool from Hub
|
| 584 |
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
|
| 585 |
|
|
|
|
| 599 |
Check_Dataset_Validity,
|
| 600 |
visit_webpage_tool, # This is correctly initialized as VisitWebpageTool()
|
| 601 |
],
|
| 602 |
+
max_steps=20,
|
| 603 |
verbosity_level=1,
|
| 604 |
grammar=None,
|
| 605 |
planning_interval=3,
|