Agent-Example

Runtime error

App Files Files Community

SolshineMisfit commited on Mar 7, 2025

Commit

f3ea50e

verified ·

1 Parent(s): 53ed647

Implemented a context management method that can monitor the token count and summarize or clear parts of the context when it gets too large

Browse files

Files changed (1) hide show

app.py +118 -3

app.py CHANGED Viewed

@@ -359,12 +359,127 @@ model = HfApiModel(
 )
 # Add fallback logic that only activates if the primary model fails
 def try_model_call_with_fallbacks(prompt):
     """Try to use the primary model first, fall back to alternatives if it fails."""
     # First attempt with primary model
     try:
-        return model(prompt)
     except Exception as primary_error:
         print(f"Primary model call failed: {str(primary_error)}")
         print("Trying fallback models...")
@@ -392,11 +507,11 @@ def try_model_call_with_fallbacks(prompt):
             try:
                 print(f"Trying fallback model: {fallback['display_name']}")
                 client = InferenceClient(provider=fallback["provider"], api_key=api_key)
-                messages = [{"role": "user", "content": prompt}]
                 completion = client.chat.completions.create(
                     model=fallback["model_name"],
                     messages=messages,
-                    max_tokens=2096,
                     temperature=0.5
                 )
                 print(f"Successfully used fallback model: {fallback['display_name']}")

 )
 # Add fallback logic that only activates if the primary model fails
+def manage_context(prompt, max_allowed_tokens=30000):
+    """Manages large contexts by summarizing or trimming when they get too big.
+    This helps avoid the 'inputs tokens + max_new_tokens must be <= 32768' error
+    by keeping the context size under control.
+    Args:
+        prompt: The full context/prompt that might be too large
+        max_allowed_tokens: Maximum number of tokens to allow before trimming
+    Returns:
+        A potentially shortened/summarized version of the prompt
+    """
+    # Rough token estimation (splitting on spaces is a crude approximation)
+    estimated_tokens = len(prompt.split())
+    # If below threshold, return as is
+    if estimated_tokens <= max_allowed_tokens:
+        return prompt
+    print(f"WARNING: Context size ({estimated_tokens} estimated tokens) exceeds limit ({max_allowed_tokens})")
+    # For extremely large prompts, we need more aggressive handling
+    if estimated_tokens > max_allowed_tokens * 1.5:
+        print("Performing aggressive context management")
+        # Approach 1: Keep only the most recent parts of the conversation
+        lines = prompt.strip().split('\n')
+        # Identify structural elements to keep
+        instruction_idx = -1
+        for i, line in enumerate(lines):
+            if "You are a" in line or "I want you to" in line:
+                instruction_idx = i
+        # Always keep the first part with instructions (system prompt)
+        keep_beginning = lines[:instruction_idx + 20] if instruction_idx >= 0 else lines[:50]
+        # Keep the most recent content (approximately half of the max tokens)
+        keep_end = lines[-int(max_allowed_tokens/15):]
+        # Add a note about trimming
+        middle_note = [
+            "",
+            "...",
+            "[Context has been trimmed to fit token limits]",
+            "...",
+            ""
+        ]
+        # Combine parts
+        shortened_prompt = "\n".join(keep_beginning + middle_note + keep_end)
+        print(f"Context reduced from ~{estimated_tokens} to ~{len(shortened_prompt.split())} estimated tokens")
+        return shortened_prompt
+    # Moderate size reduction for moderately oversized prompts
+    else:
+        print("Performing moderate context management")
+        # Split into lines for easier processing
+        sections = prompt.split("\n\n")
+        # Keep important sections like system instructions and recent content
+        # Identify which sections to keep or trim
+        keep_sections = []
+        trim_sections = []
+        # Process each section
+        for i, section in enumerate(sections):
+            # Always keep the first few sections (likely instructions)
+            if i < 3:
+                keep_sections.append(section)
+            # Keep the last several sections (most recent and relevant)
+            elif i > len(sections) - 8:
+                keep_sections.append(section)
+            # For code blocks, we should generally keep them
+            elif "```" in section:
+                keep_sections.append(section)
+            # For very short sections, keep them
+            elif len(section.split()) < 30:
+                keep_sections.append(section)
+            # For sections with likely important content, keep them
+            elif any(marker in section.lower() for marker in ["important", "key", "critical", "necessary", "must"]):
+                keep_sections.append(section)
+            # Otherwise, candidate for trimming
+            else:
+                trim_sections.append(section)
+        # If we still need to trim more, start removing some of the trim_sections
+        if len(" ".join(keep_sections).split()) > max_allowed_tokens * 0.8:
+            # Keep only a portion of the trim_sections
+            trim_to_keep = int(len(trim_sections) * 0.3)  # Keep 30%
+            trim_sections = trim_sections[:trim_to_keep]
+        # Build final prompt with a note about trimming
+        final_sections = keep_sections + ["[Some context has been summarized to fit token limits]"] + trim_sections
+        final_prompt = "\n\n".join(final_sections)
+        print(f"Context reduced from ~{estimated_tokens} to ~{len(final_prompt.split())} estimated tokens")
+        return final_prompt
+# Now update the try_model_call_with_fallbacks function to use this context management
 def try_model_call_with_fallbacks(prompt):
     """Try to use the primary model first, fall back to alternatives if it fails."""
     # First attempt with primary model
     try:
+        # Apply context management
+        managed_prompt = manage_context(prompt)
+        return original_call(managed_prompt)
     except Exception as primary_error:
+        # If it's a token limit error, try more aggressive management
+        if "Input validation error: inputs tokens + max_new_tokens" in str(primary_error):
+            try:
+                print("Token limit exceeded. Trying more aggressive context management...")
+                more_managed_prompt = manage_context(prompt, max_allowed_tokens=20000)
+                return original_call(more_managed_prompt)
+            except Exception:
+                print("Token reduction failed. Proceeding to fallback models...")
         print(f"Primary model call failed: {str(primary_error)}")
         print("Trying fallback models...")
             try:
                 print(f"Trying fallback model: {fallback['display_name']}")
                 client = InferenceClient(provider=fallback["provider"], api_key=api_key)
+                messages = [{"role": "user", "content": manage_context(prompt, 25000)}]  # Apply context management for fallbacks too
                 completion = client.chat.completions.create(
                     model=fallback["model_name"],
                     messages=messages,
+                    max_tokens=1800,
                     temperature=0.5
                 )
                 print(f"Successfully used fallback model: {fallback['display_name']}")