Agent-Example

Runtime error

App Files Files Community

SolshineMisfit commited on Mar 7, 2025

Commit

24432a6

verified ·

1 Parent(s): 6aae1a3

Switched to r 1776 model for huge context window - I hope it can code well

Browse files

Files changed (1) hide show

app.py +80 -231

app.py CHANGED Viewed

@@ -350,265 +350,114 @@ def get_current_time_in_timezone(timezone: str) -> str:
 final_answer = FinalAnswerTool()
-# Remove the huggingface_api_key parameter - it's not supported
-model = HfApiModel(
-    max_tokens=2096,
-    temperature=0.5,
-    model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',  # Using the backup endpoint
-    custom_role_conversions=None
-)
-# Add fallback logic that only activates if the primary model fails
-def manage_context(prompt, max_allowed_tokens=30000):
-    """Manages large contexts by summarizing or trimming when they get too big.
-    This helps avoid the 'inputs tokens + max_new_tokens must be <= 32768' error
-    by keeping the context size under control.
-    Args:
-        prompt: The full context/prompt that might be too large
-        max_allowed_tokens: Maximum number of tokens to allow before trimming
-    Returns:
-        A potentially shortened/summarized version of the prompt
-    """
-    # Rough token estimation (splitting on spaces is a crude approximation)
-    estimated_tokens = len(prompt.split())
-    # If below threshold, return as is
-    if estimated_tokens <= max_allowed_tokens:
-        return prompt
-    print(f"WARNING: Context size ({estimated_tokens} estimated tokens) exceeds limit ({max_allowed_tokens})")
-    # For extremely large prompts, we need more aggressive handling
-    if estimated_tokens > max_allowed_tokens * 1.5:
-        print("Performing aggressive context management")
-        # Approach 1: Keep only the most recent parts of the conversation
-        lines = prompt.strip().split('\n')
-        # Identify structural elements to keep
-        instruction_idx = -1
-        for i, line in enumerate(lines):
-            if "You are a" in line or "I want you to" in line:
-                instruction_idx = i
-        # Always keep the first part with instructions (system prompt)
-        keep_beginning = lines[:instruction_idx + 20] if instruction_idx >= 0 else lines[:50]
-        # Keep the most recent content (approximately half of the max tokens)
-        keep_end = lines[-int(max_allowed_tokens/15):]
-        # Add a note about trimming
-        middle_note = [
-            "",
-            "...",
-            "[Context has been trimmed to fit token limits]",
-            "...",
-            ""
-        ]
-        # Combine parts
-        shortened_prompt = "\n".join(keep_beginning + middle_note + keep_end)
-        print(f"Context reduced from ~{estimated_tokens} to ~{len(shortened_prompt.split())} estimated tokens")
-        return shortened_prompt
-    # Moderate size reduction for moderately oversized prompts
-    else:
-        print("Performing moderate context management")
-        # Split into lines for easier processing
-        sections = prompt.split("\n\n")
-        # Keep important sections like system instructions and recent content
-        # Identify which sections to keep or trim
-        keep_sections = []
-        trim_sections = []
-        # Process each section
-        for i, section in enumerate(sections):
-            # Always keep the first few sections (likely instructions)
-            if i < 3:
-                keep_sections.append(section)
-            # Keep the last several sections (most recent and relevant)
-            elif i > len(sections) - 8:
-                keep_sections.append(section)
-            # For code blocks, we should generally keep them
-            elif "```" in section:
-                keep_sections.append(section)
-            # For very short sections, keep them
-            elif len(section.split()) < 30:
-                keep_sections.append(section)
-            # For sections with likely important content, keep them
-            elif any(marker in section.lower() for marker in ["important", "key", "critical", "necessary", "must"]):
-                keep_sections.append(section)
-            # Otherwise, candidate for trimming
-            else:
-                trim_sections.append(section)
-        # If we still need to trim more, start removing some of the trim_sections
-        if len(" ".join(keep_sections).split()) > max_allowed_tokens * 0.8:
-            # Keep only a portion of the trim_sections
-            trim_to_keep = int(len(trim_sections) * 0.3)  # Keep 30%
-            trim_sections = trim_sections[:trim_to_keep]
-        # Build final prompt with a note about trimming
-        final_sections = keep_sections + ["[Some context has been summarized to fit token limits]"] + trim_sections
-        final_prompt = "\n\n".join(final_sections)
-        print(f"Context reduced from ~{estimated_tokens} to ~{len(final_prompt.split())} estimated tokens")
-        return final_prompt
-# Now update the try_model_call_with_fallbacks function to use this context management
-def try_model_call_with_fallbacks(prompt):
-    """Try to use the primary model first with aggressive context management."""
-    # First, ALWAYS apply context management, but more aggressively
-    try:
-        # Get a rough token count estimate
-        estimated_tokens = len(prompt.split())
-        print(f"Estimated input tokens: {estimated_tokens}")
-        # Start with 25000 as the maximum (leaving ~6K tokens buffer for the model limits)
-        managed_prompt = manage_context(prompt, max_allowed_tokens=25000)
-        # If still potentially too large, reduce further
-        if len(managed_prompt.split()) > 24000:
-            print("First context reduction still too large, reducing further...")
-            managed_prompt = manage_context(managed_prompt, max_allowed_tokens=22000)
-            # Final emergency truncation if needed
-            if len(managed_prompt.split()) > 22000:
-                print("Emergency truncation required")
-                words = managed_prompt.split()
-                # Keep first 5000 and last 15000 words with a note in between
-                managed_prompt = " ".join(words[:5000]) + "\n\n[CONTEXT SEVERELY TRUNCATED]\n\n" + " ".join(words[-15000:])
-        print(f"Final managed prompt size: {len(managed_prompt.split())} estimated tokens")
-        # Temporarily reduce output tokens even further if the prompt is large
-        temp_max_tokens = model.max_tokens
-        if len(managed_prompt.split()) > 20000:
-            print("Large prompt detected, temporarily reducing output tokens")
-            model.max_tokens = 750  # Temporarily reduce to 750 for this call
         try:
-            result = original_call(managed_prompt)
-            model.max_tokens = temp_max_tokens  # Restore original setting
-            return result
-        except Exception as call_error:
-            # Restore original setting before handling the error
-            model.max_tokens = temp_max_tokens
-            raise call_error
-    except Exception as primary_error:
-        # If we still get a token limit error, try even more aggressive reduction
-        if "Input validation error: inputs tokens + max_new_tokens" in str(primary_error):
-            try:
-                print("Critical: Token limit exceeded despite context management. Implementing emergency measures...")
-                # Take a more drastic approach - keep only system instructions and last part
-                lines = prompt.strip().split('\n')
-                # Keep first 50 lines and last 100 lines only
-                emergency_prompt = "\n".join(lines[:50] + ["\n[MAJORITY OF CONTEXT REMOVED DUE TO TOKEN LIMITS]\n"] + lines[-100:])
-                # Reduce output tokens drastically
-                temp_max_tokens = model.max_tokens
-                model.max_tokens = 500
                 try:
-                    result = original_call(emergency_prompt)
-                    model.max_tokens = temp_max_tokens
-                    return result
-                except Exception:
-                    model.max_tokens = temp_max_tokens
-                    print("Emergency measures failed. Trying fallback models...")
-            except Exception:
-                print("Emergency context management failed. Proceeding to fallback models...")
-        print(f"Primary model call failed: {str(primary_error)}")
-        print("Trying fallback models...")
-        # Rest of fallback logic remains the same...
-        # List of fallback models
-        fallbacks = [
-            {
-                "provider": "sambanova",
-                "model_name": "Qwen/Qwen2.5-Coder-32B-Instruct",
-                "display_name": "Qwen 2.5 Coder 32B"
-            },
-            {
-                "provider": "hf-inference",
-                "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
-                "display_name": "DeepSeek R1 Distill Qwen 32B"
-            }
-        ]
-        # Get API key
-        api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
-        if not api_key:
-            raise ValueError("No Hugging Face API key found in environment variables")
-        # Try each fallback model in sequence with highly aggressive context management
-        for fallback in fallbacks:
-            try:
-                print(f"Trying fallback model: {fallback['display_name']}")
-                client = InferenceClient(provider=fallback["provider"], api_key=api_key)
-                # Apply even more aggressive context management for fallbacks
-                emergency_prompt = manage_context(prompt, max_allowed_tokens=15000)
-                messages = [{"role": "user", "content": emergency_prompt}]
-                completion = client.chat.completions.create(
-                    model=fallback["model_name"],
-                    messages=messages,
-                    max_tokens=1000,  # Reduced tokens for output
-                    temperature=0.5
-                )
-                print(f"Successfully used fallback model: {fallback['display_name']}")
-                return completion.choices[0].message.content
-            except Exception as e:
-                print(f"Fallback model {fallback['display_name']} failed: {str(e)}")
-                continue
-        # If all fallbacks fail, provide a useful error message
-        return "ERROR: Unable to process request due to context size limitations. Please break your request into smaller parts or simplify your query."
-# Monkey patch the model's __call__ method to use our fallback logic
-original_call = model.__call__
-model.__call__ = try_model_call_with_fallbacks
-# Reduce the model's output tokens immediately to improve chances of success
-model.max_tokens = 750  # Reduce from 2096 to 750 for all outputs by default
-# Import tool from Hub
-image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
-with open("prompts.yaml", 'r') as stream:
-    prompt_templates = yaml.safe_load(stream)
-# Update the agent to use more conservative settings
 agent = CodeAgent(
     model=model,
     tools=[
         final_answer,
         Sonar_Web_Search_Tool,
-        primary_search_tool,  # This is already set to either DuckDuckGo, Google, or fallback
         get_current_time_in_timezone,
         image_generation_tool,
         Dataset_Creator_Tool,
         Check_Dataset_Validity,
-        visit_webpage_tool,  # This is correctly initialized as VisitWebpageTool()
     ],
-    max_steps=8,  # Reduce from 12 to 8
-    verbosity_level=0,  # Reduce from 1 to 0 to minimize internal conversation
     grammar=None,
     planning_interval=2,
     name="Research Assistant",
     description="""An AI assistant that can search the web, create datasets, and answer questions.
-                This assistant automatically manages token limits for better stability.""",
     prompt_templates=prompt_templates
 )
 # Add informative message about which search tool is being used
 print(f"Agent initialized with {search_tool_name} as primary search tool")
 print(f"Available tools: final_answer, Sonar_Web_Search_Tool, {search_tool_name}, get_current_time_in_timezone, image_generation_tool, Dataset_Creator_Tool, Check_Dataset_Validity, visit_webpage_tool")
@@ -622,7 +471,7 @@ print(f"Available tools: final_answer, Sonar_Web_Search_Tool, {search_tool_name}
 # To fix the TypeError in Gradio_UI.py, you would need to modify that file
 # For now, we'll just use the agent directly
 try:
-    GradioUI(agent).launch()
 except TypeError as e:
     if "unsupported operand type(s) for +=" in str(e):
         print("Error: Token counting issue in Gradio UI")

 final_answer = FinalAnswerTool()
+# Replace current model with Perplexity AI R1-1776 (128K context window)
+# Import additional necessary modules
+from huggingface_hub import InferenceClient
+# Keep the original model definition but don't use it
+original_model = model
+# Create a new model implementation that uses the larger context window model through InferenceClient
+class PerplexityR1Model:
+    def __init__(self, temperature=0.5, max_tokens=1500):
+        """Initialize Perplexity R1-1776 model with 128K context window."""
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.model_name = "perplexity-ai/r1-1776"
+        self.provider = "fireworks-ai"
+        self.last_input_token_count = 0
+        # Get the API key
+        self.api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
+        if not self.api_key:
+            raise ValueError("No Hugging Face API key found in environment variables")
+        # Create the inference client
+        self.client = InferenceClient(provider=self.provider, api_key=self.api_key)
+        print(f"Initialized Perplexity R1-1776 model with 128K context window")
+    def __call__(self, prompt):
+        """Call the model with the prompt."""
+        # Simple token count estimation
+        self.last_input_token_count = len(prompt.split())
+        print(f"Sending approximately {self.last_input_token_count} tokens to Perplexity R1-1776")
+        # Convert string prompt to messages format
+        messages = [{"role": "user", "content": prompt}]
         try:
+            # Call the model
+            completion = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens
+            )
+            # Return just the content string to match HfApiModel's behavior
+            return completion.choices[0].message.content
+        except Exception as e:
+            print(f"Error calling Perplexity R1-1776: {str(e)}")
+            # If we get an error with the large context model, try our aggressive context trimming
+            if "context length" in str(e).lower() or "token limit" in str(e).lower():
+                print("Context length error with R1-1776 - trimming context and retrying")
+                # Use our existing context management function
+                trimmed_prompt = manage_context(prompt, max_allowed_tokens=90000)  # 90K to be safe
+                messages = [{"role": "user", "content": trimmed_prompt}]
                 try:
+                    completion = self.client.chat.completions.create(
+                        model=self.model_name,
+                        messages=messages,
+                        temperature=self.temperature,
+                        max_tokens=self.max_tokens
+                    )
+                    return completion.choices[0].message.content
+                except Exception as retry_error:
+                    print(f"Error on retry: {str(retry_error)}")
+                    # Fall back to error message
+                    return f"ERROR: Model call failed even with reduced context. Please try a shorter query. Error: {str(retry_error)}"
+            else:
+                # For non-context errors, return error message
+                return f"ERROR: {str(e)}"
+# Replace the model with our new implementation
+model = PerplexityR1Model(temperature=0.5, max_tokens=1500)
+# No need for complex context management or fallbacks now with the large context window
+# But keep the functions in place in case they're needed as fallbacks
+# Update the agent with the new model and more steps
 agent = CodeAgent(
     model=model,
     tools=[
         final_answer,
         Sonar_Web_Search_Tool,
+        primary_search_tool,
         get_current_time_in_timezone,
         image_generation_tool,
         Dataset_Creator_Tool,
         Check_Dataset_Validity,
+        visit_webpage_tool,
     ],
+    max_steps=12,  # Increase back to 12 since we have a large context window
+    verbosity_level=1,  # Increase to 1 since we have room
     grammar=None,
     planning_interval=2,
     name="Research Assistant",
     description="""An AI assistant that can search the web, create datasets, and answer questions.
+                Using Perplexity R1-1776 model with 128K token context window.""",
     prompt_templates=prompt_templates
 )
+# Add informative message about the model
+print("Using Perplexity R1-1776 model with 128K token context window")
+# Import tool from Hub
+image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
+with open("prompts.yaml", 'r') as stream:
+    prompt_templates = yaml.safe_load(stream)
 # Add informative message about which search tool is being used
 print(f"Agent initialized with {search_tool_name} as primary search tool")
 print(f"Available tools: final_answer, Sonar_Web_Search_Tool, {search_tool_name}, get_current_time_in_timezone, image_generation_tool, Dataset_Creator_Tool, Check_Dataset_Validity, visit_webpage_tool")
 # To fix the TypeError in Gradio_UI.py, you would need to modify that file
 # For now, we'll just use the agent directly
 try:
+    GradioUI(agent).launch(share=True)
 except TypeError as e:
     if "unsupported operand type(s) for +=" in str(e):
         print("Error: Token counting issue in Gradio UI")