SolshineMisfit commited on
Commit
24432a6
·
verified ·
1 Parent(s): 6aae1a3

Switched to r 1776 model for huge context window - I hope it can code well

Browse files
Files changed (1) hide show
  1. app.py +80 -231
app.py CHANGED
@@ -350,265 +350,114 @@ def get_current_time_in_timezone(timezone: str) -> str:
350
 
351
  final_answer = FinalAnswerTool()
352
 
353
- # Remove the huggingface_api_key parameter - it's not supported
354
- model = HfApiModel(
355
- max_tokens=2096,
356
- temperature=0.5,
357
- model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # Using the backup endpoint
358
- custom_role_conversions=None
359
- )
360
 
361
- # Add fallback logic that only activates if the primary model fails
362
- def manage_context(prompt, max_allowed_tokens=30000):
363
- """Manages large contexts by summarizing or trimming when they get too big.
364
-
365
- This helps avoid the 'inputs tokens + max_new_tokens must be <= 32768' error
366
- by keeping the context size under control.
367
-
368
- Args:
369
- prompt: The full context/prompt that might be too large
370
- max_allowed_tokens: Maximum number of tokens to allow before trimming
371
-
372
- Returns:
373
- A potentially shortened/summarized version of the prompt
374
- """
375
- # Rough token estimation (splitting on spaces is a crude approximation)
376
- estimated_tokens = len(prompt.split())
377
-
378
- # If below threshold, return as is
379
- if estimated_tokens <= max_allowed_tokens:
380
- return prompt
381
-
382
- print(f"WARNING: Context size ({estimated_tokens} estimated tokens) exceeds limit ({max_allowed_tokens})")
383
-
384
- # For extremely large prompts, we need more aggressive handling
385
- if estimated_tokens > max_allowed_tokens * 1.5:
386
- print("Performing aggressive context management")
387
-
388
- # Approach 1: Keep only the most recent parts of the conversation
389
- lines = prompt.strip().split('\n')
390
-
391
- # Identify structural elements to keep
392
- instruction_idx = -1
393
- for i, line in enumerate(lines):
394
- if "You are a" in line or "I want you to" in line:
395
- instruction_idx = i
396
-
397
- # Always keep the first part with instructions (system prompt)
398
- keep_beginning = lines[:instruction_idx + 20] if instruction_idx >= 0 else lines[:50]
399
-
400
- # Keep the most recent content (approximately half of the max tokens)
401
- keep_end = lines[-int(max_allowed_tokens/15):]
402
-
403
- # Add a note about trimming
404
- middle_note = [
405
- "",
406
- "...",
407
- "[Context has been trimmed to fit token limits]",
408
- "...",
409
- ""
410
- ]
411
-
412
- # Combine parts
413
- shortened_prompt = "\n".join(keep_beginning + middle_note + keep_end)
414
- print(f"Context reduced from ~{estimated_tokens} to ~{len(shortened_prompt.split())} estimated tokens")
415
- return shortened_prompt
416
-
417
- # Moderate size reduction for moderately oversized prompts
418
- else:
419
- print("Performing moderate context management")
420
-
421
- # Split into lines for easier processing
422
- sections = prompt.split("\n\n")
423
-
424
- # Keep important sections like system instructions and recent content
425
- # Identify which sections to keep or trim
426
- keep_sections = []
427
- trim_sections = []
428
-
429
- # Process each section
430
- for i, section in enumerate(sections):
431
- # Always keep the first few sections (likely instructions)
432
- if i < 3:
433
- keep_sections.append(section)
434
- # Keep the last several sections (most recent and relevant)
435
- elif i > len(sections) - 8:
436
- keep_sections.append(section)
437
- # For code blocks, we should generally keep them
438
- elif "```" in section:
439
- keep_sections.append(section)
440
- # For very short sections, keep them
441
- elif len(section.split()) < 30:
442
- keep_sections.append(section)
443
- # For sections with likely important content, keep them
444
- elif any(marker in section.lower() for marker in ["important", "key", "critical", "necessary", "must"]):
445
- keep_sections.append(section)
446
- # Otherwise, candidate for trimming
447
- else:
448
- trim_sections.append(section)
449
-
450
- # If we still need to trim more, start removing some of the trim_sections
451
- if len(" ".join(keep_sections).split()) > max_allowed_tokens * 0.8:
452
- # Keep only a portion of the trim_sections
453
- trim_to_keep = int(len(trim_sections) * 0.3) # Keep 30%
454
- trim_sections = trim_sections[:trim_to_keep]
455
-
456
- # Build final prompt with a note about trimming
457
- final_sections = keep_sections + ["[Some context has been summarized to fit token limits]"] + trim_sections
458
- final_prompt = "\n\n".join(final_sections)
459
-
460
- print(f"Context reduced from ~{estimated_tokens} to ~{len(final_prompt.split())} estimated tokens")
461
- return final_prompt
462
 
 
 
463
 
464
- # Now update the try_model_call_with_fallbacks function to use this context management
465
- def try_model_call_with_fallbacks(prompt):
466
- """Try to use the primary model first with aggressive context management."""
467
- # First, ALWAYS apply context management, but more aggressively
468
- try:
469
- # Get a rough token count estimate
470
- estimated_tokens = len(prompt.split())
471
- print(f"Estimated input tokens: {estimated_tokens}")
472
-
473
- # Start with 25000 as the maximum (leaving ~6K tokens buffer for the model limits)
474
- managed_prompt = manage_context(prompt, max_allowed_tokens=25000)
475
-
476
- # If still potentially too large, reduce further
477
- if len(managed_prompt.split()) > 24000:
478
- print("First context reduction still too large, reducing further...")
479
- managed_prompt = manage_context(managed_prompt, max_allowed_tokens=22000)
480
-
481
- # Final emergency truncation if needed
482
- if len(managed_prompt.split()) > 22000:
483
- print("Emergency truncation required")
484
- words = managed_prompt.split()
485
- # Keep first 5000 and last 15000 words with a note in between
486
- managed_prompt = " ".join(words[:5000]) + "\n\n[CONTEXT SEVERELY TRUNCATED]\n\n" + " ".join(words[-15000:])
487
-
488
- print(f"Final managed prompt size: {len(managed_prompt.split())} estimated tokens")
489
 
490
- # Temporarily reduce output tokens even further if the prompt is large
491
- temp_max_tokens = model.max_tokens
492
- if len(managed_prompt.split()) > 20000:
493
- print("Large prompt detected, temporarily reducing output tokens")
494
- model.max_tokens = 750 # Temporarily reduce to 750 for this call
495
 
496
  try:
497
- result = original_call(managed_prompt)
498
- model.max_tokens = temp_max_tokens # Restore original setting
499
- return result
500
- except Exception as call_error:
501
- # Restore original setting before handling the error
502
- model.max_tokens = temp_max_tokens
503
- raise call_error
504
 
505
- except Exception as primary_error:
506
- # If we still get a token limit error, try even more aggressive reduction
507
- if "Input validation error: inputs tokens + max_new_tokens" in str(primary_error):
508
- try:
509
- print("Critical: Token limit exceeded despite context management. Implementing emergency measures...")
510
- # Take a more drastic approach - keep only system instructions and last part
511
- lines = prompt.strip().split('\n')
512
- # Keep first 50 lines and last 100 lines only
513
- emergency_prompt = "\n".join(lines[:50] + ["\n[MAJORITY OF CONTEXT REMOVED DUE TO TOKEN LIMITS]\n"] + lines[-100:])
 
514
 
515
- # Reduce output tokens drastically
516
- temp_max_tokens = model.max_tokens
517
- model.max_tokens = 500
518
  try:
519
- result = original_call(emergency_prompt)
520
- model.max_tokens = temp_max_tokens
521
- return result
522
- except Exception:
523
- model.max_tokens = temp_max_tokens
524
- print("Emergency measures failed. Trying fallback models...")
525
- except Exception:
526
- print("Emergency context management failed. Proceeding to fallback models...")
527
-
528
- print(f"Primary model call failed: {str(primary_error)}")
529
- print("Trying fallback models...")
530
-
531
- # Rest of fallback logic remains the same...
532
- # List of fallback models
533
- fallbacks = [
534
- {
535
- "provider": "sambanova",
536
- "model_name": "Qwen/Qwen2.5-Coder-32B-Instruct",
537
- "display_name": "Qwen 2.5 Coder 32B"
538
- },
539
- {
540
- "provider": "hf-inference",
541
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
542
- "display_name": "DeepSeek R1 Distill Qwen 32B"
543
- }
544
- ]
545
-
546
- # Get API key
547
- api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
548
- if not api_key:
549
- raise ValueError("No Hugging Face API key found in environment variables")
550
-
551
- # Try each fallback model in sequence with highly aggressive context management
552
- for fallback in fallbacks:
553
- try:
554
- print(f"Trying fallback model: {fallback['display_name']}")
555
- client = InferenceClient(provider=fallback["provider"], api_key=api_key)
556
-
557
- # Apply even more aggressive context management for fallbacks
558
- emergency_prompt = manage_context(prompt, max_allowed_tokens=15000)
559
- messages = [{"role": "user", "content": emergency_prompt}]
560
-
561
- completion = client.chat.completions.create(
562
- model=fallback["model_name"],
563
- messages=messages,
564
- max_tokens=1000, # Reduced tokens for output
565
- temperature=0.5
566
- )
567
- print(f"Successfully used fallback model: {fallback['display_name']}")
568
- return completion.choices[0].message.content
569
- except Exception as e:
570
- print(f"Fallback model {fallback['display_name']} failed: {str(e)}")
571
- continue
572
-
573
- # If all fallbacks fail, provide a useful error message
574
- return "ERROR: Unable to process request due to context size limitations. Please break your request into smaller parts or simplify your query."
575
-
576
- # Monkey patch the model's __call__ method to use our fallback logic
577
- original_call = model.__call__
578
- model.__call__ = try_model_call_with_fallbacks
579
-
580
- # Reduce the model's output tokens immediately to improve chances of success
581
- model.max_tokens = 750 # Reduce from 2096 to 750 for all outputs by default
582
 
583
- # Import tool from Hub
584
- image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
585
 
586
- with open("prompts.yaml", 'r') as stream:
587
- prompt_templates = yaml.safe_load(stream)
588
 
589
- # Update the agent to use more conservative settings
590
  agent = CodeAgent(
591
  model=model,
592
  tools=[
593
  final_answer,
594
  Sonar_Web_Search_Tool,
595
- primary_search_tool, # This is already set to either DuckDuckGo, Google, or fallback
596
  get_current_time_in_timezone,
597
  image_generation_tool,
598
  Dataset_Creator_Tool,
599
  Check_Dataset_Validity,
600
- visit_webpage_tool, # This is correctly initialized as VisitWebpageTool()
601
  ],
602
- max_steps=8, # Reduce from 12 to 8
603
- verbosity_level=0, # Reduce from 1 to 0 to minimize internal conversation
604
  grammar=None,
605
  planning_interval=2,
606
  name="Research Assistant",
607
  description="""An AI assistant that can search the web, create datasets, and answer questions.
608
- This assistant automatically manages token limits for better stability.""",
609
  prompt_templates=prompt_templates
610
  )
611
 
 
 
 
 
 
 
 
 
 
612
  # Add informative message about which search tool is being used
613
  print(f"Agent initialized with {search_tool_name} as primary search tool")
614
  print(f"Available tools: final_answer, Sonar_Web_Search_Tool, {search_tool_name}, get_current_time_in_timezone, image_generation_tool, Dataset_Creator_Tool, Check_Dataset_Validity, visit_webpage_tool")
@@ -622,7 +471,7 @@ print(f"Available tools: final_answer, Sonar_Web_Search_Tool, {search_tool_name}
622
  # To fix the TypeError in Gradio_UI.py, you would need to modify that file
623
  # For now, we'll just use the agent directly
624
  try:
625
- GradioUI(agent).launch()
626
  except TypeError as e:
627
  if "unsupported operand type(s) for +=" in str(e):
628
  print("Error: Token counting issue in Gradio UI")
 
350
 
351
  final_answer = FinalAnswerTool()
352
 
353
+ # Replace current model with Perplexity AI R1-1776 (128K context window)
 
 
 
 
 
 
354
 
355
+ # Import additional necessary modules
356
+ from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ # Keep the original model definition but don't use it
359
+ original_model = model
360
 
361
+ # Create a new model implementation that uses the larger context window model through InferenceClient
362
+ class PerplexityR1Model:
363
+ def __init__(self, temperature=0.5, max_tokens=1500):
364
+ """Initialize Perplexity R1-1776 model with 128K context window."""
365
+ self.temperature = temperature
366
+ self.max_tokens = max_tokens
367
+ self.model_name = "perplexity-ai/r1-1776"
368
+ self.provider = "fireworks-ai"
369
+ self.last_input_token_count = 0
370
+ # Get the API key
371
+ self.api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
372
+ if not self.api_key:
373
+ raise ValueError("No Hugging Face API key found in environment variables")
374
+ # Create the inference client
375
+ self.client = InferenceClient(provider=self.provider, api_key=self.api_key)
376
+ print(f"Initialized Perplexity R1-1776 model with 128K context window")
377
+
378
+ def __call__(self, prompt):
379
+ """Call the model with the prompt."""
380
+ # Simple token count estimation
381
+ self.last_input_token_count = len(prompt.split())
382
+ print(f"Sending approximately {self.last_input_token_count} tokens to Perplexity R1-1776")
 
 
 
383
 
384
+ # Convert string prompt to messages format
385
+ messages = [{"role": "user", "content": prompt}]
 
 
 
386
 
387
  try:
388
+ # Call the model
389
+ completion = self.client.chat.completions.create(
390
+ model=self.model_name,
391
+ messages=messages,
392
+ temperature=self.temperature,
393
+ max_tokens=self.max_tokens
394
+ )
395
 
396
+ # Return just the content string to match HfApiModel's behavior
397
+ return completion.choices[0].message.content
398
+ except Exception as e:
399
+ print(f"Error calling Perplexity R1-1776: {str(e)}")
400
+ # If we get an error with the large context model, try our aggressive context trimming
401
+ if "context length" in str(e).lower() or "token limit" in str(e).lower():
402
+ print("Context length error with R1-1776 - trimming context and retrying")
403
+ # Use our existing context management function
404
+ trimmed_prompt = manage_context(prompt, max_allowed_tokens=90000) # 90K to be safe
405
+ messages = [{"role": "user", "content": trimmed_prompt}]
406
 
 
 
 
407
  try:
408
+ completion = self.client.chat.completions.create(
409
+ model=self.model_name,
410
+ messages=messages,
411
+ temperature=self.temperature,
412
+ max_tokens=self.max_tokens
413
+ )
414
+ return completion.choices[0].message.content
415
+ except Exception as retry_error:
416
+ print(f"Error on retry: {str(retry_error)}")
417
+ # Fall back to error message
418
+ return f"ERROR: Model call failed even with reduced context. Please try a shorter query. Error: {str(retry_error)}"
419
+ else:
420
+ # For non-context errors, return error message
421
+ return f"ERROR: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
+ # Replace the model with our new implementation
424
+ model = PerplexityR1Model(temperature=0.5, max_tokens=1500)
425
 
426
+ # No need for complex context management or fallbacks now with the large context window
427
+ # But keep the functions in place in case they're needed as fallbacks
428
 
429
+ # Update the agent with the new model and more steps
430
  agent = CodeAgent(
431
  model=model,
432
  tools=[
433
  final_answer,
434
  Sonar_Web_Search_Tool,
435
+ primary_search_tool,
436
  get_current_time_in_timezone,
437
  image_generation_tool,
438
  Dataset_Creator_Tool,
439
  Check_Dataset_Validity,
440
+ visit_webpage_tool,
441
  ],
442
+ max_steps=12, # Increase back to 12 since we have a large context window
443
+ verbosity_level=1, # Increase to 1 since we have room
444
  grammar=None,
445
  planning_interval=2,
446
  name="Research Assistant",
447
  description="""An AI assistant that can search the web, create datasets, and answer questions.
448
+ Using Perplexity R1-1776 model with 128K token context window.""",
449
  prompt_templates=prompt_templates
450
  )
451
 
452
+ # Add informative message about the model
453
+ print("Using Perplexity R1-1776 model with 128K token context window")
454
+
455
+ # Import tool from Hub
456
+ image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
457
+
458
+ with open("prompts.yaml", 'r') as stream:
459
+ prompt_templates = yaml.safe_load(stream)
460
+
461
  # Add informative message about which search tool is being used
462
  print(f"Agent initialized with {search_tool_name} as primary search tool")
463
  print(f"Available tools: final_answer, Sonar_Web_Search_Tool, {search_tool_name}, get_current_time_in_timezone, image_generation_tool, Dataset_Creator_Tool, Check_Dataset_Validity, visit_webpage_tool")
 
471
  # To fix the TypeError in Gradio_UI.py, you would need to modify that file
472
  # For now, we'll just use the agent directly
473
  try:
474
+ GradioUI(agent).launch(share=True)
475
  except TypeError as e:
476
  if "unsupported operand type(s) for +=" in str(e):
477
  print("Error: Token counting issue in Gradio UI")