walidsobhie-code Claude Opus 4.6 commited on
Commit
35682cb
·
1 Parent(s): f4fd1fb

feat: Add evaluation and pattern management to CLI

Browse files

- Add run_eval method for MBPP, HumanEval, GSM8K benchmarks
- Add run_patterns method for pattern management
- Add CLI args: --eval, --patterns, --train

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. stack_cli/cli.py +119 -3
stack_cli/cli.py CHANGED
@@ -338,7 +338,74 @@ class StackCLI:
338
  result = self.command_mode.execute_tools(tools, out_file)
339
  print(result)
340
  return result
341
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  def run_voice(self):
343
  """Run voice mode loop."""
344
  if not self.voice.available:
@@ -417,13 +484,62 @@ Examples:
417
  default="/Users/walidsobhi/.openclaw/workspace",
418
  help="Workspace path"
419
  )
420
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  args = parser.parse_args()
422
 
423
  try:
424
  # Create CLI with custom workspace if provided
425
  cli = StackCLI()
426
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  if args.voice:
428
  cli.run_voice()
429
  elif args.tools:
 
338
  result = self.command_mode.execute_tools(tools, out_file)
339
  print(result)
340
  return result
341
+
342
+ def run_eval(self, benchmark: str, provider: str = 'ollama', model: str = None):
343
+ """Run evaluation benchmarks."""
344
+ print_colored(f"\n=== Running {benchmark} benchmark ===", "blue")
345
+
346
+ import sys
347
+ from pathlib import Path
348
+ eval_dir = Path(__file__).parent.parent / "stack-2.9-eval"
349
+ if eval_dir.exists():
350
+ sys.path.insert(0, str(eval_dir))
351
+
352
+ try:
353
+ if benchmark == 'mbpp':
354
+ from benchmarks.mbpp import MBPP
355
+ b = MBPP(model_provider=provider, model_name=model)
356
+ elif benchmark == 'human_eval':
357
+ from benchmarks.human_eval import HumanEval
358
+ b = HumanEval(model_provider=provider, model_name=model)
359
+ elif benchmark == 'gsm8k':
360
+ from benchmarks.gsm8k import GSM8K
361
+ b = GSM8K(model_provider=provider, model_name=model)
362
+ elif benchmark == 'all':
363
+ from benchmarks.mbpp import MBPP
364
+ from benchmarks.human_eval import HumanEval
365
+ from benchmarks.gsm8k import GSM8K
366
+ for name, Benchmark in [('MBPP', MBPP), ('HumanEval', HumanEval), ('GSM8K', GSM8K)]:
367
+ print_colored(f"\n--- {name} ---", "yellow")
368
+ b = Benchmark(model_provider=provider, model_name=model)
369
+ results = b.evaluate()
370
+ print(f" Accuracy: {results['accuracy']*100:.1f}%")
371
+ return
372
+
373
+ results = b.evaluate()
374
+ print_colored(f"\nResults:", "green")
375
+ print(f" Accuracy: {results['accuracy']*100:.1f}%")
376
+ print(f" Passed: {results['pass_at_1']}/{results['total_cases']}")
377
+ print(f" Model: {results['model']}")
378
+ except Exception as e:
379
+ print_colored(f"Error: {e}", "red")
380
+
381
+ def run_patterns(self, action: str):
382
+ """Manage learned patterns."""
383
+ print_colored(f"\n=== Pattern Management ===", "blue")
384
+
385
+ import sys
386
+ from pathlib import Path
387
+ train_dir = Path(__file__).parent.parent / "stack-2.9-training"
388
+ if train_dir.exists():
389
+ sys.path.insert(0, str(train_dir))
390
+
391
+ try:
392
+ from pattern_miner import PatternMiner
393
+ miner = PatternMiner()
394
+
395
+ if action == 'list':
396
+ patterns = miner.get_relevant_patterns(limit=20)
397
+ print_colored(f"\nStored Patterns:", "yellow")
398
+ for p in patterns:
399
+ print(f" [{p.pattern_type}] {p.code_snippet[:50]}...")
400
+ elif action == 'stats':
401
+ stats = miner.get_statistics()
402
+ print_colored(f"\nStatistics:", "yellow")
403
+ print(f" Total Feedback: {stats['total_feedback']}")
404
+ print(f" Success Rate: {stats['success_rate']:.1%}")
405
+ print(f" Total Patterns: {stats['total_patterns']}")
406
+ except Exception as e:
407
+ print_colored(f"Error: {e}", "red")
408
+
409
  def run_voice(self):
410
  """Run voice mode loop."""
411
  if not self.voice.available:
 
484
  default="/Users/walidsobhi/.openclaw/workspace",
485
  help="Workspace path"
486
  )
487
+
488
+ # Evaluation options
489
+ parser.add_argument(
490
+ '-e', '--eval',
491
+ choices=['mbpp', 'human_eval', 'gsm8k', 'all'],
492
+ help="Run evaluation benchmark"
493
+ )
494
+
495
+ parser.add_argument(
496
+ '--eval-provider',
497
+ default='ollama',
498
+ choices=['ollama', 'openai', 'anthropic'],
499
+ help="Model provider for evaluation"
500
+ )
501
+
502
+ parser.add_argument(
503
+ '--eval-model',
504
+ type=str,
505
+ help="Model name for evaluation"
506
+ )
507
+
508
+ # Pattern management
509
+ parser.add_argument(
510
+ '--patterns',
511
+ choices=['list', 'stats', 'clear'],
512
+ help="Manage patterns for self-evolution"
513
+ )
514
+
515
+ # Training
516
+ parser.add_argument(
517
+ '--train',
518
+ action='store_true',
519
+ help="Run LoRA training"
520
+ )
521
+
522
  args = parser.parse_args()
523
 
524
  try:
525
  # Create CLI with custom workspace if provided
526
  cli = StackCLI()
527
+
528
+ # Handle evaluation
529
+ if args.eval:
530
+ cli.run_eval(args.eval, args.eval_provider, args.eval_model)
531
+ return
532
+
533
+ # Handle pattern management
534
+ if args.patterns:
535
+ cli.run_patterns(args.patterns)
536
+ return
537
+
538
+ # Handle training
539
+ if args.train:
540
+ cli.run_train()
541
+ return
542
+
543
  if args.voice:
544
  cli.run_voice()
545
  elif args.tools: