File size: 7,116 Bytes
343cc98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import gradio as gr
import requests
import json
import time
from datetime import datetime

class OllamaSAAPBenchmark:
    def __init__(self, base_url="http://localhost:11434"):
        self.base_url = base_url
        
    def test_agent_response(self, prompt, model, agent_role="General"):
        """Test Agent-spezifische Responses für SAAP"""
        
        # SAAP-spezifische Prompts je nach Agent-Rolle
        saap_prompts = {
            "Jane": f"Als KI-Architektin: {prompt}",
            "John": f"Als Entwickler: {prompt}", 
            "Justus": f"Als Rechtsexperte: {prompt}",
            "General": prompt
        }
        
        final_prompt = saap_prompts.get(agent_role, prompt)
        start_time = time.time()
        
        try:
            response = requests.post(
                f"{self.base_url}/api/generate",
                json={
                    "model": model,
                    "prompt": final_prompt,
                    "stream": False,
                    "options": {"temperature": 0.7, "num_predict": 256}
                },
                timeout=60
            )
            
            end_time = time.time()
            
            if response.status_code == 200:
                result = response.json()
                return {
                    "response": result.get("response", ""),
                    "time": f"{end_time - start_time:.2f}s",
                    "model": model,
                    "agent_role": agent_role,
                    "tokens": len(result.get("response", "").split()),
                    "status": "✅ Success"
                }
            else:
                return {"status": f"❌ Error {response.status_code}", "time": f"{end_time - start_time:.2f}s"}
                
        except Exception as e:
            return {"status": f"❌ Connection Error: {str(e)[:50]}...", "time": f"{time.time() - start_time:.2f}s"}

    def list_models(self):
        try:
            response = requests.get(f"{self.base_url}/api/tags")
            if response.status_code == 200:
                models = response.json().get("models", [])
                return [model["name"] for model in models]
            return ["Connection failed - check if Ollama is running"]
        except:
            return ["❌ Cannot connect to Ollama"]

# Initialize benchmark system
benchmark = OllamaSAAPBenchmark()
available_models = benchmark.list_models()

# SAAP Benchmark Interface
def run_saap_benchmark(prompt, selected_models, agent_role):
    if not prompt.strip():
        return "⚠️ Bitte geben Sie einen Test-Prompt ein."
    
    results = []
    results.append(f"# 🚀 SAAP Multi-Agent Performance Benchmark")
    results.append(f"**Agent Role:** {agent_role}")
    results.append(f"**Test Prompt:** {prompt}")
    results.append(f"**Models:** {', '.join(selected_models)}")
    results.append(f"**Timestamp:** {datetime.now().strftime('%H:%M:%S')}")
    results.append("---")
    
    total_time = 0
    for model in selected_models:
        if model in available_models:
            result = benchmark.test_agent_response(prompt, model, agent_role)
            
            results.append(f"## 🤖 {model.upper()} ({agent_role})")
            results.append(f"**Status:** {result.get('status', '❌ Error')}")
            results.append(f"**Response Time:** {result.get('time', 'N/A')}")
            results.append(f"**Tokens Generated:** {result.get('tokens', 0)}")
            
            if 'response' in result and result['response']:
                preview = result['response'][:100].replace('\n', ' ')
                results.append(f"**Response Preview:** {preview}...")
            
            results.append("---")
            
            # Add to total time for averages
            try:
                time_val = float(result.get('time', '0').rstrip('s'))
                total_time += time_val
            except:
                pass
    
    # Performance Summary
    if selected_models:
        avg_time = total_time / len(selected_models)
        results.append(f"## 📊 Performance Summary")
        results.append(f"**Average Response Time:** {avg_time:.2f}s")
        results.append(f"**Total Models Tested:** {len(selected_models)}")
        
        # SAAP Performance Assessment
        if avg_time < 2.0:
            results.append(f"**SAAP Assessment:** ✅ Excellent for real-time multi-agent coordination")
        elif avg_time < 5.0:
            results.append(f"**SAAP Assessment:** ⚠️ Acceptable for batch processing")
        else:
            results.append(f"**SAAP Assessment:** ❌ Too slow for interactive agents")
    
    return "\n".join(results)

# Gradio Interface
with gr.Blocks(title="SAAP Performance Benchmark", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🚀 SAAP - satware AI Agent Platform Benchmark")
    gr.Markdown("**Master Thesis:** Hanan Wandji Danga | **Hochschule Worms** | **satware AG**")
    
    with gr.Row():
        with gr.Column(scale=2):
            prompt_input = gr.Textbox(
                label="SAAP Test Prompt",
                placeholder="Beispiel: Entwickle eine Systemarchitektur für Multi-Agent Koordination",
                lines=3,
                value="Erkläre die Vorteile einer On-Premise Multi-Agent-Plattform gegenüber Cloud-Lösungen."
            )
            
            agent_role = gr.Dropdown(
                choices=["General", "Jane", "John", "Justus"],
                label="Agent Role Simulation",
                value="General"
            )
            
        with gr.Column(scale=1):
            model_selection = gr.CheckboxGroup(
                choices=available_models,
                label="Models to Benchmark",
                value=available_models[:2] if len(available_models) >= 2 else available_models
            )
            
            benchmark_btn = gr.Button("🚀 Run SAAP Benchmark", variant="primary", size="lg")
    
    # Results
    results_output = gr.Markdown(label="Benchmark Results")
    
    # Benchmark function
    benchmark_btn.click(
        run_saap_benchmark,
        inputs=[prompt_input, model_selection, agent_role],
        outputs=results_output
    )
    
    # System Info
    with gr.Accordion("ℹ️ System Information", open=False):
        gr.Markdown(f"""
        ### 📋 SAAP Test Environment
        - **Available Models:** {len(available_models)}
        - **Models:** {', '.join(available_models)}
        - **Ollama Server:** {benchmark.base_url}
        
        ### 🎯 SAAP Performance Targets
        - **Real-time Coordination:** < 2s per response
        - **Batch Processing:** < 5s per response
        - **Multi-Agent Sync:** < 10s for complex workflows
        
        ### 🎓 Master Thesis Context
        **Projekt:** SAAP - satware AI Autonomous Agent Platform  
        **Student:** Hanan Wandji Danga  
        **Universität:** Hochschule Worms  
        **Betreuung:** Michael Wegener  
        **Ziel:** On-Premise Multi-Agent-Plattform mit lokalen LLMs
        """)

if __name__ == "__main__":
    demo.launch()