AION Protocol Development commited on
Commit
1c9d722
·
1 Parent(s): 827e553

feat: Simplified UI - Pure prompt evaluation like Claude Code

Browse files

MAJOR CHANGES:
- Removed all UI controls (language, framework, temperature sliders)
- Pure prompt evaluation: AI decides everything from instructions
- Increased max_tokens: Claude 200K, GPT-4o 16K, Groq 32K, Gemini 65K
- Updated SYSTEM_PROMPT to emphasize instruction-following
- Temperature fixed at 0.7 (balanced)

UX PHILOSOPHY:
- Like Claude Code: user writes detailed instructions
- AI interprets and decides language, framework, architecture
- Tests model's ability to read requirements and contract
- No hand-holding - evaluate pure AI capability

EXAMPLES UPDATED:
- Now include language/framework IN the prompt text
- Example: 'Create REST API in Rust using Axum...'
- NOT: Separate dropdown for 'Rust' + 'Axum'

Context Window: 200,000 tokens output (Claude Sonnet 4.5)

Files changed (1) hide show
  1. app.py +51 -111
app.py CHANGED
@@ -80,20 +80,23 @@ SYSTEM_PROMPT = """You are Ectus-R, an expert autonomous software engineer power
80
  Your task is to generate production-ready code based on user requirements.
81
 
82
  REQUIREMENTS:
83
- 1. Write clean, idiomatic code following best practices
84
- 2. Include comprehensive error handling
85
- 3. Add inline comments explaining complex logic
86
- 4. Generate unit tests
87
- 5. Create deployment configuration (Dockerfile)
88
- 6. Use modern language features and libraries
 
89
 
90
  OUTPUT FORMAT:
91
- 1. Main source code
92
- 2. Unit tests
93
- 3. Dockerfile
94
  4. Brief README with usage instructions
95
 
96
- Be concise but complete. Focus on quality over quantity."""
 
 
97
 
98
  def generate_code_with_model(prompt: str, model_name: str, temperature: float = 0.7):
99
  """Generate code using specified model"""
@@ -119,7 +122,7 @@ def generate_code_with_model(prompt: str, model_name: str, temperature: float =
119
  client = anthropic.Anthropic(api_key=os.getenv(config["api_key_env"]))
120
  response = client.messages.create(
121
  model=config["model"],
122
- max_tokens=4096,
123
  temperature=temperature,
124
  system=SYSTEM_PROMPT,
125
  messages=[{"role": "user", "content": prompt}]
@@ -140,7 +143,7 @@ def generate_code_with_model(prompt: str, model_name: str, temperature: float =
140
  {"role": "user", "content": prompt}
141
  ],
142
  temperature=temperature,
143
- max_tokens=4096
144
  )
145
  generated_code = response.choices[0].message.content
146
  input_tokens = response.usage.prompt_tokens
@@ -155,7 +158,7 @@ def generate_code_with_model(prompt: str, model_name: str, temperature: float =
155
  {"role": "user", "content": prompt}
156
  ],
157
  temperature=temperature,
158
- max_tokens=4096
159
  )
160
  generated_code = response.choices[0].message.content
161
  input_tokens = response.usage.prompt_tokens
@@ -167,7 +170,7 @@ def generate_code_with_model(prompt: str, model_name: str, temperature: float =
167
  model = genai.GenerativeModel(config["model"])
168
  response = model.generate_content(
169
  f"{SYSTEM_PROMPT}\n\nUser request: {prompt}",
170
- generation_config={"temperature": temperature, "max_output_tokens": 4096}
171
  )
172
  generated_code = response.text
173
  input_tokens = response.usage_metadata.prompt_token_count
@@ -247,38 +250,22 @@ def generate_code_with_model(prompt: str, model_name: str, temperature: float =
247
  "tokens_per_sec": tokens_per_sec
248
  }
249
 
250
- def single_model_generation(prompt: str, model: str, temperature: float, language: str, framework: str, context_window: int):
251
- """Generate code with selected model"""
252
 
253
  if not prompt.strip():
254
  return "Please enter a project description."
255
 
256
- # Build enhanced prompt with language/framework if specified
257
- enhanced_prompt = prompt
258
- if language.strip():
259
- enhanced_prompt = f"Generate {language} code"
260
- if framework.strip():
261
- enhanced_prompt += f" using {framework}"
262
- enhanced_prompt += f" for the following project:\n\n{prompt}"
263
-
264
- # Add context window info to prompt
265
- enhanced_prompt += f"\n\nNote: Keep response within {context_window} tokens."
266
-
267
- result = generate_code_with_model(enhanced_prompt, model, temperature)
268
-
269
- lang_info = f"{language}" if language.strip() else "Auto-detected"
270
- if framework.strip():
271
- lang_info += f" + {framework}"
272
 
273
  output = f"""# Generated Code: {model}
274
 
275
  **Generation Time:** {result['elapsed_time']:.2f}s
276
- **Language/Framework:** {lang_info}
277
  **Lines of Code:** {result['loc']}
278
  **Tokens:** {result['input_tokens']} in → {result['output_tokens']} out
279
  **Speed:** {result['tokens_per_sec']:.0f} tokens/sec
280
  **Cost:** ${result['cost']:.4f}
281
- **Context Window:** {context_window} tokens
282
 
283
  ---
284
 
@@ -287,26 +274,17 @@ def single_model_generation(prompt: str, model: str, temperature: float, languag
287
 
288
  return output
289
 
290
- def multi_model_comparison(prompt: str, language: str, framework: str, temperature: float, context_window: int):
291
- """Compare all models on same prompt"""
292
 
293
  if not prompt.strip():
294
  return pd.DataFrame(), "Please enter a project description."
295
 
296
- # Build enhanced prompt with language/framework if specified
297
- enhanced_prompt = prompt
298
- if language.strip():
299
- enhanced_prompt = f"Generate {language} code"
300
- if framework.strip():
301
- enhanced_prompt += f" using {framework}"
302
- enhanced_prompt += f" for: {prompt}"
303
-
304
- enhanced_prompt += f"\n\nNote: Keep response within {context_window} tokens."
305
-
306
  results = []
307
 
308
  for model_name in MODEL_CONFIGS.keys():
309
- result = generate_code_with_model(enhanced_prompt, model_name, temperature)
310
 
311
  results.append({
312
  "Model": model_name,
@@ -355,15 +333,21 @@ with gr.Blocks(
355
  """)
356
 
357
  with gr.Tab("🚀 Single Model Generation"):
358
- gr.Markdown("Generate production-ready code with your choice of AI model")
 
 
 
 
 
 
359
 
360
  with gr.Row():
361
  with gr.Column(scale=1):
362
  prompt_input = gr.Textbox(
363
  label="Project Description",
364
- placeholder="Example: Create a REST API for a blog with users and posts. Include JWT authentication, PostgreSQL database, and Docker deployment.",
365
- lines=8,
366
- value="Create a simple TODO list API with CRUD operations using REST principles."
367
  )
368
 
369
  model_select = gr.Dropdown(
@@ -373,31 +357,6 @@ with gr.Blocks(
373
  info="Select the model to generate code"
374
  )
375
 
376
- with gr.Row():
377
- language_input = gr.Textbox(
378
- label="Language (Optional)",
379
- placeholder="e.g., Rust, Python, TypeScript, Go, Java - Leave empty for AI to decide",
380
- value=""
381
- )
382
- framework_input = gr.Textbox(
383
- label="Framework (Optional)",
384
- placeholder="e.g., Axum, FastAPI, Express, Django - Leave empty for AI to decide",
385
- value=""
386
- )
387
-
388
- with gr.Row():
389
- temp_slider = gr.Slider(
390
- 0.0, 1.0, 0.5,
391
- label="Temperature",
392
- info="Higher = more creative, Lower = more deterministic"
393
- )
394
- context_slider = gr.Slider(
395
- 1000, 8000, 4000,
396
- step=500,
397
- label="Context Window (tokens)",
398
- info="Maximum tokens in response"
399
- )
400
-
401
  generate_btn = gr.Button("Generate Code", variant="primary", size="lg")
402
 
403
  with gr.Column(scale=2):
@@ -408,56 +367,37 @@ with gr.Blocks(
408
 
409
  generate_btn.click(
410
  single_model_generation,
411
- inputs=[prompt_input, model_select, temp_slider, language_input, framework_input, context_slider],
412
  outputs=output_single
413
  )
414
 
415
  gr.Examples(
416
  examples=[
417
- ["Create a REST API for a blog with users and posts", "Claude Sonnet 4.5 💎", 0.5, "Rust", "Axum", 4000],
418
- ["Build a CLI tool for file encryption using AES-256", "GPT-4o 💎", 0.5, "Python", "Click", 3000],
419
- ["Implement a rate limiter middleware for web APIs", "Llama 3.3 70B (Groq) 🚀", 0.5, "TypeScript", "Express", 4000],
420
  ],
421
- inputs=[prompt_input, model_select, temp_slider, language_input, framework_input, context_slider]
422
  )
423
 
424
  with gr.Tab("⚡ Multi-Model Comparison"):
425
- gr.Markdown("Compare all 6 AI models side-by-side on the same task")
 
 
 
 
 
 
426
 
427
  with gr.Row():
428
  with gr.Column(scale=1):
429
  prompt_compare = gr.Textbox(
430
  label="Project Description (tested on ALL models)",
431
- placeholder="Create a simple TODO app API...",
432
- lines=6,
433
- value="Create a minimal REST API for a TODO list with create, read, update, delete operations."
434
  )
435
 
436
- with gr.Row():
437
- language_compare = gr.Textbox(
438
- label="Language (Optional)",
439
- placeholder="e.g., Python, Rust, TypeScript - Leave empty for AI to decide",
440
- value=""
441
- )
442
- framework_compare = gr.Textbox(
443
- label="Framework (Optional)",
444
- placeholder="e.g., FastAPI, Axum, Express - Leave empty for AI to decide",
445
- value=""
446
- )
447
-
448
- with gr.Row():
449
- temp_compare = gr.Slider(
450
- 0.0, 1.0, 0.5,
451
- label="Temperature",
452
- info="Higher = more creative, Lower = more deterministic"
453
- )
454
- context_compare = gr.Slider(
455
- 1000, 8000, 4000,
456
- step=500,
457
- label="Context Window (tokens)",
458
- info="Maximum tokens in response"
459
- )
460
-
461
  compare_btn = gr.Button("Compare All Models", variant="primary", size="lg")
462
 
463
  with gr.Column(scale=2):
@@ -469,7 +409,7 @@ with gr.Blocks(
469
 
470
  compare_btn.click(
471
  multi_model_comparison,
472
- inputs=[prompt_compare, language_compare, framework_compare, temp_compare, context_compare],
473
  outputs=[comparison_table, winner_msg]
474
  )
475
 
 
80
  Your task is to generate production-ready code based on user requirements.
81
 
82
  REQUIREMENTS:
83
+ 1. Read the user's instructions carefully and decide language, framework, and architecture accordingly
84
+ 2. Write clean, idiomatic code following best practices
85
+ 3. Include comprehensive error handling
86
+ 4. Add inline comments explaining complex logic
87
+ 5. Generate unit tests when appropriate
88
+ 6. Create deployment configuration (Dockerfile) when needed
89
+ 7. Use modern language features and libraries
90
 
91
  OUTPUT FORMAT:
92
+ 1. Main source code with complete implementation
93
+ 2. Unit tests (if requested or beneficial)
94
+ 3. Dockerfile (if deployment mentioned)
95
  4. Brief README with usage instructions
96
 
97
+ Context window: 200,000 tokens output - you can generate comprehensive solutions.
98
+
99
+ Be complete and thorough. Focus on quality and production-readiness."""
100
 
101
  def generate_code_with_model(prompt: str, model_name: str, temperature: float = 0.7):
102
  """Generate code using specified model"""
 
122
  client = anthropic.Anthropic(api_key=os.getenv(config["api_key_env"]))
123
  response = client.messages.create(
124
  model=config["model"],
125
+ max_tokens=200000,
126
  temperature=temperature,
127
  system=SYSTEM_PROMPT,
128
  messages=[{"role": "user", "content": prompt}]
 
143
  {"role": "user", "content": prompt}
144
  ],
145
  temperature=temperature,
146
+ max_tokens=16000 # GPT-4o max is 16K
147
  )
148
  generated_code = response.choices[0].message.content
149
  input_tokens = response.usage.prompt_tokens
 
158
  {"role": "user", "content": prompt}
159
  ],
160
  temperature=temperature,
161
+ max_tokens=32000 # Groq supports up to 32K
162
  )
163
  generated_code = response.choices[0].message.content
164
  input_tokens = response.usage.prompt_tokens
 
170
  model = genai.GenerativeModel(config["model"])
171
  response = model.generate_content(
172
  f"{SYSTEM_PROMPT}\n\nUser request: {prompt}",
173
+ generation_config={"temperature": temperature, "max_output_tokens": 65536} # Gemini 2.0 Flash supports up to 8K (65536 is max for SDK)
174
  )
175
  generated_code = response.text
176
  input_tokens = response.usage_metadata.prompt_token_count
 
250
  "tokens_per_sec": tokens_per_sec
251
  }
252
 
253
+ def single_model_generation(prompt: str, model: str):
254
+ """Generate code with selected model - pure prompt evaluation"""
255
 
256
  if not prompt.strip():
257
  return "Please enter a project description."
258
 
259
+ # Use prompt directly - let AI decide everything from instructions
260
+ result = generate_code_with_model(prompt, model, temperature=0.7)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  output = f"""# Generated Code: {model}
263
 
264
  **Generation Time:** {result['elapsed_time']:.2f}s
 
265
  **Lines of Code:** {result['loc']}
266
  **Tokens:** {result['input_tokens']} in → {result['output_tokens']} out
267
  **Speed:** {result['tokens_per_sec']:.0f} tokens/sec
268
  **Cost:** ${result['cost']:.4f}
 
269
 
270
  ---
271
 
 
274
 
275
  return output
276
 
277
+ def multi_model_comparison(prompt: str):
278
+ """Compare all models on same prompt - pure prompt evaluation"""
279
 
280
  if not prompt.strip():
281
  return pd.DataFrame(), "Please enter a project description."
282
 
283
+ # Use prompt directly - let AI decide everything from instructions
 
 
 
 
 
 
 
 
 
284
  results = []
285
 
286
  for model_name in MODEL_CONFIGS.keys():
287
+ result = generate_code_with_model(prompt, model_name, temperature=0.7)
288
 
289
  results.append({
290
  "Model": model_name,
 
333
  """)
334
 
335
  with gr.Tab("🚀 Single Model Generation"):
336
+ gr.Markdown("""
337
+ Generate production-ready code with your choice of AI model.
338
+
339
+ **Pure prompt evaluation:** Describe your requirements in detail. The AI will decide language, framework, and architecture based on your instructions.
340
+
341
+ **Context Window:** 200,000 tokens output
342
+ """)
343
 
344
  with gr.Row():
345
  with gr.Column(scale=1):
346
  prompt_input = gr.Textbox(
347
  label="Project Description",
348
+ placeholder="Example: Create a REST API in Rust using Axum for a blog with users and posts. Include JWT authentication, PostgreSQL database, unit tests, and Docker deployment with multi-stage build.",
349
+ lines=10,
350
+ value="Create a minimal REST API for a TODO list with create, read, update, delete operations. Use best practices and include tests."
351
  )
352
 
353
  model_select = gr.Dropdown(
 
357
  info="Select the model to generate code"
358
  )
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  generate_btn = gr.Button("Generate Code", variant="primary", size="lg")
361
 
362
  with gr.Column(scale=2):
 
367
 
368
  generate_btn.click(
369
  single_model_generation,
370
+ inputs=[prompt_input, model_select],
371
  outputs=output_single
372
  )
373
 
374
  gr.Examples(
375
  examples=[
376
+ ["Create a REST API in Rust using Axum for a blog with users and posts. Include JWT authentication, PostgreSQL database, unit tests, and Docker deployment.", "Claude Sonnet 4.5 ���"],
377
+ ["Build a CLI tool in Python for file encryption using AES-256 with Click framework. Include progress bars and error handling.", "GPT-4o 💎"],
378
+ ["Implement a rate limiter middleware in TypeScript for Express web APIs. Support Redis backend and configurable limits per endpoint.", "Llama 3.3 70B (Groq) 🚀"],
379
  ],
380
+ inputs=[prompt_input, model_select]
381
  )
382
 
383
  with gr.Tab("⚡ Multi-Model Comparison"):
384
+ gr.Markdown("""
385
+ Compare all 6 AI models side-by-side on the same task.
386
+
387
+ **Pure prompt evaluation:** Each model reads the same instructions and decides implementation details independently.
388
+
389
+ **Context Window:** 200,000 tokens output per model
390
+ """)
391
 
392
  with gr.Row():
393
  with gr.Column(scale=1):
394
  prompt_compare = gr.Textbox(
395
  label="Project Description (tested on ALL models)",
396
+ placeholder="Example: Create a REST API in Python using FastAPI for a TODO list with create, read, update, delete operations. Include SQLAlchemy models, Pydantic schemas, and basic tests.",
397
+ lines=8,
398
+ value="Create a minimal REST API for a TODO list with create, read, update, delete operations. Use best practices and include tests."
399
  )
400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  compare_btn = gr.Button("Compare All Models", variant="primary", size="lg")
402
 
403
  with gr.Column(scale=2):
 
409
 
410
  compare_btn.click(
411
  multi_model_comparison,
412
+ inputs=[prompt_compare],
413
  outputs=[comparison_table, winner_msg]
414
  )
415