n8n-workflow-evaluator / llm_evaluator.py
suwankim
Deploy n8n workflow evaluator
1c1e7bc
"""
LLM์„ ์ด์šฉํ•œ ํ‰๊ฐ€ ์œ ํ‹ธ๋ฆฌํ‹ฐ
"""
import json
import os
from pathlib import Path
from typing import Dict, Any, Optional
from openai import OpenAI
from dotenv import load_dotenv
# .env ํŒŒ์ผ ๋กœ๋“œ (ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ ๊ธฐ์ค€์œผ๋กœ ๋ช…์‹œ์ ์œผ๋กœ ๋กœ๋“œ)
# - ์‹คํ–‰ ์œ„์น˜(cwd)๊ฐ€ ๋‹ฌ๋ผ๋„ ๋™์ž‘ํ•˜๋„๋ก ํ•จ
# - ๊ถŒํ•œ/ํ™˜๊ฒฝ ์ œ์•ฝ์ด ์žˆ๋Š” ๊ฒฝ์šฐ์—๋„ import ์ž์ฒด๊ฐ€ ์ฃฝ์ง€ ์•Š๋„๋ก ๋ฐฉ์–ด
try:
project_root = Path(__file__).resolve().parent
dotenv_path = project_root / ".env"
# env var๊ฐ€ ์ด๋ฏธ ์„ค์ •๋˜์–ด ์žˆ์–ด๋„(.e.g. test_key) .env๊ฐ€ ์šฐ์„ ํ•˜๋„๋ก override=True
load_dotenv(dotenv_path=dotenv_path, override=True)
except (PermissionError, OSError):
# ํ™˜๊ฒฝ์— ๋”ฐ๋ผ .env ์ ‘๊ทผ์ด ๋ง‰ํž ์ˆ˜ ์žˆ์Œ (์˜ˆ: ์ƒŒ๋“œ๋ฐ•์Šค)
# ์ด ๊ฒฝ์šฐ์—๋„ OPENAI_API_KEY๊ฐ€ ํ™˜๊ฒฝ๋ณ€์ˆ˜๋กœ ์ฃผ์–ด์ง€๋ฉด ์ •์ƒ ๋™์ž‘ ๊ฐ€๋Šฅ
pass
class LLMEvaluator:
"""OpenAI GPT-4o๋ฅผ ์‚ฌ์šฉํ•œ ํ‰๊ฐ€์ž"""
def __init__(self, api_key: Optional[str] = None):
"""
Args:
api_key: OpenAI API ํ‚ค. None์ด๋ฉด ํ™˜๊ฒฝ๋ณ€์ˆ˜์—์„œ ๋กœ๋“œ
"""
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
if not self.api_key:
raise ValueError(
"OPENAI_API_KEY๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. "
"ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ์˜ .env ํŒŒ์ผ ๋˜๋Š” ํ™˜๊ฒฝ๋ณ€์ˆ˜ OPENAI_API_KEY๋ฅผ ํ™•์ธํ•˜์„ธ์š”."
)
self.client = OpenAI(api_key=self.api_key)
self.model = "gpt-4o"
def evaluate(
self,
data: str,
criteria: str,
max_retries: int = 3
) -> Dict[str, Any]:
"""
์ฃผ์–ด์ง„ ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ์ค€์— ๋”ฐ๋ผ ํ‰๊ฐ€
Args:
data: ํ‰๊ฐ€ํ•  ๋ฐ์ดํ„ฐ (JSON ๋ฌธ์ž์—ด ๋˜๋Š” ํ…์ŠคํŠธ)
criteria: ํ‰๊ฐ€ ๊ธฐ์ค€
max_retries: JSON ํŒŒ์‹ฑ ์‹คํŒจ ์‹œ ์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜
Returns:
ํ‰๊ฐ€ ๊ฒฐ๊ณผ ๋”•์…”๋„ˆ๋ฆฌ
"""
prompt = f"{criteria}\n\n## ํ‰๊ฐ€ ๋Œ€์ƒ ๋ฐ์ดํ„ฐ:\n{data}\n\n์œ„ ๋ฐ์ดํ„ฐ๋ฅผ ์‹ฌ์‚ฌ ๊ธฐ์ค€์— ๋”ฐ๋ผ **์—„๊ฒฉํ•˜๊ฒŒ** ํ‰๊ฐ€ํ•˜๊ณ , ๋ฐ˜๋“œ์‹œ JSON ํ˜•์‹์œผ๋กœ๋งŒ ์‘๋‹ตํ•˜์‹ญ์‹œ์˜ค. ์ ์ˆ˜๋ฅผ ํ›„ํ•˜๊ฒŒ ์ฃผ์ง€ ๋งˆ์‹œ๊ณ , ๊ธฐ์ค€์„ ์ •ํ™•ํžˆ ์ถฉ์กฑํ•˜์ง€ ์•Š์œผ๋ฉด ๊ฐ์ ํ•˜์‹ญ์‹œ์˜ค."
for attempt in range(max_retries):
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": """๋‹น์‹ ์€ n8n ์›Œํฌํ”Œ๋กœ์šฐ ๋ฐ ํ”„๋กœ์ ํŠธ๋ฅผ ํ‰๊ฐ€ํ•˜๋Š” ์ „๋ฌธ ์‹ฌ์‚ฌ์œ„์›์ž…๋‹ˆ๋‹ค.
โš ๏ธ **์—„๊ฒฉํ•œ ํ‰๊ฐ€ ์›์น™์„ ๋ฐ˜๋“œ์‹œ ์ค€์ˆ˜ํ•˜์‹ญ์‹œ์˜ค:**
- ์ ์ˆ˜๋Š” ๋งค์šฐ ์—„๊ฒฉํ•˜๊ฒŒ ๋ถ€์—ฌํ•˜์‹ญ์‹œ์˜ค. ๊ธฐ์ค€์„ ์ •ํ™•ํžˆ ์ถฉ์กฑํ•˜์ง€ ์•Š์œผ๋ฉด ๊ฐ์ ํ•˜์‹ญ์‹œ์˜ค.
- "๊ฑฐ์˜ ์™„์„ฑ" ๋˜๋Š” "๋Œ€๋ถ€๋ถ„ ์ข‹์Œ"์€ ์ถฉ๋ถ„ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค. ์™„๋ฒฝํ•˜๊ฒŒ ์ถฉ์กฑํ•ด์•ผ๋งŒ ๋งŒ์ ์„ ๋ถ€์—ฌํ•˜์‹ญ์‹œ์˜ค.
- ๋ถˆํ™•์‹คํ•˜๊ฑฐ๋‚˜ ๋ช…ํ™•ํ•˜์ง€ ์•Š์€ ๊ฒฝ์šฐ๋Š” ๋ณด์ˆ˜์ ์œผ๋กœ ๋‚ฎ์€ ์ ์ˆ˜๋ฅผ ๋ถ€์—ฌํ•˜์‹ญ์‹œ์˜ค.
- ๊ธฐ์ค€์„ ์™„์ „ํžˆ ์ถฉ์กฑํ•˜์ง€ ์•Š์œผ๋ฉด ํ•ด๋‹น ํ•ญ๋ชฉ์— ๋Œ€ํ•œ ์ ์ˆ˜๋ฅผ ๋ถ€์—ฌํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค.
- ์ ์ˆ˜๋ฅผ ํ›„ํ•˜๊ฒŒ ์ฃผ์ง€ ๋งˆ์‹ญ์‹œ์˜ค. ์—„๊ฒฉํ•˜๊ณ  ๊ณต์ •ํ•˜๊ฒŒ ํ‰๊ฐ€ํ•˜์‹ญ์‹œ์˜ค.
ํ•ญ์ƒ JSON ํ˜•์‹์œผ๋กœ๋งŒ ์‘๋‹ตํ•˜์‹ญ์‹œ์˜ค."""
},
{
"role": "user",
"content": prompt
}
],
temperature=0.0, # ์ผ๊ด€์„ฑ์„ ์œ„ํ•ด ๋‚ฎ์€ temperature ์‚ฌ์šฉ
response_format={"type": "json_object"} # JSON ์‘๋‹ต ๊ฐ•์ œ
)
result = response.choices[0].message.content
parsed_result = json.loads(result)
return parsed_result
except json.JSONDecodeError as e:
if attempt < max_retries - 1:
print(f"JSON ํŒŒ์‹ฑ ์‹คํŒจ (์‹œ๋„ {attempt + 1}/{max_retries}), ์žฌ์‹œ๋„ ์ค‘...")
continue
else:
print(f"JSON ํŒŒ์‹ฑ ์ตœ์ข… ์‹คํŒจ: {e}")
return {
"error": "JSON ํŒŒ์‹ฑ ์‹คํŒจ",
"raw_response": result,
"์ด์ ": 0
}
except Exception as e:
print(f"API ํ˜ธ์ถœ ์‹คํŒจ: {e}")
return {
"error": str(e),
"์ด์ ": 0
}
return {"error": "ํ‰๊ฐ€ ์‹คํŒจ", "์ด์ ": 0}
def review_evaluation(
self,
data: str,
criteria: str,
initial_score: Dict[str, Any]
) -> Dict[str, Any]:
"""
ํ‰๊ฐ€ ๊ฒฐ๊ณผ๋ฅผ ์žฌ๊ฒ€ํ† 
Args:
data: ์›๋ณธ ๋ฐ์ดํ„ฐ
criteria: ํ‰๊ฐ€ ๊ธฐ์ค€
initial_score: ์ดˆ๊ธฐ ํ‰๊ฐ€ ๊ฒฐ๊ณผ
Returns:
์žฌ๊ฒ€ํ†  ๊ฒฐ๊ณผ ๋”•์…”๋„ˆ๋ฆฌ
"""
from evaluation_criteria import REVIEW_CRITERIA
review_prompt = REVIEW_CRITERIA.format(
data=data,
criteria=criteria,
initial_score=json.dumps(initial_score, ensure_ascii=False, indent=2)
)
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": """
You are an expert workflow evaluator specializing in n8n JSON workflows, LLM-based automation, and Upstage API utilization analysis.
Your job is to strictly evaluate the submitted workflow and project description using the provided scoring rubrics.
You must never hallucinate, never assume the existence of missing nodes, and evaluate solely based on the JSON/text that is explicitly given.
โš ๏ธ **CRITICAL: Strict Scoring Principles**
- Score very strictly. Do not give points unless criteria are perfectly met.
- "Almost complete" or "mostly good" is NOT sufficient. Only award points when criteria are fully satisfied.
- When uncertain or unclear, give conservative low scores.
- Do not award points for items that do not fully meet the criteria.
- Do NOT be generous with scores. Evaluate strictly and fairly.
- Be conservative and rigorous in your evaluation.
"""
},
{
"role": "user",
"content": review_prompt
}
],
temperature=0.2,
response_format={"type": "json_object"}
)
result = response.choices[0].message.content
parsed_result = json.loads(result)
return parsed_result
except Exception as e:
print(f"์žฌ๊ฒ€ํ†  ์‹คํŒจ: {e}")
return {
"์žฌ๊ฒ€ํ† _๊ฒฐ๊ณผ": "์œ ์ง€",
"์ตœ์ข…_์ ์ˆ˜": initial_score,
"์žฌ๊ฒ€ํ† _์˜๊ฒฌ": f"์žฌ๊ฒ€ํ†  ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
}
def safe_json_load(file_path: str) -> Optional[Dict[str, Any]]:
"""
JSON ํŒŒ์ผ์„ ์•ˆ์ „ํ•˜๊ฒŒ ๋กœ๋“œ
Args:
file_path: JSON ํŒŒ์ผ ๊ฒฝ๋กœ
Returns:
ํŒŒ์‹ฑ๋œ JSON ๊ฐ์ฒด ๋˜๋Š” None
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except json.JSONDecodeError:
return None
except Exception as e:
print(f"ํŒŒ์ผ ๋กœ๋“œ ์˜ค๋ฅ˜: {e}")
return None