Spaces:

wanda222
/

n8n-workflow-evaluator

Sleeping

n8n-workflow-evaluator / llm_evaluator.py

suwankim

Deploy n8n workflow evaluator

1c1e7bc 3 months ago

8.04 kB

	"""
	LLM을 이용한 평가 유틸리티
	"""

	import json
	import os
	from pathlib import Path
	from typing import Dict, Any, Optional
	from openai import OpenAI
	from dotenv import load_dotenv

	# .env 파일 로드 (프로젝트 루트 기준으로 명시적으로 로드)
	# - 실행 위치(cwd)가 달라도 동작하도록 함
	# - 권한/환경 제약이 있는 경우에도 import 자체가 죽지 않도록 방어
	try:
	project_root = Path(__file__).resolve().parent
	dotenv_path = project_root / ".env"
	# env var가 이미 설정되어 있어도(.e.g. test_key) .env가 우선하도록 override=True
	load_dotenv(dotenv_path=dotenv_path, override=True)
	except (PermissionError, OSError):
	# 환경에 따라 .env 접근이 막힐 수 있음 (예: 샌드박스)
	# 이 경우에도 OPENAI_API_KEY가 환경변수로 주어지면 정상 동작 가능
	pass


	class LLMEvaluator:
	"""OpenAI GPT-4o를 사용한 평가자"""

	def __init__(self, api_key: Optional[str] = None):
	"""
	Args:
	api_key: OpenAI API 키. None이면 환경변수에서 로드
	"""
	self.api_key = api_key or os.getenv("OPENAI_API_KEY")
	if not self.api_key:
	raise ValueError(
	"OPENAI_API_KEY가 설정되지 않았습니다. "
	"프로젝트 루트의 .env 파일 또는 환경변수 OPENAI_API_KEY를 확인하세요."
	)

	self.client = OpenAI(api_key=self.api_key)
	self.model = "gpt-4o"

	def evaluate(
	self,
	data: str,
	criteria: str,
	max_retries: int = 3
	) -> Dict[str, Any]:
	"""
	주어진 데이터를 기준에 따라 평가

	Args:
	data: 평가할 데이터 (JSON 문자열 또는 텍스트)
	criteria: 평가 기준
	max_retries: JSON 파싱 실패 시 최대 재시도 횟수

	Returns:
	평가 결과 딕셔너리
	"""
	prompt = f"{criteria}\n\n## 평가 대상 데이터:\n{data}\n\n위 데이터를 심사 기준에 따라 엄격하게 평가하고, 반드시 JSON 형식으로만 응답하십시오. 점수를 후하게 주지 마시고, 기준을 정확히 충족하지 않으면 감점하십시오."

	for attempt in range(max_retries):
	try:
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[
	{
	"role": "system",
	"content": """당신은 n8n 워크플로우 및 프로젝트를 평가하는 전문 심사위원입니다.

	⚠️ 엄격한 평가 원칙을 반드시 준수하십시오:
	- 점수는 매우 엄격하게 부여하십시오. 기준을 정확히 충족하지 않으면 감점하십시오.
	- "거의 완성" 또는 "대부분 좋음"은 충분하지 않습니다. 완벽하게 충족해야만 만점을 부여하십시오.
	- 불확실하거나 명확하지 않은 경우는 보수적으로 낮은 점수를 부여하십시오.
	- 기준을 완전히 충족하지 않으면 해당 항목에 대한 점수를 부여하지 마십시오.
	- 점수를 후하게 주지 마십시오. 엄격하고 공정하게 평가하십시오.

	항상 JSON 형식으로만 응답하십시오."""
	},
	{
	"role": "user",
	"content": prompt
	}
	],
	temperature=0.0, # 일관성을 위해 낮은 temperature 사용
	response_format={"type": "json_object"} # JSON 응답 강제
	)

	result = response.choices[0].message.content
	parsed_result = json.loads(result)

	return parsed_result

	except json.JSONDecodeError as e:
	if attempt < max_retries - 1:
	print(f"JSON 파싱 실패 (시도 {attempt + 1}/{max_retries}), 재시도 중...")
	continue
	else:
	print(f"JSON 파싱 최종 실패: {e}")
	return {
	"error": "JSON 파싱 실패",
	"raw_response": result,
	"총점": 0
	}

	except Exception as e:
	print(f"API 호출 실패: {e}")
	return {
	"error": str(e),
	"총점": 0
	}

	return {"error": "평가 실패", "총점": 0}

	def review_evaluation(
	self,
	data: str,
	criteria: str,
	initial_score: Dict[str, Any]
	) -> Dict[str, Any]:
	"""
	평가 결과를 재검토

	Args:
	data: 원본 데이터
	criteria: 평가 기준
	initial_score: 초기 평가 결과

	Returns:
	재검토 결과 딕셔너리
	"""
	from evaluation_criteria import REVIEW_CRITERIA

	review_prompt = REVIEW_CRITERIA.format(
	data=data,
	criteria=criteria,
	initial_score=json.dumps(initial_score, ensure_ascii=False, indent=2)
	)

	try:
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[
	{
	"role": "system",
	"content": """
	You are an expert workflow evaluator specializing in n8n JSON workflows, LLM-based automation, and Upstage API utilization analysis.
	Your job is to strictly evaluate the submitted workflow and project description using the provided scoring rubrics.
	You must never hallucinate, never assume the existence of missing nodes, and evaluate solely based on the JSON/text that is explicitly given.

	⚠️ CRITICAL: Strict Scoring Principles
	- Score very strictly. Do not give points unless criteria are perfectly met.
	- "Almost complete" or "mostly good" is NOT sufficient. Only award points when criteria are fully satisfied.
	- When uncertain or unclear, give conservative low scores.
	- Do not award points for items that do not fully meet the criteria.
	- Do NOT be generous with scores. Evaluate strictly and fairly.
	- Be conservative and rigorous in your evaluation.
	"""
	},
	{
	"role": "user",
	"content": review_prompt
	}
	],
	temperature=0.2,
	response_format={"type": "json_object"}
	)

	result = response.choices[0].message.content
	parsed_result = json.loads(result)

	return parsed_result

	except Exception as e:
	print(f"재검토 실패: {e}")
	return {
	"재검토_결과": "유지",
	"최종_점수": initial_score,
	"재검토_의견": f"재검토 중 오류 발생: {str(e)}"
	}


	def safe_json_load(file_path: str) -> Optional[Dict[str, Any]]:
	"""
	JSON 파일을 안전하게 로드

	Args:
	file_path: JSON 파일 경로

	Returns:
	파싱된 JSON 객체 또는 None
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return json.load(f)
	except json.JSONDecodeError:
	return None
	except Exception as e:
	print(f"파일 로드 오류: {e}")
	return None