QLoRA_RAG_test / src /create_eval_dataset.py
Dongjin1203's picture
Initial commit - RFPilot experiment
9630ae8
"""
평가 데이터셋 생성 도ꡬ
μ‹€μ œ RFP λ¬Έμ„œμ—μ„œ 질문-λ‹΅λ³€ μŒμ„ λ§Œλ“€μ–΄
Ground Truthκ°€ μžˆλŠ” 평가 데이터셋을 μƒμ„±ν•©λ‹ˆλ‹€.
μ‚¬μš©λ²•:
python create_eval_dataset.py --input data/rag_chunks_final.csv --output data/eval_dataset.json
"""
import json
import csv
import argparse
from pathlib import Path
from typing import List, Dict, Any
class EvalDatasetCreator:
"""평가 데이터셋 생성 클래슀"""
def __init__(self):
self.dataset = {
"metadata": {
"version": "1.0",
"description": "RFPilot 평가 데이터셋",
"created_by": "manual_annotation"
},
"in_distribution": [],
"out_distribution": []
}
def add_in_distribution_sample(
self,
query: str,
expected_answer: str,
category: str,
source_doc: str = None,
metadata: Dict[str, Any] = None
):
"""In-Distribution μƒ˜ν”Œ μΆ”κ°€"""
sample = {
"query": query,
"expected_answer": expected_answer,
"category": category,
"expected_type": "document",
"source_doc": source_doc,
"metadata": metadata or {}
}
self.dataset["in_distribution"].append(sample)
def add_out_distribution_sample(
self,
query: str,
expected_answer: str,
category: str,
metadata: Dict[str, Any] = None
):
"""Out-Distribution μƒ˜ν”Œ μΆ”κ°€"""
sample = {
"query": query,
"expected_answer": expected_answer,
"category": category,
"expected_type": "out_of_scope",
"metadata": metadata or {}
}
self.dataset["out_distribution"].append(sample)
def create_template_dataset(self):
"""ν…œν”Œλ¦Ώ 데이터셋 생성 (μˆ˜λ™ μž‘μ„±μš©)"""
print("πŸ“ ν…œν”Œλ¦Ώ 데이터셋 생성 쀑...")
# In-Distribution ν…œν”Œλ¦Ώ
in_dist_templates = [
{
"query": "사업 μ œμ•ˆμ„œ 제좜 λ§ˆκ°μΌμ€ μ–Έμ œμΈκ°€μš”?",
"expected_answer": "2024λ…„ 3μ›” 15μΌκΉŒμ§€μž…λ‹ˆλ‹€.", # μ‹€μ œ λ¬Έμ„œμ—μ„œ μΆ”μΆœ
"category": "deadline",
"source_doc": "RFP_2024_001.hwp",
"metadata": {"difficulty": "easy"}
},
{
"query": "μ œμ•ˆ μš”μ²­μ„œμ˜ 제좜 μ„œλ₯˜λŠ” λ¬΄μ—‡μΈκ°€μš”?",
"expected_answer": "κΈ°μˆ μ œμ•ˆμ„œ, κ°€κ²©μ œμ•ˆμ„œ, μ‚¬μ—…μžλ“±λ‘μ¦, νšŒμ‚¬μ†Œκ°œμ„œκ°€ ν•„μš”ν•©λ‹ˆλ‹€.",
"category": "requirements",
"source_doc": "RFP_2024_001.hwp",
"metadata": {"difficulty": "medium"}
},
{
"query": "사업 μ˜ˆμ‚° 규λͺ¨λŠ” μ–Όλ§ˆμΈκ°€μš”?",
"expected_answer": "총 5μ–΅μ›μž…λ‹ˆλ‹€.",
"category": "budget",
"source_doc": "RFP_2024_002.hwp",
"metadata": {"difficulty": "easy"}
},
]
# Out-Distribution ν…œν”Œλ¦Ώ
out_dist_templates = [
{
"query": "ν•œκ΅­μ˜ μˆ˜λ„λŠ” μ–΄λ””μΈκ°€μš”?",
"expected_answer": "μ„œμšΈμž…λ‹ˆλ‹€.",
"category": "general_knowledge",
"metadata": {"difficulty": "easy"}
},
{
"query": "νŒŒμ΄μ¬μ—μ„œ λ¦¬μŠ€νŠΈμ™€ νŠœν”Œμ˜ μ°¨μ΄λŠ” λ¬΄μ—‡μΈκ°€μš”?",
"expected_answer": "λ¦¬μŠ€νŠΈλŠ” κ°€λ³€(mutable)이고, νŠœν”Œμ€ λΆˆλ³€(immutable)μž…λ‹ˆλ‹€.",
"category": "programming",
"metadata": {"difficulty": "medium"}
},
]
# 데이터셋에 μΆ”κ°€
for sample in in_dist_templates:
self.add_in_distribution_sample(**sample)
for sample in out_dist_templates:
self.add_out_distribution_sample(**sample)
print(f"βœ… ν…œν”Œλ¦Ώ 생성 μ™„λ£Œ")
print(f" - In-Distribution: {len(in_dist_templates)}개")
print(f" - Out-Distribution: {len(out_dist_templates)}개")
print(f"\n⚠️ 이 ν…œν”Œλ¦Ώμ„ μˆ˜μ •ν•˜μ—¬ μ‹€μ œ 데이터λ₯Ό μ±„μ›Œμ£Όμ„Έμš”!")
def load_from_csv(self, csv_path: str):
"""CSVμ—μ„œ 데이터셋 λ‘œλ“œ"""
print(f"πŸ“₯ CSV λ‘œλ“œ 쀑: {csv_path}")
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
distribution = row.get('distribution', 'in_distribution')
if distribution == 'in_distribution':
self.add_in_distribution_sample(
query=row['query'],
expected_answer=row['expected_answer'],
category=row['category'],
source_doc=row.get('source_doc'),
metadata=json.loads(row.get('metadata', '{}'))
)
else:
self.add_out_distribution_sample(
query=row['query'],
expected_answer=row['expected_answer'],
category=row['category'],
metadata=json.loads(row.get('metadata', '{}'))
)
print(f"βœ… CSV λ‘œλ“œ μ™„λ£Œ")
def save_json(self, output_path: str):
"""JSON ν˜•μ‹μœΌλ‘œ μ €μž₯"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(self.dataset, f, ensure_ascii=False, indent=2)
print(f"πŸ’Ύ μ €μž₯ μ™„λ£Œ: {output_path}")
def save_csv_template(self, output_path: str):
"""μˆ˜λ™ μž‘μ„±μš© CSV ν…œν”Œλ¦Ώ μ €μž₯"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=[
'distribution', 'query', 'expected_answer',
'category', 'source_doc', 'metadata'
])
writer.writeheader()
# In-Distribution μ˜ˆμ‹œ
writer.writerow({
'distribution': 'in_distribution',
'query': '사업 μ œμ•ˆμ„œ 제좜 λ§ˆκ°μΌμ€ μ–Έμ œμΈκ°€μš”?',
'expected_answer': '2024λ…„ 3μ›” 15μΌκΉŒμ§€μž…λ‹ˆλ‹€.',
'category': 'deadline',
'source_doc': 'RFP_2024_001.hwp',
'metadata': '{"difficulty": "easy"}'
})
# Out-Distribution μ˜ˆμ‹œ
writer.writerow({
'distribution': 'out_distribution',
'query': 'ν•œκ΅­μ˜ μˆ˜λ„λŠ” μ–΄λ””μΈκ°€μš”?',
'expected_answer': 'μ„œμšΈμž…λ‹ˆλ‹€.',
'category': 'general_knowledge',
'source_doc': '',
'metadata': '{"difficulty": "easy"}'
})
print(f"πŸ“„ CSV ν…œν”Œλ¦Ώ μ €μž₯: {output_path}")
print(f" β†’ 이 νŒŒμΌμ„ μˆ˜μ •ν•˜μ—¬ μ‹€μ œ 데이터λ₯Ό μ±„μ›Œμ£Όμ„Έμš”!")
def print_summary(self):
"""데이터셋 μš”μ•½ 좜λ ₯"""
print("\n" + "="*60)
print("데이터셋 μš”μ•½")
print("="*60)
print(f"In-Distribution: {len(self.dataset['in_distribution'])}개")
print(f"Out-Distribution: {len(self.dataset['out_distribution'])}개")
print(f"총 μƒ˜ν”Œ: {len(self.dataset['in_distribution']) + len(self.dataset['out_distribution'])}개")
print("="*60 + "\n")
def main():
parser = argparse.ArgumentParser(description='평가 데이터셋 생성')
parser.add_argument('--mode', choices=['template', 'csv'], default='template',
help='생성 λͺ¨λ“œ: template (ν…œν”Œλ¦Ώ 생성) λ˜λŠ” csv (CSVμ—μ„œ λ‘œλ“œ)')
parser.add_argument('--input', type=str, help='μž…λ ₯ CSV 파일 경둜')
parser.add_argument('--output', type=str, default='data/eval_dataset.json',
help='좜λ ₯ JSON 파일 경둜')
parser.add_argument('--csv-template', type=str, default='data/eval_template.csv',
help='CSV ν…œν”Œλ¦Ώ μ €μž₯ 경둜')
args = parser.parse_args()
creator = EvalDatasetCreator()
if args.mode == 'template':
print("πŸ“ ν…œν”Œλ¦Ώ λͺ¨λ“œ")
creator.create_template_dataset()
creator.save_json(args.output)
creator.save_csv_template(args.csv_template)
elif args.mode == 'csv':
if not args.input:
print("❌ CSV λͺ¨λ“œμ—μ„œλŠ” --input μ˜΅μ…˜μ΄ ν•„μš”ν•©λ‹ˆλ‹€.")
return
print("πŸ“₯ CSV λͺ¨λ“œ")
creator.load_from_csv(args.input)
creator.save_json(args.output)
creator.print_summary()
print("\nβœ… μ™„λ£Œ!")
print(f"\nλ‹€μŒ 단계:")
print(f"1. {args.csv_template} νŒŒμΌμ„ μ—΄μ–΄μ„œ μ‹€μ œ 데이터 μž‘μ„±")
print(f"2. python create_eval_dataset.py --mode csv --input {args.csv_template} --output {args.output}")
print(f"3. μƒμ„±λœ {args.output}을 μ‹€ν—˜μ— μ‚¬μš©")
if __name__ == "__main__":
main()