Spaces:
Sleeping
Sleeping
File size: 9,436 Bytes
9630ae8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
"""
νκ° λ°μ΄ν°μ
μμ± λꡬ
μ€μ RFP λ¬Έμμμ μ§λ¬Έ-λ΅λ³ μμ λ§λ€μ΄
Ground Truthκ° μλ νκ° λ°μ΄ν°μ
μ μμ±ν©λλ€.
μ¬μ©λ²:
python create_eval_dataset.py --input data/rag_chunks_final.csv --output data/eval_dataset.json
"""
import json
import csv
import argparse
from pathlib import Path
from typing import List, Dict, Any
class EvalDatasetCreator:
"""νκ° λ°μ΄ν°μ
μμ± ν΄λμ€"""
def __init__(self):
self.dataset = {
"metadata": {
"version": "1.0",
"description": "RFPilot νκ° λ°μ΄ν°μ
",
"created_by": "manual_annotation"
},
"in_distribution": [],
"out_distribution": []
}
def add_in_distribution_sample(
self,
query: str,
expected_answer: str,
category: str,
source_doc: str = None,
metadata: Dict[str, Any] = None
):
"""In-Distribution μν μΆκ°"""
sample = {
"query": query,
"expected_answer": expected_answer,
"category": category,
"expected_type": "document",
"source_doc": source_doc,
"metadata": metadata or {}
}
self.dataset["in_distribution"].append(sample)
def add_out_distribution_sample(
self,
query: str,
expected_answer: str,
category: str,
metadata: Dict[str, Any] = None
):
"""Out-Distribution μν μΆκ°"""
sample = {
"query": query,
"expected_answer": expected_answer,
"category": category,
"expected_type": "out_of_scope",
"metadata": metadata or {}
}
self.dataset["out_distribution"].append(sample)
def create_template_dataset(self):
"""ν
νλ¦Ώ λ°μ΄ν°μ
μμ± (μλ μμ±μ©)"""
print("π ν
νλ¦Ώ λ°μ΄ν°μ
μμ± μ€...")
# In-Distribution ν
νλ¦Ώ
in_dist_templates = [
{
"query": "μ¬μ
μ μμ μ μΆ λ§κ°μΌμ μΈμ μΈκ°μ?",
"expected_answer": "2024λ
3μ 15μΌκΉμ§μ
λλ€.", # μ€μ λ¬Έμμμ μΆμΆ
"category": "deadline",
"source_doc": "RFP_2024_001.hwp",
"metadata": {"difficulty": "easy"}
},
{
"query": "μ μ μμ²μμ μ μΆ μλ₯λ 무μμΈκ°μ?",
"expected_answer": "κΈ°μ μ μμ, κ°κ²©μ μμ, μ¬μ
μλ±λ‘μ¦, νμ¬μκ°μκ° νμν©λλ€.",
"category": "requirements",
"source_doc": "RFP_2024_001.hwp",
"metadata": {"difficulty": "medium"}
},
{
"query": "μ¬μ
μμ° κ·λͺ¨λ μΌλ§μΈκ°μ?",
"expected_answer": "μ΄ 5μ΅μμ
λλ€.",
"category": "budget",
"source_doc": "RFP_2024_002.hwp",
"metadata": {"difficulty": "easy"}
},
]
# Out-Distribution ν
νλ¦Ώ
out_dist_templates = [
{
"query": "νκ΅μ μλλ μ΄λμΈκ°μ?",
"expected_answer": "μμΈμ
λλ€.",
"category": "general_knowledge",
"metadata": {"difficulty": "easy"}
},
{
"query": "νμ΄μ¬μμ 리μ€νΈμ ννμ μ°¨μ΄λ 무μμΈκ°μ?",
"expected_answer": "리μ€νΈλ κ°λ³(mutable)μ΄κ³ , ννμ λΆλ³(immutable)μ
λλ€.",
"category": "programming",
"metadata": {"difficulty": "medium"}
},
]
# λ°μ΄ν°μ
μ μΆκ°
for sample in in_dist_templates:
self.add_in_distribution_sample(**sample)
for sample in out_dist_templates:
self.add_out_distribution_sample(**sample)
print(f"β
ν
νλ¦Ώ μμ± μλ£")
print(f" - In-Distribution: {len(in_dist_templates)}κ°")
print(f" - Out-Distribution: {len(out_dist_templates)}κ°")
print(f"\nβ οΈ μ΄ ν
νλ¦Ώμ μμ νμ¬ μ€μ λ°μ΄ν°λ₯Ό μ±μμ£ΌμΈμ!")
def load_from_csv(self, csv_path: str):
"""CSVμμ λ°μ΄ν°μ
λ‘λ"""
print(f"π₯ CSV λ‘λ μ€: {csv_path}")
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
distribution = row.get('distribution', 'in_distribution')
if distribution == 'in_distribution':
self.add_in_distribution_sample(
query=row['query'],
expected_answer=row['expected_answer'],
category=row['category'],
source_doc=row.get('source_doc'),
metadata=json.loads(row.get('metadata', '{}'))
)
else:
self.add_out_distribution_sample(
query=row['query'],
expected_answer=row['expected_answer'],
category=row['category'],
metadata=json.loads(row.get('metadata', '{}'))
)
print(f"β
CSV λ‘λ μλ£")
def save_json(self, output_path: str):
"""JSON νμμΌλ‘ μ μ₯"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(self.dataset, f, ensure_ascii=False, indent=2)
print(f"πΎ μ μ₯ μλ£: {output_path}")
def save_csv_template(self, output_path: str):
"""μλ μμ±μ© CSV ν
νλ¦Ώ μ μ₯"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=[
'distribution', 'query', 'expected_answer',
'category', 'source_doc', 'metadata'
])
writer.writeheader()
# In-Distribution μμ
writer.writerow({
'distribution': 'in_distribution',
'query': 'μ¬μ
μ μμ μ μΆ λ§κ°μΌμ μΈμ μΈκ°μ?',
'expected_answer': '2024λ
3μ 15μΌκΉμ§μ
λλ€.',
'category': 'deadline',
'source_doc': 'RFP_2024_001.hwp',
'metadata': '{"difficulty": "easy"}'
})
# Out-Distribution μμ
writer.writerow({
'distribution': 'out_distribution',
'query': 'νκ΅μ μλλ μ΄λμΈκ°μ?',
'expected_answer': 'μμΈμ
λλ€.',
'category': 'general_knowledge',
'source_doc': '',
'metadata': '{"difficulty": "easy"}'
})
print(f"π CSV ν
νλ¦Ώ μ μ₯: {output_path}")
print(f" β μ΄ νμΌμ μμ νμ¬ μ€μ λ°μ΄ν°λ₯Ό μ±μμ£ΌμΈμ!")
def print_summary(self):
"""λ°μ΄ν°μ
μμ½ μΆλ ₯"""
print("\n" + "="*60)
print("λ°μ΄ν°μ
μμ½")
print("="*60)
print(f"In-Distribution: {len(self.dataset['in_distribution'])}κ°")
print(f"Out-Distribution: {len(self.dataset['out_distribution'])}κ°")
print(f"μ΄ μν: {len(self.dataset['in_distribution']) + len(self.dataset['out_distribution'])}κ°")
print("="*60 + "\n")
def main():
parser = argparse.ArgumentParser(description='νκ° λ°μ΄ν°μ
μμ±')
parser.add_argument('--mode', choices=['template', 'csv'], default='template',
help='μμ± λͺ¨λ: template (ν
νλ¦Ώ μμ±) λλ csv (CSVμμ λ‘λ)')
parser.add_argument('--input', type=str, help='μ
λ ₯ CSV νμΌ κ²½λ‘')
parser.add_argument('--output', type=str, default='data/eval_dataset.json',
help='μΆλ ₯ JSON νμΌ κ²½λ‘')
parser.add_argument('--csv-template', type=str, default='data/eval_template.csv',
help='CSV ν
νλ¦Ώ μ μ₯ κ²½λ‘')
args = parser.parse_args()
creator = EvalDatasetCreator()
if args.mode == 'template':
print("π ν
νλ¦Ώ λͺ¨λ")
creator.create_template_dataset()
creator.save_json(args.output)
creator.save_csv_template(args.csv_template)
elif args.mode == 'csv':
if not args.input:
print("β CSV λͺ¨λμμλ --input μ΅μ
μ΄ νμν©λλ€.")
return
print("π₯ CSV λͺ¨λ")
creator.load_from_csv(args.input)
creator.save_json(args.output)
creator.print_summary()
print("\nβ
μλ£!")
print(f"\nλ€μ λ¨κ³:")
print(f"1. {args.csv_template} νμΌμ μ΄μ΄μ μ€μ λ°μ΄ν° μμ±")
print(f"2. python create_eval_dataset.py --mode csv --input {args.csv_template} --output {args.output}")
print(f"3. μμ±λ {args.output}μ μ€νμ μ¬μ©")
if __name__ == "__main__":
main() |