Spaces:
Sleeping
Sleeping
| """ | |
| νκ° λ°μ΄ν°μ μμ± λꡬ | |
| μ€μ RFP λ¬Έμμμ μ§λ¬Έ-λ΅λ³ μμ λ§λ€μ΄ | |
| Ground Truthκ° μλ νκ° λ°μ΄ν°μ μ μμ±ν©λλ€. | |
| μ¬μ©λ²: | |
| python create_eval_dataset.py --input data/rag_chunks_final.csv --output data/eval_dataset.json | |
| """ | |
| import json | |
| import csv | |
| import argparse | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| class EvalDatasetCreator: | |
| """νκ° λ°μ΄ν°μ μμ± ν΄λμ€""" | |
| def __init__(self): | |
| self.dataset = { | |
| "metadata": { | |
| "version": "1.0", | |
| "description": "RFPilot νκ° λ°μ΄ν°μ ", | |
| "created_by": "manual_annotation" | |
| }, | |
| "in_distribution": [], | |
| "out_distribution": [] | |
| } | |
| def add_in_distribution_sample( | |
| self, | |
| query: str, | |
| expected_answer: str, | |
| category: str, | |
| source_doc: str = None, | |
| metadata: Dict[str, Any] = None | |
| ): | |
| """In-Distribution μν μΆκ°""" | |
| sample = { | |
| "query": query, | |
| "expected_answer": expected_answer, | |
| "category": category, | |
| "expected_type": "document", | |
| "source_doc": source_doc, | |
| "metadata": metadata or {} | |
| } | |
| self.dataset["in_distribution"].append(sample) | |
| def add_out_distribution_sample( | |
| self, | |
| query: str, | |
| expected_answer: str, | |
| category: str, | |
| metadata: Dict[str, Any] = None | |
| ): | |
| """Out-Distribution μν μΆκ°""" | |
| sample = { | |
| "query": query, | |
| "expected_answer": expected_answer, | |
| "category": category, | |
| "expected_type": "out_of_scope", | |
| "metadata": metadata or {} | |
| } | |
| self.dataset["out_distribution"].append(sample) | |
| def create_template_dataset(self): | |
| """ν νλ¦Ώ λ°μ΄ν°μ μμ± (μλ μμ±μ©)""" | |
| print("π ν νλ¦Ώ λ°μ΄ν°μ μμ± μ€...") | |
| # In-Distribution ν νλ¦Ώ | |
| in_dist_templates = [ | |
| { | |
| "query": "μ¬μ μ μμ μ μΆ λ§κ°μΌμ μΈμ μΈκ°μ?", | |
| "expected_answer": "2024λ 3μ 15μΌκΉμ§μ λλ€.", # μ€μ λ¬Έμμμ μΆμΆ | |
| "category": "deadline", | |
| "source_doc": "RFP_2024_001.hwp", | |
| "metadata": {"difficulty": "easy"} | |
| }, | |
| { | |
| "query": "μ μ μμ²μμ μ μΆ μλ₯λ 무μμΈκ°μ?", | |
| "expected_answer": "κΈ°μ μ μμ, κ°κ²©μ μμ, μ¬μ μλ±λ‘μ¦, νμ¬μκ°μκ° νμν©λλ€.", | |
| "category": "requirements", | |
| "source_doc": "RFP_2024_001.hwp", | |
| "metadata": {"difficulty": "medium"} | |
| }, | |
| { | |
| "query": "μ¬μ μμ° κ·λͺ¨λ μΌλ§μΈκ°μ?", | |
| "expected_answer": "μ΄ 5μ΅μμ λλ€.", | |
| "category": "budget", | |
| "source_doc": "RFP_2024_002.hwp", | |
| "metadata": {"difficulty": "easy"} | |
| }, | |
| ] | |
| # Out-Distribution ν νλ¦Ώ | |
| out_dist_templates = [ | |
| { | |
| "query": "νκ΅μ μλλ μ΄λμΈκ°μ?", | |
| "expected_answer": "μμΈμ λλ€.", | |
| "category": "general_knowledge", | |
| "metadata": {"difficulty": "easy"} | |
| }, | |
| { | |
| "query": "νμ΄μ¬μμ 리μ€νΈμ ννμ μ°¨μ΄λ 무μμΈκ°μ?", | |
| "expected_answer": "리μ€νΈλ κ°λ³(mutable)μ΄κ³ , ννμ λΆλ³(immutable)μ λλ€.", | |
| "category": "programming", | |
| "metadata": {"difficulty": "medium"} | |
| }, | |
| ] | |
| # λ°μ΄ν°μ μ μΆκ° | |
| for sample in in_dist_templates: | |
| self.add_in_distribution_sample(**sample) | |
| for sample in out_dist_templates: | |
| self.add_out_distribution_sample(**sample) | |
| print(f"β ν νλ¦Ώ μμ± μλ£") | |
| print(f" - In-Distribution: {len(in_dist_templates)}κ°") | |
| print(f" - Out-Distribution: {len(out_dist_templates)}κ°") | |
| print(f"\nβ οΈ μ΄ ν νλ¦Ώμ μμ νμ¬ μ€μ λ°μ΄ν°λ₯Ό μ±μμ£ΌμΈμ!") | |
| def load_from_csv(self, csv_path: str): | |
| """CSVμμ λ°μ΄ν°μ λ‘λ""" | |
| print(f"π₯ CSV λ‘λ μ€: {csv_path}") | |
| with open(csv_path, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| distribution = row.get('distribution', 'in_distribution') | |
| if distribution == 'in_distribution': | |
| self.add_in_distribution_sample( | |
| query=row['query'], | |
| expected_answer=row['expected_answer'], | |
| category=row['category'], | |
| source_doc=row.get('source_doc'), | |
| metadata=json.loads(row.get('metadata', '{}')) | |
| ) | |
| else: | |
| self.add_out_distribution_sample( | |
| query=row['query'], | |
| expected_answer=row['expected_answer'], | |
| category=row['category'], | |
| metadata=json.loads(row.get('metadata', '{}')) | |
| ) | |
| print(f"β CSV λ‘λ μλ£") | |
| def save_json(self, output_path: str): | |
| """JSON νμμΌλ‘ μ μ₯""" | |
| output_path = Path(output_path) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(self.dataset, f, ensure_ascii=False, indent=2) | |
| print(f"πΎ μ μ₯ μλ£: {output_path}") | |
| def save_csv_template(self, output_path: str): | |
| """μλ μμ±μ© CSV ν νλ¦Ώ μ μ₯""" | |
| output_path = Path(output_path) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8', newline='') as f: | |
| writer = csv.DictWriter(f, fieldnames=[ | |
| 'distribution', 'query', 'expected_answer', | |
| 'category', 'source_doc', 'metadata' | |
| ]) | |
| writer.writeheader() | |
| # In-Distribution μμ | |
| writer.writerow({ | |
| 'distribution': 'in_distribution', | |
| 'query': 'μ¬μ μ μμ μ μΆ λ§κ°μΌμ μΈμ μΈκ°μ?', | |
| 'expected_answer': '2024λ 3μ 15μΌκΉμ§μ λλ€.', | |
| 'category': 'deadline', | |
| 'source_doc': 'RFP_2024_001.hwp', | |
| 'metadata': '{"difficulty": "easy"}' | |
| }) | |
| # Out-Distribution μμ | |
| writer.writerow({ | |
| 'distribution': 'out_distribution', | |
| 'query': 'νκ΅μ μλλ μ΄λμΈκ°μ?', | |
| 'expected_answer': 'μμΈμ λλ€.', | |
| 'category': 'general_knowledge', | |
| 'source_doc': '', | |
| 'metadata': '{"difficulty": "easy"}' | |
| }) | |
| print(f"π CSV ν νλ¦Ώ μ μ₯: {output_path}") | |
| print(f" β μ΄ νμΌμ μμ νμ¬ μ€μ λ°μ΄ν°λ₯Ό μ±μμ£ΌμΈμ!") | |
| def print_summary(self): | |
| """λ°μ΄ν°μ μμ½ μΆλ ₯""" | |
| print("\n" + "="*60) | |
| print("λ°μ΄ν°μ μμ½") | |
| print("="*60) | |
| print(f"In-Distribution: {len(self.dataset['in_distribution'])}κ°") | |
| print(f"Out-Distribution: {len(self.dataset['out_distribution'])}κ°") | |
| print(f"μ΄ μν: {len(self.dataset['in_distribution']) + len(self.dataset['out_distribution'])}κ°") | |
| print("="*60 + "\n") | |
| def main(): | |
| parser = argparse.ArgumentParser(description='νκ° λ°μ΄ν°μ μμ±') | |
| parser.add_argument('--mode', choices=['template', 'csv'], default='template', | |
| help='μμ± λͺ¨λ: template (ν νλ¦Ώ μμ±) λλ csv (CSVμμ λ‘λ)') | |
| parser.add_argument('--input', type=str, help='μ λ ₯ CSV νμΌ κ²½λ‘') | |
| parser.add_argument('--output', type=str, default='data/eval_dataset.json', | |
| help='μΆλ ₯ JSON νμΌ κ²½λ‘') | |
| parser.add_argument('--csv-template', type=str, default='data/eval_template.csv', | |
| help='CSV ν νλ¦Ώ μ μ₯ κ²½λ‘') | |
| args = parser.parse_args() | |
| creator = EvalDatasetCreator() | |
| if args.mode == 'template': | |
| print("π ν νλ¦Ώ λͺ¨λ") | |
| creator.create_template_dataset() | |
| creator.save_json(args.output) | |
| creator.save_csv_template(args.csv_template) | |
| elif args.mode == 'csv': | |
| if not args.input: | |
| print("β CSV λͺ¨λμμλ --input μ΅μ μ΄ νμν©λλ€.") | |
| return | |
| print("π₯ CSV λͺ¨λ") | |
| creator.load_from_csv(args.input) | |
| creator.save_json(args.output) | |
| creator.print_summary() | |
| print("\nβ μλ£!") | |
| print(f"\nλ€μ λ¨κ³:") | |
| print(f"1. {args.csv_template} νμΌμ μ΄μ΄μ μ€μ λ°μ΄ν° μμ±") | |
| print(f"2. python create_eval_dataset.py --mode csv --input {args.csv_template} --output {args.output}") | |
| print(f"3. μμ±λ {args.output}μ μ€νμ μ¬μ©") | |
| if __name__ == "__main__": | |
| main() |