Spaces:
Paused
Paused
| """AI Hub ๋ฐ์ดํฐ์ ํ์ ๋ชจ๋. | |
| ๊ฐ ํ์๋ ๋จ์ผ JSON ํ์ผ์ ํ์ฑํ์ฌ ํ์ต ๋ ์ฝ๋ ๋ชฉ๋ก์ ๋ฐํํ๋ค. | |
| ๋ฐํ ํ์: | |
| { | |
| "question": str, | |
| "answer": str, | |
| "source": str, | |
| "category": str, | |
| "metadata": dict, | |
| } | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from pathlib import Path | |
| from typing import Any | |
| def _load_json(filepath: Path) -> Any: | |
| with open(filepath, encoding="utf-8") as f: | |
| return json.load(f) | |
| class GukripParser: | |
| """71852 ๊ตญ๋ฆฝ์์์๋ฌธํ์ ๋น ํ์. | |
| consulting_content์ '์๋ด์:' ๋ฐํ๋ฅผ ์ถ์ถํ์ฌ ๋ต๋ณ์ผ๋ก ์ฌ์ฉํ๊ณ , | |
| instructions[0].data[0].instruction์ ์ง๋ฌธ์ผ๋ก ์ฌ์ฉํ๋ค. | |
| """ | |
| def parse(self, filepath: Path) -> list[dict]: | |
| data = _load_json(filepath) | |
| if isinstance(data, list): | |
| records = [] | |
| for item in data: | |
| records.extend(self._parse_item(item)) | |
| return records | |
| return self._parse_item(data) | |
| def _parse_item(self, item: dict) -> list[dict]: | |
| content: str = item.get("consulting_content", "") | |
| source_id: str = item.get("source_id", "") | |
| consulting_date: str = item.get("consulting_date", "") | |
| category: str = item.get("consulting_category", "") | |
| # ์๋ด์ ๋ฐํ ์ถ์ถ | |
| agent_turns = [] | |
| for line in content.split("\n"): | |
| line = line.strip() | |
| if line.startswith("์๋ด์:"): | |
| turn_text = line[len("์๋ด์:") :].strip() | |
| if turn_text: | |
| agent_turns.append(turn_text) | |
| if not agent_turns: | |
| return [] | |
| answer = " ".join(agent_turns) | |
| # instruction์์ ์ง๋ฌธ ์ถ์ถ | |
| instructions = item.get("instructions", []) | |
| if not instructions: | |
| return [] | |
| data_list = instructions[0].get("data", []) | |
| if not data_list: | |
| return [] | |
| question = data_list[0].get("instruction", "").strip() | |
| if not question: | |
| return [] | |
| return [ | |
| { | |
| "question": question, | |
| "answer": answer, | |
| "source": "71852_๊ตญ๋ฆฝ์์์๋ฌธํ์ ๋น", | |
| "category": category, | |
| "metadata": { | |
| "source_id": source_id, | |
| "consulting_date": consulting_date, | |
| }, | |
| } | |
| ] | |
| class GovQAParser: | |
| """71852 ์ค์/์ง๋ฐฉํ์ ๊ธฐ๊ด ํ์. | |
| consulting_content์์ Q/A ํ์์ ํ์ฑํ์ฌ ๊ณต์ ์ ๋ถ ๋ต๋ณ์ ์ถ์ถํ๋ค. | |
| ๋ณด์กฐ ์ง๋ฌธ(instructions.data[*].instruction)์ ๋ณ๋ ๋ ์ฝ๋๋ก ์์ฑํ๋ค. | |
| """ | |
| # A ๊ตฌ๋ถ์ ํจํด: "\nA :" ๋๋ "\nA:" | |
| _A_SEP = re.compile(r"\nA\s*:") | |
| def parse(self, filepath: Path) -> list[dict]: | |
| data = _load_json(filepath) | |
| if isinstance(data, list): | |
| records = [] | |
| for item in data: | |
| records.extend(self._parse_item(item)) | |
| return records | |
| return self._parse_item(data) | |
| def _parse_item(self, item: dict) -> list[dict]: | |
| content: str = item.get("consulting_content", "") | |
| source_str: str = item.get("source", "") | |
| source_id: str = item.get("source_id", "") | |
| consulting_date: str = item.get("consulting_date", "") | |
| category: str = item.get("consulting_category", "") | |
| # A ๋ถ๋ถ ๋ถ๋ฆฌ | |
| parts = self._A_SEP.split(content, maxsplit=1) | |
| if len(parts) < 2: | |
| return [] | |
| q_part, a_part = parts[0], parts[1].strip() | |
| if not a_part: | |
| return [] | |
| # Q ๋ถ๋ถ์์ ์ง๋ฌธ ์ถ์ถ | |
| question = self._extract_question(q_part) | |
| if not question: | |
| return [] | |
| records = [ | |
| { | |
| "question": question, | |
| "answer": a_part, | |
| "source": "71852_์ค์ํ์ ๊ธฐ๊ด", | |
| "category": category, | |
| "metadata": { | |
| "source_id": source_id, | |
| "consulting_date": consulting_date, | |
| "org": source_str, | |
| }, | |
| } | |
| ] | |
| # ๋ณด์กฐ ์ง๋ฌธ(instructions.data[*].instruction)์ผ๋ก ์ถ๊ฐ ๋ ์ฝ๋ ์์ฑ | |
| instructions = item.get("instructions", []) | |
| if instructions: | |
| for instr_item in instructions[0].get("data", []): | |
| sub_q = instr_item.get("instruction", "").strip() | |
| if sub_q and sub_q != question: | |
| records.append( | |
| { | |
| "question": sub_q, | |
| "answer": a_part, | |
| "source": "71852_์ค์ํ์ ๊ธฐ๊ด", | |
| "category": category, | |
| "metadata": { | |
| "source_id": source_id, | |
| "consulting_date": consulting_date, | |
| "org": source_str, | |
| "question_type": "auxiliary", | |
| }, | |
| } | |
| ) | |
| return records | |
| def _extract_question(q_part: str) -> str: | |
| """Q ๋ธ๋ก์์ ์ง๋ฌธ ํ ์คํธ๋ฅผ ์ถ์ถํ๋ค.""" | |
| # "Q :" ๋๋ "Q:" ์ดํ ํ ์คํธ ์ถ์ถ | |
| q_match = re.search(r"\nQ\s*:(.*?)(?=\n\n|\Z)", q_part, re.DOTALL) | |
| if q_match: | |
| return q_match.group(1).strip() | |
| # fallback: "์ ๋ชฉ :" ์ดํ ํ ์คํธ | |
| title_match = re.search(r"์ ๋ชฉ\s*:\s*(.+)", q_part) | |
| if title_match: | |
| return title_match.group(1).strip() | |
| return q_part.strip() | |
| class GovQALocalParser(GovQAParser): | |
| """71852 ์ง๋ฐฉํ์ ๊ธฐ๊ด ํ์ โ GovQAParser์ ๋์ผํ ๋ก์ง, source ๋ ์ด๋ธ๋ง ๋ค๋ฆ.""" | |
| def _parse_item(self, item: dict) -> list[dict]: | |
| records = super()._parse_item(item) | |
| for r in records: | |
| r["source"] = "71852_์ง๋ฐฉํ์ ๊ธฐ๊ด" | |
| return records | |
| class AdminLawParser: | |
| """71847 ํ์ ๋ฒ ํ์. | |
| label.input์ ์ง๋ฌธ, label.output์ ๋ต๋ณ์ผ๋ก ์ฌ์ฉํ๋ค. | |
| ๊ฒฐ์ ๋ก(TL_๊ฒฐ์ ๋ก_QA)์ ๋ฒ๋ น(TL_๋ฒ๋ น_QA) ๋ชจ๋ ๋์ผ ๊ตฌ์กฐ. | |
| """ | |
| def __init__(self, source_label: str = "71847_๊ฒฐ์ ๋ก"): | |
| self.source_label = source_label | |
| def parse(self, filepath: Path) -> list[dict]: | |
| data = _load_json(filepath) | |
| if isinstance(data, list): | |
| records = [] | |
| for item in data: | |
| records.extend(self._parse_item(item)) | |
| return records | |
| return self._parse_item(data) | |
| def _parse_item(self, item: dict) -> list[dict]: | |
| label = item.get("label", {}) | |
| question = label.get("input", "").strip() | |
| answer = label.get("output", "").strip() | |
| if not question or not answer: | |
| return [] | |
| info = item.get("info", {}) | |
| case_name = info.get("caseName", info.get("title", "")) | |
| category = info.get("ministry", info.get("caseCode", "")) | |
| return [ | |
| { | |
| "question": question, | |
| "answer": answer, | |
| "source": self.source_label, | |
| "category": category, | |
| "metadata": { | |
| "case_name": case_name, | |
| "law_class": info.get("lawClass", ""), | |
| }, | |
| } | |
| ] | |