가이드 source_url 추가 + 헌재·PIPC 원문 deep-link 수정
Browse files코드:
- GuideChunk에 source_url 필드 추가
- guides.sqlite 스키마/INSERT/SELECT에 source_url 반영
- retriever: 가이드 카드 metadata.url에 source_url 주입
- retriever 헌재 deep-link: /LSW/detcInfoP.do?detcSeq={ID}&mode=0
(기존: SDK detail_url이 OPEN API endpoint라 localhost 상대경로로 깨짐)
- retriever PIPC deep-link: /LSW/ppcInfoP.do?ppcSeq={ID}&mode=8
(기존: 봇 차단 우려로 url 빈 값 — 실제 ID로 200 응답 검증 완료)
데이터:
- guides.sqlite: source_url 컬럼 추가, 4개 가이드 PIPC 게시판 직링크 부여 (457건)
- cases.sqlite: detail_url 현행 camelCase 포맷(?nttId=…&nttNo=…)으로 일관화
- guide/chunks/*.jsonl: source_url 필드 4개 jsonl 모두 추가
- data/guide/chunks/개인정보_질의응답_모음집(2025.12.).jsonl +0 -0
- data/guide/chunks/고정형 영상정보처리기기_설치_운영_안내서(2024.12).jsonl +0 -0
- data/guide/chunks/분야별_개인정보_보호_안내서(2024.12).jsonl +0 -0
- data/guide/chunks/소상공인을_위한_개인정보 보호_핸드북(2024.12).jsonl +0 -0
- data/guides.sqlite +2 -2
- data/guides_meta.json +1 -1
- src/kpaa/guides/index.py +9 -4
- src/kpaa/guides/models.py +1 -0
- src/kpaa/retrieval/retriever.py +16 -7
data/guide/chunks/개인정보_질의응답_모음집(2025.12.).jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/guide/chunks/고정형 영상정보처리기기_설치_운영_안내서(2024.12).jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/guide/chunks/분야별_개인정보_보호_안내서(2024.12).jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/guide/chunks/소상공인을_위한_개인정보 보호_핸드북(2024.12).jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/guides.sqlite
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28eb30381d62f57a2fef36c91ad24ced518bae84da78e0689158da2d32892157
|
| 3 |
+
size 4562944
|
data/guides_meta.json
CHANGED
|
@@ -23,6 +23,6 @@
|
|
| 23 |
"source_pdf": "소상공인을 위한 개인정보 보호 핸드북(2024.12).pdf",
|
| 24 |
"chunks_count": 41
|
| 25 |
},
|
| 26 |
-
"_built_at": "2026-05-06
|
| 27 |
"_total_chunks": 457
|
| 28 |
}
|
|
|
|
| 23 |
"source_pdf": "소상공인을 위한 개인정보 보호 핸드북(2024.12).pdf",
|
| 24 |
"chunks_count": 41
|
| 25 |
},
|
| 26 |
+
"_built_at": "2026-05-06 15:24:29",
|
| 27 |
"_total_chunks": 457
|
| 28 |
}
|
src/kpaa/guides/index.py
CHANGED
|
@@ -26,7 +26,8 @@ CREATE TABLE IF NOT EXISTS guides (
|
|
| 26 |
body TEXT,
|
| 27 |
pages TEXT,
|
| 28 |
source_pdf TEXT,
|
| 29 |
-
chunk_context TEXT
|
|
|
|
| 30 |
);
|
| 31 |
|
| 32 |
CREATE VIRTUAL TABLE IF NOT EXISTS guides_fts USING fts5(
|
|
@@ -99,12 +100,13 @@ def build_index(chunks: Iterable[GuideChunk], *, db_path: Path) -> None:
|
|
| 99 |
conn.execute(
|
| 100 |
"""INSERT INTO guides
|
| 101 |
(chunk_id, doc_id, doc_title, doc_date,
|
| 102 |
-
section, chunk_no, body, pages, source_pdf, chunk_context
|
| 103 |
-
|
|
|
|
| 104 |
(
|
| 105 |
c.chunk_id, c.doc_id, c.doc_title, c.doc_date,
|
| 106 |
c.section, c.chunk_no, c.body, c.pages, c.source_pdf,
|
| 107 |
-
c.chunk_context,
|
| 108 |
),
|
| 109 |
)
|
| 110 |
conn.execute(
|
|
@@ -154,6 +156,7 @@ def search(
|
|
| 154 |
body=r["body"], pages=r["pages"] or "",
|
| 155 |
source_pdf=r["source_pdf"],
|
| 156 |
chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
|
|
|
|
| 157 |
)
|
| 158 |
for r in rows
|
| 159 |
]
|
|
@@ -188,6 +191,7 @@ def get_chunks(
|
|
| 188 |
body=r["body"], pages=r["pages"] or "",
|
| 189 |
source_pdf=r["source_pdf"],
|
| 190 |
chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
|
|
|
|
| 191 |
)
|
| 192 |
for r in rows
|
| 193 |
}
|
|
@@ -223,6 +227,7 @@ def list_all_chunks(*, db_path: Path | None = None) -> list[GuideChunk]:
|
|
| 223 |
body=r["body"], pages=r["pages"] or "",
|
| 224 |
source_pdf=r["source_pdf"],
|
| 225 |
chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
|
|
|
|
| 226 |
)
|
| 227 |
for r in rows
|
| 228 |
]
|
|
|
|
| 26 |
body TEXT,
|
| 27 |
pages TEXT,
|
| 28 |
source_pdf TEXT,
|
| 29 |
+
chunk_context TEXT,
|
| 30 |
+
source_url TEXT DEFAULT ''
|
| 31 |
);
|
| 32 |
|
| 33 |
CREATE VIRTUAL TABLE IF NOT EXISTS guides_fts USING fts5(
|
|
|
|
| 100 |
conn.execute(
|
| 101 |
"""INSERT INTO guides
|
| 102 |
(chunk_id, doc_id, doc_title, doc_date,
|
| 103 |
+
section, chunk_no, body, pages, source_pdf, chunk_context,
|
| 104 |
+
source_url)
|
| 105 |
+
VALUES (?,?,?,?,?,?,?,?,?,?,?)""",
|
| 106 |
(
|
| 107 |
c.chunk_id, c.doc_id, c.doc_title, c.doc_date,
|
| 108 |
c.section, c.chunk_no, c.body, c.pages, c.source_pdf,
|
| 109 |
+
c.chunk_context, c.source_url,
|
| 110 |
),
|
| 111 |
)
|
| 112 |
conn.execute(
|
|
|
|
| 156 |
body=r["body"], pages=r["pages"] or "",
|
| 157 |
source_pdf=r["source_pdf"],
|
| 158 |
chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
|
| 159 |
+
source_url=(r["source_url"] if "source_url" in r.keys() else "") or "",
|
| 160 |
)
|
| 161 |
for r in rows
|
| 162 |
]
|
|
|
|
| 191 |
body=r["body"], pages=r["pages"] or "",
|
| 192 |
source_pdf=r["source_pdf"],
|
| 193 |
chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
|
| 194 |
+
source_url=(r["source_url"] if "source_url" in r.keys() else "") or "",
|
| 195 |
)
|
| 196 |
for r in rows
|
| 197 |
}
|
|
|
|
| 227 |
body=r["body"], pages=r["pages"] or "",
|
| 228 |
source_pdf=r["source_pdf"],
|
| 229 |
chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
|
| 230 |
+
source_url=(r["source_url"] if "source_url" in r.keys() else "") or "",
|
| 231 |
)
|
| 232 |
for r in rows
|
| 233 |
]
|
src/kpaa/guides/models.py
CHANGED
|
@@ -22,6 +22,7 @@ class GuideChunk(BaseModel):
|
|
| 22 |
pages: str = "" # 원본 PDF 페이지 범위 (예: "p.8" 또는 "p.8-9")
|
| 23 |
source_pdf: str # 원본 PDF 파일명
|
| 24 |
chunk_context: str = "" # Anthropic Contextual Retrieval prefix — 인덱싱 시 body 앞에 prepend (답변 출력은 body만)
|
|
|
|
| 25 |
|
| 26 |
def citation(self) -> str:
|
| 27 |
"""답변에 박을 인용 태그.
|
|
|
|
| 22 |
pages: str = "" # 원본 PDF 페이지 범위 (예: "p.8" 또는 "p.8-9")
|
| 23 |
source_pdf: str # 원본 PDF 파일명
|
| 24 |
chunk_context: str = "" # Anthropic Contextual Retrieval prefix — 인덱싱 시 body 앞에 prepend (답변 출력은 body만)
|
| 25 |
+
source_url: str = "" # 원본 가이드 게시물 URL (PIPC 자료실 bbsView.do 직링크) — UI "원문 가이드 열기" 버튼용
|
| 26 |
|
| 27 |
def citation(self) -> str:
|
| 28 |
"""답변에 박을 인용 태그.
|
src/kpaa/retrieval/retriever.py
CHANGED
|
@@ -412,7 +412,7 @@ async def _fetch_guides(
|
|
| 412 |
"doc_title": h.doc_title,
|
| 413 |
"pages": h.pages,
|
| 414 |
"source_pdf": h.source_pdf,
|
| 415 |
-
"url": "",
|
| 416 |
},
|
| 417 |
sort_priority=1, # case(0) < guide(1) < law(1) < pipc(2)...
|
| 418 |
recency_score=_recency_score(year),
|
|
@@ -565,10 +565,13 @@ async def _fetch_pipc(
|
|
| 565 |
"decision_no": body.decision_no,
|
| 566 |
"decision_date": body.decision_date,
|
| 567 |
"agency": body.agency,
|
| 568 |
-
# 법제처 사람용 페이지
|
| 569 |
-
#
|
| 570 |
-
|
| 571 |
-
|
|
|
|
|
|
|
|
|
|
| 572 |
**_fallback_meta(used_kw, queue),
|
| 573 |
},
|
| 574 |
sort_priority=2,
|
|
@@ -1140,8 +1143,14 @@ async def _fetch_constitutional(
|
|
| 1140 |
"decision_date": body.decision_date,
|
| 1141 |
"petitioner": body.petitioner,
|
| 1142 |
"respondent": body.respondent,
|
| 1143 |
-
|
| 1144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1145 |
**_fallback_meta(used_kw, queue),
|
| 1146 |
},
|
| 1147 |
sort_priority=3, # 판례·해석례 사이 (4=판례, 3=해석례)
|
|
|
|
| 412 |
"doc_title": h.doc_title,
|
| 413 |
"pages": h.pages,
|
| 414 |
"source_pdf": h.source_pdf,
|
| 415 |
+
"url": h.source_url or "",
|
| 416 |
},
|
| 417 |
sort_priority=1, # case(0) < guide(1) < law(1) < pipc(2)...
|
| 418 |
recency_score=_recency_score(year),
|
|
|
|
| 565 |
"decision_no": body.decision_no,
|
| 566 |
"decision_date": body.decision_date,
|
| 567 |
"agency": body.agency,
|
| 568 |
+
# 법제처 사람용 결정문 페이지 deep-link.
|
| 569 |
+
# 패턴: /LSW/ppcInfoP.do?ppcSeq={ID}&mode=8
|
| 570 |
+
"url": (
|
| 571 |
+
f"https://www.law.go.kr/LSW/ppcInfoP.do?ppcSeq={body.decision_id}&mode=8"
|
| 572 |
+
if body.decision_id
|
| 573 |
+
else ""
|
| 574 |
+
),
|
| 575 |
**_fallback_meta(used_kw, queue),
|
| 576 |
},
|
| 577 |
sort_priority=2,
|
|
|
|
| 1143 |
"decision_date": body.decision_date,
|
| 1144 |
"petitioner": body.petitioner,
|
| 1145 |
"respondent": body.respondent,
|
| 1146 |
+
# SDK의 hit.detail_url은 OPEN API endpoint(/DRF/lawService.do?...)
|
| 1147 |
+
# 라 브라우저용이 아니므로 사용 X. 공개 deep-link 사용.
|
| 1148 |
+
# 패턴: /LSW/detcInfoP.do?detcSeq={ID}&mode=0
|
| 1149 |
+
"url": (
|
| 1150 |
+
f"https://www.law.go.kr/LSW/detcInfoP.do?detcSeq={body.decision_id}&mode=0"
|
| 1151 |
+
if body.decision_id
|
| 1152 |
+
else ""
|
| 1153 |
+
),
|
| 1154 |
**_fallback_meta(used_kw, queue),
|
| 1155 |
},
|
| 1156 |
sort_priority=3, # 판례·해석례 사이 (4=판례, 3=해석례)
|