scvcoder commited on
Commit
004a8f3
·
verified ·
1 Parent(s): c3ed3a0

가이드 source_url 추가 + 헌재·PIPC 원문 deep-link 수정

Browse files

코드:
- GuideChunk에 source_url 필드 추가
- guides.sqlite 스키마/INSERT/SELECT에 source_url 반영
- retriever: 가이드 카드 metadata.url에 source_url 주입
- retriever 헌재 deep-link: /LSW/detcInfoP.do?detcSeq={ID}&mode=0
(기존: SDK detail_url이 OPEN API endpoint라 localhost 상대경로로 깨짐)
- retriever PIPC deep-link: /LSW/ppcInfoP.do?ppcSeq={ID}&mode=8
(기존: 봇 차단 우려로 url 빈 값 — 실제 ID로 200 응답 검증 완료)

데이터:
- guides.sqlite: source_url 컬럼 추가, 4개 가이드 PIPC 게시판 직링크 부여 (457건)
- cases.sqlite: detail_url 현행 camelCase 포맷(?nttId=…&nttNo=…)으로 일관화
- guide/chunks/*.jsonl: source_url 필드 4개 jsonl 모두 추가

data/guide/chunks/개인정보_질의응답_모음집(2025.12.).jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
data/guide/chunks/고정형 영상정보처리기기_설치_운영_안내서(2024.12).jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
data/guide/chunks/분야별_개인정보_보호_안내서(2024.12).jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
data/guide/chunks/소상공인을_위한_개인정보 보호_핸드북(2024.12).jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
data/guides.sqlite CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ef7969e411190926e1b9069aa9208abc01d993be83b2657da37def0382f23da
3
- size 4505600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28eb30381d62f57a2fef36c91ad24ced518bae84da78e0689158da2d32892157
3
+ size 4562944
data/guides_meta.json CHANGED
@@ -23,6 +23,6 @@
23
  "source_pdf": "소상공인을 위한 개인정보 보호 핸드북(2024.12).pdf",
24
  "chunks_count": 41
25
  },
26
- "_built_at": "2026-05-06 01:09:21",
27
  "_total_chunks": 457
28
  }
 
23
  "source_pdf": "소상공인을 위한 개인정보 보호 핸드북(2024.12).pdf",
24
  "chunks_count": 41
25
  },
26
+ "_built_at": "2026-05-06 15:24:29",
27
  "_total_chunks": 457
28
  }
src/kpaa/guides/index.py CHANGED
@@ -26,7 +26,8 @@ CREATE TABLE IF NOT EXISTS guides (
26
  body TEXT,
27
  pages TEXT,
28
  source_pdf TEXT,
29
- chunk_context TEXT
 
30
  );
31
 
32
  CREATE VIRTUAL TABLE IF NOT EXISTS guides_fts USING fts5(
@@ -99,12 +100,13 @@ def build_index(chunks: Iterable[GuideChunk], *, db_path: Path) -> None:
99
  conn.execute(
100
  """INSERT INTO guides
101
  (chunk_id, doc_id, doc_title, doc_date,
102
- section, chunk_no, body, pages, source_pdf, chunk_context)
103
- VALUES (?,?,?,?,?,?,?,?,?,?)""",
 
104
  (
105
  c.chunk_id, c.doc_id, c.doc_title, c.doc_date,
106
  c.section, c.chunk_no, c.body, c.pages, c.source_pdf,
107
- c.chunk_context,
108
  ),
109
  )
110
  conn.execute(
@@ -154,6 +156,7 @@ def search(
154
  body=r["body"], pages=r["pages"] or "",
155
  source_pdf=r["source_pdf"],
156
  chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
 
157
  )
158
  for r in rows
159
  ]
@@ -188,6 +191,7 @@ def get_chunks(
188
  body=r["body"], pages=r["pages"] or "",
189
  source_pdf=r["source_pdf"],
190
  chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
 
191
  )
192
  for r in rows
193
  }
@@ -223,6 +227,7 @@ def list_all_chunks(*, db_path: Path | None = None) -> list[GuideChunk]:
223
  body=r["body"], pages=r["pages"] or "",
224
  source_pdf=r["source_pdf"],
225
  chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
 
226
  )
227
  for r in rows
228
  ]
 
26
  body TEXT,
27
  pages TEXT,
28
  source_pdf TEXT,
29
+ chunk_context TEXT,
30
+ source_url TEXT DEFAULT ''
31
  );
32
 
33
  CREATE VIRTUAL TABLE IF NOT EXISTS guides_fts USING fts5(
 
100
  conn.execute(
101
  """INSERT INTO guides
102
  (chunk_id, doc_id, doc_title, doc_date,
103
+ section, chunk_no, body, pages, source_pdf, chunk_context,
104
+ source_url)
105
+ VALUES (?,?,?,?,?,?,?,?,?,?,?)""",
106
  (
107
  c.chunk_id, c.doc_id, c.doc_title, c.doc_date,
108
  c.section, c.chunk_no, c.body, c.pages, c.source_pdf,
109
+ c.chunk_context, c.source_url,
110
  ),
111
  )
112
  conn.execute(
 
156
  body=r["body"], pages=r["pages"] or "",
157
  source_pdf=r["source_pdf"],
158
  chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
159
+ source_url=(r["source_url"] if "source_url" in r.keys() else "") or "",
160
  )
161
  for r in rows
162
  ]
 
191
  body=r["body"], pages=r["pages"] or "",
192
  source_pdf=r["source_pdf"],
193
  chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
194
+ source_url=(r["source_url"] if "source_url" in r.keys() else "") or "",
195
  )
196
  for r in rows
197
  }
 
227
  body=r["body"], pages=r["pages"] or "",
228
  source_pdf=r["source_pdf"],
229
  chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
230
+ source_url=(r["source_url"] if "source_url" in r.keys() else "") or "",
231
  )
232
  for r in rows
233
  ]
src/kpaa/guides/models.py CHANGED
@@ -22,6 +22,7 @@ class GuideChunk(BaseModel):
22
  pages: str = "" # 원본 PDF 페이지 범위 (예: "p.8" 또는 "p.8-9")
23
  source_pdf: str # 원본 PDF 파일명
24
  chunk_context: str = "" # Anthropic Contextual Retrieval prefix — 인덱싱 시 body 앞에 prepend (답변 출력은 body만)
 
25
 
26
  def citation(self) -> str:
27
  """답변에 박을 인용 태그.
 
22
  pages: str = "" # 원본 PDF 페이지 범위 (예: "p.8" 또는 "p.8-9")
23
  source_pdf: str # 원본 PDF 파일명
24
  chunk_context: str = "" # Anthropic Contextual Retrieval prefix — 인덱싱 시 body 앞에 prepend (답변 출력은 body만)
25
+ source_url: str = "" # 원본 가이드 게시물 URL (PIPC 자료실 bbsView.do 직링크) — UI "원문 가이드 열기" 버튼용
26
 
27
  def citation(self) -> str:
28
  """답변에 박을 인용 태그.
src/kpaa/retrieval/retriever.py CHANGED
@@ -412,7 +412,7 @@ async def _fetch_guides(
412
  "doc_title": h.doc_title,
413
  "pages": h.pages,
414
  "source_pdf": h.source_pdf,
415
- "url": "",
416
  },
417
  sort_priority=1, # case(0) < guide(1) < law(1) < pipc(2)...
418
  recency_score=_recency_score(year),
@@ -565,10 +565,13 @@ async def _fetch_pipc(
565
  "decision_no": body.decision_no,
566
  "decision_date": body.decision_date,
567
  "agency": body.agency,
568
- # 법제처 사람용 페이지(`/LSW/ppcInfoP.do?ppcSeq=...&mode=8`)
569
- # *간헐적 봇 차단* 페이지를 반환하는 사례가 있어 url 부착
570
- # 보류. 안정 URL 확인 후 채울 예정.
571
- "url": "",
 
 
 
572
  **_fallback_meta(used_kw, queue),
573
  },
574
  sort_priority=2,
@@ -1140,8 +1143,14 @@ async def _fetch_constitutional(
1140
  "decision_date": body.decision_date,
1141
  "petitioner": body.petitioner,
1142
  "respondent": body.respondent,
1143
- "url": hit.detail_url
1144
- or f"https://www.law.go.kr/detcInfoP.do?detcSeq={body.decision_id}",
 
 
 
 
 
 
1145
  **_fallback_meta(used_kw, queue),
1146
  },
1147
  sort_priority=3, # 판례·해석례 사이 (4=판례, 3=해석례)
 
412
  "doc_title": h.doc_title,
413
  "pages": h.pages,
414
  "source_pdf": h.source_pdf,
415
+ "url": h.source_url or "",
416
  },
417
  sort_priority=1, # case(0) < guide(1) < law(1) < pipc(2)...
418
  recency_score=_recency_score(year),
 
565
  "decision_no": body.decision_no,
566
  "decision_date": body.decision_date,
567
  "agency": body.agency,
568
+ # 법제처 사람용 결정문 페이지 deep-link.
569
+ # 패턴: /LSW/ppcInfoP.do?ppcSeq={ID}&mode=8
570
+ "url": (
571
+ f"https://www.law.go.kr/LSW/ppcInfoP.do?ppcSeq={body.decision_id}&mode=8"
572
+ if body.decision_id
573
+ else ""
574
+ ),
575
  **_fallback_meta(used_kw, queue),
576
  },
577
  sort_priority=2,
 
1143
  "decision_date": body.decision_date,
1144
  "petitioner": body.petitioner,
1145
  "respondent": body.respondent,
1146
+ # SDK의 hit.detail_url은 OPEN API endpoint(/DRF/lawService.do?...)
1147
+ # 라 브라우저용이 아니므로 사용 X. 공개 deep-link 사용.
1148
+ # 패턴: /LSW/detcInfoP.do?detcSeq={ID}&mode=0
1149
+ "url": (
1150
+ f"https://www.law.go.kr/LSW/detcInfoP.do?detcSeq={body.decision_id}&mode=0"
1151
+ if body.decision_id
1152
+ else ""
1153
+ ),
1154
  **_fallback_meta(used_kw, queue),
1155
  },
1156
  sort_priority=3, # 판례·해석례 사이 (4=판례, 3=해석례)