gooookim commited on
Commit
9bf8481
Β·
verified Β·
1 Parent(s): 48bb067

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -352
app.py CHANGED
@@ -4,30 +4,16 @@
4
  # HF Spaces Secrets μ„€μ •:
5
  # NAVER_CLIENT_ID = λ°œκΈ‰λ°›μ€ Client ID
6
  # NAVER_CLIENT_SECRET = λ°œκΈ‰λ°›μ€ Client Secret
7
- #
8
- # (선택) μž„λ² λ”© λͺ¨λΈ λ³€κ²½:
9
- # EMBEDDING_MODEL = sentence-transformers ν˜Έν™˜ λͺ¨λΈλͺ…
10
- # 예) jhgan/ko-sroberta-multitask (기본)
11
- #
12
- # 둜컬 μ‹€ν–‰ μ‹œ(선택):
13
- # export NAVER_CLIENT_ID="..."
14
- # export NAVER_CLIENT_SECRET="..."
15
- # export EMBEDDING_MODEL="jhgan/ko-sroberta-multitask"
16
 
17
  import os
18
  import html
19
  import re
20
  from datetime import datetime
21
- from typing import Dict, Any, List, Tuple, Optional
22
- from functools import lru_cache
23
 
24
  import requests
25
  import gradio as gr
26
 
27
- # μž„λ² λ”©
28
- import numpy as np
29
- from sentence_transformers import SentenceTransformer
30
-
31
 
32
  NAVER_NEWS_ENDPOINT = "https://openapi.naver.com/v1/search/news.json"
33
 
@@ -122,10 +108,9 @@ def render_results(data: Dict[str, Any], max_items: int = 10) -> str:
122
  origin = it.get("originallink", "")
123
  pub = _format_pubdate(it.get("pubDate", ""))
124
 
125
- # βœ… ordered list 문법을 κΉ¨μ§€ μ•Šλ„λ‘ μ΄μŠ€μΌ€μ΄ν”„ 제거
126
  lines.append(f"{i}. **{title}**")
127
 
128
- # βœ… ν•˜μœ„ ν•­λͺ©μ€ 4μΉΈ λ“€μ—¬μ“°κΈ°(ordered list ν•˜μœ„ 리슀트 인식 μ•ˆμ •ν™”)
129
  if pub:
130
  lines.append(f" - λ°œν–‰: {pub}")
131
  if origin:
@@ -141,299 +126,6 @@ def render_results(data: Dict[str, Any], max_items: int = 10) -> str:
141
  return "\n".join(lines).strip()
142
 
143
 
144
- # ─────────────────────────────────────────────────────────────────────────────
145
- # Sentence μž…λ ₯ -> (κ·œμΉ™/톡계) 후보 생성 -> (μž„λ² λ”©) ν‚€μ›Œλ“œ 선별/ν™•μž₯ -> 쿼리 생성
146
- # ─────────────────────────────────────────────────────────────────────────────
147
-
148
- EMBEDDING_MODEL_NAME = _get_env("EMBEDDING_MODEL") or "jhgan/ko-sroberta-multitask"
149
-
150
-
151
- @lru_cache(maxsize=1)
152
- def _get_embedder() -> SentenceTransformer:
153
- # HF Spacesμ—μ„œ 졜초 λ‘œλ“œ μ‹œκ°„μ΄ μžˆμ„ 수 μžˆμŠ΅λ‹ˆλ‹€.
154
- return SentenceTransformer(EMBEDDING_MODEL_NAME)
155
-
156
-
157
- # 간단 λΆˆμš©μ–΄(ν•„μš” μ‹œ ν™•μž₯)
158
- STOPWORDS = {
159
- "그리고", "λ˜λŠ”", "및", "κ΄€λ ¨", "λŒ€ν•œ", "μ—μ„œ", "으둜", "ν•˜λŠ”", "ν•©λ‹ˆλ‹€", "ν•΄μ£Όμ„Έμš”",
160
- "μ•Œλ €", "μ•Œλ €μ€˜", "λ‰΄μŠ€", "기사", "졜근", "μš”μ¦˜", "이번", "였늘", "μ–΄μ œ", "내일",
161
- "정리", "뢄석", "동ν–₯", "ν˜„ν™©", "이슈", "λ‚΄μš©", "정보", "보기", "보고", "μ‹Άμ–΄",
162
- "μ‹ΆμŠ΅λ‹ˆλ‹€", "ν•©λ‹ˆλ‹€", "ν•΄μ€˜", "ν•΄μ£Όμ„Έμš”", "μ–΄λ–»κ²Œ", "κ°€λŠ₯", "κ°€λŠ₯ν•œ", "ν˜•νƒœ",
163
- }
164
-
165
- # μ œμ™Έ μ˜λ„ 트리거(λ¬Έμž₯에 ν¬ν•¨λ˜λ©΄ μ œμ™Έμ–΄λ₯Ό κ°•ν™”)
166
- NEGATION_TRIGGERS = [
167
- "μ œμ™Έ", "λΉΌκ³ ", "빼쀘", "말고", "μ•„λ‹Œ", "μ›μΉ˜", "μ‹«", "배제", "제거",
168
- ]
169
-
170
- # κΈ°λ³Έ μ œμ™Έ 후보(λ¬Έμ„œ/홍보성 작음 λ°©μ§€ λͺ©μ : ν•„μš” μ‹œ μ‘°μ •)
171
- DEFAULT_EXCLUDE_CANDIDATES = [
172
- "λ³΄λ„μžλ£Œ", "홍보", "PR", "ν”„λ‘œλͺ¨μ…˜", "κ΄‘κ³ ", "ν˜‘μ°¬",
173
- ]
174
-
175
-
176
- # λ™μ˜μ–΄/ν‘œκΈ° ν™•μž₯(μž‘κ²Œ μ‹œμž‘ν•΄μ„œ μš΄μ˜ν•˜λ©΄μ„œ λŠ˜λ¦¬λŠ” 것을 ꢌμž₯)
177
- SYNONYM_GROUPS = [
178
- ["AI", "인곡지λŠ₯", "A.I."],
179
- ["LLM", "κ±°λŒ€μ–Έμ–΄λͺ¨λΈ", "λŒ€κ·œλͺ¨μ–Έμ–΄λͺ¨λΈ", "μƒμ„±ν˜•AI", "μƒμ„±ν˜• AI"],
180
- ["감사원", "감사원(BAI)", "Board of Audit and Inspection"],
181
- ["λ°©μœ„μ‚¬μ—…μ²­", "방사청", "DAPA"],
182
- ]
183
-
184
-
185
- def _tokenize_korean_like(text: str) -> List[str]:
186
- """
187
- MVP용 ν† ν¬λ‚˜μ΄μ €:
188
- - ν•œκΈ€/영문/숫자 연속 토큰을 μΆ”μΆœ
189
- - μ§€λ‚˜μΉ˜κ²Œ 짧은 토큰(1자)은 제거
190
- """
191
- if not text:
192
- return []
193
- tokens = re.findall(r"[κ°€-힣A-Za-z0-9]+", text)
194
- tokens = [t.strip() for t in tokens if len(t.strip()) >= 2]
195
- return tokens
196
-
197
-
198
- def _generate_ngrams(tokens: List[str], n: int) -> List[str]:
199
- if n <= 1:
200
- return tokens[:]
201
- out = []
202
- for i in range(len(tokens) - n + 1):
203
- out.append(" ".join(tokens[i:i+n]))
204
- return out
205
-
206
-
207
- def extract_candidates(sentence: str, max_candidates: int = 60) -> List[str]:
208
- """
209
- κ·œμΉ™/톡계 기반 후보 생성:
210
- - 토큰(2자 이상) + 2-gram을 ν›„λ³΄λ‘œ 생성
211
- - λ‹¨μˆœ λΉˆλ„ 기반 점수둜 μƒμœ„ ν›„λ³΄λ§Œ λ°˜ν™˜
212
- """
213
- tokens = _tokenize_korean_like(sentence)
214
- # λΆˆμš©μ–΄ 제거(토큰 λ‹¨μœ„)
215
- tokens = [t for t in tokens if t not in STOPWORDS]
216
-
217
- unigrams = tokens
218
- bigrams = _generate_ngrams(tokens, 2)
219
-
220
- # 톡계(λΉˆλ„) 기반 μŠ€μ½”μ–΄λ§: bigram에 μ•½κ°„ κ°€μ€‘μΉ˜
221
- freq: Dict[str, float] = {}
222
-
223
- for t in unigrams:
224
- freq[t] = freq.get(t, 0.0) + 1.0
225
- for bg in bigrams:
226
- freq[bg] = freq.get(bg, 0.0) + 1.5
227
-
228
- # λ„ˆλ¬΄ κΈ΄ ν›„λ³΄λŠ” μ œμ™Έ(검색식 과도 λ³΅μž‘ν™” λ°©μ§€)
229
- def _ok(c: str) -> bool:
230
- if len(c) > 25:
231
- return False
232
- # 숫자만으둜 된 ν›„λ³΄λŠ” μ œμ™Έ
233
- if re.fullmatch(r"\d+", c):
234
- return False
235
- return True
236
-
237
- ranked = sorted(
238
- [(c, s) for c, s in freq.items() if _ok(c)],
239
- key=lambda x: x[1],
240
- reverse=True,
241
- )
242
- return [c for c, _ in ranked[:max_candidates]]
243
-
244
-
245
- def _embed_texts(texts: List[str]) -> np.ndarray:
246
- model = _get_embedder()
247
- emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
248
- return np.asarray(emb, dtype=np.float32)
249
-
250
-
251
- def select_keywords_by_embedding(sentence: str, candidates: List[str], top_n: int = 10) -> List[str]:
252
- """
253
- μž„λ² λ”©μœΌλ‘œ 후보 ν‚€μ›Œλ“œ 선별:
254
- - μž…λ ₯ λ¬Έμž₯κ³Ό 후보(짧은 ꡬ/단어)λ₯Ό μž„λ² λ”© μœ μ‚¬λ„λ‘œ μ μˆ˜ν™”
255
- """
256
- if not sentence.strip() or not candidates:
257
- return []
258
-
259
- # 후보가 λ„ˆλ¬΄ 많으면 속도 μ €ν•˜ β†’ μƒν•œ
260
- candidates = candidates[:80]
261
-
262
- sent_emb = _embed_texts([sentence])[0]
263
- cand_emb = _embed_texts(candidates)
264
- sims = cand_emb @ sent_emb # normalize_embeddings=True μ΄λ―€λ‘œ 내적=cosine
265
-
266
- idx = np.argsort(sims)[::-1][:max(1, top_n)]
267
- selected = [candidates[i] for i in idx]
268
-
269
- # 쀑볡/포함관계 정리(짧은 토큰이 κΈ΄ 후보에 ν¬ν•¨λ˜λ©΄ κΈ΄ 후보 μš°μ„ )
270
- dedup: List[str] = []
271
- for s in selected:
272
- if any(s != x and s in x for x in selected):
273
- continue
274
- if s not in dedup:
275
- dedup.append(s)
276
- return dedup[:top_n]
277
-
278
-
279
- def detect_excludes(sentence: str) -> List[str]:
280
- """
281
- μ œμ™Έμ–΄ μΆ”μΆœ:
282
- - λ¬Έμž₯에 μ œμ™Έ μ˜λ„ νŠΈλ¦¬κ±°κ°€ 있으면 κΈ°λ³Έ μ œμ™Έ 후보λ₯Ό ν™œμ„±ν™”
283
- - λ¬Έμž₯ λ‚΄μ—μ„œ "X μ œμ™Έ/λΉΌκ³ /말고" νŒ¨ν„΄λ„ λ‹¨μˆœ μΆ”μΆœ
284
- """
285
- s = sentence.strip()
286
- if not s:
287
- return []
288
-
289
- excludes: List[str] = []
290
-
291
- # 1) μ œμ™Έ μ˜λ„ 감지 μ‹œ κΈ°λ³Έ μ œμ™Έμ–΄ μΆ”κ°€
292
- if any(t in s for t in NEGATION_TRIGGERS):
293
- excludes.extend(DEFAULT_EXCLUDE_CANDIDATES)
294
-
295
- # 2) "OO μ œμ™Έ", "OO λΉΌκ³ " λ“±μ˜ λ‹¨μˆœ νŒ¨ν„΄ μΆ”μΆœ
296
- # λ„ˆλ¬΄ 곡격적으둜 λ½‘μœΌλ©΄ μ˜€νƒμ΄ λŠ˜μ–΄ MVPμ—μ„œλŠ” 보수적으둜(2자 이상 토큰)
297
- for m in re.findall(r"([κ°€-힣A-Za-z0-9]{2,})\s*(μ œμ™Έ|λΉΌκ³ |말고|배제|제거)", s):
298
- token = m[0].strip()
299
- if token and token not in excludes:
300
- excludes.append(token)
301
-
302
- # 정리
303
- excludes = [e for e in excludes if e not in STOPWORDS]
304
- # 과도 ν™•μž₯ λ°©μ§€
305
- return excludes[:8]
306
-
307
-
308
- def expand_synonyms(keywords: List[str]) -> List[List[str]]:
309
- """
310
- ν‚€μ›Œλ“œκ°€ λ™μ˜μ–΄ κ·Έλ£Ή ν•­λͺ©κ³Ό 'μ™„μ „ 일치'ν•˜μ§€ μ•Šμ•„λ„,
311
- κ·Έλ£Ή ν•­λͺ©μ΄ ν‚€μ›Œλ“œ(문ꡬ) μ•ˆμ— ν¬ν•¨λ˜λ©΄ μΉ˜ν™˜ ν™•μž₯을 λ§Œλ“€μ–΄ OR κ·Έλ£Ή 후보λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
312
-
313
- 예)
314
- "AI 기본법" -> ["AI 기본법", "인곡지λŠ₯ 기본법", "A.I. 기본법"]
315
- """
316
- groups: List[List[str]] = []
317
-
318
- for k in keywords:
319
- k_str = (k or "").strip()
320
- if not k_str:
321
- groups.append([k_str])
322
- continue
323
-
324
- expanded = [k_str]
325
- matched = False
326
-
327
- for g in SYNONYM_GROUPS:
328
- for term in g:
329
- # λΆ€λΆ„ 포함 λ§€μΉ­(λŒ€μ†Œλ¬Έμž λ¬΄μ‹œ)
330
- if term.lower() in k_str.lower():
331
- matched = True
332
- for alt in g:
333
- # ν¬ν•¨λœ term 뢀뢄을 alt둜 μΉ˜ν™˜
334
- cand = re.sub(re.escape(term), alt, k_str, flags=re.IGNORECASE).strip()
335
- if cand and cand not in expanded:
336
- expanded.append(cand)
337
- break
338
- if matched:
339
- break # 첫 λ§€μΉ­ 그룹만 적용(ν™•μž₯ 폭발 λ°©μ§€)
340
-
341
- # 폭발 λ°©μ§€: μ΅œλŒ€ 3κ°œκΉŒμ§€λ§Œ
342
- groups.append(expanded[:3])
343
-
344
- # 쀑볡 κ·Έλ£Ή 병합(λŒ€μ†Œλ¬Έμž λ¬΄μ‹œ)
345
- merged: List[List[str]] = []
346
- seen = set()
347
- for g in groups:
348
- key = tuple(sorted([x.lower() for x in g]))
349
- if key in seen:
350
- continue
351
- seen.add(key)
352
- merged.append(g)
353
-
354
- return merged
355
-
356
-
357
- def _dedup_keywords_preserve_order(keywords: List[str]) -> List[str]:
358
- """
359
- 검색어 μ •κ·œν™” + 의미 쀑볡 제거:
360
- - 양끝 곡백 제거, λ‚΄λΆ€ 연속 곡백 1개둜 μΆ•μ†Œ
361
- - 동일(μ •κ·œν™” κΈ°μ€€) 쀑볡 제거
362
- - 짧은 토큰이 κΈ΄ 토큰(μ •κ·œν™” κΈ°μ€€)에 ν¬ν•¨λ˜λ©΄ 제거
363
- - μˆœμ„œ μœ μ§€
364
- """
365
- def norm(s: str) -> str:
366
- s = (s or "").strip()
367
- s = re.sub(r"\s+", " ", s) # 연속 곡백 정리
368
- return s
369
-
370
- # 1) μ •κ·œν™” + 동일 쀑볡 제거(μˆœμ„œ μœ μ§€)
371
- out: List[str] = []
372
- seen = set()
373
- normalized = [norm(k) for k in keywords if norm(k)]
374
-
375
- for k in normalized:
376
- if k in seen:
377
- continue
378
- seen.add(k)
379
- out.append(k)
380
-
381
- # 2) 포함관계 제거(짧은 토큰이 κΈ΄ 토큰에 ν¬ν•¨λ˜λ©΄ 제거)
382
- final: List[str] = []
383
- for k in out:
384
- if any(k != x and k in x for x in out):
385
- continue
386
- final.append(k)
387
-
388
- return final
389
-
390
-
391
- def build_queries(
392
- sentence: str,
393
- selected_keywords: List[str],
394
- excludes: List[str], # (ν˜Έν™˜ μœ μ§€: μΈμžλŠ” 남겨둠)
395
- max_queries: int = 6, # (ν˜Έν™˜ μœ μ§€: μΈμžλŠ” 남겨둠)
396
- ) -> List[str]:
397
- """
398
- 졜적 μ „λž΅:
399
- - Q1: 기본 AND 쿼리 1개
400
- - Q2: λ™μ˜μ–΄/ν‘œκΈ° μΉ˜ν™˜μ΄ λͺ…ν™•ν•  λ•Œλ§Œ 1개 생성
401
- - Q3 이상 μƒμ„±ν•˜μ§€ μ•ŠμŒ
402
- - μ œμ™Έ(-)λŠ” μ‚¬μš©ν•˜μ§€ μ•ŠμŒ
403
- """
404
- if not selected_keywords:
405
- selected_keywords = extract_candidates(sentence, max_candidates=10)[:4]
406
-
407
- selected_keywords = _dedup_keywords_preserve_order(selected_keywords) # gk
408
-
409
- # Q1: κΈ°λ³Έ AND
410
- q1 = " ".join(selected_keywords).strip()
411
- queries: List[str] = [q1] if q1 else []
412
-
413
- # Q2: λ™μ˜μ–΄/ν‘œκΈ° μΉ˜ν™˜μ΄ 'μ‹€μ œλ‘œ λ°œμƒ'ν•œ κ²½μš°μ—λ§Œ 1개 생성
414
- groups = expand_synonyms(selected_keywords)
415
-
416
- # μ–΄λ–€ ν‚€μ›Œλ“œλΌλ„ ν™•μž₯(μΉ˜ν™˜) 후보가 2개 이상 있으면 "λͺ…ν™•"ν•˜λ‹€κ³  보고 Q2 생성 μ‹œλ„
417
- has_clear_substitution = any(len(g) >= 2 for g in groups)
418
-
419
- if has_clear_substitution:
420
- # Q2λŠ” 각 κ·Έλ£Ήμ—μ„œ "λŒ€μ²΄ 후보"λ₯Ό ν•˜λ‚˜μ”© 골라 Q1κ³Ό λ‹€λ₯Έ 쑰합이 되게 λ§Œλ“¦
421
- combo = []
422
- for g in groups:
423
- # g[0]은 원문 μœ μ§€, g[1]이 있으면 μΉ˜ν™˜λœ 후보λ₯Ό μš°μ„  μ‚¬μš©
424
- combo.append(g[1] if len(g) >= 2 else g[0])
425
-
426
- combo = _dedup_keywords_preserve_order(combo) # gk
427
- q2 = " ".join(combo).strip()
428
-
429
- # Q2κ°€ Q1κ³Ό λ‹€λ₯΄κ³ , λΉ„μ–΄μžˆμ§€ μ•ŠμœΌλ©΄ μΆ”κ°€
430
- if q2 and (not queries or q2 != queries[0]):
431
- queries.append(q2)
432
-
433
- # μ΅œλŒ€ 2개(Q1, Q2)만 λ°˜ν™˜
434
- return queries[:2]
435
-
436
-
437
  def dedup_items(all_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
438
  """
439
  κ²°κ³Ό 쀑볡 제거:
@@ -457,60 +149,36 @@ def dedup_items(all_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
457
  return out
458
 
459
 
460
- def rerank_items_by_embedding(sentence: str, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
461
- """
462
- μž…λ ₯ λ¬Έμž₯κ³Ό (title+description)의 μž„λ² λ”© μœ μ‚¬λ„λ‘œ μž¬μ •λ ¬
463
- """
464
- if not sentence.strip() or not items:
465
- return items
466
-
467
- texts = []
468
- for it in items:
469
- title = _strip_tags(it.get("title", ""))
470
- desc = _strip_tags(it.get("description", ""))
471
- texts.append((title + " " + desc).strip())
472
-
473
- sent_emb = _embed_texts([sentence])[0]
474
- doc_emb = _embed_texts(texts)
475
- sims = doc_emb @ sent_emb
476
-
477
- order = np.argsort(sims)[::-1]
478
- reranked = [items[i] for i in order.tolist()]
479
- return reranked
480
-
481
-
482
  def aggregate_search(
483
  sentence: str,
484
  display: int,
485
  sort: str,
486
- ) -> Tuple[List[str], List[Dict[str, Any]]]:
487
  """
488
- λ¬Έμž₯ μž…λ ₯ -> (ν˜„μž¬λŠ”) μ‚¬μš©μž μž…λ ₯ λ¬Έμž₯을 κ·ΈλŒ€λ‘œ query둜 μ‚¬μš©ν•˜μ—¬ API 호좜
489
- λ°˜ν™˜: (μƒμ„±λœ 쿼리 λͺ©λ‘, μ΅œμ’… μ•„μ΄ν…œ λͺ©λ‘)
490
  """
491
-
492
- # βœ… λ³€κ²½λœ 핡심: μ‚¬μš©μž μž…λ ₯ λ¬Έμž₯을 κ·ΈλŒ€λ‘œ query둜 μ‚¬μš©
493
  queries = [sentence]
494
 
495
  all_items: List[Dict[str, Any]] = []
496
- for q in queries:
497
- data = naver_news_search(query=q, display=int(display), sort=sort, start=1)
498
- all_items.extend(data.get("items", []))
499
 
500
- # 톡합/쀑볡 제거(단일 쿼리라도 μœ μ§€)
501
- merged = dedup_items(all_items)
 
 
502
 
503
- # μž„λ² λ”© μž¬λž­ν‚Ή(κΈ°μ‘΄ λ™μž‘ μœ μ§€)
504
- reranked = rerank_items_by_embedding(sentence, merged)
505
 
506
  # μ΅œμ’… 개수 μ ˆλ‹¨
507
- final_items = reranked[:display]
508
- return queries, final_items
509
 
510
 
511
  def render_results_from_items(items: List[Dict[str, Any]]) -> str:
512
  """
513
- 톡합/μž¬λž­ν‚Ήλœ items 리슀트λ₯Ό 동일 μŠ€νƒ€μΌλ‘œ 좜λ ₯
514
  """
515
  lines: List[str] = []
516
  lines.append(f"- μ΅œμ’… λ°˜ν™˜ 개수: {len(items)}건")
@@ -523,10 +191,8 @@ def render_results_from_items(items: List[Dict[str, Any]]) -> str:
523
  origin = it.get("originallink", "")
524
  pub = _format_pubdate(it.get("pubDate", ""))
525
 
526
- # βœ… ordered list 문법 μœ μ§€
527
  lines.append(f"{i}. **{title}**")
528
 
529
- # βœ… ν•˜μœ„ ν•­λͺ© 4μΉΈ λ“€μ—¬μ“°κΈ°
530
  if pub:
531
  lines.append(f" - λ°œν–‰: {pub}")
532
  if origin:
@@ -541,7 +207,6 @@ def render_results_from_items(items: List[Dict[str, Any]]) -> str:
541
  return "\n".join(lines).strip()
542
 
543
 
544
-
545
  def handle_search(
546
  user_query: str,
547
  chat_history: List[Dict[str, str]],
@@ -556,14 +221,23 @@ def handle_search(
556
  chat_history = chat_history + [{"role": "user", "content": q}]
557
 
558
  try:
559
- queries, items = aggregate_search(sentence=q, display=int(display), sort=sort)
 
 
 
560
 
561
- lines = []
 
 
 
562
  lines.append("")
 
 
563
  lines.append("API ν˜ΈμΆœμ— μ‚¬μš©λœ 검색어(query)λŠ” λ‹€μŒκ³Ό κ°™μŠ΅λ‹ˆλ‹€:")
564
  for i, qq in enumerate(queries, start=1):
565
  lines.append(f"- Q{i}: `{qq}`")
566
  lines.append("")
 
567
  lines.append(render_results_from_items(items))
568
 
569
  assistant_text = "\n".join(lines).strip()
 
4
  # HF Spaces Secrets μ„€μ •:
5
  # NAVER_CLIENT_ID = λ°œκΈ‰λ°›μ€ Client ID
6
  # NAVER_CLIENT_SECRET = λ°œκΈ‰λ°›μ€ Client Secret
 
 
 
 
 
 
 
 
 
7
 
8
  import os
9
  import html
10
  import re
11
  from datetime import datetime
12
+ from typing import Dict, Any, List, Tuple
 
13
 
14
  import requests
15
  import gradio as gr
16
 
 
 
 
 
17
 
18
  NAVER_NEWS_ENDPOINT = "https://openapi.naver.com/v1/search/news.json"
19
 
 
108
  origin = it.get("originallink", "")
109
  pub = _format_pubdate(it.get("pubDate", ""))
110
 
 
111
  lines.append(f"{i}. **{title}**")
112
 
113
+ # ν•˜μœ„ ν•­λͺ© 4μΉΈ λ“€μ—¬μ“°κΈ°
114
  if pub:
115
  lines.append(f" - λ°œν–‰: {pub}")
116
  if origin:
 
126
  return "\n".join(lines).strip()
127
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  def dedup_items(all_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
130
  """
131
  κ²°κ³Ό 쀑볡 제거:
 
149
  return out
150
 
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def aggregate_search(
153
  sentence: str,
154
  display: int,
155
  sort: str,
156
+ ) -> Tuple[List[str], List[Dict[str, Any]], int]:
157
  """
158
+ μ‚¬μš©μž μž…λ ₯ λ¬Έμž₯을 κ·ΈλŒ€λ‘œ query둜 μ‚¬μš©ν•˜μ—¬ API 호좜
159
+ λ°˜ν™˜: (μ‚¬μš©λœ 쿼리 λͺ©λ‘, μ΅œμ’… μ•„μ΄ν…œ λͺ©λ‘, total)
160
  """
 
 
161
  queries = [sentence]
162
 
163
  all_items: List[Dict[str, Any]] = []
164
+ total: int = 0
 
 
165
 
166
+ # 단일 쿼리 호좜
167
+ data = naver_news_search(query=sentence, display=int(display), sort=sort, start=1)
168
+ total = int(data.get("total", 0) or 0)
169
+ all_items.extend(data.get("items", []))
170
 
171
+ # 쀑볡 제거(단일 쿼리라도 μœ μ§€)
172
+ merged = dedup_items(all_items)
173
 
174
  # μ΅œμ’… 개수 μ ˆλ‹¨
175
+ final_items = merged[:display]
176
+ return queries, final_items, total
177
 
178
 
179
  def render_results_from_items(items: List[Dict[str, Any]]) -> str:
180
  """
181
+ items 리슀트λ₯Ό 동일 μŠ€νƒ€μΌλ‘œ 좜λ ₯
182
  """
183
  lines: List[str] = []
184
  lines.append(f"- μ΅œμ’… λ°˜ν™˜ 개수: {len(items)}건")
 
191
  origin = it.get("originallink", "")
192
  pub = _format_pubdate(it.get("pubDate", ""))
193
 
 
194
  lines.append(f"{i}. **{title}**")
195
 
 
196
  if pub:
197
  lines.append(f" - λ°œν–‰: {pub}")
198
  if origin:
 
207
  return "\n".join(lines).strip()
208
 
209
 
 
210
  def handle_search(
211
  user_query: str,
212
  chat_history: List[Dict[str, str]],
 
221
  chat_history = chat_history + [{"role": "user", "content": q}]
222
 
223
  try:
224
+ queries, items, total = aggregate_search(sentence=q, display=int(display), sort=sort)
225
+
226
+ # total이 0μ΄κ±°λ‚˜ 없을 λ•ŒλŠ” items 개수둜 λŒ€μ²΄
227
+ total_to_show = total if total > 0 else len(items)
228
 
229
+ lines: List[str] = []
230
+
231
+ # βœ… μš”μ²­ 문ꡬ둜 λ³€κ²½
232
+ lines.append(f"\"{q}\"에 λŒ€ν•œ 검색 κ²°κ³ΌλŠ” {total_to_show}건 이며 λ‚΄μš©μ€ λ‹€μŒκ³Ό κ°™μŠ΅λ‹ˆλ‹€.")
233
  lines.append("")
234
+
235
+ # (κΈ°μ‘΄ 이λ ₯ 좜λ ₯ μœ μ§€)
236
  lines.append("API ν˜ΈμΆœμ— μ‚¬μš©λœ 검색어(query)λŠ” λ‹€μŒκ³Ό κ°™μŠ΅λ‹ˆλ‹€:")
237
  for i, qq in enumerate(queries, start=1):
238
  lines.append(f"- Q{i}: `{qq}`")
239
  lines.append("")
240
+
241
  lines.append(render_results_from_items(items))
242
 
243
  assistant_text = "\n".join(lines).strip()