Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,30 +4,16 @@
|
|
| 4 |
# HF Spaces Secrets μ€μ :
|
| 5 |
# NAVER_CLIENT_ID = λ°κΈλ°μ Client ID
|
| 6 |
# NAVER_CLIENT_SECRET = λ°κΈλ°μ Client Secret
|
| 7 |
-
#
|
| 8 |
-
# (μ ν) μλ² λ© λͺ¨λΈ λ³κ²½:
|
| 9 |
-
# EMBEDDING_MODEL = sentence-transformers νΈν λͺ¨λΈλͺ
|
| 10 |
-
# μ) jhgan/ko-sroberta-multitask (κΈ°λ³Έ)
|
| 11 |
-
#
|
| 12 |
-
# λ‘컬 μ€ν μ(μ ν):
|
| 13 |
-
# export NAVER_CLIENT_ID="..."
|
| 14 |
-
# export NAVER_CLIENT_SECRET="..."
|
| 15 |
-
# export EMBEDDING_MODEL="jhgan/ko-sroberta-multitask"
|
| 16 |
|
| 17 |
import os
|
| 18 |
import html
|
| 19 |
import re
|
| 20 |
from datetime import datetime
|
| 21 |
-
from typing import Dict, Any, List, Tuple
|
| 22 |
-
from functools import lru_cache
|
| 23 |
|
| 24 |
import requests
|
| 25 |
import gradio as gr
|
| 26 |
|
| 27 |
-
# μλ² λ©
|
| 28 |
-
import numpy as np
|
| 29 |
-
from sentence_transformers import SentenceTransformer
|
| 30 |
-
|
| 31 |
|
| 32 |
NAVER_NEWS_ENDPOINT = "https://openapi.naver.com/v1/search/news.json"
|
| 33 |
|
|
@@ -122,10 +108,9 @@ def render_results(data: Dict[str, Any], max_items: int = 10) -> str:
|
|
| 122 |
origin = it.get("originallink", "")
|
| 123 |
pub = _format_pubdate(it.get("pubDate", ""))
|
| 124 |
|
| 125 |
-
# β
ordered list λ¬Έλ²μ κΉ¨μ§ μλλ‘ μ΄μ€μΌμ΄ν μ κ±°
|
| 126 |
lines.append(f"{i}. **{title}**")
|
| 127 |
|
| 128 |
-
#
|
| 129 |
if pub:
|
| 130 |
lines.append(f" - λ°ν: {pub}")
|
| 131 |
if origin:
|
|
@@ -141,299 +126,6 @@ def render_results(data: Dict[str, Any], max_items: int = 10) -> str:
|
|
| 141 |
return "\n".join(lines).strip()
|
| 142 |
|
| 143 |
|
| 144 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
-
# Sentence μ
λ ₯ -> (κ·μΉ/ν΅κ³) ν보 μμ± -> (μλ² λ©) ν€μλ μ λ³/νμ₯ -> 쿼리 μμ±
|
| 146 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 147 |
-
|
| 148 |
-
EMBEDDING_MODEL_NAME = _get_env("EMBEDDING_MODEL") or "jhgan/ko-sroberta-multitask"
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
@lru_cache(maxsize=1)
|
| 152 |
-
def _get_embedder() -> SentenceTransformer:
|
| 153 |
-
# HF Spacesμμ μ΅μ΄ λ‘λ μκ°μ΄ μμ μ μμ΅λλ€.
|
| 154 |
-
return SentenceTransformer(EMBEDDING_MODEL_NAME)
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
# κ°λ¨ λΆμ©μ΄(νμ μ νμ₯)
|
| 158 |
-
STOPWORDS = {
|
| 159 |
-
"κ·Έλ¦¬κ³ ", "λλ", "λ°", "κ΄λ ¨", "λν", "μμ", "μΌλ‘", "νλ", "ν©λλ€", "ν΄μ£ΌμΈμ",
|
| 160 |
-
"μλ €", "μλ €μ€", "λ΄μ€", "κΈ°μ¬", "μ΅κ·Ό", "μμ¦", "μ΄λ²", "μ€λ", "μ΄μ ", "λ΄μΌ",
|
| 161 |
-
"μ 리", "λΆμ", "λν₯", "νν©", "μ΄μ", "λ΄μ©", "μ 보", "보기", "λ³΄κ³ ", "μΆμ΄",
|
| 162 |
-
"μΆμ΅λλ€", "ν©λλ€", "ν΄μ€", "ν΄μ£ΌμΈμ", "μ΄λ»κ²", "κ°λ₯", "κ°λ₯ν", "νν",
|
| 163 |
-
}
|
| 164 |
-
|
| 165 |
-
# μ μΈ μλ νΈλ¦¬κ±°(λ¬Έμ₯μ ν¬ν¨λλ©΄ μ μΈμ΄λ₯Ό κ°ν)
|
| 166 |
-
NEGATION_TRIGGERS = [
|
| 167 |
-
"μ μΈ", "λΉΌκ³ ", "λΉΌμ€", "λ§κ³ ", "μλ", "μμΉ", "μ«", "λ°°μ ", "μ κ±°",
|
| 168 |
-
]
|
| 169 |
-
|
| 170 |
-
# κΈ°λ³Έ μ μΈ ν보(λ¬Έμ/νλ³΄μ± μ‘μ λ°©μ§ λͺ©μ : νμ μ μ‘°μ )
|
| 171 |
-
DEFAULT_EXCLUDE_CANDIDATES = [
|
| 172 |
-
"보λμλ£", "ν보", "PR", "νλ‘λͺ¨μ
", "κ΄κ³ ", "νμ°¬",
|
| 173 |
-
]
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
# λμμ΄/νκΈ° νμ₯(μκ² μμν΄μ μ΄μνλ©΄μ λ리λ κ²μ κΆμ₯)
|
| 177 |
-
SYNONYM_GROUPS = [
|
| 178 |
-
["AI", "μΈκ³΅μ§λ₯", "A.I."],
|
| 179 |
-
["LLM", "κ±°λμΈμ΄λͺ¨λΈ", "λκ·λͺ¨μΈμ΄λͺ¨λΈ", "μμ±νAI", "μμ±ν AI"],
|
| 180 |
-
["κ°μ¬μ", "κ°μ¬μ(BAI)", "Board of Audit and Inspection"],
|
| 181 |
-
["λ°©μμ¬μ
μ²", "λ°©μ¬μ²", "DAPA"],
|
| 182 |
-
]
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
def _tokenize_korean_like(text: str) -> List[str]:
|
| 186 |
-
"""
|
| 187 |
-
MVPμ© ν ν¬λμ΄μ :
|
| 188 |
-
- νκΈ/μλ¬Έ/μ«μ μ°μ ν ν°μ μΆμΆ
|
| 189 |
-
- μ§λμΉκ² μ§§μ ν ν°(1μ)μ μ κ±°
|
| 190 |
-
"""
|
| 191 |
-
if not text:
|
| 192 |
-
return []
|
| 193 |
-
tokens = re.findall(r"[κ°-ν£A-Za-z0-9]+", text)
|
| 194 |
-
tokens = [t.strip() for t in tokens if len(t.strip()) >= 2]
|
| 195 |
-
return tokens
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
def _generate_ngrams(tokens: List[str], n: int) -> List[str]:
|
| 199 |
-
if n <= 1:
|
| 200 |
-
return tokens[:]
|
| 201 |
-
out = []
|
| 202 |
-
for i in range(len(tokens) - n + 1):
|
| 203 |
-
out.append(" ".join(tokens[i:i+n]))
|
| 204 |
-
return out
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
def extract_candidates(sentence: str, max_candidates: int = 60) -> List[str]:
|
| 208 |
-
"""
|
| 209 |
-
κ·μΉ/ν΅κ³ κΈ°λ° ν보 μμ±:
|
| 210 |
-
- ν ν°(2μ μ΄μ) + 2-gramμ νλ³΄λ‘ μμ±
|
| 211 |
-
- λ¨μ λΉλ κΈ°λ° μ μλ‘ μμ νλ³΄λ§ λ°ν
|
| 212 |
-
"""
|
| 213 |
-
tokens = _tokenize_korean_like(sentence)
|
| 214 |
-
# λΆμ©μ΄ μ κ±°(ν ν° λ¨μ)
|
| 215 |
-
tokens = [t for t in tokens if t not in STOPWORDS]
|
| 216 |
-
|
| 217 |
-
unigrams = tokens
|
| 218 |
-
bigrams = _generate_ngrams(tokens, 2)
|
| 219 |
-
|
| 220 |
-
# ν΅κ³(λΉλ) κΈ°λ° μ€μ½μ΄λ§: bigramμ μ½κ° κ°μ€μΉ
|
| 221 |
-
freq: Dict[str, float] = {}
|
| 222 |
-
|
| 223 |
-
for t in unigrams:
|
| 224 |
-
freq[t] = freq.get(t, 0.0) + 1.0
|
| 225 |
-
for bg in bigrams:
|
| 226 |
-
freq[bg] = freq.get(bg, 0.0) + 1.5
|
| 227 |
-
|
| 228 |
-
# λ무 κΈ΄ ν보λ μ μΈ(κ²μμ κ³Όλ 볡μ‘ν λ°©μ§)
|
| 229 |
-
def _ok(c: str) -> bool:
|
| 230 |
-
if len(c) > 25:
|
| 231 |
-
return False
|
| 232 |
-
# μ«μλ§μΌλ‘ λ ν보λ μ μΈ
|
| 233 |
-
if re.fullmatch(r"\d+", c):
|
| 234 |
-
return False
|
| 235 |
-
return True
|
| 236 |
-
|
| 237 |
-
ranked = sorted(
|
| 238 |
-
[(c, s) for c, s in freq.items() if _ok(c)],
|
| 239 |
-
key=lambda x: x[1],
|
| 240 |
-
reverse=True,
|
| 241 |
-
)
|
| 242 |
-
return [c for c, _ in ranked[:max_candidates]]
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
def _embed_texts(texts: List[str]) -> np.ndarray:
|
| 246 |
-
model = _get_embedder()
|
| 247 |
-
emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
|
| 248 |
-
return np.asarray(emb, dtype=np.float32)
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
def select_keywords_by_embedding(sentence: str, candidates: List[str], top_n: int = 10) -> List[str]:
|
| 252 |
-
"""
|
| 253 |
-
μλ² λ©μΌλ‘ ν보 ν€μλ μ λ³:
|
| 254 |
-
- μ
λ ₯ λ¬Έμ₯κ³Ό ν보(μ§§μ ꡬ/λ¨μ΄)λ₯Ό μλ² λ© μ μ¬λλ‘ μ μν
|
| 255 |
-
"""
|
| 256 |
-
if not sentence.strip() or not candidates:
|
| 257 |
-
return []
|
| 258 |
-
|
| 259 |
-
# νλ³΄κ° λ무 λ§μΌλ©΄ μλ μ ν β μν
|
| 260 |
-
candidates = candidates[:80]
|
| 261 |
-
|
| 262 |
-
sent_emb = _embed_texts([sentence])[0]
|
| 263 |
-
cand_emb = _embed_texts(candidates)
|
| 264 |
-
sims = cand_emb @ sent_emb # normalize_embeddings=True μ΄λ―λ‘ λ΄μ =cosine
|
| 265 |
-
|
| 266 |
-
idx = np.argsort(sims)[::-1][:max(1, top_n)]
|
| 267 |
-
selected = [candidates[i] for i in idx]
|
| 268 |
-
|
| 269 |
-
# μ€λ³΅/ν¬ν¨κ΄κ³ μ 리(μ§§μ ν ν°μ΄ κΈ΄ ν보μ ν¬ν¨λλ©΄ κΈ΄ ν보 μ°μ )
|
| 270 |
-
dedup: List[str] = []
|
| 271 |
-
for s in selected:
|
| 272 |
-
if any(s != x and s in x for x in selected):
|
| 273 |
-
continue
|
| 274 |
-
if s not in dedup:
|
| 275 |
-
dedup.append(s)
|
| 276 |
-
return dedup[:top_n]
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
def detect_excludes(sentence: str) -> List[str]:
|
| 280 |
-
"""
|
| 281 |
-
μ μΈμ΄ μΆμΆ:
|
| 282 |
-
- λ¬Έμ₯μ μ μΈ μλ νΈλ¦¬κ±°κ° μμΌλ©΄ κΈ°λ³Έ μ μΈ ν보λ₯Ό νμ±ν
|
| 283 |
-
- λ¬Έμ₯ λ΄μμ "X μ μΈ/λΉΌκ³ /λ§κ³ " ν¨ν΄λ λ¨μ μΆμΆ
|
| 284 |
-
"""
|
| 285 |
-
s = sentence.strip()
|
| 286 |
-
if not s:
|
| 287 |
-
return []
|
| 288 |
-
|
| 289 |
-
excludes: List[str] = []
|
| 290 |
-
|
| 291 |
-
# 1) μ μΈ μλ κ°μ§ μ κΈ°λ³Έ μ μΈμ΄ μΆκ°
|
| 292 |
-
if any(t in s for t in NEGATION_TRIGGERS):
|
| 293 |
-
excludes.extend(DEFAULT_EXCLUDE_CANDIDATES)
|
| 294 |
-
|
| 295 |
-
# 2) "OO μ μΈ", "OO λΉΌκ³ " λ±μ λ¨μ ν¨ν΄ μΆμΆ
|
| 296 |
-
# λ무 곡격μ μΌλ‘ λ½μΌλ©΄ μ€νμ΄ λμ΄ MVPμμλ 보μμ μΌλ‘(2μ μ΄μ ν ν°)
|
| 297 |
-
for m in re.findall(r"([κ°-ν£A-Za-z0-9]{2,})\s*(μ μΈ|λΉΌκ³ |λ§κ³ |λ°°μ |μ κ±°)", s):
|
| 298 |
-
token = m[0].strip()
|
| 299 |
-
if token and token not in excludes:
|
| 300 |
-
excludes.append(token)
|
| 301 |
-
|
| 302 |
-
# μ 리
|
| 303 |
-
excludes = [e for e in excludes if e not in STOPWORDS]
|
| 304 |
-
# κ³Όλ νμ₯ λ°©μ§
|
| 305 |
-
return excludes[:8]
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
def expand_synonyms(keywords: List[str]) -> List[List[str]]:
|
| 309 |
-
"""
|
| 310 |
-
ν€μλκ° λμμ΄ κ·Έλ£Ή νλͺ©κ³Ό 'μμ μΌμΉ'νμ§ μμλ,
|
| 311 |
-
κ·Έλ£Ή νλͺ©μ΄ ν€μλ(문ꡬ) μμ ν¬ν¨λλ©΄ μΉν νμ₯μ λ§λ€μ΄ OR κ·Έλ£Ή ν보λ₯Ό μμ±ν©λλ€.
|
| 312 |
-
|
| 313 |
-
μ)
|
| 314 |
-
"AI κΈ°λ³Έλ²" -> ["AI κΈ°λ³Έλ²", "μΈκ³΅μ§λ₯ κΈ°λ³Έλ²", "A.I. κΈ°λ³Έλ²"]
|
| 315 |
-
"""
|
| 316 |
-
groups: List[List[str]] = []
|
| 317 |
-
|
| 318 |
-
for k in keywords:
|
| 319 |
-
k_str = (k or "").strip()
|
| 320 |
-
if not k_str:
|
| 321 |
-
groups.append([k_str])
|
| 322 |
-
continue
|
| 323 |
-
|
| 324 |
-
expanded = [k_str]
|
| 325 |
-
matched = False
|
| 326 |
-
|
| 327 |
-
for g in SYNONYM_GROUPS:
|
| 328 |
-
for term in g:
|
| 329 |
-
# λΆλΆ ν¬ν¨ λ§€μΉ(λμλ¬Έμ 무μ)
|
| 330 |
-
if term.lower() in k_str.lower():
|
| 331 |
-
matched = True
|
| 332 |
-
for alt in g:
|
| 333 |
-
# ν¬ν¨λ term λΆλΆμ altλ‘ μΉν
|
| 334 |
-
cand = re.sub(re.escape(term), alt, k_str, flags=re.IGNORECASE).strip()
|
| 335 |
-
if cand and cand not in expanded:
|
| 336 |
-
expanded.append(cand)
|
| 337 |
-
break
|
| 338 |
-
if matched:
|
| 339 |
-
break # 첫 λ§€μΉ κ·Έλ£Ήλ§ μ μ©(νμ₯ νλ° λ°©μ§)
|
| 340 |
-
|
| 341 |
-
# νλ° λ°©μ§: μ΅λ 3κ°κΉμ§λ§
|
| 342 |
-
groups.append(expanded[:3])
|
| 343 |
-
|
| 344 |
-
# μ€λ³΅ κ·Έλ£Ή λ³ν©(λμλ¬Έμ 무μ)
|
| 345 |
-
merged: List[List[str]] = []
|
| 346 |
-
seen = set()
|
| 347 |
-
for g in groups:
|
| 348 |
-
key = tuple(sorted([x.lower() for x in g]))
|
| 349 |
-
if key in seen:
|
| 350 |
-
continue
|
| 351 |
-
seen.add(key)
|
| 352 |
-
merged.append(g)
|
| 353 |
-
|
| 354 |
-
return merged
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
def _dedup_keywords_preserve_order(keywords: List[str]) -> List[str]:
|
| 358 |
-
"""
|
| 359 |
-
κ²μμ΄ μ κ·ν + μλ―Έ μ€λ³΅ μ κ±°:
|
| 360 |
-
- μλ 곡백 μ κ±°, λ΄λΆ μ°μ 곡백 1κ°λ‘ μΆμ
|
| 361 |
-
- λμΌ(μ κ·ν κΈ°μ€) μ€λ³΅ μ κ±°
|
| 362 |
-
- μ§§μ ν ν°μ΄ κΈ΄ ν ν°(μ κ·ν κΈ°μ€)μ ν¬ν¨λλ©΄ μ κ±°
|
| 363 |
-
- μμ μ μ§
|
| 364 |
-
"""
|
| 365 |
-
def norm(s: str) -> str:
|
| 366 |
-
s = (s or "").strip()
|
| 367 |
-
s = re.sub(r"\s+", " ", s) # μ°μ 곡백 μ 리
|
| 368 |
-
return s
|
| 369 |
-
|
| 370 |
-
# 1) μ κ·ν + λμΌ μ€λ³΅ μ κ±°(μμ μ μ§)
|
| 371 |
-
out: List[str] = []
|
| 372 |
-
seen = set()
|
| 373 |
-
normalized = [norm(k) for k in keywords if norm(k)]
|
| 374 |
-
|
| 375 |
-
for k in normalized:
|
| 376 |
-
if k in seen:
|
| 377 |
-
continue
|
| 378 |
-
seen.add(k)
|
| 379 |
-
out.append(k)
|
| 380 |
-
|
| 381 |
-
# 2) ν¬ν¨κ΄κ³ μ κ±°(μ§§μ ν ν°μ΄ κΈ΄ ν ν°μ ν¬ν¨λλ©΄ μ κ±°)
|
| 382 |
-
final: List[str] = []
|
| 383 |
-
for k in out:
|
| 384 |
-
if any(k != x and k in x for x in out):
|
| 385 |
-
continue
|
| 386 |
-
final.append(k)
|
| 387 |
-
|
| 388 |
-
return final
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
def build_queries(
|
| 392 |
-
sentence: str,
|
| 393 |
-
selected_keywords: List[str],
|
| 394 |
-
excludes: List[str], # (νΈν μ μ§: μΈμλ λ¨κ²¨λ )
|
| 395 |
-
max_queries: int = 6, # (νΈν μ μ§: μΈμλ λ¨κ²¨λ )
|
| 396 |
-
) -> List[str]:
|
| 397 |
-
"""
|
| 398 |
-
μ΅μ μ λ΅:
|
| 399 |
-
- Q1: κΈ°λ³Έ AND 쿼리 1κ°
|
| 400 |
-
- Q2: λμμ΄/νκΈ° μΉνμ΄ λͺ
νν λλ§ 1κ° μμ±
|
| 401 |
-
- Q3 μ΄μ μμ±νμ§ μμ
|
| 402 |
-
- μ μΈ(-)λ μ¬μ©νμ§ μμ
|
| 403 |
-
"""
|
| 404 |
-
if not selected_keywords:
|
| 405 |
-
selected_keywords = extract_candidates(sentence, max_candidates=10)[:4]
|
| 406 |
-
|
| 407 |
-
selected_keywords = _dedup_keywords_preserve_order(selected_keywords) # gk
|
| 408 |
-
|
| 409 |
-
# Q1: κΈ°λ³Έ AND
|
| 410 |
-
q1 = " ".join(selected_keywords).strip()
|
| 411 |
-
queries: List[str] = [q1] if q1 else []
|
| 412 |
-
|
| 413 |
-
# Q2: λμμ΄/νκΈ° μΉνμ΄ 'μ€μ λ‘ λ°μ'ν κ²½μ°μλ§ 1κ° μμ±
|
| 414 |
-
groups = expand_synonyms(selected_keywords)
|
| 415 |
-
|
| 416 |
-
# μ΄λ€ ν€μλλΌλ νμ₯(μΉν) νλ³΄κ° 2κ° μ΄μ μμΌλ©΄ "λͺ
ν"νλ€κ³ λ³΄κ³ Q2 μμ± μλ
|
| 417 |
-
has_clear_substitution = any(len(g) >= 2 for g in groups)
|
| 418 |
-
|
| 419 |
-
if has_clear_substitution:
|
| 420 |
-
# Q2λ κ° κ·Έλ£Ήμμ "λ체 ν보"λ₯Ό νλμ© κ³¨λΌ Q1κ³Ό λ€λ₯Έ μ‘°ν©μ΄ λκ² λ§λ¦
|
| 421 |
-
combo = []
|
| 422 |
-
for g in groups:
|
| 423 |
-
# g[0]μ μλ¬Έ μ μ§, g[1]μ΄ μμΌλ©΄ μΉνλ ν보λ₯Ό μ°μ μ¬μ©
|
| 424 |
-
combo.append(g[1] if len(g) >= 2 else g[0])
|
| 425 |
-
|
| 426 |
-
combo = _dedup_keywords_preserve_order(combo) # gk
|
| 427 |
-
q2 = " ".join(combo).strip()
|
| 428 |
-
|
| 429 |
-
# Q2κ° Q1κ³Ό λ€λ₯΄κ³ , λΉμ΄μμ§ μμΌλ©΄ μΆκ°
|
| 430 |
-
if q2 and (not queries or q2 != queries[0]):
|
| 431 |
-
queries.append(q2)
|
| 432 |
-
|
| 433 |
-
# μ΅λ 2κ°(Q1, Q2)λ§ λ°ν
|
| 434 |
-
return queries[:2]
|
| 435 |
-
|
| 436 |
-
|
| 437 |
def dedup_items(all_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 438 |
"""
|
| 439 |
κ²°κ³Ό μ€λ³΅ μ κ±°:
|
|
@@ -457,60 +149,36 @@ def dedup_items(all_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
| 457 |
return out
|
| 458 |
|
| 459 |
|
| 460 |
-
def rerank_items_by_embedding(sentence: str, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 461 |
-
"""
|
| 462 |
-
μ
λ ₯ λ¬Έμ₯κ³Ό (title+description)μ μλ² λ© μ μ¬λλ‘ μ¬μ λ ¬
|
| 463 |
-
"""
|
| 464 |
-
if not sentence.strip() or not items:
|
| 465 |
-
return items
|
| 466 |
-
|
| 467 |
-
texts = []
|
| 468 |
-
for it in items:
|
| 469 |
-
title = _strip_tags(it.get("title", ""))
|
| 470 |
-
desc = _strip_tags(it.get("description", ""))
|
| 471 |
-
texts.append((title + " " + desc).strip())
|
| 472 |
-
|
| 473 |
-
sent_emb = _embed_texts([sentence])[0]
|
| 474 |
-
doc_emb = _embed_texts(texts)
|
| 475 |
-
sims = doc_emb @ sent_emb
|
| 476 |
-
|
| 477 |
-
order = np.argsort(sims)[::-1]
|
| 478 |
-
reranked = [items[i] for i in order.tolist()]
|
| 479 |
-
return reranked
|
| 480 |
-
|
| 481 |
-
|
| 482 |
def aggregate_search(
|
| 483 |
sentence: str,
|
| 484 |
display: int,
|
| 485 |
sort: str,
|
| 486 |
-
) -> Tuple[List[str], List[Dict[str, Any]]]:
|
| 487 |
"""
|
| 488 |
-
|
| 489 |
-
λ°ν: (
|
| 490 |
"""
|
| 491 |
-
|
| 492 |
-
# β
λ³κ²½λ ν΅μ¬: μ¬μ©μ μ
λ ₯ λ¬Έμ₯μ κ·Έλλ‘ queryλ‘ μ¬μ©
|
| 493 |
queries = [sentence]
|
| 494 |
|
| 495 |
all_items: List[Dict[str, Any]] = []
|
| 496 |
-
|
| 497 |
-
data = naver_news_search(query=q, display=int(display), sort=sort, start=1)
|
| 498 |
-
all_items.extend(data.get("items", []))
|
| 499 |
|
| 500 |
-
#
|
| 501 |
-
|
|
|
|
|
|
|
| 502 |
|
| 503 |
-
#
|
| 504 |
-
|
| 505 |
|
| 506 |
# μ΅μ’
κ°μ μ λ¨
|
| 507 |
-
final_items =
|
| 508 |
-
return queries, final_items
|
| 509 |
|
| 510 |
|
| 511 |
def render_results_from_items(items: List[Dict[str, Any]]) -> str:
|
| 512 |
"""
|
| 513 |
-
|
| 514 |
"""
|
| 515 |
lines: List[str] = []
|
| 516 |
lines.append(f"- μ΅μ’
λ°ν κ°μ: {len(items)}건")
|
|
@@ -523,10 +191,8 @@ def render_results_from_items(items: List[Dict[str, Any]]) -> str:
|
|
| 523 |
origin = it.get("originallink", "")
|
| 524 |
pub = _format_pubdate(it.get("pubDate", ""))
|
| 525 |
|
| 526 |
-
# β
ordered list λ¬Έλ² μ μ§
|
| 527 |
lines.append(f"{i}. **{title}**")
|
| 528 |
|
| 529 |
-
# β
νμ νλͺ© 4μΉΈ λ€μ¬μ°κΈ°
|
| 530 |
if pub:
|
| 531 |
lines.append(f" - λ°ν: {pub}")
|
| 532 |
if origin:
|
|
@@ -541,7 +207,6 @@ def render_results_from_items(items: List[Dict[str, Any]]) -> str:
|
|
| 541 |
return "\n".join(lines).strip()
|
| 542 |
|
| 543 |
|
| 544 |
-
|
| 545 |
def handle_search(
|
| 546 |
user_query: str,
|
| 547 |
chat_history: List[Dict[str, str]],
|
|
@@ -556,14 +221,23 @@ def handle_search(
|
|
| 556 |
chat_history = chat_history + [{"role": "user", "content": q}]
|
| 557 |
|
| 558 |
try:
|
| 559 |
-
queries, items = aggregate_search(sentence=q, display=int(display), sort=sort)
|
|
|
|
|
|
|
|
|
|
| 560 |
|
| 561 |
-
lines = []
|
|
|
|
|
|
|
|
|
|
| 562 |
lines.append("")
|
|
|
|
|
|
|
| 563 |
lines.append("API νΈμΆμ μ¬μ©λ κ²μμ΄(query)λ λ€μκ³Ό κ°μ΅λλ€:")
|
| 564 |
for i, qq in enumerate(queries, start=1):
|
| 565 |
lines.append(f"- Q{i}: `{qq}`")
|
| 566 |
lines.append("")
|
|
|
|
| 567 |
lines.append(render_results_from_items(items))
|
| 568 |
|
| 569 |
assistant_text = "\n".join(lines).strip()
|
|
|
|
| 4 |
# HF Spaces Secrets μ€μ :
|
| 5 |
# NAVER_CLIENT_ID = λ°κΈλ°μ Client ID
|
| 6 |
# NAVER_CLIENT_SECRET = λ°κΈλ°μ Client Secret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import html
|
| 10 |
import re
|
| 11 |
from datetime import datetime
|
| 12 |
+
from typing import Dict, Any, List, Tuple
|
|
|
|
| 13 |
|
| 14 |
import requests
|
| 15 |
import gradio as gr
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
NAVER_NEWS_ENDPOINT = "https://openapi.naver.com/v1/search/news.json"
|
| 19 |
|
|
|
|
| 108 |
origin = it.get("originallink", "")
|
| 109 |
pub = _format_pubdate(it.get("pubDate", ""))
|
| 110 |
|
|
|
|
| 111 |
lines.append(f"{i}. **{title}**")
|
| 112 |
|
| 113 |
+
# νμ νλͺ© 4μΉΈ λ€μ¬μ°κΈ°
|
| 114 |
if pub:
|
| 115 |
lines.append(f" - λ°ν: {pub}")
|
| 116 |
if origin:
|
|
|
|
| 126 |
return "\n".join(lines).strip()
|
| 127 |
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
def dedup_items(all_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 130 |
"""
|
| 131 |
κ²°κ³Ό μ€λ³΅ μ κ±°:
|
|
|
|
| 149 |
return out
|
| 150 |
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
def aggregate_search(
|
| 153 |
sentence: str,
|
| 154 |
display: int,
|
| 155 |
sort: str,
|
| 156 |
+
) -> Tuple[List[str], List[Dict[str, Any]], int]:
|
| 157 |
"""
|
| 158 |
+
μ¬μ©μ μ
λ ₯ λ¬Έμ₯μ κ·Έλλ‘ queryλ‘ μ¬μ©νμ¬ API νΈμΆ
|
| 159 |
+
λ°ν: (μ¬μ©λ 쿼리 λͺ©λ‘, μ΅μ’
μμ΄ν
λͺ©λ‘, total)
|
| 160 |
"""
|
|
|
|
|
|
|
| 161 |
queries = [sentence]
|
| 162 |
|
| 163 |
all_items: List[Dict[str, Any]] = []
|
| 164 |
+
total: int = 0
|
|
|
|
|
|
|
| 165 |
|
| 166 |
+
# λ¨μΌ 쿼리 νΈμΆ
|
| 167 |
+
data = naver_news_search(query=sentence, display=int(display), sort=sort, start=1)
|
| 168 |
+
total = int(data.get("total", 0) or 0)
|
| 169 |
+
all_items.extend(data.get("items", []))
|
| 170 |
|
| 171 |
+
# μ€λ³΅ μ κ±°(λ¨μΌ 쿼리λΌλ μ μ§)
|
| 172 |
+
merged = dedup_items(all_items)
|
| 173 |
|
| 174 |
# μ΅μ’
κ°μ μ λ¨
|
| 175 |
+
final_items = merged[:display]
|
| 176 |
+
return queries, final_items, total
|
| 177 |
|
| 178 |
|
| 179 |
def render_results_from_items(items: List[Dict[str, Any]]) -> str:
|
| 180 |
"""
|
| 181 |
+
items 리μ€νΈλ₯Ό λμΌ μ€νμΌλ‘ μΆλ ₯
|
| 182 |
"""
|
| 183 |
lines: List[str] = []
|
| 184 |
lines.append(f"- μ΅μ’
λ°ν κ°μ: {len(items)}건")
|
|
|
|
| 191 |
origin = it.get("originallink", "")
|
| 192 |
pub = _format_pubdate(it.get("pubDate", ""))
|
| 193 |
|
|
|
|
| 194 |
lines.append(f"{i}. **{title}**")
|
| 195 |
|
|
|
|
| 196 |
if pub:
|
| 197 |
lines.append(f" - λ°ν: {pub}")
|
| 198 |
if origin:
|
|
|
|
| 207 |
return "\n".join(lines).strip()
|
| 208 |
|
| 209 |
|
|
|
|
| 210 |
def handle_search(
|
| 211 |
user_query: str,
|
| 212 |
chat_history: List[Dict[str, str]],
|
|
|
|
| 221 |
chat_history = chat_history + [{"role": "user", "content": q}]
|
| 222 |
|
| 223 |
try:
|
| 224 |
+
queries, items, total = aggregate_search(sentence=q, display=int(display), sort=sort)
|
| 225 |
+
|
| 226 |
+
# totalμ΄ 0μ΄κ±°λ μμ λλ items κ°μλ‘ λ체
|
| 227 |
+
total_to_show = total if total > 0 else len(items)
|
| 228 |
|
| 229 |
+
lines: List[str] = []
|
| 230 |
+
|
| 231 |
+
# β
μμ² λ¬Έκ΅¬λ‘ λ³κ²½
|
| 232 |
+
lines.append(f"\"{q}\"μ λν κ²μ κ²°κ³Όλ {total_to_show}건 μ΄λ©° λ΄μ©μ λ€μκ³Ό κ°μ΅λλ€.")
|
| 233 |
lines.append("")
|
| 234 |
+
|
| 235 |
+
# (κΈ°μ‘΄ μ΄λ ₯ μΆλ ₯ μ μ§)
|
| 236 |
lines.append("API νΈμΆμ μ¬μ©λ κ²μμ΄(query)λ λ€μκ³Ό κ°μ΅λλ€:")
|
| 237 |
for i, qq in enumerate(queries, start=1):
|
| 238 |
lines.append(f"- Q{i}: `{qq}`")
|
| 239 |
lines.append("")
|
| 240 |
+
|
| 241 |
lines.append(render_results_from_items(items))
|
| 242 |
|
| 243 |
assistant_text = "\n".join(lines).strip()
|