MANOJSEQ commited on
Commit
e7e59bd
·
verified ·
1 Parent(s): 1374986

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +467 -769
main.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from fastapi import FastAPI, Query, HTTPException, Body
2
  from typing import Optional, List, Dict, Any, Tuple, Set
3
  import os
@@ -13,7 +15,6 @@ import math
13
  import nltk
14
  from nltk.sentiment import SentimentIntensityAnalyzer
15
  from geopy.geocoders import Nominatim
16
- from geopy.exc import GeocoderUnavailable, GeocoderTimedOut
17
  from fastapi.middleware.cors import CORSMiddleware
18
  from countryinfo import CountryInfo
19
  from sentence_transformers import SentenceTransformer, util
@@ -31,148 +32,64 @@ from transformers import pipeline as hf_pipeline
31
  os.environ.setdefault("OMP_NUM_THREADS", "1")
32
  from fastapi.responses import PlainTextResponse, JSONResponse
33
 
34
-
35
- # ----------------- FastAPI -----------------
36
- app = FastAPI()
37
- app.add_middleware(
38
- CORSMiddleware,
39
- allow_origins=["*"],
40
- allow_credentials=False,
41
- allow_methods=["*"],
42
- allow_headers=["*"],
43
- )
44
-
45
- app.add_middleware(GZipMiddleware, minimum_size=500)
46
-
47
- @app.api_route("/", methods=["GET", "HEAD"], include_in_schema=False)
48
- def root():
49
- # For HEAD, Starlette ignores the body.
50
- return JSONResponse({"ok": True, "service": "newsglobe-backend"})
51
-
52
- @app.api_route("/healthz", methods=["GET", "HEAD"], include_in_schema=False)
53
- def healthz():
54
- return PlainTextResponse("OK", status_code=200)
55
-
56
- @app.get("/favicon.ico", include_in_schema=False)
57
- def favicon():
58
- return PlainTextResponse("", status_code=204)
59
-
60
-
61
-
62
  import torch
63
  torch.set_num_threads(2)
64
 
65
- # Optional runtime check for local OPUS tokenizers
66
  try:
67
- import sentencepiece as _spm # noqa: F401
68
  _HAS_SENTENCEPIECE = True
69
  except Exception:
70
  _HAS_SENTENCEPIECE = False
71
 
72
-
73
  from enum import Enum
74
  class Speed(str, Enum):
75
  fast = "fast"
76
  balanced = "balanced"
77
  max = "max"
78
 
 
79
  _local_pipes = {}
80
  _news_clf = None
81
  _sbert = None
82
 
83
-
84
- # set a writable cache for tldextract and avoid network PSL fetches
85
  _TLD_CACHE = os.getenv("TLDEXTRACT_CACHE", "/data/tld_cache")
86
  try:
87
- # suffix_list_urls=None => use cached public suffix list only (no HTTP on startup)
88
  _tld = tldextract.TLDExtract(cache_dir=_TLD_CACHE, suffix_list_urls=None)
89
  except Exception:
90
- # safe fallback: still parses domains without PSL refresh
91
  _tld = tldextract.extract
92
 
93
-
94
- # --- Translation runtime flags / caches ---
95
- ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
96
  _hf_bad_models: Set[str] = set()
97
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- def _translate_local(text: str, src: str, tgt: str) -> Optional[str]:
100
- if not _HAS_SENTENCEPIECE:
101
- # Avoid attempting to download/instantiate Marian tokenizers without sentencepiece
102
- return None
103
- model_id = opus_model_for(src, tgt)
104
- if not model_id:
105
- return None
106
- key = model_id
107
- try:
108
- if key not in _local_pipes:
109
- _local_pipes[key] = hf_pipeline("translation", model=model_id)
110
- out = _local_pipes[key](text, max_length=512)
111
- return out[0]["translation_text"]
112
- except Exception as e:
113
- log.warning("Local translate failed for %s: %s", model_id, e)
114
- return None
115
-
116
-
117
- def fetch_gdelt_multi(limit=120, query=None, language=None, timespan="48h", category=None, speed: Speed = Speed.balanced):
118
- # If user forced a language, honor it (but add a small English boost for coverage)
119
- if language:
120
- primary = fetch_gdelt_articles(limit=limit, query=query, language=language, timespan=timespan, category=category)
121
- # tiny English booster to catch global wires
122
- booster = fetch_gdelt_articles(limit=max(10, limit // 6), query=query, language="en", timespan=timespan, category=category)
123
- return primary + booster
124
-
125
- # Otherwise rotate across multiple languages
126
- if speed == Speed.fast:
127
- langs = LANG_ROTATION[:3] # quicker
128
- timespan = "24h"
129
- elif speed == Speed.balanced:
130
- langs = LANG_ROTATION[:8] # good mix
131
- timespan = "48h"
132
- else:
133
- langs = LANG_ROTATION # max coverage
134
- timespan = "3d"
135
-
136
- per_lang = max(8, math.ceil(limit / len(langs)))
137
- out = []
138
- for lg in langs:
139
- out.extend(fetch_gdelt_articles(limit=per_lang, query=query, language=lg, timespan=timespan, category=category))
140
-
141
- # Optional: add a few English pulls biased to different source countries (broadens outlets)
142
- if speed != Speed.fast:
143
- per_cc = max(4, limit // 30) if speed == Speed.max else max(2, limit // 40)
144
- for cc in COUNTRY_SEEDS[: (8 if speed == Speed.balanced else 16)]:
145
- out.extend(
146
- fetch_gdelt_articles(
147
- limit=per_cc,
148
- query=query,
149
- language="en",
150
- timespan=timespan,
151
- category=category,
152
- extra_tokens=[f"sourcecountry:{cc}"]
153
- )
154
- )
155
-
156
- return out
157
-
158
 
159
- def get_news_clf():
160
- global _news_clf
161
- if _news_clf is None:
162
- _news_clf = hf_pipeline(
163
- "text-classification",
164
- model="cardiffnlp/tweet-topic-21-multi",
165
- top_k=1,
166
- )
167
- return _news_clf
168
 
169
- def get_sbert():
170
- global _sbert
171
- if _sbert is None:
172
- _sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
173
- return _sbert
174
 
175
- # globals
176
  SESSION = requests.Session()
177
  ADAPTER = requests.adapters.HTTPAdapter(pool_connections=64, pool_maxsize=64, max_retries=2)
178
  SESSION.mount("http://", ADAPTER)
@@ -183,6 +100,7 @@ def _session_get(url, **kwargs):
183
  headers.setdefault("User-Agent", "Mozilla/5.0 (compatible; NewsGlobe/1.0)")
184
  return SESSION.get(url, headers=headers, timeout=kwargs.pop("timeout", 12), **kwargs)
185
 
 
186
  def _try_jina_reader(url: str, timeout: int) -> Optional[str]:
187
  try:
188
  u = url.strip()
@@ -199,10 +117,7 @@ def _try_jina_reader(url: str, timeout: int) -> Optional[str]:
199
  pass
200
  return None
201
 
202
-
203
-
204
- # --- description cleanup helpers ---
205
-
206
  BOILER_DESC = re.compile(
207
  r"(subscribe|sign in|sign up|enable javascript|cookies? (policy|settings)|"
208
  r"privacy (policy|notice)|continue reading|read more|click here|"
@@ -211,57 +126,13 @@ BOILER_DESC = re.compile(
211
  )
212
 
213
  def _split_sentences(text: str) -> List[str]:
214
- # light-weight splitter good enough for news blurbs
215
  parts = re.split(r"(?<=[\.\?\!])\s+(?=[A-Z0-9])", (text or "").strip())
216
- # also break on " • " and long dashes if present
217
  out = []
218
  for p in parts:
219
  out.extend(re.split(r"\s+[•–—]\s+", p))
220
  return [p.strip() for p in out if p and len(p.strip()) >= 2]
221
 
222
- def _tidy_description(title: str, desc: str, source_name: str, max_chars: int = 240) -> str:
223
- if not desc:
224
- return ""
225
-
226
- # remove repeated title
227
- desc = _dedupe_title_from_desc(title, desc)
228
-
229
- # strip obvious boilerplate
230
- desc = BOILER_DESC.sub("", desc)
231
- desc = re.sub(r"\s+", " ", desc).strip(" -–:•|")
232
-
233
- # choose first 1–2 sentences that look like summary
234
- sents = _split_sentences(desc)
235
- if not sents:
236
- sents = [desc]
237
-
238
- best = " ".join(sents[:2]).strip()
239
-
240
- # soft truncate at sentence boundary
241
- if len(best) > max_chars:
242
- # try only first sentence
243
- if len(sents[0]) <= max_chars * 0.9:
244
- best = sents[0]
245
- else:
246
- best = best[:max_chars].rsplit(" ", 1)[0].rstrip(",;:-–—")
247
-
248
- # avoid parroting the headline
249
- if _too_similar(title, best):
250
- # try next sentence if we have it
251
- for alt in sents[1:3]:
252
- if not _too_similar(title, alt):
253
- best = alt
254
- break
255
-
256
- # ensure it ends neatly
257
- if best and best[-1] not in ".!?":
258
- best += "."
259
- return best
260
-
261
-
262
-
263
  def _too_similar(a: str, b: str, thresh: float = 0.92) -> bool:
264
- """Return True if strings are near-duplicates (or one contains the other)."""
265
  a = (a or "").strip()
266
  b = (b or "").strip()
267
  if not a or not b:
@@ -274,25 +145,48 @@ def _too_similar(a: str, b: str, thresh: float = 0.92) -> bool:
274
  return ratio >= thresh
275
 
276
  def _dedupe_title_from_desc(title: str, desc: str) -> str:
277
- """If the description contains the title, strip it and tidy up."""
278
  t = (title or "").strip()
279
  d = (desc or "").strip()
280
  if not t or not d:
281
  return d
282
- # Remove exact leading title
283
  if d.lower().startswith(t.lower()):
284
  d = d[len(t):].lstrip(" -–:•|")
285
- # Remove inner repeats
286
  d = d.replace(t, "").strip(" -–:•|")
287
  d = _clean_text(d)
288
  return d
289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
- # Prevent duplicate upstream fetches when identical requests arrive together
292
  _inflight_locks: Dict[Tuple, threading.Lock] = {}
293
  _inflight_global_lock = threading.Lock()
294
-
295
-
296
  def _get_inflight_lock(key: Tuple) -> threading.Lock:
297
  with _inflight_global_lock:
298
  lk = _inflight_locks.get(key)
@@ -301,40 +195,29 @@ def _get_inflight_lock(key: Tuple) -> threading.Lock:
301
  _inflight_locks[key] = lk
302
  return lk
303
 
304
-
305
  DESC_CACHE_LOCK = threading.Lock()
306
-
307
  try:
308
- from bs4 import BeautifulSoup # optional but nice to have
309
  except Exception:
310
  BeautifulSoup = None
311
 
312
-
313
- # -------- Description fetching config --------
314
- DESC_FETCH_TIMEOUT = 3 # seconds per URL
315
- DESC_MIN_LEN = 100 # consider shorter text as "weak" and try to upgrade
316
- DESC_CACHE_TTL = 24 * 3600 # 24h
317
- MAX_DESC_FETCHES = 24 # cap number of fetches per request
318
- DESC_WORKERS = 12 # parallel workers
319
-
320
- # url -> {"text": str, "t": monotonic()}
321
  DESC_CACHE: Dict[str, Dict[str, Any]] = {}
322
 
323
-
324
  def _now_mono():
 
325
  try:
326
  return monotonic()
327
  except Exception:
328
  return time.time()
329
 
330
-
331
- def _clean_text(s: str) -> str:
332
- s = unescape(s or "")
333
- s = re.sub(r"\s+", " ", s).strip()
334
- return s
335
-
336
-
337
  def _extract_desc_from_ld_json(html: str) -> Optional[str]:
 
338
  if not html or not BeautifulSoup:
339
  return None
340
  try:
@@ -342,11 +225,9 @@ def _extract_desc_from_ld_json(html: str) -> Optional[str]:
342
  for tag in soup.find_all("script", {"type": "application/ld+json"}):
343
  try:
344
  import json
345
-
346
  data = json.loads(tag.string or "")
347
  except Exception:
348
  continue
349
-
350
  def find_desc(obj):
351
  if not isinstance(obj, (dict, list)):
352
  return None
@@ -356,21 +237,18 @@ def _extract_desc_from_ld_json(html: str) -> Optional[str]:
356
  if v:
357
  return v
358
  return None
359
- # dict
360
  for key in ("description", "abstract", "articleBody"):
361
  val = obj.get(key)
362
  if isinstance(val, str):
363
  txt = _clean_text(val)
364
  if len(txt) >= 40:
365
  return txt
366
- # nested
367
  for k, v in obj.items():
368
  if isinstance(v, (dict, list)):
369
  got = find_desc(v)
370
  if got:
371
  return got
372
  return None
373
-
374
  d = find_desc(data)
375
  if d and len(d) >= 40:
376
  return d
@@ -378,32 +256,25 @@ def _extract_desc_from_ld_json(html: str) -> Optional[str]:
378
  pass
379
  return None
380
 
381
-
382
  CONSENT_HINTS = re.compile(r"(consent|gdpr|privacy choices|before you continue|we value your privacy)", re.I)
383
 
384
-
385
  def _looks_like_consent_wall(html: str) -> bool:
386
  if not html:
387
  return False
388
- if "consent.yahoo.com" in html.lower(): # common interstitial
389
  return True
390
- # generic phrasing
391
  return bool(CONSENT_HINTS.search(html))
392
 
393
-
394
  def _extract_desc_from_html(html: str) -> Optional[str]:
395
  html = html or ""
396
  if BeautifulSoup:
397
  soup = BeautifulSoup(html, "html.parser")
398
-
399
- # ✅ JSON-LD early
400
  ld = _extract_desc_from_ld_json(html)
401
  if ld:
402
  txt = _clean_text(ld)
403
  if 40 <= len(txt) <= 480:
404
  return txt
405
-
406
-
407
  for sel, attr in [
408
  ('meta[property="og:description"]', "content"),
409
  ('meta[name="twitter:description"]', "content"),
@@ -414,13 +285,11 @@ def _extract_desc_from_html(html: str) -> Optional[str]:
414
  txt = _clean_text(tag.get(attr, ""))
415
  if len(txt) >= 40:
416
  return txt
417
- # Fallback: first meaningful <p>
418
  for p in soup.find_all("p"):
419
  txt = _clean_text(p.get_text(" "))
420
  if len(txt) >= 80:
421
  return txt
422
  else:
423
- # regex fallbacks (as you had)
424
  for pat in [
425
  r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)["\']',
426
  r'<meta[^>]+name=["\']twitter:description["\'][^>]+content=["\']([^"\']+)["\']',
@@ -436,10 +305,8 @@ def _extract_desc_from_html(html: str) -> Optional[str]:
436
  txt = _clean_text(re.sub("<[^>]+>", " ", m.group(1)))
437
  if len(txt) >= 80:
438
  return txt
439
- # JSON-LD as last regex-free fallback not available w/o bs4
440
  return None
441
 
442
-
443
  def _desc_cache_get(url: str) -> Optional[str]:
444
  if not url:
445
  return None
@@ -451,14 +318,13 @@ def _desc_cache_get(url: str) -> Optional[str]:
451
  return None
452
  return entry["text"]
453
 
454
-
455
  def _desc_cache_put(url: str, text: str):
456
  if url and text:
457
  with DESC_CACHE_LOCK:
458
  DESC_CACHE[url] = {"text": text, "t": _now_mono()}
459
 
460
-
461
  def _attempt_fetch(url: str, timeout: int) -> Optional[str]:
 
462
  headers = {
463
  "User-Agent": "Mozilla/5.0 (compatible; NewsGlobe/1.0; +mailto:you@yourdomain.com)",
464
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
@@ -473,7 +339,6 @@ def _attempt_fetch(url: str, timeout: int) -> Optional[str]:
473
  if "html" not in ct and "<html" not in txt.lower():
474
  return None
475
  if _looks_like_consent_wall(txt):
476
- # jump straight to Jina if a consent wall is detected
477
  jd = _try_jina_reader(url, timeout)
478
  if jd:
479
  return jd
@@ -482,39 +347,28 @@ def _attempt_fetch(url: str, timeout: int) -> Optional[str]:
482
  if desc and 40 <= len(desc) <= 480:
483
  return desc
484
  except Exception:
485
- # fall through to Jina
486
  pass
487
-
488
- # Last-resort: Jina reader
489
  jd = _try_jina_reader(url, timeout)
490
  if jd and 40 <= len(jd) <= 480:
491
  return jd
492
  return None
493
 
494
-
495
-
496
  def fetch_page_description(url: str) -> Optional[str]:
497
- """Fetch and cache a best-effort article description from the page (incl. AMP retries)."""
498
  if not url:
499
  return None
500
  cached = _desc_cache_get(url)
501
  if cached:
502
  return cached
503
-
504
- # Try the original URL first
505
  desc = _attempt_fetch(url, DESC_FETCH_TIMEOUT)
506
  if not desc:
507
- # Try common AMP variants
508
  amp_candidates = []
509
  try:
510
  p = urlparse(url)
511
- # /amp path
512
  if not p.path.endswith("/amp"):
513
  amp_candidates.append(urlunparse(p._replace(path=(p.path.rstrip("/") + "/amp"))))
514
- # ?amp
515
  q = p.query
516
  amp_candidates.append(urlunparse(p._replace(query=(q + ("&" if q else "") + "amp=1"))))
517
- # ?outputType=amp (CNN, some US sites)
518
  amp_candidates.append(urlunparse(p._replace(query=(q + ("&" if q else "") + "outputType=amp"))))
519
  except Exception:
520
  pass
@@ -522,14 +376,13 @@ def fetch_page_description(url: str) -> Optional[str]:
522
  desc = _attempt_fetch(amp_url, DESC_FETCH_TIMEOUT)
523
  if desc:
524
  break
525
-
526
  if desc:
527
  _desc_cache_put(url, desc)
528
  return desc
529
  return None
530
 
531
-
532
  def _needs_desc_upgrade(a: Dict[str, Any]) -> bool:
 
533
  url = a.get("url") or ""
534
  if not url:
535
  return False
@@ -539,38 +392,37 @@ def _needs_desc_upgrade(a: Dict[str, Any]) -> bool:
539
  return True
540
  if len(desc) < DESC_MIN_LEN:
541
  return True
542
- # NEW: if desc ≈ title, trigger upgrade
543
  if _too_similar(title, desc):
544
  return True
545
  return False
546
 
547
-
548
  def prefetch_descriptions(raw_articles: List[Dict[str, Any]], speed: Speed = Speed.balanced):
 
549
  candidates, seen = [], set()
550
  max_fetches = 6 if speed == Speed.fast else 8 if speed == Speed.balanced else 16
551
  timeout = 1 if speed == Speed.fast else 2
552
  workers = 3 if speed == Speed.fast else 4 if speed == Speed.balanced else 8
553
-
554
-
555
  for a in raw_articles:
556
- url = a.get("url");
557
- if not url or url in seen: continue
 
558
  seen.add(url)
559
  if _needs_desc_upgrade(a) and not _desc_cache_get(url):
560
  candidates.append(url)
561
- if len(candidates) >= max_fetches: break
562
-
563
- if not candidates: return
 
564
  with ThreadPoolExecutor(max_workers=workers) as ex:
565
  futs = [ex.submit(fetch_page_description, u) for u in candidates]
566
- for _ in as_completed(futs): pass
567
-
568
 
569
  def prefetch_descriptions_async(raw_articles, speed: Speed = Speed.balanced):
570
  threading.Thread(target=prefetch_descriptions, args=(raw_articles, speed), daemon=True).start()
571
 
572
- # news_clf = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi", top_k=1)
573
- DetectorFactory.seed = 0 # deterministic
574
 
575
  SECTION_HINTS = {
576
  "sports": "sports",
@@ -611,20 +463,29 @@ KEYWORDS = {
611
  "politics": r"\b(president|parliament|congress|minister|policy|campaign|election)\b",
612
  }
613
 
 
 
 
 
 
 
 
 
 
 
614
 
615
  def _infer_category_from_url_path(url_path: str) -> Optional[str]:
 
616
  parts = [p for p in url_path.lower().split("/") if p]
617
  for p in parts:
618
  if p in SECTION_HINTS:
619
  return SECTION_HINTS[p]
620
- # also try hyphenated tokens like 'us-business' or 'tech-news'
621
  for p in parts:
622
  for tok in re.split(r"[-_]", p):
623
  if tok in SECTION_HINTS:
624
  return SECTION_HINTS[tok]
625
  return None
626
 
627
-
628
  def _infer_category_from_text(text: str) -> Optional[str]:
629
  if not text:
630
  return None
@@ -633,13 +494,11 @@ def _infer_category_from_text(text: str) -> Optional[str]:
633
  return cat
634
  return None
635
 
636
-
637
  def infer_category(article_url, title, description, provided):
638
  if provided:
639
  p = provided.strip().lower()
640
  if p:
641
  return p
642
- # url rules
643
  try:
644
  p = urlparse(article_url).path or ""
645
  cat = _infer_category_from_url_path(p)
@@ -647,13 +506,12 @@ def infer_category(article_url, title, description, provided):
647
  return cat
648
  except Exception:
649
  pass
650
- # keyword rules
651
  text = f"{title or ''} {description or ''}".strip()
652
  cat = _infer_category_from_text(text)
653
  if cat:
654
  return cat
655
  try:
656
- preds = get_news_clf()(text[:512]) # lazy-loaded
657
  if isinstance(preds[0], list):
658
  label = preds[0][0]["label"]
659
  else:
@@ -663,193 +521,35 @@ def infer_category(article_url, title, description, provided):
663
  log.warning(f"ML category failed: {e}")
664
  return "general"
665
 
666
-
667
-
668
- BOILER = re.compile(r"\b(live updates|breaking|what we know|in pictures|opinion)\b", re.I)
669
-
670
-
671
- def _norm_text(s: str) -> str:
672
- s = (s or "").strip()
673
- s = re.sub(r"\s+", " ", s)
674
- return s
675
-
676
-
677
- def _cluster_text(a):
678
- base = f"{a.get('orig_title') or a.get('title') or ''} {a.get('orig_description') or a.get('description') or ''}"
679
- base = BOILER.sub("", base)
680
- base = re.sub(r"\b(\d{1,2}:\d{2}\s?(AM|PM))|\b(\d{1,2}\s\w+\s\d{4})", "", base, flags=re.I)
681
- return _norm_text(base)
682
-
683
-
684
- def _canonical_url(u: str) -> str:
685
- if not u:
686
- return u
687
- p = urlparse(u)
688
- # drop tracking params
689
- qs = [(k, v) for (k, v) in parse_qsl(p.query, keep_blank_values=False) if not k.lower().startswith(("utm_", "fbclid", "gclid"))]
690
- clean = p._replace(query="&".join([f"{k}={v}" for k, v in qs]), fragment="")
691
- # some sites add trailing slashes inconsistently
692
- path = clean.path.rstrip("/") or "/"
693
- clean = clean._replace(path=path)
694
- return urlunparse(clean)
695
-
696
-
697
  def detect_lang(text: str) -> Optional[str]:
698
  try:
699
- return detect(text) # returns 'en','fr','de',...
700
  except Exception:
701
  return None
702
 
 
 
 
 
 
703
 
704
  def _embed_texts(texts: List[str]):
705
  embs = get_sbert().encode(texts, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
706
  return embs
707
 
708
-
709
- # ---- cache helpers ----
710
- CACHE_TTL_SECS = 900
711
- SIM_THRESHOLD = 0.6
712
- _events_cache: Dict[Tuple, Dict[str, Any]] = {}
713
-
714
-
715
- def cache_key_for(q, category, language, limit_each, translate=False, target_lang=None, speed=Speed.balanced):
716
- return (q or "", category or "", language or "", int(limit_each or 50),
717
- bool(translate), (target_lang or "").lower(), speed.value)
718
-
719
-
720
- _first_real_build = True # module-global
721
-
722
- def get_or_build_events_cache(q, category, language, translate, target_lang, limit_each, speed=Speed.balanced):
723
- global _first_real_build
724
- key = cache_key_for(q, category, language, limit_each, translate, target_lang, speed)
725
- now = monotonic()
726
-
727
- if speed == Speed.fast:
728
- use_timespan, use_limit = "24h", min(limit_each, 20)
729
- elif speed == Speed.balanced:
730
- use_timespan, use_limit = "48h", min(limit_each, 150)
731
- else: # max
732
- use_timespan, use_limit = "3d", limit_each
733
-
734
- entry = _events_cache.get(key)
735
- if entry and now - entry["t"] < CACHE_TTL_SECS:
736
- log.info(f"CACHE HIT for {key}")
737
- return key, entry["enriched"], entry["clusters"]
738
-
739
- lock = _get_inflight_lock(key)
740
- with lock:
741
- entry = _events_cache.get(key)
742
- if entry and now - entry["t"] < CACHE_TTL_SECS:
743
- log.info(f"CACHE HIT (post-lock) for {key}")
744
- return key, entry["enriched"], entry["clusters"]
745
-
746
- if _first_real_build:
747
- use_timespan = "24h" if use_timespan != "24h" else use_timespan
748
- use_limit = min(use_limit, 100)
749
-
750
- log.info(f"CACHE MISS for {key} — fetching (timespan={use_timespan}, limit_each={use_limit})")
751
-
752
- raw = combine_raw_articles(
753
- category=category, # providers may use it; inference ignores it
754
- query=q,
755
- language=language,
756
- limit_each=use_limit,
757
- timespan=use_timespan,
758
- speed=speed,
759
- )
760
- prefetch_descriptions_async(raw, speed)
761
-
762
- enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
763
-
764
- if category:
765
- cat_norm = (category or "").strip().lower()
766
- enriched = [e for e in enriched_all if (e.get("category") or "").lower() == cat_norm]
767
- else:
768
- enriched = enriched_all
769
-
770
- clusters = cluster_articles(enriched, sim_threshold=SIM_THRESHOLD, speed=speed)
771
-
772
- _events_cache[key] = {"t": monotonic(), "enriched": enriched, "clusters": clusters}
773
- _first_real_build = False
774
- return key, enriched, clusters
775
-
776
-
777
- # Which languages to rotate when user didn't restrict language
778
- LANG_ROTATION = ["en", "es", "fr", "de", "ar", "ru", "pt", "zh", "hi", "ja", "ko"]
779
-
780
- # A few sourcecountry seeds for English to diversify outlets (optional)
781
- COUNTRY_SEEDS = ["US", "GB", "IN", "CA", "AU", "ZA", "SG", "NG", "DE", "FR", "BR", "MX", "ES", "RU", "JP", "KR", "CN"]
782
-
783
-
784
- # ----------------- Config / Keys -----------------
785
- USE_GNEWS_API = False
786
- USE_NEWSDATA_API = False
787
- USE_GDELT_API = True
788
- USE_NEWSAPI = False
789
-
790
- NEWSAPI_KEY = os.getenv("NEWSAPI_KEY", "ea734c66dc4044fa8e4501ad7b90e753")
791
- GNEWS_API_KEY = os.getenv("GNEWS_API_KEY", "5419897c95e8a4b21074e0d3fe95a3dd")
792
- NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY", "pub_1feb49a71a844719af68d0844fb43a61")
793
- HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
794
-
795
- logging.basicConfig(
796
- level=logging.WARNING,
797
- format="%(levelname)s:%(name)s:%(message)s",
798
- )
799
-
800
- log = logging.getLogger("newsglobe")
801
- log.setLevel(logging.WARNING)
802
-
803
- fetch_log = logging.getLogger("newsglobe.fetch_summary")
804
- fetch_log.setLevel(logging.INFO)
805
- _fetch_handler = logging.StreamHandler()
806
- _fetch_handler.setLevel(logging.INFO)
807
- _fetch_handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
808
- fetch_log.addHandler(_fetch_handler)
809
- fetch_log.propagate = False # don't pass to root (which is WARNING)
810
-
811
- logging.getLogger("urllib3").setLevel(logging.WARNING)
812
- logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
813
- logging.getLogger("requests.packages.urllib3").setLevel(logging.WARNING)
814
- logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
815
- logging.getLogger("transformers").setLevel(logging.WARNING)
816
-
817
- for name in ("urllib3", "urllib3.connectionpool", "requests.packages.urllib3"):
818
- lg = logging.getLogger(name)
819
- lg.setLevel(logging.ERROR)
820
- lg.propagate = False
821
-
822
-
823
- def _newsapi_enabled() -> bool:
824
- if not NEWSAPI_KEY:
825
- log.warning("NewsAPI disabled: missing NEWSAPI_KEY env var")
826
- return False
827
- return True
828
-
829
-
830
- def cluster_id(cluster, enriched_articles):
831
- urls = sorted([(enriched_articles[i].get("url") or "") for i in cluster["indices"] if enriched_articles[i].get("url")])
832
- base = "|".join(urls) if urls else "empty"
833
- return hashlib.md5(base.encode("utf-8")).hexdigest()[:10]
834
-
835
-
836
- # ----------------- NLTK / VADER -----------------
837
  NLTK_DATA_DIR = os.environ.get("NLTK_DATA", "/app/nltk_data")
838
-
839
- # Make sure NLTK looks in the baked, writable dir first
840
  if NLTK_DATA_DIR not in nltk.data.path:
841
  nltk.data.path.insert(0, NLTK_DATA_DIR)
842
-
843
  try:
844
  nltk.data.find("sentiment/vader_lexicon")
845
  except LookupError:
846
- # As a fallback, try downloading into the writable dir (won't run if already baked)
847
  try:
848
  os.makedirs(NLTK_DATA_DIR, exist_ok=True)
849
  nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIR, quiet=True)
850
  except Exception:
851
- pass # don't crash if download is blocked
852
-
853
  try:
854
  _vader = SentimentIntensityAnalyzer()
855
  except Exception:
@@ -864,8 +564,7 @@ def classify_sentiment(text: str) -> str:
864
  c = scores["compound"]
865
  return "positive" if c >= 0.2 else "negative" if c <= -0.2 else "neutral"
866
 
867
-
868
- # ----------------- Geocode helpers -----------------
869
  def get_country_centroid(country_name):
870
  if not country_name or country_name == "Unknown":
871
  return {"lat": 0, "lon": 0, "country": "Unknown"}
@@ -877,7 +576,6 @@ def get_country_centroid(country_name):
877
  log.info(f"Could not get centroid for {country_name}: {e}")
878
  return {"lat": 0, "lon": 0, "country": country_name or "Unknown"}
879
 
880
-
881
  def resolve_domain_to_ip(domain):
882
  if not domain:
883
  return None
@@ -886,7 +584,6 @@ def resolve_domain_to_ip(domain):
886
  except socket.gaierror:
887
  return None
888
 
889
-
890
  def geolocate_ip(ip):
891
  try:
892
  r = _session_get(f"https://ipwho.is/{ip}?fields=success,country,latitude,longitude", timeout=8)
@@ -897,7 +594,7 @@ def geolocate_ip(ip):
897
  pass
898
  return None
899
 
900
-
901
  geolocator = Nominatim(user_agent="newsglobe-app (contact: you@example.com)")
902
  domain_geo_cache: Dict[str, Dict[str, Any]] = {}
903
 
@@ -915,32 +612,23 @@ MAJOR_OUTLETS = {
915
  "lefigaro.fr": "France",
916
  "kyodonews.net": "Japan",
917
  "straitstimes.com": "Singapore",
918
- "thesun.my": "Malaysia", # <-- add this
919
  }
920
 
921
-
922
  def geocode_source(source_text: str, domain: str = "", do_network: bool = False):
923
  cache_key = f"{source_text}|{domain}"
924
  if cache_key in domain_geo_cache:
925
  return domain_geo_cache[cache_key]
926
-
927
  ext = _tld(domain or "")
928
  fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
929
-
930
- # 0) Major outlets / domain map
931
  if fqdn in MAJOR_OUTLETS:
932
  coords = get_country_centroid(MAJOR_OUTLETS[fqdn]); domain_geo_cache[cache_key] = coords; return coords
933
  if ext.domain in domain_country_map:
934
  coords = get_country_centroid(domain_country_map[ext.domain]); domain_geo_cache[cache_key] = coords; return coords
935
-
936
- # 1) Suffix fallback (instant)
937
  coords = get_country_centroid(_suffix_country(ext.suffix))
938
  domain_geo_cache[cache_key] = coords
939
-
940
- # 2) Optional async refinement (never block hot path)
941
  if do_network:
942
  threading.Thread(target=_refine_geo_async, args=(cache_key, source_text, fqdn), daemon=True).start()
943
-
944
  return coords
945
 
946
  def _suffix_country(suffix: Optional[str]) -> str:
@@ -955,18 +643,14 @@ def _suffix_country(suffix: Optional[str]) -> str:
955
  }
956
  return m.get(s, "United States" if s in ("com","org","net") else "Unknown")
957
 
958
-
959
-
960
  def _refine_geo_async(cache_key, source_text, fqdn):
961
  try:
962
- # Try IP geo (cheap)
963
  ip = resolve_domain_to_ip(fqdn) if fqdn else None
964
  if ip:
965
  coords = geolocate_ip(ip)
966
  if coords:
967
  domain_geo_cache[cache_key] = coords
968
  return
969
- # Try Nominatim FAST (lower timeout)
970
  location = geolocator.geocode(f"{source_text} News Headquarters", timeout=2)
971
  if location and hasattr(location, "raw"):
972
  coords = {
@@ -978,10 +662,9 @@ def _refine_geo_async(cache_key, source_text, fqdn):
978
  except Exception:
979
  pass
980
 
981
- # ----------------- HuggingFace translate (optional) -----------------
982
- HF_MODEL_PRIMARY = None # disable NLLB remote (avoids 404 spam); use OPUS + pivot/LibreTranslate
983
 
984
- # 2-letter ISO -> NLLB codes
985
  NLLB_CODES = {
986
  "en": "eng_Latn",
987
  "es": "spa_Latn",
@@ -997,8 +680,6 @@ NLLB_CODES = {
997
  "ko": "kor_Hang",
998
  }
999
 
1000
-
1001
- # OPUS-MT model map for common pairs (expand as needed)
1002
  def opus_model_for(src2: str, tgt2: str) -> Optional[str]:
1003
  pairs = {
1004
  ("es", "en"): "Helsinki-NLP/opus-mt-es-en",
@@ -1026,10 +707,8 @@ def opus_model_for(src2: str, tgt2: str) -> Optional[str]:
1026
  }
1027
  return pairs.get((src2, tgt2))
1028
 
1029
-
1030
  SUPPORTED = {"en", "fr", "de", "es", "it", "hi", "ar", "ru", "ja", "ko", "pt", "zh"}
1031
-
1032
- LIBRETRANSLATE_URL = os.getenv("LIBRETRANSLATE_URL") # e.g., http://127.0.0.1:5000
1033
 
1034
  def _translate_via_libre(text: str, src: str, tgt: str) -> Optional[str]:
1035
  url = LIBRETRANSLATE_URL
@@ -1051,14 +730,11 @@ def _translate_via_libre(text: str, src: str, tgt: str) -> Optional[str]:
1051
  log.warning("LibreTranslate failed: %s", e)
1052
  return None
1053
 
1054
-
1055
  def _hf_call(model_id: str, payload: dict) -> Optional[str]:
1056
- # require both a token and explicit opt-in
1057
  if not (HUGGINGFACE_API_TOKEN and ALLOW_HF_REMOTE):
1058
  return None
1059
  if model_id in _hf_bad_models:
1060
  return None
1061
-
1062
  url = f"https://api-inference.huggingface.co/models/{model_id}"
1063
  headers = {
1064
  "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}",
@@ -1079,7 +755,6 @@ def _hf_call(model_id: str, payload: dict) -> Optional[str]:
1079
  except Exception as e:
1080
  log.warning("HF request failed: %s", e)
1081
  return None
1082
-
1083
  if isinstance(j, list) and j and isinstance(j[0], dict):
1084
  if "generated_text" in j[0]:
1085
  return j[0]["generated_text"]
@@ -1095,20 +770,14 @@ def _hf_call(model_id: str, payload: dict) -> Optional[str]:
1095
  def _translate_cached(text: str, src: str, tgt: str) -> str:
1096
  if not text or src == tgt:
1097
  return text
1098
-
1099
- # 0) Local LibreTranslate (fast & free, if running)
1100
  out = _translate_via_libre(text, src, tgt)
1101
  if out:
1102
  return out
1103
-
1104
- # 1) OPUS serverless (direct pair) – try this first
1105
  opus_model = opus_model_for(src, tgt)
1106
  if opus_model:
1107
  out = _hf_call(opus_model, {"inputs": text})
1108
  if out:
1109
  return out
1110
-
1111
- # 2) NLLB serverless (optional; disabled if HF_MODEL_PRIMARY is None)
1112
  try:
1113
  if HF_MODEL_PRIMARY and (src in NLLB_CODES) and (tgt in NLLB_CODES):
1114
  out = _hf_call(
@@ -1123,26 +792,18 @@ def _translate_cached(text: str, src: str, tgt: str) -> str:
1123
  return out
1124
  except Exception:
1125
  pass
1126
-
1127
- # 3) Two-hop pivot via English for non-English↔non-English
1128
  if src != "en" and tgt != "en":
1129
  step_en = _translate_cached(text, src, "en")
1130
  if step_en and step_en != text:
1131
  out = _translate_cached(step_en, "en", tgt)
1132
  if out:
1133
  return out
1134
-
1135
- # 4) Local OPUS fallback (direct pair with local pipeline)
1136
  out = _translate_local(text, src, tgt)
1137
  if out:
1138
  return out
1139
-
1140
  log.warning("All translate paths failed (%s->%s); returning original.", src, tgt)
1141
  return text
1142
 
1143
-
1144
-
1145
-
1146
  def translate_text(text: str, target_lang: Optional[str], fallback_src: Optional[str] = None) -> str:
1147
  if not text or not target_lang:
1148
  return text
@@ -1163,12 +824,25 @@ def translate_text(text: str, target_lang: Optional[str], fallback_src: Optional
1163
  src = "en"
1164
  return _translate_cached(text, src, tgt)
1165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1166
 
1167
-
1168
-
1169
- # === Warm config ===
1170
- WARM_LIMIT_EACH = 20 # smaller bite to prime caches
1171
- WARM_TIMESPAN = "24h" # narrower GDELT window for faster first fetch
1172
  WARM_PREFETCH_DESCRIPTIONS = False
1173
 
1174
  def _fmt_mmss(ms: float) -> str:
@@ -1180,40 +854,27 @@ def _warm_once():
1180
  try:
1181
  log.info("WARM: starting background warm-up (limit_each=%d, timespan=%s)", WARM_LIMIT_EACH, WARM_TIMESPAN)
1182
  t0 = time.perf_counter()
1183
-
1184
- # models (you already call these in startup, but keep them here too)
1185
  get_sbert()
1186
  get_news_clf()
1187
-
1188
- # fetch a small set with shorter timespan
1189
  t1 = time.perf_counter()
1190
  raw = combine_raw_articles(
1191
  category=None, query=None, language="en",
1192
  limit_each=WARM_LIMIT_EACH, timespan=WARM_TIMESPAN,
1193
- log_summary=False # ← silence warm-up summary
1194
  )
1195
  t_fetch = (time.perf_counter() - t1) * 1000
1196
-
1197
- # optional: skip description prefetch during warm to save time
1198
  if WARM_PREFETCH_DESCRIPTIONS:
1199
  prefetch_descriptions_async(raw)
1200
-
1201
- # enrich + cluster once (no translation on warm)
1202
  t2 = time.perf_counter()
1203
  enriched = [enrich_article(a, language="en", translate=False, target_lang=None) for a in raw]
1204
  t_enrich = (time.perf_counter() - t2) * 1000
1205
-
1206
  t3 = time.perf_counter()
1207
  clusters = cluster_articles(enriched, sim_threshold=SIM_THRESHOLD)
1208
  t_cluster = (time.perf_counter() - t3) * 1000
1209
-
1210
- # stash in cache under the common default key so /news and /events hit warm data
1211
  key = cache_key_for(q=None, category=None, language="en",
1212
  limit_each=WARM_LIMIT_EACH, translate=False, target_lang=None,
1213
- speed=Speed.balanced) # 👈 add speed
1214
-
1215
  _events_cache[key] = {"t": monotonic(), "enriched": enriched, "clusters": clusters}
1216
-
1217
  t_total = (time.perf_counter() - t0) * 1000
1218
  log.info(
1219
  "WARM: fetch=%s, enrich=%s, cluster=%s, total=%s (raw=%d, enriched=%d, clusters=%d)",
@@ -1225,14 +886,11 @@ def _warm_once():
1225
 
1226
  @app.on_event("startup")
1227
  def warm():
1228
- # keep your existing model warms
1229
  get_sbert()
1230
  get_news_clf()
1231
- # fire-and-forget warm in a background thread so startup stays snappy
1232
  threading.Thread(target=_warm_once, daemon=True).start()
1233
 
1234
- # ----------------- Providers -----------------
1235
- # ISO -> GDELT 'sourcelang:' names (keep yours)
1236
  _GDELT_LANG = {
1237
  "en": "english",
1238
  "es": "spanish",
@@ -1248,7 +906,6 @@ _GDELT_LANG = {
1248
  "zh": "chinese",
1249
  }
1250
 
1251
-
1252
  def _gdelt_safe_query(user_q, language):
1253
  parts = []
1254
  if user_q:
@@ -1259,11 +916,10 @@ def _gdelt_safe_query(user_q, language):
1259
  if language and (lg := _GDELT_LANG.get(language.lower())):
1260
  parts.append(f"sourcelang:{lg}")
1261
  if not parts:
1262
- # rotate or randomly choose one to diversify
1263
  parts.append("sourcelang:english")
1264
  return " ".join(parts)
1265
 
1266
-
1267
  def fetch_gdelt_articles(
1268
  limit=50,
1269
  query=None,
@@ -1304,7 +960,6 @@ def fetch_gdelt_articles(
1304
 
1305
  data = _do_request(params)
1306
  if data is None:
1307
- # Retry narrower and smaller if needed
1308
  p2 = {**params, "timespan": "24h", "maxrecords": min(100, params["maxrecords"])}
1309
  data = _do_request(p2)
1310
  if not data:
@@ -1330,144 +985,112 @@ def fetch_gdelt_articles(
1330
  "publishedAt": a.get("seendate"),
1331
  "api_source": "gdelt",
1332
  "gdelt_sourcecountry": a.get("sourcecountry"),
1333
- # Keep the user's chosen category only for debugging/reference; do NOT use for inference.
1334
  "requested_category": category,
1335
  }
1336
  )
1337
  log.info(f"GDELT returned {len(results)}")
1338
  return results
1339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1340
 
1341
 
1342
- def fetch_newsdata_articles(category=None, limit=20, query=None, language=None):
1343
- base_url = "https://newsdata.io/api/1/news"
1344
- allowed = [
1345
- "business",
1346
- "entertainment",
1347
- "environment",
1348
- "food",
1349
- "health",
1350
- "politics",
1351
- "science",
1352
- "sports",
1353
- "technology",
1354
- "top",
1355
- "world",
1356
- ]
1357
- params = {"apikey": NEWSDATA_API_KEY, "language": (language or "en")}
1358
- if category and category in allowed:
1359
- params["category"] = category
1360
- if query:
1361
- params["q"] = query
1362
-
1363
- all_articles, next_page = [], None
1364
- while len(all_articles) < limit:
1365
- if next_page:
1366
- params["page"] = next_page
1367
- resp = _session_get(base_url, params=params, timeout=12)
1368
- if resp.status_code != 200:
1369
- break
1370
- data = resp.json()
1371
- articles = data.get("results", [])
1372
- for a in articles:
1373
- a["api_source"] = "newsdata"
1374
- all_articles.extend(articles)
1375
- next_page = data.get("nextPage")
1376
- if not next_page:
1377
- break
1378
-
1379
- # normalize timestamps if available
1380
- for a in all_articles:
1381
- a["publishedAt"] = a.get("pubDate")
1382
- return all_articles[:limit]
1383
 
 
 
 
 
1384
 
1385
- def fetch_gnews_articles(limit=20, query=None, language=None):
1386
- url = f"https://gnews.io/api/v4/top-headlines?lang={(language or 'en')}&max={limit}&token={GNEWS_API_KEY}"
1387
- if query:
1388
- url += f"&q={requests.utils.quote(query)}"
1389
- try:
1390
- r = _session_get(url, timeout=12)
1391
- if r.status_code != 200:
1392
- return []
1393
- arts = r.json().get("articles", [])
1394
- for a in arts:
1395
- a["api_source"] = "gnews"
1396
- return arts
1397
- except Exception:
1398
- return []
1399
-
1400
-
1401
- NEWSAPI_COUNTRIES = ["us", "gb", "ca", "au", "in", "za", "sg", "ie", "nz"]
1402
-
1403
 
1404
- def fetch_newsapi_headlines_multi(limit=50, language=None):
1405
- if not _newsapi_enabled():
1406
- return []
1407
- all_ = []
1408
- per = max(1, math.ceil(limit / max(1, len(NEWSAPI_COUNTRIES))))
1409
- per = min(per, 100) # NewsAPI pageSize cap
1410
- for c in NEWSAPI_COUNTRIES:
1411
- url = f"https://newsapi.org/v2/top-headlines?country={c}&pageSize={per}&apiKey={NEWSAPI_KEY}"
1412
- r = _session_get(url, timeout=12)
1413
- if r.status_code != 200:
1414
- log.warning(f"NewsAPI top-headlines {c} -> HTTP {r.status_code}: {r.text[:200]}")
1415
- continue
1416
- arts = r.json().get("articles", [])
1417
- for a in arts:
1418
- a["api_source"] = "newsapi"
1419
- all_.extend(arts)
1420
- time.sleep(0.2)
1421
- return all_[:limit] # ✅ enforce exact limit
1422
 
 
 
 
 
 
1423
 
1424
- def fetch_newsapi_articles(category=None, limit=20, query=None, language=None):
1425
- if not _newsapi_enabled():
1426
- return []
 
 
1427
 
1428
- # If a query is provided, use /everything (language allowed here)
1429
- if query:
1430
- url = f"https://newsapi.org/v2/everything?pageSize={limit}&apiKey={NEWSAPI_KEY}&q={requests.utils.quote(query)}"
1431
- if language:
1432
- url += f"&language={language}"
1433
- try:
1434
- r = _session_get(url, timeout=12)
1435
- if r.status_code != 200:
1436
- log.warning(f"NewsAPI /everything HTTP {r.status_code}: {r.text[:200]}")
1437
- return []
1438
- arts = r.json().get("articles", [])
1439
- for a in arts:
1440
- a["api_source"] = "newsapi"
1441
- # DO NOT stamp category here; we infer later
1442
- return arts[:limit]
1443
- except Exception as e:
1444
- log.warning(f"NewsAPI /everything request failed: {e}")
1445
- return []
1446
 
1447
- # Otherwise, rotate /top-headlines by country (no language param)
1448
- results = []
1449
- per_country = max(5, limit // len(NEWSAPI_COUNTRIES))
1450
- for c in NEWSAPI_COUNTRIES:
1451
- url = f"https://newsapi.org/v2/top-headlines?country={c}&pageSize={per_country}&apiKey={NEWSAPI_KEY}"
1452
- if category:
1453
- url += f"&category={category}"
1454
- try:
1455
- r = _session_get(url, timeout=12)
1456
- if r.status_code != 200:
1457
- log.warning(f"NewsAPI top-headlines {c} -> HTTP {r.status_code}: {r.text[:200]}")
1458
- continue
1459
- arts = r.json().get("articles", [])
1460
- for a in arts:
1461
- a["api_source"] = "newsapi"
1462
- # DO NOT stamp category here; we infer later
1463
- results.extend(arts)
1464
- except Exception as e:
1465
- log.warning(f"NewsAPI top-headlines {c} failed: {e}")
1466
- time.sleep(0.2)
1467
- return results[:limit]
1468
 
 
 
 
 
 
1469
 
 
 
 
 
 
 
 
 
 
1470
 
 
1471
  def normalize_newsdata_article(article):
1472
  return {
1473
  "title": article.get("title"),
@@ -1479,10 +1102,7 @@ def normalize_newsdata_article(article):
1479
  "category": ((article.get("category") or [None])[0] if isinstance(article.get("category"), list) else article.get("category")),
1480
  }
1481
 
1482
-
1483
- # ----------------- Enrichment -----------------
1484
  def enrich_article(a, language=None, translate=False, target_lang=None):
1485
- # Normalize source name
1486
  source_name = (a.get("source", {}) or {}).get("name", "").strip() or "Unknown"
1487
  s_lower = source_name.lower()
1488
  if "newsapi" in s_lower:
@@ -1491,16 +1111,12 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
1491
  source_name = "GNews"
1492
  elif "newsdata" in s_lower:
1493
  source_name = "NewsData.io"
1494
-
1495
- # Canonicalize URL & derive domain
1496
  article_url = _canonical_url(a.get("url") or "")
1497
  try:
1498
  ext = _tld(article_url)
1499
  domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
1500
  except Exception:
1501
  domain = ""
1502
-
1503
- # Country guess (GDELT provides ISO2)
1504
  country_guess = None
1505
  if a.get("api_source") == "gdelt":
1506
  sc = a.get("gdelt_sourcecountry")
@@ -1516,17 +1132,11 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
1516
  "PH": "Philippines", "ID": "Indonesia", "NZ": "New Zealand",
1517
  }
1518
  country_guess = iso2map.get(str(sc).upper(), sc if len(str(sc)) > 2 else None)
1519
-
1520
  coords = get_country_centroid(country_guess) if country_guess else geocode_source(source_name, domain, do_network=False)
1521
-
1522
- # Title / description (raw)
1523
  title = (a.get("title") or "").strip() or "(untitled)"
1524
  description = (a.get("description") or "").strip()
1525
-
1526
  if description.lower().startswith("no description"):
1527
  description = ""
1528
-
1529
- # Prefer cached page summary when weak/title-like
1530
  cached_desc = _desc_cache_get(article_url)
1531
  need_upgrade = (
1532
  (not description)
@@ -1536,26 +1146,18 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
1536
  )
1537
  if need_upgrade and cached_desc:
1538
  description = cached_desc
1539
-
1540
  if description:
1541
  description = _tidy_description(title, description, source_name)
1542
  if (not description) or _too_similar(title, description):
1543
  description = f"Quick take: {title.rstrip('.')}."
1544
-
1545
- # Save originals for categorization and debug
1546
  orig_title = title
1547
  orig_description = description
1548
-
1549
- # Language detection / sentiment
1550
  detected_lang = (detect_lang(f"{title} {description}") or "").lower()
1551
  ml_text = f"{orig_title}. {orig_description}".strip()
1552
  sentiment = classify_sentiment(f"{orig_title} {orig_description}")
1553
-
1554
- # Stable id & category (ALWAYS infer; ignore provider/requested categories)
1555
  seed = f"{source_name}|{article_url}|{title}"
1556
  uid = hashlib.md5(seed.encode("utf-8")).hexdigest()[:12]
1557
  cat = infer_category(article_url, orig_title, orig_description, None)
1558
-
1559
  return {
1560
  "id": uid,
1561
  "title": title,
@@ -1576,25 +1178,18 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
1576
  "category": cat,
1577
  }
1578
 
1579
-
1580
-
1581
-
1582
- # ----------------- Clustering into Events -----------------
1583
- # sbert_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
1584
-
1585
- # cluster_articles()
1586
- def cluster_articles(articles: List[Dict[str, Any]], sim_threshold=SIM_THRESHOLD, speed=Speed.balanced):
1587
  if speed == Speed.fast:
1588
- articles = articles[:150] # early cap
1589
  sim_threshold = max(sim_threshold, 0.64)
1590
  elif speed == Speed.balanced:
1591
  articles = articles[:]
1592
  sim_threshold = max(sim_threshold, 0.62)
1593
  texts = [_cluster_text(a) for a in articles]
1594
  embs = get_sbert().encode(texts, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
1595
- clusters = [] # [{indices:[...], centroid:tensor}]
1596
  centroids = []
1597
-
1598
  for i, emb in enumerate(embs):
1599
  best_idx, best_sim = -1, -1.0
1600
  for ci, c_emb in enumerate(centroids):
@@ -1609,21 +1204,14 @@ def cluster_articles(articles: List[Dict[str, Any]], sim_threshold=SIM_THRESHOLD
1609
  centroids[best_idx] = new_c
1610
  clusters[best_idx]["centroid"] = new_c
1611
  else:
1612
- # use texts[i] now (titles[] no longer exists)
1613
  event_id = hashlib.md5(texts[i].encode("utf-8")).hexdigest()[:10]
1614
  clusters.append({"id": event_id, "indices": [i], "centroid": emb})
1615
  centroids.append(emb)
1616
-
1617
- # second-pass merge to reduce fragmenting
1618
  merged = _merge_close_clusters(clusters, embs, threshold=0.70)
1619
-
1620
- # keep ids stable: recompute with URLs of member articles
1621
  for c in merged:
1622
  c["id"] = cluster_id(c, articles)
1623
-
1624
  return merged
1625
 
1626
-
1627
  def event_payload_from_cluster(cluster, enriched_articles):
1628
  idxs = cluster["indices"]
1629
  arts = [enriched_articles[i] for i in idxs]
@@ -1634,7 +1222,7 @@ def event_payload_from_cluster(cluster, enriched_articles):
1634
  countries = {a["country"] for a in arts if a["country"] and a["country"] != "Unknown"}
1635
  ts = [a.get("publishedAt") for a in arts if a.get("publishedAt")]
1636
  return {
1637
- "event_id": cluster_id(cluster, enriched_articles), # <-- stable id
1638
  "title": canonical_title,
1639
  "keywords": keywords,
1640
  "article_count": len(arts),
@@ -1644,30 +1232,24 @@ def event_payload_from_cluster(cluster, enriched_articles):
1644
  "sample_urls": [a["url"] for a in arts[:3] if a.get("url")],
1645
  }
1646
 
1647
-
1648
  def aggregate_event_by_country(cluster, enriched_articles):
1649
  idxs = cluster["indices"]
1650
  arts = [enriched_articles[i] for i in idxs]
1651
  by_country: Dict[str, Dict[str, Any]] = {}
1652
-
1653
  for a in arts:
1654
  c = a.get("country") or "Unknown"
1655
  if c not in by_country:
1656
  coords = get_country_centroid(c)
1657
  by_country[c] = {"country": c, "lat": coords["lat"], "lon": coords["lon"], "articles": []}
1658
  by_country[c]["articles"].append(a)
1659
-
1660
- # summarize per country
1661
  results = []
1662
  for c, block in by_country.items():
1663
  arr = block["articles"]
1664
- # avg sentiment mapped to -1/0/+1
1665
  to_num = {"negative": -1, "neutral": 0, "positive": 1}
1666
  vals = [to_num.get(a["sentiment"], 0) for a in arr]
1667
  avg = sum(vals) / max(len(vals), 1)
1668
  avg_sent = "positive" if avg > 0.15 else "negative" if avg < -0.15 else "neutral"
1669
  top_sources = [s for s, _ in Counter([a["source"] for a in arr]).most_common(3)]
1670
- # tiny extractive summary: top 2 headlines
1671
  summary = " • ".join([a["title"] for a in arr[:2]])
1672
  results.append(
1673
  {
@@ -1682,7 +1264,7 @@ def aggregate_event_by_country(cluster, enriched_articles):
1682
  {
1683
  "title": a["title"],
1684
  "orig_title": a.get("orig_title"),
1685
- "orig_description": a.get("orig_description"), # 👈 add this
1686
  "url": a["url"],
1687
  "source": a["source"],
1688
  "sentiment": a["sentiment"],
@@ -1694,9 +1276,7 @@ def aggregate_event_by_country(cluster, enriched_articles):
1694
  )
1695
  return results
1696
 
1697
-
1698
  def _merge_close_clusters(clusters, embs, threshold=0.68):
1699
- # clusters: [{"indices":[...], "centroid":tensor}, ...] – add centroid in your first pass
1700
  merged = []
1701
  used = set()
1702
  for i in range(len(clusters)):
@@ -1710,23 +1290,242 @@ def _merge_close_clusters(clusters, embs, threshold=0.68):
1710
  sim = util.cos_sim(base["centroid"], clusters[j]["centroid"]).item()
1711
  if sim >= threshold:
1712
  group.append(j)
1713
- # merge those groups
1714
  all_idx = []
1715
  cents = []
1716
  for g in group:
1717
  used.add(g)
1718
  all_idx.extend(clusters[g]["indices"])
1719
  cents.append(clusters[g]["centroid"])
1720
- # new centroid
1721
  newc = torch.stack(cents, dim=0).mean(dim=0)
1722
  newc = newc / newc.norm()
1723
  merged.append({"indices": sorted(set(all_idx)), "centroid": newc})
1724
  return merged
1725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1726
 
1727
- # ----------------- Endpoints -----------------
1728
- prefetch = False
 
1729
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1730
  @app.get("/events")
1731
  def get_events(
1732
  q: Optional[str] = Query(None),
@@ -1740,13 +1539,9 @@ def get_events(
1740
  min_articles: int = Query(2, ge=1, le=200),
1741
  speed: Speed = Query(Speed.balanced),
1742
  ):
1743
-
1744
- # always build cache on untranslated data
1745
  cache_key, enriched, clusters = get_or_build_events_cache(
1746
  q, category, language, False, None, limit_each, speed=speed
1747
  )
1748
-
1749
- # optional post-translate view (does not mutate cache)
1750
  view = enriched
1751
  if translate and target_lang:
1752
  view = [dict(i) for i in enriched]
@@ -1755,13 +1550,12 @@ def get_events(
1755
  i["title"] = translate_text(i.get("title") or "", target_lang, fallback_src=src_hint)
1756
  i["description"] = translate_text(i.get("description") or "", target_lang, fallback_src=src_hint)
1757
  i["translated"] = True
1758
-
1759
  events = [event_payload_from_cluster(c, view) for c in clusters]
1760
  events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
1761
  events.sort(key=lambda e: e["article_count"], reverse=True)
1762
-
1763
  return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
1764
 
 
1765
  @app.get("/event/{event_id}")
1766
  def get_event_details(
1767
  event_id: str,
@@ -1773,14 +1567,13 @@ def get_event_details(
1773
  target_lang: Optional[str] = Query(None),
1774
  limit_each: int = Query(150, ge=5, le=250),
1775
  ):
1776
- # /event/{event_id}
1777
  if cache_key:
1778
  parts = cache_key.split("|")
1779
  if len(parts) != 7:
1780
  raise HTTPException(status_code=400, detail="Bad cache_key")
1781
  speed_str = parts[6]
1782
  try:
1783
- speed_obj = Speed(speed_str) # "fast" | "balanced" | "max"
1784
  except ValueError:
1785
  speed_obj = Speed.balanced
1786
  key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
@@ -1788,17 +1581,13 @@ def get_event_details(
1788
  else:
1789
  speed_obj = Speed.balanced
1790
  key_tuple = cache_key_for(q, category, language, limit_each, translate, target_lang, speed=speed_obj)
1791
-
1792
  entry = _events_cache.get(key_tuple)
1793
  if not entry:
1794
- # always build untranslated
1795
  _, enriched, clusters = get_or_build_events_cache(
1796
  q, category, language, False, None, limit_each, speed=speed_obj
1797
  )
1798
  else:
1799
  enriched, clusters = entry["enriched"], entry["clusters"]
1800
-
1801
- # optional post-translate view (do not mutate cache)
1802
  eview = enriched
1803
  if translate and target_lang:
1804
  eview = [dict(i) for i in enriched]
@@ -1807,17 +1596,15 @@ def get_event_details(
1807
  i["title"] = translate_text(i.get("title") or "", target_lang, fallback_src=src_hint)
1808
  i["description"] = translate_text(i.get("description") or "", target_lang, fallback_src=src_hint)
1809
  i["translated"] = True
1810
-
1811
  cluster = next((c for c in clusters if cluster_id(c, enriched) == event_id), None)
1812
  if not cluster:
1813
  raise HTTPException(status_code=404, detail="Event not found with current filters")
1814
-
1815
  payload = event_payload_from_cluster(cluster, eview)
1816
  countries = aggregate_event_by_country(cluster, eview)
1817
  payload["articles_in_event"] = sum(c["count"] for c in countries)
1818
  return {"event": payload, "countries": countries}
1819
 
1820
-
1821
  @app.get("/news")
1822
  def get_news(
1823
  cache_key: Optional[str] = Query(None),
@@ -1834,25 +1621,21 @@ def get_news(
1834
  page_size: int = Query(120, ge=5, le=300),
1835
  ):
1836
  enriched: List[Dict[str, Any]] = []
1837
-
1838
- # Pull from cache if provided
1839
  if cache_key:
1840
  parts = cache_key.split("|")
1841
  if len(parts) == 7:
1842
  key_tuple = (
1843
- parts[0], # q
1844
- parts[1], # category
1845
- parts[2], # language
1846
- int(parts[3]), # limit_each
1847
- parts[4] == "True", # translate
1848
- parts[5].lower(), # target_lang
1849
- parts[6], # speed
1850
  )
1851
  entry = _events_cache.get(key_tuple)
1852
  if entry:
1853
  enriched = entry["enriched"]
1854
-
1855
- # Fetch fresh if no cached items
1856
  if not enriched:
1857
  raw = combine_raw_articles(category=category, query=q, language=language, limit_each=limit_each, speed=speed)
1858
  prefetch_descriptions_async(raw, speed)
@@ -1863,12 +1646,9 @@ def get_news(
1863
  else:
1864
  enriched = enriched_all
1865
  else:
1866
- # If we got cached items but want to ensure the selected category is enforced:
1867
  if category:
1868
  cat_norm = (category or "").strip().lower()
1869
  enriched = [e for e in enriched if (e.get("category") or "").lower() == cat_norm]
1870
-
1871
- # Translation (optional)
1872
  if translate and target_lang:
1873
  enriched = [dict(i) for i in enriched]
1874
  for i in enriched:
@@ -1880,25 +1660,18 @@ def get_news(
1880
  i["translated"] = True
1881
  i["translated_from"] = (src_hint or "").lower()
1882
  i["translated_to"] = target_lang.lower()
1883
-
1884
- # Optional sentiment filter
1885
  if sentiment:
1886
  s = sentiment.strip().lower()
1887
  enriched = [i for i in enriched if i.get("sentiment", "").lower() == s]
1888
-
1889
- # Pagination
1890
  total = len(enriched)
1891
  start = (page - 1) * page_size
1892
  end = start + page_size
1893
  items = [dict(i) for i in enriched[start:end]]
1894
-
1895
- # Trim debug fields
1896
  if lite:
1897
  drop = {"_ml_text"}
1898
  for i in items:
1899
  for k in drop:
1900
  i.pop(k, None)
1901
-
1902
  return {
1903
  "items": items,
1904
  "total": total,
@@ -1906,74 +1679,7 @@ def get_news(
1906
  "page_size": page_size
1907
  }
1908
 
1909
-
1910
-
1911
-
1912
-
1913
- def combine_raw_articles(category=None, query=None, language=None, limit_each=30,
1914
- timespan="3d", speed=Speed.balanced, log_summary: bool = True):
1915
- if speed == Speed.fast:
1916
- timespan = "24h"
1917
- limit_each = min(limit_each, 20)
1918
- elif speed == Speed.balanced:
1919
- timespan = "48h"
1920
- limit_each = min(limit_each, 150)
1921
-
1922
- a1 = []
1923
- if USE_NEWSAPI:
1924
- if not query:
1925
- a1 = fetch_newsapi_headlines_multi(limit=limit_each, language=language)
1926
- else:
1927
- a1 = fetch_newsapi_articles(category=category, limit=limit_each, query=query, language=language)
1928
-
1929
- a2 = []
1930
- if USE_NEWSDATA_API:
1931
- a2 = [
1932
- normalize_newsdata_article(a)
1933
- for a in fetch_newsdata_articles(category=category, limit=limit_each, query=query, language=language)
1934
- if a.get("link")
1935
- ]
1936
-
1937
- a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
1938
- # a4 = fetch_gdelt_articles(
1939
- # limit=min(100, limit_each * 2),
1940
- # query=query,
1941
- # language=language,
1942
- # timespan=timespan,
1943
- # category=category
1944
- # )
1945
- gdelt_limit = limit_each
1946
- a4 = fetch_gdelt_multi(
1947
- limit=gdelt_limit,
1948
- query=query,
1949
- language=language, # if provided, we honor it (with small EN boost)
1950
- timespan=timespan,
1951
- category=category,
1952
- speed=speed,
1953
- )
1954
-
1955
- # Dedup by canonical URL (maintain source precedence)
1956
- seen, merged = set(), []
1957
- for a in a1 + a3 + a2 + a4:
1958
- if a.get("url"):
1959
- a["url"] = _canonical_url(a["url"])
1960
- url = a["url"]
1961
- if url not in seen:
1962
- seen.add(url)
1963
- merged.append(a)
1964
-
1965
- if log_summary:
1966
- fetch_log.info("----- Article Fetch Summary -----")
1967
- fetch_log.info(f"📊 NewsAPI returned: {len(a1)} articles")
1968
- fetch_log.info(f"📊 NewsData.io returned: {len(a2)} articles")
1969
- fetch_log.info(f"📊 GNews returned: {len(a3)} articles")
1970
- fetch_log.info(f"📊 GDELT returned: {len(a4)} articles")
1971
- fetch_log.info(f"✅ Total merged articles after deduplication: {len(merged)}")
1972
- fetch_log.info("---------------------------------")
1973
-
1974
- return merged
1975
-
1976
-
1977
  @app.get("/related")
1978
  def related_articles(
1979
  id: Optional[str] = Query(None, description="article id from /news"),
@@ -1985,13 +1691,10 @@ def related_articles(
1985
  limit_each: int = Query(50, ge=5, le=100),
1986
  k: int = Query(10, ge=1, le=50),
1987
  ):
1988
- # ensure we have a working article list (enriched) to search over
1989
  raw = combine_raw_articles(category=category, query=q, language=language, limit_each=limit_each)
1990
  enriched = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
1991
  if not enriched:
1992
  return {"items": []}
1993
-
1994
- # pick the query vector
1995
  if id:
1996
  base = next((a for a in enriched if a.get("id") == id), None)
1997
  if not base:
@@ -2002,15 +1705,10 @@ def related_articles(
2002
  if not text:
2003
  raise HTTPException(400, "provide either id or title/description")
2004
  query_text = text
2005
-
2006
  corpus_texts = [a["_ml_text"] for a in enriched]
2007
  corpus_embs = _embed_texts(corpus_texts)
2008
  query_emb = _embed_texts([query_text])[0]
2009
-
2010
- # cosine similarities
2011
  sims = util.cos_sim(query_emb, corpus_embs).cpu().numpy().flatten()
2012
-
2013
- # take top-k excluding the query itself (if id provided)
2014
  idxs = sims.argsort()[::-1]
2015
  items = []
2016
  for idx in idxs:
@@ -2020,9 +1718,9 @@ def related_articles(
2020
  items.append({**a, "similarity": float(sims[idx])})
2021
  if len(items) >= k:
2022
  break
2023
-
2024
  return {"items": items}
2025
 
 
2026
  @app.middleware("http")
2027
  async def timing_middleware(request, call_next):
2028
  start = time.perf_counter()
@@ -2032,21 +1730,21 @@ async def timing_middleware(request, call_next):
2032
  return response
2033
  finally:
2034
  dur_ms = (time.perf_counter() - start) * 1000
2035
- # log.info(f"{request.method} {request.url.path} -> {dur_ms:.1f} ms ({_fmt_mmss(dur_ms)})")
2036
  if response is not None:
2037
  try:
2038
  response.headers["X-Process-Time-ms"] = f"{dur_ms:.1f}"
2039
  except Exception:
2040
  pass
2041
 
 
2042
  @app.post("/client-metric")
2043
  def client_metric(payload: Dict[str, Any] = Body(...)):
2044
  name = (payload.get("name") or "").strip()
2045
- # Drop redraw spam if it ever slips through again
2046
  if name in {"Load all article markers on globe", "Load event country markers on globe"}:
2047
  return {"ok": True}
2048
  return {"ok": True}
2049
 
 
2050
  @app.get("/diag/translate")
2051
  def diag_translate():
2052
  remote = _hf_call("Helsinki-NLP/opus-mt-es-en", {"inputs":"Hola mundo"})
 
1
+ # ----------------- Imports (Stdlib + Typing) -----------------
2
+ from fastapi import FastAPI, Query, HTTPException, Body
3
  from fastapi import FastAPI, Query, HTTPException, Body
4
  from typing import Optional, List, Dict, Any, Tuple, Set
5
  import os
 
15
  import nltk
16
  from nltk.sentiment import SentimentIntensityAnalyzer
17
  from geopy.geocoders import Nominatim
 
18
  from fastapi.middleware.cors import CORSMiddleware
19
  from countryinfo import CountryInfo
20
  from sentence_transformers import SentenceTransformer, util
 
32
  os.environ.setdefault("OMP_NUM_THREADS", "1")
33
  from fastapi.responses import PlainTextResponse, JSONResponse
34
 
35
+ # ----------------- Torch Runtime Settings -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  import torch
37
  torch.set_num_threads(2)
38
 
39
+ # ----------------- Optional Local Tokenizers -----------------
40
  try:
41
+ import sentencepiece as _spm
42
  _HAS_SENTENCEPIECE = True
43
  except Exception:
44
  _HAS_SENTENCEPIECE = False
45
 
46
+ # ----------------- Runtime Modes / Speed Enum -----------------
47
  from enum import Enum
48
  class Speed(str, Enum):
49
  fast = "fast"
50
  balanced = "balanced"
51
  max = "max"
52
 
53
+ # ----------------- Global Model Handles / Pipelines -----------------
54
  _local_pipes = {}
55
  _news_clf = None
56
  _sbert = None
57
 
58
+ # ----------------- tldextract (PSL-cached) -----------------
 
59
  _TLD_CACHE = os.getenv("TLDEXTRACT_CACHE", "/data/tld_cache")
60
  try:
 
61
  _tld = tldextract.TLDExtract(cache_dir=_TLD_CACHE, suffix_list_urls=None)
62
  except Exception:
 
63
  _tld = tldextract.extract
64
 
65
+ # ----------------- Translation Runtime Flags -----------------
66
+ ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1"
 
67
  _hf_bad_models: Set[str] = set()
68
 
69
+ # ----------------- FastAPI App + Middleware -----------------
70
+ app = FastAPI()
71
+ app.add_middleware(
72
+ CORSMiddleware,
73
+ allow_origins=["*"],
74
+ allow_credentials=False,
75
+ allow_methods=["*"],
76
+ allow_headers=["*"],
77
+ )
78
+ app.add_middleware(GZipMiddleware, minimum_size=500)
79
 
80
+ @app.api_route("/", methods=["GET", "HEAD"], include_in_schema=False)
81
+ def root():
82
+ return JSONResponse({"ok": True, "service": "newsglobe-backend"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ @app.api_route("/healthz", methods=["GET", "HEAD"], include_in_schema=False)
85
+ def healthz():
86
+ return PlainTextResponse("OK", status_code=200)
 
 
 
 
 
 
87
 
88
+ @app.get("/favicon.ico", include_in_schema=False)
89
+ def favicon():
90
+ return PlainTextResponse("", status_code=204)
 
 
91
 
92
+ # ----------------- HTTP Session (connection pooling) -----------------
93
  SESSION = requests.Session()
94
  ADAPTER = requests.adapters.HTTPAdapter(pool_connections=64, pool_maxsize=64, max_retries=2)
95
  SESSION.mount("http://", ADAPTER)
 
100
  headers.setdefault("User-Agent", "Mozilla/5.0 (compatible; NewsGlobe/1.0)")
101
  return SESSION.get(url, headers=headers, timeout=kwargs.pop("timeout", 12), **kwargs)
102
 
103
+ # ----------------- Lightweight Reader Fallback (Jina) -----------------
104
  def _try_jina_reader(url: str, timeout: int) -> Optional[str]:
105
  try:
106
  u = url.strip()
 
117
  pass
118
  return None
119
 
120
+ # ----------------- Description Cleanup Helpers -----------------
 
 
 
121
  BOILER_DESC = re.compile(
122
  r"(subscribe|sign in|sign up|enable javascript|cookies? (policy|settings)|"
123
  r"privacy (policy|notice)|continue reading|read more|click here|"
 
126
  )
127
 
128
  def _split_sentences(text: str) -> List[str]:
 
129
  parts = re.split(r"(?<=[\.\?\!])\s+(?=[A-Z0-9])", (text or "").strip())
 
130
  out = []
131
  for p in parts:
132
  out.extend(re.split(r"\s+[•–—]\s+", p))
133
  return [p.strip() for p in out if p and len(p.strip()) >= 2]
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def _too_similar(a: str, b: str, thresh: float = 0.92) -> bool:
 
136
  a = (a or "").strip()
137
  b = (b or "").strip()
138
  if not a or not b:
 
145
  return ratio >= thresh
146
 
147
  def _dedupe_title_from_desc(title: str, desc: str) -> str:
 
148
  t = (title or "").strip()
149
  d = (desc or "").strip()
150
  if not t or not d:
151
  return d
 
152
  if d.lower().startswith(t.lower()):
153
  d = d[len(t):].lstrip(" -–:•|")
 
154
  d = d.replace(t, "").strip(" -–:•|")
155
  d = _clean_text(d)
156
  return d
157
 
158
+ def _clean_text(s: str) -> str:
159
+ s = unescape(s or "")
160
+ s = re.sub(r"\s+", " ", s).strip()
161
+ return s
162
+
163
+ def _tidy_description(title: str, desc: str, source_name: str, max_chars: int = 240) -> str:
164
+ if not desc:
165
+ return ""
166
+ desc = _dedupe_title_from_desc(title, desc)
167
+ desc = BOILER_DESC.sub("", desc)
168
+ desc = re.sub(r"\s+", " ", desc).strip(" -–:•|")
169
+ sents = _split_sentences(desc)
170
+ if not sents:
171
+ sents = [desc]
172
+ best = " ".join(sents[:2]).strip()
173
+ if len(best) > max_chars:
174
+ if len(sents[0]) <= max_chars * 0.9:
175
+ best = sents[0]
176
+ else:
177
+ best = best[:max_chars].rsplit(" ", 1)[0].rstrip(",;:-–—")
178
+ if _too_similar(title, best):
179
+ for alt in sents[1:3]:
180
+ if not _too_similar(title, alt):
181
+ best = alt
182
+ break
183
+ if best and best[-1] not in ".!?":
184
+ best += "."
185
+ return best
186
 
187
+ # ----------------- Inflight Request Coalescing -----------------
188
  _inflight_locks: Dict[Tuple, threading.Lock] = {}
189
  _inflight_global_lock = threading.Lock()
 
 
190
  def _get_inflight_lock(key: Tuple) -> threading.Lock:
191
  with _inflight_global_lock:
192
  lk = _inflight_locks.get(key)
 
195
  _inflight_locks[key] = lk
196
  return lk
197
 
198
+ # ----------------- Description Fetching (Cache + Extract) -----------------
199
  DESC_CACHE_LOCK = threading.Lock()
 
200
  try:
201
+ from bs4 import BeautifulSoup
202
  except Exception:
203
  BeautifulSoup = None
204
 
205
+ DESC_FETCH_TIMEOUT = 3
206
+ DESC_MIN_LEN = 100
207
+ DESC_CACHE_TTL = 24 * 3600
208
+ MAX_DESC_FETCHES = 24
209
+ DESC_WORKERS = 12
 
 
 
 
210
  DESC_CACHE: Dict[str, Dict[str, Any]] = {}
211
 
 
212
  def _now_mono():
213
+ # Monotonic for TTL calculations
214
  try:
215
  return monotonic()
216
  except Exception:
217
  return time.time()
218
 
 
 
 
 
 
 
 
219
  def _extract_desc_from_ld_json(html: str) -> Optional[str]:
220
+ # Prefer LD-JSON when present (often cleaner summaries)
221
  if not html or not BeautifulSoup:
222
  return None
223
  try:
 
225
  for tag in soup.find_all("script", {"type": "application/ld+json"}):
226
  try:
227
  import json
 
228
  data = json.loads(tag.string or "")
229
  except Exception:
230
  continue
 
231
  def find_desc(obj):
232
  if not isinstance(obj, (dict, list)):
233
  return None
 
237
  if v:
238
  return v
239
  return None
 
240
  for key in ("description", "abstract", "articleBody"):
241
  val = obj.get(key)
242
  if isinstance(val, str):
243
  txt = _clean_text(val)
244
  if len(txt) >= 40:
245
  return txt
 
246
  for k, v in obj.items():
247
  if isinstance(v, (dict, list)):
248
  got = find_desc(v)
249
  if got:
250
  return got
251
  return None
 
252
  d = find_desc(data)
253
  if d and len(d) >= 40:
254
  return d
 
256
  pass
257
  return None
258
 
259
+ # Heuristic to detect consent walls and jump to reader fallback
260
  CONSENT_HINTS = re.compile(r"(consent|gdpr|privacy choices|before you continue|we value your privacy)", re.I)
261
 
 
262
  def _looks_like_consent_wall(html: str) -> bool:
263
  if not html:
264
  return False
265
+ if "consent.yahoo.com" in html.lower():
266
  return True
 
267
  return bool(CONSENT_HINTS.search(html))
268
 
 
269
  def _extract_desc_from_html(html: str) -> Optional[str]:
270
  html = html or ""
271
  if BeautifulSoup:
272
  soup = BeautifulSoup(html, "html.parser")
 
 
273
  ld = _extract_desc_from_ld_json(html)
274
  if ld:
275
  txt = _clean_text(ld)
276
  if 40 <= len(txt) <= 480:
277
  return txt
 
 
278
  for sel, attr in [
279
  ('meta[property="og:description"]', "content"),
280
  ('meta[name="twitter:description"]', "content"),
 
285
  txt = _clean_text(tag.get(attr, ""))
286
  if len(txt) >= 40:
287
  return txt
 
288
  for p in soup.find_all("p"):
289
  txt = _clean_text(p.get_text(" "))
290
  if len(txt) >= 80:
291
  return txt
292
  else:
 
293
  for pat in [
294
  r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)["\']',
295
  r'<meta[^>]+name=["\']twitter:description["\'][^>]+content=["\']([^"\']+)["\']',
 
305
  txt = _clean_text(re.sub("<[^>]+>", " ", m.group(1)))
306
  if len(txt) >= 80:
307
  return txt
 
308
  return None
309
 
 
310
  def _desc_cache_get(url: str) -> Optional[str]:
311
  if not url:
312
  return None
 
318
  return None
319
  return entry["text"]
320
 
 
321
  def _desc_cache_put(url: str, text: str):
322
  if url and text:
323
  with DESC_CACHE_LOCK:
324
  DESC_CACHE[url] = {"text": text, "t": _now_mono()}
325
 
 
326
  def _attempt_fetch(url: str, timeout: int) -> Optional[str]:
327
+ # Fetch page and extract description; fallback to reader if needed
328
  headers = {
329
  "User-Agent": "Mozilla/5.0 (compatible; NewsGlobe/1.0; +mailto:you@yourdomain.com)",
330
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 
339
  if "html" not in ct and "<html" not in txt.lower():
340
  return None
341
  if _looks_like_consent_wall(txt):
 
342
  jd = _try_jina_reader(url, timeout)
343
  if jd:
344
  return jd
 
347
  if desc and 40 <= len(desc) <= 480:
348
  return desc
349
  except Exception:
 
350
  pass
 
 
351
  jd = _try_jina_reader(url, timeout)
352
  if jd and 40 <= len(jd) <= 480:
353
  return jd
354
  return None
355
 
 
 
356
  def fetch_page_description(url: str) -> Optional[str]:
357
+ # Public entry: consult cache -> fetch -> AMP variants -> cache
358
  if not url:
359
  return None
360
  cached = _desc_cache_get(url)
361
  if cached:
362
  return cached
 
 
363
  desc = _attempt_fetch(url, DESC_FETCH_TIMEOUT)
364
  if not desc:
 
365
  amp_candidates = []
366
  try:
367
  p = urlparse(url)
 
368
  if not p.path.endswith("/amp"):
369
  amp_candidates.append(urlunparse(p._replace(path=(p.path.rstrip("/") + "/amp"))))
 
370
  q = p.query
371
  amp_candidates.append(urlunparse(p._replace(query=(q + ("&" if q else "") + "amp=1"))))
 
372
  amp_candidates.append(urlunparse(p._replace(query=(q + ("&" if q else "") + "outputType=amp"))))
373
  except Exception:
374
  pass
 
376
  desc = _attempt_fetch(amp_url, DESC_FETCH_TIMEOUT)
377
  if desc:
378
  break
 
379
  if desc:
380
  _desc_cache_put(url, desc)
381
  return desc
382
  return None
383
 
 
384
  def _needs_desc_upgrade(a: Dict[str, Any]) -> bool:
385
+ # Decide if we should try to refetch a better description
386
  url = a.get("url") or ""
387
  if not url:
388
  return False
 
392
  return True
393
  if len(desc) < DESC_MIN_LEN:
394
  return True
 
395
  if _too_similar(title, desc):
396
  return True
397
  return False
398
 
 
399
  def prefetch_descriptions(raw_articles: List[Dict[str, Any]], speed: Speed = Speed.balanced):
400
+ # Parallel prefetch for weak descriptions (bounded to avoid stampedes)
401
  candidates, seen = [], set()
402
  max_fetches = 6 if speed == Speed.fast else 8 if speed == Speed.balanced else 16
403
  timeout = 1 if speed == Speed.fast else 2
404
  workers = 3 if speed == Speed.fast else 4 if speed == Speed.balanced else 8
 
 
405
  for a in raw_articles:
406
+ url = a.get("url")
407
+ if not url or url in seen:
408
+ continue
409
  seen.add(url)
410
  if _needs_desc_upgrade(a) and not _desc_cache_get(url):
411
  candidates.append(url)
412
+ if len(candidates) >= max_fetches:
413
+ break
414
+ if not candidates:
415
+ return
416
  with ThreadPoolExecutor(max_workers=workers) as ex:
417
  futs = [ex.submit(fetch_page_description, u) for u in candidates]
418
+ for _ in as_completed(futs):
419
+ pass
420
 
421
  def prefetch_descriptions_async(raw_articles, speed: Speed = Speed.balanced):
422
  threading.Thread(target=prefetch_descriptions, args=(raw_articles, speed), daemon=True).start()
423
 
424
+ # ----------------- Category / Keyword Heuristics -----------------
425
+ DetectorFactory.seed = 0
426
 
427
  SECTION_HINTS = {
428
  "sports": "sports",
 
463
  "politics": r"\b(president|parliament|congress|minister|policy|campaign|election)\b",
464
  }
465
 
466
+ def get_news_clf():
467
+ # Lazy-init topic classifier
468
+ global _news_clf
469
+ if _news_clf is None:
470
+ _news_clf = hf_pipeline(
471
+ "text-classification",
472
+ model="cardiffnlp/tweet-topic-21-multi",
473
+ top_k=1,
474
+ )
475
+ return _news_clf
476
 
477
  def _infer_category_from_url_path(url_path: str) -> Optional[str]:
478
+ # Order: provided -> URL path -> keyword -> ML fallback
479
  parts = [p for p in url_path.lower().split("/") if p]
480
  for p in parts:
481
  if p in SECTION_HINTS:
482
  return SECTION_HINTS[p]
 
483
  for p in parts:
484
  for tok in re.split(r"[-_]", p):
485
  if tok in SECTION_HINTS:
486
  return SECTION_HINTS[tok]
487
  return None
488
 
 
489
  def _infer_category_from_text(text: str) -> Optional[str]:
490
  if not text:
491
  return None
 
494
  return cat
495
  return None
496
 
 
497
  def infer_category(article_url, title, description, provided):
498
  if provided:
499
  p = provided.strip().lower()
500
  if p:
501
  return p
 
502
  try:
503
  p = urlparse(article_url).path or ""
504
  cat = _infer_category_from_url_path(p)
 
506
  return cat
507
  except Exception:
508
  pass
 
509
  text = f"{title or ''} {description or ''}".strip()
510
  cat = _infer_category_from_text(text)
511
  if cat:
512
  return cat
513
  try:
514
+ preds = get_news_clf()(text[:512])
515
  if isinstance(preds[0], list):
516
  label = preds[0][0]["label"]
517
  else:
 
521
  log.warning(f"ML category failed: {e}")
522
  return "general"
523
 
524
+ # ----------------- Language Detection / Embeddings -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
  def detect_lang(text: str) -> Optional[str]:
526
  try:
527
+ return detect(text)
528
  except Exception:
529
  return None
530
 
531
+ def get_sbert():
532
+ global _sbert
533
+ if _sbert is None:
534
+ _sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
535
+ return _sbert
536
 
537
  def _embed_texts(texts: List[str]):
538
  embs = get_sbert().encode(texts, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
539
  return embs
540
 
541
+ # ----------------- NLTK / VADER Sentiment -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  NLTK_DATA_DIR = os.environ.get("NLTK_DATA", "/app/nltk_data")
 
 
543
  if NLTK_DATA_DIR not in nltk.data.path:
544
  nltk.data.path.insert(0, NLTK_DATA_DIR)
 
545
  try:
546
  nltk.data.find("sentiment/vader_lexicon")
547
  except LookupError:
 
548
  try:
549
  os.makedirs(NLTK_DATA_DIR, exist_ok=True)
550
  nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIR, quiet=True)
551
  except Exception:
552
+ pass
 
553
  try:
554
  _vader = SentimentIntensityAnalyzer()
555
  except Exception:
 
564
  c = scores["compound"]
565
  return "positive" if c >= 0.2 else "negative" if c <= -0.2 else "neutral"
566
 
567
+ # ----------------- Geocoding / Domain → Country -----------------
 
568
  def get_country_centroid(country_name):
569
  if not country_name or country_name == "Unknown":
570
  return {"lat": 0, "lon": 0, "country": "Unknown"}
 
576
  log.info(f"Could not get centroid for {country_name}: {e}")
577
  return {"lat": 0, "lon": 0, "country": country_name or "Unknown"}
578
 
 
579
  def resolve_domain_to_ip(domain):
580
  if not domain:
581
  return None
 
584
  except socket.gaierror:
585
  return None
586
 
 
587
  def geolocate_ip(ip):
588
  try:
589
  r = _session_get(f"https://ipwho.is/{ip}?fields=success,country,latitude,longitude", timeout=8)
 
594
  pass
595
  return None
596
 
597
+ # Nominatim for a light refinement pass (async)
598
  geolocator = Nominatim(user_agent="newsglobe-app (contact: you@example.com)")
599
  domain_geo_cache: Dict[str, Dict[str, Any]] = {}
600
 
 
612
  "lefigaro.fr": "France",
613
  "kyodonews.net": "Japan",
614
  "straitstimes.com": "Singapore",
615
+ "thesun.my": "Malaysia",
616
  }
617
 
 
618
  def geocode_source(source_text: str, domain: str = "", do_network: bool = False):
619
  cache_key = f"{source_text}|{domain}"
620
  if cache_key in domain_geo_cache:
621
  return domain_geo_cache[cache_key]
 
622
  ext = _tld(domain or "")
623
  fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
 
 
624
  if fqdn in MAJOR_OUTLETS:
625
  coords = get_country_centroid(MAJOR_OUTLETS[fqdn]); domain_geo_cache[cache_key] = coords; return coords
626
  if ext.domain in domain_country_map:
627
  coords = get_country_centroid(domain_country_map[ext.domain]); domain_geo_cache[cache_key] = coords; return coords
 
 
628
  coords = get_country_centroid(_suffix_country(ext.suffix))
629
  domain_geo_cache[cache_key] = coords
 
 
630
  if do_network:
631
  threading.Thread(target=_refine_geo_async, args=(cache_key, source_text, fqdn), daemon=True).start()
 
632
  return coords
633
 
634
  def _suffix_country(suffix: Optional[str]) -> str:
 
643
  }
644
  return m.get(s, "United States" if s in ("com","org","net") else "Unknown")
645
 
 
 
646
  def _refine_geo_async(cache_key, source_text, fqdn):
647
  try:
 
648
  ip = resolve_domain_to_ip(fqdn) if fqdn else None
649
  if ip:
650
  coords = geolocate_ip(ip)
651
  if coords:
652
  domain_geo_cache[cache_key] = coords
653
  return
 
654
  location = geolocator.geocode(f"{source_text} News Headquarters", timeout=2)
655
  if location and hasattr(location, "raw"):
656
  coords = {
 
662
  except Exception:
663
  pass
664
 
665
+ # ----------------- Translation (HF / Libre / Local) -----------------
666
+ HF_MODEL_PRIMARY = None
667
 
 
668
  NLLB_CODES = {
669
  "en": "eng_Latn",
670
  "es": "spa_Latn",
 
680
  "ko": "kor_Hang",
681
  }
682
 
 
 
683
  def opus_model_for(src2: str, tgt2: str) -> Optional[str]:
684
  pairs = {
685
  ("es", "en"): "Helsinki-NLP/opus-mt-es-en",
 
707
  }
708
  return pairs.get((src2, tgt2))
709
 
 
710
  SUPPORTED = {"en", "fr", "de", "es", "it", "hi", "ar", "ru", "ja", "ko", "pt", "zh"}
711
+ LIBRETRANSLATE_URL = os.getenv("LIBRETRANSLATE_URL")
 
712
 
713
  def _translate_via_libre(text: str, src: str, tgt: str) -> Optional[str]:
714
  url = LIBRETRANSLATE_URL
 
730
  log.warning("LibreTranslate failed: %s", e)
731
  return None
732
 
 
733
  def _hf_call(model_id: str, payload: dict) -> Optional[str]:
 
734
  if not (HUGGINGFACE_API_TOKEN and ALLOW_HF_REMOTE):
735
  return None
736
  if model_id in _hf_bad_models:
737
  return None
 
738
  url = f"https://api-inference.huggingface.co/models/{model_id}"
739
  headers = {
740
  "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}",
 
755
  except Exception as e:
756
  log.warning("HF request failed: %s", e)
757
  return None
 
758
  if isinstance(j, list) and j and isinstance(j[0], dict):
759
  if "generated_text" in j[0]:
760
  return j[0]["generated_text"]
 
770
  def _translate_cached(text: str, src: str, tgt: str) -> str:
771
  if not text or src == tgt:
772
  return text
 
 
773
  out = _translate_via_libre(text, src, tgt)
774
  if out:
775
  return out
 
 
776
  opus_model = opus_model_for(src, tgt)
777
  if opus_model:
778
  out = _hf_call(opus_model, {"inputs": text})
779
  if out:
780
  return out
 
 
781
  try:
782
  if HF_MODEL_PRIMARY and (src in NLLB_CODES) and (tgt in NLLB_CODES):
783
  out = _hf_call(
 
792
  return out
793
  except Exception:
794
  pass
 
 
795
  if src != "en" and tgt != "en":
796
  step_en = _translate_cached(text, src, "en")
797
  if step_en and step_en != text:
798
  out = _translate_cached(step_en, "en", tgt)
799
  if out:
800
  return out
 
 
801
  out = _translate_local(text, src, tgt)
802
  if out:
803
  return out
 
804
  log.warning("All translate paths failed (%s->%s); returning original.", src, tgt)
805
  return text
806
 
 
 
 
807
  def translate_text(text: str, target_lang: Optional[str], fallback_src: Optional[str] = None) -> str:
808
  if not text or not target_lang:
809
  return text
 
824
  src = "en"
825
  return _translate_cached(text, src, tgt)
826
 
827
+ def _translate_local(text: str, src: str, tgt: str) -> Optional[str]:
828
+ if not _HAS_SENTENCEPIECE:
829
+ return None
830
+ model_id = opus_model_for(src, tgt)
831
+ if not model_id:
832
+ return None
833
+ key = model_id
834
+ try:
835
+ if key not in _local_pipes:
836
+ _local_pipes[key] = hf_pipeline("translation", model=model_id)
837
+ out = _local_pipes[key](text, max_length=512)
838
+ return out[0]["translation_text"]
839
+ except Exception as e:
840
+ log.warning("Local translate failed for %s: %s", model_id, e)
841
+ return None
842
 
843
+ # ----------------- Warmup Settings & Routine -----------------
844
+ WARM_LIMIT_EACH = 20
845
+ WARM_TIMESPAN = "24h"
 
 
846
  WARM_PREFETCH_DESCRIPTIONS = False
847
 
848
  def _fmt_mmss(ms: float) -> str:
 
854
  try:
855
  log.info("WARM: starting background warm-up (limit_each=%d, timespan=%s)", WARM_LIMIT_EACH, WARM_TIMESPAN)
856
  t0 = time.perf_counter()
 
 
857
  get_sbert()
858
  get_news_clf()
 
 
859
  t1 = time.perf_counter()
860
  raw = combine_raw_articles(
861
  category=None, query=None, language="en",
862
  limit_each=WARM_LIMIT_EACH, timespan=WARM_TIMESPAN,
863
+ log_summary=False
864
  )
865
  t_fetch = (time.perf_counter() - t1) * 1000
 
 
866
  if WARM_PREFETCH_DESCRIPTIONS:
867
  prefetch_descriptions_async(raw)
 
 
868
  t2 = time.perf_counter()
869
  enriched = [enrich_article(a, language="en", translate=False, target_lang=None) for a in raw]
870
  t_enrich = (time.perf_counter() - t2) * 1000
 
871
  t3 = time.perf_counter()
872
  clusters = cluster_articles(enriched, sim_threshold=SIM_THRESHOLD)
873
  t_cluster = (time.perf_counter() - t3) * 1000
 
 
874
  key = cache_key_for(q=None, category=None, language="en",
875
  limit_each=WARM_LIMIT_EACH, translate=False, target_lang=None,
876
+ speed=Speed.balanced)
 
877
  _events_cache[key] = {"t": monotonic(), "enriched": enriched, "clusters": clusters}
 
878
  t_total = (time.perf_counter() - t0) * 1000
879
  log.info(
880
  "WARM: fetch=%s, enrich=%s, cluster=%s, total=%s (raw=%d, enriched=%d, clusters=%d)",
 
886
 
887
  @app.on_event("startup")
888
  def warm():
 
889
  get_sbert()
890
  get_news_clf()
 
891
  threading.Thread(target=_warm_once, daemon=True).start()
892
 
893
+ # ----------------- GDELT Query Helpers -----------------
 
894
  _GDELT_LANG = {
895
  "en": "english",
896
  "es": "spanish",
 
906
  "zh": "chinese",
907
  }
908
 
 
909
  def _gdelt_safe_query(user_q, language):
910
  parts = []
911
  if user_q:
 
916
  if language and (lg := _GDELT_LANG.get(language.lower())):
917
  parts.append(f"sourcelang:{lg}")
918
  if not parts:
 
919
  parts.append("sourcelang:english")
920
  return " ".join(parts)
921
 
922
+ # ----------------- GDELT Fetchers -----------------
923
  def fetch_gdelt_articles(
924
  limit=50,
925
  query=None,
 
960
 
961
  data = _do_request(params)
962
  if data is None:
 
963
  p2 = {**params, "timespan": "24h", "maxrecords": min(100, params["maxrecords"])}
964
  data = _do_request(p2)
965
  if not data:
 
985
  "publishedAt": a.get("seendate"),
986
  "api_source": "gdelt",
987
  "gdelt_sourcecountry": a.get("sourcecountry"),
 
988
  "requested_category": category,
989
  }
990
  )
991
  log.info(f"GDELT returned {len(results)}")
992
  return results
993
 
994
+ def fetch_gdelt_multi(limit=120, query=None, language=None, timespan="48h", category=None, speed: Speed = Speed.balanced):
995
+ if language:
996
+ primary = fetch_gdelt_articles(limit=limit, query=query, language=language, timespan=timespan, category=category)
997
+ booster = fetch_gdelt_articles(limit=max(10, limit // 6), query=query, language="en", timespan=timespan, category=category)
998
+ return primary + booster
999
+ if speed == Speed.fast:
1000
+ langs = LANG_ROTATION[:3]
1001
+ timespan = "24h"
1002
+ elif speed == Speed.balanced:
1003
+ langs = LANG_ROTATION[:8]
1004
+ timespan = "48h"
1005
+ else:
1006
+ langs = LANG_ROTATION
1007
+ timespan = "3d"
1008
+ per_lang = max(8, math.ceil(limit / len(langs)))
1009
+ out = []
1010
+ for lg in langs:
1011
+ out.extend(fetch_gdelt_articles(limit=per_lang, query=query, language=lg, timespan=timespan, category=category))
1012
+ if speed != Speed.fast:
1013
+ per_cc = max(4, limit // 30) if speed == Speed.max else max(2, limit // 40)
1014
+ for cc in COUNTRY_SEEDS[: (8 if speed == Speed.balanced else 16)]:
1015
+ out.extend(
1016
+ fetch_gdelt_articles(
1017
+ limit=per_cc,
1018
+ query=query,
1019
+ language="en",
1020
+ timespan=timespan,
1021
+ category=category,
1022
+ extra_tokens=[f"sourcecountry:{cc}"]
1023
+ )
1024
+ )
1025
+ return out
1026
 
1027
 
1028
+ # ----------------- Provider Flags / Keys / Logging -----------------
1029
+ USE_GNEWS_API = False
1030
+ USE_NEWSDATA_API = False
1031
+ USE_GDELT_API = True
1032
+ USE_NEWSAPI = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1033
 
1034
+ NEWSAPI_KEY = os.getenv("NEWSAPI_KEY", "ea734c66dc4044fa8e4501ad7b90e753")
1035
+ GNEWS_API_KEY = os.getenv("GNEWS_API_KEY", "5419897c95e8a4b21074e0d3fe95a3dd")
1036
+ NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY", "pub_1feb49a71a844719af68d0844fb43a61")
1037
+ HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
1038
 
1039
+ logging.basicConfig(
1040
+ level=logging.WARNING,
1041
+ format="%(levelname)s:%(name)s:%(message)s",
1042
+ )
1043
+ log = logging.getLogger("newsglobe")
1044
+ log.setLevel(logging.WARNING)
1045
+ fetch_log = logging.getLogger("newsglobe.fetch_summary")
1046
+ fetch_log.setLevel(logging.INFO)
1047
+ _fetch_handler = logging.StreamHandler()
1048
+ _fetch_handler.setLevel(logging.INFO)
1049
+ _fetch_handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
1050
+ fetch_log.addHandler(_fetch_handler)
1051
+ fetch_log.propagate = False
 
 
 
 
 
1052
 
1053
+ for name in ("urllib3", "urllib3.connectionpool", "requests.packages.urllib3"):
1054
+ lg = logging.getLogger(name)
1055
+ lg.setLevel(logging.ERROR)
1056
+ lg.propagate = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1057
 
1058
+ def _newsapi_enabled() -> bool:
1059
+ if not NEWSAPI_KEY:
1060
+ log.warning("NewsAPI disabled: missing NEWSAPI_KEY env var")
1061
+ return False
1062
+ return True
1063
 
1064
+ # ----------------- Clustering Helpers -----------------
1065
+ def cluster_id(cluster, enriched_articles):
1066
+ urls = sorted([(enriched_articles[i].get("url") or "") for i in cluster["indices"] if enriched_articles[i].get("url")])
1067
+ base = "|".join(urls) if urls else "empty"
1068
+ return hashlib.md5(base.encode("utf-8")).hexdigest()[:10]
1069
 
1070
+ BOILER = re.compile(r"\b(live updates|breaking|what we know|in pictures|opinion)\b", re.I)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1071
 
1072
+ def _norm_text(s: str) -> str:
1073
+ s = (s or "").strip()
1074
+ s = re.sub(r"\s+", " ", s)
1075
+ return s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1076
 
1077
+ def _cluster_text(a):
1078
+ base = f"{a.get('orig_title') or a.get('title') or ''} {a.get('orig_description') or a.get('description') or ''}"
1079
+ base = BOILER.sub("", base)
1080
+ base = re.sub(r"\b(\d{1,2}:\d{2}\s?(AM|PM))|\b(\d{1,2}\s\w+\s\d{4})", "", base, flags=re.I)
1081
+ return _norm_text(base)
1082
 
1083
+ def _canonical_url(u: str) -> str:
1084
+ if not u:
1085
+ return u
1086
+ p = urlparse(u)
1087
+ qs = [(k, v) for (k, v) in parse_qsl(p.query, keep_blank_values=False) if not k.lower().startswith(("utm_", "fbclid", "gclid"))]
1088
+ clean = p._replace(query="&".join([f"{k}={v}" for k, v in qs]), fragment="")
1089
+ path = clean.path.rstrip("/") or "/"
1090
+ clean = clean._replace(path=path)
1091
+ return urlunparse(clean)
1092
 
1093
+ # ----------------- Normalizers / Enrichment -----------------
1094
  def normalize_newsdata_article(article):
1095
  return {
1096
  "title": article.get("title"),
 
1102
  "category": ((article.get("category") or [None])[0] if isinstance(article.get("category"), list) else article.get("category")),
1103
  }
1104
 
 
 
1105
  def enrich_article(a, language=None, translate=False, target_lang=None):
 
1106
  source_name = (a.get("source", {}) or {}).get("name", "").strip() or "Unknown"
1107
  s_lower = source_name.lower()
1108
  if "newsapi" in s_lower:
 
1111
  source_name = "GNews"
1112
  elif "newsdata" in s_lower:
1113
  source_name = "NewsData.io"
 
 
1114
  article_url = _canonical_url(a.get("url") or "")
1115
  try:
1116
  ext = _tld(article_url)
1117
  domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
1118
  except Exception:
1119
  domain = ""
 
 
1120
  country_guess = None
1121
  if a.get("api_source") == "gdelt":
1122
  sc = a.get("gdelt_sourcecountry")
 
1132
  "PH": "Philippines", "ID": "Indonesia", "NZ": "New Zealand",
1133
  }
1134
  country_guess = iso2map.get(str(sc).upper(), sc if len(str(sc)) > 2 else None)
 
1135
  coords = get_country_centroid(country_guess) if country_guess else geocode_source(source_name, domain, do_network=False)
 
 
1136
  title = (a.get("title") or "").strip() or "(untitled)"
1137
  description = (a.get("description") or "").strip()
 
1138
  if description.lower().startswith("no description"):
1139
  description = ""
 
 
1140
  cached_desc = _desc_cache_get(article_url)
1141
  need_upgrade = (
1142
  (not description)
 
1146
  )
1147
  if need_upgrade and cached_desc:
1148
  description = cached_desc
 
1149
  if description:
1150
  description = _tidy_description(title, description, source_name)
1151
  if (not description) or _too_similar(title, description):
1152
  description = f"Quick take: {title.rstrip('.')}."
 
 
1153
  orig_title = title
1154
  orig_description = description
 
 
1155
  detected_lang = (detect_lang(f"{title} {description}") or "").lower()
1156
  ml_text = f"{orig_title}. {orig_description}".strip()
1157
  sentiment = classify_sentiment(f"{orig_title} {orig_description}")
 
 
1158
  seed = f"{source_name}|{article_url}|{title}"
1159
  uid = hashlib.md5(seed.encode("utf-8")).hexdigest()[:12]
1160
  cat = infer_category(article_url, orig_title, orig_description, None)
 
1161
  return {
1162
  "id": uid,
1163
  "title": title,
 
1178
  "category": cat,
1179
  }
1180
 
1181
+ # ----------------- Clustering (Semantic, single-pass + merge) -----------------
1182
+ def cluster_articles(articles: List[Dict[str, Any]], sim_threshold=0.6, speed: Speed = Speed.balanced):
 
 
 
 
 
 
1183
  if speed == Speed.fast:
1184
+ articles = articles[:150]
1185
  sim_threshold = max(sim_threshold, 0.64)
1186
  elif speed == Speed.balanced:
1187
  articles = articles[:]
1188
  sim_threshold = max(sim_threshold, 0.62)
1189
  texts = [_cluster_text(a) for a in articles]
1190
  embs = get_sbert().encode(texts, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
1191
+ clusters = []
1192
  centroids = []
 
1193
  for i, emb in enumerate(embs):
1194
  best_idx, best_sim = -1, -1.0
1195
  for ci, c_emb in enumerate(centroids):
 
1204
  centroids[best_idx] = new_c
1205
  clusters[best_idx]["centroid"] = new_c
1206
  else:
 
1207
  event_id = hashlib.md5(texts[i].encode("utf-8")).hexdigest()[:10]
1208
  clusters.append({"id": event_id, "indices": [i], "centroid": emb})
1209
  centroids.append(emb)
 
 
1210
  merged = _merge_close_clusters(clusters, embs, threshold=0.70)
 
 
1211
  for c in merged:
1212
  c["id"] = cluster_id(c, articles)
 
1213
  return merged
1214
 
 
1215
  def event_payload_from_cluster(cluster, enriched_articles):
1216
  idxs = cluster["indices"]
1217
  arts = [enriched_articles[i] for i in idxs]
 
1222
  countries = {a["country"] for a in arts if a["country"] and a["country"] != "Unknown"}
1223
  ts = [a.get("publishedAt") for a in arts if a.get("publishedAt")]
1224
  return {
1225
+ "event_id": cluster_id(cluster, enriched_articles),
1226
  "title": canonical_title,
1227
  "keywords": keywords,
1228
  "article_count": len(arts),
 
1232
  "sample_urls": [a["url"] for a in arts[:3] if a.get("url")],
1233
  }
1234
 
 
1235
  def aggregate_event_by_country(cluster, enriched_articles):
1236
  idxs = cluster["indices"]
1237
  arts = [enriched_articles[i] for i in idxs]
1238
  by_country: Dict[str, Dict[str, Any]] = {}
 
1239
  for a in arts:
1240
  c = a.get("country") or "Unknown"
1241
  if c not in by_country:
1242
  coords = get_country_centroid(c)
1243
  by_country[c] = {"country": c, "lat": coords["lat"], "lon": coords["lon"], "articles": []}
1244
  by_country[c]["articles"].append(a)
 
 
1245
  results = []
1246
  for c, block in by_country.items():
1247
  arr = block["articles"]
 
1248
  to_num = {"negative": -1, "neutral": 0, "positive": 1}
1249
  vals = [to_num.get(a["sentiment"], 0) for a in arr]
1250
  avg = sum(vals) / max(len(vals), 1)
1251
  avg_sent = "positive" if avg > 0.15 else "negative" if avg < -0.15 else "neutral"
1252
  top_sources = [s for s, _ in Counter([a["source"] for a in arr]).most_common(3)]
 
1253
  summary = " • ".join([a["title"] for a in arr[:2]])
1254
  results.append(
1255
  {
 
1264
  {
1265
  "title": a["title"],
1266
  "orig_title": a.get("orig_title"),
1267
+ "orig_description": a.get("orig_description"),
1268
  "url": a["url"],
1269
  "source": a["source"],
1270
  "sentiment": a["sentiment"],
 
1276
  )
1277
  return results
1278
 
 
1279
  def _merge_close_clusters(clusters, embs, threshold=0.68):
 
1280
  merged = []
1281
  used = set()
1282
  for i in range(len(clusters)):
 
1290
  sim = util.cos_sim(base["centroid"], clusters[j]["centroid"]).item()
1291
  if sim >= threshold:
1292
  group.append(j)
 
1293
  all_idx = []
1294
  cents = []
1295
  for g in group:
1296
  used.add(g)
1297
  all_idx.extend(clusters[g]["indices"])
1298
  cents.append(clusters[g]["centroid"])
 
1299
  newc = torch.stack(cents, dim=0).mean(dim=0)
1300
  newc = newc / newc.norm()
1301
  merged.append({"indices": sorted(set(all_idx)), "centroid": newc})
1302
  return merged
1303
 
1304
+ # ----------------- Event Cache / Keys -----------------
1305
+ CACHE_TTL_SECS = 900
1306
+ SIM_THRESHOLD = 0.6
1307
+ _events_cache: Dict[Tuple, Dict[str, Any]] = {}
1308
+
1309
+ def cache_key_for(q, category, language, limit_each, translate=False, target_lang=None, speed=Speed.balanced):
1310
+ return (q or "", category or "", language or "", int(limit_each or 50),
1311
+ bool(translate), (target_lang or "").lower(), speed.value)
1312
+
1313
+ _first_real_build = True
1314
+
1315
+ def get_or_build_events_cache(q, category, language, translate, target_lang, limit_each, speed=Speed.balanced):
1316
+ global _first_real_build
1317
+ key = cache_key_for(q, category, language, limit_each, translate, target_lang, speed)
1318
+ now = monotonic()
1319
+ if speed == Speed.fast:
1320
+ use_timespan, use_limit = "24h", min(limit_each, 20)
1321
+ elif speed == Speed.balanced:
1322
+ use_timespan, use_limit = "48h", min(limit_each, 150)
1323
+ else:
1324
+ use_timespan, use_limit = "3d", limit_each
1325
+ entry = _events_cache.get(key)
1326
+ if entry and now - entry["t"] < CACHE_TTL_SECS:
1327
+ log.info(f"CACHE HIT for {key}")
1328
+ return key, entry["enriched"], entry["clusters"]
1329
+ lock = _get_inflight_lock(key)
1330
+ with lock:
1331
+ entry = _events_cache.get(key)
1332
+ if entry and now - entry["t"] < CACHE_TTL_SECS:
1333
+ log.info(f"CACHE HIT (post-lock) for {key}")
1334
+ return key, entry["enriched"], entry["clusters"]
1335
+ if _first_real_build:
1336
+ use_timespan = "24h" if use_timespan != "24h" else use_timespan
1337
+ use_limit = min(use_limit, 100)
1338
+ log.info(f"CACHE MISS for {key} — fetching (timespan={use_timespan}, limit_each={use_limit})")
1339
+ raw = combine_raw_articles(
1340
+ category=category,
1341
+ query=q,
1342
+ language=language,
1343
+ limit_each=use_limit,
1344
+ timespan=use_timespan,
1345
+ speed=speed,
1346
+ )
1347
+ prefetch_descriptions_async(raw, speed)
1348
+ enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
1349
+ if category:
1350
+ cat_norm = (category or "").strip().lower()
1351
+ enriched = [e for e in enriched_all if (e.get("category") or "").lower() == cat_norm]
1352
+ else:
1353
+ enriched = enriched_all
1354
+ clusters = cluster_articles(enriched, sim_threshold=SIM_THRESHOLD, speed=speed)
1355
+ _events_cache[key] = {"t": monotonic(), "enriched": enriched, "clusters": clusters}
1356
+ _first_real_build = False
1357
+ return key, enriched, clusters
1358
 
1359
+ # ----------------- Language Rotation / Seeds -----------------
1360
+ LANG_ROTATION = ["en", "es", "fr", "de", "ar", "ru", "pt", "zh", "hi", "ja", "ko"]
1361
+ COUNTRY_SEEDS = ["US", "GB", "IN", "CA", "AU", "ZA", "SG", "NG", "DE", "FR", "BR", "MX", "ES", "RU", "JP", "KR", "CN"]
1362
 
1363
+ # ----------------- Other Providers (NewsData/GNews/NewsAPI) -----------------
1364
+ def fetch_newsdata_articles(category=None, limit=20, query=None, language=None):
1365
+ base_url = "https://newsdata.io/api/1/news"
1366
+ allowed = [
1367
+ "business",
1368
+ "entertainment",
1369
+ "environment",
1370
+ "food",
1371
+ "health",
1372
+ "politics",
1373
+ "science",
1374
+ "sports",
1375
+ "technology",
1376
+ "top",
1377
+ "world",
1378
+ ]
1379
+ params = {"apikey": NEWSDATA_API_KEY, "language": (language or "en")}
1380
+ if category and category in allowed:
1381
+ params["category"] = category
1382
+ if query:
1383
+ params["q"] = query
1384
+ all_articles, next_page = [], None
1385
+ while len(all_articles) < limit:
1386
+ if next_page:
1387
+ params["page"] = next_page
1388
+ resp = _session_get(base_url, params=params, timeout=12)
1389
+ if resp.status_code != 200:
1390
+ break
1391
+ data = resp.json()
1392
+ articles = data.get("results", [])
1393
+ for a in articles:
1394
+ a["api_source"] = "newsdata"
1395
+ all_articles.extend(articles)
1396
+ next_page = data.get("nextPage")
1397
+ if not next_page:
1398
+ break
1399
+ for a in all_articles:
1400
+ a["publishedAt"] = a.get("pubDate")
1401
+ return all_articles[:limit]
1402
+
1403
+ def fetch_gnews_articles(limit=20, query=None, language=None):
1404
+ url = f"https://gnews.io/api/v4/top-headlines?lang={(language or 'en')}&max={limit}&token={GNEWS_API_KEY}"
1405
+ if query:
1406
+ url += f"&q={requests.utils.quote(query)}"
1407
+ try:
1408
+ r = _session_get(url, timeout=12)
1409
+ if r.status_code != 200:
1410
+ return []
1411
+ arts = r.json().get("articles", [])
1412
+ for a in arts:
1413
+ a["api_source"] = "gnews"
1414
+ return arts
1415
+ except Exception:
1416
+ return []
1417
+
1418
+ NEWSAPI_COUNTRIES = ["us", "gb", "ca", "au", "in", "za", "sg", "ie", "nz"]
1419
+
1420
+ def fetch_newsapi_headlines_multi(limit=50, language=None):
1421
+ if not _newsapi_enabled():
1422
+ return []
1423
+ all_ = []
1424
+ per = max(1, math.ceil(limit / max(1, len(NEWSAPI_COUNTRIES))))
1425
+ per = min(per, 100)
1426
+ for c in NEWSAPI_COUNTRIES:
1427
+ url = f"https://newsapi.org/v2/top-headlines?country={c}&pageSize={per}&apiKey={NEWSAPI_KEY}"
1428
+ r = _session_get(url, timeout=12)
1429
+ if r.status_code != 200:
1430
+ log.warning(f"NewsAPI top-headlines {c} -> HTTP {r.status_code}: {r.text[:200]}")
1431
+ continue
1432
+ arts = r.json().get("articles", [])
1433
+ for a in arts:
1434
+ a["api_source"] = "newsapi"
1435
+ all_.extend(arts)
1436
+ time.sleep(0.2)
1437
+ return all_[:limit]
1438
+
1439
+ def fetch_newsapi_articles(category=None, limit=20, query=None, language=None):
1440
+ if not _newsapi_enabled():
1441
+ return []
1442
+ if query:
1443
+ url = f"https://newsapi.org/v2/everything?pageSize={limit}&apiKey={NEWSAPI_KEY}&q={requests.utils.quote(query)}"
1444
+ if language:
1445
+ url += f"&language={language}"
1446
+ try:
1447
+ r = _session_get(url, timeout=12)
1448
+ if r.status_code != 200:
1449
+ log.warning(f"NewsAPI /everything HTTP {r.status_code}: {r.text[:200]}")
1450
+ return []
1451
+ arts = r.json().get("articles", [])
1452
+ for a in arts:
1453
+ a["api_source"] = "newsapi"
1454
+ return arts[:limit]
1455
+ except Exception as e:
1456
+ log.warning(f"NewsAPI /everything request failed: {e}")
1457
+ return []
1458
+ results = []
1459
+ per_country = max(5, limit // len(NEWSAPI_COUNTRIES))
1460
+ for c in NEWSAPI_COUNTRIES:
1461
+ url = f"https://newsapi.org/v2/top-headlines?country={c}&pageSize={per_country}&apiKey={NEWSAPI_KEY}"
1462
+ if category:
1463
+ url += f"&category={category}"
1464
+ try:
1465
+ r = _session_get(url, timeout=12)
1466
+ if r.status_code != 200:
1467
+ log.warning(f"NewsAPI top-headlines {c} -> HTTP {r.status_code}: {r.text[:200]}")
1468
+ continue
1469
+ arts = r.json().get("articles", [])
1470
+ for a in arts:
1471
+ a["api_source"] = "newsapi"
1472
+ results.extend(arts)
1473
+ except Exception as e:
1474
+ log.warning(f"NewsAPI top-headlines {c} failed: {e}")
1475
+ time.sleep(0.2)
1476
+ return results[:limit]
1477
+
1478
+ # ----------------- Provider Combiner / Dedup -----------------
1479
+ def combine_raw_articles(category=None, query=None, language=None, limit_each=30,
1480
+ timespan="3d", speed=Speed.balanced, log_summary: bool = True):
1481
+ if speed == Speed.fast:
1482
+ timespan = "24h"
1483
+ limit_each = min(limit_each, 20)
1484
+ elif speed == Speed.balanced:
1485
+ timespan = "48h"
1486
+ limit_each = min(limit_each, 150)
1487
+ a1 = []
1488
+ if USE_NEWSAPI:
1489
+ if not query:
1490
+ a1 = fetch_newsapi_headlines_multi(limit=limit_each, language=language)
1491
+ else:
1492
+ a1 = fetch_newsapi_articles(category=category, limit=limit_each, query=query, language=language)
1493
+ a2 = []
1494
+ if USE_NEWSDATA_API:
1495
+ a2 = [
1496
+ normalize_newsdata_article(a)
1497
+ for a in fetch_newsdata_articles(category=category, limit=limit_each, query=query, language=language)
1498
+ if a.get("link")
1499
+ ]
1500
+ a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
1501
+ gdelt_limit = limit_each
1502
+ a4 = fetch_gdelt_multi(
1503
+ limit=gdelt_limit,
1504
+ query=query,
1505
+ language=language,
1506
+ timespan=timespan,
1507
+ category=category,
1508
+ speed=speed,
1509
+ )
1510
+ seen, merged = set(), []
1511
+ for a in a1 + a3 + a2 + a4:
1512
+ if a.get("url"):
1513
+ a["url"] = _canonical_url(a["url"])
1514
+ url = a["url"]
1515
+ if url not in seen:
1516
+ seen.add(url)
1517
+ merged.append(a)
1518
+ if log_summary:
1519
+ fetch_log.info("----- Article Fetch Summary -----")
1520
+ fetch_log.info(f"📊 NewsAPI returned: {len(a1)} articles")
1521
+ fetch_log.info(f"📊 NewsData.io returned: {len(a2)} articles")
1522
+ fetch_log.info(f"📊 GNews returned: {len(a3)} articles")
1523
+ fetch_log.info(f"📊 GDELT returned: {len(a4)} articles")
1524
+ fetch_log.info(f"✅ Total merged articles after deduplication: {len(merged)}")
1525
+ fetch_log.info("---------------------------------")
1526
+ return merged
1527
+
1528
+ # ----------------- API: /events -----------------
1529
  @app.get("/events")
1530
  def get_events(
1531
  q: Optional[str] = Query(None),
 
1539
  min_articles: int = Query(2, ge=1, le=200),
1540
  speed: Speed = Query(Speed.balanced),
1541
  ):
 
 
1542
  cache_key, enriched, clusters = get_or_build_events_cache(
1543
  q, category, language, False, None, limit_each, speed=speed
1544
  )
 
 
1545
  view = enriched
1546
  if translate and target_lang:
1547
  view = [dict(i) for i in enriched]
 
1550
  i["title"] = translate_text(i.get("title") or "", target_lang, fallback_src=src_hint)
1551
  i["description"] = translate_text(i.get("description") or "", target_lang, fallback_src=src_hint)
1552
  i["translated"] = True
 
1553
  events = [event_payload_from_cluster(c, view) for c in clusters]
1554
  events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
1555
  events.sort(key=lambda e: e["article_count"], reverse=True)
 
1556
  return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
1557
 
1558
+ # ----------------- API: /event/{event_id} -----------------
1559
  @app.get("/event/{event_id}")
1560
  def get_event_details(
1561
  event_id: str,
 
1567
  target_lang: Optional[str] = Query(None),
1568
  limit_each: int = Query(150, ge=5, le=250),
1569
  ):
 
1570
  if cache_key:
1571
  parts = cache_key.split("|")
1572
  if len(parts) != 7:
1573
  raise HTTPException(status_code=400, detail="Bad cache_key")
1574
  speed_str = parts[6]
1575
  try:
1576
+ speed_obj = Speed(speed_str)
1577
  except ValueError:
1578
  speed_obj = Speed.balanced
1579
  key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
 
1581
  else:
1582
  speed_obj = Speed.balanced
1583
  key_tuple = cache_key_for(q, category, language, limit_each, translate, target_lang, speed=speed_obj)
 
1584
  entry = _events_cache.get(key_tuple)
1585
  if not entry:
 
1586
  _, enriched, clusters = get_or_build_events_cache(
1587
  q, category, language, False, None, limit_each, speed=speed_obj
1588
  )
1589
  else:
1590
  enriched, clusters = entry["enriched"], entry["clusters"]
 
 
1591
  eview = enriched
1592
  if translate and target_lang:
1593
  eview = [dict(i) for i in enriched]
 
1596
  i["title"] = translate_text(i.get("title") or "", target_lang, fallback_src=src_hint)
1597
  i["description"] = translate_text(i.get("description") or "", target_lang, fallback_src=src_hint)
1598
  i["translated"] = True
 
1599
  cluster = next((c for c in clusters if cluster_id(c, enriched) == event_id), None)
1600
  if not cluster:
1601
  raise HTTPException(status_code=404, detail="Event not found with current filters")
 
1602
  payload = event_payload_from_cluster(cluster, eview)
1603
  countries = aggregate_event_by_country(cluster, eview)
1604
  payload["articles_in_event"] = sum(c["count"] for c in countries)
1605
  return {"event": payload, "countries": countries}
1606
 
1607
+ # ----------------- API: /news -----------------
1608
  @app.get("/news")
1609
  def get_news(
1610
  cache_key: Optional[str] = Query(None),
 
1621
  page_size: int = Query(120, ge=5, le=300),
1622
  ):
1623
  enriched: List[Dict[str, Any]] = []
 
 
1624
  if cache_key:
1625
  parts = cache_key.split("|")
1626
  if len(parts) == 7:
1627
  key_tuple = (
1628
+ parts[0],
1629
+ parts[1],
1630
+ parts[2],
1631
+ int(parts[3]),
1632
+ parts[4] == "True",
1633
+ parts[5].lower(),
1634
+ parts[6],
1635
  )
1636
  entry = _events_cache.get(key_tuple)
1637
  if entry:
1638
  enriched = entry["enriched"]
 
 
1639
  if not enriched:
1640
  raw = combine_raw_articles(category=category, query=q, language=language, limit_each=limit_each, speed=speed)
1641
  prefetch_descriptions_async(raw, speed)
 
1646
  else:
1647
  enriched = enriched_all
1648
  else:
 
1649
  if category:
1650
  cat_norm = (category or "").strip().lower()
1651
  enriched = [e for e in enriched if (e.get("category") or "").lower() == cat_norm]
 
 
1652
  if translate and target_lang:
1653
  enriched = [dict(i) for i in enriched]
1654
  for i in enriched:
 
1660
  i["translated"] = True
1661
  i["translated_from"] = (src_hint or "").lower()
1662
  i["translated_to"] = target_lang.lower()
 
 
1663
  if sentiment:
1664
  s = sentiment.strip().lower()
1665
  enriched = [i for i in enriched if i.get("sentiment", "").lower() == s]
 
 
1666
  total = len(enriched)
1667
  start = (page - 1) * page_size
1668
  end = start + page_size
1669
  items = [dict(i) for i in enriched[start:end]]
 
 
1670
  if lite:
1671
  drop = {"_ml_text"}
1672
  for i in items:
1673
  for k in drop:
1674
  i.pop(k, None)
 
1675
  return {
1676
  "items": items,
1677
  "total": total,
 
1679
  "page_size": page_size
1680
  }
1681
 
1682
+ # ----------------- API: /related -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1683
  @app.get("/related")
1684
  def related_articles(
1685
  id: Optional[str] = Query(None, description="article id from /news"),
 
1691
  limit_each: int = Query(50, ge=5, le=100),
1692
  k: int = Query(10, ge=1, le=50),
1693
  ):
 
1694
  raw = combine_raw_articles(category=category, query=q, language=language, limit_each=limit_each)
1695
  enriched = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
1696
  if not enriched:
1697
  return {"items": []}
 
 
1698
  if id:
1699
  base = next((a for a in enriched if a.get("id") == id), None)
1700
  if not base:
 
1705
  if not text:
1706
  raise HTTPException(400, "provide either id or title/description")
1707
  query_text = text
 
1708
  corpus_texts = [a["_ml_text"] for a in enriched]
1709
  corpus_embs = _embed_texts(corpus_texts)
1710
  query_emb = _embed_texts([query_text])[0]
 
 
1711
  sims = util.cos_sim(query_emb, corpus_embs).cpu().numpy().flatten()
 
 
1712
  idxs = sims.argsort()[::-1]
1713
  items = []
1714
  for idx in idxs:
 
1718
  items.append({**a, "similarity": float(sims[idx])})
1719
  if len(items) >= k:
1720
  break
 
1721
  return {"items": items}
1722
 
1723
+ # ----------------- Middleware: Request Timing -----------------
1724
  @app.middleware("http")
1725
  async def timing_middleware(request, call_next):
1726
  start = time.perf_counter()
 
1730
  return response
1731
  finally:
1732
  dur_ms = (time.perf_counter() - start) * 1000
 
1733
  if response is not None:
1734
  try:
1735
  response.headers["X-Process-Time-ms"] = f"{dur_ms:.1f}"
1736
  except Exception:
1737
  pass
1738
 
1739
+ # ----------------- Misc: Client Metrics -----------------
1740
  @app.post("/client-metric")
1741
  def client_metric(payload: Dict[str, Any] = Body(...)):
1742
  name = (payload.get("name") or "").strip()
 
1743
  if name in {"Load all article markers on globe", "Load event country markers on globe"}:
1744
  return {"ok": True}
1745
  return {"ok": True}
1746
 
1747
+ # ----------------- Diagnostics: Translation Health -----------------
1748
  @app.get("/diag/translate")
1749
  def diag_translate():
1750
  remote = _hf_call("Helsinki-NLP/opus-mt-es-en", {"inputs":"Hola mundo"})