VNEWS

Running

App Files Files Community

bep40 commited on 5 days ago

Commit

62d9dcf

verified ·

1 Parent(s): ce9dd6f

Improve topic filtering: require keyword relevance inside article bodies

Browse files

Files changed (1) hide show

main.py +20 -14

main.py CHANGED Viewed

@@ -820,35 +820,41 @@ def _topic_articles(topic,limit=5):
     return items
 def _topic_article_context(topic):
-    """Filter our readable article sources by topic, then summarize from actual article bodies."""
-    keys=[k.lower() for k in re.findall(r"[\wÀ-ỹ]+",topic) if len(k)>2]
     candidates=[];seen=set()
     def add_items(items):
         for a in items or []:
             link=a.get("link","");title=a.get("title","")
             if not link or link in seen:continue
-            low=title.lower()
-            score=sum(1 for k in keys if k in low)
-            if score>0 or not candidates:
-                seen.add(link);candidates.append((score,a))
     try:add_items(scrape_genk_ai())
     except:pass
     try:add_items(scrape_dantri_congnghe())
     except:pass
-    # Add broad but readable local sources for non-tech topics
     try:add_items(scrape_ttvh_worldcup())
     except:pass
-    candidates=sorted(candidates,key=lambda x:x[0],reverse=True)[:8]
-    chunks=[];img=""
-    for score,a in candidates:
         data=_article_by_url(a.get("link",""))
         if not data or not data.get("body"):continue
-        if not img:img=data.get("og_image") or a.get("img","")
         title=data.get("title") or a.get("title","")
         ps=[b.get("text","") for b in data.get("body",[]) if b.get("type")=="p" and len(b.get("text",""))>40]
-        excerpt=" ".join(ps)[:1600] or data.get("summary","")
-        if excerpt:chunks.append("BÀI: "+title+"\nURL: "+a.get("link","")+"\nNỘI DUNG LỌC: "+excerpt)
-        if len(chunks)>=5:break
     if chunks:return "\n\n".join(chunks),img
     return _web_context(topic),""

     return items
 def _topic_article_context(topic):
+    """Filter readable article sources by topic, then summarize actual article bodies."""
+    raw_keys=[k.lower() for k in re.findall(r"[\wÀ-ỹ]+",topic) if len(k)>2]
+    # Drop ultra-generic tokens; keep domain words such as giáo/dục, bóng/đá, world/cup.
+    stop={"trong","năm","the","and","của","cho","với","một","các","những","hiện","nay"}
+    keys=[k for k in raw_keys if k not in stop]
     candidates=[];seen=set()
     def add_items(items):
         for a in items or []:
             link=a.get("link","");title=a.get("title","")
             if not link or link in seen:continue
+            seen.add(link);candidates.append(a)
     try:add_items(scrape_genk_ai())
     except:pass
     try:add_items(scrape_dantri_congnghe())
     except:pass
     try:add_items(scrape_ttvh_worldcup())
     except:pass
+    scored=[];img=""
+    for a in candidates[:40]:
         data=_article_by_url(a.get("link",""))
         if not data or not data.get("body"):continue
         title=data.get("title") or a.get("title","")
         ps=[b.get("text","") for b in data.get("body",[]) if b.get("type")=="p" and len(b.get("text",""))>40]
+        excerpt=" ".join(ps)[:1800] or data.get("summary","")
+        hay=(title+" "+excerpt).lower()
+        score=sum(1 for k in keys if k in hay)
+        # Require topic relevance when we have meaningful keys.
+        if keys and score==0:continue
+        if len(keys)>=2 and score<2 and not any(" ".join(keys[i:i+2]) in hay for i in range(len(keys)-1)):continue
+        scored.append((score,title,a.get("link",""),excerpt,data.get("og_image") or a.get("img","") or ""))
+    scored=sorted(scored,key=lambda x:x[0],reverse=True)[:5]
+    chunks=[]
+    for score,title,link,excerpt,im in scored:
+        if not img and im:img=im
+        chunks.append("BÀI: "+title+"\nURL: "+link+"\nNỘI DUNG LỌC: "+excerpt)
     if chunks:return "\n\n".join(chunks),img
     return _web_context(topic),""