Improve topic filtering: require keyword relevance inside article bodies
Browse files
main.py
CHANGED
|
@@ -820,35 +820,41 @@ def _topic_articles(topic,limit=5):
|
|
| 820 |
return items
|
| 821 |
|
| 822 |
def _topic_article_context(topic):
|
| 823 |
-
"""Filter
|
| 824 |
-
|
|
|
|
|
|
|
|
|
|
| 825 |
candidates=[];seen=set()
|
| 826 |
def add_items(items):
|
| 827 |
for a in items or []:
|
| 828 |
link=a.get("link","");title=a.get("title","")
|
| 829 |
if not link or link in seen:continue
|
| 830 |
-
|
| 831 |
-
score=sum(1 for k in keys if k in low)
|
| 832 |
-
if score>0 or not candidates:
|
| 833 |
-
seen.add(link);candidates.append((score,a))
|
| 834 |
try:add_items(scrape_genk_ai())
|
| 835 |
except:pass
|
| 836 |
try:add_items(scrape_dantri_congnghe())
|
| 837 |
except:pass
|
| 838 |
-
# Add broad but readable local sources for non-tech topics
|
| 839 |
try:add_items(scrape_ttvh_worldcup())
|
| 840 |
except:pass
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
for score,a in candidates:
|
| 844 |
data=_article_by_url(a.get("link",""))
|
| 845 |
if not data or not data.get("body"):continue
|
| 846 |
-
if not img:img=data.get("og_image") or a.get("img","")
|
| 847 |
title=data.get("title") or a.get("title","")
|
| 848 |
ps=[b.get("text","") for b in data.get("body",[]) if b.get("type")=="p" and len(b.get("text",""))>40]
|
| 849 |
-
excerpt=" ".join(ps)[:
|
| 850 |
-
|
| 851 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
if chunks:return "\n\n".join(chunks),img
|
| 853 |
return _web_context(topic),""
|
| 854 |
|
|
|
|
| 820 |
return items
|
| 821 |
|
| 822 |
def _topic_article_context(topic):
|
| 823 |
+
"""Filter readable article sources by topic, then summarize actual article bodies."""
|
| 824 |
+
raw_keys=[k.lower() for k in re.findall(r"[\wÀ-ỹ]+",topic) if len(k)>2]
|
| 825 |
+
# Drop ultra-generic tokens; keep domain words such as giáo/dục, bóng/đá, world/cup.
|
| 826 |
+
stop={"trong","năm","the","and","của","cho","với","một","các","những","hiện","nay"}
|
| 827 |
+
keys=[k for k in raw_keys if k not in stop]
|
| 828 |
candidates=[];seen=set()
|
| 829 |
def add_items(items):
|
| 830 |
for a in items or []:
|
| 831 |
link=a.get("link","");title=a.get("title","")
|
| 832 |
if not link or link in seen:continue
|
| 833 |
+
seen.add(link);candidates.append(a)
|
|
|
|
|
|
|
|
|
|
| 834 |
try:add_items(scrape_genk_ai())
|
| 835 |
except:pass
|
| 836 |
try:add_items(scrape_dantri_congnghe())
|
| 837 |
except:pass
|
|
|
|
| 838 |
try:add_items(scrape_ttvh_worldcup())
|
| 839 |
except:pass
|
| 840 |
+
scored=[];img=""
|
| 841 |
+
for a in candidates[:40]:
|
|
|
|
| 842 |
data=_article_by_url(a.get("link",""))
|
| 843 |
if not data or not data.get("body"):continue
|
|
|
|
| 844 |
title=data.get("title") or a.get("title","")
|
| 845 |
ps=[b.get("text","") for b in data.get("body",[]) if b.get("type")=="p" and len(b.get("text",""))>40]
|
| 846 |
+
excerpt=" ".join(ps)[:1800] or data.get("summary","")
|
| 847 |
+
hay=(title+" "+excerpt).lower()
|
| 848 |
+
score=sum(1 for k in keys if k in hay)
|
| 849 |
+
# Require topic relevance when we have meaningful keys.
|
| 850 |
+
if keys and score==0:continue
|
| 851 |
+
if len(keys)>=2 and score<2 and not any(" ".join(keys[i:i+2]) in hay for i in range(len(keys)-1)):continue
|
| 852 |
+
scored.append((score,title,a.get("link",""),excerpt,data.get("og_image") or a.get("img","") or ""))
|
| 853 |
+
scored=sorted(scored,key=lambda x:x[0],reverse=True)[:5]
|
| 854 |
+
chunks=[]
|
| 855 |
+
for score,title,link,excerpt,im in scored:
|
| 856 |
+
if not img and im:img=im
|
| 857 |
+
chunks.append("BÀI: "+title+"\nURL: "+link+"\nNỘI DUNG LỌC: "+excerpt)
|
| 858 |
if chunks:return "\n\n".join(chunks),img
|
| 859 |
return _web_context(topic),""
|
| 860 |
|