bep40 commited on
Commit
62d9dcf
·
verified ·
1 Parent(s): ce9dd6f

Improve topic filtering: require keyword relevance inside article bodies

Browse files
Files changed (1) hide show
  1. main.py +20 -14
main.py CHANGED
@@ -820,35 +820,41 @@ def _topic_articles(topic,limit=5):
820
  return items
821
 
822
  def _topic_article_context(topic):
823
- """Filter our readable article sources by topic, then summarize from actual article bodies."""
824
- keys=[k.lower() for k in re.findall(r"[\wÀ-ỹ]+",topic) if len(k)>2]
 
 
 
825
  candidates=[];seen=set()
826
  def add_items(items):
827
  for a in items or []:
828
  link=a.get("link","");title=a.get("title","")
829
  if not link or link in seen:continue
830
- low=title.lower()
831
- score=sum(1 for k in keys if k in low)
832
- if score>0 or not candidates:
833
- seen.add(link);candidates.append((score,a))
834
  try:add_items(scrape_genk_ai())
835
  except:pass
836
  try:add_items(scrape_dantri_congnghe())
837
  except:pass
838
- # Add broad but readable local sources for non-tech topics
839
  try:add_items(scrape_ttvh_worldcup())
840
  except:pass
841
- candidates=sorted(candidates,key=lambda x:x[0],reverse=True)[:8]
842
- chunks=[];img=""
843
- for score,a in candidates:
844
  data=_article_by_url(a.get("link",""))
845
  if not data or not data.get("body"):continue
846
- if not img:img=data.get("og_image") or a.get("img","")
847
  title=data.get("title") or a.get("title","")
848
  ps=[b.get("text","") for b in data.get("body",[]) if b.get("type")=="p" and len(b.get("text",""))>40]
849
- excerpt=" ".join(ps)[:1600] or data.get("summary","")
850
- if excerpt:chunks.append("BÀI: "+title+"\nURL: "+a.get("link","")+"\nNỘI DUNG LỌC: "+excerpt)
851
- if len(chunks)>=5:break
 
 
 
 
 
 
 
 
 
852
  if chunks:return "\n\n".join(chunks),img
853
  return _web_context(topic),""
854
 
 
820
  return items
821
 
822
  def _topic_article_context(topic):
823
+ """Filter readable article sources by topic, then summarize actual article bodies."""
824
+ raw_keys=[k.lower() for k in re.findall(r"[\wÀ-ỹ]+",topic) if len(k)>2]
825
+ # Drop ultra-generic tokens; keep domain words such as giáo/dục, bóng/đá, world/cup.
826
+ stop={"trong","năm","the","and","của","cho","với","một","các","những","hiện","nay"}
827
+ keys=[k for k in raw_keys if k not in stop]
828
  candidates=[];seen=set()
829
  def add_items(items):
830
  for a in items or []:
831
  link=a.get("link","");title=a.get("title","")
832
  if not link or link in seen:continue
833
+ seen.add(link);candidates.append(a)
 
 
 
834
  try:add_items(scrape_genk_ai())
835
  except:pass
836
  try:add_items(scrape_dantri_congnghe())
837
  except:pass
 
838
  try:add_items(scrape_ttvh_worldcup())
839
  except:pass
840
+ scored=[];img=""
841
+ for a in candidates[:40]:
 
842
  data=_article_by_url(a.get("link",""))
843
  if not data or not data.get("body"):continue
 
844
  title=data.get("title") or a.get("title","")
845
  ps=[b.get("text","") for b in data.get("body",[]) if b.get("type")=="p" and len(b.get("text",""))>40]
846
+ excerpt=" ".join(ps)[:1800] or data.get("summary","")
847
+ hay=(title+" "+excerpt).lower()
848
+ score=sum(1 for k in keys if k in hay)
849
+ # Require topic relevance when we have meaningful keys.
850
+ if keys and score==0:continue
851
+ if len(keys)>=2 and score<2 and not any(" ".join(keys[i:i+2]) in hay for i in range(len(keys)-1)):continue
852
+ scored.append((score,title,a.get("link",""),excerpt,data.get("og_image") or a.get("img","") or ""))
853
+ scored=sorted(scored,key=lambda x:x[0],reverse=True)[:5]
854
+ chunks=[]
855
+ for score,title,link,excerpt,im in scored:
856
+ if not img and im:img=im
857
+ chunks.append("BÀI: "+title+"\nURL: "+link+"\nNỘI DUNG LỌC: "+excerpt)
858
  if chunks:return "\n\n".join(chunks),img
859
  return _web_context(topic),""
860