File size: 7,426 Bytes
0e22a86 5b38e69 0e22a86 5b38e69 0e22a86 5b38e69 0e22a86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | """
Query Builder Utility (Phase 20 β Dynamic Round-Robin Query Builder)
=====================================================================
PURPOSE
-------
When we ask a news API for articles, we send a "query string" β a list of
keywords that tells the API what topics we want. Our Phase 19 taxonomy can
have up to 28 keywords per category. Stuffing all 28 into one API call would:
1. Crash the API with an HTTP 400 "query too long" error.
2. Return the same broad results every single hour β wasting our paid quota.
SOLUTION: The Anchor + Round-Robin Strategy
-------------------------------------------
For every category we split the keyword list into two parts:
ANCHORS β The first 3 keywords. These are the most important, core terms
(e.g. "artificial intelligence", "machine learning", "openai").
They are ALWAYS included in every query, every hour.
This guarantees we never miss breaking news on a core topic.
ROTATORS β The remaining keywords (e.g. "anthropic", "mistral", "llama"...).
These are divided into chunks of 4.
Each hour of the day, one chunk is added to the anchors.
So over 24 hours, we cycle through all chunks, covering every
niche keyword without ever exceeding the URL character limit.
CLOCK MATH (Stateless & Restart-Safe)
--------------------------------------
chunk_index = datetime.now(UTC).hour % number_of_chunks
- Uses UTC so the rotation is identical everywhere β Hugging Face, local,
AWS β regardless of which timezone the server is in.
- No Redis, no database, no file. Just Python's clock. If the server
restarts, the correct chunk for the current hour is immediately selected.
SINGLE SOURCE OF TRUTH
-----------------------
We IMPORT CATEGORY_KEYWORDS from data_validation.py. We never copy it here.
One dict, one place. Phase 21 expansions will automatically be picked up.
SUPPORTED API TYPES
-------------------
"newsapi" β Multi-word phrases quoted, terms joined with " OR "
Example: '"artificial intelligence" OR openai OR llm'
"gnews" β All terms joined with a single space
Example: 'artificial intelligence openai llm'
"newsdata" β All terms joined with a comma
Example: 'artificial intelligence,openai,llm'
"""
from datetime import datetime, timezone
from typing import List
# ββ Single Source of Truth ββββββββββββββββββββββββββββββββββββββββββββββββββββ
# We import from data_validation.py rather than duplicating the dictionary here.
# This means any keyword added in a future phase is automatically picked up
# by all API queries with zero additional work.
from app.utils.data_validation import CATEGORY_KEYWORDS
# ββ Tuning Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_ANCHOR_COUNT = 3 # How many keywords are always included (anchors)
_CHUNK_SIZE = 4 # How many rotator keywords are added per hour
def _chunk_list(items: List[str], size: int) -> List[List[str]]:
"""
Splits a flat list into groups of `size`.
Example:
_chunk_list(['a','b','c','d','e','f'], 3)
β [['a','b','c'], ['d','e','f']]
If the list divides unevenly, the last chunk is shorter β that is fine.
"""
return [items[i : i + size] for i in range(0, len(items), size)]
def _format_for_api(keywords: List[str], api_type: str) -> str:
"""
Converts a list of keywords into the query string format a specific API expects.
Rules by api_type:
"newsapi" β Wrap any keyword that contains a space in double-quotes so
the API treats it as an exact phrase. Then join with " OR ".
Example output: '"artificial intelligence" OR openai OR llm'
"gnews" β Just join everything with spaces. GNews search is tolerant
of natural language.
Example output: 'artificial intelligence openai llm'
"newsdata" β Join with commas. NewsData.io uses comma-separated terms.
Example output: 'artificial intelligence,openai,llm'
Any unknown api_type falls back to the newsapi format (safest default).
"""
if not keywords:
return ""
if api_type == "newsapi":
# Phrases with spaces need quotes so the API treats them as a unit.
# Single words can go bare (no quotes needed, saves character budget).
formatted = [
f'"{kw}"' if ' ' in kw else kw
for kw in keywords
]
return " OR ".join(formatted)
elif api_type == "gnews":
# GNews accepts plain space-separated words.
return " ".join(keywords)
elif api_type == "newsdata":
# NewsData.io accepts comma-separated keywords.
return ",".join(keywords)
else:
# Unknown API type β fall back to NewsAPI format (most common).
formatted = [f'"{kw}"' if ' ' in kw else kw for kw in keywords]
return " OR ".join(formatted)
def build_dynamic_query(category: str, api_type: str = "newsapi") -> str:
"""
Build a query string for the given category using the Anchor + Round-Robin
strategy driven by the current UTC hour.
Args:
category β e.g. "ai", "cloud-aws", "data-engineering"
api_type β one of "newsapi", "gnews", "newsdata"
"""
# ββ Step 1: Get the keyword list for this category ββββββββββββββββββββββββ
all_keywords = CATEGORY_KEYWORDS.get(category)
if not all_keywords:
return category
# ββ Tune Constants based on API type limits βββββββββββββββββββββββββββββ
# NewsData has strict OR limits (max 5 keywords).
anchor_count = 2 if api_type == "newsdata" else _ANCHOR_COUNT
chunk_size = 2 if api_type == "newsdata" else _CHUNK_SIZE
# ββ Step 2: Anchor split ββββββββββββββββββββββββββββββββββββββββββββββββββ
anchors = all_keywords[:anchor_count]
rotators = all_keywords[anchor_count:]
# ββ Step 3: Chunk the rotators ββββββββββββββββββββββββββββββββββββββββββββ
chunks = _chunk_list(rotators, chunk_size)
# ββ Step 4: Pick the active chunk using the UTC clock βββββββββββββββββββββ
current_hour = datetime.now(timezone.utc).hour
if chunks:
active_index = current_hour % len(chunks)
active_chunk = chunks[active_index]
else:
active_chunk = []
# ββ Step 5: Combine anchors + active chunk ββββββββββββββββββββββββββββββββ
final_keywords = anchors + active_chunk
# ββ Step 6: Format and return βββββββββββββββββββββββββββββββββββββββββββββ
return _format_for_api(final_keywords, api_type)
|