File size: 14,045 Bytes
433f3f1
 
 
 
 
 
 
0b170f9
433f3f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
from typing import List, Dict, Any
from datetime import datetime, timedelta

from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings

from config import HF_EMBEDDING_MODEL, GNEWS_KEY
from db3_utils import query_llm, translate_term, fetch_search
from db6_utils import fetch_full_article

from supabase_ie import save_faiss_to_supabase, download_faiss_from_supabase
import urllib.parse
import time

embeddings = HuggingFaceEmbeddings(
    model_name=HF_EMBEDDING_MODEL,
    encode_kwargs={"normalize_embeddings": True},
)

# # --- Step 1: Extract keywords ---
# def extract_keywords_from_query(query: str, max_terms: int = 3) -> List[str]:
#     """
#     Use LLM to extract keywords from a free-form query.
#     """
#     prompt = (
#         f"Extract up to {max_terms} concise keywords or short phrases from this query for a news search:\n\n{query}\n\n"
#         "Return as a comma-separated list."
#     )
#     result = query_llm(prompt)
#     return [kw.strip() for kw in result.split(",") if kw.strip()]

# --- Step 2: Translate keywords ---
# def translate_keywords(keywords: List[str], langs: List[str]) -> Dict[str, Dict[str, str]]:
#     """
#     Map keywords into different languages.
#     Returns {keyword: {lang: translated_term}}
#     """
#     translated_map: Dict[str, Dict[str, str]] = {}
#     for kw in keywords:
#         translated_map[kw] = {}
#         for lang in langs:
#             if lang == "en":
#                 translated_map[kw][lang] = kw
#             else:
#                 translated_map[kw][lang] = translate_term(kw, lang)
#     return translated_map

# --- Step 3: Fetch articles from GNews ---

# def debug_fetch():
#     """
#     Temporary debug function to test GNews queries with different parameter tweaks.
#     """
#     test_variants = [
#         {"label": "original", "q": "Sinner AND Beijing ATP tournament AND performance", "use_dates": True, "country": None},
#         {"label": "no_dates", "q": "Sinner AND Beijing ATP tournament AND performance", "use_dates": False, "country": None},
#         {"label": "simple_keyword", "q": "sport", "use_dates": True, "country": None},
#         {"label": "with_country", "q": "sport", "use_dates": True, "country": "us"},
#         {"label": "sort_relevance", "q": "sport", "use_dates": True, "country": None},
#     ]

#     now = datetime.utcnow().replace(microsecond=0)
#     date_from = (now - timedelta(days=3)).isoformat() + "Z"
#     date_to = now.isoformat() + "Z"

#     results = {}

    # for variant in test_variants:
    #     print(f"\n[DEBUG FETCH] Running variant: {variant['label']}")
    #     try:
    #         rows = fetch_search(
    #             api_key=None,
    #             bucket="db7",
    #             q=variant["q"],
    #             lang="en",
    #             country=variant["country"],
    #             max_items=10,
    #             #pages=1,
    #             #sort_by=variant["sort_by"],
    #             date_from=None, #date_from if variant["use_dates"] else None,
    #             date_to=None, #date_to if variant["use_dates"] else None,
    #         )
    #         print(f"   ➡️ Found {len(rows)} articles")
    #         results[variant["label"]] = len(rows)
    #     except Exception as e:
    #         print(f"⚠️ Error in {variant['label']} fetch: {e}")
    #         results[variant["label"]] = "error"

    # return results

# temporary change
# def fetch_articles_for_keywords(
#     news_topic: Dict[str, Dict[str, str]],
#     lookback_days: int = 3,
#     max_items: int = 50,
#     min_results: int = 5
# ) -> List[Dict[str, Any]]:
#     """
#     Fetch articles for each keyword/lang pair using GNews.
#     Strategy:
#       1. Try an AND query (all keywords together).
#       2. If too few results, fall back to OR query.
#     Debug:
#       - Print final date_from/date_to strings.
#       - Print queries before calling fetch_search.
#       - Print full URL params if request fails.
#     """
#     results = []

#     now = datetime.utcnow().replace(microsecond=0)
#     date_from = (now - timedelta(days=lookback_days)).isoformat() + "Z"
#     date_to = now.isoformat() + "Z"

#     print(f"[DB7][DEBUG] date_from={date_from}, date_to={date_to}")

#     keywords = list(news_topic.keys())

#     for lang in set(l for langs in news_topic.values() for l in langs.keys()):
#         terms = [news_topic[kw].get(lang, kw) for kw in keywords if news_topic[kw].get(lang)]
#         if not terms:
#             continue

#         # --- AND query ---
#         and_query = " AND ".join(terms)
#         print(f"[DB7][DEBUG] Trying AND query for lang={lang}: {and_query}")
#         try:
#             rows = fetch_search(
#                 api_key=GNEWS_KEY,
#                 bucket="db7",
#                 q=and_query,
#                 lang=lang,
#                 country=None,
#                 max_items=max_items,
#                 pages=1,
#                 sort_by="popularity",
#                 date_from=date_from,
#                 date_to=date_to,
#             )
#             print(f"   ➡️ Found {len(rows)} articles (AND)")
#             results.extend(rows)
#             time.sleep(1)
#         except Exception as e:
#             print(f"⚠️ Error in AND fetch for lang={lang}, query='{and_query}'")
#             print(f"   date_from={date_from}, date_to={date_to}")
#             print(f"   Exception: {e}")
#             rows = []

#         # --- OR fallback ---
#         if len(rows) < min_results:
#             or_query = " OR ".join(terms)
#             print(f"[DB7][DEBUG] Trying OR query for lang={lang}: {or_query}")
#             try:
#                 rows_or = fetch_search(
#                     api_key=GNEWS_KEY,
#                     bucket="db7",
#                     q=or_query,
#                     lang=lang,
#                     country=None,
#                     max_items=max_items,
#                     pages=1,
#                     sort_by="relevance",
#                     date_from=date_from,
#                     date_to=date_to,
#                 )
#                 print(f"   ➡️ Found {len(rows_or)} articles (OR fallback)")
#                 results.extend(rows_or)
#                 time.sleep(1)
#             except Exception as e:
#                 print(f"⚠️ Error in OR fetch for lang={lang}, query='{or_query}'")
#                 print(f"   date_from={date_from}, date_to={date_to}")
#                 print(f"   Exception: {e}")

#     return results

def fetch_articles_for_keywords(
    news_topic: list[str],
    lookback_days: int = 3,
    max_items: int = 50,
    min_results: int = 5,
    user_countries: list[str] | None = None,
    user_lang: list[str] | None = None
) -> List[Dict[str, Any]]:
    """
    Fetch articles for a list of keywords in multiple languages and countries.
    Steps:
      1. Translate news_topic into each user_lang (if not "en").
      2. Perform fetches for each (topic_lang, country, lang).
      3. Always also fetch global English (country=None, lang="en").
      4. Translate found titles+descriptions back into English.
    """
    results: List[Dict[str, Any]] = []

    now = datetime.utcnow().replace(microsecond=0)
    date_from = (now - timedelta(days=lookback_days)).isoformat() + "Z"
    date_to = now.isoformat() + "Z"

    print(f"[DB7][DEBUG] date_from={date_from}, date_to={date_to}")

    if not news_topic:
        print("⚠️ No keywords provided")
        return results

    # Ensure we always include English
    langs = set(user_lang or [])
    langs.add("en")

    # --- Loop over each user language + country ---
    for lang in langs:
        # Translate topic terms if needed
        if lang == "en":
            translated_terms = news_topic
        else:
            translated_terms = [translate_term(term, lang) for term in news_topic]

        # Build AND and OR queries
        and_query = " AND ".join(translated_terms)
        or_query = " OR ".join(translated_terms)

        countries_to_check = user_countries if user_countries else [None]

        for country in countries_to_check:
            # --- AND query first ---
            try:
                print(f"[DB7][DEBUG] Trying AND query lang={lang}, country={country}: {and_query}")
                rows = fetch_search(
                    api_key=GNEWS_KEY,
                    bucket="db7",
                    q=and_query,
                    lang=lang,
                    country=country,
                    max_items=max_items,
                    pages=1,
                    sort_by="relevance",
                    date_from=date_from,
                    date_to=date_to,
                )
                print(f"   ➡️ Found {len(rows)} articles (AND)")
                results.extend(rows)
                time.sleep(1)
            except Exception as e:
                print(f"⚠️ Error in AND fetch for lang={lang}, country={country}: {e}")
                rows = []

            # --- OR fallback ---
            if len(rows) < min_results:
                try:
                    print(f"[DB7][DEBUG] Trying OR query lang={lang}, country={country}: {or_query}")
                    rows_or = fetch_search(
                        api_key=GNEWS_KEY,
                        bucket="db7",
                        q=or_query,
                        lang=lang,
                        country=country,
                        max_items=max_items,
                        pages=1,
                        sort_by="relevance",
                        date_from=date_from,
                        date_to=date_to,
                    )
                    print(f"   ➡️ Found {len(rows_or)} articles (OR fallback)")
                    results.extend(rows_or)
                    time.sleep(1)
                except Exception as e:
                    print(f"⚠️ Error in OR fetch for lang={lang}, country={country}: {e}")

    # --- Translate fetched articles into English ---
    translated_results = []
    for a in results:
        try:
            title_en = query_llm(f"Translate into English (short, precise):\n\n{a.get('title','')}")
            desc_en = query_llm(f"Translate into English (short, precise):\n\n{a.get('description','')}")
        except Exception as e:
            print(f"⚠️ Translation failed for {a.get('url')}: {e}")
            title_en, desc_en = a.get("title",""), a.get("description","")

        a["title_en"] = title_en
        a["description_en"] = desc_en
        translated_results.append(a)

    return translated_results


# --- Step 4: Embed + rank ---
def embed_and_rank_articles(articles: List[Dict[str, Any]], query: str):
    docs = []
    for a in articles:
        content = (a.get("title") or "") + " " + (a.get("description") or "")
        docs.append(Document(
            page_content=content,
            metadata={
                "url": a.get("url"),
                "title": a.get("title"),
                "date": a.get("published_at"),
                "source": (a.get("source") or {}).get("name", "")
            }
        ))

    vectorstore = FAISS.from_documents(docs, embeddings)
    docs_and_scores = vectorstore.similarity_search_with_score(query, k=len(docs))

    ranked = []
    for d, score in docs_and_scores:
        ranked.append({
            "title": d.metadata.get("title", ""),
            "url": d.metadata.get("url", ""),
            "date": d.metadata.get("date", ""),
            "source": d.metadata.get("source", ""),
            "snippet": d.page_content[:300],
            "score": float(score),
        })
    return vectorstore, ranked

# --- Step 5: Fetch top full articles ---
def fetch_top_full_articles(ranked: List[Dict[str, Any]], top_n: int = 2):
    selected = []
    for r in ranked[:top_n]:
        full_text = fetch_full_article(r["url"])
        if not full_text:
            continue
        selected.append({
            "title": r["title"],
            "url": r["url"],
            "date": r["date"],
            "source": r["source"],
            "full_text": full_text,
            "score": r["score"],
        })
    return selected

#build faiss from the full text articles save it in sb7
def build_faiss_from_full_articles(full_articles: List[Dict[str, Any]]) -> FAISS:
    """
    Build an in-memory FAISS index from full text articles (db7).
    Each document stores the full_text in page_content and metadata with URL etc.
    """
    docs = []
    for a in full_articles:
        docs.append(Document(
            page_content=a["full_text"],
            metadata={
                "url": a.get("url"),
                "title": a.get("title"),
                "date": a.get("date"),
                "source": a.get("source"),
                "score": a.get("score"),
            }
        ))
    return FAISS.from_documents(docs, embeddings)

#merge it with db6
def merge_db7_into_db6(full_articles: List[Dict[str, Any]], username: str):
    """
    Merge FAISS index built from db7 full text articles into db6 FAISS in Supabase.
    """
    
    # 1. Build FAISS from the new full articles
    new_db = build_faiss_from_full_articles(full_articles)

    # 2. Download existing db6 FAISS from Supabase
    try:
        tmp_dir = download_faiss_from_supabase("db6", username=username)
        existing_db = FAISS.load_local(tmp_dir, embeddings, allow_dangerous_deserialization=True)
        existing_db.merge_from(new_db)
        save_faiss_to_supabase(existing_db, db_key="db6", username=username)
        print(f"✅ Merged {len(full_articles)} db7 full-text articles into db6 FAISS")
    except FileNotFoundError:
        # If db6 doesn’t exist yet, just create it
        save_faiss_to_supabase(new_db, db_key="db6", username=username)
        print(f"✅ Created new db6 FAISS from {len(full_articles)} db7 full-text articles")
    return existing_db if 'existing' in locals() else new_db