Spaces:

alesamodio
/

Socrates_docker

Sleeping

File size: 24,604 Bytes

0b170f9

import os, time, requests
from typing import List, Dict, Any
from datetime import datetime, timedelta
from openai import OpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from typing import List, Dict
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from dateutil import parser

from supabase import create_client
import time
import requests
from supabase_ie import upload_text, download_faiss_from_supabase, save_faiss_to_supabase
from config import SUPABASE_URL, SUPABASE_SERVICE_KEY, OPENAI_CLASSIFIER_MODEL, GNEWS_KEY,HF_EMBEDDING_MODEL, MY_SUPABASE_UUID

supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
SEARCH_URL = "https://gnews.io/api/v4/search"
MODEL = OPENAI_CLASSIFIER_MODEL
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
gnews_key = GNEWS_KEY
DEFAULT_TIMEOUT = 25
SIMILARITY_THRESHOLD = 0.6
UA = {"User-Agent": "Genesis-NewsBot/1.0 (+internal-use)"}

embeddings = HuggingFaceEmbeddings(
    model_name= HF_EMBEDDING_MODEL,
    encode_kwargs={"normalize_embeddings": True},
)
COUNTRY_NAME_TO_ISO2 = {
    # Anglosphere & Europe (core)
    "australia": "au",
    "new zealand": "nz",
    "united kingdom": "gb",
    "england": "gb",
    "scotland": "gb",
    "wales": "gb",
    "northern ireland": "gb",
    "ireland": "ie",
    "france": "fr",
    "germany": "de",
    "italy": "it",
    "spain": "es",
    "portugal": "pt",
    "netherlands": "nl",
    "belgium": "be",
    "switzerland": "ch",
    "austria": "at",
    "denmark": "dk",
    "sweden": "se",
    "norway": "no",
    "finland": "fi",
    "iceland": "is",
    "poland": "pl",
    "czech republic": "cz",
    "czechia": "cz",
    "slovakia": "sk",
    "hungary": "hu",
    "greece": "gr",
    "romania": "ro",
    "bulgaria": "bg",
    "croatia": "hr",
    "serbia": "rs",
    "slovenia": "si",
    "albania": "al",
    "bosnia and herzegovina": "ba",
    "north macedonia": "mk",
    "montenegro": "me",
    "estonia": "ee",
    "latvia": "lv",
    "lithuania": "lt",
    "luxembourg": "lu",
    "malta": "mt",
    "monaco": "mc",
    "andorra": "ad",
    "san marino": "sm",
    "vatican": "va",

    # Americas
    "united states": "us",
    "usa": "us",
    "canada": "ca",
    "mexico": "mx",
    "argentina": "ar",
    "brazil": "br",
    "chile": "cl",
    "colombia": "co",
    "peru": "pe",
    "uruguay": "uy",
    "paraguay": "py",
    "bolivia": "bo",
    "ecuador": "ec",
    "venezuela": "ve",
    "costa rica": "cr",
    "panama": "pa",
    "guatemala": "gt",
    "honduras": "hn",
    "el salvador": "sv",
    "nicaragua": "ni",
    "dominican republic": "do",
    "cuba": "cu",
    "jamaica": "jm",
    "trinidad and tobago": "tt",

    # Middle East & North Africa
    "turkey": "tr",
    "cyprus": "cy",
    "israel": "il",
    "palestine": "ps",
    "lebanon": "lb",
    "jordan": "jo",
    "saudi arabia": "sa",
    "united arab emirates": "ae",
    "uae": "ae",
    "qatar": "qa",
    "kuwait": "kw",
    "oman": "om",
    "bahrain": "bh",
    "iran": "ir",
    "iraq": "iq",
    "egypt": "eg",
    "morocco": "ma",
    "algeria": "dz",
    "tunisia": "tn",

    # Sub-Saharan Africa
    "south africa": "za",
    "nigeria": "ng",
    "ghana": "gh",
    "kenya": "ke",
    "ethiopia": "et",
    "tanzania": "tz",
    "uganda": "ug",
    "rwanda": "rw",
    "senegal": "sn",
    "ivory coast": "ci",
    "cote d'ivoire": "ci",
    "cameroon": "cm",
    "angola": "ao",
    "zambia": "zm",
    "zimbabwe": "zw",
    "botswana": "bw",
    "namibia": "na",
    "dr congo": "cd",
    "democratic republic of the congo": "cd",

    # Asia
    "china": "cn",
    "hong kong": "hk",
    "taiwan": "tw",
    "japan": "jp",
    "south korea": "kr",
    "korea, south": "kr",
    "north korea": "kp",
    "india": "in",
    "pakistan": "pk",
    "bangladesh": "bd",
    "sri lanka": "lk",
    "nepal": "np",
    "bhutan": "bt",
    "maldives": "mv",
    "indonesia": "id",
    "malaysia": "my",
    "singapore": "sg",
    "thailand": "th",
    "vietnam": "vn",
    "cambodia": "kh",
    "laos": "la",
    "myanmar": "mm",
    "philippines": "ph",
    "mongolia": "mn",
    "kazakhstan": "kz",
    "uzbekistan": "uz",

    # Oceania & Pacific
    "papua new guinea": "pg",
    "fiji": "fj",
    "samoa": "ws",
    "tonga": "to",
    "vanuatu": "vu",
    "solomon islands": "sb",

    # Already present (kept for completeness)
    "russia": "ru",
    "ukraine": "ua",
    "portugal": "pt",
    "greece": "gr",
    "netherlands": "nl",
    "norway": "no",
    "romania": "ro",
    "sweden": "se",
    "switzerland": "ch",
    "poland": "pl",
    "peru": "pe",
    "pakistan": "pk",
    "philippines": "ph",
    "japan": "jp",
    "ireland": "ie",
    "italy": "it",
    "germany": "de",
    "france": "fr",
    "spain": "es",
    "canada": "ca",
    "brazil": "br",
    "china": "cn",
    "australia": "au",
    "egypt": "eg",
    "hong kong": "hk",
}


KEYWORDS = ["Culture", "Sport", "Soccer", "Football", "Science", "Politics", "News", "War"]
COUNTRY_LANG_MAP = {
    # EUROPE
    "at": ["de"],                    # Austria
    "be": ["fr", "nl", "de"],        # Belgium
    "bg": ["bg"],                    # Bulgaria
    "ch": ["de", "fr", "it"],        # Switzerland
    "cz": ["cs"],                    # Czech Republic
    "de": ["de"],                    # Germany
    "dk": ["da"],                    # Denmark
    "ee": ["et"],                    # Estonia
    "es": ["es"],                    # Spain
    "fi": ["fi", "sv"],              # Finland
    "fr": ["fr"],                    # France
    "gr": ["el"],                    # Greece
    "hu": ["hu"],                    # Hungary
    "it": ["it"],                    # Italy
    "lt": ["lt"],                    # Lithuania
    "lv": ["lv"],                    # Latvia
    "nl": ["nl"],                    # Netherlands
    "no": ["no"],                    # Norway
    "pl": ["pl"],                    # Poland
    "pt": ["pt"],                    # Portugal
    "ro": ["ro"],                    # Romania
    "ru": ["ru"],                    # Russia
    "se": ["sv"],                    # Sweden
    "sk": ["sk"],                    # Slovakia
    "sl": ["sl"],                    # Slovenia
    "tr": ["tr"],                    # Turkey
    "ua": ["uk"],                    # Ukraine

    # AMERICAS (Spanish/Portuguese)
    "ar": ["es"],                    # Argentina
    "bo": ["es"],                    # Bolivia
    "br": ["pt"],                    # Brazil
    "ca": ["fr"],                    # Canada (keep FR only; EN is implicit)
    "cl": ["es"],                    # Chile
    "co": ["es"],                    # Colombia
    "cr": ["es"],                    # Costa Rica
    "cu": ["es"],                    # Cuba
    "do": ["es"],                    # Dominican Republic
    "ec": ["es"],                    # Ecuador
    "gt": ["es"],                    # Guatemala
    "hn": ["es"],                    # Honduras
    "mx": ["es"],                    # Mexico
    "ni": ["es"],                    # Nicaragua
    "pa": ["es"],                    # Panama
    "pe": ["es"],                    # Peru
    "py": ["es"],                    # Paraguay
    "sv": ["es"],                    # El Salvador
    "uy": ["es"],                    # Uruguay
    "ve": ["es"],                    # Venezuela

    # MIDDLE EAST & NORTH AFRICA
    "ae": ["ar"],                    # UAE
    "dz": ["ar", "fr"],              # Algeria
    "eg": ["ar"],                    # Egypt
    "il": ["he", "ar"],              # Israel
    "iq": ["ar"],                    # Iraq
    "jo": ["ar"],                    # Jordan
    "kw": ["ar"],                    # Kuwait
    "lb": ["ar"],                    # Lebanon
    "ma": ["ar", "fr"],              # Morocco
    "om": ["ar"],                    # Oman
    "ps": ["ar"],                    # Palestine
    "qa": ["ar"],                    # Qatar
    "sa": ["ar"],                    # Saudi Arabia
    "tn": ["ar", "fr"],              # Tunisia

    # SUB-SAHARAN AFRICA (only supported non-EN)
    "ao": ["pt"],                    # Angola
    "cd": ["fr"],                    # DR Congo
    "cm": ["fr"],                    # Cameroon
    "ci": ["fr"],                    # Côte d’Ivoire
    "mz": ["pt"],                    # Mozambique
    "sn": ["fr"],                    # Senegal

    # ASIA
    "cn": ["zh"],                    # China
    "hk": ["zh"],                    # Hong Kong
    "id": ["id"],                    # Indonesia
    "in": ["hi"],                    # India
    "jp": ["ja"],                    # Japan
    "kr": ["ko"],                    # South Korea
    "sg": ["zh"],                    # Singapore (non-EN: Chinese)
    "th": ["th"],                    # Thailand
    "tw": ["zh"],                    # Taiwan
    "vn": ["vi"],                    # Vietnam
}


def query_llm(prompt: str, model: str = MODEL) -> str:
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content.strip()


def fetch_gnews_for_country(country: str, lookback_days: int = 3, max_items: int = 5) -> List[Dict[str, Any]]:
    if not gnews_key:
        print("⚠️ GNEWS_KEY not set, skipping fetch")
        return []

    # Language resolution
    native_langs = COUNTRY_LANG_MAP.get(country, ["en"])
    main_lang = native_langs[0] if isinstance(native_langs, list) else native_langs

    # Allow only GNews-supported languages; include 'el' for Greece
    supported_langs = ["en","it","fr","de","es","pt","ru","zh","ja","nl","no","sv","ro","uk","ar","hi","el"]
    if main_lang not in supported_langs:
        print(f"⚠️ Language '{main_lang}' not supported by GNews, falling back to English")
        main_lang = "en"

    date_to = datetime.utcnow()
    date_from = date_to - timedelta(days=lookback_days)
    date_from = date_from.isoformat() + "Z"
    date_to = date_to.isoformat() + "Z"

    results: List[Dict[str, Any]] = []
    for kw in KEYWORDS:
        try:
            if main_lang == "en":
                # 1) English with country
                rows = fetch_search(
                    api_key=gnews_key, bucket=kw, q=kw, lang="en", country=country,
                    max_items=max_items, pages=1, sort_by="relevance",
                    date_from=date_from, date_to=date_to
                )
                results.extend(rows)
                time.sleep(1)

                # 2) Global English (no country)
                rows = fetch_search(
                    api_key=gnews_key, bucket=kw, q=kw, lang="en", country=None,
                    max_items=max_items, pages=1, sort_by="relevance",
                    date_from=date_from, date_to=date_to
                )
                results.extend(rows)
                time.sleep(1)
            else:
                # 1) Native with country
                rows = fetch_search(
                    api_key=gnews_key, bucket=kw, q=translate_term(kw, main_lang), lang=main_lang, country=country,
                    max_items=max_items, pages=1, sort_by="relevance",
                    date_from=date_from, date_to=date_to
                )
                results.extend(rows)
                time.sleep(1)

                # 2) Global English
                rows = fetch_search(
                    api_key=gnews_key, bucket=kw, q=kw, lang="en", country=None,
                    max_items=max_items, pages=1, sort_by="relevance",
                    date_from=date_from, date_to=date_to
                )
                results.extend(rows)
                time.sleep(1)
        except Exception as e:
            # Never crash on per-keyword errors
            print(f"⚠️ Fetch error for {country}/{kw}: {e}")
            continue

    # Dedupe by 'id' (the URL)
    seen = set()
    deduped = []
    for r in results:
        rid = r.get("id")
        if rid and rid not in seen:
            seen.add(rid)
            deduped.append(r)
    return deduped

def translate_term(term: str, target_lang: str) -> str:
    if target_lang.lower() in ["en", ""]:
        return term
    prompt = f"Translate the following word into {target_lang}. Only give the translation:\n\n{term}"
    return query_llm(prompt, model=MODEL)

def translate_articles_to_english(articles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Translate article titles and descriptions into English.
    Keeps both native and English fields in each article dict.
    """
    out = []
    for a in articles:
        # Translate title
        if a.get("title"):
            a["title_en"] = query_llm(
                f"Translate this into English (short and precise, keep meaning):\n\n{a['title']}"
            )
        else:
            a["title_en"] = ""

        # Translate description (summary)
        if a.get("description"):
            a["description_en"] = query_llm(
                f"Translate this into English (short and precise, keep meaning):\n\n{a['description']}"
            )
        else:
            a["description_en"] = ""

        out.append(a)
    return out


def save_articles_to_txt(articles: List[Dict[str, Any]]):
    """
    Save all fetched articles to a text file in Supabase.
    """
    if not articles:
        print("⚠️ No articles to upload to Supabase (db3 summary)")
        return

    lines = []
    for a in articles:
        lines.append(f"{a.get('title_en') or a.get('title')}")
        lines.append(f"{a.get('description_en') or a.get('description')}")
        lines.append(f"URL: {a.get('url')}")
        lines.append(f"Date: {a.get('published_at','')}")
        lines.append(f"Source: {(a.get('source') or {}).get('name','')}")
        lines.append("-" * 80)

    content = "\n".join(lines)

    # Example bucket structure: shared/db3
    bucket = "Databases"
    path = f"shared/db3/news_summary.txt"
    upload_text(bucket=bucket, path=path, text=content)

    print(f"✅ Saved {len(articles)} articles to Supabase bucket {bucket}/news_summary.txt")


def fetch_search(
    api_key: str,
    *,
    bucket: str,
    q: str,
    lang: str,
    country: str | None,
    max_items: int,
    pages: int,
    sort_by: str,
    date_from: str,
    date_to: str,
):
    params = {
        "q": q,
        "max": max_items,
        "page": pages,
        "sortby": sort_by,
        "lang": lang,
        "from": date_from,
        "to": date_to,
        "country": country,
        "apikey": api_key,
    }
    if country:
        params["country"] = country

    data = gnews_get("https://gnews.io/api/v4/search", params)
    if not data or "articles" not in data:
        return []  # graceful failure

    rows = []
    for a in data.get("articles", []):
        rows.append({
            "id": a.get("url"),  # use url as stable id for dedupe
            "title": a.get("title"),
            "description": a.get("description"),
            "url": a.get("url"),
            "published_at": a.get("publishedAt"),
            "source": a.get("source", {}),
            "lang": lang,
            "bucket": bucket,
            "country": country or "",
        })
    return rows


def dedupe_by_id(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    seen = set(); out = []
    for it in items:
        if it["id"] in seen:
            continue
        seen.add(it["id"])
        out.append(it)
    return out

def gnews_get(url: str, params: dict, max_retries: int = 3, timeout: int = 15):
    """
    Safe GET with exponential backoff:
    - retries 429/5xx with 1s, 2s, 4s sleeps
    - returns parsed JSON or None on final failure
    """
    for attempt in range(max_retries):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code in (429, 500, 502, 503, 504):
                # transient: backoff then retry
                time.sleep(2 ** attempt)
                continue
            r.raise_for_status()
            return r.json()
        except requests.RequestException as e:
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
                continue
            print(f"⚠️ GNews request failed: {e}")
            return None

def insert_new_articles_to_faiss(articles: List[Dict[str, Any]]):
    """
    Insert new (existing=False) articles from unmatched_articles into FAISS (db3) in Supabase.
    """
    docs = []
    for a in articles:
        page_content = (a.get("title_en") or "") + "\n" + (a.get("description_en") or "")
        date_str = a.get("published_at")
        try:
            date_val = parser.parse(date_str).astimezone().isoformat() if date_str else None
        except Exception:
            date_val = None
        docs.append(Document(
            page_content=page_content,
            metadata={
                "title_native": a.get("title_native"),
                "title_en": a.get("title_en"),
                "summary_native": a.get("description_native"),
                "summary_en": a.get("description_en"),
                "url": a.get("url"),
                "date": date_val,
                "source": a.get("source", ""),
                "lang": a.get("lang", "")
            }
        ))

    if not docs:
        print("⚠️ No new documents to insert into FAISS.")
        return

    new_db = FAISS.from_documents(docs, embeddings)

    try:
        # Download existing FAISS from Supabase (if present)
        tmp_dir = download_faiss_from_supabase("db3", username="None")
        existing = FAISS.load_local(tmp_dir, embeddings, allow_dangerous_deserialization=True)
        existing.merge_from(new_db)
        save_faiss_to_supabase(existing, db_key="db3", username="None")
        print(f"✅ Merged {len(docs)} new articles into FAISS (db3)")
    except FileNotFoundError:
        # If no FAISS exists, just upload the new one
        save_faiss_to_supabase(new_db, db_key="db3", username="None")
        print(f"✅ Created new FAISS (db3) with {len(docs)} articles")


def mark_all_existing():
    """Mark all rows as existing before inserting new ones."""
    supabase.table("unmatched_articles").update({"existing": True}).not_.is_("url", None).execute()

CHUNK = 500  # adjust if you batch very large lists

def upsert_unmatched_articles(articles: list[dict]) -> int:
    """
    Insert/update articles into public.unmatched_articles keyed by URL.
    - New URL  -> inserted.
    - Existing URL -> updated with the new fields.
    After upsert, all batch URLs are set to existing=False (deterministic).
    
    REQUIREMENT (run once in DB):
        CREATE UNIQUE INDEX IF NOT EXISTS unmatched_articles_url_uidx
        ON public.unmatched_articles (url);
    """
    # Build rows to write (ONLY include fields you want to overwrite)
    rows = []
    for a in articles:
        url = a.get("url")
        if not url:
            continue
        rows.append({
            "user_id": MY_SUPABASE_UUID,
            "url": url,
            "title_native": a.get("title"),
            "title_en": a.get("title_en"),
            "description_native": a.get("description"),
            "description_en": a.get("description_en"),
            "published_at": a.get("published_at"),  # ensure ISO string compatible with your column type
            "source": (a.get("source") or {}).get("name", ""),
            "lang": a.get("lang"),
            "existing": False,  # current batch should be marked as "new"
            "updated_at": datetime.utcnow().isoformat() + "Z",  # if column is timestamptz
        })

    if not rows:
        return 0
    
    deduped = {}
    for r in rows:
        u = r["url"].strip()
        if u:
            r["url"] = u
            deduped[u] = r
    rows = list(deduped.values())

    # 1) True UPSERT by URL (insert new, update existing)
    #    on_conflict="url" requires a UNIQUE index or PK on url
    resp = supabase.table("unmatched_articles").upsert(
        rows,
        on_conflict="url"  # <- key difference from naive insert+catch
        # , returning="minimal"   # uncomment to reduce payload if supported
    ).execute()
    if getattr(resp, "error", None):
        raise RuntimeError(f"Upsert failed: {resp.error}")

    # 2) Deterministically set the whole batch to existing=False (covers the UPDATE-on-conflict path)
    urls = [r["url"] for r in rows]
    for i in range(0, len(urls), CHUNK):
        r = supabase.table("unmatched_articles") \
            .update({"existing": False}) \
            .in_("url", urls[i:i+CHUNK]) \
            .execute()
        if getattr(r, "error", None):
            raise RuntimeError(f"Force existing=False failed: {r.error}")

    return len(rows)

# def upsert_unmatched_articles(articles: list[dict]):
#     """Insert or update articles in unmatched_articles table."""
#     for a in articles:
#         url = a.get("url")
#         if not url:
#             continue
#         record = {
#             "user_id": "all",
#             "url": url,
#             "title_native": a.get("title"),
#             "title_en": a.get("title_en"),
#             "description_native": a.get("description"),
#             "description_en": a.get("description_en"),
#             "published_at": a.get("published_at"),
#             "source": (a.get("source") or {}).get("name", ""),
#             "lang": a.get("lang"),
#             "existing": False,
#             "updated_at": datetime.utcnow().isoformat()
#         }
#         try:
#             supabase.table("unmatched_articles").insert(record).execute()
#         except Exception as e:
#             if "duplicate key value" in str(e):
#                 # Update instead of skip
#                 supabase.table("unmatched_articles").update(record).eq("url", url).execute()

def get_new_articles():
    """Get new articles (existing=False)."""
    res = supabase.table("unmatched_articles").select("*").eq("existing", False).execute()
    return res.data if hasattr(res, "data") and res.data else []

def _read_countries() -> list[str]:
    """
    Read countries across ALL users from user_profiles table.
    Return a unique list of ISO2 country codes.
    """
    countries = set()
    try:
        res = supabase.table("user_profiles").select("living_country, origin_country").execute()
        if not res.data:
            return []

        for row in res.data:
            for field in ["living_country", "origin_country"]:
                c = row.get(field)
                if not c:
                    continue
                iso2 = COUNTRY_NAME_TO_ISO2.get(c.strip().lower())
                if iso2:
                    countries.add(iso2)
                else:
                    print(f"⚠️ No ISO2 mapping for '{c}'")
    except Exception as e:
        print(f"⚠️ Failed to fetch user_profiles: {e}")
    return list(countries)

# This function provides countries for a single user, used in db6 and db7 pipelines (it is in db6_utils.py because of the input being here)
def get_user_countries_and_languages() -> tuple[list[str], list[str]]:
    """
    For each call:
    - Fetch living_country and origin_country from all users.
    - Convert them into ISO2 codes (using COUNTRY_NAME_TO_ISO2).
    - Expand into languages (using COUNTRY_LANG_MAP).
    Always include English ("en").

    Returns:
      user_countries: list of ISO2 codes (e.g., ["au","it"])
      user_languages: list of language codes (e.g., ["en","it"])
    """
    user_countries: list[str] = []
    user_languages: list[str] = []  # English always included

    try:
        res = supabase.table("user_profiles").select("living_country, origin_country").execute()
        if not res.data:
            return [], ["en"]

        row = res.data[0]

        for field in ["living_country", "origin_country"]:
            c = row.get(field)
            if not c:
                continue
            iso2 = COUNTRY_NAME_TO_ISO2.get(c.strip().lower())
            if iso2 and iso2 not in user_countries:
                user_countries.append(iso2)
                # add languages for this country if available
                if iso2 in COUNTRY_LANG_MAP:
                    for lang in COUNTRY_LANG_MAP[iso2]:
                        if lang not in user_languages:
                            user_languages.append(lang)
            else:
                print(f"⚠️ No ISO2 mapping for '{c}')")

    except Exception as e:
        print(f"⚠️ Failed to fetch countries/langs for user: {e}")

    return user_countries, user_languages