Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

0-data-prepare.ipynb +509 -0
1-training.ipynb +0 -0
2-prepare-data-2.ipynb +1933 -0
3-training.ipynb +0 -0
mlp_model.pth +3 -0
pre-train.json.zip +3 -0
scaler.joblib +3 -0
test.json.zip +3 -0
train.json.zip +3 -0
val.json.zip +3 -0

0-data-prepare.ipynb ADDED Viewed

	@@ -0,0 +1,509 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7384109c-4507-4895-ac34-8400e2978021",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ujson as json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "de47a236-bb5d-4cb9-89e4-a8eb3f66abaa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "art_en_crime.json 22116 dict_keys(['id', 'embedding', 'themes', 'language', 'title', 'content', 'cluster_id'])\n",
+      "fincrime_train.json 27548 dict_keys(['content', 'themes', 'embedding', 'llm_themes', 'cl_themes'])\n",
+      "train_multilang_2.json 119583 dict_keys(['language', 'content', 'id', 'lang', 'embedding', 'cluster_id', 'themes', 'llm_themes'])\n",
+      "other_lang_train.json 149000 dict_keys(['embedding', 'themes', 'id', 'content'])\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "318247"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_files = ['art_en_crime.json', 'fincrime_train.json', 'train_multilang_2.json',  'other_lang_train.json']\n",
+    "# 'en_train.json'\n",
+    "train_data = []\n",
+    "for train_file in train_files:\n",
+    "    tm = json.load(open('topic_data_new_sorted/'+train_file))\n",
+    "    print(train_file, len(tm), tm[0].keys())\n",
+    "    train_data.extend(tm)\n",
+    "len(train_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "97e48648-4a9f-40b9-8b95-2740a93b4762",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in train_data:\n",
+    "    del i['embedding']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b3b5f700-5467-4d8b-963d-060a1962ab00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_data = []\n",
+    "contents = set()\n",
+    "for i in train_data:\n",
+    "    if not i.get('content'):\n",
+    "        continue\n",
+    "    th = hash(i['content'])\n",
+    "    if th not in contents:\n",
+    "        final_data.append(i)\n",
+    "    contents.add(th)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3527a7a2-b58c-406b-890e-99002ef050df",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(22116, 245203, 318247)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tc, cc = 0, 0\n",
+    "for i in train_data:\n",
+    "    if i.get('title'):\n",
+    "        tc += 1\n",
+    "    if i.get('content'):\n",
+    "        cc += 1\n",
+    "tc, cc, len(train_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "460312d0-236c-418d-a53e-1357fdc586bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "144496"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(final_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "631689b1-c18d-427c-baea-8d2d3d4b9781",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Politics': 44642,\n",
+       " 'Crime': 58051,\n",
+       " 'Financial Crime': 24289,\n",
+       " 'Business': 25247,\n",
+       " 'Entertainment': 18221,\n",
+       " 'Finance': 6166,\n",
+       " 'Economics': 1772,\n",
+       " 'Sports': 15184,\n",
+       " 'Tech': 4107,\n",
+       " 'Automotive': 2116,\n",
+       " 'Health': 7829,\n",
+       " 'Lifestyle': 368,\n",
+       " 'Science': 3481,\n",
+       " 'Travel': 914,\n",
+       " 'Weather': 1070,\n",
+       " 'General': 8972,\n",
+       " 'Types of life insurance fraud': 1,\n",
+       " 'Consequences of insurance fraud': 1,\n",
+       " 'How to prevent life insurance fraud': 1,\n",
+       " 'Front Running': 1,\n",
+       " 'fraud': 1}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "themes = {}\n",
+    "for i in final_data:\n",
+    "    for t in i['themes']:\n",
+    "        themes[t] = themes.get(t, 0) + 1\n",
+    "themes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "a7bccdc9-de94-4e7b-b686-0fa062f5e781",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'tr'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from fast_langdetect import detect\n",
+    "result = detect(text=\"Bugün hava çok güzel\", model='full', k=1)\n",
+    "result[0]['lang']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ed34643a-974c-4995-b349-6ce194397985",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'en': 49682,\n",
+       " 'hi': 541,\n",
+       " 'mt': 1,\n",
+       " 'fr': 11193,\n",
+       " 'de': 12987,\n",
+       " 'pt': 1586,\n",
+       " 'it': 1682,\n",
+       " 'ko': 455,\n",
+       " 'es': 4034,\n",
+       " 'zh': 5,\n",
+       " 'ar': 35732,\n",
+       " 'bn': 420,\n",
+       " 'pl': 1387,\n",
+       " 'nl': 824,\n",
+       " 'lv': 379,\n",
+       " 'hk': 345,\n",
+       " 'hr': 511,\n",
+       " 'ja': 591,\n",
+       " 'cy': 182,\n",
+       " 'sv': 852,\n",
+       " 'da': 2120,\n",
+       " 'el': 462,\n",
+       " 'tr': 507,\n",
+       " 'ro': 546,\n",
+       " 'ur': 723,\n",
+       " 'mr': 344,\n",
+       " 'so': 338,\n",
+       " 'fa': 729,\n",
+       " 'mk': 451,\n",
+       " 'gu': 385,\n",
+       " 'th': 617,\n",
+       " 'lt': 444,\n",
+       " 'tw': 246,\n",
+       " 'sl': 568,\n",
+       " 'ml': 414,\n",
+       " 'te': 344,\n",
+       " 'he': 526,\n",
+       " 'cs': 671,\n",
+       " 'et': 660,\n",
+       " 'ta': 463,\n",
+       " 'gl': 9,\n",
+       " 'id': 538,\n",
+       " 'ca': 506,\n",
+       " 'ast': 1,\n",
+       " 'eu': 1,\n",
+       " 'sk': 455,\n",
+       " 'sq': 424,\n",
+       " 'ne': 429,\n",
+       " 'fi': 652,\n",
+       " 'sw': 394,\n",
+       " 'bg': 507,\n",
+       " 'ru': 653,\n",
+       " 'hu': 564,\n",
+       " 'cn': 261,\n",
+       " 'vi': 535,\n",
+       " 'pa': 343,\n",
+       " 'no': 1156,\n",
+       " 'tl': 559,\n",
+       " 'uk': 500,\n",
+       " 'kn': 374,\n",
+       " 'af': 611,\n",
+       " 'arz': 58,\n",
+       " 'nn': 6,\n",
+       " 'dv': 10,\n",
+       " 'azb': 1,\n",
+       " 'sd': 1,\n",
+       " 'ckb': 1}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "langs = {}\n",
+    "for i in final_data:\n",
+    "    l = i.get('language')\n",
+    "    if not l:\n",
+    "        l = i.get('language')\n",
+    "    if not l:\n",
+    "        l = detect(text=i['content'], model='full', k=1)[0]['lang']\n",
+    "    langs[l] = langs.get(l, 0) + 1\n",
+    "langs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "dcdc4560-2bcb-4d90-a64d-436b85f23bc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del train_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "770489b9-78c2-4231-9c8b-ba000342fe6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_data = json.load(open('train1.json'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a69be4e4-08b3-4830-9b4d-b8ea67f15656",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from requests.adapters import HTTPAdapter\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "from typing import List, Dict, Any, Optional, Tuple\n",
+    "\n",
+    "def _make_session(pool_size: int) -> requests.Session:\n",
+    "    s = requests.Session()\n",
+    "    adapter = HTTPAdapter(pool_connections=pool_size, pool_maxsize=pool_size, max_retries=0)\n",
+    "    s.mount(\"http://\", adapter)\n",
+    "    s.mount(\"https://\", adapter)\n",
+    "    return s\n",
+    "\n",
+    "\n",
+    "def _embed_batch(\n",
+    "    session: requests.Session,\n",
+    "    base_url: str,\n",
+    "    text: str,\n",
+    "    timeout_s: float,\n",
+    "    extra_headers: Optional[Dict[str, str]] = None,\n",
+    ") -> Any:\n",
+    "    \"\"\"\n",
+    "    Native TEI endpoint: POST {base_url}/embed\n",
+    "    Payload: {\"inputs\": [..texts..]}\n",
+    "    \"\"\"\n",
+    "    url = base_url.rstrip(\"/\") + \"/embed\"\n",
+    "    headers = {\"Content-Type\": \"application/json\"}\n",
+    "    if extra_headers:\n",
+    "        headers.update(extra_headers)\n",
+    "    r = session.post(url, json={\"inputs\": text}, headers=headers, timeout=timeout_s)\n",
+    "    r.raise_for_status()\n",
+    "    return r.json()\n",
+    "\n",
+    "def one_call(art):\n",
+    "    if \"embedding\" in art:\n",
+    "        return\n",
+    "    text = []\n",
+    "    if art.get(\"title\"):\n",
+    "        text.append(art[\"title\"])\n",
+    "    if art.get(\"content\"):\n",
+    "        text.append(art[\"content\"])\n",
+    "    text = \"\\n\\n\".join(text)\n",
+    "    res = _embed_batch(session, base_url, text, timeout_s=30)\n",
+    "    art[\"embedding\"] = res[0]\n",
+    "\n",
+    "WORKERS = 64\n",
+    "session = _make_session(pool_size=WORKERS)\n",
+    "base_url = \"http://172.83.12.123:8080\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e8d7127f-af28-4029-aa16-35b52269317a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ThreadPoolExecutor(max_workers=WORKERS) as ex:\n",
+    "    _ = [ex.submit(one_call, art) for art in final_data]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "1ba25f4b-7bec-40a6-9b78-d014a64674c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(144496, 144496)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "te = 0\n",
+    "for i in final_data:\n",
+    "    if \"embedding\" in i and len(i[\"embedding\"]) == 1024:\n",
+    "        te += 1\n",
+    "te, len(final_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1e821b09-cfce-4ae6-82c6-1d786efcc676",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(final_data, open('train1.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19a5069a-6a0b-4ed2-ae4c-173899b4d632",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b02dcf6-2d64-4dab-a417-da3d2c411254",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f427d890-ce22-4a04-9157-04528db80369",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "en_test.json 750 750 750\n",
+      "other_lang_test.json 1000 981 981\n",
+      "spanish_tagged.json 1199 1197 1197\n",
+      "fincrime_test.json 623 623 623\n"
+     ]
+    }
+   ],
+   "source": [
+    "files = [\"en_test.json\", \"other_lang_test.json\", \"spanish_tagged.json\", \"fincrime_test.json\"]\n",
+    "for file in files:\n",
+    "    tm = json.load(open('topic_data_new_sorted/'+file))\n",
+    "    tf = []\n",
+    "    cc, em = 0, 0\n",
+    "    contents = set()\n",
+    "    for i in tm:\n",
+    "        if i['content'].lower() in contents:\n",
+    "            continue\n",
+    "        if i.get('content'):\n",
+    "            cc += 1\n",
+    "        contents.add(i['content'].lower())\n",
+    "        if \"embedding\" in i:\n",
+    "            del i[\"embedding\"]\n",
+    "        tf.append(i)\n",
+    "    with ThreadPoolExecutor(max_workers=WORKERS) as ex:\n",
+    "        _ = [ex.submit(one_call, art) for art in tf]\n",
+    "    for i in tf:\n",
+    "        if \"embedding\" in i and len(i[\"embedding\"]) == 1024:\n",
+    "            em += 1\n",
+    "    json.dump(tf, open('test/' + file, 'w'))\n",
+    "    print(file, len(tm), cc, em)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e42ec21-99df-4b62-b678-86bf24bb259e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

1-training.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

2-prepare-data-2.ipynb ADDED Viewed

	@@ -0,0 +1,1933 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a184875c-343b-4a39-ac51-2a80f8f661a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ujson as json\n",
+    "import random\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2f82047-027a-4950-8a17-6c8437d2f2c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "1. prepare dataset by lang + theme\n",
+    "2. choose MMR candidates (take atleast 15K each theme and 100 per each lang, en=7K, other=8K)\n",
+    "3. calculate embedding of qwen\n",
+    "4. again choose MMR based on qwen embedding also make list of llm verify candidate when qwen emb not matching with given themes (5K each)\n",
+    "5. confirm with LLM for selected candidates\n",
+    "6. split train & test also pre-train"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1679138-5178-4262-a2f1-5589dd80c7e9",
+   "metadata": {},
+   "source": [
+    "# 1. prepare dataset by lang + theme"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c70cc644-b406-4105-8bef-5dad0552d354",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = ['art_en.json', 'art_en2.json', 'art_non_en.json', 'art_non_en2.json']\n",
+    "for file in files:\n",
+    "    fo = open('raw_articles/' + file)\n",
+    "    tt = {}\n",
+    "    for line in fo:\n",
+    "        d = json.loads(line)\n",
+    "        if not d['_source'].get('nlp'):\n",
+    "            continue\n",
+    "        for t in d['_source']['nlp']['theme']:\n",
+    "            tt[t] = tt.get(t, 0) + 1\n",
+    "    tt = dict(sorted(tt.items(), key=lambda item: item[1]))\n",
+    "    print(file, tt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94a9c39a-e060-4b87-8edd-e65a3c480c86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = ['art_en2_specific.json', 'art_non_en_specific.json']\n",
+    "for file in files:\n",
+    "    fo = open('raw_articles/' + file)\n",
+    "    tt = {}\n",
+    "    for line in fo:\n",
+    "        d = json.loads(line)\n",
+    "        if not d['_source'].get('nlp'):\n",
+    "            continue\n",
+    "        for t in d['_source']['nlp']['theme']:\n",
+    "            tt[t] = tt.get(t, 0) + 1\n",
+    "    tt = dict(sorted(tt.items(), key=lambda item: item[1]))\n",
+    "    print(file, tt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2301a8d-aef2-4776-9d02-c35aae042267",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e28c0287-b649-4dea-a6f3-1e83a2669b69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = ['art_en.json', 'art_en2.json', 'art_non_en.json', 'art_non_en2.json', 'art_en2_specific.json', 'art_non_en_specific.json']\n",
+    "tt = {}\n",
+    "for file in files:\n",
+    "    fo = open('raw_articles/' + file)\n",
+    "    for line in fo:\n",
+    "        d = json.loads(line)\n",
+    "        if not d['_source'].get('nlp'):\n",
+    "            continue\n",
+    "        for t in d['_source']['nlp']['theme']:\n",
+    "            tt[t] = tt.get(t, 0) + 1\n",
+    "    tt = dict(sorted(tt.items(), key=lambda item: item[1]))\n",
+    "print(tt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b735036-3a34-4d20-bae9-58f8b5bc7100",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "priority_order = [\n",
+    "        \"Economics\",\n",
+    "        \"Financial Crime\",\n",
+    "        \"Finance\",\n",
+    "        \"Lifestyle\",\n",
+    "        \"Automotive\",\n",
+    "        \"Science\",\n",
+    "        \"Tech\",\n",
+    "        \"Travel\",\n",
+    "        \"Weather\",\n",
+    "        \"Health\",\n",
+    "        \"Crime\",\n",
+    "        \"Sports\",\n",
+    "        \"General\",\n",
+    "        \"Business\",\n",
+    "        \"Politics\",\n",
+    "        \"Entertainment\",\n",
+    "    ]\n",
+    "\n",
+    "def sort_themes_by_priority(themes):\n",
+    "    return sorted(\n",
+    "        themes,\n",
+    "        key=lambda theme: priority_order.index(theme)\n",
+    "        if theme in priority_order else float(\"inf\")\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e71c50e1-c159-4767-b366-f23153199e96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "en_ds = {}\n",
+    "\n",
+    "files = ['art_en.json', 'art_en2.json', 'art_en2_specific.json']\n",
+    "ts = set()\n",
+    "for file in files:\n",
+    "    fo = open('raw_articles/' + file)\n",
+    "    for line in fo:\n",
+    "        d = json.loads(line)\n",
+    "        art = {\"id\": d[\"_id\"], **d['_source']}\n",
+    "        if not art.get('nlp'):\n",
+    "            continue\n",
+    "        if not art['nlp'].get('new_embedding') or not art['nlp'].get('theme'):\n",
+    "            continue\n",
+    "        th = hash(art['title'].lower())\n",
+    "        if th in ts:\n",
+    "            continue\n",
+    "        ts.add(th)\n",
+    "        sts = art['nlp']['theme']\n",
+    "        sts = sort_themes_by_priority(sts)\n",
+    "        for st in sts:\n",
+    "            if st not in en_ds:\n",
+    "                en_ds[st] = []\n",
+    "            if len(en_ds[st]) >= 80000:\n",
+    "                continue\n",
+    "            en_ds[st].append(art)\n",
+    "            break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71b6a2b2-20c1-4fb7-a10d-2c4a5eb1d1e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for st in en_ds:\n",
+    "    print(st, '==>', len(en_ds[st]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8ffff0a-94ac-4d50-8ca1-179d163c458f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(en_ds, open('filtered/en_ds.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cdd6962-d7f8-4c1b-aea5-979b0f622308",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fast_langdetect import detect"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdb31010-0d34-47dd-8843-2fd417cea31f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_ds = {}\n",
+    "\n",
+    "files = ['art_non_en.json', 'art_non_en2.json', 'art_non_en_specific.json']\n",
+    "ts = set()\n",
+    "for file in files:\n",
+    "    fo = open('raw_articles/' + file)\n",
+    "    for line in fo:\n",
+    "        d = json.loads(line)\n",
+    "        art = {\"id\": d[\"_id\"], **d['_source']}\n",
+    "        if not art.get('nlp'):\n",
+    "            continue\n",
+    "        if not art['nlp'].get('new_embedding') or not art['nlp'].get('theme'):\n",
+    "            continue\n",
+    "        th = hash(art['title'].lower())\n",
+    "        if th in ts:\n",
+    "            continue\n",
+    "        ts.add(th)\n",
+    "\n",
+    "        lang = detect(text=art['content'], model='full', k=1)[0]['lang']\n",
+    "        if lang not in ml_ds:\n",
+    "            ml_ds[lang] = {}\n",
+    "\n",
+    "        sts = art['nlp']['theme']\n",
+    "        sts = sort_themes_by_priority(sts)\n",
+    "        for st in sts:\n",
+    "            if st not in ml_ds[lang]:\n",
+    "                ml_ds[lang][st] = []\n",
+    "            if len(ml_ds[lang][st]) >= 5000:\n",
+    "                continue\n",
+    "            ml_ds[lang][st].append(art)\n",
+    "            break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0dbe9913-f48f-4f08-9c25-89e9b4e9dc71",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "222f0b88-0808-426b-8ace-f57727265f79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for lang in ml_ds:\n",
+    "    xx = {t: len(ml_ds[lang][t]) for t in ml_ds[lang]}\n",
+    "    print(lang, '==>', xx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "127ce293-80b4-4f9d-973f-8293df8865ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for lang in ml_ds:\n",
+    "    tot = sum(len(ml_ds[lang][t]) for t in ml_ds[lang])\n",
+    "    if tot < 1000:\n",
+    "        continue\n",
+    "    json.dump(ml_ds[lang], open('filtered/' + lang + '_ds.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "44328527-b38f-4a69-90cd-92fbaa1e1fd3",
+   "metadata": {},
+   "source": [
+    "# 2. choose MMR candidates (take atleast 15K each theme and 100 per each lang, en=7K, other=8K)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ac418a0-0efd-4c77-ae86-4d8c8eb107c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!/usr/bin/env python3\n",
+    "from __future__ import annotations\n",
+    "\n",
+    "from typing import Any, Dict, List, Optional\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import faiss\n",
+    "import faiss.contrib.torch_utils  # enables passing torch tensors to faiss on GPU\n",
+    "\n",
+    "\n",
+    "def torch_l2_normalize_(x: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:\n",
+    "    # in-place row-wise normalize\n",
+    "    norms = torch.linalg.norm(x, dim=1, keepdim=True).clamp_min(eps)\n",
+    "    x.div_(norms)\n",
+    "    return x\n",
+    "\n",
+    "\n",
+    "class FaissGpuDiverseSelectorTorch:\n",
+    "    \"\"\"\n",
+    "    GPU-only pipeline:\n",
+    "      - build embeddings tensor on GPU (torch)\n",
+    "      - normalize on GPU\n",
+    "      - build GpuIndexFlatIP (cosine via normalization)\n",
+    "      - diversity selection via GPU spherical clustering (on a SAMPLE) + GPU NN mapping\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        embedding_key: str = \"new_embedding\",\n",
+    "        gpu_id: int = 0,\n",
+    "        seed: int = 123,\n",
+    "        temp_mem_gb: float = 6.0,\n",
+    "        use_float16_storage: bool = True,\n",
+    "        build_batch_size: int = 16384,   # batching reduces peak CPU RAM + helps conversion\n",
+    "    ):\n",
+    "        self.embedding_key = embedding_key\n",
+    "        self.gpu_id = int(gpu_id)\n",
+    "        self.seed = int(seed)\n",
+    "        self.temp_mem_bytes = int(temp_mem_gb * 1024**3)\n",
+    "        self.use_float16_storage = bool(use_float16_storage)\n",
+    "        self.build_batch_size = int(build_batch_size)\n",
+    "\n",
+    "        self.items: Optional[List[Dict[str, Any]]] = None\n",
+    "        self.xb: Optional[torch.Tensor] = None  # (N,d) on GPU, float32 normalized\n",
+    "        self.res: Optional[faiss.StandardGpuResources] = None\n",
+    "        self.index: Optional[faiss.GpuIndexFlatIP] = None\n",
+    "        self.d: Optional[int] = None\n",
+    "\n",
+    "    def fit(self, items: List[Dict[str, Any]]) -> \"FaissGpuDiverseSelectorTorch\":\n",
+    "        ngpu = faiss.get_num_gpus()\n",
+    "        if ngpu <= 0:\n",
+    "            raise RuntimeError(\"faiss.get_num_gpus()==0. You are not using faiss-gpu / CUDA not visible.\")\n",
+    "        if self.gpu_id >= ngpu:\n",
+    "            raise RuntimeError(f\"gpu_id={self.gpu_id} out of range; FAISS sees {ngpu} GPUs.\")\n",
+    "\n",
+    "        self.items = items\n",
+    "        n = len(items)\n",
+    "        if n == 0:\n",
+    "            raise ValueError(\"Empty items\")\n",
+    "\n",
+    "        # Infer dim from first embedding\n",
+    "        first = items[0][self.embedding_key]\n",
+    "        d = len(first)\n",
+    "        self.d = d\n",
+    "\n",
+    "        # Build GPU tensor in batches (still CPU->GPU copy, but avoids extra NumPy overhead)\n",
+    "        dev = torch.device(f\"cuda:{self.gpu_id}\")\n",
+    "        chunks: List[torch.Tensor] = []\n",
+    "\n",
+    "        bs = self.build_batch_size\n",
+    "        for start in range(0, n, bs):\n",
+    "            end = min(start + bs, n)\n",
+    "            # NOTE: this Python loop is unavoidable with list-of-dicts.\n",
+    "            # If you can store embeddings as a single array/tensor upstream, do that instead.\n",
+    "            batch = np.asarray([items[i][self.embedding_key] for i in range(start, end)], dtype=np.float32)\n",
+    "            t = torch.from_numpy(batch).to(device=dev, non_blocking=False)\n",
+    "            chunks.append(t)\n",
+    "\n",
+    "        xb = torch.cat(chunks, dim=0)  # (N,d) float32 on GPU\n",
+    "\n",
+    "        # Normalize on GPU\n",
+    "        torch_l2_normalize_(xb)\n",
+    "\n",
+    "        # Create and keep GPU resources + big temp memory\n",
+    "        res = faiss.StandardGpuResources()\n",
+    "        res.setTempMemory(self.temp_mem_bytes)\n",
+    "        self.res = res\n",
+    "\n",
+    "        # GPU IP index (cosine via normalization)\n",
+    "        cfg = faiss.GpuIndexFlatConfig()\n",
+    "        cfg.device = self.gpu_id\n",
+    "        cfg.useFloat16 = self.use_float16_storage\n",
+    "\n",
+    "        index = faiss.GpuIndexFlatIP(res, d, cfg)\n",
+    "\n",
+    "        # Add directly from GPU torch tensor (faiss.contrib.torch_utils)\n",
+    "        index.add(xb)\n",
+    "\n",
+    "        self.xb = xb\n",
+    "        self.index = index\n",
+    "        return self\n",
+    "\n",
+    "    def select(\n",
+    "        self,\n",
+    "        num_select: int,\n",
+    "        train_per_centroid: int = 500,  # IMPORTANT: keep this modest to avoid “forever”\n",
+    "        niter: int = 6,\n",
+    "        nredo: int = 1,\n",
+    "        centroid_search_k: int = 128,\n",
+    "    ) -> List[Dict[str, Any]]:\n",
+    "        if self.items is None or self.xb is None or self.index is None or self.res is None or self.d is None:\n",
+    "            raise RuntimeError(\"Call fit() first.\")\n",
+    "\n",
+    "        items = self.items\n",
+    "        xb = self.xb\n",
+    "        index = self.index\n",
+    "        res = self.res\n",
+    "        d = self.d\n",
+    "\n",
+    "        N = xb.shape[0]\n",
+    "        k = min(int(num_select), int(N))\n",
+    "        if k <= 0:\n",
+    "            return []\n",
+    "\n",
+    "        # Sample training data ON GPU\n",
+    "        train_sz = min(int(N), k * int(train_per_centroid))\n",
+    "        g = torch.Generator(device=xb.device)\n",
+    "        g.manual_seed(self.seed)\n",
+    "\n",
+    "        if train_sz < N:\n",
+    "            perm = torch.randperm(int(N), generator=g, device=xb.device)\n",
+    "            train_idx = perm[:train_sz]\n",
+    "            xtrain = xb.index_select(0, train_idx)\n",
+    "        else:\n",
+    "            xtrain = xb\n",
+    "\n",
+    "        # FAISS GPU spherical clustering\n",
+    "        clus = faiss.Clustering(d, k)\n",
+    "        clus.seed = self.seed\n",
+    "        clus.niter = int(niter)\n",
+    "        clus.nredo = int(nredo)\n",
+    "        clus.spherical = True\n",
+    "        clus.verbose = False\n",
+    "\n",
+    "        cfg = faiss.GpuIndexFlatConfig()\n",
+    "        cfg.device = self.gpu_id\n",
+    "        cfg.useFloat16 = self.use_float16_storage\n",
+    "        assign_index = faiss.GpuIndexFlatIP(res, d, cfg)\n",
+    "\n",
+    "        clus.train(xtrain.cpu().numpy(), assign_index)\n",
+    "\n",
+    "        centroids = faiss.vector_to_array(clus.centroids).reshape(k, d).astype(np.float32, copy=False)\n",
+    "        # Move centroids to GPU torch, normalize, then search on GPU\n",
+    "        C = torch.from_numpy(centroids).to(device=xb.device)\n",
+    "        torch_l2_normalize_(C)\n",
+    "\n",
+    "        centroid_search_k = max(int(centroid_search_k), 1)\n",
+    "        _, I = index.search(C, centroid_search_k)  # I is a torch tensor (thanks to torch_utils)\n",
+    "\n",
+    "        chosen: List[int] = []\n",
+    "        used = set()\n",
+    "\n",
+    "        I_cpu = I.to(\"cpu\").numpy()  # small: (k, centroid_search_k)\n",
+    "        for r in range(k):\n",
+    "            pick = None\n",
+    "            for j in range(centroid_search_k):\n",
+    "                idx = int(I_cpu[r, j])\n",
+    "                if idx >= 0 and idx not in used:\n",
+    "                    pick = idx\n",
+    "                    break\n",
+    "            if pick is not None:\n",
+    "                used.add(pick)\n",
+    "                chosen.append(pick)\n",
+    "\n",
+    "        # Fill if duplicates reduced count\n",
+    "        if len(chosen) < k:\n",
+    "            for r in range(k):\n",
+    "                if len(chosen) >= k:\n",
+    "                    break\n",
+    "                for j in range(centroid_search_k):\n",
+    "                    idx = int(I_cpu[r, j])\n",
+    "                    if idx >= 0 and idx not in used:\n",
+    "                        used.add(idx)\n",
+    "                        chosen.append(idx)\n",
+    "                        if len(chosen) >= k:\n",
+    "                            break\n",
+    "\n",
+    "        return [items[i] for i in chosen[:k]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ab2368e-7ec0-4870-b092-243b6c45bd70",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5fd1433-1ffd-489f-8bc7-bca2d76a22d4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4a937d7-e6bf-4f98-bbcd-ed9fb746b731",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = os.listdir('filtered')\n",
+    "files = [i for i in files if i != 'en_ds.json' and i.endswith('.json')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3c93e4b-5f2e-4ad2-9b39-59942539bce5",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "non_en = {}\n",
+    "for file in files:\n",
+    "    data = json.load(open('filtered/'+file))\n",
+    "    for theme in data:\n",
+    "        data[theme] = [{\"id\": i[\"id\"], \"title\": i[\"title\"], \"content\": i[\"content\"], \"language\": i[\"language\"], **i[\"nlp\"]} for i in data[theme]]\n",
+    "        mmr_cands = len(data[theme])//5\n",
+    "        if mmr_cands <= 2:\n",
+    "            continue\n",
+    "        selector = FaissGpuDiverseSelectorTorch(\n",
+    "            embedding_key=\"new_embedding\",\n",
+    "            gpu_id=0,\n",
+    "            temp_mem_gb=6.0,\n",
+    "            use_float16_storage=True,\n",
+    "            build_batch_size=8192,\n",
+    "        ).fit(data[theme])\n",
+    "        picked = selector.select(\n",
+    "            num_select=mmr_cands,\n",
+    "            train_per_centroid=500,\n",
+    "            niter=100,\n",
+    "            centroid_search_k=512,\n",
+    "        )\n",
+    "        if theme not in non_en:\n",
+    "            non_en[theme] = []\n",
+    "        print(file, theme, len(data[theme]), len(picked), mmr_cands)\n",
+    "        non_en[theme].extend(picked)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "499a2017-1bdc-44ea-b886-6a55bf7f8a36",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "final_ds = {}\n",
+    "for theme in non_en:\n",
+    "    selector = FaissGpuDiverseSelectorTorch(\n",
+    "        embedding_key=\"new_embedding\",\n",
+    "        gpu_id=0,\n",
+    "        temp_mem_gb=6.0,\n",
+    "        use_float16_storage=True,\n",
+    "        build_batch_size=8192,\n",
+    "    ).fit(non_en[theme])\n",
+    "    picked = selector.select(\n",
+    "        num_select=7000,\n",
+    "        train_per_centroid=500,\n",
+    "        niter=100,\n",
+    "        centroid_search_k=512,\n",
+    "    )\n",
+    "    if theme not in final_ds:\n",
+    "        final_ds[theme] = []\n",
+    "    print(theme, len(non_en[theme]), len(picked))\n",
+    "    for i in picked:\n",
+    "        del i['new_embedding']\n",
+    "    final_ds[theme].extend(picked)\n",
+    "\n",
+    "json.dump(final_ds, open('filtered2/ml_step1.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7161b895-b229-47a6-9082-b493fbd7428a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "en = json.load(open('filtered/en_ds.json'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce6213a5-40bb-454e-950b-b4b23e045e13",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "final_ds = {}\n",
+    "for theme in en:\n",
+    "    en[theme] = [{\"id\": i[\"id\"], \"title\": i[\"title\"], \"content\": i[\"content\"], \"language\": i[\"language\"], **i[\"nlp\"]} for i in en[theme]]\n",
+    "    selector = FaissGpuDiverseSelectorTorch(\n",
+    "        embedding_key=\"new_embedding\",\n",
+    "        gpu_id=0,\n",
+    "        temp_mem_gb=6.0,\n",
+    "        use_float16_storage=True,\n",
+    "        build_batch_size=8192,\n",
+    "    ).fit(en[theme])\n",
+    "    picked = selector.select(\n",
+    "        num_select=8000,\n",
+    "        train_per_centroid=500,\n",
+    "        niter=100,\n",
+    "        centroid_search_k=512,\n",
+    "    )\n",
+    "    if theme not in final_ds:\n",
+    "        final_ds[theme] = []\n",
+    "    print(theme, len(en[theme]), len(picked))\n",
+    "    for i in picked:\n",
+    "        del i['new_embedding']\n",
+    "    final_ds[theme].extend(picked)\n",
+    "\n",
+    "json.dump(final_ds, open('filtered2/en_step1.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "842369c9-6624-404d-808b-1e39a4a6a1b0",
+   "metadata": {},
+   "source": [
+    "# 3.calculate embedding of qwen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfce123b-6d47-4353-b36c-3db708274742",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from requests.adapters import HTTPAdapter\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "from typing import List, Dict, Any, Optional, Tuple\n",
+    "\n",
+    "def _make_session(pool_size: int) -> requests.Session:\n",
+    "    s = requests.Session()\n",
+    "    adapter = HTTPAdapter(pool_connections=pool_size, pool_maxsize=pool_size, max_retries=0)\n",
+    "    s.mount(\"http://\", adapter)\n",
+    "    s.mount(\"https://\", adapter)\n",
+    "    return s\n",
+    "\n",
+    "\n",
+    "def _embed_batch(\n",
+    "    session: requests.Session,\n",
+    "    base_url: str,\n",
+    "    text: str,\n",
+    "    timeout_s: float,\n",
+    "    extra_headers = {'x-api-token': '***'},\n",
+    ") -> Any:\n",
+    "    \"\"\"\n",
+    "    Native TEI endpoint: POST {base_url}/embed\n",
+    "    Payload: {\"inputs\": [..texts..]}\n",
+    "    \"\"\"\n",
+    "    url = base_url.rstrip(\"/\") + \"/qwen\"\n",
+    "    headers = {\"Content-Type\": \"application/json\"}\n",
+    "    if not text.startswith('query: '):\n",
+    "        text = 'query: ' + text\n",
+    "    if extra_headers:\n",
+    "        headers.update(extra_headers)\n",
+    "    r = session.post(url, json={\"inputs\": text}, headers=headers, timeout=timeout_s)\n",
+    "    r.raise_for_status()\n",
+    "    return r.json()\n",
+    "\n",
+    "def one_call(art):\n",
+    "    if \"embedding\" in art:\n",
+    "        return\n",
+    "    text = []\n",
+    "    if art.get(\"title\"):\n",
+    "        text.append(art[\"title\"])\n",
+    "    if art.get(\"content\"):\n",
+    "        text.append(art[\"content\"])\n",
+    "    text = \"\\n\\n\".join(text)\n",
+    "    try:\n",
+    "        res = _embed_batch(session, base_url, text, timeout_s=90)\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "    art[\"embedding\"] = res[0]\n",
+    "\n",
+    "WORKERS = 512\n",
+    "session = _make_session(pool_size=WORKERS)\n",
+    "base_url = \"http://65.19.132.154:9001\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5e64eac-dc40-41ec-a497-45ce9cb2f637",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7304ec1b-2a7a-42f7-a0fb-f6e5e6c4eb94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for file in ['en_step1.json', 'ml_step1.json']:\n",
+    "    data = json.load(open('filtered2/' + file))\n",
+    "    for theme in data:\n",
+    "        with ThreadPoolExecutor(max_workers=WORKERS) as ex:\n",
+    "            _ = [ex.submit(one_call, art) for art in data[theme]]\n",
+    "    json.dump(data, open('filtered2/' + file, 'w'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59b381bb-d4ca-4414-b299-3da7531faed3",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "# 4. again choose MMR based on qwen embedding also make list of llm verify candidate when qwen emb not matching with given themes (5K each)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80e76f11-f5de-4c3e-b130-5646b4bf5273",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fast_langdetect import detect\n",
+    "\n",
+    "en_data, ml_data = [], []\n",
+    "data = json.load(open('filtered2/en_step1.json'))\n",
+    "for theme in data:\n",
+    "    en_data.extend(data[theme])\n",
+    "\n",
+    "data = json.load(open('filtered2/ml_step1.json'))\n",
+    "for theme in data:\n",
+    "    ml_data.extend(data[theme])\n",
+    "\n",
+    "for file in ['train1.json', 'spanish_tagged.json', 'train_multilang_2.json']:\n",
+    "    data = json.load(open('filtered2/' + file))\n",
+    "    for i in data:\n",
+    "        if not i.get('content'):\n",
+    "            continue\n",
+    "        l = i.get('language')\n",
+    "        if not l:\n",
+    "            l = i.get('language')\n",
+    "        if not l:\n",
+    "            l = detect(text=i['content'], model='full', k=1)[0]['lang']\n",
+    "        t = {**i, \"language\": l, \"file\": file}\n",
+    "        if l == 'en':\n",
+    "            en_data.append(t)\n",
+    "        else:\n",
+    "            ml_data.append(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a435d53-1cf9-46e3-809a-208bb0a76122",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for i in ml_data:\n",
+    "#     if \"embedding\" in i:\n",
+    "#         del i[\"embedding\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5979e1a-4e94-45ee-937d-102550918ef7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ThreadPoolExecutor(max_workers=WORKERS) as ex:\n",
+    "    _ = [ex.submit(one_call, art) for art in en_data]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63e62230-bc41-442c-a612-08b22abdd0b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ThreadPoolExecutor(max_workers=WORKERS) as ex:\n",
+    "    _ = [ex.submit(one_call, art) for art in ml_data]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59d5c0cf-37ab-406c-ad53-042c7af451cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(en_data), len(ml_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e5543ef-53d6-4e97-923d-203a0382e878",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(en_data, open('filtered/en_step2.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8cedffd-36f8-4684-8126-2b6b78d67aec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(ml_data, open('filtered/ml_step2.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5c6d123-12a2-4275-a4b0-1d43951ed922",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# easy to confirm with theme embedding first and find mismatches\n",
+    "themes = [\n",
+    "        \"Economics\",\n",
+    "        \"Financial Crime\",\n",
+    "        \"Finance\",\n",
+    "        \"Lifestyle\",\n",
+    "        \"Automotive\",\n",
+    "        \"Science\",\n",
+    "        \"Tech\",\n",
+    "        \"Travel\",\n",
+    "        \"Weather\",\n",
+    "        \"Health\",\n",
+    "        \"Crime\",\n",
+    "        \"Sports\",\n",
+    "        \"General\",\n",
+    "        \"Business\",\n",
+    "        \"Politics\",\n",
+    "        \"Entertainment\",\n",
+    "    ]\n",
+    "theme_emb = {i: _embed_batch(session, base_url, i, timeout_s=90)[0] for i in themes}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d041e14a-cb28-4714-afbe-582ccb64fcbb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "def l2_normalize(x, axis=-1, eps=1e-12):\n",
+    "    x = np.asarray(x, dtype=np.float32)\n",
+    "    return x / (np.linalg.norm(x, axis=axis, keepdims=True) + eps)\n",
+    "\n",
+    "def prepare_theme_matrix(theme_to_emb: dict):\n",
+    "    theme_names = list(theme_to_emb.keys())\n",
+    "    theme_mat = np.vstack([theme_to_emb[t] for t in theme_names]).astype(np.float32)\n",
+    "    theme_mat = l2_normalize(theme_mat, axis=1)   # (T, D)\n",
+    "    return theme_names, theme_mat\n",
+    "\n",
+    "def similarities(article_mat, theme_mat):\n",
+    "    # article_mat: (N, D)  theme_mat: (T, D)\n",
+    "    article_mat = l2_normalize(article_mat, axis=1)\n",
+    "    return article_mat @ theme_mat.T              # (N, T) cosine similarities\n",
+    "\n",
+    "def pick_by_gap_row(scores, gap=0.10, min_score=0.25, max_k=3):\n",
+    "    # scores: (T,) raw similarity\n",
+    "    idx = np.argsort(scores)[::-1]\n",
+    "    s = scores[idx]\n",
+    "\n",
+    "    # keep while above floor\n",
+    "    keep = np.where(s >= min_score)[0]\n",
+    "    if keep.size == 0:\n",
+    "        return idx[:0]  # none\n",
+    "    last = keep[-1]\n",
+    "\n",
+    "    # stop at first big drop\n",
+    "    drops = s[:-1] - s[1:]\n",
+    "    cut_points = np.where(drops >= gap)[0]\n",
+    "    if cut_points.size > 0:\n",
+    "        last = min(last, cut_points[0])\n",
+    "\n",
+    "    last = min(last, max_k - 1)  # cap number of themes\n",
+    "    return idx[: last + 1]\n",
+    "\n",
+    "def assign_themes(article_mat, theme_to_emb, gap=0.08, min_score=0.2, max_k=5):\n",
+    "    theme_names, theme_mat = prepare_theme_matrix(theme_to_emb)\n",
+    "    sim = similarities(article_mat, theme_mat)  # (N, T)\n",
+    "\n",
+    "    results = []\n",
+    "    for i in range(sim.shape[0]):\n",
+    "        chosen_idx = pick_by_gap_row(sim[i], gap=gap, min_score=min_score, max_k=max_k)\n",
+    "        results.append([theme_names[j] for j in chosen_idx])\n",
+    "    return results\n",
+    "\n",
+    "def is_prediction_on_top(my_prediction, emb_prediction):\n",
+    "    \"\"\"\n",
+    "    Returns True if:\n",
+    "      - len(my_prediction)==1: the single predicted theme is the top-1 embedding theme\n",
+    "      - len(my_prediction)>1: the top-K embedding themes (K=len(my_prediction)) match my_prediction as a set\n",
+    "        (order-insensitive), meaning all predicted themes are \"on top\" together.\n",
+    "    \"\"\"\n",
+    "    if not my_prediction or not emb_prediction:\n",
+    "        return False\n",
+    "\n",
+    "    k = len(my_prediction)\n",
+    "    topk = emb_prediction[:k]\n",
+    "\n",
+    "    if k == 1:\n",
+    "        return my_prediction[0] == emb_prediction[0]\n",
+    "\n",
+    "    return set(my_prediction) == set(topk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c72387e-4343-47f0-88fd-ef964a03fbcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "is_prediction_on_top(en_data[500]['themes'], assign_themes([en_data[500]['embedding']], theme_emb)[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61121189-f649-4660-a4d0-973cae54de6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = assign_themes([i['embedding'] for i in en_data], theme_emb)\n",
+    "nb = 0\n",
+    "for i, j in zip(en_data, results):\n",
+    "    if is_prediction_on_top(i['themes'], j):\n",
+    "        i['need_to_validate'] = False\n",
+    "        nb += 1\n",
+    "    else:\n",
+    "        i['need_to_validate'] = True\n",
+    "nb, len(en_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2895b09b-130c-4c57-b0e3-4b8bcbc41f0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = assign_themes([i['embedding'] for i in ml_data], theme_emb)\n",
+    "nb = 0\n",
+    "for i, j in zip(ml_data, results):\n",
+    "    if is_prediction_on_top(i['themes'], j):\n",
+    "        i['need_to_validate'] = False\n",
+    "        nb += 1\n",
+    "    else:\n",
+    "        i['need_to_validate'] = True\n",
+    "nb, len(ml_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "446c19a3-afab-4100-964a-093969034e91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uuid\n",
+    "str(uuid.uuid4())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93dc427c-cebd-4376-a565-ff9e8283077f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nid, nt = 0, 0\n",
+    "for i in en_data + ml_data:\n",
+    "    if not i.get('id'):\n",
+    "        i['id'] = str(uuid.uuid4())\n",
+    "        nid += 1\n",
+    "    if not i.get('title'):\n",
+    "        nt += 1\n",
+    "len(en_data + ml_data), nid, nt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edfe0095-a239-4df6-9c1c-8d8f6033892b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(en_data, open('filtered2/en_step2.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ed59f14-c070-41a5-bf81-2f21fb89e797",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(ml_data, open('filtered2/ml_step2.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67d594c0-6eab-4191-855e-b98e1d490649",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d612263-349d-4ae6-81f3-baf643452c1c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "en_themewise = {}\n",
+    "for i in en_data:\n",
+    "    sts = sort_themes_by_priority(i['themes'])\n",
+    "    for st in sts:\n",
+    "        if st not in en_themewise:\n",
+    "            en_themewise[st] = []\n",
+    "        en_themewise[st].append(i)\n",
+    "        break\n",
+    "for i in en_themewise:\n",
+    "    tcc = len(en_themewise[i])\n",
+    "    selector = FaissGpuDiverseSelectorTorch(\n",
+    "        embedding_key=\"embedding\",\n",
+    "        gpu_id=0,\n",
+    "        temp_mem_gb=6.0,\n",
+    "        use_float16_storage=True,\n",
+    "        build_batch_size=8192,\n",
+    "    ).fit(en_themewise[i])\n",
+    "    picked = selector.select(\n",
+    "        num_select=5000,\n",
+    "        train_per_centroid=500,\n",
+    "        niter=100,\n",
+    "        centroid_search_k=512,\n",
+    "    )\n",
+    "    en_themewise[i] = picked\n",
+    "    print(i, '==>', tcc, len(picked))\n",
+    "en_data = []\n",
+    "for theme in en_themewise:\n",
+    "    en_data.extend(en_themewise[theme])\n",
+    "json.dump(en_data, open('filtered2/en_step3.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5d4d3c4-9248-46b2-ba52-51dd4e22838d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "ml_themewise = {}\n",
+    "for i in ml_data:\n",
+    "    sts = sort_themes_by_priority(i['themes'])\n",
+    "    for st in sts:\n",
+    "        if st not in ml_themewise:\n",
+    "            ml_themewise[st] = []\n",
+    "        ml_themewise[st].append(i)\n",
+    "        break\n",
+    "for i in ml_themewise:\n",
+    "    print(i, '==>', len(ml_themewise[i]))\n",
+    "    tcc = len(ml_themewise[i])\n",
+    "    selector = FaissGpuDiverseSelectorTorch(\n",
+    "        embedding_key=\"embedding\",\n",
+    "        gpu_id=0,\n",
+    "        temp_mem_gb=6.0,\n",
+    "        use_float16_storage=True,\n",
+    "        build_batch_size=8192,\n",
+    "    ).fit(ml_themewise[i])\n",
+    "    picked = selector.select(\n",
+    "        num_select=5000,\n",
+    "        train_per_centroid=500,\n",
+    "        niter=100,\n",
+    "        centroid_search_k=512,\n",
+    "    )\n",
+    "    ml_themewise[i] = picked\n",
+    "    print(i, '==>', tcc, len(picked))\n",
+    "ml_data = []\n",
+    "for theme in ml_themewise:\n",
+    "    ml_data.extend(ml_themewise[theme])\n",
+    "json.dump(ml_data, open('filtered2/ml_step3.json', 'w'))\n",
+    "len(ml_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17b1492a-0b40-43b3-9a46-3fea7fd4a7f6",
+   "metadata": {},
+   "source": [
+    "# 5. confirm with LLM for selected candidates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "033d00e6-9068-4705-a8a5-7a0aa4828d52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nb = 0\n",
+    "for i in en_data + ml_data:\n",
+    "    if i.get('need_to_validate'):\n",
+    "        nb += 1\n",
+    "nb, len(en_data + ml_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6701aa6e-2797-4083-9496-1308d71ffbde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import logging\n",
+    "import time\n",
+    "import random\n",
+    "\n",
+    "import openai\n",
+    "\n",
+    "OPENAI_KEY = '***'\n",
+    "MAX_OPENAI_RETRIES = 5\n",
+    "\n",
+    "openai.api_key = OPENAI_KEY\n",
+    "\n",
+    "\n",
+    "def extract_function_data(\n",
+    "        function: dict,\n",
+    "        content: str,\n",
+    "        target_model: str = 'gpt-4o-mini',\n",
+    "        role: str = \"You are a zero-shot classification model.\",\n",
+    "        retries: int = 0,\n",
+    "        extra_tags: dict = None\n",
+    "):\n",
+    "    retries += 1\n",
+    "    if extra_tags:\n",
+    "        openpipe_tags.update(extra_tags)\n",
+    "    if retries > MAX_OPENAI_RETRIES:\n",
+    "        logging.error(\"Failed to Extract Event Data\", extra={\n",
+    "            \"content_length\": len(content.split(\" \")),\n",
+    "            \"error_message\": f\"Failed to extract function data after {MAX_OPENAI_RETRIES} retries\"\n",
+    "        })\n",
+    "        return None\n",
+    "    try:\n",
+    "        response = openai.chat.completions.create(\n",
+    "            model=target_model,\n",
+    "            messages=[{\"role\": \"system\", \"content\": role},\n",
+    "                      {\"role\": \"user\", \"content\": content}],\n",
+    "            functions=[function],\n",
+    "            function_call={\"name\": function[\"name\"]},\n",
+    "            temperature=0\n",
+    "        )\n",
+    "\n",
+    "    # Except rate limit error\n",
+    "    except openai.RateLimitError as e:\n",
+    "        # Ask the question again with 3/4 of the content\n",
+    "        logging.error(\"Retrying OpenAI Request\", extra={\n",
+    "            \"error\": str(e),\n",
+    "            \"error_message\": \"API limit reached, try again after 5 seconds\",\n",
+    "        })\n",
+    "        # sleep for random time between 5 to 15 seconds\n",
+    "        time.sleep(random.randint(5, 15))\n",
+    "        return extract_function_data(function, target_model, content, retries)\n",
+    "\n",
+    "    # Except all other errors\n",
+    "    except Exception as e:\n",
+    "        logging.error(\"Retrying OpenAI Request\", extra={\n",
+    "            \"error\": str(e),\n",
+    "            \"error_message\": \"Unkown Error, Retrying with 3/4 of the content\",\n",
+    "        })\n",
+    "        logging.error(e)\n",
+    "        content_length = len(content.split(\" \"))\n",
+    "        updated_content = \" \".join(content.split(\" \")[:int(content_length * 0.75)])\n",
+    "\n",
+    "        return extract_function_data(function, target_model, updated_content,\n",
+    "                                     retries)\n",
+    "\n",
+    "    try:\n",
+    "        result = json.loads(response.choices[0].message.function_call.arguments)\n",
+    "        ## Remove Keys with empty string value because OpenAI is returning empty strings as keys sometimes\n",
+    "        result = {k: v for k, v in result.items() if v not in [\"\", [], None]}\n",
+    "\n",
+    "        logging.info(\"Successfully extracted Event Data\", extra={\n",
+    "            \"error_message\": \"Content Classified Successfully\",\n",
+    "            \"content\": str(result)\n",
+    "        })\n",
+    "        return result\n",
+    "    except Exception as e:\n",
+    "        logging.error(\"Failed to Parse JSON from OpenAI in Extraction\", extra={\n",
+    "            \"error\": str(e),\n",
+    "            \"openai_response\": str(response),\n",
+    "        })\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2992602-ab1b-43ad-a2c2-ede79671affc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int_theme_classification_function = {\n",
+    "    \"name\": \"theme_classification\",\n",
+    "    \"description\": \"please classify below news article into any of these categories, ['Health', 'Entertainment', 'Business', 'Science', 'Politics', 'Finance', 'Economics', 'Tech', 'Crime', 'Sports', 'Lifestyle', 'Automotive', 'Travel', 'Weather', 'General', 'Financial Crime'], You should return \\\"General\\\" only if article doesnt fall into any of these, Please classigy it carefully. Don't tell me bad result before confirming. also don't return all categories, return general if it not fits in any of above\",\n",
+    "    \"parameters\": {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {'Health': {'type': 'boolean'},\n",
+    "                       'Entertainment': {'type': 'boolean'},\n",
+    "                       'Business': {'type': 'boolean'}, 'Science': {'type': 'boolean'},\n",
+    "                       'Politics': {'type': 'boolean'}, 'Finance': {'type': 'boolean'},\n",
+    "                       'Economics': {'type': 'boolean'}, 'Tech': {'type': 'boolean'},\n",
+    "                       'Crime': {'type': 'boolean'}, 'Sports': {'type': 'boolean'},\n",
+    "                       'Lifestyle': {'type': 'boolean'},\n",
+    "                       'Automotive': {'type': 'boolean'}, 'Travel': {'type': 'boolean'},\n",
+    "                       'Weather': {'type': 'boolean'}, 'General': {'type': 'boolean'}, 'Financial Crime': {'type': 'boolean'}},\n",
+    "        \"required\": []\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "245b1afc-7926-47fb-9f4b-fdae08edb4f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in en_data[::-1]:\n",
+    "    if i['need_to_validate']:\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05314580-6e86-4fb6-a554-2d357deb21b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "themes = extract_function_data(int_theme_classification_function, \"Title\" + i['title'] + '\\n\\nContent:' + i['content'], target_model='gpt-4.1-mini')\n",
+    "themes = [i for i in themes if themes[i]]\n",
+    "themes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc20aeb3-8552-4873-bda2-37e498c04d49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def validate(art):\n",
+    "    if not art.get('need_to_validate'):\n",
+    "        return\n",
+    "    if \"prev_themes\" in art:\n",
+    "        return\n",
+    "    content = \"\"\n",
+    "    if art.get(\"title\"):\n",
+    "        content = \"Title:\" + art['title'] + '\\n\\nContent:' + art['content']\n",
+    "    else:\n",
+    "        content = art['content']\n",
+    "    themes = extract_function_data(int_theme_classification_function, content, target_model='gpt-4.1-mini')\n",
+    "    themes = [i for i in themes if themes[i]]\n",
+    "    if themes:\n",
+    "        art['prev_themes'] = art.pop('themes')\n",
+    "        art['themes'] = themes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11ca3cca-df82-4c92-aac8-58c6722d5ed7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ThreadPoolExecutor(max_workers=128) as ex:\n",
+    "    _ = [ex.submit(validate, art) for art in en_data]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2541763-4a46-4195-b2e9-48df2e992add",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ThreadPoolExecutor(max_workers=128) as ex:\n",
+    "    _ = [ex.submit(validate, art) for art in ml_data]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07227c4d-6cdb-41dc-87d3-b67c73f15b6f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nb = 0\n",
+    "for i in en_data + ml_data:\n",
+    "    if i['need_to_validate'] and not i.get('prev_themes'):\n",
+    "        nb += 1\n",
+    "        print(i.get('title'), i.get('content'))\n",
+    "nb, len(en_data + ml_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24004f74-9ca5-4c70-88c1-f1ec3fbb13c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(en_data, open('filtered2/en_step4.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67a33aa0-e8e7-44c3-89d6-2ca097679988",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(ml_data, open('filtered2/ml_step4.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ac2b2bd-7419-4f67-a023-59bac45d57b4",
+   "metadata": {},
+   "source": [
+    "# 6. split train & test also pre-train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afab9235-4e4c-4a28-b6d1-e6071efd056a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.shuffle(en_data)\n",
+    "random.shuffle(ml_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d57dc5c4-1756-497c-9be3-9e3c4086f667",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = {}\n",
+    "test = {}\n",
+    "\n",
+    "for i in en_data:\n",
+    "    sts = sort_themes_by_priority(i['themes'])\n",
+    "    if 'Education' in sts:\n",
+    "        sts.remove('Education')\n",
+    "    for st in sts:\n",
+    "        if st not in train:\n",
+    "            train[st] = []\n",
+    "            test[st] = []\n",
+    "        if len(test[st]) < 500 and i['need_to_validate']:\n",
+    "            test[st].append(i)\n",
+    "            break\n",
+    "        else:\n",
+    "            train[st].append(i)\n",
+    "            break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa27ad3e-71a5-4aed-bc2f-32771feda321",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in ml_data:\n",
+    "    sts = sort_themes_by_priority(i['themes'])\n",
+    "    if 'Education' in sts:\n",
+    "        sts.remove('Education')\n",
+    "    for st in sts:\n",
+    "        if st not in train:\n",
+    "            train[st] = []\n",
+    "            test[st] = []\n",
+    "        if len(test[st]) < 1000 and i['need_to_validate']:\n",
+    "            test[st].append(i)\n",
+    "            break\n",
+    "        else:\n",
+    "            train[st].append(i)\n",
+    "            break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9cd3ece-301f-4fee-b8fd-a1c514287ce5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in test:\n",
+    "    print(i, len(test[i]), len(train[i]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6ac7932-ca64-4d06-8a12-e1235a0ee501",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(sum([train[i] for i in train], []), open('filtered2/train.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae639b65-1b28-4079-899c-8ae5dd86eb4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(sum([test[i] for i in test], []), open('filtered2/test.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc0fc992-1c32-42ed-95c8-b1b77d878b93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tests_cont = set([i['content'].lower().strip() for i in sum([test[i] for i in test], [])])\n",
+    "len(tests_cont)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d7ce8f4-00a4-419a-8b8d-6c06cf058aed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pre_train = json.load(open('filtered2/en_step2.json')) + json.load(open('filtered2/ml_step2.json'))\n",
+    "len(pre_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba3972ee-ac8c-408b-b314-cb89424ff46d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pre_train = [i for i in pre_train if i['content'].lower().strip() not in tests_cont]\n",
+    "len(pre_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25594c3a-f5f1-4d77-a297-a33229630ac5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(pre_train, open('filtered2/pre-train.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "751f2c18-80b4-4f18-aa51-8d4da2e26e01",
+   "metadata": {},
+   "source": [
+    "# split again val, train and test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0991b233-ec51-481e-90b3-208f5c2303ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = json.load(open('filtered2/train.json'))\n",
+    "pre_train = json.load(open('filtered2/pre-train.json'))\n",
+    "test = json.load(open('filtered2/test.json'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac1fb5ff-9702-4605-899c-0b63f22fd478",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def validate_train(art):\n",
+    "    if \"prev_themes\" in art:\n",
+    "        return\n",
+    "    content = \"\"\n",
+    "    if art.get(\"title\"):\n",
+    "        content = \"Title:\" + art['title'] + '\\n\\nContent:' + art['content']\n",
+    "    else:\n",
+    "        content = art['content']\n",
+    "    themes = extract_function_data(int_theme_classification_function, content, target_model='gpt-4.1-mini')\n",
+    "    themes = [i for i in themes if themes[i]]\n",
+    "    if themes:\n",
+    "        art['prev_themes'] = art.pop('themes')\n",
+    "        art['themes'] = themes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ab0726e-a993-4400-b444-014537253fed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from requests.adapters import HTTPAdapter\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "with ThreadPoolExecutor(max_workers=512) as ex:\n",
+    "    _ = [ex.submit(validate_train, art) for art in train]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "549b4492-97e6-4a26-b4da-fc8ad4c52073",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nb = 0\n",
+    "for i in train:\n",
+    "    if \"prev_themes\" not in i:\n",
+    "        nb += 1\n",
+    "nb, len(train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d011264f-f4af-4d82-bbfc-7d621d6f2215",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(train, open('filtered2/train.json', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "054fd8df-9ea0-4d39-ac99-7e3e43c6c4d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ntrain = {}\n",
+    "nval = {}\n",
+    "\n",
+    "for i in train:\n",
+    "    sts = sort_themes_by_priority(i['themes'])\n",
+    "    sts = [i for i in sts if i in priority_order]\n",
+    "    for st in sts:\n",
+    "        if st not in ntrain:\n",
+    "            ntrain[st] = []\n",
+    "            nval[st] = []\n",
+    "        if len(nval[st]) < 500 and i['need_to_validate']:\n",
+    "            nval[st].append(i)\n",
+    "            break\n",
+    "        else:\n",
+    "            ntrain[st].append(i)\n",
+    "            break\n",
+    "\n",
+    "ntrain = sum([ntrain[i] for i in ntrain], [])\n",
+    "nval = sum([nval[i] for i in nval], [])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7df359b3-b815-402a-a0b8-fd5d09cc7ee2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(ntrain), len(nval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b37878b1-4f2b-4a87-85ac-7a685e9fe00a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36c43a30-4b0b-443f-a29a-af78848b271c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "npretrain = {}\n",
+    "contset = {}\n",
+    "\n",
+    "for i in nval+test:\n",
+    "    contset[i['content'].lower().strip()] = i['themes']\n",
+    "len(contset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bc1d022-41c4-45f8-be2d-4246cd052802",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fin_rows = []\n",
+    "for i in pre_train:\n",
+    "    if 'Financial Crime' not in i['themes']:\n",
+    "        continue\n",
+    "    if i['content'].lower().strip() not in contset:\n",
+    "        fin_rows.append(i)\n",
+    "len(fin_rows)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d65d514c-c7cb-4a81-be63-da1420eb89c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ThreadPoolExecutor(max_workers=512) as ex:\n",
+    "    _ = [ex.submit(validate_train, art) for art in fin_rows]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b881fd9-3894-40ac-a032-2a81452f3d7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tt = {}\n",
+    "for i in fin_rows[:10000]:\n",
+    "    for t in i['themes']:\n",
+    "        tt[t] = tt.get(t, 0) + 1\n",
+    "tt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e9c7f98-befe-4eb6-9bad-28b839ce385b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in fin_rows:\n",
+    "    if i['content'].lower().strip() not in contset:\n",
+    "        contset[i['content'].lower().strip()] = i['themes']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc3dfbef-9361-4d38-b0b3-c8cbc63d04ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ntrain = ntrain + fin_rows\n",
+    "len(ntrain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb8f2cf0-49cf-4306-ac1f-73d19b139ab4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_train = {}\n",
+    "for i in ntrain:\n",
+    "    sts = sort_themes_by_priority(i['themes'])\n",
+    "    sts = [i for i in sts if i in priority_order]\n",
+    "    for st in sts:\n",
+    "        if st not in final_train:\n",
+    "            final_train[st] = []\n",
+    "        if len(final_train[st]) < 6000:\n",
+    "            final_train[st].append(i)\n",
+    "            break\n",
+    "for i in final_train:\n",
+    "    print(i, len(final_train[i]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d062629-4f0c-4e15-a36d-ff92ae463399",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_train = sum([final_train[t] for t in final_train], [])\n",
+    "len(final_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7d10a78-26c0-461b-800f-cae91b4b7e4a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32b75f1c-0b21-46a8-a2cc-9dbe5cb17088",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in pre_train:\n",
+    "    if i['content'].lower().strip() in contset:\n",
+    "        i['themes'] = contset[i['content'].lower().strip()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "606aac46-b3ce-48a8-96d5-b179845dfae9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "valid_themes = set(priority_order)\n",
+    "\n",
+    "def dumpjson(rows, file_name):\n",
+    "    t = {}\n",
+    "    for row in rows:\n",
+    "        themes = row['themes']\n",
+    "        themes = [i for i in themes if i in valid_themes]\n",
+    "        row['themes'] = themes\n",
+    "        for th in themes:\n",
+    "            t[th] = t.get(th, 0) + 1\n",
+    "    print(len(rows), t)\n",
+    "    json.dump(rows, open(file_name, 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "09f7e9ac-e6fd-47ce-823b-dabe74b27ba0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "92245 {'Entertainment': 6854, 'General': 6302, 'Politics': 14207, 'Business': 14722, 'Lifestyle': 7827, 'Tech': 8823, 'Health': 7985, 'Crime': 9548, 'Economics': 7703, 'Sports': 6607, 'Travel': 6812, 'Science': 4773, 'Automotive': 4698, 'Weather': 6351, 'Finance': 8080, 'Financial Crime': 5218}\n"
+     ]
+    }
+   ],
+   "source": [
+    "dumpjson(final_train, 'filtered2/train.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "902ece36-0efa-4225-97c1-7a6b0f6082a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8000 {'Entertainment': 588, 'General': 534, 'Politics': 1285, 'Lifestyle': 505, 'Health': 687, 'Sports': 570, 'Weather': 529, 'Business': 1465, 'Tech': 763, 'Crime': 676, 'Travel': 543, 'Science': 527, 'Automotive': 509, 'Economics': 500, 'Finance': 703, 'Financial Crime': 507}\n"
+     ]
+    }
+   ],
+   "source": [
+    "dumpjson(nval, 'filtered2/val.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "03a5343e-2fee-4051-be11-d37bab9a4c24",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "16000 {'Entertainment': 1161, 'Politics': 2414, 'Crime': 1337, 'Business': 2698, 'General': 1065, 'Sports': 1118, 'Financial Crime': 1009, 'Finance': 1371, 'Travel': 1143, 'Tech': 1476, 'Health': 1295, 'Automotive': 1014, 'Weather': 1060, 'Lifestyle': 1023, 'Economics': 1000, 'Science': 1056}\n"
+     ]
+    }
+   ],
+   "source": [
+    "dumpjson(test, 'filtered2/test.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6892ae88-b0ae-4318-82fe-868f0f260c1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in train:\n",
+    "    if i['content'].lower().strip() not in contset:\n",
+    "        contset[i['content'].lower().strip()] = i['themes']\n",
+    "\n",
+    "for i in pre_train:\n",
+    "    if i['content'].lower().strip() in contset:\n",
+    "        i['themes'] = contset[i['content'].lower().strip()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cd446ac-0209-4937-a8da-045e8d17edb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(pre_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ae5ec6d-ed70-4ba8-a188-a1c3dd561dbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "to_ignore = set()\n",
+    "for i in test + nval:\n",
+    "    to_ignore.add(i['content'].lower().strip())\n",
+    "\n",
+    "final_pre_train = []\n",
+    "for i in pre_train:\n",
+    "    if i['content'].lower().strip() in to_ignore:\n",
+    "        continue\n",
+    "    final_pre_train.append(i)\n",
+    "len(final_pre_train), len(to_ignore)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "6f560a9a-cf70-47f3-97f3-ae45f520590f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "397784 {'Sports': 37315, 'Lifestyle': 21900, 'Crime': 88859, 'Business': 60980, 'Entertainment': 40906, 'Politics': 78084, 'General': 25417, 'Tech': 25008, 'Health': 32249, 'Travel': 15170, 'Finance': 21892, 'Economics': 13128, 'Weather': 13586, 'Science': 14338, 'Automotive': 12176, 'Financial Crime': 5035}\n"
+     ]
+    }
+   ],
+   "source": [
+    "dumpjson(final_pre_train, 'filtered2/pre-train.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd98be36-8ec4-444e-a7fa-d7196334a630",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fbf2b53-45a6-4727-b862-bb7819e4562e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

3-training.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mlp_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32b352f602bdf62dc246c0d6509495469520d1c6e4caa856619beeae90d89c5f
+size 9100341

pre-train.json.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:025bb8dd1a317149eb9ce207b992142cbc99a0e50efa0704b5222701bf69d2da
+size 509109586

scaler.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a50abf4348f226b4f3dbd0fda27ae284de7882851bcdcfda3e25a3bcaed0bbab
+size 25191

test.json.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbd8a5492b857ef5f1c0b8eb58a0f9c0bcc85532bae026bde30e3c47269ca423
+size 22253927

train.json.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5192ac529ad7c2c8083981325dd14d774a92f04ae768aac38f0b9d6669d75d6
+size 129843701

val.json.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3de1a164f7512828b632939d5cc75b002f7f8564c0ba142b3329cb5e2e3a5199
+size 11034512