{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5330406b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-12-01 18:13:22.487912: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2025-12-01 18:13:22.702184: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2025-12-01 18:13:24.546313: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1fc8dba14aa742ee9f2c64aab53a530a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import torch\n",
    "from tqdm import tqdm\n",
    "from transformers import AutoTokenizer, AutoModel\n",
    "\n",
    "# -------------------------------------------------------\n",
    "# SETTINGS\n",
    "# -------------------------------------------------------\n",
    "# NEW: Use a model optimized for semantic search\n",
    "MODEL_NAME = \"nomic-ai/modernbert-embed-base\" \n",
    "CSV_PATH = \"../bm25/travel_blogs.csv\"\n",
    "OUTPUT_EMB_PATH = \"./travel_blog_embeddings.pt\"\n",
    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "# -------------------------------------------------------\n",
    "# LOAD MODEL + TOKENIZER\n",
    "# -------------------------------------------------------\n",
    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
    "model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)\n",
    "model.eval()\n",
    "\n",
    "# -------------------------------------------------------\n",
    "# LOAD DATA\n",
    "# -------------------------------------------------------\n",
    "df = pd.read_csv(CSV_PATH)\n",
    "\n",
    "# We will embed the \"content\" field\n",
    "texts = df[\"content\"].fillna(\"\").tolist()\n",
    "\n",
    "# -------------------------------------------------------\n",
    "# EMBEDDING FUNCTION (Batch optimized)\n",
    "# -------------------------------------------------------\n",
    "def embed_texts(texts_batch):\n",
    "    # Tokenize (handles list of strings automatically)\n",
    "    encoded = tokenizer(\n",
    "        texts_batch,\n",
    "        padding=True,\n",
    "        truncation=True,\n",
    "        max_length=512,\n",
    "        return_tensors=\"pt\"\n",
    "    ).to(DEVICE)\n",
    "\n",
    "    # Forward pass\n",
    "    with torch.no_grad():\n",
    "        outputs = model(**encoded)\n",
    "\n",
    "    # ModernBERT uses last_hidden_state for embeddings\n",
    "    last_hidden = outputs.last_hidden_state\n",
    "\n",
    "    # Mean pooling\n",
    "    attention_mask = encoded[\"attention_mask\"].unsqueeze(-1)\n",
    "    sum_embeddings = torch.sum(last_hidden * attention_mask, dim=1)\n",
    "    sum_mask = torch.sum(attention_mask, dim=1)\n",
    "    # Clamp sum_mask to avoid division by zero\n",
    "    sum_mask = torch.clamp(sum_mask, min=1e-9)\n",
    "    embedding = sum_embeddings / sum_mask\n",
    "    \n",
    "    # Normalize embeddings to unit length\n",
    "    embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)\n",
    "\n",
    "    return embedding.cpu()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "aa915326",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Embedding travel blogs:   0%|          | 0/233 [00:00<?, ?it/s]/home/nadav/env/dsan/lib/python3.13/site-packages/torch/_inductor/compile_fx.py:282: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.\n",
      "  warnings.warn(\n",
      "W1201 18:14:33.971000 39262 torch/_inductor/utils.py:1436] [1/0_1] Not enough SMs to use max_autotune_gemm mode\n",
      "Embedding travel blogs: 100%|██████████| 233/233 [03:19<00:00,  1.17it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved embeddings to: ./travel_blog_embeddings.pt\n",
      "Embedding matrix shape: torch.Size([7441, 768])\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# -------------------------------------------------------\n",
    "# LOOP THROUGH BLOG CONTENT AND EMBED\n",
    "# -------------------------------------------------------\n",
    "embeddings = []\n",
    "BATCH_SIZE = 32  # Adjust based on VRAM (16, 32, 64)\n",
    "\n",
    "# Process in batches\n",
    "for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=\"Embedding travel blogs\"):\n",
    "    batch_texts = texts[i : i + BATCH_SIZE]\n",
    "    # Ensure batch is not empty\n",
    "    if not batch_texts:\n",
    "        continue\n",
    "    batch_emb = embed_texts(batch_texts)\n",
    "    embeddings.append(batch_emb)\n",
    "\n",
    "# Stack into tensor (use cat because embed_texts returns a batch tensor)\n",
    "embeddings_tensor = torch.cat(embeddings)\n",
    "\n",
    "# -------------------------------------------------------\n",
    "# SAVE EMBEDDINGS\n",
    "# -------------------------------------------------------\n",
    "torch.save({\n",
    "    \"embeddings\": embeddings_tensor,\n",
    "    \"ids\": df[\"id\"].tolist(),\n",
    "}, OUTPUT_EMB_PATH)\n",
    "\n",
    "print(\"Saved embeddings to:\", OUTPUT_EMB_PATH)\n",
    "print(\"Embedding matrix shape:\", embeddings_tensor.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a7f856a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import faiss\n",
    "import torch\n",
    "\n",
    "data = torch.load(\"travel_blog_embeddings.pt\", weights_only=True)\n",
    "emb = data[\"embeddings\"]  # shape (N, 768)\n",
    "\n",
    "index = faiss.IndexFlatL2(emb.shape[1])\n",
    "index.add(emb.numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "930700f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def search_blogs(query, k=5):\n",
    "    # filepath: /home/nadav/dsan/dsan6700/dsan6700_app_dev_project/backend/bert/testing.ipynb\n",
    "    # Pass as a list [query]\n",
    "    q_emb = embed_texts([query]).numpy()\n",
    "    distances, idxs = index.search(q_emb, k)\n",
    "    rows = df.iloc[idxs[0]].copy()\n",
    "    rows[\"distance\"] = distances[0]\n",
    "    return rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "eac87a6d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Search results for: 'mountains in europe'\n",
      "\n",
      "Rank 1 (Distance: 0.7334)\n",
      "Title: 5 Reasons For A Ski Trip To Europe! – Mark's Travel Journal\n",
      "Location: Europe\n",
      "Content Snippet: travel experience if you are a skier or snowboarder there are at least 5 reasons for a ski trip to europe the alps have a special mix of landscapes style glamour and après ski and when you add the world class ski terrain in austria switzerland germany italy and france a ski trip to europe cant be be...\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "Rank 2 (Distance: 0.7575)\n",
      "Title: \n",
      "Chill Out | Travel Between The Pages\n",
      "Location: the Western Hemisphere\n",
      "Content Snippet: summer has arrived early in my little corner of the western hemisphere and with the wilting heat my thoughts turn to the far north coincidentally the worlds largest and most northern national park is celebrating its 50th anniversary kalaallit nunaanni nuna eqqissisimatitaq orgrønlands nationalpark o...\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "Rank 3 (Distance: 0.7652)\n",
      "Title: Seeing the World High Up - The Women's Travel Group\n",
      "Location: Italy\n",
      "Content Snippet: homecooking trips in italyseeing the world high up what the paris olympics told me some european cities are magical from high up they are fantastical i hate heights but panoramas tell a story like no other watching the paris olympics made me reconsider climbing or riding to heights to catch the best...\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "Rank 4 (Distance: 0.7655)\n",
      "Title: \n",
      "Kastelburg – Castle Ruins for Kids | Thrifty Travel Mama\t\n",
      "Location: Kastelburg\n",
      "Content Snippet: one of the things i love most about living in europe is that we are surrounded by history we can barely move a kilometer without bumping into somethingcenturiesoldcastle ruinsare some of our familys favorite odes to bygone ages the boys love to explore the old architecture pretend to storm the walls...\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "Rank 5 (Distance: 0.7744)\n",
      "Title: Gibraltar, the Mighty Rock – Travels Through My Lens\n",
      "Location: Mighty Rock\n",
      "Content Snippet: travels through my lens sharing photos and stories from my travels gibraltar also known as the rock is one of fourteen british overseas territories and is located on the southern iberian coast along the straight of gibraltar its location at the southern tip of europe where the atlantic ends and the ...\n",
      "\n",
      "--------------------------------------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "query = \"mountains in europe\"\n",
    "results = search_blogs(query)\n",
    "\n",
    "print(f\"Search results for: '{query}'\\n\")\n",
    "for i in range(len(results)):\n",
    "    row = results.iloc[i]\n",
    "    print(f\"Rank {i+1} (Distance: {row['distance']:.4f})\")\n",
    "    \n",
    "    # Use correct column names from your dataframe\n",
    "    if 'page_title' in row:\n",
    "        print(f\"Title: {row['page_title']}\")\n",
    "    if 'location_name' in row:\n",
    "        print(f\"Location: {row['location_name']}\")\n",
    "        \n",
    "    # Print a longer snippet of the content\n",
    "    content_snippet = str(row['content'])[:300].replace('\\n', ' ')\n",
    "    print(f\"Content Snippet: {content_snippet}...\\n\")\n",
    "    print(\"-\" * 80)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d77e840",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>blog_url</th>\n",
       "      <th>page_url</th>\n",
       "      <th>page_title</th>\n",
       "      <th>page_description</th>\n",
       "      <th>page_author</th>\n",
       "      <th>location_name</th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "      <th>content</th>\n",
       "      <th>distance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1161</th>\n",
       "      <td>1159</td>\n",
       "      <td>https://markstraveljournal.wordpress.com/</td>\n",
       "      <td>https://markstraveljournal.me/2019/11/14/5-rea...</td>\n",
       "      <td>5 Reasons For A Ski Trip To Europe! – Mark's T...</td>\n",
       "      <td>If you are a skier or snowboarder, there are a...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Europe</td>\n",
       "      <td>51.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>travel experience if you are a skier or snowbo...</td>\n",
       "      <td>0.733406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6749</th>\n",
       "      <td>7918</td>\n",
       "      <td>https://travelbetweenthepages.wordpress.com/</td>\n",
       "      <td>https://travelbetweenthepages.com/2024/05/28/c...</td>\n",
       "      <td>\\nChill Out | Travel Between The Pages</td>\n",
       "      <td>Summer has arrived early in my little corner o...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>the Western Hemisphere</td>\n",
       "      <td>45.419592</td>\n",
       "      <td>-75.708378</td>\n",
       "      <td>summer has arrived early in my little corner o...</td>\n",
       "      <td>0.757490</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2029</th>\n",
       "      <td>2043</td>\n",
       "      <td>https://thewomenstravelgroup.wordpress.com/</td>\n",
       "      <td>https://thewomenstravelgroup.com/seeing-the-wo...</td>\n",
       "      <td>Seeing the World High Up - The Women's Travel ...</td>\n",
       "      <td>Seeing the World High Up - cooking trips in Italy</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Italy</td>\n",
       "      <td>42.638426</td>\n",
       "      <td>12.674297</td>\n",
       "      <td>homecooking trips in italyseeing the world hig...</td>\n",
       "      <td>0.765224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3702</th>\n",
       "      <td>4179</td>\n",
       "      <td>https://thriftytravelmama.wordpress.com/</td>\n",
       "      <td>https://thriftytravelmama.wordpress.com/2013/0...</td>\n",
       "      <td>\\nKastelburg – Castle Ruins for Kids | Thrifty...</td>\n",
       "      <td>One of the things I love most about living in ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Kastelburg</td>\n",
       "      <td>48.097558</td>\n",
       "      <td>7.957541</td>\n",
       "      <td>one of the things i love most about living in ...</td>\n",
       "      <td>0.765539</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5609</th>\n",
       "      <td>6477</td>\n",
       "      <td>https://travelsthroughmylenstravel.wordpress.com/</td>\n",
       "      <td>https://travelsthroughmylens.com/2023/03/02/gi...</td>\n",
       "      <td>Gibraltar, the Mighty Rock – Travels Through M...</td>\n",
       "      <td>Gibraltar, also known as the Rock, is one of f...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Mighty Rock</td>\n",
       "      <td>13.130837</td>\n",
       "      <td>-59.633284</td>\n",
       "      <td>travels through my lens sharing photos and sto...</td>\n",
       "      <td>0.774408</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id                                           blog_url  \\\n",
       "1161  1159          https://markstraveljournal.wordpress.com/   \n",
       "6749  7918       https://travelbetweenthepages.wordpress.com/   \n",
       "2029  2043        https://thewomenstravelgroup.wordpress.com/   \n",
       "3702  4179           https://thriftytravelmama.wordpress.com/   \n",
       "5609  6477  https://travelsthroughmylenstravel.wordpress.com/   \n",
       "\n",
       "                                               page_url  \\\n",
       "1161  https://markstraveljournal.me/2019/11/14/5-rea...   \n",
       "6749  https://travelbetweenthepages.com/2024/05/28/c...   \n",
       "2029  https://thewomenstravelgroup.com/seeing-the-wo...   \n",
       "3702  https://thriftytravelmama.wordpress.com/2013/0...   \n",
       "5609  https://travelsthroughmylens.com/2023/03/02/gi...   \n",
       "\n",
       "                                             page_title  \\\n",
       "1161  5 Reasons For A Ski Trip To Europe! – Mark's T...   \n",
       "6749             \\nChill Out | Travel Between The Pages   \n",
       "2029  Seeing the World High Up - The Women's Travel ...   \n",
       "3702  \\nKastelburg – Castle Ruins for Kids | Thrifty...   \n",
       "5609  Gibraltar, the Mighty Rock – Travels Through M...   \n",
       "\n",
       "                                       page_description  page_author  \\\n",
       "1161  If you are a skier or snowboarder, there are a...          NaN   \n",
       "6749  Summer has arrived early in my little corner o...          NaN   \n",
       "2029  Seeing the World High Up - cooking trips in Italy          NaN   \n",
       "3702  One of the things I love most about living in ...          NaN   \n",
       "5609  Gibraltar, also known as the Rock, is one of f...          NaN   \n",
       "\n",
       "               location_name   latitude  longitude  \\\n",
       "1161                  Europe  51.000000  10.000000   \n",
       "6749  the Western Hemisphere  45.419592 -75.708378   \n",
       "2029                   Italy  42.638426  12.674297   \n",
       "3702              Kastelburg  48.097558   7.957541   \n",
       "5609             Mighty Rock  13.130837 -59.633284   \n",
       "\n",
       "                                                content  distance  \n",
       "1161  travel experience if you are a skier or snowbo...  0.733406  \n",
       "6749  summer has arrived early in my little corner o...  0.757490  \n",
       "2029  homecooking trips in italyseeing the world hig...  0.765224  \n",
       "3702  one of the things i love most about living in ...  0.765539  \n",
       "5609  travels through my lens sharing photos and sto...  0.774408  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c763676",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MyNewEnv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}