{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "5330406b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-12-01 18:13:22.487912: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2025-12-01 18:13:22.702184: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "2025-12-01 18:13:24.546313: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1fc8dba14aa742ee9f2c64aab53a530a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/596M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import torch\n", "from tqdm import tqdm\n", "from transformers import AutoTokenizer, AutoModel\n", "\n", "# -------------------------------------------------------\n", "# SETTINGS\n", "# -------------------------------------------------------\n", "# NEW: Use a model optimized for semantic search\n", "MODEL_NAME = \"nomic-ai/modernbert-embed-base\" \n", "CSV_PATH = \"../bm25/travel_blogs.csv\"\n", "OUTPUT_EMB_PATH = \"./travel_blog_embeddings.pt\"\n", "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "# -------------------------------------------------------\n", "# LOAD MODEL + TOKENIZER\n", "# -------------------------------------------------------\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", "model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)\n", "model.eval()\n", "\n", "# -------------------------------------------------------\n", "# LOAD DATA\n", "# -------------------------------------------------------\n", "df = pd.read_csv(CSV_PATH)\n", "\n", "# We will embed the \"content\" field\n", "texts = df[\"content\"].fillna(\"\").tolist()\n", "\n", "# -------------------------------------------------------\n", "# EMBEDDING FUNCTION (Batch optimized)\n", "# -------------------------------------------------------\n", "def embed_texts(texts_batch):\n", " # Tokenize (handles list of strings automatically)\n", " encoded = tokenizer(\n", " texts_batch,\n", " padding=True,\n", " truncation=True,\n", " max_length=512,\n", " return_tensors=\"pt\"\n", " ).to(DEVICE)\n", "\n", " # Forward pass\n", " with torch.no_grad():\n", " outputs = model(**encoded)\n", "\n", " # ModernBERT uses last_hidden_state for embeddings\n", " last_hidden = outputs.last_hidden_state\n", "\n", " # Mean pooling\n", " attention_mask = encoded[\"attention_mask\"].unsqueeze(-1)\n", " sum_embeddings = torch.sum(last_hidden * attention_mask, dim=1)\n", " sum_mask = torch.sum(attention_mask, dim=1)\n", " # Clamp sum_mask to avoid division by zero\n", " sum_mask = torch.clamp(sum_mask, min=1e-9)\n", " embedding = sum_embeddings / sum_mask\n", " \n", " # Normalize embeddings to unit length\n", " embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)\n", "\n", " return embedding.cpu()" ] }, { "cell_type": "code", "execution_count": 2, "id": "aa915326", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Embedding travel blogs: 0%| | 0/233 [00:00, ?it/s]/home/nadav/env/dsan/lib/python3.13/site-packages/torch/_inductor/compile_fx.py:282: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.\n", " warnings.warn(\n", "W1201 18:14:33.971000 39262 torch/_inductor/utils.py:1436] [1/0_1] Not enough SMs to use max_autotune_gemm mode\n", "Embedding travel blogs: 100%|██████████| 233/233 [03:19<00:00, 1.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Saved embeddings to: ./travel_blog_embeddings.pt\n", "Embedding matrix shape: torch.Size([7441, 768])\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "\n", "# -------------------------------------------------------\n", "# LOOP THROUGH BLOG CONTENT AND EMBED\n", "# -------------------------------------------------------\n", "embeddings = []\n", "BATCH_SIZE = 32 # Adjust based on VRAM (16, 32, 64)\n", "\n", "# Process in batches\n", "for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=\"Embedding travel blogs\"):\n", " batch_texts = texts[i : i + BATCH_SIZE]\n", " # Ensure batch is not empty\n", " if not batch_texts:\n", " continue\n", " batch_emb = embed_texts(batch_texts)\n", " embeddings.append(batch_emb)\n", "\n", "# Stack into tensor (use cat because embed_texts returns a batch tensor)\n", "embeddings_tensor = torch.cat(embeddings)\n", "\n", "# -------------------------------------------------------\n", "# SAVE EMBEDDINGS\n", "# -------------------------------------------------------\n", "torch.save({\n", " \"embeddings\": embeddings_tensor,\n", " \"ids\": df[\"id\"].tolist(),\n", "}, OUTPUT_EMB_PATH)\n", "\n", "print(\"Saved embeddings to:\", OUTPUT_EMB_PATH)\n", "print(\"Embedding matrix shape:\", embeddings_tensor.shape)\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "a7f856a4", "metadata": {}, "outputs": [], "source": [ "import faiss\n", "import torch\n", "\n", "data = torch.load(\"travel_blog_embeddings.pt\", weights_only=True)\n", "emb = data[\"embeddings\"] # shape (N, 768)\n", "\n", "index = faiss.IndexFlatL2(emb.shape[1])\n", "index.add(emb.numpy())" ] }, { "cell_type": "code", "execution_count": 9, "id": "930700f9", "metadata": {}, "outputs": [], "source": [ "def search_blogs(query, k=5):\n", " # filepath: /home/nadav/dsan/dsan6700/dsan6700_app_dev_project/backend/bert/testing.ipynb\n", " # Pass as a list [query]\n", " q_emb = embed_texts([query]).numpy()\n", " distances, idxs = index.search(q_emb, k)\n", " rows = df.iloc[idxs[0]].copy()\n", " rows[\"distance\"] = distances[0]\n", " return rows" ] }, { "cell_type": "code", "execution_count": 16, "id": "eac87a6d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Search results for: 'mountains in europe'\n", "\n", "Rank 1 (Distance: 0.7334)\n", "Title: 5 Reasons For A Ski Trip To Europe! – Mark's Travel Journal\n", "Location: Europe\n", "Content Snippet: travel experience if you are a skier or snowboarder there are at least 5 reasons for a ski trip to europe the alps have a special mix of landscapes style glamour and après ski and when you add the world class ski terrain in austria switzerland germany italy and france a ski trip to europe cant be be...\n", "\n", "--------------------------------------------------------------------------------\n", "Rank 2 (Distance: 0.7575)\n", "Title: \n", "Chill Out | Travel Between The Pages\n", "Location: the Western Hemisphere\n", "Content Snippet: summer has arrived early in my little corner of the western hemisphere and with the wilting heat my thoughts turn to the far north coincidentally the worlds largest and most northern national park is celebrating its 50th anniversary kalaallit nunaanni nuna eqqissisimatitaq orgrønlands nationalpark o...\n", "\n", "--------------------------------------------------------------------------------\n", "Rank 3 (Distance: 0.7652)\n", "Title: Seeing the World High Up - The Women's Travel Group\n", "Location: Italy\n", "Content Snippet: homecooking trips in italyseeing the world high up what the paris olympics told me some european cities are magical from high up they are fantastical i hate heights but panoramas tell a story like no other watching the paris olympics made me reconsider climbing or riding to heights to catch the best...\n", "\n", "--------------------------------------------------------------------------------\n", "Rank 4 (Distance: 0.7655)\n", "Title: \n", "Kastelburg – Castle Ruins for Kids | Thrifty Travel Mama\t\n", "Location: Kastelburg\n", "Content Snippet: one of the things i love most about living in europe is that we are surrounded by history we can barely move a kilometer without bumping into somethingcenturiesoldcastle ruinsare some of our familys favorite odes to bygone ages the boys love to explore the old architecture pretend to storm the walls...\n", "\n", "--------------------------------------------------------------------------------\n", "Rank 5 (Distance: 0.7744)\n", "Title: Gibraltar, the Mighty Rock – Travels Through My Lens\n", "Location: Mighty Rock\n", "Content Snippet: travels through my lens sharing photos and stories from my travels gibraltar also known as the rock is one of fourteen british overseas territories and is located on the southern iberian coast along the straight of gibraltar its location at the southern tip of europe where the atlantic ends and the ...\n", "\n", "--------------------------------------------------------------------------------\n" ] } ], "source": [ "query = \"mountains in europe\"\n", "results = search_blogs(query)\n", "\n", "print(f\"Search results for: '{query}'\\n\")\n", "for i in range(len(results)):\n", " row = results.iloc[i]\n", " print(f\"Rank {i+1} (Distance: {row['distance']:.4f})\")\n", " \n", " # Use correct column names from your dataframe\n", " if 'page_title' in row:\n", " print(f\"Title: {row['page_title']}\")\n", " if 'location_name' in row:\n", " print(f\"Location: {row['location_name']}\")\n", " \n", " # Print a longer snippet of the content\n", " content_snippet = str(row['content'])[:300].replace('\\n', ' ')\n", " print(f\"Content Snippet: {content_snippet}...\\n\")\n", " print(\"-\" * 80)" ] }, { "cell_type": "code", "execution_count": null, "id": "2d77e840", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | id | \n", "blog_url | \n", "page_url | \n", "page_title | \n", "page_description | \n", "page_author | \n", "location_name | \n", "latitude | \n", "longitude | \n", "content | \n", "distance | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1161 | \n", "1159 | \n", "https://markstraveljournal.wordpress.com/ | \n", "https://markstraveljournal.me/2019/11/14/5-rea... | \n", "5 Reasons For A Ski Trip To Europe! – Mark's T... | \n", "If you are a skier or snowboarder, there are a... | \n", "NaN | \n", "Europe | \n", "51.000000 | \n", "10.000000 | \n", "travel experience if you are a skier or snowbo... | \n", "0.733406 | \n", "
| 6749 | \n", "7918 | \n", "https://travelbetweenthepages.wordpress.com/ | \n", "https://travelbetweenthepages.com/2024/05/28/c... | \n", "\\nChill Out | Travel Between The Pages | \n", "Summer has arrived early in my little corner o... | \n", "NaN | \n", "the Western Hemisphere | \n", "45.419592 | \n", "-75.708378 | \n", "summer has arrived early in my little corner o... | \n", "0.757490 | \n", "
| 2029 | \n", "2043 | \n", "https://thewomenstravelgroup.wordpress.com/ | \n", "https://thewomenstravelgroup.com/seeing-the-wo... | \n", "Seeing the World High Up - The Women's Travel ... | \n", "Seeing the World High Up - cooking trips in Italy | \n", "NaN | \n", "Italy | \n", "42.638426 | \n", "12.674297 | \n", "homecooking trips in italyseeing the world hig... | \n", "0.765224 | \n", "
| 3702 | \n", "4179 | \n", "https://thriftytravelmama.wordpress.com/ | \n", "https://thriftytravelmama.wordpress.com/2013/0... | \n", "\\nKastelburg – Castle Ruins for Kids | Thrifty... | \n", "One of the things I love most about living in ... | \n", "NaN | \n", "Kastelburg | \n", "48.097558 | \n", "7.957541 | \n", "one of the things i love most about living in ... | \n", "0.765539 | \n", "
| 5609 | \n", "6477 | \n", "https://travelsthroughmylenstravel.wordpress.com/ | \n", "https://travelsthroughmylens.com/2023/03/02/gi... | \n", "Gibraltar, the Mighty Rock – Travels Through M... | \n", "Gibraltar, also known as the Rock, is one of f... | \n", "NaN | \n", "Mighty Rock | \n", "13.130837 | \n", "-59.633284 | \n", "travels through my lens sharing photos and sto... | \n", "0.774408 | \n", "