datamatters24 commited on
Commit
caff04b
·
verified ·
1 Parent(s): fa8cc5c

Upload notebooks/01_exploration/13_embedding_explorer.ipynb with huggingface_hub

Browse files
notebooks/01_exploration/13_embedding_explorer.ipynb ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Embedding Explorer — UMAP Projection\n",
8
+ "\n",
9
+ "Visualize page embeddings in 2D using UMAP dimensionality reduction.\n",
10
+ "- Load a sample of 50K page embeddings from the database\n",
11
+ "- Reduce to 2D with UMAP\n",
12
+ "- Interactive Plotly scatter colored by collection (source_section)\n",
13
+ "- Save interactive HTML for sharing"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "metadata": {},
19
+ "source": [
20
+ "import numpy as np\n",
21
+ "import pandas as pd\n",
22
+ "import umap\n",
23
+ "import plotly.express as px\n",
24
+ "\n",
25
+ "from research_lib.db import fetch_df\n",
26
+ "from research_lib.plotting import set_style, COLLECTION_COLORS\n",
27
+ "\n",
28
+ "set_style()\n",
29
+ "\n",
30
+ "SAMPLE_SIZE = 50_000\n",
31
+ "UMAP_N_NEIGHBORS = 15\n",
32
+ "UMAP_MIN_DIST = 0.1\n",
33
+ "UMAP_METRIC = \"cosine\"\n",
34
+ "RANDOM_SEED = 42\n",
35
+ "\n",
36
+ "print(f\"Configuration: sample={SAMPLE_SIZE:,}, n_neighbors={UMAP_N_NEIGHBORS}, min_dist={UMAP_MIN_DIST}\")"
37
+ ],
38
+ "execution_count": null,
39
+ "outputs": []
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "metadata": {},
44
+ "source": [
45
+ "# Load sample of 50K page embeddings from DB\n",
46
+ "# Fetches page id, embedding vector, and source_section via join\n",
47
+ "df_emb = fetch_df(f\"\"\"\n",
48
+ " SELECT p.id AS page_id,\n",
49
+ " p.embedding::text AS embedding_text,\n",
50
+ " d.source_section,\n",
51
+ " d.id AS doc_id\n",
52
+ " FROM pages p\n",
53
+ " JOIN documents d ON d.id = p.document_id\n",
54
+ " WHERE p.embedding IS NOT NULL\n",
55
+ " ORDER BY RANDOM()\n",
56
+ " LIMIT {SAMPLE_SIZE}\n",
57
+ "\"\"\")\n",
58
+ "\n",
59
+ "print(f\"Loaded {len(df_emb):,} page embeddings\")\n",
60
+ "print(f\"Collections represented: {df_emb['source_section'].nunique()}\")\n",
61
+ "print()\n",
62
+ "print(df_emb[\"source_section\"].value_counts())\n",
63
+ "\n",
64
+ "# Parse embedding vectors from text representation\n",
65
+ "# pgvector returns embeddings as '[0.1,0.2,...]' strings\n",
66
+ "embeddings = np.array([\n",
67
+ " np.fromstring(s.strip(\"[]\"), sep=\",\")\n",
68
+ " for s in df_emb[\"embedding_text\"]\n",
69
+ "])\n",
70
+ "\n",
71
+ "print(f\"\\nEmbedding matrix shape: {embeddings.shape}\")"
72
+ ],
73
+ "execution_count": null,
74
+ "outputs": []
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "metadata": {},
79
+ "source": [
80
+ "# UMAP reduction to 2D\n",
81
+ "print(\"Running UMAP reduction (this may take a few minutes)...\")\n",
82
+ "\n",
83
+ "reducer = umap.UMAP(\n",
84
+ " n_components=2,\n",
85
+ " n_neighbors=UMAP_N_NEIGHBORS,\n",
86
+ " min_dist=UMAP_MIN_DIST,\n",
87
+ " metric=UMAP_METRIC,\n",
88
+ " random_state=RANDOM_SEED,\n",
89
+ " verbose=True,\n",
90
+ ")\n",
91
+ "\n",
92
+ "coords_2d = reducer.fit_transform(embeddings)\n",
93
+ "\n",
94
+ "df_emb[\"umap_x\"] = coords_2d[:, 0]\n",
95
+ "df_emb[\"umap_y\"] = coords_2d[:, 1]\n",
96
+ "\n",
97
+ "print(f\"\\nUMAP complete. Output shape: {coords_2d.shape}\")"
98
+ ],
99
+ "execution_count": null,
100
+ "outputs": []
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "metadata": {},
105
+ "source": [
106
+ "# Plotly interactive scatter colored by collection\n",
107
+ "color_map = {k: v for k, v in COLLECTION_COLORS.items() if k in df_emb[\"source_section\"].unique()}\n",
108
+ "\n",
109
+ "fig = px.scatter(\n",
110
+ " df_emb,\n",
111
+ " x=\"umap_x\",\n",
112
+ " y=\"umap_y\",\n",
113
+ " color=\"source_section\",\n",
114
+ " color_discrete_map=color_map,\n",
115
+ " hover_data=[\"page_id\", \"doc_id\", \"source_section\"],\n",
116
+ " title=f\"UMAP Projection of {len(df_emb):,} Page Embeddings\",\n",
117
+ " labels={\"umap_x\": \"UMAP 1\", \"umap_y\": \"UMAP 2\"},\n",
118
+ " opacity=0.4,\n",
119
+ " width=1200,\n",
120
+ " height=800,\n",
121
+ ")\n",
122
+ "\n",
123
+ "fig.update_traces(marker=dict(size=3))\n",
124
+ "fig.update_layout(\n",
125
+ " legend_title_text=\"Collection\",\n",
126
+ " legend=dict(itemsizing=\"constant\"),\n",
127
+ ")\n",
128
+ "\n",
129
+ "fig.show()"
130
+ ],
131
+ "execution_count": null,
132
+ "outputs": []
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "metadata": {},
137
+ "source": [
138
+ "# Save interactive HTML\n",
139
+ "from pathlib import Path\n",
140
+ "\n",
141
+ "output_dir = Path(\"/opt/epstein_env/research/outputs/figures\")\n",
142
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
143
+ "\n",
144
+ "html_path = output_dir / \"embedding_umap_explorer.html\"\n",
145
+ "fig.write_html(str(html_path), include_plotlyjs=\"cdn\")\n",
146
+ "print(f\"Interactive HTML saved to: {html_path}\")\n",
147
+ "print(f\"File size: {html_path.stat().st_size / (1024**2):.1f} MB\")"
148
+ ],
149
+ "execution_count": null,
150
+ "outputs": []
151
+ }
152
+ ],
153
+ "metadata": {
154
+ "kernelspec": {
155
+ "display_name": "Python 3",
156
+ "language": "python",
157
+ "name": "python3"
158
+ },
159
+ "language_info": {
160
+ "name": "python",
161
+ "version": "3.10.0"
162
+ }
163
+ },
164
+ "nbformat": 4,
165
+ "nbformat_minor": 5
166
+ }