Ory999 commited on
Commit
6586069
·
verified ·
1 Parent(s): bb2cec5

Upload 3 files

Browse files
assignment_2.ipynb ADDED
@@ -0,0 +1,1321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "4312be94",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Part A"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 26,
14
+ "id": "499c83de",
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "Data already exists in 'patents_data_raw'. Skipping download.\n"
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "import os\n",
27
+ "from huggingface_hub import snapshot_download\n",
28
+ "\n",
29
+ "# Check if the folder already exists and is not empty\n",
30
+ "folder_name = \"patents_data_raw\"\n",
31
+ "\n",
32
+ "# Check if folder exists and if it has any files inside\n",
33
+ "if os.path.exists(folder_name) and any(os.scandir(folder_name)):\n",
34
+ " print(f\"Data already exists in '{folder_name}'. Skipping download.\")\n",
35
+ " local_folder = os.path.abspath(folder_name)\n",
36
+ "else:\n",
37
+ " #Download only if missing\n",
38
+ " print(f\"Downloading dataset files to '{folder_name}'... (This may take a few minutes)\")\n",
39
+ " try:\n",
40
+ " local_folder = snapshot_download(\n",
41
+ " repo_id=\"AI-Growth-Lab/patents_claims_1.5m_traim_test\", \n",
42
+ " repo_type=\"dataset\",\n",
43
+ " local_dir=folder_name,\n",
44
+ " ignore_patterns=[\"*.git*\"]\n",
45
+ " )\n",
46
+ " print(f\"Success! Files downloaded to: {local_folder}\")\n",
47
+ " except Exception as e:\n",
48
+ " print(f\"Download failed: {e}\")"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 27,
54
+ "id": "2a1e5f1b",
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "name": "stdout",
59
+ "output_type": "stream",
60
+ "text": [
61
+ "Found existing processed file: patents_50k_green.parquet\n",
62
+ " Skipping filtering and merging to save time.\n"
63
+ ]
64
+ }
65
+ ],
66
+ "source": [
67
+ "from datasets import load_dataset, concatenate_datasets, disable_progress_bar\n",
68
+ "import pandas as pd\n",
69
+ "import datasets\n",
70
+ "\n",
71
+ "# Silience Hugging Face logs for cleaner output\n",
72
+ "disable_progress_bar()\n",
73
+ "datasets.utils.logging.set_verbosity_error()\n",
74
+ "\n",
75
+ "output_filename = \"patents_50k_green.parquet\"\n",
76
+ "\n",
77
+ "# Check if we already did this work\n",
78
+ "if os.path.exists(output_filename):\n",
79
+ " print(f\"Found existing processed file: {output_filename}\")\n",
80
+ " print(f\" Skipping filtering and merging to save time.\")\n",
81
+ "else:\n",
82
+ " print(\"1. Loading dataset from local cache...\")\n",
83
+ " # Point to the local folder\n",
84
+ " dataset_full = load_dataset(\"./patents_data_raw\", split=\"train\")\n",
85
+ "\n",
86
+ " print(f\" Dataset loaded. Total rows: {len(dataset_full):,}\")\n",
87
+ "\n",
88
+ " # Identify Green Columns\n",
89
+ " all_cols = dataset_full.column_names\n",
90
+ " y02_cols = [c for c in all_cols if c.startswith(\"Y02\")]\n",
91
+ " print(f\" Found {len(y02_cols)} Green Patent (Y02) indicator columns.\")\n",
92
+ "\n",
93
+ " # Filtering Logic\n",
94
+ " print(\"2. Filtering for 25,000 Green patents...\")\n",
95
+ " dataset_green = dataset_full.filter(\n",
96
+ " lambda x: any(x[col] == 1 for col in y02_cols),\n",
97
+ " num_proc=1\n",
98
+ " ).shuffle(seed=42).select(range(25000))\n",
99
+ "\n",
100
+ " print(\"3. Filtering for 25,000 Non-Green patents...\")\n",
101
+ " dataset_not_green = dataset_full.filter(\n",
102
+ " lambda x: all(x[col] == 0 for col in y02_cols),\n",
103
+ " num_proc=1\n",
104
+ " ).shuffle(seed=42).select(range(25000))\n",
105
+ "\n",
106
+ " # 4. Add \"is_green_silver\" Labels\n",
107
+ " print(\"4. Adding silver labels (0/1)...\")\n",
108
+ " dataset_green = dataset_green.map(lambda x: {\"is_green_silver\": 1})\n",
109
+ " dataset_not_green = dataset_not_green.map(lambda x: {\"is_green_silver\": 0})\n",
110
+ "\n",
111
+ " # 5. Combine and Save\n",
112
+ " print(\"5. Merging and saving final Parquet...\")\n",
113
+ " final_dataset = concatenate_datasets([dataset_green, dataset_not_green]).shuffle(seed=42)\n",
114
+ " final_dataset.to_parquet(output_filename)\n",
115
+ "\n",
116
+ " print(f\"Success! File saved: {output_filename}\")\n",
117
+ " print(f\"Total Balanced Rows: {len(final_dataset):,}\")"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 28,
123
+ "id": "784cf7cb",
124
+ "metadata": {},
125
+ "outputs": [
126
+ {
127
+ "name": "stdout",
128
+ "output_type": "stream",
129
+ "text": [
130
+ "Loading patents_50k_green.parquet...\n",
131
+ " Data Setup Complete\n",
132
+ " - train_silver: 2000 rows\n",
133
+ " - eval_silver: 5000 rows\n",
134
+ " - pool_unlabeled: 43000 rows\n"
135
+ ]
136
+ }
137
+ ],
138
+ "source": [
139
+ "from sklearn.model_selection import train_test_split\n",
140
+ "\n",
141
+ "# 1. Loading the balanced 50k file\n",
142
+ "print(\"Loading patents_50k_green.parquet...\")\n",
143
+ "df = pd.read_parquet(\"patents_50k_green.parquet\")\n",
144
+ "\n",
145
+ "# Creating the Splits\n",
146
+ "# - train_silver: Small initial labeled set to train the baseline (e.g., 2,000 - 5,000 rows)\n",
147
+ "# - eval_silver: Validation set to test performance (e.g., 5,000 rows)\n",
148
+ "# - pool_unlabeled: The rest, which you will \"mine\" for high-risk examples.\n",
149
+ "\n",
150
+ "# Reserve 5,000 for evaluation\n",
151
+ "df_eval = df.sample(n=5000, random_state=42)\n",
152
+ "df_remaining = df.drop(df_eval.index)\n",
153
+ "\n",
154
+ "# Reserve 2,000 for the initial \"train_silver\"\n",
155
+ "df_train_silver = df_remaining.sample(n=2000, random_state=42)\n",
156
+ "df_pool_unlabeled = df_remaining.drop(df_train_silver.index)\n",
157
+ "\n",
158
+ "print(\" Data Setup Complete\")\n",
159
+ "print(f\" - train_silver: {len(df_train_silver)} rows\")\n",
160
+ "print(f\" - eval_silver: {len(df_eval)} rows\")\n",
161
+ "print(f\" - pool_unlabeled: {len(df_pool_unlabeled)} rows\")"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 29,
167
+ "id": "d56051b7",
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "Starting Part A: Baseline Model Training...\n",
175
+ " - Loading 50k dataset...\n",
176
+ " - Splits created: Train=2000, Eval=5000, Pool=43000\n",
177
+ " - Loading PatentSBERTa model...\n",
178
+ " - Generating Training embeddings...\n"
179
+ ]
180
+ },
181
+ {
182
+ "data": {
183
+ "application/vnd.jupyter.widget-view+json": {
184
+ "model_id": "2c18d825d370452ca1eadd3788063e0c",
185
+ "version_major": 2,
186
+ "version_minor": 0
187
+ },
188
+ "text/plain": [
189
+ " Encoding: 0%| | 0/63 [00:00<?, ?it/s]"
190
+ ]
191
+ },
192
+ "metadata": {},
193
+ "output_type": "display_data"
194
+ },
195
+ {
196
+ "name": "stdout",
197
+ "output_type": "stream",
198
+ "text": [
199
+ " - Generating Evaluation embeddings...\n"
200
+ ]
201
+ },
202
+ {
203
+ "data": {
204
+ "application/vnd.jupyter.widget-view+json": {
205
+ "model_id": "fd3be50181a348c8b8e8688ae3568157",
206
+ "version_major": 2,
207
+ "version_minor": 0
208
+ },
209
+ "text/plain": [
210
+ " Encoding: 0%| | 0/157 [00:00<?, ?it/s]"
211
+ ]
212
+ },
213
+ "metadata": {},
214
+ "output_type": "display_data"
215
+ },
216
+ {
217
+ "name": "stdout",
218
+ "output_type": "stream",
219
+ "text": [
220
+ " - Training Logistic Regression...\n",
221
+ "\n",
222
+ "=============================================\n",
223
+ "PART A RESULTS: BASELINE MODEL\n",
224
+ "=============================================\n",
225
+ " precision recall f1-score support\n",
226
+ "\n",
227
+ " Not Green 0.74 0.76 0.75 2493\n",
228
+ " Green 0.75 0.74 0.75 2507\n",
229
+ "\n",
230
+ " accuracy 0.75 5000\n",
231
+ " macro avg 0.75 0.75 0.75 5000\n",
232
+ "weighted avg 0.75 0.75 0.75 5000\n",
233
+ "\n",
234
+ "---------------------------------------------\n",
235
+ "Part A Baseline F1-Score: 0.7488\n",
236
+ "=============================================\n"
237
+ ]
238
+ }
239
+ ],
240
+ "source": [
241
+ "import numpy as np\n",
242
+ "import torch\n",
243
+ "from transformers import AutoTokenizer, AutoModel, logging\n",
244
+ "from sklearn.linear_model import LogisticRegression\n",
245
+ "from sklearn.metrics import classification_report\n",
246
+ "from tqdm.auto import tqdm\n",
247
+ "\n",
248
+ "# Silence logs\n",
249
+ "logging.set_verbosity_error()\n",
250
+ "\n",
251
+ "print(\"Starting Part A: Baseline Model Training...\")\n",
252
+ "\n",
253
+ "# Check for the source file\n",
254
+ "parquet_file = \"patents_50k_green.parquet\"\n",
255
+ "if not os.path.exists(parquet_file):\n",
256
+ " print(f\"Error: {parquet_file} not found. Please run the Filtering script first.\")\n",
257
+ "else:\n",
258
+ " # Load Data & Create Splits\n",
259
+ " print(\" - Loading 50k dataset...\")\n",
260
+ " df = pd.read_parquet(parquet_file)\n",
261
+ "\n",
262
+ " # Creating the Splits (train_silver, eval_silver, pool_unlabeled)\n",
263
+ " df_eval = df.sample(n=5000, random_state=42)\n",
264
+ " df_remaining = df.drop(df_eval.index)\n",
265
+ " df_train = df_remaining.sample(n=2000, random_state=42)\n",
266
+ " df_pool = df_remaining.drop(df_train.index)\n",
267
+ "\n",
268
+ " print(f\" - Splits created: Train={len(df_train)}, Eval={len(df_eval)}, Pool={len(df_pool)}\")\n",
269
+ "\n",
270
+ " # Load PatentSBERTa\n",
271
+ " print(\" - Loading PatentSBERTa model...\")\n",
272
+ " model_name = \"AI-Growth-Lab/PatentSBERTa\"\n",
273
+ " tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
274
+ " model = AutoModel.from_pretrained(model_name)\n",
275
+ "\n",
276
+ " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
277
+ " model.to(device); model.eval()\n",
278
+ "\n",
279
+ " # Helper function with clean progress tracking\n",
280
+ " def get_embeddings(text_list, batch_size=32):\n",
281
+ " all_embeddings = []\n",
282
+ " # We keep the tqdm bar small and clean\n",
283
+ " for i in tqdm(range(0, len(text_list), batch_size), desc=\" Encoding\", leave=False):\n",
284
+ " batch_texts = text_list[i:i+batch_size]\n",
285
+ " inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors=\"pt\").to(device)\n",
286
+ " with torch.no_grad():\n",
287
+ " outputs = model(**inputs)\n",
288
+ " embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()\n",
289
+ " all_embeddings.append(embeddings)\n",
290
+ " return np.vstack(all_embeddings)\n",
291
+ "\n",
292
+ " # Generate Embeddings\n",
293
+ " print(\" - Generating Training embeddings...\")\n",
294
+ " X_train = get_embeddings(df_train['text'].tolist())\n",
295
+ " y_train = df_train['is_green_silver'].values\n",
296
+ "\n",
297
+ " print(\" - Generating Evaluation embeddings...\")\n",
298
+ " X_eval = get_embeddings(df_eval['text'].tolist())\n",
299
+ " y_eval = df_eval['is_green_silver'].values\n",
300
+ "\n",
301
+ " # Train Baseline Classifier\n",
302
+ " print(\" - Training Logistic Regression...\")\n",
303
+ " clf = LogisticRegression(max_iter=1000, random_state=42)\n",
304
+ " clf.fit(X_train, y_train)\n",
305
+ "\n",
306
+ " # FINAL REPORT OUTPUT\n",
307
+ " print(\"\\n\" + \"=\"*45)\n",
308
+ " print(\"PART A RESULTS: BASELINE MODEL\")\n",
309
+ " print(\"=\"*45)\n",
310
+ " y_pred = clf.predict(X_eval)\n",
311
+ " report = classification_report(y_eval, y_pred, target_names=['Not Green', 'Green'])\n",
312
+ " print(report)\n",
313
+ " \n",
314
+ " # Store Macro F1 for par D\n",
315
+ " report_dict = classification_report(y_eval, y_pred, output_dict=True)\n",
316
+ " macro_f1 = report_dict['macro avg']['f1-score']\n",
317
+ " \n",
318
+ " print(\"-\" * 45)\n",
319
+ " print(f\"Part A Baseline F1-Score: {macro_f1:.4f}\")\n",
320
+ " print(\"=\"*45)"
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "markdown",
325
+ "id": "99b2f0f6",
326
+ "metadata": {},
327
+ "source": [
328
+ "# Part B"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": null,
334
+ "id": "9a46f788",
335
+ "metadata": {},
336
+ "outputs": [
337
+ {
338
+ "name": "stdout",
339
+ "output_type": "stream",
340
+ "text": [
341
+ "--- Starting Part B: Safe Reproduction of Outputs ---\n",
342
+ "Generating baseline training embeddings...\n"
343
+ ]
344
+ },
345
+ {
346
+ "data": {
347
+ "application/vnd.jupyter.widget-view+json": {
348
+ "model_id": "2fc0f419d2aa404ab6e86ca8e20ce7d0",
349
+ "version_major": 2,
350
+ "version_minor": 0
351
+ },
352
+ "text/plain": [
353
+ "Encoding: 0%| | 0/63 [00:00<?, ?it/s]"
354
+ ]
355
+ },
356
+ "metadata": {},
357
+ "output_type": "display_data"
358
+ },
359
+ {
360
+ "name": "stdout",
361
+ "output_type": "stream",
362
+ "text": [
363
+ "Generating embeddings for 43000 unlabeled examples...\n"
364
+ ]
365
+ },
366
+ {
367
+ "data": {
368
+ "application/vnd.jupyter.widget-view+json": {
369
+ "model_id": "0d80d393db8542b8af12f15c1d92fddf",
370
+ "version_major": 2,
371
+ "version_minor": 0
372
+ },
373
+ "text/plain": [
374
+ "Encoding: 0%| | 0/1344 [00:00<?, ?it/s]"
375
+ ]
376
+ },
377
+ "metadata": {},
378
+ "output_type": "display_data"
379
+ },
380
+ {
381
+ "name": "stdout",
382
+ "output_type": "stream",
383
+ "text": [
384
+ "Calculating uncertainty scores...\n",
385
+ "\n",
386
+ "========================================\n",
387
+ "✅ Part B Complete! Outputs successfully shown.\n",
388
+ "File saved to: hitl_green_100_REPRODUCED.csv\n",
389
+ " - Min Uncertainty: 0.9959\n",
390
+ " - Max Uncertainty: 1.0000\n",
391
+ "========================================\n"
392
+ ]
393
+ }
394
+ ],
395
+ "source": [
396
+ "print(\"--- Starting Part B: Safe Reproduction of Outputs ---\")\n",
397
+ "\n",
398
+ "# Re-initialize Data & Model\n",
399
+ "df = pd.read_parquet(\"patents_50k_green.parquet\")\n",
400
+ "df_eval = df.sample(n=5000, random_state=42)\n",
401
+ "df_remaining = df.drop(df_eval.index)\n",
402
+ "df_train = df_remaining.sample(n=2000, random_state=42)\n",
403
+ "df_pool = df_remaining.drop(df_train.index) #unlabeled pool\n",
404
+ "\n",
405
+ "# Re-initialize PatentSBERTa\n",
406
+ "model_name = \"AI-Growth-Lab/PatentSBERTa\"\n",
407
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
408
+ "model = AutoModel.from_pretrained(model_name)\n",
409
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
410
+ "model.to(device); model.eval()\n",
411
+ "\n",
412
+ "def get_embeddings(text_list, batch_size=32):\n",
413
+ " all_embeddings = []\n",
414
+ " for i in tqdm(range(0, len(text_list), batch_size), desc=\"Encoding\"):\n",
415
+ " batch_texts = text_list[i:i+batch_size]\n",
416
+ " inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors=\"pt\").to(device)\n",
417
+ " with torch.no_grad():\n",
418
+ " outputs = model(**inputs)\n",
419
+ " all_embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())\n",
420
+ " return np.vstack(all_embeddings)\n",
421
+ "\n",
422
+ "# Re-train the Baseline Classifier\n",
423
+ "print(\"Generating baseline training embeddings...\")\n",
424
+ "X_train = get_embeddings(df_train['text'].tolist())\n",
425
+ "clf = LogisticRegression(max_iter=1000, random_state=42)\n",
426
+ "clf.fit(X_train, df_train['is_green_silver'].values)\n",
427
+ "\n",
428
+ "# Generate Embeddings for the Unlabeled Pool\n",
429
+ "print(f\"Generating embeddings for {len(df_pool)} unlabeled examples...\")\n",
430
+ "X_pool = get_embeddings(df_pool['text'].tolist())\n",
431
+ "\n",
432
+ "# Predict Probabilities and Uncertainty\n",
433
+ "print(\"Calculating uncertainty scores...\")\n",
434
+ "probs = clf.predict_proba(X_pool)[:, 1]\n",
435
+ "uncertainty_scores = 1 - 2 * np.abs(probs - 0.5)\n",
436
+ "\n",
437
+ "df_pool['p_green'] = probs\n",
438
+ "df_pool['u'] = uncertainty_scores\n",
439
+ "\n",
440
+ "# Select Top 100\n",
441
+ "df_high_risk = df_pool.sort_values(by='u', ascending=False).head(100)\n",
442
+ "\n",
443
+ "# Format for Export (Changed filename to be safe)\n",
444
+ "if 'id' in df_high_risk.columns:\n",
445
+ " df_high_risk = df_high_risk.rename(columns={'id': 'doc_id'})\n",
446
+ "else:\n",
447
+ " df_high_risk['doc_id'] = df_high_risk.index\n",
448
+ "\n",
449
+ "for col in ['llm_green_suggested', 'llm_confidence', 'llm_rationale', 'is_green_human', 'notes']:\n",
450
+ " df_high_risk[col] = \"\"\n",
451
+ "\n",
452
+ "final_columns = ['doc_id', 'text', 'p_green', 'u', 'llm_green_suggested', 'llm_confidence', 'llm_rationale', 'is_green_human', 'notes']\n",
453
+ "\n",
454
+ "# Due to keral restart, a second file which is not used in the next steps is created to avoid confusion. The file is named \"hitl_green_100_REPRODUCED.csv\" to indicate that it is a reproduction of the original \"hitl_green_100.csv\" file, but with a different name to prevent any accidental overwriting or confusion with the original file that may have been generated before the kernel restart.\n",
455
+ "safe_filename = \"hitl_green_100_REPRODUCED.csv\"\n",
456
+ "df_high_risk[final_columns].to_csv(safe_filename, index=False)\n",
457
+ "\n",
458
+ "print(\"\\n\" + \"=\"*40)\n",
459
+ "print(f\"Part B Complete! Outputs successfully shown.\")\n",
460
+ "print(f\"File saved to: {safe_filename}\")\n",
461
+ "print(f\" - Min Uncertainty: {df_high_risk['u'].min():.4f}\")\n",
462
+ "print(f\" - Max Uncertainty: {df_high_risk['u'].max():.4f}\")\n",
463
+ "print(\"=\"*40)"
464
+ ]
465
+ },
466
+ {
467
+ "cell_type": "markdown",
468
+ "id": "6317d3a4",
469
+ "metadata": {},
470
+ "source": [
471
+ "# Part C"
472
+ ]
473
+ },
474
+ {
475
+ "cell_type": "code",
476
+ "execution_count": null,
477
+ "id": "8a7f1e12",
478
+ "metadata": {},
479
+ "outputs": [
480
+ {
481
+ "name": "stdout",
482
+ "output_type": "stream",
483
+ "text": [
484
+ "--- Part C: HITL Labeling ---\n",
485
+ "Remaining rows: 68\n",
486
+ "-----------------------------\n",
487
+ "\n",
488
+ "[Row 33] (Uncertainty: 0.9984)\n",
489
+ "CLAIM: 1. A method of increasing light extraction from a light-emitting diode (LED) device comprising; forming a first n-doped layer on a carrier substrate; forming a Si forming a second n-doped layer on the Si forming an active layer configured to emit light on the second n-doped layer; forming a p-doped ...\n",
490
+ "------------------------------------------------------------\n",
491
+ "LLM Says: 0 (High) | Rationale: The claim concerns LED light‑emission enhancement, not climate change mitigation.\n",
492
+ "Saved.\n",
493
+ "\n",
494
+ "[Row 34] (Uncertainty: 0.9984)\n",
495
+ "CLAIM: 1. A method comprising: identifying valuation data comprising a plurality of estimated asset values corresponding to one or more of location information and property type; identifying a group of two or more characteristics, wherein for each respective real estate investment trust of a plurality of r...\n",
496
+ "------------------------------------------------------------\n",
497
+ "LLM Says: 0 (High) | Rationale: The claim concerns financial weighting of real estate trusts, not climate mitigation.\n",
498
+ "Saved.\n",
499
+ "\n",
500
+ "[Row 35] (Uncertainty: 0.9983)\n",
501
+ "CLAIM: 1. A construction machine, comprising: a lower travel body; an upper slewing body mounted on the lower travel body and having an engine compartment; an engine compartment cover which covers the engine compartment of the upper slewing body; an air filter which collects dust included in outside air ta...\n",
502
+ "------------------------------------------------------------\n",
503
+ "LLM Says: 0 (High) | Rationale: The claim describes an air filter system for a construction machine, which does not relate to green or climate change mitigation.\n",
504
+ "Saved.\n",
505
+ "\n",
506
+ "[Row 36] (Uncertainty: 0.9983)\n",
507
+ "CLAIM: 1. A semiconductor device including a plurality of operation circuits executing operation in synchronization with a clock signal comprising: a control unit for outputting first operation control information and second operation control information for controlling operation executed by the plurality ...\n",
508
+ "------------------------------------------------------------\n",
509
+ "LLM Says: 0 (High) | Rationale: The claim describes a generic semiconductor device architecture without any reference to environmental impact or climate change mitigation.\n",
510
+ "Saved.\n",
511
+ "\n",
512
+ "[Row 37] (Uncertainty: 0.9983)\n",
513
+ "CLAIM: 1. A method of forming a layer over a substrate, the method comprising: receiving data identifying a desired thickness of the layer; using a processor to generate instructions for a printing mechanism to deposit droplets of ink onto the substrate according to the data, the ink carrying material to f...\n",
514
+ "------------------------------------------------------------\n",
515
+ "LLM Says: 0 (High) | Rationale: The claim describes a printing method for depositing ink layers, unrelated to climate change mitigation.\n",
516
+ "Saved.\n",
517
+ "\n",
518
+ "[Row 38] (Uncertainty: 0.9982)\n",
519
+ "CLAIM: 1. An insulated-gate bipolar transistor (IGBT) in a semiconductor substrate, said IGBT comprising: a collector at a bottom surface of said semiconductor substrate, a drift region having a first conductivity type situated over said collector, and a base layer having a second conductivity type opposit...\n",
520
+ "------------------------------------------------------------\n",
521
+ "LLM Says: 0 (High) | Rationale: The claim describes a semiconductor device structure, not a technology for greenhouse gas mitigation.\n",
522
+ "Saved.\n",
523
+ "\n",
524
+ "[Row 39] (Uncertainty: 0.9981)\n",
525
+ "CLAIM: 1. A method of joining two or more articles via slender nanomaterials embedded in a joining medium and interlinked together, the method involving: (i) dispersion of nanomaterials comprising at least one of carbon nanotubes and nanofibers within a solvent, with the weight percent of said nanomaterial...\n",
526
+ "------------------------------------------------------------\n",
527
+ "LLM Says: 0 (High) | Rationale: The claim describes a nanomaterial-based joining method, which is unrelated to greenhouse‑gas reduction or climate change mitigation.\n",
528
+ "Saved.\n",
529
+ "\n",
530
+ "[Row 40] (Uncertainty: 0.9981)\n",
531
+ "CLAIM: 1. A compound having a structure represented by a chemical formula described below:...\n",
532
+ "------------------------------------------------------------\n",
533
+ "Asking LM Studio...\n",
534
+ "LLM output invalid JSON: <|channel|>final <|constrain|>JSON<|message|>{\"suggestion\":0,\"confidence\":\"Low\",\"rationale\":\"The claim merely describes a chemical compound without indicating any environmental or climate‑related application.\"}\n",
535
+ "LLM Failed (Check settings or Label Manually)\n",
536
+ "Saved.\n",
537
+ "\n",
538
+ "[Row 41] (Uncertainty: 0.9981)\n",
539
+ "CLAIM: 1. A computer-implemented process comprising: executing, by a computer processor, at least two read threads to read a block of data from a database, each of the read threads having a first wait stat and a second wait stat, the read threads configured to compress data using a dynamic compression rati...\n",
540
+ "------------------------------------------------------------\n",
541
+ "LLM Says: 0 (High) | Rationale: The claim describes a data compression and backup process, unrelated to climate change mitigation.\n",
542
+ "Saved.\n",
543
+ "\n",
544
+ "[Row 42] (Uncertainty: 0.9980)\n",
545
+ "CLAIM: 1. A rotor for a Wankel engine comprising: two axially spaced apart end faces having a generally triangular profile with outwardly arched sides and three circumferentially spaced apex portions; a peripheral face extending between the end faces and defining three flanks, each flank extending between ...\n",
546
+ "------------------------------------------------------------\n",
547
+ "LLM Says: 0 (High) | Rationale: The claim describes a mechanical component for a Wankel engine and does not relate to greenhouse gas mitigation or climate change technologies.\n",
548
+ "Saved.\n",
549
+ "\n",
550
+ "[Row 43] (Uncertainty: 0.9980)\n",
551
+ "CLAIM: 1. A housing apparatus, comprising: a housing casing which surrounds a first cavity and which has multiple side surfaces; a volute housing arranged in an interior of the housing casing, said volute housing having a central through opening for accommodating a compressor wheel of a rotor and for suppl...\n",
552
+ "------------------------------------------------------------\n",
553
+ "LLM Says: 0 (Medium) | Rationale: The claim describes a housing apparatus for a compressor wheel, which is unrelated to green or climate change mitigation technologies.\n",
554
+ "Saved.\n",
555
+ "\n",
556
+ "[Row 44] (Uncertainty: 0.9979)\n",
557
+ "CLAIM: 1. A biodegradable container for a semi-solid composition, comprising: a tube portion comprising a first paper that defines first, second, and third plies forming an open end and a closed end, and a lumen containing the semi-solid composition, wherein the tube portion further comprises a continuous ...\n",
558
+ "------------------------------------------------------------\n",
559
+ "LLM Says: 1 (High) | Rationale: The claim describes a fully biodegradable container, indicating an environmental benefit aligned with Green/Climate Change mitigation.\n",
560
+ "Saved.\n",
561
+ "\n",
562
+ "[Row 45] (Uncertainty: 0.9979)\n",
563
+ "CLAIM: 1. An isolated green sulfur bacterium Chlorobaculum limnaeum strain RK-j-1 deposited at National Institute of Technology and Evaluation Patent Microorganisms Depositary (NPMD) as accession number NITE BP-1202....\n",
564
+ "------------------------------------------------------------\n",
565
+ "LLM Says: 0 (Medium) | Rationale: The claim merely describes isolation of a microorganism without any stated application to climate change mitigation.\n",
566
+ "Saved.\n",
567
+ "\n",
568
+ "[Row 46] (Uncertainty: 0.9979)\n",
569
+ "CLAIM: 1. A method for communicating over allocated resources, comprising: receiving a resource allocation comprising a portion of a resource block over a plurality of bundled transmission time intervals, wherein the portion of the resource block comprises a subset of subcarriers in the resource block with...\n",
570
+ "------------------------------------------------------------\n",
571
+ "LLM Says: 0 (High) | Rationale: The claim describes a telecommunications resource allocation method, unrelated to Green/Climate Change mitigation.\n",
572
+ "Saved.\n",
573
+ "\n",
574
+ "[Row 47] (Uncertainty: 0.9979)\n",
575
+ "CLAIM: 1. A switch system, comprising: a plurality of nodes, wherein each node includes a computational processor and an embedded switch; a plurality of links associated with each node, wherein the plurality of links are configured to connect nodes in the plurality of nodes to create a topology of a switch...\n",
576
+ "------------------------------------------------------------\n",
577
+ "LLM Says: 0 (High) | Rationale: The claim describes a network switch architecture, unrelated to climate change mitigation.\n",
578
+ "Saved.\n",
579
+ "\n",
580
+ "[Row 48] (Uncertainty: 0.9977)\n",
581
+ "CLAIM: 1. A seed of soybean cultivar S100323, wherein a representative of sample seed of said cultivar is deposited under ATCC Accession No. PTA-12317....\n",
582
+ "------------------------------------------------------------\n",
583
+ "LLM Says: 0 (High) | Rationale: The claim concerns a soybean seed deposit, not climate‑change mitigation.\n",
584
+ "Saved.\n",
585
+ "\n",
586
+ "[Row 49] (Uncertainty: 0.9976)\n",
587
+ "CLAIM: 1. A method to accelerate particles into a chamber, comprising: distributing a fluidic substance between electrodes configured at a location proximate a chamber, the electrodes comprising a low work function material; generating a current of ionized particles by applying an electric field between th...\n",
588
+ "------------------------------------------------------------\n",
589
+ "Asking LM Studio...\n",
590
+ "LLM output invalid JSON: <|channel|>final <|constrain|>JSON<|message|>{\"suggestion\":0,\"confidence\":\"High\",\"rationale\":\"The claim describes a particle acceleration method unrelated to greenhouse gas reduction.\"}\n",
591
+ "LLM Failed (Check settings or Label Manually)\n",
592
+ "Saved.\n",
593
+ "\n",
594
+ "[Row 50] (Uncertainty: 0.9976)\n",
595
+ "CLAIM: 1. A biogenic flocculant composition for CEPT sludge conditioning comprising a) a first flocculant component which comprises at least one acidophilic auto-trophic iron-oxidizing bacterium and at least one species of acid tolerant organotrophic microbes which are grown in medium containing iron (II) ...\n",
596
+ "------------------------------------------------------------\n",
597
+ "LLM Says: 0 (Medium) | Rationale: The claim focuses on sludge conditioning using microbial flocculants, which is a wastewater treatment application rather than a direct climate‑change mitigation technology.\n",
598
+ "Saved.\n",
599
+ "\n",
600
+ "[Row 51] (Uncertainty: 0.9975)\n",
601
+ "CLAIM: 1. A nuclear reactor comprising: an elongated reactor vessel enclosed at a lower end and having an open upper end on which an annular flange is formed and a central axis extending, along an elongated dimension; a reactor vessel head having an annular portion on an underside of the bead that is machi...\n",
602
+ "------------------------------------------------------------\n",
603
+ "LLM Says: 0 (High) | Rationale: The claim describes a nuclear reactor component, not a climate‑change mitigation technology.\n",
604
+ "Saved.\n",
605
+ "\n",
606
+ "[Row 52] (Uncertainty: 0.9975)\n",
607
+ "CLAIM: 1. A steam reforming system comprising: a) a kiln, comprising a susceptor tube; a kiln inlet for receiving a feedstock; a conveyor for transporting said feedstock through said kiln; b) a steam reforming reactor comprising a reformer tube; a reactor inlet in fluid communication with said first kiln o...\n",
608
+ "------------------------------------------------------------\n",
609
+ "LLM Says: 0 (Medium) | Rationale: The claim describes a steam reforming system for gas production, which is a general chemical process and does not explicitly address greenhouse gas reduction or climate change mitigation.\n",
610
+ "Saved.\n",
611
+ "\n",
612
+ "[Row 53] (Uncertainty: 0.9975)\n",
613
+ "CLAIM: 1. A pest trap reporting system, comprising: a plurality of pest traps, wherein each pest trap encloses, retains or kills one or more non-human pests; a pest report database that includes pest activity information for the plurality of pest traps; a plurality of sensors, each of the plurality of sens...\n",
614
+ "------------------------------------------------------------\n",
615
+ "LLM Says: 0 (High) | Rationale: The claim describes a pest monitoring system, not related to greenhouse gas mitigation.\n",
616
+ "Saved.\n",
617
+ "\n",
618
+ "[Row 54] (Uncertainty: 0.9974)\n",
619
+ "CLAIM: 1. A motor vehicle comprising: a body; a wheel rotatably supported on the body; an occupant riding portion supported by the body for tilting relative to the body and mounted with an occupant; occupant attitude detection means for detecting an attitude of the occupant riding portion; body attitude de...\n",
620
+ "------------------------------------------------------------\n",
621
+ "LLM Says: 0 (High) | Rationale: The claim describes a vehicle tilt-control system unrelated to greenhouse‑gas reduction or climate mitigation.\n",
622
+ "Saved.\n",
623
+ "\n",
624
+ "[Row 55] (Uncertainty: 0.9973)\n",
625
+ "CLAIM: 1. A compound having Formula (III) or a therapeutically acceptable salt thereof, wherein...\n",
626
+ "------------------------------------------------------------\n",
627
+ "Asking LM Studio...\n",
628
+ "LLM output invalid JSON: <|channel|>final <|constrain|>JSON<|message|>{\"suggestion\":0,\"confidence\":\"High\",\"rationale\":\"The claim describes a chemical compound for therapeutic use, not related to green or climate change mitigation.\"}\n",
629
+ "LLM Failed (Check settings or Label Manually)\n",
630
+ "Saved.\n",
631
+ "\n",
632
+ "[Row 56] (Uncertainty: 0.9973)\n",
633
+ "CLAIM: 1. A method for fabricating a semiconductor device, comprising: forming a conductive layer over first and second regions of a semiconductor substrate; forming a trench extended in the first region of the semiconductor substrate through the conductive layer; forming a first gate electrode in the tren...\n",
634
+ "------------------------------------------------------------\n",
635
+ "LLM Says: 0 (High) | Rationale: The claim describes semiconductor fabrication steps, unrelated to Green/Climate Change mitigation.\n",
636
+ "Saved.\n",
637
+ "\n",
638
+ "[Row 57] (Uncertainty: 0.9972)\n",
639
+ "CLAIM: 1. A method for installation of an offshore wind turbine, characterized in comprising the steps of: prefabrication of a foundation, including: fabricating the foundation which includes a plurality of tanks providing buoyant force and uprighting force to the foundation so as to keep the foundation up...\n",
640
+ "------------------------------------------------------------\n",
641
+ "LLM Says: 1 (High) | Rationale: The claim describes a method for installing an offshore wind turbine, which is a renewable energy technology that mitigates climate change.\n",
642
+ "Saved.\n",
643
+ "\n",
644
+ "[Row 58] (Uncertainty: 0.9972)\n",
645
+ "CLAIM: 1. An airfoil comprising: an airfoil body made of a first material with a leading edge, trailing edge, pressure side and suction side; a sheath with first and second flanks made of a second material; a first shim disposed between a portion of an end of the first flank and the airfoil body and extend...\n",
646
+ "------------------------------------------------------------\n",
647
+ "LLM Says: 0 (Medium) | Rationale: The claim describes structural components of an airfoil without reference to energy efficiency, emissions reduction, or other climate‑change mitigation measures.\n",
648
+ "Saved.\n",
649
+ "\n",
650
+ "[Row 59] (Uncertainty: 0.9972)\n",
651
+ "CLAIM: 1. An electric storage system comprising: a plurality of electric storage blocks connected in series, each of the plurality of electric storage blocks including a plurality of electric storage elements connected in parallel; a plurality of current breakers, each of the plurality of current breakers ...\n",
652
+ "------------------------------------------------------------\n",
653
+ "LLM Says: 0 (Medium) | Rationale: The claim describes a battery management system, not a direct green or climate change mitigation technology.\n",
654
+ "Saved.\n",
655
+ "\n",
656
+ "[Row 60] (Uncertainty: 0.9972)\n",
657
+ "CLAIM: 1. A method for performing operations on a stainer in a stainer network comprising: providing a robotic arm coupled to the stainer, the robotic arm having a reagent dispenser; establishing a network connection between a computer and a stainer in the stainer network; sending requests from the compute...\n",
658
+ "------------------------------------------------------------\n",
659
+ "LLM Says: 0 (High) | Rationale: The claim describes laboratory automation for sample processing, not a technology related to greenhouse gas mitigation or climate change.\n",
660
+ "Saved.\n",
661
+ "\n",
662
+ "[Row 61] (Uncertainty: 0.9971)\n",
663
+ "CLAIM: 1. A Group III nitride semiconductor light-emitting device, comprising: a conductive support; a p-electrode disposed on the support; a semiconductor layer disposed on the p-electrode, the semiconductor layer comprising at least a p-layer, a light-emitting layer, and an n-layer disposed in this order...\n",
664
+ "------------------------------------------------------------\n",
665
+ "LLM Says: 0 (High) | Rationale: The claim describes a semiconductor light-emitting device, which is unrelated to green or climate change mitigation.\n",
666
+ "Saved.\n",
667
+ "\n",
668
+ "[Row 62] (Uncertainty: 0.9971)\n",
669
+ "CLAIM: 1. A plate heat exchanger in a sealed design, with: a stacked arrangement comprising: a front-side and a rear-side end plate, wherein at least one end plate is constituted as a connection plate having at least one connection, heat exchanger plates which are arranged and stacked between the front-sid...\n",
670
+ "------------------------------------------------------------\n",
671
+ "LLM Says: 0 (High) | Rationale: The claim describes a mechanical design for a plate heat exchanger and does not address energy efficiency or climate‑change mitigation technologies.\n",
672
+ "Saved.\n",
673
+ "\n",
674
+ "[Row 63] (Uncertainty: 0.9971)\n",
675
+ "CLAIM: 1. A vehicle braking/driving force control system comprising: a braking/driving force generating mechanism that causes each wheel of a vehicle to generate driving force or braking force independently of one another; a suspension mechanism that couples each of the wheels that are not supported by spr...\n",
676
+ "------------------------------------------------------------\n",
677
+ "LLM Says: 0 (High) | Rationale: The claim describes a vehicle braking and driving force control system, which is unrelated to greenhouse gas mitigation or climate change technologies.\n",
678
+ "Saved.\n",
679
+ "\n",
680
+ "[Row 64] (Uncertainty: 0.9970)\n",
681
+ "CLAIM: 1. A detector apparatus configured to receive light and generate electrical signals, the detector apparatus comprising: a light sensor having a light incidence side, the light sensor including at least one photocathode; a cooling component, the cooling component being in direct contact, on the light...\n",
682
+ "------------------------------------------------------------\n",
683
+ "LLM Says: 0 (High) | Rationale: The claim describes a light detection device, not related to climate change mitigation.\n",
684
+ "Saved.\n",
685
+ "\n",
686
+ "[Row 65] (Uncertainty: 0.9970)\n",
687
+ "CLAIM: 1. A flexible display device comprising: a display panel configured to generate an image; and a window member on the display panel, the window member comprising: wherein a width of each of the second parts is smaller than a width of the first part at a bending area....\n",
688
+ "------------------------------------------------------------\n",
689
+ "Asking LM Studio...\n",
690
+ "LLM output invalid JSON: <|channel|>final <|constrain|>JSON<|message|>{\"suggestion\":0,\"confidence\":\"High\",\"rationale\":\"The claim describes a flexible display device, which is unrelated to Green/Climate Change mitigation.\"}\n",
691
+ "LLM Failed (Check settings or Label Manually)\n",
692
+ "Saved.\n",
693
+ "\n",
694
+ "[Row 66] (Uncertainty: 0.9969)\n",
695
+ "CLAIM: 1. A conductive film comprising: a substrate; a transparent electrode layer provided on the substrate; and a conductive pattern layer provided on the transparent electrode layer, wherein the conductive pattern layer includes a metal nitride pattern layer including CuNx, x is a mass ratio of N with r...\n",
696
+ "------------------------------------------------------------\n",
697
+ "LLM Says: 0 (High) | Rationale: The claim describes a conductive film for electronic applications, with no indication of greenhouse gas reduction or climate mitigation.\n",
698
+ "Saved.\n",
699
+ "\n",
700
+ "[Row 67] (Uncertainty: 0.9969)\n",
701
+ "CLAIM: 1. A hydrolysable linker selected from a compound of formula V, VI, VII, and VIII: wherein: R′ and R″ are each independently a C each a is independently an integer from 0 to 6; each b is independently an integer from 1 to 6; each X is independently: each X each Y is independently: each Y each m, n, ...\n",
702
+ "------------------------------------------------------------\n",
703
+ "LLM Says: 0 (Low) | Rationale: The claim describes a chemical linker, with no indication of climate‑change mitigation.\n",
704
+ "Saved.\n",
705
+ "\n",
706
+ "[Row 68] (Uncertainty: 0.9968)\n",
707
+ "CLAIM: 1. A substrate bearing a stack of layers as the back contact in a molybdenum photovoltaic device, said back contact comprising in order from the substrate: a barrier layer comprising at least one of: Si a primer layer; a layer of ZnO; and a layer of molybdenum, wherein the molybdenum is deposited di...\n",
708
+ "------------------------------------------------------------\n",
709
+ "LLM Says: 1 (High) | Rationale: The claim describes a photovoltaic device structure, which is directly relevant to solar energy technology for climate change mitigation.\n",
710
+ "Saved.\n",
711
+ "\n",
712
+ "[Row 69] (Uncertainty: 0.9968)\n",
713
+ "CLAIM: 1. A system comprising: a foil including a leading inlet for fluid to enter, a forward chamber within the foil downstream of the leading inlet, a rearward chamber within the foil downstream of the forward chamber, and a constriction formed by the foil between the forward and rearward chambers; at le...\n",
714
+ "------------------------------------------------------------\n",
715
+ "LLM Says: 0 (High) | Rationale: The claim describes a fluid flow device without any explicit reference to environmental or climate‑change mitigation.\n",
716
+ "Saved.\n",
717
+ "\n",
718
+ "[Row 70] (Uncertainty: 0.9967)\n",
719
+ "CLAIM: 1. A method for passivating a surface of crystalline iron disulfide, comprising: sputtering iron disulfide to form a layer of crystalline iron disulfide on a substrate, wherein the layer has a surface comprising crystal surfaces; and depositing a capping layer of epitaxial zinc sulfide onto the surf...\n",
720
+ "------------------------------------------------------------\n",
721
+ "LLM Says: 0 (High) | Rationale: The claim concerns surface passivation of iron disulfide, not a technology for greenhouse gas mitigation or climate change.\n",
722
+ "Saved.\n",
723
+ "\n",
724
+ "[Row 71] (Uncertainty: 0.9967)\n",
725
+ "CLAIM: 1. A brake control apparatus, comprising: a frictional braking unit configured to generate frictional braking force by supplying operating fluid to a wheel cylinder provided on each wheel of a vehicle to press a frictional member against the wheel; a regenerative braking unit configured to generate ...\n",
726
+ "------------------------------------------------------------\n",
727
+ "LLM Says: 0 (High) | Rationale: The claim describes a brake control apparatus for vehicles, which is unrelated to Green/Climate Change mitigation.\n",
728
+ "Saved.\n",
729
+ "\n",
730
+ "[Row 72] (Uncertainty: 0.9966)\n",
731
+ "CLAIM: 1. A circuit for recording a magnitude of an electrostatic discharge (ESD) event during semiconductor assembly, the circuit comprising: a voltage divider connected between a first potential and a second potential, the voltage divider configured to provide a first node having a discrete voltage level...\n",
732
+ "------------------------------------------------------------\n",
733
+ "LLM Says: 0 (High) | Rationale: The claim describes an ESD recording circuit for semiconductor manufacturing, which does not pertain to greenhouse gas mitigation or climate change technologies.\n",
734
+ "Saved.\n",
735
+ "\n",
736
+ "[Row 73] (Uncertainty: 0.9966)\n",
737
+ "CLAIM: 1. A rectifier, comprising: a first rectification unit having an anode and a cathode, the anode being connected to a negative radio frequency (RF) port, and the cathode being connected to a positive direct current (DC) port; a second rectification unit having an anode and a cathode, the anode being ...\n",
738
+ "------------------------------------------------------------\n",
739
+ "LLM Says: 0 (High) | Rationale: The claim describes a rectifier circuit, unrelated to climate change mitigation.\n",
740
+ "Saved.\n",
741
+ "\n",
742
+ "[Row 74] (Uncertainty: 0.9966)\n",
743
+ "CLAIM: 1. An apparatus comprising: a first electronic device to communicate with a second electronic device, the first device comprising:...\n",
744
+ "------------------------------------------------------------\n",
745
+ "Asking LM Studio...\n",
746
+ "LLM output invalid JSON: <|channel|>final <|constrain|>JSON<|message|>{\"suggestion\":0,\"confidence\":\"High\",\"rationale\":\"The claim describes a generic communication apparatus between electronic devices, with no reference to environmental or climate-related functions.\"}\n",
747
+ "LLM Failed (Check settings or Label Manually)\n",
748
+ "Saved.\n",
749
+ "\n",
750
+ "[Row 75] (Uncertainty: 0.9966)\n",
751
+ "CLAIM: 1. A process-based method of detecting a CO 2 gas leak in a deep geologic gas storage reservoir, the method comprising: constructing a gas sampling station in a vadose zone proximal to the deep geologic gas storage reservoir; measuring a CO measuring an O measuring a CH measuring a N determining a H...\n",
752
+ "------------------------------------------------------------\n",
753
+ "LLM Says: 1 (High) | Rationale: The method detects CO₂ leaks from a geological storage site, directly supporting carbon sequestration efforts.\n",
754
+ "Saved.\n",
755
+ "\n",
756
+ "[Row 76] (Uncertainty: 0.9965)\n",
757
+ "CLAIM: 1. A self-supporting reflector for a parabolic trough: (a) having a reflectance of at least 90%, based on the solar spectrum; (b) comprising at least one layer of a transparent plastic material facing a light source and having a layer thickness within a range of from 0.1 mm to 8 mm; and (c) at least...\n",
758
+ "------------------------------------------------------------\n",
759
+ "LLM Says: 1 (High) | Rationale: The claim describes a component for a parabolic trough solar collector, which is a renewable energy technology used to mitigate climate change.\n",
760
+ "Saved.\n",
761
+ "\n",
762
+ "[Row 77] (Uncertainty: 0.9965)\n",
763
+ "CLAIM: 1. A valve train system for an internal combustion engine having a combustion chamber with a piston which reciprocates therewithin between a top-dead-center position and a bottom-dead-center position, said valve train system comprising: an intake valve which moves between an intake closed position a...\n",
764
+ "------------------------------------------------------------\n",
765
+ "LLM Says: 0 (High) | Rationale: The claim describes a conventional valve train for an internal combustion engine, which is unrelated to Green/Climate Change mitigation.\n",
766
+ "Saved.\n",
767
+ "\n",
768
+ "[Row 78] (Uncertainty: 0.9965)\n",
769
+ "CLAIM: 1. An organic light-emitting diode (OLED) display, comprising: a first plastic layer; a first barrier layer formed over the first plastic layer; a first intermediate layer formed over the first barrier layer, wherein the first intermediate layer comprises amorphous silicon; a second plastic layer fo...\n",
770
+ "------------------------------------------------------------\n",
771
+ "LLM Says: 0 (High) | Rationale: The claim describes an OLED display structure, which does not relate to Green/Climate Change mitigation.\n",
772
+ "Saved.\n",
773
+ "\n",
774
+ "[Row 79] (Uncertainty: 0.9965)\n",
775
+ "CLAIM: 1. A wave activated power generating device, comprising: a support frame; a buoy vertically positioned to rise and fall relative to motion of waves impacting the buoy and the support frame, the buoy being formed with a hollow interior space; a rack and pinion structure operatively connected between ...\n",
776
+ "------------------------------------------------------------\n",
777
+ "LLM Says: 1 (High) | Rationale: The claim describes a wave‑powered generator that converts ocean wave motion into electricity, which is a renewable energy technology for climate change mitigation.\n",
778
+ "Saved.\n",
779
+ "\n",
780
+ "[Row 80] (Uncertainty: 0.9964)\n",
781
+ "CLAIM: 1. A method for reducing an amount of unwanted living organisms within an algae cultivation fluid, the algae cultivation fluid including wanted living algae of genus Nannochloropsis , the method comprising: subjecting the algae cultivation fluid, the algae cultivation fluid including the wanted livi...\n",
782
+ "------------------------------------------------------------\n",
783
+ "Asking LM Studio...\n",
784
+ "LLM output invalid JSON: <|channel|>final <|constrain|>JSON<|message|>{\"suggestion\":1,\"confidence\":\"Medium\",\"rationale\":\"The\n",
785
+ "LLM Failed (Check settings or Label Manually)\n",
786
+ "Saved.\n",
787
+ "\n",
788
+ "[Row 81] (Uncertainty: 0.9964)\n",
789
+ "CLAIM: 1. A compound having Formula (I): wherein, R R b) —(CH R X is —O, NH or S; Y is a cleavable or non-cleavable linker group; and Z is an antigen derived from an infectious agent or a tumor antigen or a pharmaceutically acceptable salt thereof....\n",
790
+ "------------------------------------------------------------\n",
791
+ "Asking LM Studio...\n",
792
+ "LLM output invalid JSON: <|channel|>final <|constrain|>json<|message|>{\"suggestion\":0,\"confidence\":\"High\",\"rationale\":\"The claim concerns a pharmaceutical compound for medical use, not related to climate change mitigation.\"}\n",
793
+ "LLM Failed (Check settings or Label Manually)\n",
794
+ "Saved.\n",
795
+ "\n",
796
+ "[Row 82] (Uncertainty: 0.9964)\n",
797
+ "CLAIM: 1. A memory controller comprising: driver circuitry to output a first timing signal to a memory device, the first timing signal to time transmission of a data signal from the memory device to the memory controller; control circuitry to enable oscillation of the first timing signal at a first frequen...\n",
798
+ "------------------------------------------------------------\n",
799
+ "LLM Says: 0 (High) | Rationale: The claim describes a memory controller’s timing and clock circuitry, unrelated to climate change mitigation.\n",
800
+ "Saved.\n",
801
+ "\n",
802
+ "[Row 83] (Uncertainty: 0.9964)\n",
803
+ "CLAIM: 1. A rubber composition, comprising, based on 100% by mass of a rubber component: 5 to 55% by mass of a copolymer (A) of an aromatic vinyl compound and a conjugated diene compound, the copolymer (A) having an aromatic vinyl compound content of 5-14% by mass and a vinyl bond content in the conjugated...\n",
804
+ "------------------------------------------------------------\n",
805
+ "LLM Says: 0 (High) | Rationale: The claim describes a rubber composition and silica filler, not a technology for greenhouse gas mitigation.\n",
806
+ "Saved.\n",
807
+ "\n",
808
+ "[Row 84] (Uncertainty: 0.9963)\n",
809
+ "CLAIM: 1. A method of preparing a metal suboxide, comprising: preparing a mixture including a metal suboxide precursor, an aromatic compound substituted with a hydroxy group, and a linking precursor including one selected from a C1 to C30 aldehyde, a C3 to C30 ketone, and a combination thereof; reacting th...\n",
810
+ "------------------------------------------------------------\n",
811
+ "LLM Says: 0 (Low) | Rationale: The claim describes a chemical synthesis method, not directly related to climate change mitigation.\n",
812
+ "Saved.\n",
813
+ "\n",
814
+ "[Row 85] (Uncertainty: 0.9963)\n",
815
+ "CLAIM: 1. A vehicle hydraulic control device including: an oil pump that is driven by a driving force source for wheels; and an oil passage that guides oil discharged from the oil pump to a rotating electrical machine that forms at least a part of the driving force source and a gear mechanism to which driv...\n",
816
+ "------------------------------------------------------------\n",
817
+ "LLM Says: 0 (Medium) | Rationale: The claim describes a hydraulic control system for a vehicle, which does not directly address greenhouse gas reduction or climate change mitigation.\n",
818
+ "Saved.\n",
819
+ "\n",
820
+ "[Row 86] (Uncertainty: 0.9963)\n",
821
+ "CLAIM: 1. A method for the production of a human or animal nutrition product comprising producing an adsorbate suitable for human or animal nutrition comprising applying a component to a carrier using at least one stabilizer such that the component is adsorbed to the carrier, wherein the carrier has a mean...\n",
822
+ "------------------------------------------------------------\n",
823
+ "LLM Says: 0 (High) | Rationale: The claim concerns a nutritional adsorbate production method, not a technology for greenhouse gas reduction or climate mitigation.\n",
824
+ "Saved.\n",
825
+ "\n",
826
+ "[Row 87] (Uncertainty: 0.9963)\n",
827
+ "CLAIM: 1. An automated driving system, comprising: one or more sensors disposed on an autonomous vehicle; and a computing device in communication with the one or more sensors, comprising:...\n",
828
+ "------------------------------------------------------------\n",
829
+ "LLM Says: 0 (Low) | Rationale: The claim describes a general automated driving system without explicit reference to environmental or climate mitigation.\n",
830
+ "Saved.\n",
831
+ "\n",
832
+ "[Row 88] (Uncertainty: 0.9963)\n",
833
+ "CLAIM: 1. A memory device, comprising: an array of memory cells, the memory cells in the array being programmable to at least two different charge levels; and a control logic unit coupled to the array of memory cells and configured to program the memory cells in each of a plurality of groups with a respect...\n",
834
+ "------------------------------------------------------------\n",
835
+ "LLM Says: 0 (High) | Rationale: The claim describes a memory device and programming logic, unrelated to climate change mitigation.\n",
836
+ "Saved.\n",
837
+ "\n",
838
+ "[Row 89] (Uncertainty: 0.9963)\n",
839
+ "CLAIM: 1. An adjustable solar panel mounting assembly comprising: a. a first clamp further comprising an upper and lower portion wherein the lower portion further comprises a cavity; b. a first mounting plate extending outward from the lower portion of the first clamp to an end; c. a first flange extending...\n",
840
+ "------------------------------------------------------------\n",
841
+ "LLM Says: 1 (High) | Rationale: The claim describes a solar panel mounting assembly, which supports renewable energy generation.\n",
842
+ "Saved.\n",
843
+ "\n",
844
+ "[Row 90] (Uncertainty: 0.9963)\n",
845
+ "CLAIM: 1. A method of planting or seeding multiple types of seed in a single planting pass during row-crop planting or seeding of an agricultural field with an agricultural implement, the method comprising: storing seeds of multiple types including at least a first type and a second type in multiple compar...\n",
846
+ "------------------------------------------------------------\n",
847
+ "LLM Says: 0 (Medium) | Rationale: The claim describes a multi‑seed planting method, which is an agricultural technique but does not directly address greenhouse gas reduction or climate change mitigation.\n",
848
+ "Saved.\n",
849
+ "\n",
850
+ "[Row 91] (Uncertainty: 0.9963)\n",
851
+ "CLAIM: 1. A method to update a cache in a multi-core processor, the method comprising: receiving a notification of a cache miss associated with a process or a thread running on a single core of the multi-core processor, the single core including: determining that an address associated with the cache miss c...\n",
852
+ "------------------------------------------------------------\n",
853
+ "LLM Says: 0 (High) | Rationale: The claim concerns processor cache management, unrelated to climate change mitigation.\n",
854
+ "Saved.\n",
855
+ "\n",
856
+ "[Row 92] (Uncertainty: 0.9962)\n",
857
+ "CLAIM: 1. A driving method of a liquid crystal display device comprising a liquid crystal element, the driving method comprising the steps of: applying a first voltage to the liquid crystal element in a first subframe period of a first frame period; making transmittance of the liquid crystal element at the...\n",
858
+ "------------------------------------------------------------\n",
859
+ "LLM Says: 0 (High) | Rationale: The claim describes a technical method for controlling liquid crystal display operation, with no reference to energy efficiency or climate‑change mitigation.\n",
860
+ "Saved.\n",
861
+ "\n",
862
+ "[Row 93] (Uncertainty: 0.9962)\n",
863
+ "CLAIM: 1. A compound of Formula IA: wherein X R R wherein each R each R where Z wherein the alkyl, alkenyl, alkynyl, cycloalkyl, aryl, heterocyclic, or heteroaryl groups of Z wherein Y indicates one or more optional double bonds; and n is 0, 1, 2, or 3; R each R wherein the alkyl, alkenyl, alkynyl, cycloal...\n",
864
+ "------------------------------------------------------------\n",
865
+ "LLM Says: 0 (High) | Rationale: The claim describes a generic chemical structure for a potential pharmaceutical compound with no reference to environmental or climate‑change mitigation.\n",
866
+ "Saved.\n",
867
+ "\n",
868
+ "[Row 94] (Uncertainty: 0.9962)\n",
869
+ "CLAIM: 1. A valve for a fuel cell comprising: a housing; a first pressure chamber and a second pressure chamber provided in the housing; two supply/discharge tubes connected to the housing, and supplying and discharging fluid to and from the first pressure chamber and the second pressure chamber, respectiv...\n",
870
+ "------------------------------------------------------------\n",
871
+ "LLM Says: 1 (Medium) | Rationale: The claim describes a valve for a fuel cell, which is a technology used in clean energy generation and thus relates to green/climate change mitigation.\n",
872
+ "Saved.\n",
873
+ "\n",
874
+ "[Row 95] (Uncertainty: 0.9962)\n",
875
+ "CLAIM: 1. A wing comprising: an airfoil section including a leading edge, a trailing edge, an upper surface and a lower surface, wherein a region within the airfoil section immediately adjacent the leading edge is ventilated via one or more vent openings which open in the upper surface to establish a sub-s...\n",
876
+ "------------------------------------------------------------\n",
877
+ "LLM Says: 0 (High) | Rationale: The claim describes a wing ventilation system for aerodynamic performance, not climate change mitigation.\n",
878
+ "Saved.\n",
879
+ "\n",
880
+ "[Row 96] (Uncertainty: 0.9962)\n",
881
+ "CLAIM: 1. An input device comprising: a touch panel that includes M (where M is a natural number of 5 or more) driving electrodes, and a plurality of detection electrodes forming capacitances between the respective driving electrodes, in which the M driving electrodes and the plurality of detection electro...\n",
882
+ "------------------------------------------------------------\n",
883
+ "LLM Says: 0 (High) | Rationale: The claim describes a touch panel input device, unrelated to green or climate‑change mitigation.\n",
884
+ "Saved.\n",
885
+ "\n",
886
+ "[Row 97] (Uncertainty: 0.9961)\n",
887
+ "CLAIM: 1. An information display system in a transportation apparatus, the information display system comprises: a liquid crystal display (LCD) screen that occupies at least a portion of a dashboard of the transportation apparatus, wherein the LCD screen is capable of graphically displaying multiple inform...\n",
888
+ "------------------------------------------------------------\n",
889
+ "Asking LM Studio...\n",
890
+ "LLM output invalid JSON: <|channel|>final <|constrain|>JSON<|message|>{\"suggestion\":0,\"confidence\":\"High\",\"rationale\":\"The claim describes an information display system for a vehicle dashboard, which does not directly address greenhouse gas reduction or climate change mitigation.\"}\n",
891
+ "LLM Failed (Check settings or Label Manually)\n",
892
+ "Saved.\n",
893
+ "\n",
894
+ "[Row 98] (Uncertainty: 0.9960)\n",
895
+ "CLAIM: 1. A photovoltaic (PV) device, comprising: at least one PV interband cascade (IC PV) stage having a conduction band and a valence band, comprising: wherein the absorption region is positioned between the intraband transport region and the interband tunneling region, wherein the interband tunneling r...\n",
896
+ "------------------------------------------------------------\n",
897
+ "LLM Says: 1 (High) | Rationale: The claim describes a photovoltaic device for generating electricity from light.\n",
898
+ "Saved.\n",
899
+ "\n",
900
+ "[Row 99] (Uncertainty: 0.9960)\n",
901
+ "CLAIM: 1. A rack system comprising: a plurality of trays configured to hold a respective plurality of battery-powered unmanned aerial vehicles; and a frame configured to support the plurality of trays in a vertical arrangement, wherein each tray of the plurality of trays comprises:...\n",
902
+ "------------------------------------------------------------\n",
903
+ "Asking LM Studio...\n",
904
+ "LLM output invalid JSON: <|channel|>final <|constrain|>JSON<|message|>{\"suggestion\":0,\"confidence\":\"Medium\",\"rationale\":\"The claim describes a storage rack for battery-powered UAVs, which does not directly address green or climate change mitigation technologies.\"}\n",
905
+ "LLM Failed (Check settings or Label Manually)\n",
906
+ "Saved.\n",
907
+ "\n",
908
+ "[Row 100] (Uncertainty: 0.9959)\n",
909
+ "CLAIM: 1. A DC electrical machine comprising: an armature having a non-integer number of winding slots per pole-pair of a magnetic field of a field means, each winding slot having a phase angle, wherein the phase angle is electrical and is a position of the winding slot in relation to a fundamental wavefor...\n",
910
+ "------------------------------------------------------------\n",
911
+ "LLM Says: 0 (High) | Rationale: The claim describes a technical improvement to a DC electrical machine, unrelated to climate change mitigation.\n",
912
+ "Saved.\n",
913
+ "\n",
914
+ "Done\n"
915
+ ]
916
+ }
917
+ ],
918
+ "source": [
919
+ "import requests\n",
920
+ "\n",
921
+ "# LLM model is locally hosted via network via LM Studio.\n",
922
+ "LM_STUDIO_URL = \"http://localhost:1234/v1/chat/completions\"\n",
923
+ "\n",
924
+ "# GPT-OSS-20B is run locally\n",
925
+ "MODEL_NAME = \"local-model\" \n",
926
+ "\n",
927
+ "filename = \"hitl_green_100.csv\"\n",
928
+ "\n",
929
+ "def get_llm_response_lmstudio(claim_text):\n",
930
+ " \"\"\"Function to call LM Studio with Error Printing\"\"\"\n",
931
+ " \n",
932
+ " system_prompt = \"\"\"\n",
933
+ " You are a patent classification AI. You must respond in valid JSON format only.\n",
934
+ " Schema:\n",
935
+ " {\n",
936
+ " \"suggestion\": 0 or 1,\n",
937
+ " \"confidence\": \"Low\", \"Medium\", or \"High\",\n",
938
+ " \"rationale\": \"short sentence\"\n",
939
+ " }\n",
940
+ " \"\"\"\n",
941
+ " \n",
942
+ " user_prompt = f\"\"\"\n",
943
+ " Analyze this patent claim. Is it related to Green/Climate Change mitigation (Y02)?\n",
944
+ " Claim: \"{claim_text[:2000]}\"\n",
945
+ " \"\"\"\n",
946
+ "\n",
947
+ " payload = {\n",
948
+ " \"model\": MODEL_NAME,\n",
949
+ " \"messages\": [\n",
950
+ " {\"role\": \"system\", \"content\": system_prompt},\n",
951
+ " {\"role\": \"user\", \"content\": user_prompt}\n",
952
+ " ],\n",
953
+ " \"temperature\": 0.1, \n",
954
+ " \"max_tokens\": 150\n",
955
+ " }\n",
956
+ "\n",
957
+ " try:\n",
958
+ " response = requests.post(LM_STUDIO_URL, json=payload, headers={\"Content-Type\": \"application/json\"})\n",
959
+ " \n",
960
+ " #DEBUGGING BLOCK\n",
961
+ " if response.status_code == 200:\n",
962
+ " result = response.json()\n",
963
+ " \n",
964
+ " # Check if the server sent an error instead of an answer\n",
965
+ " if 'choices' not in result:\n",
966
+ " print(f\"\\n LM STUDIO ERROR: {result}\")\n",
967
+ " return None\n",
968
+ " \n",
969
+ " content = result['choices'][0]['message']['content']\n",
970
+ " \n",
971
+ " # Clean up code blocks if present\n",
972
+ " if \"```\" in content:\n",
973
+ " content = content.replace(\"```json\", \"\").replace(\"```\", \"\").strip()\n",
974
+ " \n",
975
+ " try:\n",
976
+ " return json.loads(content)\n",
977
+ " except json.JSONDecodeError:\n",
978
+ " print(f\"\\nLLM output invalid JSON: {content}\")\n",
979
+ " return None\n",
980
+ " else:\n",
981
+ " print(f\"Server Error {response.status_code}: {response.text}\")\n",
982
+ " return None\n",
983
+ " \n",
984
+ " except Exception as e:\n",
985
+ " print(f\"Connection Error: {e}\")\n",
986
+ " return None\n",
987
+ "\n",
988
+ "def labeling_loop():\n",
989
+ " if not os.path.exists(filename):\n",
990
+ " print(f\"Error: {filename} not found.\")\n",
991
+ " return\n",
992
+ " \n",
993
+ " df = pd.read_csv(filename)\n",
994
+ " \n",
995
+ " # Create columns if missing\n",
996
+ " for col in ['llm_green_suggested', 'llm_confidence', 'llm_rationale', 'is_green_human', 'notes']:\n",
997
+ " if col not in df.columns: df[col] = \"\"\n",
998
+ "\n",
999
+ " # Find empty rows\n",
1000
+ " remaining_indices = df[df['is_green_human'].isna() | (df['is_green_human'] == \"\")].index.tolist()\n",
1001
+ " \n",
1002
+ " print(f\"--- Part C: HITL Labeling ---\")\n",
1003
+ " print(f\"Remaining rows: {len(remaining_indices)}\")\n",
1004
+ " print(\"-----------------------------\\n\")\n",
1005
+ "\n",
1006
+ " for idx in remaining_indices:\n",
1007
+ " row = df.loc[idx]\n",
1008
+ " claim_text = str(row['text'])\n",
1009
+ " \n",
1010
+ " print(f\"[Row {idx+1}] (Uncertainty: {row['u']:.4f})\")\n",
1011
+ " print(f\"CLAIM: {claim_text[:300]}...\") \n",
1012
+ " print(\"-\" * 60)\n",
1013
+ "\n",
1014
+ " # 1. Ask LLM\n",
1015
+ " print(\"Asking LM Studio...\", end=\"\\r\")\n",
1016
+ " llm_result = get_llm_response_lmstudio(claim_text)\n",
1017
+ " \n",
1018
+ " suggestion = 0\n",
1019
+ " conf = \"Low\"\n",
1020
+ " rat = \"\"\n",
1021
+ " \n",
1022
+ " if llm_result:\n",
1023
+ " suggestion = llm_result.get('suggestion', 0)\n",
1024
+ " conf = llm_result.get('confidence', \"Low\")\n",
1025
+ " rat = llm_result.get('rationale', \"\")\n",
1026
+ " print(f\"LLM Says: {suggestion} ({conf}) | Rationale: {rat}\")\n",
1027
+ " else:\n",
1028
+ " print(\"LLM Failed (Check settings or Label Manually)\")\n",
1029
+ "\n",
1030
+ " # 2. Human Review\n",
1031
+ " while True:\n",
1032
+ " user_input = input(f\"Your Label (0/1) [Enter for {suggestion}]: \")\n",
1033
+ " if user_input.strip() == \"\":\n",
1034
+ " final_label = suggestion\n",
1035
+ " break\n",
1036
+ " if user_input.strip() in ['0', '1']:\n",
1037
+ " final_label = int(user_input)\n",
1038
+ " break\n",
1039
+ " print(\"Please enter 0 or 1.\")\n",
1040
+ "\n",
1041
+ " # 3. Save\n",
1042
+ " df.at[idx, 'llm_green_suggested'] = suggestion\n",
1043
+ " df.at[idx, 'llm_confidence'] = conf\n",
1044
+ " df.at[idx, 'llm_rationale'] = rat\n",
1045
+ " df.at[idx, 'is_green_human'] = final_label\n",
1046
+ " \n",
1047
+ " df.to_csv(filename, index=False)\n",
1048
+ " print(\"Saved.\\n\")\n",
1049
+ "\n",
1050
+ " print(\"Done\")\n",
1051
+ "\n",
1052
+ "labeling_loop()"
1053
+ ]
1054
+ },
1055
+ {
1056
+ "cell_type": "code",
1057
+ "execution_count": null,
1058
+ "id": "42e180f2",
1059
+ "metadata": {},
1060
+ "outputs": [
1061
+ {
1062
+ "name": "stdout",
1063
+ "output_type": "stream",
1064
+ "text": [
1065
+ "============================================================\n",
1066
+ "HITL ANALYSIS REPORT\n",
1067
+ "============================================================\n",
1068
+ "Total Claims Labeled: 100\n",
1069
+ "Human Overrides: 3\n",
1070
+ "Agreement Rate: 97.0%\n",
1071
+ "------------------------------------------------------------\n",
1072
+ "\n",
1073
+ " 3 EXAMPLES OF HUMAN OVERRIDES:\n",
1074
+ "\n",
1075
+ "Example #1:\n",
1076
+ " • Claim Snippet: \"1. An apparatus, comprising: a single, dilute solids phase reactor having a top, a central section, and a bottom section with an exit port, and a top ...\"\n",
1077
+ " • LLM Suggestion: 0 (Rationale: The claim describes a particle removal apparatus for exhaust gases, which addresses air pollution control rather than greenhouse gas mitigation.)\n",
1078
+ " • Human Label: 1\n",
1079
+ " • Your Notes: Manual override: This technology is classified as Green under CPC Y02.\n",
1080
+ "\n",
1081
+ "Example #2:\n",
1082
+ " • Claim Snippet: \"1. A biogenic flocculant composition for CEPT sludge conditioning comprising a) a first flocculant component which comprises at least one acidophilic ...\"\n",
1083
+ " • LLM Suggestion: 0 (Rationale: The claim focuses on sludge conditioning using microbial flocculants, which is a wastewater treatment application rather than a direct climate‑change mitigation technology.)\n",
1084
+ " • Human Label: 1\n",
1085
+ " • Your Notes: Manual override: This technology is classified as Green under CPC Y02.\n",
1086
+ "\n",
1087
+ "Example #3:\n",
1088
+ " • Claim Snippet: \"1. A nuclear reactor comprising: an elongated reactor vessel enclosed at a lower end and having an open upper end on which an annular flange is formed...\"\n",
1089
+ " • LLM Suggestion: 0 (Rationale: The claim describes a nuclear reactor component, not a climate‑change mitigation technology.)\n",
1090
+ " • Human Label: 1\n",
1091
+ " • Your Notes: Manual override: This technology is classified as Green under CPC Y02.\n"
1092
+ ]
1093
+ }
1094
+ ],
1095
+ "source": [
1096
+ "# Load the completed file\n",
1097
+ "filename = \"hitl_green_100.csv\"\n",
1098
+ "try:\n",
1099
+ " df = pd.read_csv(filename)\n",
1100
+ "except FileNotFoundError:\n",
1101
+ " print(f\"Error: Could not find {filename}. Make sure you saved your work!\")\n",
1102
+ " exit()\n",
1103
+ "\n",
1104
+ "# Find Disagreements between LLM and Human Labels (0/1)\n",
1105
+ "df['llm_green_suggested'] = pd.to_numeric(df['llm_green_suggested'], errors='coerce').fillna(-1).astype(int)\n",
1106
+ "df['is_green_human'] = pd.to_numeric(df['is_green_human'], errors='coerce').fillna(-1).astype(int)\n",
1107
+ "\n",
1108
+ "overrides = df[df['llm_green_suggested'] != df['is_green_human']]\n",
1109
+ "total_count = len(df)\n",
1110
+ "override_count = len(overrides)\n",
1111
+ "\n",
1112
+ "#Print the Report\n",
1113
+ "print(\"=\"*60)\n",
1114
+ "print(\"HITL ANALYSIS REPORT\")\n",
1115
+ "print(\"=\"*60)\n",
1116
+ "print(f\"Total Claims Labeled: {total_count}\")\n",
1117
+ "print(f\"Human Overrides: {override_count}\")\n",
1118
+ "print(f\"Agreement Rate: {((total_count - override_count)/total_count)*100:.1f}%\")\n",
1119
+ "print(\"-\" * 60)\n",
1120
+ "\n",
1121
+ "if override_count > 0:\n",
1122
+ " print(\"\\n 3 EXAMPLES OF HUMAN OVERRIDES:\")\n",
1123
+ " # Selecting 3 examples to show\n",
1124
+ " examples = overrides.head(3)\n",
1125
+ " \n",
1126
+ " for i, (idx, row) in enumerate(examples.iterrows(), 1):\n",
1127
+ " print(f\"\\nExample #{i}:\")\n",
1128
+ " print(f\" • Claim Snippet: \\\"{str(row['text'])[:150]}...\\\"\")\n",
1129
+ " print(f\" • LLM Suggestion: {row['llm_green_suggested']} (Rationale: {row['llm_rationale']})\")\n",
1130
+ " print(f\" • Human Label: {row['is_green_human']}\")\n",
1131
+ " if row['notes']:\n",
1132
+ " print(f\" • Your Notes: {row['notes']}\")"
1133
+ ]
1134
+ },
1135
+ {
1136
+ "cell_type": "markdown",
1137
+ "id": "a4fefb37",
1138
+ "metadata": {},
1139
+ "source": [
1140
+ "# Part D"
1141
+ ]
1142
+ },
1143
+ {
1144
+ "cell_type": "code",
1145
+ "execution_count": null,
1146
+ "id": "0ad0f2e4",
1147
+ "metadata": {},
1148
+ "outputs": [
1149
+ {
1150
+ "name": "stdout",
1151
+ "output_type": "stream",
1152
+ "text": [
1153
+ "Starting Part D: Final Active Learning Evaluation...\n",
1154
+ " - Generating Base Training and Eval Embeddings...\n"
1155
+ ]
1156
+ },
1157
+ {
1158
+ "data": {
1159
+ "application/vnd.jupyter.widget-view+json": {
1160
+ "model_id": "213f76fffed1405582e4c06644c3a2cf",
1161
+ "version_major": 2,
1162
+ "version_minor": 0
1163
+ },
1164
+ "text/plain": [
1165
+ "Encoding: 0%| | 0/63 [00:00<?, ?it/s]"
1166
+ ]
1167
+ },
1168
+ "metadata": {},
1169
+ "output_type": "display_data"
1170
+ },
1171
+ {
1172
+ "data": {
1173
+ "application/vnd.jupyter.widget-view+json": {
1174
+ "model_id": "5c3e96ca30f94a0fa27f27b24368d491",
1175
+ "version_major": 2,
1176
+ "version_minor": 0
1177
+ },
1178
+ "text/plain": [
1179
+ "Encoding: 0%| | 0/157 [00:00<?, ?it/s]"
1180
+ ]
1181
+ },
1182
+ "metadata": {},
1183
+ "output_type": "display_data"
1184
+ },
1185
+ {
1186
+ "name": "stdout",
1187
+ "output_type": "stream",
1188
+ "text": [
1189
+ " - Loading 100 human-labeled examples...\n"
1190
+ ]
1191
+ },
1192
+ {
1193
+ "data": {
1194
+ "application/vnd.jupyter.widget-view+json": {
1195
+ "model_id": "21f8b1a47ab249c8bea9de13cd2b80e9",
1196
+ "version_major": 2,
1197
+ "version_minor": 0
1198
+ },
1199
+ "text/plain": [
1200
+ "Encoding: 0%| | 0/4 [00:00<?, ?it/s]"
1201
+ ]
1202
+ },
1203
+ "metadata": {},
1204
+ "output_type": "display_data"
1205
+ },
1206
+ {
1207
+ "name": "stdout",
1208
+ "output_type": "stream",
1209
+ "text": [
1210
+ "\n",
1211
+ "========================================\n",
1212
+ "FINAL PERFORMANCE COMPARISON\n",
1213
+ "========================================\n",
1214
+ "Metric | Baseline (Part A) | Active (Part D) \n",
1215
+ "------------------------------------------------------------\n",
1216
+ "Precision | 0.7489 | 0.7473 (-0.0015)\n",
1217
+ "Recall | 0.7488 | 0.7467 (-0.0021)\n",
1218
+ "F1-score | 0.7488 | 0.7465 (-0.0023)\n",
1219
+ "============================================================\n",
1220
+ "Assignment Complete! Copy the table above into your README.\n"
1221
+ ]
1222
+ }
1223
+ ],
1224
+ "source": [
1225
+ "print(\"Starting Part D: Final Active Learning Evaluation...\")\n",
1226
+ "\n",
1227
+ "# Setup Model & Data\n",
1228
+ "model_name = \"AI-Growth-Lab/PatentSBERTa\"\n",
1229
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
1230
+ "model = AutoModel.from_pretrained(model_name)\n",
1231
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1232
+ "model.to(device); model.eval()\n",
1233
+ "\n",
1234
+ "def get_embeddings(text_list, batch_size=32):\n",
1235
+ " all_embeddings = []\n",
1236
+ " for i in tqdm(range(0, len(text_list), batch_size), desc=\"Encoding\"):\n",
1237
+ " batch_texts = text_list[i:i+batch_size]\n",
1238
+ " inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors=\"pt\").to(device)\n",
1239
+ " with torch.no_grad():\n",
1240
+ " outputs = model(**inputs)\n",
1241
+ " all_embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())\n",
1242
+ " return np.vstack(all_embeddings)\n",
1243
+ "\n",
1244
+ "# Re-create splits from Part A\n",
1245
+ "df = pd.read_parquet(\"patents_50k_green.parquet\")\n",
1246
+ "df_eval = df.sample(n=5000, random_state=42)\n",
1247
+ "df_remaining = df.drop(df_eval.index)\n",
1248
+ "df_train = df_remaining.sample(n=2000, random_state=42)\n",
1249
+ "\n",
1250
+ "print(\" - Generating Base Training and Eval Embeddings...\")\n",
1251
+ "X_train = get_embeddings(df_train['text'].tolist())\n",
1252
+ "y_train = df_train['is_green_silver'].values\n",
1253
+ "X_eval = get_embeddings(df_eval['text'].tolist())\n",
1254
+ "y_eval = df_eval['is_green_silver'].values\n",
1255
+ "\n",
1256
+ "# Train Baseline\n",
1257
+ "clf_base = LogisticRegression(max_iter=1000, random_state=42)\n",
1258
+ "clf_base.fit(X_train, y_train)\n",
1259
+ "base_report = classification_report(y_eval, clf_base.predict(X_eval), output_dict=True)\n",
1260
+ "\n",
1261
+ "# Load your HITL Gold Labels\n",
1262
+ "df_hitl = pd.read_csv(\"hitl_green_100.csv\")\n",
1263
+ "print(f\" - Loading {len(df_hitl)} human-labeled examples...\")\n",
1264
+ "X_hitl = get_embeddings(df_hitl['text'].tolist())\n",
1265
+ "y_hitl = df_hitl['is_green_human'].values\n",
1266
+ "\n",
1267
+ "# Active Learning: Combine Original Train + Human Gold Labels\n",
1268
+ "X_combined = np.vstack([X_train, X_hitl])\n",
1269
+ "y_combined = np.concatenate([y_train, y_hitl])\n",
1270
+ "\n",
1271
+ "# Train the Active Learning Model\n",
1272
+ "clf_active = LogisticRegression(max_iter=1000, random_state=42)\n",
1273
+ "clf_active.fit(X_combined, y_combined)\n",
1274
+ "active_report = classification_report(y_eval, clf_active.predict(X_eval), output_dict=True)\n",
1275
+ "\n",
1276
+ "# FINAL COMPARISON REPORT\n",
1277
+ "print(\"\\n\" + \"=\"*40)\n",
1278
+ "print(\"FINAL PERFORMANCE COMPARISON\")\n",
1279
+ "print(\"=\"*40)\n",
1280
+ "print(f\"{'Metric':<15} | {'Baseline (Part A)':<20} | {'Active (Part D)':<20}\")\n",
1281
+ "print(\"-\" * 60)\n",
1282
+ "for m in ['precision', 'recall', 'f1-score']:\n",
1283
+ " val_a = base_report['macro avg'][m]\n",
1284
+ " val_d = active_report['macro avg'][m]\n",
1285
+ " diff = val_d - val_a\n",
1286
+ " print(f\"{m.capitalize():<15} | {val_a:20.4f} | {val_d:20.4f} ({'+' if diff >=0 else ''}{diff:.4f})\")\n",
1287
+ "print(\"=\"*60)\n",
1288
+ "print(\"Assignment Complete! Copy the table above into your README.\")"
1289
+ ]
1290
+ },
1291
+ {
1292
+ "cell_type": "code",
1293
+ "execution_count": null,
1294
+ "id": "cf420b77",
1295
+ "metadata": {},
1296
+ "outputs": [],
1297
+ "source": []
1298
+ }
1299
+ ],
1300
+ "metadata": {
1301
+ "kernelspec": {
1302
+ "display_name": "Python 3",
1303
+ "language": "python",
1304
+ "name": "python3"
1305
+ },
1306
+ "language_info": {
1307
+ "codemirror_mode": {
1308
+ "name": "ipython",
1309
+ "version": 3
1310
+ },
1311
+ "file_extension": ".py",
1312
+ "mimetype": "text/x-python",
1313
+ "name": "python",
1314
+ "nbconvert_exporter": "python",
1315
+ "pygments_lexer": "ipython3",
1316
+ "version": "3.12.10"
1317
+ }
1318
+ },
1319
+ "nbformat": 4,
1320
+ "nbformat_minor": 5
1321
+ }
hitl_green_100.csv ADDED
The diff for this file is too large to render. See raw diff
 
patents_50k_green.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce03253c78337876a32fa166356cc6a5b87f66488f3aac55be4c02420fa1fb6
3
+ size 21754536