arbabarshad Claude Opus 4.6 commited on
Commit
c43f82f
·
1 Parent(s): 6dfcfaf

Update retrieval evaluation with k=1,2,4,8 and fix random seed

Browse files

- Change K values from [1,3,5] to [1,2,4,8] for finer-grained eval
- Add random.seed(42) for reproducible sampling
- Update README evaluation table with new metrics
- Regenerate evaluation results

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

README.md CHANGED
@@ -114,12 +114,12 @@ python generate_usa_ipm_info.py --step parse # Create Excel sheet
114
  **Output:** Updates `species-organized/PestID Species - Organized.xlsx` with "USA" sheet containing 110 species present in the United States (pests + beneficials).
115
 
116
  ### Evaluation Filters (retrieval_evaluation.py)
117
- | Filter | P@5 | nDCG@5 |
118
- |--------|-----|--------|
119
- | No Filter | 0.82 | 0.72 |
120
- | Species Only | 0.99 | 0.89 |
121
- | Region Only | 0.83 | 0.73 |
122
- | Species + Region | **1.00** | **0.90** |
123
 
124
  ---
125
 
 
114
  **Output:** Updates `species-organized/PestID Species - Organized.xlsx` with "USA" sheet containing 110 species present in the United States (pests + beneficials).
115
 
116
  ### Evaluation Filters (retrieval_evaluation.py)
117
+ | Filter | P@1 | P@2 | P@4 | P@8 | nDCG@1 | nDCG@2 | nDCG@4 | nDCG@8 |
118
+ |--------|-----|-----|-----|-----|--------|--------|--------|--------|
119
+ | No Filter | 0.64 | 0.76 | 0.81 | 0.85 | 0.64 | 0.72 | 0.74 | 0.75 |
120
+ | Species Only | 0.68 | 0.84 | 0.93 | **1.00** | 0.68 | 0.78 | 0.83 | 0.85 |
121
+ | Region Only | 0.69 | 0.78 | 0.84 | 0.87 | 0.69 | 0.75 | 0.78 | 0.79 |
122
+ | Species + Region | **0.79** | **0.91** | **0.99** | **1.00** | **0.79** | **0.87** | **0.90** | **0.91** |
123
 
124
  ---
125
 
retrieval_evaluation.py CHANGED
@@ -228,6 +228,7 @@ def load_chunks_from_vectordb(persist_directory: str, sample_size: Optional[int]
228
  chunks.append(chunk_data)
229
 
230
  if sample_size and len(chunks) > sample_size:
 
231
  chunks = random.sample(chunks, sample_size)
232
 
233
  return chunks
@@ -238,7 +239,7 @@ def main():
238
  # Configuration
239
  VECTOR_DB_PATH = 'vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
240
  SAMPLE_SIZE = 100 # Start with smaller sample for testing
241
- K_VALUES = [1, 3, 5]
242
  OUTPUT_FILE = 'retrieval_evaluation_results.json'
243
 
244
  print("Starting Retrieval Evaluation Pipeline")
 
228
  chunks.append(chunk_data)
229
 
230
  if sample_size and len(chunks) > sample_size:
231
+ random.seed(42)
232
  chunks = random.sample(chunks, sample_size)
233
 
234
  return chunks
 
239
  # Configuration
240
  VECTOR_DB_PATH = 'vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
241
  SAMPLE_SIZE = 100 # Start with smaller sample for testing
242
+ K_VALUES = [1, 2, 4, 8]
243
  OUTPUT_FILE = 'retrieval_evaluation_results.json'
244
 
245
  print("Starting Retrieval Evaluation Pipeline")
retrieval_evaluation_results.json CHANGED
@@ -1,129 +1,169 @@
1
  {
2
  "no_filter": {
3
  "precision@1": {
4
- "mean": 0.55,
5
- "std": 0.49749371855331,
6
  "count": 100
7
  },
8
- "precision@3": {
9
- "mean": 0.74,
10
- "std": 0.4386342439892262,
11
  "count": 100
12
  },
13
- "precision@5": {
14
- "mean": 0.8,
15
- "std": 0.4,
 
 
 
 
 
16
  "count": 100
17
  },
18
  "ndcg@1": {
19
- "mean": 0.55,
20
- "std": 0.49749371855331,
21
  "count": 100
22
  },
23
- "ndcg@3": {
24
- "mean": 0.6620208679642894,
25
- "std": 0.4224663020789173,
26
  "count": 100
27
  },
28
- "ndcg@5": {
29
- "mean": 0.6865467489235274,
30
- "std": 0.39428047037500696,
 
 
 
 
 
31
  "count": 100
32
  }
33
  },
34
  "species_only": {
35
  "precision@1": {
36
- "mean": 0.72,
37
- "std": 0.4489988864128729,
38
  "count": 100
39
  },
40
- "precision@3": {
41
- "mean": 1.0,
42
- "std": 0.0,
 
 
 
 
 
43
  "count": 100
44
  },
45
- "precision@5": {
46
  "mean": 1.0,
47
  "std": 0.0,
48
  "count": 100
49
  },
50
  "ndcg@1": {
51
- "mean": 0.72,
52
- "std": 0.4489988864128729,
 
 
 
 
 
53
  "count": 100
54
  },
55
- "ndcg@3": {
56
- "mean": 0.8861859507142915,
57
- "std": 0.18517270734359137,
58
  "count": 100
59
  },
60
- "ndcg@5": {
61
- "mean": 0.8861859507142915,
62
- "std": 0.18517270734359137,
63
  "count": 100
64
  }
65
  },
66
  "region_only": {
67
  "precision@1": {
68
- "mean": 0.56,
69
- "std": 0.4963869458396343,
70
  "count": 100
71
  },
72
- "precision@3": {
73
- "mean": 0.74,
74
- "std": 0.4386342439892262,
75
  "count": 100
76
  },
77
- "precision@5": {
78
- "mean": 0.82,
79
- "std": 0.38418745424597095,
 
 
 
 
 
80
  "count": 100
81
  },
82
  "ndcg@1": {
83
- "mean": 0.56,
84
- "std": 0.4963869458396343,
85
  "count": 100
86
  },
87
- "ndcg@3": {
88
- "mean": 0.6683301655000039,
89
- "std": 0.423160630771083,
90
  "count": 100
91
  },
92
- "ndcg@5": {
93
- "mean": 0.7010313401123213,
94
- "std": 0.38430545848027436,
 
 
 
 
 
95
  "count": 100
96
  }
97
  },
98
  "species_and_region": {
99
  "precision@1": {
100
- "mean": 0.74,
101
- "std": 0.4386342439892262,
102
  "count": 100
103
  },
104
- "precision@3": {
105
- "mean": 1.0,
106
- "std": 0.0,
 
 
 
 
 
107
  "count": 100
108
  },
109
- "precision@5": {
110
  "mean": 1.0,
111
  "std": 0.0,
112
  "count": 100
113
  },
114
  "ndcg@1": {
115
- "mean": 0.74,
116
- "std": 0.4386342439892262,
 
 
 
 
 
117
  "count": 100
118
  },
119
- "ndcg@3": {
120
- "mean": 0.8961859507142915,
121
- "std": 0.17738436382801476,
122
  "count": 100
123
  },
124
- "ndcg@5": {
125
- "mean": 0.8961859507142915,
126
- "std": 0.17738436382801476,
127
  "count": 100
128
  }
129
  }
 
1
  {
2
  "no_filter": {
3
  "precision@1": {
4
+ "mean": 0.64,
5
+ "std": 0.48,
6
  "count": 100
7
  },
8
+ "precision@2": {
9
+ "mean": 0.76,
10
+ "std": 0.4270831300812524,
11
  "count": 100
12
  },
13
+ "precision@4": {
14
+ "mean": 0.81,
15
+ "std": 0.39230090491866054,
16
+ "count": 100
17
+ },
18
+ "precision@8": {
19
+ "mean": 0.85,
20
+ "std": 0.3570714214271425,
21
  "count": 100
22
  },
23
  "ndcg@1": {
24
+ "mean": 0.64,
25
+ "std": 0.48,
26
  "count": 100
27
  },
28
+ "ndcg@2": {
29
+ "mean": 0.7157115704285749,
30
+ "std": 0.41895779074707734,
31
  "count": 100
32
  },
33
+ "ndcg@4": {
34
+ "mean": 0.7400183360093088,
35
+ "std": 0.38986711559166154,
36
+ "count": 100
37
+ },
38
+ "ndcg@8": {
39
+ "mean": 0.754344341157148,
40
+ "std": 0.3684202134419992,
41
  "count": 100
42
  }
43
  },
44
  "species_only": {
45
  "precision@1": {
46
+ "mean": 0.68,
47
+ "std": 0.466476151587624,
48
  "count": 100
49
  },
50
+ "precision@2": {
51
+ "mean": 0.84,
52
+ "std": 0.3666060555964672,
53
+ "count": 100
54
+ },
55
+ "precision@4": {
56
+ "mean": 0.93,
57
+ "std": 0.25514701644346144,
58
  "count": 100
59
  },
60
+ "precision@8": {
61
  "mean": 1.0,
62
  "std": 0.0,
63
  "count": 100
64
  },
65
  "ndcg@1": {
66
+ "mean": 0.68,
67
+ "std": 0.466476151587624,
68
+ "count": 100
69
+ },
70
+ "ndcg@2": {
71
+ "mean": 0.7809487605714333,
72
+ "std": 0.36580132584863706,
73
  "count": 100
74
  },
75
+ "ndcg@4": {
76
+ "mean": 0.824562291732901,
77
+ "std": 0.29154459287790513,
78
  "count": 100
79
  },
80
+ "ndcg@8": {
81
+ "mean": 0.8508003372990416,
82
+ "std": 0.22561628946474954,
83
  "count": 100
84
  }
85
  },
86
  "region_only": {
87
  "precision@1": {
88
+ "mean": 0.69,
89
+ "std": 0.462493243193887,
90
  "count": 100
91
  },
92
+ "precision@2": {
93
+ "mean": 0.78,
94
+ "std": 0.4142463035441596,
95
  "count": 100
96
  },
97
+ "precision@4": {
98
+ "mean": 0.84,
99
+ "std": 0.36660605559646725,
100
+ "count": 100
101
+ },
102
+ "precision@8": {
103
+ "mean": 0.87,
104
+ "std": 0.33630343441600474,
105
  "count": 100
106
  },
107
  "ndcg@1": {
108
+ "mean": 0.69,
109
+ "std": 0.462493243193887,
110
  "count": 100
111
  },
112
+ "ndcg@2": {
113
+ "mean": 0.7467836778214312,
114
+ "std": 0.4100495706548825,
115
  "count": 100
116
  },
117
+ "ndcg@4": {
118
+ "mean": 0.7747039745636328,
119
+ "std": 0.37245769233603554,
120
+ "count": 100
121
+ },
122
+ "ndcg@8": {
123
+ "mean": 0.7857743640416571,
124
+ "std": 0.35433577663047267,
125
  "count": 100
126
  }
127
  },
128
  "species_and_region": {
129
  "precision@1": {
130
+ "mean": 0.79,
131
+ "std": 0.40730823708832603,
132
  "count": 100
133
  },
134
+ "precision@2": {
135
+ "mean": 0.91,
136
+ "std": 0.2861817604250837,
137
+ "count": 100
138
+ },
139
+ "precision@4": {
140
+ "mean": 0.99,
141
+ "std": 0.09949874371066199,
142
  "count": 100
143
  },
144
+ "precision@8": {
145
  "mean": 1.0,
146
  "std": 0.0,
147
  "count": 100
148
  },
149
  "ndcg@1": {
150
+ "mean": 0.79,
151
+ "std": 0.40730823708832603,
152
+ "count": 100
153
+ },
154
+ "ndcg@2": {
155
+ "mean": 0.8657115704285749,
156
+ "std": 0.2971736181074952,
157
  "count": 100
158
  },
159
+ "ndcg@4": {
160
+ "mean": 0.9036318671707767,
161
+ "std": 0.19819838555518796,
162
  "count": 100
163
  },
164
+ "ndcg@8": {
165
+ "mean": 0.907500395243122,
166
+ "std": 0.18377359707383636,
167
  "count": 100
168
  }
169
  }
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/27ac9297-abc2-406e-8919-7670a60055f1/length.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bb9e60f9d3b2f5f6a730ae39372f43f54d173ad0cfb5e463b21ab0794b67883
3
  size 40000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f2fe12bccc27d05f521a0a00eb43912172a7113283d0a99b45af0b2da569582
3
  size 40000
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61d8f6141f09d3335edbde43ae25cdd9763b4328ae01daac0f4195563188d2fe
3
  size 10715136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08dce2645f4aa422368f3fc4e975e58a518875b335458e83f60a0205c46f27b8
3
  size 10715136