arbabarshad commited on
Commit
8302c34
Β·
1 Parent(s): 42f7194

fixed count of species

Browse files
README.md CHANGED
@@ -14,7 +14,7 @@ license: apache-2.0
14
 
15
  ### Environment
16
  ```bash
17
- source ~/miniconda3/etc/profile.d/conda.sh && conda activate agllm-env1-updates-1
18
  ```
19
 
20
  ### Key Commands
@@ -57,9 +57,14 @@ source ~/miniconda3/etc/profile.d/conda.sh && conda activate agllm-env1-updates-
57
  ```
58
 
59
  ### Database Build Flow
60
- 1. PDFs loaded from `agllm-data/` (insects + weeds)
61
- 2. Metadata read from `matched_species_results_v2.csv` files
62
- 3. Excel sheets (India, Africa) processed from `PestID Species.xlsx`
 
 
 
 
 
63
  4. Documents chunked (512 tokens, 10 overlap)
64
  5. Tagged with `matched_specie_X` + `region` metadata
65
  6. Stored in ChromaDB at `vector-databases-deployed/db5-*/`
 
14
 
15
  ### Environment
16
  ```bash
17
+ source ~/miniconda3/etc/profile.d/conda.sh && conda activate agllm-june-15
18
  ```
19
 
20
  ### Key Commands
 
57
  ```
58
 
59
  ### Database Build Flow
60
+ **US Data (80 species):**
61
+ 1. PDFs loaded from `agllm-data/.../raw-pdfs/` (content source)
62
+ 2. `matched_species_results_v2.csv` maps PDF filename β†’ species name (metadata)
63
+
64
+ **Africa/India Data (35 + 11 species):**
65
+ 3. Excel `species-organized/PestID Species - Organized.xlsx` provides both content (IPM Info) and metadata
66
+
67
+ **All Data:**
68
  4. Documents chunked (512 tokens, 10 overlap)
69
  5. Tagged with `matched_specie_X` + `region` metadata
70
  6. Stored in ChromaDB at `vector-databases-deployed/db5-*/`
app_database_prep.py CHANGED
@@ -120,8 +120,8 @@ documents = insects_documents + weeds_documents
120
  metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_index=True)
121
 
122
 
123
- ## Load Excel File Path (Define once)
124
- excel_file_path = "agllm-data/PestID Species.xlsx"
125
 
126
 
127
  ## Process PDF documents using CSV β†’ PDF approach
 
120
  metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_index=True)
121
 
122
 
123
+ ## Load Excel File Path (Define once) - Using Organized file as single source of truth
124
+ excel_file_path = "species-organized/PestID Species - Organized.xlsx"
125
 
126
 
127
  ## Process PDF documents using CSV β†’ PDF approach
retrieval_evaluation_results.json CHANGED
@@ -1,109 +1,109 @@
1
  {
2
  "no_filter": {
3
  "precision@1": {
4
- "mean": 0.59,
5
- "std": 0.49183330509431744,
6
  "count": 100
7
  },
8
  "precision@3": {
9
- "mean": 0.76,
10
- "std": 0.42708313008125254,
11
  "count": 100
12
  },
13
  "precision@5": {
14
- "mean": 0.82,
15
- "std": 0.38418745424597095,
16
  "count": 100
17
  },
18
  "ndcg@1": {
19
- "mean": 0.59,
20
- "std": 0.49183330509431744,
21
  "count": 100
22
  },
23
  "ndcg@3": {
24
- "mean": 0.6946394630357184,
25
- "std": 0.41495405707705707,
26
  "count": 100
27
  },
28
  "ndcg@5": {
29
- "mean": 0.7182888689781796,
30
- "std": 0.3848500116841757,
31
  "count": 100
32
  }
33
  },
34
  "species_only": {
35
  "precision@1": {
36
- "mean": 0.73,
37
- "std": 0.4439594576084623,
38
  "count": 100
39
  },
40
  "precision@3": {
41
- "mean": 0.98,
42
- "std": 0.13999999999999999,
43
  "count": 100
44
  },
45
  "precision@5": {
46
- "mean": 0.99,
47
- "std": 0.09949874371066199,
48
  "count": 100
49
  },
50
  "ndcg@1": {
51
- "mean": 0.73,
52
- "std": 0.4439594576084623,
53
  "count": 100
54
  },
55
  "ndcg@3": {
56
- "mean": 0.8811859507142915,
57
- "std": 0.2136019453378135,
58
  "count": 100
59
  },
60
  "ndcg@5": {
61
- "mean": 0.8850544787866369,
62
- "std": 0.2007226726424171,
63
  "count": 100
64
  }
65
  },
66
  "region_only": {
67
  "precision@1": {
68
- "mean": 0.61,
69
- "std": 0.4877499359302879,
70
  "count": 100
71
  },
72
  "precision@3": {
73
- "mean": 0.77,
74
- "std": 0.4208325082500163,
75
  "count": 100
76
  },
77
  "precision@5": {
78
- "mean": 0.83,
79
- "std": 0.375632799419859,
80
  "count": 100
81
  },
82
  "ndcg@1": {
83
- "mean": 0.61,
84
- "std": 0.4877499359302879,
85
  "count": 100
86
  },
87
  "ndcg@3": {
88
- "mean": 0.7083301655000039,
89
- "std": 0.4110942789611411,
90
  "count": 100
91
  },
92
  "ndcg@5": {
93
- "mean": 0.7319795714424648,
94
- "std": 0.37983366654728035,
95
  "count": 100
96
  }
97
  },
98
  "species_and_region": {
99
  "precision@1": {
100
- "mean": 0.75,
101
- "std": 0.4330127018922193,
102
  "count": 100
103
  },
104
  "precision@3": {
105
- "mean": 0.98,
106
- "std": 0.13999999999999999,
107
  "count": 100
108
  },
109
  "precision@5": {
@@ -112,18 +112,18 @@
112
  "count": 100
113
  },
114
  "ndcg@1": {
115
- "mean": 0.75,
116
- "std": 0.4330127018922193,
117
  "count": 100
118
  },
119
  "ndcg@3": {
120
- "mean": 0.8898766531785769,
121
- "std": 0.2091728695998248,
122
  "count": 100
123
  },
124
  "ndcg@5": {
125
- "mean": 0.8980519468316562,
126
- "std": 0.18024378480878206,
127
  "count": 100
128
  }
129
  }
 
1
  {
2
  "no_filter": {
3
  "precision@1": {
4
+ "mean": 0.55,
5
+ "std": 0.49749371855331,
6
  "count": 100
7
  },
8
  "precision@3": {
9
+ "mean": 0.74,
10
+ "std": 0.4386342439892262,
11
  "count": 100
12
  },
13
  "precision@5": {
14
+ "mean": 0.8,
15
+ "std": 0.4,
16
  "count": 100
17
  },
18
  "ndcg@1": {
19
+ "mean": 0.55,
20
+ "std": 0.49749371855331,
21
  "count": 100
22
  },
23
  "ndcg@3": {
24
+ "mean": 0.6620208679642894,
25
+ "std": 0.4224663020789173,
26
  "count": 100
27
  },
28
  "ndcg@5": {
29
+ "mean": 0.6865467489235274,
30
+ "std": 0.39428047037500696,
31
  "count": 100
32
  }
33
  },
34
  "species_only": {
35
  "precision@1": {
36
+ "mean": 0.72,
37
+ "std": 0.4489988864128729,
38
  "count": 100
39
  },
40
  "precision@3": {
41
+ "mean": 1.0,
42
+ "std": 0.0,
43
  "count": 100
44
  },
45
  "precision@5": {
46
+ "mean": 1.0,
47
+ "std": 0.0,
48
  "count": 100
49
  },
50
  "ndcg@1": {
51
+ "mean": 0.72,
52
+ "std": 0.4489988864128729,
53
  "count": 100
54
  },
55
  "ndcg@3": {
56
+ "mean": 0.8861859507142915,
57
+ "std": 0.18517270734359137,
58
  "count": 100
59
  },
60
  "ndcg@5": {
61
+ "mean": 0.8861859507142915,
62
+ "std": 0.18517270734359137,
63
  "count": 100
64
  }
65
  },
66
  "region_only": {
67
  "precision@1": {
68
+ "mean": 0.56,
69
+ "std": 0.4963869458396343,
70
  "count": 100
71
  },
72
  "precision@3": {
73
+ "mean": 0.74,
74
+ "std": 0.4386342439892262,
75
  "count": 100
76
  },
77
  "precision@5": {
78
+ "mean": 0.82,
79
+ "std": 0.38418745424597095,
80
  "count": 100
81
  },
82
  "ndcg@1": {
83
+ "mean": 0.56,
84
+ "std": 0.4963869458396343,
85
  "count": 100
86
  },
87
  "ndcg@3": {
88
+ "mean": 0.6683301655000039,
89
+ "std": 0.423160630771083,
90
  "count": 100
91
  },
92
  "ndcg@5": {
93
+ "mean": 0.7010313401123213,
94
+ "std": 0.38430545848027436,
95
  "count": 100
96
  }
97
  },
98
  "species_and_region": {
99
  "precision@1": {
100
+ "mean": 0.74,
101
+ "std": 0.4386342439892262,
102
  "count": 100
103
  },
104
  "precision@3": {
105
+ "mean": 1.0,
106
+ "std": 0.0,
107
  "count": 100
108
  },
109
  "precision@5": {
 
112
  "count": 100
113
  },
114
  "ndcg@1": {
115
+ "mean": 0.74,
116
+ "std": 0.4386342439892262,
117
  "count": 100
118
  },
119
  "ndcg@3": {
120
+ "mean": 0.8961859507142915,
121
+ "std": 0.17738436382801476,
122
  "count": 100
123
  },
124
  "ndcg@5": {
125
+ "mean": 0.8961859507142915,
126
+ "std": 0.17738436382801476,
127
  "count": 100
128
  }
129
  }
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{0bdb47f3-00af-43ed-a2af-ae5a3eee5f98 β†’ 8577d142-4583-4e84-bf32-088c2e98b3b6}/data_level0.bin RENAMED
File without changes
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{0bdb47f3-00af-43ed-a2af-ae5a3eee5f98 β†’ 8577d142-4583-4e84-bf32-088c2e98b3b6}/header.bin RENAMED
File without changes
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{0bdb47f3-00af-43ed-a2af-ae5a3eee5f98 β†’ 8577d142-4583-4e84-bf32-088c2e98b3b6}/length.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11491cf0eac47e805aa1b059bb8d72b895d20b41d24581b6a4383eff57db12f5
3
  size 40000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fa22d78fd3b20a685b167ad464489ed4470048542d3ee1c780b316fdcb38211
3
  size 40000
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{0bdb47f3-00af-43ed-a2af-ae5a3eee5f98 β†’ 8577d142-4583-4e84-bf32-088c2e98b3b6}/link_lists.bin RENAMED
File without changes
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12653e79b55a19108699f56736a4d97a4ad00f3627d6504348862d911eaa1688
3
- size 5410816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97bfefa450a9f891951d6d1f06ec715f9e3b072afd569ba010315ee9034d2ec6
3
+ size 5423104