arbabarshad commited on
Commit
b9629f4
Β·
1 Parent(s): aac482c

starting oct 5

Browse files
app_database_prep.py CHANGED
@@ -101,7 +101,7 @@ def process_excel_sheet(
101
 
102
  # --- Main Script Logic ---
103
 
104
- # --- INSECTS DATA PROCESSING ---
105
  insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
106
  persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
107
  insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
@@ -124,30 +124,64 @@ metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_inde
124
  excel_file_path = "agllm-data/PestID Species.xlsx"
125
 
126
 
127
- ## Process PDF documents and add metadata
128
- print("--- Processing PDF Documents ---")
129
- pdf_documents_for_splitting = [] # Prepare list to hold docs with added metadata
130
- for doc in documents:
131
- # Add region for PDF docs
132
- doc.metadata["region"] = "United States"
133
-
134
- # Add species metadata (existing logic)
135
- file_name_associated_with_this_doc = doc.metadata["source"].split('/')[-1]
136
- matching_species_for_this_file_name = metadata_raw[metadata_raw["File Name"].str.lower() == file_name_associated_with_this_doc.lower()]["Species"]
137
- # Ensure matching_species_for_this_file_name is iterable and not empty
138
- if not matching_species_for_this_file_name.empty:
139
- for specie_index in range(len(matching_species_for_this_file_name)):
140
- # Check if specie_index is within bounds (although range should handle this)
141
- if specie_index < len(matching_species_for_this_file_name):
142
- specie_name = matching_species_for_this_file_name.iloc[specie_index]
143
- doc.metadata["matched_specie_" + str(specie_index)] = specie_name
144
- else:
145
- # This case should ideally not happen with range(len(...))
146
- print(f"Warning: Specie index {specie_index} out of bounds for file {file_name_associated_with_this_doc}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  else:
148
- print(f"Warning: No matching species found in CSV for PDF: {file_name_associated_with_this_doc}")
 
149
 
150
- pdf_documents_for_splitting.append(doc) # Add modified doc to new list
 
 
 
 
151
 
152
 
153
  # Initialize Text Splitter
 
101
 
102
  # --- Main Script Logic ---
103
 
104
+ # --- INSECTS DATA PROCESSING --- #actually this includes both the weed and insects.
105
  insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
106
  persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
107
  insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
 
124
  excel_file_path = "agllm-data/PestID Species.xlsx"
125
 
126
 
127
+ ## Process PDF documents using CSV β†’ PDF approach
128
+ print("--- Processing PDF Documents (CSV β†’ PDF approach) ---")
129
+
130
+ # Function to find PDF file for a given filename
131
+ def find_pdf_file(filename, documents):
132
+ """Find a PDF document by filename in the loaded documents"""
133
+ for doc in documents:
134
+ doc_filename = doc.metadata["source"].split('/')[-1]
135
+ # Try exact match first
136
+ if doc_filename.lower() == filename.lower():
137
+ return doc
138
+ # Try without extension
139
+ if doc_filename.lower().replace('.pdf', '') == filename.lower().replace('.pdf', ''):
140
+ return doc
141
+ return None
142
+
143
+ pdf_documents_for_splitting = []
144
+ processed_files = set()
145
+ missing_pdfs = []
146
+
147
+ # Process CSV entries first, then find matching PDFs
148
+ print(f"Processing {len(metadata_raw)} CSV entries...")
149
+ for index, row in metadata_raw.iterrows():
150
+ filename = row['File Name']
151
+ species = row['Species']
152
+
153
+ # Find the corresponding PDF document
154
+ pdf_doc = find_pdf_file(filename, documents)
155
+
156
+ if pdf_doc is not None:
157
+ # Only process if we haven't already processed this file
158
+ doc_source = pdf_doc.metadata["source"]
159
+ if doc_source not in processed_files:
160
+ # Add region for PDF docs
161
+ pdf_doc.metadata["region"] = "United States"
162
+
163
+ # Add species metadata - guaranteed to exist since we're starting from CSV
164
+ pdf_doc.metadata["matched_specie_0"] = species
165
+
166
+ # Check if there are multiple species for the same file
167
+ same_file_species = metadata_raw[metadata_raw["File Name"].str.lower() == filename.lower()]["Species"]
168
+ for specie_index, specie_name in enumerate(same_file_species):
169
+ pdf_doc.metadata[f"matched_specie_{specie_index}"] = specie_name
170
+
171
+ pdf_documents_for_splitting.append(pdf_doc)
172
+ processed_files.add(doc_source)
173
+ print(f"βœ“ Processed: {filename} β†’ {species}")
174
+ else:
175
+ print(f"⚠ Already processed: {filename}")
176
  else:
177
+ missing_pdfs.append(filename)
178
+ print(f"βœ— PDF not found for CSV entry: {filename} β†’ {species}")
179
 
180
+ print(f"Successfully processed: {len(pdf_documents_for_splitting)} PDFs")
181
+ print(f"Missing PDFs: {len(missing_pdfs)}")
182
+ if missing_pdfs:
183
+ print("Missing PDF files:", missing_pdfs[:10]) # Show first 10
184
+ print("---------------------------------------------------")
185
 
186
 
187
  # Initialize Text Splitter
retrieval_evaluation.py CHANGED
@@ -59,7 +59,7 @@ The answer to your question MUST be found in the provided chunk.
59
  Context: {context}
60
 
61
  Chunk Content:
62
- {chunk_content[:1500]} # Limit chunk size for prompt
63
 
64
  Generate a single, clear question (no explanations, just the question):"""
65
 
@@ -237,7 +237,7 @@ def main():
237
 
238
  # Configuration
239
  VECTOR_DB_PATH = 'vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
240
- SAMPLE_SIZE = 20 # Start with smaller sample for testing
241
  K_VALUES = [1, 3, 5]
242
  OUTPUT_FILE = 'retrieval_evaluation_results.json'
243
 
@@ -274,7 +274,7 @@ def main():
274
  metadata = chunk['metadata']
275
  species = metadata.get('matched_specie_0', 'MISSING')
276
  region = metadata.get('region', 'MISSING')
277
- source = metadata.get('source', 'unknown')[:50] + "..." # Truncate for readability
278
  print(f"Chunk {i+1:2d}: Species='{species}' | Region='{region}' | Source={source}")
279
  print("##### END DEBUG #####\n")
280
 
 
59
  Context: {context}
60
 
61
  Chunk Content:
62
+ {chunk_content} # Limit chunk size for prompt
63
 
64
  Generate a single, clear question (no explanations, just the question):"""
65
 
 
237
 
238
  # Configuration
239
  VECTOR_DB_PATH = 'vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
240
+ SAMPLE_SIZE = 100 # Start with smaller sample for testing
241
  K_VALUES = [1, 3, 5]
242
  OUTPUT_FILE = 'retrieval_evaluation_results.json'
243
 
 
274
  metadata = chunk['metadata']
275
  species = metadata.get('matched_specie_0', 'MISSING')
276
  region = metadata.get('region', 'MISSING')
277
+ source = metadata.get('source', 'unknown') + "..." # Truncate for readability
278
  print(f"Chunk {i+1:2d}: Species='{species}' | Region='{region}' | Source={source}")
279
  print("##### END DEBUG #####\n")
280
 
retrieval_evaluation_results.json CHANGED
@@ -1,130 +1,130 @@
1
  {
2
  "no_filter": {
3
  "precision@1": {
4
- "mean": 0.55,
5
- "std": 0.49749371855331,
6
- "count": 20
7
  },
8
  "precision@3": {
9
- "mean": 0.85,
10
- "std": 0.3570714214271425,
11
- "count": 20
12
  },
13
  "precision@5": {
14
- "mean": 0.9,
15
- "std": 0.30000000000000004,
16
- "count": 20
17
  },
18
  "ndcg@1": {
19
- "mean": 0.55,
20
- "std": 0.49749371855331,
21
- "count": 20
22
  },
23
  "ndcg@3": {
24
- "mean": 0.7327324383928644,
25
- "std": 0.353724839687973,
26
- "count": 20
27
  },
28
  "ndcg@5": {
29
- "mean": 0.7542662662965341,
30
- "std": 0.319960314564507,
31
- "count": 20
32
  }
33
  },
34
  "species_only": {
35
  "precision@1": {
36
- "mean": 0.7692307692307693,
37
- "std": 0.4213250442347432,
38
- "count": 13
39
  },
40
  "precision@3": {
41
- "mean": 0.9230769230769231,
42
- "std": 0.26646935501059654,
43
- "count": 13
44
  },
45
  "precision@5": {
46
- "mean": 1.0,
47
- "std": 0.0,
48
- "count": 13
49
  },
50
  "ndcg@1": {
51
- "mean": 0.7692307692307693,
52
- "std": 0.4213250442347432,
53
- "count": 13
54
  },
55
  "ndcg@3": {
56
- "mean": 0.8662968851648396,
57
- "std": 0.28284691370224896,
58
- "count": 13
59
  },
60
  "ndcg@5": {
61
- "mean": 0.8960547934136506,
62
- "std": 0.19766235701592574,
63
- "count": 13
64
  }
65
  },
66
  "region_only": {
67
  "precision@1": {
68
- "mean": 0.6,
69
- "std": 0.48989794855663565,
70
- "count": 20
71
  },
72
  "precision@3": {
73
- "mean": 0.85,
74
- "std": 0.3570714214271425,
75
- "count": 20
76
  },
77
  "precision@5": {
78
- "mean": 0.9,
79
- "std": 0.30000000000000004,
80
- "count": 20
81
  },
82
  "ndcg@1": {
83
- "mean": 0.6,
84
- "std": 0.48989794855663565,
85
- "count": 20
86
  },
87
  "ndcg@3": {
88
- "mean": 0.7511859507142915,
89
- "std": 0.3575390024008766,
90
- "count": 20
91
  },
92
  "ndcg@5": {
93
- "mean": 0.7727197786179613,
94
- "std": 0.32294384868681797,
95
- "count": 20
96
  }
97
  },
98
  "species_and_region": {
99
  "precision@1": {
100
- "mean": 0.8461538461538461,
101
- "std": 0.36080121229410994,
102
- "count": 13
103
  },
104
  "precision@3": {
105
- "mean": 0.9230769230769231,
106
- "std": 0.26646935501059654,
107
- "count": 13
108
  },
109
  "precision@5": {
110
- "mean": 1.0,
111
- "std": 0.0,
112
- "count": 13
113
  },
114
  "ndcg@1": {
115
- "mean": 0.8461538461538461,
116
- "std": 0.36080121229410994,
117
- "count": 13
118
  },
119
  "ndcg@3": {
120
- "mean": 0.8946869041208814,
121
- "std": 0.27624290045474437,
122
- "count": 13
123
  },
124
  "ndcg@5": {
125
- "mean": 0.9244448123696922,
126
- "std": 0.18354431531186644,
127
- "count": 13
128
  }
129
  }
130
  }
 
1
  {
2
  "no_filter": {
3
  "precision@1": {
4
+ "mean": 0.61,
5
+ "std": 0.4877499359302879,
6
+ "count": 100
7
  },
8
  "precision@3": {
9
+ "mean": 0.82,
10
+ "std": 0.38418745424597095,
11
+ "count": 100
12
  },
13
  "precision@5": {
14
+ "mean": 0.84,
15
+ "std": 0.36660605559646725,
16
+ "count": 100
17
  },
18
  "ndcg@1": {
19
+ "mean": 0.61,
20
+ "std": 0.4877499359302879,
21
+ "count": 100
22
  },
23
  "ndcg@3": {
24
+ "mean": 0.7359487605714332,
25
+ "std": 0.38022493138147806,
26
+ "count": 100
27
  },
28
  "ndcg@5": {
29
+ "mean": 0.7441240542245126,
30
+ "std": 0.3685408287782305,
31
+ "count": 100
32
  }
33
  },
34
  "species_only": {
35
  "precision@1": {
36
+ "mean": 0.71,
37
+ "std": 0.4537620521815371,
38
+ "count": 100
39
  },
40
  "precision@3": {
41
+ "mean": 0.97,
42
+ "std": 0.17058722109231983,
43
+ "count": 100
44
  },
45
  "precision@5": {
46
+ "mean": 0.99,
47
+ "std": 0.09949874371066199,
48
+ "count": 100
49
  },
50
  "ndcg@1": {
51
+ "mean": 0.71,
52
+ "std": 0.4537620521815371,
53
+ "count": 100
54
  },
55
  "ndcg@3": {
56
+ "mean": 0.8661859507142915,
57
+ "std": 0.23310162928115066,
58
+ "count": 100
59
  },
60
  "ndcg@5": {
61
+ "mean": 0.8739230068589822,
62
+ "std": 0.2094424760171824,
63
+ "count": 100
64
  }
65
  },
66
  "region_only": {
67
  "precision@1": {
68
+ "mean": 0.62,
69
+ "std": 0.48538644398046393,
70
+ "count": 100
71
  },
72
  "precision@3": {
73
+ "mean": 0.83,
74
+ "std": 0.375632799419859,
75
+ "count": 100
76
  },
77
  "precision@5": {
78
+ "mean": 0.86,
79
+ "std": 0.34698703145794946,
80
+ "count": 100
81
  },
82
  "ndcg@1": {
83
+ "mean": 0.62,
84
+ "std": 0.48538644398046393,
85
+ "count": 100
86
  },
87
  "ndcg@3": {
88
+ "mean": 0.7459487605714332,
89
+ "std": 0.373834218916114,
90
+ "count": 100
91
  },
92
  "ndcg@5": {
93
+ "mean": 0.7584308198052464,
94
+ "std": 0.3552188974398061,
95
+ "count": 100
96
  }
97
  },
98
  "species_and_region": {
99
  "precision@1": {
100
+ "mean": 0.72,
101
+ "std": 0.4489988864128729,
102
+ "count": 100
103
  },
104
  "precision@3": {
105
+ "mean": 0.98,
106
+ "std": 0.13999999999999999,
107
+ "count": 100
108
  },
109
  "precision@5": {
110
+ "mean": 0.99,
111
+ "std": 0.09949874371066199,
112
+ "count": 100
113
  },
114
  "ndcg@1": {
115
+ "mean": 0.72,
116
+ "std": 0.4489988864128729,
117
+ "count": 100
118
  },
119
  "ndcg@3": {
120
+ "mean": 0.877495248250006,
121
+ "std": 0.21470277973614038,
122
+ "count": 100
123
  },
124
  "ndcg@5": {
125
+ "mean": 0.8813637763223514,
126
+ "std": 0.20196444998865976,
127
+ "count": 100
128
  }
129
  }
130
  }
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 β†’ 8da9893a-19f6-48c6-bb16-8a169d9e166f}/data_level0.bin RENAMED
File without changes
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 β†’ 8da9893a-19f6-48c6-bb16-8a169d9e166f}/header.bin RENAMED
File without changes
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 β†’ 8da9893a-19f6-48c6-bb16-8a169d9e166f}/length.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e632323b84e2258a31c2401bbb859c7fc59cd994aa4f6b2217651488f3cf3be3
3
  size 40000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b274da292d64f026adecde33133c35635f3faf9e38eee883d259dcf632c7729b
3
  size 40000
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 β†’ 8da9893a-19f6-48c6-bb16-8a169d9e166f}/link_lists.bin RENAMED
File without changes
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4942e0dbb09693a3162b420dd2471ef8fcfaa541f479979627fa6125d12f2af6
3
- size 9072640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:717b0646137d385b2777333886c81f41d57bae3261a881b66c728a21e465c29b
3
+ size 5414912