Spaces:

arbabarshad
/

agllm2-dev

Sleeping

App Files Files Community

arbabarshad commited on Oct 5, 2025

Commit

b9629f4

1 Parent(s): aac482c

starting oct 5

Browse files

Files changed (8) hide show

app_database_prep.py +57 -23
retrieval_evaluation.py +3 -3
retrieval_evaluation_results.json +72 -72
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 → 8da9893a-19f6-48c6-bb16-8a169d9e166f}/data_level0.bin +0 -0
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 → 8da9893a-19f6-48c6-bb16-8a169d9e166f}/header.bin +0 -0
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 → 8da9893a-19f6-48c6-bb16-8a169d9e166f}/length.bin +1 -1
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 → 8da9893a-19f6-48c6-bb16-8a169d9e166f}/link_lists.bin +0 -0
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 +2 -2

app_database_prep.py CHANGED Viewed

@@ -101,7 +101,7 @@ def process_excel_sheet(
 # --- Main Script Logic ---
-# --- INSECTS DATA PROCESSING ---
 insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
 persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
 insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
@@ -124,30 +124,64 @@ metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_inde
 excel_file_path = "agllm-data/PestID Species.xlsx"
-## Process PDF documents and add metadata
-print("--- Processing PDF Documents ---")
-pdf_documents_for_splitting = [] # Prepare list to hold docs with added metadata
-for doc in documents:
-    # Add region for PDF docs
-    doc.metadata["region"] = "United States"
-    # Add species metadata (existing logic)
-    file_name_associated_with_this_doc = doc.metadata["source"].split('/')[-1]
-    matching_species_for_this_file_name = metadata_raw[metadata_raw["File Name"].str.lower() == file_name_associated_with_this_doc.lower()]["Species"]
-    # Ensure matching_species_for_this_file_name is iterable and not empty
-    if not matching_species_for_this_file_name.empty:
-        for specie_index in range(len(matching_species_for_this_file_name)):
-             # Check if specie_index is within bounds (although range should handle this)
-            if specie_index < len(matching_species_for_this_file_name):
-                specie_name = matching_species_for_this_file_name.iloc[specie_index]
-                doc.metadata["matched_specie_" + str(specie_index)] = specie_name
-            else:
-                # This case should ideally not happen with range(len(...))
-                print(f"Warning: Specie index {specie_index} out of bounds for file {file_name_associated_with_this_doc}")
     else:
-         print(f"Warning: No matching species found in CSV for PDF: {file_name_associated_with_this_doc}")
-    pdf_documents_for_splitting.append(doc) # Add modified doc to new list
 # Initialize Text Splitter

 # --- Main Script Logic ---
+# --- INSECTS DATA PROCESSING --- #actually this includes both the weed and insects.
 insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
 persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
 insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
 excel_file_path = "agllm-data/PestID Species.xlsx"
+## Process PDF documents using CSV → PDF approach
+print("--- Processing PDF Documents (CSV → PDF approach) ---")
+# Function to find PDF file for a given filename
+def find_pdf_file(filename, documents):
+    """Find a PDF document by filename in the loaded documents"""
+    for doc in documents:
+        doc_filename = doc.metadata["source"].split('/')[-1]
+        # Try exact match first
+        if doc_filename.lower() == filename.lower():
+            return doc
+        # Try without extension
+        if doc_filename.lower().replace('.pdf', '') == filename.lower().replace('.pdf', ''):
+            return doc
+    return None
+pdf_documents_for_splitting = []
+processed_files = set()
+missing_pdfs = []
+# Process CSV entries first, then find matching PDFs
+print(f"Processing {len(metadata_raw)} CSV entries...")
+for index, row in metadata_raw.iterrows():
+    filename = row['File Name']
+    species = row['Species']
+    # Find the corresponding PDF document
+    pdf_doc = find_pdf_file(filename, documents)
+    if pdf_doc is not None:
+        # Only process if we haven't already processed this file
+        doc_source = pdf_doc.metadata["source"]
+        if doc_source not in processed_files:
+            # Add region for PDF docs
+            pdf_doc.metadata["region"] = "United States"
+            # Add species metadata - guaranteed to exist since we're starting from CSV
+            pdf_doc.metadata["matched_specie_0"] = species
+            # Check if there are multiple species for the same file
+            same_file_species = metadata_raw[metadata_raw["File Name"].str.lower() == filename.lower()]["Species"]
+            for specie_index, specie_name in enumerate(same_file_species):
+                pdf_doc.metadata[f"matched_specie_{specie_index}"] = specie_name
+            pdf_documents_for_splitting.append(pdf_doc)
+            processed_files.add(doc_source)
+            print(f"✓ Processed: {filename} → {species}")
+        else:
+            print(f"⚠ Already processed: {filename}")
     else:
+        missing_pdfs.append(filename)
+        print(f"✗ PDF not found for CSV entry: {filename} → {species}")
+print(f"Successfully processed: {len(pdf_documents_for_splitting)} PDFs")
+print(f"Missing PDFs: {len(missing_pdfs)}")
+if missing_pdfs:
+    print("Missing PDF files:", missing_pdfs[:10])  # Show first 10
+print("---------------------------------------------------")
 # Initialize Text Splitter

retrieval_evaluation.py CHANGED Viewed

@@ -59,7 +59,7 @@ The answer to your question MUST be found in the provided chunk.
 Context: {context}
 Chunk Content:
-{chunk_content[:1500]}  # Limit chunk size for prompt
 Generate a single, clear question (no explanations, just the question):"""
@@ -237,7 +237,7 @@ def main():
     # Configuration
     VECTOR_DB_PATH = 'vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
-    SAMPLE_SIZE = 20   # Start with smaller sample for testing
     K_VALUES = [1, 3, 5]
     OUTPUT_FILE = 'retrieval_evaluation_results.json'
@@ -274,7 +274,7 @@ def main():
         metadata = chunk['metadata']
         species = metadata.get('matched_specie_0', 'MISSING')
         region = metadata.get('region', 'MISSING')
-        source = metadata.get('source', 'unknown')[:50] + "..."  # Truncate for readability
         print(f"Chunk {i+1:2d}: Species='{species}' | Region='{region}' | Source={source}")
     print("##### END DEBUG #####\n")

 Context: {context}
 Chunk Content:
+{chunk_content}  # Limit chunk size for prompt
 Generate a single, clear question (no explanations, just the question):"""
     # Configuration
     VECTOR_DB_PATH = 'vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
+    SAMPLE_SIZE = 100   # Start with smaller sample for testing
     K_VALUES = [1, 3, 5]
     OUTPUT_FILE = 'retrieval_evaluation_results.json'
         metadata = chunk['metadata']
         species = metadata.get('matched_specie_0', 'MISSING')
         region = metadata.get('region', 'MISSING')
+        source = metadata.get('source', 'unknown') + "..."  # Truncate for readability
         print(f"Chunk {i+1:2d}: Species='{species}' | Region='{region}' | Source={source}")
     print("##### END DEBUG #####\n")

retrieval_evaluation_results.json CHANGED Viewed

@@ -1,130 +1,130 @@
 {
   "no_filter": {
     "precision@1": {
-      "mean": 0.55,
-      "std": 0.49749371855331,
-      "count": 20
     },
     "precision@3": {
-      "mean": 0.85,
-      "std": 0.3570714214271425,
-      "count": 20
     },
     "precision@5": {
-      "mean": 0.9,
-      "std": 0.30000000000000004,
-      "count": 20
     },
     "ndcg@1": {
-      "mean": 0.55,
-      "std": 0.49749371855331,
-      "count": 20
     },
     "ndcg@3": {
-      "mean": 0.7327324383928644,
-      "std": 0.353724839687973,
-      "count": 20
     },
     "ndcg@5": {
-      "mean": 0.7542662662965341,
-      "std": 0.319960314564507,
-      "count": 20
     }
   },
   "species_only": {
     "precision@1": {
-      "mean": 0.7692307692307693,
-      "std": 0.4213250442347432,
-      "count": 13
     },
     "precision@3": {
-      "mean": 0.9230769230769231,
-      "std": 0.26646935501059654,
-      "count": 13
     },
     "precision@5": {
-      "mean": 1.0,
-      "std": 0.0,
-      "count": 13
     },
     "ndcg@1": {
-      "mean": 0.7692307692307693,
-      "std": 0.4213250442347432,
-      "count": 13
     },
     "ndcg@3": {
-      "mean": 0.8662968851648396,
-      "std": 0.28284691370224896,
-      "count": 13
     },
     "ndcg@5": {
-      "mean": 0.8960547934136506,
-      "std": 0.19766235701592574,
-      "count": 13
     }
   },
   "region_only": {
     "precision@1": {
-      "mean": 0.6,
-      "std": 0.48989794855663565,
-      "count": 20
     },
     "precision@3": {
-      "mean": 0.85,
-      "std": 0.3570714214271425,
-      "count": 20
     },
     "precision@5": {
-      "mean": 0.9,
-      "std": 0.30000000000000004,
-      "count": 20
     },
     "ndcg@1": {
-      "mean": 0.6,
-      "std": 0.48989794855663565,
-      "count": 20
     },
     "ndcg@3": {
-      "mean": 0.7511859507142915,
-      "std": 0.3575390024008766,
-      "count": 20
     },
     "ndcg@5": {
-      "mean": 0.7727197786179613,
-      "std": 0.32294384868681797,
-      "count": 20
     }
   },
   "species_and_region": {
     "precision@1": {
-      "mean": 0.8461538461538461,
-      "std": 0.36080121229410994,
-      "count": 13
     },
     "precision@3": {
-      "mean": 0.9230769230769231,
-      "std": 0.26646935501059654,
-      "count": 13
     },
     "precision@5": {
-      "mean": 1.0,
-      "std": 0.0,
-      "count": 13
     },
     "ndcg@1": {
-      "mean": 0.8461538461538461,
-      "std": 0.36080121229410994,
-      "count": 13
     },
     "ndcg@3": {
-      "mean": 0.8946869041208814,
-      "std": 0.27624290045474437,
-      "count": 13
     },
     "ndcg@5": {
-      "mean": 0.9244448123696922,
-      "std": 0.18354431531186644,
-      "count": 13
     }
   }
 }

 {
   "no_filter": {
     "precision@1": {
+      "mean": 0.61,
+      "std": 0.4877499359302879,
+      "count": 100
     },
     "precision@3": {
+      "mean": 0.82,
+      "std": 0.38418745424597095,
+      "count": 100
     },
     "precision@5": {
+      "mean": 0.84,
+      "std": 0.36660605559646725,
+      "count": 100
     },
     "ndcg@1": {
+      "mean": 0.61,
+      "std": 0.4877499359302879,
+      "count": 100
     },
     "ndcg@3": {
+      "mean": 0.7359487605714332,
+      "std": 0.38022493138147806,
+      "count": 100
     },
     "ndcg@5": {
+      "mean": 0.7441240542245126,
+      "std": 0.3685408287782305,
+      "count": 100
     }
   },
   "species_only": {
     "precision@1": {
+      "mean": 0.71,
+      "std": 0.4537620521815371,
+      "count": 100
     },
     "precision@3": {
+      "mean": 0.97,
+      "std": 0.17058722109231983,
+      "count": 100
     },
     "precision@5": {
+      "mean": 0.99,
+      "std": 0.09949874371066199,
+      "count": 100
     },
     "ndcg@1": {
+      "mean": 0.71,
+      "std": 0.4537620521815371,
+      "count": 100
     },
     "ndcg@3": {
+      "mean": 0.8661859507142915,
+      "std": 0.23310162928115066,
+      "count": 100
     },
     "ndcg@5": {
+      "mean": 0.8739230068589822,
+      "std": 0.2094424760171824,
+      "count": 100
     }
   },
   "region_only": {
     "precision@1": {
+      "mean": 0.62,
+      "std": 0.48538644398046393,
+      "count": 100
     },
     "precision@3": {
+      "mean": 0.83,
+      "std": 0.375632799419859,
+      "count": 100
     },
     "precision@5": {
+      "mean": 0.86,
+      "std": 0.34698703145794946,
+      "count": 100
     },
     "ndcg@1": {
+      "mean": 0.62,
+      "std": 0.48538644398046393,
+      "count": 100
     },
     "ndcg@3": {
+      "mean": 0.7459487605714332,
+      "std": 0.373834218916114,
+      "count": 100
     },
     "ndcg@5": {
+      "mean": 0.7584308198052464,
+      "std": 0.3552188974398061,
+      "count": 100
     }
   },
   "species_and_region": {
     "precision@1": {
+      "mean": 0.72,
+      "std": 0.4489988864128729,
+      "count": 100
     },
     "precision@3": {
+      "mean": 0.98,
+      "std": 0.13999999999999999,
+      "count": 100
     },
     "precision@5": {
+      "mean": 0.99,
+      "std": 0.09949874371066199,
+      "count": 100
     },
     "ndcg@1": {
+      "mean": 0.72,
+      "std": 0.4489988864128729,
+      "count": 100
     },
     "ndcg@3": {
+      "mean": 0.877495248250006,
+      "std": 0.21470277973614038,
+      "count": 100
     },
     "ndcg@5": {
+      "mean": 0.8813637763223514,
+      "std": 0.20196444998865976,
+      "count": 100
     }
   }
 }

vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 → 8da9893a-19f6-48c6-bb16-8a169d9e166f}/data_level0.bin RENAMED Viewed

File without changes

vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 → 8da9893a-19f6-48c6-bb16-8a169d9e166f}/header.bin RENAMED Viewed

File without changes

vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 → 8da9893a-19f6-48c6-bb16-8a169d9e166f}/length.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e632323b84e2258a31c2401bbb859c7fc59cd994aa4f6b2217651488f3cf3be3
 size 40000

 version https://git-lfs.github.com/spec/v1
+oid sha256:b274da292d64f026adecde33133c35635f3faf9e38eee883d259dcf632c7729b
 size 40000

vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{e82d58e5-16f1-41a6-9289-211464329861 → 8da9893a-19f6-48c6-bb16-8a169d9e166f}/link_lists.bin RENAMED Viewed

File without changes

vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4942e0dbb09693a3162b420dd2471ef8fcfaa541f479979627fa6125d12f2af6
-size 9072640

 version https://git-lfs.github.com/spec/v1
+oid sha256:717b0646137d385b2777333886c81f41d57bae3261a881b66c728a21e465c29b
+size 5414912