File size: 11,024 Bytes
7a56e2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9629f4
63c9deb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a56e2a
8302c34
 
7a56e2a
 
b9629f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a56e2a
b9629f4
 
 
 
 
 
 
 
7a56e2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42f7194
 
 
 
 
7a56e2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import os
import shutil

from langchain.document_loaders import PyPDFDirectoryLoader
import pandas as pd
import langchain
from queue import Queue
from typing import Any, List
from langchain.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.schema import LLMResult
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts.prompt import PromptTemplate
from anyio.from_thread import start_blocking_portal #For model callback streaming

langchain.debug=True # TODO: DOUBLE CHECK

system_message = {"role": "system", "content": "You are a helpful assistant."} # TODO: double check how this plays out later. 
import os
from dotenv import load_dotenv

import streamlit as st

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
import chromadb


## added information in metadata: 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.schema import Document


# Function to process a sheet from the Excel file
def process_excel_sheet(
    excel_path: str, 
    sheet_name: str, 
    region: str, 
    splitter: RecursiveCharacterTextSplitter
) -> List[Document]:
    """Loads data from an Excel sheet, creates Documents, splits them, and adds metadata."""
    print(f"--- Processing Excel Sheet: {sheet_name} (Region: {region}) ---")
    try:
        df = pd.read_excel(excel_path, sheet_name=sheet_name)
        print(f"Excel Data Head ({sheet_name}):\\n", df.head())
    except Exception as e:
        print(f"Error loading sheet '{sheet_name}' from {excel_path}: {e}")
        return []

    initial_documents = []
    for index, row in df.iterrows():
        ipm_info = str(row['IPM Info']) if pd.notna(row['IPM Info']) else ""
        # Check if essential columns exist and are not empty (removed accuracy check)
        if pd.isna(row['Common Name']) or pd.isna(row['Species']):
             print(f"Skipping row {index+2} in sheet '{sheet_name}' due to missing essential data (Common Name or Species).")
             continue

        doc = Document(
            page_content=ipm_info,
            metadata={
                "source": f"{excel_path}#sheet={sheet_name}#row={index+2}",
                "common_name": row['Common Name'],
                "species": row['Species'],
                "matched_specie_0": row['Species'],
                "region": region
            }
        )
        initial_documents.append(doc)

    if initial_documents:
        print(f"First Document from {sheet_name} (before splitting):\\n", initial_documents[0])
    else:
        print(f"No documents created from sheet: {sheet_name}")
        return [] # Return empty list if no documents were created

    split_documents = []
    for doc in initial_documents:
        splits = splitter.split_documents([doc])
        for i, split_doc in enumerate(splits, start=1):
            metadata = split_doc.metadata.copy()
            metadata["source"] = f"{metadata['source']}#chunk{i}"
            split_doc.metadata = metadata
            split_documents.append(split_doc)

    if split_documents:
        print(f"First Document chunk from {sheet_name}:\\n", split_documents[0])
    
    print(f"Finished processing sheet: {sheet_name}. Found {len(split_documents)} chunks.")
    print("---------------------------------------------------")
    return split_documents

# --- Main Script Logic ---

# --- INSECTS DATA PROCESSING --- #actually this includes both the weed and insects. 
insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
chunk_size_input = 512
insects_metadata_raw = pd.read_csv(f"./agllm-data/{insects_data_domain_identifier}/matched_species_results_v2.csv")
insects_documents = insects_loader.load()

# --- WEEDS DATA PROCESSING ---
weeds_data_domain_identifier = "agllm-data-isu-field-weeds-all-species"
weeds_loader = DirectoryLoader(f'agllm-data/{weeds_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
weeds_metadata_raw = pd.read_csv(f"./agllm-data/{weeds_data_domain_identifier}/matched_species_results_v2.csv")
weeds_documents = weeds_loader.load()

# Combine documents from both sources before processing
documents = insects_documents + weeds_documents
metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_index=True)


## Load Excel File Path (Define once) - Using Organized file as single source of truth
excel_file_path = "species-organized/PestID Species - Organized.xlsx"


## Process PDF documents using CSV → PDF approach
print("--- Processing PDF Documents (CSV → PDF approach) ---")

# Function to find PDF file for a given filename
def find_pdf_file(filename, documents):
    """Find a PDF document by filename in the loaded documents"""
    for doc in documents:
        doc_filename = doc.metadata["source"].split('/')[-1]
        # Try exact match first
        if doc_filename.lower() == filename.lower():
            return doc
        # Try without extension
        if doc_filename.lower().replace('.pdf', '') == filename.lower().replace('.pdf', ''):
            return doc
    return None

pdf_documents_for_splitting = []
processed_files = set()
missing_pdfs = []

# Process CSV entries first, then find matching PDFs
print(f"Processing {len(metadata_raw)} CSV entries...")
for index, row in metadata_raw.iterrows():
    filename = row['File Name']
    species = row['Species']

    # Find the corresponding PDF document
    pdf_doc = find_pdf_file(filename, documents)

    if pdf_doc is not None:
        # Only process if we haven't already processed this file
        doc_source = pdf_doc.metadata["source"]
        if doc_source not in processed_files:
            # Add region for PDF docs
            pdf_doc.metadata["region"] = "United States"

            # Add species metadata - guaranteed to exist since we're starting from CSV
            pdf_doc.metadata["matched_specie_0"] = species

            # Check if there are multiple species for the same file
            same_file_species = metadata_raw[metadata_raw["File Name"].str.lower() == filename.lower()]["Species"]
            for specie_index, specie_name in enumerate(same_file_species):
                pdf_doc.metadata[f"matched_specie_{specie_index}"] = specie_name

            pdf_documents_for_splitting.append(pdf_doc)
            processed_files.add(doc_source)
            print(f"✓ Processed: {filename}{species}")
        else:
            print(f"⚠ Already processed: {filename}")
    else:
        missing_pdfs.append(filename)
        print(f"✗ PDF not found for CSV entry: {filename}{species}")

print(f"Successfully processed: {len(pdf_documents_for_splitting)} PDFs")
print(f"Missing PDFs: {len(missing_pdfs)}")
if missing_pdfs:
    print("Missing PDF files:", missing_pdfs[:10])  # Show first 10
print("---------------------------------------------------")


# Initialize Text Splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size_input, chunk_overlap=10)


# Split PDF documents
pdf_splitted_documents = []
for doc in pdf_documents_for_splitting: # Use the list with added metadata
    splits = text_splitter.split_documents([doc])
    for i, split_doc in enumerate(splits, start=1):
        metadata = split_doc.metadata.copy()
        # Update source for PDF chunks (existing logic)
        source_base = metadata.get('source', 'unknown_source')
        page_num = metadata.get('page', 'unknown_page')
        metadata["source"] = f"{source_base}#page{page_num}#chunk{i}"
        # Remove the raw page number if desired, as it's now in the source string
        # metadata.pop('page', None) 
        split_doc.metadata = metadata
        pdf_splitted_documents.append(split_doc)

print("First PDF Document chunk:\\n", pdf_splitted_documents[0] if pdf_splitted_documents else "No PDF documents processed")
print(f"Count after PDF processing: {len(pdf_splitted_documents)}")
print("---------------------------------------------------")


# Process Excel Sheets using the function
india_splitted_documents = process_excel_sheet(
    excel_path=excel_file_path, 
    sheet_name="India", 
    region="India", 
    splitter=text_splitter
)

africa_splitted_documents = process_excel_sheet(
    excel_path=excel_file_path, 
    sheet_name="Africa", 
    region="Africa", 
    splitter=text_splitter
)


# Combine lists from all sources
splitted_documents = pdf_splitted_documents + india_splitted_documents + africa_splitted_documents


print("pdf_splitted_documents", len(pdf_splitted_documents))
print("india_splitted_documents", len(india_splitted_documents))
print("africa_splitted_documents", len(africa_splitted_documents))


# print(splitted_documents[0]) # Original print statement - commented out as we print chunks above
print("=== Combined Processing Done ===") # Adjusted print statement
print(f"Total documents after combining PDF, India, and Africa sources: {len(splitted_documents)}")
print("=============================")



# ONLY FOR THE FIRST TIME

# Check if the persist directory exists and delete it to ensure a fresh start
if os.path.exists(persist_directory):
    print(f"Deleting existing vector database directory: {persist_directory}")
    shutil.rmtree(persist_directory)
    print(f"Directory deleted.")
else:
    print(f"Vector database directory not found, creating a new one: {persist_directory}")

embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=splitted_documents, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

# persiste the db to disk
vectordb.persist()
vectordb = None


# Now we can load the persisted database from disk, and use it as normal. 

vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)


print(vectordb.get())

#just a test script: 

specie_selector="Aphis spiraecola"
filter = {
    "$or": [
        {"matched_specie_0": specie_selector},
        {"matched_specie_1": specie_selector},
        {"matched_specie_2": specie_selector},
    ]
}
answer = vectordb.as_retriever(search_kwargs={'k':10, 'filter': filter}).get_relevant_documents(
    "anything else.?")
print(answer)