Spaces:

arbabarshad
/

agllm2-dev

Sleeping

App Files Files Community

agllm2-dev / app_database_prep.py

arbabarshad

fixed count of species

8302c34 30 days ago

raw

history blame contribute delete

11 kB

	import os
	import shutil

	from langchain.document_loaders import PyPDFDirectoryLoader
	import pandas as pd
	import langchain
	from queue import Queue
	from typing import Any, List
	from langchain.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference
	from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
	from langchain.schema import LLMResult
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.prompts.prompt import PromptTemplate
	from anyio.from_thread import start_blocking_portal #For model callback streaming

	langchain.debug=True # TODO: DOUBLE CHECK

	system_message = {"role": "system", "content": "You are a helpful assistant."} # TODO: double check how this plays out later.
	import os
	from dotenv import load_dotenv

	import streamlit as st

	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.chains.question_answering import load_qa_chain
	from langchain.chat_models import ChatOpenAI
	from langchain.vectorstores import Chroma
	import chromadb


	## added information in metadata:
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.llms import OpenAI
	from langchain.chains import RetrievalQA
	from langchain.document_loaders import TextLoader
	from langchain.document_loaders import DirectoryLoader
	from langchain_community.document_loaders import PyMuPDFLoader
	from langchain.schema import Document


	# Function to process a sheet from the Excel file
	def process_excel_sheet(
	excel_path: str,
	sheet_name: str,
	region: str,
	splitter: RecursiveCharacterTextSplitter
	) -> List[Document]:
	"""Loads data from an Excel sheet, creates Documents, splits them, and adds metadata."""
	print(f"--- Processing Excel Sheet: {sheet_name} (Region: {region}) ---")
	try:
	df = pd.read_excel(excel_path, sheet_name=sheet_name)
	print(f"Excel Data Head ({sheet_name}):\\n", df.head())
	except Exception as e:
	print(f"Error loading sheet '{sheet_name}' from {excel_path}: {e}")
	return []

	initial_documents = []
	for index, row in df.iterrows():
	ipm_info = str(row['IPM Info']) if pd.notna(row['IPM Info']) else ""
	# Check if essential columns exist and are not empty (removed accuracy check)
	if pd.isna(row['Common Name']) or pd.isna(row['Species']):
	print(f"Skipping row {index+2} in sheet '{sheet_name}' due to missing essential data (Common Name or Species).")
	continue

	doc = Document(
	page_content=ipm_info,
	metadata={
	"source": f"{excel_path}#sheet={sheet_name}#row={index+2}",
	"common_name": row['Common Name'],
	"species": row['Species'],
	"matched_specie_0": row['Species'],
	"region": region
	}
	)
	initial_documents.append(doc)

	if initial_documents:
	print(f"First Document from {sheet_name} (before splitting):\\n", initial_documents[0])
	else:
	print(f"No documents created from sheet: {sheet_name}")
	return [] # Return empty list if no documents were created

	split_documents = []
	for doc in initial_documents:
	splits = splitter.split_documents([doc])
	for i, split_doc in enumerate(splits, start=1):
	metadata = split_doc.metadata.copy()
	metadata["source"] = f"{metadata['source']}#chunk{i}"
	split_doc.metadata = metadata
	split_documents.append(split_doc)

	if split_documents:
	print(f"First Document chunk from {sheet_name}:\\n", split_documents[0])

	print(f"Finished processing sheet: {sheet_name}. Found {len(split_documents)} chunks.")
	print("---------------------------------------------------")
	return split_documents

	# --- Main Script Logic ---

	# --- INSECTS DATA PROCESSING --- #actually this includes both the weed and insects.
	insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
	persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
	insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='*/.pdf', loader_cls=PyMuPDFLoader)
	chunk_size_input = 512
	insects_metadata_raw = pd.read_csv(f"./agllm-data/{insects_data_domain_identifier}/matched_species_results_v2.csv")
	insects_documents = insects_loader.load()

	# --- WEEDS DATA PROCESSING ---
	weeds_data_domain_identifier = "agllm-data-isu-field-weeds-all-species"
	weeds_loader = DirectoryLoader(f'agllm-data/{weeds_data_domain_identifier}', glob='*/.pdf', loader_cls=PyMuPDFLoader)
	weeds_metadata_raw = pd.read_csv(f"./agllm-data/{weeds_data_domain_identifier}/matched_species_results_v2.csv")
	weeds_documents = weeds_loader.load()

	# Combine documents from both sources before processing
	documents = insects_documents + weeds_documents
	metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_index=True)


	## Load Excel File Path (Define once) - Using Organized file as single source of truth
	excel_file_path = "species-organized/PestID Species - Organized.xlsx"


	## Process PDF documents using CSV → PDF approach
	print("--- Processing PDF Documents (CSV → PDF approach) ---")

	# Function to find PDF file for a given filename
	def find_pdf_file(filename, documents):
	"""Find a PDF document by filename in the loaded documents"""
	for doc in documents:
	doc_filename = doc.metadata["source"].split('/')[-1]
	# Try exact match first
	if doc_filename.lower() == filename.lower():
	return doc
	# Try without extension
	if doc_filename.lower().replace('.pdf', '') == filename.lower().replace('.pdf', ''):
	return doc
	return None

	pdf_documents_for_splitting = []
	processed_files = set()
	missing_pdfs = []

	# Process CSV entries first, then find matching PDFs
	print(f"Processing {len(metadata_raw)} CSV entries...")
	for index, row in metadata_raw.iterrows():
	filename = row['File Name']
	species = row['Species']

	# Find the corresponding PDF document
	pdf_doc = find_pdf_file(filename, documents)

	if pdf_doc is not None:
	# Only process if we haven't already processed this file
	doc_source = pdf_doc.metadata["source"]
	if doc_source not in processed_files:
	# Add region for PDF docs
	pdf_doc.metadata["region"] = "United States"

	# Add species metadata - guaranteed to exist since we're starting from CSV
	pdf_doc.metadata["matched_specie_0"] = species

	# Check if there are multiple species for the same file
	same_file_species = metadata_raw[metadata_raw["File Name"].str.lower() == filename.lower()]["Species"]
	for specie_index, specie_name in enumerate(same_file_species):
	pdf_doc.metadata[f"matched_specie_{specie_index}"] = specie_name

	pdf_documents_for_splitting.append(pdf_doc)
	processed_files.add(doc_source)
	print(f"✓ Processed: {filename} → {species}")
	else:
	print(f"⚠ Already processed: {filename}")
	else:
	missing_pdfs.append(filename)
	print(f"✗ PDF not found for CSV entry: {filename} → {species}")

	print(f"Successfully processed: {len(pdf_documents_for_splitting)} PDFs")
	print(f"Missing PDFs: {len(missing_pdfs)}")
	if missing_pdfs:
	print("Missing PDF files:", missing_pdfs[:10]) # Show first 10
	print("---------------------------------------------------")


	# Initialize Text Splitter
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size_input, chunk_overlap=10)


	# Split PDF documents
	pdf_splitted_documents = []
	for doc in pdf_documents_for_splitting: # Use the list with added metadata
	splits = text_splitter.split_documents([doc])
	for i, split_doc in enumerate(splits, start=1):
	metadata = split_doc.metadata.copy()
	# Update source for PDF chunks (existing logic)
	source_base = metadata.get('source', 'unknown_source')
	page_num = metadata.get('page', 'unknown_page')
	metadata["source"] = f"{source_base}#page{page_num}#chunk{i}"
	# Remove the raw page number if desired, as it's now in the source string
	# metadata.pop('page', None)
	split_doc.metadata = metadata
	pdf_splitted_documents.append(split_doc)

	print("First PDF Document chunk:\\n", pdf_splitted_documents[0] if pdf_splitted_documents else "No PDF documents processed")
	print(f"Count after PDF processing: {len(pdf_splitted_documents)}")
	print("---------------------------------------------------")


	# Process Excel Sheets using the function
	india_splitted_documents = process_excel_sheet(
	excel_path=excel_file_path,
	sheet_name="India",
	region="India",
	splitter=text_splitter
	)

	africa_splitted_documents = process_excel_sheet(
	excel_path=excel_file_path,
	sheet_name="Africa",
	region="Africa",
	splitter=text_splitter
	)


	# Combine lists from all sources
	splitted_documents = pdf_splitted_documents + india_splitted_documents + africa_splitted_documents


	print("pdf_splitted_documents", len(pdf_splitted_documents))
	print("india_splitted_documents", len(india_splitted_documents))
	print("africa_splitted_documents", len(africa_splitted_documents))


	# print(splitted_documents[0]) # Original print statement - commented out as we print chunks above
	print("=== Combined Processing Done ===") # Adjusted print statement
	print(f"Total documents after combining PDF, India, and Africa sources: {len(splitted_documents)}")
	print("=============================")



	# ONLY FOR THE FIRST TIME

	# Check if the persist directory exists and delete it to ensure a fresh start
	if os.path.exists(persist_directory):
	print(f"Deleting existing vector database directory: {persist_directory}")
	shutil.rmtree(persist_directory)
	print(f"Directory deleted.")
	else:
	print(f"Vector database directory not found, creating a new one: {persist_directory}")

	embedding = OpenAIEmbeddings()
	vectordb = Chroma.from_documents(documents=splitted_documents,
	embedding=embedding,
	persist_directory=persist_directory)

	# persiste the db to disk
	vectordb.persist()
	vectordb = None


	# Now we can load the persisted database from disk, and use it as normal.

	vectordb = Chroma(persist_directory=persist_directory,
	embedding_function=embedding)


	print(vectordb.get())

	#just a test script:

	specie_selector="Aphis spiraecola"
	filter = {
	"$or": [
	{"matched_specie_0": specie_selector},
	{"matched_specie_1": specie_selector},
	{"matched_specie_2": specie_selector},
	]
	}
	answer = vectordb.as_retriever(search_kwargs={'k':10, 'filter': filter}).get_relevant_documents(
	"anything else.?")
	print(answer)