Spaces:

AgentSearch
/

AgentBase-Platform

Running

App Files Files Community

AgentBase-Platform / retrieval /utils.py

Arastun

fix: pandas depreciated document handling

8189da6 9 days ago

Raw

History Blame Contribute Delete

1.76 kB

	from typing import Dict, List, Tuple
	import json

	import pandas as pd


	def agentbase_indexing(db_path: str) -> Tuple[pd.DataFrame, List[str]]:
	"""
	Another indexing configuration for AgentBase dense models.
	1. concatenate and embed all columns, except agent_id (redundant) and misc (not to go over max_seq_length)
	2. handle missing/null values
	3. prioritise important fields (see field semantics)

	:returns: ids and prepared documents
	"""
	agents_df = pd.read_csv(db_path)
	agent_ids = agents_df["agent_id"]
	agents_df.drop(columns=["agent_id", "misc"], inplace=True)
	columns = agents_df.columns

	high_priority_cols = ["agent_name", "agent_description", "agent_category"]
	columns = high_priority_cols + [col for col in agents_df.columns if col not in high_priority_cols]
	documents = agents_df.apply(
	lambda row: ' '.join([f"{row[col]}" for col in columns if pd.notna(row[col])]), axis=1
	).tolist()
	return agent_ids, documents


	def load_documents(db_path: str, columns=["agent_name", "agent_description"]) -> Tuple[pd.DataFrame, List[str]]:
	"""
	Loads documents (for sparse and dense models) by concatenating all column fields
	:returns: ids and prepared documents
	"""
	agents_df = pd.read_csv(db_path)
	agent_ids = agents_df["agent_id"] # keep agent IDs (mapping back after retrieval)
	documents = agents_df[columns].apply(
	lambda row: ' '.join(row.fillna('').astype(str)), axis=1
	).tolist()
	return agent_ids, documents


	def tokenise(doc: str) -> List[str]:
	return doc.lower().split()


	def load_queries(queries_path: str) -> Dict[str, str]:
	with open(queries_path) as json_file:
	data = json.load(json_file)
	return data