Spaces:
Running
Running
File size: 1,762 Bytes
277590a 98aa770 277590a 98aa770 8189da6 98aa770 277590a 98aa770 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | from typing import Dict, List, Tuple
import json
import pandas as pd
def agentbase_indexing(db_path: str) -> Tuple[pd.DataFrame, List[str]]:
"""
Another indexing configuration for AgentBase dense models.
1. concatenate and embed all columns, except agent_id (redundant) and misc (not to go over max_seq_length)
2. handle missing/null values
3. prioritise important fields (see field semantics)
:returns: ids and prepared documents
"""
agents_df = pd.read_csv(db_path)
agent_ids = agents_df["agent_id"]
agents_df.drop(columns=["agent_id", "misc"], inplace=True)
columns = agents_df.columns
high_priority_cols = ["agent_name", "agent_description", "agent_category"]
columns = high_priority_cols + [col for col in agents_df.columns if col not in high_priority_cols]
documents = agents_df.apply(
lambda row: ' '.join([f"{row[col]}" for col in columns if pd.notna(row[col])]), axis=1
).tolist()
return agent_ids, documents
def load_documents(db_path: str, columns=["agent_name", "agent_description"]) -> Tuple[pd.DataFrame, List[str]]:
"""
Loads documents (for sparse and dense models) by concatenating all column fields
:returns: ids and prepared documents
"""
agents_df = pd.read_csv(db_path)
agent_ids = agents_df["agent_id"] # keep agent IDs (mapping back after retrieval)
documents = agents_df[columns].apply(
lambda row: ' '.join(row.fillna('').astype(str)), axis=1
).tolist()
return agent_ids, documents
def tokenise(doc: str) -> List[str]:
return doc.lower().split()
def load_queries(queries_path: str) -> Dict[str, str]:
with open(queries_path) as json_file:
data = json.load(json_file)
return data
|