File size: 1,762 Bytes
277590a
98aa770
 
 
 
 
277590a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98aa770
 
8189da6
 
 
98aa770
 
 
 
 
 
 
277590a
98aa770
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from typing import Dict, List, Tuple
import json

import pandas as pd


def agentbase_indexing(db_path: str) -> Tuple[pd.DataFrame, List[str]]:
    """
    Another indexing configuration for AgentBase dense models.
        1. concatenate and embed all columns, except agent_id (redundant) and misc (not to go over max_seq_length)
        2. handle missing/null values
        3. prioritise important fields (see field semantics)

    :returns: ids and prepared documents
    """
    agents_df = pd.read_csv(db_path)
    agent_ids = agents_df["agent_id"]
    agents_df.drop(columns=["agent_id", "misc"], inplace=True)
    columns = agents_df.columns

    high_priority_cols = ["agent_name", "agent_description", "agent_category"]
    columns = high_priority_cols + [col for col in agents_df.columns if col not in high_priority_cols]
    documents = agents_df.apply(
        lambda row: ' '.join([f"{row[col]}" for col in columns if pd.notna(row[col])]), axis=1
    ).tolist()
    return agent_ids, documents


def load_documents(db_path: str, columns=["agent_name", "agent_description"]) -> Tuple[pd.DataFrame, List[str]]:
    """
    Loads documents (for sparse and dense models) by concatenating all column fields
    :returns: ids and prepared documents
    """
    agents_df = pd.read_csv(db_path)
    agent_ids = agents_df["agent_id"] # keep agent IDs (mapping back after retrieval)
    documents = agents_df[columns].apply(
        lambda row: ' '.join(row.fillna('').astype(str)), axis=1
    ).tolist()
    return agent_ids, documents


def tokenise(doc: str) -> List[str]:
    return doc.lower().split()


def load_queries(queries_path: str) -> Dict[str, str]:
    with open(queries_path) as json_file:
        data = json.load(json_file)
    return data