| | import itertools
|
| | import re
|
| |
|
| | from numpy import array
|
| | from pandas import DataFrame
|
| |
|
| |
|
| | class SearchError(Exception):
|
| | """Search returned misaligned results."""
|
| | pass
|
| |
|
| |
|
| |
|
| | def search_columns(df: DataFrame,
|
| | patterns: list,
|
| | columns: list,
|
| | return_as: str = "indicator_column",
|
| | return_column: str = "indicator",
|
| | re_flags = re.I | re.X):
|
| | """Search columns for string patterns within dataframe columns.
|
| |
|
| | Args:
|
| | df (DataFrame): Input data in format of pandas dataframe.
|
| | patterns (list): List of string patterns to input, compatible with regex.
|
| | columns (list): List of column names to search for input patterns.
|
| | return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column".
|
| | re_flags (optional): Regex flags to use. Defaults to re.I | re.X.
|
| |
|
| | Raises:
|
| | TypeError: Raises exception when `patterns` or `columns` parameters are not lists.
|
| | ValueError: Raises exception when `patterns` or `columns` parameters have incorrect length.
|
| | ValueError: Raises exception when `return_as` parameter receives an incorrect value.
|
| |
|
| | Returns:
|
| | DataFrame: DataFrame with "indicator" column or filtered by search terms.
|
| | """
|
| |
|
| | bool_list = []
|
| |
|
| |
|
| | if not (isinstance(patterns, list) and isinstance(columns, list)):
|
| | raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.')
|
| |
|
| | if len(patterns) == len(columns):
|
| |
|
| | inputs = list(zip(patterns,columns))
|
| |
|
| |
|
| | for i in inputs:
|
| | searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
|
| | searchbool = array([True if n is True else False for n in searchre])
|
| | bool_list.append(searchbool)
|
| |
|
| | elif (len(patterns) == 1) and (len(patterns) != len(columns)):
|
| |
|
| | inputs = list(itertools.product(patterns, columns))
|
| |
|
| |
|
| | for i in inputs:
|
| | searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
|
| | searchbool = array([True if n is True else False for n in searchre])
|
| | bool_list.append(searchbool)
|
| |
|
| | else:
|
| | raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.")
|
| |
|
| |
|
| |
|
| |
|
| | filter_bool = array(bool_list).any(axis=0)
|
| |
|
| | if return_as == "indicator_column":
|
| | dfResults = df.copy(deep=True)
|
| | dfResults.loc[:, return_column] = 0
|
| | dfResults.loc[filter_bool, return_column] = 1
|
| |
|
| | return dfResults
|
| |
|
| | elif return_as == "filtered_df":
|
| |
|
| | dfResults = df.loc[filter_bool, :].copy(deep=True)
|
| |
|
| | return dfResults
|
| |
|
| | else:
|
| | raise ValueError("Incorrect input for 'return_as' parameter.")
|
| |
|