| import itertools |
| import re |
|
|
| from numpy import array |
| from pandas import DataFrame |
|
|
|
|
| class SearchError(Exception): |
| """Search returned misaligned results.""" |
| pass |
|
|
|
|
| |
| def search_columns( |
| df: DataFrame, |
| patterns: list, |
| columns: list, |
| return_as: str = "indicator_column", |
| return_column: str = "indicator", |
| re_flags = re.I | re.X |
| ): |
| """Search columns for string patterns within dataframe columns. |
| |
| Args: |
| df (DataFrame): Input data in format of pandas dataframe. |
| patterns (list): List of string patterns to input, compatible with regex. |
| columns (list): List of column names to search for input patterns. |
| return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column". |
| re_flags (optional): Regex flags to use. Defaults to re.I | re.X. |
| |
| Raises: |
| TypeError: Raises exception when `patterns` or `columns` parameters are not lists. |
| ValueError: Raises exception when `patterns` or `columns` parameters have incorrect length. |
| ValueError: Raises exception when `return_as` parameter receives an incorrect value. |
| |
| Returns: |
| DataFrame: DataFrame with "indicator" column or filtered by search terms. |
| """ |
| |
| bool_list = [] |
| |
| |
| if not (isinstance(patterns, list) and isinstance(columns, list)): |
| raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.') |
| |
| if len(patterns) == len(columns): |
| |
| inputs = list(zip(patterns,columns)) |
| |
| |
| for i in inputs: |
| searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags) |
| searchbool = array([True if n is True else False for n in searchre]) |
| bool_list.append(searchbool) |
| |
| elif (len(patterns) == 1) and (len(patterns) != len(columns)): |
| |
| inputs = list(itertools.product(patterns, columns)) |
| |
| |
| for i in inputs: |
| searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags) |
| searchbool = array([True if n is True else False for n in searchre]) |
| bool_list.append(searchbool) |
| |
| else: |
| raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.") |
|
|
| |
| |
| |
| filter_bool = array(bool_list).any(axis=0) |
|
|
| if return_as == "indicator_column": |
| dfResults = df.copy(deep=True) |
| dfResults.loc[:, return_column] = 0 |
| dfResults.loc[filter_bool, return_column] = 1 |
| |
| return dfResults |
| |
| elif return_as == "filtered_df": |
| |
| dfResults = df.loc[filter_bool, :].copy(deep=True) |
| |
| return dfResults |
| |
| else: |
| raise ValueError("Incorrect input for 'return_as' parameter.") |
|
|