File size: 2,730 Bytes
2e164d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import pandas as pd
import re
from typing import Any
# Precompile regex for special-only strings.
SPECIAL_ONLY_REGEX = re.compile(r'^[^A-Za-z0-9]+$')
def is_numeric_or_special(s: Any) -> bool:
"""
Check if the provided value is numeric or consists solely of special characters.
Parameters:
s (Any): The input value to check.
Returns:
bool: True if the value is numeric or special-only, False otherwise.
"""
if pd.isnull(s):
return False
# Ensure the input is a string.
s = str(s).strip()
# Check if the string can be converted to a float.
try:
float(s)
return True
except ValueError:
pass
# Check if the string is composed exclusively of special characters.
if SPECIAL_ONLY_REGEX.match(s):
return True
return False
def remove_numeric_or_special_responses(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
"""
Remove rows from the DataFrame where the target column's value is either numeric or
consists solely of special characters.
Parameters:
df (pd.DataFrame): The input DataFrame.
target_col (str): The name of the column to filter.
Returns:
pd.DataFrame: A DataFrame with the undesired responses removed.
"""
filtered_df = df[~df[target_col].map(is_numeric_or_special)].reset_index(drop=True)
return filtered_df
#####################
# DATE CONVERT
#####################
import pandas as pd
import datetime
from dateutil import parser
def robust_convert_date(date_series):
"""
Convert a pandas Series containing dates in various formats to datetime objects.
This function tries:
1. The built-in pd.to_datetime() with infer_datetime_format and dayfirst options.
2. Falls back to dateutil.parser.parse for any values that remain unparsed.
Parameters:
date_series (pd.Series): A pandas Series with date values (as strings, numbers, etc.)
Returns:
pd.Series: A Series of datetime objects (or pd.NaT if conversion fails)
"""
def convert_single(x):
# If the value is already a datetime, just return it.
if pd.isnull(x):
return pd.NaT
if isinstance(x, (pd.Timestamp, datetime.datetime)):
return x
# First, try using pd.to_datetime with coercion.
dt = pd.to_datetime(x, errors='coerce', infer_datetime_format=True, dayfirst=True)
if pd.notnull(dt):
return dt
# Fallback: use dateutil.parser to attempt parsing.
try:
return parser.parse(str(x), dayfirst=True)
except Exception:
return pd.NaT
return date_series.apply(convert_single)
|