AI-Agent-Book / utils /preprocess.py
Cuong2004's picture
init project
ded29b0
import pandas as pd
import numpy as np
import logging
import sys
import io
# Configure logging to stdout only
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
def preprocess_csv(input_data):
"""
Preprocess a CSV file:
- Remove empty rows
- Handle newline characters in string columns
- Fill No. column with sequential numbers
Args:
input_data: File-like object or path to the input CSV file
Returns:
DataFrame: The processed dataframe
"""
try:
logger.info("Reading input data")
# Read CSV data (can handle both file path string or file-like object)
df = pd.read_csv(input_data)
logger.info(f"Original dataframe shape: {df.shape}")
# Remove completely empty rows
df = df.dropna(how='all')
logger.info(f"Shape after removing empty rows: {df.shape}")
# Handle newline characters in string columns
for column in df.columns:
if df[column].dtype == 'object': # Only apply to string columns
df[column] = df[column].str.replace('\n', ' ').str.strip()
# Fill No. column with sequential numbers
df['No.'] = np.arange(1, len(df) + 1)
logger.info("Preprocessing complete:")
logger.info(f"- Removed empty rows")
logger.info(f"- Processed newline characters in string columns")
logger.info(f"- Filled No. column with values from 1 to {len(df)}")
return df
except Exception as e:
logger.error(f"Error preprocessing CSV: {str(e)}")
raise