import pandas as pd import numpy as np import logging import sys import io # Configure logging to stdout only logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) def preprocess_csv(input_data): """ Preprocess a CSV file: - Remove empty rows - Handle newline characters in string columns - Fill No. column with sequential numbers Args: input_data: File-like object or path to the input CSV file Returns: DataFrame: The processed dataframe """ try: logger.info("Reading input data") # Read CSV data (can handle both file path string or file-like object) df = pd.read_csv(input_data) logger.info(f"Original dataframe shape: {df.shape}") # Remove completely empty rows df = df.dropna(how='all') logger.info(f"Shape after removing empty rows: {df.shape}") # Handle newline characters in string columns for column in df.columns: if df[column].dtype == 'object': # Only apply to string columns df[column] = df[column].str.replace('\n', ' ').str.strip() # Fill No. column with sequential numbers df['No.'] = np.arange(1, len(df) + 1) logger.info("Preprocessing complete:") logger.info(f"- Removed empty rows") logger.info(f"- Processed newline characters in string columns") logger.info(f"- Filled No. column with values from 1 to {len(df)}") return df except Exception as e: logger.error(f"Error preprocessing CSV: {str(e)}") raise