Spaces:
Sleeping
Sleeping
File size: 1,796 Bytes
ded29b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import pandas as pd
import numpy as np
import logging
import sys
import io
# Configure logging to stdout only
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
def preprocess_csv(input_data):
"""
Preprocess a CSV file:
- Remove empty rows
- Handle newline characters in string columns
- Fill No. column with sequential numbers
Args:
input_data: File-like object or path to the input CSV file
Returns:
DataFrame: The processed dataframe
"""
try:
logger.info("Reading input data")
# Read CSV data (can handle both file path string or file-like object)
df = pd.read_csv(input_data)
logger.info(f"Original dataframe shape: {df.shape}")
# Remove completely empty rows
df = df.dropna(how='all')
logger.info(f"Shape after removing empty rows: {df.shape}")
# Handle newline characters in string columns
for column in df.columns:
if df[column].dtype == 'object': # Only apply to string columns
df[column] = df[column].str.replace('\n', ' ').str.strip()
# Fill No. column with sequential numbers
df['No.'] = np.arange(1, len(df) + 1)
logger.info("Preprocessing complete:")
logger.info(f"- Removed empty rows")
logger.info(f"- Processed newline characters in string columns")
logger.info(f"- Filled No. column with values from 1 to {len(df)}")
return df
except Exception as e:
logger.error(f"Error preprocessing CSV: {str(e)}")
raise |