Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import logging | |
| import sys | |
| import io | |
| # Configure logging to stdout only | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout) | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def preprocess_csv(input_data): | |
| """ | |
| Preprocess a CSV file: | |
| - Remove empty rows | |
| - Handle newline characters in string columns | |
| - Fill No. column with sequential numbers | |
| Args: | |
| input_data: File-like object or path to the input CSV file | |
| Returns: | |
| DataFrame: The processed dataframe | |
| """ | |
| try: | |
| logger.info("Reading input data") | |
| # Read CSV data (can handle both file path string or file-like object) | |
| df = pd.read_csv(input_data) | |
| logger.info(f"Original dataframe shape: {df.shape}") | |
| # Remove completely empty rows | |
| df = df.dropna(how='all') | |
| logger.info(f"Shape after removing empty rows: {df.shape}") | |
| # Handle newline characters in string columns | |
| for column in df.columns: | |
| if df[column].dtype == 'object': # Only apply to string columns | |
| df[column] = df[column].str.replace('\n', ' ').str.strip() | |
| # Fill No. column with sequential numbers | |
| df['No.'] = np.arange(1, len(df) + 1) | |
| logger.info("Preprocessing complete:") | |
| logger.info(f"- Removed empty rows") | |
| logger.info(f"- Processed newline characters in string columns") | |
| logger.info(f"- Filled No. column with values from 1 to {len(df)}") | |
| return df | |
| except Exception as e: | |
| logger.error(f"Error preprocessing CSV: {str(e)}") | |
| raise |