Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import logging | |
| from typing import Optional | |
| from pydantic import BaseModel, Field | |
| from pydantic_settings import BaseSettings | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class Settings(BaseSettings): | |
| """Settings for P&L CSV extraction, loaded from environment variables or .env file.""" | |
| excel_file_path: str = Field(default="In Lakhs BS_FY 23-24 V5 - Final.xlsx", env="PNL_EXCEL_FILE_PATH") | |
| output_folder: str = Field(default="data/csv_notes_pnl", env="PNL_OUTPUT_FOLDER") | |
| note_16_23_sheet: str = Field(default="Note 16-23", env="PNL_NOTE_16_23_SHEET") | |
| skiprows: int = Field(default=3, env="PNL_SKIPROWS") | |
| settings = Settings() | |
| def get_xls(excel_file_path: str) -> pd.ExcelFile: | |
| try: | |
| xls = pd.ExcelFile(excel_file_path) | |
| logger.info(f"Loaded Excel file: {excel_file_path}") | |
| logger.info(f"Available sheets: {xls.sheet_names}") | |
| return xls | |
| except Exception as e: | |
| logger.error(f"Failed to load Excel file '{excel_file_path}': {e}") | |
| raise | |
| class NoteCSVInfo(BaseModel): | |
| name: str | |
| rows: int | |
| def clean_note(xls, sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame: | |
| """ | |
| Parse and clean a sheet from the Excel file. | |
| Drops empty rows and columns, resets index. | |
| """ | |
| df = xls.parse(sheet_name, skiprows=skiprows) | |
| df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True) | |
| return df | |
| def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo: | |
| """ | |
| Export DataFrame to CSV and return info. | |
| """ | |
| # Always use absolute path for output folder | |
| abs_output_folder = os.path.abspath(output_folder) | |
| try: | |
| os.makedirs(abs_output_folder, exist_ok=True) | |
| logger.info(f"Output folder ensured: {abs_output_folder}") | |
| except Exception as e: | |
| logger.error(f"Failed to create output folder '{abs_output_folder}': {e}") | |
| raise | |
| output_path = os.path.join(abs_output_folder, filename) | |
| df.to_csv(output_path, index=False) | |
| logger.info(f"CSV file written to: {output_path}") | |
| return NoteCSVInfo(name=filename, rows=df.shape[0]) | |
| def main() -> None: | |
| """ | |
| Main function to extract P&L notes from Excel and export as CSV. | |
| """ | |
| import sys | |
| logger.info(f"Current working directory: {os.getcwd()}") | |
| excel_file_path = settings.excel_file_path | |
| if len(sys.argv) > 1: | |
| excel_file_path = sys.argv[1] | |
| logger.info(f"Excel file path from argument: {excel_file_path}") | |
| xls = get_xls(excel_file_path) | |
| if settings.note_16_23_sheet not in xls.sheet_names: | |
| logger.error(f"Sheet '{settings.note_16_23_sheet}' not found in Excel file. Available sheets: {xls.sheet_names}") | |
| return | |
| note_16_23_df = clean_note(xls, settings.note_16_23_sheet, settings.skiprows) | |
| logger.info(f"Loaded DataFrame shape: {note_16_23_df.shape}") | |
| logger.info(f"First few rows:\n{note_16_23_df.head()}\n") | |
| info_16_23 = export_note_to_csv(note_16_23_df, "Note_16_to_23_Full.csv", settings.output_folder) | |
| logger.info(f"Extracted rows: Note 16-23 = {info_16_23.rows} rows") | |
| abs_output_folder = os.path.abspath(settings.output_folder) | |
| logger.info(f"CSV output path: {os.path.join(abs_output_folder, 'Note_16_to_23_Full.csv')}") | |
| if __name__ == "__main__": | |
| main() | |