import os import pandas as pd import sys import logging from typing import Optional from pydantic import BaseModel, Field from pydantic_settings import BaseSettings # Ensure stdout encoding for Unicode sys.stdout.reconfigure(encoding='utf-8') # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class Settings(BaseSettings): """Settings for Balance Sheet CSV extraction, loaded from environment variables or .env file.""" excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx", env="BS_EXCEL_FILE_PATH") output_folder: str = Field(default="data/csv_notes_bs", env="BS_OUTPUT_FOLDER") note_2_8_sheet: str = Field(default="Note 2 - 8", env="BS_NOTE_2_8_SHEET") note_9_sheet: str = Field(default="Note 9", env="BS_NOTE_9_SHEET") note_10_15_sheet: str = Field(default="Note 10-15", env="BS_NOTE_10_15_SHEET") skiprows: int = Field(default=3, env="BS_SKIPROWS") settings = Settings() class NoteCSVInfo(BaseModel): name: str rows: int def clean_note(sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame: """ Parse and clean a sheet from the Excel file. Drops empty rows and columns, resets index. """ df = xls.parse(sheet_name, skiprows=skiprows) df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True) return df def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo: """ Export DataFrame to CSV and return info. """ output_path = os.path.join(output_folder, filename) df.to_csv(output_path, index=False) return NoteCSVInfo(name=filename, rows=df.shape[0]) def main() -> None: """ Main function to extract notes from Excel and export as CSVs. """ # Use command-line argument for Excel file path if provided excel_path = settings.excel_file_path if len(sys.argv) > 1: excel_path = sys.argv[1] logger.info(f"Excel file path from argument: {excel_path}") else: logger.info(f"Excel file path from settings: {excel_path}") global xls xls = pd.ExcelFile(excel_path) # Clean each sheet note_2_8_df = clean_note(settings.note_2_8_sheet, settings.skiprows) note_9_df = clean_note(settings.note_9_sheet, settings.skiprows) note_10_15_df = clean_note(settings.note_10_15_sheet, settings.skiprows) # Ensure output folder exists os.makedirs(settings.output_folder, exist_ok=True) # Export each as CSV in the folder info_2_8 = export_note_to_csv(note_2_8_df, "Note_2_to_8_Full.csv", settings.output_folder) info_9 = export_note_to_csv(note_9_df, "Note_9_Full.csv", settings.output_folder) info_10_15 = export_note_to_csv(note_10_15_df, "Note_10_to_15_Full.csv", settings.output_folder) # Log confirmation and row counts logger.info(f"Extracted rows: Note 2–8 = {info_2_8.rows} rows") logger.info(f"Extracted rows: Note 9 = {info_9.rows} rows") logger.info(f"Extracted rows: Note 10–15 = {info_10_15.rows} rows") if __name__ == "__main__": try: main() except Exception as e: logger.error(f"Balance sheet data extraction failed: {e}") sys.exit(1)