Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import sys | |
| import logging | |
| from typing import Optional | |
| from pydantic import BaseModel, Field | |
| from pydantic_settings import BaseSettings | |
| # Ensure stdout encoding for Unicode | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class Settings(BaseSettings): | |
| """Settings for Balance Sheet CSV extraction, loaded from environment variables or .env file.""" | |
| excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx", env="BS_EXCEL_FILE_PATH") | |
| output_folder: str = Field(default="data/csv_notes_bs", env="BS_OUTPUT_FOLDER") | |
| note_2_8_sheet: str = Field(default="Note 2 - 8", env="BS_NOTE_2_8_SHEET") | |
| note_9_sheet: str = Field(default="Note 9", env="BS_NOTE_9_SHEET") | |
| note_10_15_sheet: str = Field(default="Note 10-15", env="BS_NOTE_10_15_SHEET") | |
| skiprows: int = Field(default=3, env="BS_SKIPROWS") | |
| settings = Settings() | |
| class NoteCSVInfo(BaseModel): | |
| name: str | |
| rows: int | |
| def clean_note(sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame: | |
| """ | |
| Parse and clean a sheet from the Excel file. | |
| Drops empty rows and columns, resets index. | |
| """ | |
| df = xls.parse(sheet_name, skiprows=skiprows) | |
| df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True) | |
| return df | |
| def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo: | |
| """ | |
| Export DataFrame to CSV and return info. | |
| """ | |
| output_path = os.path.join(output_folder, filename) | |
| df.to_csv(output_path, index=False) | |
| return NoteCSVInfo(name=filename, rows=df.shape[0]) | |
| def main() -> None: | |
| """ | |
| Main function to extract notes from Excel and export as CSVs. | |
| """ | |
| # Use command-line argument for Excel file path if provided | |
| excel_path = settings.excel_file_path | |
| if len(sys.argv) > 1: | |
| excel_path = sys.argv[1] | |
| logger.info(f"Excel file path from argument: {excel_path}") | |
| else: | |
| logger.info(f"Excel file path from settings: {excel_path}") | |
| global xls | |
| xls = pd.ExcelFile(excel_path) | |
| # Clean each sheet | |
| note_2_8_df = clean_note(settings.note_2_8_sheet, settings.skiprows) | |
| note_9_df = clean_note(settings.note_9_sheet, settings.skiprows) | |
| note_10_15_df = clean_note(settings.note_10_15_sheet, settings.skiprows) | |
| # Ensure output folder exists | |
| os.makedirs(settings.output_folder, exist_ok=True) | |
| # Export each as CSV in the folder | |
| info_2_8 = export_note_to_csv(note_2_8_df, "Note_2_to_8_Full.csv", settings.output_folder) | |
| info_9 = export_note_to_csv(note_9_df, "Note_9_Full.csv", settings.output_folder) | |
| info_10_15 = export_note_to_csv(note_10_15_df, "Note_10_to_15_Full.csv", settings.output_folder) | |
| # Log confirmation and row counts | |
| logger.info(f"Extracted rows: Note 2β8 = {info_2_8.rows} rows") | |
| logger.info(f"Extracted rows: Note 9 = {info_9.rows} rows") | |
| logger.info(f"Extracted rows: Note 10β15 = {info_10_15.rows} rows") | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except Exception as e: | |
| logger.error(f"Balance sheet data extraction failed: {e}") | |
| sys.exit(1) | |