File size: 3,229 Bytes
c79824c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f39814a
 
c79824c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452e581
 
 
 
 
 
 
c79824c
452e581
c79824c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbdac03
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import pandas as pd
import sys
import logging
from typing import Optional
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings

# Ensure stdout encoding for Unicode
sys.stdout.reconfigure(encoding='utf-8')

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class Settings(BaseSettings):
    """Settings for Balance Sheet CSV extraction, loaded from environment variables or .env file."""
    excel_file_path: str = Field(default="data/input/In Lakhs  BS_FY 23-24 V5 - Final.xlsx", env="BS_EXCEL_FILE_PATH")
    output_folder: str = Field(default="data/csv_notes_bs", env="BS_OUTPUT_FOLDER")
    note_2_8_sheet: str = Field(default="Note 2 - 8", env="BS_NOTE_2_8_SHEET")
    note_9_sheet: str = Field(default="Note 9", env="BS_NOTE_9_SHEET")
    note_10_15_sheet: str = Field(default="Note 10-15", env="BS_NOTE_10_15_SHEET")
    skiprows: int = Field(default=3, env="BS_SKIPROWS")

settings = Settings()

class NoteCSVInfo(BaseModel):
    name: str
    rows: int

def clean_note(sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame:
    """
    Parse and clean a sheet from the Excel file.
    Drops empty rows and columns, resets index.
    """
    df = xls.parse(sheet_name, skiprows=skiprows)
    df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
    return df

def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo:
    """
    Export DataFrame to CSV and return info.
    """
    output_path = os.path.join(output_folder, filename)
    df.to_csv(output_path, index=False)
    return NoteCSVInfo(name=filename, rows=df.shape[0])

def main() -> None:
    """
    Main function to extract notes from Excel and export as CSVs.
    """
    # Use command-line argument for Excel file path if provided
    excel_path = settings.excel_file_path
    if len(sys.argv) > 1:
        excel_path = sys.argv[1]
        logger.info(f"Excel file path from argument: {excel_path}")
    else:
        logger.info(f"Excel file path from settings: {excel_path}")
    global xls
    xls = pd.ExcelFile(excel_path)

    # Clean each sheet
    note_2_8_df = clean_note(settings.note_2_8_sheet, settings.skiprows)
    note_9_df = clean_note(settings.note_9_sheet, settings.skiprows)
    note_10_15_df = clean_note(settings.note_10_15_sheet, settings.skiprows)

    # Ensure output folder exists
    os.makedirs(settings.output_folder, exist_ok=True)

    # Export each as CSV in the folder
    info_2_8 = export_note_to_csv(note_2_8_df, "Note_2_to_8_Full.csv", settings.output_folder)
    info_9 = export_note_to_csv(note_9_df, "Note_9_Full.csv", settings.output_folder)
    info_10_15 = export_note_to_csv(note_10_15_df, "Note_10_to_15_Full.csv", settings.output_folder)

    # Log confirmation and row counts
    logger.info(f"Extracted rows: Note 2–8 = {info_2_8.rows} rows")
    logger.info(f"Extracted rows: Note 9   = {info_9.rows} rows")
    logger.info(f"Extracted rows: Note 10–15 = {info_10_15.rows} rows")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Balance sheet data extraction failed: {e}")
        sys.exit(1)