File size: 3,401 Bytes
c79824c
 
 
 
 
 
 
 
 
 
 
 
 
 
f39814a
c79824c
 
 
 
 
5fd0efa
 
 
 
 
 
 
 
 
 
c79824c
 
 
 
5fd0efa
c79824c
 
 
 
 
 
 
 
 
 
 
 
5fd0efa
 
 
 
 
 
 
 
 
c79824c
5fd0efa
c79824c
 
 
 
 
 
5fd0efa
 
 
 
 
 
 
 
 
 
 
 
 
c79824c
 
5fd0efa
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import pandas as pd
import logging
from typing import Optional
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class Settings(BaseSettings):
    """Settings for P&L CSV extraction, loaded from environment variables or .env file."""
    excel_file_path: str = Field(default="In Lakhs  BS_FY 23-24 V5 - Final.xlsx", env="PNL_EXCEL_FILE_PATH")
    output_folder: str = Field(default="data/csv_notes_pnl", env="PNL_OUTPUT_FOLDER")
    note_16_23_sheet: str = Field(default="Note 16-23", env="PNL_NOTE_16_23_SHEET")
    skiprows: int = Field(default=3, env="PNL_SKIPROWS")

settings = Settings()

def get_xls(excel_file_path: str) -> pd.ExcelFile:
    try:
        xls = pd.ExcelFile(excel_file_path)
        logger.info(f"Loaded Excel file: {excel_file_path}")
        logger.info(f"Available sheets: {xls.sheet_names}")
        return xls
    except Exception as e:
        logger.error(f"Failed to load Excel file '{excel_file_path}': {e}")
        raise

class NoteCSVInfo(BaseModel):
    name: str
    rows: int

def clean_note(xls, sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame:
    """
    Parse and clean a sheet from the Excel file.
    Drops empty rows and columns, resets index.
    """
    df = xls.parse(sheet_name, skiprows=skiprows)
    df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
    return df

def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo:
    """
    Export DataFrame to CSV and return info.
    """
    # Always use absolute path for output folder
    abs_output_folder = os.path.abspath(output_folder)
    try:
        os.makedirs(abs_output_folder, exist_ok=True)
        logger.info(f"Output folder ensured: {abs_output_folder}")
    except Exception as e:
        logger.error(f"Failed to create output folder '{abs_output_folder}': {e}")
        raise
    output_path = os.path.join(abs_output_folder, filename)
    df.to_csv(output_path, index=False)
    logger.info(f"CSV file written to: {output_path}")
    return NoteCSVInfo(name=filename, rows=df.shape[0])

def main() -> None:
    """
    Main function to extract P&L notes from Excel and export as CSV.
    """
    import sys
    logger.info(f"Current working directory: {os.getcwd()}")
    excel_file_path = settings.excel_file_path
    if len(sys.argv) > 1:
        excel_file_path = sys.argv[1]
        logger.info(f"Excel file path from argument: {excel_file_path}")
    xls = get_xls(excel_file_path)
    if settings.note_16_23_sheet not in xls.sheet_names:
        logger.error(f"Sheet '{settings.note_16_23_sheet}' not found in Excel file. Available sheets: {xls.sheet_names}")
        return
    note_16_23_df = clean_note(xls, settings.note_16_23_sheet, settings.skiprows)
    logger.info(f"Loaded DataFrame shape: {note_16_23_df.shape}")
    logger.info(f"First few rows:\n{note_16_23_df.head()}\n")
    info_16_23 = export_note_to_csv(note_16_23_df, "Note_16_to_23_Full.csv", settings.output_folder)
    logger.info(f"Extracted rows: Note 16-23 = {info_16_23.rows} rows")
    abs_output_folder = os.path.abspath(settings.output_folder)
    logger.info(f"CSV output path: {os.path.join(abs_output_folder, 'Note_16_to_23_Full.csv')}")

if __name__ == "__main__":
    main()