File size: 9,156 Bytes
bc7f19f
 
 
 
e1b550c
bc7f19f
e1b550c
bc7f19f
 
 
 
 
 
 
 
 
e1b550c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7f19f
 
 
 
 
 
 
e1b550c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7f19f
 
 
e1b550c
 
 
bc7f19f
 
 
 
 
 
 
 
 
 
e1b550c
bc7f19f
 
 
 
 
e1b550c
bc7f19f
 
 
 
 
 
 
 
e1b550c
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7f19f
 
e1b550c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7f19f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import os
import pandas as pd
import sys
import logging
from typing import Optional, List
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict

# Ensure stdout encoding for Unicode
sys.stdout.reconfigure(encoding='utf-8')

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class Settings(BaseSettings):
    """Settings for Cash Flow Statement CSV extraction"""
    
    model_config = SettingsConfigDict(
        env_prefix='CFS_',
        env_file='.env',
        extra='ignore'
    )
    
    excel_file_path: str = Field(default="data/input/In Lakhs  BS_FY 23-24 V5 - Final.xlsx")
    output_folder: str = Field(default="data/csv_notes_cfs")
    note_16_23_sheet: str = Field(default="Note 16-23")
    note_2_8_sheet: str = Field(default="Note 2 - 8")
    note_9_sheet: str = Field(default="Note 9")
    note_10_15_sheet: str = Field(default="Note 10-15")
    note_24_30_sheet: str = Field(default="Note 24-30")
    skiprows: int = Field(default=3)

settings = Settings()

class NoteCSVInfo(BaseModel):
    name: str
    rows: int

def get_available_sheets(xls: pd.ExcelFile) -> List[str]:
    """Get list of available sheet names"""
    return xls.sheet_names

def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]:
    """Find the main trial balance sheet"""
    sheet_names = get_available_sheets(xls)
    
    # Common trial balance sheet name patterns
    trial_balance_patterns = [
        'trial balance',
        'tb',
        'trialbalance',
        'trial_balance',
        'sheet1',
        'sheet 1'
    ]
    
    for sheet in sheet_names:
        sheet_lower = sheet.lower()
        for pattern in trial_balance_patterns:
            if pattern in sheet_lower:
                logger.info(f"Found trial balance sheet: {sheet}")
                return sheet
    
    # If no match, return the first sheet
    logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}")
    return sheet_names[0] if sheet_names else None

def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]:
    """
    Extract data from a standard trial balance file (fallback method)
    """
    tb_sheet = find_trial_balance_sheet(xls)
    
    if not tb_sheet:
        raise ValueError("No suitable sheet found in Excel file")
    
    logger.info(f"Using sheet '{tb_sheet}' as trial balance")
    
    # Try different skiprows values to find the header
    for skiprows in [0, 1, 2, 3, 4, 5]:
        try:
            df = xls.parse(tb_sheet, skiprows=skiprows)
            df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
            
            # Check if we have reasonable data
            if len(df) > 5 and len(df.columns) >= 2:
                logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}")
                
                # Save the entire trial balance
                os.makedirs(output_folder, exist_ok=True)
                output_path = os.path.join(output_folder, "Trial_Balance_Full.csv")
                df.to_csv(output_path, index=False)
                logger.info(f"Saved trial balance to: {output_path}")
                
                return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])]
        except Exception as e:
            logger.debug(f"Failed with skiprows={skiprows}: {e}")
            continue
    
    raise ValueError("Could not parse trial balance sheet with any skiprows configuration")

def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame:
    """
    Parse and clean a sheet from the Excel file.
    """
    if sheet_name not in xls.sheet_names:
        raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}")
    
    df = xls.parse(sheet_name, skiprows=skiprows)
    df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
    return df

def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo:
    """
    Export DataFrame to CSV and return info.
    """
    output_path = os.path.join(output_folder, filename)
    df.to_csv(output_path, index=False)
    logger.info(f"Exported: {output_path}")
    return NoteCSVInfo(name=filename, rows=df.shape[0])

def main() -> None:
    """
    Main function to extract notes from Excel and export as CSVs.
    Handles both structured note files and standard trial balance files.
    """
    # Use command-line argument for Excel file path if provided
    excel_path = settings.excel_file_path
    if len(sys.argv) > 1:
        excel_path = sys.argv[1]
        logger.info(f"Excel file path from argument: {excel_path}")
    else:
        logger.info(f"Excel file path from settings: {excel_path}")
    
    if not os.path.exists(excel_path):
        logger.error(f"Excel file not found: {excel_path}")
        sys.exit(1)
    
    try:
        xls = pd.ExcelFile(excel_path)
    except Exception as e:
        logger.error(f"Failed to open Excel file: {e}")
        sys.exit(1)
    
    available_sheets = get_available_sheets(xls)
    logger.info(f"Available sheets in file: {', '.join(available_sheets)}")
    
    # Ensure output folder exists
    os.makedirs(settings.output_folder, exist_ok=True)
    
    # Try to extract from structured notes first
    expected_sheets = [
        settings.note_16_23_sheet,
        settings.note_2_8_sheet,
        settings.note_9_sheet,
        settings.note_10_15_sheet,
        settings.note_24_30_sheet
    ]
    
    # Check if this is a structured notes file
    has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets)
    
    if has_structured_notes:
        logger.info("="*60)
        logger.info("Detected structured notes file - extracting from individual note sheets")
        logger.info("="*60)
        
        try:
            info_16_23 = export_note_to_csv(
                clean_note(xls, settings.note_16_23_sheet, settings.skiprows),
                "Note_16_to_23_Full.csv",
                settings.output_folder
            )
            info_2_8 = export_note_to_csv(
                clean_note(xls, settings.note_2_8_sheet, settings.skiprows),
                "Note_2_to_8_Full.csv",
                settings.output_folder
            )
            info_9 = export_note_to_csv(
                clean_note(xls, settings.note_9_sheet, settings.skiprows),
                "Note_9_Full.csv",
                settings.output_folder
            )
            info_10_15 = export_note_to_csv(
                clean_note(xls, settings.note_10_15_sheet, settings.skiprows),
                "Note_10_to_15_Full.csv",
                settings.output_folder
            )
            info_24_30 = export_note_to_csv(
                clean_note(xls, settings.note_24_30_sheet, settings.skiprows),
                "Note_24_to_30_Full.csv",
                settings.output_folder
            )
            
            logger.info("="*60)
            logger.info("EXTRACTION SUMMARY")
            logger.info("="*60)
            logger.info(f"Note 16-23: {info_16_23.rows} rows")
            logger.info(f"Note 2-8:   {info_2_8.rows} rows")
            logger.info(f"Note 9:     {info_9.rows} rows")
            logger.info(f"Note 10-15: {info_10_15.rows} rows")
            logger.info(f"Note 24-30: {info_24_30.rows} rows")
            logger.info("="*60)
            
        except ValueError as e:
            logger.error(f"Error extracting from structured notes: {e}")
            sys.exit(1)
    else:
        logger.info("="*60)
        logger.info("Standard trial balance file detected - using fallback extraction")
        logger.info("="*60)
        logger.warning("⚠️  Cash Flow Statement generation may be limited with trial balance data only")
        logger.info("")
        logger.info("For complete Cash Flow Statement, please provide a file with these sheets:")
        logger.info("  • Note 16-23 (Revenue details)")
        logger.info("  • Note 2-8 (Share capital, reserves, liabilities)")
        logger.info("  • Note 9 (Fixed assets)")
        logger.info("  • Note 10-15 (Current assets, loans)")
        logger.info("  • Note 24-30 (Additional disclosures)")
        logger.info("="*60)
        
        try:
            csv_infos = extract_from_trial_balance(xls, settings.output_folder)
            
            logger.info("="*60)
            logger.info("EXTRACTION SUMMARY")
            logger.info("="*60)
            for info in csv_infos:
                logger.info(f"{info.name}: {info.rows} rows")
            logger.info("="*60)
            logger.info("✓ Trial balance data extracted successfully")
            logger.info("Note: Cash Flow generation will proceed with available data")
            
        except Exception as e:
            logger.error(f"Error extracting from trial balance: {e}")
            logger.error("Please check if the file contains valid financial data")
            sys.exit(1)

if __name__ == "__main__":
    main()