dipan004 commited on
Commit
e1b550c
·
verified ·
1 Parent(s): 37aca1b

Update cf/cash_flow_data_extractor.py

Browse files
Files changed (1) hide show
  1. cf/cash_flow_data_extractor.py +189 -37
cf/cash_flow_data_extractor.py CHANGED
@@ -2,9 +2,9 @@ import os
2
  import pandas as pd
3
  import sys
4
  import logging
5
- from typing import Optional
6
  from pydantic import BaseModel, Field
7
- from pydantic_settings import BaseSettings
8
 
9
  # Ensure stdout encoding for Unicode
10
  sys.stdout.reconfigure(encoding='utf-8')
@@ -14,15 +14,22 @@ logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
  class Settings(BaseSettings):
17
- """Settings for Cash Flow Statement CSV extraction, loaded from environment variables or .env file."""
18
- excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx", env="CFS_EXCEL_FILE_PATH")
19
- output_folder: str = Field(default="data/csv_notes_cfs", env="CFS_OUTPUT_FOLDER")
20
- note_16_23_sheet: str = Field(default="Note 16-23", env="CFS_NOTE_16_23_SHEET")
21
- note_2_8_sheet: str = Field(default="Note 2 - 8", env="CFS_NOTE_2_8_SHEET")
22
- note_9_sheet: str = Field(default="Note 9", env="CFS_NOTE_9_SHEET")
23
- note_10_15_sheet: str = Field(default="Note 10-15", env="CFS_NOTE_10_15_SHEET")
24
- note_24_30_sheet: str = Field(default="Note 24-30", env="CFS_NOTE_24_30_SHEET")
25
- skiprows: int = Field(default=3, env="CFS_SKIPROWS")
 
 
 
 
 
 
 
26
 
27
  settings = Settings()
28
 
@@ -30,11 +37,76 @@ class NoteCSVInfo(BaseModel):
30
  name: str
31
  rows: int
32
 
33
- def clean_note(sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  """
35
  Parse and clean a sheet from the Excel file.
36
- Drops empty rows and columns, resets index.
37
  """
 
 
 
38
  df = xls.parse(sheet_name, skiprows=skiprows)
39
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
40
  return df
@@ -45,11 +117,13 @@ def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> N
45
  """
46
  output_path = os.path.join(output_folder, filename)
47
  df.to_csv(output_path, index=False)
 
48
  return NoteCSVInfo(name=filename, rows=df.shape[0])
49
 
50
  def main() -> None:
51
  """
52
  Main function to extract notes from Excel and export as CSVs.
 
53
  """
54
  # Use command-line argument for Excel file path if provided
55
  excel_path = settings.excel_file_path
@@ -58,32 +132,110 @@ def main() -> None:
58
  logger.info(f"Excel file path from argument: {excel_path}")
59
  else:
60
  logger.info(f"Excel file path from settings: {excel_path}")
61
- global xls
62
- xls = pd.ExcelFile(excel_path)
63
-
64
- # Clean each sheet
65
- note_16_23_df = clean_note(settings.note_16_23_sheet, settings.skiprows)
66
- note_2_8_df = clean_note(settings.note_2_8_sheet, settings.skiprows)
67
- note_9_df = clean_note(settings.note_9_sheet, settings.skiprows)
68
- note_10_15_df = clean_note(settings.note_10_15_sheet, settings.skiprows)
69
- note_24_30_df = clean_note(settings.note_24_30_sheet, settings.skiprows)
70
-
 
 
 
 
71
  # Ensure output folder exists
72
  os.makedirs(settings.output_folder, exist_ok=True)
73
-
74
- # Export each as CSV in the folder
75
- info_16_23 = export_note_to_csv(note_16_23_df, "Note_16_to_23_Full.csv", settings.output_folder)
76
- info_2_8 = export_note_to_csv(note_2_8_df, "Note_2_to_8_Full.csv", settings.output_folder)
77
- info_9 = export_note_to_csv(note_9_df, "Note_9_Full.csv", settings.output_folder)
78
- info_10_15 = export_note_to_csv(note_10_15_df, "Note_10_to_15_Full.csv", settings.output_folder)
79
- info_24_30 = export_note_to_csv(note_24_30_df, "Note_24_to_30_Full.csv", settings.output_folder)
80
-
81
- # Log confirmation and row counts
82
- logger.info(f"Extracted rows: Note 16–23 = {info_16_23.rows} rows")
83
- logger.info(f"Extracted rows: Note 2–8 = {info_2_8.rows} rows")
84
- logger.info(f"Extracted rows: Note 9 = {info_9.rows} rows")
85
- logger.info(f"Extracted rows: Note 10–15 = {info_10_15.rows} rows")
86
- logger.info(f"Extracted rows: Note 24–30 = {info_24_30.rows} rows")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  if __name__ == "__main__":
89
  main()
 
2
  import pandas as pd
3
  import sys
4
  import logging
5
+ from typing import Optional, List
6
  from pydantic import BaseModel, Field
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
 
9
  # Ensure stdout encoding for Unicode
10
  sys.stdout.reconfigure(encoding='utf-8')
 
14
  logger = logging.getLogger(__name__)
15
 
16
  class Settings(BaseSettings):
17
+ """Settings for Cash Flow Statement CSV extraction"""
18
+
19
+ model_config = SettingsConfigDict(
20
+ env_prefix='CFS_',
21
+ env_file='.env',
22
+ extra='ignore'
23
+ )
24
+
25
+ excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx")
26
+ output_folder: str = Field(default="data/csv_notes_cfs")
27
+ note_16_23_sheet: str = Field(default="Note 16-23")
28
+ note_2_8_sheet: str = Field(default="Note 2 - 8")
29
+ note_9_sheet: str = Field(default="Note 9")
30
+ note_10_15_sheet: str = Field(default="Note 10-15")
31
+ note_24_30_sheet: str = Field(default="Note 24-30")
32
+ skiprows: int = Field(default=3)
33
 
34
  settings = Settings()
35
 
 
37
  name: str
38
  rows: int
39
 
40
+ def get_available_sheets(xls: pd.ExcelFile) -> List[str]:
41
+ """Get list of available sheet names"""
42
+ return xls.sheet_names
43
+
44
+ def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]:
45
+ """Find the main trial balance sheet"""
46
+ sheet_names = get_available_sheets(xls)
47
+
48
+ # Common trial balance sheet name patterns
49
+ trial_balance_patterns = [
50
+ 'trial balance',
51
+ 'tb',
52
+ 'trialbalance',
53
+ 'trial_balance',
54
+ 'sheet1',
55
+ 'sheet 1'
56
+ ]
57
+
58
+ for sheet in sheet_names:
59
+ sheet_lower = sheet.lower()
60
+ for pattern in trial_balance_patterns:
61
+ if pattern in sheet_lower:
62
+ logger.info(f"Found trial balance sheet: {sheet}")
63
+ return sheet
64
+
65
+ # If no match, return the first sheet
66
+ logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}")
67
+ return sheet_names[0] if sheet_names else None
68
+
69
+ def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]:
70
+ """
71
+ Extract data from a standard trial balance file (fallback method)
72
+ """
73
+ tb_sheet = find_trial_balance_sheet(xls)
74
+
75
+ if not tb_sheet:
76
+ raise ValueError("No suitable sheet found in Excel file")
77
+
78
+ logger.info(f"Using sheet '{tb_sheet}' as trial balance")
79
+
80
+ # Try different skiprows values to find the header
81
+ for skiprows in [0, 1, 2, 3, 4, 5]:
82
+ try:
83
+ df = xls.parse(tb_sheet, skiprows=skiprows)
84
+ df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
85
+
86
+ # Check if we have reasonable data
87
+ if len(df) > 5 and len(df.columns) >= 2:
88
+ logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}")
89
+
90
+ # Save the entire trial balance
91
+ os.makedirs(output_folder, exist_ok=True)
92
+ output_path = os.path.join(output_folder, "Trial_Balance_Full.csv")
93
+ df.to_csv(output_path, index=False)
94
+ logger.info(f"Saved trial balance to: {output_path}")
95
+
96
+ return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])]
97
+ except Exception as e:
98
+ logger.debug(f"Failed with skiprows={skiprows}: {e}")
99
+ continue
100
+
101
+ raise ValueError("Could not parse trial balance sheet with any skiprows configuration")
102
+
103
+ def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame:
104
  """
105
  Parse and clean a sheet from the Excel file.
 
106
  """
107
+ if sheet_name not in xls.sheet_names:
108
+ raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}")
109
+
110
  df = xls.parse(sheet_name, skiprows=skiprows)
111
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
112
  return df
 
117
  """
118
  output_path = os.path.join(output_folder, filename)
119
  df.to_csv(output_path, index=False)
120
+ logger.info(f"Exported: {output_path}")
121
  return NoteCSVInfo(name=filename, rows=df.shape[0])
122
 
123
  def main() -> None:
124
  """
125
  Main function to extract notes from Excel and export as CSVs.
126
+ Handles both structured note files and standard trial balance files.
127
  """
128
  # Use command-line argument for Excel file path if provided
129
  excel_path = settings.excel_file_path
 
132
  logger.info(f"Excel file path from argument: {excel_path}")
133
  else:
134
  logger.info(f"Excel file path from settings: {excel_path}")
135
+
136
+ if not os.path.exists(excel_path):
137
+ logger.error(f"Excel file not found: {excel_path}")
138
+ sys.exit(1)
139
+
140
+ try:
141
+ xls = pd.ExcelFile(excel_path)
142
+ except Exception as e:
143
+ logger.error(f"Failed to open Excel file: {e}")
144
+ sys.exit(1)
145
+
146
+ available_sheets = get_available_sheets(xls)
147
+ logger.info(f"Available sheets in file: {', '.join(available_sheets)}")
148
+
149
  # Ensure output folder exists
150
  os.makedirs(settings.output_folder, exist_ok=True)
151
+
152
+ # Try to extract from structured notes first
153
+ expected_sheets = [
154
+ settings.note_16_23_sheet,
155
+ settings.note_2_8_sheet,
156
+ settings.note_9_sheet,
157
+ settings.note_10_15_sheet,
158
+ settings.note_24_30_sheet
159
+ ]
160
+
161
+ # Check if this is a structured notes file
162
+ has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets)
163
+
164
+ if has_structured_notes:
165
+ logger.info("="*60)
166
+ logger.info("Detected structured notes file - extracting from individual note sheets")
167
+ logger.info("="*60)
168
+
169
+ try:
170
+ info_16_23 = export_note_to_csv(
171
+ clean_note(xls, settings.note_16_23_sheet, settings.skiprows),
172
+ "Note_16_to_23_Full.csv",
173
+ settings.output_folder
174
+ )
175
+ info_2_8 = export_note_to_csv(
176
+ clean_note(xls, settings.note_2_8_sheet, settings.skiprows),
177
+ "Note_2_to_8_Full.csv",
178
+ settings.output_folder
179
+ )
180
+ info_9 = export_note_to_csv(
181
+ clean_note(xls, settings.note_9_sheet, settings.skiprows),
182
+ "Note_9_Full.csv",
183
+ settings.output_folder
184
+ )
185
+ info_10_15 = export_note_to_csv(
186
+ clean_note(xls, settings.note_10_15_sheet, settings.skiprows),
187
+ "Note_10_to_15_Full.csv",
188
+ settings.output_folder
189
+ )
190
+ info_24_30 = export_note_to_csv(
191
+ clean_note(xls, settings.note_24_30_sheet, settings.skiprows),
192
+ "Note_24_to_30_Full.csv",
193
+ settings.output_folder
194
+ )
195
+
196
+ logger.info("="*60)
197
+ logger.info("EXTRACTION SUMMARY")
198
+ logger.info("="*60)
199
+ logger.info(f"Note 16-23: {info_16_23.rows} rows")
200
+ logger.info(f"Note 2-8: {info_2_8.rows} rows")
201
+ logger.info(f"Note 9: {info_9.rows} rows")
202
+ logger.info(f"Note 10-15: {info_10_15.rows} rows")
203
+ logger.info(f"Note 24-30: {info_24_30.rows} rows")
204
+ logger.info("="*60)
205
+
206
+ except ValueError as e:
207
+ logger.error(f"Error extracting from structured notes: {e}")
208
+ sys.exit(1)
209
+ else:
210
+ logger.info("="*60)
211
+ logger.info("Standard trial balance file detected - using fallback extraction")
212
+ logger.info("="*60)
213
+ logger.warning("⚠️ Cash Flow Statement generation may be limited with trial balance data only")
214
+ logger.info("")
215
+ logger.info("For complete Cash Flow Statement, please provide a file with these sheets:")
216
+ logger.info(" • Note 16-23 (Revenue details)")
217
+ logger.info(" • Note 2-8 (Share capital, reserves, liabilities)")
218
+ logger.info(" • Note 9 (Fixed assets)")
219
+ logger.info(" • Note 10-15 (Current assets, loans)")
220
+ logger.info(" • Note 24-30 (Additional disclosures)")
221
+ logger.info("="*60)
222
+
223
+ try:
224
+ csv_infos = extract_from_trial_balance(xls, settings.output_folder)
225
+
226
+ logger.info("="*60)
227
+ logger.info("EXTRACTION SUMMARY")
228
+ logger.info("="*60)
229
+ for info in csv_infos:
230
+ logger.info(f"{info.name}: {info.rows} rows")
231
+ logger.info("="*60)
232
+ logger.info("✓ Trial balance data extracted successfully")
233
+ logger.info("Note: Cash Flow generation will proceed with available data")
234
+
235
+ except Exception as e:
236
+ logger.error(f"Error extracting from trial balance: {e}")
237
+ logger.error("Please check if the file contains valid financial data")
238
+ sys.exit(1)
239
 
240
  if __name__ == "__main__":
241
  main()