Spaces:
Sleeping
Sleeping
Update cf/cash_flow_data_extractor.py
Browse files- cf/cash_flow_data_extractor.py +189 -37
cf/cash_flow_data_extractor.py
CHANGED
|
@@ -2,9 +2,9 @@ import os
|
|
| 2 |
import pandas as pd
|
| 3 |
import sys
|
| 4 |
import logging
|
| 5 |
-
from typing import Optional
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
-
from pydantic_settings import BaseSettings
|
| 8 |
|
| 9 |
# Ensure stdout encoding for Unicode
|
| 10 |
sys.stdout.reconfigure(encoding='utf-8')
|
|
@@ -14,15 +14,22 @@ logging.basicConfig(level=logging.INFO)
|
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
class Settings(BaseSettings):
|
| 17 |
-
"""Settings for Cash Flow Statement CSV extraction
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
settings = Settings()
|
| 28 |
|
|
@@ -30,11 +37,76 @@ class NoteCSVInfo(BaseModel):
|
|
| 30 |
name: str
|
| 31 |
rows: int
|
| 32 |
|
| 33 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
"""
|
| 35 |
Parse and clean a sheet from the Excel file.
|
| 36 |
-
Drops empty rows and columns, resets index.
|
| 37 |
"""
|
|
|
|
|
|
|
|
|
|
| 38 |
df = xls.parse(sheet_name, skiprows=skiprows)
|
| 39 |
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
|
| 40 |
return df
|
|
@@ -45,11 +117,13 @@ def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> N
|
|
| 45 |
"""
|
| 46 |
output_path = os.path.join(output_folder, filename)
|
| 47 |
df.to_csv(output_path, index=False)
|
|
|
|
| 48 |
return NoteCSVInfo(name=filename, rows=df.shape[0])
|
| 49 |
|
| 50 |
def main() -> None:
|
| 51 |
"""
|
| 52 |
Main function to extract notes from Excel and export as CSVs.
|
|
|
|
| 53 |
"""
|
| 54 |
# Use command-line argument for Excel file path if provided
|
| 55 |
excel_path = settings.excel_file_path
|
|
@@ -58,32 +132,110 @@ def main() -> None:
|
|
| 58 |
logger.info(f"Excel file path from argument: {excel_path}")
|
| 59 |
else:
|
| 60 |
logger.info(f"Excel file path from settings: {excel_path}")
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# Ensure output folder exists
|
| 72 |
os.makedirs(settings.output_folder, exist_ok=True)
|
| 73 |
-
|
| 74 |
-
#
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
if __name__ == "__main__":
|
| 89 |
main()
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import sys
|
| 4 |
import logging
|
| 5 |
+
from typing import Optional, List
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 8 |
|
| 9 |
# Ensure stdout encoding for Unicode
|
| 10 |
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
class Settings(BaseSettings):
|
| 17 |
+
"""Settings for Cash Flow Statement CSV extraction"""
|
| 18 |
+
|
| 19 |
+
model_config = SettingsConfigDict(
|
| 20 |
+
env_prefix='CFS_',
|
| 21 |
+
env_file='.env',
|
| 22 |
+
extra='ignore'
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx")
|
| 26 |
+
output_folder: str = Field(default="data/csv_notes_cfs")
|
| 27 |
+
note_16_23_sheet: str = Field(default="Note 16-23")
|
| 28 |
+
note_2_8_sheet: str = Field(default="Note 2 - 8")
|
| 29 |
+
note_9_sheet: str = Field(default="Note 9")
|
| 30 |
+
note_10_15_sheet: str = Field(default="Note 10-15")
|
| 31 |
+
note_24_30_sheet: str = Field(default="Note 24-30")
|
| 32 |
+
skiprows: int = Field(default=3)
|
| 33 |
|
| 34 |
settings = Settings()
|
| 35 |
|
|
|
|
| 37 |
name: str
|
| 38 |
rows: int
|
| 39 |
|
| 40 |
+
def get_available_sheets(xls: pd.ExcelFile) -> List[str]:
|
| 41 |
+
"""Get list of available sheet names"""
|
| 42 |
+
return xls.sheet_names
|
| 43 |
+
|
| 44 |
+
def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]:
|
| 45 |
+
"""Find the main trial balance sheet"""
|
| 46 |
+
sheet_names = get_available_sheets(xls)
|
| 47 |
+
|
| 48 |
+
# Common trial balance sheet name patterns
|
| 49 |
+
trial_balance_patterns = [
|
| 50 |
+
'trial balance',
|
| 51 |
+
'tb',
|
| 52 |
+
'trialbalance',
|
| 53 |
+
'trial_balance',
|
| 54 |
+
'sheet1',
|
| 55 |
+
'sheet 1'
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
for sheet in sheet_names:
|
| 59 |
+
sheet_lower = sheet.lower()
|
| 60 |
+
for pattern in trial_balance_patterns:
|
| 61 |
+
if pattern in sheet_lower:
|
| 62 |
+
logger.info(f"Found trial balance sheet: {sheet}")
|
| 63 |
+
return sheet
|
| 64 |
+
|
| 65 |
+
# If no match, return the first sheet
|
| 66 |
+
logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}")
|
| 67 |
+
return sheet_names[0] if sheet_names else None
|
| 68 |
+
|
| 69 |
+
def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]:
|
| 70 |
+
"""
|
| 71 |
+
Extract data from a standard trial balance file (fallback method)
|
| 72 |
+
"""
|
| 73 |
+
tb_sheet = find_trial_balance_sheet(xls)
|
| 74 |
+
|
| 75 |
+
if not tb_sheet:
|
| 76 |
+
raise ValueError("No suitable sheet found in Excel file")
|
| 77 |
+
|
| 78 |
+
logger.info(f"Using sheet '{tb_sheet}' as trial balance")
|
| 79 |
+
|
| 80 |
+
# Try different skiprows values to find the header
|
| 81 |
+
for skiprows in [0, 1, 2, 3, 4, 5]:
|
| 82 |
+
try:
|
| 83 |
+
df = xls.parse(tb_sheet, skiprows=skiprows)
|
| 84 |
+
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
|
| 85 |
+
|
| 86 |
+
# Check if we have reasonable data
|
| 87 |
+
if len(df) > 5 and len(df.columns) >= 2:
|
| 88 |
+
logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}")
|
| 89 |
+
|
| 90 |
+
# Save the entire trial balance
|
| 91 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 92 |
+
output_path = os.path.join(output_folder, "Trial_Balance_Full.csv")
|
| 93 |
+
df.to_csv(output_path, index=False)
|
| 94 |
+
logger.info(f"Saved trial balance to: {output_path}")
|
| 95 |
+
|
| 96 |
+
return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])]
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.debug(f"Failed with skiprows={skiprows}: {e}")
|
| 99 |
+
continue
|
| 100 |
+
|
| 101 |
+
raise ValueError("Could not parse trial balance sheet with any skiprows configuration")
|
| 102 |
+
|
| 103 |
+
def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame:
|
| 104 |
"""
|
| 105 |
Parse and clean a sheet from the Excel file.
|
|
|
|
| 106 |
"""
|
| 107 |
+
if sheet_name not in xls.sheet_names:
|
| 108 |
+
raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}")
|
| 109 |
+
|
| 110 |
df = xls.parse(sheet_name, skiprows=skiprows)
|
| 111 |
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
|
| 112 |
return df
|
|
|
|
| 117 |
"""
|
| 118 |
output_path = os.path.join(output_folder, filename)
|
| 119 |
df.to_csv(output_path, index=False)
|
| 120 |
+
logger.info(f"Exported: {output_path}")
|
| 121 |
return NoteCSVInfo(name=filename, rows=df.shape[0])
|
| 122 |
|
| 123 |
def main() -> None:
|
| 124 |
"""
|
| 125 |
Main function to extract notes from Excel and export as CSVs.
|
| 126 |
+
Handles both structured note files and standard trial balance files.
|
| 127 |
"""
|
| 128 |
# Use command-line argument for Excel file path if provided
|
| 129 |
excel_path = settings.excel_file_path
|
|
|
|
| 132 |
logger.info(f"Excel file path from argument: {excel_path}")
|
| 133 |
else:
|
| 134 |
logger.info(f"Excel file path from settings: {excel_path}")
|
| 135 |
+
|
| 136 |
+
if not os.path.exists(excel_path):
|
| 137 |
+
logger.error(f"Excel file not found: {excel_path}")
|
| 138 |
+
sys.exit(1)
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
xls = pd.ExcelFile(excel_path)
|
| 142 |
+
except Exception as e:
|
| 143 |
+
logger.error(f"Failed to open Excel file: {e}")
|
| 144 |
+
sys.exit(1)
|
| 145 |
+
|
| 146 |
+
available_sheets = get_available_sheets(xls)
|
| 147 |
+
logger.info(f"Available sheets in file: {', '.join(available_sheets)}")
|
| 148 |
+
|
| 149 |
# Ensure output folder exists
|
| 150 |
os.makedirs(settings.output_folder, exist_ok=True)
|
| 151 |
+
|
| 152 |
+
# Try to extract from structured notes first
|
| 153 |
+
expected_sheets = [
|
| 154 |
+
settings.note_16_23_sheet,
|
| 155 |
+
settings.note_2_8_sheet,
|
| 156 |
+
settings.note_9_sheet,
|
| 157 |
+
settings.note_10_15_sheet,
|
| 158 |
+
settings.note_24_30_sheet
|
| 159 |
+
]
|
| 160 |
+
|
| 161 |
+
# Check if this is a structured notes file
|
| 162 |
+
has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets)
|
| 163 |
+
|
| 164 |
+
if has_structured_notes:
|
| 165 |
+
logger.info("="*60)
|
| 166 |
+
logger.info("Detected structured notes file - extracting from individual note sheets")
|
| 167 |
+
logger.info("="*60)
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
info_16_23 = export_note_to_csv(
|
| 171 |
+
clean_note(xls, settings.note_16_23_sheet, settings.skiprows),
|
| 172 |
+
"Note_16_to_23_Full.csv",
|
| 173 |
+
settings.output_folder
|
| 174 |
+
)
|
| 175 |
+
info_2_8 = export_note_to_csv(
|
| 176 |
+
clean_note(xls, settings.note_2_8_sheet, settings.skiprows),
|
| 177 |
+
"Note_2_to_8_Full.csv",
|
| 178 |
+
settings.output_folder
|
| 179 |
+
)
|
| 180 |
+
info_9 = export_note_to_csv(
|
| 181 |
+
clean_note(xls, settings.note_9_sheet, settings.skiprows),
|
| 182 |
+
"Note_9_Full.csv",
|
| 183 |
+
settings.output_folder
|
| 184 |
+
)
|
| 185 |
+
info_10_15 = export_note_to_csv(
|
| 186 |
+
clean_note(xls, settings.note_10_15_sheet, settings.skiprows),
|
| 187 |
+
"Note_10_to_15_Full.csv",
|
| 188 |
+
settings.output_folder
|
| 189 |
+
)
|
| 190 |
+
info_24_30 = export_note_to_csv(
|
| 191 |
+
clean_note(xls, settings.note_24_30_sheet, settings.skiprows),
|
| 192 |
+
"Note_24_to_30_Full.csv",
|
| 193 |
+
settings.output_folder
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
logger.info("="*60)
|
| 197 |
+
logger.info("EXTRACTION SUMMARY")
|
| 198 |
+
logger.info("="*60)
|
| 199 |
+
logger.info(f"Note 16-23: {info_16_23.rows} rows")
|
| 200 |
+
logger.info(f"Note 2-8: {info_2_8.rows} rows")
|
| 201 |
+
logger.info(f"Note 9: {info_9.rows} rows")
|
| 202 |
+
logger.info(f"Note 10-15: {info_10_15.rows} rows")
|
| 203 |
+
logger.info(f"Note 24-30: {info_24_30.rows} rows")
|
| 204 |
+
logger.info("="*60)
|
| 205 |
+
|
| 206 |
+
except ValueError as e:
|
| 207 |
+
logger.error(f"Error extracting from structured notes: {e}")
|
| 208 |
+
sys.exit(1)
|
| 209 |
+
else:
|
| 210 |
+
logger.info("="*60)
|
| 211 |
+
logger.info("Standard trial balance file detected - using fallback extraction")
|
| 212 |
+
logger.info("="*60)
|
| 213 |
+
logger.warning("⚠️ Cash Flow Statement generation may be limited with trial balance data only")
|
| 214 |
+
logger.info("")
|
| 215 |
+
logger.info("For complete Cash Flow Statement, please provide a file with these sheets:")
|
| 216 |
+
logger.info(" • Note 16-23 (Revenue details)")
|
| 217 |
+
logger.info(" • Note 2-8 (Share capital, reserves, liabilities)")
|
| 218 |
+
logger.info(" • Note 9 (Fixed assets)")
|
| 219 |
+
logger.info(" • Note 10-15 (Current assets, loans)")
|
| 220 |
+
logger.info(" • Note 24-30 (Additional disclosures)")
|
| 221 |
+
logger.info("="*60)
|
| 222 |
+
|
| 223 |
+
try:
|
| 224 |
+
csv_infos = extract_from_trial_balance(xls, settings.output_folder)
|
| 225 |
+
|
| 226 |
+
logger.info("="*60)
|
| 227 |
+
logger.info("EXTRACTION SUMMARY")
|
| 228 |
+
logger.info("="*60)
|
| 229 |
+
for info in csv_infos:
|
| 230 |
+
logger.info(f"{info.name}: {info.rows} rows")
|
| 231 |
+
logger.info("="*60)
|
| 232 |
+
logger.info("✓ Trial balance data extracted successfully")
|
| 233 |
+
logger.info("Note: Cash Flow generation will proceed with available data")
|
| 234 |
+
|
| 235 |
+
except Exception as e:
|
| 236 |
+
logger.error(f"Error extracting from trial balance: {e}")
|
| 237 |
+
logger.error("Please check if the file contains valid financial data")
|
| 238 |
+
sys.exit(1)
|
| 239 |
|
| 240 |
if __name__ == "__main__":
|
| 241 |
main()
|