Spaces:
Sleeping
Sleeping
File size: 9,156 Bytes
bc7f19f e1b550c bc7f19f e1b550c bc7f19f e1b550c bc7f19f e1b550c bc7f19f e1b550c bc7f19f e1b550c bc7f19f e1b550c bc7f19f e1b550c bc7f19f e1b550c bc7f19f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
import os
import pandas as pd
import sys
import logging
from typing import Optional, List
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
# Ensure stdout encoding for Unicode
sys.stdout.reconfigure(encoding='utf-8')
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Settings(BaseSettings):
"""Settings for Cash Flow Statement CSV extraction"""
model_config = SettingsConfigDict(
env_prefix='CFS_',
env_file='.env',
extra='ignore'
)
excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx")
output_folder: str = Field(default="data/csv_notes_cfs")
note_16_23_sheet: str = Field(default="Note 16-23")
note_2_8_sheet: str = Field(default="Note 2 - 8")
note_9_sheet: str = Field(default="Note 9")
note_10_15_sheet: str = Field(default="Note 10-15")
note_24_30_sheet: str = Field(default="Note 24-30")
skiprows: int = Field(default=3)
settings = Settings()
class NoteCSVInfo(BaseModel):
name: str
rows: int
def get_available_sheets(xls: pd.ExcelFile) -> List[str]:
"""Get list of available sheet names"""
return xls.sheet_names
def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]:
"""Find the main trial balance sheet"""
sheet_names = get_available_sheets(xls)
# Common trial balance sheet name patterns
trial_balance_patterns = [
'trial balance',
'tb',
'trialbalance',
'trial_balance',
'sheet1',
'sheet 1'
]
for sheet in sheet_names:
sheet_lower = sheet.lower()
for pattern in trial_balance_patterns:
if pattern in sheet_lower:
logger.info(f"Found trial balance sheet: {sheet}")
return sheet
# If no match, return the first sheet
logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}")
return sheet_names[0] if sheet_names else None
def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]:
"""
Extract data from a standard trial balance file (fallback method)
"""
tb_sheet = find_trial_balance_sheet(xls)
if not tb_sheet:
raise ValueError("No suitable sheet found in Excel file")
logger.info(f"Using sheet '{tb_sheet}' as trial balance")
# Try different skiprows values to find the header
for skiprows in [0, 1, 2, 3, 4, 5]:
try:
df = xls.parse(tb_sheet, skiprows=skiprows)
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
# Check if we have reasonable data
if len(df) > 5 and len(df.columns) >= 2:
logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}")
# Save the entire trial balance
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, "Trial_Balance_Full.csv")
df.to_csv(output_path, index=False)
logger.info(f"Saved trial balance to: {output_path}")
return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])]
except Exception as e:
logger.debug(f"Failed with skiprows={skiprows}: {e}")
continue
raise ValueError("Could not parse trial balance sheet with any skiprows configuration")
def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame:
"""
Parse and clean a sheet from the Excel file.
"""
if sheet_name not in xls.sheet_names:
raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}")
df = xls.parse(sheet_name, skiprows=skiprows)
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
return df
def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo:
"""
Export DataFrame to CSV and return info.
"""
output_path = os.path.join(output_folder, filename)
df.to_csv(output_path, index=False)
logger.info(f"Exported: {output_path}")
return NoteCSVInfo(name=filename, rows=df.shape[0])
def main() -> None:
"""
Main function to extract notes from Excel and export as CSVs.
Handles both structured note files and standard trial balance files.
"""
# Use command-line argument for Excel file path if provided
excel_path = settings.excel_file_path
if len(sys.argv) > 1:
excel_path = sys.argv[1]
logger.info(f"Excel file path from argument: {excel_path}")
else:
logger.info(f"Excel file path from settings: {excel_path}")
if not os.path.exists(excel_path):
logger.error(f"Excel file not found: {excel_path}")
sys.exit(1)
try:
xls = pd.ExcelFile(excel_path)
except Exception as e:
logger.error(f"Failed to open Excel file: {e}")
sys.exit(1)
available_sheets = get_available_sheets(xls)
logger.info(f"Available sheets in file: {', '.join(available_sheets)}")
# Ensure output folder exists
os.makedirs(settings.output_folder, exist_ok=True)
# Try to extract from structured notes first
expected_sheets = [
settings.note_16_23_sheet,
settings.note_2_8_sheet,
settings.note_9_sheet,
settings.note_10_15_sheet,
settings.note_24_30_sheet
]
# Check if this is a structured notes file
has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets)
if has_structured_notes:
logger.info("="*60)
logger.info("Detected structured notes file - extracting from individual note sheets")
logger.info("="*60)
try:
info_16_23 = export_note_to_csv(
clean_note(xls, settings.note_16_23_sheet, settings.skiprows),
"Note_16_to_23_Full.csv",
settings.output_folder
)
info_2_8 = export_note_to_csv(
clean_note(xls, settings.note_2_8_sheet, settings.skiprows),
"Note_2_to_8_Full.csv",
settings.output_folder
)
info_9 = export_note_to_csv(
clean_note(xls, settings.note_9_sheet, settings.skiprows),
"Note_9_Full.csv",
settings.output_folder
)
info_10_15 = export_note_to_csv(
clean_note(xls, settings.note_10_15_sheet, settings.skiprows),
"Note_10_to_15_Full.csv",
settings.output_folder
)
info_24_30 = export_note_to_csv(
clean_note(xls, settings.note_24_30_sheet, settings.skiprows),
"Note_24_to_30_Full.csv",
settings.output_folder
)
logger.info("="*60)
logger.info("EXTRACTION SUMMARY")
logger.info("="*60)
logger.info(f"Note 16-23: {info_16_23.rows} rows")
logger.info(f"Note 2-8: {info_2_8.rows} rows")
logger.info(f"Note 9: {info_9.rows} rows")
logger.info(f"Note 10-15: {info_10_15.rows} rows")
logger.info(f"Note 24-30: {info_24_30.rows} rows")
logger.info("="*60)
except ValueError as e:
logger.error(f"Error extracting from structured notes: {e}")
sys.exit(1)
else:
logger.info("="*60)
logger.info("Standard trial balance file detected - using fallback extraction")
logger.info("="*60)
logger.warning("⚠️ Cash Flow Statement generation may be limited with trial balance data only")
logger.info("")
logger.info("For complete Cash Flow Statement, please provide a file with these sheets:")
logger.info(" • Note 16-23 (Revenue details)")
logger.info(" • Note 2-8 (Share capital, reserves, liabilities)")
logger.info(" • Note 9 (Fixed assets)")
logger.info(" • Note 10-15 (Current assets, loans)")
logger.info(" • Note 24-30 (Additional disclosures)")
logger.info("="*60)
try:
csv_infos = extract_from_trial_balance(xls, settings.output_folder)
logger.info("="*60)
logger.info("EXTRACTION SUMMARY")
logger.info("="*60)
for info in csv_infos:
logger.info(f"{info.name}: {info.rows} rows")
logger.info("="*60)
logger.info("✓ Trial balance data extracted successfully")
logger.info("Note: Cash Flow generation will proceed with available data")
except Exception as e:
logger.error(f"Error extracting from trial balance: {e}")
logger.error("Please check if the file contains valid financial data")
sys.exit(1)
if __name__ == "__main__":
main() |