finryver-dev / cf /cash_flow_data_extractor.py
dipan004's picture
Update cf/cash_flow_data_extractor.py
e1b550c verified
import os
import pandas as pd
import sys
import logging
from typing import Optional, List
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
# Ensure stdout encoding for Unicode
sys.stdout.reconfigure(encoding='utf-8')
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Settings(BaseSettings):
"""Settings for Cash Flow Statement CSV extraction"""
model_config = SettingsConfigDict(
env_prefix='CFS_',
env_file='.env',
extra='ignore'
)
excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx")
output_folder: str = Field(default="data/csv_notes_cfs")
note_16_23_sheet: str = Field(default="Note 16-23")
note_2_8_sheet: str = Field(default="Note 2 - 8")
note_9_sheet: str = Field(default="Note 9")
note_10_15_sheet: str = Field(default="Note 10-15")
note_24_30_sheet: str = Field(default="Note 24-30")
skiprows: int = Field(default=3)
settings = Settings()
class NoteCSVInfo(BaseModel):
name: str
rows: int
def get_available_sheets(xls: pd.ExcelFile) -> List[str]:
"""Get list of available sheet names"""
return xls.sheet_names
def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]:
"""Find the main trial balance sheet"""
sheet_names = get_available_sheets(xls)
# Common trial balance sheet name patterns
trial_balance_patterns = [
'trial balance',
'tb',
'trialbalance',
'trial_balance',
'sheet1',
'sheet 1'
]
for sheet in sheet_names:
sheet_lower = sheet.lower()
for pattern in trial_balance_patterns:
if pattern in sheet_lower:
logger.info(f"Found trial balance sheet: {sheet}")
return sheet
# If no match, return the first sheet
logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}")
return sheet_names[0] if sheet_names else None
def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]:
"""
Extract data from a standard trial balance file (fallback method)
"""
tb_sheet = find_trial_balance_sheet(xls)
if not tb_sheet:
raise ValueError("No suitable sheet found in Excel file")
logger.info(f"Using sheet '{tb_sheet}' as trial balance")
# Try different skiprows values to find the header
for skiprows in [0, 1, 2, 3, 4, 5]:
try:
df = xls.parse(tb_sheet, skiprows=skiprows)
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
# Check if we have reasonable data
if len(df) > 5 and len(df.columns) >= 2:
logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}")
# Save the entire trial balance
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, "Trial_Balance_Full.csv")
df.to_csv(output_path, index=False)
logger.info(f"Saved trial balance to: {output_path}")
return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])]
except Exception as e:
logger.debug(f"Failed with skiprows={skiprows}: {e}")
continue
raise ValueError("Could not parse trial balance sheet with any skiprows configuration")
def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame:
"""
Parse and clean a sheet from the Excel file.
"""
if sheet_name not in xls.sheet_names:
raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}")
df = xls.parse(sheet_name, skiprows=skiprows)
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
return df
def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo:
"""
Export DataFrame to CSV and return info.
"""
output_path = os.path.join(output_folder, filename)
df.to_csv(output_path, index=False)
logger.info(f"Exported: {output_path}")
return NoteCSVInfo(name=filename, rows=df.shape[0])
def main() -> None:
"""
Main function to extract notes from Excel and export as CSVs.
Handles both structured note files and standard trial balance files.
"""
# Use command-line argument for Excel file path if provided
excel_path = settings.excel_file_path
if len(sys.argv) > 1:
excel_path = sys.argv[1]
logger.info(f"Excel file path from argument: {excel_path}")
else:
logger.info(f"Excel file path from settings: {excel_path}")
if not os.path.exists(excel_path):
logger.error(f"Excel file not found: {excel_path}")
sys.exit(1)
try:
xls = pd.ExcelFile(excel_path)
except Exception as e:
logger.error(f"Failed to open Excel file: {e}")
sys.exit(1)
available_sheets = get_available_sheets(xls)
logger.info(f"Available sheets in file: {', '.join(available_sheets)}")
# Ensure output folder exists
os.makedirs(settings.output_folder, exist_ok=True)
# Try to extract from structured notes first
expected_sheets = [
settings.note_16_23_sheet,
settings.note_2_8_sheet,
settings.note_9_sheet,
settings.note_10_15_sheet,
settings.note_24_30_sheet
]
# Check if this is a structured notes file
has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets)
if has_structured_notes:
logger.info("="*60)
logger.info("Detected structured notes file - extracting from individual note sheets")
logger.info("="*60)
try:
info_16_23 = export_note_to_csv(
clean_note(xls, settings.note_16_23_sheet, settings.skiprows),
"Note_16_to_23_Full.csv",
settings.output_folder
)
info_2_8 = export_note_to_csv(
clean_note(xls, settings.note_2_8_sheet, settings.skiprows),
"Note_2_to_8_Full.csv",
settings.output_folder
)
info_9 = export_note_to_csv(
clean_note(xls, settings.note_9_sheet, settings.skiprows),
"Note_9_Full.csv",
settings.output_folder
)
info_10_15 = export_note_to_csv(
clean_note(xls, settings.note_10_15_sheet, settings.skiprows),
"Note_10_to_15_Full.csv",
settings.output_folder
)
info_24_30 = export_note_to_csv(
clean_note(xls, settings.note_24_30_sheet, settings.skiprows),
"Note_24_to_30_Full.csv",
settings.output_folder
)
logger.info("="*60)
logger.info("EXTRACTION SUMMARY")
logger.info("="*60)
logger.info(f"Note 16-23: {info_16_23.rows} rows")
logger.info(f"Note 2-8: {info_2_8.rows} rows")
logger.info(f"Note 9: {info_9.rows} rows")
logger.info(f"Note 10-15: {info_10_15.rows} rows")
logger.info(f"Note 24-30: {info_24_30.rows} rows")
logger.info("="*60)
except ValueError as e:
logger.error(f"Error extracting from structured notes: {e}")
sys.exit(1)
else:
logger.info("="*60)
logger.info("Standard trial balance file detected - using fallback extraction")
logger.info("="*60)
logger.warning("⚠️ Cash Flow Statement generation may be limited with trial balance data only")
logger.info("")
logger.info("For complete Cash Flow Statement, please provide a file with these sheets:")
logger.info(" • Note 16-23 (Revenue details)")
logger.info(" • Note 2-8 (Share capital, reserves, liabilities)")
logger.info(" • Note 9 (Fixed assets)")
logger.info(" • Note 10-15 (Current assets, loans)")
logger.info(" • Note 24-30 (Additional disclosures)")
logger.info("="*60)
try:
csv_infos = extract_from_trial_balance(xls, settings.output_folder)
logger.info("="*60)
logger.info("EXTRACTION SUMMARY")
logger.info("="*60)
for info in csv_infos:
logger.info(f"{info.name}: {info.rows} rows")
logger.info("="*60)
logger.info("✓ Trial balance data extracted successfully")
logger.info("Note: Cash Flow generation will proceed with available data")
except Exception as e:
logger.error(f"Error extracting from trial balance: {e}")
logger.error("Please check if the file contains valid financial data")
sys.exit(1)
if __name__ == "__main__":
main()