import os import pandas as pd import sys import logging from typing import Optional, List from pydantic import BaseModel, Field from pydantic_settings import BaseSettings, SettingsConfigDict # Ensure stdout encoding for Unicode sys.stdout.reconfigure(encoding='utf-8') # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class Settings(BaseSettings): """Settings for Cash Flow Statement CSV extraction""" model_config = SettingsConfigDict( env_prefix='CFS_', env_file='.env', extra='ignore' ) excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx") output_folder: str = Field(default="data/csv_notes_cfs") note_16_23_sheet: str = Field(default="Note 16-23") note_2_8_sheet: str = Field(default="Note 2 - 8") note_9_sheet: str = Field(default="Note 9") note_10_15_sheet: str = Field(default="Note 10-15") note_24_30_sheet: str = Field(default="Note 24-30") skiprows: int = Field(default=3) settings = Settings() class NoteCSVInfo(BaseModel): name: str rows: int def get_available_sheets(xls: pd.ExcelFile) -> List[str]: """Get list of available sheet names""" return xls.sheet_names def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]: """Find the main trial balance sheet""" sheet_names = get_available_sheets(xls) # Common trial balance sheet name patterns trial_balance_patterns = [ 'trial balance', 'tb', 'trialbalance', 'trial_balance', 'sheet1', 'sheet 1' ] for sheet in sheet_names: sheet_lower = sheet.lower() for pattern in trial_balance_patterns: if pattern in sheet_lower: logger.info(f"Found trial balance sheet: {sheet}") return sheet # If no match, return the first sheet logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}") return sheet_names[0] if sheet_names else None def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]: """ Extract data from a standard trial balance file (fallback method) """ tb_sheet = find_trial_balance_sheet(xls) if not tb_sheet: raise ValueError("No suitable sheet found in Excel file") logger.info(f"Using sheet '{tb_sheet}' as trial balance") # Try different skiprows values to find the header for skiprows in [0, 1, 2, 3, 4, 5]: try: df = xls.parse(tb_sheet, skiprows=skiprows) df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True) # Check if we have reasonable data if len(df) > 5 and len(df.columns) >= 2: logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}") # Save the entire trial balance os.makedirs(output_folder, exist_ok=True) output_path = os.path.join(output_folder, "Trial_Balance_Full.csv") df.to_csv(output_path, index=False) logger.info(f"Saved trial balance to: {output_path}") return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])] except Exception as e: logger.debug(f"Failed with skiprows={skiprows}: {e}") continue raise ValueError("Could not parse trial balance sheet with any skiprows configuration") def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame: """ Parse and clean a sheet from the Excel file. """ if sheet_name not in xls.sheet_names: raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}") df = xls.parse(sheet_name, skiprows=skiprows) df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True) return df def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo: """ Export DataFrame to CSV and return info. """ output_path = os.path.join(output_folder, filename) df.to_csv(output_path, index=False) logger.info(f"Exported: {output_path}") return NoteCSVInfo(name=filename, rows=df.shape[0]) def main() -> None: """ Main function to extract notes from Excel and export as CSVs. Handles both structured note files and standard trial balance files. """ # Use command-line argument for Excel file path if provided excel_path = settings.excel_file_path if len(sys.argv) > 1: excel_path = sys.argv[1] logger.info(f"Excel file path from argument: {excel_path}") else: logger.info(f"Excel file path from settings: {excel_path}") if not os.path.exists(excel_path): logger.error(f"Excel file not found: {excel_path}") sys.exit(1) try: xls = pd.ExcelFile(excel_path) except Exception as e: logger.error(f"Failed to open Excel file: {e}") sys.exit(1) available_sheets = get_available_sheets(xls) logger.info(f"Available sheets in file: {', '.join(available_sheets)}") # Ensure output folder exists os.makedirs(settings.output_folder, exist_ok=True) # Try to extract from structured notes first expected_sheets = [ settings.note_16_23_sheet, settings.note_2_8_sheet, settings.note_9_sheet, settings.note_10_15_sheet, settings.note_24_30_sheet ] # Check if this is a structured notes file has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets) if has_structured_notes: logger.info("="*60) logger.info("Detected structured notes file - extracting from individual note sheets") logger.info("="*60) try: info_16_23 = export_note_to_csv( clean_note(xls, settings.note_16_23_sheet, settings.skiprows), "Note_16_to_23_Full.csv", settings.output_folder ) info_2_8 = export_note_to_csv( clean_note(xls, settings.note_2_8_sheet, settings.skiprows), "Note_2_to_8_Full.csv", settings.output_folder ) info_9 = export_note_to_csv( clean_note(xls, settings.note_9_sheet, settings.skiprows), "Note_9_Full.csv", settings.output_folder ) info_10_15 = export_note_to_csv( clean_note(xls, settings.note_10_15_sheet, settings.skiprows), "Note_10_to_15_Full.csv", settings.output_folder ) info_24_30 = export_note_to_csv( clean_note(xls, settings.note_24_30_sheet, settings.skiprows), "Note_24_to_30_Full.csv", settings.output_folder ) logger.info("="*60) logger.info("EXTRACTION SUMMARY") logger.info("="*60) logger.info(f"Note 16-23: {info_16_23.rows} rows") logger.info(f"Note 2-8: {info_2_8.rows} rows") logger.info(f"Note 9: {info_9.rows} rows") logger.info(f"Note 10-15: {info_10_15.rows} rows") logger.info(f"Note 24-30: {info_24_30.rows} rows") logger.info("="*60) except ValueError as e: logger.error(f"Error extracting from structured notes: {e}") sys.exit(1) else: logger.info("="*60) logger.info("Standard trial balance file detected - using fallback extraction") logger.info("="*60) logger.warning("⚠️ Cash Flow Statement generation may be limited with trial balance data only") logger.info("") logger.info("For complete Cash Flow Statement, please provide a file with these sheets:") logger.info(" • Note 16-23 (Revenue details)") logger.info(" • Note 2-8 (Share capital, reserves, liabilities)") logger.info(" • Note 9 (Fixed assets)") logger.info(" • Note 10-15 (Current assets, loans)") logger.info(" • Note 24-30 (Additional disclosures)") logger.info("="*60) try: csv_infos = extract_from_trial_balance(xls, settings.output_folder) logger.info("="*60) logger.info("EXTRACTION SUMMARY") logger.info("="*60) for info in csv_infos: logger.info(f"{info.name}: {info.rows} rows") logger.info("="*60) logger.info("✓ Trial balance data extracted successfully") logger.info("Note: Cash Flow generation will proceed with available data") except Exception as e: logger.error(f"Error extracting from trial balance: {e}") logger.error("Please check if the file contains valid financial data") sys.exit(1) if __name__ == "__main__": main()