Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import sys | |
| import logging | |
| from typing import Optional, List | |
| from pydantic import BaseModel, Field | |
| from pydantic_settings import BaseSettings, SettingsConfigDict | |
| # Ensure stdout encoding for Unicode | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class Settings(BaseSettings): | |
| """Settings for Cash Flow Statement CSV extraction""" | |
| model_config = SettingsConfigDict( | |
| env_prefix='CFS_', | |
| env_file='.env', | |
| extra='ignore' | |
| ) | |
| excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx") | |
| output_folder: str = Field(default="data/csv_notes_cfs") | |
| note_16_23_sheet: str = Field(default="Note 16-23") | |
| note_2_8_sheet: str = Field(default="Note 2 - 8") | |
| note_9_sheet: str = Field(default="Note 9") | |
| note_10_15_sheet: str = Field(default="Note 10-15") | |
| note_24_30_sheet: str = Field(default="Note 24-30") | |
| skiprows: int = Field(default=3) | |
| settings = Settings() | |
| class NoteCSVInfo(BaseModel): | |
| name: str | |
| rows: int | |
| def get_available_sheets(xls: pd.ExcelFile) -> List[str]: | |
| """Get list of available sheet names""" | |
| return xls.sheet_names | |
| def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]: | |
| """Find the main trial balance sheet""" | |
| sheet_names = get_available_sheets(xls) | |
| # Common trial balance sheet name patterns | |
| trial_balance_patterns = [ | |
| 'trial balance', | |
| 'tb', | |
| 'trialbalance', | |
| 'trial_balance', | |
| 'sheet1', | |
| 'sheet 1' | |
| ] | |
| for sheet in sheet_names: | |
| sheet_lower = sheet.lower() | |
| for pattern in trial_balance_patterns: | |
| if pattern in sheet_lower: | |
| logger.info(f"Found trial balance sheet: {sheet}") | |
| return sheet | |
| # If no match, return the first sheet | |
| logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}") | |
| return sheet_names[0] if sheet_names else None | |
| def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]: | |
| """ | |
| Extract data from a standard trial balance file (fallback method) | |
| """ | |
| tb_sheet = find_trial_balance_sheet(xls) | |
| if not tb_sheet: | |
| raise ValueError("No suitable sheet found in Excel file") | |
| logger.info(f"Using sheet '{tb_sheet}' as trial balance") | |
| # Try different skiprows values to find the header | |
| for skiprows in [0, 1, 2, 3, 4, 5]: | |
| try: | |
| df = xls.parse(tb_sheet, skiprows=skiprows) | |
| df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True) | |
| # Check if we have reasonable data | |
| if len(df) > 5 and len(df.columns) >= 2: | |
| logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}") | |
| # Save the entire trial balance | |
| os.makedirs(output_folder, exist_ok=True) | |
| output_path = os.path.join(output_folder, "Trial_Balance_Full.csv") | |
| df.to_csv(output_path, index=False) | |
| logger.info(f"Saved trial balance to: {output_path}") | |
| return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])] | |
| except Exception as e: | |
| logger.debug(f"Failed with skiprows={skiprows}: {e}") | |
| continue | |
| raise ValueError("Could not parse trial balance sheet with any skiprows configuration") | |
| def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame: | |
| """ | |
| Parse and clean a sheet from the Excel file. | |
| """ | |
| if sheet_name not in xls.sheet_names: | |
| raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}") | |
| df = xls.parse(sheet_name, skiprows=skiprows) | |
| df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True) | |
| return df | |
| def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo: | |
| """ | |
| Export DataFrame to CSV and return info. | |
| """ | |
| output_path = os.path.join(output_folder, filename) | |
| df.to_csv(output_path, index=False) | |
| logger.info(f"Exported: {output_path}") | |
| return NoteCSVInfo(name=filename, rows=df.shape[0]) | |
| def main() -> None: | |
| """ | |
| Main function to extract notes from Excel and export as CSVs. | |
| Handles both structured note files and standard trial balance files. | |
| """ | |
| # Use command-line argument for Excel file path if provided | |
| excel_path = settings.excel_file_path | |
| if len(sys.argv) > 1: | |
| excel_path = sys.argv[1] | |
| logger.info(f"Excel file path from argument: {excel_path}") | |
| else: | |
| logger.info(f"Excel file path from settings: {excel_path}") | |
| if not os.path.exists(excel_path): | |
| logger.error(f"Excel file not found: {excel_path}") | |
| sys.exit(1) | |
| try: | |
| xls = pd.ExcelFile(excel_path) | |
| except Exception as e: | |
| logger.error(f"Failed to open Excel file: {e}") | |
| sys.exit(1) | |
| available_sheets = get_available_sheets(xls) | |
| logger.info(f"Available sheets in file: {', '.join(available_sheets)}") | |
| # Ensure output folder exists | |
| os.makedirs(settings.output_folder, exist_ok=True) | |
| # Try to extract from structured notes first | |
| expected_sheets = [ | |
| settings.note_16_23_sheet, | |
| settings.note_2_8_sheet, | |
| settings.note_9_sheet, | |
| settings.note_10_15_sheet, | |
| settings.note_24_30_sheet | |
| ] | |
| # Check if this is a structured notes file | |
| has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets) | |
| if has_structured_notes: | |
| logger.info("="*60) | |
| logger.info("Detected structured notes file - extracting from individual note sheets") | |
| logger.info("="*60) | |
| try: | |
| info_16_23 = export_note_to_csv( | |
| clean_note(xls, settings.note_16_23_sheet, settings.skiprows), | |
| "Note_16_to_23_Full.csv", | |
| settings.output_folder | |
| ) | |
| info_2_8 = export_note_to_csv( | |
| clean_note(xls, settings.note_2_8_sheet, settings.skiprows), | |
| "Note_2_to_8_Full.csv", | |
| settings.output_folder | |
| ) | |
| info_9 = export_note_to_csv( | |
| clean_note(xls, settings.note_9_sheet, settings.skiprows), | |
| "Note_9_Full.csv", | |
| settings.output_folder | |
| ) | |
| info_10_15 = export_note_to_csv( | |
| clean_note(xls, settings.note_10_15_sheet, settings.skiprows), | |
| "Note_10_to_15_Full.csv", | |
| settings.output_folder | |
| ) | |
| info_24_30 = export_note_to_csv( | |
| clean_note(xls, settings.note_24_30_sheet, settings.skiprows), | |
| "Note_24_to_30_Full.csv", | |
| settings.output_folder | |
| ) | |
| logger.info("="*60) | |
| logger.info("EXTRACTION SUMMARY") | |
| logger.info("="*60) | |
| logger.info(f"Note 16-23: {info_16_23.rows} rows") | |
| logger.info(f"Note 2-8: {info_2_8.rows} rows") | |
| logger.info(f"Note 9: {info_9.rows} rows") | |
| logger.info(f"Note 10-15: {info_10_15.rows} rows") | |
| logger.info(f"Note 24-30: {info_24_30.rows} rows") | |
| logger.info("="*60) | |
| except ValueError as e: | |
| logger.error(f"Error extracting from structured notes: {e}") | |
| sys.exit(1) | |
| else: | |
| logger.info("="*60) | |
| logger.info("Standard trial balance file detected - using fallback extraction") | |
| logger.info("="*60) | |
| logger.warning("⚠️ Cash Flow Statement generation may be limited with trial balance data only") | |
| logger.info("") | |
| logger.info("For complete Cash Flow Statement, please provide a file with these sheets:") | |
| logger.info(" • Note 16-23 (Revenue details)") | |
| logger.info(" • Note 2-8 (Share capital, reserves, liabilities)") | |
| logger.info(" • Note 9 (Fixed assets)") | |
| logger.info(" • Note 10-15 (Current assets, loans)") | |
| logger.info(" • Note 24-30 (Additional disclosures)") | |
| logger.info("="*60) | |
| try: | |
| csv_infos = extract_from_trial_balance(xls, settings.output_folder) | |
| logger.info("="*60) | |
| logger.info("EXTRACTION SUMMARY") | |
| logger.info("="*60) | |
| for info in csv_infos: | |
| logger.info(f"{info.name}: {info.rows} rows") | |
| logger.info("="*60) | |
| logger.info("✓ Trial balance data extracted successfully") | |
| logger.info("Note: Cash Flow generation will proceed with available data") | |
| except Exception as e: | |
| logger.error(f"Error extracting from trial balance: {e}") | |
| logger.error("Please check if the file contains valid financial data") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |