Spaces:

point9
/

finryver-dev

Runtime error

App Files Files Community

dipan004 commited on Dec 17, 2025

Commit

e1b550c

verified ·

1 Parent(s): 37aca1b

Update cf/cash_flow_data_extractor.py

Browse files

Files changed (1) hide show

cf/cash_flow_data_extractor.py +189 -37

cf/cash_flow_data_extractor.py CHANGED Viewed

@@ -2,9 +2,9 @@ import os
 import pandas as pd
 import sys
 import logging
-from typing import Optional
 from pydantic import BaseModel, Field
-from pydantic_settings import BaseSettings
 # Ensure stdout encoding for Unicode
 sys.stdout.reconfigure(encoding='utf-8')
@@ -14,15 +14,22 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class Settings(BaseSettings):
-    """Settings for Cash Flow Statement CSV extraction, loaded from environment variables or .env file."""
-    excel_file_path: str = Field(default="data/input/In Lakhs  BS_FY 23-24 V5 - Final.xlsx", env="CFS_EXCEL_FILE_PATH")
-    output_folder: str = Field(default="data/csv_notes_cfs", env="CFS_OUTPUT_FOLDER")
-    note_16_23_sheet: str = Field(default="Note 16-23", env="CFS_NOTE_16_23_SHEET")
-    note_2_8_sheet: str = Field(default="Note 2 - 8", env="CFS_NOTE_2_8_SHEET")
-    note_9_sheet: str = Field(default="Note 9", env="CFS_NOTE_9_SHEET")
-    note_10_15_sheet: str = Field(default="Note 10-15", env="CFS_NOTE_10_15_SHEET")
-    note_24_30_sheet: str = Field(default="Note 24-30", env="CFS_NOTE_24_30_SHEET")
-    skiprows: int = Field(default=3, env="CFS_SKIPROWS")
 settings = Settings()
@@ -30,11 +37,76 @@ class NoteCSVInfo(BaseModel):
     name: str
     rows: int
-def clean_note(sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame:
     """
     Parse and clean a sheet from the Excel file.
-    Drops empty rows and columns, resets index.
     """
     df = xls.parse(sheet_name, skiprows=skiprows)
     df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
     return df
@@ -45,11 +117,13 @@ def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> N
     """
     output_path = os.path.join(output_folder, filename)
     df.to_csv(output_path, index=False)
     return NoteCSVInfo(name=filename, rows=df.shape[0])
 def main() -> None:
     """
     Main function to extract notes from Excel and export as CSVs.
     """
     # Use command-line argument for Excel file path if provided
     excel_path = settings.excel_file_path
@@ -58,32 +132,110 @@ def main() -> None:
         logger.info(f"Excel file path from argument: {excel_path}")
     else:
         logger.info(f"Excel file path from settings: {excel_path}")
-    global xls
-    xls = pd.ExcelFile(excel_path)
-    # Clean each sheet
-    note_16_23_df = clean_note(settings.note_16_23_sheet, settings.skiprows)
-    note_2_8_df = clean_note(settings.note_2_8_sheet, settings.skiprows)
-    note_9_df = clean_note(settings.note_9_sheet, settings.skiprows)
-    note_10_15_df = clean_note(settings.note_10_15_sheet, settings.skiprows)
-    note_24_30_df = clean_note(settings.note_24_30_sheet, settings.skiprows)
     # Ensure output folder exists
     os.makedirs(settings.output_folder, exist_ok=True)
-    # Export each as CSV in the folder
-    info_16_23 = export_note_to_csv(note_16_23_df, "Note_16_to_23_Full.csv", settings.output_folder)
-    info_2_8 = export_note_to_csv(note_2_8_df, "Note_2_to_8_Full.csv", settings.output_folder)
-    info_9 = export_note_to_csv(note_9_df, "Note_9_Full.csv", settings.output_folder)
-    info_10_15 = export_note_to_csv(note_10_15_df, "Note_10_to_15_Full.csv", settings.output_folder)
-    info_24_30 = export_note_to_csv(note_24_30_df, "Note_24_to_30_Full.csv", settings.output_folder)
-    # Log confirmation and row counts
-    logger.info(f"Extracted rows: Note 16–23 = {info_16_23.rows} rows")
-    logger.info(f"Extracted rows: Note 2–8   = {info_2_8.rows} rows")
-    logger.info(f"Extracted rows: Note 9     = {info_9.rows} rows")
-    logger.info(f"Extracted rows: Note 10–15 = {info_10_15.rows} rows")
-    logger.info(f"Extracted rows: Note 24–30 = {info_24_30.rows} rows")
 if __name__ == "__main__":
     main()

 import pandas as pd
 import sys
 import logging
+from typing import Optional, List
 from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
 # Ensure stdout encoding for Unicode
 sys.stdout.reconfigure(encoding='utf-8')
 logger = logging.getLogger(__name__)
 class Settings(BaseSettings):
+    """Settings for Cash Flow Statement CSV extraction"""
+    model_config = SettingsConfigDict(
+        env_prefix='CFS_',
+        env_file='.env',
+        extra='ignore'
+    )
+    excel_file_path: str = Field(default="data/input/In Lakhs  BS_FY 23-24 V5 - Final.xlsx")
+    output_folder: str = Field(default="data/csv_notes_cfs")
+    note_16_23_sheet: str = Field(default="Note 16-23")
+    note_2_8_sheet: str = Field(default="Note 2 - 8")
+    note_9_sheet: str = Field(default="Note 9")
+    note_10_15_sheet: str = Field(default="Note 10-15")
+    note_24_30_sheet: str = Field(default="Note 24-30")
+    skiprows: int = Field(default=3)
 settings = Settings()
     name: str
     rows: int
+def get_available_sheets(xls: pd.ExcelFile) -> List[str]:
+    """Get list of available sheet names"""
+    return xls.sheet_names
+def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]:
+    """Find the main trial balance sheet"""
+    sheet_names = get_available_sheets(xls)
+    # Common trial balance sheet name patterns
+    trial_balance_patterns = [
+        'trial balance',
+        'tb',
+        'trialbalance',
+        'trial_balance',
+        'sheet1',
+        'sheet 1'
+    ]
+    for sheet in sheet_names:
+        sheet_lower = sheet.lower()
+        for pattern in trial_balance_patterns:
+            if pattern in sheet_lower:
+                logger.info(f"Found trial balance sheet: {sheet}")
+                return sheet
+    # If no match, return the first sheet
+    logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}")
+    return sheet_names[0] if sheet_names else None
+def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]:
+    """
+    Extract data from a standard trial balance file (fallback method)
+    """
+    tb_sheet = find_trial_balance_sheet(xls)
+    if not tb_sheet:
+        raise ValueError("No suitable sheet found in Excel file")
+    logger.info(f"Using sheet '{tb_sheet}' as trial balance")
+    # Try different skiprows values to find the header
+    for skiprows in [0, 1, 2, 3, 4, 5]:
+        try:
+            df = xls.parse(tb_sheet, skiprows=skiprows)
+            df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
+            # Check if we have reasonable data
+            if len(df) > 5 and len(df.columns) >= 2:
+                logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}")
+                # Save the entire trial balance
+                os.makedirs(output_folder, exist_ok=True)
+                output_path = os.path.join(output_folder, "Trial_Balance_Full.csv")
+                df.to_csv(output_path, index=False)
+                logger.info(f"Saved trial balance to: {output_path}")
+                return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])]
+        except Exception as e:
+            logger.debug(f"Failed with skiprows={skiprows}: {e}")
+            continue
+    raise ValueError("Could not parse trial balance sheet with any skiprows configuration")
+def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame:
     """
     Parse and clean a sheet from the Excel file.
     """
+    if sheet_name not in xls.sheet_names:
+        raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}")
     df = xls.parse(sheet_name, skiprows=skiprows)
     df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
     return df
     """
     output_path = os.path.join(output_folder, filename)
     df.to_csv(output_path, index=False)
+    logger.info(f"Exported: {output_path}")
     return NoteCSVInfo(name=filename, rows=df.shape[0])
 def main() -> None:
     """
     Main function to extract notes from Excel and export as CSVs.
+    Handles both structured note files and standard trial balance files.
     """
     # Use command-line argument for Excel file path if provided
     excel_path = settings.excel_file_path
         logger.info(f"Excel file path from argument: {excel_path}")
     else:
         logger.info(f"Excel file path from settings: {excel_path}")
+    if not os.path.exists(excel_path):
+        logger.error(f"Excel file not found: {excel_path}")
+        sys.exit(1)
+    try:
+        xls = pd.ExcelFile(excel_path)
+    except Exception as e:
+        logger.error(f"Failed to open Excel file: {e}")
+        sys.exit(1)
+    available_sheets = get_available_sheets(xls)
+    logger.info(f"Available sheets in file: {', '.join(available_sheets)}")
     # Ensure output folder exists
     os.makedirs(settings.output_folder, exist_ok=True)
+    # Try to extract from structured notes first
+    expected_sheets = [
+        settings.note_16_23_sheet,
+        settings.note_2_8_sheet,
+        settings.note_9_sheet,
+        settings.note_10_15_sheet,
+        settings.note_24_30_sheet
+    ]
+    # Check if this is a structured notes file
+    has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets)
+    if has_structured_notes:
+        logger.info("="*60)
+        logger.info("Detected structured notes file - extracting from individual note sheets")
+        logger.info("="*60)
+        try:
+            info_16_23 = export_note_to_csv(
+                clean_note(xls, settings.note_16_23_sheet, settings.skiprows),
+                "Note_16_to_23_Full.csv",
+                settings.output_folder
+            )
+            info_2_8 = export_note_to_csv(
+                clean_note(xls, settings.note_2_8_sheet, settings.skiprows),
+                "Note_2_to_8_Full.csv",
+                settings.output_folder
+            )
+            info_9 = export_note_to_csv(
+                clean_note(xls, settings.note_9_sheet, settings.skiprows),
+                "Note_9_Full.csv",
+                settings.output_folder
+            )
+            info_10_15 = export_note_to_csv(
+                clean_note(xls, settings.note_10_15_sheet, settings.skiprows),
+                "Note_10_to_15_Full.csv",
+                settings.output_folder
+            )
+            info_24_30 = export_note_to_csv(
+                clean_note(xls, settings.note_24_30_sheet, settings.skiprows),
+                "Note_24_to_30_Full.csv",
+                settings.output_folder
+            )
+            logger.info("="*60)
+            logger.info("EXTRACTION SUMMARY")
+            logger.info("="*60)
+            logger.info(f"Note 16-23: {info_16_23.rows} rows")
+            logger.info(f"Note 2-8:   {info_2_8.rows} rows")
+            logger.info(f"Note 9:     {info_9.rows} rows")
+            logger.info(f"Note 10-15: {info_10_15.rows} rows")
+            logger.info(f"Note 24-30: {info_24_30.rows} rows")
+            logger.info("="*60)
+        except ValueError as e:
+            logger.error(f"Error extracting from structured notes: {e}")
+            sys.exit(1)
+    else:
+        logger.info("="*60)
+        logger.info("Standard trial balance file detected - using fallback extraction")
+        logger.info("="*60)
+        logger.warning("⚠️  Cash Flow Statement generation may be limited with trial balance data only")
+        logger.info("")
+        logger.info("For complete Cash Flow Statement, please provide a file with these sheets:")
+        logger.info("  • Note 16-23 (Revenue details)")
+        logger.info("  • Note 2-8 (Share capital, reserves, liabilities)")
+        logger.info("  • Note 9 (Fixed assets)")
+        logger.info("  • Note 10-15 (Current assets, loans)")
+        logger.info("  • Note 24-30 (Additional disclosures)")
+        logger.info("="*60)
+        try:
+            csv_infos = extract_from_trial_balance(xls, settings.output_folder)
+            logger.info("="*60)
+            logger.info("EXTRACTION SUMMARY")
+            logger.info("="*60)
+            for info in csv_infos:
+                logger.info(f"{info.name}: {info.rows} rows")
+            logger.info("="*60)
+            logger.info("✓ Trial balance data extracted successfully")
+            logger.info("Note: Cash Flow generation will proceed with available data")
+        except Exception as e:
+            logger.error(f"Error extracting from trial balance: {e}")
+            logger.error("Please check if the file contains valid financial data")
+            sys.exit(1)
 if __name__ == "__main__":
     main()