Spaces:
Sleeping
Sleeping
File size: 3,229 Bytes
c79824c f39814a c79824c 452e581 c79824c 452e581 c79824c cbdac03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
import pandas as pd
import sys
import logging
from typing import Optional
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings
# Ensure stdout encoding for Unicode
sys.stdout.reconfigure(encoding='utf-8')
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Settings(BaseSettings):
"""Settings for Balance Sheet CSV extraction, loaded from environment variables or .env file."""
excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx", env="BS_EXCEL_FILE_PATH")
output_folder: str = Field(default="data/csv_notes_bs", env="BS_OUTPUT_FOLDER")
note_2_8_sheet: str = Field(default="Note 2 - 8", env="BS_NOTE_2_8_SHEET")
note_9_sheet: str = Field(default="Note 9", env="BS_NOTE_9_SHEET")
note_10_15_sheet: str = Field(default="Note 10-15", env="BS_NOTE_10_15_SHEET")
skiprows: int = Field(default=3, env="BS_SKIPROWS")
settings = Settings()
class NoteCSVInfo(BaseModel):
name: str
rows: int
def clean_note(sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame:
"""
Parse and clean a sheet from the Excel file.
Drops empty rows and columns, resets index.
"""
df = xls.parse(sheet_name, skiprows=skiprows)
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
return df
def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo:
"""
Export DataFrame to CSV and return info.
"""
output_path = os.path.join(output_folder, filename)
df.to_csv(output_path, index=False)
return NoteCSVInfo(name=filename, rows=df.shape[0])
def main() -> None:
"""
Main function to extract notes from Excel and export as CSVs.
"""
# Use command-line argument for Excel file path if provided
excel_path = settings.excel_file_path
if len(sys.argv) > 1:
excel_path = sys.argv[1]
logger.info(f"Excel file path from argument: {excel_path}")
else:
logger.info(f"Excel file path from settings: {excel_path}")
global xls
xls = pd.ExcelFile(excel_path)
# Clean each sheet
note_2_8_df = clean_note(settings.note_2_8_sheet, settings.skiprows)
note_9_df = clean_note(settings.note_9_sheet, settings.skiprows)
note_10_15_df = clean_note(settings.note_10_15_sheet, settings.skiprows)
# Ensure output folder exists
os.makedirs(settings.output_folder, exist_ok=True)
# Export each as CSV in the folder
info_2_8 = export_note_to_csv(note_2_8_df, "Note_2_to_8_Full.csv", settings.output_folder)
info_9 = export_note_to_csv(note_9_df, "Note_9_Full.csv", settings.output_folder)
info_10_15 = export_note_to_csv(note_10_15_df, "Note_10_to_15_Full.csv", settings.output_folder)
# Log confirmation and row counts
logger.info(f"Extracted rows: Note 2β8 = {info_2_8.rows} rows")
logger.info(f"Extracted rows: Note 9 = {info_9.rows} rows")
logger.info(f"Extracted rows: Note 10β15 = {info_10_15.rows} rows")
if __name__ == "__main__":
try:
main()
except Exception as e:
logger.error(f"Balance sheet data extraction failed: {e}")
sys.exit(1)
|