finryver-dev / pnl /profit_loss_data_extractor.py
Sahil Garg
agent added, files name changed
a9ec4f6
import os
import pandas as pd
import logging
from typing import Optional
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Settings(BaseSettings):
"""Settings for P&L CSV extraction, loaded from environment variables or .env file."""
excel_file_path: str = Field(default="In Lakhs BS_FY 23-24 V5 - Final.xlsx", env="PNL_EXCEL_FILE_PATH")
output_folder: str = Field(default="data/csv_notes_pnl", env="PNL_OUTPUT_FOLDER")
note_16_23_sheet: str = Field(default="Note 16-23", env="PNL_NOTE_16_23_SHEET")
skiprows: int = Field(default=3, env="PNL_SKIPROWS")
settings = Settings()
def get_xls(excel_file_path: str) -> pd.ExcelFile:
try:
xls = pd.ExcelFile(excel_file_path)
logger.info(f"Loaded Excel file: {excel_file_path}")
logger.info(f"Available sheets: {xls.sheet_names}")
return xls
except Exception as e:
logger.error(f"Failed to load Excel file '{excel_file_path}': {e}")
raise
class NoteCSVInfo(BaseModel):
name: str
rows: int
def clean_note(xls, sheet_name: str, skiprows: int = settings.skiprows) -> pd.DataFrame:
"""
Parse and clean a sheet from the Excel file.
Drops empty rows and columns, resets index.
"""
df = xls.parse(sheet_name, skiprows=skiprows)
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
return df
def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo:
"""
Export DataFrame to CSV and return info.
"""
# Always use absolute path for output folder
abs_output_folder = os.path.abspath(output_folder)
try:
os.makedirs(abs_output_folder, exist_ok=True)
logger.info(f"Output folder ensured: {abs_output_folder}")
except Exception as e:
logger.error(f"Failed to create output folder '{abs_output_folder}': {e}")
raise
output_path = os.path.join(abs_output_folder, filename)
df.to_csv(output_path, index=False)
logger.info(f"CSV file written to: {output_path}")
return NoteCSVInfo(name=filename, rows=df.shape[0])
def main() -> None:
"""
Main function to extract P&L notes from Excel and export as CSV.
"""
import sys
logger.info(f"Current working directory: {os.getcwd()}")
excel_file_path = settings.excel_file_path
if len(sys.argv) > 1:
excel_file_path = sys.argv[1]
logger.info(f"Excel file path from argument: {excel_file_path}")
xls = get_xls(excel_file_path)
if settings.note_16_23_sheet not in xls.sheet_names:
logger.error(f"Sheet '{settings.note_16_23_sheet}' not found in Excel file. Available sheets: {xls.sheet_names}")
return
note_16_23_df = clean_note(xls, settings.note_16_23_sheet, settings.skiprows)
logger.info(f"Loaded DataFrame shape: {note_16_23_df.shape}")
logger.info(f"First few rows:\n{note_16_23_df.head()}\n")
info_16_23 = export_note_to_csv(note_16_23_df, "Note_16_to_23_Full.csv", settings.output_folder)
logger.info(f"Extracted rows: Note 16-23 = {info_16_23.rows} rows")
abs_output_folder = os.path.abspath(settings.output_folder)
logger.info(f"CSV output path: {os.path.join(abs_output_folder, 'Note_16_to_23_Full.csv')}")
if __name__ == "__main__":
main()