Spaces:

point9
/

finryver-dev

Sleeping

App Files Files Community

finryver-dev / cf /cash_flow_data_extractor.py

dipan004

Update cf/cash_flow_data_extractor.py

e1b550c verified 14 days ago

raw

history blame contribute delete

9.16 kB

	import os
	import pandas as pd
	import sys
	import logging
	from typing import Optional, List
	from pydantic import BaseModel, Field
	from pydantic_settings import BaseSettings, SettingsConfigDict

	# Ensure stdout encoding for Unicode
	sys.stdout.reconfigure(encoding='utf-8')

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class Settings(BaseSettings):
	"""Settings for Cash Flow Statement CSV extraction"""

	model_config = SettingsConfigDict(
	env_prefix='CFS_',
	env_file='.env',
	extra='ignore'
	)

	excel_file_path: str = Field(default="data/input/In Lakhs BS_FY 23-24 V5 - Final.xlsx")
	output_folder: str = Field(default="data/csv_notes_cfs")
	note_16_23_sheet: str = Field(default="Note 16-23")
	note_2_8_sheet: str = Field(default="Note 2 - 8")
	note_9_sheet: str = Field(default="Note 9")
	note_10_15_sheet: str = Field(default="Note 10-15")
	note_24_30_sheet: str = Field(default="Note 24-30")
	skiprows: int = Field(default=3)

	settings = Settings()

	class NoteCSVInfo(BaseModel):
	name: str
	rows: int

	def get_available_sheets(xls: pd.ExcelFile) -> List[str]:
	"""Get list of available sheet names"""
	return xls.sheet_names

	def find_trial_balance_sheet(xls: pd.ExcelFile) -> Optional[str]:
	"""Find the main trial balance sheet"""
	sheet_names = get_available_sheets(xls)

	# Common trial balance sheet name patterns
	trial_balance_patterns = [
	'trial balance',
	'tb',
	'trialbalance',
	'trial_balance',
	'sheet1',
	'sheet 1'
	]

	for sheet in sheet_names:
	sheet_lower = sheet.lower()
	for pattern in trial_balance_patterns:
	if pattern in sheet_lower:
	logger.info(f"Found trial balance sheet: {sheet}")
	return sheet

	# If no match, return the first sheet
	logger.info(f"No trial balance pattern matched, using first sheet: {sheet_names[0]}")
	return sheet_names[0] if sheet_names else None

	def extract_from_trial_balance(xls: pd.ExcelFile, output_folder: str) -> List[NoteCSVInfo]:
	"""
	Extract data from a standard trial balance file (fallback method)
	"""
	tb_sheet = find_trial_balance_sheet(xls)

	if not tb_sheet:
	raise ValueError("No suitable sheet found in Excel file")

	logger.info(f"Using sheet '{tb_sheet}' as trial balance")

	# Try different skiprows values to find the header
	for skiprows in [0, 1, 2, 3, 4, 5]:
	try:
	df = xls.parse(tb_sheet, skiprows=skiprows)
	df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)

	# Check if we have reasonable data
	if len(df) > 5 and len(df.columns) >= 2:
	logger.info(f"Successfully parsed with skiprows={skiprows}, rows={len(df)}, cols={len(df.columns)}")

	# Save the entire trial balance
	os.makedirs(output_folder, exist_ok=True)
	output_path = os.path.join(output_folder, "Trial_Balance_Full.csv")
	df.to_csv(output_path, index=False)
	logger.info(f"Saved trial balance to: {output_path}")

	return [NoteCSVInfo(name="Trial_Balance_Full.csv", rows=df.shape[0])]
	except Exception as e:
	logger.debug(f"Failed with skiprows={skiprows}: {e}")
	continue

	raise ValueError("Could not parse trial balance sheet with any skiprows configuration")

	def clean_note(xls: pd.ExcelFile, sheet_name: str, skiprows: int = 3) -> pd.DataFrame:
	"""
	Parse and clean a sheet from the Excel file.
	"""
	if sheet_name not in xls.sheet_names:
	raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {', '.join(xls.sheet_names)}")

	df = xls.parse(sheet_name, skiprows=skiprows)
	df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
	return df

	def export_note_to_csv(df: pd.DataFrame, filename: str, output_folder: str) -> NoteCSVInfo:
	"""
	Export DataFrame to CSV and return info.
	"""
	output_path = os.path.join(output_folder, filename)
	df.to_csv(output_path, index=False)
	logger.info(f"Exported: {output_path}")
	return NoteCSVInfo(name=filename, rows=df.shape[0])

	def main() -> None:
	"""
	Main function to extract notes from Excel and export as CSVs.
	Handles both structured note files and standard trial balance files.
	"""
	# Use command-line argument for Excel file path if provided
	excel_path = settings.excel_file_path
	if len(sys.argv) > 1:
	excel_path = sys.argv[1]
	logger.info(f"Excel file path from argument: {excel_path}")
	else:
	logger.info(f"Excel file path from settings: {excel_path}")

	if not os.path.exists(excel_path):
	logger.error(f"Excel file not found: {excel_path}")
	sys.exit(1)

	try:
	xls = pd.ExcelFile(excel_path)
	except Exception as e:
	logger.error(f"Failed to open Excel file: {e}")
	sys.exit(1)

	available_sheets = get_available_sheets(xls)
	logger.info(f"Available sheets in file: {', '.join(available_sheets)}")

	# Ensure output folder exists
	os.makedirs(settings.output_folder, exist_ok=True)

	# Try to extract from structured notes first
	expected_sheets = [
	settings.note_16_23_sheet,
	settings.note_2_8_sheet,
	settings.note_9_sheet,
	settings.note_10_15_sheet,
	settings.note_24_30_sheet
	]

	# Check if this is a structured notes file
	has_structured_notes = any(sheet in available_sheets for sheet in expected_sheets)

	if has_structured_notes:
	logger.info("="*60)
	logger.info("Detected structured notes file - extracting from individual note sheets")
	logger.info("="*60)

	try:
	info_16_23 = export_note_to_csv(
	clean_note(xls, settings.note_16_23_sheet, settings.skiprows),
	"Note_16_to_23_Full.csv",
	settings.output_folder
	)
	info_2_8 = export_note_to_csv(
	clean_note(xls, settings.note_2_8_sheet, settings.skiprows),
	"Note_2_to_8_Full.csv",
	settings.output_folder
	)
	info_9 = export_note_to_csv(
	clean_note(xls, settings.note_9_sheet, settings.skiprows),
	"Note_9_Full.csv",
	settings.output_folder
	)
	info_10_15 = export_note_to_csv(
	clean_note(xls, settings.note_10_15_sheet, settings.skiprows),
	"Note_10_to_15_Full.csv",
	settings.output_folder
	)
	info_24_30 = export_note_to_csv(
	clean_note(xls, settings.note_24_30_sheet, settings.skiprows),
	"Note_24_to_30_Full.csv",
	settings.output_folder
	)

	logger.info("="*60)
	logger.info("EXTRACTION SUMMARY")
	logger.info("="*60)
	logger.info(f"Note 16-23: {info_16_23.rows} rows")
	logger.info(f"Note 2-8: {info_2_8.rows} rows")
	logger.info(f"Note 9: {info_9.rows} rows")
	logger.info(f"Note 10-15: {info_10_15.rows} rows")
	logger.info(f"Note 24-30: {info_24_30.rows} rows")
	logger.info("="*60)

	except ValueError as e:
	logger.error(f"Error extracting from structured notes: {e}")
	sys.exit(1)
	else:
	logger.info("="*60)
	logger.info("Standard trial balance file detected - using fallback extraction")
	logger.info("="*60)
	logger.warning("⚠️ Cash Flow Statement generation may be limited with trial balance data only")
	logger.info("")
	logger.info("For complete Cash Flow Statement, please provide a file with these sheets:")
	logger.info(" • Note 16-23 (Revenue details)")
	logger.info(" • Note 2-8 (Share capital, reserves, liabilities)")
	logger.info(" • Note 9 (Fixed assets)")
	logger.info(" • Note 10-15 (Current assets, loans)")
	logger.info(" • Note 24-30 (Additional disclosures)")
	logger.info("="*60)

	try:
	csv_infos = extract_from_trial_balance(xls, settings.output_folder)

	logger.info("="*60)
	logger.info("EXTRACTION SUMMARY")
	logger.info("="*60)
	for info in csv_infos:
	logger.info(f"{info.name}: {info.rows} rows")
	logger.info("="*60)
	logger.info("✓ Trial balance data extracted successfully")
	logger.info("Note: Cash Flow generation will proceed with available data")

	except Exception as e:
	logger.error(f"Error extracting from trial balance: {e}")
	logger.error("Please check if the file contains valid financial data")
	sys.exit(1)

	if __name__ == "__main__":
	main()