FinRyver / notes /data_loader.py
Sahil Garg
module and file name changed according to community standards
c094882
raw
history blame
1.8 kB
import os
import json
import logging
import pandas as pd
from typing import Any
from pydantic import BaseModel, ValidationError
from pydantic_settings import BaseSettings
from utils.utils import clean_value
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Settings(BaseSettings):
"""Application settings loaded from environment variables or .env file."""
trial_balance_json: str = "data/output1/parsed_trial_balance.json"
settings = Settings()
class TrialBalanceRecord(BaseModel):
account_name: str
amount: float
group: str
def load_trial_balance() -> pd.DataFrame:
"""
Load trial balance data from a JSON file, validate with Pydantic, and return as a cleaned DataFrame.
Raises FileNotFoundError if the file does not exist.
"""
json_file = settings.trial_balance_json
if not os.path.exists(json_file):
logger.error(f"{json_file} not found! Please run the data extraction step first.")
raise FileNotFoundError(f"{json_file} not found! Please run the data extraction step first.")
with open(json_file, "r", encoding="utf-8") as f:
parsed_data = json.load(f)
# Determine the structure and load into DataFrame
if isinstance(parsed_data, list):
records = parsed_data
else:
records = parsed_data.get("trial_balance", parsed_data)
validated_records = []
for record in records:
try:
validated = TrialBalanceRecord(**record)
validated_dict = validated.dict()
except ValidationError as ve:
logger.warning(f"Validation error for record: {ve}")
validated_dict = record # fallback to raw dict
validated_records.append(validated_dict)
tb_df = pd.DataFrame(validated_records)
tb_df['amount'] = tb_df['amount'].apply(clean_value)
logger.info(f"Loaded trial balance with {len(tb_df)} records.")
return tb_df