import logging from typing import Any, Dict, List, Optional from pydantic import BaseModel, ValidationError # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class NormalizedNote(BaseModel): note_number: Optional[str] note_title: Optional[str] full_title: Optional[str] table_data: List[Dict[str, Any]] breakdown: Dict[str, Any] = {} matched_accounts: List[Any] = [] total_amount: Optional[float] = None total_amount_lakhs: Optional[float] = None matched_accounts_count: Optional[int] = None comparative_data: Dict[str, Any] = {} notes_and_disclosures: List[str] = [] markdown_content: Optional[str] = "" def is_date_label(label: str) -> bool: """Check if a label is a date string.""" import re return bool(re.match(r"^(March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}$", label)) \ or bool(re.match(r"^\d{4}-\d{2}-\d{2}$", label)) def normalize_llm_note_json(llm_json: Dict[str, Any]) -> Dict[str, Any]: """ Normalize a single LLM-generated note JSON to standard format. Returns a dict compatible with NormalizedNote. """ note_number = llm_json.get("note_number") or llm_json.get("metadata", {}).get("note_number", "") note_title = llm_json.get("note_title") or llm_json.get("title", "") full_title = llm_json.get("full_title") or (f"{note_number}. {note_title}" if note_number else note_title) table_data: List[Dict[str, Any]] = [] if "structure" in llm_json and llm_json["structure"]: for item in llm_json["structure"]: if "subcategories" in item and item["subcategories"]: for sub in item["subcategories"]: label = sub.get("label", "") if not is_date_label(label): row = { "particulars": label, "current_year": sub.get("value", ""), "previous_year": sub.get("previous_value", "-"), } table_data.append(row) if "category" in item and ("total" in item or "previous_total" in item): row = { "particulars": f"Total {item.get('category', '')}", "current_year": item.get("total", ""), "previous_year": item.get("previous_total", "-"), } table_data.append(row) # Optionally, add a header row if table_data: table_data.insert(0, { "particulars": "Particulars", "current_year": "March 31, 2024", "previous_year": "March 31, 2023" }) normalized = { "note_number": note_number, "note_title": note_title, "full_title": full_title, "table_data": table_data, "breakdown": {}, "matched_accounts": [], "total_amount": None, "total_amount_lakhs": None, "matched_accounts_count": None, "comparative_data": {}, "notes_and_disclosures": [], "markdown_content": llm_json.get("markdown_content", ""), } try: # Validate with Pydantic model NormalizedNote(**normalized) except ValidationError as ve: logger.warning(f"Validation error in normalized note: {ve}") return normalized def normalize_llm_notes_json(llm_json: Dict[str, Any]) -> Dict[str, Any]: """ Accepts {"notes": [ ... ]} and returns {"notes": [ ...normalized... ]} """ notes = llm_json.get("notes", []) normalized_notes = [normalize_llm_note_json(note) for note in notes] return {"notes": normalized_notes}