dipan004 commited on
Commit
2623d94
·
verified ·
1 Parent(s): b261ad9

Update notes/llm_notes_generator.py

Browse files
Files changed (1) hide show
  1. notes/llm_notes_generator.py +28 -107
notes/llm_notes_generator.py CHANGED
@@ -1,14 +1,11 @@
1
- # Minimal placeholder for FlexibleFinancialNoteGenerator
2
  class FlexibleFinancialNoteGenerator:
3
  def __init__(self):
4
  pass
5
 
6
  def generate_note(self, note_number, trial_balance_path=None):
7
- # Placeholder logic
8
  return True
9
 
10
  def generate_all_notes(self, trial_balance_path=None):
11
- # Placeholder logic
12
  return {"dummy": True}
13
 
14
  import json
@@ -27,15 +24,12 @@ from pydantic_settings import BaseSettings
27
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
28
  from utils.utils import convert_note_json_to_lakhs
29
 
30
- # Load environment variables
31
  load_dotenv(dotenv_path=Path(__file__).parent.parent / '.env')
32
 
33
- # Configure logging
34
  logging.basicConfig(level=logging.INFO)
35
  logger = logging.getLogger(__name__)
36
 
37
  class Settings(BaseSettings):
38
- """Application settings loaded from environment variables or .env file."""
39
  openrouter_api_key: str = os.getenv('OPENROUTER_API_KEY', '')
40
  api_url: str = "https://openrouter.ai/api/v1/chat/completions"
41
  output_dir: str = "data/generated_notes"
@@ -51,7 +45,6 @@ class Account(BaseModel):
51
  class NoteTemplate(BaseModel):
52
  title: str
53
  full_title: str
54
- # Add other fields as needed for your template structure
55
 
56
  class GeneratedNote(BaseModel):
57
  note_number: str
@@ -59,14 +52,19 @@ class GeneratedNote(BaseModel):
59
  grand_total_lakhs: float
60
  generated_on: str
61
  assumptions: Optional[str] = None
62
- # Add other fields as needed
63
 
64
  class FlexibleFinancialNoteGenerator:
65
- def __init__(self):
66
- self.openrouter_api_key = settings.openrouter_api_key
67
- if not self.openrouter_api_key:
68
- logger.error("OPENROUTER_API_KEY not found in .env file")
69
- raise ValueError("OPENROUTER_API_KEY not found in .env file")
 
 
 
 
 
 
70
  self.api_url = settings.api_url
71
  self.headers = {
72
  "Authorization": f"Bearer {self.openrouter_api_key}",
@@ -75,18 +73,13 @@ class FlexibleFinancialNoteGenerator:
75
  "X-Title": "Financial Note Generator"
76
  }
77
  self.note_templates = self.load_note_templates()
78
- # Updated model list with DeepSeek as first choice
79
  self.recommended_models = [
80
-
81
- "deepseek/deepseek-r1",
82
- #"deepseek/deepseek-coder",
83
  "mistralai/mixtral-8x7b-instruct"
84
  ]
85
 
86
  def load_note_templates(self) -> Dict[str, Any]:
87
- """Load note templates from notes_template.py file."""
88
  try:
89
- # Add parent directory to path for imports when run as script
90
  if __name__ == "__main__":
91
  sys.path.append(str(Path(__file__).parent.parent))
92
 
@@ -100,7 +93,6 @@ class FlexibleFinancialNoteGenerator:
100
  return {}
101
 
102
  def load_trial_balance(self, file_path: str = settings.trial_balance_json) -> Optional[Dict[str, Any]]:
103
- """Load the complete trial balance from Excel or JSON."""
104
  try:
105
  if file_path.endswith('.json'):
106
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -130,14 +122,12 @@ class FlexibleFinancialNoteGenerator:
130
  return None
131
 
132
  def build_llm_prompt(self, note_number: str, trial_balance_data: Dict[str, Any]) -> Optional[str]:
133
- """Build comprehensive LLM prompt with strict JSON output requirements"""
134
  if note_number not in self.note_templates:
135
  return None
136
 
137
  template = self.note_templates[note_number]
138
  all_accounts = trial_balance_data.get("accounts", [])
139
 
140
- # Build context with full trial balance
141
  context = {
142
  "note_info": {
143
  "number": note_number,
@@ -152,18 +142,17 @@ class FlexibleFinancialNoteGenerator:
152
  "financial_year": "2023-24"
153
  }
154
 
155
- # Get note-specific classification guidance
156
  classification_guide = self._get_classification_guide(note_number)
157
 
158
  prompt = f"""You are a senior financial analyst and chartered accountant with expertise in Indian accounting standards and Schedule III of the Companies Act 2013.
159
 
160
- 🔴 CRITICAL INSTRUCTIONS - MUST FOLLOW EXACTLY:
161
  1. OUTPUT ONLY VALID JSON - NO MARKDOWN, NO EXPLANATIONS, NO TEXT OUTSIDE JSON
162
  2. START YOUR RESPONSE WITH {{ and END WITH }}
163
  3. DO NOT USE ```json``` CODE BLOCKS
164
  4. DO NOT ADD ANY COMMENTARY OR EXPLANATIONS
165
 
166
- 🔴 REQUIRED JSON STRUCTURE - ALL FIELDS MANDATORY:
167
  {{
168
  "title": "{template.get('title', '')}",
169
  "full_title": "{template.get('full_title', '')}",
@@ -191,7 +180,7 @@ class FlexibleFinancialNoteGenerator:
191
  "assumptions": "List any assumptions made during classification"
192
  }}
193
 
194
- 🔴 STRUCTURE ARRAY EXPLAINED:
195
  - First element: Header row with column labels (March 31, 2024, March 31, 2023)
196
  - Subsequent elements: Data categories with subcategories
197
  - Each data category must have:
@@ -200,15 +189,15 @@ class FlexibleFinancialNoteGenerator:
200
  * "total": Sum of current year values in subcategories
201
  * "previous_total": Sum of previous year values in subcategories
202
 
203
- 🔴 YOUR TASK:
204
  1. Analyze ALL trial balance accounts provided below
205
  2. Identify accounts that belong to "{template['full_title']}"
206
  3. Classify into appropriate subcategories per Schedule III
207
- 4. Convert all amounts to lakhs ( ÷ 100,000) with 2 decimal places
208
  5. Calculate accurate totals ensuring mathematical consistency
209
  6. Structure output in hierarchical "structure" array format
210
 
211
- 🔴 MATHEMATICAL REQUIREMENTS:
212
  - All amounts MUST be in lakhs (divide original by 100,000)
213
  - All subtotals MUST equal the grand total exactly
214
  - Use 0.00 for March 2023 if data missing
@@ -216,16 +205,16 @@ class FlexibleFinancialNoteGenerator:
216
  - Ensure "total" = sum of "value" in subcategories
217
  - Ensure "previous_total" = sum of "previous_value" in subcategories
218
 
219
- 🔴 CLASSIFICATION GUIDANCE FOR NOTE {note_number}:
220
  {classification_guide}
221
 
222
- 🔴 COMPLETE TRIAL BALANCE DATA:
223
  {json.dumps(context, indent=2)}
224
 
225
- 🔴 TEMPLATE STRUCTURE TO FOLLOW:
226
  {json.dumps(template, indent=2)}
227
 
228
- 🔴 VALIDATION RULES:
229
  - If no accounts match this note category, use empty categories with 0.00 totals
230
  - Ensure "metadata.note_number" exactly matches {note_number}
231
  - Document classification logic in "assumptions" field
@@ -236,7 +225,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
236
  return prompt
237
 
238
  def _get_classification_guide(self, note_number: str) -> str:
239
- """Get note-specific classification guidance"""
240
  guides = {
241
  "10": """
242
  **Note 10 - Long Term Loans and Advances:**
@@ -258,7 +246,7 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
258
  """,
259
  "13": """
260
  **Note 13 - Cash and Cash Equivalents:**
261
- - Include: Cash on hand, balances with banks (current/savings), short-term deposits (3 months)
262
  - Separate: Cash and cash equivalents vs Other bank balances (FDs >3 months)
263
  - Show: Balances in current accounts, savings accounts, fixed deposits separately
264
  """,
@@ -279,7 +267,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
279
  return guides.get(note_number, f"**Note {note_number}:** Classify accounts logically based on their nature and the note title.")
280
 
281
  def call_openrouter_api(self, prompt: str) -> Optional[str]:
282
- """Make API call to OpenRouter with model fallback"""
283
  for model in self.recommended_models:
284
  logger.info(f"Trying model: {model}")
285
  payload = {
@@ -312,6 +299,9 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
312
  logger.warning(f"Model {model} not found (404), trying next model")
313
  elif e.response.status_code == 402:
314
  logger.warning(f"Model {model} requires payment (402), trying next model")
 
 
 
315
  else:
316
  logger.error(f"HTTP error with {model}: {e}")
317
  except Exception as e:
@@ -321,11 +311,8 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
321
  return None
322
 
323
  def extract_json_from_markdown(self, response_text: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
324
- """Extract JSON from response, handling markdown code blocks and cleaning"""
325
  response_text = response_text.strip()
326
 
327
- # CRITICAL FIX: Handle concatenated/duplicate JSON (e.g., "}{\n{")
328
- # Find the first complete JSON object
329
  json_objects = []
330
  brace_count = 0
331
  start_idx = -1
@@ -338,12 +325,10 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
338
  elif char == '}':
339
  brace_count -= 1
340
  if brace_count == 0 and start_idx != -1:
341
- # Found complete JSON object
342
  potential_json = response_text[start_idx:i+1]
343
  try:
344
  parsed = json.loads(potential_json)
345
  json_objects.append((parsed, potential_json))
346
- # Use the first valid JSON object
347
  break
348
  except json.JSONDecodeError:
349
  continue
@@ -352,8 +337,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
352
  logger.info("Successfully extracted first valid JSON object from response")
353
  return json_objects[0]
354
 
355
- # Fallback: Try original extraction methods
356
- # Remove any leading/trailing text outside JSON
357
  json_patterns = [
358
  r'```json\s*(.*?)\s*```',
359
  r'```\s*(.*?)\s*```',
@@ -370,12 +353,10 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
370
  except json.JSONDecodeError:
371
  continue
372
 
373
- # Try parsing the entire response as JSON
374
  try:
375
  json_data = json.loads(response_text)
376
  return json_data, response_text
377
  except json.JSONDecodeError:
378
- # Last attempt: find JSON-like structure
379
  try:
380
  start = response_text.find('{')
381
  end = response_text.rfind('}') + 1
@@ -389,13 +370,10 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
389
  return None, None
390
 
391
  def validate_and_fix_json(self, json_data: Dict[str, Any], note_number: str) -> Dict[str, Any]:
392
- """Validate JSON structure and auto-fix missing required fields"""
393
  fixed_data = json_data.copy()
394
 
395
- # Get template for this note
396
  template = self.note_templates.get(note_number, {})
397
 
398
- # Auto-fix title fields
399
  if "title" not in fixed_data or not fixed_data["title"]:
400
  fixed_data["title"] = template.get("title", f"Note {note_number}")
401
  logger.info(f"Auto-fixed missing title field")
@@ -404,18 +382,14 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
404
  fixed_data["full_title"] = template.get("full_title", f"{note_number}. {fixed_data.get('title', 'Financial Note')}")
405
  logger.info(f"Auto-fixed missing full_title field")
406
 
407
- # Auto-fix or create metadata
408
  if "metadata" not in fixed_data or not isinstance(fixed_data["metadata"], dict):
409
  fixed_data["metadata"] = {}
410
  logger.info("Auto-created metadata object")
411
 
412
- # CRITICAL FIX: Ensure note_number is correct integer, not 0.0
413
  metadata_note_num = fixed_data["metadata"].get("note_number")
414
  try:
415
- # Convert note_number string to int
416
  expected_note_num = int(note_number)
417
 
418
- # Check if metadata note_number is wrong (0, 0.0, or mismatch)
419
  if (metadata_note_num is None or
420
  metadata_note_num == 0 or
421
  metadata_note_num == 0.0 or
@@ -431,7 +405,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
431
  fixed_data["metadata"]["generated_on"] = datetime.now().isoformat()
432
  logger.info("Auto-fixed missing metadata.generated_on field")
433
 
434
- # Auto-fix or create structure array
435
  if "structure" not in fixed_data or not isinstance(fixed_data["structure"], list):
436
  logger.warning("Structure array missing, creating default structure")
437
  fixed_data["structure"] = [
@@ -452,7 +425,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
452
  }
453
  ]
454
  else:
455
- # Validate and fix structure elements
456
  if len(fixed_data["structure"]) == 0:
457
  logger.warning("Empty structure array, adding default elements")
458
  fixed_data["structure"] = [
@@ -465,7 +437,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
465
  }
466
  ]
467
 
468
- # Ensure each structure element has required fields
469
  for i, struct_elem in enumerate(fixed_data["structure"]):
470
  if not isinstance(struct_elem, dict):
471
  continue
@@ -476,7 +447,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
476
  if "subcategories" not in struct_elem or not isinstance(struct_elem["subcategories"], list):
477
  struct_elem["subcategories"] = []
478
 
479
- # For data rows (not header), ensure totals exist
480
  if i > 0 and struct_elem.get("subcategories"):
481
  if "total" not in struct_elem:
482
  struct_elem["total"] = sum(
@@ -492,7 +462,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
492
  if isinstance(sub, dict)
493
  )
494
 
495
- # Auto-fix assumptions
496
  if "assumptions" not in fixed_data:
497
  fixed_data["assumptions"] = "Classification based on account names and standard accounting practices"
498
  logger.info("Auto-added default assumptions")
@@ -500,10 +469,8 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
500
  return fixed_data
501
 
502
  def validate_json_structure(self, json_data: Dict[str, Any], note_number: str) -> Tuple[bool, str]:
503
- """Validate that the JSON matches expected structure"""
504
  required_fields = ["title", "full_title", "structure", "metadata", "assumptions"]
505
 
506
- # Check required fields
507
  missing_fields = []
508
  for field in required_fields:
509
  if field not in json_data:
@@ -512,7 +479,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
512
  if missing_fields:
513
  return False, f"Missing required fields: {', '.join(missing_fields)}"
514
 
515
- # Check metadata structure
516
  if not isinstance(json_data.get("metadata"), dict):
517
  return False, "metadata must be an object"
518
 
@@ -523,7 +489,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
523
  if str(metadata.get("note_number", "")) != str(note_number):
524
  return False, f"Note number mismatch: expected {note_number}, got {metadata.get('note_number')}"
525
 
526
- # Check structure array
527
  if not isinstance(json_data.get("structure"), list):
528
  return False, "structure must be an array"
529
 
@@ -533,7 +498,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
533
  return True, "Validation passed"
534
 
535
  def _generate_markdown_from_structure(self, json_data: Dict[str, Any]) -> str:
536
- """Generate markdown table from structure array"""
537
  try:
538
  title = json_data.get("full_title", json_data.get("title", "Financial Note"))
539
  structure = json_data.get("structure", [])
@@ -541,44 +505,36 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
541
  if not structure:
542
  return f"# {title}\n\n*No data available*"
543
 
544
- # Start markdown
545
  md_lines = [f"# {title}\n"]
546
 
547
- # Get header row (first element)
548
  header_elem = structure[0] if len(structure) > 0 else None
549
  if header_elem and header_elem.get("subcategories"):
550
  headers = [sub.get("label", "") for sub in header_elem["subcategories"]]
551
  md_lines.append("| Particulars | " + " | ".join(headers) + " |")
552
  md_lines.append("|" + "---|" * (len(headers) + 1))
553
 
554
- # Process data rows
555
  for i in range(1, len(structure)):
556
  elem = structure[i]
557
  category = elem.get("category", "")
558
  subcategories = elem.get("subcategories", [])
559
 
560
- # Add category header if exists
561
  if category:
562
  md_lines.append(f"\n**{category}**\n")
563
 
564
- # Add subcategory rows
565
  for sub in subcategories:
566
  label = sub.get("label", "")
567
  value = sub.get("value", 0.00)
568
  previous_value = sub.get("previous_value", 0.00)
569
  md_lines.append(f"| {label} | {value:.2f} | {previous_value:.2f} |")
570
 
571
- # Add total row if exists
572
  if "total" in elem:
573
  total = elem.get("total", 0.00)
574
  previous_total = elem.get("previous_total", 0.00)
575
  md_lines.append(f"| **Total {category}** | **{total:.2f}** | **{previous_total:.2f}** |")
576
 
577
- # Add metadata
578
  metadata = json_data.get("metadata", {})
579
  md_lines.append(f"\n\n*Generated on: {metadata.get('generated_on', 'Unknown')}*")
580
 
581
- # Add assumptions if present
582
  assumptions = json_data.get("assumptions", "")
583
  if assumptions:
584
  md_lines.append(f"\n\n**Assumptions:** {assumptions}")
@@ -590,41 +546,32 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
590
  return f"# {json_data.get('full_title', 'Financial Note')}\n\n*Error generating markdown table*"
591
 
592
  def save_generated_note(self, note_data: str, note_number: str, output_dir: str = settings.output_dir) -> bool:
593
- """Save the generated note to file with robust validation and auto-fixing"""
594
  Path(output_dir).mkdir(parents=True, exist_ok=True)
595
  json_output_path = f"{output_dir}/notes.json"
596
  raw_output_path = f"{output_dir}/notes_raw.txt"
597
  formatted_md_path = f"{output_dir}/notes_formatted.md"
598
 
599
  try:
600
- # Always save raw response for debugging
601
  with open(raw_output_path, 'w', encoding='utf-8') as f:
602
  f.write(note_data)
603
 
604
- # Extract and validate JSON
605
  json_data, json_string = self.extract_json_from_markdown(note_data)
606
 
607
  if json_data:
608
- # Auto-fix missing or incorrect fields
609
  json_data = self.validate_and_fix_json(json_data, note_number)
610
 
611
- # Final validation
612
  is_valid, validation_msg = self.validate_json_structure(json_data, note_number)
613
  if not is_valid:
614
  logger.warning(f"JSON validation warning after auto-fix: {validation_msg}")
615
 
616
- # Convert to lakhs if needed
617
  json_data = convert_note_json_to_lakhs(json_data)
618
 
619
- # Save JSON
620
  with open(json_output_path, 'w', encoding='utf-8') as f:
621
  json.dump(json_data, f, indent=2, ensure_ascii=False)
622
  logger.info(f"JSON saved to {json_output_path}")
623
 
624
- # Generate and save markdown
625
  md_content = json_data.get('markdown_content', '')
626
  if not md_content:
627
- # Generate markdown from structure
628
  md_content = self._generate_markdown_from_structure(json_data)
629
  logger.info("Auto-generated markdown from structure array")
630
 
@@ -633,7 +580,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
633
 
634
  return True
635
  else:
636
- # Create fallback JSON with all required fields
637
  template = self.note_templates.get(note_number, {})
638
  fallback_json = {
639
  "title": template.get("title", f"Note {note_number}"),
@@ -672,7 +618,6 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
672
  except Exception as e:
673
  logger.error(f"Error saving files: {e}")
674
 
675
- # Emergency fallback
676
  try:
677
  template = self.note_templates.get(note_number, {})
678
  emergency_json = {
@@ -703,42 +648,35 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
703
  return False
704
 
705
  def generate_note(self, note_number: str, trial_balance_path: str = settings.trial_balance_json) -> bool:
706
- """Generate a specific note based on note number"""
707
  if note_number not in self.note_templates:
708
  logger.error(f"Note template {note_number} not found")
709
  return False
710
 
711
  logger.info(f"Starting Note {note_number} generation...")
712
 
713
- # Load complete trial balance
714
  trial_balance = self.load_trial_balance(trial_balance_path)
715
  if not trial_balance:
716
  return False
717
 
718
- # Build prompt with full trial balance
719
  prompt = self.build_llm_prompt(note_number, trial_balance)
720
  if not prompt:
721
  logger.error("Failed to build prompt")
722
  return False
723
 
724
- # Get LLM response
725
  response = self.call_openrouter_api(prompt)
726
  if not response:
727
  logger.error("Failed to get API response")
728
  return False
729
 
730
- # Save the generated note
731
  success = self.save_generated_note(response, note_number)
732
  logger.info(f"Note {note_number} {'generated successfully' if success else 'generated with issues'}")
733
  return success
734
 
735
  def generate_all_notes(self, trial_balance_path: str = settings.trial_balance_json) -> Dict[str, bool]:
736
- """Generate all available notes and save them in a single notes.json file."""
737
  logger.info(f"Starting generation of all {len(self.note_templates)} notes...")
738
  results = {}
739
  all_notes = []
740
 
741
- # Load trial balance once
742
  trial_balance = self.load_trial_balance(trial_balance_path)
743
  if not trial_balance:
744
  logger.error("Failed to load trial balance")
@@ -747,22 +685,18 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
747
  for note_number in self.note_templates.keys():
748
  logger.info(f"Processing Note {note_number}")
749
 
750
- # Build prompt for this note
751
  prompt = self.build_llm_prompt(note_number, trial_balance)
752
  if not prompt:
753
  results[note_number] = False
754
  continue
755
 
756
- # Get LLM response
757
  response = self.call_openrouter_api(prompt)
758
  if not response:
759
  results[note_number] = False
760
  continue
761
 
762
- # Parse JSON response
763
  json_data, _ = self.extract_json_from_markdown(response)
764
  if json_data:
765
- # Auto-fix and validate
766
  json_data = self.validate_and_fix_json(json_data, note_number)
767
  is_valid, validation_msg = self.validate_json_structure(json_data, note_number)
768
 
@@ -773,13 +707,11 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
773
  logger.info(f"Note {note_number} processed successfully")
774
  else:
775
  logger.warning(f"Note {note_number} validation failed even after auto-fix: {validation_msg}")
776
- # Still include it but mark as failed
777
  json_data = convert_note_json_to_lakhs(json_data)
778
  all_notes.append(json_data)
779
  results[note_number] = False
780
  else:
781
  logger.error(f"Note {note_number}: Could not parse JSON from response")
782
- # Create fallback note with new structure
783
  template = self.note_templates.get(note_number, {})
784
  fallback_note = {
785
  "title": template.get("title", f"Note {note_number}"),
@@ -811,11 +743,9 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
811
  all_notes.append(fallback_note)
812
  results[note_number] = False
813
 
814
- # Brief pause between API calls
815
  import time
816
  time.sleep(2)
817
 
818
- # Save all notes in consolidated file
819
  output_dir = settings.output_dir
820
  Path(output_dir).mkdir(parents=True, exist_ok=True)
821
 
@@ -841,9 +771,7 @@ GENERATE VALID JSON NOW (NO ADDITIONAL TEXT):"""
841
  return results
842
 
843
  def main() -> None:
844
- """Main function to run the flexible note generator"""
845
  try:
846
- # Initialize generator
847
  generator = FlexibleFinancialNoteGenerator()
848
  if not generator.note_templates:
849
  logger.error("No note templates loaded. Check notes_template.py")
@@ -851,9 +779,7 @@ def main() -> None:
851
 
852
  logger.info(f"Loaded {len(generator.note_templates)} note templates")
853
 
854
- # Check for command line arguments
855
  if len(sys.argv) > 1:
856
- # Command line mode
857
  if len(sys.argv) < 3:
858
  logger.error("Usage: python llm_notes_generator.py <mode> <note_numbers>")
859
  logger.error(" mode: 'specific' or 'all'")
@@ -876,7 +802,6 @@ def main() -> None:
876
  if note_number in generator.note_templates:
877
  success = generator.generate_note(note_number)
878
  if success:
879
- # Load the generated note
880
  try:
881
  with open("data/generated_notes/notes.json", "r", encoding="utf-8") as f:
882
  note_data = json.load(f)
@@ -890,7 +815,6 @@ def main() -> None:
890
  else:
891
  logger.error(f"Note {note_number} not found in templates")
892
 
893
- # Save consolidated notes
894
  if all_notes:
895
  output_dir = settings.output_dir
896
  Path(output_dir).mkdir(parents=True, exist_ok=True)
@@ -913,9 +837,8 @@ def main() -> None:
913
  total = len(results)
914
  logger.info(f"{successful}/{total} notes generated successfully")
915
 
916
- # Print detailed results
917
  for note, success in results.items():
918
- status = " SUCCESS" if success else " FAILED"
919
  logger.info(f" Note {note}: {status}")
920
 
921
  else:
@@ -923,7 +846,6 @@ def main() -> None:
923
  sys.exit(1)
924
 
925
  else:
926
- # Interactive mode
927
  choice = input("\nGenerate (1) specific note or (2) all notes? Enter 1 or 2: ").strip()
928
 
929
  if choice == "1":
@@ -943,12 +865,11 @@ def main() -> None:
943
  total = len(results)
944
  logger.info(f"{successful}/{total} notes generated successfully")
945
 
946
- # Print summary
947
  print("\n" + "="*50)
948
  print("GENERATION SUMMARY")
949
  print("="*50)
950
  for note, success in results.items():
951
- status = " SUCCESS" if success else " FAILED"
952
  print(f"Note {note}: {status}")
953
  print("="*50)
954
 
 
 
1
  class FlexibleFinancialNoteGenerator:
2
  def __init__(self):
3
  pass
4
 
5
  def generate_note(self, note_number, trial_balance_path=None):
 
6
  return True
7
 
8
  def generate_all_notes(self, trial_balance_path=None):
 
9
  return {"dummy": True}
10
 
11
  import json
 
24
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
25
  from utils.utils import convert_note_json_to_lakhs
26
 
 
27
  load_dotenv(dotenv_path=Path(__file__).parent.parent / '.env')
28
 
 
29
  logging.basicConfig(level=logging.INFO)
30
  logger = logging.getLogger(__name__)
31
 
32
  class Settings(BaseSettings):
 
33
  openrouter_api_key: str = os.getenv('OPENROUTER_API_KEY', '')
34
  api_url: str = "https://openrouter.ai/api/v1/chat/completions"
35
  output_dir: str = "data/generated_notes"
 
45
  class NoteTemplate(BaseModel):
46
  title: str
47
  full_title: str
 
48
 
49
  class GeneratedNote(BaseModel):
50
  note_number: str
 
52
  grand_total_lakhs: float
53
  generated_on: str
54
  assumptions: Optional[str] = None
 
55
 
56
  class FlexibleFinancialNoteGenerator:
57
+ def __init__(self, user_api_key: Optional[str] = None):
58
+ if user_api_key:
59
+ self.openrouter_api_key = user_api_key
60
+ logger.info("Using user-provided API key")
61
+ else:
62
+ self.openrouter_api_key = settings.openrouter_api_key
63
+ if not self.openrouter_api_key:
64
+ logger.error("OPENROUTER_API_KEY not found in .env file and no user key provided")
65
+ raise ValueError("OPENROUTER_API_KEY not found in .env file and no user key provided")
66
+ logger.info("Using API key from .env file")
67
+
68
  self.api_url = settings.api_url
69
  self.headers = {
70
  "Authorization": f"Bearer {self.openrouter_api_key}",
 
73
  "X-Title": "Financial Note Generator"
74
  }
75
  self.note_templates = self.load_note_templates()
 
76
  self.recommended_models = [
77
+ "deepseek/deepseek-r1",
 
 
78
  "mistralai/mixtral-8x7b-instruct"
79
  ]
80
 
81
  def load_note_templates(self) -> Dict[str, Any]:
 
82
  try:
 
83
  if __name__ == "__main__":
84
  sys.path.append(str(Path(__file__).parent.parent))
85
 
 
93
  return {}
94
 
95
  def load_trial_balance(self, file_path: str = settings.trial_balance_json) -> Optional[Dict[str, Any]]:
 
96
  try:
97
  if file_path.endswith('.json'):
98
  with open(file_path, 'r', encoding='utf-8') as f:
 
122
  return None
123
 
124
  def build_llm_prompt(self, note_number: str, trial_balance_data: Dict[str, Any]) -> Optional[str]:
 
125
  if note_number not in self.note_templates:
126
  return None
127
 
128
  template = self.note_templates[note_number]
129
  all_accounts = trial_balance_data.get("accounts", [])
130
 
 
131
  context = {
132
  "note_info": {
133
  "number": note_number,
 
142
  "financial_year": "2023-24"
143
  }
144
 
 
145
  classification_guide = self._get_classification_guide(note_number)
146
 
147
  prompt = f"""You are a senior financial analyst and chartered accountant with expertise in Indian accounting standards and Schedule III of the Companies Act 2013.
148
 
149
+ ðŸ"´ CRITICAL INSTRUCTIONS - MUST FOLLOW EXACTLY:
150
  1. OUTPUT ONLY VALID JSON - NO MARKDOWN, NO EXPLANATIONS, NO TEXT OUTSIDE JSON
151
  2. START YOUR RESPONSE WITH {{ and END WITH }}
152
  3. DO NOT USE ```json``` CODE BLOCKS
153
  4. DO NOT ADD ANY COMMENTARY OR EXPLANATIONS
154
 
155
+ ðŸ"´ REQUIRED JSON STRUCTURE - ALL FIELDS MANDATORY:
156
  {{
157
  "title": "{template.get('title', '')}",
158
  "full_title": "{template.get('full_title', '')}",
 
180
  "assumptions": "List any assumptions made during classification"
181
  }}
182
 
183
+ ðŸ"´ STRUCTURE ARRAY EXPLAINED:
184
  - First element: Header row with column labels (March 31, 2024, March 31, 2023)
185
  - Subsequent elements: Data categories with subcategories
186
  - Each data category must have:
 
189
  * "total": Sum of current year values in subcategories
190
  * "previous_total": Sum of previous year values in subcategories
191
 
192
+ ðŸ"´ YOUR TASK:
193
  1. Analyze ALL trial balance accounts provided below
194
  2. Identify accounts that belong to "{template['full_title']}"
195
  3. Classify into appropriate subcategories per Schedule III
196
+ 4. Convert all amounts to lakhs (₹ ÷ 100,000) with 2 decimal places
197
  5. Calculate accurate totals ensuring mathematical consistency
198
  6. Structure output in hierarchical "structure" array format
199
 
200
+ ðŸ"´ MATHEMATICAL REQUIREMENTS:
201
  - All amounts MUST be in lakhs (divide original by 100,000)
202
  - All subtotals MUST equal the grand total exactly
203
  - Use 0.00 for March 2023 if data missing
 
205
  - Ensure "total" = sum of "value" in subcategories
206
  - Ensure "previous_total" = sum of "previous_value" in subcategories
207
 
208
+ ðŸ"´ CLASSIFICATION GUIDANCE FOR NOTE {note_number}:
209
  {classification_guide}
210
 
211
+ ðŸ"´ COMPLETE TRIAL BALANCE DATA:
212
  {json.dumps(context, indent=2)}
213
 
214
+ ðŸ"´ TEMPLATE STRUCTURE TO FOLLOW:
215
  {json.dumps(template, indent=2)}
216
 
217
+ ðŸ"´ VALIDATION RULES:
218
  - If no accounts match this note category, use empty categories with 0.00 totals
219
  - Ensure "metadata.note_number" exactly matches {note_number}
220
  - Document classification logic in "assumptions" field
 
225
  return prompt
226
 
227
  def _get_classification_guide(self, note_number: str) -> str:
 
228
  guides = {
229
  "10": """
230
  **Note 10 - Long Term Loans and Advances:**
 
246
  """,
247
  "13": """
248
  **Note 13 - Cash and Cash Equivalents:**
249
+ - Include: Cash on hand, balances with banks (current/savings), short-term deposits (≤3 months)
250
  - Separate: Cash and cash equivalents vs Other bank balances (FDs >3 months)
251
  - Show: Balances in current accounts, savings accounts, fixed deposits separately
252
  """,
 
267
  return guides.get(note_number, f"**Note {note_number}:** Classify accounts logically based on their nature and the note title.")
268
 
269
  def call_openrouter_api(self, prompt: str) -> Optional[str]:
 
270
  for model in self.recommended_models:
271
  logger.info(f"Trying model: {model}")
272
  payload = {
 
299
  logger.warning(f"Model {model} not found (404), trying next model")
300
  elif e.response.status_code == 402:
301
  logger.warning(f"Model {model} requires payment (402), trying next model")
302
+ elif e.response.status_code == 401:
303
+ logger.error(f"Invalid API key (401)")
304
+ return None
305
  else:
306
  logger.error(f"HTTP error with {model}: {e}")
307
  except Exception as e:
 
311
  return None
312
 
313
  def extract_json_from_markdown(self, response_text: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
 
314
  response_text = response_text.strip()
315
 
 
 
316
  json_objects = []
317
  brace_count = 0
318
  start_idx = -1
 
325
  elif char == '}':
326
  brace_count -= 1
327
  if brace_count == 0 and start_idx != -1:
 
328
  potential_json = response_text[start_idx:i+1]
329
  try:
330
  parsed = json.loads(potential_json)
331
  json_objects.append((parsed, potential_json))
 
332
  break
333
  except json.JSONDecodeError:
334
  continue
 
337
  logger.info("Successfully extracted first valid JSON object from response")
338
  return json_objects[0]
339
 
 
 
340
  json_patterns = [
341
  r'```json\s*(.*?)\s*```',
342
  r'```\s*(.*?)\s*```',
 
353
  except json.JSONDecodeError:
354
  continue
355
 
 
356
  try:
357
  json_data = json.loads(response_text)
358
  return json_data, response_text
359
  except json.JSONDecodeError:
 
360
  try:
361
  start = response_text.find('{')
362
  end = response_text.rfind('}') + 1
 
370
  return None, None
371
 
372
  def validate_and_fix_json(self, json_data: Dict[str, Any], note_number: str) -> Dict[str, Any]:
 
373
  fixed_data = json_data.copy()
374
 
 
375
  template = self.note_templates.get(note_number, {})
376
 
 
377
  if "title" not in fixed_data or not fixed_data["title"]:
378
  fixed_data["title"] = template.get("title", f"Note {note_number}")
379
  logger.info(f"Auto-fixed missing title field")
 
382
  fixed_data["full_title"] = template.get("full_title", f"{note_number}. {fixed_data.get('title', 'Financial Note')}")
383
  logger.info(f"Auto-fixed missing full_title field")
384
 
 
385
  if "metadata" not in fixed_data or not isinstance(fixed_data["metadata"], dict):
386
  fixed_data["metadata"] = {}
387
  logger.info("Auto-created metadata object")
388
 
 
389
  metadata_note_num = fixed_data["metadata"].get("note_number")
390
  try:
 
391
  expected_note_num = int(note_number)
392
 
 
393
  if (metadata_note_num is None or
394
  metadata_note_num == 0 or
395
  metadata_note_num == 0.0 or
 
405
  fixed_data["metadata"]["generated_on"] = datetime.now().isoformat()
406
  logger.info("Auto-fixed missing metadata.generated_on field")
407
 
 
408
  if "structure" not in fixed_data or not isinstance(fixed_data["structure"], list):
409
  logger.warning("Structure array missing, creating default structure")
410
  fixed_data["structure"] = [
 
425
  }
426
  ]
427
  else:
 
428
  if len(fixed_data["structure"]) == 0:
429
  logger.warning("Empty structure array, adding default elements")
430
  fixed_data["structure"] = [
 
437
  }
438
  ]
439
 
 
440
  for i, struct_elem in enumerate(fixed_data["structure"]):
441
  if not isinstance(struct_elem, dict):
442
  continue
 
447
  if "subcategories" not in struct_elem or not isinstance(struct_elem["subcategories"], list):
448
  struct_elem["subcategories"] = []
449
 
 
450
  if i > 0 and struct_elem.get("subcategories"):
451
  if "total" not in struct_elem:
452
  struct_elem["total"] = sum(
 
462
  if isinstance(sub, dict)
463
  )
464
 
 
465
  if "assumptions" not in fixed_data:
466
  fixed_data["assumptions"] = "Classification based on account names and standard accounting practices"
467
  logger.info("Auto-added default assumptions")
 
469
  return fixed_data
470
 
471
  def validate_json_structure(self, json_data: Dict[str, Any], note_number: str) -> Tuple[bool, str]:
 
472
  required_fields = ["title", "full_title", "structure", "metadata", "assumptions"]
473
 
 
474
  missing_fields = []
475
  for field in required_fields:
476
  if field not in json_data:
 
479
  if missing_fields:
480
  return False, f"Missing required fields: {', '.join(missing_fields)}"
481
 
 
482
  if not isinstance(json_data.get("metadata"), dict):
483
  return False, "metadata must be an object"
484
 
 
489
  if str(metadata.get("note_number", "")) != str(note_number):
490
  return False, f"Note number mismatch: expected {note_number}, got {metadata.get('note_number')}"
491
 
 
492
  if not isinstance(json_data.get("structure"), list):
493
  return False, "structure must be an array"
494
 
 
498
  return True, "Validation passed"
499
 
500
  def _generate_markdown_from_structure(self, json_data: Dict[str, Any]) -> str:
 
501
  try:
502
  title = json_data.get("full_title", json_data.get("title", "Financial Note"))
503
  structure = json_data.get("structure", [])
 
505
  if not structure:
506
  return f"# {title}\n\n*No data available*"
507
 
 
508
  md_lines = [f"# {title}\n"]
509
 
 
510
  header_elem = structure[0] if len(structure) > 0 else None
511
  if header_elem and header_elem.get("subcategories"):
512
  headers = [sub.get("label", "") for sub in header_elem["subcategories"]]
513
  md_lines.append("| Particulars | " + " | ".join(headers) + " |")
514
  md_lines.append("|" + "---|" * (len(headers) + 1))
515
 
 
516
  for i in range(1, len(structure)):
517
  elem = structure[i]
518
  category = elem.get("category", "")
519
  subcategories = elem.get("subcategories", [])
520
 
 
521
  if category:
522
  md_lines.append(f"\n**{category}**\n")
523
 
 
524
  for sub in subcategories:
525
  label = sub.get("label", "")
526
  value = sub.get("value", 0.00)
527
  previous_value = sub.get("previous_value", 0.00)
528
  md_lines.append(f"| {label} | {value:.2f} | {previous_value:.2f} |")
529
 
 
530
  if "total" in elem:
531
  total = elem.get("total", 0.00)
532
  previous_total = elem.get("previous_total", 0.00)
533
  md_lines.append(f"| **Total {category}** | **{total:.2f}** | **{previous_total:.2f}** |")
534
 
 
535
  metadata = json_data.get("metadata", {})
536
  md_lines.append(f"\n\n*Generated on: {metadata.get('generated_on', 'Unknown')}*")
537
 
 
538
  assumptions = json_data.get("assumptions", "")
539
  if assumptions:
540
  md_lines.append(f"\n\n**Assumptions:** {assumptions}")
 
546
  return f"# {json_data.get('full_title', 'Financial Note')}\n\n*Error generating markdown table*"
547
 
548
  def save_generated_note(self, note_data: str, note_number: str, output_dir: str = settings.output_dir) -> bool:
 
549
  Path(output_dir).mkdir(parents=True, exist_ok=True)
550
  json_output_path = f"{output_dir}/notes.json"
551
  raw_output_path = f"{output_dir}/notes_raw.txt"
552
  formatted_md_path = f"{output_dir}/notes_formatted.md"
553
 
554
  try:
 
555
  with open(raw_output_path, 'w', encoding='utf-8') as f:
556
  f.write(note_data)
557
 
 
558
  json_data, json_string = self.extract_json_from_markdown(note_data)
559
 
560
  if json_data:
 
561
  json_data = self.validate_and_fix_json(json_data, note_number)
562
 
 
563
  is_valid, validation_msg = self.validate_json_structure(json_data, note_number)
564
  if not is_valid:
565
  logger.warning(f"JSON validation warning after auto-fix: {validation_msg}")
566
 
 
567
  json_data = convert_note_json_to_lakhs(json_data)
568
 
 
569
  with open(json_output_path, 'w', encoding='utf-8') as f:
570
  json.dump(json_data, f, indent=2, ensure_ascii=False)
571
  logger.info(f"JSON saved to {json_output_path}")
572
 
 
573
  md_content = json_data.get('markdown_content', '')
574
  if not md_content:
 
575
  md_content = self._generate_markdown_from_structure(json_data)
576
  logger.info("Auto-generated markdown from structure array")
577
 
 
580
 
581
  return True
582
  else:
 
583
  template = self.note_templates.get(note_number, {})
584
  fallback_json = {
585
  "title": template.get("title", f"Note {note_number}"),
 
618
  except Exception as e:
619
  logger.error(f"Error saving files: {e}")
620
 
 
621
  try:
622
  template = self.note_templates.get(note_number, {})
623
  emergency_json = {
 
648
  return False
649
 
650
  def generate_note(self, note_number: str, trial_balance_path: str = settings.trial_balance_json) -> bool:
 
651
  if note_number not in self.note_templates:
652
  logger.error(f"Note template {note_number} not found")
653
  return False
654
 
655
  logger.info(f"Starting Note {note_number} generation...")
656
 
 
657
  trial_balance = self.load_trial_balance(trial_balance_path)
658
  if not trial_balance:
659
  return False
660
 
 
661
  prompt = self.build_llm_prompt(note_number, trial_balance)
662
  if not prompt:
663
  logger.error("Failed to build prompt")
664
  return False
665
 
 
666
  response = self.call_openrouter_api(prompt)
667
  if not response:
668
  logger.error("Failed to get API response")
669
  return False
670
 
 
671
  success = self.save_generated_note(response, note_number)
672
  logger.info(f"Note {note_number} {'generated successfully' if success else 'generated with issues'}")
673
  return success
674
 
675
  def generate_all_notes(self, trial_balance_path: str = settings.trial_balance_json) -> Dict[str, bool]:
 
676
  logger.info(f"Starting generation of all {len(self.note_templates)} notes...")
677
  results = {}
678
  all_notes = []
679
 
 
680
  trial_balance = self.load_trial_balance(trial_balance_path)
681
  if not trial_balance:
682
  logger.error("Failed to load trial balance")
 
685
  for note_number in self.note_templates.keys():
686
  logger.info(f"Processing Note {note_number}")
687
 
 
688
  prompt = self.build_llm_prompt(note_number, trial_balance)
689
  if not prompt:
690
  results[note_number] = False
691
  continue
692
 
 
693
  response = self.call_openrouter_api(prompt)
694
  if not response:
695
  results[note_number] = False
696
  continue
697
 
 
698
  json_data, _ = self.extract_json_from_markdown(response)
699
  if json_data:
 
700
  json_data = self.validate_and_fix_json(json_data, note_number)
701
  is_valid, validation_msg = self.validate_json_structure(json_data, note_number)
702
 
 
707
  logger.info(f"Note {note_number} processed successfully")
708
  else:
709
  logger.warning(f"Note {note_number} validation failed even after auto-fix: {validation_msg}")
 
710
  json_data = convert_note_json_to_lakhs(json_data)
711
  all_notes.append(json_data)
712
  results[note_number] = False
713
  else:
714
  logger.error(f"Note {note_number}: Could not parse JSON from response")
 
715
  template = self.note_templates.get(note_number, {})
716
  fallback_note = {
717
  "title": template.get("title", f"Note {note_number}"),
 
743
  all_notes.append(fallback_note)
744
  results[note_number] = False
745
 
 
746
  import time
747
  time.sleep(2)
748
 
 
749
  output_dir = settings.output_dir
750
  Path(output_dir).mkdir(parents=True, exist_ok=True)
751
 
 
771
  return results
772
 
773
  def main() -> None:
 
774
  try:
 
775
  generator = FlexibleFinancialNoteGenerator()
776
  if not generator.note_templates:
777
  logger.error("No note templates loaded. Check notes_template.py")
 
779
 
780
  logger.info(f"Loaded {len(generator.note_templates)} note templates")
781
 
 
782
  if len(sys.argv) > 1:
 
783
  if len(sys.argv) < 3:
784
  logger.error("Usage: python llm_notes_generator.py <mode> <note_numbers>")
785
  logger.error(" mode: 'specific' or 'all'")
 
802
  if note_number in generator.note_templates:
803
  success = generator.generate_note(note_number)
804
  if success:
 
805
  try:
806
  with open("data/generated_notes/notes.json", "r", encoding="utf-8") as f:
807
  note_data = json.load(f)
 
815
  else:
816
  logger.error(f"Note {note_number} not found in templates")
817
 
 
818
  if all_notes:
819
  output_dir = settings.output_dir
820
  Path(output_dir).mkdir(parents=True, exist_ok=True)
 
837
  total = len(results)
838
  logger.info(f"{successful}/{total} notes generated successfully")
839
 
 
840
  for note, success in results.items():
841
+ status = "✅ SUCCESS" if success else "⌠FAILED"
842
  logger.info(f" Note {note}: {status}")
843
 
844
  else:
 
846
  sys.exit(1)
847
 
848
  else:
 
849
  choice = input("\nGenerate (1) specific note or (2) all notes? Enter 1 or 2: ").strip()
850
 
851
  if choice == "1":
 
865
  total = len(results)
866
  logger.info(f"{successful}/{total} notes generated successfully")
867
 
 
868
  print("\n" + "="*50)
869
  print("GENERATION SUMMARY")
870
  print("="*50)
871
  for note, success in results.items():
872
+ status = "✅ SUCCESS" if success else "⌠FAILED"
873
  print(f"Note {note}: {status}")
874
  print("="*50)
875