import os import tempfile import datetime from abc import ABC, abstractmethod from fpdf import FPDF # <-- Import FPDF import json import pandas as pd class PDFDocumentGenerator(ABC): """ Abstract Base Class for generating a PDF document from data using a pure-Python library (fpdf2). """ def __init__(self, data: dict): """ Initializes the generator with user-provided data. :param data: A dictionary containing the data for the document. """ self.data = data @abstractmethod def build_document(self, pdf: FPDF): """ Subclasses must implement this method to build the PDF by calling methods on the provided fpdf.FPDF object. :param pdf: An FPDF object to build upon. """ pass def _get_day_suffix(self, day): """Returns the ordinal suffix (st, nd, rd, th) for a given day.""" if 11 <= day <= 13: return 'th' suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(day % 10, 'th') return suffix def _get_smart_doc_data(self, smart_df: pd.DataFrame, doc_type: str) -> dict: """ Helper untuk mencari data dengan 'extraction_result' yang memiliki skor tertinggi (jumlah keys - jumlah null/None keys) dari DataFrame smart_upload berdasarkan doc_type. """ filtered_df = smart_df[smart_df['doc_type'] == doc_type].copy() if filtered_df.empty: return {} # --- Logic Change Start --- def calculate_extraction_score(result_data): """ Helper internal untuk parse dan hitung skor. Skor = (Jumlah Total Keys) - (Jumlah Keys dengan Value None). """ data_dict = None # 1. Coba ubah data menjadi dictionary if isinstance(result_data, dict): data_dict = result_data elif isinstance(result_data, str): try: # Coba parse JSON string data_dict = json.loads(result_data) except (json.JSONDecodeError, TypeError): # Jika bukan JSON valid atau None, anggap bukan dict data_dict = None # 2. Jika bukan dictionary (atau None, int, float, dll.), skornya 0 if not isinstance(data_dict, dict): return 0 # 3. Hitung skor berdasarkan formula total_keys = len(data_dict) if total_keys == 0: return 0 # Dict kosong skor 0 # Hitung nilai yang 'None' secara efisien null_count = sum(1 for value in data_dict.values() if value is None) # Skor akhir = (Total Keys) - (Null Keys) return total_keys - null_count # 1. Buat kolom baru '_extraction_score' berisi skor untuk setiap baris filtered_df['_extraction_score'] = filtered_df['extraction_result'].apply(calculate_extraction_score) # 2. Jika skor tertinggi adalah 0 (atau kurang), berarti tidak ada data valid # (Contoh: semua field null, atau dict kosong, atau JSON invalid) if filtered_df['_extraction_score'].max() <= 0: # Ini adalah pengecekannya: if hasattr(self, 'logger'): # Cek jika logger ada self.logger.warning(f"No valid extraction results with non-null keys found for doc_type: {doc_type}") return {} # 3. Sort berdasarkan '_extraction_score' descending dan ambil baris pertama best_doc_row = filtered_df.sort_values(by='_extraction_score', ascending=False).iloc[0] # --- Logic Change End --- # Ambil 'extraction_result' dari baris terbaik result = best_doc_row['extraction_result'] # Gunakan logic parsing yang sudah ada untuk memastikan return type-nya dict if isinstance(result, dict): # Kembalikan dict ekstraksi terbaru yang ditemukan return result elif isinstance(result, str): try: return json.loads(result) except json.JSONDecodeError: # Ini adalah pengecekan kedua: if hasattr(self, 'logger'): # Cek jika logger ada self.logger.warning(f"Failed to decode best extraction result for doc_type: {doc_type}") return {} # Fallback # Fallback jika 'result' bukan dict atau str (misal: None) return {} def compile_pdf(self, tempdir: str) -> (str, str): """ Generates the PDF document using fpdf2 and saves it in tempdir. :param tempdir: The temporary directory to use for compilation. :return: A tuple of (pdf_filepath, error_message). On success, (pdf_filepath, None). On failure, (None, error_message). """ try: pdf = FPDF() pdf.add_page() self.build_document(pdf) pdf_filename = "document.pdf" pdf_filepath = os.path.join(tempdir, pdf_filename) pdf.output(pdf_filepath) if not os.path.exists(pdf_filepath): return None, "PDF file was not created by fpdf2." pdf.set_margins(25, 25) return pdf_filepath, None except Exception as e: print(f"Error during PDF generation: {e}") return None, str(e) def get_prefill_data(self) -> dict: """ Fetches and merges all required data from various DB sources based on the 'application_id' in self.data. Subclasses should override this to implement their specific data-fetching and merging logic. """ # Base implementation just returns the data it was given. # This acts as a placeholder for generators that don't need prefill. self.logger.warning("Base get_prefill_data called. No data merging performed.") return self.data