Spaces:

spunteam
/

api-web-crawler

Sleeping

App Files Files Community

api-web-crawler / app /util /pdf_document_generator.py

mrfirdauss

feat; add prefill

cce8d96 5 months ago

raw

history blame contribute delete

6.14 kB

	import os
	import tempfile
	import datetime
	from abc import ABC, abstractmethod
	from fpdf import FPDF # <-- Import FPDF
	import json
	import pandas as pd

	class PDFDocumentGenerator(ABC):
	"""
	Abstract Base Class for generating a PDF document from data
	using a pure-Python library (fpdf2).
	"""
	def __init__(self, data: dict):
	"""
	Initializes the generator with user-provided data.

	:param data: A dictionary containing the data for the document.
	"""
	self.data = data


	@abstractmethod
	def build_document(self, pdf: FPDF):
	"""
	Subclasses must implement this method to build the PDF
	by calling methods on the provided fpdf.FPDF object.

	:param pdf: An FPDF object to build upon.
	"""
	pass

	def _get_day_suffix(self, day):
	"""Returns the ordinal suffix (st, nd, rd, th) for a given day."""
	if 11 <= day <= 13:
	return 'th'
	suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(day % 10, 'th')
	return suffix
	def _get_smart_doc_data(self, smart_df: pd.DataFrame, doc_type: str) -> dict:
	"""
	Helper untuk mencari data dengan 'extraction_result' yang memiliki
	skor tertinggi (jumlah keys - jumlah null/None keys) dari DataFrame
	smart_upload berdasarkan doc_type.
	"""
	filtered_df = smart_df[smart_df['doc_type'] == doc_type].copy()

	if filtered_df.empty:
	return {}

	# --- Logic Change Start ---

	def calculate_extraction_score(result_data):
	"""
	Helper internal untuk parse dan hitung skor.
	Skor = (Jumlah Total Keys) - (Jumlah Keys dengan Value None).
	"""
	data_dict = None

	# 1. Coba ubah data menjadi dictionary
	if isinstance(result_data, dict):
	data_dict = result_data
	elif isinstance(result_data, str):
	try:
	# Coba parse JSON string
	data_dict = json.loads(result_data)
	except (json.JSONDecodeError, TypeError):
	# Jika bukan JSON valid atau None, anggap bukan dict
	data_dict = None

	# 2. Jika bukan dictionary (atau None, int, float, dll.), skornya 0
	if not isinstance(data_dict, dict):
	return 0

	# 3. Hitung skor berdasarkan formula
	total_keys = len(data_dict)
	if total_keys == 0:
	return 0 # Dict kosong skor 0

	# Hitung nilai yang 'None' secara efisien
	null_count = sum(1 for value in data_dict.values() if value is None)

	# Skor akhir = (Total Keys) - (Null Keys)
	return total_keys - null_count

	# 1. Buat kolom baru '_extraction_score' berisi skor untuk setiap baris
	filtered_df['_extraction_score'] = filtered_df['extraction_result'].apply(calculate_extraction_score)

	# 2. Jika skor tertinggi adalah 0 (atau kurang), berarti tidak ada data valid
	# (Contoh: semua field null, atau dict kosong, atau JSON invalid)
	if filtered_df['_extraction_score'].max() <= 0:
	# Ini adalah pengecekannya:
	if hasattr(self, 'logger'): # Cek jika logger ada
	self.logger.warning(f"No valid extraction results with non-null keys found for doc_type: {doc_type}")
	return {}

	# 3. Sort berdasarkan '_extraction_score' descending dan ambil baris pertama
	best_doc_row = filtered_df.sort_values(by='_extraction_score', ascending=False).iloc[0]

	# --- Logic Change End ---

	# Ambil 'extraction_result' dari baris terbaik
	result = best_doc_row['extraction_result']

	# Gunakan logic parsing yang sudah ada untuk memastikan return type-nya dict
	if isinstance(result, dict):
	# Kembalikan dict ekstraksi terbaru yang ditemukan
	return result

	elif isinstance(result, str):
	try:
	return json.loads(result)
	except json.JSONDecodeError:
	# Ini adalah pengecekan kedua:
	if hasattr(self, 'logger'): # Cek jika logger ada
	self.logger.warning(f"Failed to decode best extraction result for doc_type: {doc_type}")
	return {} # Fallback

	# Fallback jika 'result' bukan dict atau str (misal: None)
	return {}

	def compile_pdf(self, tempdir: str) -> (str, str):
	"""
	Generates the PDF document using fpdf2 and saves it in tempdir.

	:param tempdir: The temporary directory to use for compilation.
	:return: A tuple of (pdf_filepath, error_message).
	On success, (pdf_filepath, None).
	On failure, (None, error_message).
	"""
	try:
	pdf = FPDF()
	pdf.add_page()

	self.build_document(pdf)

	pdf_filename = "document.pdf"
	pdf_filepath = os.path.join(tempdir, pdf_filename)

	pdf.output(pdf_filepath)

	if not os.path.exists(pdf_filepath):
	return None, "PDF file was not created by fpdf2."

	pdf.set_margins(25, 25)
	return pdf_filepath, None

	except Exception as e:
	print(f"Error during PDF generation: {e}")
	return None, str(e)

	def get_prefill_data(self) -> dict:
	"""
	Fetches and merges all required data from various DB sources
	based on the 'application_id' in self.data.

	Subclasses should override this to implement their specific
	data-fetching and merging logic.
	"""
	# Base implementation just returns the data it was given.
	# This acts as a placeholder for generators that don't need prefill.
	self.logger.warning("Base get_prefill_data called. No data merging performed.")
	return self.data