Spaces:
Build error
Build error
| import os | |
| import json | |
| from pydantic import BaseModel, Field, model_validator | |
| from typing import List | |
| import pandas as pd | |
| from utils.logger import setup_logger | |
| logger = setup_logger(__name__) | |
| def restructure_documents(original_data:dict): | |
| result = {} | |
| for pdf_path, pages in original_data.items(): | |
| file_name = os.path.basename(pdf_path) | |
| for image_path, data in pages.items(): | |
| image_name = os.path.basename(image_path) | |
| document_category = data.get("document_category") | |
| document_type = data.get("document_type") | |
| # Prepare the inner dict content | |
| entry = { | |
| "uploaded_file_path": file_name, | |
| "uploaded_file_extracted_images": [image_name], | |
| **data # include all original fields (including document_type and document_category) | |
| } | |
| # Wrap it under document_type | |
| wrapped_entry = {document_type: entry} | |
| # Append to appropriate document_category list | |
| result.setdefault(document_category, []).append(wrapped_entry) | |
| return result | |
| def extract_document_types_from_transformed(transformed_data): | |
| category_map = {} | |
| for category, docs in transformed_data.items(): | |
| doc_types = set() | |
| for item in docs: | |
| for doc_type in item.keys(): # because each item is like {'payslip': {...}} | |
| doc_types.add(doc_type) | |
| category_map[category] = sorted(list(doc_types)) | |
| return category_map | |
| class DocumentTypeByCategory(BaseModel): | |
| bank_statement: List[str] = Field(default_factory=list) | |
| income_document: List[str] = Field(default_factory=list) | |
| identity_verification_document: List[str] = Field(default_factory=list) | |
| # Computed flags | |
| is_bank_statement_valid: bool = Field(default=False, exclude=True) | |
| is_income_document_valid: bool = Field(default=False, exclude=True) | |
| is_identity_verification_document_valid: bool = Field(default=False, exclude=True) | |
| def compute_valid_flags(self): | |
| self.is_bank_statement_valid = bool(self.bank_statement) | |
| self.is_income_document_valid = bool(self.income_document) | |
| self.is_identity_verification_document_valid = bool(self.identity_verification_document) | |
| return self | |
| def to_dataframe(self) -> pd.DataFrame: | |
| data = [ | |
| { | |
| "document_category": "bank_statement", | |
| "Uploaded": self.is_bank_statement_valid, | |
| "document_types": ", ".join(self.bank_statement) if self.bank_statement else "Missing" | |
| }, | |
| { | |
| "document_category": "income_document", | |
| "Uploaded": self.is_income_document_valid, | |
| "document_types": ", ".join(self.income_document) if self.income_document else "Missing" | |
| }, | |
| { | |
| "document_category": "identity_verification_document", | |
| "Uploaded": self.is_identity_verification_document_valid, | |
| "document_types": ", ".join(self.identity_verification_document) if self.identity_verification_document else "Missing" | |
| } | |
| ] | |
| data_df = pd.DataFrame(data) | |
| data_df.index += 1 | |
| logger.info(f"df: {data_df}") | |
| data_df['Uploaded'] = data_df['Uploaded'].apply(lambda x: '✅' if x else '❌') | |
| return data_df | |