Spaces:
Sleeping
Sleeping
File size: 5,672 Bytes
8c35759 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
"""Project record data model for structured extraction from PDF reports."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, Optional
@dataclass
class ProjectRecord:
"""Canonical structured fields parsed from a single PDF project report."""
# Identification
source: str
project_id: Optional[str] = None
project_name: Optional[str] = None
# Classification
industry_code: Optional[str] = None
project_type: Optional[str] = None
sector: Optional[str] = None
sic_code: Optional[str] = None
sic_product: Optional[str] = None
# Financial
tiv_amount: Optional[float] = None
tiv_currency: Optional[str] = None
# Status
status: Optional[str] = None
status_reason: Optional[str] = None
project_probability: Optional[str] = None
# Timeline
last_update: Optional[str] = None
initial_release: Optional[str] = None
pec_timing: Optional[str] = None
pec_activity: Optional[str] = None
# Location
address: Optional[str] = None
city_state_line: Optional[str] = None
zone_county: Optional[str] = None
# Plant Info
plant_owner: Optional[str] = None
plant_parent: Optional[str] = None
plant_name: Optional[str] = None
plant_id: Optional[str] = None
unit_name: Optional[str] = None
# Contacts
project_manager: Optional[str] = None
project_manager_company: Optional[str] = None
project_manager_title: Optional[str] = None
project_manager_email: Optional[str] = None
project_manager_phone: Optional[str] = None
engineer_company: Optional[str] = None
ec_firm: Optional[str] = None
phone: Optional[str] = None
# Technical
scope_text: Optional[str] = None
project_capacity: Optional[str] = None
environmental: Optional[str] = None
construction_labor: Optional[str] = None
operations_labor: Optional[str] = None
fuel_type: Optional[str] = None
# Derived text sections
schedule_text: Optional[str] = None
details_text: Optional[str] = None
@property
def owner_company(self) -> Optional[str]:
"""Alias for plant_owner (backward compatibility)."""
return self.plant_owner
def get_unique_key(self) -> str:
return self.project_id or self.project_name or self.source
def has_budget_info(self) -> bool:
return self.tiv_amount is not None and self.tiv_currency is not None
def has_location_info(self) -> bool:
return any([self.address, self.city_state_line, self.zone_county])
def has_timeline_info(self) -> bool:
return bool(self.schedule_text)
def to_dict(self) -> Dict[str, Any]:
"""Convert record to dictionary with non-None fields only."""
return {
k: v for k, v in {
"source": self.source,
"project_id": self.project_id,
"project_name": self.project_name,
"industry_code": self.industry_code,
"project_type": self.project_type,
"sector": self.sector,
"sic_code": self.sic_code,
"sic_product": self.sic_product,
"tiv_amount": self.tiv_amount,
"tiv_currency": self.tiv_currency,
"status": self.status,
"status_reason": self.status_reason,
"project_probability": self.project_probability,
"last_update": self.last_update,
"initial_release": self.initial_release,
"pec_timing": self.pec_timing,
"pec_activity": self.pec_activity,
"address": self.address,
"city_state_line": self.city_state_line,
"zone_county": self.zone_county,
"plant_owner": self.plant_owner,
"plant_parent": self.plant_parent,
"plant_name": self.plant_name,
"plant_id": self.plant_id,
"unit_name": self.unit_name,
"project_manager": self.project_manager,
"project_manager_company": self.project_manager_company,
"project_manager_title": self.project_manager_title,
"project_manager_email": self.project_manager_email,
"project_manager_phone": self.project_manager_phone,
"engineer_company": self.engineer_company,
"ec_firm": self.ec_firm,
"phone": self.phone,
"scope_text": self.scope_text,
"project_capacity": self.project_capacity,
"environmental": self.environmental,
"construction_labor": self.construction_labor,
"operations_labor": self.operations_labor,
"fuel_type": self.fuel_type,
"schedule_text": self.schedule_text,
"details_text": self.details_text,
}.items() if v is not None
}
@dataclass
class Milestone:
"""A project milestone extracted from schedule text."""
name: str
date_text: str = ""
sentence: str = ""
source: str = ""
def to_dict(self) -> Dict[str, str]:
return {"name": self.name, "dateText": self.date_text, "sentence": self.sentence}
@dataclass
class GeoComponents:
"""Parsed geographic components from city/state line."""
city: Optional[str] = None
state: Optional[str] = None
postal: Optional[str] = None
country: Optional[str] = None
def to_dict(self) -> Dict[str, Optional[str]]:
return {"city": self.city, "state": self.state, "postal": self.postal, "country": self.country}
|