adi-123's picture
Upload 21 files
8c35759 verified
"""Project record data model for structured extraction from PDF reports."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, Optional
@dataclass
class ProjectRecord:
"""Canonical structured fields parsed from a single PDF project report."""
# Identification
source: str
project_id: Optional[str] = None
project_name: Optional[str] = None
# Classification
industry_code: Optional[str] = None
project_type: Optional[str] = None
sector: Optional[str] = None
sic_code: Optional[str] = None
sic_product: Optional[str] = None
# Financial
tiv_amount: Optional[float] = None
tiv_currency: Optional[str] = None
# Status
status: Optional[str] = None
status_reason: Optional[str] = None
project_probability: Optional[str] = None
# Timeline
last_update: Optional[str] = None
initial_release: Optional[str] = None
pec_timing: Optional[str] = None
pec_activity: Optional[str] = None
# Location
address: Optional[str] = None
city_state_line: Optional[str] = None
zone_county: Optional[str] = None
# Plant Info
plant_owner: Optional[str] = None
plant_parent: Optional[str] = None
plant_name: Optional[str] = None
plant_id: Optional[str] = None
unit_name: Optional[str] = None
# Contacts
project_manager: Optional[str] = None
project_manager_company: Optional[str] = None
project_manager_title: Optional[str] = None
project_manager_email: Optional[str] = None
project_manager_phone: Optional[str] = None
engineer_company: Optional[str] = None
ec_firm: Optional[str] = None
phone: Optional[str] = None
# Technical
scope_text: Optional[str] = None
project_capacity: Optional[str] = None
environmental: Optional[str] = None
construction_labor: Optional[str] = None
operations_labor: Optional[str] = None
fuel_type: Optional[str] = None
# Derived text sections
schedule_text: Optional[str] = None
details_text: Optional[str] = None
@property
def owner_company(self) -> Optional[str]:
"""Alias for plant_owner (backward compatibility)."""
return self.plant_owner
def get_unique_key(self) -> str:
return self.project_id or self.project_name or self.source
def has_budget_info(self) -> bool:
return self.tiv_amount is not None and self.tiv_currency is not None
def has_location_info(self) -> bool:
return any([self.address, self.city_state_line, self.zone_county])
def has_timeline_info(self) -> bool:
return bool(self.schedule_text)
def to_dict(self) -> Dict[str, Any]:
"""Convert record to dictionary with non-None fields only."""
return {
k: v for k, v in {
"source": self.source,
"project_id": self.project_id,
"project_name": self.project_name,
"industry_code": self.industry_code,
"project_type": self.project_type,
"sector": self.sector,
"sic_code": self.sic_code,
"sic_product": self.sic_product,
"tiv_amount": self.tiv_amount,
"tiv_currency": self.tiv_currency,
"status": self.status,
"status_reason": self.status_reason,
"project_probability": self.project_probability,
"last_update": self.last_update,
"initial_release": self.initial_release,
"pec_timing": self.pec_timing,
"pec_activity": self.pec_activity,
"address": self.address,
"city_state_line": self.city_state_line,
"zone_county": self.zone_county,
"plant_owner": self.plant_owner,
"plant_parent": self.plant_parent,
"plant_name": self.plant_name,
"plant_id": self.plant_id,
"unit_name": self.unit_name,
"project_manager": self.project_manager,
"project_manager_company": self.project_manager_company,
"project_manager_title": self.project_manager_title,
"project_manager_email": self.project_manager_email,
"project_manager_phone": self.project_manager_phone,
"engineer_company": self.engineer_company,
"ec_firm": self.ec_firm,
"phone": self.phone,
"scope_text": self.scope_text,
"project_capacity": self.project_capacity,
"environmental": self.environmental,
"construction_labor": self.construction_labor,
"operations_labor": self.operations_labor,
"fuel_type": self.fuel_type,
"schedule_text": self.schedule_text,
"details_text": self.details_text,
}.items() if v is not None
}
@dataclass
class Milestone:
"""A project milestone extracted from schedule text."""
name: str
date_text: str = ""
sentence: str = ""
source: str = ""
def to_dict(self) -> Dict[str, str]:
return {"name": self.name, "dateText": self.date_text, "sentence": self.sentence}
@dataclass
class GeoComponents:
"""Parsed geographic components from city/state line."""
city: Optional[str] = None
state: Optional[str] = None
postal: Optional[str] = None
country: Optional[str] = None
def to_dict(self) -> Dict[str, Optional[str]]:
return {"city": self.city, "state": self.state, "postal": self.postal, "country": self.country}