Spaces:
Running
Running
| """Project record data model for structured extraction from PDF reports.""" | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import Any, Dict, Optional | |
| class ProjectRecord: | |
| """Canonical structured fields parsed from a single PDF project report.""" | |
| # Identification | |
| source: str | |
| project_id: Optional[str] = None | |
| project_name: Optional[str] = None | |
| # Classification | |
| industry_code: Optional[str] = None | |
| project_type: Optional[str] = None | |
| sector: Optional[str] = None | |
| sic_code: Optional[str] = None | |
| sic_product: Optional[str] = None | |
| # Financial | |
| tiv_amount: Optional[float] = None | |
| tiv_currency: Optional[str] = None | |
| # Status | |
| status: Optional[str] = None | |
| status_reason: Optional[str] = None | |
| project_probability: Optional[str] = None | |
| # Timeline | |
| last_update: Optional[str] = None | |
| initial_release: Optional[str] = None | |
| pec_timing: Optional[str] = None | |
| pec_activity: Optional[str] = None | |
| # Location | |
| address: Optional[str] = None | |
| city_state_line: Optional[str] = None | |
| zone_county: Optional[str] = None | |
| # Plant Info | |
| plant_owner: Optional[str] = None | |
| plant_parent: Optional[str] = None | |
| plant_name: Optional[str] = None | |
| plant_id: Optional[str] = None | |
| unit_name: Optional[str] = None | |
| # Contacts | |
| project_manager: Optional[str] = None | |
| project_manager_company: Optional[str] = None | |
| project_manager_title: Optional[str] = None | |
| project_manager_email: Optional[str] = None | |
| project_manager_phone: Optional[str] = None | |
| engineer_company: Optional[str] = None | |
| ec_firm: Optional[str] = None | |
| phone: Optional[str] = None | |
| # Technical | |
| scope_text: Optional[str] = None | |
| project_capacity: Optional[str] = None | |
| environmental: Optional[str] = None | |
| construction_labor: Optional[str] = None | |
| operations_labor: Optional[str] = None | |
| fuel_type: Optional[str] = None | |
| # Derived text sections | |
| schedule_text: Optional[str] = None | |
| details_text: Optional[str] = None | |
| def owner_company(self) -> Optional[str]: | |
| """Alias for plant_owner (backward compatibility).""" | |
| return self.plant_owner | |
| def get_unique_key(self) -> str: | |
| return self.project_id or self.project_name or self.source | |
| def has_budget_info(self) -> bool: | |
| return self.tiv_amount is not None and self.tiv_currency is not None | |
| def has_location_info(self) -> bool: | |
| return any([self.address, self.city_state_line, self.zone_county]) | |
| def has_timeline_info(self) -> bool: | |
| return bool(self.schedule_text) | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert record to dictionary with non-None fields only.""" | |
| return { | |
| k: v for k, v in { | |
| "source": self.source, | |
| "project_id": self.project_id, | |
| "project_name": self.project_name, | |
| "industry_code": self.industry_code, | |
| "project_type": self.project_type, | |
| "sector": self.sector, | |
| "sic_code": self.sic_code, | |
| "sic_product": self.sic_product, | |
| "tiv_amount": self.tiv_amount, | |
| "tiv_currency": self.tiv_currency, | |
| "status": self.status, | |
| "status_reason": self.status_reason, | |
| "project_probability": self.project_probability, | |
| "last_update": self.last_update, | |
| "initial_release": self.initial_release, | |
| "pec_timing": self.pec_timing, | |
| "pec_activity": self.pec_activity, | |
| "address": self.address, | |
| "city_state_line": self.city_state_line, | |
| "zone_county": self.zone_county, | |
| "plant_owner": self.plant_owner, | |
| "plant_parent": self.plant_parent, | |
| "plant_name": self.plant_name, | |
| "plant_id": self.plant_id, | |
| "unit_name": self.unit_name, | |
| "project_manager": self.project_manager, | |
| "project_manager_company": self.project_manager_company, | |
| "project_manager_title": self.project_manager_title, | |
| "project_manager_email": self.project_manager_email, | |
| "project_manager_phone": self.project_manager_phone, | |
| "engineer_company": self.engineer_company, | |
| "ec_firm": self.ec_firm, | |
| "phone": self.phone, | |
| "scope_text": self.scope_text, | |
| "project_capacity": self.project_capacity, | |
| "environmental": self.environmental, | |
| "construction_labor": self.construction_labor, | |
| "operations_labor": self.operations_labor, | |
| "fuel_type": self.fuel_type, | |
| "schedule_text": self.schedule_text, | |
| "details_text": self.details_text, | |
| }.items() if v is not None | |
| } | |
| class Milestone: | |
| """A project milestone extracted from schedule text.""" | |
| name: str | |
| date_text: str = "" | |
| sentence: str = "" | |
| source: str = "" | |
| def to_dict(self) -> Dict[str, str]: | |
| return {"name": self.name, "dateText": self.date_text, "sentence": self.sentence} | |
| class GeoComponents: | |
| """Parsed geographic components from city/state line.""" | |
| city: Optional[str] = None | |
| state: Optional[str] = None | |
| postal: Optional[str] = None | |
| country: Optional[str] = None | |
| def to_dict(self) -> Dict[str, Optional[str]]: | |
| return {"city": self.city, "state": self.state, "postal": self.postal, "country": self.country} | |