odl-training-data / ingestion /validator.py
midah's picture
AI Training Data Deals Dashboard with automated discovery, extraction pipeline, and MCP integration
0efb0d1
"""
Deal Validator - Pydantic models for validation
"""
from pydantic import BaseModel, Field, validator
from typing import Optional, List, Dict, Any
from datetime import datetime
from enum import Enum
class Modality(str, Enum):
TEXT = "Text"
IMAGE = "Image"
AUDIO = "Audio"
VIDEO = "Video"
SATELLITE = "Satellite"
HEALTH_BIOTECH = "Health / Biotech"
CORPORATE_INFRA = "Corporate / data infra"
LEGAL_BOOKS = "Legal / Books"
COMMISSIONING = "Commissioning"
MIXED = "Mixed"
class DealType(str, Enum):
AGGREGATE = "aggregate"
PER_UNIT = "per-unit"
COMMISSIONING = "commissioning"
SETTLEMENT = "settlement"
ACQUISITION = "acquisition"
COMMONS = "commons"
IMPLICIT = "implicit"
HYBRID = "hybrid"
class DealStage(str, Enum):
ANNOUNCED = "announced"
RUMORED = "rumored"
CONFIRMED = "confirmed"
SETTLED = "settled"
class DealData(BaseModel):
"""Validated deal data structure"""
# Core fields (required)
provider: str = Field(..., min_length=1)
buyer: str = Field(..., min_length=1)
modality: str = Field(default="Text")
data_type: Optional[str] = None
# Pricing
price_usd: Optional[float] = Field(None, ge=0)
price_range_min_usd: Optional[float] = Field(None, ge=0)
price_range_max_usd: Optional[float] = Field(None, ge=0)
price_currency: str = Field(default="USD")
reported_terms: Optional[str] = None
pricing_mechanism: Optional[str] = None
deal_type: Optional[str] = None
# Dates
date: Optional[str] = None # YYYY-MM-DD or YYYY or YYYY-MM
start_date: Optional[datetime] = None
end_date: Optional[datetime] = None
duration_years: Optional[float] = Field(None, ge=0)
# Rights
exclusive: Optional[bool] = None
creators_compensated: Optional[bool] = None
creator_split_percentage: Optional[float] = Field(None, ge=0, le=100)
revenue_share: Optional[bool] = None
# Rights granted
training_allowed: Optional[bool] = None
finetuning_allowed: Optional[bool] = None
inference_allowed: Optional[bool] = None
redistribution_allowed: Optional[bool] = None
deletion_required: Optional[bool] = None
# Provenance
sources: List[str] = Field(default_factory=list)
source_primary: Optional[str] = None
discovered_via: Optional[str] = None
exa_query: Optional[str] = None
exa_score: Optional[float] = None
exa_retrieved_at: Optional[str] = None
# Extraction metadata
extraction_metadata: Optional[Dict[str, Any]] = None
raw_text_snippets: List[str] = Field(default_factory=list)
regex_confidence: Optional[str] = None
llm_confidence: Optional[str] = None
last_extracted: Optional[str] = None
# Linkages
linkages_metadata: Optional[Dict[str, Any]] = None
notes: Optional[str] = None
deal_stage: str = Field(default="announced")
confidence_score: float = Field(default=0.5, ge=0, le=1)
version: Optional[str] = None
@validator("modality")
def validate_modality(cls, v):
"""Normalize modality"""
if v in [m.value for m in Modality]:
return v
return "Text" # Default
@validator("deal_type")
def validate_deal_type(cls, v):
"""Normalize deal type"""
if not v:
return None
if v in [d.value for d in DealType]:
return v
return None
@validator("deal_stage")
def validate_deal_stage(cls, v):
"""Normalize deal stage"""
if v in [s.value for s in DealStage]:
return v
return "announced"
@validator("price_range_max_usd")
def validate_price_range(cls, v, values):
"""Ensure max >= min"""
if v and values.get("price_range_min_usd"):
if v < values["price_range_min_usd"]:
raise ValueError("price_range_max_usd must be >= price_range_min_usd")
return v
@validator("end_date")
def validate_end_date(cls, v, values):
"""Ensure end_date >= start_date"""
if v and values.get("start_date"):
if v < values["start_date"]:
raise ValueError("end_date must be >= start_date")
return v
class Config:
use_enum_values = True
class DealValidator:
"""Validator for deal data"""
@staticmethod
def validate(deal_dict: dict) -> tuple[bool, Optional[DealData], Optional[str]]:
"""
Validate deal data
Returns:
(is_valid, DealData object, error_message)
"""
try:
deal_data = DealData(**deal_dict)
return True, deal_data, None
except Exception as e:
return False, None, str(e)
@staticmethod
def is_valid_deal(deal_dict: dict) -> bool:
"""Check if deal is valid"""
is_valid, _, _ = DealValidator.validate(deal_dict)
return is_valid