sadickam's picture
Prepare for HF Space deployment
d01a7e3
"""Source manifest model for dataset freshness validation.
This module defines the Pydantic models for source_manifest.json which
tracks source file hashes and metadata for cache invalidation and
dataset integrity verification.
The source manifest is generated during the build pipeline and stored
alongside the RAG artifacts in the HuggingFace dataset. On server startup,
the manifest is loaded and validated to ensure:
1. Schema version compatibility (forward/backward migrations)
2. Artifact integrity (file hashes match expected values)
3. Build metadata is available for debugging
Manifest Structure:
The manifest contains:
- Schema version for future migrations
- Creation timestamp for the build
- Index version identifier (matches index_version.txt)
- List of source files with hashes and metadata
Schema Versioning:
The EXPECTED_SCHEMA_VERSION constant defines the schema version that
this server code expects. If the downloaded manifest has a different
schema version, validation will fail with a clear error message
indicating the version mismatch.
Lazy Loading:
Pydantic is imported inside a factory function to avoid import
overhead at module load time. This follows the project's lazy
loading pattern used throughout the codebase.
Example:
-------
>>> from rag_chatbot.api.manifest import SourceManifest, SourceFileEntry
>>> from datetime import datetime, UTC
>>>
>>> # Create a source file entry
>>> file_entry = SourceFileEntry(
... path="data/raw/ashrae_55.pdf",
... sha256="abc123...",
... size_bytes=1024000,
... modified_at=datetime.now(UTC),
... )
>>>
>>> # Create a manifest
>>> manifest = SourceManifest(
... schema_version="1.0.0",
... created_at=datetime.now(UTC),
... index_version="2024.01.15.001",
... source_files=[file_entry],
... )
>>> manifest.schema_version
'1.0.0'
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from datetime import datetime
# =============================================================================
# Module Exports
# =============================================================================
__all__: list[str] = [
"SourceManifest",
"SourceFileEntry",
"ManifestValidationError",
"EXPECTED_SCHEMA_VERSION",
]
# =============================================================================
# Constants
# =============================================================================
EXPECTED_SCHEMA_VERSION: str = "1.0.0"
"""Expected schema version for source_manifest.json.
This constant defines the schema version that the server expects.
If the downloaded manifest has a different schema version, the
server will fail to start with a clear error message.
Versioning scheme follows semantic versioning (MAJOR.MINOR.PATCH):
- MAJOR: Breaking changes that require code changes to handle
- MINOR: Backward-compatible additions (new optional fields)
- PATCH: Backward-compatible bug fixes (documentation, etc.)
History:
- 1.0.0: Initial schema version with core fields
"""
# =============================================================================
# Exceptions
# =============================================================================
class ManifestValidationError(Exception):
"""Exception raised when manifest validation fails.
This exception is raised when the source manifest fails validation
due to schema version mismatch, missing required fields, or invalid
field values.
The exception includes detailed information to help diagnose the
issue, including expected vs actual schema versions when applicable.
Attributes:
----------
message : str
Human-readable description of the validation failure.
expected_version : str | None
The schema version expected by the server (if applicable).
actual_version : str | None
The schema version found in the manifest (if applicable).
field_name : str | None
Name of the field that failed validation (if applicable).
Example:
-------
>>> try:
... validator.validate()
... except ManifestValidationError as e:
... print(f"Validation failed: {e.message}")
... if e.expected_version:
... print(f"Expected version: {e.expected_version}")
... print(f"Actual version: {e.actual_version}")
"""
def __init__(
self,
message: str,
*,
expected_version: str | None = None,
actual_version: str | None = None,
field_name: str | None = None,
) -> None:
"""Initialize a ManifestValidationError.
Args:
----
message: Human-readable description of the validation failure.
expected_version: Schema version expected by server (optional).
actual_version: Schema version found in manifest (optional).
field_name: Name of the field that failed validation (optional).
"""
super().__init__(message)
self.message = message
self.expected_version = expected_version
self.actual_version = actual_version
self.field_name = field_name
def __str__(self) -> str:
"""Return a string representation of the error.
Returns
-------
Formatted error message with version and field info if available.
"""
parts = [self.message]
# Add version mismatch details if present
if self.expected_version is not None and self.actual_version is not None:
parts.append(
f"(expected: {self.expected_version}, actual: {self.actual_version})"
)
# Add field name if present
if self.field_name:
parts.append(f"[field: {self.field_name}]")
return " ".join(parts)
# =============================================================================
# Pydantic Model Factory (Lazy Loading)
# =============================================================================
# This factory function creates the Pydantic models lazily to avoid
# importing Pydantic at module load time. This follows the project's
# lazy loading pattern used throughout the codebase.
# =============================================================================
def _create_source_file_entry_model() -> type:
"""Create the SourceFileEntry Pydantic model.
This factory function creates the model class with lazy imports
to avoid loading Pydantic at module import time.
Returns
-------
type: The SourceFileEntry Pydantic model class.
"""
# Import datetime for Pydantic's runtime type resolution
from datetime import datetime # noqa: F401 - Used by Pydantic field annotation
from pydantic import BaseModel, ConfigDict, Field, field_validator
class _SourceFileEntry(BaseModel):
"""Model for a single source file entry in the manifest.
Each SourceFileEntry represents one source file that was used
to build the RAG index. It includes the file path, content hash,
size, and modification timestamp for verification and debugging.
The SHA256 hash enables verification that source files haven't
changed since the index was built. If source files change, a
rebuild is required to keep the index in sync.
Attributes:
----------
path : str
Relative path to the source file from the project root.
Example: "data/raw/ashrae_55.pdf"
sha256 : str
SHA256 hash of the file content (64 hex characters).
Used for integrity verification and change detection.
size_bytes : int
File size in bytes. Useful for debugging and validation.
Must be non-negative.
modified_at : datetime
Last modification timestamp of the source file.
Should be in UTC timezone for consistency.
Example:
-------
>>> from datetime import datetime, UTC
>>> entry = _SourceFileEntry(
... path="data/raw/ashrae_55.pdf",
... sha256="abc123def456...",
... size_bytes=1024000,
... modified_at=datetime.now(UTC),
... )
>>> entry.path
'data/raw/ashrae_55.pdf'
"""
# =====================================================================
# Model Configuration
# =====================================================================
model_config = ConfigDict(
# Forbid extra fields to catch typos in manifest files
extra="forbid",
# Make instances immutable for thread-safety
frozen=True,
# Enable JSON schema generation with examples
json_schema_extra={
"examples": [
{
"path": "data/raw/ashrae_55.pdf",
# Example SHA256 hash (64 hex characters)
"sha256": (
"e3b0c44298fc1c149afbf4c8996fb924"
"27ae41e4649b934ca495991b7852b855"
),
"size_bytes": 1048576,
"modified_at": "2024-01-15T10:30:00Z",
}
]
},
)
# =====================================================================
# Fields
# =====================================================================
path: str = Field(
..., # Required field
min_length=1,
description=(
"Relative path to the source file from project root. "
"Example: 'data/raw/ashrae_55.pdf'"
),
)
sha256: str = Field(
..., # Required field
min_length=64,
max_length=64,
pattern=r"^[a-f0-9]{64}$",
description=(
"SHA256 hash of the file content as 64 lowercase hex characters. "
"Used for integrity verification and change detection."
),
)
size_bytes: int = Field(
..., # Required field
ge=0, # File size must be non-negative
description="File size in bytes. Must be non-negative.",
)
modified_at: datetime = Field(
..., # Required field
description=(
"Last modification timestamp of the source file. "
"Should be in UTC timezone for consistency."
),
)
# =====================================================================
# Validators
# =====================================================================
@field_validator("path", mode="before")
@classmethod
def _normalize_path(cls, value: object) -> str:
"""Normalize the path field by stripping whitespace.
Args:
----
value: The input value to normalize.
Returns:
-------
Stripped path string.
Raises:
------
ValueError: If value is None or empty after stripping.
"""
if value is None:
msg = "path cannot be None"
raise ValueError(msg)
path = str(value).strip()
if not path:
msg = "path cannot be empty"
raise ValueError(msg)
return path
@field_validator("sha256", mode="before")
@classmethod
def _normalize_sha256(cls, value: object) -> str:
"""Normalize the sha256 field to lowercase.
Args:
----
value: The input value to normalize.
Returns:
-------
Lowercase sha256 string.
Raises:
------
ValueError: If value is None or invalid format.
"""
if value is None:
msg = "sha256 cannot be None"
raise ValueError(msg)
# Convert to lowercase string and strip whitespace
sha256 = str(value).strip().lower()
if not sha256:
msg = "sha256 cannot be empty"
raise ValueError(msg)
return sha256
return _SourceFileEntry
def _create_source_manifest_model(source_file_entry_class: type) -> type:
"""Create the SourceManifest Pydantic model.
This factory function creates the model class with lazy imports
to avoid loading Pydantic at module import time.
Args:
----
source_file_entry_class: The SourceFileEntry model class to use
for the source_files field type annotation.
Returns:
-------
type: The SourceManifest Pydantic model class.
"""
# Import datetime for Pydantic's runtime type resolution
from datetime import datetime # noqa: F401 - Used by Pydantic field annotation
from pydantic import BaseModel, ConfigDict, Field, field_validator
class _SourceManifest(BaseModel):
"""Model for the source_manifest.json file.
The SourceManifest tracks metadata about the source files and
build process for the RAG index. It enables:
- Schema version validation for compatibility checking
- Build timestamp tracking for debugging
- Index version matching with index_version.txt
- Source file tracking for change detection
This manifest is generated during the build pipeline and stored
alongside the RAG artifacts in the HuggingFace dataset.
Attributes:
----------
schema_version : str
Schema version of this manifest (e.g., "1.0.0").
Must match EXPECTED_SCHEMA_VERSION for validation to pass.
created_at : datetime
When this manifest was generated (build timestamp).
Should be in UTC timezone for consistency.
index_version : str
Index version identifier that matches index_version.txt.
Used for cache invalidation and version tracking.
source_files : list[SourceFileEntry]
List of source files used to build the index.
Each entry includes path, hash, size, and timestamp.
Example:
-------
>>> from datetime import datetime, UTC
>>> manifest = _SourceManifest(
... schema_version="1.0.0",
... created_at=datetime.now(UTC),
... index_version="2024.01.15.001",
... source_files=[],
... )
>>> manifest.schema_version
'1.0.0'
"""
# =====================================================================
# Model Configuration
# =====================================================================
model_config = ConfigDict(
# Forbid extra fields to catch typos in manifest files
extra="forbid",
# Make instances immutable for thread-safety
frozen=True,
# Enable JSON schema generation with examples
json_schema_extra={
"examples": [
{
"schema_version": "1.0.0",
"created_at": "2024-01-15T10:30:00Z",
"index_version": "2024.01.15.001",
"source_files": [
{
"path": "data/raw/ashrae_55.pdf",
"sha256": "e3b0c44...",
"size_bytes": 1048576,
"modified_at": "2024-01-15T10:30:00Z",
}
],
}
]
},
)
# =====================================================================
# Fields
# =====================================================================
schema_version: str = Field(
..., # Required field
min_length=1,
pattern=r"^\d+\.\d+\.\d+$", # Semantic versioning format
description=(
"Schema version of this manifest (semantic versioning). "
"Example: '1.0.0'. Must match EXPECTED_SCHEMA_VERSION."
),
)
created_at: datetime = Field(
..., # Required field
description=(
"When this manifest was generated (build timestamp). "
"Should be in UTC timezone for consistency."
),
)
index_version: str = Field(
..., # Required field
min_length=1,
description=(
"Index version identifier that matches index_version.txt. "
"Used for cache invalidation and version tracking."
),
)
source_files: list[source_file_entry_class] = Field( # type: ignore[valid-type]
default_factory=list,
description=(
"List of source files used to build the index. "
"Each entry includes path, hash, size, and timestamp."
),
)
# =====================================================================
# Validators
# =====================================================================
@field_validator("schema_version", mode="before")
@classmethod
def _normalize_schema_version(cls, value: object) -> str:
"""Normalize the schema_version field.
Args:
----
value: The input value to normalize.
Returns:
-------
Stripped schema version string.
Raises:
------
ValueError: If value is None or empty.
"""
if value is None:
msg = "schema_version cannot be None"
raise ValueError(msg)
version = str(value).strip()
if not version:
msg = "schema_version cannot be empty"
raise ValueError(msg)
return version
@field_validator("index_version", mode="before")
@classmethod
def _normalize_index_version(cls, value: object) -> str:
"""Normalize the index_version field.
Args:
----
value: The input value to normalize.
Returns:
-------
Stripped index version string.
Raises:
------
ValueError: If value is None or empty.
"""
if value is None:
msg = "index_version cannot be None"
raise ValueError(msg)
version = str(value).strip()
if not version:
msg = "index_version cannot be empty"
raise ValueError(msg)
return version
# =====================================================================
# Instance Methods
# =====================================================================
def to_dict(self) -> dict[str, Any]:
"""Convert the manifest to a JSON-serializable dictionary.
This method produces a dictionary suitable for JSON serialization.
Datetime fields are converted to ISO 8601 format strings.
Returns:
-------
dict[str, Any]
Dictionary with all fields, datetimes as ISO 8601 strings.
Example:
-------
>>> manifest.to_dict()
{
"schema_version": "1.0.0",
"created_at": "2024-01-15T10:30:00Z",
"index_version": "2024.01.15.001",
"source_files": [...]
}
"""
# Build list of source file dictionaries for JSON serialization
# Using Any type since source_files contains dynamically typed instances
source_file_dicts: list[dict[str, Any]] = []
for file_entry in self.source_files:
# Access attributes directly - type safety ensured by Pydantic
entry: Any = file_entry
source_file_dicts.append(
{
"path": entry.path,
"sha256": entry.sha256,
"size_bytes": entry.size_bytes,
"modified_at": entry.modified_at.isoformat(),
}
)
return {
"schema_version": self.schema_version,
"created_at": self.created_at.isoformat(),
"index_version": self.index_version,
"source_files": source_file_dicts,
}
@property
def total_source_size_bytes(self) -> int:
"""Calculate the total size of all source files.
Returns
-------
Total size in bytes of all source files in the manifest.
"""
# Sum size_bytes from all source files
# Using Any type since source_files contains dynamically typed instances
total: int = 0
for file_entry in self.source_files:
entry: Any = file_entry
total += int(entry.size_bytes)
return total
@property
def source_file_count(self) -> int:
"""Get the number of source files in the manifest.
Returns
-------
Number of source files.
"""
return len(self.source_files)
return _SourceManifest
# =============================================================================
# Model Class Cache
# =============================================================================
# These module-level variables cache the lazily-created Pydantic model classes.
# The first access creates the class; subsequent accesses return the cached class.
# =============================================================================
_source_file_entry_model: type | None = None
_source_manifest_model: type | None = None
def _get_source_file_entry() -> type:
"""Get or create the SourceFileEntry model class.
This function implements the lazy loading pattern. The Pydantic
model class is created on first call and cached for subsequent calls.
Returns
-------
type: The SourceFileEntry Pydantic model class.
"""
global _source_file_entry_model # noqa: PLW0603
if _source_file_entry_model is None:
_source_file_entry_model = _create_source_file_entry_model()
return _source_file_entry_model
def _get_source_manifest() -> type:
"""Get or create the SourceManifest model class.
This function implements the lazy loading pattern. The Pydantic
model class is created on first call and cached for subsequent calls.
Returns
-------
type: The SourceManifest Pydantic model class.
"""
global _source_manifest_model # noqa: PLW0603
if _source_manifest_model is None:
# First get the SourceFileEntry class (creates it if needed)
source_file_entry_class = _get_source_file_entry()
_source_manifest_model = _create_source_manifest_model(source_file_entry_class)
return _source_manifest_model
# =============================================================================
# Public Model Classes (Lazy Proxies)
# =============================================================================
# These classes act as proxies that defer model creation until first use.
# This enables lazy loading while maintaining the appearance of regular classes.
# =============================================================================
class SourceFileEntry:
"""Model for a single source file entry in the manifest.
This is a lazy-loading proxy class. The actual Pydantic model is
created on first use to avoid importing Pydantic at module load time.
Each SourceFileEntry represents one source file that was used
to build the RAG index. It includes the file path, content hash,
size, and modification timestamp for verification and debugging.
Attributes:
----------
path : str
Relative path to the source file from the project root.
sha256 : str
SHA256 hash of the file content (64 hex characters).
size_bytes : int
File size in bytes.
modified_at : datetime
Last modification timestamp of the source file.
Example:
-------
>>> from datetime import datetime, UTC
>>> entry = SourceFileEntry(
... path="data/raw/ashrae_55.pdf",
... sha256="abc123def456...",
... size_bytes=1024000,
... modified_at=datetime.now(UTC),
... )
>>> entry.path
'data/raw/ashrae_55.pdf'
"""
# Type stubs for mypy
path: str
sha256: str
size_bytes: int
modified_at: datetime
def __new__(cls, **kwargs: object) -> SourceFileEntry:
"""Create a new SourceFileEntry instance.
Args:
----
**kwargs: Field values for the model. Required fields:
- path: str
- sha256: str
- size_bytes: int
- modified_at: datetime
Returns:
-------
SourceFileEntry: A SourceFileEntry Pydantic model instance.
Raises:
------
pydantic.ValidationError: If required fields are missing or
field values fail validation.
"""
model_class = _get_source_file_entry()
return model_class(**kwargs) # type: ignore[no-any-return]
@classmethod
def model_validate(cls, obj: object) -> SourceFileEntry:
"""Validate and create a model from an object.
Args:
----
obj: Object to validate. Can be a dict with the required fields
or another object with matching attributes.
Returns:
-------
SourceFileEntry: Validated SourceFileEntry instance.
Raises:
------
pydantic.ValidationError: If validation fails.
"""
model_class = _get_source_file_entry()
return model_class.model_validate(obj) # type: ignore[attr-defined, no-any-return]
@classmethod
def model_json_schema(cls) -> dict[str, Any]:
"""Get the JSON schema for the SourceFileEntry model.
Returns
-------
dict[str, Any]: JSON schema dictionary.
"""
model_class = _get_source_file_entry()
return model_class.model_json_schema() # type: ignore[attr-defined, no-any-return]
class SourceManifest:
"""Model for the source_manifest.json file.
This is a lazy-loading proxy class. The actual Pydantic model is
created on first use to avoid importing Pydantic at module load time.
The SourceManifest tracks metadata about the source files and
build process for the RAG index. It enables:
- Schema version validation for compatibility checking
- Build timestamp tracking for debugging
- Index version matching with index_version.txt
- Source file tracking for change detection
Attributes:
----------
schema_version : str
Schema version of this manifest (e.g., "1.0.0").
created_at : datetime
When this manifest was generated.
index_version : str
Index version identifier.
source_files : list[SourceFileEntry]
List of source files used to build the index.
Example:
-------
>>> from datetime import datetime, UTC
>>> manifest = SourceManifest(
... schema_version="1.0.0",
... created_at=datetime.now(UTC),
... index_version="2024.01.15.001",
... source_files=[],
... )
>>> manifest.schema_version
'1.0.0'
"""
# Type stubs for mypy
schema_version: str
created_at: datetime
index_version: str
source_files: list[SourceFileEntry]
def __new__(cls, **kwargs: object) -> SourceManifest:
"""Create a new SourceManifest instance.
Args:
----
**kwargs: Field values for the model. Required fields:
- schema_version: str
- created_at: datetime
- index_version: str
Optional fields:
- source_files: list[SourceFileEntry] (defaults to [])
Returns:
-------
SourceManifest: A SourceManifest Pydantic model instance.
Raises:
------
pydantic.ValidationError: If required fields are missing or
field values fail validation.
"""
model_class = _get_source_manifest()
return model_class(**kwargs) # type: ignore[no-any-return]
@classmethod
def model_validate(cls, obj: object) -> SourceManifest:
"""Validate and create a model from an object.
Args:
----
obj: Object to validate. Can be a dict with the required fields
or another object with matching attributes.
Returns:
-------
SourceManifest: Validated SourceManifest instance.
Raises:
------
pydantic.ValidationError: If validation fails.
"""
model_class = _get_source_manifest()
return model_class.model_validate(obj) # type: ignore[attr-defined, no-any-return]
@classmethod
def model_json_schema(cls) -> dict[str, Any]:
"""Get the JSON schema for the SourceManifest model.
Returns
-------
dict[str, Any]: JSON schema dictionary.
"""
model_class = _get_source_manifest()
return model_class.model_json_schema() # type: ignore[attr-defined, no-any-return]
def to_dict(self) -> dict[str, Any]:
"""Convert the manifest to a JSON-serializable dictionary.
This method is a proxy to the underlying Pydantic model's to_dict().
It produces a dictionary suitable for JSON serialization with
datetime fields converted to ISO 8601 format strings.
Returns:
-------
dict[str, Any]: Dictionary with all fields serialized.
Note:
----
This is a stub method. The actual implementation is on
the dynamically created Pydantic model class.
"""
# This should never be called directly on the proxy class
# Instances are actually the underlying Pydantic model
msg = "Call to_dict() on actual instance"
raise NotImplementedError(msg) # pragma: no cover
@property
def total_source_size_bytes(self) -> int:
"""Calculate the total size of all source files.
Returns
-------
Total size in bytes.
"""
# This is a stub - actual implementation on the model
raise NotImplementedError # pragma: no cover
@property
def source_file_count(self) -> int:
"""Get the number of source files.
Returns
-------
Number of source files.
"""
# This is a stub - actual implementation on the model
raise NotImplementedError # pragma: no cover