"""
Config loader — read YAML / JSON checkpoint files.
A *checkpoint* file ties together a data source and an expectation suite
so that validations can be run declaratively from the CLI.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict
import yaml # type: ignore
from validatex.core.suite import ExpectationSuite
[docs]
@dataclass
class CheckpointConfig:
"""
Represents a checkpoint configuration.
Attributes
----------
name : str
Checkpoint name.
suite_path : str
Path to the expectation suite YAML/JSON file.
data_source : dict
Data source configuration (type, path, query, etc.).
engine : str
Engine to use: ``"pandas"`` or ``"spark"``.
report : dict
Report configuration (format, output_path).
"""
name: str = "default_checkpoint"
suite_path: str = ""
data_source: Dict[str, Any] = field(default_factory=dict)
engine: str = "pandas"
report: Dict[str, Any] = field(default_factory=dict)
[docs]
def load_suite(self) -> ExpectationSuite:
"""Load the expectation suite from the configured path."""
# Ensure expectations are registered
import validatex.expectations # noqa: F401
return ExpectationSuite.load(self.suite_path)
[docs]
def load_data(self, spark_session: Any = None) -> Any:
"""Load data based on the data source configuration."""
ds_type = self.data_source.get("type", "csv")
path = self.data_source.get("path", "")
query = self.data_source.get("query", "")
connection = self.data_source.get("connection_string", "")
if ds_type == "csv":
from validatex.datasources.csv_source import CSVDataSource
source: Any = CSVDataSource(filepath=path)
elif ds_type == "parquet":
from validatex.datasources.parquet_source import ParquetDataSource
source = ParquetDataSource(filepath=path)
elif ds_type == "database":
from validatex.datasources.database_source import DatabaseDataSource
source = DatabaseDataSource(connection_string=connection, query=query)
else:
raise ValueError(f"Unsupported data source type: {ds_type}")
return source.load(engine=self.engine, spark_session=spark_session)
[docs]
def load_checkpoint(filepath: str) -> CheckpointConfig:
"""
Load a checkpoint configuration from a YAML or JSON file.
Parameters
----------
filepath : str
Path to the checkpoint config file.
Returns
-------
CheckpointConfig
"""
with open(filepath, "r", encoding="utf-8") as f:
if filepath.endswith((".yaml", ".yml")):
data = yaml.safe_load(f)
else:
import json
data = json.load(f)
return CheckpointConfig(
name=data.get("name", "default_checkpoint"),
suite_path=data.get("suite_path", ""),
data_source=data.get("data_source", {}),
engine=data.get("engine", "pandas"),
report=data.get("report", {}),
)