"""
Validation result data models.
Every expectation run produces an :class:`ExpectationResult`.
A full validation run aggregates them into a :class:`ValidationResult`.
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, List, Optional
# ---------------------------------------------------------------------------
# Severity constants
# ---------------------------------------------------------------------------
SEVERITY_CRITICAL = "critical"
SEVERITY_WARNING = "warning"
SEVERITY_INFO = "info"
SEVERITY_WEIGHTS = {
SEVERITY_CRITICAL: 3,
SEVERITY_WARNING: 2,
SEVERITY_INFO: 1,
}
# Default severity mapping based on expectation type keywords
_SEVERITY_MAP: Dict[str, str] = {
# Critical — data integrity
"expect_column_to_exist": SEVERITY_CRITICAL,
"expect_column_to_not_be_null": SEVERITY_CRITICAL,
"expect_column_values_to_be_unique": SEVERITY_CRITICAL,
"expect_table_row_count_to_equal": SEVERITY_CRITICAL,
"expect_table_row_count_to_be_between": SEVERITY_CRITICAL,
"expect_table_columns_to_match_ordered_list": SEVERITY_CRITICAL,
"expect_table_columns_to_match_set": SEVERITY_CRITICAL,
"expect_table_column_count_to_equal": SEVERITY_CRITICAL,
"expect_compound_columns_to_be_unique": SEVERITY_CRITICAL,
# Warning — data quality
"expect_column_values_to_be_between": SEVERITY_WARNING,
"expect_column_values_to_be_in_set": SEVERITY_WARNING,
"expect_column_values_to_not_be_in_set": SEVERITY_WARNING,
"expect_column_values_to_match_regex": SEVERITY_WARNING,
"expect_column_values_to_be_of_type": SEVERITY_WARNING,
"expect_column_values_to_be_dateutil_parseable": SEVERITY_WARNING,
"expect_column_pair_values_a_to_be_greater_than_b": SEVERITY_WARNING,
"expect_column_pair_values_to_be_equal": SEVERITY_WARNING,
"expect_multicolumn_sum_to_equal": SEVERITY_WARNING,
# Info — statistical / informational
"expect_column_value_lengths_to_be_between": SEVERITY_INFO,
"expect_column_max_to_be_between": SEVERITY_INFO,
"expect_column_min_to_be_between": SEVERITY_INFO,
"expect_column_mean_to_be_between": SEVERITY_INFO,
"expect_column_stdev_to_be_between": SEVERITY_INFO,
"expect_column_distinct_values_to_be_in_set": SEVERITY_INFO,
"expect_column_proportion_of_unique_values_to_be_between": SEVERITY_INFO,
}
[docs]
def get_severity(expectation_type: str, meta: Optional[Dict] = None) -> str:
"""Return severity for an expectation type (user meta overrides default)."""
if meta and "severity" in meta:
return str(meta["severity"])
return _SEVERITY_MAP.get(expectation_type, SEVERITY_WARNING)
# ---------------------------------------------------------------------------
# Native-type coercion helper
# ---------------------------------------------------------------------------
[docs]
def to_native(value: Any) -> Any:
"""
Convert numpy / pandas scalar types to native Python types.
Professional tools NEVER leak internal types like ``np.int64(20)``.
"""
if value is None:
return None
if isinstance(value, (bool,)):
return bool(value)
if isinstance(value, (int, float, str)):
return value
# numpy scalar types
try:
import numpy as np
if isinstance(value, np.integer):
return int(value)
if isinstance(value, np.floating):
return float(value)
if isinstance(value, np.bool_):
return bool(value)
if isinstance(value, np.ndarray):
return [to_native(v) for v in value.tolist()]
except ImportError:
pass
# dict — recursively clean
if isinstance(value, dict):
return {k: to_native(v) for k, v in value.items()}
# list / tuple
if isinstance(value, (list, tuple)):
return [to_native(v) for v in value]
return value
# ---------------------------------------------------------------------------
# ExpectationResult
# ---------------------------------------------------------------------------
[docs]
@dataclass
class ExpectationResult:
"""Result of a single expectation evaluation."""
expectation_type: str
success: bool
column: Optional[str] = None
observed_value: Any = None
element_count: int = 0
unexpected_count: int = 0
unexpected_percent: float = 0.0
unexpected_values: List[Any] = field(default_factory=list)
details: Dict[str, Any] = field(default_factory=dict)
exception_info: Optional[str] = None
meta: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
"""Sanitize all numeric values to native Python types."""
self.observed_value = to_native(self.observed_value)
self.element_count = int(self.element_count) if self.element_count else 0
self.unexpected_count = int(self.unexpected_count) if self.unexpected_count else 0
self.unexpected_percent = float(self.unexpected_percent) if self.unexpected_percent else 0.0
self.unexpected_values = [to_native(v) for v in self.unexpected_values]
self.details = to_native(self.details) or {}
@property
def status(self) -> str:
if self.exception_info:
return "ERROR"
return "PASSED" if self.success else "FAILED"
@property
def status_icon(self) -> str:
icons = {"PASSED": "✅", "FAILED": "❌", "ERROR": "⚠️"}
return icons.get(self.status, "❓")
@property
def severity(self) -> str:
"""Return severity level for this expectation."""
return get_severity(self.expectation_type, self.meta)
@property
def severity_icon(self) -> str:
icons = {
SEVERITY_CRITICAL: "🔴",
SEVERITY_WARNING: "🟡",
SEVERITY_INFO: "🔵",
}
return icons.get(self.severity, "🟡")
@property
def human_observed(self) -> str:
"""
Return a human-readable string for the observed value.
Converts raw dicts / technical strings into executive-friendly text.
"""
val = self.observed_value
if val is None:
return "—"
# Dict-style observed values → readable sentences
if isinstance(val, dict):
parts = []
if "min" in val and "max" in val:
parts.append(f"Min: {val['min']} | Max: {val['max']}")
if "min_length" in val and "max_length" in val:
parts.append(f"Length: {val['min_length']} – {val['max_length']}")
if "unique_values" in val:
parts.append(f"Distinct values: {val['unique_values']}")
if "distinct_values" in val:
vals = val["distinct_values"]
if isinstance(vals, list):
parts.append(f"Distinct: {', '.join(str(v) for v in vals[:8])}")
else:
parts.append(f"Distinct values: {vals}")
if parts:
return " · ".join(parts)
# Fallback: key=value pairs
return " · ".join(f"{k}: {v}" for k, v in val.items())
# String containing "unique out of" → reformat
s = str(val)
if "unique out of" in s:
try:
parts = s.split()
uniq = int(parts[0])
total = int(parts[4])
pct = round(uniq / total * 100, 1) if total > 0 else 0
return f"Unique: {uniq}/{total} ({pct}%)"
except (IndexError, ValueError):
pass
# List → join
if isinstance(val, list):
if len(val) == 0:
return "—"
return ", ".join(str(v) for v in val[:10])
return str(val)
[docs]
def to_dict(self) -> Dict[str, Any]:
return {
"expectation_type": self.expectation_type,
"success": self.success,
"status": self.status,
"severity": self.severity,
"column": self.column,
"observed_value": to_native(self.observed_value),
"element_count": self.element_count,
"unexpected_count": self.unexpected_count,
"unexpected_percent": round(self.unexpected_percent, 4),
"unexpected_values": [to_native(v) for v in self.unexpected_values[:20]],
"details": to_native(self.details),
"exception_info": self.exception_info,
"meta": self.meta,
}
# ---------------------------------------------------------------------------
# ColumnHealthSummary
# ---------------------------------------------------------------------------
[docs]
@dataclass
class ColumnHealthSummary:
"""Aggregated health metrics for a single column."""
column: str
checks: int = 0
passed: int = 0
failed: int = 0
errors: int = 0
null_count: Optional[int] = None
null_percent: Optional[float] = None
unique_count: Optional[int] = None
unique_percent: Optional[float] = None
total_rows: Optional[int] = None
@property
def health_score(self) -> float:
if self.checks == 0:
return 100.0
return round((self.passed / self.checks) * 100, 1)
[docs]
def to_dict(self) -> Dict[str, Any]:
return {
"column": self.column,
"checks": self.checks,
"passed": self.passed,
"failed": self.failed,
"errors": self.errors,
"health_score": self.health_score,
"null_count": self.null_count,
"null_percent": (round(self.null_percent, 2) if self.null_percent is not None else None),
"unique_count": self.unique_count,
"unique_percent": (round(self.unique_percent, 2) if self.unique_percent is not None else None),
}
# ---------------------------------------------------------------------------
# ValidationResult
# ---------------------------------------------------------------------------
[docs]
@dataclass
class ValidationResult:
"""Aggregate result of running an entire expectation suite."""
suite_name: str
results: List[ExpectationResult] = field(default_factory=list)
run_time: Optional[datetime] = None
run_duration_seconds: float = 0.0
data_source: Optional[str] = None
engine: str = "pandas"
statistics: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
if self.run_time is None:
self.run_time = datetime.now()
@property
def success(self) -> bool:
"""True only if *every* expectation passed."""
return all(r.success for r in self.results)
@property
def total_expectations(self) -> int:
return len(self.results)
@property
def successful_expectations(self) -> int:
return sum(1 for r in self.results if r.success)
@property
def failed_expectations(self) -> int:
return sum(1 for r in self.results if not r.success and not r.exception_info)
@property
def errored_expectations(self) -> int:
return sum(1 for r in self.results if r.exception_info)
@property
def success_percent(self) -> float:
if not self.results:
return 0.0
return round((self.successful_expectations / self.total_expectations) * 100, 2)
# -- Quality Score -----------------------------------------------------
[docs]
def compute_quality_score(self) -> float:
"""
Compute a weighted data quality score (0–100).
Severity weights:
- Critical: ×3
- Warning : ×2
- Info : ×1
Score = 100 × (weighted_passed / weighted_total)
"""
if not self.results:
return 100.0
weighted_passed = 0.0
weighted_total = 0.0
for r in self.results:
w = SEVERITY_WEIGHTS.get(r.severity, 2)
weighted_total += w
if r.success:
weighted_passed += w
if weighted_total == 0:
return 100.0
return round((weighted_passed / weighted_total) * 100, 1)
# -- Column Health Summary ---------------------------------------------
[docs]
def column_health(self) -> List[ColumnHealthSummary]:
"""
Aggregate expectation results by column.
Extracts null % and unique % from specific expectation types
when present.
"""
col_map: Dict[str, ColumnHealthSummary] = {}
for r in self.results:
col = r.column or "__table__"
if col not in col_map:
col_map[col] = ColumnHealthSummary(column=col)
summary = col_map[col]
summary.checks += 1
if r.success:
summary.passed += 1
elif r.exception_info:
summary.errors += 1
else:
summary.failed += 1
# Extract null info
if r.expectation_type == "expect_column_to_not_be_null":
details = r.details or {}
summary.null_count = details.get("null_count", r.unexpected_count)
total = details.get("total_count", r.element_count)
summary.total_rows = total
if total and total > 0:
nc = summary.null_count or 0
summary.null_percent = (nc / total) * 100
# Extract uniqueness info
if r.expectation_type == "expect_column_values_to_be_unique":
details = r.details or {}
total = r.element_count
dup = details.get("duplicate_count", r.unexpected_count)
summary.total_rows = total
if total and total > 0:
summary.unique_count = total - dup
summary.unique_percent = ((total - dup) / total) * 100
# Return column summaries (table-level last)
cols = sorted(
col_map.values(),
key=lambda c: (c.column == "__table__", c.column),
)
return cols
# -- Statistics --------------------------------------------------------
[docs]
def compute_statistics(self) -> Dict[str, Any]:
"""Compute summary statistics and store them."""
self.statistics = {
"total": self.total_expectations,
"passed": self.successful_expectations,
"failed": self.failed_expectations,
"errors": self.errored_expectations,
"success_percent": self.success_percent,
"overall_success": self.success,
"quality_score": self.compute_quality_score(),
"run_time": self.run_time.isoformat() if self.run_time else None,
"run_duration_seconds": round(self.run_duration_seconds, 3),
}
return self.statistics
[docs]
def to_dict(self) -> Dict[str, Any]:
self.compute_statistics()
return {
"suite_name": self.suite_name,
"success": self.success,
"statistics": self.statistics,
"quality_score": self.compute_quality_score(),
"column_health": [c.to_dict() for c in self.column_health()],
"data_source": self.data_source,
"engine": self.engine,
"results": [r.to_dict() for r in self.results],
}
[docs]
def to_json(self, indent: int = 2) -> str:
"""Serialize the full result to a JSON string."""
return json.dumps(self.to_dict(), indent=indent, default=str)
[docs]
def to_json_file(self, filepath: str) -> None:
"""Write the validation result to a JSON file."""
with open(filepath, "w", encoding="utf-8") as f:
f.write(self.to_json())
[docs]
def to_html(self, filepath: str) -> None:
"""Generate a rich HTML report and write to *filepath*."""
from validatex.reporting.html_report import HTMLReportGenerator
generator = HTMLReportGenerator()
generator.generate(self, filepath)
[docs]
def summary(self) -> str:
"""Return a human-readable summary string."""
self.compute_statistics()
score = self.compute_quality_score()
status = "✅ ALL PASSED" if self.success else "❌ SOME FAILED"
lines = [
f"{'='*60}",
f" ValidateX Validation Report — {self.suite_name}",
f"{'='*60}",
f" Status : {status}",
f" Quality Score : {score} / 100",
f" Total Expectations: {self.total_expectations}",
f" Passed : {self.successful_expectations}",
f" Failed : {self.failed_expectations}",
f" Errors : {self.errored_expectations}",
f" Success Rate : {self.success_percent}%",
f" Run Duration : {self.run_duration_seconds:.3f}s",
f" Engine : {self.engine}",
f"{'='*60}",
]
if self.failed_expectations > 0 or self.errored_expectations > 0:
lines.append(" Failed / Errored Expectations:")
lines.append(f" {'-'*56}")
for r in self.results:
if not r.success:
col_str = f" (column: {r.column})" if r.column else ""
sev = r.severity_icon
lines.append(f" {r.status_icon} {sev} {r.expectation_type}{col_str}")
if r.exception_info:
lines.append(f" Error: {r.exception_info}")
elif r.unexpected_count:
lines.append(f" Unexpected: {r.unexpected_count} " f"({r.unexpected_percent:.2f}%)")
lines.append(f"{'='*60}")
return "\n".join(lines)
def __repr__(self) -> str:
return (
f"ValidationResult(suite={self.suite_name!r}, "
f"success={self.success}, "
f"passed={self.successful_expectations}/{self.total_expectations})"
)