Source code for validatex.profiler.profiler

"""
Data Profiler — analyse a dataset and auto-suggest expectations.

The profiler computes summary statistics for every column and
proposes a reasonable set of expectations that can serve as a
starting point for a quality suite.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional

import pandas as pd

from validatex.core.suite import ExpectationSuite



[docs]
@dataclass
class ColumnProfile:
    """Statistical profile of a single column."""

    name: str
    dtype: str = ""
    total_count: int = 0
    null_count: int = 0
    null_percent: float = 0.0
    unique_count: int = 0
    unique_percent: float = 0.0
    min_value: Any = None
    max_value: Any = None
    mean_value: Optional[float] = None
    std_value: Optional[float] = None
    median_value: Optional[float] = None
    min_length: Optional[int] = None
    max_length: Optional[int] = None
    top_values: List[Dict[str, Any]] = field(default_factory=list)
    sample_values: List[Any] = field(default_factory=list)


[docs]
    def to_dict(self) -> Dict[str, Any]:
        return {
            "name": self.name,
            "dtype": self.dtype,
            "total_count": self.total_count,
            "null_count": self.null_count,
            "null_percent": round(self.null_percent, 2),
            "unique_count": self.unique_count,
            "unique_percent": round(self.unique_percent, 2),
            "min_value": self._safe(self.min_value),
            "max_value": self._safe(self.max_value),
            "mean_value": (round(self.mean_value, 4) if self.mean_value is not None else None),
            "std_value": (round(self.std_value, 4) if self.std_value is not None else None),
            "median_value": (round(self.median_value, 4) if self.median_value is not None else None),
            "min_length": self.min_length,
            "max_length": self.max_length,
            "top_values": self.top_values[:10],
            "sample_values": [self._safe(v) for v in self.sample_values[:5]],
        }


    @staticmethod
    def _safe(val: Any) -> Any:
        if val is None:
            return None
        if isinstance(val, (int, float, bool, str)):
            return val
        return str(val)




[docs]
@dataclass
class DataProfile:
    """Full profile of a DataFrame."""

    row_count: int = 0
    column_count: int = 0
    columns: List[ColumnProfile] = field(default_factory=list)


[docs]
    def to_dict(self) -> Dict[str, Any]:
        return {
            "row_count": self.row_count,
            "column_count": self.column_count,
            "columns": [c.to_dict() for c in self.columns],
        }



[docs]
    def summary(self) -> str:
        """Return a human-readable summary."""
        lines = [
            f"{'='*60}",
            "  ValidateX Data Profile",
            f"{'='*60}",
            f"  Rows    : {self.row_count:,}",
            f"  Columns : {self.column_count}",
            f"{'='*60}",
        ]
        for cp in self.columns:
            lines.append(f"\n  📊 Column: {cp.name}")
            lines.append(f"     Type       : {cp.dtype}")
            lines.append(f"     Nulls      : {cp.null_count} ({cp.null_percent:.1f}%)")
            lines.append(f"     Unique     : {cp.unique_count} ({cp.unique_percent:.1f}%)")
            if cp.min_value is not None:
                lines.append(f"     Min        : {cp.min_value}")
                lines.append(f"     Max        : {cp.max_value}")
            if cp.mean_value is not None:
                lines.append(f"     Mean       : {cp.mean_value:.4f}")
                lines.append(f"     Std Dev    : {cp.std_value:.4f}")
                lines.append(f"     Median     : {cp.median_value:.4f}")
            if cp.min_length is not None:
                lines.append(f"     Str Len    : {cp.min_length} – {cp.max_length}")
            if cp.top_values:
                top_str = ", ".join(f"{v['value']}({v['count']})" for v in cp.top_values[:5])
                lines.append(f"     Top Values : {top_str}")
        lines.append(f"\n{'='*60}")
        return "\n".join(lines)





[docs]
class DataProfiler:
    """
    Analyse a Pandas DataFrame and produce a :class:`DataProfile`.

    Usage
    -----
    >>> profiler = DataProfiler()
    >>> profile = profiler.profile(df)
    >>> print(profile.summary())
    >>> suite = profiler.suggest_expectations(df, suite_name="auto_suite")
    """


[docs]
    def profile(self, df: pd.DataFrame) -> DataProfile:
        """
        Profile every column in *df*.

        Returns
        -------
        DataProfile
        """
        profile = DataProfile(
            row_count=len(df),
            column_count=len(df.columns),
        )

        for col_name in df.columns:
            cp = self._profile_column(df, col_name)
            profile.columns.append(cp)

        return profile



[docs]
    def suggest_expectations(
        self,
        df: pd.DataFrame,
        suite_name: str = "auto_generated_suite",
    ) -> ExpectationSuite:
        """
        Auto-generate an :class:`ExpectationSuite` based on the data profile.

        Heuristics
        ----------
        * If a column has zero nulls → ``expect_column_to_not_be_null``
        * If a column is fully unique → ``expect_column_values_to_be_unique``
        * For numeric columns → ``expect_column_values_to_be_between``
          with observed min/max.
        * For string columns with few distinct values →
          ``expect_column_values_to_be_in_set``
        * For string columns → ``expect_column_value_lengths_to_be_between``
        """
        # Import expectations so they are registered
        import validatex.expectations  # noqa: F401

        profile = self.profile(df)
        suite = ExpectationSuite(name=suite_name)

        # Table-level
        suite.add(
            "expect_table_row_count_to_be_between",
            min_value=max(0, profile.row_count - profile.row_count // 10),
            max_value=profile.row_count + profile.row_count // 10,
        )
        suite.add(
            "expect_table_column_count_to_equal",
            value=profile.column_count,
        )

        for cp in profile.columns:
            # Column existence
            suite.add("expect_column_to_exist", column=cp.name)

            # Null checks
            if cp.null_count == 0:
                suite.add("expect_column_to_not_be_null", column=cp.name)

            # Uniqueness
            if cp.unique_count == cp.total_count and cp.total_count > 0:
                suite.add("expect_column_values_to_be_unique", column=cp.name)

            # Numeric range
            if cp.mean_value is not None and cp.min_value is not None:
                margin = abs(cp.max_value - cp.min_value) * 0.1 if cp.max_value != cp.min_value else 1
                suite.add(
                    "expect_column_values_to_be_between",
                    column=cp.name,
                    min_value=cp.min_value - margin,
                    max_value=cp.max_value + margin,
                )

            # Categorical (string with few distinct values)
            dtype_lower = cp.dtype.lower()
            is_string = dtype_lower.startswith("object") or dtype_lower in ("str", "string") or "string" in dtype_lower
            if is_string and 0 < cp.unique_count <= 20 and cp.total_count > 0:
                values = [v["value"] for v in cp.top_values if v["value"] is not None]
                if values:
                    suite.add(
                        "expect_column_values_to_be_in_set",
                        column=cp.name,
                        value_set=values,
                    )

            # String length
            if cp.min_length is not None and cp.max_length is not None:
                suite.add(
                    "expect_column_value_lengths_to_be_between",
                    column=cp.name,
                    min_value=max(0, cp.min_length - 1),
                    max_value=cp.max_length + 10,
                )

        return suite


    # -- internal ----------------------------------------------------------

    def _profile_column(self, df: pd.DataFrame, col: str) -> ColumnProfile:
        """Profile a single column."""
        series = df[col]
        total = len(series)
        null_count = int(series.isnull().sum())
        non_null = series.dropna()
        unique_count = int(non_null.nunique())

        cp = ColumnProfile(
            name=col,
            dtype=str(series.dtype),
            total_count=total,
            null_count=null_count,
            null_percent=(null_count / total * 100) if total > 0 else 0.0,
            unique_count=unique_count,
            unique_percent=(unique_count / total * 100) if total > 0 else 0.0,
        )

        # Numeric stats
        if pd.api.types.is_numeric_dtype(series):
            if len(non_null) > 0:
                cp.min_value = non_null.min()
                cp.max_value = non_null.max()
                cp.mean_value = float(non_null.mean())
                cp.std_value = float(non_null.std()) if len(non_null) > 1 else 0.0
                cp.median_value = float(non_null.median())

        # String stats — handle 'object', 'str', 'string', 'StringDtype' etc.
        dtype_str = str(series.dtype).lower()
        is_string_col = dtype_str == "object" or dtype_str in ("str", "string") or "string" in dtype_str
        if is_string_col:
            str_series = non_null.astype(str)
            if len(str_series) > 0:
                lengths = str_series.str.len()
                cp.min_length = int(lengths.min())
                cp.max_length = int(lengths.max())
                cp.min_value = str(non_null.min()) if len(non_null) > 0 else None
                cp.max_value = str(non_null.max()) if len(non_null) > 0 else None

        # Top values
        if len(non_null) > 0:
            value_counts = non_null.value_counts().head(10)
            cp.top_values = [{"value": str(v), "count": int(c)} for v, c in value_counts.items()]

        # Sample values
        if len(non_null) > 0:
            cp.sample_values = non_null.head(5).tolist()

        return cp