Source code for validatex.datasources.parquet_source

"""Parquet data source."""

from __future__ import annotations

from typing import Any, Dict, Optional

import pandas as pd

from validatex.datasources.base_source import DataSource


[docs] class ParquetDataSource(DataSource): """ Load data from a Parquet file. Parameters ---------- filepath : str Path to the Parquet file or directory. read_options : dict, optional Extra kwargs forwarded to ``pd.read_parquet`` / Spark reader. name : str, optional """ def __init__( self, filepath: str, read_options: Optional[Dict[str, Any]] = None, name: Optional[str] = None, ): super().__init__(name=name or filepath) self.filepath = filepath self.read_options = read_options or {}
[docs] def load_pandas(self) -> pd.DataFrame: return pd.read_parquet(self.filepath, **self.read_options)
[docs] def load_spark(self, spark_session: Any = None) -> Any: if spark_session is None: raise ValueError("A SparkSession is required.") return spark_session.read.parquet(self.filepath)