Source code for validatex.datasources.csv_source

"""CSV data source."""

from __future__ import annotations

from typing import Any, Dict, Optional

import pandas as pd

from validatex.datasources.base_source import DataSource


[docs] class CSVDataSource(DataSource): """ Load data from a CSV file. Parameters ---------- filepath : str Path to the CSV file. read_options : dict, optional Extra keyword arguments forwarded to ``pd.read_csv`` / Spark reader. name : str, optional A label for this source. """ def __init__( self, filepath: str, read_options: Optional[Dict[str, Any]] = None, name: Optional[str] = None, ): super().__init__(name=name or filepath) self.filepath = filepath self.read_options = read_options or {}
[docs] def load_pandas(self) -> pd.DataFrame: return pd.read_csv(self.filepath, **self.read_options)
[docs] def load_spark(self, spark_session: Any = None) -> Any: if spark_session is None: raise ValueError("A SparkSession is required to load CSV as Spark DataFrame.") reader = spark_session.read.option("header", "true").option("inferSchema", "true") for k, v in self.read_options.items(): reader = reader.option(k, v) return reader.csv(self.filepath)