utils

grizz.utils ¶

Contain utility functions.

grizz.utils.column ¶

Contain DataFrame columns utility functions.

grizz.utils.column.check_column_exist_policy ¶

check_column_exist_policy(exist_policy: str) -> None

Check the policy on how to handle existing columns.

Parameters:

Name	Type	Description	Default
`exist_policy`	`str`	The policy on how to handle existing columns.	required

Raises:

Type	Description
`ValueError`	if `exist_policy` is not `'ignore'`, `'warn'`, or `'raise'`.

Example usage:

>>> from grizz.utils.column import check_column_exist_policy
>>> check_column_exist_policy("ignore")

grizz.utils.column.check_column_missing_policy ¶

check_column_missing_policy(missing_policy: str) -> None

Check the policy on how to handle missing columns.

Parameters:

Name	Type	Description	Default
`missing_policy`	`str`	The policy on how to handle missing columns.	required

Raises:

Type	Description
`ValueError`	if `missing_policy` is not `'ignore'`, `'warn'`, or `'raise'`.

Example usage:

>>> from grizz.utils.column import check_column_missing_policy
>>> check_column_missing_policy("ignore")

grizz.utils.column.check_existing_column ¶

check_existing_column(
    frame_or_cols: DataFrame | Sequence,
    column: str,
    exist_policy: str = "raise",
) -> None

Check if a column already exists.

Parameters:

Name	Type	Description	Default
`frame_or_cols`	`DataFrame \| Sequence`	The DataFrame or its columns.	required
`column`	`str`	The column to check.	required
`exist_policy`	`str`	The policy on how to handle existing columns. The following options are available: `'ignore'`, `'warn'`, and `'raise'`. If `'raise'`, an exception is raised the column already exists. If `'warn'`, a warning is raised if the column already exists and the existing column is overwritten. If `'ignore'`, the existing column is overwritten and no warning message appears.	`'raise'`

Raises:

Type	Description
`ColumnExistsError`	if at least one column already exists and `exist_policy='raise'`.

Example usage:

>>> import polars as pl
>>> from grizz.utils.column import check_existing_column
>>> frame = pl.DataFrame(
...     {
...         "col1": [1, 2, 3, 4, 5],
...         "col2": ["1", "2", "3", "4", "5"],
...         "col3": ["a ", " b", "  c  ", "d", "e"],
...         "col4": ["a ", " b", "  c  ", "d", "e"],
...     }
... )
>>> check_existing_column(frame, "col1", exist_policy="warn")

grizz.utils.column.check_existing_columns ¶

check_existing_columns(
    frame_or_cols: DataFrame | Sequence,
    columns: Sequence,
    exist_policy: str = "raise",
) -> None

Check if some columns already exist.

Parameters:

Name	Type	Description	Default
`frame_or_cols`	`DataFrame \| Sequence`	The DataFrame or its columns.	required
`columns`	`Sequence`	The columns to check.	required
`exist_policy`	`str`	The policy on how to handle existing columns. The following options are available: `'ignore'`, `'warn'`, and `'raise'`. If `'raise'`, an exception is raised if at least one column already exist. If `'warn'`, a warning is raised if at least one column already exist and the existing columns are overwritten. If `'ignore'`, the existing columns are overwritten and no warning message appears.	`'raise'`

Raises:

Type	Description
`ColumnExistsError`	if at least one column already exists and `exist_policy='raise'`.

Example usage:

>>> import polars as pl
>>> from grizz.utils.column import check_existing_columns
>>> frame = pl.DataFrame(
...     {
...         "col1": [1, 2, 3, 4, 5],
...         "col2": ["1", "2", "3", "4", "5"],
...         "col3": ["a ", " b", "  c  ", "d", "e"],
...         "col4": ["a ", " b", "  c  ", "d", "e"],
...     }
... )
>>> check_existing_columns(frame, ["col1", "col5"], exist_policy="warn")

grizz.utils.column.check_missing_column ¶

check_missing_column(
    frame_or_cols: DataFrame | Sequence,
    column: str,
    missing_policy: str = "raise",
) -> None

Check if a column is missing.

Parameters:

Name	Type	Description	Default
`frame_or_cols`	`DataFrame \| Sequence`	The DataFrame or its columns.	required
`column`	`str`	The column to check.	required
`missing_policy`	`str`	The policy on how to handle missing columns. The following options are available: `'ignore'`, `'warn'`, and `'raise'`. If `'raise'`, an exception is raised if the column is missing. If `'warn'`, a warning is raised if the column is missing and the missing columns are ignored. If `'ignore'`, the missing column is ignored and no warning message appears.	`'raise'`

Raises:

Type	Description
`ColumnNotFoundError`	if the column is missing and `missing_policy='raise'`.

Example usage:

>>> import polars as pl
>>> from grizz.utils.column import check_missing_column
>>> frame = pl.DataFrame(
...     {
...         "col1": [1, 2, 3, 4, 5],
...         "col2": ["1", "2", "3", "4", "5"],
...         "col3": ["a ", " b", "  c  ", "d", "e"],
...         "col4": ["a ", " b", "  c  ", "d", "e"],
...     }
... )
>>> check_missing_column(frame, "col1", missing_policy="warn")

grizz.utils.column.check_missing_columns ¶

check_missing_columns(
    frame_or_cols: DataFrame | Sequence,
    columns: Sequence,
    missing_policy: str = "raise",
) -> None

Check if some columns are missing.

Parameters:

Name	Type	Description	Default
`frame_or_cols`	`DataFrame \| Sequence`	The DataFrame or its columns.	required
`columns`	`Sequence`	The columns to check.	required
`missing_policy`	`str`	The policy on how to handle missing columns. The following options are available: `'ignore'`, `'warn'`, and `'raise'`. If `'raise'`, an exception is raised if at least one column is missing. If `'warn'`, a warning is raised if at least one column is missing and the missing columns are ignored. If `'ignore'`, the missing columns are ignored and no warning message appears.	`'raise'`

Raises:

Type	Description
`ColumnNotFoundError`	if at least one column is missing and `missing_policy='raise'`.

Example usage:

>>> import polars as pl
>>> from grizz.utils.column import check_missing_columns
>>> frame = pl.DataFrame(
...     {
...         "col1": [1, 2, 3, 4, 5],
...         "col2": ["1", "2", "3", "4", "5"],
...         "col3": ["a ", " b", "  c  ", "d", "e"],
...         "col4": ["a ", " b", "  c  ", "d", "e"],
...     }
... )
>>> check_missing_columns(frame, ["col1", "col5"], missing_policy="warn")

grizz.utils.column.find_common_columns ¶

find_common_columns(
    frame_or_cols: DataFrame | Sequence,
    columns: Sequence[str],
) -> tuple[str, ...]

Find the common columns that are both in the DataFrame and the given columns.

Parameters:

Name	Type	Description	Default
`frame_or_cols`	`DataFrame \| Sequence`	The DataFrame or its columns.	required
`columns`	`Sequence[str]`	The columns to check.	required

Returns:

Type	Description
`tuple[str, ...]`	The columns i.e. the columns that are both in `columns` and `frame_or_cols`.

Example usage:

>>> import polars as pl
>>> from grizz.utils.column import find_common_columns
>>> frame = pl.DataFrame(
...     {
...         "col1": [1, 2, 3, 4, 5],
...         "col2": ["1", "2", "3", "4", "5"],
...         "col3": ["a ", " b", "  c  ", "d", "e"],
...     }
... )
>>> cols = find_common_columns(frame, columns=["col1", "col2", "col3", "col4"])
>>> cols
('col1', 'col2', 'col3')

grizz.utils.column.find_missing_columns ¶

find_missing_columns(
    frame_or_cols: DataFrame | Sequence,
    columns: Sequence[str],
) -> tuple[str, ...]

Find the columns that are in the given columns but not in the DataFrame.

Parameters:

Name	Type	Description	Default
`frame_or_cols`	`DataFrame \| Sequence`	The DataFrame or its columns.	required
`columns`	`Sequence[str]`	The columns to check.	required

Returns:

Type	Description
`tuple[str, ...]`	The list of missing columns i.e. the columns that are in `columns` but not in `frame_or_cols`.

Example usage:

>>> import polars as pl
>>> from grizz.utils.column import find_missing_columns
>>> frame = pl.DataFrame(
...     {
...         "col1": [1, 2, 3, 4, 5],
...         "col2": ["1", "2", "3", "4", "5"],
...         "col3": ["a ", " b", "  c  ", "d", "e"],
...     }
... )
>>> cols = find_missing_columns(frame, columns=["col1", "col2", "col3", "col4"])
>>> cols
('col4',)

grizz.utils.count ¶

Contain utility functions for counting.

grizz.utils.count.compute_nunique ¶

compute_nunique(frame: DataFrame) -> ndarray

Return the number of unique values in each column.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The DataFrame to analyze.	required

Returns:

Type	Description
`ndarray`	An array with the number of unique values in each column. The shape of the array is the number of columns.

Example usage:

>>> import polars as pl
>>> from grizz.utils.count import compute_nunique
>>> frame = pl.DataFrame(
...     {
...         "int": [None, 1, 0, 1],
...         "float": [1.2, 4.2, None, 2.2],
...         "str": ["A", "B", None, None],
...     },
...     schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String},
... )
>>> count = compute_nunique(frame)
>>> count
array([3, 4, 3])

grizz.utils.count.compute_temporal_count ¶

compute_temporal_count(
    frame: DataFrame, temporal_column: str, period: str
) -> tuple[ndarray, list[str]]

Prepare the data to create the figure and table.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The DataFrame to analyze.	required
`temporal_column`	`str`	The temporal column used to analyze the temporal distribution.	required
`period`	`str`	The temporal period e.g. monthly or daily.	required

Returns:

Type	Description
`tuple[ndarray, list[str]]`	A tuple with the counts and the temporal steps.

Example usage:

>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.count import compute_temporal_count
>>> counts, steps = compute_temporal_count(
...     frame=pl.DataFrame(
...         {
...             "col1": [None, float("nan"), 0.0, 1.0, 4.2, 42.0],
...             "col2": [None, 1, 0, None, 2, 3],
...             "datetime": [
...                 datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
...                 datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
...             ],
...         },
...         schema={
...             "col1": pl.Float64,
...             "col2": pl.Int64,
...             "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
...         },
...     ),
...     temporal_column="datetime",
...     period="1mo",
... )
>>> counts
array([3, 1, 1, 1])
>>> steps
['2020-01', '2020-02', '2020-03', '2020-04']

grizz.utils.count.compute_temporal_value_counts ¶

compute_temporal_value_counts(
    frame: DataFrame,
    column: str,
    temporal_column: str,
    period: str,
    drop_nulls: bool = False,
) -> tuple[ndarray, list[str], list[str]]

Compute the value counts for temporal windows of a given column.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The DataFrame to analyze.	required
`column`	`str`	The column to analyze the temporal value counts.	required
`temporal_column`	`str`	The temporal column used to analyze the temporal distribution.	required
`period`	`str`	The temporal period e.g. monthly or daily.	required
`drop_nulls`	`bool`	If `True`, the null values are ignored.	`False`

Returns:

Type	Description
`tuple[ndarray, list[str], list[str]]`	A tuple with 3 items. The first item is a 2-d array that indicates the number of occurrences for each value and time step. The first dimension represents the value and the second dimension represents the steps. The second item is the list of time steps. The third item is the list of string representation of the values.

Example usage:

>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.count import compute_temporal_value_counts
>>> counts, steps, values = compute_temporal_value_counts(
...     frame=pl.DataFrame(
...         {
...             "col1": [None, 1.0, 0.0, 1.0, 4.2, 42.0],
...             "col2": [None, 1, 0, None, 2, 3],
...             "datetime": [
...                 datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
...                 datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
...             ],
...         },
...         schema={
...             "col1": pl.Float64,
...             "col2": pl.Int64,
...             "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
...         },
...     ),
...     column="col1",
...     temporal_column="datetime",
...     period="1mo",
... )
>>> counts
array([[1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0]])
>>> steps
['2020-01', '2020-02', '2020-03', '2020-04']
>>> values
['0.0', '1.0', '4.2', '42.0', 'null']

grizz.utils.datetime ¶

Contain utility functions for datetime and date objects.

grizz.utils.datetime.find_end_datetime ¶

find_end_datetime(
    start: datetime | date,
    interval: str | timedelta,
    periods: int,
) -> datetime

Find the upper bound of the datetime range from the lower bound of the datetime range, the interval, and the number of periods.

Parameters:

Name	Type	Description	Default
`start`	`datetime \| date`	The lower bound of the datetime range.	required
`interval`	`str \| timedelta`	The interval of the range periods, specified as a Python timedelta object or using the Polars duration string language.	required
`periods`	`int`	The number of periods after the start.	required

Returns:

Type	Description
`datetime`	The upper bound of the datetime range.

Notes

interval is created according to the following string language:

- 1ns (1 nanosecond)
- 1us (1 microsecond)
- 1ms (1 millisecond)
- 1s (1 second)
- 1m (1 minute)
- 1h (1 hour)
- 1d (1 calendar day)
- 1w (1 calendar week)

Example usage:

>>> from datetime import timedelta, datetime, timezone
>>> from grizz.utils.datetime import find_end_datetime
>>> find_end_datetime(
...     start=datetime(year=2020, month=5, day=12, hour=4, tzinfo=timezone.utc),
...     interval=timedelta(hours=1),
...     periods=42,
... )
datetime.datetime(2020, 5, 13, 22, 0, tzinfo=datetime.timezone.utc)

grizz.utils.datetime.to_datetime ¶

to_datetime(dt: datetime | date) -> datetime

Convert a date object to a datetime object.

Parameters:

Name	Type	Description	Default
`dt`	`datetime \| date`	The `date` object to convert.	required

Returns:

Type	Description
`datetime`	The `datetime` object.

Example usage:

>>> from datetime import datetime, date, timezone
>>> from grizz.utils.datetime import to_datetime
>>> to_datetime(datetime(year=2020, month=5, day=12, hour=4, tzinfo=timezone.utc))
datetime.datetime(2020, 5, 12, 4, 0, tzinfo=datetime.timezone.utc)
>>> to_datetime(date(year=2020, month=5, day=12))
datetime.datetime(2020, 5, 12, 0, 0, tzinfo=datetime.timezone.utc)

grizz.utils.factory ¶

Contain a function to instantiate an object from its configuration.

grizz.utils.factory.setup_object ¶

setup_object(obj_or_config: T | dict) -> T

Set up an object from its configuration.

Parameters:

Name	Type	Description	Default
`obj_or_config`	`T \| dict`	The object or its configuration.	required

Returns:

Type	Description
`T`	The instantiated object.

Example usage:

>>> from grizz.utils.factory import setup_object
>>> obj = setup_object({"_target_": "collections.deque", "iterable": [1, 2, 1, 3]})
>>> obj
deque([1, 2, 1, 3])
>>> setup_object(obj)  # Do nothing because the object is already instantiated
deque([1, 2, 1, 3])

grizz.utils.format ¶

Contain utility functions to format strings.

grizz.utils.format.human_byte ¶

human_byte(size: float, decimal: int = 2) -> str

Return a human-readable string representation of byte sizes.

Parameters:

Name	Type	Description	Default
`size`	`float`	The number of bytes.	required
`decimal`	`int`	The number of decimal digits.	`2`

Returns:

Type	Description
`str`	The human-readable string representation of byte sizes.

Example usage:

>>> from grizz.utils.format import human_byte
>>> human_byte(2)
'2.00 B'
>>> human_byte(2048)
'2.00 KB'
>>> human_byte(2097152)
'2.00 MB'

grizz.utils.format.str_boolean_series_stats ¶

str_boolean_series_stats(series: Series) -> str

Return a string about the content of a Boolean series.

Parameters:

Name	Type	Description	Default
`series`	`Series`	The input series.	required

Returns:

Type	Description
`str`	The generated string about the input series.

Raises:

Type	Description
`ValueError`	if `series` is not a Boolean series.

Example usage:

>>> import polars as pl
>>> from grizz.utils.format import str_boolean_series_stats
>>> series = pl.Series([True, False, None, None, False, None])
>>> str_boolean_series_stats(series)
true: 1/3 (33.3333 %) | null: 3/6 (50.0000 %)

grizz.utils.format.str_col_diff ¶

str_col_diff(orig: int, final: int) -> str

Return a string that indicates the difference of columns.

Parameters:

Name	Type	Description	Default
`orig`	`int`	The original number of columns.	required
`final`	`int`	The final number of columns.	required

Returns:

Type	Description
`str`	The generated string with the difference of columns.

Example usage:

>>> from grizz.utils.format import str_col_diff
>>> str_col_diff(100, 10)
90/100 (90.0000 %) columns have been removed
>>> str_col_diff(100, 99)
1/100 (1.0000 %) column has been removed

grizz.utils.format.str_dataframe_diff ¶

str_dataframe_diff(
    orig: DataFrame, final: DataFrame
) -> str

Return a string that shows the difference between DataFrames.

Parameters:

Name	Type	Description	Default
`orig`	`DataFrame`	The original DataFrame.	required
`final`	`DataFrame`	The final DataFrame.	required

Returns:

Type	Description
`str`	The generated string with the difference of DataFrames.

Example usage:

>>> import polars as pl
>>> from grizz.utils.format import str_dataframe_diff
>>> frame1 = pl.DataFrame(
...     {
...         "col1": [1, 2, 3, 4, 5],
...         "col2": ["1", "2", "3", "4", "5"],
...         "col3": ["1", "2", "3", "4", "5"],
...         "col4": ["a", "b", "c", "d", "e"],
...     }
... )
>>> frame2 = pl.DataFrame(
...     {
...         "col1": [1, 2, 3, 4, 5],
...         "col2": ["1", "2", "3", "4", "5"],
...         "col3": [1.0, 2.0, 3.0, 4.0, 5.0],
...     }
... )
>>> print(str_dataframe_diff(orig=frame1, final=frame2))
DataFrame shape: (5, 4) -> (5, 3) | 1/4 (25.0000 %) column has been removed
DataFrame estimated size: 55.00 B -> 85.00 B | difference: 30.00 B (54.5455 %)

grizz.utils.format.str_kwargs ¶

str_kwargs(mapping: Mapping) -> str

Return a string of the input mapping.

This function is designed to be used in __repr__ and __str__ methods.

Parameters:

Name	Type	Description	Default
`mapping`	`Mapping`	The mapping.	required

Returns:

Type	Description
`str`	The generated string.

Example usage:

>>> from grizz.utils.format import str_kwargs
>>> str_kwargs({"key1": 1})
', key1=1'
>>> str_kwargs({"key1": 1, "key2": 2})
', key1=1, key2=2'

grizz.utils.format.str_row_diff ¶

str_row_diff(orig: int, final: int) -> str

Return a string that indicates the difference of rows.

Parameters:

Name	Type	Description	Default
`orig`	`int`	The original number of rows.	required
`final`	`int`	The final number of rows.	required

Returns:

Type	Description
`str`	The generated string with the difference of rows.

Example usage:

>>> from grizz.utils.format import str_row_diff
>>> str_row_diff(100, 10)
90/100 (90.0000 %) rows have been removed
>>> str_row_diff(100, 99)
1/100 (1.0000 %) row has been removed

grizz.utils.format.str_shape_diff ¶

str_shape_diff(
    orig: tuple[int, int], final: tuple[int, int]
) -> str

Return a string that indicates the difference of DataFrame shapes.

Parameters:

Name	Type	Description	Default
`orig`	`tuple[int, int]`	The original shape.	required
`final`	`tuple[int, int]`	The final shape.	required

Returns:

Type	Description
`str`	The generated string with the difference of DataFrame shapes.

Example usage:

>>> from grizz.utils.format import str_shape_diff
>>> str_shape_diff(orig=(100, 10), final=(80, 8))
DataFrame shape: (100, 10) -> (80, 8) | 20/100 (20.0000 %) rows have been removed |
2/10 (20.0000 %) columns have been removed

grizz.utils.format.str_size_diff ¶

str_size_diff(orig: float, final: float) -> str

Return a string that indicates the difference of DataFrame sizes.

Parameters:

Name	Type	Description	Default
`orig`	`float`	The original size.	required
`final`	`float`	The final size.	required

Returns:

Type	Description
`str`	The generated string with the difference of DataFrame sizes.

Example usage:

>>> from grizz.utils.format import str_size_diff
>>> str_size_diff(orig=100, final=120)
DataFrame estimated size: 100.00 B -> 120.00 B | difference: 20.00 B (20.0000 %)

grizz.utils.hashing ¶

Contain utility functions to compute hash of objects.

grizz.utils.hashing.str_to_sha256 ¶

str_to_sha256(string: str) -> str

Generate the SHA-256 hash of a string.

Parameters:

Name	Type	Description	Default
`string`	`str`	The string to hash.	required

Returns:

Type	Description
`str`	The SHA-256 hash.

Example usage:

>>> from grizz.utils.hashing import str_to_sha256
>>> str_to_sha256("bears are funny")
c97afc5c7f1b598c9f68dc2d6e323b2dd2eaaa31d3a07c98059de6079cbd30e0

grizz.utils.imports ¶

Implement some utility functions to manage optional dependencies.

grizz.utils.imports.check_clickhouse_connect ¶

check_clickhouse_connect() -> None

Check if the clickhouse_connect package is installed.

Raises:

Type	Description
`RuntimeError`	if the `clickhouse_connect` package is not installed.

Example usage:

>>> from grizz.utils.imports import check_clickhouse_connect
>>> check_clickhouse_connect()

grizz.utils.imports.check_colorlog ¶

check_colorlog() -> None

Check if the colorlog package is installed.

Raises:

Type	Description
`RuntimeError`	if the `colorlog` package is not installed.

Example usage:

>>> from grizz.utils.imports import check_colorlog
>>> check_colorlog()

grizz.utils.imports.check_pyarrow ¶

check_pyarrow() -> None

Check if the pyarrow package is installed.

Raises:

Type	Description
`RuntimeError`	if the `pyarrow` package is not installed.

Example usage:

>>> from grizz.utils.imports import check_pyarrow
>>> check_pyarrow()

grizz.utils.imports.check_sklearn ¶

check_sklearn() -> None

Check if the sklearn package is installed.

Raises:

Type	Description
`RuntimeError`	if the `sklearn` package is not installed.

Example usage:

>>> from grizz.utils.imports import check_sklearn
>>> check_sklearn()

grizz.utils.imports.check_tqdm ¶

check_tqdm() -> None

Check if the tqdm package is installed.

Raises:

Type	Description
`RuntimeError`	if the `tqdm` package is not installed.

Example usage:

>>> from grizz.utils.imports import check_tqdm
>>> check_tqdm()

grizz.utils.imports.clickhouse_connect_available ¶

clickhouse_connect_available(
    fn: Callable[..., Any],
) -> Callable[..., Any]

Implement a decorator to execute a function only if clickhouse_connect package is installed.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[..., Any]`	The function to execute.	required

Returns:

Type	Description
`Callable[..., Any]`	A wrapper around `fn` if `clickhouse_connect` package is installed, otherwise `None`.

Example usage:

>>> from grizz.utils.imports import clickhouse_connect_available
>>> @clickhouse_connect_available
... def my_function(n: int = 0) -> int:
...     return 42 + n
...
>>> my_function()

grizz.utils.imports.colorlog_available ¶

colorlog_available(
    fn: Callable[..., Any],
) -> Callable[..., Any]

Implement a decorator to execute a function only if colorlog package is installed.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[..., Any]`	The function to execute.	required

Returns:

Type	Description
`Callable[..., Any]`	A wrapper around `fn` if `colorlog` package is installed, otherwise `None`.

Example usage:

>>> from grizz.utils.imports import colorlog_available
>>> @colorlog_available
... def my_function(n: int = 0) -> int:
...     return 42 + n
...
>>> my_function()

grizz.utils.imports.is_clickhouse_connect_available `cached` ¶

is_clickhouse_connect_available() -> bool

Indicate if the clickhouse_connect package is installed or not.

Returns:

Type	Description
`bool`	`True` if `clickhouse_connect` is available otherwise `False`.

Example usage:

>>> from grizz.utils.imports import is_clickhouse_connect_available
>>> is_clickhouse_connect_available()

grizz.utils.imports.is_colorlog_available ¶

is_colorlog_available() -> bool

Indicate if the colorlog package is installed or not.

Returns:

Type	Description
`bool`	`True` if `colorlog` is available otherwise `False`.

Example usage:

>>> from grizz.utils.imports import is_colorlog_available
>>> is_colorlog_available()

grizz.utils.imports.is_pyarrow_available `cached` ¶

is_pyarrow_available() -> bool

Indicate if the pyarrow package is installed or not.

Returns:

Type	Description
`bool`	`True` if `pyarrow` is available otherwise `False`.

Example usage:

>>> from grizz.utils.imports import is_pyarrow_available
>>> is_pyarrow_available()

grizz.utils.imports.is_sklearn_available `cached` ¶

is_sklearn_available() -> bool

Indicate if the sklearn package is installed or not.

Returns:

Type	Description
`bool`	`True` if `sklearn` is available otherwise `False`.

Example usage:

>>> from grizz.utils.imports import is_sklearn_available
>>> is_sklearn_available()

grizz.utils.imports.is_tqdm_available `cached` ¶

is_tqdm_available() -> bool

Indicate if the tqdm package is installed or not.

Returns:

Type	Description
`bool`	`True` if `tqdm` is available otherwise `False`.

Example usage:

>>> from grizz.utils.imports import is_tqdm_available
>>> is_tqdm_available()

grizz.utils.imports.pyarrow_available ¶

pyarrow_available(
    fn: Callable[..., Any],
) -> Callable[..., Any]

Implement a decorator to execute a function only if pyarrow package is installed.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[..., Any]`	Specifies the function to execute.	required

Returns:

Type	Description
`Callable[..., Any]`	A wrapper around `fn` if `pyarrow` package is installed, otherwise `None`.

Example usage:

>>> from grizz.utils.imports import pyarrow_available
>>> @pyarrow_available
... def my_function(n: int = 0) -> int:
...     return 42 + n
...
>>> my_function()

grizz.utils.imports.sklearn_available ¶

sklearn_available(
    fn: Callable[..., Any],
) -> Callable[..., Any]

Implement a decorator to execute a function only if sklearn package is installed.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[..., Any]`	Specifies the function to execute.	required

Returns:

Type	Description
`Callable[..., Any]`	A wrapper around `fn` if `sklearn` package is installed, otherwise `None`.

Example usage:

>>> from grizz.utils.imports import sklearn_available
>>> @sklearn_available
... def my_function(n: int = 0) -> int:
...     return 42 + n
...
>>> my_function()

grizz.utils.imports.tqdm_available ¶

tqdm_available(
    fn: Callable[..., Any],
) -> Callable[..., Any]

Implement a decorator to execute a function only if tqdm package is installed.

Parameters:

Name	Type	Description	Default
`fn`	`Callable[..., Any]`	Specifies the function to execute.	required

Returns:

Type	Description
`Callable[..., Any]`	A wrapper around `fn` if `tqdm` package is installed, otherwise `None`.

Example usage:

>>> from grizz.utils.imports import tqdm_available
>>> @tqdm_available
... def my_function(n: int = 0) -> int:
...     return 42 + n
...
>>> my_function()

grizz.utils.interval ¶

Contain interval utility functions.

grizz.utils.interval.find_time_unit ¶

find_time_unit(interval: str) -> str

Find the time unit associated to a polars interval.

Parameters:

Name	Type	Description	Default
`interval`	`str`	The `polars` interval to analyze.	required

Returns:

Type	Description
`str`	The found time unit.

Raises:

Type	Description
`RuntimeError`	if no valid time unit can be found.

Example usage:

>>> from grizz.utils.interval import find_time_unit
>>> find_time_unit("3d12h4m")
m
>>> find_time_unit("3y5mo")
mo

grizz.utils.interval.interval_to_strftime_format ¶

interval_to_strftime_format(interval: str) -> str

Return the default strftime format for a given interval.

Parameters:

Name	Type	Description	Default
`interval`	`str`	The `polars` interval to analyze.	required

Returns:

Type	Description
`str`	The default strftime format.

Example usage:

>>> from grizz.utils.interval import interval_to_strftime_format
>>> interval_to_strftime_format("1h")
%Y-%m-%d %H:%M
>>> interval_to_strftime_format("3y1mo")
%Y-%m

grizz.utils.interval.interval_to_timedelta ¶

interval_to_timedelta(interval: str) -> timedelta

Convert a interval to a timedelta object.

Parameters:

Name	Type	Description	Default
`interval`	`str`	The input interval.	required

Returns:

Type	Description
`timedelta`	The timedelta object generated from the interval.

Example usage:

>>> from grizz.utils.interval import interval_to_timedelta
>>> interval_to_timedelta("5d1h42m")
datetime.timedelta(days=5, seconds=6120)

grizz.utils.interval.time_unit_to_strftime_format ¶

time_unit_to_strftime_format(time_unit: str) -> str

Return the default strftime format for a given time unit.

Parameters:

Name	Type	Description	Default
`time_unit`	`str`	The time unit.	required

Returns:

Type	Description
`str`	The default strftime format.

Example usage:

>>> from grizz.utils.interval import time_unit_to_strftime_format
>>> time_unit_to_strftime_format("h")
%Y-%m-%d %H:%M
>>> time_unit_to_strftime_format("mo")
%Y-%m

grizz.utils.logging ¶

Contain utility functions to configure the standard logging library.

grizz.utils.logging.configure_logging ¶

configure_logging(level: int = INFO) -> None

Configure the logging module with a colored formatter.

Parameters:

Name	Type	Description	Default
`level`	`int`	The lower level.	`INFO`

grizz.utils.nan ¶

Contain utility functions to transform data with NaNs.

grizz.utils.nan.LowNaN ¶

Bases: float

Implement a NaN representation that is always lower than other numbers.

This class is designed to be used to compare numbers with NaN values and should not be used in other cases.

https://docs.python.org/3/library/functions.html#sorted

grizz.utils.nan.remove_nan ¶

remove_nan(data: T) -> T

Remove the NaN values from the input sequence.

Parameters:

Name	Type	Description	Default
`data`	`T`	The input sequence.	required

Returns:

Type	Description
`T`	The input sequence without NaN values.

Example usage:

>>> from grizz.utils.nan import remove_nan
>>> data = [float("nan"), float("-inf"), -2, 1.2]
>>> remove_nan(data)
[-inf, -2, 1.2]

grizz.utils.nan.sortnan ¶

sortnan(
    iterable: Iterable[bool | float],
    /,
    *,
    reverse: bool = False,
) -> list[bool | float]

Sort a sequence of numeric values with NaN.

This function is an extension of the built-in sorted function. It sees NaN values as equivalent to -infinity when the values are sorted.

Parameters:

Name	Type	Description	Default
`iterable`	`Iterable[bool \| float]`	The numeric values to sort.	required
`reverse`	`bool`	If set to `True`, then the list elements are sorted as if each comparison were reversed.	`False`

Returns:

Type	Description
`list[bool \| float]`	The sorted list.

Example usage:

>>> from grizz.utils.nan import sortnan
>>> x = [4, float("nan"), 2, 1.2, 7.9, -2]
>>> sorted(x)
[4, nan, -2, 1.2, 2, 7.9]
>>> sortnan(x)
[nan, -2, 1.2, 2, 4, 7.9]
>>> sortnan(x, reverse=True)
[7.9, 4, 2, 1.2, -2, nan]

grizz.utils.noop ¶

Contain no-op functions.

grizz.utils.noop.tqdm ¶

tqdm(
    iterable: Iterable, *args: Any, **kwargs: Any
) -> Iterable

Implement a no-op tqdm progressbar that is used when tqdm is not installed.

Parameters:

Name	Type	Description	Default
`iterable`	`Iterable`	Iterable to decorate with a progressbar.	required
`*args`	`Any`	Positional arbitrary arguments.	`()`
`**kwargs`	`Any`	Keyword arbitrary arguments.	`{}`

Returns:

Type	Description
`Iterable`	The input iterable.

grizz.utils.null ¶

Contain utility functions to manipulate null values in DataFrames.

grizz.utils.null.compute_null ¶

compute_null(frame: DataFrame) -> DataFrame

Return the number and percentage of null values per column.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The DataFrame to analyze.	required

Returns:

Type	Description
`DataFrame`	A DataFrame with the number and percentage of null values per column.

Example usage:

>>> import polars as pl
>>> from grizz.utils.null import compute_null
>>> frame = compute_null(
...     pl.DataFrame(
...         {
...             "int": [None, 1, 0, 1],
...             "float": [1.2, 4.2, None, 2.2],
...             "str": ["A", "B", None, None],
...         },
...         schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String},
...     )
... )
>>> frame
shape: (3, 4)
┌────────┬──────┬───────┬──────────┐
│ column ┆ null ┆ total ┆ null_pct │
│ ---    ┆ ---  ┆ ---   ┆ ---      │
│ str    ┆ i64  ┆ i64   ┆ f64      │
╞════════╪══════╪═══════╪══════════╡
│ int    ┆ 1    ┆ 4     ┆ 0.25     │
│ float  ┆ 1    ┆ 4     ┆ 0.25     │
│ str    ┆ 2    ┆ 4     ┆ 0.5      │
└────────┴──────┴───────┴──────────┘

grizz.utils.null.compute_null_count ¶

compute_null_count(frame: DataFrame) -> ndarray

Return the number of null values in each column.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The DataFrame to analyze.	required

Returns:

Type	Description
`ndarray`	An array with the number of null values in each column. The shape of the array is the number of columns.

Example usage:

>>> import polars as pl
>>> from grizz.utils.null import compute_null_count
>>> frame = pl.DataFrame(
...     {
...         "int": [None, 1, 0, 1],
...         "float": [1.2, 4.2, None, 2.2],
...         "str": ["A", "B", None, None],
...     },
...     schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String},
... )
>>> count = compute_null_count(frame)
>>> count
array([1, 1, 2])

grizz.utils.null.compute_temporal_null_count ¶

compute_temporal_null_count(
    frame: DataFrame,
    columns: Sequence[str],
    temporal_column: str,
    period: str,
) -> tuple[ndarray, ndarray, list]

Compute the number of null values per temporal segments.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The DataFrame to analyze.	required
`columns`	`Sequence[str]`	The list of columns to analyze.	required
`temporal_column`	`str`	The temporal column used to analyze the temporal distribution.	required
`period`	`str`	The temporal period e.g. monthly or daily.	required

Returns:

Type	Description
`tuple[ndarray, ndarray, list]`	A tuple with 3 values. The first value is a numpy NDArray that contains the number of null values per period. The second value is a numpy NDArray that contains the total number of values. The third value is a list that contains the label of each period.

Example usage:

>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.null import compute_temporal_null_count
>>> nulls, totals, labels = compute_temporal_null_count(
...     frame=pl.DataFrame(
...         {
...             "col1": [None, float("nan"), 0.0, 1.0],
...             "col2": [None, 1, 0, None],
...             "datetime": [
...                 datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
...             ],
...         },
...         schema={
...             "col1": pl.Float64,
...             "col2": pl.Int64,
...             "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
...         },
...     ),
...     columns=["col1", "col2"],
...     temporal_column="datetime",
...     period="1mo",
... )
>>> nulls
array([2, 0, 0, 1])
>>> totals
array([2, 2, 2, 2])
>>> labels
['2020-01', '2020-02', '2020-03', '2020-04']

grizz.utils.null.propagate_nulls ¶

propagate_nulls(
    frame: DataFrame, frame_with_null: DataFrame
) -> DataFrame

Propagate the null values from frame_with_null to frame.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The input DataFrame where to add `None` values based on `frame_with_null`.	required
`frame_with_null`	`DataFrame`	The DataFrame with the `None` values to propagate to `frame`.	required

Returns:

Type	Description
`DataFrame`	The output DataFrame.

Example usage:

>>> import polars as pl
>>> from grizz.utils.null import propagate_nulls
>>> frame_with_null = pl.DataFrame(
...     {
...         "col1": [1, None, 3, float("nan"), 5],
...         "col2": ["1", "2", None, "4", "5"],
...         "col3": [10, 20, 30, None, 50],
...     },
...     schema={"col1": pl.Float32, "col2": pl.String, "col3": pl.Int64},
... )
>>> frame = frame_with_null.fill_null(99).fill_nan(99)
>>> frame
shape: (5, 3)
┌──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ ---  ┆ ---  ┆ ---  │
│ f32  ┆ str  ┆ i64  │
╞══════╪══════╪══════╡
│ 1.0  ┆ 1    ┆ 10   │
│ 99.0 ┆ 2    ┆ 20   │
│ 3.0  ┆ null ┆ 30   │
│ 99.0 ┆ 4    ┆ 99   │
│ 5.0  ┆ 5    ┆ 50   │
└──────┴──────┴──────┘
>>> out = propagate_nulls(frame=frame, frame_with_null=frame_with_null)
>>> out
shape: (5, 3)
┌──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ ---  ┆ ---  ┆ ---  │
│ f32  ┆ str  ┆ i64  │
╞══════╪══════╪══════╡
│ 1.0  ┆ 1    ┆ 10   │
│ null ┆ 2    ┆ 20   │
│ 3.0  ┆ null ┆ 30   │
│ 99.0 ┆ 4    ┆ null │
│ 5.0  ┆ 5    ┆ 50   │
└──────┴──────┴──────┘

grizz.utils.path ¶

Contain utility functions to manage paths.

grizz.utils.path.find_files ¶

find_files(
    path: Path | str,
    filter_fn: Callable[[Path], bool],
    recursive: bool = True,
) -> list[Path]

Find the path of all the tar files in a given path.

This function does not check if a path is a symbolic link so be careful if you are using a path with symbolic links.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path where to look for the parquet files.	required
`filter_fn`	`Callable[[Path], bool]`	The path filtering function. The function should return `True` for the path to find, and `False` otherwise.	required
`recursive`	`bool`	Indicate if it should also check the sub-folders.	`True`

Returns:

Type	Description
`list[Path]`	The tuple of path of parquet files.

Example usage:

>>> from pathlib import Path
>>> from grizz.utils.path import find_files
>>> find_files(Path("something"), filter_fn=lambda path: path.name.endswith(".txt"))
[...]

grizz.utils.path.find_parquet_files ¶

find_parquet_files(
    path: Path | str, recursive: bool = True
) -> list[Path]

Find the path of all the parquet files in a given path.

This function does not check if a path is a symbolic link so be careful if you are using a path with symbolic links.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path where to look for the parquet files.	required
`recursive`	`bool`	Specifies if it should also check the sub-folders.	`True`

Returns:

Type	Description
`list[Path]`	The list of parquet files.

Example usage:

>>> from pathlib import Path
>>> from grizz.utils.path import find_parquet_files
>>> find_parquet_files(Path("something"))
[...]

grizz.utils.path.human_file_size ¶

human_file_size(path: Path | str, decimal: int = 2) -> str

Get a human-readable representation of a file size.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required
`decimal`	`int`	The number of decimal digits.	`2`

Returns:

Type	Description
`str`	The file size in a human-readable format.

Example usage:

>>> from grizz.utils.path import human_file_size
>>> human_file_size("README.md")
'...B'

grizz.utils.path.sanitize_path ¶

sanitize_path(path: Path | str) -> Path

Sanitize a given path.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to sanitize.	required

Returns:

Type	Description
`Path`	The sanitized path.

Example usage:

>>> from pathlib import Path
>>> from grizz.utils.path import sanitize_path
>>> sanitize_path("something")
PosixPath('.../something')
>>> sanitize_path("")
PosixPath('...')
>>> sanitize_path(Path("something"))
PosixPath('.../something')
>>> sanitize_path(Path("something/./../"))
PosixPath('...')

grizz.utils.series ¶

Contain utility functions for series.

grizz.utils.series.compute_stats_boolean ¶

compute_stats_boolean(series: Series) -> dict[str, float]

Compute some basic statistics about a Boolean series.

Parameters:

Name	Type	Description	Default
`series`	`Series`	The series to analyze.	required

Returns:

Type	Description
`dict[str, float]`	The statistics about the input Boolean series.

Raises:

Type	Description
`ValueError`	if `series` is not a Boolean series.

Example usage:

>>> import polars as pl
>>> from grizz.utils.series import compute_stats_boolean
>>> series = pl.Series([True, False, None, None, False, None])
>>> compute_stats_boolean(series)
{'num_false': 2, 'num_null': 3, 'num_true': 1, 'total': 6}

grizz.utils.sorting ¶

Contain utility functions to sort values from multiple types.

grizz.utils.sorting.mixed_typed_sort ¶

mixed_typed_sort(
    iterable: Iterable, /, *, reverse: bool = False
) -> list

Return a new list containing all items from the iterable sorted in ascending order.

This function is an extension of the built-in sorted function that works on a list with multiple types. There is no global order for all types, so the items are sorted only by type. For example, if a list has string and float values, the string values are sorted together and the float values are sorted together. Each type must implement the python sorting interface. The types are sorted by alphabetical order, so in the previous example, the float values are before the string values in the sorted output list. This function uses sortnan to sort numerical values, so it is possible to sort a list with NaNs.

Parameters:

Name	Type	Description	Default
`iterable`	`Iterable`	The data to sort.	required
`reverse`	`bool`	If set to `True`, then the list elements are sorted as if each comparison were reversed.	`False`

Returns:

Type	Description
`list`	The sorted data.

Example usage:

>>> from grizz.utils.sorting import mixed_typed_sort
>>> x = [1, "c", "a", "b", 4, -2]
>>> mixed_typed_sort(x)
[-2, 1, 4, 'a', 'b', 'c']
>>> mixed_typed_sort(x, reverse=True)
[4, 1, -2, 'c', 'b', 'a']

grizz.utils.temporal ¶

Contain utility functions to do temporal transformations.

grizz.utils.temporal.compute_temporal_stats ¶

compute_temporal_stats(
    frame: DataFrame,
    column: str,
    temporal_column: str,
    period: str,
) -> DataFrame

Return a DataFrame with stats for each temporal window.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The DataFrame to analyze.	required
`column`	`str`	The column to analyze.	required
`temporal_column`	`str`	The temporal column used to create the temporal DataFrames.	required
`period`	`str`	The temporal period e.g. monthly or daily.	required

Returns:

Type	Description
`DataFrame`	A DataFrame with stats for each temporal window.

Example usage:

>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.temporal import compute_temporal_stats
>>> stats = compute_temporal_stats(
...     frame=pl.DataFrame(
...         {
...             "col": [1.2, 4.2, 0.0, 1.0, 4.2, 42.0],
...             "datetime": [
...                 datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
...                 datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
...             ],
...         },
...         schema={
...             "col": pl.Float64,
...             "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
...         },
...     ),
...     column="col",
...     temporal_column="datetime",
...     period="1mo",
... )
>>> stats
shape: (4, 16)
┌─────────────────────────┬───────┬─────────┬──────┬───┬──────┬──────┬──────┬──────┐
│ step                    ┆ count ┆ nunique ┆ mean ┆ … ┆ q90  ┆ q95  ┆ q99  ┆ max  │
│ ---                     ┆ ---   ┆ ---     ┆ ---  ┆   ┆ ---  ┆ ---  ┆ ---  ┆ ---  │
│ datetime[μs, UTC]       ┆ i64   ┆ i64     ┆ f64  ┆   ┆ f64  ┆ f64  ┆ f64  ┆ f64  │
╞═════════════════════════╪═══════╪═════════╪══════╪═══╪══════╪══════╪══════╪══════╡
│ 2020-01-01 00:00:00 UTC ┆ 3     ┆ 3       ┆ 1.8  ┆ … ┆ 4.2  ┆ 4.2  ┆ 4.2  ┆ 4.2  │
│ 2020-02-01 00:00:00 UTC ┆ 1     ┆ 1       ┆ 1.0  ┆ … ┆ 1.0  ┆ 1.0  ┆ 1.0  ┆ 1.0  │
│ 2020-03-01 00:00:00 UTC ┆ 1     ┆ 1       ┆ 4.2  ┆ … ┆ 4.2  ┆ 4.2  ┆ 4.2  ┆ 4.2  │
│ 2020-04-01 00:00:00 UTC ┆ 1     ┆ 1       ┆ 42.0 ┆ … ┆ 42.0 ┆ 42.0 ┆ 42.0 ┆ 42.0 │
└─────────────────────────┴───────┴─────────┴──────┴───┴──────┴──────┴──────┴──────┘

grizz.utils.temporal.to_step_names ¶

to_step_names(groups: GroupBy, period: str) -> list[str]

Return the name of each step.

Parameters:

Name	Type	Description	Default
`groups`	`GroupBy`	The grouped DataFrame by step.	required
`period`	`str`	The temporal period e.g. monthly or daily.	required

Returns:

Type	Description
`list[str]`	A list that contains the name of each step.

>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.temporal import to_step_names
>>> groups = (
...     pl.DataFrame(
...         {
...             "col": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
...             "datetime": [
...                 datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
...                 datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
...             ],
...         },
...         schema={
...             "col": pl.Float64,
...             "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
...         },
...     )
...     .sort("datetime")
...     .group_by_dynamic("datetime", every="1mo")
... )
>>> steps = to_step_names(groups=groups, period="1mo")
>>> steps
['2020-01', '2020-02', '2020-03', '2020-04']

grizz.utils.temporal.to_temporal_frames ¶

to_temporal_frames(
    frame: DataFrame, temporal_column: str, period: str
) -> tuple[list[DataFrame], list[str]]

Return a list of temporal DataFrames and the associated time steps.

Parameters:

Name	Type	Description	Default
`frame`	`DataFrame`	The DataFrame to analyze.	required
`temporal_column`	`str`	The temporal column used to create the temporal DataFrames.	required
`period`	`str`	The temporal period e.g. monthly or daily.	required

Returns:

Type	Description
`tuple[list[DataFrame], list[str]]`	A tuple with the counts and the temporal steps.

Example usage:

>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.temporal import to_temporal_frames
>>> frames, steps = to_temporal_frames(
...     frame=pl.DataFrame(
...         {
...             "col1": [None, float("nan"), 0.0, 1.0, 4.2, 42.0],
...             "col2": [None, 1, 0, None, 2, 3],
...             "datetime": [
...                 datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
...                 datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
...                 datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
...                 datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
...             ],
...         },
...         schema={
...             "col1": pl.Float64,
...             "col2": pl.Int64,
...             "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
...         },
...     ),
...     temporal_column="datetime",
...     period="1mo",
... )
>>> frames
[shape: (3, 3)
┌──────┬──────┬─────────────────────────┐
│ col1 ┆ col2 ┆ datetime                │
│ ---  ┆ ---  ┆ ---                     │
│ f64  ┆ i64  ┆ datetime[μs, UTC]       │
╞══════╪══════╪═════════════════════════╡
│ null ┆ null ┆ 2020-01-03 00:00:00 UTC │
│ NaN  ┆ 1    ┆ 2020-01-04 00:00:00 UTC │
│ 0.0  ┆ 0    ┆ 2020-01-05 00:00:00 UTC │
└──────┴──────┴─────────────────────────┘, shape: (1, 3)
┌──────┬──────┬─────────────────────────┐
│ col1 ┆ col2 ┆ datetime                │
│ ---  ┆ ---  ┆ ---                     │
│ f64  ┆ i64  ┆ datetime[μs, UTC]       │
╞══════╪══════╪═════════════════════════╡
│ 1.0  ┆ null ┆ 2020-02-03 00:00:00 UTC │
└──────┴──────┴─────────────────────────┘, shape: (1, 3)
┌──────┬──────┬─────────────────────────┐
│ col1 ┆ col2 ┆ datetime                │
│ ---  ┆ ---  ┆ ---                     │
│ f64  ┆ i64  ┆ datetime[μs, UTC]       │
╞══════╪══════╪═════════════════════════╡
│ 4.2  ┆ 2    ┆ 2020-03-03 00:00:00 UTC │
└──────┴──────┴─────────────────────────┘, shape: (1, 3)
┌──────┬──────┬─────────────────────────┐
│ col1 ┆ col2 ┆ datetime                │
│ ---  ┆ ---  ┆ ---                     │
│ f64  ┆ i64  ┆ datetime[μs, UTC]       │
╞══════╪══════╪═════════════════════════╡
│ 42.0 ┆ 3    ┆ 2020-04-03 00:00:00 UTC │
└──────┴──────┴─────────────────────────┘]
>>> steps
['2020-01', '2020-02', '2020-03', '2020-04']

utils

grizz.utils ¶

grizz.utils.column ¶

grizz.utils.column.check_column_exist_policy ¶

grizz.utils.column.check_column_missing_policy ¶

grizz.utils.column.check_existing_column ¶

grizz.utils.column.check_existing_columns ¶

grizz.utils.column.check_missing_column ¶

grizz.utils.column.check_missing_columns ¶

grizz.utils.column.find_common_columns ¶

grizz.utils.column.find_missing_columns ¶

grizz.utils.count ¶

grizz.utils.count.compute_nunique ¶

grizz.utils.count.compute_temporal_count ¶

grizz.utils.count.compute_temporal_value_counts ¶

grizz.utils.datetime ¶

grizz.utils.datetime.find_end_datetime ¶

grizz.utils.datetime.to_datetime ¶

grizz.utils.factory ¶

grizz.utils.factory.setup_object ¶

grizz.utils.format ¶

grizz.utils.format.human_byte ¶

grizz.utils.format.str_boolean_series_stats ¶

grizz.utils.format.str_col_diff ¶

grizz.utils.format.str_dataframe_diff ¶

grizz.utils.format.str_kwargs ¶

grizz.utils.format.str_row_diff ¶

grizz.utils.format.str_shape_diff ¶

grizz.utils.format.str_size_diff ¶

grizz.utils.hashing ¶

grizz.utils.hashing.str_to_sha256 ¶

grizz.utils.imports ¶

grizz.utils.imports.check_clickhouse_connect ¶

grizz.utils.imports.check_colorlog ¶

grizz.utils.imports.check_pyarrow ¶

grizz.utils.imports.check_sklearn ¶

grizz.utils.imports.check_tqdm ¶

grizz.utils.imports.clickhouse_connect_available ¶

grizz.utils.imports.colorlog_available ¶

grizz.utils.imports.is_clickhouse_connect_available cached ¶

grizz.utils.imports.is_colorlog_available ¶

grizz.utils.imports.is_pyarrow_available cached ¶

grizz.utils.imports.is_sklearn_available cached ¶

grizz.utils.imports.is_tqdm_available cached ¶

grizz.utils.imports.pyarrow_available ¶

grizz.utils.imports.sklearn_available ¶

grizz.utils.imports.tqdm_available ¶

grizz.utils.interval ¶

grizz.utils.interval.find_time_unit ¶

grizz.utils.interval.interval_to_strftime_format ¶

grizz.utils.interval.interval_to_timedelta ¶

grizz.utils.interval.time_unit_to_strftime_format ¶

grizz.utils.logging ¶

grizz.utils.logging.configure_logging ¶

grizz.utils.nan ¶

grizz.utils.nan.LowNaN ¶

grizz.utils.nan.remove_nan ¶

grizz.utils.nan.sortnan ¶

grizz.utils.noop ¶

grizz.utils.noop.tqdm ¶

grizz.utils.null ¶

grizz.utils.null.compute_null ¶

grizz.utils.null.compute_null_count ¶

grizz.utils.null.compute_temporal_null_count ¶

grizz.utils.null.propagate_nulls ¶

grizz.utils.path ¶

grizz.utils.path.find_files ¶

grizz.utils.path.find_parquet_files ¶

grizz.utils.path.human_file_size ¶

grizz.utils.path.sanitize_path ¶

grizz.utils.series ¶

grizz.utils.series.compute_stats_boolean ¶

grizz.utils.sorting ¶

grizz.utils.sorting.mixed_typed_sort ¶

grizz.utils.temporal ¶

grizz.utils.temporal.compute_temporal_stats ¶

grizz.utils.temporal.to_step_names ¶

grizz.utils.temporal.to_temporal_frames ¶

grizz.utils.imports.is_clickhouse_connect_available `cached` ¶

grizz.utils.imports.is_pyarrow_available `cached` ¶

grizz.utils.imports.is_sklearn_available `cached` ¶

grizz.utils.imports.is_tqdm_available `cached` ¶