utils
grizz.utils ¶
Contain utility functions.
grizz.utils.column ¶
Contain DataFrame columns utility functions.
grizz.utils.column.check_column_exist_policy ¶
check_column_exist_policy(exist_policy: str) -> None
Check the policy on how to handle existing columns.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
exist_policy
|
str
|
The policy on how to handle existing columns. |
required |
Raises:
Type | Description |
---|---|
ValueError
|
if |
Example usage:
>>> from grizz.utils.column import check_column_exist_policy
>>> check_column_exist_policy("ignore")
grizz.utils.column.check_column_missing_policy ¶
check_column_missing_policy(missing_policy: str) -> None
Check the policy on how to handle missing columns.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
missing_policy
|
str
|
The policy on how to handle missing columns. |
required |
Raises:
Type | Description |
---|---|
ValueError
|
if |
Example usage:
>>> from grizz.utils.column import check_column_missing_policy
>>> check_column_missing_policy("ignore")
grizz.utils.column.check_existing_column ¶
check_existing_column(
frame_or_cols: DataFrame | Sequence,
column: str,
exist_policy: str = "raise",
) -> None
Check if a column already exists.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame_or_cols
|
DataFrame | Sequence
|
The DataFrame or its columns. |
required |
column
|
str
|
The column to check. |
required |
exist_policy
|
str
|
The policy on how to handle existing columns.
The following options are available: |
'raise'
|
Raises:
Type | Description |
---|---|
ColumnExistsError
|
if at least one column already exists and
|
Example usage:
>>> import polars as pl
>>> from grizz.utils.column import check_existing_column
>>> frame = pl.DataFrame(
... {
... "col1": [1, 2, 3, 4, 5],
... "col2": ["1", "2", "3", "4", "5"],
... "col3": ["a ", " b", " c ", "d", "e"],
... "col4": ["a ", " b", " c ", "d", "e"],
... }
... )
>>> check_existing_column(frame, "col1", exist_policy="warn")
grizz.utils.column.check_existing_columns ¶
check_existing_columns(
frame_or_cols: DataFrame | Sequence,
columns: Sequence,
exist_policy: str = "raise",
) -> None
Check if some columns already exist.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame_or_cols
|
DataFrame | Sequence
|
The DataFrame or its columns. |
required |
columns
|
Sequence
|
The columns to check. |
required |
exist_policy
|
str
|
The policy on how to handle existing columns.
The following options are available: |
'raise'
|
Raises:
Type | Description |
---|---|
ColumnExistsError
|
if at least one column already exists and
|
Example usage:
>>> import polars as pl
>>> from grizz.utils.column import check_existing_columns
>>> frame = pl.DataFrame(
... {
... "col1": [1, 2, 3, 4, 5],
... "col2": ["1", "2", "3", "4", "5"],
... "col3": ["a ", " b", " c ", "d", "e"],
... "col4": ["a ", " b", " c ", "d", "e"],
... }
... )
>>> check_existing_columns(frame, ["col1", "col5"], exist_policy="warn")
grizz.utils.column.check_missing_column ¶
check_missing_column(
frame_or_cols: DataFrame | Sequence,
column: str,
missing_policy: str = "raise",
) -> None
Check if a column is missing.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame_or_cols
|
DataFrame | Sequence
|
The DataFrame or its columns. |
required |
column
|
str
|
The column to check. |
required |
missing_policy
|
str
|
The policy on how to handle missing columns.
The following options are available: |
'raise'
|
Raises:
Type | Description |
---|---|
ColumnNotFoundError
|
if the column is missing and
|
Example usage:
>>> import polars as pl
>>> from grizz.utils.column import check_missing_column
>>> frame = pl.DataFrame(
... {
... "col1": [1, 2, 3, 4, 5],
... "col2": ["1", "2", "3", "4", "5"],
... "col3": ["a ", " b", " c ", "d", "e"],
... "col4": ["a ", " b", " c ", "d", "e"],
... }
... )
>>> check_missing_column(frame, "col1", missing_policy="warn")
grizz.utils.column.check_missing_columns ¶
check_missing_columns(
frame_or_cols: DataFrame | Sequence,
columns: Sequence,
missing_policy: str = "raise",
) -> None
Check if some columns are missing.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame_or_cols
|
DataFrame | Sequence
|
The DataFrame or its columns. |
required |
columns
|
Sequence
|
The columns to check. |
required |
missing_policy
|
str
|
The policy on how to handle missing columns.
The following options are available: |
'raise'
|
Raises:
Type | Description |
---|---|
ColumnNotFoundError
|
if at least one column is missing and
|
Example usage:
>>> import polars as pl
>>> from grizz.utils.column import check_missing_columns
>>> frame = pl.DataFrame(
... {
... "col1": [1, 2, 3, 4, 5],
... "col2": ["1", "2", "3", "4", "5"],
... "col3": ["a ", " b", " c ", "d", "e"],
... "col4": ["a ", " b", " c ", "d", "e"],
... }
... )
>>> check_missing_columns(frame, ["col1", "col5"], missing_policy="warn")
grizz.utils.column.find_common_columns ¶
find_common_columns(
frame_or_cols: DataFrame | Sequence,
columns: Sequence[str],
) -> tuple[str, ...]
Find the common columns that are both in the DataFrame and the given columns.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame_or_cols
|
DataFrame | Sequence
|
The DataFrame or its columns. |
required |
columns
|
Sequence[str]
|
The columns to check. |
required |
Returns:
Type | Description |
---|---|
tuple[str, ...]
|
The columns i.e. the columns that are both in
|
Example usage:
>>> import polars as pl
>>> from grizz.utils.column import find_common_columns
>>> frame = pl.DataFrame(
... {
... "col1": [1, 2, 3, 4, 5],
... "col2": ["1", "2", "3", "4", "5"],
... "col3": ["a ", " b", " c ", "d", "e"],
... }
... )
>>> cols = find_common_columns(frame, columns=["col1", "col2", "col3", "col4"])
>>> cols
('col1', 'col2', 'col3')
grizz.utils.column.find_missing_columns ¶
find_missing_columns(
frame_or_cols: DataFrame | Sequence,
columns: Sequence[str],
) -> tuple[str, ...]
Find the columns that are in the given columns but not in the DataFrame.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame_or_cols
|
DataFrame | Sequence
|
The DataFrame or its columns. |
required |
columns
|
Sequence[str]
|
The columns to check. |
required |
Returns:
Type | Description |
---|---|
tuple[str, ...]
|
The list of missing columns i.e. the columns that are in
|
Example usage:
>>> import polars as pl
>>> from grizz.utils.column import find_missing_columns
>>> frame = pl.DataFrame(
... {
... "col1": [1, 2, 3, 4, 5],
... "col2": ["1", "2", "3", "4", "5"],
... "col3": ["a ", " b", " c ", "d", "e"],
... }
... )
>>> cols = find_missing_columns(frame, columns=["col1", "col2", "col3", "col4"])
>>> cols
('col4',)
grizz.utils.count ¶
Contain utility functions for counting.
grizz.utils.count.compute_nunique ¶
compute_nunique(frame: DataFrame) -> ndarray
Return the number of unique values in each column.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The DataFrame to analyze. |
required |
Returns:
Type | Description |
---|---|
ndarray
|
An array with the number of unique values in each column. The shape of the array is the number of columns. |
Example usage:
>>> import polars as pl
>>> from grizz.utils.count import compute_nunique
>>> frame = pl.DataFrame(
... {
... "int": [None, 1, 0, 1],
... "float": [1.2, 4.2, None, 2.2],
... "str": ["A", "B", None, None],
... },
... schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String},
... )
>>> count = compute_nunique(frame)
>>> count
array([3, 4, 3])
grizz.utils.count.compute_temporal_count ¶
compute_temporal_count(
frame: DataFrame, temporal_column: str, period: str
) -> tuple[ndarray, list[str]]
Prepare the data to create the figure and table.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The DataFrame to analyze. |
required |
temporal_column
|
str
|
The temporal column used to analyze the temporal distribution. |
required |
period
|
str
|
The temporal period e.g. monthly or daily. |
required |
Returns:
Type | Description |
---|---|
tuple[ndarray, list[str]]
|
A tuple with the counts and the temporal steps. |
Example usage:
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.count import compute_temporal_count
>>> counts, steps = compute_temporal_count(
... frame=pl.DataFrame(
... {
... "col1": [None, float("nan"), 0.0, 1.0, 4.2, 42.0],
... "col2": [None, 1, 0, None, 2, 3],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Float64,
... "col2": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... ),
... temporal_column="datetime",
... period="1mo",
... )
>>> counts
array([3, 1, 1, 1])
>>> steps
['2020-01', '2020-02', '2020-03', '2020-04']
grizz.utils.count.compute_temporal_value_counts ¶
compute_temporal_value_counts(
frame: DataFrame,
column: str,
temporal_column: str,
period: str,
drop_nulls: bool = False,
) -> tuple[ndarray, list[str], list[str]]
Compute the value counts for temporal windows of a given column.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The DataFrame to analyze. |
required |
column
|
str
|
The column to analyze the temporal value counts. |
required |
temporal_column
|
str
|
The temporal column used to analyze the temporal distribution. |
required |
period
|
str
|
The temporal period e.g. monthly or daily. |
required |
drop_nulls
|
bool
|
If |
False
|
Returns:
Type | Description |
---|---|
tuple[ndarray, list[str], list[str]]
|
A tuple with 3 items. The first item is a 2-d array that indicates the number of occurrences for each value and time step. The first dimension represents the value and the second dimension represents the steps. The second item is the list of time steps. The third item is the list of string representation of the values. |
Example usage:
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.count import compute_temporal_value_counts
>>> counts, steps, values = compute_temporal_value_counts(
... frame=pl.DataFrame(
... {
... "col1": [None, 1.0, 0.0, 1.0, 4.2, 42.0],
... "col2": [None, 1, 0, None, 2, 3],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Float64,
... "col2": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... ),
... column="col1",
... temporal_column="datetime",
... period="1mo",
... )
>>> counts
array([[1, 0, 0, 0],
[1, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1],
[1, 0, 0, 0]])
>>> steps
['2020-01', '2020-02', '2020-03', '2020-04']
>>> values
['0.0', '1.0', '4.2', '42.0', 'null']
grizz.utils.datetime ¶
Contain utility functions for datetime and date objects.
grizz.utils.datetime.find_end_datetime ¶
find_end_datetime(
start: datetime | date,
interval: str | timedelta,
periods: int,
) -> datetime
Find the upper bound of the datetime range from the lower bound of the datetime range, the interval, and the number of periods.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
start
|
datetime | date
|
The lower bound of the datetime range. |
required |
interval
|
str | timedelta
|
The interval of the range periods, specified as a Python timedelta object or using the Polars duration string language. |
required |
periods
|
int
|
The number of periods after the start. |
required |
Returns:
Type | Description |
---|---|
datetime
|
The upper bound of the datetime range. |
Notes
interval
is created according to the following string
language:
- 1ns (1 nanosecond)
- 1us (1 microsecond)
- 1ms (1 millisecond)
- 1s (1 second)
- 1m (1 minute)
- 1h (1 hour)
- 1d (1 calendar day)
- 1w (1 calendar week)
Example usage:
>>> from datetime import timedelta, datetime, timezone
>>> from grizz.utils.datetime import find_end_datetime
>>> find_end_datetime(
... start=datetime(year=2020, month=5, day=12, hour=4, tzinfo=timezone.utc),
... interval=timedelta(hours=1),
... periods=42,
... )
datetime.datetime(2020, 5, 13, 22, 0, tzinfo=datetime.timezone.utc)
grizz.utils.datetime.to_datetime ¶
to_datetime(dt: datetime | date) -> datetime
Convert a date
object to a datetime
object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dt
|
datetime | date
|
The |
required |
Returns:
Type | Description |
---|---|
datetime
|
The |
Example usage:
>>> from datetime import datetime, date, timezone
>>> from grizz.utils.datetime import to_datetime
>>> to_datetime(datetime(year=2020, month=5, day=12, hour=4, tzinfo=timezone.utc))
datetime.datetime(2020, 5, 12, 4, 0, tzinfo=datetime.timezone.utc)
>>> to_datetime(date(year=2020, month=5, day=12))
datetime.datetime(2020, 5, 12, 0, 0, tzinfo=datetime.timezone.utc)
grizz.utils.factory ¶
Contain a function to instantiate an object from its configuration.
grizz.utils.factory.setup_object ¶
setup_object(obj_or_config: T | dict) -> T
Set up an object from its configuration.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
obj_or_config
|
T | dict
|
The object or its configuration. |
required |
Returns:
Type | Description |
---|---|
T
|
The instantiated object. |
Example usage:
>>> from grizz.utils.factory import setup_object
>>> obj = setup_object({"_target_": "collections.deque", "iterable": [1, 2, 1, 3]})
>>> obj
deque([1, 2, 1, 3])
>>> setup_object(obj) # Do nothing because the object is already instantiated
deque([1, 2, 1, 3])
grizz.utils.format ¶
Contain utility functions to format strings.
grizz.utils.format.human_byte ¶
human_byte(size: float, decimal: int = 2) -> str
Return a human-readable string representation of byte sizes.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
size
|
float
|
The number of bytes. |
required |
decimal
|
int
|
The number of decimal digits. |
2
|
Returns:
Type | Description |
---|---|
str
|
The human-readable string representation of byte sizes. |
Example usage:
>>> from grizz.utils.format import human_byte
>>> human_byte(2)
'2.00 B'
>>> human_byte(2048)
'2.00 KB'
>>> human_byte(2097152)
'2.00 MB'
grizz.utils.format.str_boolean_series_stats ¶
str_boolean_series_stats(series: Series) -> str
Return a string about the content of a Boolean series.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
series
|
Series
|
The input series. |
required |
Returns:
Type | Description |
---|---|
str
|
The generated string about the input series. |
Raises:
Type | Description |
---|---|
ValueError
|
if |
Example usage:
>>> import polars as pl
>>> from grizz.utils.format import str_boolean_series_stats
>>> series = pl.Series([True, False, None, None, False, None])
>>> str_boolean_series_stats(series)
true: 1/3 (33.3333 %) | null: 3/6 (50.0000 %)
grizz.utils.format.str_col_diff ¶
str_col_diff(orig: int, final: int) -> str
Return a string that indicates the difference of columns.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
orig
|
int
|
The original number of columns. |
required |
final
|
int
|
The final number of columns. |
required |
Returns:
Type | Description |
---|---|
str
|
The generated string with the difference of columns. |
Example usage:
>>> from grizz.utils.format import str_col_diff
>>> str_col_diff(100, 10)
90/100 (90.0000 %) columns have been removed
>>> str_col_diff(100, 99)
1/100 (1.0000 %) column has been removed
grizz.utils.format.str_dataframe_diff ¶
str_dataframe_diff(
orig: DataFrame, final: DataFrame
) -> str
Return a string that shows the difference between DataFrames.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
orig
|
DataFrame
|
The original DataFrame. |
required |
final
|
DataFrame
|
The final DataFrame. |
required |
Returns:
Type | Description |
---|---|
str
|
The generated string with the difference of DataFrames. |
Example usage:
>>> import polars as pl
>>> from grizz.utils.format import str_dataframe_diff
>>> frame1 = pl.DataFrame(
... {
... "col1": [1, 2, 3, 4, 5],
... "col2": ["1", "2", "3", "4", "5"],
... "col3": ["1", "2", "3", "4", "5"],
... "col4": ["a", "b", "c", "d", "e"],
... }
... )
>>> frame2 = pl.DataFrame(
... {
... "col1": [1, 2, 3, 4, 5],
... "col2": ["1", "2", "3", "4", "5"],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0],
... }
... )
>>> print(str_dataframe_diff(orig=frame1, final=frame2))
DataFrame shape: (5, 4) -> (5, 3) | 1/4 (25.0000 %) column has been removed
DataFrame estimated size: 55.00 B -> 85.00 B | difference: 30.00 B (54.5455 %)
grizz.utils.format.str_kwargs ¶
str_kwargs(mapping: Mapping) -> str
Return a string of the input mapping.
This function is designed to be used in __repr__
and
__str__
methods.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mapping
|
Mapping
|
The mapping. |
required |
Returns:
Type | Description |
---|---|
str
|
The generated string. |
Example usage:
>>> from grizz.utils.format import str_kwargs
>>> str_kwargs({"key1": 1})
', key1=1'
>>> str_kwargs({"key1": 1, "key2": 2})
', key1=1, key2=2'
grizz.utils.format.str_row_diff ¶
str_row_diff(orig: int, final: int) -> str
Return a string that indicates the difference of rows.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
orig
|
int
|
The original number of rows. |
required |
final
|
int
|
The final number of rows. |
required |
Returns:
Type | Description |
---|---|
str
|
The generated string with the difference of rows. |
Example usage:
>>> from grizz.utils.format import str_row_diff
>>> str_row_diff(100, 10)
90/100 (90.0000 %) rows have been removed
>>> str_row_diff(100, 99)
1/100 (1.0000 %) row has been removed
grizz.utils.format.str_shape_diff ¶
str_shape_diff(
orig: tuple[int, int], final: tuple[int, int]
) -> str
Return a string that indicates the difference of DataFrame shapes.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
orig
|
tuple[int, int]
|
The original shape. |
required |
final
|
tuple[int, int]
|
The final shape. |
required |
Returns:
Type | Description |
---|---|
str
|
The generated string with the difference of DataFrame shapes. |
Example usage:
>>> from grizz.utils.format import str_shape_diff
>>> str_shape_diff(orig=(100, 10), final=(80, 8))
DataFrame shape: (100, 10) -> (80, 8) | 20/100 (20.0000 %) rows have been removed |
2/10 (20.0000 %) columns have been removed
grizz.utils.format.str_size_diff ¶
str_size_diff(orig: float, final: float) -> str
Return a string that indicates the difference of DataFrame sizes.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
orig
|
float
|
The original size. |
required |
final
|
float
|
The final size. |
required |
Returns:
Type | Description |
---|---|
str
|
The generated string with the difference of DataFrame sizes. |
Example usage:
>>> from grizz.utils.format import str_size_diff
>>> str_size_diff(orig=100, final=120)
DataFrame estimated size: 100.00 B -> 120.00 B | difference: 20.00 B (20.0000 %)
grizz.utils.hashing ¶
Contain utility functions to compute hash of objects.
grizz.utils.hashing.str_to_sha256 ¶
str_to_sha256(string: str) -> str
Generate the SHA-256 hash of a string.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
string
|
str
|
The string to hash. |
required |
Returns:
Type | Description |
---|---|
str
|
The SHA-256 hash. |
Example usage:
>>> from grizz.utils.hashing import str_to_sha256
>>> str_to_sha256("bears are funny")
c97afc5c7f1b598c9f68dc2d6e323b2dd2eaaa31d3a07c98059de6079cbd30e0
grizz.utils.imports ¶
Implement some utility functions to manage optional dependencies.
grizz.utils.imports.check_clickhouse_connect ¶
check_clickhouse_connect() -> None
Check if the clickhouse_connect
package is installed.
Raises:
Type | Description |
---|---|
RuntimeError
|
if the |
Example usage:
>>> from grizz.utils.imports import check_clickhouse_connect
>>> check_clickhouse_connect()
grizz.utils.imports.check_colorlog ¶
check_colorlog() -> None
Check if the colorlog
package is installed.
Raises:
Type | Description |
---|---|
RuntimeError
|
if the |
Example usage:
>>> from grizz.utils.imports import check_colorlog
>>> check_colorlog()
grizz.utils.imports.check_pyarrow ¶
check_pyarrow() -> None
Check if the pyarrow
package is installed.
Raises:
Type | Description |
---|---|
RuntimeError
|
if the |
Example usage:
>>> from grizz.utils.imports import check_pyarrow
>>> check_pyarrow()
grizz.utils.imports.check_sklearn ¶
check_sklearn() -> None
Check if the sklearn
package is installed.
Raises:
Type | Description |
---|---|
RuntimeError
|
if the |
Example usage:
>>> from grizz.utils.imports import check_sklearn
>>> check_sklearn()
grizz.utils.imports.check_tqdm ¶
check_tqdm() -> None
Check if the tqdm
package is installed.
Raises:
Type | Description |
---|---|
RuntimeError
|
if the |
Example usage:
>>> from grizz.utils.imports import check_tqdm
>>> check_tqdm()
grizz.utils.imports.clickhouse_connect_available ¶
clickhouse_connect_available(
fn: Callable[..., Any],
) -> Callable[..., Any]
Implement a decorator to execute a function only if
clickhouse_connect
package is installed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fn
|
Callable[..., Any]
|
The function to execute. |
required |
Returns:
Type | Description |
---|---|
Callable[..., Any]
|
A wrapper around |
Example usage:
>>> from grizz.utils.imports import clickhouse_connect_available
>>> @clickhouse_connect_available
... def my_function(n: int = 0) -> int:
... return 42 + n
...
>>> my_function()
grizz.utils.imports.colorlog_available ¶
colorlog_available(
fn: Callable[..., Any],
) -> Callable[..., Any]
Implement a decorator to execute a function only if colorlog
package is installed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fn
|
Callable[..., Any]
|
The function to execute. |
required |
Returns:
Type | Description |
---|---|
Callable[..., Any]
|
A wrapper around |
Example usage:
>>> from grizz.utils.imports import colorlog_available
>>> @colorlog_available
... def my_function(n: int = 0) -> int:
... return 42 + n
...
>>> my_function()
grizz.utils.imports.is_clickhouse_connect_available
cached
¶
is_clickhouse_connect_available() -> bool
Indicate if the clickhouse_connect
package is installed or
not.
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> from grizz.utils.imports import is_clickhouse_connect_available
>>> is_clickhouse_connect_available()
grizz.utils.imports.is_colorlog_available ¶
is_colorlog_available() -> bool
Indicate if the colorlog
package is installed or not.
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> from grizz.utils.imports import is_colorlog_available
>>> is_colorlog_available()
grizz.utils.imports.is_pyarrow_available
cached
¶
is_pyarrow_available() -> bool
Indicate if the pyarrow
package is installed or not.
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> from grizz.utils.imports import is_pyarrow_available
>>> is_pyarrow_available()
grizz.utils.imports.is_sklearn_available
cached
¶
is_sklearn_available() -> bool
Indicate if the sklearn
package is installed or not.
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> from grizz.utils.imports import is_sklearn_available
>>> is_sklearn_available()
grizz.utils.imports.is_tqdm_available
cached
¶
is_tqdm_available() -> bool
Indicate if the tqdm
package is installed or not.
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> from grizz.utils.imports import is_tqdm_available
>>> is_tqdm_available()
grizz.utils.imports.pyarrow_available ¶
pyarrow_available(
fn: Callable[..., Any],
) -> Callable[..., Any]
Implement a decorator to execute a function only if pyarrow
package is installed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fn
|
Callable[..., Any]
|
Specifies the function to execute. |
required |
Returns:
Type | Description |
---|---|
Callable[..., Any]
|
A wrapper around |
Example usage:
>>> from grizz.utils.imports import pyarrow_available
>>> @pyarrow_available
... def my_function(n: int = 0) -> int:
... return 42 + n
...
>>> my_function()
grizz.utils.imports.sklearn_available ¶
sklearn_available(
fn: Callable[..., Any],
) -> Callable[..., Any]
Implement a decorator to execute a function only if sklearn
package is installed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fn
|
Callable[..., Any]
|
Specifies the function to execute. |
required |
Returns:
Type | Description |
---|---|
Callable[..., Any]
|
A wrapper around |
Example usage:
>>> from grizz.utils.imports import sklearn_available
>>> @sklearn_available
... def my_function(n: int = 0) -> int:
... return 42 + n
...
>>> my_function()
grizz.utils.imports.tqdm_available ¶
tqdm_available(
fn: Callable[..., Any],
) -> Callable[..., Any]
Implement a decorator to execute a function only if tqdm
package is installed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fn
|
Callable[..., Any]
|
Specifies the function to execute. |
required |
Returns:
Type | Description |
---|---|
Callable[..., Any]
|
A wrapper around |
Example usage:
>>> from grizz.utils.imports import tqdm_available
>>> @tqdm_available
... def my_function(n: int = 0) -> int:
... return 42 + n
...
>>> my_function()
grizz.utils.interval ¶
Contain interval utility functions.
grizz.utils.interval.find_time_unit ¶
find_time_unit(interval: str) -> str
Find the time unit associated to a polars
interval.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
interval
|
str
|
The |
required |
Returns:
Type | Description |
---|---|
str
|
The found time unit. |
Raises:
Type | Description |
---|---|
RuntimeError
|
if no valid time unit can be found. |
Example usage:
>>> from grizz.utils.interval import find_time_unit
>>> find_time_unit("3d12h4m")
m
>>> find_time_unit("3y5mo")
mo
grizz.utils.interval.interval_to_strftime_format ¶
interval_to_strftime_format(interval: str) -> str
Return the default strftime format for a given interval.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
interval
|
str
|
The |
required |
Returns:
Type | Description |
---|---|
str
|
The default strftime format. |
Example usage:
>>> from grizz.utils.interval import interval_to_strftime_format
>>> interval_to_strftime_format("1h")
%Y-%m-%d %H:%M
>>> interval_to_strftime_format("3y1mo")
%Y-%m
grizz.utils.interval.interval_to_timedelta ¶
interval_to_timedelta(interval: str) -> timedelta
Convert a interval to a timedelta object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
interval
|
str
|
The input interval. |
required |
Returns:
Type | Description |
---|---|
timedelta
|
The timedelta object generated from the interval. |
Example usage:
>>> from grizz.utils.interval import interval_to_timedelta
>>> interval_to_timedelta("5d1h42m")
datetime.timedelta(days=5, seconds=6120)
grizz.utils.interval.time_unit_to_strftime_format ¶
time_unit_to_strftime_format(time_unit: str) -> str
Return the default strftime format for a given time unit.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
time_unit
|
str
|
The time unit. |
required |
Returns:
Type | Description |
---|---|
str
|
The default strftime format. |
Example usage:
>>> from grizz.utils.interval import time_unit_to_strftime_format
>>> time_unit_to_strftime_format("h")
%Y-%m-%d %H:%M
>>> time_unit_to_strftime_format("mo")
%Y-%m
grizz.utils.logging ¶
Contain utility functions to configure the standard logging library.
grizz.utils.logging.configure_logging ¶
configure_logging(level: int = INFO) -> None
Configure the logging module with a colored formatter.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
level
|
int
|
The lower level. |
INFO
|
grizz.utils.nan ¶
Contain utility functions to transform data with NaNs.
grizz.utils.nan.LowNaN ¶
Bases: float
Implement a NaN representation that is always lower than other numbers.
This class is designed to be used to compare numbers with NaN values and should not be used in other cases.
https://docs.python.org/3/library/functions.html#sorted
grizz.utils.nan.remove_nan ¶
remove_nan(data: T) -> T
Remove the NaN values from the input sequence.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data
|
T
|
The input sequence. |
required |
Returns:
Type | Description |
---|---|
T
|
The input sequence without NaN values. |
Example usage:
>>> from grizz.utils.nan import remove_nan
>>> data = [float("nan"), float("-inf"), -2, 1.2]
>>> remove_nan(data)
[-inf, -2, 1.2]
grizz.utils.nan.sortnan ¶
sortnan(
iterable: Iterable[bool | float],
/,
*,
reverse: bool = False,
) -> list[bool | float]
Sort a sequence of numeric values with NaN.
This function is an extension of the built-in sorted
function.
It sees NaN values as equivalent to -infinity when the values are
sorted.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
iterable
|
Iterable[bool | float]
|
The numeric values to sort. |
required |
reverse
|
bool
|
If set to |
False
|
Returns:
Type | Description |
---|---|
list[bool | float]
|
The sorted list. |
Example usage:
>>> from grizz.utils.nan import sortnan
>>> x = [4, float("nan"), 2, 1.2, 7.9, -2]
>>> sorted(x)
[4, nan, -2, 1.2, 2, 7.9]
>>> sortnan(x)
[nan, -2, 1.2, 2, 4, 7.9]
>>> sortnan(x, reverse=True)
[7.9, 4, 2, 1.2, -2, nan]
grizz.utils.noop ¶
Contain no-op functions.
grizz.utils.noop.tqdm ¶
tqdm(
iterable: Iterable, *args: Any, **kwargs: Any
) -> Iterable
Implement a no-op tqdm progressbar that is used when tqdm is not installed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
iterable
|
Iterable
|
Iterable to decorate with a progressbar. |
required |
*args
|
Any
|
Positional arbitrary arguments. |
()
|
**kwargs
|
Any
|
Keyword arbitrary arguments. |
{}
|
Returns:
Type | Description |
---|---|
Iterable
|
The input iterable. |
grizz.utils.null ¶
Contain utility functions to manipulate null values in DataFrames.
grizz.utils.null.compute_null ¶
compute_null(frame: DataFrame) -> DataFrame
Return the number and percentage of null values per column.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The DataFrame to analyze. |
required |
Returns:
Type | Description |
---|---|
DataFrame
|
A DataFrame with the number and percentage of null values per column. |
Example usage:
>>> import polars as pl
>>> from grizz.utils.null import compute_null
>>> frame = compute_null(
... pl.DataFrame(
... {
... "int": [None, 1, 0, 1],
... "float": [1.2, 4.2, None, 2.2],
... "str": ["A", "B", None, None],
... },
... schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String},
... )
... )
>>> frame
shape: (3, 4)
┌────────┬──────┬───────┬──────────┐
│ column ┆ null ┆ total ┆ null_pct │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ f64 │
╞════════╪══════╪═══════╪══════════╡
│ int ┆ 1 ┆ 4 ┆ 0.25 │
│ float ┆ 1 ┆ 4 ┆ 0.25 │
│ str ┆ 2 ┆ 4 ┆ 0.5 │
└────────┴──────┴───────┴──────────┘
grizz.utils.null.compute_null_count ¶
compute_null_count(frame: DataFrame) -> ndarray
Return the number of null values in each column.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The DataFrame to analyze. |
required |
Returns:
Type | Description |
---|---|
ndarray
|
An array with the number of null values in each column. The shape of the array is the number of columns. |
Example usage:
>>> import polars as pl
>>> from grizz.utils.null import compute_null_count
>>> frame = pl.DataFrame(
... {
... "int": [None, 1, 0, 1],
... "float": [1.2, 4.2, None, 2.2],
... "str": ["A", "B", None, None],
... },
... schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String},
... )
>>> count = compute_null_count(frame)
>>> count
array([1, 1, 2])
grizz.utils.null.compute_temporal_null_count ¶
compute_temporal_null_count(
frame: DataFrame,
columns: Sequence[str],
temporal_column: str,
period: str,
) -> tuple[ndarray, ndarray, list]
Compute the number of null values per temporal segments.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The DataFrame to analyze. |
required |
columns
|
Sequence[str]
|
The list of columns to analyze. |
required |
temporal_column
|
str
|
The temporal column used to analyze the temporal distribution. |
required |
period
|
str
|
The temporal period e.g. monthly or daily. |
required |
Returns:
Type | Description |
---|---|
tuple[ndarray, ndarray, list]
|
A tuple with 3 values. The first value is a numpy NDArray that contains the number of null values per period. The second value is a numpy NDArray that contains the total number of values. The third value is a list that contains the label of each period. |
Example usage:
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.null import compute_temporal_null_count
>>> nulls, totals, labels = compute_temporal_null_count(
... frame=pl.DataFrame(
... {
... "col1": [None, float("nan"), 0.0, 1.0],
... "col2": [None, 1, 0, None],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Float64,
... "col2": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... ),
... columns=["col1", "col2"],
... temporal_column="datetime",
... period="1mo",
... )
>>> nulls
array([2, 0, 0, 1])
>>> totals
array([2, 2, 2, 2])
>>> labels
['2020-01', '2020-02', '2020-03', '2020-04']
grizz.utils.null.propagate_nulls ¶
propagate_nulls(
frame: DataFrame, frame_with_null: DataFrame
) -> DataFrame
Propagate the null values from frame_with_null
to frame
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The input DataFrame where to add |
required |
frame_with_null
|
DataFrame
|
The DataFrame with the |
required |
Returns:
Type | Description |
---|---|
DataFrame
|
The output DataFrame. |
Example usage:
>>> import polars as pl
>>> from grizz.utils.null import propagate_nulls
>>> frame_with_null = pl.DataFrame(
... {
... "col1": [1, None, 3, float("nan"), 5],
... "col2": ["1", "2", None, "4", "5"],
... "col3": [10, 20, 30, None, 50],
... },
... schema={"col1": pl.Float32, "col2": pl.String, "col3": pl.Int64},
... )
>>> frame = frame_with_null.fill_null(99).fill_nan(99)
>>> frame
shape: (5, 3)
┌──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ --- ┆ --- ┆ --- │
│ f32 ┆ str ┆ i64 │
╞══════╪══════╪══════╡
│ 1.0 ┆ 1 ┆ 10 │
│ 99.0 ┆ 2 ┆ 20 │
│ 3.0 ┆ null ┆ 30 │
│ 99.0 ┆ 4 ┆ 99 │
│ 5.0 ┆ 5 ┆ 50 │
└──────┴──────┴──────┘
>>> out = propagate_nulls(frame=frame, frame_with_null=frame_with_null)
>>> out
shape: (5, 3)
┌──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ --- ┆ --- ┆ --- │
│ f32 ┆ str ┆ i64 │
╞══════╪══════╪══════╡
│ 1.0 ┆ 1 ┆ 10 │
│ null ┆ 2 ┆ 20 │
│ 3.0 ┆ null ┆ 30 │
│ 99.0 ┆ 4 ┆ null │
│ 5.0 ┆ 5 ┆ 50 │
└──────┴──────┴──────┘
grizz.utils.path ¶
Contain utility functions to manage paths.
grizz.utils.path.find_files ¶
find_files(
path: Path | str,
filter_fn: Callable[[Path], bool],
recursive: bool = True,
) -> list[Path]
Find the path of all the tar files in a given path.
This function does not check if a path is a symbolic link so be careful if you are using a path with symbolic links.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path
|
Path | str
|
The path where to look for the parquet files. |
required |
filter_fn
|
Callable[[Path], bool]
|
The path filtering function. The function
should return |
required |
recursive
|
bool
|
Indicate if it should also check the sub-folders. |
True
|
Returns:
Type | Description |
---|---|
list[Path]
|
The tuple of path of parquet files. |
Example usage:
>>> from pathlib import Path
>>> from grizz.utils.path import find_files
>>> find_files(Path("something"), filter_fn=lambda path: path.name.endswith(".txt"))
[...]
grizz.utils.path.find_parquet_files ¶
find_parquet_files(
path: Path | str, recursive: bool = True
) -> list[Path]
Find the path of all the parquet files in a given path.
This function does not check if a path is a symbolic link so be careful if you are using a path with symbolic links.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path
|
Path | str
|
The path where to look for the parquet files. |
required |
recursive
|
bool
|
Specifies if it should also check the sub-folders. |
True
|
Returns:
Type | Description |
---|---|
list[Path]
|
The list of parquet files. |
Example usage:
>>> from pathlib import Path
>>> from grizz.utils.path import find_parquet_files
>>> find_parquet_files(Path("something"))
[...]
grizz.utils.path.human_file_size ¶
human_file_size(path: Path | str, decimal: int = 2) -> str
Get a human-readable representation of a file size.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path
|
Path | str
|
The path to the file. |
required |
decimal
|
int
|
The number of decimal digits. |
2
|
Returns:
Type | Description |
---|---|
str
|
The file size in a human-readable format. |
Example usage:
>>> from grizz.utils.path import human_file_size
>>> human_file_size("README.md")
'...B'
grizz.utils.path.sanitize_path ¶
sanitize_path(path: Path | str) -> Path
Sanitize a given path.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path
|
Path | str
|
The path to sanitize. |
required |
Returns:
Type | Description |
---|---|
Path
|
The sanitized path. |
Example usage:
>>> from pathlib import Path
>>> from grizz.utils.path import sanitize_path
>>> sanitize_path("something")
PosixPath('.../something')
>>> sanitize_path("")
PosixPath('...')
>>> sanitize_path(Path("something"))
PosixPath('.../something')
>>> sanitize_path(Path("something/./../"))
PosixPath('...')
grizz.utils.series ¶
Contain utility functions for series.
grizz.utils.series.compute_stats_boolean ¶
compute_stats_boolean(series: Series) -> dict[str, float]
Compute some basic statistics about a Boolean series.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
series
|
Series
|
The series to analyze. |
required |
Returns:
Type | Description |
---|---|
dict[str, float]
|
The statistics about the input Boolean series. |
Raises:
Type | Description |
---|---|
ValueError
|
if |
Example usage:
>>> import polars as pl
>>> from grizz.utils.series import compute_stats_boolean
>>> series = pl.Series([True, False, None, None, False, None])
>>> compute_stats_boolean(series)
{'num_false': 2, 'num_null': 3, 'num_true': 1, 'total': 6}
grizz.utils.sorting ¶
Contain utility functions to sort values from multiple types.
grizz.utils.sorting.mixed_typed_sort ¶
mixed_typed_sort(
iterable: Iterable, /, *, reverse: bool = False
) -> list
Return a new list containing all items from the iterable sorted in ascending order.
This function is an extension of the built-in sorted
function
that works on a list with multiple types. There is no global order
for all types, so the items are sorted only by type. For example,
if a list has string and float values, the string values are
sorted together and the float values are sorted together.
Each type must implement the python sorting interface.
The types are sorted by alphabetical order, so in the previous
example, the float values are before the string values in the
sorted output list. This function uses sortnan
to sort
numerical values, so it is possible to sort a list with NaNs.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
iterable
|
Iterable
|
The data to sort. |
required |
reverse
|
bool
|
If set to |
False
|
Returns:
Type | Description |
---|---|
list
|
The sorted data. |
Example usage:
>>> from grizz.utils.sorting import mixed_typed_sort
>>> x = [1, "c", "a", "b", 4, -2]
>>> mixed_typed_sort(x)
[-2, 1, 4, 'a', 'b', 'c']
>>> mixed_typed_sort(x, reverse=True)
[4, 1, -2, 'c', 'b', 'a']
grizz.utils.temporal ¶
Contain utility functions to do temporal transformations.
grizz.utils.temporal.compute_temporal_stats ¶
compute_temporal_stats(
frame: DataFrame,
column: str,
temporal_column: str,
period: str,
) -> DataFrame
Return a DataFrame with stats for each temporal window.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The DataFrame to analyze. |
required |
column
|
str
|
The column to analyze. |
required |
temporal_column
|
str
|
The temporal column used to create the temporal DataFrames. |
required |
period
|
str
|
The temporal period e.g. monthly or daily. |
required |
Returns:
Type | Description |
---|---|
DataFrame
|
A DataFrame with stats for each temporal window. |
Example usage:
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.temporal import compute_temporal_stats
>>> stats = compute_temporal_stats(
... frame=pl.DataFrame(
... {
... "col": [1.2, 4.2, 0.0, 1.0, 4.2, 42.0],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col": pl.Float64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... ),
... column="col",
... temporal_column="datetime",
... period="1mo",
... )
>>> stats
shape: (4, 16)
┌─────────────────────────┬───────┬─────────┬──────┬───┬──────┬──────┬──────┬──────┐
│ step ┆ count ┆ nunique ┆ mean ┆ … ┆ q90 ┆ q95 ┆ q99 ┆ max │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ datetime[μs, UTC] ┆ i64 ┆ i64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════════════════════════╪═══════╪═════════╪══════╪═══╪══════╪══════╪══════╪══════╡
│ 2020-01-01 00:00:00 UTC ┆ 3 ┆ 3 ┆ 1.8 ┆ … ┆ 4.2 ┆ 4.2 ┆ 4.2 ┆ 4.2 │
│ 2020-02-01 00:00:00 UTC ┆ 1 ┆ 1 ┆ 1.0 ┆ … ┆ 1.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 │
│ 2020-03-01 00:00:00 UTC ┆ 1 ┆ 1 ┆ 4.2 ┆ … ┆ 4.2 ┆ 4.2 ┆ 4.2 ┆ 4.2 │
│ 2020-04-01 00:00:00 UTC ┆ 1 ┆ 1 ┆ 42.0 ┆ … ┆ 42.0 ┆ 42.0 ┆ 42.0 ┆ 42.0 │
└─────────────────────────┴───────┴─────────┴──────┴───┴──────┴──────┴──────┴──────┘
grizz.utils.temporal.to_step_names ¶
to_step_names(groups: GroupBy, period: str) -> list[str]
Return the name of each step.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
groups
|
GroupBy
|
The grouped DataFrame by step. |
required |
period
|
str
|
The temporal period e.g. monthly or daily. |
required |
Returns:
Type | Description |
---|---|
list[str]
|
A list that contains the name of each step. |
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.temporal import to_step_names
>>> groups = (
... pl.DataFrame(
... {
... "col": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col": pl.Float64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... )
... .sort("datetime")
... .group_by_dynamic("datetime", every="1mo")
... )
>>> steps = to_step_names(groups=groups, period="1mo")
>>> steps
['2020-01', '2020-02', '2020-03', '2020-04']
grizz.utils.temporal.to_temporal_frames ¶
to_temporal_frames(
frame: DataFrame, temporal_column: str, period: str
) -> tuple[list[DataFrame], list[str]]
Return a list of temporal DataFrames and the associated time steps.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
frame
|
DataFrame
|
The DataFrame to analyze. |
required |
temporal_column
|
str
|
The temporal column used to create the temporal DataFrames. |
required |
period
|
str
|
The temporal period e.g. monthly or daily. |
required |
Returns:
Type | Description |
---|---|
tuple[list[DataFrame], list[str]]
|
A tuple with the counts and the temporal steps. |
Example usage:
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from grizz.utils.temporal import to_temporal_frames
>>> frames, steps = to_temporal_frames(
... frame=pl.DataFrame(
... {
... "col1": [None, float("nan"), 0.0, 1.0, 4.2, 42.0],
... "col2": [None, 1, 0, None, 2, 3],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=4, tzinfo=timezone.utc),
... datetime(year=2020, month=1, day=5, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Float64,
... "col2": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... ),
... temporal_column="datetime",
... period="1mo",
... )
>>> frames
[shape: (3, 3)
┌──────┬──────┬─────────────────────────┐
│ col1 ┆ col2 ┆ datetime │
│ --- ┆ --- ┆ --- │
│ f64 ┆ i64 ┆ datetime[μs, UTC] │
╞══════╪══════╪═════════════════════════╡
│ null ┆ null ┆ 2020-01-03 00:00:00 UTC │
│ NaN ┆ 1 ┆ 2020-01-04 00:00:00 UTC │
│ 0.0 ┆ 0 ┆ 2020-01-05 00:00:00 UTC │
└──────┴──────┴─────────────────────────┘, shape: (1, 3)
┌──────┬──────┬─────────────────────────┐
│ col1 ┆ col2 ┆ datetime │
│ --- ┆ --- ┆ --- │
│ f64 ┆ i64 ┆ datetime[μs, UTC] │
╞══════╪══════╪═════════════════════════╡
│ 1.0 ┆ null ┆ 2020-02-03 00:00:00 UTC │
└──────┴──────┴─────────────────────────┘, shape: (1, 3)
┌──────┬──────┬─────────────────────────┐
│ col1 ┆ col2 ┆ datetime │
│ --- ┆ --- ┆ --- │
│ f64 ┆ i64 ┆ datetime[μs, UTC] │
╞══════╪══════╪═════════════════════════╡
│ 4.2 ┆ 2 ┆ 2020-03-03 00:00:00 UTC │
└──────┴──────┴─────────────────────────┘, shape: (1, 3)
┌──────┬──────┬─────────────────────────┐
│ col1 ┆ col2 ┆ datetime │
│ --- ┆ --- ┆ --- │
│ f64 ┆ i64 ┆ datetime[μs, UTC] │
╞══════╪══════╪═════════════════════════╡
│ 42.0 ┆ 3 ┆ 2020-04-03 00:00:00 UTC │
└──────┴──────┴─────────────────────────┘]
>>> steps
['2020-01', '2020-02', '2020-03', '2020-04']