Skip to content

Dataset

Overview

The dataset is an abstraction to manage collections of shards organized into splits (e.g., train, validation, test) and assets (e.g., metadata, statistics). iden provides a simple and flexible way to organize and access your machine learning datasets without loading all data into memory at once.

Creating a dataset

A dataset can be created using the create_vanilla_dataset function or directly using the VanillaDataset class.

Using create_vanilla_dataset

The create_vanilla_dataset function is the recommended way to create datasets:

>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import create_vanilla_dataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
...     # Create shards using create_shard_tuple
...     train_shards = create_shard_tuple(
...         [
...             create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("train1").as_uri()),
...             create_json_shard([4, 5, 6], uri=Path(tmpdir).joinpath("train2").as_uri()),
...         ],
...         uri=Path(tmpdir).joinpath("train_tuple").as_uri(),
...     )
...     # Create shards dictionary
...     shards = create_shard_dict(
...         shards={"train": train_shards},
...         uri=Path(tmpdir).joinpath("shards").as_uri(),
...     )
...     # Create assets (can be empty)
...     assets = create_shard_dict(
...         shards={},
...         uri=Path(tmpdir).joinpath("assets").as_uri(),
...     )
...     # Create dataset
...     dataset = create_vanilla_dataset(
...         shards=shards,
...         assets=assets,
...         uri=Path(tmpdir).joinpath("dataset").as_uri(),
...     )
...     dataset
...
VanillaDataset(
  (uri): file:///.../dataset
  (shards): ShardDict(...)
  (assets): ShardDict(...)
)

Accessing shards

You can access shards by split name using the get_shards method, which returns a tuple of shards:

>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import create_vanilla_dataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
...     train_shards = create_shard_tuple(
...         [
...             create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("train1").as_uri()),
...             create_json_shard([4, 5, 6], uri=Path(tmpdir).joinpath("train2").as_uri()),
...         ],
...         uri=Path(tmpdir).joinpath("train_tuple").as_uri(),
...     )
...     val_shards = create_shard_tuple(
...         [create_json_shard([7, 8, 9], uri=Path(tmpdir).joinpath("val1").as_uri())],
...         uri=Path(tmpdir).joinpath("val_tuple").as_uri(),
...     )
...     shards = create_shard_dict(
...         shards={"train": train_shards, "val": val_shards},
...         uri=Path(tmpdir).joinpath("shards").as_uri(),
...     )
...     assets = create_shard_dict(
...         shards={},
...         uri=Path(tmpdir).joinpath("assets").as_uri(),
...     )
...     dataset = create_vanilla_dataset(
...         shards=shards,
...         assets=assets,
...         uri=Path(tmpdir).joinpath("dataset").as_uri(),
...     )
...     # Get training shards
...     train_data = dataset.get_shards("train")
...     # Access individual shards by index
...     first_shard = train_data[0]
...     first_shard.get_data()
...
[1, 2, 3]

Working with assets

Assets allow you to store metadata, statistics, or other auxiliary information alongside your dataset:

>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import create_vanilla_dataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
...     # Create minimal shards
...     shards = create_shard_dict(
...         shards={
...             "train": create_shard_tuple(
...                 [
...                     create_json_shard(
...                         [1, 2, 3], uri=Path(tmpdir).joinpath("data").as_uri()
...                     )
...                 ],
...                 uri=Path(tmpdir).joinpath("train_tuple").as_uri(),
...             )
...         },
...         uri=Path(tmpdir).joinpath("shards").as_uri(),
...     )
...     # Create assets
...     assets = create_shard_dict(
...         shards={
...             "metadata": create_json_shard(
...                 {"version": "1.0", "num_samples": 1000},
...                 uri=Path(tmpdir).joinpath("metadata").as_uri(),
...             ),
...             "stats": create_json_shard(
...                 {"mean": 0.5, "std": 0.25},
...                 uri=Path(tmpdir).joinpath("stats").as_uri(),
...             ),
...         },
...         uri=Path(tmpdir).joinpath("assets").as_uri(),
...     )
...     dataset = create_vanilla_dataset(
...         shards=shards,
...         assets=assets,
...         uri=Path(tmpdir).joinpath("dataset").as_uri(),
...     )
...     # Access assets
...     metadata = dataset.get_asset("metadata")
...     metadata.get_data()
...
{'version': '1.0', 'num_samples': 1000}

Available splits

You can check which splits are available in your dataset using get_splits():

>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import create_vanilla_dataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
...     shards = create_shard_dict(
...         shards={
...             "train": create_shard_tuple(
...                 [
...                     create_json_shard(
...                         [1, 2, 3], uri=Path(tmpdir).joinpath("train").as_uri()
...                     )
...                 ],
...                 uri=Path(tmpdir).joinpath("train_tuple").as_uri(),
...             ),
...             "val": create_shard_tuple(
...                 [
...                     create_json_shard(
...                         [4, 5, 6], uri=Path(tmpdir).joinpath("val").as_uri()
...                     )
...                 ],
...                 uri=Path(tmpdir).joinpath("val_tuple").as_uri(),
...             ),
...             "test": create_shard_tuple(
...                 [
...                     create_json_shard(
...                         [7, 8, 9], uri=Path(tmpdir).joinpath("test").as_uri()
...                     )
...                 ],
...                 uri=Path(tmpdir).joinpath("test_tuple").as_uri(),
...             ),
...         },
...         uri=Path(tmpdir).joinpath("shards").as_uri(),
...     )
...     assets = create_shard_dict(shards={}, uri=Path(tmpdir).joinpath("assets").as_uri())
...     dataset = create_vanilla_dataset(
...         shards=shards,
...         assets=assets,
...         uri=Path(tmpdir).joinpath("dataset").as_uri(),
...     )
...     sorted(dataset.get_splits())
...
['test', 'train', 'val']

Saving and loading datasets

Datasets are automatically saved when created with create_vanilla_dataset and can be loaded using load_from_uri:

>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import create_vanilla_dataset, load_from_uri
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
...     # Create and save dataset
...     shards = create_shard_dict(
...         shards={
...             "train": create_shard_tuple(
...                 [
...                     create_json_shard(
...                         [1, 2, 3], uri=Path(tmpdir).joinpath("train").as_uri()
...                     )
...                 ],
...                 uri=Path(tmpdir).joinpath("train_tuple").as_uri(),
...             )
...         },
...         uri=Path(tmpdir).joinpath("shards").as_uri(),
...     )
...     assets = create_shard_dict(shards={}, uri=Path(tmpdir).joinpath("assets").as_uri())
...     dataset = create_vanilla_dataset(
...         shards=shards,
...         assets=assets,
...         uri=Path(tmpdir).joinpath("dataset").as_uri(),
...     )
...     # Load dataset from URI
...     loaded_dataset = load_from_uri(Path(tmpdir).joinpath("dataset").as_uri())
...     loaded_dataset.get_uri()
...
'file:///.../dataset'