Dataset
iden.dataset ¶
Contain dataset implementations.
iden.dataset.BaseDataset ¶
Bases: Generic[T]
, ABC
Define the base class to implement a dataset.
Note this dataset class is very different from the PyTorch dataset class because it has a different goal. One of the goals is to help to organize and manage shards.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(
... uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets
... )
... dataset
...
VanillaDataset(
(uri): file:///.../uri
(shards): ShardDict(
(uri): file:///.../uri_shards
(shards):
(train): ShardTuple(
(uri): file:///.../uri_train
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
(val): ShardTuple(
(uri): file:///.../uri_val
(shards):
)
)
(assets): ShardDict(
(uri): file:///.../uri_assets
(shards):
(stats): JsonShard(uri=file:///.../uri_stats)
)
)
iden.dataset.BaseDataset.equal
abstractmethod
¶
equal(other: Any, equal_nan: bool = False) -> bool
Indicate if two datasets are equal or not.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
other |
Any
|
The object to compare with. |
required |
equal_nan |
bool
|
If |
False
|
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset1 = VanillaDataset(
... uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets
... )
... dataset2 = VanillaDataset(
... uri=Path(tmpdir).joinpath("uri2").as_uri(), shards=shards, assets=assets
... )
... dataset1.equal(dataset2)
...
False
iden.dataset.BaseDataset.get_asset
abstractmethod
¶
get_asset(asset_id: str) -> Any
Get a data asset from this sharded dataset.
This method is useful to access some data variables/parameters that are not available before to load/preprocess the data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
asset_id |
str
|
The asset ID used to find the asset. |
required |
Returns:
Type | Description |
---|---|
Any
|
The asset. |
Raises:
Type | Description |
---|---|
AssetNotFoundError
|
if the asset does not exist. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... {'mean': 42}, uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets)
... dataset.get_asset('stats').get_data()
...
{'mean': 42}
iden.dataset.BaseDataset.get_num_shards
abstractmethod
¶
get_num_shards(split: str) -> int
Get the number of shards for a given split.
Returns:
Type | Description |
---|---|
int
|
The number of shards in the dataset for a given split. |
Raises:
Type | Description |
---|---|
SplitNotFoundError
|
if the split does not exist. |
Returns:
Type | Description |
---|---|
int
|
The dataset splits. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... {'mean': 42}, uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets)
... dataset.get_num_shards('train')
... dataset.get_num_shards('val')
...
2
0
iden.dataset.BaseDataset.get_shards
abstractmethod
¶
get_shards(split: str) -> tuple[BaseShard[T], ...]
Get the shards for a given split.
Returns:
Type | Description |
---|---|
tuple[BaseShard[T], ...]
|
The shards for a given split. The shards are sorted by ascending order of URI. |
Raises:
Type | Description |
---|---|
SplitNotFoundError
|
if the split does not exist. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... {'mean': 42}, uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets)
... dataset.get_shards('train')
... dataset.get_shards('val')
...
(JsonShard(uri=file:///.../uri1), JsonShard(uri=file:///.../uri2))
()
iden.dataset.BaseDataset.get_splits
abstractmethod
¶
get_splits() -> set[str]
Get the available dataset splits.
Returns:
Type | Description |
---|---|
set[str]
|
The dataset splits. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... {'mean': 42}, uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets)
... sorted(dataset.get_splits())
...
['train', 'val']
iden.dataset.BaseDataset.get_uri
abstractmethod
¶
get_uri() -> str
Get the Uniform Resource Identifier (URI) of the dataset.
Returns:
Type | Description |
---|---|
str
|
The dataset's URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... {'mean': 42}, uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets)
... dataset.get_uri()
...
file:///.../uri
iden.dataset.BaseDataset.has_asset
abstractmethod
¶
has_asset(asset_id: str) -> bool
Indicate if the asset exists or not.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
asset_id |
str
|
The asset ID used to find the asset. |
required |
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... {'mean': 42}, uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets)
... dataset.has_asset('stats')
... dataset.has_asset('missing')
...
True
False
iden.dataset.BaseDataset.has_split
abstractmethod
¶
has_split(split: str) -> bool
Indicate if a dataset split exists or not.
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... {'mean': 42}, uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets)
... dataset.has_split('train')
... dataset.has_split('missing')
...
True
False
iden.dataset.VanillaDataset ¶
Bases: BaseDataset[T]
Implement a simple dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The URI associated to the dataset. |
required |
shards |
ShardDict[ShardTuple[BaseShard[T]]]
|
The dataset's shards. Each item in the mapping represent a dataset split, where the key is the dataset split and the value is the shards. |
required |
assets |
ShardDict
|
The dataset's assets. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = VanillaDataset(
... uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets
... )
... dataset
...
VanillaDataset(
(uri): file:///.../uri
(shards): ShardDict(
(uri): file:///.../uri_shards
(shards):
(train): ShardTuple(
(uri): file:///.../uri_train
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
(val): ShardTuple(
(uri): file:///.../uri_val
(shards):
)
)
(assets): ShardDict(
(uri): file:///.../uri_assets
(shards):
(stats): JsonShard(uri=file:///.../uri_stats)
)
)
iden.dataset.VanillaDataset.from_uri
classmethod
¶
from_uri(uri: str) -> VanillaDataset
Instantiate a shard from its URI.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The URI. |
required |
Returns:
Type | Description |
---|---|
VanillaDataset
|
The instantiated shard. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import create_vanilla_dataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... uri = Path(tmpdir).joinpath("uri").as_uri()
... _ = create_vanilla_dataset(uri=uri, shards=shards, assets=assets)
... dataset = VanillaDataset.from_uri(uri)
... dataset
...
VanillaDataset(
(uri): file:///.../uri
(shards): ShardDict(
(uri): file:///.../uri_shards
(shards):
(train): ShardTuple(
(uri): file:///.../uri_train
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
(val): ShardTuple(
(uri): file:///.../uri_val
(shards):
)
)
(assets): ShardDict(
(uri): file:///.../uri_assets
(shards):
(stats): JsonShard(uri=file:///.../uri_stats)
)
)
iden.dataset.VanillaDataset.generate_uri_config
classmethod
¶
generate_uri_config(
shards: ShardDict[ShardTuple[BaseShard[T]]],
assets: ShardDict,
) -> dict
Generate the minimal config that is used to load the dataset from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
ShardDict[ShardTuple[BaseShard[T]]]
|
The shards in the dataset. Each item in the mapping represent a dataset split, where the key is the dataset split and the value is the shards. |
required |
assets |
ShardDict
|
The dataset's assets. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... config = VanillaDataset.generate_uri_config(shards=shards, assets=assets)
... config
...
{'loader': {'_target_': 'iden.dataset.loader.VanillaDatasetLoader'},
'shards': 'file:///.../uri_shards',
'assets': 'file:///.../uri_assets'}
iden.dataset.create_vanilla_dataset ¶
create_vanilla_dataset(
shards: ShardDict[ShardTuple[BaseShard[T]]],
assets: ShardDict,
uri: str,
) -> VanillaDataset
Create a VanillaDataset
from its shards.
Note
It is a utility function to create a VanillaDataset
from
its shards and URI. It is possible to create a
VanillaDataset
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
ShardDict[ShardTuple[BaseShard[T]]]
|
The dataset's shards. Each item in the mapping represent a dataset split, where the key is the dataset split and the value is the shards. |
required |
assets |
ShardDict
|
The dataset's assets. |
required |
uri |
str
|
The URI associated to the dataset. |
required |
Returns:
Type | Description |
---|---|
VanillaDataset
|
The instantited |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import create_vanilla_dataset
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... dataset = create_vanilla_dataset(
... uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards, assets=assets
... )
... dataset
...
VanillaDataset(
(uri): file:///.../uri
(shards): ShardDict(
(uri): file:///.../uri_shards
(shards):
(train): ShardTuple(
(uri): file:///.../uri_train
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
(val): ShardTuple(
(uri): file:///.../uri_val
(shards):
)
)
(assets): ShardDict(
(uri): file:///.../uri_assets
(shards):
(stats): JsonShard(uri=file:///.../uri_stats)
)
)
iden.dataset.load_from_uri ¶
load_from_uri(uri: str) -> BaseDataset
Load a dataset from its Uniform Resource Identifier (URI).
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The URI of the dataset. |
required |
Returns:
Type | Description |
---|---|
BaseDataset
|
The dataset associated to the URI. |
Raises:
Type | Description |
---|---|
FileNotFoundError
|
if the URI file does not exist. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import create_vanilla_dataset, load_from_uri
>>> from iden.shard import create_json_shard, create_shard_dict, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = create_shard_dict(
... shards={
... "train": create_shard_tuple(
... [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ],
... uri=Path(tmpdir).joinpath("uri_train").as_uri(),
... ),
... "val": create_shard_tuple(
... shards=[],
... uri=Path(tmpdir).joinpath("uri_val").as_uri(),
... ),
... },
... uri=Path(tmpdir).joinpath("uri_shards").as_uri(),
... )
... assets = create_shard_dict(
... shards={
... "stats": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("uri_stats").as_uri()
... )
... },
... uri=Path(tmpdir).joinpath("uri_assets").as_uri(),
... )
... uri = Path(tmpdir).joinpath("uri").as_uri()
... _ = create_vanilla_dataset(uri=uri, shards=shards, assets=assets)
... dataset = load_from_uri(uri)
... dataset
...
VanillaDataset(
(uri): file:///.../uri
(shards): ShardDict(
(uri): file:///.../uri_shards
(shards):
(train): ShardTuple(
(uri): file:///.../uri_train
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
(val): ShardTuple(
(uri): file:///.../uri_val
(shards):
)
)
(assets): ShardDict(
(uri): file:///.../uri_assets
(shards):
(stats): JsonShard(uri=file:///.../uri_stats)
)
)