iden.shard
iden.shard ¶
Contain shard implementations.
iden.shard.BaseShard ¶
Bases: Generic[T]
, ABC
Define the base class to implement a shard.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.BaseShard.clear
abstractmethod
¶
clear() -> None
Clear the current shard cache i.e. remove from memory the data if possible.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... data = shard.get_data(cache=True)
... data
... data.append(4) # in-place modification
... data = shard.get_data()
... data
... shard.clear()
... data = shard.get_data()
... data
...
[1, 2, 3]
[1, 2, 3, 4]
[1, 2, 3]
iden.shard.BaseShard.equal
abstractmethod
¶
equal(other: Any, equal_nan: bool = False) -> bool
Indicate if two shards are equal or not.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
other |
Any
|
The object to compare with. |
required |
equal_nan |
bool
|
If |
False
|
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import JsonShard, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri1 = Path(tmpdir).joinpath("my_uri1").as_uri()
... uri2 = Path(tmpdir).joinpath("my_uri2").as_uri()
... shard1 = create_json_shard([1, 2, 3], uri=uri1)
... shard2 = create_json_shard([4, 5, 6], uri=uri2)
... shard3 = JsonShard.from_uri(uri=uri1)
... shard1.equal(shard2)
... shard1.equal(shard3)
...
...
False
True
iden.shard.BaseShard.get_data
abstractmethod
¶
get_data(cache: bool = False) -> T
Get the data in the shard.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
cache |
bool
|
If |
False
|
Returns:
Type | Description |
---|---|
T
|
The data in the shard. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.BaseShard.get_uri
abstractmethod
¶
get_uri() -> str | None
Get the Uniform Resource Identifier (URI) of the shard.
Returns:
Type | Description |
---|---|
str | None
|
The Uniform Resource Identifier (URI). |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... shard.get_uri()
...
'file:///.../uri/0001'
iden.shard.BaseShard.is_cached
abstractmethod
¶
is_cached() -> bool
Indicate if the data in the shard are cached or not.
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... shard.is_cached()
... data = shard.get_data(cache=True)
... shard.is_cached()
... shard.clear()
... shard.is_cached()
...
False
True
False
iden.shard.FileShard ¶
Bases: BaseShard[T]
Implement a generic shard where the data are stored in a single file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
path |
Path | str
|
Specifies the path to the pickle file. |
required |
loader |
BaseLoader[T] | dict | None
|
The data loader or its configuration. |
None
|
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import FileShard
>>> from iden.io import save_json, JsonLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... shard = FileShard(uri=uri, path=file, loader=JsonLoader())
... shard.get_data()
...
[1, 2, 3]
iden.shard.FileShard.from_uri
classmethod
¶
from_uri(uri: str) -> FileShard
Instantiate a shard from its URI.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The URI. |
required |
Returns:
Type | Description |
---|---|
FileShard
|
The instantiated shard. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import FileShard, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_json_shard([1, 2, 3], uri=uri)
... shard = FileShard.from_uri(uri)
... shard
...
FileShard(uri=file:///.../my_uri)
iden.shard.FileShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path
|
The path to the json file. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import FileShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.json")
... FileShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.json'},
'loader': {'_target_': 'iden.shard.loader.FileShardLoader'}}
iden.shard.InMemoryShard ¶
Bases: BaseShard[Any]
Implement an in-memory shard.
This shard does not have valid URI as the data are stored in-memory.
Example usage:
>>> from iden.shard import InMemoryShard
>>> shard = InMemoryShard([1, 2, 3])
>>> shard.get_data()
[1, 2, 3]
iden.shard.JsonShard ¶
Bases: FileShard[Any]
Implement a JSON shard.
The data are stored in a JSON file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
path |
Path | str
|
Specifies the path to the JSON file. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import JsonShard
>>> from iden.io import save_json
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.JsonShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path
|
The path to the json file. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.json")
... JsonShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.json'},
'loader': {'_target_': 'iden.shard.loader.JsonShardLoader'}}
iden.shard.NumpySafetensorsShard ¶
Bases: FileShard[dict[str, ndarray]]
Implement a safetensors shard for numpy.ndarray
s.
The data are stored in a safetensors file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
path |
Path | str
|
Specifies the path to the safetensors file. |
required |
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> import numpy as np
>>> from pathlib import Path
>>> from iden.shard import NumpySafetensorsShard
>>> from iden.io.safetensors import NumpySaver
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.safetensors")
... NumpySaver().save({"key1": np.ones((2, 3)), "key2": np.arange(5)}, file)
... shard = NumpySafetensorsShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
{'key1': array([[1., 1., 1.], [1., 1., 1.]]), 'key2': array([0, 1, 2, 3, 4])}
iden.shard.NumpySafetensorsShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path
|
The path to the pickle file. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import NumpySafetensorsShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.safetensors")
... NumpySafetensorsShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.safetensors'},
'loader': {'_target_': 'iden.shard.loader.NumpySafetensorsShardLoader'}}
iden.shard.PickleShard ¶
Bases: FileShard[Any]
Implement a pickle shard.
The data are stored in a pickle file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
path |
Path | str
|
Specifies the path to the pickle file. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import PickleShard
>>> from iden.io import save_pickle
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pkl")
... save_pickle([1, 2, 3], file)
... shard = PickleShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.PickleShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path
|
The path to the pickle file. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import PickleShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pkl")
... PickleShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.pkl'},
'loader': {'_target_': 'iden.shard.loader.PickleShardLoader'}}
iden.shard.ShardDict ¶
Bases: BaseShard[T]
Implement a data structure to manage a dictionary of shards.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
shards |
dict[str, BaseShard[T]]
|
The dictionary of shards. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, ShardDict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shards/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shards/uri2").as_uri()
... ),
... }
... sd = ShardDict(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards)
... sd
...
ShardDict(
(uri): file:///.../uri
(shards):
(train): JsonShard(uri=file:///.../shards/uri1)
(val): JsonShard(uri=file:///.../shards/uri2)
)
iden.shard.ShardDict.from_uri
classmethod
¶
from_uri(uri: str) -> ShardDict[T]
Instantiate a shard from its URI.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The URI. |
required |
Returns:
Type | Description |
---|---|
ShardDict[T]
|
The instantiated shard. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardDict, create_json_shard, create_shard_dict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... uri = Path(tmpdir).joinpath("uri").as_uri()
... _ = create_shard_dict(shards, uri=uri)
... shard = ShardDict.from_uri(uri)
... shard
...
ShardDict(
(uri): file:///.../uri
(shards):
(train): JsonShard(uri=file:///.../shard/uri1)
(val): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.ShardDict.generate_uri_config
classmethod
¶
generate_uri_config(
shards: dict[str, BaseShard[T]]
) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
dict[str, BaseShard[T]]
|
The shards. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardDict, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... ShardDict.generate_uri_config(shards)
...
{'shards': {'train': 'file:///.../shard/uri1', 'val': 'file:///.../shard/uri2'},
'loader': {'_target_': 'iden.shard.loader.ShardDictLoader'}}
iden.shard.ShardDict.get_shard ¶
get_shard(shard_id: str) -> Any
Get a shard.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shard_id |
str
|
The shard ID. |
required |
Returns:
Type | Description |
---|---|
Any
|
The shard. |
Raises:
Type | Description |
---|---|
ShardNotFoundError
|
if the shard does not exist. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, ShardDict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... sd = ShardDict(uri=Path(tmpdir).joinpath("main_uri").as_uri(), shards=shards)
... sd.get_shard("train")
...
JsonShard(uri=file:///.../uri1)
iden.shard.ShardDict.get_shard_ids ¶
get_shard_ids() -> set[str]
Get the shard IDs.
Returns:
Type | Description |
---|---|
set[str]
|
The shard IDs. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, ShardDict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... sd = ShardDict(uri=Path(tmpdir).joinpath("main_uri").as_uri(), shards=shards)
... sorted(sd.get_shard_ids())
...
['train', 'val']
iden.shard.ShardDict.has_shard ¶
has_shard(shard_id: str) -> bool
Indicate if the shard exists or not.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shard_id |
str
|
The shard ID. |
required |
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, ShardDict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... sd = ShardDict(uri=Path(tmpdir).joinpath("main_uri").as_uri(), shards=shards)
... sd.has_shard("train")
... sd.has_shard("test")
...
True
False
iden.shard.ShardTuple ¶
Bases: BaseShard[tuple[BaseShard[T], ...]]
Implement a data structure to manage a tuple of shards.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
shards |
Iterable[BaseShard[T]]
|
The tuple of shards. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard import ShardTuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shards/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shards/uri2").as_uri()
... ),
... ]
... sl = ShardTuple(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards)
... sl
...
ShardTuple(
(uri): file:///.../uri
(shards):
(0): JsonShard(uri=file:///.../shards/uri1)
(1): JsonShard(uri=file:///.../shards/uri2)
)
iden.shard.ShardTuple.from_uri
classmethod
¶
from_uri(uri: str) -> ShardTuple[T]
Instantiate a shard from its URI.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The URI. |
required |
Returns:
Type | Description |
---|---|
ShardTuple[T]
|
The instantiated shard. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardTuple, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... uri = Path(tmpdir).joinpath("uri").as_uri()
... create_shard_tuple(shards, uri=uri)
... shard = ShardTuple.from_uri(uri)
... shard
...
ShardTuple(
(uri): file:///.../uri
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.ShardTuple.generate_uri_config
classmethod
¶
generate_uri_config(
shards: Iterable[BaseShard[T]],
) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
Iterable[BaseShard[T]]
|
The shards. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardTuple, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... ShardTuple.generate_uri_config(shards)
...
{'shards': ['file:///.../shard/uri1', 'file:///.../shard/uri2'],
'loader': {'_target_': 'iden.shard.loader.ShardTupleLoader'}}
iden.shard.ShardTuple.get ¶
get(index: int) -> BaseShard[T]
Get a shard.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
index |
int
|
The shard index to get. |
required |
Returns:
Type | Description |
---|---|
BaseShard[T]
|
The shard. |
Raises:
Type | Description |
---|---|
IndexError
|
if the index is outside the tuple range. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard import ShardTuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... sl = ShardTuple(uri=Path(tmpdir).joinpath("main_uri").as_uri(), shards=shards)
... sl.get(0)
...
JsonShard(uri=file:///.../uri1)
iden.shard.ShardTuple.is_sorted_by_uri ¶
is_sorted_by_uri() -> bool
Indicate if the shards are sorted by ascending order of URIs or not.
Returns:
Type | Description |
---|---|
bool
|
|
iden.shard.TorchSafetensorsShard ¶
Bases: FileShard[dict[str, Tensor]]
Implement a safetensors shard for torch.Tensor
s.
The data are stored in a safetensors file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
path |
Path | str
|
Specifies the path to the safetensors file. |
required |
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import TorchSafetensorsShard
>>> from iden.io.safetensors import TorchSaver
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.safetensors")
... TorchSaver().save({"key1": torch.ones(2, 3), "key2": torch.arange(5)}, file)
... shard = TorchSafetensorsShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
{'key1': tensor([[1., 1., 1.], [1., 1., 1.]]), 'key2': tensor([0, 1, 2, 3, 4])}
iden.shard.TorchSafetensorsShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path
|
The path to the pickle file. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import TorchSafetensorsShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.safetensors")
... TorchSafetensorsShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.safetensors'},
'loader': {'_target_': 'iden.shard.loader.TorchSafetensorsShardLoader'}}
iden.shard.TorchShard ¶
Bases: FileShard[Any]
Implement a PyTorch shard for torch.Tensor
s.
The data are stored in a PyTorch file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
path |
Path | str
|
Specifies the path to the PyTorch file. |
required |
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import TorchShard
>>> from iden.io import TorchSaver
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pt")
... TorchSaver().save({"key1": torch.ones(2, 3), "key2": torch.arange(5)}, file)
... shard = TorchShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
{'key1': tensor([[1., 1., 1.], [1., 1., 1.]]), 'key2': tensor([0, 1, 2, 3, 4])}
iden.shard.TorchShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path
|
The path to the pickle file. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import TorchShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pt")
... TorchShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.pt'},
'loader': {'_target_': 'iden.shard.loader.TorchShardLoader'}}
iden.shard.YamlShard ¶
Bases: FileShard[Any]
Implement a YAML shard.
The data are stored in a YAML file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The shard's URI. |
required |
path |
Path | str
|
Specifies the path to the YAML file. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import YamlShard
>>> from iden.io import save_yaml
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.yaml")
... save_yaml([1, 2, 3], file)
... shard = YamlShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.YamlShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the YAML format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path
|
The path to the yaml file. |
required |
Returns:
Type | Description |
---|---|
dict
|
The minimal config to load the shard from its URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import YamlShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.yaml")
... YamlShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.yaml'},
'loader': {'_target_': 'iden.shard.loader.YamlShardLoader'}}
iden.shard.create_json_shard ¶
create_json_shard(
data: Any, uri: str, path: Path | None = None
) -> JsonShard
Create a JsonShard
from data.
Note
It is a utility function to create a JsonShard
from its
data and URI. It is possible to create a JsonShard
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
Any
|
The data to save in the json file. |
required |
uri |
str
|
The shard's URI. |
required |
path |
Path | None
|
The path to the JSON file. If |
None
|
Returns:
Type | Description |
---|---|
JsonShard
|
The |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("my_uri").as_uri())
... shard.get_data()
...
[1, 2, 3]
iden.shard.create_numpy_safetensors_shard ¶
create_numpy_safetensors_shard(
data: dict[str, ndarray],
uri: str,
path: Path | None = None,
) -> NumpySafetensorsShard
Create a NumpySafetensorsShard
from data.
Note
It is a utility function to create a NumpySafetensorsShard
from its data and URI. It is possible to create a
NumpySafetensorsShard
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
dict[str, ndarray]
|
The data to save in the safetensors file. |
required |
uri |
str
|
The shard's URI. |
required |
path |
Path | None
|
The path to the safetensors file. If |
None
|
Returns:
Type | Description |
---|---|
NumpySafetensorsShard
|
The |
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import create_numpy_safetensors_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_numpy_safetensors_shard(
... data={"key1": np.ones((2, 3)), "key2": np.arange(5)},
... uri=Path(tmpdir).joinpath("my_uri").as_uri()
... )
... shard.get_data()
...
{'key1': array([[1., 1., 1.], [1., 1., 1.]]), 'key2': array([0, 1, 2, 3, 4])}
iden.shard.create_pickle_shard ¶
create_pickle_shard(
data: Any, uri: str, path: Path | None = None
) -> PickleShard
Create a PickleShard
from data.
Note
It is a utility function to create a PickleShard
from its
data and URI. It is possible to create a PickleShard
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
Any
|
The data to save in the pickle file. |
required |
uri |
str
|
The shard's URI. |
required |
path |
Path | None
|
The path to the pickle file. If |
None
|
Returns:
Type | Description |
---|---|
PickleShard
|
The |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_pickle_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_pickle_shard([1, 2, 3], uri=Path(tmpdir).joinpath("my_uri").as_uri())
... shard.get_data()
...
[1, 2, 3]
iden.shard.create_shard_dict ¶
Create a ShardDict
a list of shards.
Note
It is a utility function to create a ShardDict
from its
shards and URI. It is possible to create a ShardDict
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
dict[str, BaseShard[T]]
|
The shards. |
required |
uri |
str
|
The shard's URI. |
required |
Returns:
Type | Description |
---|---|
ShardDict[T]
|
The |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardDict, create_json_shard, create_shard_dict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... shard = create_shard_dict(shards, uri=Path(tmpdir).joinpath("uri").as_uri())
... shard
...
ShardDict(
(uri): file:///.../uri
(shards):
(train): JsonShard(uri=file:///.../shard/uri1)
(val): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.create_shard_tuple ¶
create_shard_tuple(
shards: Iterable[BaseShard[T]], uri: str
) -> ShardTuple[T]
Create a ShardTuple
a list of shards.
Note
It is a utility function to create a ShardTuple
from its
shards and URI. It is possible to create a ShardTuple
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
Iterable[BaseShard[T]]
|
The shards. |
required |
uri |
str
|
The shard's URI. |
required |
Returns:
Type | Description |
---|---|
ShardTuple[T]
|
The |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardTuple, create_json_shard, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... shard = create_shard_tuple(shards, uri=Path(tmpdir).joinpath("uri").as_uri())
... shard
...
ShardTuple(
(uri): file:///.../uri
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.create_torch_safetensors_shard ¶
create_torch_safetensors_shard(
data: dict[str, Tensor],
uri: str,
path: Path | None = None,
) -> TorchSafetensorsShard
Create a TorchSafetensorsShard
from data.
Note
It is a utility function to create a TorchSafetensorsShard
from its data and URI. It is possible to create a
TorchSafetensorsShard
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
dict[str, Tensor]
|
The data to save in the safetensors file. |
required |
uri |
str
|
The shard's URI. |
required |
path |
Path | None
|
The path to the safetensors file. If |
None
|
Returns:
Type | Description |
---|---|
TorchSafetensorsShard
|
The |
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import create_torch_safetensors_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_torch_safetensors_shard(
... data={"key1": torch.ones(2, 3), "key2": torch.arange(5)},
... uri=Path(tmpdir).joinpath("my_uri").as_uri()
... )
... shard.get_data()
...
{'key1': tensor([[1., 1., 1.], [1., 1., 1.]]), 'key2': tensor([0, 1, 2, 3, 4])}
iden.shard.create_torch_shard ¶
create_torch_shard(
data: Any, uri: str, path: Path | None = None
) -> TorchShard
Create a TorchShard
from data.
Note
It is a utility function to create a TorchShard
from its
data and URI. It is possible to create a TorchShard
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
Any
|
The data to save in the PyTorch file. |
required |
uri |
str
|
The shard's URI. |
required |
path |
Path | None
|
The path to the PyTorch file. If |
None
|
Returns:
Type | Description |
---|---|
TorchShard
|
The |
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> import torch
>>> from iden.shard import create_torch_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_torch_shard(
... data={"key1": torch.ones(2, 3), "key2": torch.arange(5)},
... uri=Path(tmpdir).joinpath("my_uri").as_uri()
... )
... shard.get_data()
...
{'key1': tensor([[1., 1., 1.], [1., 1., 1.]]), 'key2': tensor([0, 1, 2, 3, 4])}
iden.shard.create_yaml_shard ¶
create_yaml_shard(
data: Any, uri: str, path: Path | None = None
) -> YamlShard
Create a YamlShard
from data.
Note
It is a utility function to create a YamlShard
from its
data and URI. It is possible to create a YamlShard
in other ways.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
Any
|
The data to save in the yaml file. |
required |
uri |
str
|
The shard's URI. |
required |
path |
Path | None
|
The path to the YAML file. If |
None
|
Returns:
Type | Description |
---|---|
YamlShard
|
The |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_yaml_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_yaml_shard([1, 2, 3], uri=Path(tmpdir).joinpath("my_uri").as_uri())
... shard.get_data()
...
[1, 2, 3]
iden.shard.get_dict_uris ¶
get_dict_uris(
shards: dict[str, BaseShard]
) -> dict[str, str]
Get the dictionary of shard's URI.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
dict[str, BaseShard]
|
The dictionary of shards. |
required |
Returns:
Type | Description |
---|---|
dict[str, str]
|
The dictionary of shard's URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, get_dict_uris
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... get_dict_uris(shards)
...
{'train': 'file:///.../shard/uri1', 'val': 'file:///.../shard/uri2'}
iden.shard.get_list_uris ¶
get_list_uris(shards: Iterable[BaseShard]) -> list[str]
Get the list of shard's URI.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
Iterable[BaseShard]
|
The shards. |
required |
Returns:
Type | Description |
---|---|
list[str]
|
The tuple of shard's URI. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import get_list_uris, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... get_list_uris(shards)
...
['file:///.../shard/uri1', 'file:///.../shard/uri2']
iden.shard.load_from_uri ¶
load_from_uri(uri: str) -> BaseShard
Load a shard from its Uniform Resource Identifier (URI).
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The URI of the shard. |
required |
Returns:
Type | Description |
---|---|
BaseShard
|
The shard associated to the URI. |
Raises:
Type | Description |
---|---|
FileNotFoundError
|
if the URI file does not exist. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, load_from_uri
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_json_shard([1, 2, 3], uri=uri)
... shard = load_from_uri(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.sort_by_uri ¶
Sort a sequence of shards by their URIs.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
Iterable[BaseShard]
|
The shards to sort. |
required |
reverse |
bool
|
If set to |
False
|
Returns:
Type | Description |
---|---|
list[BaseShard]
|
The sorted shards. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, sort_by_uri
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = sort_by_uri(
... [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("uri2").as_uri()),
... create_json_shard([4, 5, 6, 7], uri=Path(tmpdir).joinpath("uri3").as_uri()),
... create_json_shard([4, 5, 6, 7], uri=Path(tmpdir).joinpath("uri1").as_uri()),
... ]
... )
... shards
...
[JsonShard(uri=file:///.../uri1), JsonShard(uri=file:///.../uri2), JsonShard(uri=file:///.../uri3)]
iden.shard.generator ¶
Contain shard generator implementations.
iden.shard.generator.BaseShardGenerator ¶
Bases: Generic[T]
, ABC
Define the base class to create a shard.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
JsonShard(uri=file:///.../uri/shard1)
iden.shard.generator.BaseShardGenerator.generate
abstractmethod
¶
generate(shard_id: str) -> BaseShard[T]
Generate a shard.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shard_id |
str
|
The shard IDI. |
required |
Returns:
Type | Description |
---|---|
BaseShard[T]
|
The generated shard. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... shard = generator.generate("shard1")
... shard
...
JsonShard(uri=file:///.../uri/shard1)
iden.shard.generator.JsonShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a JSON shard generator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
BaseDataGenerator[T] | dict
|
The data to save in the shard. |
required |
path_uri |
Path
|
The path where to save the URI file. |
required |
path_shard |
Path
|
The path where to save the shard data. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
JsonShard(uri=file:///.../uri/shard1)
iden.shard.generator.NumpySafetensorsShardGenerator ¶
Bases: BaseFileShardGenerator[dict[str, ndarray]]
Implement a safetensors shard generator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
BaseDataGenerator[dict[str, ndarray]] | dict
|
The data to save in the shard. |
required |
path_uri |
Path
|
The path where to save the URI file. |
required |
path_shard |
Path
|
The path where to save the shard data. |
required |
Example usage:
>>> import tempfile
>>> import numpy as np
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import NumpySafetensorsShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = NumpySafetensorsShardGenerator(
... data=DataGenerator({"key1": np.ones((2, 3)), "key2": np.arange(5)}),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
NumpySafetensorsShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
NumpySafetensorsShard(uri=file:///.../uri/shard1)
iden.shard.generator.PickleShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a pickle shard generator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
BaseDataGenerator[T] | dict
|
The data to save in the shard. |
required |
path_uri |
Path
|
The path where to save the URI file. |
required |
path_shard |
Path
|
The path where to save the shard data. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import PickleShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = PickleShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
PickleShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
PickleShard(uri=file:///.../uri/shard1)
iden.shard.generator.ShardDictGenerator ¶
Bases: BaseShardGenerator[dict[str, BaseShard]]
Implement a ShardDict
generator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shards |
dict[str, BaseShardGenerator | dict]
|
The shard generators or their configurations. |
required |
path_uri |
Path
|
The path where to save the URI file. |
required |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import ShardDictGenerator, JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = ShardDictGenerator(
... shards={
... "train": JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... },
... path_uri=Path(tmpdir).joinpath("uri"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
ShardDictGenerator(
(path_uri): PosixPath('/.../uri')
(shards):
(train): JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
)
ShardDict(
(uri): file:///.../uri/shard1
(shards):
(train): JsonShard(uri=file:///.../uri/train)
)
iden.shard.generator.ShardTupleGenerator ¶
Bases: BaseShardGenerator[tuple[BaseShard[T], ...]]
Implement a ShardTuple
generator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shard |
BaseShardGenerator[T] | dict
|
The shard generator or its configuration. |
required |
num_shards |
int
|
The number of shards to generate in the
|
required |
path_uri |
Path
|
The path where to save the URI file. |
required |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import ShardTupleGenerator, JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = ShardTupleGenerator(
... shard=JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... ),
... path_uri=Path(tmpdir).joinpath("uri"),
... num_shards=5,
... )
... generator
... shard = generator.generate("shard1")
... shard
...
ShardTupleGenerator(
(path_uri): PosixPath('/.../uri')
(num_shards): 5
(shard): JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
)
ShardTuple(
(uri): file:///.../uri/shard1
(shards):
(0): JsonShard(uri=file:///.../uri/000000001)
(1): JsonShard(uri=file:///.../uri/000000002)
(2): JsonShard(uri=file:///.../uri/000000003)
(3): JsonShard(uri=file:///.../uri/000000004)
(4): JsonShard(uri=file:///.../uri/000000005)
)
iden.shard.generator.TorchSafetensorsShardGenerator ¶
Bases: BaseFileShardGenerator[dict[str, Tensor]]
Implement a safetensors shard generator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
BaseDataGenerator[dict[str, Tensor]] | dict
|
The data to save in the shard. |
required |
path_uri |
Path
|
The path where to save the URI file. |
required |
path_shard |
Path
|
The path where to save the shard data. |
required |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import TorchSafetensorsShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = TorchSafetensorsShardGenerator(
... data=DataGenerator({"key1": torch.ones(2, 3), "key2": torch.arange(5)}),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
TorchSafetensorsShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
TorchSafetensorsShard(uri=file:///.../uri/shard1)
iden.shard.generator.TorchShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a torch shard generator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
BaseDataGenerator[T] | dict
|
The data to save in the shard. |
required |
path_uri |
Path
|
The path where to save the URI file. |
required |
path_shard |
Path
|
The path where to save the shard data. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import TorchShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = TorchShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
TorchShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
TorchShard(uri=file:///.../uri/shard1)
iden.shard.generator.YamlShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a YAML shard generator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
BaseDataGenerator[T] | dict
|
The data to save in the shard. |
required |
path_uri |
Path
|
The path where to save the URI file. |
required |
path_shard |
Path
|
The path where to save the shard data. |
required |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import YamlShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = YamlShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
YamlShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
YamlShard(uri=file:///.../uri/shard1)
iden.shard.generator.is_shard_generator_config ¶
is_shard_generator_config(config: dict) -> bool
Indicate if the input configuration is a configuration for a
BaseShardGenerator
.
This function only checks if the value of the key _target_
is valid. It does not check the other values. If _target_
indicates a function, the returned type hint is used to check
the class.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
dict
|
Specifies the configuration to check. |
required |
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> from iden.shard.generator import is_shard_generator_config
>>> is_shard_generator_config({"_target_": "iden.shard.generator.JsonShardGenerator"})
True
iden.shard.generator.setup_shard_generator ¶
setup_shard_generator(
shard_generator: BaseShardGenerator[T] | dict,
) -> BaseShardGenerator[T]
Set up a shard generator.
The shard generator is instantiated from its configuration by using the
BaseShardGenerator
factory function.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shard_generator |
BaseShardGenerator[T] | dict
|
Specifies the shard generator or its configuration. |
required |
Returns:
Type | Description |
---|---|
BaseShardGenerator[T]
|
The instantiated shard generator. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard.generator import setup_shard_generator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = setup_shard_generator(
... {
... "_target_": "iden.shard.generator.JsonShardGenerator",
... "data": [1, 2, 3],
... "path_uri": Path(tmpdir).joinpath("uri"),
... "path_shard": Path(tmpdir).joinpath("data"),
... }
... )
... generator
...
JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): [1, 2, 3]
)
iden.shard.loader ¶
Contain shard loader implementations.
iden.shard.loader.BaseShardLoader ¶
Bases: Generic[T]
, ABC
Define the base class to implement a shard loader.
A shard loader object allows to load a BaseShard
object from
its Uniform Resource Identifier (URI).
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard.loader import JsonShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_json_shard([1, 2, 3], uri=uri)
... loader = JsonShardLoader()
... loader
...
JsonShardLoader()
iden.shard.loader.BaseShardLoader.load
abstractmethod
¶
load(uri: str) -> BaseShard[T]
Load a shard from its Uniform Resource Identifier (URI).
Parameters:
Name | Type | Description | Default |
---|---|---|---|
uri |
str
|
The URI of the shard to load. |
required |
Returns:
Type | Description |
---|---|
BaseShard[T]
|
The loaded shard. |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard.loader import JsonShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_json_shard([1, 2, 3], uri=uri)
... loader = JsonShardLoader()
... shard = loader.load(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.loader.FileShardLoader ¶
Bases: BaseShardLoader[Any]
Implement a file-based shard loader.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard.loader import FileShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_json_shard([1, 2, 3], uri=uri)
... loader = FileShardLoader()
... shard = loader.load(uri)
... shard
...
FileShard(uri=file:///.../my_uri)
iden.shard.loader.JsonShardLoader ¶
Bases: BaseShardLoader[Any]
Implement a JSON shard loader.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard.loader import JsonShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_json_shard([1, 2, 3], uri=uri)
... loader = JsonShardLoader()
... shard = loader.load(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.loader.NumpySafetensorsShardLoader ¶
Bases: BaseShardLoader[dict[str, ndarray]]
Implement a safetensors shard loader for numpy.ndarray
s.
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> import numpy as np
>>> from pathlib import Path
>>> from iden.shard import create_numpy_safetensors_shard
>>> from iden.shard.loader import NumpySafetensorsShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_numpy_safetensors_shard(
... {"key1": np.ones((2, 3)), "key2": np.arange(5)}, uri=uri
... )
... loader = NumpySafetensorsShardLoader()
... shard = loader.load(uri)
... shard
...
NumpySafetensorsShard(uri=file:///.../my_uri)
iden.shard.loader.PickleShardLoader ¶
Bases: BaseShardLoader[Any]
Implement a pickle shard loader.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_pickle_shard
>>> from iden.shard.loader import PickleShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_pickle_shard([1, 2, 3], uri=uri)
... loader = PickleShardLoader()
... shard = loader.load(uri)
... shard
...
PickleShard(uri=file:///.../my_uri)
iden.shard.loader.ShardDictLoader ¶
Bases: BaseShardLoader[dict[str, BaseShard]]
Implement a ShardDict
loader.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, create_shard_dict
>>> from iden.shard.loader import ShardDictLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri").as_uri()
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... _ = create_shard_dict(shards, uri=uri)
... loader = ShardDictLoader()
... shard = loader.load(uri)
... shard
...
ShardDict(
(uri): file:///.../uri
(shards):
(train): JsonShard(uri=file:///.../shard/uri1)
(val): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.loader.ShardTupleLoader ¶
Bases: BaseShardLoader[tuple[BaseShard, ...]]
Implement a ShardTuple
loader.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, create_shard_tuple
>>> from iden.shard.loader import ShardTupleLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri").as_uri()
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... _ = create_shard_tuple(shards, uri=uri)
... loader = ShardTupleLoader()
... shard = loader.load(uri)
... shard
...
ShardTuple(
(uri): file:///.../uri
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.loader.TorchSafetensorsShardLoader ¶
Bases: BaseShardLoader[dict[str, Tensor]]
Implement a safetensors shard loader for torch.Tensor
s.
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import create_torch_safetensors_shard
>>> from iden.shard.loader import TorchSafetensorsShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_torch_safetensors_shard(
... {"key1": torch.ones(2, 3), "key2": torch.arange(5)}, uri=uri
... )
... loader = TorchSafetensorsShardLoader()
... shard = loader.load(uri)
... shard
...
TorchSafetensorsShard(uri=file:///.../my_uri)
iden.shard.loader.TorchShardLoader ¶
Bases: BaseShardLoader[Any]
Implement a PyTorch shard loader.
Raises:
Type | Description |
---|---|
RuntimeError
|
if |
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_torch_shard
>>> from iden.shard.loader import TorchShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_torch_shard([1, 2, 3], uri=uri)
... loader = TorchShardLoader()
... shard = loader.load(uri)
... shard
...
TorchShard(uri=file:///.../my_uri)
iden.shard.loader.YamlShardLoader ¶
Bases: BaseShardLoader[Any]
Implement a YAML shard loader.
Example usage:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_yaml_shard
>>> from iden.shard.loader import YamlShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... _ = create_yaml_shard([1, 2, 3], uri=uri)
... loader = YamlShardLoader()
... shard = loader.load(uri)
... shard
...
YamlShard(uri=file:///.../my_uri)
iden.shard.loader.is_shard_loader_config ¶
is_shard_loader_config(config: dict) -> bool
Indicate if the input configuration is a configuration for a
BaseShardLoader
.
This function only checks if the value of the key _target_
is valid. It does not check the other values. If _target_
indicates a function, the returned type hint is used to check
the class.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
dict
|
Specifies the configuration to check. |
required |
Returns:
Type | Description |
---|---|
bool
|
|
Example usage:
>>> from iden.shard.loader import is_shard_loader_config
>>> is_shard_loader_config({"_target_": "iden.shard.loader.JsonShardLoader"})
True
iden.shard.loader.setup_shard_loader ¶
setup_shard_loader(
shard_loader: BaseShardLoader | dict,
) -> BaseShardLoader
Set up a shard loader.
The shard loader is instantiated from its configuration by using the
BaseShardLoader
factory function.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
shard_loader |
BaseShardLoader | dict
|
Specifies the shard loader or its configuration. |
required |
Returns:
Type | Description |
---|---|
BaseShardLoader
|
The instantiated shard loader. |
Example usage:
>>> from iden.shard.loader import setup_shard_loader
>>> shard_loader = setup_shard_loader({"_target_": "iden.shard.loader.JsonShardLoader"})
>>> shard_loader
JsonShardLoader()