Skip to content

rg.Dataset

Dataset is a class that represents a collection of records. It is used to store and manage records in Argilla.

Usage Examples

Creating a Dataset

To create a new dataset you need to define its name and settings. Optional parameters are workspace and client, if you want to create the dataset in a specific workspace or on a specific Argilla instance.

dataset = rg.Dataset(
    name="my_dataset",
    settings=rg.Settings(
        fields=[
            rg.TextField(name="text"),
        ],
        questions=[
            rg.TextQuestion(name="response"),
        ],
    ),
)
dataset.create()

For a detail guide of the dataset creation and publication process, see the Dataset how to guide.

Retrieving an existing Dataset

To retrieve an existing dataset, use client.datasets("my_dataset") instead.

dataset = client.datasets("my_dataset")

Dataset

Bases: Resource, HubImportExportMixin, DiskImportExportMixin

Class for interacting with Argilla Datasets

Attributes:

Name Type Description
name str

Name of the dataset.

records DatasetRecords

The records object for the dataset. Used to interact with the records of the dataset by iterating, searching, etc.

settings Settings

The settings object of the dataset. Used to configure the dataset with fields, questions, guidelines, etc.

fields list

The fields of the dataset, for example the rg.TextField of the dataset. Defined in the settings.

questions list

The questions of the dataset defined in the settings. For example, the rg.TextQuestion that you want labelers to answer.

guidelines str

The guidelines of the dataset defined in the settings. Used to provide instructions to labelers.

allow_extra_metadata bool

True if extra metadata is allowed, False otherwise.

Source code in src/argilla/datasets/_resource.py
class Dataset(Resource, HubImportExportMixin, DiskImportExportMixin):
    """Class for interacting with Argilla Datasets

    Attributes:
        name: Name of the dataset.
        records (DatasetRecords): The records object for the dataset. Used to interact with the records of the dataset by iterating, searching, etc.
        settings (Settings): The settings object of the dataset. Used to configure the dataset with fields, questions, guidelines, etc.
        fields (list): The fields of the dataset, for example the `rg.TextField` of the dataset. Defined in the settings.
        questions (list): The questions of the dataset defined in the settings. For example, the `rg.TextQuestion` that you want labelers to answer.
        guidelines (str): The guidelines of the dataset defined in the settings. Used to provide instructions to labelers.
        allow_extra_metadata (bool): True if extra metadata is allowed, False otherwise.
    """

    name: str
    id: Optional[UUID]

    _api: "DatasetsAPI"
    _model: "DatasetModel"

    def __init__(
        self,
        name: Optional[str] = None,
        workspace: Optional[Union["Workspace", str, UUID]] = None,
        settings: Optional[Settings] = None,
        client: Optional["Argilla"] = None,
    ) -> None:
        """Initializes a new Argilla Dataset object with the given parameters.

        Parameters:
            name (str): Name of the dataset. Replaced by random UUID if not assigned.
            workspace (UUID): Workspace of the dataset. Default is the first workspace found in the server.
            settings (Settings): Settings class to be used to configure the dataset.
            client (Argilla): Instance of Argilla to connect with the server. Default is the default client.
        """
        client = client or Argilla._get_default()
        super().__init__(client=client, api=client.api.datasets)
        if name is None:
            name = f"dataset_{uuid4()}"
            self._log_message(f"Settings dataset name to unique UUID: {name}")

        self._workspace = workspace
        self._model = DatasetModel(name=name)
        self._settings = settings._copy() if settings else Settings(_dataset=self)
        self._settings.dataset = self
        self.__records = DatasetRecords(client=self._client, dataset=self, mapping=self._settings.mapping)

    #####################
    #  Properties       #
    #####################

    @property
    def name(self) -> str:
        return self._model.name

    @name.setter
    def name(self, value: str) -> None:
        self._model.name = value

    @property
    def records(self) -> "DatasetRecords":
        return self.__records

    @property
    def settings(self) -> Settings:
        return self._settings

    @settings.setter
    def settings(self, value: Settings) -> None:
        settings_copy = value._copy()
        settings_copy.dataset = self
        self._settings = settings_copy

    @property
    def fields(self) -> list:
        return self.settings.fields

    @property
    def questions(self) -> list:
        return self.settings.questions

    @property
    def guidelines(self) -> str:
        return self.settings.guidelines

    @guidelines.setter
    def guidelines(self, value: str) -> None:
        self.settings.guidelines = value

    @property
    def allow_extra_metadata(self) -> bool:
        return self.settings.allow_extra_metadata

    @allow_extra_metadata.setter
    def allow_extra_metadata(self, value: bool) -> None:
        self.settings.allow_extra_metadata = value

    @property
    def schema(self) -> dict:
        return self.settings.schema

    @property
    def workspace(self) -> Workspace:
        self._workspace = self._resolve_workspace()
        return self._workspace

    @property
    def distribution(self) -> TaskDistribution:
        return self.settings.distribution

    @distribution.setter
    def distribution(self, value: TaskDistribution) -> None:
        self.settings.distribution = value

    #####################
    #  Core methods     #
    #####################

    def get(self) -> "Dataset":
        super().get()
        self.settings.get()
        return self

    def create(self) -> "Dataset":
        """Creates the dataset on the server with the `Settings` configuration.

        Returns:
            Dataset: The created dataset object.
        """
        try:
            super().create()
        except ForbiddenError as e:
            settings_url = f"{self._client.api_url}/user-settings"
            user_role = self._client.me.role.value
            user_name = self._client.me.username
            workspace_name = self.workspace.name
            message = f"""User '{user_name}' is not authorized to create a dataset in workspace '{workspace_name}'
            with role '{user_role}'. Go to {settings_url} to view your role."""
            raise ForbiddenError(message) from e
        try:
            return self._publish()
        except Exception as e:
            self._log_message(message=f"Error creating dataset: {e}", level="error")
            self._rollback_dataset_creation()
            raise SettingsError from e

    def update(self) -> "Dataset":
        """Updates the dataset on the server with the current settings.

        Returns:
            Dataset: The updated dataset object.
        """
        self.settings.update()
        return self

    def progress(self, with_users_distribution: bool = False) -> dict:
        """Returns the team's progress on the dataset.

        Parameters:
            with_users_distribution (bool): If True, the progress of the dataset is returned
                with users distribution. This includes the number of responses made by each user.

        Returns:
            dict: The team's progress on the dataset.

        An example of a response when `with_users_distribution` is `True`:
        ```json
        {
            "total": 100,
            "completed": 50,
            "pending": 50,
            "users": {
                "user1": {
                   "completed": { "submitted": 10, "draft": 5, "discarded": 5},
                   "pending": { "submitted": 5, "draft": 10, "discarded": 10},
                },
                "user2": {
                   "completed": { "submitted": 20, "draft": 10, "discarded": 5},
                   "pending": { "submitted": 2, "draft": 25, "discarded": 0},
                },
                ...
        }
        ```

        """

        progress = self._api.get_progress(dataset_id=self._model.id).model_dump()

        if with_users_distribution:
            users_progress = self._api.list_users_progress(dataset_id=self._model.id)
            users_distribution = {
                user.username: {
                    "completed": user.completed.model_dump(),
                    "pending": user.pending.model_dump(),
                }
                for user in users_progress
            }

            progress.update({"users": users_distribution})

        return progress

    @classmethod
    def from_model(cls, model: DatasetModel, client: "Argilla") -> "Dataset":
        instance = cls(client=client, workspace=model.workspace_id, name=model.name)
        instance._model = model

        return instance

    #####################
    #  Utility methods  #
    #####################

    def api_model(self) -> DatasetModel:
        self._model.workspace_id = self.workspace.id
        return self._model

    def _publish(self) -> "Dataset":
        self._settings.create()
        self._api.publish(dataset_id=self._model.id)

        return self.get()

    def _resolve_workspace(self) -> Workspace:
        workspace = self._workspace

        if workspace is None:
            workspace = self._client.workspaces.default
            warnings.warn(f"Workspace not provided. Using default workspace: {workspace.name} id: {workspace.id}")
        elif isinstance(workspace, str):
            workspace = self._client.workspaces(workspace)
            if workspace is None:
                available_workspace_names = [ws.name for ws in self._client.workspaces]
                raise NotFoundError(
                    f"Workspace with name {workspace} not found. Available workspaces: {available_workspace_names}"
                )
        elif isinstance(workspace, UUID):
            ws_model = self._client.api.workspaces.get(workspace)
            workspace = Workspace.from_model(ws_model, client=self._client)
        elif not isinstance(workspace, Workspace):
            raise ValueError(f"Wrong workspace value found {workspace}")

        return workspace

    def _rollback_dataset_creation(self):
        if not self._is_published():
            self.delete()

    def _is_published(self) -> bool:
        return self._model.status == "ready"

    def _with_client(self, client: Argilla) -> "Self":
        return super()._with_client(client=client)

__init__(name=None, workspace=None, settings=None, client=None)

Initializes a new Argilla Dataset object with the given parameters.

Parameters:

Name Type Description Default
name str

Name of the dataset. Replaced by random UUID if not assigned.

None
workspace UUID

Workspace of the dataset. Default is the first workspace found in the server.

None
settings Settings

Settings class to be used to configure the dataset.

None
client Argilla

Instance of Argilla to connect with the server. Default is the default client.

None
Source code in src/argilla/datasets/_resource.py
def __init__(
    self,
    name: Optional[str] = None,
    workspace: Optional[Union["Workspace", str, UUID]] = None,
    settings: Optional[Settings] = None,
    client: Optional["Argilla"] = None,
) -> None:
    """Initializes a new Argilla Dataset object with the given parameters.

    Parameters:
        name (str): Name of the dataset. Replaced by random UUID if not assigned.
        workspace (UUID): Workspace of the dataset. Default is the first workspace found in the server.
        settings (Settings): Settings class to be used to configure the dataset.
        client (Argilla): Instance of Argilla to connect with the server. Default is the default client.
    """
    client = client or Argilla._get_default()
    super().__init__(client=client, api=client.api.datasets)
    if name is None:
        name = f"dataset_{uuid4()}"
        self._log_message(f"Settings dataset name to unique UUID: {name}")

    self._workspace = workspace
    self._model = DatasetModel(name=name)
    self._settings = settings._copy() if settings else Settings(_dataset=self)
    self._settings.dataset = self
    self.__records = DatasetRecords(client=self._client, dataset=self, mapping=self._settings.mapping)

create()

Creates the dataset on the server with the Settings configuration.

Returns:

Name Type Description
Dataset Dataset

The created dataset object.

Source code in src/argilla/datasets/_resource.py
def create(self) -> "Dataset":
    """Creates the dataset on the server with the `Settings` configuration.

    Returns:
        Dataset: The created dataset object.
    """
    try:
        super().create()
    except ForbiddenError as e:
        settings_url = f"{self._client.api_url}/user-settings"
        user_role = self._client.me.role.value
        user_name = self._client.me.username
        workspace_name = self.workspace.name
        message = f"""User '{user_name}' is not authorized to create a dataset in workspace '{workspace_name}'
        with role '{user_role}'. Go to {settings_url} to view your role."""
        raise ForbiddenError(message) from e
    try:
        return self._publish()
    except Exception as e:
        self._log_message(message=f"Error creating dataset: {e}", level="error")
        self._rollback_dataset_creation()
        raise SettingsError from e

update()

Updates the dataset on the server with the current settings.

Returns:

Name Type Description
Dataset Dataset

The updated dataset object.

Source code in src/argilla/datasets/_resource.py
def update(self) -> "Dataset":
    """Updates the dataset on the server with the current settings.

    Returns:
        Dataset: The updated dataset object.
    """
    self.settings.update()
    return self

progress(with_users_distribution=False)

Returns the team's progress on the dataset.

Parameters:

Name Type Description Default
with_users_distribution bool

If True, the progress of the dataset is returned with users distribution. This includes the number of responses made by each user.

False

Returns:

Name Type Description
dict dict

The team's progress on the dataset.

An example of a response when with_users_distribution is True:

{
    "total": 100,
    "completed": 50,
    "pending": 50,
    "users": {
        "user1": {
           "completed": { "submitted": 10, "draft": 5, "discarded": 5},
           "pending": { "submitted": 5, "draft": 10, "discarded": 10},
        },
        "user2": {
           "completed": { "submitted": 20, "draft": 10, "discarded": 5},
           "pending": { "submitted": 2, "draft": 25, "discarded": 0},
        },
        ...
}

Source code in src/argilla/datasets/_resource.py
def progress(self, with_users_distribution: bool = False) -> dict:
    """Returns the team's progress on the dataset.

    Parameters:
        with_users_distribution (bool): If True, the progress of the dataset is returned
            with users distribution. This includes the number of responses made by each user.

    Returns:
        dict: The team's progress on the dataset.

    An example of a response when `with_users_distribution` is `True`:
    ```json
    {
        "total": 100,
        "completed": 50,
        "pending": 50,
        "users": {
            "user1": {
               "completed": { "submitted": 10, "draft": 5, "discarded": 5},
               "pending": { "submitted": 5, "draft": 10, "discarded": 10},
            },
            "user2": {
               "completed": { "submitted": 20, "draft": 10, "discarded": 5},
               "pending": { "submitted": 2, "draft": 25, "discarded": 0},
            },
            ...
    }
    ```

    """

    progress = self._api.get_progress(dataset_id=self._model.id).model_dump()

    if with_users_distribution:
        users_progress = self._api.list_users_progress(dataset_id=self._model.id)
        users_distribution = {
            user.username: {
                "completed": user.completed.model_dump(),
                "pending": user.pending.model_dump(),
            }
            for user in users_progress
        }

        progress.update({"users": users_distribution})

    return progress

DiskImportExportMixin

Bases: ABC

A mixin for exporting and importing datasets to and from disk.

Source code in src/argilla/datasets/_io/_disk.py
class DiskImportExportMixin(ABC):
    """A mixin for exporting and importing datasets to and from disk."""

    _model: DatasetModel
    _DEFAULT_RECORDS_PATH = "records.json"
    _DEFAULT_CONFIG_REPO_DIR = ".argilla"
    _DEFAULT_SETTINGS_PATH = f"{_DEFAULT_CONFIG_REPO_DIR}/settings.json"
    _DEFAULT_DATASET_PATH = f"{_DEFAULT_CONFIG_REPO_DIR}/dataset.json"
    _DEFAULT_CONFIGURATION_FILES = [_DEFAULT_SETTINGS_PATH, _DEFAULT_DATASET_PATH]

    def to_disk(self: "Dataset", path: str, *, with_records: bool = True) -> str:
        """Exports the dataset to disk in the given path. The dataset is exported as a directory containing the dataset model, settings and records as json files.

        Parameters:
            path (str): The path to export the dataset to. Must be an empty directory.
            with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
        """
        dataset_path, settings_path, records_path = self._define_child_paths(path=path)
        logging.info(f"Loading dataset from {dataset_path}")
        logging.info(f"Loading settings from {settings_path}")
        logging.info(f"Loading records from {records_path}")
        # Export the dataset model, settings and records
        self._persist_dataset_model(path=dataset_path)
        self.settings.to_json(path=settings_path)
        if with_records:
            self.records.to_json(path=records_path)

        return path

    @classmethod
    def from_disk(
        cls: Type["Dataset"],
        path: str,
        *,
        name: Optional[str] = None,
        workspace: Optional[Union["Workspace", str]] = None,
        client: Optional["Argilla"] = None,
        with_records: bool = True,
    ) -> "Dataset":
        """Imports a dataset from disk as a directory containing the dataset model, settings and records.
        The directory should be defined using the `to_disk` method.

        Parameters:
            path (str): The path to the directory containing the dataset model, settings and records.
            name (str, optional): The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.
            workspace (Union[Workspace, str], optional): The workspace to import the dataset to. Defaults to None and default workspace is used.
            client (Argilla, optional): The client to use for the import. Defaults to None and the default client is used.
            with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
        """

        client = client or Argilla._get_default()

        try:
            dataset_path, settings_path, records_path = cls._define_child_paths(path=path)
            logging.info(f"Loading dataset from {dataset_path}")
            logging.info(f"Loading settings from {settings_path}")
            logging.info(f"Loading records from {records_path}")

            dataset_model = cls._load_dataset_model(path=dataset_path)
        except (NotADirectoryError, FileNotFoundError) as e:
            raise ImportDatasetError(f"Error loading dataset from disk. {e}") from e

        # Get the relevant workspace_id of the incoming dataset
        if isinstance(workspace, str):
            workspace = client.workspaces(workspace)
            if not workspace:
                raise ArgillaError(f"Workspace {workspace} not found on the server.")
        else:
            warnings.warn("Workspace not provided. Using default workspace.")
            workspace = client.workspaces.default
        dataset_model.workspace_id = workspace.id

        if name and (name != dataset_model.name):
            logging.info(f"Changing dataset name from {dataset_model.name} to {name}")
            dataset_model.name = name

        if client.api.datasets.name_exists(name=dataset_model.name, workspace_id=workspace.id):
            warnings.warn(
                f"Loaded dataset name {dataset_model.name} already exists in the workspace {workspace.name} so using it. To create a new dataset, provide a unique name to the `name` parameter."
            )
            dataset_model = client.api.datasets.get_by_name_and_workspace_id(
                name=dataset_model.name, workspace_id=workspace.id
            )
            dataset = cls.from_model(model=dataset_model, client=client)
            dataset.get()
        else:
            # Create a new dataset and load the settings and records
            if not os.path.exists(settings_path):
                raise ImportDatasetError(f"Settings file not found at {settings_path}")

            dataset = cls.from_model(model=dataset_model, client=client)
            dataset.settings = Settings.from_json(path=settings_path)
            dataset.create()

        if os.path.exists(records_path) and with_records:
            try:
                dataset.records.from_json(path=records_path)
            except RecordsIngestionError as e:
                raise RecordsIngestionError(
                    message="Error importing dataset records from disk. "
                    "Records and datasets settings are not compatible."
                ) from e

        return dataset

    ############################
    # Utility methods
    ############################

    def _persist_dataset_model(self, path: Path):
        """Persists the dataset model to disk."""
        if path.exists():
            raise FileExistsError(f"Dataset already exists at {path}")
        with open(file=path, mode="w") as f:
            json.dump(self.api_model().model_dump(), f)

    @classmethod
    def _load_dataset_model(cls, path: Path):
        """Loads the dataset model from disk."""
        if not os.path.exists(path):
            raise FileNotFoundError(f"Dataset model not found at {path}")
        with open(file=path, mode="r") as f:
            dataset_model = json.load(f)
            dataset_model = DatasetModel(**dataset_model)
        return dataset_model

    @classmethod
    def _define_child_paths(cls, path: Union[Path, str]) -> Tuple[Path, Path, Path]:
        path = Path(path)
        if not path.is_dir():
            raise NotADirectoryError(f"Path {path} is not a directory")
        main_path = path / cls._DEFAULT_CONFIG_REPO_DIR
        main_path.mkdir(exist_ok=True)
        dataset_path = path / cls._DEFAULT_DATASET_PATH
        settings_path = path / cls._DEFAULT_SETTINGS_PATH
        records_path = path / cls._DEFAULT_RECORDS_PATH
        return dataset_path, settings_path, records_path

to_disk(path, *, with_records=True)

Exports the dataset to disk in the given path. The dataset is exported as a directory containing the dataset model, settings and records as json files.

Parameters:

Name Type Description Default
path str

The path to export the dataset to. Must be an empty directory.

required
with_records bool

whether to load the records from the Hugging Face dataset. Defaults to True.

True
Source code in src/argilla/datasets/_io/_disk.py
def to_disk(self: "Dataset", path: str, *, with_records: bool = True) -> str:
    """Exports the dataset to disk in the given path. The dataset is exported as a directory containing the dataset model, settings and records as json files.

    Parameters:
        path (str): The path to export the dataset to. Must be an empty directory.
        with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
    """
    dataset_path, settings_path, records_path = self._define_child_paths(path=path)
    logging.info(f"Loading dataset from {dataset_path}")
    logging.info(f"Loading settings from {settings_path}")
    logging.info(f"Loading records from {records_path}")
    # Export the dataset model, settings and records
    self._persist_dataset_model(path=dataset_path)
    self.settings.to_json(path=settings_path)
    if with_records:
        self.records.to_json(path=records_path)

    return path

from_disk(path, *, name=None, workspace=None, client=None, with_records=True) classmethod

Imports a dataset from disk as a directory containing the dataset model, settings and records. The directory should be defined using the to_disk method.

Parameters:

Name Type Description Default
path str

The path to the directory containing the dataset model, settings and records.

required
name str

The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.

None
workspace Union[Workspace, str]

The workspace to import the dataset to. Defaults to None and default workspace is used.

None
client Argilla

The client to use for the import. Defaults to None and the default client is used.

None
with_records bool

whether to load the records from the Hugging Face dataset. Defaults to True.

True
Source code in src/argilla/datasets/_io/_disk.py
@classmethod
def from_disk(
    cls: Type["Dataset"],
    path: str,
    *,
    name: Optional[str] = None,
    workspace: Optional[Union["Workspace", str]] = None,
    client: Optional["Argilla"] = None,
    with_records: bool = True,
) -> "Dataset":
    """Imports a dataset from disk as a directory containing the dataset model, settings and records.
    The directory should be defined using the `to_disk` method.

    Parameters:
        path (str): The path to the directory containing the dataset model, settings and records.
        name (str, optional): The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.
        workspace (Union[Workspace, str], optional): The workspace to import the dataset to. Defaults to None and default workspace is used.
        client (Argilla, optional): The client to use for the import. Defaults to None and the default client is used.
        with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
    """

    client = client or Argilla._get_default()

    try:
        dataset_path, settings_path, records_path = cls._define_child_paths(path=path)
        logging.info(f"Loading dataset from {dataset_path}")
        logging.info(f"Loading settings from {settings_path}")
        logging.info(f"Loading records from {records_path}")

        dataset_model = cls._load_dataset_model(path=dataset_path)
    except (NotADirectoryError, FileNotFoundError) as e:
        raise ImportDatasetError(f"Error loading dataset from disk. {e}") from e

    # Get the relevant workspace_id of the incoming dataset
    if isinstance(workspace, str):
        workspace = client.workspaces(workspace)
        if not workspace:
            raise ArgillaError(f"Workspace {workspace} not found on the server.")
    else:
        warnings.warn("Workspace not provided. Using default workspace.")
        workspace = client.workspaces.default
    dataset_model.workspace_id = workspace.id

    if name and (name != dataset_model.name):
        logging.info(f"Changing dataset name from {dataset_model.name} to {name}")
        dataset_model.name = name

    if client.api.datasets.name_exists(name=dataset_model.name, workspace_id=workspace.id):
        warnings.warn(
            f"Loaded dataset name {dataset_model.name} already exists in the workspace {workspace.name} so using it. To create a new dataset, provide a unique name to the `name` parameter."
        )
        dataset_model = client.api.datasets.get_by_name_and_workspace_id(
            name=dataset_model.name, workspace_id=workspace.id
        )
        dataset = cls.from_model(model=dataset_model, client=client)
        dataset.get()
    else:
        # Create a new dataset and load the settings and records
        if not os.path.exists(settings_path):
            raise ImportDatasetError(f"Settings file not found at {settings_path}")

        dataset = cls.from_model(model=dataset_model, client=client)
        dataset.settings = Settings.from_json(path=settings_path)
        dataset.create()

    if os.path.exists(records_path) and with_records:
        try:
            dataset.records.from_json(path=records_path)
        except RecordsIngestionError as e:
            raise RecordsIngestionError(
                message="Error importing dataset records from disk. "
                "Records and datasets settings are not compatible."
            ) from e

    return dataset

HubImportExportMixin

Bases: DiskImportExportMixin

Source code in src/argilla/datasets/_io/_hub.py
class HubImportExportMixin(DiskImportExportMixin):
    def to_hub(
        self: "Dataset",
        repo_id: str,
        *,
        with_records: bool = True,
        generate_card: Optional[bool] = True,
        **kwargs: Any,
    ) -> None:
        """Pushes the `Dataset` to the Hugging Face Hub. If the dataset has been previously pushed to the
        Hugging Face Hub, it will be updated instead of creating a new dataset repo.

        Parameters:
            repo_id: the ID of the Hugging Face Hub repo to push the `Dataset` to.
            with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
            generate_card: whether to generate a dataset card for the `Dataset` in the Hugging Face Hub. Defaults
                to `True`.
            **kwargs: the kwargs to pass to `datasets.Dataset.push_to_hub`.

        Returns:
            None
        """

        from huggingface_hub import DatasetCardData, HfApi

        from argilla.datasets._io.card import (
            ArgillaDatasetCard,
            size_categories_parser,
        )

        hf_api = HfApi(token=kwargs.get("token"))

        hfds = False
        if with_records:
            hfds = self.records(with_vectors=True, with_responses=True, with_suggestions=True).to_datasets()
            hfds.push_to_hub(repo_id, **kwargs)
        else:
            hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=kwargs.get("exist_ok") or True)

        with TemporaryDirectory() as tmpdirname:
            config_dir = os.path.join(tmpdirname)

            self.to_disk(path=config_dir, with_records=False)

            if generate_card:
                sample_argilla_record = next(iter(self.records(with_suggestions=True, with_responses=True)))
                sample_huggingface_record = self._get_sample_hf_record(hfds) if with_records else None
                dataset_size = len(hfds) if with_records else 0
                card = ArgillaDatasetCard.from_template(
                    card_data=DatasetCardData(
                        size_categories=size_categories_parser(dataset_size),
                        tags=["rlfh", "argilla", "human-feedback"],
                    ),
                    repo_id=repo_id,
                    argilla_fields=self.settings.fields,
                    argilla_questions=self.settings.questions,
                    argilla_guidelines=self.settings.guidelines or None,
                    argilla_vectors_settings=self.settings.vectors or None,
                    argilla_metadata_properties=self.settings.metadata,
                    argilla_record=sample_argilla_record.to_dict(),
                    huggingface_record=sample_huggingface_record,
                )
                card.save(filepath=os.path.join(tmpdirname, "README.md"))

            hf_api.upload_folder(
                folder_path=tmpdirname,
                repo_id=repo_id,
                repo_type="dataset",
            )

    @classmethod
    def from_hub(
        cls: Type["Dataset"],
        repo_id: str,
        *,
        name: Optional[str] = None,
        workspace: Optional[Union["Workspace", str]] = None,
        client: Optional["Argilla"] = None,
        with_records: bool = True,
        settings: Union["Settings", Literal["auto", "ui"]] = "ui",
        split: Optional[str] = None,
        subset: Optional[str] = None,
        **kwargs: Any,
    ) -> Union["Dataset", str]:
        """Loads a `Dataset` from the Hugging Face Hub.

        Parameters:
            repo_id: the ID of the Hugging Face Hub repo to load the `Dataset` from.
            name (str, optional): The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.
            workspace (Union[Workspace, str], optional): The workspace to import the dataset to. Defaults to None and default workspace is used.
            client: the client to use to load the `Dataset`. If not provided, the default client will be used.
            with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
            settings: the settings to use to load the `Dataset`. If settings are "ui", a URL to configure the settings
                through argilla will be returned. If settings are "auto",
                the settings will be inferred from the `Features` of the dataset on the hub. Defaults to "ui".
            split: the split to load from the Hugging Face dataset. If not provided, the first split will be loaded.
            subset: the subset to load from the Hugging Face dataset. If not provided, the first subset will be loaded.
            **kwargs: the kwargs to pass to `datasets.Dataset.load_from_hub`.

        Returns:
            A `Dataset` loaded from the Hugging Face Hub.
        """
        from argilla.settings import Settings
        from datasets import load_dataset
        from huggingface_hub import snapshot_download

        settings = settings or "ui"

        if name is None:
            name = repo_id

        if settings == "ui":
            return cls._run_settings_ui(
                repo_id=repo_id,
                subset=subset,
                split=split,
                client=client,
            )

        elif isinstance(settings, Settings):
            dataset = cls(name=name, settings=settings)
            dataset.create()
        else:
            try:
                # download configuration files from the hub
                folder_path = snapshot_download(
                    repo_id=repo_id,
                    repo_type="dataset",
                    allow_patterns=cls._DEFAULT_CONFIGURATION_FILES,
                    token=kwargs.get("token"),
                )

                dataset = cls.from_disk(
                    path=folder_path, workspace=workspace, name=name, client=client, with_records=with_records
                )
            except ImportDatasetError:
                from argilla import Settings

                settings = Settings.from_hub(repo_id=repo_id, subset=subset)
                dataset = cls.from_hub(
                    repo_id=repo_id,
                    name=name,
                    workspace=workspace,
                    client=client,
                    with_records=with_records,
                    settings=settings,
                    split=split,
                    subset=subset,
                    **kwargs,
                )
                return dataset

        if with_records:
            try:
                hf_dataset = load_dataset(
                    path=repo_id,
                    split=split,
                    name=subset,
                    **kwargs,
                )  # type: ignore
                hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, split=split, **kwargs)
                cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
            except EmptyDatasetError:
                warnings.warn(
                    message="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
                    category=UserWarning,
                )

        return dataset

    @staticmethod
    def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
        """This method extracts the responses from a Hugging Face dataset and returns a list of `Record` objects"""
        # Identify columns that columns that contain responses
        responses_columns = [col for col in hf_dataset.column_names if ".responses" in col]
        response_questions = defaultdict(dict)
        user_ids = {}
        for col in responses_columns:
            question_name = col.split(".")[0]
            if col.endswith("users"):
                response_questions[question_name]["users"] = hf_dataset[col] or []
                for users in hf_dataset[col]:
                    if users is None:
                        continue
                    user_ids.update({UUID(user_id): user_id for user_id in users})
            elif col.endswith("responses"):
                response_questions[question_name]["responses"] = hf_dataset[col]
            elif col.endswith("status"):
                response_questions[question_name]["status"] = hf_dataset[col]

        # Check if all user ids are known to this Argilla client
        known_users_ids = [user.id for user in dataset._client.users]
        unknown_user_ids = set(user_ids.keys()) - set(known_users_ids)
        my_user = dataset._client.me
        if len(unknown_user_ids) > 1:
            warnings.warn(
                message=f"""Found unknown user ids in dataset repo: {unknown_user_ids}.
                    Assigning first response for each record to current user ({my_user.username}) and discarding the rest."""
            )
        for unknown_user_id in unknown_user_ids:
            user_ids[unknown_user_id] = my_user.id

        # Create a mapper to map the Hugging Face dataset to a Record object
        mapping = {}
        for col in hf_dataset.column_names:
            if ".suggestion" in col:
                mapping[col] = col
            elif col.startswith("metadata.") and col.replace("metadata.", "") in dataset.schema:
                mapping[col] = col.replace("metadata.", "")
            elif col.startswith("vector.") and col.replace("vector.", "") in dataset.schema:
                mapping[col] = col.replace("vector.", "")

        mapper = IngestedRecordMapper(dataset=dataset, mapping=mapping, user_id=my_user.id)

        # Extract responses and create Record objects
        records = []
        hf_dataset = HFDatasetsIO.to_argilla(hf_dataset=hf_dataset, mapper=mapper)
        for idx, row in enumerate(hf_dataset):
            record = mapper(row)
            for question_name, values in response_questions.items():
                response_values = values["responses"][idx] or []
                response_users = values["users"][idx] or []
                response_status = values["status"][idx] or []

                used_users = set()
                for value, user_id, status in zip(response_values, response_users, response_status):
                    user_id = user_ids[UUID(user_id)]
                    if user_id in used_users:
                        continue

                    used_users.add(user_id)
                    response = Response(
                        user_id=user_id,
                        question_name=question_name,
                        value=value,
                        status=status,
                    )
                    record.responses.add(response)
            records.append(record)

        try:
            dataset.records.log(records=records)
        except (RecordsIngestionError, UnprocessableEntityError) as e:
            raise SettingsError(
                message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema. Hugging face dataset features: {hf_dataset.features}. Argilla dataset settings : {dataset.settings}"
            ) from e

    @staticmethod
    def _get_dataset_split(hf_dataset: "HFDataset", split: Optional[str] = None, **kwargs: Dict) -> "HFDataset":
        """Get a single dataset from a Hugging Face dataset.

        Parameters:
            hf_dataset (HFDataset): The Hugging Face dataset to get a single dataset from.

        Returns:
            HFDataset: The single dataset.
        """

        if isinstance(hf_dataset, DatasetDict) and split is None:
            split = next(iter(hf_dataset.keys()))
            if len(hf_dataset.keys()) > 1:
                warnings.warn(
                    message=f"Multiple splits found in Hugging Face dataset. Using the first split: {split}. "
                    f"Available splits are: {', '.join(hf_dataset.keys())}."
                )
            hf_dataset = hf_dataset[split]
        return hf_dataset

    @staticmethod
    def _get_sample_hf_record(hf_dataset: "HFDataset") -> Dict:
        """Get a sample record from a Hugging Face dataset.

        Parameters:
            hf_dataset (HFDataset): The Hugging Face dataset to get a sample record from.

        Returns:
            Dict: The sample record.
        """

        if hf_dataset:
            sample_huggingface_record = {}
            for key, value in hf_dataset[0].items():
                try:
                    json.dumps(value)
                    sample_huggingface_record[key] = value
                except TypeError:
                    if isinstance(value, Image.Image):
                        sample_huggingface_record[key] = pil_to_data_uri(value)
                    else:
                        sample_huggingface_record[key] = "Record value is not serializable"
            return sample_huggingface_record

    @classmethod
    def _run_settings_ui(cls, repo_id: str, subset: str, split: str, client: Optional["Argilla"] = None) -> str:
        from urllib.parse import quote_plus, urlencode
        from argilla.client import Argilla

        import webbrowser

        client = client or Argilla._get_default()

        params = {
            "subset": subset,
            "split": split,
        }

        url = f"{client.api_url.removesuffix('/')}/new/{quote_plus(repo_id)}?{urlencode(params)}"

        try:
            webbrowser.open(url, new=2, autoraise=True)
        except Exception as e:
            warnings.warn(f"Error opening the URL in the browser: {e}")
        finally:
            warnings.warn(f"Open the following URL in your browser to configure the dataset: {url}")
            return url

to_hub(repo_id, *, with_records=True, generate_card=True, **kwargs)

Pushes the Dataset to the Hugging Face Hub. If the dataset has been previously pushed to the Hugging Face Hub, it will be updated instead of creating a new dataset repo.

Parameters:

Name Type Description Default
repo_id str

the ID of the Hugging Face Hub repo to push the Dataset to.

required
with_records bool

whether to load the records from the Hugging Face dataset. Defaults to True.

True
generate_card Optional[bool]

whether to generate a dataset card for the Dataset in the Hugging Face Hub. Defaults to True.

True
**kwargs Any

the kwargs to pass to datasets.Dataset.push_to_hub.

{}

Returns:

Type Description
None

None

Source code in src/argilla/datasets/_io/_hub.py
def to_hub(
    self: "Dataset",
    repo_id: str,
    *,
    with_records: bool = True,
    generate_card: Optional[bool] = True,
    **kwargs: Any,
) -> None:
    """Pushes the `Dataset` to the Hugging Face Hub. If the dataset has been previously pushed to the
    Hugging Face Hub, it will be updated instead of creating a new dataset repo.

    Parameters:
        repo_id: the ID of the Hugging Face Hub repo to push the `Dataset` to.
        with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
        generate_card: whether to generate a dataset card for the `Dataset` in the Hugging Face Hub. Defaults
            to `True`.
        **kwargs: the kwargs to pass to `datasets.Dataset.push_to_hub`.

    Returns:
        None
    """

    from huggingface_hub import DatasetCardData, HfApi

    from argilla.datasets._io.card import (
        ArgillaDatasetCard,
        size_categories_parser,
    )

    hf_api = HfApi(token=kwargs.get("token"))

    hfds = False
    if with_records:
        hfds = self.records(with_vectors=True, with_responses=True, with_suggestions=True).to_datasets()
        hfds.push_to_hub(repo_id, **kwargs)
    else:
        hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=kwargs.get("exist_ok") or True)

    with TemporaryDirectory() as tmpdirname:
        config_dir = os.path.join(tmpdirname)

        self.to_disk(path=config_dir, with_records=False)

        if generate_card:
            sample_argilla_record = next(iter(self.records(with_suggestions=True, with_responses=True)))
            sample_huggingface_record = self._get_sample_hf_record(hfds) if with_records else None
            dataset_size = len(hfds) if with_records else 0
            card = ArgillaDatasetCard.from_template(
                card_data=DatasetCardData(
                    size_categories=size_categories_parser(dataset_size),
                    tags=["rlfh", "argilla", "human-feedback"],
                ),
                repo_id=repo_id,
                argilla_fields=self.settings.fields,
                argilla_questions=self.settings.questions,
                argilla_guidelines=self.settings.guidelines or None,
                argilla_vectors_settings=self.settings.vectors or None,
                argilla_metadata_properties=self.settings.metadata,
                argilla_record=sample_argilla_record.to_dict(),
                huggingface_record=sample_huggingface_record,
            )
            card.save(filepath=os.path.join(tmpdirname, "README.md"))

        hf_api.upload_folder(
            folder_path=tmpdirname,
            repo_id=repo_id,
            repo_type="dataset",
        )

from_hub(repo_id, *, name=None, workspace=None, client=None, with_records=True, settings='ui', split=None, subset=None, **kwargs) classmethod

Loads a Dataset from the Hugging Face Hub.

Parameters:

Name Type Description Default
repo_id str

the ID of the Hugging Face Hub repo to load the Dataset from.

required
name str

The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.

None
workspace Union[Workspace, str]

The workspace to import the dataset to. Defaults to None and default workspace is used.

None
client Optional[Argilla]

the client to use to load the Dataset. If not provided, the default client will be used.

None
with_records bool

whether to load the records from the Hugging Face dataset. Defaults to True.

True
settings Union[Settings, Literal['auto', 'ui']]

the settings to use to load the Dataset. If settings are "ui", a URL to configure the settings through argilla will be returned. If settings are "auto", the settings will be inferred from the Features of the dataset on the hub. Defaults to "ui".

'ui'
split Optional[str]

the split to load from the Hugging Face dataset. If not provided, the first split will be loaded.

None
subset Optional[str]

the subset to load from the Hugging Face dataset. If not provided, the first subset will be loaded.

None
**kwargs Any

the kwargs to pass to datasets.Dataset.load_from_hub.

{}

Returns:

Type Description
Union[Dataset, str]

A Dataset loaded from the Hugging Face Hub.

Source code in src/argilla/datasets/_io/_hub.py
@classmethod
def from_hub(
    cls: Type["Dataset"],
    repo_id: str,
    *,
    name: Optional[str] = None,
    workspace: Optional[Union["Workspace", str]] = None,
    client: Optional["Argilla"] = None,
    with_records: bool = True,
    settings: Union["Settings", Literal["auto", "ui"]] = "ui",
    split: Optional[str] = None,
    subset: Optional[str] = None,
    **kwargs: Any,
) -> Union["Dataset", str]:
    """Loads a `Dataset` from the Hugging Face Hub.

    Parameters:
        repo_id: the ID of the Hugging Face Hub repo to load the `Dataset` from.
        name (str, optional): The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.
        workspace (Union[Workspace, str], optional): The workspace to import the dataset to. Defaults to None and default workspace is used.
        client: the client to use to load the `Dataset`. If not provided, the default client will be used.
        with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
        settings: the settings to use to load the `Dataset`. If settings are "ui", a URL to configure the settings
            through argilla will be returned. If settings are "auto",
            the settings will be inferred from the `Features` of the dataset on the hub. Defaults to "ui".
        split: the split to load from the Hugging Face dataset. If not provided, the first split will be loaded.
        subset: the subset to load from the Hugging Face dataset. If not provided, the first subset will be loaded.
        **kwargs: the kwargs to pass to `datasets.Dataset.load_from_hub`.

    Returns:
        A `Dataset` loaded from the Hugging Face Hub.
    """
    from argilla.settings import Settings
    from datasets import load_dataset
    from huggingface_hub import snapshot_download

    settings = settings or "ui"

    if name is None:
        name = repo_id

    if settings == "ui":
        return cls._run_settings_ui(
            repo_id=repo_id,
            subset=subset,
            split=split,
            client=client,
        )

    elif isinstance(settings, Settings):
        dataset = cls(name=name, settings=settings)
        dataset.create()
    else:
        try:
            # download configuration files from the hub
            folder_path = snapshot_download(
                repo_id=repo_id,
                repo_type="dataset",
                allow_patterns=cls._DEFAULT_CONFIGURATION_FILES,
                token=kwargs.get("token"),
            )

            dataset = cls.from_disk(
                path=folder_path, workspace=workspace, name=name, client=client, with_records=with_records
            )
        except ImportDatasetError:
            from argilla import Settings

            settings = Settings.from_hub(repo_id=repo_id, subset=subset)
            dataset = cls.from_hub(
                repo_id=repo_id,
                name=name,
                workspace=workspace,
                client=client,
                with_records=with_records,
                settings=settings,
                split=split,
                subset=subset,
                **kwargs,
            )
            return dataset

    if with_records:
        try:
            hf_dataset = load_dataset(
                path=repo_id,
                split=split,
                name=subset,
                **kwargs,
            )  # type: ignore
            hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, split=split, **kwargs)
            cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
        except EmptyDatasetError:
            warnings.warn(
                message="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
                category=UserWarning,
            )

    return dataset