`rg.Dataset`¶

Dataset is a class that represents a collection of records. It is used to store and manage records in Argilla.

Usage Examples¶

Creating a Dataset¶

To create a new dataset you need to define its name and settings. Optional parameters are workspace and client, if you want to create the dataset in a specific workspace or on a specific Argilla instance.

dataset = rg.Dataset(
    name="my_dataset",
    settings=rg.Settings(
        fields=[
            rg.TextField(name="text"),
        ],
        questions=[
            rg.TextQuestion(name="response"),
        ],
    ),
)
dataset.create()

For a detail guide of the dataset creation and publication process, see the Dataset how to guide.

Retrieving an existing Dataset¶

To retrieve an existing dataset, use client.datasets("my_dataset") instead.

dataset = client.datasets("my_dataset")

`Dataset` ¶

Bases: Resource, HubImportExportMixin, DiskImportExportMixin

Class for interacting with Argilla Datasets

Attributes:

Name	Type	Description
`name`	`str`	Name of the dataset.
`records`	`DatasetRecords`	The records object for the dataset. Used to interact with the records of the dataset by iterating, searching, etc.
`settings`	`Settings`	The settings object of the dataset. Used to configure the dataset with fields, questions, guidelines, etc.
`fields`	`list`	The fields of the dataset, for example the `rg.TextField` of the dataset. Defined in the settings.
`questions`	`list`	The questions of the dataset defined in the settings. For example, the `rg.TextQuestion` that you want labelers to answer.
`guidelines`	`str`	The guidelines of the dataset defined in the settings. Used to provide instructions to labelers.
`allow_extra_metadata`	`bool`	True if extra metadata is allowed, False otherwise.

Source code in src/argilla/datasets/_resource.py

class Dataset(Resource, HubImportExportMixin, DiskImportExportMixin):
    """Class for interacting with Argilla Datasets

    Attributes:
        name: Name of the dataset.
        records (DatasetRecords): The records object for the dataset. Used to interact with the records of the dataset by iterating, searching, etc.
        settings (Settings): The settings object of the dataset. Used to configure the dataset with fields, questions, guidelines, etc.
        fields (list): The fields of the dataset, for example the `rg.TextField` of the dataset. Defined in the settings.
        questions (list): The questions of the dataset defined in the settings. For example, the `rg.TextQuestion` that you want labelers to answer.
        guidelines (str): The guidelines of the dataset defined in the settings. Used to provide instructions to labelers.
        allow_extra_metadata (bool): True if extra metadata is allowed, False otherwise.
    """

    name: str
    id: Optional[UUID]

    _api: "DatasetsAPI"
    _model: "DatasetModel"

    def __init__(
        self,
        name: Optional[str] = None,
        workspace: Optional[Union["Workspace", str, UUID]] = None,
        settings: Optional[Settings] = None,
        client: Optional["Argilla"] = None,
    ) -> None:
        """Initializes a new Argilla Dataset object with the given parameters.

        Parameters:
            name (str): Name of the dataset. Replaced by random UUID if not assigned.
            workspace (UUID): Workspace of the dataset. Default is the first workspace found in the server.
            settings (Settings): Settings class to be used to configure the dataset.
            client (Argilla): Instance of Argilla to connect with the server. Default is the default client.
        """
        client = client or Argilla._get_default()
        super().__init__(client=client, api=client.api.datasets)
        if name is None:
            name = f"dataset_{uuid4()}"
            self._log_message(f"Settings dataset name to unique UUID: {name}")

        self._workspace = workspace
        self._model = DatasetModel(name=name)
        self._settings = settings._copy() if settings else Settings(_dataset=self)
        self._settings.dataset = self
        self.__records = DatasetRecords(client=self._client, dataset=self, mapping=self._settings.mapping)

    #####################
    #  Properties       #
    #####################

    @property
    def name(self) -> str:
        return self._model.name

    @name.setter
    def name(self, value: str) -> None:
        self._model.name = value

    @property
    def records(self) -> "DatasetRecords":
        return self.__records

    @property
    def settings(self) -> Settings:
        return self._settings

    @settings.setter
    def settings(self, value: Settings) -> None:
        settings_copy = value._copy()
        settings_copy.dataset = self
        self._settings = settings_copy

    @property
    def fields(self) -> list:
        return self.settings.fields

    @property
    def questions(self) -> list:
        return self.settings.questions

    @property
    def guidelines(self) -> str:
        return self.settings.guidelines

    @guidelines.setter
    def guidelines(self, value: str) -> None:
        self.settings.guidelines = value

    @property
    def allow_extra_metadata(self) -> bool:
        return self.settings.allow_extra_metadata

    @allow_extra_metadata.setter
    def allow_extra_metadata(self, value: bool) -> None:
        self.settings.allow_extra_metadata = value

    @property
    def schema(self) -> dict:
        return self.settings.schema

    @property
    def workspace(self) -> Workspace:
        self._workspace = self._resolve_workspace()
        return self._workspace

    @property
    def distribution(self) -> TaskDistribution:
        return self.settings.distribution

    @distribution.setter
    def distribution(self, value: TaskDistribution) -> None:
        self.settings.distribution = value

    #####################
    #  Core methods     #
    #####################

    def get(self) -> "Dataset":
        super().get()
        self.settings.get()
        return self

    def create(self) -> "Dataset":
        """Creates the dataset on the server with the `Settings` configuration.

        Returns:
            Dataset: The created dataset object.
        """
        try:
            super().create()
        except ForbiddenError as e:
            settings_url = f"{self._client.api_url}/user-settings"
            user_role = self._client.me.role.value
            user_name = self._client.me.username
            workspace_name = self.workspace.name
            message = f"""User '{user_name}' is not authorized to create a dataset in workspace '{workspace_name}'
            with role '{user_role}'. Go to {settings_url} to view your role."""
            raise ForbiddenError(message) from e
        try:
            return self._publish()
        except Exception as e:
            self._log_message(message=f"Error creating dataset: {e}", level="error")
            self._rollback_dataset_creation()
            raise SettingsError from e

    def update(self) -> "Dataset":
        """Updates the dataset on the server with the current settings.

        Returns:
            Dataset: The updated dataset object.
        """
        self.settings.update()
        return self

    def progress(self, with_users_distribution: bool = False) -> dict:
        """Returns the team's progress on the dataset.

        Parameters:
            with_users_distribution (bool): If True, the progress of the dataset is returned
                with users distribution. This includes the number of responses made by each user.

        Returns:
            dict: The team's progress on the dataset.

        An example of a response when `with_users_distribution` is `True`:
        ```json
        {
            "total": 100,
            "completed": 50,
            "pending": 50,
            "users": {
                "user1": {
                   "completed": { "submitted": 10, "draft": 5, "discarded": 5},
                   "pending": { "submitted": 5, "draft": 10, "discarded": 10},
                },
                "user2": {
                   "completed": { "submitted": 20, "draft": 10, "discarded": 5},
                   "pending": { "submitted": 2, "draft": 25, "discarded": 0},
                },
                ...
        }
        ```

        """

        progress = self._api.get_progress(dataset_id=self._model.id).model_dump()

        if with_users_distribution:
            users_progress = self._api.list_users_progress(dataset_id=self._model.id)
            users_distribution = {
                user.username: {
                    "completed": user.completed.model_dump(),
                    "pending": user.pending.model_dump(),
                }
                for user in users_progress
            }

            progress.update({"users": users_distribution})

        return progress

    @classmethod
    def from_model(cls, model: DatasetModel, client: "Argilla") -> "Dataset":
        instance = cls(client=client, workspace=model.workspace_id, name=model.name)
        instance._model = model

        return instance

    #####################
    #  Utility methods  #
    #####################

    def api_model(self) -> DatasetModel:
        self._model.workspace_id = self.workspace.id
        return self._model

    def _publish(self) -> "Dataset":
        self._settings.create()
        self._api.publish(dataset_id=self._model.id)

        return self.get()

    def _resolve_workspace(self) -> Workspace:
        workspace = self._workspace

        if workspace is None:
            workspace = self._client.workspaces.default
            warnings.warn(f"Workspace not provided. Using default workspace: {workspace.name} id: {workspace.id}")
        elif isinstance(workspace, str):
            workspace = self._client.workspaces(workspace)
            if workspace is None:
                available_workspace_names = [ws.name for ws in self._client.workspaces]
                raise NotFoundError(
                    f"Workspace with name {workspace} not found. Available workspaces: {available_workspace_names}"
                )
        elif isinstance(workspace, UUID):
            ws_model = self._client.api.workspaces.get(workspace)
            workspace = Workspace.from_model(ws_model, client=self._client)
        elif not isinstance(workspace, Workspace):
            raise ValueError(f"Wrong workspace value found {workspace}")

        return workspace

    def _rollback_dataset_creation(self):
        if not self._is_published():
            self.delete()

    def _is_published(self) -> bool:
        return self._model.status == "ready"

    def _with_client(self, client: Argilla) -> "Self":
        return super()._with_client(client=client)

`init(name=None, workspace=None, settings=None, client=None)` ¶

Initializes a new Argilla Dataset object with the given parameters.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the dataset. Replaced by random UUID if not assigned.	`None`
`workspace`	`UUID`	Workspace of the dataset. Default is the first workspace found in the server.	`None`
`settings`	`Settings`	Settings class to be used to configure the dataset.	`None`
`client`	`Argilla`	Instance of Argilla to connect with the server. Default is the default client.	`None`

Source code in src/argilla/datasets/_resource.py

def __init__(
    self,
    name: Optional[str] = None,
    workspace: Optional[Union["Workspace", str, UUID]] = None,
    settings: Optional[Settings] = None,
    client: Optional["Argilla"] = None,
) -> None:
    """Initializes a new Argilla Dataset object with the given parameters.

    Parameters:
        name (str): Name of the dataset. Replaced by random UUID if not assigned.
        workspace (UUID): Workspace of the dataset. Default is the first workspace found in the server.
        settings (Settings): Settings class to be used to configure the dataset.
        client (Argilla): Instance of Argilla to connect with the server. Default is the default client.
    """
    client = client or Argilla._get_default()
    super().__init__(client=client, api=client.api.datasets)
    if name is None:
        name = f"dataset_{uuid4()}"
        self._log_message(f"Settings dataset name to unique UUID: {name}")

    self._workspace = workspace
    self._model = DatasetModel(name=name)
    self._settings = settings._copy() if settings else Settings(_dataset=self)
    self._settings.dataset = self
    self.__records = DatasetRecords(client=self._client, dataset=self, mapping=self._settings.mapping)

`create()` ¶

Creates the dataset on the server with the Settings configuration.

Returns:

Name	Type	Description
`Dataset`	`Dataset`	The created dataset object.

Source code in src/argilla/datasets/_resource.py

def create(self) -> "Dataset":
    """Creates the dataset on the server with the `Settings` configuration.

    Returns:
        Dataset: The created dataset object.
    """
    try:
        super().create()
    except ForbiddenError as e:
        settings_url = f"{self._client.api_url}/user-settings"
        user_role = self._client.me.role.value
        user_name = self._client.me.username
        workspace_name = self.workspace.name
        message = f"""User '{user_name}' is not authorized to create a dataset in workspace '{workspace_name}'
        with role '{user_role}'. Go to {settings_url} to view your role."""
        raise ForbiddenError(message) from e
    try:
        return self._publish()
    except Exception as e:
        self._log_message(message=f"Error creating dataset: {e}", level="error")
        self._rollback_dataset_creation()
        raise SettingsError from e

`update()` ¶

Updates the dataset on the server with the current settings.

Returns:

Name	Type	Description
`Dataset`	`Dataset`	The updated dataset object.

Source code in src/argilla/datasets/_resource.py

def update(self) -> "Dataset":
    """Updates the dataset on the server with the current settings.

    Returns:
        Dataset: The updated dataset object.
    """
    self.settings.update()
    return self

`progress(with_users_distribution=False)` ¶

Returns the team's progress on the dataset.

Parameters:

Name	Type	Description	Default
`with_users_distribution`	`bool`	If True, the progress of the dataset is returned with users distribution. This includes the number of responses made by each user.	`False`

Returns:

Name	Type	Description
`dict`	`dict`	The team's progress on the dataset.

An example of a response when with_users_distribution is True:

{
    "total": 100,
    "completed": 50,
    "pending": 50,
    "users": {
        "user1": {
           "completed": { "submitted": 10, "draft": 5, "discarded": 5},
           "pending": { "submitted": 5, "draft": 10, "discarded": 10},
        },
        "user2": {
           "completed": { "submitted": 20, "draft": 10, "discarded": 5},
           "pending": { "submitted": 2, "draft": 25, "discarded": 0},
        },
        ...
}

Source code in src/argilla/datasets/_resource.py

def progress(self, with_users_distribution: bool = False) -> dict:
    """Returns the team's progress on the dataset.

    Parameters:
        with_users_distribution (bool): If True, the progress of the dataset is returned
            with users distribution. This includes the number of responses made by each user.

    Returns:
        dict: The team's progress on the dataset.

    An example of a response when `with_users_distribution` is `True`:
    ```json
    {
        "total": 100,
        "completed": 50,
        "pending": 50,
        "users": {
            "user1": {
               "completed": { "submitted": 10, "draft": 5, "discarded": 5},
               "pending": { "submitted": 5, "draft": 10, "discarded": 10},
            },
            "user2": {
               "completed": { "submitted": 20, "draft": 10, "discarded": 5},
               "pending": { "submitted": 2, "draft": 25, "discarded": 0},
            },
            ...
    }
    ```

    """

    progress = self._api.get_progress(dataset_id=self._model.id).model_dump()

    if with_users_distribution:
        users_progress = self._api.list_users_progress(dataset_id=self._model.id)
        users_distribution = {
            user.username: {
                "completed": user.completed.model_dump(),
                "pending": user.pending.model_dump(),
            }
            for user in users_progress
        }

        progress.update({"users": users_distribution})

    return progress

`DiskImportExportMixin` ¶

Bases: ABC

A mixin for exporting and importing datasets to and from disk.

Source code in src/argilla/datasets/_io/_disk.py

class DiskImportExportMixin(ABC):
    """A mixin for exporting and importing datasets to and from disk."""

    _model: DatasetModel
    _DEFAULT_RECORDS_PATH = "records.json"
    _DEFAULT_CONFIG_REPO_DIR = ".argilla"
    _DEFAULT_SETTINGS_PATH = f"{_DEFAULT_CONFIG_REPO_DIR}/settings.json"
    _DEFAULT_DATASET_PATH = f"{_DEFAULT_CONFIG_REPO_DIR}/dataset.json"
    _DEFAULT_CONFIGURATION_FILES = [_DEFAULT_SETTINGS_PATH, _DEFAULT_DATASET_PATH]

    def to_disk(self: "Dataset", path: str, *, with_records: bool = True) -> str:
        """Exports the dataset to disk in the given path. The dataset is exported as a directory containing the dataset model, settings and records as json files.

        Parameters:
            path (str): The path to export the dataset to. Must be an empty directory.
            with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
        """
        dataset_path, settings_path, records_path = self._define_child_paths(path=path)
        logging.info(f"Loading dataset from {dataset_path}")
        logging.info(f"Loading settings from {settings_path}")
        logging.info(f"Loading records from {records_path}")
        # Export the dataset model, settings and records
        self._persist_dataset_model(path=dataset_path)
        self.settings.to_json(path=settings_path)
        if with_records:
            self.records.to_json(path=records_path)

        return path

    @classmethod
    def from_disk(
        cls: Type["Dataset"],
        path: str,
        *,
        name: Optional[str] = None,
        workspace: Optional[Union["Workspace", str]] = None,
        client: Optional["Argilla"] = None,
        with_records: bool = True,
    ) -> "Dataset":
        """Imports a dataset from disk as a directory containing the dataset model, settings and records.
        The directory should be defined using the `to_disk` method.

        Parameters:
            path (str): The path to the directory containing the dataset model, settings and records.
            name (str, optional): The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.
            workspace (Union[Workspace, str], optional): The workspace to import the dataset to. Defaults to None and default workspace is used.
            client (Argilla, optional): The client to use for the import. Defaults to None and the default client is used.
            with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
        """

        client = client or Argilla._get_default()

        try:
            dataset_path, settings_path, records_path = cls._define_child_paths(path=path)
            logging.info(f"Loading dataset from {dataset_path}")
            logging.info(f"Loading settings from {settings_path}")
            logging.info(f"Loading records from {records_path}")

            dataset_model = cls._load_dataset_model(path=dataset_path)
        except (NotADirectoryError, FileNotFoundError) as e:
            raise ImportDatasetError(f"Error loading dataset from disk. {e}") from e

        # Get the relevant workspace_id of the incoming dataset
        if isinstance(workspace, str):
            workspace = client.workspaces(workspace)
            if not workspace:
                raise ArgillaError(f"Workspace {workspace} not found on the server.")
        else:
            warnings.warn("Workspace not provided. Using default workspace.")
            workspace = client.workspaces.default
        dataset_model.workspace_id = workspace.id

        if name and (name != dataset_model.name):
            logging.info(f"Changing dataset name from {dataset_model.name} to {name}")
            dataset_model.name = name

        if client.api.datasets.name_exists(name=dataset_model.name, workspace_id=workspace.id):
            warnings.warn(
                f"Loaded dataset name {dataset_model.name} already exists in the workspace {workspace.name} so using it. To create a new dataset, provide a unique name to the `name` parameter."
            )
            dataset_model = client.api.datasets.get_by_name_and_workspace_id(
                name=dataset_model.name, workspace_id=workspace.id
            )
            dataset = cls.from_model(model=dataset_model, client=client)
            dataset.get()
        else:
            # Create a new dataset and load the settings and records
            if not os.path.exists(settings_path):
                raise ImportDatasetError(f"Settings file not found at {settings_path}")

            dataset = cls.from_model(model=dataset_model, client=client)
            dataset.settings = Settings.from_json(path=settings_path)
            dataset.create()

        if os.path.exists(records_path) and with_records:
            try:
                dataset.records.from_json(path=records_path)
            except RecordsIngestionError as e:
                raise RecordsIngestionError(
                    message="Error importing dataset records from disk. "
                    "Records and datasets settings are not compatible."
                ) from e

        return dataset

    ############################
    # Utility methods
    ############################

    def _persist_dataset_model(self, path: Path):
        """Persists the dataset model to disk."""
        if path.exists():
            raise FileExistsError(f"Dataset already exists at {path}")
        with open(file=path, mode="w") as f:
            json.dump(self.api_model().model_dump(), f)

    @classmethod
    def _load_dataset_model(cls, path: Path):
        """Loads the dataset model from disk."""
        if not os.path.exists(path):
            raise FileNotFoundError(f"Dataset model not found at {path}")
        with open(file=path, mode="r") as f:
            dataset_model = json.load(f)
            dataset_model = DatasetModel(**dataset_model)
        return dataset_model

    @classmethod
    def _define_child_paths(cls, path: Union[Path, str]) -> Tuple[Path, Path, Path]:
        path = Path(path)
        if not path.is_dir():
            raise NotADirectoryError(f"Path {path} is not a directory")
        main_path = path / cls._DEFAULT_CONFIG_REPO_DIR
        main_path.mkdir(exist_ok=True)
        dataset_path = path / cls._DEFAULT_DATASET_PATH
        settings_path = path / cls._DEFAULT_SETTINGS_PATH
        records_path = path / cls._DEFAULT_RECORDS_PATH
        return dataset_path, settings_path, records_path

`to_disk(path, *, with_records=True)` ¶

Exports the dataset to disk in the given path. The dataset is exported as a directory containing the dataset model, settings and records as json files.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to export the dataset to. Must be an empty directory.	required
`with_records`	`bool`	whether to load the records from the Hugging Face dataset. Defaults to `True`.	`True`

Source code in src/argilla/datasets/_io/_disk.py

def to_disk(self: "Dataset", path: str, *, with_records: bool = True) -> str:
    """Exports the dataset to disk in the given path. The dataset is exported as a directory containing the dataset model, settings and records as json files.

    Parameters:
        path (str): The path to export the dataset to. Must be an empty directory.
        with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
    """
    dataset_path, settings_path, records_path = self._define_child_paths(path=path)
    logging.info(f"Loading dataset from {dataset_path}")
    logging.info(f"Loading settings from {settings_path}")
    logging.info(f"Loading records from {records_path}")
    # Export the dataset model, settings and records
    self._persist_dataset_model(path=dataset_path)
    self.settings.to_json(path=settings_path)
    if with_records:
        self.records.to_json(path=records_path)

    return path

`from_disk(path, *, name=None, workspace=None, client=None, with_records=True)` `classmethod` ¶

Imports a dataset from disk as a directory containing the dataset model, settings and records. The directory should be defined using the to_disk method.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the directory containing the dataset model, settings and records.	required
`name`	`str`	The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.	`None`
`workspace`	`Union[Workspace, str]`	The workspace to import the dataset to. Defaults to None and default workspace is used.	`None`
`client`	`Argilla`	The client to use for the import. Defaults to None and the default client is used.	`None`
`with_records`	`bool`	whether to load the records from the Hugging Face dataset. Defaults to `True`.	`True`

Source code in src/argilla/datasets/_io/_disk.py

@classmethod
def from_disk(
    cls: Type["Dataset"],
    path: str,
    *,
    name: Optional[str] = None,
    workspace: Optional[Union["Workspace", str]] = None,
    client: Optional["Argilla"] = None,
    with_records: bool = True,
) -> "Dataset":
    """Imports a dataset from disk as a directory containing the dataset model, settings and records.
    The directory should be defined using the `to_disk` method.

    Parameters:
        path (str): The path to the directory containing the dataset model, settings and records.
        name (str, optional): The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.
        workspace (Union[Workspace, str], optional): The workspace to import the dataset to. Defaults to None and default workspace is used.
        client (Argilla, optional): The client to use for the import. Defaults to None and the default client is used.
        with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
    """

    client = client or Argilla._get_default()

    try:
        dataset_path, settings_path, records_path = cls._define_child_paths(path=path)
        logging.info(f"Loading dataset from {dataset_path}")
        logging.info(f"Loading settings from {settings_path}")
        logging.info(f"Loading records from {records_path}")

        dataset_model = cls._load_dataset_model(path=dataset_path)
    except (NotADirectoryError, FileNotFoundError) as e:
        raise ImportDatasetError(f"Error loading dataset from disk. {e}") from e

    # Get the relevant workspace_id of the incoming dataset
    if isinstance(workspace, str):
        workspace = client.workspaces(workspace)
        if not workspace:
            raise ArgillaError(f"Workspace {workspace} not found on the server.")
    else:
        warnings.warn("Workspace not provided. Using default workspace.")
        workspace = client.workspaces.default
    dataset_model.workspace_id = workspace.id

    if name and (name != dataset_model.name):
        logging.info(f"Changing dataset name from {dataset_model.name} to {name}")
        dataset_model.name = name

    if client.api.datasets.name_exists(name=dataset_model.name, workspace_id=workspace.id):
        warnings.warn(
            f"Loaded dataset name {dataset_model.name} already exists in the workspace {workspace.name} so using it. To create a new dataset, provide a unique name to the `name` parameter."
        )
        dataset_model = client.api.datasets.get_by_name_and_workspace_id(
            name=dataset_model.name, workspace_id=workspace.id
        )
        dataset = cls.from_model(model=dataset_model, client=client)
        dataset.get()
    else:
        # Create a new dataset and load the settings and records
        if not os.path.exists(settings_path):
            raise ImportDatasetError(f"Settings file not found at {settings_path}")

        dataset = cls.from_model(model=dataset_model, client=client)
        dataset.settings = Settings.from_json(path=settings_path)
        dataset.create()

    if os.path.exists(records_path) and with_records:
        try:
            dataset.records.from_json(path=records_path)
        except RecordsIngestionError as e:
            raise RecordsIngestionError(
                message="Error importing dataset records from disk. "
                "Records and datasets settings are not compatible."
            ) from e

    return dataset

`HubImportExportMixin` ¶

Bases: DiskImportExportMixin

Source code in src/argilla/datasets/_io/_hub.py

class HubImportExportMixin(DiskImportExportMixin):
    def to_hub(
        self: "Dataset",
        repo_id: str,
        *,
        with_records: bool = True,
        generate_card: Optional[bool] = True,
        **kwargs: Any,
    ) -> None:
        """Pushes the `Dataset` to the Hugging Face Hub. If the dataset has been previously pushed to the
        Hugging Face Hub, it will be updated instead of creating a new dataset repo.

        Parameters:
            repo_id: the ID of the Hugging Face Hub repo to push the `Dataset` to.
            with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
            generate_card: whether to generate a dataset card for the `Dataset` in the Hugging Face Hub. Defaults
                to `True`.
            **kwargs: the kwargs to pass to `datasets.Dataset.push_to_hub`.

        Returns:
            None
        """

        from huggingface_hub import DatasetCardData, HfApi

        from argilla.datasets._io.card import (
            ArgillaDatasetCard,
            size_categories_parser,
        )

        hf_api = HfApi(token=kwargs.get("token"))

        hfds = False
        if with_records:
            hfds = self.records(with_vectors=True, with_responses=True, with_suggestions=True).to_datasets()
            hfds.push_to_hub(repo_id, **kwargs)
        else:
            hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=kwargs.get("exist_ok") or True)

        with TemporaryDirectory() as tmpdirname:
            config_dir = os.path.join(tmpdirname)

            self.to_disk(path=config_dir, with_records=False)

            if generate_card:
                sample_argilla_record = next(iter(self.records(with_suggestions=True, with_responses=True)))
                sample_huggingface_record = self._get_sample_hf_record(hfds) if with_records else None
                dataset_size = len(hfds) if with_records else 0
                card = ArgillaDatasetCard.from_template(
                    card_data=DatasetCardData(
                        size_categories=size_categories_parser(dataset_size),
                        tags=["rlfh", "argilla", "human-feedback"],
                    ),
                    repo_id=repo_id,
                    argilla_fields=self.settings.fields,
                    argilla_questions=self.settings.questions,
                    argilla_guidelines=self.settings.guidelines or None,
                    argilla_vectors_settings=self.settings.vectors or None,
                    argilla_metadata_properties=self.settings.metadata,
                    argilla_record=sample_argilla_record.to_dict(),
                    huggingface_record=sample_huggingface_record,
                )
                card.save(filepath=os.path.join(tmpdirname, "README.md"))

            hf_api.upload_folder(
                folder_path=tmpdirname,
                repo_id=repo_id,
                repo_type="dataset",
            )

    @classmethod
    def from_hub(
        cls: Type["Dataset"],
        repo_id: str,
        *,
        name: Optional[str] = None,
        workspace: Optional[Union["Workspace", str]] = None,
        client: Optional["Argilla"] = None,
        with_records: bool = True,
        settings: Union["Settings", Literal["auto", "ui"]] = "ui",
        split: Optional[str] = None,
        subset: Optional[str] = None,
        **kwargs: Any,
    ) -> Union["Dataset", str]:
        """Loads a `Dataset` from the Hugging Face Hub.

        Parameters:
            repo_id: the ID of the Hugging Face Hub repo to load the `Dataset` from.
            name (str, optional): The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.
            workspace (Union[Workspace, str], optional): The workspace to import the dataset to. Defaults to None and default workspace is used.
            client: the client to use to load the `Dataset`. If not provided, the default client will be used.
            with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
            settings: the settings to use to load the `Dataset`. If settings are "ui", a URL to configure the settings
                through argilla will be returned. If settings are "auto",
                the settings will be inferred from the `Features` of the dataset on the hub. Defaults to "ui".
            split: the split to load from the Hugging Face dataset. If not provided, the first split will be loaded.
            subset: the subset to load from the Hugging Face dataset. If not provided, the first subset will be loaded.
            **kwargs: the kwargs to pass to `datasets.Dataset.load_from_hub`.

        Returns:
            A `Dataset` loaded from the Hugging Face Hub.
        """
        from argilla.settings import Settings
        from datasets import load_dataset
        from huggingface_hub import snapshot_download

        settings = settings or "ui"

        if name is None:
            name = repo_id

        if settings == "ui":
            return cls._run_settings_ui(
                repo_id=repo_id,
                subset=subset,
                split=split,
                client=client,
            )

        elif isinstance(settings, Settings):
            dataset = cls(name=name, settings=settings)
            dataset.create()
        else:
            try:
                # download configuration files from the hub
                folder_path = snapshot_download(
                    repo_id=repo_id,
                    repo_type="dataset",
                    allow_patterns=cls._DEFAULT_CONFIGURATION_FILES,
                    token=kwargs.get("token"),
                )

                dataset = cls.from_disk(
                    path=folder_path, workspace=workspace, name=name, client=client, with_records=with_records
                )
            except ImportDatasetError:
                from argilla import Settings

                settings = Settings.from_hub(repo_id=repo_id, subset=subset)
                dataset = cls.from_hub(
                    repo_id=repo_id,
                    name=name,
                    workspace=workspace,
                    client=client,
                    with_records=with_records,
                    settings=settings,
                    split=split,
                    subset=subset,
                    **kwargs,
                )
                return dataset

        if with_records:
            try:
                hf_dataset = load_dataset(
                    path=repo_id,
                    split=split,
                    name=subset,
                    **kwargs,
                )  # type: ignore
                hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, split=split, **kwargs)
                cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
            except EmptyDatasetError:
                warnings.warn(
                    message="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
                    category=UserWarning,
                )

        return dataset

    @staticmethod
    def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
        """This method extracts the responses from a Hugging Face dataset and returns a list of `Record` objects"""
        # Identify columns that columns that contain responses
        responses_columns = [col for col in hf_dataset.column_names if ".responses" in col]
        response_questions = defaultdict(dict)
        user_ids = {}
        for col in responses_columns:
            question_name = col.split(".")[0]
            if col.endswith("users"):
                response_questions[question_name]["users"] = hf_dataset[col] or []
                for users in hf_dataset[col]:
                    if users is None:
                        continue
                    user_ids.update({UUID(user_id): user_id for user_id in users})
            elif col.endswith("responses"):
                response_questions[question_name]["responses"] = hf_dataset[col]
            elif col.endswith("status"):
                response_questions[question_name]["status"] = hf_dataset[col]

        # Check if all user ids are known to this Argilla client
        known_users_ids = [user.id for user in dataset._client.users]
        unknown_user_ids = set(user_ids.keys()) - set(known_users_ids)
        my_user = dataset._client.me
        if len(unknown_user_ids) > 1:
            warnings.warn(
                message=f"""Found unknown user ids in dataset repo: {unknown_user_ids}.
                    Assigning first response for each record to current user ({my_user.username}) and discarding the rest."""
            )
        for unknown_user_id in unknown_user_ids:
            user_ids[unknown_user_id] = my_user.id

        # Create a mapper to map the Hugging Face dataset to a Record object
        mapping = {}
        for col in hf_dataset.column_names:
            if ".suggestion" in col:
                mapping[col] = col
            elif col.startswith("metadata.") and col.replace("metadata.", "") in dataset.schema:
                mapping[col] = col.replace("metadata.", "")
            elif col.startswith("vector.") and col.replace("vector.", "") in dataset.schema:
                mapping[col] = col.replace("vector.", "")

        mapper = IngestedRecordMapper(dataset=dataset, mapping=mapping, user_id=my_user.id)

        # Extract responses and create Record objects
        records = []
        hf_dataset = HFDatasetsIO.to_argilla(hf_dataset=hf_dataset, mapper=mapper)
        for idx, row in enumerate(hf_dataset):
            record = mapper(row)
            for question_name, values in response_questions.items():
                response_values = values["responses"][idx] or []
                response_users = values["users"][idx] or []
                response_status = values["status"][idx] or []

                used_users = set()
                for value, user_id, status in zip(response_values, response_users, response_status):
                    user_id = user_ids[UUID(user_id)]
                    if user_id in used_users:
                        continue

                    used_users.add(user_id)
                    response = Response(
                        user_id=user_id,
                        question_name=question_name,
                        value=value,
                        status=status,
                    )
                    record.responses.add(response)
            records.append(record)

        try:
            dataset.records.log(records=records)
        except (RecordsIngestionError, UnprocessableEntityError) as e:
            raise SettingsError(
                message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema. Hugging face dataset features: {hf_dataset.features}. Argilla dataset settings : {dataset.settings}"
            ) from e

    @staticmethod
    def _get_dataset_split(hf_dataset: "HFDataset", split: Optional[str] = None, **kwargs: Dict) -> "HFDataset":
        """Get a single dataset from a Hugging Face dataset.

        Parameters:
            hf_dataset (HFDataset): The Hugging Face dataset to get a single dataset from.

        Returns:
            HFDataset: The single dataset.
        """

        if isinstance(hf_dataset, DatasetDict) and split is None:
            split = next(iter(hf_dataset.keys()))
            if len(hf_dataset.keys()) > 1:
                warnings.warn(
                    message=f"Multiple splits found in Hugging Face dataset. Using the first split: {split}. "
                    f"Available splits are: {', '.join(hf_dataset.keys())}."
                )
            hf_dataset = hf_dataset[split]
        return hf_dataset

    @staticmethod
    def _get_sample_hf_record(hf_dataset: "HFDataset") -> Dict:
        """Get a sample record from a Hugging Face dataset.

        Parameters:
            hf_dataset (HFDataset): The Hugging Face dataset to get a sample record from.

        Returns:
            Dict: The sample record.
        """

        if hf_dataset:
            sample_huggingface_record = {}
            for key, value in hf_dataset[0].items():
                try:
                    json.dumps(value)
                    sample_huggingface_record[key] = value
                except TypeError:
                    if isinstance(value, Image.Image):
                        sample_huggingface_record[key] = pil_to_data_uri(value)
                    else:
                        sample_huggingface_record[key] = "Record value is not serializable"
            return sample_huggingface_record

    @classmethod
    def _run_settings_ui(cls, repo_id: str, subset: str, split: str, client: Optional["Argilla"] = None) -> str:
        from urllib.parse import quote_plus, urlencode
        from argilla.client import Argilla

        import webbrowser

        client = client or Argilla._get_default()

        params = {
            "subset": subset,
            "split": split,
        }

        url = f"{client.api_url.removesuffix('/')}/new/{quote_plus(repo_id)}?{urlencode(params)}"

        try:
            webbrowser.open(url, new=2, autoraise=True)
        except Exception as e:
            warnings.warn(f"Error opening the URL in the browser: {e}")
        finally:
            warnings.warn(f"Open the following URL in your browser to configure the dataset: {url}")
            return url

`to_hub(repo_id, *, with_records=True, generate_card=True, **kwargs)` ¶

Pushes the Dataset to the Hugging Face Hub. If the dataset has been previously pushed to the Hugging Face Hub, it will be updated instead of creating a new dataset repo.

Parameters:

Name	Type	Description	Default
`repo_id`	`str`	the ID of the Hugging Face Hub repo to push the `Dataset` to.	required
`with_records`	`bool`	whether to load the records from the Hugging Face dataset. Defaults to `True`.	`True`
`generate_card`	`Optional[bool]`	whether to generate a dataset card for the `Dataset` in the Hugging Face Hub. Defaults to `True`.	`True`
`**kwargs`	`Any`	the kwargs to pass to `datasets.Dataset.push_to_hub`.	`{}`

Returns:

Type	Description
`None`	None

Source code in src/argilla/datasets/_io/_hub.py

def to_hub(
    self: "Dataset",
    repo_id: str,
    *,
    with_records: bool = True,
    generate_card: Optional[bool] = True,
    **kwargs: Any,
) -> None:
    """Pushes the `Dataset` to the Hugging Face Hub. If the dataset has been previously pushed to the
    Hugging Face Hub, it will be updated instead of creating a new dataset repo.

    Parameters:
        repo_id: the ID of the Hugging Face Hub repo to push the `Dataset` to.
        with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
        generate_card: whether to generate a dataset card for the `Dataset` in the Hugging Face Hub. Defaults
            to `True`.
        **kwargs: the kwargs to pass to `datasets.Dataset.push_to_hub`.

    Returns:
        None
    """

    from huggingface_hub import DatasetCardData, HfApi

    from argilla.datasets._io.card import (
        ArgillaDatasetCard,
        size_categories_parser,
    )

    hf_api = HfApi(token=kwargs.get("token"))

    hfds = False
    if with_records:
        hfds = self.records(with_vectors=True, with_responses=True, with_suggestions=True).to_datasets()
        hfds.push_to_hub(repo_id, **kwargs)
    else:
        hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=kwargs.get("exist_ok") or True)

    with TemporaryDirectory() as tmpdirname:
        config_dir = os.path.join(tmpdirname)

        self.to_disk(path=config_dir, with_records=False)

        if generate_card:
            sample_argilla_record = next(iter(self.records(with_suggestions=True, with_responses=True)))
            sample_huggingface_record = self._get_sample_hf_record(hfds) if with_records else None
            dataset_size = len(hfds) if with_records else 0
            card = ArgillaDatasetCard.from_template(
                card_data=DatasetCardData(
                    size_categories=size_categories_parser(dataset_size),
                    tags=["rlfh", "argilla", "human-feedback"],
                ),
                repo_id=repo_id,
                argilla_fields=self.settings.fields,
                argilla_questions=self.settings.questions,
                argilla_guidelines=self.settings.guidelines or None,
                argilla_vectors_settings=self.settings.vectors or None,
                argilla_metadata_properties=self.settings.metadata,
                argilla_record=sample_argilla_record.to_dict(),
                huggingface_record=sample_huggingface_record,
            )
            card.save(filepath=os.path.join(tmpdirname, "README.md"))

        hf_api.upload_folder(
            folder_path=tmpdirname,
            repo_id=repo_id,
            repo_type="dataset",
        )

`from_hub(repo_id, *, name=None, workspace=None, client=None, with_records=True, settings='ui', split=None, subset=None, **kwargs)` `classmethod` ¶

Loads a Dataset from the Hugging Face Hub.

Parameters:

Name	Type	Description	Default
`repo_id`	`str`	the ID of the Hugging Face Hub repo to load the `Dataset` from.	required
`name`	`str`	The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.	`None`
`workspace`	`Union[Workspace, str]`	The workspace to import the dataset to. Defaults to None and default workspace is used.	`None`
`client`	`Optional[Argilla]`	the client to use to load the `Dataset`. If not provided, the default client will be used.	`None`
`with_records`	`bool`	whether to load the records from the Hugging Face dataset. Defaults to `True`.	`True`
`settings`	`Union[Settings, Literal['auto', 'ui']]`	the settings to use to load the `Dataset`. If settings are "ui", a URL to configure the settings through argilla will be returned. If settings are "auto", the settings will be inferred from the `Features` of the dataset on the hub. Defaults to "ui".	`'ui'`
`split`	`Optional[str]`	the split to load from the Hugging Face dataset. If not provided, the first split will be loaded.	`None`
`subset`	`Optional[str]`	the subset to load from the Hugging Face dataset. If not provided, the first subset will be loaded.	`None`
`**kwargs`	`Any`	the kwargs to pass to `datasets.Dataset.load_from_hub`.	`{}`

Returns:

Type	Description
`Union[Dataset, str]`	A `Dataset` loaded from the Hugging Face Hub.

Source code in src/argilla/datasets/_io/_hub.py

@classmethod
def from_hub(
    cls: Type["Dataset"],
    repo_id: str,
    *,
    name: Optional[str] = None,
    workspace: Optional[Union["Workspace", str]] = None,
    client: Optional["Argilla"] = None,
    with_records: bool = True,
    settings: Union["Settings", Literal["auto", "ui"]] = "ui",
    split: Optional[str] = None,
    subset: Optional[str] = None,
    **kwargs: Any,
) -> Union["Dataset", str]:
    """Loads a `Dataset` from the Hugging Face Hub.

    Parameters:
        repo_id: the ID of the Hugging Face Hub repo to load the `Dataset` from.
        name (str, optional): The name to assign to the new dataset. Defaults to None and the dataset's source name is used, unless it already exists, in which case a unique UUID is appended.
        workspace (Union[Workspace, str], optional): The workspace to import the dataset to. Defaults to None and default workspace is used.
        client: the client to use to load the `Dataset`. If not provided, the default client will be used.
        with_records: whether to load the records from the Hugging Face dataset. Defaults to `True`.
        settings: the settings to use to load the `Dataset`. If settings are "ui", a URL to configure the settings
            through argilla will be returned. If settings are "auto",
            the settings will be inferred from the `Features` of the dataset on the hub. Defaults to "ui".
        split: the split to load from the Hugging Face dataset. If not provided, the first split will be loaded.
        subset: the subset to load from the Hugging Face dataset. If not provided, the first subset will be loaded.
        **kwargs: the kwargs to pass to `datasets.Dataset.load_from_hub`.

    Returns:
        A `Dataset` loaded from the Hugging Face Hub.
    """
    from argilla.settings import Settings
    from datasets import load_dataset
    from huggingface_hub import snapshot_download

    settings = settings or "ui"

    if name is None:
        name = repo_id

    if settings == "ui":
        return cls._run_settings_ui(
            repo_id=repo_id,
            subset=subset,
            split=split,
            client=client,
        )

    elif isinstance(settings, Settings):
        dataset = cls(name=name, settings=settings)
        dataset.create()
    else:
        try:
            # download configuration files from the hub
            folder_path = snapshot_download(
                repo_id=repo_id,
                repo_type="dataset",
                allow_patterns=cls._DEFAULT_CONFIGURATION_FILES,
                token=kwargs.get("token"),
            )

            dataset = cls.from_disk(
                path=folder_path, workspace=workspace, name=name, client=client, with_records=with_records
            )
        except ImportDatasetError:
            from argilla import Settings

            settings = Settings.from_hub(repo_id=repo_id, subset=subset)
            dataset = cls.from_hub(
                repo_id=repo_id,
                name=name,
                workspace=workspace,
                client=client,
                with_records=with_records,
                settings=settings,
                split=split,
                subset=subset,
                **kwargs,
            )
            return dataset

    if with_records:
        try:
            hf_dataset = load_dataset(
                path=repo_id,
                split=split,
                name=subset,
                **kwargs,
            )  # type: ignore
            hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, split=split, **kwargs)
            cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
        except EmptyDatasetError:
            warnings.warn(
                message="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
                category=UserWarning,
            )

    return dataset

rg.Dataset¶

Usage Examples¶

Creating a Dataset¶

Retrieving an existing Dataset¶

Dataset ¶

__init__(name=None, workspace=None, settings=None, client=None) ¶

create() ¶

update() ¶

progress(with_users_distribution=False) ¶

DiskImportExportMixin ¶

to_disk(path, *, with_records=True) ¶

from_disk(path, *, name=None, workspace=None, client=None, with_records=True) classmethod ¶

HubImportExportMixin ¶

to_hub(repo_id, *, with_records=True, generate_card=True, **kwargs) ¶

from_hub(repo_id, *, name=None, workspace=None, client=None, with_records=True, settings='ui', split=None, subset=None, **kwargs) classmethod ¶

`rg.Dataset`¶

`Dataset` ¶

`init(name=None, workspace=None, settings=None, client=None)` ¶

`create()` ¶

`update()` ¶

`progress(with_users_distribution=False)` ¶

`DiskImportExportMixin` ¶

`to_disk(path, *, with_records=True)` ¶

`from_disk(path, *, name=None, workspace=None, client=None, with_records=True)` `classmethod` ¶

`HubImportExportMixin` ¶

`to_hub(repo_id, *, with_records=True, generate_card=True, **kwargs)` ¶

`from_hub(repo_id, *, name=None, workspace=None, client=None, with_records=True, settings='ui', split=None, subset=None, **kwargs)` `classmethod` ¶