"""Deepchem Dataset"""
from collections.abc import Mapping
import numpy as np
from ..utils import match, as_list, update_copy
from .dataset import Dataset, DictDataset, TeachableWrapperDataset
# pylint: disable=import-outside-toplevel
[docs]class DeepChemDataset(DictDataset):
"""
DeepChem dataset
Some common featurizers:
Keras `GraphConvModel`s use the `ConvMolFeaturizer`, which may be
abbreviated to `'convmol'` in the `featurizer` argument here.
Pytorch `GCNModel`s use the `MolGraphConvFeaturizer`, which may be
abbreviated to `'molgraph'` here.
"""
def __init__(self, data={}, *args, featurizer='dummy', bdim=1, **kwargs): # NOSONAR
import deepchem as dc
if isinstance(data, dc.data.Dataset):
dataset = data
data = {
"X": dataset.X,
"ids": dataset.ids,
}
try:
data["y"] = dataset.y
data["w"] = dataset.w
except KeyError:
pass
data = update_copy(data, kwargs) # NOSONAR
if "ids" not in data:
data["ids"] = Dataset(np.arange(len(data['X'])))
if "y" in data and "w" not in data:
data["w"] = Dataset(np.ones(len(data['X']), dtype=np.float32))
if featurizer is not None:
data["X"] = Dataset(self.get_featurizer(featurizer).featurize(data["X"]))
super().__init__(data, *args, bdim=bdim, has_Xy=True)
[docs] @staticmethod
def get_featurizer(f, **kwargs):
import deepchem as dc
if f is None:
return dc.feat.DummyFeaturizer()
if isinstance(f, dc.feat.Featurizer):
return f
elif isinstance(f, type) and issubclass(f, dc.feat.Featurizer):
return f(**kwargs)
else:
return dc.feat.__dict__[
match(f, dc.feat.__dict__, lambda x, y: x.lower() in y.lower())
](**kwargs)
[docs] @staticmethod
def from_csv(file, X="X", y=None, featurizer=None, **kwargs):
"""
Loads a DeepChem dataset from a `.csv` file.
Args:
X, y (str): Column names for the X and y data
featurizer: Specifies the DeepChem featurizer to use, if any.
`featurizer` may be a DeepChem featurizer class, or a featurizer
instance, *or* a string contained in the classname of a
featurizer. (Eg., `'convmol'` matches the DeepChem `ConvMolFeaturizer`.)
**kwargs: These are passed to the featurizer constructor.
Returns:
An `alien.data.DeepChemDataset`
"""
import deepchem as dc
y = as_list(y)
loader = dc.data.CSVLoader(
y,
feature_field=X,
featurizer=DeepChemDataset.get_featurizer(featurizer),
**kwargs
)
disk_dataset = loader.create_dataset(file)
data = {
"X": disk_dataset.X,
"ids": disk_dataset.ids,
}
if y:
data["y"] = disk_dataset.y
data["w"] = disk_dataset.w
return DeepChemDataset(data)
[docs] @staticmethod
def from_df(df, X="X", y=None, ids="ids", weights=None, featurizer=None, **kwargs):
"""
Returns a DeepChemDataset built from a Pandas DataFrame.
:param df: The dataframe to convert
:param X: The name of the feature column. Defaults to "X".
:param y: The name of the y/label column, or a list of names for
multi-prediction. By default, no y values are extracted.
:param ids: The name of the ids column. By default, looks for a column
named 'ids', and if none is found, uses the dataframe index.
:param weights: The name of the weights column. If none is given,
uses 1.0 for all weights.
:param featurizer: Specifies the DeepChem featurizer to use, if any.
`featurizer` may be a DeepChem featurizer class, or a featurizer
instance, *or* a string contained in the classname of a
featurizer. (Eg., `'convmol'` matches the DeepChem `ConvMolFeaturizer`.)
:param **kwargs: Any additional keyword args will become columns in
the dataset; for example, keyword arg `t='timestamp'`, creates
a column with key `t` and values taken from `df['timestamp']`.
"""
import deepchem as dc
y = as_list(y)
data_dict = DeepChemDataset._get_data_dict(df, X=X, y=y, ids=ids, **kwargs)
if len(y) > 0:
y_cols = []
for y_col in y:
if y_col not in df.columns:
raise ValueError(f"y-value `{y_col}` is not in the dataframe.")
y_cols.append(df[y_col].values)
data_dict["y"] = np.stack(y_cols, axis=1)
if weights is not None:
if weights in df.columns:
data_dict["w"] = df[weights].values
else:
raise ValueError(f"Dataframe doesn't contain the weights column '{weights}'.")
else:
data_dict["w"] = np.ones(len(df), dtype=float)
if featurizer is not None:
data_dict["X"] = DeepChemDataset.get_featurizer(featurizer).featurize(data_dict["X"])
return DeepChemDataset(data_dict)
@staticmethod
def _get_data_dict(df, X="X", ids="ids", **kwargs):
data_dict = {}
if X in df.columns:
data_dict["X"] = df[X].values
elif "X" in df.columns:
data_dict["X"] = df["X"].values
else:
raise ValueError(
f"Your dataframe must have either an 'X' column, or a user-specified X column. \nInstead, you have columns:\n{df.columns}"
)
if ids in df.columns:
data_dict["ids"] = df[ids].values
elif "ids" in df.columns:
data_dict["ids"] = df.ids.values
else:
data_dict["ids"] = df.index.values
for k, c in kwargs.items():
data_dict[k] = df[c].values
return data_dict
def _to_DC(self):
import deepchem as dc
return dc.data.NumpyDataset(**{k: v.data for k, v in self.data.items()})
# def append(self, x):
# if 'ids' not in x:
# warn("Failed to include 'ids' key in DeepChemDataset.append")
# if len(self) == 0:
# x['ids'] = 0
# elif isinstance[self.data['ids'][0]] and
[docs]def as_DCDataset(data):
"""Convert data to a DeepChem dataset."""
import deepchem
if isinstance(data, deepchem.data.Dataset):
return data
if isinstance(data, TeachableWrapperDataset):
data = data.data
if isinstance(data, Mapping):
data = {k: np.asarray(v) for k, v in data.items()}
if "ids" not in data:
data["ids"] = np.arange(len(next(iter(data.values()))))
else:
data = {"X": np.asarray(data), "ids": np.arange(len(data))}
return deepchem.data.NumpyDataset(**data)