Source code for torchctr.datasets.base

#!/usr/bin/env python
# encoding: utf-8

import math
from torch.utils.data import Dataset
from collections import defaultdict


[docs]class BaseDataset(Dataset):
    def __init__(self):
        self.feature_mapper = {}
        self.field_mapper = {}

[docs]    def load_data(self):
        raise NotImplementedError

[docs]    def preprocess_x(self, non_categorical=[], non_categorical_func=None):
        if non_categorical_func == None:
            print(
                "| Warning | Didn't specify the func for dense field, so we will use default log "
            )
            non_categorical_func = lambda x: int(math.log(x + 10) ** 2)

        self.field_mapper = {
            field: idx for idx, field in enumerate(self.x_columns)
        }

        feature_counter = defaultdict(lambda: defaultdict(int))

        for c in self.x_columns:
            if c in non_categorical:
                self.data[c] = self.data[c].apply(
                    lambda x: non_categorical_func(x)
                )
            di = self.data[c].value_counts()
            di.index = di.index.astype(str)
            feature_counter[self.field_mapper[c]] = di.to_dict()

        feature_mapper = {
            i: {feat for feat, c in cnt.items() if c > 0}
            for i, cnt in feature_counter.items()
        }
        self.feature_mapper = {
            i: {feat: idx for idx, feat in enumerate(cnt)}
            for i, cnt in feature_mapper.items()
        }

        self.data = self.data.astype({c: str for c in self.x_columns})

        for c in self.x_columns:
            self.data[c] = self.data[c].apply(
                lambda x: self.feature_mapper[self.field_mapper[c]][x]
            )

[docs]    def preprocess_y(self, y_func=None):
        if y_func == None:
            print(
                "| Warning | Didn't specify the func for target column, so we will use raw data"
            )
        else:
            self.data[self.y_column] = self.data[self.y_column].apply(lambda x: y_func(x))

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]