Kaggleとかのユーティリティのメモ

毎回探すのアホだろと思ったので自分用まとめ
適宜更新する

  • タイマー、seed固定、混同行列、feature importance 等
    qiita.com

  • ヒートマップ高速描画
    spcx8.hatenablog.com

  • trainとtestのdataframeを結合したり切り離したりするやつ

def merge_train_test(df_train, df_test):
    if "target" not in df_test.columns.values:
        df_test["target"] = -1  # df_train["target"] に存在しない値にする
    res = pd.concat([df_train, df_test])
    res.reset_index(inplace=True, drop=True)
    return res

def split_train_test(df):
    df_train = df[df["target"] != -1]
    df_test = df[df["target"] == -1]
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop=True)
    return df_train, df_test
  • count encoding
class CountEncoder:
    def fit(self, series):
        self.counts = series.groupby(series).count()
    
    def transform(self, series):
        return series.map(self.counts).fillna(0)
  • pytorchで異なるシーケンス長のデータをひとつのミニバッチにまとめる
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

def pad_collate(batch):
    feature, target = zip(*batch)
    lengths = torch.tensor([len(f) for f in feature])
    feature = pad_sequence(feature, batch_first=True)
    target = pad_sequence(target, batch_first=True)
    return feature, target, lengths

dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)

RNNとかに通すときは以下のようにpackする

lstm = nn.LSTM(n_features, 32, batch_first=True, bidirectional=True)
for feature, target, lengths in dataloader:
    h0, c0 = torch.randn(2, BATCH_SIZE, 32, device=device), torch.randn(2, BATCH_SIZE, 32, device=device)

    feature = pack_padded_sequence(feature, lengths, enforce_sorted=False, batch_first=True)
    x, _ = lstm(feature, (h0, c0))
    x, l = pad_packed_sequence(x, batch_first=True)
    assert (l == lengths).all()

    break