Kaggleとかのユーティリティのメモ
毎回探すのアホだろと思ったので自分用まとめ
適宜更新する
タイマー、seed固定、混同行列、feature importance 等
qiita.comヒートマップ高速描画
spcx8.hatenablog.comtrainとtestのdataframeを結合したり切り離したりするやつ
def merge_train_test(df_train, df_test): if "target" not in df_test.columns.values: df_test["target"] = -1 # df_train["target"] に存在しない値にする res = pd.concat([df_train, df_test]) res.reset_index(inplace=True, drop=True) return res def split_train_test(df): df_train = df[df["target"] != -1] df_test = df[df["target"] == -1] df_train.reset_index(inplace=True, drop=True) df_test.reset_index(inplace=True, drop=True) return df_train, df_test
- count encoding
class CountEncoder: def fit(self, series): self.counts = series.groupby(series).count() def transform(self, series): return series.map(self.counts).fillna(0)
- pytorchで異なるシーケンス長のデータをひとつのミニバッチにまとめる
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence def pad_collate(batch): feature, target = zip(*batch) lengths = torch.tensor([len(f) for f in feature]) feature = pad_sequence(feature, batch_first=True) target = pad_sequence(target, batch_first=True) return feature, target, lengths dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
RNNとかに通すときは以下のようにpackする
lstm = nn.LSTM(n_features, 32, batch_first=True, bidirectional=True) for feature, target, lengths in dataloader: h0, c0 = torch.randn(2, BATCH_SIZE, 32, device=device), torch.randn(2, BATCH_SIZE, 32, device=device) feature = pack_padded_sequence(feature, lengths, enforce_sorted=False, batch_first=True) x, _ = lstm(feature, (h0, c0)) x, l = pad_packed_sequence(x, batch_first=True) assert (l == lengths).all() break