#!/usr/bin/env python
# coding=utf-8
from ast import Assert
import os, sys
from re import L
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.cuda import FloatTensor, LongTensor
import numpy as np
ModelConf = {
"lpkt": ["timestamps"]
}
[docs]class LPKTDataset(Dataset):
def __init__(self, file_path, at2idx, it2idx, input_type, folds, qtest=False):
super(LPKTDataset, self).__init__()
self.sequence_path = file_path
self.at2idx = at2idx
self.it2idx = it2idx
self.input_type = input_type
self.qtest = qtest
folds = list(folds)
folds_str = "_" + "_".join([str(_) for _ in folds])
if self.qtest:
processed_data = file_path + folds_str + "_lpkt_qtest.pkl"
else:
processed_data = file_path + folds_str + "_lpkt.pkl"
if not os.path.exists(processed_data):
print(f"Start preprocessing {file_path} fold: {folds_str}...")
if self.qtest:
self.dori, self.dqtest = \
self.__load_data__(self.sequence_path, folds)
save_data = [self.dori, self.dqtest]
# self.q_seqs, self.c_seqs, self.r_seqs, self.at_seqs, self.it_seqs, self.mask_seqs, self.select_masks, self.dqtest = \
# self.load_data(self.sequence_path, folds)
# save_data = [self.q_seqs, self.c_seqs, self.r_seqs, self.at_seqs, self.it_seqs, self.mask_seqs, self.select_masks, self.dqtest]
else:
self.dori = \
self.__load_data__(self.sequence_path, folds)
save_data = self.dori
# self.q_seqs, self.c_seqs, self.r_seqs, self.at_seqs, self.it_seqs, self.mask_seqs, self.select_masks = \
# self.load_data(self.sequence_path, folds)
# save_data = [self.q_seqs, self.c_seqs, self.r_seqs, self.at_seqs, self.it_seqs, self.mask_seqs, self.select_masks]
pd.to_pickle(save_data, processed_data)
else:
print(f"Read data from processed file: {processed_data}")
if self.qtest:
self.dori, self.dqtest = pd.read_pickle(processed_data)
else:
self.dori = pd.read_pickle(processed_data)
print(f"file path: {file_path}, qlen: {len(self.dori['qseqs'])}, clen: {len(self.dori['cseqs'])}, rlen: {len(self.dori['rseqs'])}")
def __len__(self):
"""return the dataset length
Returns:
int: the length of the dataset
"""
return len(self.dori["rseqs"])
def __getitem__(self, index):
"""
Args:
index (int): the index of the data want to get
Returns:
(tuple): tuple containing:
- **q_seqs (torch.tensor)**: question id sequence of the 0~seqlen-2 interactions
- **c_seqs (torch.tensor)**: knowledge concept id sequence of the 0~seqlen-2 interactions
- **r_seqs (torch.tensor)**: response id sequence of the 0~seqlen-2 interactions
- **qshft_seqs (torch.tensor)**: question id sequence of the 1~seqlen-1 interactions
- **cshft_seqs (torch.tensor)**: knowledge concept id sequence of the 1~seqlen-1 interactions
- **rshft_seqs (torch.tensor)**: response id sequence of the 1~seqlen-1 interactions
- **mask_seqs (torch.tensor)**: masked value sequence, shape is seqlen-1
- **select_masks (torch.tensor)**: is select to calculate the performance or not, 0 is not selected, 1 is selected, only available for 1~seqlen-1, shape is seqlen-1
- **dcur (dict)**: used only self.qtest is True, for question level evaluation
"""
dcur = dict()
mseqs = self.dori["masks"][index]
for key in self.dori:
if key in ["masks", "smasks"]:
continue
if len(self.dori[key]) == 0:
dcur[key] = self.dori[key]
dcur["shft_"+key] = self.dori[key]
continue
# print(f"key: {key}, len: {len(self.dori[key])}")
seqs = self.dori[key][index][:-1] * mseqs
shft_seqs = self.dori[key][index][1:] * mseqs
dcur[key] = seqs
dcur["shft_"+key] = shft_seqs
dcur["masks"] = mseqs
dcur["smasks"] = self.dori["smasks"][index]
# print("tseqs", dcur["tseqs"])
if not self.qtest:
return dcur
else:
dqtest = dict()
for key in self.dqtest:
dqtest[key] = self.dqtest[key][index]
return dcur, dqtest
# def __getitem__(self, index):
# q_seqs, qshft_seqs, c_seqs, cshft_seqs = torch.tensor([]), torch.tensor([]), torch.tensor([]), torch.tensor([])
# # rgaps, sgaps, pcounts = torch.tensor([]), torch.tensor([]), torch.tensor([])
# # at_seqs, it_seqs = torch.tensor([]), torch.tensor([])
# d, dshft = dict(), dict()
# if "questions" in self.input_type:
# q_seqs = self.q_seqs[index][:-1] * self.mask_seqs[index]
# qshft_seqs = self.q_seqs[index][1:] * self.mask_seqs[index]
# if "concepts" in self.input_type:
# c_seqs = self.c_seqs[index][:-1] * self.mask_seqs[index]
# cshft_seqs = self.c_seqs[index][1:] * self.mask_seqs[index]
# d["at_seqs"] = self.at_seqs[index][:-1] * self.mask_seqs[index]
# d["it_seqs"] = self.it_seqs[index][:-1] * self.mask_seqs[index]
# # d["pcounts"] = self.pcounts[index][:-1] * self.mask_seqs[index]
# dshft["at_seqs"] = self.at_seqs[index][1:] * self.mask_seqs[index]
# dshft["it_seqs"] = self.it_seqs[index][1:] * self.mask_seqs[index]
# # dshft["pcounts"] = self.pcounts[index][1:] * self.mask_seqs[index]
# r_seqs = self.r_seqs[index][:-1] * self.mask_seqs[index]
# rshft_seqs = self.r_seqs[index][1:] * self.mask_seqs[index]
# mask_seqs = self.mask_seqs[index]
# select_masks = self.select_masks[index]
# if not self.qtest:
# return q_seqs, c_seqs, r_seqs, qshft_seqs, cshft_seqs, rshft_seqs, mask_seqs, select_masks, d, dshft
# else:
# dcur = dict()
# for key in self.dqtest:
# dcur[key] = self.dqtest[key][index]
# return q_seqs, c_seqs, r_seqs, qshft_seqs, cshft_seqs, rshft_seqs, mask_seqs, select_masks, d, dshft, dcur
# def load_data(self, sequence_path, folds, pad_val=-1):
# seq_qids, seq_cids, seq_rights, seq_mask = [], [], [], []
# seq_at, seq_it = [], []
# df = pd.read_csv(sequence_path)
# df = df[df["fold"].isin(folds)]
# dqtest = {"qidxs": [], "rests":[], "orirow":[]}
# flag = True
# for key in ModelConf["lpkt"]:
# if key not in df.columns:
# print(f"key: {key} not in data: {self.sequence_path}! can not run dkt_forget model!")
# flag = False
# assert flag == True
# for i, row in df.iterrows():
# #use kc_id or question_id as input
# if "concepts" in self.input_type:
# seq_cids.append([int(_) for _ in row["concepts"].split(",")])
# if "questions" in self.input_type:
# seq_qids.append([int(_) for _ in row["questions"].split(",")])
# seq_rights.append([int(_) for _ in row["responses"].split(",")])
# seq_mask.append([int(_) for _ in row["selectmasks"].split(",")])
# at = [self.at2idx[str(int(float(t)))] for t in row["usetimes"].split(",")]
# seq_at.append(at)
# #cal interval time
# timestamps = [int(float(t)) for t in row["timestamps"].split(",")]
# shft_timestamps = [0] + timestamps[:-1]
# it = np.maximum(np.minimum((np.array(timestamps) - np.array(shft_timestamps)) // 60, 43200),-1)
# tmp_it = [self.it2idx[str(t)] for t in it]
# seq_it.append(tmp_it)
# if self.qtest:
# dqtest["qidxs"].append([int(_) for _ in row["qidxs"].split(",")])
# dqtest["rests"].append([int(_) for _ in row["rest"].split(",")])
# dqtest["orirow"].append([int(_) for _ in row["orirow"].split(",")])
# q_seqs, c_seqs, r_seqs = FloatTensor(seq_qids), FloatTensor(seq_cids), FloatTensor(seq_rights)
# # rgaps, sgaps, pcounts = LongTensor(repeated_gap), LongTensor(sequence_gap), LongTensor(past_counts)
# at_seqs, it_seqs = LongTensor(seq_at), LongTensor(seq_it)
# seq_mask = LongTensor(seq_mask)
# mask_seqs = (c_seqs[:,:-1] != pad_val) * (c_seqs[:,1:] != pad_val)
# select_masks = (seq_mask[:, 1:] != pad_val)
# if self.qtest:
# for key in dqtest:
# dqtest[key] = LongTensor(dqtest[key])[:, 1:]
# return q_seqs, c_seqs, r_seqs, at_seqs, it_seqs, mask_seqs, select_masks, dqtest
# return q_seqs, c_seqs, r_seqs, at_seqs, it_seqs, mask_seqs, select_masks
def __load_data__(self, sequence_path, folds, pad_val=-1):
"""
Args:
sequence_path (str): file path of the sequences
folds (list[int]):
pad_val (int, optional): pad value. Defaults to -1.
Returns:
(tuple): tuple containing
- **q_seqs (torch.tensor)**: question id sequence of the 0~seqlen-1 interactions
- **c_seqs (torch.tensor)**: knowledge concept id sequence of the 0~seqlen-1 interactions
- **r_seqs (torch.tensor)**: response id sequence of the 0~seqlen-1 interactions
- **mask_seqs (torch.tensor)**: masked value sequence, shape is seqlen-1
- **select_masks (torch.tensor)**: is select to calculate the performance or not, 0 is not selected, 1 is selected, only available for 1~seqlen-1, shape is seqlen-1
- **dqtest (dict)**: not null only self.qtest is True, for question level evaluation
"""
dori = {"qseqs": [], "cseqs": [], "rseqs": [], "tseqs": [], "utseqs": [], "smasks": [], "itseqs": []}
# seq_qids, seq_cids, seq_rights, seq_mask = [], [], [], []
df = pd.read_csv(sequence_path)#[0:1000]
df = df[df["fold"].isin(folds)]
interaction_num = 0
# seq_qidxs, seq_rests = [], []
dqtest = {"qidxs": [], "rests":[], "orirow":[]}
for i, row in df.iterrows():
#use kc_id or question_id as input
if "concepts" in self.input_type:
dori["cseqs"].append([int(_) for _ in row["concepts"].split(",")])
if "questions" in self.input_type:
dori["qseqs"].append([int(_) for _ in row["questions"].split(",")])
if "timestamps" in row:
dori["tseqs"].append([int(_) for _ in row["timestamps"].split(",")])
if "usetimes" in row:
at = [self.at2idx[str(int(float(_)))] for _ in row["usetimes"].split(",")]
dori["utseqs"].append(at)
dori["rseqs"].append([int(_) for _ in row["responses"].split(",")])
dori["smasks"].append([int(_) for _ in row["selectmasks"].split(",")])
#cal interval time
timestamps = dori["tseqs"][-1]
shft_timestamps = [0] + timestamps[:-1]
it = np.maximum(np.minimum((np.array(timestamps) - np.array(shft_timestamps)) // 60, 43200),-1)
tmp_it = [self.it2idx[str(t)] for t in it]
dori["itseqs"].append(tmp_it)
interaction_num += dori["smasks"][-1].count(1)
if self.qtest:
dqtest["qidxs"].append([int(_) for _ in row["qidxs"].split(",")])
dqtest["rests"].append([int(_) for _ in row["rest"].split(",")])
dqtest["orirow"].append([int(_) for _ in row["orirow"].split(",")])
for key in dori:
if key not in ["rseqs"]:#in ["smasks", "tseqs"]:
dori[key] = LongTensor(dori[key])
else:
dori[key] = FloatTensor(dori[key])
mask_seqs = (dori["cseqs"][:,:-1] != pad_val) * (dori["cseqs"][:,1:] != pad_val)
dori["masks"] = mask_seqs
dori["smasks"] = (dori["smasks"][:, 1:] != pad_val)
print(f"interaction_num: {interaction_num}")
# print("load data tseqs: ", dori["tseqs"])
if self.qtest:
for key in dqtest:
dqtest[key] = LongTensor(dqtest[key])[:, 1:]
return dori, dqtest
return dori