Cool pandas hack - get random rows in a multi-column dataframe
#
# load inputs
#
actives = pd.read_pickle("actives_final.pkl")
decoys = pd.read_pickle("decoys_final.pkl")
#
# stack tables
#
df = pd.concat([actives, decoys])
#
# remove duplicate indicies
#
df = df.reset_index()
ordered = df.sort_values(by='tc')
.groupby(['category', 'molId'])
.last()
.reset_index()
shuffled = df.sample(frac=1, random_state=123456)
.groupby(['category', 'molId'])
.last()
.reset_index()
Comments
Post a Comment