1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
| def cal_score_distribution(labels,score,n=0): ''' 计算相关指标 label: feature list score: score list n:要分多少组 ''' re_col = {'precision':'准确率', 'cum_precision':'累计准确率', 'recall':'召回率', 'cum_recall':'累计召回率', 'disturb':'打扰率', 'cum_disturb':'累计打扰率', 'good_rate':'好样本占比', 'bad_rate':'坏样本占比', 'total':'区间样本分配', 'total_rate':'总量占比', 'pred':'分数区间', 'good':'好样本', 'bad':'坏样本', 'cum_good':'好样本累计总量', 'cum_bad':'坏样本累计总量', 'sum':'总量' }
if type(n) == int and n > 0 : ar_rang = np.array([i for i in range(n) ]) elif type(n) == list: ar_rang = np.array(n) else: ar_rang = np.arange(score.min()//2*2,score.max()//2*2,2) #print(ar_rang,type(ar_rang)) bins = [i for i in ar_rang ] preds , bins = pd.cut(score, bins,retbins=True) pred = preds # 预测值 bad = labels # 取1为bad, 0为good ksds = pd.DataFrame({'bad': bad, 'pred': pred}) ksds['good'] = 1 - ksds.bad df_gp = ksds.groupby('pred').agg({'good':'sum','bad':'sum'}) result_df = df_gp.reset_index().sort_values(by=['pred'],ascending=False) result_df['good_rate'] = result_df['good']/result_df['good'].sum() result_df['bad_rate'] = result_df['bad']/result_df['bad'].sum() result_df['cum_good']=result_df.good.cumsum() result_df['cum_bad']=result_df.bad.cumsum() result_df['total'] = result_df['good'] + result_df['bad'] result_df['total_rate'] = result_df['total']/result_df['total'].sum() result_df['sum']=result_df['cum_good']+result_df['cum_bad'] result_df['overdue_rate'] = result_df['cum_bad']/result_df['sum'] result_df['woe'] = np.log(result_df['good_rate']/result_df['bad_rate']) result_df['iv'] = (result_df['good_rate'] - result_df['bad_rate'])*result_df['woe'] result_df['pass_rate'] = result_df['sum']/(result_df['good'].sum()+result_df['bad'].sum()) result_df['precision'] = result_df['bad']/result_df['total'] result_df['cum_precision'] = result_df['cum_bad']/result_df['sum'] result_df['recall'] = result_df['bad']/result_df['bad'].sum() result_df['cum_recall'] = result_df['cum_bad'] / result_df['bad'].sum() result_df['disturb'] = result_df['good']/result_df['good'].sum() result_df['cum_disturb'] = result_df['cum_good']/result_df['good'].sum() result_df['range_ks'] = result_df['cum_bad'] / result_df['bad'].sum() - result_df['cum_good'] / result_df['good'].sum() result_df['ks'] = result_df['range_ks'].max() #pre_data.rename(columns=columns, inplace=True) return result_df.rename(columns=re_col)
|