资金流入流出预测比赛(五)

本文涉及内容为时间序列规则,参考链接为Datawhale 资金流入流出学习内容

数据预处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pandas as pd
import sklearn as skr
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
from typing import *
import random
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')
np.random.seed(1024)

labels = ['total_purchase_amt', 'total_redeem_amt']
1
2
3
4
5
6
7
8
9
10
11
# 分割数据集

def split_data_underline(data: pd.DataFrame)->pd.DataFrame:
trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,8,1))]
testset = data[(datetime.date(2014,8,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]
return trainset, testset

def split_data_online(data: pd.DataFrame)->pd.DataFrame:
trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]
testset = data[(datetime.date(2014,9,1) <= data['date']) & (data['date'] < datetime.date(2014,10,1))]
return trainset, testset
1
2
3
4
5
6
7
# 定义评价函数

def AE(y: Iterable, yhat: Iterable)->Iterable:
return np.abs(y - yhat) / np.abs(y)

def total_AE(purchasehat: Iterable, redeemhat: Iterable, purchase: Iterable, redeem: Iterable, h: int = 0.3)->Iterable:
return sum(map(lambda x : np.exp(-x/h)*10, AE(purchase, purchasehat))) * 0.45 + sum(map(lambda x : np.exp(-x/h)*10, AE(redeem, redeemhat))) * 0.55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 在不同的时间段对模型进行验证

def week_evalution_single(data: pd.DataFrame, model: object, types: str)->pd.DataFrame:
results = []
a_month = relativedelta(months=1)
for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11),
datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]
testset = data[(i <= data['date']) & (data['date'] < i + a_month)]
if len(testset) == 0 or len(trainset) == 0:
i = datetime.date(2014, 4, 20)
trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]
testset = data[(i <= data['date']) & (data['date'] < datetime.date(2014, 9, 1))]
feature = [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']]

model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
result_lr = model.predict(testset[feature])

h = 0.3
results.append(sum(AE(testset['total_' + types + '_amt'], result_lr).apply(lambda x : np.exp(-x/h))*10))
return pd.DataFrame(results)
1
2
3
4
5
6
7
# 输出评级表格

def draw_eva_table(df: pd.DataFrame)->pd.DataFrame:
rest = df.copy()
rest['interval'] = [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11),
datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]
return rest
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 对生成结果进行可视化

def visual(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->None:
fig = plt.figure(figsize=(10,4))
plt.plot(testset['date'], result_purchase_lr, label='predicted_purchase')
plt.plot(testset['date'], testset['total_purchase_amt'], label='real_redeem')

plt.legend(loc='best')
plt.title("The distribution of real and predict purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
fig = plt.figure(figsize=(10,4))
sns.barplot(testset['date'].dt.day ,result_purchase_lr - testset['total_purchase_amt'])

fig = plt.figure(figsize=(10,4))
plt.plot(testset['date'], result_redeem_lr, label='predicted_redeem')
plt.plot(testset['date'], testset['total_redeem_amt'], label='real_redeem')

plt.legend(loc='best')
plt.title("The distribution of real and predict redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
fig = plt.figure(figsize=(10,4))
sns.barplot(testset['date'].dt.day ,result_redeem_lr - testset['total_redeem_amt'])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 定义提取线下最好效果特征的函数

def feature_extract(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], List[float]]:
features = [x for x in data.columns if x not in labels + ['date']]
random.shuffle(features)
results = []
score = -1
for i in features:
score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types))
if score_update > score:
score = score_update
results.append(i)
return results, score

def robust_feature_extract(data: pd.DataFrame, model: object, types: str):
results = []
score = -1
for i in range(10):
results_update, score_update = feature_extract(data, model, types)
if score_update > score:
score = score_update
results = results_update
print(results_update, score_update)
return results
1
2
3
4
5
6
# 定义AIC,BIC评价指标

def AIC(L: Iterable, delta: float, n_features: int):
return L * np.log10(delta) + 2 * (n_features + 1)
def AIC(L: Iterable, delta: float, n_features: int):
return L * np.log10(delta) + (n_features + 1) * np.log10(L)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# 使用AIC指标融合模型

def feature_extract_AIC(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], float]:
features = [x for x in data.columns if x not in labels + ['date']]
random.shuffle(features)
results = []
test_score = 1e9
train_score = 0
for i in features:
test_score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types)[0])
if test_score_update < test_score:
test_score = test_score_update
results.append(i)

trainset, testset = split_data_underline(data)
feature = results
model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
train_result_lr = model.predict(trainset[feature])
delta = mean_squared_error(train_result_lr, trainset['total_' + types + '_amt'])
#delta = np.sum(AE(trainset['total_' + types + '_amt'], train_result_lr).apply(lambda x : np.exp(-x/0.1))*10)
return results, AIC(len(trainset), delta, len(feature))

def multi_model(data: pd.DataFrame, model: object, types: str)->Tuple[List[List[str]], float]:
features = []
weights = []
for i in range(100):
results_update, score_update = feature_extract_AIC(data, model, types)
features.append(results_update)
weights.append(score_update)
avg = np.mean(weights)
weights = [x - avg for x in weights]
weights = [np.power((-1 * x / 2), 10) for x in weights]
summ = np.sum(weights)
weights = [x / summ for x in weights]
return features, weights
1
2
3
4
5
6
7
# 生成线上结果

def generate_online_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:
trainset, testset = split_data_online(df)
model.fit(X=trainset[feature], y=trainset[target])
result_purchase_lr = model.predict(testset[feature])
return result_purchase_lr
1
2
3
4
5
def generate_under_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:
trainset, testset = split_data_underline(df)
model.fit(X=trainset[feature], y=trainset[target])
result_purchase_lr = model.predict(testset[feature])
return result_purchase_lr
1
2
3
4
5
6
7
8
9
# 生成线上提交的格式

def normalize_upload_file(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->pd.DataFrame:
testset['total_purchase_amt'] = result_purchase_lr
testset['total_redeem_amt'] = result_redeem_lr
online_upload = testset[['date','total_purchase_amt','total_redeem_amt']]
online_upload['date'] = online_upload['date'].astype(str)
online_upload['date'] = online_upload['date'].str.replace('-','')
return online_upload
1
2
3
4
5
6
7
8
9
10
11
# 线上结果可视化

def draw_result(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame):
fig = plt.figure(figsize=(10,4))
plt.plot(testset['date'].dt.day, result_purchase_lr, label='online_purchase')
plt.plot(testset['date'].dt.day, result_redeem_lr, label='online_redeem')

plt.legend(loc='best')
plt.title("The predict values")
plt.xlabel("Time")
plt.ylabel("Amount")
1
2
3
4
5
6
7
8
9
# 重载DataFrame加法

def add_two_df(df1, df2, features = None, left_a = 0.45, right_a = 0.55):
data = df1.copy()
if not features:
features = [x for x in data.columns if x != 'interval']
for i in features:
data[i] = (data[i] * left_a + df2[i] * right_a)
return data
1
2
3
4
5
6
7
8
9
# 重载DataFrame乘法

def scale_df(df1, features = None, eta = 1):
data = df1.copy()
if not features:
features = [x for x in data.columns if x != 'interval']
for i in features:
data[i] *= eta
return data

建模测试

一、仅使用IS特征

1
2
data = pd.read_csv('Dataset/feature0522.csv')
data['date'] = pd.to_datetime(data['date'])
1
2
3
trainset, testset = split_data_underline(data)
result_purchase_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_purchase_amt')
result_redeem_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_redeem_amt')

在八月份预测结果

1
2
total_AE(result_purchase_lr, result_redeem_lr, testset['total_purchase_amt'], testset['total_redeem_amt'])
# 189.31445991054966

滑窗测试结果

1
draw_eva_table(week_evalution_single(data, model=LinearRegression(), types = 'purchase'))
0 interval
0 197.277321 2014-08-01
1 167.809363 2014-07-25
2 162.569572 2014-07-18
3 159.214733 2014-07-11
4 144.062633 2014-07-04
5 142.332339 2014-06-27
6 126.240393 2014-06-20
1
draw_eva_table(week_evalution_single(data, LinearRegression(), 'redeem'))
0 interval
0 182.799392 2014-08-01
1 172.093440 2014-07-25
2 181.210211 2014-07-18
3 168.488252 2014-07-11
4 175.160622 2014-07-04
5 174.465494 2014-06-27
6 175.201245 2014-06-20
### 八月份预测图与真实图
1
visual(result_purchase_lr, result_redeem_lr, testset)
![png](../Pic/%E8%B5%84%E9%87%91%E6%B5%81%E5%85%A5%E6%B5%81%E5%87%BA%E9%A2%84%E6%B5%8B%E6%AF%94%E8%B5%9B%EF%BC%88%E4%BA%94%EF%BC%89/output_26_0.png) ![png](../Pic/%E8%B5%84%E9%87%91%E6%B5%81%E5%85%A5%E6%B5%81%E5%87%BA%E9%A2%84%E6%B5%8B%E6%AF%94%E8%B5%9B%EF%BC%88%E4%BA%94%EF%BC%89/output_26_1.png) ![png](../Pic/%E8%B5%84%E9%87%91%E6%B5%81%E5%85%A5%E6%B5%81%E5%87%BA%E9%A2%84%E6%B5%8B%E6%AF%94%E8%B5%9B%EF%BC%88%E4%BA%94%EF%BC%89/output_26_2.png) ![png](../Pic/%E8%B5%84%E9%87%91%E6%B5%81%E5%85%A5%E6%B5%81%E5%87%BA%E9%A2%84%E6%B5%8B%E6%AF%94%E8%B5%9B%EF%BC%88%E4%BA%94%EF%BC%89/output_26_3.png)
1
2
result_purchase_lr = generate_online_result(data, [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_purchase_amt')
result_redeem_lr = generate_online_result(data, [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_redeem_amt')
### 九月份预测效果图(线性)
1
2
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)
![png](../Pic/%E8%B5%84%E9%87%91%E6%B5%81%E5%85%A5%E6%B5%81%E5%87%BA%E9%A2%84%E6%B5%8B%E6%AF%94%E8%B5%9B%EF%BC%88%E4%BA%94%EF%BC%89/output_29_0.png)
1
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190612_only_is.csv',index=False,header=None)
## 二、多模型对比
1
2
3
4
5
6
7
8
9
10
def multi_model_eva(data, types:str = 'purchase'):
results = pd.DataFrame()
for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:
if results.empty:
results = draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]})
else:
results = pd.merge(results, \
draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]}), on='interval')
results = results[['interval'] + [x for x in results.columns if x != 'interval']]
return results
1
add_two_df(multi_model_eva(data, 'purchase'), multi_model_eva(data, 'redeem'))
interval LinearRegression DecisionTreeRegressor RandomForestRegressor GradientBoostingRegressor MLPRegressor XGBRegressor
0 2014-08-01 189.314460 188.743896 187.715746 188.808471 188.744889 189.283918
1 2014-07-25 170.165605 171.123954 172.599158 171.155189 169.383859 171.483486
2 2014-07-18 172.821924 175.689847 176.236757 175.700583 174.238004 175.781053
3 2014-07-11 164.315168 167.489060 168.552495 167.497959 164.755084 167.463230
4 2014-07-04 161.166527 164.436476 163.818487 164.457772 162.257028 164.322969
5 2014-06-27 160.005574 163.849417 162.010456 163.820346 160.703909 163.713086
6 2014-06-20 153.168861 156.563393 157.063395 156.511847 156.539468 156.445360

三、劣汰后特征对比

1
2
data_purchase = pd.read_csv('Feature/purchase_feature_droped_0614.csv')
data_purchase['date'] = pd.to_datetime(data_purchase['date'])
1
2
data_redeem = pd.read_csv('Feature/redeem_feature_droped_0614.csv')
data_redeem['date'] = pd.to_datetime(data_redeem['date'])
1
2
3
4
trainset_purchase, testset_purchase = split_data_underline(data_purchase)
result_purchase_lr = generate_under_result(data_purchase, [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_purchase_amt')
1
2
3
4
trainset_redeem, testset_redeem = split_data_underline(data_redeem)
result_redeem_lr = generate_under_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_redeem_amt')
1
2
total_AE(result_purchase_lr, result_redeem_lr, testset_purchase['total_purchase_amt'], testset_redeem['total_redeem_amt'])
# 189.90232809854422
1
add_two_df(multi_model_eva(data_purchase, 'purchase'), multi_model_eva(data_redeem, 'redeem'))
interval LinearRegression DecisionTreeRegressor RandomForestRegressor GradientBoostingRegressor MLPRegressor XGBRegressor
0 2014-08-01 189.902328 167.901814 170.798802 177.216664 186.847975 178.177163
1 2014-07-25 177.650078 167.752839 177.877918 169.722728 180.481298 171.346027
2 2014-07-18 182.113181 184.259270 176.164814 175.773043 180.043535 177.470257
3 2014-07-11 182.092305 178.165773 184.042765 178.501400 179.125274 181.784144
4 2014-07-04 181.210428 172.354412 164.208141 170.061840 176.928324 165.812636
5 2014-06-27 185.309340 178.630136 182.713216 189.882640 177.631714 186.246480
6 2014-06-20 169.342125 168.941842 173.833505 173.421845 169.514554 172.578523

八月份预测效果(线性)

1
2
trainset, testset = split_data_underline(data)
visual(result_purchase_lr, result_redeem_lr, testset)

png

png

png

output_43_3

1
2
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_redeem_amt')

生成线上效果(线性)

可以看到28号很高(work in Sunday)

1
2
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)

png

purchase feature

‘dis_to_nowork’, ‘dis_to_work’, ‘dis_from_work’, ‘purchase_weekdayrate’,
‘redeem_dayrate’, ‘weekday_onehot_5’, ‘weekday_onehot_6’,
‘dis_from_nowork’, ‘is_holiday’, ‘weekday_onehot_1’, ‘weekday_onehot_2’,
‘weekday_onehot_0’, ‘dis_from_middleofweek’, ‘dis_from_holiendday’,
‘weekday_onehot_3’, ‘is_lastday_of_holiday’, ‘is_firstday_of_holiday’,
‘weekday_onehot_4’, ‘is_worked_yestday’, ‘is_second_week’,
‘is_third_week’, ‘dis_from_startofmonth’, ‘dis_from_holiday’,
‘dis_to_nowork%%%%dis_from_purchase_peak’, ‘total_purchase_amt’,
‘total_redeem_amt’, ‘date’

Redeem feature

‘is_work’, ‘dis_from_redeem_valley’, ‘purchase_weekdayrate’,
‘redeem_dayrate’, ‘weekday_onehot_5’, ‘is_gonna_work_tomorrow’,
‘is_holiday’, ‘dis_from_nowork’, ‘weekday_onehot_0’, ‘weekday_onehot_1’,
‘is_firstday_of_holiday’, ‘weekday_onehot_2’, ‘is_lastday_of_holiday’,
‘dis_from_holiday’, ‘is_work_on_sunday’, ‘is_firstday_of_work’,
‘is_secday_of_month’, ‘dis_from_holiendday’,
‘dis_from_redeem_valley%%%%dis_from_redeem_peak’, ‘total_purchase_amt’,
‘total_redeem_amt’, ‘date’

1
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190614_droped.csv',index=False,header=None)

生成线上效果(MLP)

1
2
3
4
5
6
7
8
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns 
if x not in ['total_purchase_amt','total_redeem_amt','date']],
MLPRegressor(solver='lbfgs'),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
MLPRegressor(solver='lbfgs'),'total_redeem_amt')
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)

png

1
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190614_droped_MLP.csv',index=False,header=None)

生成线上效果(Xgboost)

1
2
3
4
5
6
7
8
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns 
if x not in ['total_purchase_amt','total_redeem_amt','date']],
xgb.XGBRegressor(objective='reg:squarederror'),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
xgb.XGBRegressor(objective='reg:squarederror'),'total_redeem_amt')
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)

png

1
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190615_droped_XGB.csv',index=False,header=None)

四、AIC模型平均

1
purchase_features, purchase_weight = multi_model(data_purchase, model=LinearRegression(), types = 'purchase')
1
redeem_features, redeem_weight = multi_model(data_redeem, model=LinearRegression(), types = 'redeem')
1
2
3
4
5
6
7
8
9
10
11
def eva_for_aic(data_purchase, purchase_features, purchase_weight):
results = pd.DataFrame()
for index, feature in enumerate(purchase_features):
if results.empty:
results = scale_df(multi_model_eva(data_purchase[['date'] + labels + feature], 'purchase'),
eta = purchase_weight[index])
else:
results = add_two_df(results, multi_model_eva(data_purchase[['date'] + labels + feature], 'purchase')
, left_a = 1,
right_a = purchase_weight[index])
return results
1
2
add_two_df(eva_for_aic(data_purchase, purchase_features, purchase_weight), 
eva_for_aic(data_redeem, redeem_features, redeem_weight))

interval LinearRegression DecisionTreeRegressor RandomForestRegressor GradientBoostingRegressor MLPRegressor XGBRegressor
0 2014-08-01 196.963095 183.792030 186.524287 190.406781 203.741264 191.603561
1 2014-07-25 166.890259 162.282961 165.155036 167.600603 170.796653 168.847170
2 2014-07-18 164.275651 164.099009 165.696601 165.439729 166.863106 166.027208
3 2014-07-11 165.406218 173.410333 175.492379 174.605650 171.207335 172.707320
4 2014-07-04 159.754349 159.385969 157.523226 159.920750 165.437567 159.528064
5 2014-06-27 170.536885 170.257129 172.330545 172.594357 173.168680 172.330513
6 2014-06-20 171.443469 162.639913 164.623205 164.944005 171.675562 166.762065

五、针对残差建模

1
2
data_purchase = pd.read_csv('Feature/residual_feature_purchase_0621.csv')
data_purchase['date'] = pd.to_datetime(data_purchase['date'])
1
2
data_redeem = pd.read_csv('Feature/residual_feature_redeem_0621.csv')
data_redeem['date'] = pd.to_datetime(data_redeem['date'])
1
base = pd.read_csv('Dataset/base.csv')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
def generate_residual_result(data, base, model=LinearRegression(), types = 'purchase', split_time = datetime.date(2014,8,1)):
a_month = relativedelta(months=1)
trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < split_time)]
testset = data[(split_time <= data['date']) & (data['date'] < split_time + a_month)]
feature = [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']]
model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
result_purchase_rate = model.predict(testset[feature])

base['date'] = pd.to_datetime(base['date'], format= "%Y%m%d")
result_purchase_cycle = np.array(base[(base['date'] >= split_time)
&(base['date'] < split_time + a_month)]['total_'+types+'_predicted_by_cycle'])
result_purchase_residual = result_purchase_rate * np.array(result_purchase_cycle)
return result_purchase_residual
1
2
3
4
5
6
7
8
9
10
11
12
def generate_evaluate_for_residual(model=LinearRegression()):
result = []
for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11),
datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
result_purchase_residual = generate_residual_result(data_purchase, base, model=model, types='purchase', split_time = i)
result_redeem_residual = generate_residual_result(data_purchase, base, model=model, types='redeem', split_time= i)
a_month = relativedelta(months=1)
testset = data[(data['date'] >= i) & (data['date'] < i + a_month)]
real_purchase = testset['total_purchase_amt']
real_redeem = testset['total_redeem_amt']
result.append(total_AE(result_purchase_residual, result_redeem_residual, real_purchase, real_redeem))
return pd.DataFrame(result)
1
2
3
4
5
6
7
8
9
10
def multi_model_eva_for_residual():
results = pd.DataFrame()
for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:
if results.empty:
results = draw_eva_table(generate_evaluate_for_residual(model)).rename(columns={0: repr(model).split('(')[0]})
else:
results = pd.merge(results, \
draw_eva_table(generate_evaluate_for_residual(model)).rename(columns={0: repr(model).split('(')[0]}))
results = results[['interval'] + [x for x in results.columns if x != 'interval']]
return results
1
2
3
4
5
6
7
8
9
10
11
12
13
def generate_evaluate_for_cycle():
result = []
for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11),
datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
a_month = relativedelta(months=1)
testset = base[(base['date'] >= i) & (base['date'] < i + a_month)].reset_index(drop=True)
result_purchase_residual = testset['total_purchase_predicted_by_cycle']
result_redeem_residual = testset['total_redeem_predicted_by_cycle']
testset = data[(data['date'] >= i) & (data['date'] < i + a_month)].reset_index(drop=True)
real_purchase = testset['total_purchase_amt']
real_redeem = testset['total_redeem_amt']
result.append(total_AE(result_purchase_residual, result_redeem_residual, real_purchase, real_redeem))
return pd.DataFrame(result).rename(columns={0: 'PureTimeSeries'})
1
pd.merge(multi_model_eva_for_residual(), draw_eva_table(generate_evaluate_for_cycle()))

interval LinearRegression DecisionTreeRegressor RandomForestRegressor GradientBoostingRegressor MLPRegressor XGBRegressor PureTimeSeries
0 2014-08-01 180.766534 155.580904 167.712675 171.177173 171.180097 175.335191 175.933714
1 2014-07-25 161.447048 159.351623 161.705225 162.192265 158.220680 163.794727 155.916275
2 2014-07-18 172.796145 153.972984 170.188452 169.297901 156.933699 173.068241 169.890622
3 2014-07-11 165.437181 154.819818 166.759593 161.607800 158.148139 165.108007 165.668307
4 2014-07-04 150.715908 140.131046 148.064148 147.667609 148.554344 151.872781 155.458113
5 2014-06-27 161.207466 157.575158 161.413391 160.319942 145.957128 160.949321 160.758547
6 2014-06-20 146.011833 147.507212 149.043333 154.453050 142.023337 155.435619 154.336379

(1) 只使用周期因子在8月份的预测效果

1
_, testset = split_data_underline(data)
1
2
real_purchase = testset['total_purchase_amt']
real_redeem = testset['total_redeem_amt']
1
2
3
4
result_purchase_cycle = np.array(base[(base['date'] >= datetime.date(2014,8,1))&(base['date'] < datetime.date(2014,9,1))]['total_purchase_predicted_by_cycle'])
result_redeem_cycle = np.array(base[(base['date'] >= datetime.date(2014,8,1))&(base['date'] < datetime.date(2014,9,1))]['total_redeem_predicted_by_cycle'])
total_AE(result_purchase_cycle, result_redeem_cycle, real_purchase, real_redeem)
# 175.93371418259747
1
2
trainset, testset = split_data_underline(data)
visual(result_purchase_cycle, result_redeem_cycle, testset)

png

png

png

png

(2) 只使用周期因子+预测残差在8月份的预测效果(比单纯用因子好)

1
2
3
4
trainset_purchase, testset_purchase = split_data_underline(data_purchase)
result_purchase_rate = generate_under_result(data_purchase, [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_purchase_amt')
1
2
3
4
trainset_redeem, testset_redeem = split_data_underline(data_redeem)
result_redeem_rate = generate_under_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_redeem_amt')
1
2
total_AE(result_purchase_rate * result_purchase_cycle, result_redeem_rate * result_redeem_cycle, real_purchase, real_redeem)
# 182.628220303351
1
2
trainset, testset = split_data_underline(data)
visual(result_purchase_rate * result_purchase_cycle, result_redeem_rate * result_redeem_cycle, testset)

png

png

png

png

(3) 生成线上结果

1
2
3
4
trainset_purchase, testset_purchase = split_data_online(data_purchase)
result_purchase_rate = generate_online_result(data_purchase, [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_purchase_amt')
1
2
3
4
trainset_redeem, testset_redeem = split_data_online(data_redeem)
result_redeem_rate = generate_online_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_redeem_amt')
1
2
3
# 修正一下预测结果试试
result_purchase_rate = result_purchase_rate / np.mean(result_purchase_rate)
result_redeem_rate = result_redeem_rate / np.mean(result_redeem_rate)
1
2
result_purchase_cycle = np.array(base[(base['date'] >= datetime.date(2014,9,1))&(base['date'] < datetime.date(2014,10,1))]['total_purchase_predicted_by_cycle'])
result_redeem_cycle = np.array(base[(base['date'] >= datetime.date(2014,9,1))&(base['date'] < datetime.date(2014,10,1))]['total_redeem_predicted_by_cycle'])
1
result_purchase_residual = result_purchase_rate * result_purchase_cycle
1
result_redeem_residual = result_redeem_rate * result_redeem_cycle

月份周期因子线上结果(135)

1
draw_result(result_purchase_cycle, result_redeem_cycle, testset_redeem)

png

残差处理后结果

1
draw_result(result_purchase_residual, result_redeem_residual, testset_redeem)

png

1
normalize_upload_file(result_purchase_residual, result_redeem_residual, testset_redeem).to_csv('20190622_residual_liner.csv',index=False,header=None)

仅用与修正节假日的结果

1
result_score135 = pd.read_csv('Result/timeseries0606.csv',header=None)
1
result_residual = normalize_upload_file(result_purchase_residual, result_redeem_residual, testset_redeem).reset_index(drop=True)
1
result_residual['date'] = result_residual['date'].astype(int)
1
2
3
4
5
6
days_need_to_change = [
20140906,
20140907,
20140908,
20140928
]
1
2
3
4
for index,row in result_score135.iterrows():
if row[0] in days_need_to_change:
result_score135.loc[index, 1] = result_residual.loc[index, 'total_purchase_amt']
result_score135.loc[index, 2] = result_residual.loc[index, 'total_redeem_amt']
1
result_score135.to_csv('result135_fixed_by_residual_0621.csv',index=False,header=None)
  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!

扫一扫,分享到微信

微信分享二维码
  • Copyrights © 2020 chenk
  • 由 帅气的CK本尊 强力驱动
  • 访问人数: | 浏览次数:

请我喝杯咖啡吧~

支付宝
微信