轉自:coggle數據科學
近期一些朋友詢問我關於如何做特征工程的問題,有沒有什麽適合初學者的有效操作。
特征工程的問題往往需要具體問題具體分析,當然也有一些暴力的策略,可以在競賽初賽前期可以帶來較大提升,而很多競賽往往依賴這些資訊就可以拿到非常好的效果,剩余的則需要結合業務邏輯以及很多其他的技巧,此處我們將平時用得最多的聚合操作羅列在下方。
最近剛好看到一篇文章匯總了非常多的聚合函式,就摘錄在下方,供許多初入競賽的朋友參考。
聚合特征匯總
pandas內建的聚合函式
其它重要聚合函式
其它重要聚合函式&分類分別如下。
defmedian(x):
return np.median(x)
defvariation_coefficient(x):
mean = np.mean(x)
if mean != 0:
return np.std(x) / mean
else:
return np.nan
defvariance(x):
return np.var(x)
defskewness(x):
ifnot isinstance(x, pd.Series):
x = pd.Series(x)
return pd.Series.skew(x)
defkurtosis(x):
ifnot isinstance(x, pd.Series):
x = pd.Series(x)
return pd.Series.kurtosis(x)
defstandard_deviation(x):
return np.std(x)
deflarge_standard_deviation(x):
if (np.max(x)-np.min(x)) == 0:
return np.nan
else:
return np.std(x)/(np.max(x)-np.min(x))
defvariation_coefficient(x):
mean = np.mean(x)
if mean != 0:
return np.std(x) / mean
else:
return np.nan
defvariance_std_ratio(x):
y = np.var(x)
if y != 0:
return y/np.sqrt(y)
else:
return np.nan
defratio_beyond_r_sigma(x, r):
if x.size == 0:
return np.nan
else:
return np.sum(np.abs(x - np.mean(x)) > r * np.asarray(np.std(x))) / x.size
defrange_ratio(x):
mean_median_difference = np.abs(np.mean(x) - np.median(x))
max_min_difference = np.max(x) - np.min(x)
if max_min_difference == 0:
return np.nan
else:
return mean_median_difference / max_min_difference
defhas_duplicate_max(x):
return np.sum(x == np.max(x)) >= 2
defhas_duplicate_min(x):
return np.sum(x == np.min(x)) >= 2
defhas_duplicate(x):
return x.size != np.unique(x).size
defcount_duplicate_max(x):
return np.sum(x == np.max(x))
defcount_duplicate_min(x):
return np.sum(x == np.min(x))
defcount_duplicate(x):
return x.size - np.unique(x).size
defsum_values(x):
if len(x) == 0:
return0
return np.sum(x)
deflog_return(list_stock_prices):
return np.log(list_stock_prices).diff()
defrealized_volatility(series):
return np.sqrt(np.sum(series**2))
defrealized_abs_skew(series):
return np.power(np.abs(np.sum(series**3)),1/3)
defrealized_skew(series):
return np.sign(np.sum(series**3))*np.power(np.abs(np.sum(series**3)),1/3)
defrealized_vol_skew(series):
return np.power(np.abs(np.sum(series**6)),1/6)
defrealized_quarticity(series):
return np.power(np.sum(series**4),1/4)
defcount_unique(series):
return len(np.unique(series))
defcount(series):
return series.size
#drawdons functions are mine
defmaximum_drawdown(series):
series = np.asarray(series)
if len(series)<2:
return0
k = series[np.argmax(np.maximum.accumulate(series) - series)]
i = np.argmax(np.maximum.accumulate(series) - series)
if len(series[:i])<1:
return np.NaN
else:
j = np.max(series[:i])
return j-k
defmaximum_drawup(series):
series = np.asarray(series)
if len(series)<2:
return0
series = - series
k = series[np.argmax(np.maximum.accumulate(series) - series)]
i = np.argmax(np.maximum.accumulate(series) - series)
if len(series[:i])<1:
return np.NaN
else:
j = np.max(series[:i])
return j-k
defdrawdown_duration(series):
series = np.asarray(series)
if len(series)<2:
return0
k = np.argmax(np.maximum.accumulate(series) - series)
i = np.argmax(np.maximum.accumulate(series) - series)
if len(series[:i]) == 0:
j=k
else:
j = np.argmax(series[:i])
return k-j
defdrawup_duration(series):
series = np.asarray(series)
if len(series)<2:
return0
series=-series
k = np.argmax(np.maximum.accumulate(series) - series)
i = np.argmax(np.maximum.accumulate(series) - series)
if len(series[:i]) == 0:
j=k
else:
j = np.argmax(series[:i])
return k-j
defmax_over_min(series):
if len(series)<2:
return0
if np.min(series) == 0:
return np.nan
return np.max(series)/np.min(series)
defmean_n_absolute_max(x, number_of_maxima = 1):
""" Calculates the arithmetic mean of the n absolute maximum values of the time series."""
assert (
number_of_maxima > 0
), f" number_of_maxima={number_of_maxima} which is not greater than 1"
n_absolute_maximum_values = np.sort(np.absolute(x))[-number_of_maxima:]
return np.mean(n_absolute_maximum_values) if len(x) > number_of_maxima else np.NaN
defcount_above(x, t):
if len(x)==0:
return np.nan
else:
return np.sum(x >= t) / len(x)
defcount_below(x, t):
if len(x)==0:
return np.nan
else:
return np.sum(x <= t) / len(x)
#number of valleys = number_peaks(-x, n)
defnumber_peaks(x, n):
"""
Calculates the number of peaks of at least support n in the time series x. A peak of support n is defined as a
subsequence of x where a value occurs, which is bigger than its n neighbours to the left and to the right.
"""
x_reduced = x[n:-n]
res = None
for i in range(1, n + 1):
result_first = x_reduced > _roll(x, i)[n:-n]
if res isNone:
res = result_first
else:
res &= result_first
res &= x_reduced > _roll(x, -i)[n:-n]
return np.sum(res)
defmean_abs_change(x):
return np.mean(np.abs(np.diff(x)))
defmean_change(x):
x = np.asarray(x)
return (x[-1] - x[0]) / (len(x) - 1) if len(x) > 1else np.NaN
defmean_second_derivative_central(x):
x = np.asarray(x)
return (x[-1] - x[-2] - x[1] + x[0]) / (2 * (len(x) - 2)) if len(x) > 2else np.NaN
defroot_mean_square(x):
return np.sqrt(np.mean(np.square(x))) if len(x) > 0else np.NaN
defabsolute_sum_of_changes(x):
return np.sum(np.abs(np.diff(x)))
deflongest_strike_below_mean(x):
ifnot isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
return np.max(_get_length_sequences_where(x < np.mean(x))) if x.size > 0else0
deflongest_strike_above_mean(x):
ifnot isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
return np.max(_get_length_sequences_where(x > np.mean(x))) if x.size > 0else0
defcount_above_mean(x):
m = np.mean(x)
return np.where(x > m)[0].size
defcount_below_mean(x):
m = np.mean(x)
return np.where(x < m)[0].size
deflast_location_of_maximum(x):
x = np.asarray(x)
return1.0 - np.argmax(x[::-1]) / len(x) if len(x) > 0else np.NaN
deffirst_location_of_maximum(x):
ifnot isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
return np.argmax(x) / len(x) if len(x) > 0else np.NaN
deflast_location_of_minimum(x):
x = np.asarray(x)
return1.0 - np.argmin(x[::-1]) / len(x) if len(x) > 0else np.NaN
deffirst_location_of_minimum(x):
ifnot isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
return np.argmin(x) / len(x) if len(x) > 0else np.NaN
# Test non-consecutive non-reoccuring values ?
defpercentage_of_reoccurring_values_to_all_values(x):
if len(x) == 0:
return np.nan
unique, counts = np.unique(x, return_counts=True)
if counts.shape[0] == 0:
return0
return np.sum(counts > 1) / float(counts.shape[0])
defpercentage_of_reoccurring_datapoints_to_all_datapoints(x):
if len(x) == 0:
return np.nan
ifnot isinstance(x, pd.Series):
x = pd.Series(x)
value_counts = x.value_counts()
reoccuring_values = value_counts[value_counts > 1].sum()
if np.isnan(reoccuring_values):
return0
return reoccuring_values / x.size
defsum_of_reoccurring_values(x):
unique, counts = np.unique(x, return_counts=True)
counts[counts < 2] = 0
counts[counts > 1] = 1
return np.sum(counts * unique)
defsum_of_reoccurring_data_points(x):
unique, counts = np.unique(x, return_counts=True)
counts[counts < 2] = 0
return np.sum(counts * unique)
defratio_value_number_to_time_series_length(x):
ifnot isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
if x.size == 0:
return np.nan
return np.unique(x).size / x.size
defabs_energy(x):
ifnot isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
return np.dot(x, x)
defquantile(x, q):
if len(x) == 0:
return np.NaN
return np.quantile(x, q)
# crossing the mean ? other levels ?
defnumber_crossing_m(x, m):
ifnot isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
# From https://stackoverflow.com/questions/3843017/efficiently-detect-sign-changes-in-python
positive = x > m
return np.where(np.diff(positive))[0].size
defabsolute_maximum(x):
return np.max(np.absolute(x)) if len(x) > 0else np.NaN
defvalue_count(x, value):
ifnot isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
if np.isnan(value):
return np.isnan(x).sum()
else:
return x[x == value].size
defrange_count(x, min, max):
return np.sum((x >= min) & (x < max))
defmean_diff(x):
return np.nanmean(np.diff(x.values))
base_stats = ['mean','sum','size','count','std','first','last','min','max',median,skewness,kurtosis]
higher_order_stats = [abs_energy,root_mean_square,sum_values,realized_volatility,realized_abs_skew,realized_skew,realized_vol_skew,realized_quarticity]
additional_quantiles = [quantile_01,quantile_025,quantile_075,quantile_09]
other_min_max = [absolute_maximum,max_over_min]
min_max_positions = [last_location_of_maximum,first_location_of_maximum,last_location_of_minimum,first_location_of_minimum]
peaks = [number_peaks_2, mean_n_absolute_max_2, number_peaks_5, mean_n_absolute_max_5, number_peaks_10, mean_n_absolute_max_10]
counts = [count_unique,count,count_above_0,count_below_0,value_count_0,count_near_0]
reoccuring_values = [count_above_mean,count_below_mean,percentage_of_reoccurring_values_to_all_values,percentage_of_reoccurring_datapoints_to_all_datapoints,sum_of_reoccurring_values,sum_of_reoccurring_data_points,ratio_value_number_to_time_series_length]
count_duplicate = [count_duplicate,count_duplicate_min,count_duplicate_max]
variations = [mean_diff,mean_abs_change,mean_change,mean_second_derivative_central,absolute_sum_of_changes,number_crossing_0]
ranges = [variance_std_ratio,ratio_beyond_01_sigma,ratio_beyond_02_sigma,ratio_beyond_03_sigma,large_standard_deviation,range_ratio]
參考文獻:
https://www.kaggle.com/code/lucasmorin/amex-feature-engineering-2-aggreg-functions