I. Introduction¶
Importations¶
#General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Choice of the features
from ipywidgets import widgets
from pandas_profiling import ProfileReport
#Gap filling
import datetime as dt
#Standardization
from sklearn.preprocessing import StandardScaler
#Denoising
import pywt
#Kernel smoothing
from skfda.representation.grid import FDataGrid
import skfda
import skfda.preprocessing.smoothing.kernel_smoothers as ks
import skfda.preprocessing.smoothing.validation as val
#Detrending
import statsmodels.api
from statsmodels.tsa.seasonal import STL
II. Labelling¶
Column names dictionnary¶
In order to have access to the columns easily, we define a dictionnary which matches the french name with the technical id of each column.
This method makes the database easy to use for the users but allows future changes to the names of the columns.
dico = {"Description" : "Point Name",
"MOTOR CURRENT" : "PARD@3C52@3C52-M@JT7099.CAL",
"Température palier étage 1" : "PARD@3C52@3C52-M@TE7011A.PNT",
"Température palier étage2" : "PARD@3C52@3C52-M@TE7021A.PNT",
"Température palier étage 3" : "PARD@3C52@3C52-M@TE7031A.PNT",
"Température palier étage 4" : "PARD@3C52@3C52-M@TE7041A.PNT",
"Déplacement axiale 1/2" : "PARD@3C52@3C52-M@VT7001.PNT:RAW",
"Déplacement axiale 3/4" : "PARD@3C52@3C52-M@VT7002.PNT:RAW",
"1e stade vibration X" : "PARD@3C52@3C52-M@VT7011A.PNT:RAW",
"1er stade vibration Y" : "PARD@3C52@3C52-M@VT7011B.PNT:RAW",
"2e stade vibration X" : "PARD@3C52@3C52-M@VT7021A.PNT:RAW",
"2e stade vibration Y" : "PARD@3C52@3C52-M@VT7021B.PNT:RAW",
"3e stade vibration X" : "PARD@3C52@3C52-M@VT7031A.PNT:RAW",
"3e stade vibration Y" : "PARD@3C52@3C52-M@VT7031B.PNT:RAW",
"4e stade vibration X" : "PARD@3C52@3C52-M@VT7041A.PNT:RAW",
"4e stade vibration Y" : "PARD@3C52@3C52-M@VT7041B.PNT:RAW",
"Température huile sortie réfrigerant" : "PARD@3C52@3C52@TE7086.PNT",
"labels" : "labels"}
inv_dico = {v: k for k, v in dico.items()}
Thresholding¶
We have noticed an absurd value - always the same - appearing often in the signal, corresponding to the saturation of the captor, which thus returns an arbitrary value.
The following function is dedicated to getting rid of these anomalies by deleting any value above a given threshold.
# Separate reading of file
def load_csv_file(filename):
data_desc = {
# Use homogenous name
# Eventualy store the map in file
"Point Name": ("TIMESTAMPS", lambda s: dt.datetime.strptime(s, '%m/%d/%Y %I:%M:%S %p')),
"PARD@3C52@3C52-M@JT7099.CAL": ("MOTOR CURRENT", float),
"PARD@3C52@3C52-M@TE7011A.PNT": ("Température palier étage 1", float),
"PARD@3C52@3C52-M@TE7021A.PNT": ("Température palier étage 2", float),
"PARD@3C52@3C52-M@TE7031A.PNT": ("Température palier étage 3", float),
"PARD@3C52@3C52-M@TE7041A.PNT": ("Température palier étage 4", float),
"PARD@3C52@3C52-M@VT7001.PNT:RAW": ("Déplacement axiale 1/2", float),
"PARD@3C52@3C52-M@VT7002.PNT:RAW": ("Déplacement axiale 3/4", float),
"PARD@3C52@3C52-M@VT7011A.PNT:RAW": ("1e stade vibration X", float),
"PARD@3C52@3C52-M@VT7011B.PNT:RAW": ("1e stade vibration Y", float),
"PARD@3C52@3C52-M@VT7021A.PNT:RAW": ("2e stade vibration X", float),
"PARD@3C52@3C52-M@VT7021B.PNT:RAW": ("2e stade vibration Y", float),
"PARD@3C52@3C52-M@VT7031A.PNT:RAW": ("3e stade vibration X", float),
"PARD@3C52@3C52-M@VT7031B.PNT:RAW": ("3e stade vibration Y", float),
"PARD@3C52@3C52-M@VT7041A.PNT:RAW": ("4e stade vibration X", float),
"PARD@3C52@3C52-M@VT7041B.PNT:RAW": ("4e stade vibration Y", float),
"PARD@3C52@3C52@TE7086.PNT": ("Température huile sortie réfrigerant", float)
}
df = pd.read_csv(filename, converters = {k: v[1] for k, v in data_desc.items() if v is not None}, skiprows=[1,2,3,4,5])
df.rename(columns = {k : v[0] for k, v in data_desc.items()}, inplace = True)
return df
def apply_thresholding_inplace(x, lower_bound, upper_bound):
if lower_bound is not None:
x.mask(x < lower_bound, inplace = True)
if upper_bound is not None:
x.mask(x > upper_bound, inplace = True)
def thresholding_inplace(df):
#Motor current
apply_thresholding_inplace(df["MOTOR CURRENT"], 300, None)
#Température palier étage 1
apply_thresholding_inplace(df["Température palier étage 1"], 70, 125)
#Température palier étage 2
apply_thresholding_inplace(df["Température palier étage 2"], 70, 100)
#Température palier étage 3
apply_thresholding_inplace(df["Température palier étage 3"], 60, 100)
#Température palier étage 4
apply_thresholding_inplace(df["Température palier étage 4"], 77, 100)
#Déplacement axial 1/2
apply_thresholding_inplace(df["Déplacement axiale 1/2"], None, 50)
#Déplacement axial 3/4
apply_thresholding_inplace(df["Déplacement axiale 3/4"], None, 50)
#1er stade de vibration X
apply_thresholding_inplace(df["1e stade vibration X"], 3, 50)
#1er stade de vibration Y
apply_thresholding_inplace(df["1e stade vibration Y"], 5, 50)
#2e stade de vibration X
apply_thresholding_inplace(df["2e stade vibration X"], 4, 50)
#2e stade de vibration Y
apply_thresholding_inplace(df["2e stade vibration Y"], 6, 50)
#3e stade de vibration X
apply_thresholding_inplace(df["3e stade vibration X"], None, 50)
#3e stade de vibration Y
apply_thresholding_inplace(df["3e stade vibration Y"], None, 50)
#4e stade de vibration X
apply_thresholding_inplace(df["4e stade vibration X"], 1, 50)
#4e stade de vibration Y
apply_thresholding_inplace(df["4e stade vibration Y"], 1, 50)
#Température huile sortie réfrigérant
apply_thresholding_inplace(df["Température huile sortie réfrigerant"], None, 50)
return df
First we decided to train our algorithms only on the last three years. First thing first : we import those data as Pandas' DataFrame and and apply the thresholds previously set.
df_2018 = load_csv_file('2018 10min.csv')
df_2019 = load_csv_file('2019 10min.csv')
df_2020 = load_csv_file('2020 10min.csv')
df_2018 = thresholding_inplace(df_2018)
df_2019 = thresholding_inplace(df_2019)
df_2020 = thresholding_inplace(df_2020)
#joining the three years to create a continuous set of data
df_total = pd.concat((df_2018,df_2019,df_2020), axis = 0)
# df_total = thresholding_inplace(df_total)
# df_total
TIMESTAMPS | MOTOR CURRENT | Température palier étage 1 | Température palier étage 2 | Température palier étage 3 | Température palier étage 4 | Déplacement axiale 1/2 | Déplacement axiale 3/4 | 1e stade vibration X | 1e stade vibration Y | 2e stade vibration X | 2e stade vibration Y | 3e stade vibration X | 3e stade vibration Y | 4e stade vibration X | 4e stade vibration Y | Température huile sortie réfrigerant | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01 00:10:00 | 902.333740 | 83.399986 | 91.199982 | 78.999985 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.60 | 7.068229 | 5.125000 | 7.328948 | 7.337153 | 44.399990 |
1 | 2018-01-01 00:20:00 | 902.333740 | 83.599983 | 91.199982 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.022708 | 5.022708 | 7.183333 | 7.346614 | 44.599991 |
2 | 2018-01-01 00:30:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.125000 | 5.125000 | 7.279167 | 7.329514 | 44.599991 |
3 | 2018-01-01 00:40:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.011652 | 5.125000 | 7.375000 | 7.356651 | 44.599991 |
4 | 2018-01-01 00:50:00 | 897.018066 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.625 | 7.875 | 8.50 | 7.022708 | 5.002292 | 7.327083 | 7.308574 | 44.799992 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
37376 | 2020-09-16 14:30:00 | 855.821594 | 89.399971 | 93.999977 | 80.599976 | 82.999977 | -0.033999 | -0.118028 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503597 | 6.372628 | 7.250000 | 8.337501 | NaN |
37377 | 2020-09-16 14:40:00 | 847.848083 | 89.399971 | 93.999977 | 80.399979 | 82.999977 | -0.033849 | -0.117976 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503816 | 6.367173 | 7.250000 | 8.326176 | NaN |
37378 | 2020-09-16 14:50:00 | 843.861389 | 89.199974 | 93.999977 | 80.399979 | 82.999977 | -0.033699 | -0.117925 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.273644 | 7.250000 | 8.331106 | NaN |
37379 | 2020-09-16 15:00:00 | 849.177002 | 89.399971 | 94.199974 | 80.599976 | 83.199974 | -0.033549 | -0.117873 | 5.125 | 8.375 | 8.875 | 10.25 | 6.566219 | 6.366466 | 7.250000 | 8.258534 | NaN |
37380 | 2020-09-16 15:10:00 | 850.505981 | 89.399971 | 94.199974 | 80.399979 | 83.199974 | -0.033399 | -0.117821 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.375000 | 7.250000 | 8.252553 | NaN |
142681 rows × 17 columns
Spotting of the correlations and choice of the main features¶
Why don't we take all the features¶
By looking at the data, we clearly see that, out of the 16 measures we have, most of theme are correlated and it would be useless, time-consuming and would require more computing to use them all. Furthermore too many features for training can result in lower performances of the algorithm.
How to choose those features?¶
The main idea there is to delete redundancy, to do that we need to visualize them using a correlation matrix. Then we just select data that do not seem too correlated (less than 50% of correlation). Those will be our features.
# Y = df_total.iloc[:,1:]
# profile = ProfileReport(Y, title='Correlation matrix')
# profile
Result¶
We can spot five features which do not seem too correlated :
- MOTOR CURRENT
- Température étage 1
- Déplacement axial 1/2
- Déplacement axial 3/4
- 1e stade vibration X
Now we will only work on those data which are less correlated as we can see below.
# Keep only usefull columns
selected = ['TIMESTAMPS', 'MOTOR CURRENT', 'Température palier étage 1', 'Déplacement axiale 1/2', 'Déplacement axiale 3/4', '1e stade vibration X']
df_total = df_total[selected]
# df_total
TIMESTAMPS | MOTOR CURRENT | Température palier étage 1 | Déplacement axiale 1/2 | Déplacement axiale 3/4 | 1e stade vibration X | |
---|---|---|---|---|---|---|
0 | 2018-01-01 00:10:00 | 902.333740 | 83.399986 | -0.046250 | -0.120000 | 4.625 |
1 | 2018-01-01 00:20:00 | 902.333740 | 83.599983 | -0.046250 | -0.120000 | 4.625 |
2 | 2018-01-01 00:30:00 | 902.333740 | 83.599983 | -0.046250 | -0.120000 | 4.625 |
3 | 2018-01-01 00:40:00 | 902.333740 | 83.599983 | -0.046250 | -0.120000 | 4.625 |
4 | 2018-01-01 00:50:00 | 897.018066 | 83.599983 | -0.046250 | -0.120000 | 4.625 |
... | ... | ... | ... | ... | ... | ... |
37376 | 2020-09-16 14:30:00 | 855.821594 | 89.399971 | -0.033999 | -0.118028 | 5.125 |
37377 | 2020-09-16 14:40:00 | 847.848083 | 89.399971 | -0.033849 | -0.117976 | 5.125 |
37378 | 2020-09-16 14:50:00 | 843.861389 | 89.199974 | -0.033699 | -0.117925 | 5.125 |
37379 | 2020-09-16 15:00:00 | 849.177002 | 89.399971 | -0.033549 | -0.117873 | 5.125 |
37380 | 2020-09-16 15:10:00 | 850.505981 | 89.399971 | -0.033399 | -0.117821 | 5.125 |
142681 rows × 6 columns
# Y = df_total.iloc[:,1:]
# profile = ProfileReport(Y, title='Correlation matrix')
# profile
Labelling¶
This step aims at adding a column to the dataframe in which we assign a specific number according to the status of the given value: part of an anomaly or not.
To decide whether a value is part of an anomaly or not, we focus on the 'Température palier étage 1', and look for oscillations only in that signal.
If the given data does not belong to an anomaly, it is assigned the value 0. Otherwise, it is assigned the number of abnormal oscillations in the actual anomaly it belongs to. If the status of a data is ambiguous, it is assigned the value -1 by default.
#Before the labelling, we assign the value 0 to all data
df_2018['labels'] = 0
df_2019['labels'] = 0
df_2020['labels'] = 0
#df_2018.insert(len(df_2018.columns), 'labels', 0)
#df_2019.insert(len(df_2019.columns), 'labels', 0)
#df_2020.insert(len(df_2020.columns), 'labels', 0)
The labelling has been performed 'by hand', by observing the signal and counting the number of oscillations.
To do so, we plotted the values and zoomed in on specific periods of time when necessary. We decided to consider as different anomalies abnormal signals that presented different patterns. We paid close attention to the time period of the oscillations.
When two abnormal periods were separated by less than 1 day and were close enough in terms of pattern, there were considered as one.
#Example for year 2020
df_ex = df_2020
COLUMN = "Température palier étage 1"
STEP = 17000
L = len(df_ex[COLUMN])
#"Température palier étage 1" for year 2020
for i in range(4, L-STEP, STEP) :
tab = np.array(df_ex[COLUMN].loc[i:(i+STEP+2000)])
if (np.nanmax(tab) - np.nanmin(tab) > 2):
plt.figure(figsize = (30,6))
plt.plot(range(i,i+STEP+2001), tab) ;
plt.grid()
plt.show()
#Zoom on a specified abnormal period
deb, fin = 10000, 13000
tab = np.array(df_ex[COLUMN].loc[deb:fin])
plt.figure(figsize = (30,6))
plt.plot(range(deb,fin +1), tab) ;
plt.grid()
plt.title("Zoom on the abnormal period ranging from 03/13/2020 to 03/29/2020", size = 20)
plt.show()
Above, is an example of a zoom over an abnormal period, presenting 4 oscillations, of similar pattern.
Below, are the different anomalies that we detected. We update the 'labels' column of our dataframe with the number of oscillations.
df_2018.at[28750:29878,'labels'] = -1
df_2018.at[31500:35261,'labels'] = 4
df_2018.at[35261:36000,'labels'] = 4
df_2018.at[36000:41927,'labels'] = 4
df_2018.at[51000:51500,'labels'] = -1
df_2018.at[51500:,'labels'] = 9
df_2019.at[4:1228,'labels'] = 9
df_2019.at[1228:3000,'labels'] = -1
df_2019.at[3000:9000,'labels'] = 8
df_2020.at[7880:8516,'labels'] = -1
df_2020.at[8516:9786,'labels'] = 3
df_2020.at[10500:12800,'labels'] = 4
df_2020.at[12800:14840,'labels'] = -1
df_2020.at[15900:20310,'labels'] = 4
df_2020.at[22130:23460,'labels'] = 2
In order to check our labelling, we plotted the signal and overlaid the label. On the following figure, to improve the readability of the figure, we added 84 to the label variable.
tab_signal18 = df_2018['Température palier étage 1']
tab_label18 = df_2018['labels']
plt.figure(figsize = (15,6))
plt.plot(tab_signal18)
plt.plot(tab_label18+84)
plt.show()
tab_signal19 = df_2019['Température palier étage 1']
tab_label19 = df_2019['labels']
plt.figure(figsize = (15,6))
plt.plot(tab_signal19)
plt.plot(tab_label19+84)
plt.show()
tab_signal20 = df_2020['Température palier étage 1']
tab_label20 = df_2020['labels']
plt.figure(figsize = (15,6))
plt.plot(tab_signal20)
plt.plot(tab_label20+84)
plt.show()
Finally, we concatenate the 3 years that we studied in one final dataframe and we convert it into a .csv file.
df_total = pd.concat((df_2018,df_2019,df_2020), axis = 0)
# df_total
TIMESTAMPS | MOTOR CURRENT | Température palier étage 1 | Température palier étage 2 | Température palier étage 3 | Température palier étage 4 | Déplacement axiale 1/2 | Déplacement axiale 3/4 | 1e stade vibration X | 1e stade vibration Y | 2e stade vibration X | 2e stade vibration Y | 3e stade vibration X | 3e stade vibration Y | 4e stade vibration X | 4e stade vibration Y | Température huile sortie réfrigerant | labels | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01 00:10:00 | 902.333740 | 83.399986 | 91.199982 | 78.999985 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.60 | 7.068229 | 5.125000 | 7.328948 | 7.337153 | 44.399990 | 0 |
1 | 2018-01-01 00:20:00 | 902.333740 | 83.599983 | 91.199982 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.022708 | 5.022708 | 7.183333 | 7.346614 | 44.599991 | 0 |
2 | 2018-01-01 00:30:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.125000 | 5.125000 | 7.279167 | 7.329514 | 44.599991 | 0 |
3 | 2018-01-01 00:40:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.011652 | 5.125000 | 7.375000 | 7.356651 | 44.599991 | 0 |
4 | 2018-01-01 00:50:00 | 897.018066 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.625 | 7.875 | 8.50 | 7.022708 | 5.002292 | 7.327083 | 7.308574 | 44.799992 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
37376 | 2020-09-16 14:30:00 | 855.821594 | 89.399971 | 93.999977 | 80.599976 | 82.999977 | -0.033999 | -0.118028 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503597 | 6.372628 | 7.250000 | 8.337501 | NaN | 0 |
37377 | 2020-09-16 14:40:00 | 847.848083 | 89.399971 | 93.999977 | 80.399979 | 82.999977 | -0.033849 | -0.117976 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503816 | 6.367173 | 7.250000 | 8.326176 | NaN | 0 |
37378 | 2020-09-16 14:50:00 | 843.861389 | 89.199974 | 93.999977 | 80.399979 | 82.999977 | -0.033699 | -0.117925 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.273644 | 7.250000 | 8.331106 | NaN | 0 |
37379 | 2020-09-16 15:00:00 | 849.177002 | 89.399971 | 94.199974 | 80.599976 | 83.199974 | -0.033549 | -0.117873 | 5.125 | 8.375 | 8.875 | 10.25 | 6.566219 | 6.366466 | 7.250000 | 8.258534 | NaN | 0 |
37380 | 2020-09-16 15:10:00 | 850.505981 | 89.399971 | 94.199974 | 80.399979 | 83.199974 | -0.033399 | -0.117821 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.375000 | 7.250000 | 8.252553 | NaN | 0 |
142681 rows × 18 columns
III. Gap Filling¶
This algorithm takes a labeled Pandas dataset and fills in gaps of two types: gaps generated through prior threshold processing, and gaps in the sampling time steps.
Why do we need to gap fill? What is our method?¶
Because of the thresholding used previously, there are many "missing points" in the dataset as we can see below. There are also missing points for unknown reasons. Some missing points are very punctual and can easily be interpolated and used, but some unknown areas are too wide to be correctly interpolated. To simplify our job we will interpolate all the missing values, the areas we cannot correctly interpolate will also be interpolated but removed by the end of the work on the data. Labels in the missing points will be set to 0 as they are considered punctual enough not to interfer with the training.
First we do the list of every time interval between two measurements in order to delete the measurement made at the same time and to spot the abnormal measures. Then we create empty patches where needed to complete the dataframe. Finally we interpolate those points from their neighbors using linear interpolation.
Just below we can see an example of a gap induced by thresholding needing to be filled.
plt.subplot(211)
tab = np.array(df_2020['Température palier étage 1'].loc[:2500])
plt.plot(tab);
plt.subplot(212)
tab = np.array(df_2020['Température palier étage 1'])
tab = np.pad(tab, (0, 144*(tab.shape[0]//144+1)-tab.shape[0]))
tab = np.reshape(tab, (-1,144))
plt.imshow(tab.T)
<matplotlib.image.AxesImage at 0x7fdc332dbb50>
# DEFINING GLOBAL VARIABLES
timestamps_column = 'TIMESTAMPS'
sep = ','
The first step is reading the measurements' timestamps in order to be able to fill in the temporal irregularities.
First, we create a function that reads the dataframe, spots the most frequent time intervals, converts the datetime string used in the dataframe into timestamps easier to use, and finally creates a list with every time interval.
def get_deltas(dataframe: pd.DataFrame) -> (int, list, list):
"""
This function reads the timestamps for the values and computes the most frequent time delta in the data series.
:param dataframe: The path of the csv file
:returns: The most frequent time delta (in seconds) and the list of timestamps (datetime objects) and the list of time deltas in seconds
"""
dataframe.sort_values("TIMESTAMPS", inplace = True)
deltas_seclist = dataframe["TIMESTAMPS"].diff()
# extract all unique deltas
uniques = deltas_seclist.unique()
# maps delta to an integer identifier
unique_map = {k: v for v, k in enumerate(uniques)}
# find the most frequent delta
most_frequent_delta = uniques[np.argmax(np.bincount(deltas_seclist.map(unique_map)))]
return most_frequent_delta, dataframe["TIMESTAMPS"], deltas_seclist.apply(lambda x: x.total_seconds())
# df_total
TIMESTAMPS | MOTOR CURRENT | Température palier étage 1 | Température palier étage 2 | Température palier étage 3 | Température palier étage 4 | Déplacement axiale 1/2 | Déplacement axiale 3/4 | 1e stade vibration X | 1e stade vibration Y | 2e stade vibration X | 2e stade vibration Y | 3e stade vibration X | 3e stade vibration Y | 4e stade vibration X | 4e stade vibration Y | Température huile sortie réfrigerant | labels | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01 00:10:00 | 902.333740 | 83.399986 | 91.199982 | 78.999985 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.60 | 7.068229 | 5.125000 | 7.328948 | 7.337153 | 44.399990 | 0 |
1 | 2018-01-01 00:20:00 | 902.333740 | 83.599983 | 91.199982 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.022708 | 5.022708 | 7.183333 | 7.346614 | 44.599991 | 0 |
2 | 2018-01-01 00:30:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.125000 | 5.125000 | 7.279167 | 7.329514 | 44.599991 | 0 |
3 | 2018-01-01 00:40:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.011652 | 5.125000 | 7.375000 | 7.356651 | 44.599991 | 0 |
4 | 2018-01-01 00:50:00 | 897.018066 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.625 | 7.875 | 8.50 | 7.022708 | 5.002292 | 7.327083 | 7.308574 | 44.799992 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
37376 | 2020-09-16 14:30:00 | 855.821594 | 89.399971 | 93.999977 | 80.599976 | 82.999977 | -0.033999 | -0.118028 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503597 | 6.372628 | 7.250000 | 8.337501 | NaN | 0 |
37377 | 2020-09-16 14:40:00 | 847.848083 | 89.399971 | 93.999977 | 80.399979 | 82.999977 | -0.033849 | -0.117976 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503816 | 6.367173 | 7.250000 | 8.326176 | NaN | 0 |
37378 | 2020-09-16 14:50:00 | 843.861389 | 89.199974 | 93.999977 | 80.399979 | 82.999977 | -0.033699 | -0.117925 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.273644 | 7.250000 | 8.331106 | NaN | 0 |
37379 | 2020-09-16 15:00:00 | 849.177002 | 89.399971 | 94.199974 | 80.599976 | 83.199974 | -0.033549 | -0.117873 | 5.125 | 8.375 | 8.875 | 10.25 | 6.566219 | 6.366466 | 7.250000 | 8.258534 | NaN | 0 |
37380 | 2020-09-16 15:10:00 | 850.505981 | 89.399971 | 94.199974 | 80.399979 | 83.199974 | -0.033399 | -0.117821 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.375000 | 7.250000 | 8.252553 | NaN | 0 |
142681 rows × 18 columns
Reading the data timepoints using get_deltas
¶
As we previously concatenated the dataframes, the indexes have to be reset in order to be unique and ordered. Then we delete the prior indexes, delete the begining of the dataframe which only contains useless data for the program and uses the function get_deltas
.
# Computing xxxxx
most_frequent_delta, timepoints_dtlist, deltas_seclist = get_deltas(df_total)
# df_total
TIMESTAMPS | MOTOR CURRENT | Température palier étage 1 | Température palier étage 2 | Température palier étage 3 | Température palier étage 4 | Déplacement axiale 1/2 | Déplacement axiale 3/4 | 1e stade vibration X | 1e stade vibration Y | 2e stade vibration X | 2e stade vibration Y | 3e stade vibration X | 3e stade vibration Y | 4e stade vibration X | 4e stade vibration Y | Température huile sortie réfrigerant | labels | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01 00:10:00 | 902.333740 | 83.399986 | 91.199982 | 78.999985 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.60 | 7.068229 | 5.125000 | 7.328948 | 7.337153 | 44.399990 | 0 |
1 | 2018-01-01 00:20:00 | 902.333740 | 83.599983 | 91.199982 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.022708 | 5.022708 | 7.183333 | 7.346614 | 44.599991 | 0 |
2 | 2018-01-01 00:30:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.125000 | 5.125000 | 7.279167 | 7.329514 | 44.599991 | 0 |
3 | 2018-01-01 00:40:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.011652 | 5.125000 | 7.375000 | 7.356651 | 44.599991 | 0 |
4 | 2018-01-01 00:50:00 | 897.018066 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.625 | 7.875 | 8.50 | 7.022708 | 5.002292 | 7.327083 | 7.308574 | 44.799992 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
37376 | 2020-09-16 14:30:00 | 855.821594 | 89.399971 | 93.999977 | 80.599976 | 82.999977 | -0.033999 | -0.118028 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503597 | 6.372628 | 7.250000 | 8.337501 | NaN | 0 |
37377 | 2020-09-16 14:40:00 | 847.848083 | 89.399971 | 93.999977 | 80.399979 | 82.999977 | -0.033849 | -0.117976 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503816 | 6.367173 | 7.250000 | 8.326176 | NaN | 0 |
37378 | 2020-09-16 14:50:00 | 843.861389 | 89.199974 | 93.999977 | 80.399979 | 82.999977 | -0.033699 | -0.117925 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.273644 | 7.250000 | 8.331106 | NaN | 0 |
37379 | 2020-09-16 15:00:00 | 849.177002 | 89.399971 | 94.199974 | 80.599976 | 83.199974 | -0.033549 | -0.117873 | 5.125 | 8.375 | 8.875 | 10.25 | 6.566219 | 6.366466 | 7.250000 | 8.258534 | NaN | 0 |
37380 | 2020-09-16 15:10:00 | 850.505981 | 89.399971 | 94.199974 | 80.399979 | 83.199974 | -0.033399 | -0.117821 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.375000 | 7.250000 | 8.252553 | NaN | 0 |
142681 rows × 18 columns
Dropping duplicate values (time-wise)¶
We create a column with the timestamps and add this column to the dataframe. This column becomes the new index of the dataframe. Finally we delete the rows with the same timestamps e.g. the simultaneous measurements.
df_total.drop_duplicates(subset=['TIMESTAMPS'], keep='first', inplace = True)
# SETTING INDEX
#df = df.set_index('TIMESTAMPS')
#df.drop(columns=['index'], inplace=True)
Inserting missing samples to get regular time intervals in the data¶
As we extracted the most frequent time intervals, we want to be able to insert lines in the place of temporal gaps in order to obtain a dataframe of regularly spaced data points.
# WARNING: this function uses np for the NaN: replace if possible
def create_empty_lines(df: pd.DataFrame, t_start: pd.Timestamp, periods: int, delta_t: int) -> (pd.DataFrame) :
"""
This function adds nb_lines of NaN values at the index idx
:param df:
delta_t in sec
"""
nb_columns = len(df.columns)
freq = f'{delta_t}s'
date_range = list(pd.date_range(t_start, periods=periods, freq=freq))
return pd.DataFrame(np.nan, index = date_range[1:], columns = list(df.head(0)))
def complete(df_in: pd.DataFrame, deltas_seclist: list, delta_t: int) -> pd.DataFrame :
"""
This function takes a dataframe with TIMESTAMPS (in seconds and sorted) as index and complete this dataframe with NaN values to have a regular time step, if needed
:param df_in: Input DataFrame
:param deltas_seclist:
"""
# Stuff
df = df_in.copy()
timestamps_list = list(df.index)
id_new = 0
for i in range(len(timestamps_list) - 1) :
# Getting the time deltas (in seconds)
delta_dt = deltas_seclist[i]
# If missing values
if (delta_dt != delta_t) :
# Generating the empty lines to insert
periods = int(delta_dt / delta_t)
patch = create_empty_lines(df, timestamps_list[i], periods, delta_t)
# incrementing the new index with the inserted lines
id_new += periods
timestamp = timestamps_list[i]
timestamp_next = timestamps_list[i+1]
# Concatenating (inserting the lines)
df_begin = df.loc[:timestamp, :]
df_end = df.loc[timestamp_next:, :]
df = pd.concat([df_begin, patch, df_end], axis=0)
# incrementing the new index
id_new += 1
return df
#df_total = complete(df_total, deltas_seclist, most_frequent_delta)
# completing dataframe
most_frequent_delta_dateoffset = pd.DateOffset(seconds=int(most_frequent_delta.astype('timedelta64[s]').astype(int)))
complete_timerange = pd.date_range(df_total.iloc[0,0],df_total.iloc[-1,0],freq=most_frequent_delta_dateoffset)
df_total.set_index('TIMESTAMPS', inplace = True)
df_total = df_total.reindex(complete_timerange)
Interpolating the data per column in order to remove missing (NaN
) points, and filling in the gaps in the labels with 0
¶
Now we need to fill the lines we inserted in the place of the missing data points. To do so, we use Pandas
’ native interpolate
function which replaces NaN
points with the interpolated values. We chose to use a linear interpolation method. In the case of the labels, we only want to fill in the gaps with the value 0. which represents a normal data point. Missing data points with no leading or trailing actual values will remain untouched in their initial NaN
state.
df_total['labels'].fillna(value = 0.0, inplace = True)
interpolated_columns = [c for c in df_total.columns if c not in {'labels','TIMESTAMPS'}]
df_total.update(df_total[interpolated_columns].interpolate(method='slinear'))
# Interpolates except if leading/trailing NaN values
#for column_name in df_total.columns[:-1]:
# column_to_fill = df_total.loc[:, column_name].astype(float)
# column_to_fill = column_to_fill.interpolate(method='slinear')
# df_total = df_total.assign(**{column_name: column_to_fill})
# Filling in the gaps in the labels
#df_total['labels'].fillna(value = 0, inplace = True)
#df_total.reset_index(inplace=True)
# df_total
MOTOR CURRENT | Température palier étage 1 | Température palier étage 2 | Température palier étage 3 | Température palier étage 4 | Déplacement axiale 1/2 | Déplacement axiale 3/4 | 1e stade vibration X | 1e stade vibration Y | 2e stade vibration X | 2e stade vibration Y | 3e stade vibration X | 3e stade vibration Y | 4e stade vibration X | 4e stade vibration Y | Température huile sortie réfrigerant | labels | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2018-01-01 00:10:00 | 902.333740 | 83.399986 | 91.199982 | 78.999985 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.60 | 7.068229 | 5.125000 | 7.328948 | 7.337153 | 44.399990 | 0.0 |
2018-01-01 00:20:00 | 902.333740 | 83.599983 | 91.199982 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.022708 | 5.022708 | 7.183333 | 7.346614 | 44.599991 | 0.0 |
2018-01-01 00:30:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.125000 | 5.125000 | 7.279167 | 7.329514 | 44.599991 | 0.0 |
2018-01-01 00:40:00 | 902.333740 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.011652 | 5.125000 | 7.375000 | 7.356651 | 44.599991 | 0.0 |
2018-01-01 00:50:00 | 897.018066 | 83.599983 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.625 | 7.875 | 8.50 | 7.022708 | 5.002292 | 7.327083 | 7.308574 | 44.799992 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2020-09-16 14:30:00 | 855.821594 | 89.399971 | 93.999977 | 80.599976 | 82.999977 | -0.033999 | -0.118028 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503597 | 6.372628 | 7.250000 | 8.337501 | NaN | 0.0 |
2020-09-16 14:40:00 | 847.848083 | 89.399971 | 93.999977 | 80.399979 | 82.999977 | -0.033849 | -0.117976 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503816 | 6.367173 | 7.250000 | 8.326176 | NaN | 0.0 |
2020-09-16 14:50:00 | 843.861389 | 89.199974 | 93.999977 | 80.399979 | 82.999977 | -0.033699 | -0.117925 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.273644 | 7.250000 | 8.331106 | NaN | 0.0 |
2020-09-16 15:00:00 | 849.177002 | 89.399971 | 94.199974 | 80.599976 | 83.199974 | -0.033549 | -0.117873 | 5.125 | 8.375 | 8.875 | 10.25 | 6.566219 | 6.366466 | 7.250000 | 8.258534 | NaN | 0.0 |
2020-09-16 15:10:00 | 850.505981 | 89.399971 | 94.199974 | 80.399979 | 83.199974 | -0.033399 | -0.117821 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.375000 | 7.250000 | 8.252553 | NaN | 0.0 |
142507 rows × 17 columns
Results¶
Just below we can see the data sample previously used to demonstrate the issue and its result once interpolated. As we ca see the approximation is very naive but won't interfer too much with the programs.
plt.subplot(311)
tab = np.array(df_2020['Température palier étage 1'].loc[4:2500].astype('float32'))
plt.plot(tab)
plt.subplot(312)
tab2 = np.array(df_total.iloc[105120:107617].loc[:,'Température palier étage 1'].astype('float32'))
plt.plot(tab2);
plt.subplot(313)
tab = np.array(df_total['Température palier étage 1'])
tab = np.pad(tab, (0, 144*(tab.shape[0]//144+1)-tab.shape[0]))
tab = np.reshape(tab, (-1,144))
plt.imshow(tab.T)
<matplotlib.image.AxesImage at 0x7fdc33b1fb90>
IV. Standardization¶
Collecting the data¶
Feature : Temperature 1¶
temp = df_total["Température palier étage 1"]
temp.name
'Température palier étage 1'
We can add other features if we want
features = [temp]
cles = ["Température palier étage 1"]
nb_features = len(features)
Standardization¶
Why standardize data ?¶
When we have multiple features, it is necesary to reduce them to the same scale by modifiying the mean and the variance (in fact, each feature has its own scale: temperature, power, ...)
In our case, it helps our ML algorithms to learn well. This preprocessing step is neceserary.
What is the principle ?¶
The goal is to have a null mean $\mu = 0$ and a standard deviation $\sigma = 1$ for our data.
To do this, we make standard transformation on our data to center and to reduce : $ y = \frac{x - \mu}{\sigma} $
Implementation¶
We use the scikit-learn library, and especialy the StandardScaler class.
Application to labeled data : standard deviation calculation on normal slices, not on outlier ones¶
labels = df_total.loc[:, "labels"].to_numpy(dtype='i4')
labels_mask = np.where(labels>0, 1, 0)
bounds = np.pad(np.argwhere(np.diff(labels_mask)!=0).flatten()+1, (1,1))
bounds[-1] = labels_mask.shape[0]
slices = list(zip(bounds[0:-1],bounds[1:]))
if (labels[0] == 0):
normal_slices = list(slices[::2])
outliers_slices = list(slices[1::2])
else:
normal_slices = list(slices[1::2])
outliers_slices = list(slices[0::2])
print(f"Normal slices: {normal_slices}")
print(f"Outliners slices: {outliers_slices}")
Normal slices: [(0, 31506), (41934, 51500), (52560, 52561), (53788, 55560), (61561, 113636), (114907, 115620), (117803, 117809), (117926, 121026), (125437, 127256), (128587, 142507)] Outliners slices: [(31506, 41934), (51500, 52560), (52561, 53788), (55560, 61561), (113636, 114907), (115620, 117803), (117809, 117926), (121026, 125437), (127256, 128587)]
# collect the labels
label = df_total.loc[:, "labels"].to_numpy()
normal_slices = []
outliers_slices = []
start_slice = 0
normal_indic = (label[start_slice] == 0)
# obtain normal and outliers slices from labels
for i in range(1, label.size):
if label[i] > 0 and normal_indic:
normal_slices += [(start_slice, i)]
normal_indic = False
start_slice = i
if label[i] == 0 and not normal_indic:
outliers_slices += [(start_slice, i)]
normal_indic = True
start_slice = i
if i == (label.size - 1):
if normal_indic:
normal_slices += [(start_slice, i+1)]
else:
outliers_slices += [(start_slice, i+1)]
if normal_indic and normal_slices == []:
normal_slices += [(start_slice, label.size)]
if not normal_indic and outliers_slices == []:
outliers_slices += [(start_slice, label.size)]
print(f"Normal slices: {normal_slices}")
print(f"Outliners slices: {outliers_slices}")
Normal slices: [(0, 31506), (41934, 51500), (52560, 52561), (61561, 113636), (114907, 115620), (117803, 117809), (119967, 121026), (125437, 127256), (128587, 142507)] Outliners slices: [(31506, 41934), (51500, 52560), (52561, 61561), (113636, 114907), (115620, 117803), (117809, 119967), (121026, 125437), (127256, 128587)]
# standardization : without using scikitlearn
labels = df_total.loc[:, "labels"].to_numpy(dtype='i4')
normal_mask = (labels == 0)
for feature in features:
std_to_apply = np.std(feature[normal_mask])
mean_to_apply = np.mean(feature[normal_mask])
print(f"Mean transformation applied: {mean_to_apply}")
print(f"STD transformation applied: {std_to_apply}")
feature -= mean_to_apply
feature /= std_to_apply
# plot
fig = plt.figure(figsize=(30,6))
plt.plot(feature) ;
Mean transformation applied: 86.05305514719214 STD transformation applied: 2.3162345287069024
# STANDARDIZATION : with scikitlearn (we finally use this method)
for feature in features:
scaler = StandardScaler()
scaler.fit(feature[normal_mask].to_numpy().reshape((-1,1)))
feature[:] = scaler.transform(feature.to_numpy().reshape((-1,1))).flatten()
df_total.update(feature)
fig = plt.figure(figsize=(30,6))
plt.plot(feature)
V. Smoothing¶
Kernel Density Estimation (KDE)¶
We want to smooth the sound effect in order to better see the global tendancies of the curve.
Smoothing method¶
The values provided by the captors are discrete, we want to approximate the variation of the features by using the Kernel Density Estimation. Through this method, we take an interval of points around an easting value, and calculate an approximated function that fits the curve and then gives the new smoothed value. The curve returned is continuous and more regular.
Choice of the studied (scaled) data¶
BEGIN = 0
END = 1000
scaled_data1 = df_total["Température palier étage 1"][BEGIN:END]
plt.figure(figsize=(10,5))
plt.title("Features' variation throughout the time")
plt.plot(scaled_data1[:100]);
Choice of the kernel and bandwidth¶
We chose to use the skfda library.
# df_total
def apply_transform(kernel, df):
data_matrix = df.to_numpy().reshape((1,-1))
grid_points = range(data_matrix.shape[1])
fd = FDataGrid(data_matrix, grid_points)
smooth_df = df.copy()
fd = kernel.fit_transform(fd)
smooth_df[:] = fd.data_matrix.flatten()
return smooth_df
Kernel choice¶
smootherReg = ks.LocalLinearRegressionSmoother(smoothing_parameter=10)
smooth_dataReg = apply_transform(smootherReg, scaled_data1)
plt.figure(figsize=(20,10))
plt.title('Gaussian kernel')
line1, = plt.plot(smooth_dataReg, 'blue', label = 'gaussian')
line2, = plt.plot(scaled_data1,'orange', label = 'Raw data')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');
This solution does not follow the curve when there is a rapid variation or when the temperature is constant. For these reasons we put it aside.
smootherKNN = ks.KNeighborsSmoother(smoothing_parameter=10)
smooth_dataKNN = apply_transform(smootherKNN, scaled_data1)
plt.figure(figsize=(20,10))
line1, = plt.plot(smooth_dataKNN, 'blue', label = 'KNN')
plt.title('KNN kernel')
line2, = plt.plot(scaled_data1,'orange', label = 'Raw data')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');
smootherNW = ks.NadarayaWatsonSmoother(smoothing_parameter=10)
smooth_dataNW = apply_transform(smootherNW, scaled_data1)
plt.figure(figsize=(20,10))
line1, = plt.plot(smooth_dataNW, 'blue', label = 'NW')
plt.title('Nadaraya Watson kernel')
line2, = plt.plot(scaled_data1,'orange', label = 'Raw data')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');
The two last kernels suit, we choose the Nadaraya-Watson one.
Bandwidth choice¶
The bandwidth decides the interval of approximation around each easting value. The lower it is, the stronger the approximation fits the curve.
smootherNW_1 = ks.NadarayaWatsonSmoother(smoothing_parameter=100)
smooth_dataNW_1 = apply_transform(smootherNW_1, scaled_data1)
smootherNW_2 = ks.NadarayaWatsonSmoother(smoothing_parameter=5)
smooth_dataNW_2 = apply_transform(smootherNW_2, smooth_dataNW_1)
smootherNW_3 = ks.NadarayaWatsonSmoother(smoothing_parameter=0.1)
smooth_dataNW_3 = apply_transform(smootherNW_3, smooth_dataNW_2)
plt.figure(figsize=(20,10))
plt.title('Comparaison BD')
plt.plot(scaled_data1,'blue');
line1, = plt.plot(smooth_dataNW_1, 'yellow', label = 'bd=100')
line2, = plt.plot(smooth_dataNW_2, 'orange', label = 'bd=5')
line3, = plt.plot(smooth_dataNW, 'red', label = 'bd=0.1')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');
In order to follow the curve while having a visible smoothing effect we choose bandwidth = 10.
Wavelet¶
df_2018_example = load_csv_file("2018 10min.csv")
The working data are noised and cannot be treated as shows in the following example:
COLUMN = "Température huile sortie réfrigerant"
BGN = 7500
END = 8000
X = np.arange(BGN, END)
data = df_2018_example.iloc[BGN:END][COLUMN]
plt.plot(X, data, color = 'b');
In order to denoise we are going to use an algorithm based on wavelets transform.
The algorithm¶
def sure(lbd, x, N):
"""
return the calculation of sure
"""
return (N + np.sum(np.minimum(np.abs(x), lbd) ** 2)
- 2 * np.sum(np.abs(x) < lbd))
def compute_sure_threshold(x: np.array) -> float:
"""
return the best parameter to optimize the approximation
"""
N = x.shape[0]
lbd_candidates = np.arange(0, np.sqrt(2 * np.log(N)), 0.01)
return np.argmin(np.array([sure(lbd, x, N)
for lbd in lbd_candidates]))
def shrink(x: np.array, lbd: float) -> np.array:
"""
kill or update the coefficients (in the wavelets transform)
"""
return np.sign(x) * np.maximum(0, np.abs(x) - lbd)
def threshold_dwt(df, level = 5, wavelet_type = 'db5') -> (list):
"""
calulate coefficients in the wavelets transform choosen of df, treat them and reconstruct the signal
"""
# wavelet based deconstruction
coeffs = pywt.wavedec(df, wavelet_type, level = 5)
# coefficient thresholding
coeffs = [coeffs[0]] + [shrink(cD, compute_sure_threshold(cD))
for cD in coeffs[1:]]
# signal reconstruction
rdf = pywt.waverec(coeffs, wavelet_type)
# For unknown reason I get one more element in the output
return rdf[:df.shape[0]]
Principle¶
Wavelets are the generalisation of Fourier transform and allow to make the link between time and frequencies. The algorithm calculates the coefficients of signal based on the choosen wavelets decomposition so it can makes to zero the coefficients (already not far from zero) associated with high frequencies which provoks noising. The signal is then reconstituted by reconstructing it with the new spectrum.
The threshold (ldb in the code) determined how to affect the coefficients with the following instruction:
- If abs(coefficient) < lbd, then coefficient = 0
- If coeffficient < -lbd, then coefficient = coefficient - lbd
- If coeffficient > lbd, then coefficient = coefficient + lbd
The threshold is choosen in order to minimize the quantity "sure" which approximately represents the difference between the noised signal and the average signal.
Level indicates how many times high and low frequencies are cut. It does not have a major impact on the computation so standard value of 5 is choosen.
Choose the wavelets¶
Here is some approximation using different families of wavelets among Daubechies(db), Symlets(sym) and Coiflets(coif).
COLUMN = "Température palier étage 1"
BGN = 0
END = 200
X = np.arange(BGN, END)
level = 4
data = df_2018_example.iloc[BGN:END][COLUMN]
for _type in ['db', 'sym', 'coif']:
for i in range(2, 9):
level = i
denoised_data = threshold_dwt(data, wavelet_type = 'db'+str(i), level = level)
plt.xlabel(f"type = {_type}{i}")
plt.plot(X, data, color = 'b');
plt.plot(X, denoised_data, color = 'g');
plt.show()
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects. "boundary effects.").format(level))
After examination, we retain the most accurate and smooth curve. Another condition is not to have too high jumps to insure good properties. Finally, the 'db5' approximation will be used.
Results with db5¶
BGN = 11500
END = 12000
X = np.arange(BGN, END)
data = df_2018_example.iloc[BGN:END][COLUMN]
denoised_data = threshold_dwt(data)
plt.plot(X, data);
plt.plot(X, denoised_data, color = 'g');
plt.xlabel(f"Approximation : green")
plt.show()
BGN = 500
END = 1000
X = np.arange(BGN, END)
data = df_2018_example.iloc[BGN:END][COLUMN]
denoised_data = threshold_dwt(data)
plt.plot(X, data);
plt.plot(X, denoised_data, color = 'g');
plt.xlabel(f"Approximation : green")
plt.show()
Order of execution¶
We have two algorithms that smooth the features, we are going to test the efficiency of these methods depending on the order we use them in.
1: kde 2: wavelet¶
# kde
smootherNW1 = ks.NadarayaWatsonSmoother(smoothing_parameter=10)
smooth_data1 = apply_transform(smootherNW1, scaled_data1)
# wavelet
denoised_data1 = scaled_data1.copy()
denoised_data1[:] = threshold_dwt(smooth_data1)
plt.figure(figsize=(20,10))
line0, = plt.plot(scaled_data1[BGN:END], 'blue', label = 'RAW')
line1, = plt.plot(denoised_data1,'orange', label = 'wavelet')
plt.title('Nadaraya Watson kernel')
line2, = plt.plot(smooth_data1, 'yellow', label = 'kde')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');
1: wavelet 2: kde¶
# Wavelet
denoised_data2 = scaled_data1.copy()
denoised_data2[:] = threshold_dwt(scaled_data1)
#kde
smootherNW2 = ks.NadarayaWatsonSmoother(smoothing_parameter=10)
smooth_data2 = apply_transform(smootherNW2, denoised_data2)
plt.figure(figsize=(20,10))
line0, = plt.plot(scaled_data1, 'blue', label = 'RAW')
line1, = plt.plot(denoised_data2,'orange', label = 'wavelet')
plt.title('Nadaraya Watson kernel')
line2, = plt.plot(smooth_data2, 'yellow', label = 'kde')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');
After comparing the two orders possible we choose to apply the wavelet method first, and then the KDE. The wavelet reduces the sound effect, yet when the variation is important this method leaves some fluctuation that the KDE method reduces.
Final solution¶
df = df_total.copy()
# df
MOTOR CURRENT | Température palier étage 1 | Température palier étage 2 | Température palier étage 3 | Température palier étage 4 | Déplacement axiale 1/2 | Déplacement axiale 3/4 | 1e stade vibration X | 1e stade vibration Y | 2e stade vibration X | 2e stade vibration Y | 3e stade vibration X | 3e stade vibration Y | 4e stade vibration X | 4e stade vibration Y | Température huile sortie réfrigerant | labels | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2018-01-01 00:10:00 | 902.333740 | -1.145423 | 91.199982 | 78.999985 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.60 | 7.068229 | 5.125000 | 7.328948 | 7.337153 | 44.399990 | 0.0 |
2018-01-01 00:20:00 | 902.333740 | -1.059078 | 91.199982 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.022708 | 5.022708 | 7.183333 | 7.346614 | 44.599991 | 0.0 |
2018-01-01 00:30:00 | 902.333740 | -1.059078 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.125000 | 5.125000 | 7.279167 | 7.329514 | 44.599991 | 0.0 |
2018-01-01 00:40:00 | 902.333740 | -1.059078 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.750 | 7.875 | 8.50 | 7.011652 | 5.125000 | 7.375000 | 7.356651 | 44.599991 | 0.0 |
2018-01-01 00:50:00 | 897.018066 | -1.059078 | 91.399979 | 79.199982 | NaN | -0.046250 | -0.120000 | 4.625 | 6.625 | 7.875 | 8.50 | 7.022708 | 5.002292 | 7.327083 | 7.308574 | 44.799992 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2020-09-16 14:30:00 | 855.821594 | 1.444981 | 93.999977 | 80.599976 | 82.999977 | -0.033999 | -0.118028 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503597 | 6.372628 | 7.250000 | 8.337501 | NaN | 0.0 |
2020-09-16 14:40:00 | 847.848083 | 1.444981 | 93.999977 | 80.399979 | 82.999977 | -0.033849 | -0.117976 | 5.125 | 8.375 | 8.875 | 10.25 | 6.503816 | 6.367173 | 7.250000 | 8.326176 | NaN | 0.0 |
2020-09-16 14:50:00 | 843.861389 | 1.358636 | 93.999977 | 80.399979 | 82.999977 | -0.033699 | -0.117925 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.273644 | 7.250000 | 8.331106 | NaN | 0.0 |
2020-09-16 15:00:00 | 849.177002 | 1.444981 | 94.199974 | 80.599976 | 83.199974 | -0.033549 | -0.117873 | 5.125 | 8.375 | 8.875 | 10.25 | 6.566219 | 6.366466 | 7.250000 | 8.258534 | NaN | 0.0 |
2020-09-16 15:10:00 | 850.505981 | 1.444981 | 94.199974 | 80.399979 | 83.199974 | -0.033399 | -0.117821 | 5.125 | 8.375 | 8.875 | 10.25 | 6.500000 | 6.375000 | 7.250000 | 8.252553 | NaN | 0.0 |
142507 rows × 17 columns
def local_smoothing(df, feature):
denoised_data = df[feature].copy()
denoised_data[:] = threshold_dwt(denoised_data)
smootherNW = ks.NadarayaWatsonSmoother(smoothing_parameter=15)
smooth_data = apply_transform(smootherNW, denoised_data)
return smooth_data
def smoothing(df):
for feature in df.columns[1:]:
smooth_data = local_smoothing(df, feature)
df.update(smooth_data)
return df
# Implement a slicing because otherwise NadarayaWatsonSmoother need too much memory
def smoothing_with_slice_inplace(df, feature):
NB_SAMPLES = 6196
for i in range(0, df.shape[0], NB_SAMPLES):
X = local_smoothing(df.iloc[i:min(i+NB_SAMPLES, df.shape[0])], feature)
df.update(X)
# modif de la dataframe et visualisation
df_total_ref = df_total.copy()
smoothing_with_slice_inplace(df_total, "Température palier étage 1")
fig = plt.figure(figsize=(30,6))
plt.plot(df_total_ref["Température palier étage 1"])
plt.plot(df_total["Température palier étage 1"])
[<matplotlib.lines.Line2D at 0x7fdc1a1e2050>]
VI. Detrending¶
What is a Detrend?¶
A detrend involves removing the effects of trend from a data set to show only the differences in values from the trend. It allows cyclical and other patterns to be identified. Detrending shows a different aspect of time series data by removing deterministic and stochastic trends.
fig = plt.figure(figsize=(30,6))
temp_palier = df_total.loc[:, 'Température palier étage 1']
plt.plot(temp_palier)
[<matplotlib.lines.Line2D at 0x7fdc1a545710>]
Implementation¶
We use the function STL from statsmodels to decompose our datas in 3 components : trend, seasonal and resid. This function takes one argument 'period' that corresponds to the appearing period of the observed signal. Here, we took a period of one week, that gives relevant graphs.
temp_palier.to_numpy()
res = STL(temp_palier.to_numpy(), period=24*6*7).fit()
plt.figure(figsize=(30,6))
res.plot()
plt.show()
<Figure size 2160x432 with 0 Axes>
Then, we retain the resid graph, that corresponds to the signal observed without the trend and seasonal components.
fig = plt.figure(figsize=(30,6))
plt.plot(res.resid)
[<matplotlib.lines.Line2D at 0x7fdc33e2acd0>]
df_total.loc[:,"Température palier étage 1"] = res.resid
VII. Export clean data in CSV¶
df_total.to_csv("clean_data_detrended.csv")
plt.figure(figsize=(50,20))
plt.plot(df_total.loc[:,"Température palier étage 1"])
[<matplotlib.lines.Line2D at 0x7fdc18c07fd0>]