#General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Choice of the features
from ipywidgets import widgets
from pandas_profiling import ProfileReport

#Gap filling
import datetime as dt

#Standardization
from sklearn.preprocessing import StandardScaler

#Denoising
import pywt

#Kernel smoothing
from skfda.representation.grid import FDataGrid 
import skfda

import skfda.preprocessing.smoothing.kernel_smoothers as ks
import skfda.preprocessing.smoothing.validation as val

#Detrending
import statsmodels.api 
from statsmodels.tsa.seasonal import STL


dico = {"Description" : "Point Name",
        "MOTOR CURRENT" : "PARD@3C52@3C52-M@JT7099.CAL",
        "Température palier étage 1" : "PARD@3C52@3C52-M@TE7011A.PNT",
        "Température palier étage2" : "PARD@3C52@3C52-M@TE7021A.PNT",
        "Température palier étage 3" : "PARD@3C52@3C52-M@TE7031A.PNT",
        "Température palier étage 4" : "PARD@3C52@3C52-M@TE7041A.PNT",
        "Déplacement axiale 1/2" : "PARD@3C52@3C52-M@VT7001.PNT:RAW",
        "Déplacement axiale 3/4" : "PARD@3C52@3C52-M@VT7002.PNT:RAW",
        "1e stade vibration X" : "PARD@3C52@3C52-M@VT7011A.PNT:RAW",
        "1er stade vibration Y" : "PARD@3C52@3C52-M@VT7011B.PNT:RAW",
        "2e stade vibration X" : "PARD@3C52@3C52-M@VT7021A.PNT:RAW",
        "2e stade vibration Y" : "PARD@3C52@3C52-M@VT7021B.PNT:RAW",
        "3e stade vibration X" : "PARD@3C52@3C52-M@VT7031A.PNT:RAW",
        "3e stade vibration Y" : "PARD@3C52@3C52-M@VT7031B.PNT:RAW",
        "4e stade vibration X" : "PARD@3C52@3C52-M@VT7041A.PNT:RAW",
        "4e stade vibration Y" : "PARD@3C52@3C52-M@VT7041B.PNT:RAW",
        "Température huile sortie réfrigerant" : "PARD@3C52@3C52@TE7086.PNT",
        "labels" : "labels"}
inv_dico = {v: k for k, v in dico.items()}


# Separate reading of file
def load_csv_file(filename):
    data_desc = {
        # Use homogenous name
        # Eventualy store the map in file
        "Point Name": ("TIMESTAMPS", lambda s: dt.datetime.strptime(s, '%m/%d/%Y %I:%M:%S %p')),
        "PARD@3C52@3C52-M@JT7099.CAL": ("MOTOR CURRENT", float),
        "PARD@3C52@3C52-M@TE7011A.PNT": ("Température palier étage 1", float),
        "PARD@3C52@3C52-M@TE7021A.PNT": ("Température palier étage 2", float),
        "PARD@3C52@3C52-M@TE7031A.PNT": ("Température palier étage 3", float),
        "PARD@3C52@3C52-M@TE7041A.PNT": ("Température palier étage 4", float),
        "PARD@3C52@3C52-M@VT7001.PNT:RAW": ("Déplacement axiale 1/2", float),
        "PARD@3C52@3C52-M@VT7002.PNT:RAW": ("Déplacement axiale 3/4", float),
        "PARD@3C52@3C52-M@VT7011A.PNT:RAW": ("1e stade vibration X", float),
        "PARD@3C52@3C52-M@VT7011B.PNT:RAW": ("1e stade vibration Y", float),
        "PARD@3C52@3C52-M@VT7021A.PNT:RAW": ("2e stade vibration X", float),
        "PARD@3C52@3C52-M@VT7021B.PNT:RAW": ("2e stade vibration Y", float),
        "PARD@3C52@3C52-M@VT7031A.PNT:RAW": ("3e stade vibration X", float),
        "PARD@3C52@3C52-M@VT7031B.PNT:RAW": ("3e stade vibration Y", float),
        "PARD@3C52@3C52-M@VT7041A.PNT:RAW": ("4e stade vibration X", float),
        "PARD@3C52@3C52-M@VT7041B.PNT:RAW": ("4e stade vibration Y", float),
        "PARD@3C52@3C52@TE7086.PNT": ("Température huile sortie réfrigerant", float)
    }
    
    df = pd.read_csv(filename, converters = {k: v[1] for k, v in data_desc.items() if v is not None}, skiprows=[1,2,3,4,5])
    df.rename(columns = {k : v[0] for k, v in data_desc.items()}, inplace = True)
    
    return df
    

def apply_thresholding_inplace(x, lower_bound, upper_bound):
    if lower_bound is not None:
        x.mask(x < lower_bound, inplace = True)
    if upper_bound is not None:
        x.mask(x > upper_bound, inplace = True)

def thresholding_inplace(df):
    #Motor current
    apply_thresholding_inplace(df["MOTOR CURRENT"], 300, None)
    
    #Température palier étage 1
    apply_thresholding_inplace(df["Température palier étage 1"], 70, 125)    
    
    #Température palier étage 2
    apply_thresholding_inplace(df["Température palier étage 2"], 70, 100)
    
    #Température palier étage 3
    apply_thresholding_inplace(df["Température palier étage 3"], 60, 100)
    
    #Température palier étage 4
    apply_thresholding_inplace(df["Température palier étage 4"], 77, 100)
   
    #Déplacement axial 1/2
    apply_thresholding_inplace(df["Déplacement axiale 1/2"], None, 50)
    
    #Déplacement axial 3/4
    apply_thresholding_inplace(df["Déplacement axiale 3/4"], None, 50)
    
    #1er stade de vibration X
    apply_thresholding_inplace(df["1e stade vibration X"], 3, 50)

    #1er stade de vibration Y
    apply_thresholding_inplace(df["1e stade vibration Y"], 5, 50)

    #2e stade de vibration X
    apply_thresholding_inplace(df["2e stade vibration X"], 4, 50)
    
    #2e stade de vibration Y
    apply_thresholding_inplace(df["2e stade vibration Y"], 6, 50)
    
    #3e stade de vibration X
    apply_thresholding_inplace(df["3e stade vibration X"], None, 50)

    #3e stade de vibration Y
    apply_thresholding_inplace(df["3e stade vibration Y"], None, 50)

    #4e stade de vibration X
    apply_thresholding_inplace(df["4e stade vibration X"], 1, 50)

    #4e stade de vibration Y
    apply_thresholding_inplace(df["4e stade vibration Y"], 1, 50)
    
    #Température huile sortie réfrigérant
    apply_thresholding_inplace(df["Température huile sortie réfrigerant"], None, 50)
    
    return df


df_2018 = load_csv_file('2018 10min.csv')
df_2019 = load_csv_file('2019 10min.csv')
df_2020 = load_csv_file('2020 10min.csv')

df_2018 = thresholding_inplace(df_2018)
df_2019 = thresholding_inplace(df_2019)
df_2020 = thresholding_inplace(df_2020)

#joining the three years to create a continuous set of data
df_total = pd.concat((df_2018,df_2019,df_2020), axis = 0)
# df_total = thresholding_inplace(df_total)
# df_total


# Y = df_total.iloc[:,1:]
# profile = ProfileReport(Y, title='Correlation matrix')
# profile


# Keep only usefull columns
selected = ['TIMESTAMPS', 'MOTOR CURRENT', 'Température palier étage 1', 'Déplacement axiale 1/2', 'Déplacement axiale 3/4', '1e stade vibration X']
df_total = df_total[selected]
# df_total


# Y = df_total.iloc[:,1:]
# profile = ProfileReport(Y, title='Correlation matrix')
# profile


#Before the labelling, we assign the value 0 to all data
df_2018['labels'] = 0
df_2019['labels'] = 0
df_2020['labels'] = 0
#df_2018.insert(len(df_2018.columns), 'labels', 0)
#df_2019.insert(len(df_2019.columns), 'labels', 0)
#df_2020.insert(len(df_2020.columns), 'labels', 0)


#Example for year 2020
df_ex = df_2020
COLUMN = "Température palier étage 1"
STEP = 17000
L = len(df_ex[COLUMN])

#"Température palier étage 1" for year 2020
for i in range(4, L-STEP, STEP) :
    tab = np.array(df_ex[COLUMN].loc[i:(i+STEP+2000)])
    if (np.nanmax(tab) - np.nanmin(tab) > 2):
        plt.figure(figsize = (30,6))
        plt.plot(range(i,i+STEP+2001), tab) ;
        plt.grid()
        plt.show()

#Zoom on a specified abnormal period
deb, fin = 10000, 13000

tab = np.array(df_ex[COLUMN].loc[deb:fin])
plt.figure(figsize = (30,6))
plt.plot(range(deb,fin +1), tab) ;
plt.grid()
plt.title("Zoom on the abnormal period ranging from 03/13/2020 to 03/29/2020", size = 20)
plt.show()


df_2018.at[28750:29878,'labels'] = -1
df_2018.at[31500:35261,'labels'] = 4
df_2018.at[35261:36000,'labels'] = 4
df_2018.at[36000:41927,'labels'] = 4
df_2018.at[51000:51500,'labels'] = -1
df_2018.at[51500:,'labels'] = 9


df_2019.at[4:1228,'labels'] = 9
df_2019.at[1228:3000,'labels'] = -1
df_2019.at[3000:9000,'labels'] = 8


df_2020.at[7880:8516,'labels'] = -1
df_2020.at[8516:9786,'labels'] = 3
df_2020.at[10500:12800,'labels'] = 4
df_2020.at[12800:14840,'labels'] = -1
df_2020.at[15900:20310,'labels'] = 4
df_2020.at[22130:23460,'labels'] = 2


tab_signal18 = df_2018['Température palier étage 1']
tab_label18 = df_2018['labels']
plt.figure(figsize = (15,6))
plt.plot(tab_signal18)
plt.plot(tab_label18+84)
plt.show()


tab_signal19 = df_2019['Température palier étage 1']
tab_label19 = df_2019['labels']
plt.figure(figsize = (15,6))
plt.plot(tab_signal19)
plt.plot(tab_label19+84)
plt.show()


tab_signal20 = df_2020['Température palier étage 1']
tab_label20 = df_2020['labels']
plt.figure(figsize = (15,6))
plt.plot(tab_signal20)
plt.plot(tab_label20+84)
plt.show()


df_total = pd.concat((df_2018,df_2019,df_2020), axis = 0)
# df_total


plt.subplot(211)
tab = np.array(df_2020['Température palier étage 1'].loc[:2500])
plt.plot(tab);

plt.subplot(212)
tab = np.array(df_2020['Température palier étage 1'])
tab = np.pad(tab, (0, 144*(tab.shape[0]//144+1)-tab.shape[0]))
tab = np.reshape(tab, (-1,144))
plt.imshow(tab.T)

<matplotlib.image.AxesImage at 0x7fdc332dbb50>


# DEFINING GLOBAL VARIABLES

timestamps_column = 'TIMESTAMPS'
sep = ','


def get_deltas(dataframe: pd.DataFrame) -> (int, list, list):
    """
    This function reads the timestamps for the values and computes the most frequent time delta in the data series.
    :param dataframe: The path of the csv file
    :returns: The most frequent time delta (in seconds) and the list of timestamps (datetime objects) and the list of time deltas in seconds
    """
    
    dataframe.sort_values("TIMESTAMPS", inplace = True)
    deltas_seclist = dataframe["TIMESTAMPS"].diff()
    
    # extract all unique deltas
    uniques = deltas_seclist.unique()
    
    # maps delta to an integer identifier
    unique_map = {k: v for v, k in enumerate(uniques)}
    
    # find the most frequent delta
    most_frequent_delta = uniques[np.argmax(np.bincount(deltas_seclist.map(unique_map)))]

    return most_frequent_delta, dataframe["TIMESTAMPS"], deltas_seclist.apply(lambda x: x.total_seconds())


# df_total


#    Computing xxxxx
most_frequent_delta, timepoints_dtlist, deltas_seclist = get_deltas(df_total)
# df_total


df_total.drop_duplicates(subset=['TIMESTAMPS'], keep='first', inplace = True)

# SETTING INDEX
#df = df.set_index('TIMESTAMPS')
#df.drop(columns=['index'], inplace=True)


# WARNING: this function uses np for the NaN: replace if possible
def create_empty_lines(df: pd.DataFrame, t_start: pd.Timestamp, periods: int, delta_t: int) -> (pd.DataFrame) :
    """
    This function adds nb_lines of NaN values at the index idx 
    :param df:
    delta_t in sec
    """

    nb_columns = len(df.columns)
    
    freq = f'{delta_t}s'
    date_range = list(pd.date_range(t_start, periods=periods, freq=freq))
    
    return pd.DataFrame(np.nan, index = date_range[1:], columns = list(df.head(0)))
    


    
def complete(df_in: pd.DataFrame, deltas_seclist: list, delta_t: int) -> pd.DataFrame :
    """
    This function takes a dataframe with TIMESTAMPS (in seconds and sorted) as index and complete this dataframe with NaN values to have a regular time step, if needed
    :param df_in: Input DataFrame
    :param deltas_seclist: 
    """
    
    #   Stuff
    df = df_in.copy()
    timestamps_list = list(df.index)
    
    id_new = 0
    for i in range(len(timestamps_list) - 1) :
                
        #   Getting the time deltas (in seconds)
        delta_dt = deltas_seclist[i]
        
        #   If missing values
        if (delta_dt != delta_t) :   
                        
            #    Generating the empty lines to insert
            periods = int(delta_dt / delta_t)
            patch = create_empty_lines(df, timestamps_list[i], periods, delta_t)
            
            #    incrementing the new index with the inserted lines
            id_new += periods
            
            timestamp = timestamps_list[i]
            timestamp_next = timestamps_list[i+1]
            
            #    Concatenating (inserting the lines)
            df_begin = df.loc[:timestamp, :]
            df_end = df.loc[timestamp_next:, :]
            
            df = pd.concat([df_begin, patch, df_end], axis=0)

            
        #   incrementing the new index
        id_new += 1
        
    return df



#df_total = complete(df_total, deltas_seclist, most_frequent_delta)


# completing dataframe
most_frequent_delta_dateoffset = pd.DateOffset(seconds=int(most_frequent_delta.astype('timedelta64[s]').astype(int)))
complete_timerange = pd.date_range(df_total.iloc[0,0],df_total.iloc[-1,0],freq=most_frequent_delta_dateoffset)
df_total.set_index('TIMESTAMPS', inplace = True)
df_total = df_total.reindex(complete_timerange)


df_total['labels'].fillna(value = 0.0, inplace = True)

interpolated_columns = [c for c in df_total.columns if c not in {'labels','TIMESTAMPS'}]

df_total.update(df_total[interpolated_columns].interpolate(method='slinear'))

#    Interpolates except if leading/trailing NaN values
#for column_name in df_total.columns[:-1]:    
#    column_to_fill = df_total.loc[:, column_name].astype(float)
#    column_to_fill = column_to_fill.interpolate(method='slinear')
#    df_total = df_total.assign(**{column_name: column_to_fill})

#   Filling in the gaps in the labels
#df_total['labels'].fillna(value = 0, inplace = True)
#df_total.reset_index(inplace=True)

# df_total


plt.subplot(311)
tab = np.array(df_2020['Température palier étage 1'].loc[4:2500].astype('float32'))
plt.plot(tab)
plt.subplot(312)
tab2 = np.array(df_total.iloc[105120:107617].loc[:,'Température palier étage 1'].astype('float32'))
plt.plot(tab2);

plt.subplot(313)
tab = np.array(df_total['Température palier étage 1'])
tab = np.pad(tab, (0, 144*(tab.shape[0]//144+1)-tab.shape[0]))
tab = np.reshape(tab, (-1,144))
plt.imshow(tab.T)

<matplotlib.image.AxesImage at 0x7fdc33b1fb90>


temp = df_total["Température palier étage 1"]
temp.name

'Température palier étage 1'


features = [temp]
cles = ["Température palier étage 1"]
nb_features = len(features)


labels = df_total.loc[:, "labels"].to_numpy(dtype='i4')
labels_mask = np.where(labels>0, 1, 0)
bounds = np.pad(np.argwhere(np.diff(labels_mask)!=0).flatten()+1, (1,1))
bounds[-1] = labels_mask.shape[0]
slices = list(zip(bounds[0:-1],bounds[1:]))
if (labels[0] == 0):
    normal_slices = list(slices[::2])
    outliers_slices = list(slices[1::2])
else:
    normal_slices = list(slices[1::2])
    outliers_slices = list(slices[0::2])
    
print(f"Normal slices: {normal_slices}")
print(f"Outliners slices: {outliers_slices}")

Normal slices: [(0, 31506), (41934, 51500), (52560, 52561), (53788, 55560), (61561, 113636), (114907, 115620), (117803, 117809), (117926, 121026), (125437, 127256), (128587, 142507)]
Outliners slices: [(31506, 41934), (51500, 52560), (52561, 53788), (55560, 61561), (113636, 114907), (115620, 117803), (117809, 117926), (121026, 125437), (127256, 128587)]


# collect the labels

label = df_total.loc[:, "labels"].to_numpy()

normal_slices = []
outliers_slices = []

start_slice = 0
normal_indic = (label[start_slice] == 0)

# obtain normal and outliers slices from labels

for i in range(1, label.size):
    if label[i] > 0 and normal_indic:
        normal_slices += [(start_slice, i)]
        normal_indic = False
        start_slice = i
    if label[i] == 0 and not normal_indic:
        outliers_slices += [(start_slice, i)]
        normal_indic = True
        start_slice = i
    if i == (label.size - 1):
        if normal_indic:
            normal_slices += [(start_slice, i+1)]
        else:
            outliers_slices += [(start_slice, i+1)]
if normal_indic and normal_slices == []:
    normal_slices += [(start_slice, label.size)]
if not normal_indic and outliers_slices == []:
    outliers_slices += [(start_slice, label.size)]

print(f"Normal slices: {normal_slices}")
print(f"Outliners slices: {outliers_slices}")

Normal slices: [(0, 31506), (41934, 51500), (52560, 52561), (61561, 113636), (114907, 115620), (117803, 117809), (119967, 121026), (125437, 127256), (128587, 142507)]
Outliners slices: [(31506, 41934), (51500, 52560), (52561, 61561), (113636, 114907), (115620, 117803), (117809, 119967), (121026, 125437), (127256, 128587)]


# standardization : without using scikitlearn

labels = df_total.loc[:, "labels"].to_numpy(dtype='i4')
normal_mask = (labels == 0)

for feature in features:
    std_to_apply = np.std(feature[normal_mask])
    mean_to_apply = np.mean(feature[normal_mask])

    print(f"Mean transformation applied: {mean_to_apply}")
    print(f"STD transformation applied: {std_to_apply}")
    
    feature -= mean_to_apply
    feature /= std_to_apply

    # plot

    fig = plt.figure(figsize=(30,6))
    plt.plot(feature) ;

Mean transformation applied: 86.05305514719214
STD transformation applied: 2.3162345287069024


# STANDARDIZATION : with scikitlearn (we finally use this method)

for feature in features:
    scaler = StandardScaler()
    scaler.fit(feature[normal_mask].to_numpy().reshape((-1,1)))
    feature[:] = scaler.transform(feature.to_numpy().reshape((-1,1))).flatten()
    df_total.update(feature)
    
    fig = plt.figure(figsize=(30,6))
    plt.plot(feature)


BEGIN = 0
END = 1000
scaled_data1 = df_total["Température palier étage 1"][BEGIN:END]


plt.figure(figsize=(10,5))
plt.title("Features' variation throughout the time")
plt.plot(scaled_data1[:100]);


# df_total


def apply_transform(kernel, df):
    data_matrix = df.to_numpy().reshape((1,-1))
    grid_points = range(data_matrix.shape[1])
    fd = FDataGrid(data_matrix, grid_points)
    smooth_df = df.copy()
    fd = kernel.fit_transform(fd)
    smooth_df[:] = fd.data_matrix.flatten()
    return smooth_df


smootherReg = ks.LocalLinearRegressionSmoother(smoothing_parameter=10)
smooth_dataReg = apply_transform(smootherReg, scaled_data1)

plt.figure(figsize=(20,10))
plt.title('Gaussian kernel')
line1, = plt.plot(smooth_dataReg, 'blue', label = 'gaussian')
line2, = plt.plot(scaled_data1,'orange', label = 'Raw data')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');


smootherKNN = ks.KNeighborsSmoother(smoothing_parameter=10)
smooth_dataKNN = apply_transform(smootherKNN, scaled_data1)

plt.figure(figsize=(20,10))
line1, = plt.plot(smooth_dataKNN, 'blue', label = 'KNN')
plt.title('KNN kernel')
line2, = plt.plot(scaled_data1,'orange', label = 'Raw data')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');


smootherNW = ks.NadarayaWatsonSmoother(smoothing_parameter=10)
smooth_dataNW = apply_transform(smootherNW, scaled_data1)

plt.figure(figsize=(20,10))
line1, = plt.plot(smooth_dataNW, 'blue', label = 'NW')
plt.title('Nadaraya Watson kernel')
line2, = plt.plot(scaled_data1,'orange', label = 'Raw data')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');


smootherNW_1 = ks.NadarayaWatsonSmoother(smoothing_parameter=100)
smooth_dataNW_1 = apply_transform(smootherNW_1, scaled_data1)

smootherNW_2 = ks.NadarayaWatsonSmoother(smoothing_parameter=5)
smooth_dataNW_2 = apply_transform(smootherNW_2, smooth_dataNW_1)

smootherNW_3 = ks.NadarayaWatsonSmoother(smoothing_parameter=0.1)
smooth_dataNW_3 = apply_transform(smootherNW_3, smooth_dataNW_2)

plt.figure(figsize=(20,10))
plt.title('Comparaison BD')
plt.plot(scaled_data1,'blue');

line1, = plt.plot(smooth_dataNW_1, 'yellow', label = 'bd=100')

line2, = plt.plot(smooth_dataNW_2, 'orange', label = 'bd=5')

line3, = plt.plot(smooth_dataNW, 'red', label = 'bd=0.1')

plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');


df_2018_example = load_csv_file("2018 10min.csv")


COLUMN = "Température huile sortie réfrigerant"
BGN = 7500
END = 8000
X = np.arange(BGN, END)

data = df_2018_example.iloc[BGN:END][COLUMN]
plt.plot(X, data, color = 'b');


def sure(lbd, x, N):
    """
    return the calculation of sure
    """
    return (N + np.sum(np.minimum(np.abs(x), lbd) ** 2)
            - 2 * np.sum(np.abs(x) < lbd))

def compute_sure_threshold(x: np.array) -> float:
    """
    return the best parameter to optimize the approximation
    """
    N = x.shape[0]
    lbd_candidates = np.arange(0, np.sqrt(2 * np.log(N)), 0.01)
    return np.argmin(np.array([sure(lbd, x, N)
                        for lbd in lbd_candidates]))

def shrink(x: np.array, lbd: float) -> np.array:
    """
    kill or update the coefficients (in the wavelets transform)
    """
    return np.sign(x) * np.maximum(0, np.abs(x) - lbd)

def threshold_dwt(df, level = 5, wavelet_type = 'db5') -> (list):
    """
    calulate coefficients in the wavelets transform choosen of df, treat them and reconstruct the signal 
    """
    # wavelet based deconstruction
    coeffs = pywt.wavedec(df, wavelet_type, level = 5)
    # coefficient thresholding
    coeffs = [coeffs[0]] + [shrink(cD, compute_sure_threshold(cD))
                for cD in coeffs[1:]]
    
    # signal reconstruction
    rdf = pywt.waverec(coeffs, wavelet_type)
    # For unknown reason I get one more element in the output
    return rdf[:df.shape[0]]


COLUMN = "Température palier étage 1"
BGN = 0
END = 200
X = np.arange(BGN, END)

level = 4
data = df_2018_example.iloc[BGN:END][COLUMN]
for _type in ['db', 'sym', 'coif']:
    for i in range(2, 9):
        level = i
        denoised_data = threshold_dwt(data, wavelet_type = 'db'+str(i), level = level)
        plt.xlabel(f"type = {_type}{i}")
        plt.plot(X, data, color = 'b');
        plt.plot(X, denoised_data, color = 'g');
        plt.show()

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))

/opt/tljh/user/lib/python3.7/site-packages/pywt/_multilevel.py:45: UserWarning: Level value of 5 is too high: all coefficients will experience boundary effects.
  "boundary effects.").format(level))


BGN = 11500
END = 12000
X = np.arange(BGN, END)

data = df_2018_example.iloc[BGN:END][COLUMN]
denoised_data = threshold_dwt(data)
plt.plot(X, data);
plt.plot(X, denoised_data, color = 'g');
plt.xlabel(f"Approximation : green")
plt.show()


BGN = 500
END = 1000
X = np.arange(BGN, END)

data = df_2018_example.iloc[BGN:END][COLUMN]
denoised_data = threshold_dwt(data)
plt.plot(X, data);
plt.plot(X, denoised_data, color = 'g');
plt.xlabel(f"Approximation : green")
plt.show()


# kde
smootherNW1 = ks.NadarayaWatsonSmoother(smoothing_parameter=10)
smooth_data1 = apply_transform(smootherNW1, scaled_data1)


# wavelet
denoised_data1 = scaled_data1.copy()
denoised_data1[:] = threshold_dwt(smooth_data1)


plt.figure(figsize=(20,10))
line0, = plt.plot(scaled_data1[BGN:END], 'blue', label = 'RAW')
line1, = plt.plot(denoised_data1,'orange', label = 'wavelet')
plt.title('Nadaraya Watson kernel')
line2, = plt.plot(smooth_data1, 'yellow', label = 'kde')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');


# Wavelet
denoised_data2 = scaled_data1.copy()
denoised_data2[:] = threshold_dwt(scaled_data1)


#kde
smootherNW2 = ks.NadarayaWatsonSmoother(smoothing_parameter=10)
smooth_data2 = apply_transform(smootherNW2, denoised_data2)


plt.figure(figsize=(20,10))
line0, = plt.plot(scaled_data1, 'blue', label = 'RAW')
line1, = plt.plot(denoised_data2,'orange', label = 'wavelet')
plt.title('Nadaraya Watson kernel')
line2, = plt.plot(smooth_data2, 'yellow', label = 'kde')
plt.legend(bbox_to_anchor=(1.05,1), loc = 'upper left');


df = df_total.copy()
# df


def local_smoothing(df, feature):
    denoised_data = df[feature].copy()
    denoised_data[:] = threshold_dwt(denoised_data)
    smootherNW = ks.NadarayaWatsonSmoother(smoothing_parameter=15)
    smooth_data = apply_transform(smootherNW, denoised_data)
    return smooth_data

def smoothing(df):
    for feature in df.columns[1:]:
        smooth_data = local_smoothing(df, feature)
        df.update(smooth_data)
    return df


# Implement a slicing because otherwise NadarayaWatsonSmoother need too much memory
def smoothing_with_slice_inplace(df, feature):
    NB_SAMPLES = 6196
    for i in range(0, df.shape[0], NB_SAMPLES):
        X = local_smoothing(df.iloc[i:min(i+NB_SAMPLES, df.shape[0])], feature)
        df.update(X)


# modif de la dataframe et visualisation
df_total_ref = df_total.copy()
smoothing_with_slice_inplace(df_total, "Température palier étage 1")

fig = plt.figure(figsize=(30,6))
plt.plot(df_total_ref["Température palier étage 1"])
plt.plot(df_total["Température palier étage 1"])

[<matplotlib.lines.Line2D at 0x7fdc1a1e2050>]


fig = plt.figure(figsize=(30,6))
temp_palier = df_total.loc[:, 'Température palier étage 1']
plt.plot(temp_palier)

[<matplotlib.lines.Line2D at 0x7fdc1a545710>]


temp_palier.to_numpy()
res = STL(temp_palier.to_numpy(), period=24*6*7).fit()
plt.figure(figsize=(30,6))
res.plot()
plt.show()

<Figure size 2160x432 with 0 Axes>


fig = plt.figure(figsize=(30,6))
plt.plot(res.resid)

[<matplotlib.lines.Line2D at 0x7fdc33e2acd0>]


df_total.loc[:,"Température palier étage 1"] = res.resid


df_total.to_csv("clean_data_detrended.csv")


plt.figure(figsize=(50,20))
plt.plot(df_total.loc[:,"Température palier étage 1"])

[<matplotlib.lines.Line2D at 0x7fdc18c07fd0>]

	TIMESTAMPS	MOTOR CURRENT	Température palier étage 1	Température palier étage 2	Température palier étage 3	Température palier étage 4	Déplacement axiale 1/2	Déplacement axiale 3/4	1e stade vibration X	1e stade vibration Y	2e stade vibration X	2e stade vibration Y	3e stade vibration X	3e stade vibration Y	4e stade vibration X	4e stade vibration Y	Température huile sortie réfrigerant
0	2018-01-01 00:10:00	902.333740	83.399986	91.199982	78.999985	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.60	7.068229	5.125000	7.328948	7.337153	44.399990
1	2018-01-01 00:20:00	902.333740	83.599983	91.199982	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.022708	5.022708	7.183333	7.346614	44.599991
2	2018-01-01 00:30:00	902.333740	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.125000	5.125000	7.279167	7.329514	44.599991
3	2018-01-01 00:40:00	902.333740	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.011652	5.125000	7.375000	7.356651	44.599991
4	2018-01-01 00:50:00	897.018066	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.625	7.875	8.50	7.022708	5.002292	7.327083	7.308574	44.799992
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
37376	2020-09-16 14:30:00	855.821594	89.399971	93.999977	80.599976	82.999977	-0.033999	-0.118028	5.125	8.375	8.875	10.25	6.503597	6.372628	7.250000	8.337501	NaN
37377	2020-09-16 14:40:00	847.848083	89.399971	93.999977	80.399979	82.999977	-0.033849	-0.117976	5.125	8.375	8.875	10.25	6.503816	6.367173	7.250000	8.326176	NaN
37378	2020-09-16 14:50:00	843.861389	89.199974	93.999977	80.399979	82.999977	-0.033699	-0.117925	5.125	8.375	8.875	10.25	6.500000	6.273644	7.250000	8.331106	NaN
37379	2020-09-16 15:00:00	849.177002	89.399971	94.199974	80.599976	83.199974	-0.033549	-0.117873	5.125	8.375	8.875	10.25	6.566219	6.366466	7.250000	8.258534	NaN
37380	2020-09-16 15:10:00	850.505981	89.399971	94.199974	80.399979	83.199974	-0.033399	-0.117821	5.125	8.375	8.875	10.25	6.500000	6.375000	7.250000	8.252553	NaN

	TIMESTAMPS	MOTOR CURRENT	Température palier étage 1	Déplacement axiale 1/2	Déplacement axiale 3/4	1e stade vibration X
0	2018-01-01 00:10:00	902.333740	83.399986	-0.046250	-0.120000	4.625
1	2018-01-01 00:20:00	902.333740	83.599983	-0.046250	-0.120000	4.625
2	2018-01-01 00:30:00	902.333740	83.599983	-0.046250	-0.120000	4.625
3	2018-01-01 00:40:00	902.333740	83.599983	-0.046250	-0.120000	4.625
4	2018-01-01 00:50:00	897.018066	83.599983	-0.046250	-0.120000	4.625
...	...	...	...	...	...	...
37376	2020-09-16 14:30:00	855.821594	89.399971	-0.033999	-0.118028	5.125
37377	2020-09-16 14:40:00	847.848083	89.399971	-0.033849	-0.117976	5.125
37378	2020-09-16 14:50:00	843.861389	89.199974	-0.033699	-0.117925	5.125
37379	2020-09-16 15:00:00	849.177002	89.399971	-0.033549	-0.117873	5.125
37380	2020-09-16 15:10:00	850.505981	89.399971	-0.033399	-0.117821	5.125

	TIMESTAMPS	MOTOR CURRENT	Température palier étage 1	Température palier étage 2	Température palier étage 3	Température palier étage 4	Déplacement axiale 1/2	Déplacement axiale 3/4	1e stade vibration X	1e stade vibration Y	2e stade vibration X	2e stade vibration Y	3e stade vibration X	3e stade vibration Y	4e stade vibration X	4e stade vibration Y	Température huile sortie réfrigerant	labels
0	2018-01-01 00:10:00	902.333740	83.399986	91.199982	78.999985	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.60	7.068229	5.125000	7.328948	7.337153	44.399990	0
1	2018-01-01 00:20:00	902.333740	83.599983	91.199982	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.022708	5.022708	7.183333	7.346614	44.599991	0
2	2018-01-01 00:30:00	902.333740	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.125000	5.125000	7.279167	7.329514	44.599991	0
3	2018-01-01 00:40:00	902.333740	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.011652	5.125000	7.375000	7.356651	44.599991	0
4	2018-01-01 00:50:00	897.018066	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.625	7.875	8.50	7.022708	5.002292	7.327083	7.308574	44.799992	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
37376	2020-09-16 14:30:00	855.821594	89.399971	93.999977	80.599976	82.999977	-0.033999	-0.118028	5.125	8.375	8.875	10.25	6.503597	6.372628	7.250000	8.337501	NaN	0
37377	2020-09-16 14:40:00	847.848083	89.399971	93.999977	80.399979	82.999977	-0.033849	-0.117976	5.125	8.375	8.875	10.25	6.503816	6.367173	7.250000	8.326176	NaN	0
37378	2020-09-16 14:50:00	843.861389	89.199974	93.999977	80.399979	82.999977	-0.033699	-0.117925	5.125	8.375	8.875	10.25	6.500000	6.273644	7.250000	8.331106	NaN	0
37379	2020-09-16 15:00:00	849.177002	89.399971	94.199974	80.599976	83.199974	-0.033549	-0.117873	5.125	8.375	8.875	10.25	6.566219	6.366466	7.250000	8.258534	NaN	0
37380	2020-09-16 15:10:00	850.505981	89.399971	94.199974	80.399979	83.199974	-0.033399	-0.117821	5.125	8.375	8.875	10.25	6.500000	6.375000	7.250000	8.252553	NaN	0

	TIMESTAMPS	MOTOR CURRENT	Température palier étage 1	Température palier étage 2	Température palier étage 3	Température palier étage 4	Déplacement axiale 1/2	Déplacement axiale 3/4	1e stade vibration X	1e stade vibration Y	2e stade vibration X	2e stade vibration Y	3e stade vibration X	3e stade vibration Y	4e stade vibration X	4e stade vibration Y	Température huile sortie réfrigerant	labels
0	2018-01-01 00:10:00	902.333740	83.399986	91.199982	78.999985	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.60	7.068229	5.125000	7.328948	7.337153	44.399990	0
1	2018-01-01 00:20:00	902.333740	83.599983	91.199982	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.022708	5.022708	7.183333	7.346614	44.599991	0
2	2018-01-01 00:30:00	902.333740	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.125000	5.125000	7.279167	7.329514	44.599991	0
3	2018-01-01 00:40:00	902.333740	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.011652	5.125000	7.375000	7.356651	44.599991	0
4	2018-01-01 00:50:00	897.018066	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.625	7.875	8.50	7.022708	5.002292	7.327083	7.308574	44.799992	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
37376	2020-09-16 14:30:00	855.821594	89.399971	93.999977	80.599976	82.999977	-0.033999	-0.118028	5.125	8.375	8.875	10.25	6.503597	6.372628	7.250000	8.337501	NaN	0
37377	2020-09-16 14:40:00	847.848083	89.399971	93.999977	80.399979	82.999977	-0.033849	-0.117976	5.125	8.375	8.875	10.25	6.503816	6.367173	7.250000	8.326176	NaN	0
37378	2020-09-16 14:50:00	843.861389	89.199974	93.999977	80.399979	82.999977	-0.033699	-0.117925	5.125	8.375	8.875	10.25	6.500000	6.273644	7.250000	8.331106	NaN	0
37379	2020-09-16 15:00:00	849.177002	89.399971	94.199974	80.599976	83.199974	-0.033549	-0.117873	5.125	8.375	8.875	10.25	6.566219	6.366466	7.250000	8.258534	NaN	0
37380	2020-09-16 15:10:00	850.505981	89.399971	94.199974	80.399979	83.199974	-0.033399	-0.117821	5.125	8.375	8.875	10.25	6.500000	6.375000	7.250000	8.252553	NaN	0

	TIMESTAMPS	MOTOR CURRENT	Température palier étage 1	Température palier étage 2	Température palier étage 3	Température palier étage 4	Déplacement axiale 1/2	Déplacement axiale 3/4	1e stade vibration X	1e stade vibration Y	2e stade vibration X	2e stade vibration Y	3e stade vibration X	3e stade vibration Y	4e stade vibration X	4e stade vibration Y	Température huile sortie réfrigerant	labels
0	2018-01-01 00:10:00	902.333740	83.399986	91.199982	78.999985	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.60	7.068229	5.125000	7.328948	7.337153	44.399990	0
1	2018-01-01 00:20:00	902.333740	83.599983	91.199982	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.022708	5.022708	7.183333	7.346614	44.599991	0
2	2018-01-01 00:30:00	902.333740	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.125000	5.125000	7.279167	7.329514	44.599991	0
3	2018-01-01 00:40:00	902.333740	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.011652	5.125000	7.375000	7.356651	44.599991	0
4	2018-01-01 00:50:00	897.018066	83.599983	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.625	7.875	8.50	7.022708	5.002292	7.327083	7.308574	44.799992	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
37376	2020-09-16 14:30:00	855.821594	89.399971	93.999977	80.599976	82.999977	-0.033999	-0.118028	5.125	8.375	8.875	10.25	6.503597	6.372628	7.250000	8.337501	NaN	0
37377	2020-09-16 14:40:00	847.848083	89.399971	93.999977	80.399979	82.999977	-0.033849	-0.117976	5.125	8.375	8.875	10.25	6.503816	6.367173	7.250000	8.326176	NaN	0
37378	2020-09-16 14:50:00	843.861389	89.199974	93.999977	80.399979	82.999977	-0.033699	-0.117925	5.125	8.375	8.875	10.25	6.500000	6.273644	7.250000	8.331106	NaN	0
37379	2020-09-16 15:00:00	849.177002	89.399971	94.199974	80.599976	83.199974	-0.033549	-0.117873	5.125	8.375	8.875	10.25	6.566219	6.366466	7.250000	8.258534	NaN	0
37380	2020-09-16 15:10:00	850.505981	89.399971	94.199974	80.399979	83.199974	-0.033399	-0.117821	5.125	8.375	8.875	10.25	6.500000	6.375000	7.250000	8.252553	NaN	0

	MOTOR CURRENT	Température palier étage 1	Température palier étage 2	Température palier étage 3	Température palier étage 4	Déplacement axiale 1/2	Déplacement axiale 3/4	1e stade vibration X	1e stade vibration Y	2e stade vibration X	2e stade vibration Y	3e stade vibration X	3e stade vibration Y	4e stade vibration X	4e stade vibration Y	Température huile sortie réfrigerant	labels
2018-01-01 00:10:00	902.333740	-1.145423	91.199982	78.999985	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.60	7.068229	5.125000	7.328948	7.337153	44.399990	0.0
2018-01-01 00:20:00	902.333740	-1.059078	91.199982	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.022708	5.022708	7.183333	7.346614	44.599991	0.0
2018-01-01 00:30:00	902.333740	-1.059078	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.125000	5.125000	7.279167	7.329514	44.599991	0.0
2018-01-01 00:40:00	902.333740	-1.059078	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.750	7.875	8.50	7.011652	5.125000	7.375000	7.356651	44.599991	0.0
2018-01-01 00:50:00	897.018066	-1.059078	91.399979	79.199982	NaN	-0.046250	-0.120000	4.625	6.625	7.875	8.50	7.022708	5.002292	7.327083	7.308574	44.799992	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2020-09-16 14:30:00	855.821594	1.444981	93.999977	80.599976	82.999977	-0.033999	-0.118028	5.125	8.375	8.875	10.25	6.503597	6.372628	7.250000	8.337501	NaN	0.0
2020-09-16 14:40:00	847.848083	1.444981	93.999977	80.399979	82.999977	-0.033849	-0.117976	5.125	8.375	8.875	10.25	6.503816	6.367173	7.250000	8.326176	NaN	0.0
2020-09-16 14:50:00	843.861389	1.358636	93.999977	80.399979	82.999977	-0.033699	-0.117925	5.125	8.375	8.875	10.25	6.500000	6.273644	7.250000	8.331106	NaN	0.0
2020-09-16 15:00:00	849.177002	1.444981	94.199974	80.599976	83.199974	-0.033549	-0.117873	5.125	8.375	8.875	10.25	6.566219	6.366466	7.250000	8.258534	NaN	0.0
2020-09-16 15:10:00	850.505981	1.444981	94.199974	80.399979	83.199974	-0.033399	-0.117821	5.125	8.375	8.875	10.25	6.500000	6.375000	7.250000	8.252553	NaN	0.0

I. Introduction¶

Importations¶

II. Labelling¶

Column names dictionnary¶

Thresholding¶

Spotting of the correlations and choice of the main features¶

Why don't we take all the features¶

How to choose those features?¶

Result¶

Labelling¶

III. Gap Filling¶

Why do we need to gap fill? What is our method?¶

Reading the data timepoints using get_deltas¶

Dropping duplicate values (time-wise)¶

Inserting missing samples to get regular time intervals in the data¶

Interpolating the data per column in order to remove missing (NaN) points, and filling in the gaps in the labels with 0¶

Results¶

IV. Standardization¶

Collecting the data¶

Feature : Temperature 1¶

Standardization¶

Why standardize data ?¶

What is the principle ?¶

Implementation¶

Application to labeled data : standard deviation calculation on normal slices, not on outlier ones¶

V. Smoothing¶

Kernel Density Estimation (KDE)¶

Smoothing method¶

Choice of the studied (scaled) data¶

Choice of the kernel and bandwidth¶

Kernel choice¶

Bandwidth choice¶

Wavelet¶

The algorithm¶

Principle¶

Choose the wavelets¶

Results with db5¶

Order of execution¶

1: kde 2: wavelet¶

1: wavelet 2: kde¶

Final solution¶

VI. Detrending¶

What is a Detrend?¶

Implementation¶

VII. Export clean data in CSV¶

Reading the data timepoints using `get_deltas`¶

Interpolating the data per column in order to remove missing (`NaN`) points, and filling in the gaps in the labels with `0`¶