VIRGO SPM dataset¶
[1]:
import os
import numpy as np
import pandas as pd
import pywt
from tsipy.correction import compute_exposure, correct_degradation, load_model
from tsipy.utils import (
plot_signals,
plot_signals_history,
pprint,
pprint_block,
)
Load corrected dataset by C. Fröhlich¶
[2]:
import datetime
def load_dat_file(file_name : str, directory, n_cols : int, sep=" ") -> pd.DataFrame:
rows = []
with open(os.path.join(directory, file_name), "r") as f:
line = f.readline()
n_header = int(line.strip().split(";")[0].strip())
i = 1
while True:
line = f.readline()
i += 1
# Skip header
if i <= n_header:
continue
# EOF with empty line
if not line:
break
line = line.strip().split(sep)
if len(line) != n_cols:
raise ValueError(f"Invalid number of columns {len(line)} in line {i}.")
rows.append(line)
data = np.array(rows)
return data
def parse_timestamp(timestamp, start_date):
date_str, day_frac = timestamp.split(".")
date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
time_delta = date - start_date
return time_delta.days + float(f"0.{day_frac}")
def load_data_spm(file_name : str, directory="../data") -> pd.DataFrame:
if f"{file_name}.h5" in os.listdir(directory):
return pd.read_hdf(os.path.join(directory, f"{file_name}.h5"), "table")
if file_name == "spm_level2_claus":
data = load_dat_file("spm_level2_claus.dat", "../data", 4)
data = pd.DataFrame(data, columns=["t", "r", "g", "b"])
data["t"] = data["t"].map(lambda x: parse_timestamp(x, datetime.datetime(1996, 4, 17)))
for channel in ["r", "g", "b"]:
channel_data = np.copy(data[channel].values.astype(float))
nan_ids = np.equal(channel_data, -9.99999)
channel_data[nan_ids] = np.nan
data[channel] = channel_data
data.to_hdf(os.path.join(directory, f"{file_name}.h5"), "table", mode="w")
return data
[3]:
data = load_data_spm("spm_level2_claus")
t = data["t"].values
r = data["r"].values
g = data["g"].values
b = data["b"].values
_ = plot_signals([(t, r, "$r$", {"c": "tab:red"})])
_ = plot_signals([(t, g, "$g$", {"c": "tab:green"})])
_ = plot_signals([(t, b, "$b$", {"c": "tab:blue"})])
Load level-1 data¶
[4]:
def load_data_spm(file_name : str, directory="../data", n_cols=3) -> pd.DataFrame:
if f"{file_name}.h5" in os.listdir(directory):
return pd.read_hdf(os.path.join(directory, f"{file_name}.h5"), "table")
rows = []
with open(os.path.join(directory, f"{file_name}.txt"), "r") as f:
i = 0
while True:
line = f.readline()
line = line.strip().split()
i += 1
# EOF with empty line
if not line:
break
if len(line) != n_cols:
print(i, line)
raise ValueError("Line {} does not have 3 entries: {}".format(i, line))
rows.append(line)
data = np.array(rows).astype(np.float)
data = pd.DataFrame(data, columns=["t", "a", "b"])
data.to_hdf(os.path.join(directory, f"{file_name}.h5"), "table", mode="w")
return data
[43]:
def moving_filter(x : np.ndarray, n_window : int = 11, n_std : int = 2):
x = pd.Series(x)
x_mean = x.rolling(n_window, min_periods=1, center=True).mean()
x_std = x.rolling(n_window, min_periods=1, center=True).std()
nan_ids = np.greater(np.abs(x - x_mean), n_std * x_std)
print(np.sum(nan_ids), nan_ids.size)
return nan_ids
[60]:
data = load_data_spm(file_name="spm_red_level1", directory="../data", n_cols=3)
t = data["t"].values
a = data["a"].values
b = data["b"].values
a_out = np.copy(a)
nan_ids = np.less(a, 0.58)
a[nan_ids] = np.nan
nan_ids = np.logical_or(nan_ids, moving_filter(a, n_window=200, n_std=5))
a[nan_ids] = np.nan
a_out[~nan_ids] = np.nan
fig, ax = plot_signals(
[(t, a, "$a$", {}), (t, b, "$b$", {})], legend="upper right"
)
# ax.scatter(t, a_out, label="$a_o$", c="tab:red")
_ = plot_signals(
[(t, a, "$a$", {}), (t, b, "$b$", {})], legend="upper right", x_lim=(0.0, 100.0)
)
552 12434400
[6]:
data = load_data_spm(file_name="spm_red_detrended", directory="../data", n_cols=3)
t = data["t"].values
a = data["a"].values
b = data["b"].values
data["a"][data["a"] < 0.005] = np.nan
_ = plot_signals(
[(t, a, "$a$", {}), (t, b, "$b$", {})], legend="upper right"
)
_ = plot_signals(
[(t, a, "$a$", {}), (t, b, "$b$", {})], legend="upper right", x_lim=(0.0, 100.0)
)
[ ]:
[3]:
pprint_block("SPM Dataset", level=1)
data = load_data_spm("SpmRed", directory="../data")
# Treat zero-measurements as nans
data["a"][data["a"] == 0.0] = np.nan
# Compute exposure
e_a = compute_exposure(data["a"].values)
e_b = compute_exposure(data["b"].values)
max_e = max(np.max(e_a), np.max(e_b))
e_a /= max_e
e_b /= max_e
data["e_a"] = e_a
data["e_b"] = e_b
# Channel measurements
data_a = data[["t", "a", "e_a"]].dropna()
data_b = data[["t", "b", "e_b"]].dropna()
t_a_nn, t_b_nn = data_a["t"].values, data_b["t"].values
a_nn, b_nn = data_a["a"].values, data_b["b"].values
e_a_nn, e_b_nn = data_a["e_a"].values, data_b["e_b"].values
pprint("Signal", level=0)
pprint("- t_a_nn", t_a_nn.shape, level=1)
pprint("- a_nn", a_nn.shape, level=1)
pprint("- e_a_nn", e_a_nn.shape, level=1)
pprint("Signal", level=0)
pprint("- t_b_nn", t_b_nn.shape, level=1)
pprint("- b_nn", b_nn.shape, level=1)
pprint("- e_b_nn", e_b_nn.shape, level=1)
# Mutual measurements
pprint_block("Simultaneous measurements", level=1)
data_m = data[["t", "a", "b", "e_a", "e_b"]].dropna()
t_m = data_m["t"].values
a_m, b_m = data_m["a"].values, data_m["b"].values
e_a_m, e_b_m = data_m["e_a"].values, data_m["e_b"].values
pprint("Signal", level=0)
pprint("- t_m", t_m.shape, level=1)
pprint("- a_m", a_m.shape, level=1)
pprint("- e_a_m", e_a_m.shape, level=1)
pprint("Signal", level=0)
pprint("- t_m", t_m.shape, level=1)
pprint("- b_m", b_m.shape, level=1)
pprint("- e_b_m", e_b_m.shape, level=1)
SPM Dataset
---------------------------------------------------------------------------
Signal
- t_a_nn (12385572,)
- a_nn (12385572,)
- e_a_nn (12385572,)
Signal
- t_b_nn (711,)
- b_nn (711,)
- e_b_nn (711,)
Simultaneous measurements
---------------------------------------------------------------------------
Signal
- t_m (708,)
- a_m (708,)
- e_a_m (708,)
Signal
- t_m (708,)
- b_m (708,)
- e_b_m (708,)
[ ]:
[ ]:
Dataset visualization¶
[5]:
fig, ax = plot_signals([(t_a_nn, a_nn, "$a$", {})])
ax.scatter(t_b_nn, b_nn, label="$b$", c="tab:red")
ax.legend(loc="upper right")
fig, ax = plot_signals(
[
(t_m, a_m, "$a_m$", {}),
(t_m, b_m, "$b_m$", {}),
],
legend="upper right"
)
ratio_m = np.divide(a_m, b_m)
fig, ax = plot_signals([(t_m, ratio_m, "$r_m$", {})], legend="upper right")
Degradation Correction (Naive)¶
[17]:
from collections import namedtuple
pprint_block("Degradation Correction", level=1)
Correction = namedtuple(
"Correction",
[
"a_m_c",
"b_m_c",
"d_a_c",
"d_b_c",
"a_c_nn",
"b_c_nn",
]
)
corrections = dict()
for model_name in ["exp", "explin"]:
degradation_model = load_model(model_name)
degradation_model.initial_fit(x_a=e_a_m, y_a=a_m, y_b=b_m)
a_m_c, b_m_c, degradation_model, history = correct_degradation(
t_m=t_m,
a_m=a_m,
e_a_m=e_a_m,
b_m=b_m,
e_b_m=e_b_m,
model=degradation_model,
verbose=True,
)
d_a_c = degradation_model(e_a_nn)
d_b_c = degradation_model(e_b_nn)
a_c_nn = np.divide(a_nn, d_a_c)
b_c_nn = np.divide(b_nn, d_b_c)
correction = Correction(
a_m_c, b_m_c, d_a_c, d_b_c, a_c_nn, b_c_nn
)
corrections[model_name] = correction
signal_fourplets = []
for model_name in ["exp", "explin"]:
correction = corrections[model_name]
signal_fourplets.extend(
[
(t_m, correction.a_m_c, "{} $a_c$".format(model_name), {}),
(t_m, correction.b_m_c, "{} $b_c$".format(model_name), {}),
]
)
_ = plot_signals(signal_fourplets, legend="upper right")
signal_fourplets = []
for model_name in ["exp", "explin"]:
correction = corrections[model_name]
signal_fourplets.extend(
[
(t_a_nn, correction.d_a_c, "{} $d(e_a(t))$".format(model_name), {}),
(t_b_nn, correction.d_b_c, "{} $d(e_b(t))$".format(model_name), {}),
]
)
_ = plot_signals(signal_fourplets, legend="lower left")
Degradation Correction
---------------------------------------------------------------------------
- Corrected in 3 iterations.
- Corrected in 3 iterations.
[18]:
# Smoothing and outlier filtering
# Decomposition
c = pywt.wavedec(a_nn, wavelet="db5", mode="per")
# Denoising - Thresholding
thresh = 1.0 * np.nanmax(a_nn)
c[1:] = (pywt.threshold(coeff, value=thresh, mode="soft") for coeff in c[1:])
# Reconstruction
a_nn_r = pywt.waverec(c, wavelet="db5", mode="per")
fig, ax = plot_signals([
(t_a_nn, a_nn, "$a_{nn}$", {"lw": 2, "alpha": 0.5}),
(t_a_nn, a_nn_r, "$a_{nn, r}$", {"lw": 1, "c": "k"}),
],
legend="lower left"
)
[ ]: