Module tsflex.features.logger
Contains the used variables and functions to provide logging functionality.
See Also
FeatureCollection
- its
logging_file_path
of thecalculate
method.
Expand source code
"""Contains the used variables and functions to provide logging functionality.
See Also
--------
FeatureCollection: its `logging_file_path` of the `calculate` method.
"""
__author__ = "Jeroen Van Der Donckt"
import logging
import re
import numpy as np
import pandas as pd
from ..utils.argument_parsing import timedelta_to_str
from ..utils.logging import logging_file_to_df, remove_inner_brackets
# Package specific logger
logger = logging.getLogger("feature_calculation_logger")
logger.setLevel(logging.DEBUG)
# Create logger which writes WARNING messages or higher to sys.stderr
console = logging.StreamHandler()
console.setLevel(logging.WARNING)
logger.addHandler(console)
def _parse_message(message: str) -> list:
"""Parse the message of the logged info."""
regex = r"\[(.*?)\]"
matches = re.findall(regex, remove_inner_brackets(message))
assert len(matches) == 5
func = matches[0]
key = matches[1].replace("'", "")
window = matches[2].split(",")[0].strip()
stride = ",".join(matches[2].split(",")[1:]).strip()
if stride != "manual":
stride = eval(stride) # parse the tuple
output_names = matches[3].replace("'", "")
duration_s = float(matches[4].rstrip(" seconds"))
return [func, key, window, stride, output_names, duration_s]
def _parse_logging_execution_to_df(logging_file_path: str) -> pd.DataFrame:
"""Parse the logged messages into a dataframe that contains execution info.
Parameters
----------
logging_file_path: str
The file path where the logged messages are stored. This is the file path that
is passed to the `FeatureCollection` its `calculate` method.
Returns
-------
pd.DataFrame
A DataFrame with the features its function, input series names, output names,
and (%) calculation duration.
Note
----
This function only works when the ``logging_file_path`` used in a
``FeatureCollection`` its ``calculate`` method is passed.
"""
df = logging_file_to_df(logging_file_path)
df[
["function", "series_names", "window", "stride", "output_names", "duration"]
] = pd.DataFrame(
list(df["message"].apply(_parse_message)),
index=df.index,
)
# Parse the window
if (df["window"] == "manual").any():
# All should be manual
assert (df["window"] == "manual").all()
elif df["window"].str.isnumeric().all():
df["window"] = pd.to_numeric(df["window"])
else:
df["window"] = pd.to_timedelta(df["window"]).apply(timedelta_to_str)
# Parse the stride
if (df["stride"] == "manual").any():
# All should be manual
assert (df["stride"] == "manual").all()
elif (
df["stride"]
.apply(lambda stride_tuple: np.char.isnumeric(stride_tuple).all())
.all()
):
df["stride"] = df["stride"].apply(
lambda stride_tuple: tuple(sorted(pd.to_numeric(s) for s in stride_tuple))
)
else:
df["stride"] = df["stride"].apply(
lambda stride_tuple: tuple(
timedelta_to_str(pd.to_timedelta(s)) for s in stride_tuple
)
)
df["duration %"] = (100 * (df["duration"] / df["duration"].sum())).round(2)
return df.drop(columns=["name", "log_level", "message"])
def get_feature_logs(logging_file_path: str) -> pd.DataFrame:
"""Get execution (time) info for each feature of a `FeatureCollection`.
Parameters
----------
logging_file_path: str
The file path where the logged messages are stored. This is the file path that
is passed to the `FeatureCollection` its `calculate` method.
Returns
-------
pd.DataFrame
A DataFrame with the features its function, input series names and
(%) calculation duration.
"""
df = _parse_logging_execution_to_df(logging_file_path)
df["duration"] = pd.to_timedelta(df["duration"], unit="s")
return df
def get_function_stats(logging_file_path: str) -> pd.DataFrame:
"""Get execution (time) statistics for each function.
Parameters
----------
logging_file_path: str
The file path where the logged messages are stored. This is the file path that
is passed to the `FeatureCollection` its `calculate` method.
Returns
-------
pd.DataFrame
A DataFrame with for each function (i.e., `function-(window,stride)`)
combination the mean (time), std (time), sum (time), sum (% time),
mean (% time),and number of executions.
"""
df = _parse_logging_execution_to_df(logging_file_path)
# Get the sorted functions in a list to use as key for sorting the groups
sorted_funcs = (
df.groupby(["function"])
.agg({"duration": ["mean"]})
.sort_values(by=("duration", "mean"), ascending=True)
.index.to_list()
)
def key_func(idx_level): # type: ignore[no-untyped-def]
if all(idx in sorted_funcs for idx in idx_level):
return [sorted_funcs.index(idx) for idx in idx_level]
return idx_level
return (
df.groupby(["function", "window", "stride"])
.agg(
{
"duration": ["sum", "mean", "std", "count"],
"duration %": ["sum", "mean"],
}
)
.sort_index(key=key_func, ascending=False)
)
def get_series_names_stats(logging_file_path: str) -> pd.DataFrame:
"""Get execution (time) statistics for each `key-(window,stride)` combination.
Parameters
----------
logging_file_path: str
The file path where the logged messages are stored. This is the file path that
is passed to the `FeatureCollection` its `calculate` method.
Returns
-------
pd.DataFrame
A DataFrame with for each function the mean (time), std (time), sum (time),
sum (% time), mean (% time), and number of executions.
"""
df = _parse_logging_execution_to_df(logging_file_path)
return (
df.groupby(["series_names", "window", "stride"])
.agg(
{
"duration": ["sum", "mean", "std", "count"],
"duration %": ["sum", "mean"],
}
)
.sort_values(by=("duration", "sum"), ascending=False)
)
Functions
def get_feature_logs(logging_file_path)
-
Expand source code
def get_feature_logs(logging_file_path: str) -> pd.DataFrame: """Get execution (time) info for each feature of a `FeatureCollection`. Parameters ---------- logging_file_path: str The file path where the logged messages are stored. This is the file path that is passed to the `FeatureCollection` its `calculate` method. Returns ------- pd.DataFrame A DataFrame with the features its function, input series names and (%) calculation duration. """ df = _parse_logging_execution_to_df(logging_file_path) df["duration"] = pd.to_timedelta(df["duration"], unit="s") return df
Get execution (time) info for each feature of a
FeatureCollection
.Parameters
logging_file_path
:str
- The file path where the logged messages are stored. This is the file path that
is passed to the
FeatureCollection
itscalculate
method.
Returns
pd.DataFrame
- A DataFrame with the features its function, input series names and (%) calculation duration.
def get_function_stats(logging_file_path)
-
Expand source code
def get_function_stats(logging_file_path: str) -> pd.DataFrame: """Get execution (time) statistics for each function. Parameters ---------- logging_file_path: str The file path where the logged messages are stored. This is the file path that is passed to the `FeatureCollection` its `calculate` method. Returns ------- pd.DataFrame A DataFrame with for each function (i.e., `function-(window,stride)`) combination the mean (time), std (time), sum (time), sum (% time), mean (% time),and number of executions. """ df = _parse_logging_execution_to_df(logging_file_path) # Get the sorted functions in a list to use as key for sorting the groups sorted_funcs = ( df.groupby(["function"]) .agg({"duration": ["mean"]}) .sort_values(by=("duration", "mean"), ascending=True) .index.to_list() ) def key_func(idx_level): # type: ignore[no-untyped-def] if all(idx in sorted_funcs for idx in idx_level): return [sorted_funcs.index(idx) for idx in idx_level] return idx_level return ( df.groupby(["function", "window", "stride"]) .agg( { "duration": ["sum", "mean", "std", "count"], "duration %": ["sum", "mean"], } ) .sort_index(key=key_func, ascending=False) )
Get execution (time) statistics for each function.
Parameters
logging_file_path
:str
- The file path where the logged messages are stored. This is the file path that
is passed to the
FeatureCollection
itscalculate
method.
Returns
pd.DataFrame
- A DataFrame with for each function (i.e.,
function-(window,stride)
) combination the mean (time), std (time), sum (time), sum (% time), mean (% time),and number of executions.
def get_series_names_stats(logging_file_path)
-
Expand source code
def get_series_names_stats(logging_file_path: str) -> pd.DataFrame: """Get execution (time) statistics for each `key-(window,stride)` combination. Parameters ---------- logging_file_path: str The file path where the logged messages are stored. This is the file path that is passed to the `FeatureCollection` its `calculate` method. Returns ------- pd.DataFrame A DataFrame with for each function the mean (time), std (time), sum (time), sum (% time), mean (% time), and number of executions. """ df = _parse_logging_execution_to_df(logging_file_path) return ( df.groupby(["series_names", "window", "stride"]) .agg( { "duration": ["sum", "mean", "std", "count"], "duration %": ["sum", "mean"], } ) .sort_values(by=("duration", "sum"), ascending=False) )
Get execution (time) statistics for each
key-(window,stride)
combination.Parameters
logging_file_path
:str
- The file path where the logged messages are stored. This is the file path that
is passed to the
FeatureCollection
itscalculate
method.
Returns
pd.DataFrame
- A DataFrame with for each function the mean (time), std (time), sum (time), sum (% time), mean (% time), and number of executions.