Direct vs Recursive Forecasting#

The purpose of this notebook is to compare the performance of direct forecasting and recursive forecasting using the MLForecast library. Direct forecasting means that we train a family of model to predict the target value at various horizons in the future, e.g. 1 hour, 2 hours, …, 24 hours ahead. Recursive forecasting (also known as auto-regressive forecasting) means that we train a single model to predict the target value at the next time step, and then use the model recursively to predict the next time step using the previous predictions as input features. Implementing recursive forecasting is a bit cumbersome to do manually, hence we use the MLForecast library to handle this for us.

The objective is to show that recursive forecasting can be more efficient in terms of memory usage and training time. However, it can also lead to a loss of accuracy because recursive calls are fed with previous predictions that do not necessarily match the training distribution of the model, and can therefore lead to degenerate predictions, in particular when the variance of the lagged values is informative.

We highlight this issue with a synthetic dataset that has two types of segments:

  • Segment type “a” has a prefix centered around 0 with low variance and a suffix centered around 1 with low variance.

  • Segment type “b” has a prefix centered around 0 with high variance and a suffix centered around -1 with low variance.

Segment of type “a” and “b” are independently sampled, meaning that is not possible to forecast beyond the length of the segments. However, it should be quite easy to predict the end of a segment given the prefix of the segment with lagged feature engineering.

from time import perf_counter
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tzdata  # noqa: F401

SEGMENT_LENGTH = 30


def generate_synthetic_1(
    segment_length=SEGMENT_LENGTH,
    n_segments=100,
    low_noise_level=0.01,
    high_noise_level=0.1,
    seed=None,
):
    """Generate synthetic time series data with two types of segments

    - segment type "a" has a prefix centered around 0 and a suffix centered
      around 1.
    - segment type "b" has a prefix centered around 0 with high variance and a
      suffix centered around -1.

    The variance of the prefix is therefore predictive of the suffix.

    The suffix values predictive of the next segment prefix's mean (always 0).
    """
    rng = np.random.default_rng(seed)
    total_length = segment_length * n_segments
    segment_types = rng.choice(["a", "b"], n_segments)
    prefix_length = segment_length // 2
    suffix_length = segment_length - prefix_length

    segments = []
    for segment_type in segment_types:
        if segment_type == "a":
            # Prefix is centered around 0 with low variance
            segments.append(
                rng.normal(loc=0, scale=low_noise_level, size=prefix_length)
            )
            # Suffix is centered around 1 with low variance
            segments.append(
                rng.normal(loc=1, scale=low_noise_level, size=suffix_length)
            )
        elif segment_type == "b":
            # Prefix is also centered around 0 but with high variance
            segments.append(
                rng.normal(loc=0, scale=high_noise_level, size=prefix_length)
            )
            # Suffix is centered around -1 with low variance
            segments.append(
                rng.normal(loc=-1, scale=low_noise_level, size=suffix_length)
            )
    return pd.DataFrame(
        {
            "time": np.arange(total_length),
            "y": np.concatenate(segments),
            "series_id": np.zeros(total_length, dtype=np.int32),
        }
    )


data = generate_synthetic_1(n_segments=500, seed=1)
cutoff = -SEGMENT_LENGTH * 10  # 10 segments for testing
data_train = data.iloc[:cutoff]
data_test = data.iloc[cutoff:]
_ = data_train.plot(x="time", y="y", title="Synthetic data 1", figsize=(15, 5))
../../_images/3dae77693a21412de9f98dc6a584bbb98815064434d1e93c1ff6b7474d054498.png
_ = data_train.iloc[: SEGMENT_LENGTH * 10].plot(
    x="time",
    y="y",
    title="Synthetic data 1 - First points of training set",
    figsize=(15, 5),
)

_ = data_test.iloc[: SEGMENT_LENGTH * 10].plot(
    x="time",
    y="y",
    title="Synthetic data 1 - First points of testing set",
    figsize=(15, 5),
)
../../_images/d3f3b182cfac49dd7a8fa54b6829ced6e0245575981968685c6cdb4834a7b881.png ../../_images/9860ec69f71e2c26b7058bb6f9a3be21466d2f316c3488497795137a927a3d7b.png
from mlforecast import MLForecast
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import SplineTransformer, PolynomialFeatures
from sklearn.kernel_approximation import Nystroem
from mlforecast.lag_transforms import (
    RollingMax,
    RollingMin,
    RollingMean,
    RollingStd,
)
from mlforecast.target_transforms import Differences
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeRegressor
import warnings
import threadpoolctl

# Workaround a performance problem with HistGradientBoostingRegressor on small datasets.
threadpoolctl.threadpool_limits(limits=1, user_api="openmp")
warnings.filterwarnings("ignore", category=RuntimeWarning, module="sklearn")


# MLForecast can train multiple models in parallel, each model can be a
# pipeline of transformers and a regressor. However we focus on a single
# HistGradientBoostingRegressor model to make sure that this notebook runs
# quickly enough. Feel free to uncomment the other models to compare their
# performance if you have enough time and memory available.
#
# Spoiler alert: the HistGradientBoostingRegressor model is the most accurate.

mlf = MLForecast(
    models=[
        # make_pipeline(
        #     SplineTransformer(sparse_output=True, n_knots=10),
        #     PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),
        #     # Nystroem(kernel="poly", n_components=200, degree=2, random_state=0),
        #     SelectKBest(k=100),
        #     Ridge(alpha=1e-6),
        # ),
        # RandomForestRegressor(
        #     n_estimators=100,
        #     max_features=0.8,
        #     max_depth=8,
        #     min_samples_leaf=300,
        #     n_jobs=4,
        # ),
        # DecisionTreeRegressor(max_depth=8, min_samples_leaf=300),
        HistGradientBoostingRegressor(),
    ],
    freq=1,
    lags=range(1, SEGMENT_LENGTH + 1),
    lag_transforms={
        1: [
            RollingMean(SEGMENT_LENGTH // 2),
            RollingStd(SEGMENT_LENGTH // 2),
        ],
        SEGMENT_LENGTH
        // 2: [
            RollingMax(SEGMENT_LENGTH // 2),
            RollingMin(SEGMENT_LENGTH // 2),
        ],
    },
    # target_transforms=[Differences([1])],
    num_threads=4,
)
schema = dict(
    time_col="time",
    id_col="series_id",
    target_col="y",
)
mlf.preprocess(data_train, **schema)
time y series_id lag1 lag2 lag3 lag4 lag5 lag6 lag7 ... lag25 lag26 lag27 lag28 lag29 lag30 rolling_mean_lag1_window_size15 rolling_std_lag1_window_size15 rolling_max_lag15_window_size15 rolling_min_lag15_window_size15
30 30 0.086113 0 0.992702 0.993013 1.031000 0.975424 1.003332 0.993741 0.990852 ... -0.000597 -0.003396 0.003249 -0.003548 0.006188 -0.015397 0.996924 0.013039 1.006198 -0.008697
31 31 -0.003983 0 0.086113 0.992702 0.993013 1.031000 0.975424 1.003332 0.993741 ... 0.002458 -0.000597 -0.003396 0.003249 -0.003548 0.006188 0.935585 0.235347 1.006198 -0.008697
32 32 -0.177943 0 -0.003983 0.086113 0.992702 0.993013 1.031000 0.975424 1.003332 ... -0.007467 0.002458 -0.000597 -0.003396 0.003249 -0.003548 0.869826 0.337127 1.006198 -0.008697
33 33 0.062693 0 -0.177943 -0.003983 0.086113 0.992702 0.993013 1.031000 0.975424 ... 0.006787 -0.007467 0.002458 -0.000597 -0.003396 0.003249 0.791984 0.429595 1.006198 -0.008697
34 34 0.085538 0 0.062693 -0.177943 -0.003983 0.086113 0.992702 0.993013 1.031000 ... -0.004699 0.006787 -0.007467 0.002458 -0.000597 -0.003396 0.729470 0.463953 1.006198 -0.008697
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14695 14695 1.010180 0 1.000320 1.014936 1.003430 0.991600 0.998221 0.992632 0.989077 ... 0.008091 0.999022 0.989610 1.017450 0.986895 1.000529 0.664467 0.489913 1.017450 -0.017102
14696 14696 1.019595 0 1.010180 1.000320 1.014936 1.003430 0.991600 0.998221 0.992632 ... 0.010742 0.008091 0.999022 0.989610 1.017450 0.986895 0.731740 0.460728 1.017450 -0.017102
14697 14697 1.014361 0 1.019595 1.010180 1.000320 1.014936 1.003430 0.991600 0.998221 ... 0.012473 0.010742 0.008091 0.999022 0.989610 1.017450 0.800022 0.417685 0.999022 -0.017102
14698 14698 0.987593 0 1.014361 1.019595 1.010180 1.000320 1.014936 1.003430 0.991600 ... -0.006401 0.012473 0.010742 0.008091 0.999022 0.989610 0.867881 0.355933 0.999022 -0.017102
14699 14699 1.002173 0 0.987593 1.014361 1.019595 1.010180 1.000320 1.014936 1.003430 ... -0.001378 -0.006401 0.012473 0.010742 0.008091 0.999022 0.934417 0.260507 0.012473 -0.017102

14670 rows × 37 columns

Recursive or auto-regressive forecasting#

tic = perf_counter()
mlf.fit(data_train, **schema)  # recursive forecasting by default in mlforecast
print(f"Recursive forecasting training time: {perf_counter() - tic:.1f} seconds")
Recursive forecasting training time: 0.4 seconds
PREDICTION_HORIZON = SEGMENT_LENGTH * 2


def collect_predictions(mlf, data_test, test_offset=0):
    """Collect predictions from the MLForecast object."""
    all_predictions = []
    UPDATE_CHUNK_SIZE = 5
    while test_offset < len(data_test):

        new_predictions = mlf.predict(PREDICTION_HORIZON)
        new_predictions["horizon"] = np.arange(new_predictions.shape[0]) + 1
        new_predictions = new_predictions.merge(
            data_test, on=["time", "series_id"], how="left"
        )
        all_predictions.append(new_predictions)

        # Update the forecaster with the new observations
        mlf.update(data_test.iloc[test_offset : test_offset + UPDATE_CHUNK_SIZE])
        test_offset += UPDATE_CHUNK_SIZE

    return all_predictions


tic = perf_counter()
all_recursive_predictions = collect_predictions(mlf, data_test)
print(f"Recursive forecasting prediction time: {perf_counter() - tic:.1f} seconds")
Recursive forecasting prediction time: 27.8 seconds

Direct forecasting#

Let’s pass max_horizon to force modeling for direct forecasting.

tic = perf_counter()
mlf.fit(data_train, max_horizon=PREDICTION_HORIZON, **schema)
print(f"Direct forecasting training time: {perf_counter() - tic:.1f} seconds")
Direct forecasting training time: 17.6 seconds
tic = perf_counter()
all_direct_predictions = collect_predictions(mlf, data_test)
print(f"Direct forecasting prediction time: {perf_counter() - tic:.1f} seconds")
Direct forecasting prediction time: 4.5 seconds

Quantitative comparison#

def score_predictions(all_predictions, model_name):
    """Compute the mean absolute error of the predictions."""
    all_predictions = pd.concat(all_predictions)
    all_predictions["absolute_error"] = np.abs(
        all_predictions["y"] - all_predictions[model_name]
    )
    return all_predictions.dropna().groupby("horizon")


import matplotlib.pyplot as plt

fig, ax = plt.subplots()
score_predictions(
    all_recursive_predictions, "HistGradientBoostingRegressor"
).mean().reset_index().plot(x="horizon", y="absolute_error", label="recursive", ax=ax)
score_predictions(
    all_direct_predictions, "HistGradientBoostingRegressor"
).mean().reset_index().plot(x="horizon", y="absolute_error", label="direct ", ax=ax)
_ = ax.set(ylabel="MAE")
../../_images/48f6476714151be95e4f1f281b24f4fdbcdda968d778c444a24f5ba5dd1e2602.png

Qualitative comparison#

def plot_some_predictions(all_predictions, data_test, model_name, nrows=12, title=None):

    fig, axes = plt.subplots(nrows=nrows, figsize=(15, 5 * nrows))
    for row_idx, predictions in enumerate(all_predictions):
        predictions = predictions.drop("y", axis=1)
        merged_data = data_test.copy()
        merged_data = merged_data.merge(
            predictions, on=["time", "series_id"], how="left"
        )
        merged_data.drop(["series_id"], axis=1).iloc[: SEGMENT_LENGTH * 3].plot(
            x="time", y=["y", model_name], ax=axes[row_idx]
        )
        axes[row_idx].set_title(title)
        axes[row_idx].set_ylim(-1.2, 1.2)

        if row_idx >= nrows - 1:
            break
plot_some_predictions(
    all_recursive_predictions,
    data_test,
    "HistGradientBoostingRegressor",
    title="recursive",
)
../../_images/a94ab9655d5db62118aa4790423c3733437ee01ad096ce621d757183a193ac16.png
plot_some_predictions(
    all_direct_predictions, data_test, "HistGradientBoostingRegressor", title="direct"
)
../../_images/dc117a595fe3b88b5dc19fdb47255fcb415fa393e15d2fd9efcf2b456a0e6a27.png