Summarize a scoring table with scores relative to a baseline model. — summarise_scores_with

Light wrapper of scoringutils::summarise_scores() and scoringutils::get_pairwise_comparisons() that automatically adds relative metrics with a baseline.

Usage

summarise_scores_with_baseline(
  scores,
  baseline,
  compare = "model",
  metric_to_compare = intersect(c("wis", "crps", "brier_score"), names(scores)),
  by = NULL,
  ...
)

summarize_scores_with_baseline(
  scores,
  baseline,
  compare = "model",
  metric_to_compare = intersect(c("wis", "crps", "brier_score"), names(scores)),
  by = NULL,
  ...
)

Arguments

scores: A table of scores, as the output of scoringutils::score().
baseline: Name of the baseline, as a string. Should be an element of the column defined by the compare argument.
compare: Column name of the column that uniquely identifies sets of forecasts (typically, models) to compare, as a string. Passed as the compare argument to scoringutils::get_pairwise_comparisons(). Default "model", matching the default of scoringutils::get_pairwise_comparisons().
metric_to_compare: Name of the (absolute) metric for which to compute relative values. Passed as the metric argument to scoringutils::get_pairwise_comparisons(). By default, chooses "wis", "crps", or "brier_score" if that metric is available in the input scores. This matches the default in scoringutils::get_pairwise_comparisons().
by: Columns besides the column in compare to group by when scoring. by is passed directly as the by argument to scoringutils::get_pairwise_comparisons(). c(compare, by) is passed as the by argument to scoringutils::summarise_scores(). Default NULL.
...: additional arguments passed to scoringutils::summarise_scores().

Value

A data frame with summarised scores for each model, including relative scores where possible.

Examples

quantile_summary <- scoringutils::example_quantile |>
  scoringutils::score() |>
  summarise_scores_with_baseline(
  baseline = "EuroCOVIDhub-baseline")

print(quantile_summary)
#>                    model         wis overprediction underprediction dispersion
#>                   <char>       <num>          <num>           <num>      <num>
#> 1: EuroCOVIDhub-ensemble  8992.62316    5025.130095      2120.64029 1846.85278
#> 2: EuroCOVIDhub-baseline 14321.48926    7081.000000      5143.53567 2096.95360
#> 3:  epiforecasts-EpiNow2 10827.40786    6179.439535      1697.23411 2950.73422
#> 4:       UMass-MechBayes    52.65195       8.978601        16.80095   26.87239
#>           bias interval_coverage_50 interval_coverage_90   ae_median
#>          <num>                <num>                <num>       <num>
#> 1:  0.00812500            0.6328125            0.9023438 12077.10156
#> 2:  0.21851562            0.4960938            0.9101562 19353.42969
#> 3: -0.04336032            0.4453441            0.8461538 14521.10526
#> 4: -0.02234375            0.4609375            0.8750000    78.47656
#>    mean_scores_ratio wis_scaled_relative_skill
#>                <num>                     <num>
#> 1:         0.6279112                 0.5036559
#> 2:         1.0000000                 1.0000000
#> 3:         0.7297431                 0.6444541
#> 4:         0.3303053                 0.4662919

sample_summary <- scoringutils::example_sample_discrete |>
  scoringutils::score() |>
  summarise_scores_with_baseline(
  baseline = "EuroCOVIDhub-baseline",
  by = "location")
print(sample_summary)
#>                     model location        bias       dss        crps
#>                    <char>   <char>       <num>     <num>       <num>
#>  1: EuroCOVIDhub-ensemble       DE  0.06640625 14.585791  3996.59153
#>  2: EuroCOVIDhub-baseline       DE  0.45781250 17.207565  7871.84502
#>  3:  epiforecasts-EpiNow2       DE -0.09296875 20.246055  6775.82807
#>  4:       UMass-MechBayes       DE -0.58359375 10.620242    76.81818
#>  5: EuroCOVIDhub-ensemble       FR  0.02929688 20.952984 23600.27501
#>  6: EuroCOVIDhub-baseline       FR  0.15664063       NaN 28147.97039
#>  7:  epiforecasts-EpiNow2       FR -0.03818182 24.751637 31536.43798
#>  8:       UMass-MechBayes       FR  0.05312500 11.322718    85.43188
#>  9: EuroCOVIDhub-ensemble       GB -0.18398438 14.972982  8925.17672
#> 10: EuroCOVIDhub-baseline       GB -0.26875000 18.969516 20479.26157
#> 11:  epiforecasts-EpiNow2       GB  0.12148438 15.042455 10268.71716
#> 12:       UMass-MechBayes       GB  0.62500000  9.041895    40.56682
#> 13: EuroCOVIDhub-ensemble       IT  0.12851562 15.108105  2985.80009
#> 14: EuroCOVIDhub-baseline       IT  0.36562500 16.712333  4739.67421
#> 15:  epiforecasts-EpiNow2       IT -0.09179688 44.180755  1785.94208
#> 16:       UMass-MechBayes       IT -0.20703125  9.360836    37.95668
#>     overprediction underprediction dispersion log_score         mad   ae_median
#>              <num>           <num>      <num>     <num>       <num>       <num>
#>  1:    2379.839063    3.022727e+02 1314.47981  8.236498  5687.57792  5843.27344
#>  2:    6036.800781    1.026883e+02 1732.35596  9.467819  6921.42544 10131.54688
#>  3:    4593.658594    4.848375e+02 1697.33197 14.629958  7209.33941  9319.66406
#>  4:       0.131250    4.125625e+01   35.43068  6.346216   147.47237   113.14062
#>  5:   17010.621875    3.883522e+03 2706.13126 17.372682 10346.15036 27192.31250
#>  6:   18400.061719    5.428659e+03 4319.24930       Inf 12809.53659 32654.15625
#>  7:   22098.452727    4.496860e+03 4941.12525 21.565394 19341.40656 37033.10000
#>  8:      15.978125    2.434844e+01   45.10531  6.519167   186.18213    98.29688
#>  9:     412.983594    4.726980e+03 3785.21344  8.500156 15402.31442 12136.94531
#> 10:      40.872656    1.751203e+04 2926.35532 10.090072 11417.30569 25996.53125
#> 11:    2772.551563    1.768381e+03 5727.78435  8.358450 23875.58191 12975.75781
#> 12:      24.051563    6.093750e-02   16.45432  5.311195    66.99499    62.06250
#> 13:    1323.830469    8.200516e+02  841.91806  8.882401  3618.49379  4451.60938
#> 14:    2329.627344    4.147633e+02 1995.28358  8.992583  7573.38720  6947.88281
#> 15:     605.171094    6.012195e+02  579.55146       Inf  2463.60630  2535.30469
#> 16:       4.673438    1.145781e+01   21.82543  5.592964    93.77445    45.14062
#>          se_mean mean_scores_ratio crps_scaled_relative_skill
#>            <num>             <num>                      <num>
#>  1: 2.247307e+08         0.5077071                  0.4674125
#>  2: 4.670460e+08         1.0000000                  1.0000000
#>  3: 6.432208e+08         0.8607675                  0.7979298
#>  4: 1.543624e+04         0.4954612                  0.5805554
#>  5: 7.135968e+09         0.8384361                  0.6667121
#>  6: 8.267762e+09         1.0000000                  1.0000000
#>  7: 1.153157e+10         0.9637155                  0.8729716
#>  8: 2.700341e+04         0.4213216                  0.5849167
#>  9: 9.704756e+08         0.4358154                  0.3724142
#> 10: 2.614459e+09         1.0000000                  1.0000000
#> 11: 1.580384e+09         0.5014203                  0.4284583
#> 12: 8.934674e+03         0.3699103                  0.5066010
#> 13: 8.093173e+07         0.6299589                  0.5113004
#> 14: 1.909874e+08         1.0000000                  1.0000000
#> 15: 3.224173e+07         0.3768069                  0.3637621
#> 16: 3.483343e+03         0.1939792                  0.2475670