Source code for dea_tools.validation

## validation.py
"""
Tools for validating outputs and producing accuracy assessment metrics.

License: The code in this notebook is licensed under the Apache License,
Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0). Digital Earth
Australia data is licensed under the Creative Commons by Attribution 4.0
license (https://creativecommons.org/licenses/by/4.0/).

Contact: If you need assistance, please post a question on the Open Data
Cube Slack channel (http://slack.opendatacube.org/) or on the GIS Stack
Exchange (https://gis.stackexchange.com/questions/ask?tags=open-data-cube)
using the `open-data-cube` tag (you can view previously asked questions
here: https://gis.stackexchange.com/questions/tagged/open-data-cube).

If you would like to report an issue with this script, you can file one
on GitHub (https://github.com/GeoscienceAustralia/dea-notebooks/issues/new).

Last modified: April 2023
"""

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from scipy import stats


[docs] def eval_metrics(x, y, round=3, all_regress=False): """ Calculate a set of common statistical metrics based on two input actual and predicted vectors. These include: - Pearson correlation - Root Mean Squared Error - Mean Absolute Error - R-squared - Bias - Linear regression parameters (slope, p-value, intercept, standard error) Parameters ---------- x : numpy.array An array providing "actual" variable values y : numpy.array An array providing "predicted" variable values round : int Number of decimal places to round each metric to. Defaults to 3 all_regress : bool Whether to return linear regression p-value, intercept and standard error (in addition to only regression slope). Defaults to False Returns ------- A pandas.Series containing calculated metrics """ # Create dataframe to drop na xy_df = pd.DataFrame({"x": x, "y": y}).dropna() # Compute linear regression lin_reg = stats.linregress(x=xy_df.x, y=xy_df.y) # Calculate statistics stats_dict = { "Correlation": xy_df.corr().iloc[0, 1], "RMSE": sqrt(mean_squared_error(xy_df.x, xy_df.y)), "MAE": mean_absolute_error(xy_df.x, xy_df.y), "R-squared": lin_reg.rvalue**2, "Bias": (xy_df.y - xy_df.x).mean(), "Regression slope": lin_reg.slope, } # Additional regression params if all_regress: stats_dict.update( { "Regression p-value": lin_reg.pvalue, "Regression intercept": lin_reg.intercept, "Regression standard error": lin_reg.stderr, } ) # Return as return pd.Series(stats_dict).round(round)