Source code for HAlphaAnomalyzer._anova_analysis

# Copyright (C) 2024  Mahsa Khazaei, Heba Mahdi, Azim Ahmadzadeh

# This file is part of H-Alpha Anomalyzer.
#
# H-Alpha Anomalyzer is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
#
# H-Alpha Anomalyzer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with H-Alpha Anomalyzer. If not, see <https://www.gnu.org/licenses/>.


import pandas as pd
from scipy.stats import f_oneway
from tqdm import tqdm


[docs]def _preprocess_and_filter_data(data_with_ranges): """ Computes the S statistic and separates data by anomaly labels. This function calculates the S statistic as the sum of absolute deviations between candidate range values and the average pixel values for each grid cell of the training images data. It then separates the data into two DataFrames based on the anomaly label: one for non-anomalous (label 0) and one for anomalous (label 1) data. Parameters ---------- data_with_ranges : pd.DataFrame The DataFrame with candidate ranges for each grid cell of the training images data. Returns ------- df_label_0 : pd.DataFrame A DataFrame with computed S statistics for non-anomalous data (label 0). df_label_1 : pd.DataFrame A DataFrame with computed S statistics for anomalous data (label 1). """ upper_deviation = data_with_ranges['upper_range_val'] - \ data_with_ranges['cell_pixel_avg'] lower_deviation = data_with_ranges['lower_range_val'] - \ data_with_ranges['cell_pixel_avg'] data_with_ranges['S'] = upper_deviation.abs() + lower_deviation.abs() df_label_0 = data_with_ranges[data_with_ranges['label'] == 0] df_label_1 = data_with_ranges[data_with_ranges['label'] == 1] return df_label_0, df_label_1
[docs]def _anova_ftest(data_with_ranges, grid_size=8, lower_range_end=20, upper_range_start=80, step_size=2): """ Performs One-way ANOVA F-test on S statistics across grid cells and candidate ranges. This function calculates the One-way ANOVA F-test statistic between anomalous and non-anomalous images for each combination of grid cell and candidate range of the training images data using the S statistics. Parameters ---------- data_with_ranges : pd.DataFrame The DataFrame with candidate ranges for each grid cell of the training images data. grid_size : int, optional The number of rows and columns to divide each image into, by default 8. lower_range_end : int, optional The end of candidate lower ranges, by default 20. upper_range_start : int, optional The start of candidate upper ranges, by default 80. step_size : int, optional The step size for candidate ranges, by default 2. Returns ------- df_anova_results : pd.DataFrame A DataFrame with computed F-statistic for each combination of grid cell and candidate range of the training images data. """ df_label_0, df_label_1 = _preprocess_and_filter_data(data_with_ranges) results = [] for row in tqdm(range(grid_size), desc="Performing One-way ANOVA F-test"): for column in range(grid_size): for lower_range in range(0, lower_range_end, step_size): for upper_range in range(upper_range_start, 100, step_size): subset_label_0 = df_label_0[ (df_label_0['row'] == row) & (df_label_0['column'] == column) & (df_label_0['lower_range'] == lower_range) & (df_label_0['upper_range'] == upper_range) ] subset_label_1 = df_label_1[ (df_label_1['row'] == row) & (df_label_1['column'] == column) & (df_label_1['lower_range'] == lower_range) & (df_label_1['upper_range'] == upper_range) ] data_label_0 = subset_label_0['S'].values data_label_1 = subset_label_1['S'].values f_statistic, _ = f_oneway(*[data_label_0, data_label_1]) results.append([ row, column, lower_range, upper_range, subset_label_0['lower_range_val'].values[0], subset_label_0['upper_range_val'].values[0], f_statistic ]) columns = [ 'row', 'column', 'lower_range', 'upper_range', 'lower_range_val', 'upper_range_val', 'f_statistic' ] df_anova_results = pd.DataFrame(results, columns=columns) return df_anova_results