# Copyright (C) 2024 Mahsa Khazaei, Heba Mahdi, Azim Ahmadzadeh
# This file is part of H-Alpha Anomalyzer.
#
# H-Alpha Anomalyzer is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
#
# H-Alpha Anomalyzer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with H-Alpha Anomalyzer. If not, see <https://www.gnu.org/licenses/>.
import pandas as pd
from scipy.stats import f_oneway
from tqdm import tqdm
[docs]def _preprocess_and_filter_data(data_with_ranges):
"""
Computes the S statistic and separates data by anomaly labels.
This function calculates the S statistic as the sum of absolute deviations
between candidate range values and the average pixel values for each grid
cell of the training images data. It then separates the data into two
DataFrames based on the anomaly label: one for non-anomalous (label 0) and
one for anomalous (label 1) data.
Parameters
----------
data_with_ranges : pd.DataFrame
The DataFrame with candidate ranges for each grid cell of the training
images data.
Returns
-------
df_label_0 : pd.DataFrame
A DataFrame with computed S statistics for non-anomalous data (label
0).
df_label_1 : pd.DataFrame
A DataFrame with computed S statistics for anomalous data (label 1).
"""
upper_deviation = data_with_ranges['upper_range_val'] - \
data_with_ranges['cell_pixel_avg']
lower_deviation = data_with_ranges['lower_range_val'] - \
data_with_ranges['cell_pixel_avg']
data_with_ranges['S'] = upper_deviation.abs() + lower_deviation.abs()
df_label_0 = data_with_ranges[data_with_ranges['label'] == 0]
df_label_1 = data_with_ranges[data_with_ranges['label'] == 1]
return df_label_0, df_label_1
[docs]def _anova_ftest(data_with_ranges, grid_size=8, lower_range_end=20,
upper_range_start=80, step_size=2):
"""
Performs One-way ANOVA F-test on S statistics across grid cells and
candidate ranges.
This function calculates the One-way ANOVA F-test statistic between
anomalous and non-anomalous images for each combination of grid cell and
candidate range of the training images data using the S statistics.
Parameters
----------
data_with_ranges : pd.DataFrame
The DataFrame with candidate ranges for each grid cell of the training
images data.
grid_size : int, optional
The number of rows and columns to divide each image into, by default 8.
lower_range_end : int, optional
The end of candidate lower ranges, by default 20.
upper_range_start : int, optional
The start of candidate upper ranges, by default 80.
step_size : int, optional
The step size for candidate ranges, by default 2.
Returns
-------
df_anova_results : pd.DataFrame
A DataFrame with computed F-statistic for each combination of grid
cell and candidate range of the training images data.
"""
df_label_0, df_label_1 = _preprocess_and_filter_data(data_with_ranges)
df0 = df_label_0.set_index(['row', 'column', 'lower_range', 'upper_range']).sort_index()
df1 = df_label_1.set_index(['row', 'column', 'lower_range', 'upper_range']).sort_index()
results = []
for row in tqdm(range(grid_size), desc="Performing One-way ANOVA F-test"):
for column in range(grid_size):
for lower_range in range(0, lower_range_end, step_size):
for upper_range in range(upper_range_start, 100, step_size):
subset_label_0 = df0.loc[(row, column, lower_range, upper_range)]
subset_label_1 = df1.loc[(row, column, lower_range, upper_range)]
data_label_0 = subset_label_0['S'].values
data_label_1 = subset_label_1['S'].values
f_statistic, _ = f_oneway(*[data_label_0,
data_label_1])
results.append([
row, column, lower_range, upper_range,
subset_label_0['lower_range_val'].values[0],
subset_label_0['upper_range_val'].values[0],
f_statistic
])
columns = [
'row', 'column', 'lower_range', 'upper_range',
'lower_range_val', 'upper_range_val', 'f_statistic'
]
df_anova_results = pd.DataFrame(results, columns=columns)
return df_anova_results