metrics section draft

2025-10-18 17:23:18 +02:00
parent 3d21171a40
commit c3830db913
2 changed files with 62 additions and 11 deletions
--- a/thesis/Main.tex
+++ b/thesis/Main.tex
@@ -765,27 +765,22 @@ We adapted the baseline implementations to our data loader and input format and
 \paragraph{Evaluation Metrics}
 As discussed in Section~\ref{sec:preprocessing}, the evaluation of model performance in our setup is challenging due to the lack of analog ground truth. Instead, we rely on binary labels that are additionally noisy and subjective. All models under consideration output continuous anomaly scores: DeepSAD produces a positive-valued distance to a hypersphere center, Isolation Forest computes deviations from mean tree depths (which may be negative), and OCSVM yields a signed distance from the decision boundary. Because these scores differ in scale and sign, and due to the lack of a meaningful threshold for degradation, it is not appropriate to evaluate performance using metrics such as accuracy or F1 score, both of which require classification at a fixed threshold.
 Instead, we adopt threshold-independent evaluation curves that illustrate model behavior across all possible decision thresholds. The most commonly used curve for this is the Receiver Operating Characteristic (ROC)~\cite{roc}, along with its scalar summary metric, ROC AUC. ROC curves plot the true positive rate (TPR) against the false positive rate (FPR), giving insight into how well the model distinguishes between classes as the threshold changes. However, as pointed out in~\cite{roc_vs_prc2}\cite{roc_vs_prc} and confirmed in our own testing, ROC AUC can be overly optimistic in the presence of strong class imbalance, which is typical of anomaly detection tasks such as ours.
-we have no analog ground truth, only binary labels which are noisy and somewhat subjective as discussed in section~\ref{sec:preprocessing}. The models all output analog scores although deepsad outputs geometric distance from point so always positive, isoforest difference from mean tree depth (which can a negative number up to mean tree depth or max positive depending on tree sizes) and ocsvm signed distance from boundary (so also negative and positive values).  for these reasons it is infeasible to manually define threshold at which we evaluate models, which means model accuracy or f1 score cannot be reported, since they require evaluation at a fixed threshold.
+To mitigate this, we instead use Precision-Recall Curves (PRC)~\cite{prc}, which better reflect model performance on the minority class. PRC plots precision (the proportion of predicted anomalies that are truly anomalous) over recall (the proportion of true anomalies that were identified). As the decision threshold is lowered, recall increases while precision typically decreases due to the inclusion of more false positives. This tradeoff is visualized over the entire threshold range. The metric definitions are as follows:
 so we turn to evaluation curves which can show performance of methods over all possible thresholds. first in line, widely known and used is roc and for single look comparison roc AUC. it models TPR over FPR for all possible thresholds, but comes with the problem of being overly optimistic for imbalanced datasets such as ours, as is typical for anomaly detection tasks. this problem has been discussed in~\cite{roc_vs_prc} and it was concluded that PRC and PRC AUC (which is also called Average precsision=AP) is more robust in such cases.
 PRC curves plot the precision (Of the points I predict as anomalies, how many are actually anomalies) over recall (Out of all real anomalies, how many did I find?). this means for varying thresholds from infinitely low threshold we have perfect precision, but recall nothing and with rising threshold we start to recall more but at the cost of precision, since we start to also flag normal data as anomalies. The exact definition is:
 \[
 	\text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}, \quad
 	\text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}.
 \]
-in our own evaluation we found that ROC's overestimation was enough of a problem, that it made reporting of it infeasible. in figure (ref todo) we see this problem visualized in a ROC and PRC form the same data for both isolation forest and deepsad from our own testing. it can be seen that while in ROC both methods look similar and result in similarly good ROC AUC the PRC shows that isolation forest is quite a lot worse, since it wrongly flags a lot of normal data when trying to recall more anomlies, which is hidden in the ROC. the PRC is more sensitive to this class imbalance and its auc (=AP) shows this well, in  how it is a lot lower for isolation forest than for deepsad (ROC AUC is 0.693 vs 0.782 whereas AP is 0.207 vs 0.633 for isolation forest and deepsad respectively)
+In our evaluations, we found this difference to be practically significant. As shown in Figure~\ref{fig:roc_vs_prc}, ROC AUC values for both Isolation Forest and DeepSAD are similar (0.693 vs. 0.782), suggesting comparable performance. However, the PRC reveals a stark contrast in quality: while DeepSAD maintains high precision, Isolation Forest quickly accumulates false positives as recall increases. The resulting Average Precision (AP)—the area under the PRC—is markedly lower for Isolation Forest (0.207 vs. 0.633), accurately reflecting its poor ranking behavior in the presence of class imbalance.
-\figc{roc_vs_prc}{figures/setup_roc_vs_prc.png}{todo caption: prc vs roc of same data (our own runs of deepsad and isolation forest) which visualizes problem of roc which is not sensitive enough to class imbalance and therefore does not show that isolation forest wrongly identifies a lot of normal data as anomalous when trying to recall more true anomalies.}{width=.9\textwidth}
+\figc{roc_vs_prc}{figures/setup_roc_vs_prc.png}{Comparison of ROC and PRC for the same evaluation run. ROC fails to highlight the poor performance of Isolation Forest, which falsely classifies many normal samples as anomalous at lower thresholds. The PRC exposes this behavior, leading to a much lower AP for Isolation Forest than DeepSAD.}{width=.9\textwidth}
-
+In conclusion, the combination of unreliable thresholds and significant class imbalance makes traditional thresholded metrics unsuitable. PRC and AP on the other hand, offer a theoretically sound and empirically validated alternative, which is why we chose to report them.
 To summarize PRC performance in a single number, we use \textbf{Average Precision (AP)}, which computes the area under the PR curve. AP captures the model's precision across all recall levels and provides a robust, threshold-independent score.
 Due to the absence of reliable thresholds and the rarity of degraded data, traditional metrics like accuracy, F1, and ROC AUC do not faithfully reflect anomaly detection quality in our case. PR curves and AP, in contrast, emphasize model behavior on the positive class and penalize false alarms appropriately. This makes them both theoretically and empirically more suitable for evaluating degradation scoring in imbalanced, unsupervised settings.
 \newsection{setup_experiments_environment}{Experiment Overview \& Computational Environment}
--- a/thesis/bib/bibliography.bib
+++ b/thesis/bib/bibliography.bib
@@ -684,6 +684,62 @@ article{ef_concept_source,
 	year = {1986},
 	month = dec,
 	pages = {56–68},
 },
@article{roc_vs_prc2,
 	title = {Context discovery for anomaly detection},
 	volume = {19},
 	ISSN = {2364-4168},
 	url = {http://dx.doi.org/10.1007/s41060-024-00586-x},
 	DOI = {10.1007/s41060-024-00586-x},
 	number = {1},
 	journal = {International Journal of Data Science and Analytics},
 	publisher = {Springer Science and Business Media LLC},
 	author = {Calikus, Ece and Nowaczyk, Slawomir and Dikmen, Onur},
 	year = {2024},
 	month = jun,
 	pages = {99–113},
 },
@article{roc_vs_prc,
 	title = {On the evaluation of unsupervised outlier detection: measures,
 	         datasets, and an empirical study},
 	volume = {30},
 	ISSN = {1573-756X},
 	url = {http://dx.doi.org/10.1007/s10618-015-0444-8},
 	DOI = {10.1007/s10618-015-0444-8},
 	number = {4},
 	journal = {Data Mining and Knowledge Discovery},
 	publisher = {Springer Science and Business Media LLC},
 	author = {Campos, Guilherme O. and Zimek, Arthur and Sander, J\"{o}rg and
 	          Campello, Ricardo J. G. B. and Micenková, Barbora and Schubert, Erich
 	          and Assent, Ira and Houle, Michael E.},
 	year = {2016},
 	month = jan,
 	pages = {891–927},
 },
@inproceedings{roc,
 	title = {Basic principles of ROC analysis},
 	author = {Metz, Charles E},
 	booktitle = {Seminars in nuclear medicine},
 	volume = {8},
 	number = {4},
 	pages = {283--298},
 	year = {1978},
 	organization = {Elsevier},
 },
@article{prc,
 	title = {A critical investigation of recall and precision as measures of
 	         retrieval system performance},
 	volume = {7},
 	ISSN = {1558-2868},
 	url = {http://dx.doi.org/10.1145/65943.65945},
 	DOI = {10.1145/65943.65945},
 	number = {3},
 	journal = {ACM Transactions on Information Systems},
 	publisher = {Association for Computing Machinery (ACM)},
 	author = {Raghavan, Vijay and Bollmann, Peter and Jung, Gwang S.},
 	year = {1989},
 	month = jul,
 	pages = {205–229},
 }