This commit is contained in:
Jan Kowalczyk
2025-09-29 18:20:30 +02:00
parent a6f5ecaba2
commit d5f5a09d6f
6 changed files with 239 additions and 98 deletions

View File

@@ -1163,6 +1163,7 @@
\strng{authornamehash}{2348f5826634af872a0634ea83f5916a} \strng{authornamehash}{2348f5826634af872a0634ea83f5916a}
\strng{authorfullhash}{6bbe9b21a1058838c2696c645e510766} \strng{authorfullhash}{6bbe9b21a1058838c2696c645e510766}
\strng{authorfullhashraw}{6bbe9b21a1058838c2696c645e510766} \strng{authorfullhashraw}{6bbe9b21a1058838c2696c645e510766}
\field{extraname}{1}
\field{sortinit}{3} \field{sortinit}{3}
\field{sortinithash}{ad6fe7482ffbd7b9f99c9e8b5dccd3d7} \field{sortinithash}{ad6fe7482ffbd7b9f99c9e8b5dccd3d7}
\field{labelnamesource}{author} \field{labelnamesource}{author}
@@ -1830,6 +1831,130 @@
\verb http://dx.doi.org/10.3389/fmars.2021.717184 \verb http://dx.doi.org/10.3389/fmars.2021.717184
\endverb \endverb
\endentry \endentry
\entry{mobilenet}{misc}{}{}
\name{author}{8}{}{%
{{hash=0cedb03f907400fc304fdfaa1f7e2085}{%
family={Howard},
familyi={H\bibinitperiod},
given={Andrew\bibnamedelima G.},
giveni={A\bibinitperiod\bibinitdelim G\bibinitperiod}}}%
{{hash=d767e8e4d733bcf728bcdf2c193462f7}{%
family={Zhu},
familyi={Z\bibinitperiod},
given={Menglong},
giveni={M\bibinitperiod}}}%
{{hash=31960f03389184b7f052f5b197cc9fdf}{%
family={Chen},
familyi={C\bibinitperiod},
given={Bo},
giveni={B\bibinitperiod}}}%
{{hash=6cbb997a11c6922af719c32863261918}{%
family={Kalenichenko},
familyi={K\bibinitperiod},
given={Dmitry},
giveni={D\bibinitperiod}}}%
{{hash=47ad65c82b1de7d642988df185d7d8ea}{%
family={Wang},
familyi={W\bibinitperiod},
given={Weijun},
giveni={W\bibinitperiod}}}%
{{hash=7dcb9c6d4d4251a7e32b502d03c9354b}{%
family={Weyand},
familyi={W\bibinitperiod},
given={Tobias},
giveni={T\bibinitperiod}}}%
{{hash=8f221f2afb0b5a3d95b3e97101924922}{%
family={Andreetto},
familyi={A\bibinitperiod},
given={Marco},
giveni={M\bibinitperiod}}}%
{{hash=c707ec5b5997dc408a14a34a8380166c}{%
family={Adam},
familyi={A\bibinitperiod},
given={Hartwig},
giveni={H\bibinitperiod}}}%
}
\list{publisher}{1}{%
{arXiv}%
}
\strng{namehash}{e1fc6cab9b6009340e110518e53868c4}
\strng{fullhash}{cffcf38c642164887a370768f5701b8e}
\strng{fullhashraw}{cffcf38c642164887a370768f5701b8e}
\strng{bibnamehash}{cffcf38c642164887a370768f5701b8e}
\strng{authorbibnamehash}{cffcf38c642164887a370768f5701b8e}
\strng{authornamehash}{e1fc6cab9b6009340e110518e53868c4}
\strng{authorfullhash}{cffcf38c642164887a370768f5701b8e}
\strng{authorfullhashraw}{cffcf38c642164887a370768f5701b8e}
\field{sortinit}{7}
\field{sortinithash}{108d0be1b1bee9773a1173443802c0a3}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{title}{MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications}
\field{year}{2017}
\verb{doi}
\verb 10.48550/ARXIV.1704.04861
\endverb
\verb{urlraw}
\verb https://arxiv.org/abs/1704.04861
\endverb
\verb{url}
\verb https://arxiv.org/abs/1704.04861
\endverb
\keyw{Computer Vision and Pattern Recognition (cs.CV),FOS: Computer and information sciences,FOS: Computer and information sciences}
\endentry
\entry{shufflenet}{inproceedings}{}{}
\name{author}{4}{}{%
{{hash=5e72bc22dbcf0984c6d113d280e36990}{%
family={Zhang},
familyi={Z\bibinitperiod},
given={Xiangyu},
giveni={X\bibinitperiod}}}%
{{hash=5c1e75bf6f2b5386a54aff442f04d5cf}{%
family={Zhou},
familyi={Z\bibinitperiod},
given={Xinyu},
giveni={X\bibinitperiod}}}%
{{hash=a24609d7dfcf8f716059c40ed11ba974}{%
family={Lin},
familyi={L\bibinitperiod},
given={Mengxiao},
giveni={M\bibinitperiod}}}%
{{hash=f85751488058842b5777c7b4074077b5}{%
family={Sun},
familyi={S\bibinitperiod},
given={Jian},
giveni={J\bibinitperiod}}}%
}
\list{publisher}{1}{%
{IEEE}%
}
\strng{namehash}{d8f4dadb2f7478b1e29f1f5f541e5a3e}
\strng{fullhash}{2b7b29fe45fee2bd5ddb1dd1cbbff521}
\strng{fullhashraw}{2b7b29fe45fee2bd5ddb1dd1cbbff521}
\strng{bibnamehash}{2b7b29fe45fee2bd5ddb1dd1cbbff521}
\strng{authorbibnamehash}{2b7b29fe45fee2bd5ddb1dd1cbbff521}
\strng{authornamehash}{d8f4dadb2f7478b1e29f1f5f541e5a3e}
\strng{authorfullhash}{2b7b29fe45fee2bd5ddb1dd1cbbff521}
\strng{authorfullhashraw}{2b7b29fe45fee2bd5ddb1dd1cbbff521}
\field{extraname}{2}
\field{sortinit}{7}
\field{sortinithash}{108d0be1b1bee9773a1173443802c0a3}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{booktitle}{2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition}
\field{month}{6}
\field{title}{ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices}
\field{year}{2018}
\verb{doi}
\verb 10.1109/cvpr.2018.00716
\endverb
\verb{urlraw}
\verb http://dx.doi.org/10.1109/CVPR.2018.00716
\endverb
\verb{url}
\verb http://dx.doi.org/10.1109/CVPR.2018.00716
\endverb
\endentry
\enddatalist \enddatalist
\endrefsection \endrefsection
\endinput \endinput

Binary file not shown.

View File

@@ -399,16 +399,17 @@ DeepSAD relies on several tuneable hyperparameters that influence different stag
\item \textbf{Latent space dimensionality $\mathbb{R}^d$} \\ \item \textbf{Latent space dimensionality $\mathbb{R}^d$} \\
The size of the latent bottleneck is a critical parameter. If $\mathbb{R}^d$ is too small, the network cannot encode all relevant information, leading to information loss and weak representations. If $\mathbb{R}^d$ is too large, the network risks overfitting by encoding irrelevant detail, while also increasing computational cost. These insights stem from autoencoder literature \cite{deep_learning_book}, but it is unclear whether they apply directly to DeepSAD: here the autoencoder serves only for pretraining, and the encoder is subsequently fine-tuned with a different objective. Thus, the optimal choice of $\mathbb{R}^d$ may not coincide with the value that would be ideal for autoencoder reconstruction alone. The size of the latent bottleneck is a critical parameter. If $\mathbb{R}^d$ is too small, the network cannot encode all relevant information, leading to information loss and weak representations. If $\mathbb{R}^d$ is too large, the network risks overfitting by encoding irrelevant detail, while also increasing computational cost. These insights stem from autoencoder literature \cite{deep_learning_book}, but it is unclear whether they apply directly to DeepSAD: here the autoencoder serves only for pretraining, and the encoder is subsequently fine-tuned with a different objective. Thus, the optimal choice of $\mathbb{R}^d$ may not coincide with the value that would be ideal for autoencoder reconstruction alone.
\item \textbf{Label weighting $\eta$} \\ \item \textbf{Label weighting $\eta$} \\
The parameter $\eta$ controls the relative contribution of labeled versus unlabeled data in the DeepSAD objective. With $\eta = 1$, both groups contribute equally (normalized by their sample counts). Larger values of $\eta$ emphasize the labeled data, pulling labeled normals closer to the center and pushing labeled anomalies further away. Smaller values emphasize the unlabeled data, effectively reducing the influence of labels. This hyperparameter is unique to DeepSAD and mediates the transition between unsupervised and semi-supervised training. Its impact depends not only on its numerical value but also on the quantity and quality of available labels. The parameter $\eta$ controls the relative contribution of labeled versus unlabeled data in the DeepSAD objective. With $\eta = 1$, both groups contribute equally (normalized by their sample counts). Larger values of $\eta$ emphasize the labeled data, pulling labeled normals closer to the center and pushing labeled anomalies further away. Smaller values emphasize the unlabeled data, effectively reducing the influence of labels. Its impact depends not only on its numerical value but also on the quantity and quality of available labels.
\item \textbf{Learning rates $L_A$ and $L_M$} \\ \item \textbf{Learning rates $L_A$ and $L_M$} \\
Two learning rates are defined: $L_A$ for the autoencoder pretraining and $L_M$ for the main DeepSAD training. The learning rate sets the step size used during gradient descent updates and thereby controls the stability and speed of training. If it is too high, the optimization may diverge or oscillate; if too low, convergence becomes excessively slow and may get stuck in poor local minima. Schemes with adaptive learning rates such as ADAM may be applied to prevent poor choices. Two learning rates are defined: $L_A$ for the autoencoder pretraining and $L_M$ for the main DeepSAD training. The learning rate sets the step size used during gradient descent updates and thereby controls the stability and speed of training. If it is too high, the optimization may diverge or oscillate; if too low, convergence becomes excessively slow and may get stuck in poor local minima. Schemes with adaptive learning rates such as ADAM may be applied to prevent poor choices.
\item \textbf{Number of epochs $E_A$ and $E_M$} \\ \item \textbf{Number of epochs $E_A$ and $E_M$} \\
The number of training epochs specifies how many full passes over the dataset are made in pretraining ($E_A$) and in the main DeepSAD training ($E_M$). More epochs allow the model to fit more closely to the training data, but also increase the risk of overfitting to noise or mislabeled samples. In practice, the effective number of epochs depends on dataset size, network architecture, and whether early stopping is applied. The number of training epochs specifies how many full passes over the dataset are made in pretraining ($E_A$) and in the main DeepSAD training ($E_M$). More epochs allow the model to fit more closely to the training data, but also increase the risk of overfitting to noise or mislabeled samples. In practice, the effective number of epochs depends on dataset size, network architecture, and whether early stopping is applied.
\item \textbf{Regularization rate $\lambda$} \\
The rate of regularization, where $\lambda$ has to be larger than 0 for regularization to take effect. A higher value decreases the chance of overfitting at the cost of model comlexity.
\end{itemize} \end{itemize}
\newchapter{data_preprocessing}{Data and Preprocessing} \newchapter{data_preprocessing}{Data and Preprocessing}
Situations such as earthquakes, structural failures, and other emergencies that require rescue robots are fortunately rare. When these operations do occur, the primary focus is on the rapid and safe rescue of survivors rather than on data collection. Consequently, there is a scarcity of publicly available data from such scenarios. To improve any method, however, a large, diverse, and high-quality dataset is essential for comprehensive evaluation. This challenge is further compounded in our work, as we evaluate a training-based approach that imposes even higher demands on the data, especially requiring a great deal of diverse training samples, making it difficult to find a suitable dataset. Situations such as earthquakes, structural failures, and other emergencies that require rescue robots are fortunately rare. When these operations do occur, the primary focus is on the rapid and safe rescue of survivors rather than on data collection. Consequently, there is a scarcity of publicly available data from such scenarios. To improve any method, however, a large, diverse, and high-quality dataset is essential for comprehensive evaluation. This challenge is further compounded in our work, as we evaluate a training-based approach that imposes even higher demands on the data, especially requiring a great deal of diverse training samples, making it difficult to find a suitable dataset.
In this chapter, we outline the specific requirements we established for the data, describe the dataset selected for this task—including key statistics and notable features—and explain the preprocessing steps applied for training and evaluating the methods. In this chapter, we outline the specific requirements we established for the data, describe the dataset selected for this task—including key statistics and notable features—and explain the preprocessing steps applied for training and evaluating the methods.
@@ -501,18 +502,18 @@ In the anomalous experiments, the artificial smoke machine appears to have been
The figures~\ref{fig:data_screenshot_pointcloud}~and~\ref{fig:data_screenshot_camera} show an representative depiction of the environment of the experiments as a camera image of the IR camera and the point cloud created by the OS1 lidar sensor at practically the same time. The figures~\ref{fig:data_screenshot_pointcloud}~and~\ref{fig:data_screenshot_camera} show an representative depiction of the environment of the experiments as a camera image of the IR camera and the point cloud created by the OS1 lidar sensor at practically the same time.
\fig{data_screenshot_pointcloud}{figures/data_screenshot_pointcloud.png}{Screenshot of 3D rendering of an experiment without smoke and with illumination (same frame and roughly same alignment as figure~\ref{fig:data_screenshot_camera}). Point color corresponds to measurement range and axis in center of figure is the lidar's position.} \fig{data_screenshot_pointcloud}{figures/data_screenshot_pointcloud.png}{Screenshot of 3D rendering of an experiment's point cloud produced by the OS1-32 lidar sensor without smoke and with illumination (same frame and roughly same alignment as figure~\ref{fig:data_screenshot_camera}). Point color corresponds to measurement range and axis in center of figure is the lidar's position.}
\fig{data_screenshot_camera}{figures/data_screenshot_camera.png}{Screenshot of IR camera output of an experiment without smoke and with illumination (same frame and roughly same alignment as figure~\ref{fig:data_screenshot_pointcloud})} \fig{data_screenshot_camera}{figures/data_screenshot_camera.png}{Screenshot of IR camera output of an experiment without smoke and with illumination (same frame and roughly same alignment as figure~\ref{fig:data_screenshot_pointcloud})}
Regarding the dataset volume, the 10 normal experiments ranged from 88.7 to 363.1 seconds, with an average duration of 157.65 seconds. At a capture rate of 10 frames per second, these experiments yield 15,765 non-degraded point clouds. In contrast, the 4 anomalous experiments, including one stationary experiment lasting 11.7 seconds and another extending to 62.1 seconds, averaged 47.33 seconds, resulting in 1,893 degraded point clouds. In total, the dataset comprises 17,658 point clouds, with approximately 89.28\% classified as non-degraded (normal) and 10.72\% as degraded (anomalous). The distribution of experimental data is visualized in figure~\ref{fig:data_points_pie}. Regarding the dataset volume, the 10 normal experiments ranged from 88.7 to 363.1 seconds, with an average duration of 157.65 seconds. At a capture rate of 10 frames per second, these experiments yield 15,765 non-degraded point clouds. In contrast, the 4 anomalous experiments, including one stationary experiment lasting 11.7 seconds and another extending to 62.1 seconds, averaged 47.33 seconds, resulting in 1,893 degraded point clouds. In total, the dataset comprises 17,658 point clouds, with approximately 89.28\% classified as non-degraded (normal) and 10.72\% as degraded (anomalous). The distribution of experimental data is visualized in figure~\ref{fig:data_points_pie}.
\fig{data_points_pie}{figures/data_points_pie.png}{Pie chart visualizing the amount and distribution of normal and anomalous point clouds in \cite{subter}} \fig{data_points_pie}{figures/data_points_pie.png}{Pie chart visualizing the amount and distribution of normal and anomalous point clouds in \cite{subter}}
The artificial smoke introduces measurable changes that clearly separate the \textit{anomalous} runs from the \textit{normal} baseline. One change is a larger share of missing points per scan: smoke particles scatter or absorb the laser beam before it reaches a solid target, so the sensor reports an error instead of a distance. Figure~\ref{fig:data_missing_points} shows the resulting rightshift of the missing-point histogram, a known effect for lidar sensors in aerosol-filled environments~\cite{when_the_dust_settles}. Another demonstrative effect is the appearance of many spurious returns very close to the sensor; these near-field points arise when back-scatter from the aerosol itself is mistaken for a surface echo. The box-plot in Fig.~\ref{fig:particles_near_sensor} confirms a pronounced increase in sub-50 cm hits under smoke, consistent with the behaviour reported in \citetitle{when_the_dust_settles}~\cite{when_the_dust_settles}. The artificial smoke introduces measurable changes that clearly separate the \textit{anomalous} runs from the \textit{normal} baseline. One change is a larger share of missing points per scan: smoke particles scatter or absorb the laser beam before it reaches a solid target, so the sensor reports an error instead of a distance. Figure~\ref{fig:data_missing_points} shows the resulting rightshift of the missing-point histogram, a known effect for lidar sensors in aerosol-filled environments. Another demonstrative effect is the appearance of many spurious returns very close to the sensor; these near-field points arise when back-scatter from the aerosol itself is mistaken for a surface echo. The box-plot in Fig.~\ref{fig:particles_near_sensor} confirms a pronounced increase in sub-50 cm hits under smoke, a range at which we do not expect any non-erroneous measurements. Both effects are consistent with the behaviour reported in \citetitle{when_the_dust_settles}~\cite{when_the_dust_settles}.
\fig{data_missing_points}{figures/data_missing_points.png}{Density histogram showing the percentage of missing measurements per scan for normal experiments without degradation and anomalous experiments with artifical smoke introduced as degradation.} \fig{data_missing_points}{figures/data_missing_points.png}{Density histogram showing the percentage of missing measurements per scan for normal experiments without degradation and anomalous experiments with artifical smoke introduced as degradation.}
\fig{particles_near_sensor}{figures/particles_near_sensor_boxplot_zoomed_500.png}{Box diagram depicting the percentage of measurements closer than 50 centimeters to the sensor for normal and anomalous experiments} \fig{particles_near_sensor}{figures/particles_near_sensor_boxplot_zoomed_500.png}{Box diagram depicting the percentage of measurements closer than 50 centimeters to the sensor for normal and anomalous experiments.}
Taken together, the percentage of missing points and the proportion of near-sensor returns provide a concise indication of how strongly the smoke degrades our scans—capturing the two most prominent aerosol effects, drop-outs and back-scatter spikes. They do not, however, reveal the full error landscape discussed earlier (compound errors, temperature drift, multipath, \dots), so they should be read as an easily computed synopsis rather than an exhaustive measure of lidar quality. Next we will discuss how the lidar scans were preprocessed before use and how we actually assigned ground-truth labels to each scan, so we could train and evaluate our quantification degradation methods. Taken together, the percentage of missing points and the proportion of near-sensor returns provide a concise indication of how strongly the smoke degrades our scans—capturing the two most prominent aerosol effects, drop-outs and back-scatter spikes. They do not, however, reveal the full error landscape discussed earlier (compound errors, temperature drift, multipath, \dots), so they should be read as an easily computed synopsis rather than an exhaustive measure of lidar quality. Next we will discuss how the lidar scans were preprocessed before use and how we actually assigned ground-truth labels to each scan, so we could train and evaluate our quantification degradation methods.
@@ -523,16 +524,16 @@ As described in Section~\ref{sec:algorithm_description}, the method under evalua
For this reason and to simplify the architecture, we converted the point clouds into two-dimensional grayscale images using a spherical projection. This approach—proven sucessful in related work~\cite{degradation_quantification_rain}—encodes each lidar measurement as a single pixel, where the pixels grayscale value is determined by the reciprocal range, calculated as $v = \frac{1}{\sqrt{\emph{X}^2 + \emph{Y}^2 + \emph{Z}^2}}$. Given the lidar sensor's configuration, the resulting images have a resolution of 2048 pixels in width and 32 pixels in height. Missing measurements in the point cloud are mapped to pixels with a brightness value of $v = 0$. For this reason and to simplify the architecture, we converted the point clouds into two-dimensional grayscale images using a spherical projection. This approach—proven sucessful in related work~\cite{degradation_quantification_rain}—encodes each lidar measurement as a single pixel, where the pixels grayscale value is determined by the reciprocal range, calculated as $v = \frac{1}{\sqrt{\emph{X}^2 + \emph{Y}^2 + \emph{Z}^2}}$. Given the lidar sensor's configuration, the resulting images have a resolution of 2048 pixels in width and 32 pixels in height. Missing measurements in the point cloud are mapped to pixels with a brightness value of $v = 0$.
To create this mapping, we leveraged the available measurement indices and channel information inherent in the dense point clouds, which are ordered from 0 to 65,535 in a horizontally ascending, channel-by-channel manner. For sparser point clouds without such indices, one would need to rely on the pitch and yaw angles relative to the sensor's origin to correctly map each point to its corresponding pixel. To create this mapping, we leveraged the available measurement indices and channel information inherent in the dense point clouds, which are ordered from 0 to 65,535 in a horizontally ascending, channel-by-channel manner. For sparse point clouds without such indices, one would need to rely on the pitch and yaw angles relative to the sensor's origin to correctly map each point to its corresponding pixel, although this often leads to ambiguous mappings due to numerical errors in angle estimation.
Figure~\ref{fig:data_projections} displays two examples of lidar point cloud projections to aid in the readers understanding. Although the original point clouds were converted into grayscale images with a resolution of 2048×32 pixels, these raw images can be challenging to interpret. To enhance human readability, we applied the viridis colormap and vertically stretched the images so that each measurement occupies multiple pixels in height. The top projection is derived from a scan without artificial smoke—and therefore minimal degradation—while the lower projection comes from an experiment where artificial smoke introduced significant degradation. Figure~\ref{fig:data_projections} displays two examples of lidar point cloud projections to aid in the readers understanding. Although the original point clouds were converted into grayscale images with a resolution of 2048×32 pixels, these raw images can be challenging to interpret. To enhance human readability, we applied the viridis colormap and vertically stretched the images so that each measurement occupies multiple pixels in height. The top projection is derived from a scan without artificial smoke—and therefore minimal degradation—while the lower projection comes from an experiment where artificial smoke introduced significant degradation.
\fig{data_projections}{figures/data_2d_projections.png}{Two-dimensional projections of two pointclouds, one from an experiment without degradation and one from an experiment with artifical smoke as degradation. To aid the readers perception, the images are vertically stretched and a colormap has been applied to the pixels' reciprocal range values, while the actual training data is grayscale.} \fig{data_projections}{figures/data_2d_projections.png}{Two-dimensional projections of two pointclouds, one from an experiment without degradation and one from an experiment with artifical smoke as degradation. To aid the readers perception, the images are vertically stretched and a colormap has been applied to the pixels' reciprocal range values, while the actual training data is grayscale.}
The remaining challenge, was labeling a large enough portion of the dataset in a reasonably accurate manner, whose difficulties and general approach we described in section~\ref{sec:data_req}. Since, to our knowledge, neither our chosen dataset nor any other publicly available dataset provide objective labels for lidar data degradation in the SAR domain, we had to define our own labeling approach. With objective measures of degradation unavailable, we explored alternative labeling methods—such as using the datas' statistical properties like the number of missing measurements per point cloud or the higher incidence of erroneous measurements near the sensor we described in section~\ref{sec:data_req}. Ultimately, we were concerned that these statistical approaches might lead the method to simply mimic the statistical evaluation rather than to quantify degradation in a generalized and robust manner. After considering these options, we decided to label all point clouds from experiments with artificial smoke as anomalies, while point clouds from experiments without smoke were labeled as normal data. This labeling strategy—based on the presence or absence of smoke—is fundamentally an environmental indicator, independent of the intrinsic data properties recorded during the experiments. The remaining challenge, was labeling a large enough portion of the dataset in a reasonably accurate manner, whose difficulties and general approach we described in section~\ref{sec:data_req}. Since, to our knowledge, neither our chosen dataset nor any other publicly available one provide objective labels for lidar data degradation in the SAR domain, we had to define our own labeling approach. With objective measures of degradation unavailable, we explored alternative labeling methods—such as using the datas' statistical properties like the number of missing measurements per point cloud or the higher incidence of erroneous measurements near the sensor we described in section~\ref{sec:data_req}. Ultimately, we were concerned that these statistical approaches might lead the method to simply mimic the statistical evaluation rather than to quantify degradation in a generalized and robust manner. After considering these options, we decided to label all point clouds from experiments with artificial smoke as anomalies, while point clouds from experiments without smoke were labeled as normal data. This labeling strategy—based on the presence or absence of smoke—is fundamentally an environmental indicator, independent of the intrinsic data properties recorded during the experiments.
The simplicity of this labeling approach has both advantages and disadvantages. On the positive side, it is easy to implement and creates a clear distinction between normal and anomalous data. However, its simplicity is also its drawback: some point clouds from experiments with artificial smoke do not exhibit perceptible degradation, yet they are still labeled as anomalies. The reason for this, is that during the three non-static anomalous experiments the sensor platform starts recording in a tunnel roughly 20 meters from the smoke machine's location. It starts by approaching the smoke machine, navigates close to the machine for some time and then leaves its perimeter once again. Since the artificical smoke's density is far larger near the machine it originates from, the time the sensor platform spent close to it produced highly degraded point clouds, whereas the temporal beginnings and ends of the anomalous experiments capture point clouds which are subjectively not degraded and appear similar to ones from the normal experiments. This effect is clearly illustrated by the degradation indicators which we talked about earlier-the proportion of missing points and the amount of erroneous points close to the sensor per pointcloud-as can be seen in figure~\ref{fig:data_anomalies_timeline}. The simplicity of this labeling approach has both advantages and disadvantages. On the positive side, it is easy to implement and creates a clear distinction between normal and anomalous data. However, its simplicity is also its drawback: some point clouds from experiments with artificial smoke do not exhibit perceptible degradation, yet they are still labeled as anomalies. The reason for this, is that during the three non-static anomalous experiments the sensor platform starts recording in a tunnel roughly 20 meters from the smoke machine's location. It starts by approaching the smoke machine, navigates close to the machine for some time and then leaves its perimeter once again. Since the artificical smoke's density is far larger near the machine it originates from, the time the sensor platform spent close to it produced highly degraded point clouds, whereas the beginnings and ends of the anomalous experiments capture point clouds which are subjectively not degraded and appear similar to ones from the normal experiments. This effect is clearly illustrated by the degradation indicators which we talked about earlier-the proportion of missing points and the amount of erroneous points close to the sensor per pointcloud-as can be seen in figure~\ref{fig:data_anomalies_timeline}.
\fig{data_anomalies_timeline}{figures/data_combined_anomalies_timeline.png}{Missing points and points with a measured range smaller than 50cm per point cloud over a normalized timeline of the individual experiments. This illustrates the rise, plateau and fall of degradation intensity during the anomalous experiments, owed to the spacial proximity to the degradation source (smoke machine). One of the normal experiments (without artifical smoke) is included as a baseline.} \fig{data_anomalies_timeline}{figures/data_combined_anomalies_timeline.png}{Missing points and points with a measured range smaller than 50cm per point cloud over a normalized timeline of the individual experiments. This illustrates the rise, plateau and fall of degradation intensity during the anomalous experiments, owed to the spacial proximity to the degradation source (smoke machine). One of the normal experiments (without artifical smoke) is included as a baseline.}
@@ -540,55 +541,40 @@ Afraid that the incorrectly labeled data may negatively impact DeepSAD's semi-su
\begin{enumerate} \begin{enumerate}
\item \textbf{Experiment-based labels:} All scans from anomalous experiments marked anomalous, including border cases—yielding conservative performance metrics that reflect real-world label noise. \item \textbf{Experiment-based labels:} All scans from anomalous experiments marked anomalous, including border cases—yielding conservative performance metrics that reflect real-world label noise.
\item \textbf{Manually-refined labels:} Only unequivocally degraded scans marked anomalous—producing near-ideal separation in a lot of cases. \item \textbf{Manually-defined labels:} Only unequivocally degraded scans marked anomalous—producing near-ideal separation in a lot of cases.
\end{enumerate} \end{enumerate}
Under both evaluation schemes all frames from normal experiments were marked as normal, since they appear to have produced high quality data throughout. A visualization of how the two evaluation schemes measure up in terms of numbers of samples per class can be seen in figure~\ref{fig:data_eval_labels}.
\fig{data_eval_labels}{figures/data_eval_labels.png}{Pie charts visualizing the number of normal and anomalous labels applied to the dataset per labeling scheme. A large part of the experiment-based anomalous labels had to be removed for the manually-defined scheme, since they were either subjectively clearly or possibly not degraded.}
By evaluating and comparing both approaches, we hope to demonstrate a more thorough performance investigatation than with only one of the two. By evaluating and comparing both approaches, we hope to demonstrate a more thorough performance investigatation than with only one of the two.
\newchapter{experimental_setup}{Experimental Setup} \newchapter{experimental_setup}{Experimental Setup}
We built our experiments on the official DeepSAD PyTorch implementation and evaluation framework, available at \url{https://github.com/lukasruff/Deep-SAD-PyTorch}. This codebase provides routines for loading standard datasets, training DeepSAD and several baseline models, and evaluating their performance. We built our experiments starting from the official DeepSAD PyTorch implementation and evaluation framework, available at \url{https://github.com/lukasruff/Deep-SAD-PyTorch}. This codebase provides routines for loading standard datasets, training DeepSAD and several baseline models, and evaluating their performance.
In the following sections, we detail our adaptations to this framework: In the following sections, we detail our adaptations to this framework:
\begin{itemize} \begin{itemize}
\item Data integration: preprocessing and loading the dataset from \citetitle{subter}. \item Data integration: preprocessing and loading the dataset from \citetitle{subter}.
\item Model architecture: configuring DeepSADs encoder to match our pointcloud input format, contrasting two distinct neural network architectures to investigate their impact on the method's output. \item Model architecture: configuring DeepSADs encoder to match our pointcloud input format, contrasting two distinct neural network architectures to investigate their impact on the method's output.
\item Training \& evaluation: training DeepSAD alongside two classical baselines—Isolation Forest and one-class SVM—and comparing their degradation-quantification performance. \item Training \& evaluation: training DeepSAD alongside two classical baselines—Isolation Forest and One-class SVM (OCSVM)—and comparing their degradation-quantification performance.
\item Experimental environment: the hardware and software stack used, with typical training and inference runtimes. \item Experimental environment: the hardware and software stack used, with typical training and inference runtimes.
\end{itemize} \end{itemize}
Together, these components define the full experimental pipeline, from data preprocessing to the evaluation metrics we use to compare methods. Together, these components define the full experimental pipeline, from data loading, preprocessing, method training to the evaluation and comparing of methods.
\section{Framework \& Data Preparation} \section{Framework \& Data Preparation}
\newsubsubsectionNoTOC{DeepSAD PyTorch codebase and our adaptations} %\newsubsubsectionNoTOC{DeepSAD PyTorch codebase and our adaptations}
DeepSAD's PyTorch implementation includes standardized datasets such as MNIST, CIFAR-10 and datasets from \citetitle{odds}~\cite{odds}, as well as suitable network architectures for the corresponding datatypes. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE and SemiDGM with the loaded data and evaluate their performance by calculating the Average Precision as well as the Precision Recall Curve for all given algorithms. We adapted this implementation which was originally developed for Python 3.7 to work with Python 3.12 and changed or added functionality for dataloading our chosen dataset, added DeepSAD models that work with the lidar projections datatype, added more evaluation methods and an inference module. DeepSAD's PyTorch implementation—our starting point—includes implementations for training on standardized datasets such as MNIST, CIFAR-10 and datasets from \citetitle{odds}~\cite{odds}. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE and SemiDGM with the loaded data and evaluate their performance by calculating the Receiver Operating Characteristic (ROC) and its Area Under the Curve (AUC) for all given algorithms. We adapted this implementation which was originally developed for Python 3.7 to work with Python 3.12 and changed or added functionality for dataloading our chosen dataset, added DeepSAD models that work with the lidar projections datatype, added more evaluation methods and an inference module.
\newsubsubsectionNoTOC{SubTER dataset preprocessing, train/test splits, and label strategy} The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 lidar. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” as described in section~\ref{sec:preprocessing} and export them to files as NumPy arrays. Storing precomputed projections allows rapid data loading during training and evaluation. Many modern lidars can be configured to output range images directly which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.
We extended the DeepSAD frameworks PyTorch \texttt{DataLoader} by implementing a custom \texttt{Dataset} class that ingests our precomputed NumPy range-image files and attaches appropriate evaluation labels. Each experiments frames are stored as a single \texttt{.npy} file of shape \((\text{Number of Frames}, H, W)\), containing the point clouds' reciprocal range values. Our \texttt{Dataset} initializer scans a directory of these files, loads the NumPy arrays from file into memory, transforms them into PyTorch tensors and assigns evaluation and training labels accordingly.
The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 lidar. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” and save them as NumPy arrays. We apply a spherical projection that maps each lidar measurement to a pixel in a 2D image of size Height × Width, where Height = number of vertical channels (32) and Width = measurements per rotation (2048). Instead of computing per-point azimuth and elevation angles at runtime, we exploit the sensors metadata:
\begin{itemize}
\item \textbf{Channel index:} directly gives the row (vertical position) of each measurement.
\item \textbf{Measurement index:} by taking the measurement index modulo Width, we obtain the column (horizontal position) in the 360° sweep.
\end{itemize}
The measurement index is only available because the SubTER data is dense—every possible channel × measurement pair appears in the bag, even if the lidar did not record a return. We can perform a direct 1:1 mapping without collision or missing entries. This avoids the ambiguities we previously encountered when reconstructing the projection via angle computations alone, which sometimes mapped multiple points to the same pixel due to numerical errors in angle estimation.
For each projected pixel, we compute $\mathbf{v}_i = \sqrt{{x_i}^2 + {y_i}^2 + {z_i}^2}$ where $\mathbf{v}_i$ is the reciprocal range value assigned to each pixel in the projection and $x_i, y_i$ and $z_i$ are the corresponding measurement's 3d coordinates. This transformation both compresses the dynamic range and emphasizes close-range returns—critical for detecting near-sensor degradation. We then save the resulting tensor of shape (Number of Frames, Height, Width) using NumPys save function. Storing precomputed projections allows rapid data loading during training and evaluation.
Many modern lidars can be configured to output range images directly which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.
\newsubsubsectionNoTOC{Any implementation challenges or custom data loaders}
We extended the DeepSAD frameworks PyTorch \texttt{DataLoader} by implementing a custom \texttt{Dataset} class that ingests our precomputed NumPy range-image files and attaches appropriate evaluation labels.
Each experiments frames are stored as a single \texttt{.npy} file of shape \((\text{Number of Frames}, H, W)\), containing the reciprocal range values described in Section~\ref{sec:preprocessing}. Our \texttt{Dataset} initializer scans a directory of these files, loads the NumPy arrays from file into memory, transforms them into PyTorch tensors and assigns evaluation and training labels accordingly.
The first labeling scheme, called \emph{experiment-based labels}, assigns The first labeling scheme, called \emph{experiment-based labels}, assigns
\[ \[
@@ -611,22 +597,18 @@ To obtain a second source of ground truth, we also support \emph{manually-define
\end{cases} \end{cases}
\] \]
We pass instances of this \texttt{Dataset} to PyTorchs \texttt{DataLoader}, enabling batch sampling, shuffling, and multi-worker loading. The dataloader returns the preprocessed lidar projection, both evaluation labels and a semi-supervised training label. This modular design lets us train and evaluate DeepSAD under both labeling regimes without duplicating data-handling code. We pass instances of this \texttt{Dataset} to PyTorchs \texttt{DataLoader}, enabling batch sampling, shuffling, and multi-worker loading. The dataloader returns the preprocessed lidar projection, both evaluation labels and a semi-supervised training label.
DeepSAD supports both unsupervised and semi-supervised training by optionally incorporating a small number of labeled samples. To control this, our custom PyTorch \texttt{Dataset} accepts two integer parameters, \texttt{num\_labelled\_normal} and \texttt{num\_labelled\_anomalous}, which specify how many samples of each class should retain their labels during training. All other samples are assigned a label of 0 (``unknown'') and treated as unlabeled. To control the supervision of DeepSAD's training, our custom PyTorch \texttt{Dataset} accepts two integer parameters, \texttt{num\_labelled\_normal} and \texttt{num\_labelled\_anomalous}, which specify how many samples of each class should retain their labels during training. We begin with the manually-defined evaluation labels, to not use mislabeled anomalous frames for the semi-supervision. Then, we randomly un-label (set to 0) enough samples of each class until exactly \texttt{num\_labelled\_normal} normals and \texttt{num\_labelled\_anomalous} anomalies remain labeled.
When using semi-supervised mode, we begin with the manually-defined evaluation labels. We then randomly un-label (set to 0) enough samples of each class until exactly \texttt{num\_labelled\_normal} normals and \texttt{num\_labelled\_anomalous} anomalies remain labeled. This mechanism allows us to systematically compare unsupervised mode, where \texttt{num\_labelled\_normal} = \texttt{num\_labelled\_anomalous} = 0, and Semi-supervised modes with varying label budgets.
To obtain robust performance estimates on our relatively small dataset, we implement $k$-fold cross-validation. A single integer parameter, \texttt{num\_folds}, controls the number of splits. We use scikit-learns \texttt{KFold} (from \texttt{sklearn.model\_selection}) with \texttt{shuffle=True} and a fixed random seed to partition each experiments frames into \texttt{num\_folds} disjoint folds. Training then proceeds across $k$ rounds, each time training on $(k-1)/k$ of the data and evaluating on the remaining $1/k$. In our experiments, we set \texttt{num\_folds=5}, yielding an 80/20 train/evaluation split per fold. To obtain robust performance estimates on our relatively small dataset, we implement $k$-fold cross-validation. A single integer parameter, \texttt{num\_folds}, controls the number of splits. We use scikit-learns \texttt{KFold} (from \texttt{sklearn.model\_selection}) with \texttt{shuffle=True} and a fixed random seed to partition each experiments frames into \texttt{num\_folds} disjoint folds. Training then proceeds across $k$ rounds, each time training on $(k-1)/k$ of the data and evaluating on the remaining $1/k$. In our experiments, we set \texttt{num\_folds=5}, yielding an 80/20 train/evaluation split per fold.
For inference (i.e.\ model validation on held-out experiments), we provide a second \texttt{Dataset} class that loads a single experiment's NumPy file (no k-fold splitting), does not assign any labels to the frames nor does it shuffle frames, preserving temporal order. This setup enables seamless, frame-by-frame scoring of complete runs—crucial for analyzing degradation dynamics over an entire traversal. For inference (i.e.\ model validation on held-out experiments), we provide a second \texttt{Dataset} class that loads a single experiment's NumPy file (no k-fold splitting), does not assign any labels to the frames nor does it shuffle frames, preserving temporal order. This setup enables seamless, frame-by-frame scoring of complete runs—crucial for analyzing degradation dynamics over an entire experiment.
\section{Model Configuration \& Evaluation Protocol} \section{Model Configuration \& Evaluation Protocol}
Since the neural network architecture trained in the deepsad method is not fixed as described in section~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed lidar data projections. Since \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth reffered to as "efficient architecture" to incorporate a few modern techniques, befitting our use case. Since the neural network architecture trained in the deepsad method is not fixed as described in section~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed lidar data projections. Since \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth reffered to as "efficient architecture" to incorporate a few modern techniques, befitting our use case.
\newsubsubsectionNoTOC{Network architectures (LeNet variant, custom encoder) and how they suit the pointcloud input}
The LeNet-inspired autoencoder can be split into an encoder network (figure~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (figure~\ref{fig:setup_arch_lenet_decoder}) with a latent space inbetween the two parts. Such an arrangement is typical for autoencoder architectures as we discussed in section~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture which is used to infer the degradation quantification in our use case, once trained. The LeNet-inspired autoencoder can be split into an encoder network (figure~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (figure~\ref{fig:setup_arch_lenet_decoder}) with a latent space inbetween the two parts. Such an arrangement is typical for autoencoder architectures as we discussed in section~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture which is used to infer the degradation quantification in our use case, once trained.
\figc{setup_arch_lenet_encoder}{diagrams/arch_lenet_encoder}{ \figc{setup_arch_lenet_encoder}{diagrams/arch_lenet_encoder}{
@@ -642,7 +624,7 @@ The LeNet-inspired autoencoder can be split into an encoder network (figure~\ref
representation used by DeepSAD for anomaly detection. representation used by DeepSAD for anomaly detection.
}{width=.8\textwidth} }{width=.8\textwidth}
The LeNet-inspired encoder network (see figure~\ref{fig:setup_arch_lenet_encoder}) is a compact convolutional neural network that reduces image data into a lower-dimensional latent space. It consists of two stages of convolution, normalization, non-linear activation, and pooling, followed by a dense layer that defines the latent representation. Conceptually, the convolutional layers learn small filters that detect visual patterns in the input (such as edges or textures). Batch normalization ensures that these learned signals remain numerically stable during training, while a LeakyReLU activation introduces non-linearity, allowing the network to capture more complex relationships. Pooling operations then downsample the feature maps, which reduces the spatial size of the data and emphasizes the most important features. Finally, a dense layer transforms the extracted feature maps into the latent space, which serves as the datas' representation in the reduced dimensionality latent space. The LeNet-inspired encoder network (see figure~\ref{fig:setup_arch_lenet_encoder}) is a compact convolutional neural network that reduces image data into a lower-dimensional latent space. It consists of two stages of convolution, normalization, non-linear activation, and pooling, followed by a dense layer that defines the latent representation. Conceptually, the convolutional layers learn small filters that detect visual patterns in the input (such as edges or textures). Batch normalization ensures that these learned signals remain numerically stable during training, while a LeakyReLU activation introduces non-linearity, allowing the network to capture more complex relationships. Pooling operations then downsample the feature maps, which reduces the spatial size of the data and emphasizes the most important features. Finally, a dense layer transforms the extracted feature maps into the latent space.
\figc{setup_arch_lenet_decoder}{diagrams/arch_lenet_decoder}{ \figc{setup_arch_lenet_decoder}{diagrams/arch_lenet_decoder}{
Architecture of the LeNet-inspired decoder. The input is a latent vector of dimension $d$, Architecture of the LeNet-inspired decoder. The input is a latent vector of dimension $d$,
@@ -671,7 +653,7 @@ To adjust for this, we decided to modify the network architecture and included f
\item \textbf{Non-square convolution kernels.} Depthwise-separable convolutions with kernel size $3 \times 17$ are used instead of square kernels, resulting in an RF of $10 \times 52$ pixels, corresponding to $9.93^{\circ} \times 9.14^{\circ}$, substantially more balanced than the LeNet-inspired network's RF. \item \textbf{Non-square convolution kernels.} Depthwise-separable convolutions with kernel size $3 \times 17$ are used instead of square kernels, resulting in an RF of $10 \times 52$ pixels, corresponding to $9.93^{\circ} \times 9.14^{\circ}$, substantially more balanced than the LeNet-inspired network's RF.
\item \textbf{Circular padding along azimuth.} The horizontal axis is circularly padded to respect the wrap-around of $360^{\circ}$ lidar data, preventing artificial seams at the image boundaries. \item \textbf{Circular padding along azimuth.} The horizontal axis is circularly padded to respect the wrap-around of $360^{\circ}$ lidar data, preventing artificial seams at the image boundaries.
\item \textbf{Aggressive horizontal pooling.} A $1 \times 4$ pooling operation is applied early in the network, which reduces the over-sampled horizontal resolution (2048~px to 512~px) while keeping vertical detail intact. \item \textbf{Aggressive horizontal pooling.} A $1 \times 4$ pooling operation is applied early in the network, which reduces the over-sampled horizontal resolution (2048~px to 512~px) while keeping vertical detail intact.
\item \textbf{Depthwise-separable convolutions with channel shuffle.} Inspired by MobileNet and ShuffleNet, this reduces the number of parameters and computations while retaining representational capacity, making the network more suitable for embedded platforms, while simultaneously allowing more learnable channels without increasing computational demand. \item \textbf{Depthwise-separable convolutions with channel shuffle.} Inspired by MobileNet~\cite{mobilenet} and ShuffleNet~\cite{shufflenet}, this reduces the number of parameters and computations while retaining representational capacity, making the network more suitable for embedded platforms, while simultaneously allowing more learnable channels without increasing computational demand.
\item \textbf{Max pooling.} Standard max pooling is used instead of average pooling, since it preserves sharp activations that are often indicative of localized degradation. \item \textbf{Max pooling.} Standard max pooling is used instead of average pooling, since it preserves sharp activations that are often indicative of localized degradation.
\item \textbf{Channel compression before latent mapping.} After feature extraction, a $1 \times 1$ convolution reduces the number of channels before flattening, which lowers the parameter count of the final fully connected layer without sacrificing feature richness. \item \textbf{Channel compression before latent mapping.} After feature extraction, a $1 \times 1$ convolution reduces the number of channels before flattening, which lowers the parameter count of the final fully connected layer without sacrificing feature richness.
\end{itemize} \end{itemize}
@@ -744,16 +726,18 @@ To compare the computational efficiency of the two architectures we show the num
\label{tab:params_lenet_vs_efficient} \label{tab:params_lenet_vs_efficient}
\end{table} \end{table}
\newsubsubsectionNoTOC{Baseline methods (Isolation Forest, one-class SVM) and feature extraction via the encoder} \FloatBarrier
\newsubsubsectionNoTOC{Baseline methods (Isolation Forest, OCSVM)}
To contextualize the performance of DeepSAD, we compare against two widely used baselines: Isolation Forest and One-Class SVM (OCSVM). Both are included in the original DeepSAD codebase and the associated paper, and they represent well-understood but conceptually different families of anomaly detection. In our setting, the raw input dimensionality ($2048 \times 32$ per frame) is too high for a direct OCSVM fit, so we reuse the DeepSAD autoencoders \emph{encoder} as a learned dimensionality reduction (to the same latent size as DeepSAD). This choice is motivated by practicality (compute) and inductive bias: the encoder captures non-linear, domain-specific structure of the lidar range images, which linear methods like PCA may miss. Together, these two baselines cover complementary perspectives: tree-based partitioning (Isolation Forest) and kernel-based boundary learning (OCSVM), providing a broad and well-established basis for comparison. To contextualize the performance of DeepSAD, we compare against two widely used baselines: Isolation Forest and OCSVM. Both are included in the original DeepSAD codebase and the associated paper, and they represent well-understood but conceptually different families of anomaly detection. In our setting, the raw input dimensionality ($2048 \times 32$ per frame) is too high for a direct OCSVM fit, so we reuse the DeepSAD autoencoders \emph{encoder} as a learned dimensionality reduction (to the same latent size as DeepSAD), to allow OCSVM training on this latent space. Together, these two baselines cover complementary perspectives: raw input tree-based partitioning (Isolation Forest) and dimensionality reduced kernel-based boundary learning (OCSVM), providing a broad and well-established basis for comparison.
Isolation Forest is an ensemble method for anomaly detection that builds on the principle that anomalies are easier to separate from the rest of the data. It constructs many binary decision trees, each by recursively splitting the data at randomly chosen features and thresholds. In this process, the “training” step consists of building the forest of trees: each tree captures different random partitions of the input space, and together they form a diverse set of perspectives on how easily individual samples can be isolated. Isolation Forest is an ensemble method for anomaly detection that builds on the principle that anomalies are easier to separate from the rest of the data. It constructs many binary decision trees, each by recursively splitting the data at randomly chosen features and thresholds. In this process, the “training” step consists of building the forest of trees: each tree captures different random partitions of the input space, and together they form a diverse set of perspectives on how easily individual samples can be isolated.
Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient since no explicit density estimation is required, and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the lidar input representation, providing a strong non-neural baseline for comparison against DeepSAD. Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the lidar input representation, providing a strong non-neural baseline for comparison against DeepSAD.
While Isolation Forest relies on random partitioning of the input space, OCSVM takes a very different approach by learning a flexible boundary around normal samples. OCSVM is trained only on data assumed to be normal, with the goal of enclosing the majority of these samples in such a way that new points lying outside this boundary can be identified as anomalies. OCSVM takes a very different approach by learning a flexible boundary around normal samples. It assumes all training data to be normal, with the goal of enclosing the majority of these samples in such a way that new points lying outside this boundary can be identified as anomalies.
The boundary itself is learned using the support vector machine framework. In essence, OCSVM looks for a hyperplane in some feature space that maximizes the separation between the bulk of the data and the origin. To make this possible even when the normal data has a complex, curved shape, OCSVM uses a kernel function such as the radial basis function (RBF). The kernel implicitly maps the input data into a higher-dimensional space, where the cluster of normal samples becomes easier to separate with a simple hyperplane. When this separation is mapped back to the original input space, it corresponds to a flexible, nonlinear boundary that can adapt to the structure of the data. The boundary itself is learned using the support vector machine framework. In essence, OCSVM looks for a hyperplane in some feature space that maximizes the separation between the bulk of the data and the origin. To make this possible even when the normal data has a complex, curved shape, OCSVM uses a kernel function such as the radial basis function (RBF). The kernel implicitly maps the input data into a higher-dimensional space, where the cluster of normal samples becomes easier to separate with a simple hyperplane. When this separation is mapped back to the original input space, it corresponds to a flexible, nonlinear boundary that can adapt to the structure of the data.
@@ -763,25 +747,21 @@ We adapted the baseline implementations to our data loader and input format, and
\newsection{setup_experiments_environment}{Experiment Overview \& Computational Environment} \newsection{setup_experiments_environment}{Experiment Overview \& Computational Environment}
Across all experiments we vary three factors: (i) latent space dimensionality, (ii) encoder architecture (LeNet-inspired vs. Efficient), and (iii) the amount of semi-supervision (labeling regime). To keep results comparable, we fix the remaining training hyperparameters: all autoencoders are pretrained for 50~epochs at a learning rate of $1\cdot 10^{-5}$; all DeepSAD models are then trained for 150~epochs at the same learning rate. The DeepSAD label-weighting parameter is kept at $\eta = 1$ for all runs. Every configuration is evaluated with 5-fold cross-validation, and we report fold means. Across all experiments we vary three factors: (i) latent space dimensionality, (ii) encoder architecture (LeNet-inspired vs. Efficient), and (iii) the amount of semi-supervision (labeling regime). To keep results comparable, we fix the remaining training hyperparameters: all autoencoders are pretrained for $E_A = 50$~epochs with ADAM as an optimzer at a starting learning rate of $L_A = 1\cdot 10^{-5}$; all DeepSAD models are then trained for $E_M = 150$~epochs with the same optimizer and starting learning rate ($L_M = 1\cdot 10^{-5}$). The DeepSAD label-weighting parameter is kept at $\eta = 1$ and the regularization rate at $\lambda = 1\cdot 10^{-6}$ for all runs. Every configuration is evaluated with 5-fold cross-validation, and we report fold means.
We first search over the latent bottleneck size by pretraining autoencoders only. For both encoder backbones, we evaluate latent sizes $32, 64, 128, 256, 512, 768,$ and $1024$. The goal is to identify compact yet expressive representations before moving to anomaly detection. We first search over the latent bottleneck size by pretraining autoencoders only. For both encoder backbones, we evaluate latent sizes $32, 64, 128, 256, 512, 768,$ and $1024$. The goal is to identify compact yet expressive representations and to compare the autoencoding performance between the two network architectures LeNet-inspired and Efficient. Additionally, we are interested in finding possible correlations between the autoencoder performance and the DeepSAD anomaly detection performance.
Using the same latent sizes and backbones, we train full DeepSAD models initialized from the pretrained encoders. This stage tests how representation size and architecture transfer to anomaly detection performance under different levels of supervision. Using the same latent sizes and backbones, we train full DeepSAD models initialized from the pretrained encoders. We study three supervision regimes, from unsupervised to strongly supervised (see Table~\ref{tab:labeling_regimes} for proportions within the training folds):
We study three regimes, from unsupervised to strongly supervised (see Table~\ref{tab:labeling_regimes} for proportions within the training folds):
\begin{itemize} \begin{itemize}
\item \textbf{Unsupervised:} $(0,0)$ labeled (normal, anomalous) samples. \item \textbf{Unsupervised:} $(0,0)$ labeled (normal, anomalous) samples.
\item \textbf{Low supervision:} $(50,10)$ labeled samples. \item \textbf{Low supervision:} $(50,10)$ labeled samples.
\item \textbf{High supervision:} $(500,100)$ labeled samples. \item \textbf{High supervision:} $(500,100)$ labeled samples.
\end{itemize} \end{itemize}
Percentages in Table~\ref{tab:labeling_regimes} are computed relative to the training split of each fold (80\% of the data). The classes “normal,” “anomalous,” and “unknown” follow the experiment-based scheme. Importantly, for semi-supervised labels we \emph{only} use hand-selected, unambiguous smoke intervals; frames outside these intervals in smoke runs are treated as “unknown” (not anomalous) to avoid injecting mislabeled data into training. Percentages in Table~\ref{tab:labeling_regimes} are computed relative to the training split of each fold (80\% of the data) from the experiment-based labeling scheme. Importantly, for semi-supervised labels we \emph{only} use hand-selected, unambiguous smoke intervals from the manually-defined evaluation scheme, to avoid injecting mislabeled data into training.
\begin{table}[h] \begin{table}[h]
\centering \centering
\caption{Proportion of labeled samples in the training folds for each labeling regime. \caption{Proportion of labeled samples in the training folds for each labeling regime. Percentages are computed relative to the available training data after 5-fold splitting (80\% of the dataset per fold). }
Percentages are computed relative to the available training data after 5-fold splitting
(80\% of the dataset per fold). Unknown samples were never labeled.}
\renewcommand{\arraystretch}{1.15} \renewcommand{\arraystretch}{1.15}
\begin{tabularx}{\linewidth}{lYYYY} \begin{tabularx}{\linewidth}{lYYYY}
\toprule \toprule
@@ -883,7 +863,7 @@ Pretraining runtimes for the autoencoders are reported in Table~\ref{tab:ae_pret
\end{tabularx} \end{tabularx}
\end{table} \end{table}
The full DeepSAD training times are shown in Table~\ref{tab:train_runtimes_compact}, alongside the two classical baselines Isolation Forest and One-Class SVM. Here the contrast between methods is clear: while DeepSAD requires on the order of 1520 minutes of GPU training per configuration and fold, both baselines complete training in seconds on CPU. The OCSVM training can only be this fast due to the reduced input dimensionality from utilizing DeepSAD's pretraining encoder as a preprocessing step, although other dimensionality reduction methods may also be used which could require less computational resources for this step. The full DeepSAD training times are shown in Table~\ref{tab:train_runtimes_compact}, alongside the two classical baselines Isolation Forest and OCSVM. Here the contrast between methods is clear: while DeepSAD requires on the order of 1520 minutes of GPU training per configuration and fold, both baselines complete training in seconds on CPU. The OCSVM training can only be this fast due to the reduced input dimensionality from utilizing DeepSAD's pretraining encoder as a preprocessing step, although other dimensionality reduction methods may also be used which could require less computational resources for this step.
\begin{table} \begin{table}
\centering \centering
@@ -933,7 +913,7 @@ Together, these results provide a comprehensive overview of the computational re
\newchapter{results_discussion}{Results and Discussion} \newchapter{results_discussion}{Results and Discussion}
The experiments described in Chapter~\ref{chp:experimental_setup} are presented in this chapter. We begin in Section~\ref{sec:results_pretraining} with the pretraining stage, where the two autoencoder architectures were trained across multiple latent space dimensionalities. These results provide insight into the representational capacity of each architecture. In Section~\ref{sec:results_deepsad}, we turn to the main experiments: training DeepSAD models and benchmarking them against baseline algorithms (Isolation Forest and One-Class SVM). Finally, in Section~\ref{sec:results_inference}, we present inference results on experiments that were held out during training. These plots illustrate how the algorithms behave when applied sequentially to unseen traversals, offering a more practical perspective on their potential for real-world rescue robotics applications. The experiments described in Chapter~\ref{chp:experimental_setup} are presented in this chapter. We begin in Section~\ref{sec:results_pretraining} with the pretraining stage, where the two autoencoder architectures were trained across multiple latent space dimensionalities. These results provide insight into the representational capacity of each architecture. In Section~\ref{sec:results_deepsad}, we turn to the main experiments: training DeepSAD models and benchmarking them against baseline algorithms (Isolation Forest and OCSVM). Finally, in Section~\ref{sec:results_inference}, we present inference results on experiments that were held out during training. These plots illustrate how the algorithms behave when applied sequentially to unseen experiments, offering a more practical perspective on their potential for real-world rescue robotics applications.
% --- Section: Autoencoder Pretraining Results --- % --- Section: Autoencoder Pretraining Results ---
\newsection{results_pretraining}{Autoencoder Pretraining Results} \newsection{results_pretraining}{Autoencoder Pretraining Results}
@@ -961,13 +941,11 @@ The results of pretraining the two autoencoder architectures are summarized in T
\end{tabularx} \end{tabularx}
\end{table} \end{table}
\figc{ae_loss_overall}{figures/ae_elbow_test_loss_overall.png}{Reconstruction loss across latent dimensions for LeNet-inspired and Efficient architectures.}{width=.9\textwidth} \figc{ae_loss_overall}{figures/ae_elbow_test_loss_overall.png}{Reconstruction loss across latent dimensions for LeNet-inspired and Efficient autoencoder architectures.}{width=.9\textwidth}
Because overall reconstruction loss might obscure how well encoders represent anomalous samples, we additionally evaluate reconstruction errors only on degraded samples from hand-labeled smoke segments (Figure~\ref{fig:ae_loss_degraded}). As expected, reconstruction losses are higher on these challenging samples than in the overall evaluation. However, the relative advantage of the Efficient architecture remains, suggesting that its improvements extend to anomalous inputs as well. Because overall reconstruction loss might obscure how well encoders represent anomalous samples, we additionally evaluate reconstruction errors only on degraded samples from manually-defined smoke segments (Figure~\ref{fig:ae_loss_degraded}). As expected, reconstruction losses are higher on these challenging samples than in the overall evaluation. However, the relative advantage of the Efficient architecture remains, suggesting that its improvements extend to anomalous inputs as well.
\figc{ae_loss_degraded}{figures/ae_elbow_test_loss_anomaly.png}{Reconstruction loss across latent dimensions for LeNet-inspired and Efficient architectures, evaluated only on degraded data from hand-labeled smoke experiments.}{width=.9\textwidth} \figc{ae_loss_degraded}{figures/ae_elbow_test_loss_anomaly.png}{Reconstruction loss across latent dimensions for LeNet-inspired and Efficient autoencoder architectures, evaluated only on degraded data from manually-defined smoke experiments.}{width=.9\textwidth}
Since only per-sample reconstruction losses were retained during pretraining, we report results in reciprocal-range MSE space. While more interpretable metrics in meters and distance-binned analyses would be desirable, the downstream anomaly detection performance did not differ starkly between encoders, so we did not pursue this additional evaluation. Future work could extend the pretraining analysis with physically interpretable metrics.
% --- Section: DeepSAD Training Results --- % --- Section: DeepSAD Training Results ---
\newsection{results_deepsad}{DeepSAD Detection Performance} \newsection{results_deepsad}{DeepSAD Detection Performance}
@@ -975,10 +953,10 @@ Since only per-sample reconstruction losses were retained during pretraining, we
Due to the challenges of ground truth quality, evaluation results must be interpreted with care. As introduced earlier, we consider two complementary evaluation schemes: Due to the challenges of ground truth quality, evaluation results must be interpreted with care. As introduced earlier, we consider two complementary evaluation schemes:
\begin{itemize} \begin{itemize}
\item \textbf{Experiment-based labels:} An objective way to assign anomaly labels to all frames from degraded runs. However, this also marks many near-normal frames at the start and end of runs as anomalous. These knowingly false labels lower the maximum achievable AP, because even an ideal model would be forced to classify some normal frames as anomalous. \item \textbf{Experiment-based labels:} An objective way to assign anomaly labels to all frames from degraded runs. However, this also marks many near-normal frames at the start and end of runs as anomalous. These knowingly false labels lower the maximum achievable AP, because even an ideal model would be forced to classify some normal frames as anomalous.
\item \textbf{Hand-labeled labels:} A cleaner ground truth, containing only clearly degraded frames. This removes mislabeled intervals and allows nearly perfect separation. However, it also simplifies the task too much, because borderline cases are excluded. \item \textbf{Manually-defined labels:} A cleaner ground truth, containing only clearly degraded frames. This removes mislabeled intervals and allows nearly perfect separation. However, it also simplifies the task too much, because borderline cases are excluded.
\end{itemize} \end{itemize}
Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimensions, labeling regimes, and methods. Under experiment-based evaluation, both DeepSAD variants consistently outperform the baselines, reaching AP values around 0.600.66 compared to 0.21 for Isolation Forest and 0.310.49 for OC-SVM. Under hand-labeled evaluation, DeepSAD achieves nearly perfect AP in all settings, while the baselines remain much lower. This contrast shows that the lower AP under experiment-based evaluation is not a weakness of DeepSAD itself, but a direct result of mislabeled samples in the evaluation data. The hand-labeled scheme therefore confirms that DeepSAD separates clearly normal from clearly degraded frames very well, while also highlighting that label noise must be kept in mind when interpreting the experiment-based results. Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimensions, labeling regimes, and methods. Under experiment-based evaluation, both DeepSAD variants consistently outperform the baselines, reaching AP values around 0.600.66 compared to 0.21 for Isolation Forest and 0.310.49 for OC-SVM. Under manually-defined evaluation, DeepSAD achieves nearly perfect AP in all settings, while the baselines remain much lower. This contrast shows that the lower AP under experiment-based evaluation is not a weakness of DeepSAD itself, but a direct result of mislabeled samples in the evaluation data. The manually-defined scheme therefore confirms that DeepSAD separates clearly normal from clearly degraded frames very well, while also highlighting that label noise must be kept in mind when interpreting the experiment-based results.
\begin{table}[t] \begin{table}[t]
\centering \centering
@@ -986,39 +964,39 @@ Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimen
\label{tab:results_ap} \label{tab:results_ap}
\begin{tabularx}{\textwidth}{c*{4}{Y}|*{4}{Y}} \begin{tabularx}{\textwidth}{c*{4}{Y}|*{4}{Y}}
\toprule \toprule
& \multicolumn{4}{c}{Experiment-based eval.} & \multicolumn{4}{c}{Handlabeled eval.} \\ & \multicolumn{4}{c}{Experiment-based eval.} & \multicolumn{4}{c}{Manually-defined eval.} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9} \cmidrule(lr){2-5} \cmidrule(lr){6-9}
Latent Dim. & \rotheader{DeepSAD \\(LeNet)} & \rotheader{DeepSAD\\(Efficient)} & \rotheader{IsoForest} & \rotheader{OC-SVM} & \rotheader{DeepSAD\\(LeNet)} & \rotheader{DeepSAD\\(Efficient)} & \rotheader{IsoForest} & \rotheader{OC-SVM} \\ Latent Dim. & \rotheader{DeepSAD \\(LeNet)} & \rotheader{DeepSAD\\(Efficient)} & \rotheader{IsoForest} & \rotheader{OC-SVM} & \rotheader{DeepSAD\\(LeNet)} & \rotheader{DeepSAD\\(Efficient)} & \rotheader{IsoForest} & \rotheader{OC-SVM} \\
\midrule \midrule
\multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{0/0}\) \textit{(normal/anomalous samples labeled)}} \\ \multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{0/0}\) \textit{(normal/anomalous samples labeled)}} \\
\addlinespace[2pt] \addlinespace[2pt]
32 & \textbf{0.664} & 0.650 & 0.217 & 0.315 & \textbf{1.000} & \textbf{1.000} & 0.241 & 0.426 \\ 32 & \textbf{0.664} & 0.650 & 0.217 & 0.315 & \textbf{1.000} & \textbf{1.000} & 0.241 & 0.426 \\
64 & 0.635 & \textbf{0.643} & 0.215 & 0.371 & \textbf{1.000} & \textbf{1.000} & 0.233 & 0.531 \\ 64 & 0.635 & \textbf{0.643} & 0.215 & 0.371 & \textbf{1.000} & \textbf{1.000} & 0.233 & 0.531 \\
128 & \textbf{0.642} & \textbf{0.642} & 0.218 & 0.486 & \textbf{1.000} & \textbf{1.000} & 0.241 & 0.729 \\ 128 & \textbf{0.642} & \textbf{0.642} & 0.218 & 0.486 & \textbf{1.000} & \textbf{1.000} & 0.241 & 0.729 \\
256 & 0.615 & \textbf{0.631} & 0.214 & 0.452 & 0.999 & \textbf{1.000} & 0.236 & 0.664 \\ 256 & 0.615 & \textbf{0.631} & 0.214 & 0.452 & 0.999 & \textbf{1.000} & 0.236 & 0.664 \\
512 & 0.613 & \textbf{0.635} & 0.216 & 0.397 & \textbf{1.000} & \textbf{1.000} & 0.241 & 0.550 \\ 512 & 0.613 & \textbf{0.635} & 0.216 & 0.397 & \textbf{1.000} & \textbf{1.000} & 0.241 & 0.550 \\
768 & 0.609 & \textbf{0.617} & 0.219 & 0.439 & 0.997 & \textbf{1.000} & 0.244 & 0.624 \\ 768 & 0.609 & \textbf{0.617} & 0.219 & 0.439 & 0.997 & \textbf{1.000} & 0.244 & 0.624 \\
1024 & 0.607 & \textbf{0.612} & 0.215 & 0.394 & 0.997 & \textbf{1.000} & 0.235 & 0.529 \\ 1024 & 0.607 & \textbf{0.612} & 0.215 & 0.394 & 0.997 & \textbf{1.000} & 0.235 & 0.529 \\
\midrule \midrule
\multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{50/10}\) \textit{(normal/anomalous samples labeled)}} \\ \multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{50/10}\) \textit{(normal/anomalous samples labeled)}} \\
\addlinespace[2pt] \addlinespace[2pt]
32 & 0.569 & \textbf{0.582} & 0.217 & 0.315 & 0.933 & \textbf{0.976} & 0.241 & 0.426 \\ 32 & 0.569 & \textbf{0.582} & 0.217 & 0.315 & 0.933 & \textbf{0.976} & 0.241 & 0.426 \\
64 & 0.590 & \textbf{0.592} & 0.215 & 0.371 & 0.970 & \textbf{0.986} & 0.233 & 0.531 \\ 64 & 0.590 & \textbf{0.592} & 0.215 & 0.371 & 0.970 & \textbf{0.986} & 0.233 & 0.531 \\
128 & 0.566 & \textbf{0.588} & 0.218 & 0.486 & 0.926 & \textbf{0.983} & 0.241 & 0.729 \\ 128 & 0.566 & \textbf{0.588} & 0.218 & 0.486 & 0.926 & \textbf{0.983} & 0.241 & 0.729 \\
256 & \textbf{0.598} & 0.587 & 0.214 & 0.452 & 0.978 & \textbf{0.984} & 0.236 & 0.664 \\ 256 & \textbf{0.598} & 0.587 & 0.214 & 0.452 & 0.978 & \textbf{0.984} & 0.236 & 0.664 \\
512 & 0.550 & \textbf{0.587} & 0.216 & 0.397 & 0.863 & \textbf{0.978} & 0.241 & 0.550 \\ 512 & 0.550 & \textbf{0.587} & 0.216 & 0.397 & 0.863 & \textbf{0.978} & 0.241 & 0.550 \\
768 & \textbf{0.596} & 0.577 & 0.219 & 0.439 & \textbf{0.992} & 0.974 & 0.244 & 0.624 \\ 768 & \textbf{0.596} & 0.577 & 0.219 & 0.439 & \textbf{0.992} & 0.974 & 0.244 & 0.624 \\
1024 & \textbf{0.601} & 0.568 & 0.215 & 0.394 & \textbf{0.990} & 0.966 & 0.235 & 0.529 \\ 1024 & \textbf{0.601} & 0.568 & 0.215 & 0.394 & \textbf{0.990} & 0.966 & 0.235 & 0.529 \\
\midrule \midrule
\multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{500/100}\) \textit{(normal/anomalous samples labeled)}} \\ \multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{500/100}\) \textit{(normal/anomalous samples labeled)}} \\
\addlinespace[2pt] \addlinespace[2pt]
32 & \textbf{0.625} & 0.621 & 0.217 & 0.315 & \textbf{0.999} & 0.997 & 0.241 & 0.426 \\ 32 & \textbf{0.625} & 0.621 & 0.217 & 0.315 & \textbf{0.999} & 0.997 & 0.241 & 0.426 \\
64 & 0.611 & \textbf{0.621} & 0.215 & 0.371 & 0.996 & \textbf{0.998} & 0.233 & 0.531 \\ 64 & 0.611 & \textbf{0.621} & 0.215 & 0.371 & 0.996 & \textbf{0.998} & 0.233 & 0.531 \\
128 & 0.607 & \textbf{0.615} & 0.218 & 0.486 & 0.996 & \textbf{0.998} & 0.241 & 0.729 \\ 128 & 0.607 & \textbf{0.615} & 0.218 & 0.486 & 0.996 & \textbf{0.998} & 0.241 & 0.729 \\
256 & 0.604 & \textbf{0.612} & 0.214 & 0.452 & 0.984 & \textbf{0.998} & 0.236 & 0.664 \\ 256 & 0.604 & \textbf{0.612} & 0.214 & 0.452 & 0.984 & \textbf{0.998} & 0.236 & 0.664 \\
512 & 0.578 & \textbf{0.608} & 0.216 & 0.397 & 0.916 & \textbf{0.998} & 0.241 & 0.550 \\ 512 & 0.578 & \textbf{0.608} & 0.216 & 0.397 & 0.916 & \textbf{0.998} & 0.241 & 0.550 \\
768 & 0.597 & \textbf{0.598} & 0.219 & 0.439 & 0.994 & \textbf{0.995} & 0.244 & 0.624 \\ 768 & 0.597 & \textbf{0.598} & 0.219 & 0.439 & 0.994 & \textbf{0.995} & 0.244 & 0.624 \\
1024 & \textbf{0.601} & 0.591 & 0.215 & 0.394 & 0.990 & \textbf{0.993} & 0.235 & 0.529 \\ 1024 & \textbf{0.601} & 0.591 & 0.215 & 0.394 & 0.990 & \textbf{0.993} & 0.235 & 0.529 \\
\bottomrule \bottomrule
\end{tabularx} \end{tabularx}
\end{table} \end{table}
@@ -1026,7 +1004,7 @@ Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimen
The precision--recall curves (Figure~\ref{fig:prc_representative}) illustrate these effects more clearly. For DeepSAD, precision stays close to 1 until about 0.5 recall, after which it drops off sharply. This plateau corresponds to the fraction of truly degraded frames in the anomalous set. Once recall moves beyond this point, the evaluation demands that the model also “find” the mislabeled anomalies near the run boundaries. To do so, the decision threshold must be lowered so far that many normal frames are also flagged, which causes precision to collapse. The baselines behave differently: OC-SVM shows a smooth but weaker decline without a strong high-precision plateau, while Isolation Forest collapses to near-random performance. These operational differences are hidden in a single AP number but are important for judging how the methods would behave in deployment. The precision--recall curves (Figure~\ref{fig:prc_representative}) illustrate these effects more clearly. For DeepSAD, precision stays close to 1 until about 0.5 recall, after which it drops off sharply. This plateau corresponds to the fraction of truly degraded frames in the anomalous set. Once recall moves beyond this point, the evaluation demands that the model also “find” the mislabeled anomalies near the run boundaries. To do so, the decision threshold must be lowered so far that many normal frames are also flagged, which causes precision to collapse. The baselines behave differently: OC-SVM shows a smooth but weaker decline without a strong high-precision plateau, while Isolation Forest collapses to near-random performance. These operational differences are hidden in a single AP number but are important for judging how the methods would behave in deployment.
Taken together, the two evaluation schemes provide complementary insights. The experiment-based labels offer a noisy but realistic setting that shows how methods cope with ambiguous data, while the hand-labeled labels confirm that DeepSAD can achieve nearly perfect separation when the ground truth is clean. The combination of both evaluations makes clear that (i) DeepSAD is stronger than the baselines under both conditions, (ii) the apparent performance limits under experiment-based labels are mainly due to label noise, and (iii) interpreting results requires care, since performance drops in the curves often reflect mislabeled samples rather than model failures. At the same time, both schemes remain binary classifications and therefore cannot directly evaluate the central question of whether anomaly scores can serve as a continuous measure of degradation. For this reason, we extend the analysis in Section~\ref{sec:results_inference}, where inference on entire unseen experiments is used to provide a more intuitive demonstration of the methods potential for quantifying lidar degradation in practice. Taken together, the two evaluation schemes provide complementary insights. The experiment-based labels offer a noisy but realistic setting that shows how methods cope with ambiguous data, while the manually-defined labels confirm that DeepSAD can achieve nearly perfect separation when the ground truth is clean. The combination of both evaluations makes clear that (i) DeepSAD is stronger than the baselines under both conditions, (ii) the apparent performance limits under experiment-based labels are mainly due to label noise, and (iii) interpreting results requires care, since performance drops in the curves often reflect mislabeled samples rather than model failures. At the same time, both schemes remain binary classifications and therefore cannot directly evaluate the central question of whether anomaly scores can serve as a continuous measure of degradation. For this reason, we extend the analysis in Section~\ref{sec:results_inference}, where inference on entire unseen experiments is used to provide a more intuitive demonstration of the methods potential for quantifying lidar degradation in practice.
\fig{prc_representative}{figures/results_prc.png}{Representative precisionrecall curves over all latent dimensionalities for semi-labeling regime 0/0 from experiment-based evaluation labels. DeepSAD maintains a large high-precision operating region before collapsing; OC-SVM declines smoother but exhibits high standard deviation between folds; IsoForest collapses quickly and remains flat. DeepSAD's fall-off is at least partly due to known mislabeled evaluation targets.} \fig{prc_representative}{figures/results_prc.png}{Representative precisionrecall curves over all latent dimensionalities for semi-labeling regime 0/0 from experiment-based evaluation labels. DeepSAD maintains a large high-precision operating region before collapsing; OC-SVM declines smoother but exhibits high standard deviation between folds; IsoForest collapses quickly and remains flat. DeepSAD's fall-off is at least partly due to known mislabeled evaluation targets.}
@@ -1041,7 +1019,7 @@ During autoencoder pretraining we observed that reconstruction loss decreased mo
\figc{latent_dim_ap}{figures/results_ap_over_latent.png}{AP as a function of latent dimension (experiment-based evaluation). DeepSAD shows inverse correlation between AP and latent space size.}{width=.7\textwidth} \figc{latent_dim_ap}{figures/results_ap_over_latent.png}{AP as a function of latent dimension (experiment-based evaluation). DeepSAD shows inverse correlation between AP and latent space size.}{width=.7\textwidth}
\paragraph{Effect of semi-supervised labeling.} \paragraph{Effect of semi-supervised labeling.}
Table~\ref{tab:results_ap} shows that the unsupervised regime \((0/0)\) achieves the best AP, while the lightly supervised regime \((50/10)\) performs worst. With many labels \((500/100)\), performance improves again but remains slightly below the unsupervised case. This pattern also appears under the hand-labeled evaluation, which excludes mislabeled frames. The drop with light supervision therefore cannot be explained by noisy evaluation targets, but must stem from the training process itself. Table~\ref{tab:results_ap} shows that the unsupervised regime \((0/0)\) achieves the best AP, while the lightly supervised regime \((50/10)\) performs worst. With many labels \((500/100)\), performance improves again but remains slightly below the unsupervised case. This pattern also appears under the manually-defined evaluation, which excludes mislabeled frames. The drop with light supervision therefore cannot be explained by noisy evaluation targets, but must stem from the training process itself.
The precision--recall curves in Figure~\ref{fig:prc_over_semi} show that the overall curve shapes are similar across regimes, but shifted relative to one another in line with the AP ordering \((0/0) > (500/100) > (50/10)\). We attribute these shifts to overfitting: when only a few anomalies are labeled, the model fits them too strongly, and if those examples differ too much from other anomalies, generalization suffers. This explains why lightly supervised training performs even worse than unsupervised training, which avoids this bias. The precision--recall curves in Figure~\ref{fig:prc_over_semi} show that the overall curve shapes are similar across regimes, but shifted relative to one another in line with the AP ordering \((0/0) > (500/100) > (50/10)\). We attribute these shifts to overfitting: when only a few anomalies are labeled, the model fits them too strongly, and if those examples differ too much from other anomalies, generalization suffers. This explains why lightly supervised training performs even worse than unsupervised training, which avoids this bias.
@@ -1083,23 +1061,23 @@ This thesis set out to answer the research question stated in Chapter~\ref{chp:i
\begin{quote} \begin{quote}
Can autonomous robots quantify the reliability of lidar sensor data in hazardous environments to make more informed decisions? Can autonomous robots quantify the reliability of lidar sensor data in hazardous environments to make more informed decisions?
\end{quote} \end{quote}
Our results indicate a qualified “yes.” Using anomaly detection (AD)—in particular DeepSAD—we can obtain scores that (i) separate clearly normal from clearly degraded scans and (ii) track degradation trends over time on held-out traversals (see Sections~\ref{sec:results_deepsad} and \ref{sec:results_inference}). At the same time, the absence of robust ground truth limits how confidently we can assess \emph{continuous} quantification quality and complicates cross-method comparisons. The remainder of this chapter summarizes what we contribute, what we learned, and what is still missing. Our results indicate a qualified “yes.” Using anomaly detection (AD)—in particular DeepSAD—we can obtain scores that (i) separate clearly normal from clearly degraded scans and (ii) track degradation trends over time on held-out experiments (see Sections~\ref{sec:results_deepsad} and \ref{sec:results_inference}). At the same time, the absence of robust ground truth limits how confidently we can assess \emph{continuous} quantification quality and complicates cross-method comparisons. The remainder of this chapter summarizes what we contribute, what we learned, and what is still missing.
\paragraph{Main contributions.} \paragraph{Main contributions.}
\begin{itemize} \begin{itemize}
\item \textbf{Empirical comparison for lidar degradation.} A systematic evaluation of DeepSAD against Isolation Forest and OC-SVM across latent sizes and labeling regimes, showing that DeepSAD consistently outperforms the baselines under both evaluation schemes (Section~\ref{sec:results_deepsad}). \item \textbf{Empirical comparison for lidar degradation.} A systematic evaluation of DeepSAD against Isolation Forest and OC-SVM across latent sizes and labeling regimes, showing that DeepSAD consistently outperforms the baselines under both evaluation schemes (Section~\ref{sec:results_deepsad}).
\item \textbf{Two-track evaluation protocol.} We frame and use two complementary label sets: (i) \emph{experiment-based} labels (objective but noisy at run boundaries), and (ii) \emph{hand-labeled} intervals (clean but simplified). This pairing clarifies what each scheme can—and cannot—tell us about real performance (Section~\ref{sec:results_deepsad}). \item \textbf{Two-track evaluation protocol.} We frame and use two complementary label sets: (i) \emph{experiment-based} labels (objective but noisy at run boundaries), and (ii) \emph{manually-defined} intervals (clean but simplified). This pairing clarifies what each scheme can—and cannot—tell us about real performance (Section~\ref{sec:results_deepsad}).
% \item \textbf{Latent dimensionality insight.} Compact bottlenecks (32128) are more robust under noisy labels and yield the best AP; larger latent spaces amplify precision collapses beyond the high-precision plateau (Figure~\ref{fig:latent_dim_ap}). High-dimensional input data apparently can be compressed quite strongly, which may lead to improved performance and better generalization. % \item \textbf{Latent dimensionality insight.} Compact bottlenecks (32128) are more robust under noisy labels and yield the best AP; larger latent spaces amplify precision collapses beyond the high-precision plateau (Figure~\ref{fig:latent_dim_ap}). High-dimensional input data apparently can be compressed quite strongly, which may lead to improved performance and better generalization.
\item \textbf{Latent dimensionality insight.} \item \textbf{Latent dimensionality insight.}
Autoencoder pretraining loss decreases with larger latent spaces, but DeepSAD performance shows the opposite trend: compact bottlenecks (32128) achieve the highest AP. This contrast demonstrates that pretraining performance does not directly predict DeepSAD performance—latent dimensionality cannot be tuned via autoencoder loss alone, even though it remains useful for comparing architectures. Autoencoder pretraining loss decreases with larger latent spaces, but DeepSAD performance shows the opposite trend: compact bottlenecks (32128) achieve the highest AP. This contrast demonstrates that pretraining performance does not directly predict DeepSAD performance—latent dimensionality cannot be tuned via autoencoder loss alone, even though it remains useful for comparing architectures.
\item \textbf{Semi-supervision insight.} In our data, \emph{unsupervised} DeepSAD performed best; \emph{light} labeling (50/10) performed worst; \emph{many} labels (500/100) partially recovered performance but did not surpass unsupervised. Evidence from PRC shapes and fold variance points to \emph{training-side overfitting to a small labeled set}, an effect that persists even under clean hand-labeled evaluation (Table~\ref{tab:results_ap}, Figure~\ref{fig:prc_over_semi}). \item \textbf{Semi-supervision insight.} In our data, \emph{unsupervised} DeepSAD performed best; \emph{light} labeling (50/10) performed worst; \emph{many} labels (500/100) partially recovered performance but did not surpass unsupervised. Evidence from PRC shapes and fold variance points to \emph{training-side overfitting to a small labeled set}, an effect that persists even under clean manually-defined evaluation (Table~\ref{tab:results_ap}, Figure~\ref{fig:prc_over_semi}).
\item \textbf{Encoder architecture matters.} The Efficient encoder outperformed the LeNet-inspired variant in pretraining and downstream AD, indicating that representation quality substantially affects DeepSAD performance (Section~\ref{sec:results_pretraining}, Section~\ref{sec:results_deepsad}). \item \textbf{Encoder architecture matters.} The Efficient encoder outperformed the LeNet-inspired variant in pretraining and downstream AD, indicating that representation quality substantially affects DeepSAD performance (Section~\ref{sec:results_pretraining}, Section~\ref{sec:results_deepsad}).
\item \textbf{Temporal inference recipe.} For deployment-oriented analysis we propose clean-run $z$-score normalization and causal EMA smoothing to obtain interpretable time-series anomaly scores on full traversals (Section~\ref{sec:results_inference}). \item \textbf{Temporal inference recipe.} For deployment-oriented analysis we propose clean-run $z$-score normalization and causal EMA smoothing to obtain interpretable time-series anomaly scores on full experiments (Section~\ref{sec:results_inference}).
\end{itemize} \end{itemize}
\paragraph{Practical recommendations.} \paragraph{Practical recommendations.}

View File

@@ -569,6 +569,32 @@
publisher = {MIT Press}, publisher = {MIT Press},
note = {\url{http://www.deeplearningbook.org}}, note = {\url{http://www.deeplearningbook.org}},
year = {2016}, year = {2016},
},
@misc{mobilenet,
doi = {10.48550/ARXIV.1704.04861},
url = {https://arxiv.org/abs/1704.04861},
author = {Howard, Andrew G. and Zhu, Menglong and Chen, Bo and Kalenichenko,
Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and
Adam, Hartwig},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and
information sciences, FOS: Computer and information sciences},
title = {MobileNets: Efficient Convolutional Neural Networks for Mobile Vision
Applications},
publisher = {arXiv},
year = {2017},
copyright = {arXiv.org perpetual, non-exclusive license},
},
@inproceedings{shufflenet,
title = {ShuffleNet: An Extremely Efficient Convolutional Neural Network for
Mobile Devices},
url = {http://dx.doi.org/10.1109/CVPR.2018.00716},
DOI = {10.1109/cvpr.2018.00716},
booktitle = {2018 IEEE/CVF Conference on Computer Vision and Pattern
Recognition},
publisher = {IEEE},
author = {Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
year = {2018},
month = jun,
} }

View File

@@ -8,4 +8,5 @@ dependencies = [
"pandas>=2.3.2", "pandas>=2.3.2",
"polars>=1.33.0", "polars>=1.33.0",
"pyarrow>=21.0.0", "pyarrow>=21.0.0",
"tabulate>=0.9.0",
] ]

11
tools/uv.lock generated
View File

@@ -208,6 +208,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
] ]
[[package]]
name = "tabulate"
version = "0.9.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
]
[[package]] [[package]]
name = "tools" name = "tools"
version = "0.1.0" version = "0.1.0"
@@ -216,6 +225,7 @@ dependencies = [
{ name = "pandas" }, { name = "pandas" },
{ name = "polars" }, { name = "polars" },
{ name = "pyarrow" }, { name = "pyarrow" },
{ name = "tabulate" },
] ]
[package.metadata] [package.metadata]
@@ -223,6 +233,7 @@ requires-dist = [
{ name = "pandas", specifier = ">=2.3.2" }, { name = "pandas", specifier = ">=2.3.2" },
{ name = "polars", specifier = ">=1.33.0" }, { name = "polars", specifier = ">=1.33.0" },
{ name = "pyarrow", specifier = ">=21.0.0" }, { name = "pyarrow", specifier = ">=21.0.0" },
{ name = "tabulate", specifier = ">=0.9.0" },
] ]
[[package]] [[package]]