% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Bluhme:364059,
      author       = {Bluhme, Nora and Gorbunov, Sergey and Lindenstruth, Volker},
      title        = {{A}pplication of {L}inear and {N}on-{L}inear {C}onstraints
                      in a {B}rute-{F}orce-{B}ased {A}lignment {A}pproach for
                      {CBM}},
      journal      = {The European physical journal / Web of Conferences},
      volume       = {337},
      issn         = {2100-014X},
      address      = {Les Ulis},
      publisher    = {EDP Sciences},
      reportid     = {GSI-2026-00239},
      pages        = {01229},
      year         = {2025},
      note         = {This is an open access article distributed under the terms
                      of the Creative Commons Attribution License 4.0
                      (https://creativecommons.org/licenses/by/4.0/)},
      abstract     = {The Compressed Baryonic Matter (CBM) experiment at FAIR
                      will operate at interaction rates up to 10 MHz, generating
                      data streams averaging 500 GB/s. This necessitates efficient
                      online reconstruction capabilities, particularly for the
                      Silicon Tracking System (STS), which is the key detector for
                      track reconstruction and contributes a large fraction of the
                      expected data volume. We present a GPU-accelerated hit
                      reconstruction chain for the STS that achieves a 128 speedup
                      over the sequential CPU implementation. The implementation
                      features optimized data structures reducing memory
                      footprint, parallel algorithms for sorting, cluster finding,
                      and hit reconstruction, and portability across GPU
                      architectures. Our custom merge sort outperforms library
                      implementations by 10 $\%$ while using 33 $\%$ less memory.
                      Cluster finding employs a twophase approach with atomic
                      operations for thread-safe connections between signal
                      clusters. Even before GPU acceleration, algorithmic
                      improvements provide a 3 speedup in single-threaded
                      execution. Both NVIDIA and AMD GPUs achieve comparable
                      performance of approximately 0.12 s on a timeframe
                      containing 1000 Au+Au events. The reconstruction chain was
                      successfully deployed during the May 2024 mCBM beamtime,
                      processing data rates up to 2.4 GB/s in real-time,
                      demonstrating its viability for CBM’s triggerless data
                      acquisition approach.},
      month         = {Oct},
      date          = {2024-10-21},
      organization  = {27th International Conference on
                       Computing in High Energy and Nuclear
                       Physics (CHEP 2024), Krakow (Poland),
                       21 Oct 2024 - 25 Oct 2024},
      cin          = {CBM / CBM@FAIR},
      ddc          = {530},
      cid          = {I:(DE-Ds200)CBM-20080821OR102 / I:(DE-Ds200)Coll-FAIR-CBM},
      pnm          = {612 - Cosmic Matter in the Laboratory (POF4-612) /
                      05P21RFFC1 - Verbundprojekt 05P2021 (ErUM-FSP T06) - Aufbau
                      von CBM bei FAIR: Bau des First-Level Event Selectors (FLES)
                      für das CBM-Experiment an FAIR (BMBF-05P21RFFC1)},
      pid          = {G:(DE-HGF)POF4-612 / G:(DE-Ds200)BMBF-05P21RFFC1},
      experiment   = {$EXP:(DE-Ds200)External_experiment-20200803$},
      typ          = {PUB:(DE-HGF)16 / PUB:(DE-HGF)8},
      doi          = {10.1051/epjconf/202533701229},
      url          = {https://repository.gsi.de/record/364059},
}