<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
    <url>
        <loc>https://veitner.bearblog.dev/</loc>
        <lastmod>2026-03-11</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/simple-math-to-speed-up-gdn-prefill/</loc>
        <lastmod>2026-03-14</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/chunkwise-gated-delta-rule/</loc>
        <lastmod>2026-03-10</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/gated-delta-net-decoding/</loc>
        <lastmod>2026-02-15</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/grouped-blockscaled-gemm-kernel/</loc>
        <lastmod>2026-02-07</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/grouped-blockscaled-gemm-host-code/</loc>
        <lastmod>2026-01-24</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/grouped-block-scaled-gemm-intro/</loc>
        <lastmod>2026-01-22</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/warp-specialisation-in-cutedsl/</loc>
        <lastmod>2026-01-07</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/2-cta-gemm-on-b200/</loc>
        <lastmod>2026-01-04</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/blackwell-pipelining-with-cutedsl/</loc>
        <lastmod>2025-12-23</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/b200-blockscaled-gemm-the-setup/</loc>
        <lastmod>2025-12-06</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/scale-tensor-construction-in-cutedsl/</loc>
        <lastmod>2025-12-03</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/demystifying-numeric-conversions-in-cutedsl/</loc>
        <lastmod>2025-11-23</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/nvfp4-gemv-improved/</loc>
        <lastmod>2025-11-16</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/nvfp4-gemv/</loc>
        <lastmod>2025-11-13</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/bit-counting-and-geometric-series/</loc>
        <lastmod>2025-11-06</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/simple-reduction-in-cutedsl/</loc>
        <lastmod>2025-10-27</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/mma-atoms-in-cute/</loc>
        <lastmod>2025-12-12</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/lru-and-lfu-in-c/</loc>
        <lastmod>2025-10-11</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/mutual-refinement-and-composition/</loc>
        <lastmod>2025-10-03</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/applied-introduction-to-categorical-treatment-of-cute/</loc>
        <lastmod>2025-09-27</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/layout-gymnastics/</loc>
        <lastmod>2025-09-23</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/swizzles-and-their-usage-in-cutedsl-kernels/</loc>
        <lastmod>2025-09-20</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/cute-partitions/</loc>
        <lastmod>2025-09-14</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/tensors-slicing-in-cute/</loc>
        <lastmod>2025-09-09</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/understanding-cute-swizzling-the-math-behind-32b-64b-and-128b-patterns/</loc>
        <lastmod>2026-01-16</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/gpu-l2-cache-persistence/</loc>
        <lastmod>2025-09-05</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/cuda-streams/</loc>
        <lastmod>2025-09-01</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/pingpong-in-the-cutedsl-with-quack/</loc>
        <lastmod>2025-08-23</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/bit-hacking-in-c/</loc>
        <lastmod>2025-08-17</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/intuition-behind-hierarchical-layouts/</loc>
        <lastmod>2025-08-14</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/persistent-float8-dense-gemm-on-hopper/</loc>
        <lastmod>2025-08-09</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/epilogue-h100-cutedsl/</loc>
        <lastmod>2025-12-12</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/let-the-compiler-do-the-work-in-cutedsl/</loc>
        <lastmod>2025-07-30</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/persistent-gemm-in-cutedsl-on-hopper/</loc>
        <lastmod>2025-07-25</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/consumer-producer-pattern-on-h100-in-cutedsl/</loc>
        <lastmod>2025-07-20</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/backprob-through-layernorm/</loc>
        <lastmod>2025-07-13</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/backprop-through-rmsnorm/</loc>
        <lastmod>2025-07-13</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/outperform-compiled-pytorch-code-using-quack/</loc>
        <lastmod>2025-07-12</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/cutedsl-on-hopper-pipelining/</loc>
        <lastmod>2025-07-05</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/cutedsl-on-hopper-wgmma-and-tma-intro/</loc>
        <lastmod>2025-12-29</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/thread-value-layouts-in-cute/</loc>
        <lastmod>2025-06-28</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/sgemm-in-cutedsl/</loc>
        <lastmod>2025-06-26</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/an-applied-introduction-to-cutedsl/</loc>
        <lastmod>2025-06-24</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/calculating-the-fibonacci-numbers-on-gpu/</loc>
        <lastmod>2025-06-21</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/an-introduction-to-thrust/</loc>
        <lastmod>2025-06-17</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/infinite-binary-string/</loc>
        <lastmod>2025-06-09</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/highly-efficient-matrix-transpose-in-mojo/</loc>
        <lastmod>2025-06-06</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/the-bijection-between-natural-numbers-and-binary-strings/</loc>
        <lastmod>2025-06-05</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/use-tma-without-cuda/</loc>
        <lastmod>2025-06-04</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/use-ptx-instructions-in-mojo/</loc>
        <lastmod>2025-05-29</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/very-fast-vector-sum-without-cuda/</loc>
        <lastmod>2025-05-25</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/short-introduction-to-the-mojo-programming-language/</loc>
        <lastmod>2025-05-22</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/bridging-math-and-code-cute-layout-algebra-in-cutedsl/</loc>
        <lastmod>2025-05-18</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/load-and-store-matrices-efficently-with-ptx-instructions/</loc>
        <lastmod>2025-12-12</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/how-to-use-reasoning-models-with-sglang/</loc>
        <lastmod>2025-05-11</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/making-matrix-transpose-really-fast-on-hopper-gpus/</loc>
        <lastmod>2025-05-07</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/tma-introduction/</loc>
        <lastmod>2025-04-27</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/analyze-cuda-programs-by-looking-at-gpu-assembly/</loc>
        <lastmod>2025-04-21</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/making-rmsnorm-really-fast/</loc>
        <lastmod>2025-04-18</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/making-prefix-sum-really-fast/</loc>
        <lastmod>2025-04-13</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/making-vector-sum-really-fast/</loc>
        <lastmod>2025-04-09</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/predication-in-cutlass/</loc>
        <lastmod>2025-03-31</lastmod>
    </url>
    
    <url>
        <loc>https://veitner.bearblog.dev/indexing-in-cuda/</loc>
        <lastmod>2025-03-23</lastmod>
    </url>
    
</urlset>