ML Notes: bench/last of 100 million

Consider this Common Lisp program:

(defun iota (n)
  (loop for m upto n collect m))

(defun main ()
  (princ (last (iota 100000000)))
  (terpri))

(main)

sbcl OOM's on this by default, but with a flag this is how it performs:

$ perf stat sbcl --dynamic-space-size $((5*1024)) --script bin/alloc.lisp 
(100000000)

 Performance counter stats for 'sbcl --dynamic-space-size 5120 --script bin/alloc.lisp':

     2,232,924,511      task-clock:u                     #    0.999 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
           726,546      page-faults:u                    #  325.379 K/sec                     
    23,155,704,327      instructions:u                   #    3.62  insn per cycle            
                                                  #    0.02  stalled cycles per insn   
     6,397,898,139      cycles:u                         #    2.865 GHz                       
       353,307,962      stalled-cycles-frontend:u        #    5.52% frontend cycles idle      
     5,685,685,604      branches:u                       #    2.546 G/sec                     
         1,479,992      branch-misses:u                  #    0.03% of all branches           

       2.234610944 seconds time elapsed

       1.326468000 seconds user
       0.894361000 seconds sys

With (room t) the output includes:

Dynamic space usage is:   1,612,423,280 bytes.                                                                   
Immobile space usage is:     12,816,912 bytes (2,880 bytes overhead).                                            
Read-only space usage is:     8,281,792 bytes.                                                                   
Static space usage is:            2,752 bytes.                                                                   
Control stack usage is:           4,680 bytes.                                                                   
Binding stack usage is:             640 bytes.                                                                   
Control and binding stack usage is for the current thread only.                                                  
Garbage collection is currently enabled.                                                                         
                                                                                                                 
Summary of spaces: dynamic immobile static                                                                       
                                                                                                                 
CONS:                                                                                                            
    1,603,778,944 bytes, 100,236,184 objects, 100% dynamic.

Here's near-identical OCaml:

let iota n =
  let[@tail_mod_cons] rec loop hi = function
    | n when n = hi -> [n]
    | n -> n :: loop hi (n + 1)
  in
  loop n 0

let rec last = function
  | [x] -> x
  | _ :: tl -> last tl
  | [] -> failwith "last"

let () =
  print_int (last (iota 100_000_000));
  print_newline ()

and its performance:

$ perf stat _build/default/bin/alloc1.exe                                           
100000000                                                                                                        
                                                                                                                 
 Performance counter stats for '_build/default/bin/alloc1.exe':                                                  
                                                                                                                 
     2,723,962,538      task-clock:u                     #    0.999 CPUs utilized                                
                 0      context-switches:u               #    0.000 /sec                                         
                 0      cpu-migrations:u                 #    0.000 /sec                                         
           586,687      page-faults:u                    #  215.380 K/sec                                        
    40,011,378,392      instructions:u                   #    4.53  insn per cycle                               
                                                  #    0.01  stalled cycles per insn                             
     8,837,817,006      cycles:u                         #    3.244 GHz                                          
       207,224,634      stalled-cycles-frontend:u        #    2.34% frontend cycles idle                         
     8,356,370,725      branches:u                       #    3.068 G/sec                                        
         1,492,433      branch-misses:u                  #    0.02% of all branches                              
                                                                                                                 
       2.727611291 seconds time elapsed                                                                          
                                                                                                                 
       1.881649000 seconds user                                                                                  
       0.827292000 seconds sys

and its Gc.print_stat stdout:

minor_collections:      1158                                                                                     
major_collections:      20                                                                                       
compactions:            0                                                                                        
forced_major_collections: 1                                                                                      
                                                                                                                 
minor_words:    300000098
promoted_words: 300000095
major_words:    300000161

top_heap_words: 300318720
heap_words:     300318720
live_words:     300000161
free_words:         25262
largest_free:           0
fragments:         293297

live_blocks: 100000032
free_blocks: 0
heap_chunks: 0

and the poop comparison:

Benchmark 1 (3 runs): sbcl --dynamic-space-size 5120 --script bin/alloc.lisp
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.87s  ± 76.1ms    2.81s  … 2.96s           0 ( 0%)        0%
  peak_rss           2.73GB ± 75.7KB    2.73GB … 2.73GB          0 ( 0%)        0%
  cpu_cycles         9.32G  ±  212M     9.15G  … 9.56G           0 ( 0%)        0%
  instructions       39.3G  ± 95.6      39.3G  … 39.3G           0 ( 0%)        0%
  cache_references    490M  ±  717K      489M  …  490M           0 ( 0%)        0%
  cache_misses       3.12M  ±  145K     2.98M  … 3.27M           0 ( 0%)        0%
  branch_misses      1.63M  ± 5.58K     1.62M  … 1.63M           0 ( 0%)        0%
Benchmark 2 (3 runs): _build/default/bin/alloc1.exe
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.72s  ± 28.1ms    2.69s  … 2.74s           0 ( 0%)          -  5.3% ±  4.5%
  peak_rss           2.41GB ±    0      2.41GB … 2.41GB          0 ( 0%)        ⚡- 11.9% ±  0.0%
  cpu_cycles         8.88G  ± 37.8M     8.84G  … 8.91G           0 ( 0%)        ⚡-  4.7% ±  3.7%
  instructions       40.0G  ±  389K     40.0G  … 40.0G           0 ( 0%)        💩+  1.9% ±  0.0%
  cache_references    655M  ± 3.52M      651M  …  658M           0 ( 0%)        💩+ 33.7% ±  1.2%
  cache_misses       6.23M  ±  318K     5.86M  … 6.42M           0 ( 0%)        💩+ 99.9% ± 18.0%
  branch_misses      1.52M  ± 30.8K     1.50M  … 1.55M           0 ( 0%)        ⚡-  6.8% ±  3.1%

I initially thought that OCaml performed quite a bit worse than SBCL, but it was all added work from Gc.print_stat. Instead OCaml's very similar on this very Lisp-friendly benchmark which mostly tests CONS and the allocator.

For fun here is Standard ML that saves the world after building the big list:

fun iota n =
  let
    fun loop hi m =
      if m = hi then [m] else m :: loop hi (m + 1)
  in
    loop n 0
  end

fun last [x] = x
  | last (_ :: tl) = last tl
  | last [] = raise Fail "last"

val () =
  let
    val list = iota 100000000
    val _ = MLton.World.save "alloc1.world"
    val result = last list
  in
    print (Int.toString result ^ "\n")
  end

MLton and MLkit both get need nearly 5s on this benchmark (in a version without MLton.World.save) and get wiped out by both SBCL and OCaml. But when MLton is allowed to cheat by starting mid-way...

$ poop ./_build/default/bin/alloc1.exe './bin/alloc1 @MLton load-world alloc1.world --'
Benchmark 1 (3 runs): ./_build/default/bin/alloc1.exe
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.71s  ± 17.8ms    2.69s  … 2.72s           0 ( 0%)        0%
  peak_rss           2.40GB ± 75.7KB    2.40GB … 2.41GB          0 ( 0%)        0%
  cpu_cycles         8.88G  ± 25.1M     8.86G  … 8.91G           0 ( 0%)        0%
  instructions       40.0G  ±  581K     40.0G  … 40.0G           0 ( 0%)        0%
  cache_references    653M  ±  409K      653M  …  654M           0 ( 0%)        0%
  cache_misses       6.38M  ± 75.5K     6.30M  … 6.44M           0 ( 0%)        0%
  branch_misses      1.52M  ± 15.7K     1.50M  … 1.53M           0 ( 0%)        0%
Benchmark 2 (4 runs): ./bin/alloc1 @MLton load-world alloc1.world --
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          1.59s  ± 16.6ms    1.57s  … 1.60s           0 ( 0%)        ⚡- 41.5% ±  1.2%
  peak_rss           2.53GB ±  107KB    2.53GB … 2.53GB          0 ( 0%)        💩+  5.1% ±  0.0%
  cpu_cycles         3.24G  ± 11.7M     3.23G  … 3.25G           0 ( 0%)        ⚡- 63.5% ±  0.4%
  instructions       7.90G  ± 6.80      7.90G  … 7.90G           0 ( 0%)        ⚡- 80.2% ±  0.0%
  cache_references    155M  ± 91.3K      155M  …  155M           1 (25%)        ⚡- 76.3% ±  0.1%
  cache_misses        107K  ± 10.4K     95.8K  …  117K           0 ( 0%)        ⚡- 98.3% ±  1.5%
  branch_misses      38.4K  ± 82.9      38.3K  … 38.5K           0 ( 0%)        ⚡- 97.5% ±  1.3%

OCaml can use a memory-mapped bigarray cache, but MLton can do it with any data.

Although,

let iota n =
  let[@tail_mod_cons] rec loop hi = function
    | n when n = hi -> [n]
    | n -> n :: loop hi (n + 1)
  in
  loop n 0

let rec last = function
  | [x] -> x
  | _ :: tl -> last tl
  | [] -> failwith "last"

let () =
  let list =
    match Sys.argv with
    | [|_; "save"; filename|] ->
      let list = iota 100_000_000 in
      Out_channel.with_open_bin filename (fun file ->
          Marshal.to_channel file list [Marshal.No_sharing]);
      list
    | [|_; "load"; filename|] -> In_channel.with_open_bin filename Marshal.from_channel
    | _ ->
      prerr_endline "usage: <save|load> <file>";
      exit 1
  in
  print_int (last list);
  print_newline ()

is about twice as slow when generating the cache (or 5x as slow without Marshal.No_sharing) in return for some slight advantages, especially in cache usage:

Benchmark 1 (3 runs): ./_build/default/bin/alloc1.exe
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.74s  ± 35.7ms    2.70s  … 2.77s           0 ( 0%)        0%
  peak_rss           2.41GB ±    0      2.41GB … 2.41GB          0 ( 0%)        0%
  cpu_cycles         9.01G  ± 52.9M     8.95G  … 9.06G           0 ( 0%)        0%
  instructions       40.0G  ±  396K     40.0G  … 40.0G           0 ( 0%)        0%
  cache_references    655M  ±  530K      654M  …  655M           0 ( 0%)        0%
  cache_misses       6.50M  ±  231K     6.23M  … 6.67M           0 ( 0%)        0%
  branch_misses      1.48M  ± 12.8K     1.47M  … 1.50M           0 ( 0%)        0%
Benchmark 2 (3 runs): ./_build/default/bin/alloc5.exe alloc.cache
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          2.46s  ± 20.5ms    2.44s  … 2.48s           0 ( 0%)        ⚡- 10.2% ±  2.4%
  peak_rss           3.81GB ± 86.3KB    3.81GB … 3.81GB          0 ( 0%)        💩+ 58.2% ±  0.0%
  cpu_cycles         5.59G  ± 8.70M     5.58G  … 5.60G           0 ( 0%)        ⚡- 37.9% ±  1.0%
  instructions       24.0G  ± 9.17      24.0G  … 24.0G           0 ( 0%)        ⚡- 39.9% ±  0.0%
  cache_references    220M  ±  581K      219M  …  220M           0 ( 0%)        ⚡- 66.5% ±  0.2%
  cache_misses        714K  ±  176K      524K  …  871K           0 ( 0%)        ⚡- 89.0% ±  7.2%
  branch_misses      1.63M  ± 3.51K     1.63M  … 1.64M           0 ( 0%)        💩+ 10.2% ±  1.4%

GC tuning

Going back to the original OCaml, GC tuning can easily worsen perform enormously:

$ OCAMLRUNPARAM=o=1 getr 1 _build/default/bin/alloc1.exe 
100000000
Real time      : 1 min 3 s
User time      : 62.3452 s
System time    : 1.1240 s
Time           : 1 min 3 s (63469.241 ms/per)
Max RSS        : 2.24 GB
Page reclaims  : 586694
Page faults    : 0
Block inputs   : 0
Block outputs  : 0
vol ctx switches   : 1
invol ctx switches : 1893

And also significantly improve it by making the minor heap size large enough that the list doesn't promote to the major heap:

$ hyperfine --warmup=2 _build/default/bin/alloc1.exe 'OCAMLRUNPARAM=s=300M _build/default/bin/alloc1.exe'
Benchmark 1: _build/default/bin/alloc1.exe
  Time (mean ± σ):      2.845 s ±  0.027 s    [User: 1.946 s, System: 0.878 s]
  Range (min … max):    2.800 s …  2.870 s    10 runs
 
Benchmark 2: OCAMLRUNPARAM=s=300M _build/default/bin/alloc1.exe
  Time (mean ± σ):      1.042 s ±  0.010 s    [User: 0.270 s, System: 0.766 s]
  Range (min … max):    1.027 s …  1.059 s    10 runs
 
Summary
  OCAMLRUNPARAM=s=300M _build/default/bin/alloc1.exe ran
    2.73 ± 0.04 times faster than _build/default/bin/alloc1.exe

v=64 gives decent runtime output from the GC - there's lots of output with default GC settings, and none for a minor heap sized at 300M words (about 2.3 GB on a 64-bit machine).

NB. s=300M doesn't only raise the memory water level by 2.3GB, even in an application with only one domain, because there's other GC tuning that considers the size of the minor heap. A program can run under 3G with default settings, then run at nearly 20G with s=3G.

NB. safepoints added to OCaml code