ML Notes: Standalone binaries

Bytecode portability

Check with ocamlobjinfo; the DLLs that need to be loaded at runtime are listed first:

$ ocamlobjinfo _build/default/bin/main.bc                       
File _build/default/bin/main.bc                                                                      
Used DLLs:
        dllcamlstrbyt
        dllunixbyt
        dllthreads
        dllsqlite3_stubs
        dllcamlzip

Apart from DLLs, and of course apart from more expected environmental dependencies like expecting to be able to spawn /usr/bin/git, bytecode should only require the same version of ocamlrun on the target.

Some benchmarks

(* o3.ml *)
let () = print_endline "Привет, мир!"; exit 0

(* m3.sml *)
val () = (print "Привет, мир!\n"; OS.Process.exit OS.Process.success);

$ ocamlmktop -o o3.bc -linkall -custom o3.ml
$ ocamlc -o o3.b o3.ml  # not actually standalone
$ ocamlopt -compact -o o3s o3.ml
$ ocamlopt -O2 -o o3 o3.ml
$ mosmlc -o m3 -standalone m3.sml
$ mlton -output m3b -default-ann 'allowExtendedTextConsts true' m3.sml
$ ./o3
Привет, мир!
$ ./m3
Привет, мир!

$ hyperfine ./m3 ./m3b ./o3 ./o3.bc ./o3.b ./o3s
...
  ./m3b ran
    1.04 ± 6.60 times faster than ./o3
    1.20 ± 7.58 times faster than ./o3s
    1.80 ± 10.26 times faster than ./o3.b
   17.05 ± 77.68 times faster than ./m3
  248.06 ± 1111.17 times faster than ./o3.bc
$ du -sh ./m3 ./m3b ./o3 ./o3.bc ./o3.b ./o3s
152K    ./m3
680K    ./m3b
1.7M    ./o3
5.3M    ./o3.bc
28K     ./o3.b
1.7M    ./o3s

This is the startup time of a trivial executable of course. I included ocamlc for its ergonomics, but here's something similar for SML/NJ:

(* m4.sml *)
structure Hello =
struct
  fun main (_, _) =
    (print "Привет, мир!\n"; OS.Process.success)
end

(* m4.cm *)
Group is
  $/basis.cm
  m4.sml

$ ml-build m4.cm Hello.main hello
Standard ML of New Jersey [Version 110.99.9; 64-bit; November 4, 2025]
[scanning m4.cm]
[library $SMLNJ-BASIS/basis.cm is stable]
[library $SMLNJ-BASIS/(basis.cm):basis-common.cm is stable]
[loading (m4.cm):m4.sml]
[scanning 185127-export.cm]
[scanning (185127-export.cm):m4.cm]
[parsing (185127-export.cm):185127-export.sml]
[compiling (185127-export.cm):185127-export.sml]
[code: 350, data: 30, env: 39 bytes]
$ sml @SMLload=hello
Привет, мир!
$ du -sh hello.amd64-linux 
428K    hello.amd64-linux
$ hyperfine ./m3 'sml @SMLload=hello'
...
  ./m3 ran
   12.34 ± 7.80 times faster than sml @SMLload=hello

Peak RSS:

m3                    2.23MB
m3b                   2.49MB
o3                    2.62MB
o3.bc                10.00MB
o3.b                  2.88MB
o3s                   2.62MB
sml @SMLload=hello    4.06MB

dune builds standalone bytecode that's strangely super good, significantly faster for this simple task than even the ocamlopt and MLton native binaries above (and while doing worse on every other metric, like instruction count or cache misses).

(* bin/main.ml *)
let () = print_endline "Привет, мир!"

; bin/dune
(executable
 (public_name o4)
 (name main)
 (libraries o4)
 (modes byte_complete))

Output from poop:

Benchmark 8 (3255 runs): ./o4/_build/default/bin/main.bc.exe
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          1.48ms ±  439us     742us … 4.96ms         77 ( 2%)        ⚡- 11.0% ±  1.4%
  peak_rss           2.80MB ± 65.7KB    2.49MB … 2.88MB          2 ( 0%)        💩+ 35.8% ±  0.2%
  cpu_cycles          599K  ± 29.7K      573K  …  983K         238 ( 7%)        💩+ 72.4% ±  0.4%
  instructions        763K  ± 3.46       763K  …  763K         197 ( 6%)        💩+128.5% ±  0.0%
  cache_references   37.6K  ±  462      36.0K  … 41.5K          72 ( 2%)        💩+ 85.9% ±  0.1%
  cache_misses       8.44K  ±  215      7.33K  … 12.1K         137 ( 4%)        💩+ 39.2% ±  0.2%
  branch_misses      6.30K  ± 34.1      6.14K  … 6.46K          37 ( 1%)        💩+  3.1% ±  0.0%