Miller 5.6.2

Overview   Using   Reference   Background   Repository  

Why?
Why C?
Why C: details
Why call it Miller?
How original is Miller?
Performance

Why C: details
• Overview
• Timings
• Discussion
• Source code
    • C cat using fgets
    • C cat using getdelim
    • C cat using mmap
    • D cat
    • Rust cat
    • Go cat
    • Nim cat
    • D cut
    • Go cut
    • Nim cut
• Makefile
• Compiler versions
• Hardware

Overview

This section contains details substantiating the discussion at Why C?.

I did simple experiments in several languages. In one experiment (cat tests) I just read lines and printed them back out — a line-oriented cat. In another (cut tests) I consumed input lines like x=1,y=2,z=3 one at a time, split them on commas and equals signs to populate hash maps, transformed them (e.g. remove the y field), and emitted them. Basically mlr cut -x -f y with DKVP format. I didn’t do anything fancy — just using each language’s getline, string-split, hashmap-put, etc. (For C, the hashmap logic was my own and has turned into Miller per se.) And nothing was as fast as C, so I used C. Here are the experiments I kept (I failed to keep the Lua code, for example). Note that I re-ran the timings in 2019 using code written in 2015, for purposes of creating this page.

Timings

----------------------------------------------------------------
$ wc -l ../data/bigger
1000000 ../data/bigger

$ ls -lh ../data/bigger
-rw----r-- 1 kerl kerl 59M Sep 18 21:47 ../data/bigger

$ head ../data/bigger
a=pan,b=pan,i=1,x=0.3467901443380824,y=0.7268028627434533
a=eks,b=pan,i=2,x=0.7586799647899636,y=0.5221511083334797
a=wye,b=wye,i=3,x=0.20460330576630303,y=0.33831852551664776
a=eks,b=wye,i=4,x=0.38139939387114097,y=0.13418874328430463
a=wye,b=pan,i=5,x=0.5732889198020006,y=0.8636244699032729
a=zee,b=pan,i=6,x=0.5271261600918548,y=0.49322128674835697
a=eks,b=zee,i=7,x=0.6117840605678454,y=0.1878849191181694
a=zee,b=wye,i=8,x=0.5985540091064224,y=0.976181385699006
a=hat,b=wye,i=9,x=0.03144187646093577,y=0.7495507603507059
a=pan,b=wye,i=10,x=0.5026260055412137,y=0.9526183602969864

----------------------------------------------------------------
$ cat timings.sh

time /bin/cat ../data/bigger > /dev/null
time catc < ../data/bigger  > /dev/null
time catc0 < ../data/bigger  > /dev/null
time catm ../data/bigger  > /dev/null
time catd < ../data/bigger  > /dev/null
time catgo < ../data/bigger  > /dev/null
time catrust < ../data/bigger  > /dev/null
time catnim < ../data/bigger  > /dev/null

time cutgo < ../data/bigger  > /dev/null
time cutd < ../data/bigger  > /dev/null
time cutnim < ../data/bigger  > /dev/null

----------------------------------------------------------------
$ bash -x timings.sh

+ /bin/cat ../data/bigger
real    0m0.026s
user    0m0.000s
sys     0m0.026s

+ catc
real    0m0.097s
user    0m0.076s
sys     0m0.021s

+ catc0
real    0m0.057s
user    0m0.028s
sys     0m0.028s

+ catm ../data/bigger
real    0m0.197s
user    0m0.177s
sys     0m0.020s

+ catd
real    0m0.258s
user    0m0.213s
sys     0m0.054s

+ catgo
real    0m0.121s
user    0m0.111s
sys     0m0.024s

+ catrust
real    0m1.055s
user    0m1.035s
sys     0m0.020s

+ catnim
real    0m0.624s
user    0m0.432s
sys     0m0.192s

+ cutgo
real    0m1.073s
user    0m1.079s
sys     0m0.041s

+ cutd
real    0m4.233s
user    0m4.462s
sys     0m0.113s

+ cutnim
real    0m21.380s
user    0m21.350s
sys     0m0.024s

Discussion

Source code

C cat using fgets

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// ----------------------------------------------------------------
static int do_stream(char* file_name) {
	FILE* input_stream  = stdin;
	FILE* output_stream = stdout;

	if (strcmp(file_name, "-")) {
		input_stream = fopen(file_name, "r");
		if (input_stream == NULL) {
			perror(file_name);
			return 0;
		}
	}

	while (1) {
		char* line = NULL;
		size_t linecap = 0;
		ssize_t linelen = getdelim(&line, &linecap, '\n', input_stream);
		if (linelen <= 0) {
			break;
		}
		fputs(line, output_stream);
		free(line);
	}
	if (input_stream != stdin)
		fclose(input_stream);

	return 1;
}

// ================================================================
int main(int argc, char** argv) {
	int ok = 1;
	if (argc == 1) {
		ok = ok && do_stream("-");
	} else {
		for (int argi = 1; argi < argc; argi++) {
		    ok = do_stream(argv[argi]);
		}
	}
	return ok ? 0 : 1;
}

C cat using getdelim

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MYBUFSIZ 8192
static char iobuf[MYBUFSIZ];

// ----------------------------------------------------------------
static int do_stream(char* file_name) {
	FILE* input_stream  = stdin;
	FILE* output_stream = stdout;

	if (strcmp(file_name, "-")) {
		input_stream = fopen(file_name, "r");
		if (input_stream == NULL) {
			perror(file_name);
			return 0;
		}
	}

	while (1) {
		char* line = fgets(iobuf, BUFSIZ, input_stream);
		if (line == NULL)
			break;
		fputs(line, output_stream);
	}
	if (input_stream != stdin)
		fclose(input_stream);

	return 1;
}

// ================================================================
int main(int argc, char** argv) {
	int ok = 1;
	if (argc == 1) {
		ok = ok && do_stream("-");
	} else {
		for (int argi = 1; argi < argc; argi++) {
		    ok = do_stream(argv[argi]);
		}
	}
	return ok ? 0 : 1;
}

C cat using mmap

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>

// ----------------------------------------------------------------
static void emit(char* sol, char* eol, FILE* output_stream) {
	 size_t ntowrite = eol - sol;
     size_t nwritten = fwrite(sol, 1, ntowrite, output_stream);
	 if (nwritten != ntowrite) {
		perror("fwrite");
		exit(1);
	 }
}

// ----------------------------------------------------------------
static int do_stream(char* file_name) {
	FILE* output_stream = stdout;
	int fd = open(file_name, O_RDONLY);
	if (fd < 0) {
		perror("open");
		exit(1);
	}
	struct stat stat;
	if (fstat(fd, &stat) < 0) {
		perror("fstat");
		exit(1);
	}
	char* sof = mmap(NULL, (size_t)stat.st_size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE, fd, (off_t)0);
	if (sof == MAP_FAILED) {
		perror("mmap");
		exit(1);
	}
	char* eof = sof + stat.st_size;
	char* sol = sof;
	char* eol;
	char* p = sof;

	while (p < eof) {
		if (*p == '\n') {
			*p = 0;
			eol = p;
			emit(sol, eol, output_stream);
			p++;
			sol = p;
		} else {
			p++;
		}
	}

	if (close(fd) < 0) {
		perror("close");
		exit(1);
	}

	return 1;
}

// ================================================================
int main(int argc, char** argv) {
	int ok = 1;
	for (int argi = 1; argi < argc; argi++) {
	    ok = do_stream(argv[argi]);
	}
	return ok ? 0 : 1;
}

D cat

// Reads $(D stdin) and writes it to $(D stdout).
import std.stdio;

void main()
{
	string line;
	while ((line = stdin.readln()) !is null)
		write(line);
}

Rust cat

use std::io;
use std::io::BufRead;

fn main() {
    for line in io::stdin().lock().lines() {
        print!("{}", line.unwrap());
    }
}

Go cat

package main

import (
	"bufio"
	"io"
	"log"
	"os"
)

// ----------------------------------------------------------------
func main() {
	args := os.Args[1:]
	includeFields := []string {"a", "x"};

	ok := true
	if len(args) == 0 {
		ok = handle("-", includeFields) && ok
	} else {
		for _, arg := range args {
			ok = handle(arg, includeFields) && ok
		}
	}
	if ok {
		os.Exit(0)
	} else {
		os.Exit(1)
	}
}

// ----------------------------------------------------------------
func handle(fileName string, includeFields []string) (ok bool) {
	inputStream := os.Stdin
	if fileName != "-" {
		var err error
		if inputStream, err = os.Open(fileName); err != nil {
			log.Println(err)
			return false
		}
	}

	reader := bufio.NewReader(inputStream)
	writer := bufio.NewWriter(os.Stdout)
	eof := false

	for !eof {
		line, err := reader.ReadString('\n')
		if err == io.EOF {
			err = nil
			eof = true
		} else if err != nil {
			log.Println(err)
			if fileName != "-" {
				inputStream.Close()
			}
			return false
		} else {
			writer.WriteString(line)
		}
	}
	if fileName != "-" {
		inputStream.Close()
	}
	writer.Flush()

	return true
}

Nim cat

for line in stdin.lines:
  echo(line)

D cut

// Reads $(D stdin) and writes it to $(D stdout).
// http://dlang.org/hash-map.html
import std.stdio;
import std.string;
import std.array;

void main() {
	string[] includeFields = ["a", "x"];
	string line;
	while ((line = stdin.readln()) !is null) {
		// Input string to hashmap.
		string[string] oldmap;
		string[] fields = split(line, ',');
		foreach (field; fields) {
			string[] kvps = split(field, '='); // really want splitN with max #parts = 2
			oldmap[kvps[0]] = kvps[1];
		}

		// Hashmap-to-hashmap transform.
		// Note: unordered hashmap here.
		string[string] newmap;
		foreach (includeField; includeFields) {
			if (includeField in oldmap) {
				newmap[includeField] = oldmap[includeField];
			}
		}

		// Hashmap to output strings.
		int i = 0;
		foreach (key; newmap.keys) {
			if (i > 0)
				write(',');
			write(key);
			write('=');
			write(newmap[key]);
			i++;
		}
		write('\n');
	}
}

Go cut

package main

import (
	"bufio"
	"io"
	"log"
	"os"
	"strings"
)

// ----------------------------------------------------------------
func main() {
	args := os.Args[1:]
	includeFields := []string {"a", "x"};

	ok := true
	if len(args) == 0 {
		ok = handle("-", includeFields) && ok
	} else {
		for _, arg := range args {
			ok = handle(arg, includeFields) && ok
		}
	}
	if ok {
		os.Exit(0)
	} else {
		os.Exit(1)
	}
}

// ----------------------------------------------------------------
func handle(fileName string, includeFields []string) (ok bool) {
	inputStream := os.Stdin
	if fileName != "-" {
		var err error
		if inputStream, err = os.Open(fileName); err != nil {
			log.Println(err)
			return false
		}
	}

	reader := bufio.NewReader(inputStream)
	writer := bufio.NewWriter(os.Stdout)
	eof := false

	for !eof {
		line, err := reader.ReadString('\n')
		if err == io.EOF {
			err = nil
			eof = true
		} else if err != nil {
			log.Println(err)
			if fileName != "-" {
				inputStream.Close()
			}
			return false
		} else {

			// 0.030s

			// Line to map
			mymap := make(map[string]string)
			fields := strings.Split(line, ",");
			for _, field := range(fields) {
				kvps := strings.SplitN(field, "=", 2)
				mymap[kvps[0]] = kvps[1]
			}
			// 0.220s
			// delta 0.190s
			// 27%

			// Map-to-map transform
			newmap := make(map[string]string)
			for _, includeField := range(includeFields) {
				value, present := mymap[includeField]
				if present {
					newmap[includeField] = value
				}
			}
			// 0.280s
			// delta 0.060s
			// 9%

			// Map to string
			outs := make([]string, len(newmap))
			i := 0
			for k, v := range(newmap) {
				outs[i] = k + "=" + v
				i++
			}
			// 0.320s
			// delta 0.040s
			// 6%

			out := strings.Join(outs, ",")
			// 0.330s
			// delta 0.010s
			// 2%

			// Write to output stream
			//fmt.Println("")
			writer.WriteString(out)
			// delta 0.390s
			// 56%

		}
	}
	if fileName != "-" {
		inputStream.Close()
	}
	writer.Flush()

	return true
}

Nim cut

import strutils, tables

for line in stdin.lines:
  #var map: OrderedTable[string,string]
  var map = {"":""}.newOrderedTable
  #var map = initTable[string, string]
  #var map: OrderedTable[string, string]
  #var map: newOrderedTable[string, string](16)
  for word in line.split(","):
      var pair = word.split("=")
      #echo(pair[0])
      #echo(pair[1])
      #echo()
      #map[pair[0]] = pair[1]
      map.add(pair[0], pair[1])

Makefile

all: catc catc0 catm catd catrust catgo catnim cutd cutgo cutnim
allc: catc catc0 catm

catc: catc.c
	gcc -Wall catc.c -o catc
catc0: catc0.c
	gcc -Wall catc0.c -o catc0
catm: catm.c
	gcc -Wall catm.c -o catm
catd: catd.d
	dmd -O catd.d
catrust: catrust.rs
	rustc catrust.rs
catgo: catgo.go
	go build catgo.go
catnim: catnim.nim
	nim compile catnim.nim
cutd: cutd.d
	dmd -O cutd.d
cutgo: cutgo.go
	go build cutgo.go
cutnim: cutnim.nim
	nim compile cutnim.nim

Compiler versions

$ gcc --version
gcc (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

$ dmd --version
DMD64 D Compiler v2.088.0
Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved written by Walter Bright

$ rustc --version
rustc 1.36.0

$ go version
go version go1.10.4 linux/amd64

$ nim --version
Nim Compiler Version 0.17.2 (2018-02-05) [Linux: amd64]
Copyright (c) 2006-2018 by Andreas Rumpf

active boot switches: -d:release

Hardware

$ uname -a
Linux sprax 4.15.0-20-generic #21-Ubuntu SMP Tue Apr 24 06:16:15 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux

$ cat /proc/cpuinfo
processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 142
model name	: Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz
stepping	: 10
microcode	: 0x96
cpu MHz		: 796.615
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 0
cpu cores	: 4
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 22
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat
pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp
lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2
ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt
tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch
cpuid_fault epb invpcid_single pti tpr_shadow vnmi flexpriority ept vpid
fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx rdseed adx smap
clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves ibpb ibrs stibp dtherm ida
arat pln pts hwp hwp_notify hwp_act_window hwp_epp
bugs		: cpu_meltdown spectre_v1 spectre_v2
bogomips	: 3984.00
clflush size	: 64
cache_alignment	: 64
address sizes	: 39 bits physical, 48 bits virtual
power management:

... (8 CPUs total)