Overview: • About Miller • Miller in 10 minutes • File formats • Miller features in the context of the Unix toolkit • Record-heterogeneity • Internationalization Using Miller: • FAQ • Cookbook part 1 • Cookbook part 2 • Cookbook part 3 • Data-diving examples • Manpage • Reference • Reference: Verbs • Reference: DSL • Documents by release • Installation, portability, dependencies, and testing Background: • Why? • Why C? • Why call it Miller? • How original is Miller? • Performance Repository: • Things to do • Contact information • GitHub repo |
• Choices for printing to files • Other-format examples • SQL-output examples • Log-processing examples • More CSV-file examples$ cat example.csv color,shape,flag,index,quantity,rate yellow,triangle,1,11,43.6498,9.8870 red,square,1,15,79.2778,0.0130 red,circle,1,16,13.8103,2.9010 red,square,0,48,77.5542,7.4670 purple,triangle,0,51,81.2290,8.5910 red,square,0,64,77.1991,9.5310 purple,triangle,0,65,80.1405,5.8240 yellow,circle,1,73,63.9785,4.2370 yellow,circle,1,87,63.5058,8.3350 purple,square,0,91,72.3735,8.2430 $ mlr --csv cat example.csv color,shape,flag,index,quantity,rate yellow,triangle,1,11,43.6498,9.8870 red,square,1,15,79.2778,0.0130 red,circle,1,16,13.8103,2.9010 red,square,0,48,77.5542,7.4670 purple,triangle,0,51,81.2290,8.5910 red,square,0,64,77.1991,9.5310 purple,triangle,0,65,80.1405,5.8240 yellow,circle,1,73,63.9785,4.2370 yellow,circle,1,87,63.5058,8.3350 purple,square,0,91,72.3735,8.2430 $ mlr --icsv --opprint cat example.csv color shape flag index quantity rate yellow triangle 1 11 43.6498 9.8870 red square 1 15 79.2778 0.0130 red circle 1 16 13.8103 2.9010 red square 0 48 77.5542 7.4670 purple triangle 0 51 81.2290 8.5910 red square 0 64 77.1991 9.5310 purple triangle 0 65 80.1405 5.8240 yellow circle 1 73 63.9785 4.2370 yellow circle 1 87 63.5058 8.3350 purple square 0 91 72.3735 8.2430 $ mlr --csv head -n 4 example.csv color,shape,flag,index,quantity,rate yellow,triangle,1,11,43.6498,9.8870 red,square,1,15,79.2778,0.0130 red,circle,1,16,13.8103,2.9010 red,square,0,48,77.5542,7.4670 $ mlr --csv tail -n 4 example.csv color,shape,flag,index,quantity,rate purple,triangle,0,65,80.1405,5.8240 yellow,circle,1,73,63.9785,4.2370 yellow,circle,1,87,63.5058,8.3350 purple,square,0,91,72.3735,8.2430 $ mlr --icsv --opprint sort -f shape -nr index example.csv color shape flag index quantity rate yellow circle 1 87 63.5058 8.3350 yellow circle 1 73 63.9785 4.2370 red circle 1 16 13.8103 2.9010 purple square 0 91 72.3735 8.2430 red square 0 64 77.1991 9.5310 red square 0 48 77.5542 7.4670 red square 1 15 79.2778 0.0130 purple triangle 0 65 80.1405 5.8240 purple triangle 0 51 81.2290 8.5910 yellow triangle 1 11 43.6498 9.8870 $ mlr --icsv --opprint cut -f flag,shape example.csv shape flag triangle 1 square 1 circle 1 square 0 triangle 0 square 0 triangle 0 circle 1 circle 1 square 0 $ mlr --icsv --opprint cut -o -f flag,shape example.csv flag shape 1 triangle 1 square 1 circle 0 square 0 triangle 0 square 0 triangle 1 circle 1 circle 0 square $ mlr --icsv --opprint cut -x -f flag,shape example.csv color index quantity rate yellow 11 43.6498 9.8870 red 15 79.2778 0.0130 red 16 13.8103 2.9010 red 48 77.5542 7.4670 purple 51 81.2290 8.5910 red 64 77.1991 9.5310 purple 65 80.1405 5.8240 yellow 73 63.9785 4.2370 yellow 87 63.5058 8.3350 purple 91 72.3735 8.2430 $ mlr --icsv --opprint filter '$color == "red"' example.csv color shape flag index quantity rate red square 1 15 79.2778 0.0130 red circle 1 16 13.8103 2.9010 red square 0 48 77.5542 7.4670 red square 0 64 77.1991 9.5310 $ mlr --icsv --opprint filter '$color == "red" && $flag == 1' example.csv color shape flag index quantity rate red square 1 15 79.2778 0.0130 red circle 1 16 13.8103 2.9010 $ mlr --icsv --opprint put '$ratio = $quantity / $rate; $color_shape = $color . "_" . $shape' example.csv color shape flag index quantity rate ratio color_shape yellow triangle 1 11 43.6498 9.8870 4.414868 yellow_triangle red square 1 15 79.2778 0.0130 6098.292308 red_square red circle 1 16 13.8103 2.9010 4.760531 red_circle red square 0 48 77.5542 7.4670 10.386260 red_square purple triangle 0 51 81.2290 8.5910 9.455127 purple_triangle red square 0 64 77.1991 9.5310 8.099790 red_square purple triangle 0 65 80.1405 5.8240 13.760388 purple_triangle yellow circle 1 73 63.9785 4.2370 15.099953 yellow_circle yellow circle 1 87 63.5058 8.3350 7.619172 yellow_circle purple square 0 91 72.3735 8.2430 8.779995 purple_square $ mlr --icsv --ojson put '$ratio = $quantity/$rate; $shape = toupper($shape)' example.csv { "color": "yellow", "shape": "TRIANGLE", "flag": 1, "index": 11, "quantity": 43.6498, "rate": 9.8870, "ratio": 4.414868 } { "color": "red", "shape": "SQUARE", "flag": 1, "index": 15, "quantity": 79.2778, "rate": 0.0130, "ratio": 6098.292308 } { "color": "red", "shape": "CIRCLE", "flag": 1, "index": 16, "quantity": 13.8103, "rate": 2.9010, "ratio": 4.760531 } { "color": "red", "shape": "SQUARE", "flag": 0, "index": 48, "quantity": 77.5542, "rate": 7.4670, "ratio": 10.386260 } { "color": "purple", "shape": "TRIANGLE", "flag": 0, "index": 51, "quantity": 81.2290, "rate": 8.5910, "ratio": 9.455127 } { "color": "red", "shape": "SQUARE", "flag": 0, "index": 64, "quantity": 77.1991, "rate": 9.5310, "ratio": 8.099790 } { "color": "purple", "shape": "TRIANGLE", "flag": 0, "index": 65, "quantity": 80.1405, "rate": 5.8240, "ratio": 13.760388 } { "color": "yellow", "shape": "CIRCLE", "flag": 1, "index": 73, "quantity": 63.9785, "rate": 4.2370, "ratio": 15.099953 } { "color": "yellow", "shape": "CIRCLE", "flag": 1, "index": 87, "quantity": 63.5058, "rate": 8.3350, "ratio": 7.619172 } { "color": "purple", "shape": "SQUARE", "flag": 0, "index": 91, "quantity": 72.3735, "rate": 8.2430, "ratio": 8.779995 } $ mlr --icsv --ojson --jvstack --jlistwrap tail -n 2 example.csv [ { "color": "yellow", "shape": "circle", "flag": 1, "index": 87, "quantity": 63.5058, "rate": 8.3350 } ,{ "color": "purple", "shape": "square", "flag": 0, "index": 91, "quantity": 72.3735, "rate": 8.2430 } ] $ mlr --icsv --opprint sort -f shape -nr index then head -n 1 -g shape example.csv color shape flag index quantity rate yellow circle 1 87 63.5058 8.3350 purple square 0 91 72.3735 8.2430 purple triangle 0 65 80.1405 5.8240 $ mlr --icsv --oxtab --from example.csv stats1 -a p0,p10,p25,p50,p75,p90,p99,p100 -f rate rate_p0 0.013000 rate_p10 2.901000 rate_p25 4.237000 rate_p50 8.243000 rate_p75 8.591000 rate_p90 9.887000 rate_p99 9.887000 rate_p100 9.887000 $ mlr --icsv --opprint --from example.csv stats1 -a count,min,mean,max -f quantity -g shape shape quantity_count quantity_min quantity_mean quantity_max triangle 3 43.649800 68.339767 81.229000 square 4 72.373500 76.601150 79.277800 circle 3 13.810300 47.098200 63.978500 $ mlr --icsv --opprint --from example.csv stats1 -a count,min,mean,max -f quantity -g shape,color shape color quantity_count quantity_min quantity_mean quantity_max triangle yellow 1 43.649800 43.649800 43.649800 square red 3 77.199100 78.010367 79.277800 circle red 1 13.810300 13.810300 13.810300 triangle purple 2 80.140500 80.684750 81.229000 circle yellow 2 63.505800 63.742150 63.978500 square purple 1 72.373500 72.373500 72.373500 Choices for printing to files
Often we want to print output
$ mlr --csv --from example.csv put -q 'tee > $shape.".csv", $*'
Other-format examples
What’s a CSV file, really? It’s an array of rows, or
records, each being a list of key-value pairs, or fields: for CSV
it so happens that all the keys are shared in the header line and the values
vary data line by data line.
For example, if you have
shape,flag,index circle,1,24 square,0,36 shape=circle,flag=1,index=24 shape=square,flag=0,index=36 CSV PPRINT JSON shape,flag,index shape flag index [ circle,1,24 circle 1 24 { square,0,36 square 0 36 "shape": "circle", "flag": 1, "index": 24 }, DKVP XTAB { shape=circle,flag=1,index=24 shape circle "shape": "square", shape=square,flag=0,index=36 flag 1 "flag": 0, index 24 "index": 36 } shape square ] flag 0 index 36 SQL-output examples
I like to produce SQL-query output with header-column and tab delimiter:
this is CSV but with a tab instead of a comma, also known as TSV. Then I
post-process with mlr --tsv or mlr --tsvlite. This
means I can do some (or all, or none) of my data processing within SQL queries,
and some (or none, or all) of my data processing using Miller — whichever
is most convenient for my needs at the moment.
For example, using default output formatting in mysql we get
formatting like Miller’s --opprint --barred:
$ mysql --database=mydb -e 'show columns in mytable' +------------------+--------------+------+-----+---------+-------+ | Field | Type | Null | Key | Default | Extra | +------------------+--------------+------+-----+---------+-------+ | id | bigint(20) | NO | MUL | NULL | | | category | varchar(256) | NO | | NULL | | | is_permanent | tinyint(1) | NO | | NULL | | | assigned_to | bigint(20) | YES | | NULL | | | last_update_time | int(11) | YES | | NULL | | +------------------+--------------+------+-----+---------+-------+ $ mysql --database=mydb -B -e 'show columns in mytable' | mlr --itsvlite --opprint cat Field Type Null Key Default Extra id bigint(20) NO MUL NULL - category varchar(256) NO - NULL - is_permanent tinyint(1) NO - NULL - assigned_to bigint(20) YES - NULL - last_update_time int(11) YES - NULL - $ mysql --database=mydb -B -e 'show columns in mytable' | mlr --itsvlite --ojson --jlistwrap --jvstack cat [ { "Field": "id", "Type": "bigint(20)", "Null": "NO", "Key": "MUL", "Default": "NULL", "Extra": "" }, { "Field": "category", "Type": "varchar(256)", "Null": "NO", "Key": "", "Default": "NULL", "Extra": "" }, { "Field": "is_permanent", "Type": "tinyint(1)", "Null": "NO", "Key": "", "Default": "NULL", "Extra": "" }, { "Field": "assigned_to", "Type": "bigint(20)", "Null": "YES", "Key": "", "Default": "NULL", "Extra": "" }, { "Field": "last_update_time", "Type": "int(11)", "Null": "YES", "Key": "", "Default": "NULL", "Extra": "" } ] $ mysql --database=mydb -B -e 'select * from mytable' > query.tsv $ mlr --from query.tsv --t2p stats1 -a count -f id -g category,assigned_to category assigned_to id_count special 10000978 207 special 10003924 385 special 10009872 168 standard 10000978 524 standard 10003924 392 standard 10009872 108 ... Log-processing examples
Another of my favorite use-cases for Miller is doing ad-hoc processing of
log-file data. Here’s where DKVP format really shines: one, since the
field names and field values are present on every line, every line stands on
its own. That means you can grep or what have you. Also it means not
every line needs to have the same list of field names (“schema ”).
Again, all the examples in the CSV section apply here — just change
the input-format flags. But there’s more you can do when not all the
records have the same shape.
Writing a program — in any language whatsoever — you can have
it print out log lines as it goes along, with items for various events jumbled
together. After the program has finished running you can sort it all out,
filter it, analyze it, and learn from it.
Suppose your program has printed something like this:
$ cat log.txt op=enter,time=1472819681 op=cache,type=A9,hit=0 op=cache,type=A4,hit=1 time=1472819690,batch_size=100,num_filtered=237 op=cache,type=A1,hit=1 op=cache,type=A9,hit=0 op=cache,type=A1,hit=1 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A1,hit=1 time=1472819705,batch_size=100,num_filtered=348 op=cache,type=A4,hit=1 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A4,hit=1 time=1472819713,batch_size=100,num_filtered=493 op=cache,type=A9,hit=1 op=cache,type=A1,hit=1 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A9,hit=1 time=1472819720,batch_size=100,num_filtered=554 op=cache,type=A1,hit=0 op=cache,type=A4,hit=1 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A4,hit=0 op=cache,type=A4,hit=0 op=cache,type=A9,hit=0 time=1472819736,batch_size=100,num_filtered=612 op=cache,type=A1,hit=1 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 op=cache,type=A4,hit=1 op=cache,type=A1,hit=1 op=cache,type=A9,hit=0 op=cache,type=A9,hit=0 time=1472819742,batch_size=100,num_filtered=728 $ grep op=cache log.txt \ | mlr --idkvp --opprint stats1 -a mean -f hit -g type then sort -f type type hit_mean A1 0.857143 A4 0.714286 A9 0.090909 $ mlr --from log.txt --opprint \ filter 'is_present($batch_size)' \ then step -a delta -f time,num_filtered \ then sec2gmt time time batch_size num_filtered time_delta num_filtered_delta 2016-09-02T12:34:50Z 100 237 0 0 2016-09-02T12:35:05Z 100 348 15 111 2016-09-02T12:35:13Z 100 493 8 145 2016-09-02T12:35:20Z 100 554 7 61 2016-09-02T12:35:36Z 100 612 16 58 2016-09-02T12:35:42Z 100 728 6 116 $ mlr --opprint group-like log.txt op time enter 1472819681 op type hit cache A9 0 cache A4 1 cache A1 1 cache A9 0 cache A1 1 cache A9 0 cache A9 0 cache A1 1 cache A4 1 cache A9 0 cache A9 0 cache A9 0 cache A9 0 cache A4 1 cache A9 1 cache A1 1 cache A9 0 cache A9 0 cache A9 1 cache A1 0 cache A4 1 cache A9 0 cache A9 0 cache A9 0 cache A4 0 cache A4 0 cache A9 0 cache A1 1 cache A9 0 cache A9 0 cache A9 0 cache A9 0 cache A4 1 cache A1 1 cache A9 0 cache A9 0 time batch_size num_filtered 1472819690 100 237 1472819705 100 348 1472819713 100 493 1472819720 100 554 1472819736 100 612 1472819742 100 728 $ mlr --opprint group-like then sec2gmt time log.txt op time enter 2016-09-02T12:34:41Z op type hit cache A9 0 cache A4 1 cache A1 1 cache A9 0 cache A1 1 cache A9 0 cache A9 0 cache A1 1 cache A4 1 cache A9 0 cache A9 0 cache A9 0 cache A9 0 cache A4 1 cache A9 1 cache A1 1 cache A9 0 cache A9 0 cache A9 1 cache A1 0 cache A4 1 cache A9 0 cache A9 0 cache A9 0 cache A4 0 cache A4 0 cache A9 0 cache A1 1 cache A9 0 cache A9 0 cache A9 0 cache A9 0 cache A4 1 cache A1 1 cache A9 0 cache A9 0 time batch_size num_filtered 2016-09-02T12:34:50Z 100 237 2016-09-02T12:35:05Z 100 348 2016-09-02T12:35:13Z 100 493 2016-09-02T12:35:20Z 100 554 2016-09-02T12:35:36Z 100 612 2016-09-02T12:35:42Z 100 728 More |