Mixing with other languages¶
As discussed in the section on File formats, Miller supports several different file formats. Different tools are good at different things, so it’s important to be able to move data into and out of other languages. CSV and JSON are well-known, of course; here are some examples using DKVP format, with Ruby and Python. Last, we show how to use arbitrary shell commands to extend functionality beyond Miller’s domain-specific language.
DKVP I/O in Python¶
Here are the I/O routines:
#!/usr/bin/env python
# ================================================================
# Example of DKVP I/O using Python.
#
# Key point: Use Miller for what it's good at; pass data into/out of tools in
# other languages to do what they're good at.
#
# bash$ python -i dkvp_io.py
#
# # READ
# >>> map = dkvpline2map('x=1,y=2', '=', ',')
# >>> map
# OrderedDict([('x', '1'), ('y', '2')])
#
# # MODIFY
# >>> map['z'] = map['x'] + map['y']
# >>> map
# OrderedDict([('x', '1'), ('y', '2'), ('z', 3)])
#
# # WRITE
# >>> line = map2dkvpline(map, '=', ',')
# >>> line
# 'x=1,y=2,z=3'
#
# ================================================================
import re
import collections
# ----------------------------------------------------------------
# ips and ifs (input pair separator and input field separator) are nominally '=' and ','.
def dkvpline2map(line, ips, ifs):
pairs = re.split(ifs, line)
map = collections.OrderedDict()
for pair in pairs:
key, value = re.split(ips, pair, 1)
# Type inference:
try:
value = int(value)
except:
try:
value = float(value)
except:
pass
map[key] = value
return map
# ----------------------------------------------------------------
# ops and ofs (output pair separator and output field separator) are nominally '=' and ','.
def map2dkvpline(map , ops, ofs):
line = ''
pairs = []
for key in map:
pairs.append(str(key) + ops + str(map[key]))
return str.join(ofs, pairs)
And here is an example using them:
$ cat polyglot-dkvp-io/example.py
#!/usr/bin/env python
import sys
import re
import copy
import dkvp_io
while True:
# Read the original record:
line = sys.stdin.readline().strip()
if line == '':
break
map = dkvp_io.dkvpline2map(line, '=', ',')
# Drop a field:
map.pop('x')
# Compute some new fields:
map['ab'] = map['a'] + map['b']
map['iy'] = map['i'] + map['y']
# Add new fields which show type of each already-existing field:
omap = copy.copy(map) # since otherwise the for-loop will modify what it loops over
keys = omap.keys()
for key in keys:
# Convert "<type 'int'>" to just "int", etc.:
type_string = str(map[key].__class__)
type_string = re.sub("<type '", "", type_string) # python2
type_string = re.sub("<class '", "", type_string) # python3
type_string = re.sub("'>", "", type_string)
map['t'+key] = type_string
# Write the modified record:
print(dkvp_io.map2dkvpline(map, '=', ','))
Run as-is:
$ python polyglot-dkvp-io/example.py < data/small
a=pan,b=pan,i=1,y=0.7268028627434533,ab=panpan,iy=1.7268028627434533,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
a=eks,b=pan,i=2,y=0.5221511083334797,ab=ekspan,iy=2.5221511083334796,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
a=wye,b=wye,i=3,y=0.33831852551664776,ab=wyewye,iy=3.3383185255166477,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
a=eks,b=wye,i=4,y=0.13418874328430463,ab=ekswye,iy=4.134188743284304,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
a=wye,b=pan,i=5,y=0.8636244699032729,ab=wyepan,iy=5.863624469903273,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
Run as-is, then pipe to Miller for pretty-printing:
$ python polyglot-dkvp-io/example.py < data/small | mlr --opprint cat
a b i y ab iy ta tb ti ty tab tiy
pan pan 1 0.7268028627434533 panpan 1.7268028627434533 str str int float str float
eks pan 2 0.5221511083334797 ekspan 2.5221511083334796 str str int float str float
wye wye 3 0.33831852551664776 wyewye 3.3383185255166477 str str int float str float
eks wye 4 0.13418874328430463 ekswye 4.134188743284304 str str int float str float
wye pan 5 0.8636244699032729 wyepan 5.863624469903273 str str int float str float
DKVP I/O in Ruby¶
Here are the I/O routines:
#!/usr/bin/env ruby
# ================================================================
# Example of DKVP I/O using Ruby.
#
# Key point: Use Miller for what it's good at; pass data into/out of tools in
# other languages to do what they're good at.
#
# bash$ irb -I. -r dkvp_io.rb
#
# # READ
# irb(main):001:0> map = dkvpline2map('x=1,y=2', '=', ',')
# => {"x"=>"1", "y"=>"2"}
#
# # MODIFY
# irb(main):001:0> map['z'] = map['x'] + map['y']
# => 3
#
# # WRITE
# irb(main):002:0> line = map2dkvpline(map, '=', ',')
# => "x=1,y=2,z=3"
#
# ================================================================
# ----------------------------------------------------------------
# ips and ifs (input pair separator and input field separator) are nominally '=' and ','.
def dkvpline2map(line, ips, ifs)
map = {}
line.split(ifs).each do |pair|
(k, v) = pair.split(ips, 2)
# Type inference:
begin
v = Integer(v)
rescue ArgumentError
begin
v = Float(v)
rescue ArgumentError
# Leave as string
end
end
map[k] = v
end
map
end
# ----------------------------------------------------------------
# ops and ofs (output pair separator and output field separator) are nominally '=' and ','.
def map2dkvpline(map, ops, ofs)
map.collect{|k,v| k.to_s + ops + v.to_s}.join(ofs)
end
And here is an example using them:
$ cat polyglot-dkvp-io/example.rb
#!/usr/bin/env ruby
require 'dkvp_io'
ARGF.each do |line|
# Read the original record:
map = dkvpline2map(line.chomp, '=', ',')
# Drop a field:
map.delete('x')
# Compute some new fields:
map['ab'] = map['a'] + map['b']
map['iy'] = map['i'] + map['y']
# Add new fields which show type of each already-existing field:
keys = map.keys
keys.each do |key|
map['t'+key] = map[key].class
end
# Write the modified record:
puts map2dkvpline(map, '=', ',')
end
Run as-is:
$ ruby -I./polyglot-dkvp-io polyglot-dkvp-io/example.rb data/small
a=pan,b=pan,i=1,y=0.7268028627434533,ab=panpan,iy=1.7268028627434533,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
a=eks,b=pan,i=2,y=0.5221511083334797,ab=ekspan,iy=2.5221511083334796,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
a=wye,b=wye,i=3,y=0.33831852551664776,ab=wyewye,iy=3.3383185255166477,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
a=eks,b=wye,i=4,y=0.13418874328430463,ab=ekswye,iy=4.134188743284304,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
a=wye,b=pan,i=5,y=0.8636244699032729,ab=wyepan,iy=5.863624469903273,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
Run as-is, then pipe to Miller for pretty-printing:
$ ruby -I./polyglot-dkvp-io polyglot-dkvp-io/example.rb data/small | mlr --opprint cat
a b i y ab iy ta tb ti ty tab tiy
pan pan 1 0.7268028627434533 panpan 1.7268028627434533 String String Integer Float String Float
eks pan 2 0.5221511083334797 ekspan 2.5221511083334796 String String Integer Float String Float
wye wye 3 0.33831852551664776 wyewye 3.3383185255166477 String String Integer Float String Float
eks wye 4 0.13418874328430463 ekswye 4.134188743284304 String String Integer Float String Float
wye pan 5 0.8636244699032729 wyepan 5.863624469903273 String String Integer Float String Float
SQL-output examples¶
Please see SQL-output examples.
SQL-input examples¶
Please see SQL-input examples.
Running shell commands¶
The system DSL function allows you to run a specific shell command and put its output – minus the final newline – into a record field. The command itself is any string, either a literal string, or a concatenation of strings, perhaps including other field values or what have you.
$ mlr --opprint put '$o = system("echo hello world")' data/small
a b i x y o
pan pan 1 0.3467901443380824 0.7268028627434533 hello world
eks pan 2 0.7586799647899636 0.5221511083334797 hello world
wye wye 3 0.20460330576630303 0.33831852551664776 hello world
eks wye 4 0.38139939387114097 0.13418874328430463 hello world
wye pan 5 0.5732889198020006 0.8636244699032729 hello world
$ mlr --opprint put '$o = system("echo {" . NR . "}")' data/small
a b i x y o
pan pan 1 0.3467901443380824 0.7268028627434533 {1}
eks pan 2 0.7586799647899636 0.5221511083334797 {2}
wye wye 3 0.20460330576630303 0.33831852551664776 {3}
eks wye 4 0.38139939387114097 0.13418874328430463 {4}
wye pan 5 0.5732889198020006 0.8636244699032729 {5}
$ mlr --opprint put '$o = system("echo -n ".$a."| sha1sum")' data/small
a b i x y o
pan pan 1 0.3467901443380824 0.7268028627434533 f29c748220331c273ef16d5115f6ecd799947f13 -
eks pan 2 0.7586799647899636 0.5221511083334797 456d988ecb3bf1b75f057fc6e9fe70db464e9388 -
wye wye 3 0.20460330576630303 0.33831852551664776 eab0de043d67f441c7fd1e335f0ca38708e6ebf7 -
eks wye 4 0.38139939387114097 0.13418874328430463 456d988ecb3bf1b75f057fc6e9fe70db464e9388 -
wye pan 5 0.5732889198020006 0.8636244699032729 eab0de043d67f441c7fd1e335f0ca38708e6ebf7 -
Note that running a subprocess on every record takes a non-trivial amount of time. Comparing asking the system date
command for the current time in nanoseconds versus computing it in process:
$ mlr --opprint put '$t=system("date +%s.%N")' then step -a delta -f t data/small
a b i x y t t_delta
pan pan 1 0.3467901443380824 0.7268028627434533 1568774318.513903817 0
eks pan 2 0.7586799647899636 0.5221511083334797 1568774318.514722876 0.000819
wye wye 3 0.20460330576630303 0.33831852551664776 1568774318.515618046 0.000895
eks wye 4 0.38139939387114097 0.13418874328430463 1568774318.516547441 0.000929
wye pan 5 0.5732889198020006 0.8636244699032729 1568774318.517518828 0.000971
$ mlr --opprint put '$t=systime()' then step -a delta -f t data/small
a b i x y t t_delta
pan pan 1 0.3467901443380824 0.7268028627434533 1568774318.518699 0
eks pan 2 0.7586799647899636 0.5221511083334797 1568774318.518717 0.000018
wye wye 3 0.20460330576630303 0.33831852551664776 1568774318.518723 0.000006
eks wye 4 0.38139939387114097 0.13418874328430463 1568774318.518727 0.000004
wye pan 5 0.5732889198020006 0.8636244699032729 1568774318.518730 0.000003