At Cozi, our family-oriented software depends on several web services. We have our own deployment tool, called Artemis to deploy our web services. Artemis has a simple web interface that processes commands using the same quoting rules as Unix's Bourne shell.
Had we written Artemis in Python, we could have simply used Python's built-in shlex module. However, as we chose Erlang for its robust and intelligent built-in support for concurrency and network communication, I found there to be no built-in or easily available version, so I decided to write one myself (if any reader does know of such a thing, please add a comment).
The goals (and non-goals) of the Erlang version of shlex are as follows:
- mimic the behavior of shlex.split, minus the posix and infile options
- performance is not a major concern, as this code is called infrequently
- try to use an Erlang-y coding style
Originally, this code was about half the present length, but it failed many of the unit tests, in particular the ones involving using quoting at the end of the string. Probably the most interesting thing about the code (especially for someone, such as myself, with more experience using C-like languages), is the lack of explicit control structures (i.e., there are no if statements). Instead, the "function head" style is used.
%%% See the documentation for Python's built-in shlex module for what
%%% this does: http://docs.python.org/lib/module-shlex.html
-module(shlex).
-include_lib("eunit.hrl").
-export([split/1]).
%% Must be macros so we can use them in guard clauses.
-define(IS_WHITESPACE(Char),
Char =:= $\s; Char =:= $\t; Char =:= $\r; Char =:= $\n).
-define(IS_QUOTE(Char), Char =:= $\'; Char =:= $\").
%% Rough equivalent of Python's shlex.split(). We support its
%% optional 'comment' argument, though.
split(String) ->
split(String, _Word = "", _Line = "", _ActiveQuote=none, _Escape=false).
%% Explanation of these lines: Each line handles a different case. The
%% first argument is the input (which is processed recursively). Next
%% is the word currently being built up. Next is whether or not
%% quoting is currently active (and what the quote char actually is),
%% and lastly is whether escaping is currently active (e.g., whether
%% we're just after a backslash).
split([], _Word, _Line, _QuoteChar, _Escape=true) ->
{error, trailing_backslash};
split([], _Word, _Line, $\", _Escape=false) ->
{error, unterminated_double_quote};
split([], _Word, _Line, $\', _Escape=false) ->
{error, unterminated_single_quote};
split([], [], Line, _QuoteChar=none, _Escape=false) ->
{ok, Line};
split([], Word, Line, _QuoteChar=none, _Escape=false) ->
{ok, Line ++ [Word]};
split([AnyChar|Rest], Word, Line, QuoteChar, _Escape = true) ->
split(Rest, Word ++ [AnyChar], Line, QuoteChar, false);
split([$\\|Rest], Word, Line, QuoteChar, _Escape = false) ->
split(Rest, Word, Line, QuoteChar, true);
split([Whitespace|Rest], [], Line, none, false)
when ?IS_WHITESPACE(Whitespace) ->
split(Rest, [], Line, none, false);
split([Whitespace|Rest], Word, Line, none, false)
when ?IS_WHITESPACE(Whitespace) ->
split(Rest, [], Line ++ [Word], none, false);
split([QuoteChar|Rest], Word, Line, none, false)
when ?IS_QUOTE(QuoteChar) ->
split(Rest, Word, Line, QuoteChar, false);
split([Char | Rest], Word, Line, none, false) ->
split(Rest, Word ++ [Char], Line, none, false);
split([QuoteChar], Word, Line = [_Head|_Rest], QuoteChar, false) ->
%% Special case: "a ''" -> ["a", []]. Don't want this to fire
%% when Line is empty b/c that would mess up "''" -> [].
split([], [], Line ++ [Word], none, false);
split([QuoteChar, Whitespace | Rest], Word, Line, QuoteChar, false)
when ?IS_WHITESPACE(Whitespace) ->
%% Special case: "a '' b" -> ["a", [], "b"]
split(Rest, [], Line ++ [Word], none, false);
split([QuoteChar|Rest], Word, Line, QuoteChar, false) ->
split(Rest, Word, Line, none, false);
split([NonQuoteChar|Rest], Word, Line, QuoteChar, false) ->
split(Rest, Word ++ [NonQuoteChar], Line, QuoteChar, false).
%%% Tests
test_happy([]) ->
ok;
test_happy([[Input|Output]|Rest]) ->
?debugFmt("testing ~p -> ~p", [Input, Output]),
{ok, Output} = split(Input),
test_happy(Rest).
test_sad([]) ->
ok;
test_sad([[Input, ErrorAtom]|Rest]) ->
?debugFmt("testing ~p -> {error, ~p}", [Input, ErrorAtom]),
{error, ErrorAtom} = split(Input),
test_sad(Rest).
big_test() ->
HappyCases =
[[""],
[" "],
[" "],
[" \t\n\r\t "],
["abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"],
["0123456789", "0123456789"],
["a ", "a"],
["a", "a"],
[" a", "a"],
["\"a\"", "a"],
["'a'", "a"],
["a''", "a"],
["a\"\"", "a"],
["\"\"a", "a"],
["''a", "a"],
["''"],
[" ''"],
["a '' b", "a", "", "b"],
["a '' ", "a", ""],
["a \"\"", "a", ""],
["a ''", "a", ""],
["a ' '", "a", " "],
["a \" \"", "a", " "],
["a \" \" b", "a", " ", "b"],
["a \" \" b ' ' c", "a", " ", "b", " ", "c"],
["a \"\" b", "a", "", "b"],
["12 \"\" 34", "12", "", "34"],
["a", "a"],
["ab", "ab"],
["\\\"ab\\\"", "\"ab\""],
["a a", "a", "a"],
["a a a", "a", "a", "a"],
["a ", "a"],
["a ", "a"],
["a ", "a"],
["a b", "a", "b"],
["xy", "xy"],
["xyb", "xyb"],
["xy xy", "xy", "xy"],
["xy xy xy", "xy", "xy", "xy"],
["xy ", "xy"],
["xy ", "xy"],
["xy ", "xy"],
["xy b", "xy", "b"],
["margle", "margle"],
["margle margle", "margle", "margle"],
["margle margle margle", "margle", "margle", "margle"],
["margle ", "margle"],
["margle ", "margle"],
["margle ", "margle"],
["margle b", "margle", "b"],
["\"\""]],
test_happy(HappyCases),
SadCases =
[
["\"", unterminated_double_quote],
["'", unterminated_single_quote],
["\\", trailing_backslash],
["\\\\\\", trailing_backslash],
["\"\"\"", unterminated_double_quote],
["'''", unterminated_single_quote],
["a \"", unterminated_double_quote],
["a '", unterminated_single_quote],
["a \\", trailing_backslash],
["a \\\\\\", trailing_backslash],
["a \"\"\"", unterminated_double_quote],
["a '''", unterminated_single_quote],
["a '''a", unterminated_single_quote],
["a '''a b", unterminated_single_quote],
["a '''a b c", unterminated_single_quote]
],
test_sad(SadCases).