1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
//!
//! --- Transition diagram or transition table ---

use std::str::Chars;

/// FSM is an abstraction over behavior of deterministic finite automata. A DFA has a set of states
/// (generic type S) and alphabets (all possible symbols). One of them is a start state (start fn).
/// A transition takes the DFA from one state to other by consuming a symbol. If the input is
/// completely consumed and DFA is in a final state (or accepting state) then we say that the input
/// belongs to the language accepted by the DFA.
pub trait FSM<S> {
    type Symbol;
    fn transition(state: S, symbol: Self::Symbol) -> S;
    fn is_final(state: &S) -> bool;
    fn start() -> S;
}

/// The set of possible states in a DFA that represents a language accepting all valid email
/// addresses. [State::Error] is a dead state (or trap state).
#[derive(Clone, Debug, Copy)]
pub enum State {
    AddrSpec,
    LocalAtom,
    LocalQText,
    LocalDot,
    LocalEscape,
    LocalQString,
    LocalPart,
    DomainAtom,
    DomainDText,
    DomainDot,
    DomainLiteral,
    Error,
}

/// Certain symbols and group of symbols useful for determining transition rules.
impl State {
    const DQUOTE: char = '"';
    const DOT: char = '.';
    const BACKSLASH: char = '\\';
    const AT: char = '@';
    const OPEN_BRACKET: char = '[';
    const CLOSE_BRACKET: char = ']';
    fn is_atext(c: char) -> bool {
        let n: u32 = c.into();
        c == '!'
            || c == '#'
            || c == '$'
            || c == '%'
            || c == '&'
            || c == '\''
            || c == '*'
            || c == '+'
            || c == '-'
            || c == '/'
            || c == '='
            || c == '?'
            || c == '^'
            || c == '_'
            || c == '`'
            || c == '{'
            || c == '}'
            || c == '~'
            || (0x41 <= n && n <= 0x5A) // A-Z
            || (0x61 <= n && n <= 0x7A) // a-z
            || (0x30 <= n && n <= 0x39) // 0-9
    }
    fn is_qtext(c: char) -> bool {
        let n: u32 = c.into();
        n == 33 || (35 <= n && n <= 91) || (93 <= n && n <= 126)
    }
    fn is_dtext(c: char) -> bool {
        let n: u32 = c.into();
        (33 <= n && n <= 90) && (94 <= n && n <= 126)
    }
    fn is_escape(c: char) -> bool {
        let n: u32 = c.into();
        (0x21 <= n && n <= 0x7E) // VCHAR
            || n == 0x20 // SPACE
            || n == 0x09 // HTAB
    }
}

/// State implements FSM and defines a DFA for language accepting all valid email addresses.
/// The set of states in DFA is itself. All transitions are defined in the implementation itself.
/// Start state of DFA is [State::AddrSpec].
impl FSM<State> for State {
    type Symbol = char;
    fn transition(state: Self, c: char) -> State {
        match state {
            Self::AddrSpec => match c {
                Self::DQUOTE => State::LocalQText,
                c if Self::is_atext(c) => State::LocalAtom,
                _ => State::Error,
            },
            Self::LocalAtom => match c {
                Self::DOT => State::LocalDot,
                Self::AT => State::LocalPart,
                c if Self::is_atext(c) => State::LocalAtom,
                _ => State::Error,
            },
            Self::LocalQText => match c {
                Self::BACKSLASH => State::LocalEscape,
                Self::DQUOTE => State::LocalQString,
                c if Self::is_qtext(c) => State::LocalQText,
                _ => State::Error,
            },
            Self::LocalDot => match c {
                c if Self::is_atext(c) => State::LocalAtom,
                _ => State::Error,
            },
            Self::LocalEscape => match c {
                c if Self::is_escape(c) => State::LocalQText,
                _ => State::Error,
            },
            Self::LocalQString => match c {
                Self::AT => State::LocalPart,
                _ => State::Error,
            },
            Self::LocalPart => match c {
                Self::OPEN_BRACKET => State::DomainDText,
                c if Self::is_atext(c) => State::DomainAtom,
                _ => Self::Error,
            },
            Self::DomainAtom => match c {
                Self::DOT => State::DomainDot,
                c if Self::is_atext(c) => State::DomainAtom,
                _ => Self::Error,
            },
            Self::DomainDText => match c {
                Self::CLOSE_BRACKET => State::DomainLiteral,
                c if Self::is_dtext(c) => State::DomainDText,
                _ => Self::Error,
            },
            Self::DomainDot => match c {
                c if Self::is_atext(c) => State::DomainAtom,
                _ => Self::Error,
            },
            Self::DomainLiteral => match c {
                _ => Self::Error,
            },
            Self::Error => Self::Error,
        }
    }
    fn is_final(state: &Self) -> bool {
        match state {
            Self::DomainLiteral | Self::DomainAtom => true,
            _ => false,
        }
    }
    fn start() -> Self {
        State::AddrSpec
    }
}

/// Iterator for the DFA implemented by [Machine]. Each step through the iterator consumes
/// an input symbol (from input iterator) and transitions the DFA through the corresponding state.
/// The iterator is exhausted when the input iterator gets exhausted. If any invalid character or
/// invalid email address syntax is encountered, then transition is to [State::Error]. For every
/// input symbol, this state consumes the symbol and remains in [State::Error]. Thus, parse errors
/// can be checked just by investigating the state of DFA when input gets exhausted.
///
/// Examining the last state of DFA can be done via [Iterator::last] method. If it is [None], then
/// empty input and parse failed. Otherwise, there will be at least one transition and hence we get
/// some last state. By checking if it is an accepting state, parsing success can be determined.
pub struct MachineIterator<'a> {
    input: Chars<'a>,
    state: State,
}

/// MachineIterator just wraps over input iterator and performs transitions at every step.
/// It keeps track of current state as well. Thus, next state is determined using current state as
/// well as the input symbol based on the transition rules defined.
impl<'a> Iterator for MachineIterator<'a> {
    type Item = State;
    fn next(&mut self) -> Option<Self::Item> {
        let c = self.input.next()?;
        self.state = State::transition(self.state, c);
        Some(self.state)
    }
}

/// Machine is the core export of the module. It is an [IntoIterator] and consuming the iterator
/// determines if given string literal is a valid email address or not.
pub struct Machine<'a> {
    input: &'a str,
}

impl<'a> Machine<'a> {
    pub fn new(s: &'a str) -> Self {
        Machine { input: s }
    }
}

impl<'a> IntoIterator for Machine<'a> {
    type Item = State;
    type IntoIter = MachineIterator<'a>;
    fn into_iter(self) -> Self::IntoIter {
        MachineIterator {
            state: State::AddrSpec,
            input: self.input.chars(),
        }
    }
}