1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
use std::ops::Range;
use std::str::FromStr;
use std::sync::Arc;
use std::vec;

use crate::{
    common::BlockDatabase,
    compile::{Builder, Flags, Pattern},
    runtime::Matching,
    Error, Result,
};

/// Match represents a single match of a regex in a haystack.
///
/// The lifetime parameter `'t` refers to the lifetime of the matched text.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct Match<'t> {
    text: &'t str,
    start: usize,
    end: usize,
}

impl<'t> Match<'t> {
    /// Returns the starting byte offset of the match in the haystack.
    #[inline]
    pub fn start(&self) -> usize {
        self.start
    }

    /// Returns the ending byte offset of the match in the haystack.
    #[inline]
    pub fn end(&self) -> usize {
        self.end
    }

    /// Returns the range over the starting and ending byte offsets of the
    /// match in the haystack.
    #[inline]
    pub fn range(&self) -> Range<usize> {
        self.start..self.end
    }

    /// Returns the matched text.
    #[inline]
    pub fn as_str(&self) -> &'t str {
        &self.text[self.start..self.end]
    }

    /// Creates a new match from the given haystack and byte offsets.
    #[inline]
    fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
        Match {
            text: haystack,
            start,
            end,
        }
    }
}

impl<'t> From<Match<'t>> for &'t str {
    fn from(m: Match<'t>) -> &'t str {
        m.as_str()
    }
}

impl<'t> From<Match<'t>> for Range<usize> {
    fn from(m: Match<'t>) -> Range<usize> {
        m.range()
    }
}

/// An iterator over all non-overlapping matches for a particular string.
///
/// The iterator yields a `Match` value. The iterator stops when no more
/// matches can be found.
///
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
/// lifetime of the matched string.
pub struct Matches<'t>(&'t str, vec::IntoIter<Range<usize>>);

impl<'t> Matches<'t> {
    /// Return the text being searched.
    pub fn text(&self) -> &'t str {
        self.0
    }
}

impl<'t> Iterator for Matches<'t> {
    type Item = Match<'t>;

    fn next(&mut self) -> Option<Self::Item> {
        self.1.next().map(|range| Match::new(self.0, range.start, range.end))
    }
}

impl<'t> DoubleEndedIterator for Matches<'t> {
    fn next_back(&mut self) -> Option<Self::Item> {
        self.1
            .next_back()
            .map(|range| Match::new(self.0, range.start, range.end))
    }
}

/// A compiled regular expression for matching Unicode strings.
#[derive(Clone)]
pub struct Regex(pub(crate) Arc<BlockDatabase>);

impl FromStr for Regex {
    type Err = Error;

    /// Attempts to parse a string into a regular expression
    fn from_str(s: &str) -> Result<Regex> {
        Regex::new(s)
    }
}

/// Core regular expression methods.
impl Regex {
    /// Compiles a regular expression.
    /// Once compiled, it can be used repeatedly to search, split or replace text in a string.
    ///
    /// If an invalid expression is given, then an error is returned.
    pub fn new<S: Into<String>>(re: S) -> Result<Regex> {
        Self::with_flags(re, Flags::empty())
    }

    pub(crate) fn with_flags<S: Into<String>>(re: S, flags: Flags) -> Result<Regex> {
        Pattern::with_flags(re, flags | Flags::SOM_LEFTMOST | Flags::UTF8)?
            .build()
            .map(|db| Regex(Arc::new(db)))
    }

    /// Returns true if and only if the regex matches the string given.
    ///
    /// It is recommended to use this method if all you need to do is test a match,
    /// since the underlying matching engine may be able to do less work.
    ///
    /// # Examples
    ///
    /// Test if some text contains at least one word with exactly 13 Unicode word characters:
    ///
    /// ```rust
    /// # use hyperscan::regex::Regex;
    /// let text = "I categorically deny having triskaidekaphobia.";
    /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
    /// ```
    pub fn is_match(&self, text: &str) -> bool {
        let mut matched = false;

        let s = self.0.alloc_scratch().unwrap();
        let _ = self.0.scan(text, &s, |_, _, _, _| {
            matched = true;

            Matching::Terminate
        });

        matched
    }

    /// Returns the start and end byte range of the leftmost-first match in text. If no match exists, then None is returned.
    ///
    /// Note that this should only be used if you want to discover the position of the match. Testing the existence of a match is faster if you use is_match.
    ///
    /// # Examples
    ///
    /// Find the start and end location of the first word with exactly 13 Unicode word characters:
    ///
    /// ```rust
    /// # use hyperscan::regex::Regex;
    /// let text = "I categorically deny having triskaidekaphobia.";
    /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
    /// assert_eq!(mat.start(), 2);
    /// assert_eq!(mat.end(), 15);
    /// ```
    pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
        let mut matched = vec![];

        let s = self.0.alloc_scratch().unwrap();
        let _ = self.0.scan(text, &s, |_, from, to, _| {
            matched.push((from as usize, to as usize));

            Matching::Terminate
        });

        matched
            .first()
            .map(|&(start, end)| Match::new(&text[start..end], start, end))
    }

    /// Returns an iterator for each successive non-overlapping match in
    /// `text`, returning the start and end byte indices with respect to
    /// `text`.
    ///
    /// # Examples
    ///
    /// Find the start and end location of every word with exactly 13 Unicode
    /// word characters:
    ///
    /// ```rust
    /// # use hyperscan::regex::Regex;
    /// let text = "Retroactively relinquishing remunerations is reprehensible.";
    /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
    ///     println!("{:?}", mat);
    /// }
    /// ```
    pub fn find_iter<'t>(&self, text: &'t str) -> Matches<'t> {
        let mut matched = Vec::<Range<usize>>::new();

        let s = self.0.alloc_scratch().unwrap();
        let _ = self.0.scan(text, &s, |_, from, to, _| {
            let range = from as usize..to as usize;

            match matched.last() {
                Some(last) if last.start == range.start && last.end < range.end => {
                    // only the non-overlapping match should be return
                    *matched.last_mut().unwrap() = range;
                }
                _ => matched.push(range),
            }

            Matching::Continue
        });

        Matches(text, matched.into_iter())
    }

    /// Returns an iterator of substrings of `text` delimited by a match of the
    /// regular expression. Namely, each element of the iterator corresponds to
    /// text that *isn't* matched by the regular expression.
    ///
    /// This method will *not* copy the text given.
    ///
    /// # Examples
    ///
    /// To split a string delimited by arbitrary amounts of spaces or tabs:
    ///
    /// ```rust
    /// # use hyperscan::regex::Regex;
    /// let re = Regex::new(r"[ \t]+").unwrap();
    /// let fields: Vec<&str> = re.split("a b \t  c\td    e").collect();
    /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
    /// ```
    pub fn split<'t>(&self, text: &'t str) -> Split<'t> {
        Split {
            finder: self.find_iter(text),
            last: 0,
        }
    }

    /// Returns an iterator of at most `limit` substrings of `text` delimited
    /// by a match of the regular expression. (A `limit` of `0` will return no
    /// substrings.) Namely, each element of the iterator corresponds to text
    /// that *isn't* matched by the regular expression. The remainder of the
    /// string that is not split will be the last element in the iterator.
    ///
    /// This method will *not* copy the text given.
    ///
    /// # Examples
    ///
    /// Get the first two words in some text:
    ///
    /// ```rust
    /// # use hyperscan::regex::Regex;
    /// let re = Regex::new(r"\W+").unwrap();
    /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
    /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
    /// ```
    pub fn splitn<'t>(&self, text: &'t str, limit: usize) -> SplitN<'t> {
        SplitN {
            splits: self.split(text),
            n: limit,
        }
    }
}

/// Yields all substrings delimited by a regular expression match.
///
/// `'t` is the lifetime of the string being split.
pub struct Split<'t> {
    finder: Matches<'t>,
    last: usize,
}

impl<'t> Iterator for Split<'t> {
    type Item = &'t str;

    fn next(&mut self) -> Option<&'t str> {
        let text = self.finder.text();
        match self.finder.next() {
            None => {
                if self.last > text.len() {
                    None
                } else {
                    let s = &text[self.last..];
                    self.last = text.len() + 1; // Next call will return None
                    Some(s)
                }
            }
            Some(m) => {
                let matched = &text[self.last..m.start()];
                self.last = m.end();
                Some(matched)
            }
        }
    }
}

/// Yields at most `N` substrings delimited by a regular expression match.
///
/// The last substring will be whatever remains after splitting.
///
/// `'t` is the lifetime of the string being split.
pub struct SplitN<'t> {
    splits: Split<'t>,
    n: usize,
}

impl<'t> Iterator for SplitN<'t> {
    type Item = &'t str;

    fn next(&mut self) -> Option<&'t str> {
        if self.n == 0 {
            return None;
        }

        self.n -= 1;
        if self.n > 0 {
            return self.splits.next();
        }

        let text = self.splits.finder.text();
        if self.splits.last > text.len() {
            // We've already returned all substrings.
            None
        } else {
            // self.n == 0, so future calls will return None immediately
            Some(&text[self.splits.last..])
        }
    }
}

#[cfg(test)]
mod tests {
    #[test]
    fn test_find_iter() {
        let regex = r"\b\w{13}\b";
        let text = "Retroactively relinquishing remunerations is reprehensible.";

        assert_eq!(
            regex::Regex::new(regex)
                .unwrap()
                .find_iter(text)
                .map(|m| m.range())
                .collect::<Vec<_>>(),
            super::Regex::new(regex)
                .unwrap()
                .find_iter(text)
                .map(|m| m.range())
                .collect::<Vec<_>>()
        );
    }
}