From 3265624259b40c7617894711cf96d1f94214acba Mon Sep 17 00:00:00 2001 From: Fenrir Date: Thu, 6 Oct 2016 01:31:27 -0700 Subject: [PATCH] Add memchr --- src/memchr.rs | 397 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 src/memchr.rs diff --git a/src/memchr.rs b/src/memchr.rs new file mode 100644 index 0000000..c97ce01 --- /dev/null +++ b/src/memchr.rs @@ -0,0 +1,397 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// Original implementation taken from rust-memchr +// Copyright 2015 Andrew Gallant, bluss and Nicolas Koch + + + +/// A safe interface to `memchr`. +/// +/// Returns the index corresponding to the first occurrence of `needle` in +/// `haystack`, or `None` if one is not found. +/// +/// memchr reduces to super-optimized machine code at around an order of +/// magnitude faster than `haystack.iter().position(|&b| b == needle)`. +/// (See benchmarks.) +/// +/// # Example +/// +/// This shows how to find the first position of a byte in a byte string. +/// +/// ```rust,ignore +/// use memchr::memchr; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memchr(b'k', haystack), Some(8)); +/// ``` +pub fn memchr(needle: u8, haystack: &[u8]) -> Option { + fn memchr_specific(needle: u8, haystack: &[u8]) -> Option { + use libctru::libc; + + let p = unsafe { + libc::memchr(haystack.as_ptr() as *const libc::c_void, + needle as libc::c_int, + haystack.len() as libc::size_t) + }; + if p.is_null() { + None + } else { + Some(p as usize - (haystack.as_ptr() as usize)) + } + } + memchr_specific(needle, haystack) +} + +/// A safe interface to `memrchr`. +/// +/// Returns the index corresponding to the last occurrence of `needle` in +/// `haystack`, or `None` if one is not found. +/// +/// # Example +/// +/// This shows how to find the last position of a byte in a byte string. +/// +/// ```rust,ignore +/// use memchr::memrchr; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memrchr(b'o', haystack), Some(17)); +/// ``` +pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { + fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option { + use libc; + + // GNU's memrchr() will - unlike memchr() - error if haystack is empty. + if haystack.is_empty() { + return None; + } + let p = unsafe { + libc::memrchr(haystack.as_ptr() as *const libc::c_void, + needle as libc::c_int, + haystack.len() as libc::size_t) + }; + if p.is_null() { + None + } else { + Some(p as usize - (haystack.as_ptr() as usize)) + } + } + memrchr_specific(needle, haystack) +} + +#[allow(dead_code)] +mod fallback { + use core::cmp; + use core::mem; + + const LO_U64: u64 = 0x0101010101010101; + const HI_U64: u64 = 0x8080808080808080; + + // use truncation + const LO_USIZE: usize = LO_U64 as usize; + const HI_USIZE: usize = HI_U64 as usize; + + /// Return `true` if `x` contains any zero byte. + /// + /// From *Matters Computational*, J. Arndt + /// + /// "The idea is to subtract one from each of the bytes and then look for + /// bytes where the borrow propagated all the way to the most significant + /// bit." + #[inline] + fn contains_zero_byte(x: usize) -> bool { + x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 + } + + #[cfg(target_pointer_width = "32")] + #[inline] + fn repeat_byte(b: u8) -> usize { + let mut rep = (b as usize) << 8 | b as usize; + rep = rep << 16 | rep; + rep + } + + #[cfg(target_pointer_width = "64")] + #[inline] + fn repeat_byte(b: u8) -> usize { + let mut rep = (b as usize) << 8 | b as usize; + rep = rep << 16 | rep; + rep = rep << 32 | rep; + rep + } + + /// Return the first index matching the byte `a` in `text`. + pub fn memchr(x: u8, text: &[u8]) -> Option { + // Scan for a single byte value by reading two `usize` words at a time. + // + // Split `text` in three parts + // - unaligned initial part, before the first word aligned address in text + // - body, scan by 2 words at a time + // - the last remaining part, < 2 word size + let len = text.len(); + let ptr = text.as_ptr(); + let usize_bytes = mem::size_of::(); + + // search up to an aligned boundary + let align = (ptr as usize) & (usize_bytes - 1); + let mut offset; + if align > 0 { + offset = cmp::min(usize_bytes - align, len); + if let Some(index) = text[..offset].iter().position(|elt| *elt == x) { + return Some(index); + } + } else { + offset = 0; + } + + // search the body of the text + let repeated_x = repeat_byte(x); + + if len >= 2 * usize_bytes { + while offset <= len - 2 * usize_bytes { + unsafe { + let u = *(ptr.offset(offset as isize) as *const usize); + let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize); + + // break if there is a matching byte + let zu = contains_zero_byte(u ^ repeated_x); + let zv = contains_zero_byte(v ^ repeated_x); + if zu || zv { + break; + } + } + offset += usize_bytes * 2; + } + } + + // find the byte after the point the body loop stopped + text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i) + } + + /// Return the last index matching the byte `a` in `text`. + pub fn memrchr(x: u8, text: &[u8]) -> Option { + // Scan for a single byte value by reading two `usize` words at a time. + // + // Split `text` in three parts + // - unaligned tail, after the last word aligned address in text + // - body, scan by 2 words at a time + // - the first remaining bytes, < 2 word size + let len = text.len(); + let ptr = text.as_ptr(); + let usize_bytes = mem::size_of::(); + + // search to an aligned boundary + let end_align = (ptr as usize + len) & (usize_bytes - 1); + let mut offset; + if end_align > 0 { + offset = if end_align >= len { + 0 + } else { + len - end_align + }; + if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) { + return Some(offset + index); + } + } else { + offset = len; + } + + // search the body of the text + let repeated_x = repeat_byte(x); + + while offset >= 2 * usize_bytes { + unsafe { + let u = *(ptr.offset(offset as isize - 2 * usize_bytes as isize) as *const usize); + let v = *(ptr.offset(offset as isize - usize_bytes as isize) as *const usize); + + // break if there is a matching byte + let zu = contains_zero_byte(u ^ repeated_x); + let zv = contains_zero_byte(v ^ repeated_x); + if zu || zv { + break; + } + } + offset -= 2 * usize_bytes; + } + + // find the byte before the point the body loop stopped + text[..offset].iter().rposition(|elt| *elt == x) + } + + // test fallback implementations on all platforms + #[test] + fn matches_one() { + assert_eq!(Some(0), memchr(b'a', b"a")); + } + + #[test] + fn matches_begin() { + assert_eq!(Some(0), memchr(b'a', b"aaaa")); + } + + #[test] + fn matches_end() { + assert_eq!(Some(4), memchr(b'z', b"aaaaz")); + } + + #[test] + fn matches_nul() { + assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00")); + } + + #[test] + fn matches_past_nul() { + assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z")); + } + + #[test] + fn no_match_empty() { + assert_eq!(None, memchr(b'a', b"")); + } + + #[test] + fn no_match() { + assert_eq!(None, memchr(b'a', b"xyz")); + } + + #[test] + fn matches_one_reversed() { + assert_eq!(Some(0), memrchr(b'a', b"a")); + } + + #[test] + fn matches_begin_reversed() { + assert_eq!(Some(3), memrchr(b'a', b"aaaa")); + } + + #[test] + fn matches_end_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"zaaaa")); + } + + #[test] + fn matches_nul_reversed() { + assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00")); + } + + #[test] + fn matches_past_nul_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa")); + } + + #[test] + fn no_match_empty_reversed() { + assert_eq!(None, memrchr(b'a', b"")); + } + + #[test] + fn no_match_reversed() { + assert_eq!(None, memrchr(b'a', b"xyz")); + } + + #[test] + fn each_alignment_reversed() { + let mut data = [1u8; 64]; + let needle = 2; + let pos = 40; + data[pos] = needle; + for start in 0..16 { + assert_eq!(Some(pos - start), memrchr(needle, &data[start..])); + } + } +} + +#[cfg(test)] +mod tests { + // test the implementations for the current plattform + use super::{memchr, memrchr}; + + #[test] + fn matches_one() { + assert_eq!(Some(0), memchr(b'a', b"a")); + } + + #[test] + fn matches_begin() { + assert_eq!(Some(0), memchr(b'a', b"aaaa")); + } + + #[test] + fn matches_end() { + assert_eq!(Some(4), memchr(b'z', b"aaaaz")); + } + + #[test] + fn matches_nul() { + assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00")); + } + + #[test] + fn matches_past_nul() { + assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z")); + } + + #[test] + fn no_match_empty() { + assert_eq!(None, memchr(b'a', b"")); + } + + #[test] + fn no_match() { + assert_eq!(None, memchr(b'a', b"xyz")); + } + + #[test] + fn matches_one_reversed() { + assert_eq!(Some(0), memrchr(b'a', b"a")); + } + + #[test] + fn matches_begin_reversed() { + assert_eq!(Some(3), memrchr(b'a', b"aaaa")); + } + + #[test] + fn matches_end_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"zaaaa")); + } + + #[test] + fn matches_nul_reversed() { + assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00")); + } + + #[test] + fn matches_past_nul_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa")); + } + + #[test] + fn no_match_empty_reversed() { + assert_eq!(None, memrchr(b'a', b"")); + } + + #[test] + fn no_match_reversed() { + assert_eq!(None, memrchr(b'a', b"xyz")); + } + + #[test] + fn each_alignment() { + let mut data = [1u8; 64]; + let needle = 2; + let pos = 40; + data[pos] = needle; + for start in 0..16 { + assert_eq!(Some(pos - start), memchr(needle, &data[start..])); + } + } +}