123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332 |
- use serde::{de, de::Visitor, Deserialize, Deserializer, Serialize, Serializer};
- use std::{fmt, fmt::Formatter};
- /// [OTString] uses [String] as its inner container.
- #[derive(Clone, Debug, Eq, PartialEq)]
- pub struct OTString(pub String);
- impl OTString {
- /// Returns the number of UTF-16 code units in this string.
- ///
- /// The length of strings behaves differently in different languages. For example: [Dart] string's
- /// length is calculated with UTF-16 code units. The method [utf16_len] returns the length of a
- /// String in UTF-16 code units.
- ///
- /// # Examples
- ///
- /// ```
- /// use lib_ot::core::OTString;
- /// let utf16_len = OTString::from("👋").utf16_len();
- /// assert_eq!(utf16_len, 2);
- /// let bytes_len = String::from("👋").len();
- /// assert_eq!(bytes_len, 4);
- ///
- /// ```
- pub fn utf16_len(&self) -> usize {
- count_utf16_code_units(&self.0)
- }
- pub fn utf16_iter(&self) -> Utf16CodeUnitIterator {
- Utf16CodeUnitIterator::new(self)
- }
- /// Returns a new string with the given [Interval]
- /// # Examples
- ///
- /// ```
- /// use lib_ot::core::{OTString, Interval};
- /// let s: OTString = "你好\n😁".into();
- /// assert_eq!(s.utf16_len(), 5);
- /// let output1 = s.sub_str(Interval::new(0, 2)).unwrap();
- /// assert_eq!(output1, "你好");
- ///
- /// let output2 = s.sub_str(Interval::new(2, 3)).unwrap();
- /// assert_eq!(output2, "\n");
- ///
- /// let output3 = s.sub_str(Interval::new(3, 5)).unwrap();
- /// assert_eq!(output3, "😁");
- /// ```
- pub fn sub_str(&self, interval: Interval) -> Option<String> {
- let mut iter = Utf16CodeUnitIterator::new(self);
- let mut buf = vec![];
- while let Some((byte, _len)) = iter.next() {
- if iter.utf16_offset >= interval.start && iter.utf16_offset < interval.end {
- buf.extend_from_slice(byte);
- }
- }
- if buf.is_empty() {
- return None;
- }
- match str::from_utf8(&buf) {
- Ok(item) => Some(item.to_owned()),
- Err(_e) => None,
- }
- }
- /// Return a new string with the given [Interval]
- /// # Examples
- ///
- /// ```
- /// use lib_ot::core::OTString;
- /// let s: OTString = "👋😁👋".into(); ///
- /// let mut iter = s.utf16_code_point_iter();
- /// assert_eq!(iter.next().unwrap(), "👋".to_string());
- /// assert_eq!(iter.next().unwrap(), "😁".to_string());
- /// assert_eq!(iter.next().unwrap(), "👋".to_string());
- /// assert_eq!(iter.next(), None);
- ///
- /// let s: OTString = "👋12ab一二👋".into(); ///
- /// let mut iter = s.utf16_code_point_iter();
- /// assert_eq!(iter.next().unwrap(), "👋".to_string());
- /// assert_eq!(iter.next().unwrap(), "1".to_string());
- /// assert_eq!(iter.next().unwrap(), "2".to_string());
- ///
- /// assert_eq!(iter.skip(OTString::from("ab一二").utf16_len()).next().unwrap(), "👋".to_string());
- /// ```
- #[allow(dead_code)]
- pub fn utf16_code_point_iter(&self) -> OTUtf16CodePointIterator {
- OTUtf16CodePointIterator::new(self, 0)
- }
- }
- impl std::ops::Deref for OTString {
- type Target = String;
- fn deref(&self) -> &Self::Target {
- &self.0
- }
- }
- impl std::ops::DerefMut for OTString {
- fn deref_mut(&mut self) -> &mut Self::Target {
- &mut self.0
- }
- }
- impl std::convert::From<String> for OTString {
- fn from(s: String) -> Self {
- OTString(s)
- }
- }
- impl std::convert::From<&str> for OTString {
- fn from(s: &str) -> Self {
- s.to_owned().into()
- }
- }
- impl std::fmt::Display for OTString {
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
- f.write_str(&self.0)
- }
- }
- impl std::ops::Add<&str> for OTString {
- type Output = OTString;
- fn add(self, rhs: &str) -> OTString {
- let new_value = self.0 + rhs;
- new_value.into()
- }
- }
- impl std::ops::AddAssign<&str> for OTString {
- fn add_assign(&mut self, rhs: &str) {
- self.0 += rhs;
- }
- }
- impl Serialize for OTString {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: Serializer,
- {
- serializer.serialize_str(&self.0)
- }
- }
- impl<'de> Deserialize<'de> for OTString {
- fn deserialize<D>(deserializer: D) -> Result<OTString, D::Error>
- where
- D: Deserializer<'de>,
- {
- struct OTStringVisitor;
- impl<'de> Visitor<'de> for OTStringVisitor {
- type Value = OTString;
- fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
- formatter.write_str("a str")
- }
- fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
- where
- E: de::Error,
- {
- Ok(s.into())
- }
- }
- deserializer.deserialize_str(OTStringVisitor)
- }
- }
- pub struct Utf16CodeUnitIterator<'a> {
- s: &'a OTString,
- byte_offset: usize,
- utf16_offset: usize,
- utf16_count: usize,
- }
- impl<'a> Utf16CodeUnitIterator<'a> {
- pub fn new(s: &'a OTString) -> Self {
- Utf16CodeUnitIterator {
- s,
- byte_offset: 0,
- utf16_offset: 0,
- utf16_count: 0,
- }
- }
- }
- impl<'a> Iterator for Utf16CodeUnitIterator<'a> {
- type Item = (&'a [u8], usize);
- fn next(&mut self) -> Option<Self::Item> {
- let _len = self.s.len();
- if self.byte_offset == self.s.len() {
- None
- } else {
- let b = self.s.as_bytes()[self.byte_offset];
- let start = self.byte_offset;
- let end = self.byte_offset + len_utf8_from_first_byte(b);
- if (b as i8) >= -0x40 {
- self.utf16_count += 1;
- }
- if b >= 0xf0 {
- self.utf16_count += 1;
- }
- if self.utf16_count > 0 {
- self.utf16_offset = self.utf16_count - 1;
- }
- self.byte_offset = end;
- let byte = &self.s.as_bytes()[start..end];
- Some((byte, end - start))
- }
- }
- }
- pub struct OTUtf16CodePointIterator<'a> {
- s: &'a OTString,
- offset: usize,
- }
- impl<'a> OTUtf16CodePointIterator<'a> {
- pub fn new(s: &'a OTString, offset: usize) -> Self {
- OTUtf16CodePointIterator { s, offset }
- }
- }
- use crate::core::interval::Interval;
- use std::str;
- impl<'a> Iterator for OTUtf16CodePointIterator<'a> {
- type Item = String;
- fn next(&mut self) -> Option<Self::Item> {
- if self.offset == self.s.len() {
- None
- } else {
- let byte = self.s.as_bytes()[self.offset];
- let end = len_utf8_from_first_byte(byte);
- let buf = &self.s.as_bytes()[self.offset..self.offset + end];
- self.offset += end;
- match str::from_utf8(buf) {
- Ok(item) => Some(item.to_string()),
- Err(_e) => None,
- }
- }
- }
- }
- pub fn count_utf16_code_units(s: &str) -> usize {
- let mut utf16_count = 0;
- for &b in s.as_bytes() {
- if (b as i8) >= -0x40 {
- utf16_count += 1;
- }
- if b >= 0xf0 {
- utf16_count += 1;
- }
- }
- utf16_count
- }
- /// Given the initial byte of a UTF-8 codepoint, returns the number of
- /// bytes required to represent the codepoint.
- /// RFC reference : https://tools.ietf.org/html/rfc3629#section-4
- pub fn len_utf8_from_first_byte(b: u8) -> usize {
- match b {
- b if b < 0x80 => 1,
- b if b < 0xe0 => 2,
- b if b < 0xf0 => 3,
- _ => 4,
- }
- }
- #[cfg(test)]
- mod tests {
- use crate::core::interval::Interval;
- use crate::core::ot_str::OTString;
- #[test]
- fn flowy_str_code_unit() {
- let size = OTString::from("👋").utf16_len();
- assert_eq!(size, 2);
- let s: OTString = "👋 \n👋".into();
- let output = s.sub_str(Interval::new(0, size)).unwrap();
- assert_eq!(output, "👋");
- let output = s.sub_str(Interval::new(2, 3)).unwrap();
- assert_eq!(output, " ");
- let output = s.sub_str(Interval::new(3, 4)).unwrap();
- assert_eq!(output, "\n");
- let output = s.sub_str(Interval::new(4, 4 + size)).unwrap();
- assert_eq!(output, "👋");
- }
- #[test]
- fn flowy_str_sub_str_in_chinese2() {
- let s: OTString = "😁 \n".into();
- let size = s.utf16_len();
- assert_eq!(size, 4);
- let output1 = s.sub_str(Interval::new(0, 3)).unwrap();
- let output2 = s.sub_str(Interval::new(3, 4)).unwrap();
- assert_eq!(output1, "😁 ");
- assert_eq!(output2, "\n");
- }
- #[test]
- fn flowy_str_sub_str_in_english() {
- let s: OTString = "ab".into();
- let size = s.utf16_len();
- assert_eq!(size, 2);
- let output = s.sub_str(Interval::new(0, 2)).unwrap();
- assert_eq!(output, "ab");
- }
- #[test]
- fn flowy_str_utf16_code_point_iter_test2() {
- let s: OTString = "👋😁👋".into();
- let iter = s.utf16_code_point_iter();
- let result = iter.skip(1).take(1).collect::<String>();
- assert_eq!(result, "😁".to_string());
- }
- }
|