ot_str.rs 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. use serde::{de, de::Visitor, Deserialize, Deserializer, Serialize, Serializer};
  2. use std::{fmt, fmt::Formatter};
  3. /// [OTString] uses [String] as its inner container.
  4. #[derive(Clone, Debug, Eq, PartialEq)]
  5. pub struct OTString(pub String);
  6. impl OTString {
  7. /// Returns the number of UTF-16 code units in this string.
  8. ///
  9. /// The length of strings behaves differently in different languages. For example: [Dart] string's
  10. /// length is calculated with UTF-16 code units. The method [utf16_len] returns the length of a
  11. /// String in UTF-16 code units.
  12. ///
  13. /// # Examples
  14. ///
  15. /// ```
  16. /// use lib_ot::core::OTString;
  17. /// let utf16_len = OTString::from("👋").utf16_len();
  18. /// assert_eq!(utf16_len, 2);
  19. /// let bytes_len = String::from("👋").len();
  20. /// assert_eq!(bytes_len, 4);
  21. ///
  22. /// ```
  23. pub fn utf16_len(&self) -> usize {
  24. count_utf16_code_units(&self.0)
  25. }
  26. pub fn utf16_iter(&self) -> Utf16CodeUnitIterator {
  27. Utf16CodeUnitIterator::new(self)
  28. }
  29. /// Returns a new string with the given [Interval]
  30. /// # Examples
  31. ///
  32. /// ```
  33. /// use lib_ot::core::{OTString, Interval};
  34. /// let s: OTString = "你好\n😁".into();
  35. /// assert_eq!(s.utf16_len(), 5);
  36. /// let output1 = s.sub_str(Interval::new(0, 2)).unwrap();
  37. /// assert_eq!(output1, "你好");
  38. ///
  39. /// let output2 = s.sub_str(Interval::new(2, 3)).unwrap();
  40. /// assert_eq!(output2, "\n");
  41. ///
  42. /// let output3 = s.sub_str(Interval::new(3, 5)).unwrap();
  43. /// assert_eq!(output3, "😁");
  44. /// ```
  45. pub fn sub_str(&self, interval: Interval) -> Option<String> {
  46. let mut iter = Utf16CodeUnitIterator::new(self);
  47. let mut buf = vec![];
  48. while let Some((byte, _len)) = iter.next() {
  49. if iter.utf16_offset >= interval.start && iter.utf16_offset < interval.end {
  50. buf.extend_from_slice(byte);
  51. }
  52. }
  53. if buf.is_empty() {
  54. return None;
  55. }
  56. match str::from_utf8(&buf) {
  57. Ok(item) => Some(item.to_owned()),
  58. Err(_e) => None,
  59. }
  60. }
  61. /// Return a new string with the given [Interval]
  62. /// # Examples
  63. ///
  64. /// ```
  65. /// use lib_ot::core::OTString;
  66. /// let s: OTString = "👋😁👋".into(); ///
  67. /// let mut iter = s.utf16_code_point_iter();
  68. /// assert_eq!(iter.next().unwrap(), "👋".to_string());
  69. /// assert_eq!(iter.next().unwrap(), "😁".to_string());
  70. /// assert_eq!(iter.next().unwrap(), "👋".to_string());
  71. /// assert_eq!(iter.next(), None);
  72. ///
  73. /// let s: OTString = "👋12ab一二👋".into(); ///
  74. /// let mut iter = s.utf16_code_point_iter();
  75. /// assert_eq!(iter.next().unwrap(), "👋".to_string());
  76. /// assert_eq!(iter.next().unwrap(), "1".to_string());
  77. /// assert_eq!(iter.next().unwrap(), "2".to_string());
  78. ///
  79. /// assert_eq!(iter.skip(OTString::from("ab一二").utf16_len()).next().unwrap(), "👋".to_string());
  80. /// ```
  81. #[allow(dead_code)]
  82. pub fn utf16_code_point_iter(&self) -> OTUtf16CodePointIterator {
  83. OTUtf16CodePointIterator::new(self, 0)
  84. }
  85. }
  86. impl std::ops::Deref for OTString {
  87. type Target = String;
  88. fn deref(&self) -> &Self::Target {
  89. &self.0
  90. }
  91. }
  92. impl std::ops::DerefMut for OTString {
  93. fn deref_mut(&mut self) -> &mut Self::Target {
  94. &mut self.0
  95. }
  96. }
  97. impl std::convert::From<String> for OTString {
  98. fn from(s: String) -> Self {
  99. OTString(s)
  100. }
  101. }
  102. impl std::convert::From<&str> for OTString {
  103. fn from(s: &str) -> Self {
  104. s.to_owned().into()
  105. }
  106. }
  107. impl std::fmt::Display for OTString {
  108. fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
  109. f.write_str(&self.0)
  110. }
  111. }
  112. impl std::ops::Add<&str> for OTString {
  113. type Output = OTString;
  114. fn add(self, rhs: &str) -> OTString {
  115. let new_value = self.0 + rhs;
  116. new_value.into()
  117. }
  118. }
  119. impl std::ops::AddAssign<&str> for OTString {
  120. fn add_assign(&mut self, rhs: &str) {
  121. self.0 += rhs;
  122. }
  123. }
  124. impl Serialize for OTString {
  125. fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
  126. where
  127. S: Serializer,
  128. {
  129. serializer.serialize_str(&self.0)
  130. }
  131. }
  132. impl<'de> Deserialize<'de> for OTString {
  133. fn deserialize<D>(deserializer: D) -> Result<OTString, D::Error>
  134. where
  135. D: Deserializer<'de>,
  136. {
  137. struct OTStringVisitor;
  138. impl<'de> Visitor<'de> for OTStringVisitor {
  139. type Value = OTString;
  140. fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
  141. formatter.write_str("a str")
  142. }
  143. fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
  144. where
  145. E: de::Error,
  146. {
  147. Ok(s.into())
  148. }
  149. }
  150. deserializer.deserialize_str(OTStringVisitor)
  151. }
  152. }
  153. pub struct Utf16CodeUnitIterator<'a> {
  154. s: &'a OTString,
  155. byte_offset: usize,
  156. utf16_offset: usize,
  157. utf16_count: usize,
  158. }
  159. impl<'a> Utf16CodeUnitIterator<'a> {
  160. pub fn new(s: &'a OTString) -> Self {
  161. Utf16CodeUnitIterator {
  162. s,
  163. byte_offset: 0,
  164. utf16_offset: 0,
  165. utf16_count: 0,
  166. }
  167. }
  168. }
  169. impl<'a> Iterator for Utf16CodeUnitIterator<'a> {
  170. type Item = (&'a [u8], usize);
  171. fn next(&mut self) -> Option<Self::Item> {
  172. let _len = self.s.len();
  173. if self.byte_offset == self.s.len() {
  174. None
  175. } else {
  176. let b = self.s.as_bytes()[self.byte_offset];
  177. let start = self.byte_offset;
  178. let end = self.byte_offset + len_utf8_from_first_byte(b);
  179. if (b as i8) >= -0x40 {
  180. self.utf16_count += 1;
  181. }
  182. if b >= 0xf0 {
  183. self.utf16_count += 1;
  184. }
  185. if self.utf16_count > 0 {
  186. self.utf16_offset = self.utf16_count - 1;
  187. }
  188. self.byte_offset = end;
  189. let byte = &self.s.as_bytes()[start..end];
  190. Some((byte, end - start))
  191. }
  192. }
  193. }
  194. pub struct OTUtf16CodePointIterator<'a> {
  195. s: &'a OTString,
  196. offset: usize,
  197. }
  198. impl<'a> OTUtf16CodePointIterator<'a> {
  199. pub fn new(s: &'a OTString, offset: usize) -> Self {
  200. OTUtf16CodePointIterator { s, offset }
  201. }
  202. }
  203. use crate::core::interval::Interval;
  204. use std::str;
  205. impl<'a> Iterator for OTUtf16CodePointIterator<'a> {
  206. type Item = String;
  207. fn next(&mut self) -> Option<Self::Item> {
  208. if self.offset == self.s.len() {
  209. None
  210. } else {
  211. let byte = self.s.as_bytes()[self.offset];
  212. let end = len_utf8_from_first_byte(byte);
  213. let buf = &self.s.as_bytes()[self.offset..self.offset + end];
  214. self.offset += end;
  215. match str::from_utf8(buf) {
  216. Ok(item) => Some(item.to_string()),
  217. Err(_e) => None,
  218. }
  219. }
  220. }
  221. }
  222. pub fn count_utf16_code_units(s: &str) -> usize {
  223. let mut utf16_count = 0;
  224. for &b in s.as_bytes() {
  225. if (b as i8) >= -0x40 {
  226. utf16_count += 1;
  227. }
  228. if b >= 0xf0 {
  229. utf16_count += 1;
  230. }
  231. }
  232. utf16_count
  233. }
  234. /// Given the initial byte of a UTF-8 codepoint, returns the number of
  235. /// bytes required to represent the codepoint.
  236. /// RFC reference : https://tools.ietf.org/html/rfc3629#section-4
  237. pub fn len_utf8_from_first_byte(b: u8) -> usize {
  238. match b {
  239. b if b < 0x80 => 1,
  240. b if b < 0xe0 => 2,
  241. b if b < 0xf0 => 3,
  242. _ => 4,
  243. }
  244. }
  245. #[cfg(test)]
  246. mod tests {
  247. use crate::core::interval::Interval;
  248. use crate::core::ot_str::OTString;
  249. #[test]
  250. fn flowy_str_code_unit() {
  251. let size = OTString::from("👋").utf16_len();
  252. assert_eq!(size, 2);
  253. let s: OTString = "👋 \n👋".into();
  254. let output = s.sub_str(Interval::new(0, size)).unwrap();
  255. assert_eq!(output, "👋");
  256. let output = s.sub_str(Interval::new(2, 3)).unwrap();
  257. assert_eq!(output, " ");
  258. let output = s.sub_str(Interval::new(3, 4)).unwrap();
  259. assert_eq!(output, "\n");
  260. let output = s.sub_str(Interval::new(4, 4 + size)).unwrap();
  261. assert_eq!(output, "👋");
  262. }
  263. #[test]
  264. fn flowy_str_sub_str_in_chinese2() {
  265. let s: OTString = "😁 \n".into();
  266. let size = s.utf16_len();
  267. assert_eq!(size, 4);
  268. let output1 = s.sub_str(Interval::new(0, 3)).unwrap();
  269. let output2 = s.sub_str(Interval::new(3, 4)).unwrap();
  270. assert_eq!(output1, "😁 ");
  271. assert_eq!(output2, "\n");
  272. }
  273. #[test]
  274. fn flowy_str_sub_str_in_english() {
  275. let s: OTString = "ab".into();
  276. let size = s.utf16_len();
  277. assert_eq!(size, 2);
  278. let output = s.sub_str(Interval::new(0, 2)).unwrap();
  279. assert_eq!(output, "ab");
  280. }
  281. #[test]
  282. fn flowy_str_utf16_code_point_iter_test2() {
  283. let s: OTString = "👋😁👋".into();
  284. let iter = s.utf16_code_point_iter();
  285. let result = iter.skip(1).take(1).collect::<String>();
  286. assert_eq!(result, "😁".to_string());
  287. }
  288. }