flowy_str.rs 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. use serde::{de, de::Visitor, Deserialize, Deserializer, Serialize, Serializer};
  2. use std::{fmt, fmt::Formatter};
  3. #[derive(Clone, Debug, Eq, PartialEq)]
  4. pub struct FlowyStr(pub String);
  5. impl FlowyStr {
  6. // https://stackoverflow.com/questions/2241348/what-is-unicode-utf-8-utf-16
  7. pub fn utf16_size(&self) -> usize {
  8. count_utf16_code_units(&self.0)
  9. }
  10. pub fn utf16_code_unit_iter(&self) -> Utf16CodeUnitIterator {
  11. Utf16CodeUnitIterator::new(self)
  12. }
  13. pub fn sub_str(&self, interval: Interval) -> Option<String> {
  14. let mut iter = Utf16CodeUnitIterator::new(self);
  15. let mut buf = vec![];
  16. while let Some((byte, _len)) = iter.next() {
  17. if iter.utf16_offset >= interval.start && iter.utf16_offset < interval.end {
  18. buf.extend_from_slice(byte);
  19. }
  20. }
  21. if buf.is_empty() {
  22. return None;
  23. }
  24. match str::from_utf8(&buf) {
  25. Ok(item) => Some(item.to_owned()),
  26. Err(_e) => None,
  27. }
  28. }
  29. #[allow(dead_code)]
  30. fn utf16_code_point_iter(&self) -> FlowyUtf16CodePointIterator {
  31. FlowyUtf16CodePointIterator::new(self, 0)
  32. }
  33. }
  34. impl std::ops::Deref for FlowyStr {
  35. type Target = String;
  36. fn deref(&self) -> &Self::Target {
  37. &self.0
  38. }
  39. }
  40. impl std::ops::DerefMut for FlowyStr {
  41. fn deref_mut(&mut self) -> &mut Self::Target {
  42. &mut self.0
  43. }
  44. }
  45. impl std::convert::From<String> for FlowyStr {
  46. fn from(s: String) -> Self {
  47. FlowyStr(s)
  48. }
  49. }
  50. impl std::convert::From<&str> for FlowyStr {
  51. fn from(s: &str) -> Self {
  52. s.to_owned().into()
  53. }
  54. }
  55. impl std::fmt::Display for FlowyStr {
  56. fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
  57. f.write_str(&self.0)
  58. }
  59. }
  60. impl std::ops::Add<&str> for FlowyStr {
  61. type Output = FlowyStr;
  62. fn add(self, rhs: &str) -> FlowyStr {
  63. let new_value = self.0 + rhs;
  64. new_value.into()
  65. }
  66. }
  67. impl std::ops::AddAssign<&str> for FlowyStr {
  68. fn add_assign(&mut self, rhs: &str) {
  69. self.0 += rhs;
  70. }
  71. }
  72. impl Serialize for FlowyStr {
  73. fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
  74. where
  75. S: Serializer,
  76. {
  77. serializer.serialize_str(&self.0)
  78. }
  79. }
  80. impl<'de> Deserialize<'de> for FlowyStr {
  81. fn deserialize<D>(deserializer: D) -> Result<FlowyStr, D::Error>
  82. where
  83. D: Deserializer<'de>,
  84. {
  85. struct FlowyStrVisitor;
  86. impl<'de> Visitor<'de> for FlowyStrVisitor {
  87. type Value = FlowyStr;
  88. fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
  89. formatter.write_str("a str")
  90. }
  91. fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
  92. where
  93. E: de::Error,
  94. {
  95. Ok(s.into())
  96. }
  97. }
  98. deserializer.deserialize_str(FlowyStrVisitor)
  99. }
  100. }
  101. pub struct Utf16CodeUnitIterator<'a> {
  102. s: &'a FlowyStr,
  103. byte_offset: usize,
  104. utf16_offset: usize,
  105. utf16_count: usize,
  106. }
  107. impl<'a> Utf16CodeUnitIterator<'a> {
  108. pub fn new(s: &'a FlowyStr) -> Self {
  109. Utf16CodeUnitIterator {
  110. s,
  111. byte_offset: 0,
  112. utf16_offset: 0,
  113. utf16_count: 0,
  114. }
  115. }
  116. }
  117. impl<'a> Iterator for Utf16CodeUnitIterator<'a> {
  118. type Item = (&'a [u8], usize);
  119. fn next(&mut self) -> Option<Self::Item> {
  120. let _len = self.s.len();
  121. if self.byte_offset == self.s.len() {
  122. None
  123. } else {
  124. let b = self.s.as_bytes()[self.byte_offset];
  125. let start = self.byte_offset;
  126. let end = self.byte_offset + len_utf8_from_first_byte(b);
  127. if (b as i8) >= -0x40 {
  128. self.utf16_count += 1;
  129. }
  130. if b >= 0xf0 {
  131. self.utf16_count += 1;
  132. }
  133. if self.utf16_count > 0 {
  134. self.utf16_offset = self.utf16_count - 1;
  135. }
  136. self.byte_offset = end;
  137. let byte = &self.s.as_bytes()[start..end];
  138. Some((byte, end - start))
  139. }
  140. }
  141. }
  142. pub struct FlowyUtf16CodePointIterator<'a> {
  143. s: &'a FlowyStr,
  144. offset: usize,
  145. }
  146. impl<'a> FlowyUtf16CodePointIterator<'a> {
  147. pub fn new(s: &'a FlowyStr, offset: usize) -> Self {
  148. FlowyUtf16CodePointIterator { s, offset }
  149. }
  150. }
  151. use crate::core::Interval;
  152. use std::str;
  153. impl<'a> Iterator for FlowyUtf16CodePointIterator<'a> {
  154. type Item = String;
  155. fn next(&mut self) -> Option<Self::Item> {
  156. if self.offset == self.s.len() {
  157. None
  158. } else {
  159. let byte = self.s.as_bytes()[self.offset];
  160. let end = len_utf8_from_first_byte(byte);
  161. let buf = &self.s.as_bytes()[self.offset..self.offset + end];
  162. self.offset += end;
  163. match str::from_utf8(buf) {
  164. Ok(item) => Some(item.to_string()),
  165. Err(_e) => None,
  166. }
  167. }
  168. }
  169. }
  170. pub fn count_utf16_code_units(s: &str) -> usize {
  171. let mut utf16_count = 0;
  172. for &b in s.as_bytes() {
  173. if (b as i8) >= -0x40 {
  174. utf16_count += 1;
  175. }
  176. if b >= 0xf0 {
  177. utf16_count += 1;
  178. }
  179. }
  180. utf16_count
  181. }
  182. /// Given the initial byte of a UTF-8 codepoint, returns the number of
  183. /// bytes required to represent the codepoint.
  184. /// RFC reference : https://tools.ietf.org/html/rfc3629#section-4
  185. pub fn len_utf8_from_first_byte(b: u8) -> usize {
  186. match b {
  187. b if b < 0x80 => 1,
  188. b if b < 0xe0 => 2,
  189. b if b < 0xf0 => 3,
  190. _ => 4,
  191. }
  192. }
  193. #[cfg(test)]
  194. mod tests {
  195. use crate::core::{FlowyStr, Interval};
  196. #[test]
  197. fn flowy_str_code_unit() {
  198. let size = FlowyStr::from("👋").utf16_size();
  199. assert_eq!(size, 2);
  200. let s: FlowyStr = "👋 \n👋".into();
  201. let output = s.sub_str(Interval::new(0, size)).unwrap();
  202. assert_eq!(output, "👋");
  203. let output = s.sub_str(Interval::new(2, 3)).unwrap();
  204. assert_eq!(output, " ");
  205. let output = s.sub_str(Interval::new(3, 4)).unwrap();
  206. assert_eq!(output, "\n");
  207. let output = s.sub_str(Interval::new(4, 4 + size)).unwrap();
  208. assert_eq!(output, "👋");
  209. }
  210. #[test]
  211. fn flowy_str_sub_str_in_chinese() {
  212. let s: FlowyStr = "你好\n😁".into();
  213. let size = s.utf16_size();
  214. assert_eq!(size, 5);
  215. let output1 = s.sub_str(Interval::new(0, 2)).unwrap();
  216. let output2 = s.sub_str(Interval::new(2, 3)).unwrap();
  217. let output3 = s.sub_str(Interval::new(3, 5)).unwrap();
  218. assert_eq!(output1, "你好");
  219. assert_eq!(output2, "\n");
  220. assert_eq!(output3, "😁");
  221. }
  222. #[test]
  223. fn flowy_str_sub_str_in_chinese2() {
  224. let s: FlowyStr = "😁 \n".into();
  225. let size = s.utf16_size();
  226. assert_eq!(size, 4);
  227. let output1 = s.sub_str(Interval::new(0, 3)).unwrap();
  228. let output2 = s.sub_str(Interval::new(3, 4)).unwrap();
  229. assert_eq!(output1, "😁 ");
  230. assert_eq!(output2, "\n");
  231. }
  232. #[test]
  233. fn flowy_str_sub_str_in_english() {
  234. let s: FlowyStr = "ab".into();
  235. let size = s.utf16_size();
  236. assert_eq!(size, 2);
  237. let output = s.sub_str(Interval::new(0, 2)).unwrap();
  238. assert_eq!(output, "ab");
  239. }
  240. #[test]
  241. fn flowy_str_utf16_code_point_iter_test1() {
  242. let s: FlowyStr = "👋😁👋".into();
  243. let mut iter = s.utf16_code_point_iter();
  244. assert_eq!(iter.next().unwrap(), "👋".to_string());
  245. assert_eq!(iter.next().unwrap(), "😁".to_string());
  246. assert_eq!(iter.next().unwrap(), "👋".to_string());
  247. assert_eq!(iter.next(), None);
  248. }
  249. #[test]
  250. fn flowy_str_utf16_code_point_iter_test2() {
  251. let s: FlowyStr = "👋😁👋".into();
  252. let iter = s.utf16_code_point_iter();
  253. let result = iter.skip(1).take(1).collect::<String>();
  254. assert_eq!(result, "😁".to_string());
  255. }
  256. }