flowy_str.rs 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. use serde::{de, de::Visitor, Deserialize, Deserializer, Serialize, Serializer};
  2. use std::{fmt, fmt::Formatter, slice};
  3. #[derive(Clone, Debug, Eq, PartialEq)]
  4. pub struct FlowyStr(pub String);
  5. impl FlowyStr {
  6. pub fn count_utf16_code_units(&self) -> usize { count_utf16_code_units(&self.0) }
  7. pub fn utf16_iter(&self) -> FlowyUtf16Iterator { FlowyUtf16Iterator::new(self, 0) }
  8. pub fn code_point_iter(&self) -> CodePointIterator { CodePointIterator::new(self) }
  9. pub fn sub_str(&self, interval: Interval) -> String {
  10. match self.with_interval(interval) {
  11. None => "".to_owned(),
  12. Some(s) => s.0,
  13. }
  14. }
  15. pub fn with_interval(&self, interval: Interval) -> Option<FlowyStr> {
  16. let mut iter = CodePointIterator::new(self);
  17. let mut buf = vec![];
  18. while let Some((byte, _len)) = iter.next() {
  19. if interval.start < iter.code_point_offset && interval.end >= iter.code_point_offset {
  20. buf.extend_from_slice(byte);
  21. }
  22. }
  23. if buf.is_empty() {
  24. return None;
  25. }
  26. match str::from_utf8(&buf) {
  27. Ok(item) => Some(item.into()),
  28. Err(_e) => None,
  29. }
  30. }
  31. }
  32. pub struct CodePointIterator<'a> {
  33. s: &'a FlowyStr,
  34. bytes_offset: usize,
  35. code_point_offset: usize,
  36. iter_index: usize,
  37. iter: slice::Iter<'a, u8>,
  38. }
  39. impl<'a> CodePointIterator<'a> {
  40. pub fn new(s: &'a FlowyStr) -> Self {
  41. CodePointIterator {
  42. s,
  43. bytes_offset: 0,
  44. code_point_offset: 0,
  45. iter_index: 0,
  46. iter: s.as_bytes().iter(),
  47. }
  48. }
  49. }
  50. impl<'a> Iterator for CodePointIterator<'a> {
  51. type Item = (&'a [u8], usize);
  52. fn next(&mut self) -> Option<Self::Item> {
  53. let start = self.bytes_offset;
  54. let _end = start;
  55. while let Some(&b) = self.iter.next() {
  56. self.iter_index += 1;
  57. let mut code_point_count = 0;
  58. if self.bytes_offset > self.iter_index {
  59. continue;
  60. }
  61. if self.bytes_offset == self.iter_index {
  62. break;
  63. }
  64. if (b as i8) >= -0x40 {
  65. code_point_count += 1
  66. }
  67. if b >= 0xf0 {
  68. code_point_count += 1
  69. }
  70. self.bytes_offset += len_utf8_from_first_byte(b);
  71. self.code_point_offset += code_point_count;
  72. if code_point_count == 1 {
  73. break;
  74. }
  75. }
  76. if start == self.bytes_offset {
  77. return None;
  78. }
  79. let byte = &self.s.as_bytes()[start..self.bytes_offset];
  80. Some((byte, self.bytes_offset - start))
  81. }
  82. }
  83. impl std::ops::Deref for FlowyStr {
  84. type Target = String;
  85. fn deref(&self) -> &Self::Target { &self.0 }
  86. }
  87. impl std::ops::DerefMut for FlowyStr {
  88. fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 }
  89. }
  90. impl std::convert::From<String> for FlowyStr {
  91. fn from(s: String) -> Self { FlowyStr(s) }
  92. }
  93. impl std::convert::From<&str> for FlowyStr {
  94. fn from(s: &str) -> Self { s.to_owned().into() }
  95. }
  96. impl std::fmt::Display for FlowyStr {
  97. fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str(&self.0) }
  98. }
  99. impl std::ops::Add<&str> for FlowyStr {
  100. type Output = FlowyStr;
  101. fn add(self, rhs: &str) -> FlowyStr {
  102. let new_value = self.0 + rhs;
  103. new_value.into()
  104. }
  105. }
  106. impl std::ops::AddAssign<&str> for FlowyStr {
  107. fn add_assign(&mut self, rhs: &str) { self.0 += rhs; }
  108. }
  109. impl Serialize for FlowyStr {
  110. fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
  111. where
  112. S: Serializer,
  113. {
  114. serializer.serialize_str(&self.0)
  115. }
  116. }
  117. impl<'de> Deserialize<'de> for FlowyStr {
  118. fn deserialize<D>(deserializer: D) -> Result<FlowyStr, D::Error>
  119. where
  120. D: Deserializer<'de>,
  121. {
  122. struct FlowyStrVisitor;
  123. impl<'de> Visitor<'de> for FlowyStrVisitor {
  124. type Value = FlowyStr;
  125. fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { formatter.write_str("a str") }
  126. fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
  127. where
  128. E: de::Error,
  129. {
  130. Ok(s.into())
  131. }
  132. }
  133. deserializer.deserialize_str(FlowyStrVisitor)
  134. }
  135. }
  136. pub struct FlowyUtf16Iterator<'a> {
  137. s: &'a FlowyStr,
  138. offset: usize,
  139. }
  140. impl<'a> FlowyUtf16Iterator<'a> {
  141. pub fn new(s: &'a FlowyStr, offset: usize) -> Self { FlowyUtf16Iterator { s, offset } }
  142. }
  143. use crate::core::Interval;
  144. use std::str;
  145. impl<'a> Iterator for FlowyUtf16Iterator<'a> {
  146. type Item = String;
  147. fn next(&mut self) -> Option<Self::Item> {
  148. if self.offset == self.s.len() {
  149. None
  150. } else {
  151. let byte = self.s.as_bytes()[self.offset];
  152. let end = len_utf8_from_first_byte(byte);
  153. let buf = &self.s.as_bytes()[self.offset..self.offset + end];
  154. self.offset += end;
  155. match str::from_utf8(buf) {
  156. Ok(item) => Some(item.to_string()),
  157. Err(_e) => None,
  158. }
  159. }
  160. }
  161. }
  162. pub fn count_utf16_code_units(s: &str) -> usize {
  163. // bytecount::num_chars(s.as_bytes())
  164. let mut utf16_count = 0;
  165. for &b in s.as_bytes() {
  166. if (b as i8) >= -0x40 {
  167. utf16_count += 1;
  168. }
  169. if b >= 0xf0 {
  170. utf16_count += 1;
  171. }
  172. }
  173. utf16_count
  174. }
  175. /// Given the initial byte of a UTF-8 codepoint, returns the number of
  176. /// bytes required to represent the codepoint.
  177. /// RFC reference : https://tools.ietf.org/html/rfc3629#section-4
  178. pub fn len_utf8_from_first_byte(b: u8) -> usize {
  179. match b {
  180. b if b < 0x80 => 1,
  181. b if b < 0xe0 => 2,
  182. b if b < 0xf0 => 3,
  183. _ => 4,
  184. }
  185. }
  186. #[cfg(test)]
  187. mod tests {
  188. use crate::core::{FlowyStr, Interval};
  189. #[test]
  190. fn flowy_str_utf16_test() {
  191. let s: FlowyStr = "👋😁👋😁".into();
  192. let mut iter = s.utf16_iter();
  193. assert_eq!(iter.next().unwrap(), "👋".to_string());
  194. assert_eq!(iter.next().unwrap(), "😁".to_string());
  195. assert_eq!(iter.next().unwrap(), "👋".to_string());
  196. assert_eq!(iter.next().unwrap(), "😁".to_string());
  197. assert_eq!(iter.next(), None);
  198. }
  199. #[test]
  200. fn flowy_str_utf16_iter_test() {
  201. let s: FlowyStr = "👋👋😁😁👋👋".into();
  202. let iter = s.utf16_iter();
  203. let result = iter.skip(2).take(2).collect::<String>();
  204. assert_eq!(result, "😁😁".to_string());
  205. }
  206. #[test]
  207. fn flowy_str_code_point_test() {
  208. let s: FlowyStr = "👋 \n👋".into();
  209. let output = s.with_interval(Interval::new(0, 2)).unwrap().0;
  210. assert_eq!(output, "👋");
  211. let output = s.with_interval(Interval::new(2, 3)).unwrap().0;
  212. assert_eq!(output, " ");
  213. let output = s.with_interval(Interval::new(3, 4)).unwrap().0;
  214. assert_eq!(output, "\n");
  215. let output = s.with_interval(Interval::new(4, 6)).unwrap().0;
  216. assert_eq!(output, "👋");
  217. }
  218. }