1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
use crate::decode::XmlDecodeError;
use std::borrow::Cow;
pub fn unescape(s: &str) -> Result<Cow<str>, XmlDecodeError> {
if !s.contains('&') {
return Ok(Cow::Borrowed(s));
}
let mut res = String::with_capacity(s.len());
let mut sections = s.split('&');
if let Some(prefix) = sections.next() {
res.push_str(prefix);
}
for section in sections {
match section.find(';') {
Some(idx) => {
let entity = §ion[..idx];
match entity {
"lt" => res.push('<'),
"gt" => res.push('>'),
"amp" => res.push('&'),
"quot" => res.push('"'),
"apos" => res.push('\''),
entity => {
let (entity, radix) = if let Some(entity) = entity.strip_prefix("#x") {
(entity, 16)
} else if let Some(entity) = entity.strip_prefix('#') {
(entity, 10)
} else {
return Err(XmlDecodeError::invalid_escape(entity));
};
let char_code = u32::from_str_radix(entity, radix).map_err(|_| {
XmlDecodeError::invalid_escape(format!(
"expected numeric escape in base {}; got: {}",
radix, &entity
))
})?;
let chr = std::char::from_u32(char_code).ok_or_else(|| {
XmlDecodeError::invalid_escape(format!(
"invalid char code: {}",
char_code
))
})?;
res.push(chr);
}
}
res.push_str(§ion[idx + 1..])
}
None => return Err(XmlDecodeError::invalid_escape("unterminated pattern")),
}
}
Ok(Cow::Owned(res))
}
#[cfg(test)]
mod test {
use crate::unescape::unescape;
use std::borrow::Cow;
#[test]
fn basic_unescape() {
assert_eq!(
unescape("< > ' " &").unwrap(),
"< > ' \" &"
);
assert_eq!(
unescape("Since a > b, b is less than a").unwrap(),
"Since a > b, b is less than a"
);
}
#[test]
fn no_need_to_escape() {
assert_eq!(unescape("hello 🍕!").unwrap(), Cow::Borrowed("hello 🍕!"));
}
#[test]
fn complex_unescape() {
assert_eq!(
unescape("a<b>c"d'e&f;;").unwrap(),
"a<b>c\"d'e&f;;"
);
assert_eq!(unescape("&lt;").unwrap(), "<")
}
#[test]
fn newline_encoding() {
assert_eq!(unescape(" ").unwrap(), "\n");
assert_eq!(unescape("
").unwrap(), "\r");
}
#[test]
fn xml_eol_encoding() {
assert_eq!(unescape("
 
").unwrap(), "\n \n");
assert_eq!(
unescape("a
 b
 c
").unwrap(),
"a\r\n b\n c\r"
);
assert_eq!(
unescape("a
… b…").unwrap(),
"a\r\u{0085} b\u{0085}"
);
assert_eq!(
unescape("a

 b… c
").unwrap(),
"a\r\u{2028} b\u{0085} c\u{2028}"
);
}
#[test]
fn invalid_escapes() {
unescape("<e;").expect_err("lte does not make a ≤");
unescape("<").expect_err("unterminated escape sequence");
unescape("&#Q1234;").expect_err("Q does not began a numeric sequence");
unescape(".14;").expect_err("decimal escape");
unescape("&#xZZ").expect_err("Z is not hex");
unescape("here is a & but without an escape sequence...").expect_err("naked &");
}
use proptest::prelude::*;
proptest! {
#[test]
fn no_panics(s: String) {
let unescaped = unescape(&s);
if s.contains('&') {
assert!(
matches!(unescaped, Ok(Cow::Owned(_)) | Err(_))
);
}
}
}
}