1use once_cell::sync::Lazy;
2use regex::Regex;
3
4static MIME_WORD_RE: Lazy<Regex> =
5 Lazy::new(|| Regex::new(r"=\?([^?]+)\?([BbQq])\?([^?]*)\?=").unwrap());
6
7#[derive(Debug, Clone)]
8pub struct Header {
9 pub name: String,
10 pub body: String,
11}
12
13pub fn parse_headers(data: &[u8]) -> Vec<Header> {
14 let text = String::from_utf8_lossy(data);
15 let mut headers = Vec::new();
16 let mut current_name = String::new();
17 let mut current_body = String::new();
18 let mut in_headers = true;
19
20 for line in text.lines() {
21 if in_headers {
22 if line.is_empty() {
23 in_headers = false;
24 if !current_name.is_empty() {
25 headers.push(Header {
26 name: current_name.trim().to_lowercase(),
27 body: current_body.trim().to_string(),
28 });
29 current_name.clear();
30 current_body.clear();
31 }
32 continue;
33 }
34
35 if line.starts_with([' ', '\t']) {
36 if !current_name.is_empty() {
37 current_body.push(' ');
38 current_body.push_str(line.trim());
39 }
40 } else if let Some((name, body)) = line.split_once(':') {
41 if !current_name.is_empty() {
42 headers.push(Header {
43 name: current_name.trim().to_lowercase(),
44 body: current_body.trim().to_string(),
45 });
46 }
47 current_name = name.to_string();
48 current_body = body.to_string();
49 }
50 }
51 }
52
53 if !current_name.is_empty() {
54 headers.push(Header {
55 name: current_name.trim().to_lowercase(),
56 body: current_body.trim().to_string(),
57 });
58 }
59
60 headers
61}
62
63pub fn find_header<'a>(headers: &'a [Header], name: &str) -> Option<&'a str> {
64 let lower = name.to_lowercase();
65 headers.iter().find(|h| h.name == lower).map(|h| h.body.as_str())
66}
67
68pub fn find_headers<'a>(headers: &'a [Header], name: &str) -> Vec<&'a str> {
70 let lower = name.to_lowercase();
71 headers.iter().filter(|h| h.name == lower).map(|h| h.body.as_str()).collect()
72}
73
74pub fn decode_mime_words(s: &str) -> String {
75 let re = &*MIME_WORD_RE;
76
77 struct Match {
81 start: usize,
82 end: usize,
83 charset: String,
84 raw_bytes: Vec<u8>,
85 }
86
87 let mut matches: Vec<Match> = Vec::new();
88 for caps in re.captures_iter(s) {
89 let full = caps.get(0).unwrap();
90 let charset = caps.get(1).unwrap().as_str().to_string();
91 let encoding = caps.get(2).unwrap().as_str();
92 let encoded = caps.get(3).unwrap().as_str();
93
94 let raw_bytes = match encoding.to_uppercase().as_str() {
95 "B" => decode_base64_mime_bytes(encoded),
96 "Q" => decode_quoted_printable_bytes(encoded),
97 _ => encoded.as_bytes().to_vec(),
98 };
99
100 matches.push(Match { start: full.start(), end: full.end(), charset, raw_bytes });
101 }
102
103 if matches.is_empty() {
104 return s.to_string();
105 }
106
107 let mut result = String::new();
109 let mut prev_end: usize = 0;
110
111 let mut i = 0;
112 while i < matches.len() {
113 result.push_str(&s[prev_end..matches[i].start]);
115
116 let mut group_bytes: Vec<u8> = matches[i].raw_bytes.clone();
118 let mut group_charset = matches[i].charset.clone();
119 let mut group_end = matches[i].end;
120 let mut j = i + 1;
121
122 while j < matches.len() {
123 let between = &s[group_end..matches[j].start];
124 if between.chars().all(|c| c == ' ' || c == '\t' || c == '\r' || c == '\n') {
125 if matches[j].charset.eq_ignore_ascii_case(&group_charset) {
128 group_bytes.extend_from_slice(&matches[j].raw_bytes);
129 } else {
130 result.push_str(&decode_to_utf8(&group_bytes, &group_charset));
132 group_bytes = matches[j].raw_bytes.clone();
133 group_charset = matches[j].charset.clone();
134 }
135 group_end = matches[j].end;
136 j += 1;
137 } else {
138 break;
139 }
140 }
141
142 result.push_str(&decode_to_utf8(&group_bytes, &group_charset));
143 prev_end = group_end;
144 i = j;
145 }
146
147 result.push_str(&s[prev_end..]);
149 result
150}
151
152fn normalize_charset(charset: &str) -> String {
153 let lower = charset.to_lowercase();
154 match lower.as_str() {
156 "iso-8859-75" | "iso885975" => "iso-8859-7".to_string(),
158 _ => lower,
161 }
162}
163
164pub fn decode_to_utf8(data: &[u8], charset: &str) -> String {
165 let charset_normalized = normalize_charset(charset);
166 let charset_lower = charset_normalized.as_str();
167
168 if charset_lower == "utf-8" || charset_lower == "utf8" || data.is_ascii() {
169 return String::from_utf8_lossy(data).to_string();
170 }
171
172 if charset_lower == "iso-8859-1" || charset_lower == "iso-8859-15" {
175 for fallback in &["iso-8859-7", "windows-1253"] {
177 if let Some(encoding) = encoding_rs::Encoding::for_label(fallback.as_bytes()) {
178 let (cow, _, _) = encoding.decode(data);
179
180 let greek_count =
182 cow.chars().filter(|c| ('\u{0370}'..='\u{03FF}').contains(c)).count();
183 let total_alpha = cow.chars().filter(|c| c.is_alphabetic()).count();
184
185 if total_alpha > 0 && greek_count * 100 / total_alpha > 30 {
187 return cow.into_owned();
188 }
189 }
190 }
191 }
193
194 if let Some(encoding) = encoding_rs::Encoding::for_label(charset_normalized.as_bytes()) {
196 let (cow, _, had_errors) = encoding.decode(data);
197 if !had_errors || !cow.contains('\u{FFFD}') {
199 return cow.into_owned();
200 }
201 }
202
203 for fallback in &["iso-8859-7", "windows-1253", "windows-1252", "iso-8859-1"] {
205 if let Some(encoding) = encoding_rs::Encoding::for_label(fallback.as_bytes()) {
206 let (cow, _, _) = encoding.decode(data);
207 if !cow.contains('\u{FFFD}') {
208 return cow.into_owned();
209 }
210 }
211 }
212
213 String::from_utf8_lossy(data).to_string()
215}
216
217fn decode_base64_mime_bytes(s: &str) -> Vec<u8> {
218 use base64::Engine as _;
219 let engine = base64::engine::general_purpose::STANDARD;
220 engine.decode(s).unwrap_or_else(|_| s.as_bytes().to_vec())
221}
222
223fn decode_quoted_printable_bytes(s: &str) -> Vec<u8> {
224 let s = s.replace('_', " ");
225 let data = s.as_bytes();
226 let mut result = Vec::with_capacity(data.len());
227 let mut i = 0;
228
229 while i < data.len() {
230 if data[i] == b'=' && i + 2 < data.len() {
231 if let (Some(h), Some(l)) = (hex_val(data[i + 1]), hex_val(data[i + 2])) {
232 result.push(h << 4 | l);
233 i += 3;
234 continue;
235 }
236 }
237 if data[i] != b'\r' {
238 result.push(data[i]);
239 }
240 i += 1;
241 }
242
243 result
244}
245
246fn hex_val(b: u8) -> Option<u8> {
247 match b {
248 b'0'..=b'9' => Some(b - b'0'),
249 b'A'..=b'F' => Some(b - b'A' + 10),
250 b'a'..=b'f' => Some(b - b'a' + 10),
251 _ => None,
252 }
253}
254
255pub fn unfold_header(s: &str) -> String {
256 let s = s.replace("\r\n ", " ").replace("\r\n\t", " ");
258 let s = s.replace("\n ", " ").replace("\n\t", " ");
260 s.replace(['\r', '\n'], "")
262}
263
264pub fn parse_email_address(s: &str) -> (Option<String>, Option<String>) {
265 let s = s.trim();
266
267 if let Some(angle_start) = s.find('<') {
268 let name = if angle_start > 0 {
269 Some(s[..angle_start].trim().trim_matches('"').to_string())
270 } else {
271 None
272 };
273 let email = s[angle_start..]
274 .find('>')
275 .map(|angle_end| s[angle_start + 1..angle_start + angle_end].to_string());
276 return (name, email);
277 }
278
279 if let Some(paren_start) = s.find('(') {
280 let email = Some(s[..paren_start].trim().to_string());
281 let name = s[paren_start..]
282 .find(')')
283 .map(|paren_end| s[paren_start + 1..paren_start + paren_end].to_string());
284 return (name, email);
285 }
286
287 if s.contains('@') {
288 return (None, Some(s.to_string()));
289 }
290
291 (Some(s.to_string()), None)
292}
293
294#[cfg(test)]
295mod tests {
296 use super::*;
297
298 #[test]
299 fn test_parse_simple_headers() {
300 let data = b"From: alice@example.com\nSubject: Hello\n\nBody text\n";
301 let headers = parse_headers(data);
302 assert_eq!(headers.len(), 2);
303 assert_eq!(headers[0].name, "from");
304 assert_eq!(headers[0].body, "alice@example.com");
305 assert_eq!(headers[1].name, "subject");
306 assert_eq!(headers[1].body, "Hello");
307 }
308
309 #[test]
310 fn test_find_header() {
311 let headers = parse_headers(b"From: alice@example.com\nSubject: Test\n\nBody\n");
312 assert_eq!(find_header(&headers, "From"), Some("alice@example.com"));
313 assert_eq!(find_header(&headers, "Subject"), Some("Test"));
314 assert_eq!(find_header(&headers, "Date"), None);
315 }
316
317 #[test]
318 fn test_folded_headers() {
319 let data = b"Subject: A very long\n subject header\n\nBody\n";
320 let headers = parse_headers(data);
321 assert_eq!(find_header(&headers, "Subject"), Some("A very long subject header"));
322 }
323
324 #[test]
325 fn test_decode_mime_b() {
326 let decoded = decode_mime_words("=?UTF-8?B?SGVsbG8gV29ybGQ=?=");
327 assert_eq!(decoded, "Hello World");
328 }
329
330 #[test]
331 fn test_decode_mime_q() {
332 let decoded = decode_mime_words("=?utf-8?Q?H=C3=A5kan?=");
333 assert_eq!(decoded, "Håkan");
334 }
335
336 #[test]
337 fn test_decode_mime_mixed() {
338 let decoded = decode_mime_words("Re: =?UTF-8?B?SGVsbG8=?=");
339 assert_eq!(decoded, "Re: Hello");
340 }
341
342 #[test]
343 fn test_decode_mime_q_iso8859_1() {
344 let decoded = decode_mime_words("=?ISO-8859-1?Q?H=E5kan?=");
346 assert_eq!(decoded, "Håkan");
347 }
348
349 #[test]
350 fn test_decode_mime_b_iso8859_1() {
351 let decoded = decode_mime_words("=?ISO-8859-1?B?SOVrYW4=?=");
353 assert_eq!(decoded, "Håkan");
354 }
355
356 #[test]
357 fn test_decode_mime_q_shift_jis() {
358 let decoded = decode_mime_words("=?Shift_JIS?B?k/qWe4zq?=");
360 assert_eq!(decoded, "日本語");
361 }
362
363 #[test]
364 fn test_decode_mime_adjacent_words() {
365 let decoded = decode_mime_words("=?UTF-8?Q?Hello=20?==?UTF-8?Q?World?=");
367 assert_eq!(decoded, "Hello World");
368 }
369
370 #[test]
371 fn test_parse_email() {
372 let (name, email) = parse_email_address("Alice <alice@example.com>");
373 assert_eq!(name.as_deref(), Some("Alice"));
374 assert_eq!(email.as_deref(), Some("alice@example.com"));
375 }
376
377 #[test]
378 fn test_parse_email_no_name() {
379 let (name, email) = parse_email_address("alice@example.com");
380 assert!(name.is_none());
381 assert_eq!(email.as_deref(), Some("alice@example.com"));
382 }
383
384 #[test]
385 fn test_unfold_header() {
386 let unfolded = unfold_header("Subject: A very\r\n long subject");
387 assert_eq!(unfolded, "Subject: A very long subject");
388 }
389
390 #[test]
391 fn test_empty_headers() {
392 let headers = parse_headers(b"\nJust body\n");
393 assert!(headers.is_empty());
394 }
395
396 #[test]
399 fn test_decode_mime_b_iso_8859_7_kalimera() {
400 let decoded = decode_mime_words("=?ISO-8859-7?B?yuHr5+zl8eE=?=");
403 assert_eq!(decoded, "Καλημερα");
404 }
405
406 #[test]
407 fn test_decode_mime_q_iso_8859_7_geia() {
408 let decoded = decode_mime_words("=?ISO-8859-7?Q?=C3=E5=E9=E1?=");
410 assert_eq!(decoded, "Γεια");
411 }
412
413 #[test]
414 fn test_decode_mime_b_windows_1253_anthropos() {
415 let decoded = decode_mime_words("=?windows-1253?B?3O3o8fnw7/I=?=");
418 assert_eq!(decoded, "άνθρωπος");
419 }
420
421 #[test]
422 fn test_decode_mime_greek_mixed_text() {
423 let decoded = decode_mime_words("Re: =?ISO-8859-7?B?yuHr5+zl8eE=?=");
425 assert_eq!(decoded, "Re: Καλημερα");
426 }
427
428 #[test]
429 fn test_decode_mime_greek_adjacent_words() {
430 let decoded = decode_mime_words("=?ISO-8859-7?B?w+Xp4Q==?= =?ISO-8859-7?B?8+/1?=");
433 assert_eq!(decoded, "Γειασου");
434 }
435
436 #[test]
437 fn test_decode_mime_utf8_greek() {
438 let decoded = decode_mime_words("=?UTF-8?B?zprOsc67zrfOvM61z4HOsQ==?=");
439 assert_eq!(decoded, "Καλημερα");
440 }
441
442 #[test]
443 fn test_decode_mime_multiple_charsets() {
444 let decoded = decode_mime_words("=?ISO-8859-7?B?w+Xp4Q==?= =?ISO-8859-1?Q?H=E5kan?=");
446 assert_eq!(decoded, "ΓειαHåkan");
447 }
448
449 #[test]
450 fn test_decode_mime_iso_8859_7_tonos() {
451 let decoded = decode_mime_words("=?ISO-8859-7?B?3O3o8fnw7/I=?=");
454 assert_eq!(decoded, "άνθρωπος");
455 }
456
457 #[test]
459 fn test_parse_headers_greek_subject() {
460 let raw_bytes = b"Subject: \xC3\xE5\xE9\xE1\nFrom: test@test.com\n\nBody\n";
463 let headers = parse_headers(raw_bytes);
464 let subject = find_header(&headers, "Subject").unwrap();
465 assert!(
468 subject.contains('\u{FFFD}') || subject == "Γεια",
469 "Raw non-UTF-8 subject should either contain replacement chars or be valid UTF-8"
470 );
471 }
472
473 #[test]
476 fn test_decode_mime_uppercase_tonos_iso_8859_7() {
477 let decoded = decode_mime_words("=?ISO-8859-7?B?tu3o8fnw7/I=?=");
480 assert_eq!(decoded, "Άνθρωπος");
481 }
482
483 #[test]
484 fn test_decode_mime_uppercase_tonos_windows_1253() {
485 let decoded = decode_mime_words("=?windows-1253?B?ou3o8fnw7/I=?=");
488 assert_eq!(decoded, "Άνθρωπος");
489 }
490
491 #[test]
492 fn test_decode_mime_real_world_greeting() {
493 let decoded = decode_mime_words("=?ISO-8859-7?Q?=CA=E1=EB=FC_=E1=F0=FC=E3=E5=F5=EC=E1?=");
496 assert_eq!(decoded, "Καλό απόγευμα");
497 }
498
499 #[test]
500 fn test_decode_mime_greek_question() {
501 let decoded = decode_mime_words("=?ISO-8859-7?Q?=D0=FE=F2_=E5=DF=F3=E1=E9;?=");
504 assert_eq!(decoded, "Πώς είσαι;");
505 }
506
507 #[test]
508 fn test_decode_mime_mixed_greek_latin_subject() {
509 let decoded = decode_mime_words("Re: =?ISO-8859-7?B?yuHr5+zl8eE=?=");
511 assert_eq!(decoded, "Re: Καλημερα");
512 }
513
514 #[test]
515 fn test_decode_mime_greek_with_numbers() {
516 let decoded = decode_mime_words("=?ISO-8859-7?Q?=D3=E5=EB=DF=E4=E1_123?=");
519 assert_eq!(decoded, "Σελίδα 123");
520 }
521
522 #[test]
523 fn test_decode_mime_fwd_greek() {
524 let decoded = decode_mime_words("Fwd: =?UTF-8?B?zpXOu867zrfOvc65zrrOrA==?=");
527 assert_eq!(decoded, "Fwd: Ελληνικά");
528 }
529
530 #[test]
531 fn test_decode_mime_greek_parentheses() {
532 let decoded = decode_mime_words("(=?ISO-8859-7?B?0+fs4e306er8?=)");
535 assert_eq!(decoded, "(Σημαντικό)");
536 }
537
538 #[test]
539 fn test_decode_mime_multiple_greek_words_adjacent() {
540 let decoded = decode_mime_words("=?ISO-8859-7?B?yuHr3g==?= =?ISO-8859-7?B?7N3x4Q==?=");
542 assert_eq!(decoded, "Καλήμέρα");
543 }
544
545 #[test]
546 fn test_decode_mime_greek_diaeresis() {
547 let decoded = decode_mime_words("=?ISO-8859-7?B?+uTp7/I=?=");
550 assert_eq!(decoded, "ϊδιος");
551 }
552
553 #[test]
554 fn test_decode_mime_windows_1253_real_world() {
555 let decoded = decode_mime_words("=?windows-1253?B?xfX34fHp8/T+?=");
559 assert_eq!(decoded, "Ευχαριστώ");
560 }
561
562 #[test]
563 fn test_decode_mislabeled_iso_8859_1_as_greek() {
564 let greek_bytes = b"\xD3\xF9\xF3\xF4\xDC\x20\xFC\xEB\xE1\x20\xE1\xF5\xF4\xDC";
568
569 let result = decode_to_utf8(greek_bytes, "iso-8859-1");
571 assert!(
572 result.contains("Σωστά") || result.contains("ωστά"),
573 "Should detect Greek in mislabeled iso-8859-1: got '{}'",
574 result
575 );
576 }
577
578 #[test]
579 fn test_decode_correct_iso_8859_1_latin() {
580 let latin_bytes = b"Caf\xE9 r\xE9sum\xE9";
584
585 let result = decode_to_utf8(latin_bytes, "iso-8859-1");
586 assert_eq!(result, "Café résumé", "Should preserve correct Latin-1 text");
587 }
588
589 #[test]
590 fn test_find_headers_multiple_values() {
591 let data = b"Received: from a\nReceived: from b\nFrom: alice@example.com\n\nBody\n";
592 let headers = parse_headers(data);
593 let received = find_headers(&headers, "Received");
594 assert_eq!(received.len(), 2);
595 assert!(received.contains(&"from a"));
596 assert!(received.contains(&"from b"));
597 }
598
599 #[test]
600 fn test_find_headers_none_found() {
601 let headers = parse_headers(b"From: alice@example.com\n\nBody\n");
602 let result = find_headers(&headers, "X-Missing");
603 assert!(result.is_empty());
604 }
605
606 #[test]
607 fn test_unfold_header_bare_lf() {
608 let result = unfold_header("Subject: Long\n header");
609 assert_eq!(result, "Subject: Long header");
610 }
611
612 #[test]
613 fn test_parse_email_paren_style() {
614 let (name, email) = parse_email_address("alice@example.com (Alice)");
615 assert_eq!(name.as_deref(), Some("Alice"));
616 assert_eq!(email.as_deref(), Some("alice@example.com"));
617 }
618
619 #[test]
620 fn test_parse_email_bare_name() {
621 let (name, email) = parse_email_address("Alice");
622 assert_eq!(name.as_deref(), Some("Alice"));
623 assert!(email.is_none());
624 }
625}