Skip to main content

hypermail/
mbox.rs

1use crate::error::{HypermailError, Result};
2use std::io::{BufRead, BufReader, Read};
3
4/// Maximum size for a single email message (100 MB).
5///
6/// This prevents denial-of-service attacks via extremely large messages
7/// that could exhaust memory. Legitimate emails are typically < 50 MB
8/// even with large attachments.
9const MAX_MESSAGE_SIZE: usize = 100 * 1024 * 1024;
10
11/// Maximum line length for email headers or body lines (10 MB).
12///
13/// RFC 2822 recommends lines < 998 characters, but we allow much larger
14/// for compatibility with malformed messages and large MIME parts.
15/// This prevents memory exhaustion from pathological single-line inputs.
16const MAX_LINE_SIZE: usize = 10 * 1024 * 1024;
17
18/// Supported mbox format variants for "From " line escaping strategies.
19#[derive(Debug, Clone, PartialEq)]
20pub enum MboxFormat {
21    MboxO,
22    MboxRd,
23    MboxCl,
24    MboxCl2,
25}
26
27/// A single raw email message split into envelope line, headers, and body bytes.
28#[derive(Debug, Clone)]
29pub struct RawMessage {
30    pub from_line: String,
31    pub headers: Vec<u8>,
32    pub body: Vec<u8>,
33}
34
35/// Streaming iterator that splits an mbox file into individual messages.
36///
37/// # Security
38///
39/// Enforces per-message and per-line size limits to prevent memory exhaustion
40/// from malicious or malformed input.
41pub struct MboxReader<R: Read> {
42    reader: BufReader<R>,
43    format: MboxFormat,
44    line_num: usize,
45    buffer: Vec<u8>,
46    eof: bool,
47    max_message_size: usize,
48}
49
50impl<R: Read> MboxReader<R> {
51    /// Creates a new mbox reader with the given format variant.
52    pub fn new(reader: R, format: MboxFormat) -> Self {
53        MboxReader {
54            reader: BufReader::new(reader),
55            format,
56            line_num: 0,
57            buffer: Vec::new(),
58            eof: false,
59            max_message_size: MAX_MESSAGE_SIZE,
60        }
61    }
62
63    /// Overrides the default maximum message size limit.
64    pub fn with_max_message_size(mut self, size: usize) -> Self {
65        self.max_message_size = size;
66        self
67    }
68
69    fn is_from_line(line: &[u8]) -> bool {
70        line.starts_with(b"From ") && line.len() > 5
71    }
72
73    #[allow(dead_code)]
74    fn is_from_line_mboxrd(line: &[u8]) -> bool {
75        line.starts_with(b">From ") || Self::is_from_line(line)
76    }
77
78    fn unescape_mboxrd(line: &[u8]) -> Vec<u8> {
79        if line.starts_with(b">From ") {
80            line[1..].to_vec()
81        } else {
82            line.to_vec()
83        }
84    }
85}
86
87impl<R: Read> Iterator for MboxReader<R> {
88    type Item = Result<RawMessage>;
89
90    fn next(&mut self) -> Option<Self::Item> {
91        if self.eof {
92            return None;
93        }
94
95        loop {
96            let mut line = Vec::new();
97            self.line_num += 1;
98
99            match self.reader.read_until(b'\n', &mut line) {
100                Ok(0) => {
101                    self.eof = true;
102                    if self.buffer.is_empty() {
103                        return None;
104                    }
105                    break;
106                },
107                Ok(_) => {},
108                Err(e) => {
109                    return Some(Err(HypermailError::MboxParse {
110                        line: self.line_num,
111                        message: format!("read error: {e}"),
112                    }))
113                },
114            }
115
116            // Security: Check line size to prevent memory exhaustion
117            // from pathological inputs with extremely long lines
118            if line.len() > MAX_LINE_SIZE {
119                return Some(Err(HypermailError::MboxParse {
120                    line: self.line_num,
121                    message: format!(
122                        "line exceeds maximum size ({} bytes > {} bytes)",
123                        line.len(),
124                        MAX_LINE_SIZE
125                    ),
126                }));
127            }
128
129            if line.last() == Some(&b'\n') {
130                line.pop();
131                if line.last() == Some(&b'\r') {
132                    line.pop();
133                }
134            }
135
136            if self.buffer.is_empty() {
137                if Self::is_from_line(&line) {
138                    self.buffer = line;
139                    self.buffer.push(b'\n');
140                    continue;
141                }
142                self.buffer = line;
143                self.buffer.push(b'\n');
144                continue;
145            }
146
147            if Self::is_from_line(&line) {
148                let raw = self.buffer.split_off(0);
149                self.buffer = line;
150                self.buffer.push(b'\n');
151                return Some(Ok(parse_raw_message(&raw)));
152            }
153
154            // Security: Check total message size before appending
155            // Prevents denial-of-service via extremely large messages
156            let new_size = self.buffer.len() + line.len() + 1;
157            if new_size > self.max_message_size {
158                self.buffer.clear();
159                return Some(Err(HypermailError::MboxParse {
160                    line: self.line_num,
161                    message: format!(
162                        "message exceeds maximum size ({} bytes > {} bytes)",
163                        new_size, self.max_message_size
164                    ),
165                }));
166            }
167
168            if self.format == MboxFormat::MboxRd {
169                let unescaped = Self::unescape_mboxrd(&line);
170                self.buffer.extend_from_slice(&unescaped);
171            } else {
172                self.buffer.extend_from_slice(&line);
173            }
174            self.buffer.push(b'\n');
175        }
176
177        if self.buffer.is_empty() {
178            return None;
179        }
180        let raw = std::mem::take(&mut self.buffer);
181        Some(Ok(parse_raw_message(&raw)))
182    }
183}
184
185fn parse_raw_message(data: &[u8]) -> RawMessage {
186    let from_end = data.iter().position(|&b| b == b'\n').unwrap_or(data.len());
187    let from_line = String::from_utf8_lossy(&data[..from_end]).trim_end().to_string();
188
189    let rest = if from_end + 1 < data.len() {
190        &data[from_end + 1..]
191    } else {
192        &[]
193    };
194
195    let sep = rest
196        .windows(2)
197        .position(|w| w == b"\n\n")
198        .or_else(|| rest.windows(4).position(|w| w == b"\r\n\r\n").map(|p| p + 2));
199
200    if let Some(headers_end) = sep {
201        let header_bytes = rest[..headers_end].to_vec();
202        let body_bytes = if headers_end + 2 < rest.len() {
203            let skip = if rest[headers_end..].starts_with(b"\r\n") {
204                4
205            } else {
206                2
207            };
208            if headers_end + skip < rest.len() {
209                rest[headers_end + skip..].to_vec()
210            } else {
211                Vec::new()
212            }
213        } else {
214            Vec::new()
215        };
216        RawMessage { from_line, headers: header_bytes, body: body_bytes }
217    } else {
218        RawMessage { from_line, headers: rest.to_vec(), body: Vec::new() }
219    }
220}
221
222/// Reads an entire mbox file into a vector of raw messages.
223pub fn read_mbox_file(path: &str, format: MboxFormat) -> Result<Vec<RawMessage>> {
224    let file = std::fs::File::open(path).map_err(HypermailError::Io)?;
225    let reader = MboxReader::new(file, format);
226    let mut messages = Vec::new();
227    for msg in reader {
228        messages.push(msg?);
229    }
230    Ok(messages)
231}
232
233#[cfg(test)]
234mod tests {
235    use super::*;
236    use std::io::Cursor;
237
238    fn create_mbox_data() -> Vec<u8> {
239        b"From alice@example.com Mon Jan 01 12:00:00 2024\n\
240          From: Alice <alice@example.com>\n\
241          Subject: First message\n\
242          Message-ID: <001@example.com>\n\
243          Date: Mon, 01 Jan 2024 12:00:00 +0000\n\
244          \n\
245          This is the first message body.\n\
246          \n\
247          From bob@example.com Mon Jan 01 13:00:00 2024\n\
248          From: Bob <bob@example.com>\n\
249          Subject: Re: First message\n\
250          Message-ID: <002@example.com>\n\
251          In-Reply-To: <001@example.com>\n\
252          Date: Mon, 01 Jan 2024 13:00:00 +0000\n\
253          \n\
254          This is a reply.\n\
255          \n\
256          From carol@example.com Mon Jan 01 14:00:00 2024\n\
257          From: Carol <carol@example.com>\n\
258          Subject: Another thread\n\
259          Message-ID: <003@example.com>\n\
260          Date: Mon, 01 Jan 2024 14:00:00 +0000\n\
261          \n\
262          A different conversation.\n"
263            .to_vec()
264    }
265
266    #[test]
267    fn test_parse_mbox_basic() {
268        let data = create_mbox_data();
269        let cursor = Cursor::new(data);
270        let reader = MboxReader::new(cursor, MboxFormat::MboxO);
271        let messages: Vec<Result<RawMessage>> = reader.collect();
272        assert_eq!(messages.len(), 3);
273
274        let msg0 = messages[0].as_ref().unwrap();
275        assert!(msg0.from_line.contains("alice@example.com"));
276
277        let headers = crate::headers::parse_headers(&msg0.headers);
278        assert_eq!(crate::headers::find_header(&headers, "Subject"), Some("First message"));
279    }
280
281    #[test]
282    fn test_from_line_parsing() {
283        assert!(MboxReader::<std::io::Empty>::is_from_line(b"From alice@example.com Mon Jan 01"));
284        assert!(!MboxReader::<std::io::Empty>::is_from_line(b"From: alice@example.com"));
285        assert!(!MboxReader::<std::io::Empty>::is_from_line(b""));
286    }
287
288    #[test]
289    fn test_empty_mbox() {
290        let cursor = Cursor::new(b"");
291        let reader = MboxReader::new(cursor, MboxFormat::MboxO);
292        let count = reader.count();
293        assert_eq!(count, 0);
294    }
295
296    #[test]
297    fn test_mbox_no_headers() {
298        let data = b"From alice@example.com\n\nJust a body\n".to_vec();
299        let cursor = Cursor::new(data);
300        let reader = MboxReader::new(cursor, MboxFormat::MboxO);
301        let messages: Vec<Result<RawMessage>> = reader.collect();
302        assert_eq!(messages.len(), 1);
303        assert!(messages[0].is_ok());
304    }
305
306    #[test]
307    fn test_multipart_message_with_from_in_body() {
308        let data = b"From alice@example.com\n\
309                     From: Alice <alice@example.com>\n\
310                     Subject: Test\n\
311                     \n\
312                     This line looks like\n\
313                     >From someone else\n\
314                     but shouldn't be split.\n"
315            .to_vec();
316        let cursor = Cursor::new(data);
317        let reader = MboxReader::new(cursor, MboxFormat::MboxRd);
318        let messages: Vec<Result<RawMessage>> = reader.collect();
319        assert_eq!(messages.len(), 1);
320        let msg = messages[0].as_ref().unwrap();
321        assert_eq!(msg.from_line, "From alice@example.com");
322        let body_str = std::str::from_utf8(&msg.body).unwrap();
323        assert!(body_str.contains("From someone else"));
324    }
325
326    #[test]
327    fn test_max_message_size_exceeded() {
328        let data = b"From alice@example.com\n\
329                     From: Alice <alice@example.com>\n\
330                     \n\
331                     This is a very long body line that exceeds our tiny limit.\n"
332            .to_vec();
333        let cursor = Cursor::new(data);
334        let reader = MboxReader::new(cursor, MboxFormat::MboxO).with_max_message_size(10);
335        let results: Vec<Result<RawMessage>> = reader.collect();
336        assert!(
337            results.iter().any(|r| r.is_err()),
338            "Should fail when message exceeds size limit"
339        );
340    }
341
342    #[test]
343    fn test_mboxrd_unescape() {
344        // MboxRd: ">From " at start of body line is an escaped "From " and must be unescaped
345        let data = b"From alice@example.com\n\
346                     Subject: Test\n\
347                     \n\
348                     >From someone we know\n"
349            .to_vec();
350        let cursor = Cursor::new(data);
351        let reader = MboxReader::new(cursor, MboxFormat::MboxRd);
352        let messages: Vec<Result<RawMessage>> = reader.collect();
353        assert_eq!(messages.len(), 1);
354        let msg = messages[0].as_ref().unwrap();
355        let body = std::str::from_utf8(&msg.body).unwrap();
356        assert!(body.contains("From someone"), "'>From' should be unescaped to 'From'");
357        assert!(!body.contains(">From"), "unescaped body should not contain '>From'");
358    }
359
360    #[test]
361    fn test_parse_mbox_three_messages_bodies() {
362        let data = create_mbox_data();
363        let cursor = Cursor::new(data);
364        let reader = MboxReader::new(cursor, MboxFormat::MboxO);
365        let messages: Vec<Result<RawMessage>> = reader.collect();
366        assert_eq!(messages.len(), 3);
367        let body0 = std::str::from_utf8(&messages[0].as_ref().unwrap().body).unwrap();
368        assert!(body0.contains("first message body"));
369        let body1 = std::str::from_utf8(&messages[1].as_ref().unwrap().body).unwrap();
370        assert!(body1.contains("reply"));
371    }
372}