1use crate::error::{HypermailError, Result};
2use std::io::{BufRead, BufReader, Read};
3
4const MAX_MESSAGE_SIZE: usize = 100 * 1024 * 1024;
10
11const MAX_LINE_SIZE: usize = 10 * 1024 * 1024;
17
18#[derive(Debug, Clone, PartialEq)]
20pub enum MboxFormat {
21 MboxO,
22 MboxRd,
23 MboxCl,
24 MboxCl2,
25}
26
27#[derive(Debug, Clone)]
29pub struct RawMessage {
30 pub from_line: String,
31 pub headers: Vec<u8>,
32 pub body: Vec<u8>,
33}
34
35pub struct MboxReader<R: Read> {
42 reader: BufReader<R>,
43 format: MboxFormat,
44 line_num: usize,
45 buffer: Vec<u8>,
46 eof: bool,
47 max_message_size: usize,
48}
49
50impl<R: Read> MboxReader<R> {
51 pub fn new(reader: R, format: MboxFormat) -> Self {
53 MboxReader {
54 reader: BufReader::new(reader),
55 format,
56 line_num: 0,
57 buffer: Vec::new(),
58 eof: false,
59 max_message_size: MAX_MESSAGE_SIZE,
60 }
61 }
62
63 pub fn with_max_message_size(mut self, size: usize) -> Self {
65 self.max_message_size = size;
66 self
67 }
68
69 fn is_from_line(line: &[u8]) -> bool {
70 line.starts_with(b"From ") && line.len() > 5
71 }
72
73 #[allow(dead_code)]
74 fn is_from_line_mboxrd(line: &[u8]) -> bool {
75 line.starts_with(b">From ") || Self::is_from_line(line)
76 }
77
78 fn unescape_mboxrd(line: &[u8]) -> Vec<u8> {
79 if line.starts_with(b">From ") {
80 line[1..].to_vec()
81 } else {
82 line.to_vec()
83 }
84 }
85}
86
87impl<R: Read> Iterator for MboxReader<R> {
88 type Item = Result<RawMessage>;
89
90 fn next(&mut self) -> Option<Self::Item> {
91 if self.eof {
92 return None;
93 }
94
95 loop {
96 let mut line = Vec::new();
97 self.line_num += 1;
98
99 match self.reader.read_until(b'\n', &mut line) {
100 Ok(0) => {
101 self.eof = true;
102 if self.buffer.is_empty() {
103 return None;
104 }
105 break;
106 },
107 Ok(_) => {},
108 Err(e) => {
109 return Some(Err(HypermailError::MboxParse {
110 line: self.line_num,
111 message: format!("read error: {e}"),
112 }))
113 },
114 }
115
116 if line.len() > MAX_LINE_SIZE {
119 return Some(Err(HypermailError::MboxParse {
120 line: self.line_num,
121 message: format!(
122 "line exceeds maximum size ({} bytes > {} bytes)",
123 line.len(),
124 MAX_LINE_SIZE
125 ),
126 }));
127 }
128
129 if line.last() == Some(&b'\n') {
130 line.pop();
131 if line.last() == Some(&b'\r') {
132 line.pop();
133 }
134 }
135
136 if self.buffer.is_empty() {
137 if Self::is_from_line(&line) {
138 self.buffer = line;
139 self.buffer.push(b'\n');
140 continue;
141 }
142 self.buffer = line;
143 self.buffer.push(b'\n');
144 continue;
145 }
146
147 if Self::is_from_line(&line) {
148 let raw = self.buffer.split_off(0);
149 self.buffer = line;
150 self.buffer.push(b'\n');
151 return Some(Ok(parse_raw_message(&raw)));
152 }
153
154 let new_size = self.buffer.len() + line.len() + 1;
157 if new_size > self.max_message_size {
158 self.buffer.clear();
159 return Some(Err(HypermailError::MboxParse {
160 line: self.line_num,
161 message: format!(
162 "message exceeds maximum size ({} bytes > {} bytes)",
163 new_size, self.max_message_size
164 ),
165 }));
166 }
167
168 if self.format == MboxFormat::MboxRd {
169 let unescaped = Self::unescape_mboxrd(&line);
170 self.buffer.extend_from_slice(&unescaped);
171 } else {
172 self.buffer.extend_from_slice(&line);
173 }
174 self.buffer.push(b'\n');
175 }
176
177 if self.buffer.is_empty() {
178 return None;
179 }
180 let raw = std::mem::take(&mut self.buffer);
181 Some(Ok(parse_raw_message(&raw)))
182 }
183}
184
185fn parse_raw_message(data: &[u8]) -> RawMessage {
186 let from_end = data.iter().position(|&b| b == b'\n').unwrap_or(data.len());
187 let from_line = String::from_utf8_lossy(&data[..from_end]).trim_end().to_string();
188
189 let rest = if from_end + 1 < data.len() {
190 &data[from_end + 1..]
191 } else {
192 &[]
193 };
194
195 let sep = rest
196 .windows(2)
197 .position(|w| w == b"\n\n")
198 .or_else(|| rest.windows(4).position(|w| w == b"\r\n\r\n").map(|p| p + 2));
199
200 if let Some(headers_end) = sep {
201 let header_bytes = rest[..headers_end].to_vec();
202 let body_bytes = if headers_end + 2 < rest.len() {
203 let skip = if rest[headers_end..].starts_with(b"\r\n") {
204 4
205 } else {
206 2
207 };
208 if headers_end + skip < rest.len() {
209 rest[headers_end + skip..].to_vec()
210 } else {
211 Vec::new()
212 }
213 } else {
214 Vec::new()
215 };
216 RawMessage { from_line, headers: header_bytes, body: body_bytes }
217 } else {
218 RawMessage { from_line, headers: rest.to_vec(), body: Vec::new() }
219 }
220}
221
222pub fn read_mbox_file(path: &str, format: MboxFormat) -> Result<Vec<RawMessage>> {
224 let file = std::fs::File::open(path).map_err(HypermailError::Io)?;
225 let reader = MboxReader::new(file, format);
226 let mut messages = Vec::new();
227 for msg in reader {
228 messages.push(msg?);
229 }
230 Ok(messages)
231}
232
233#[cfg(test)]
234mod tests {
235 use super::*;
236 use std::io::Cursor;
237
238 fn create_mbox_data() -> Vec<u8> {
239 b"From alice@example.com Mon Jan 01 12:00:00 2024\n\
240 From: Alice <alice@example.com>\n\
241 Subject: First message\n\
242 Message-ID: <001@example.com>\n\
243 Date: Mon, 01 Jan 2024 12:00:00 +0000\n\
244 \n\
245 This is the first message body.\n\
246 \n\
247 From bob@example.com Mon Jan 01 13:00:00 2024\n\
248 From: Bob <bob@example.com>\n\
249 Subject: Re: First message\n\
250 Message-ID: <002@example.com>\n\
251 In-Reply-To: <001@example.com>\n\
252 Date: Mon, 01 Jan 2024 13:00:00 +0000\n\
253 \n\
254 This is a reply.\n\
255 \n\
256 From carol@example.com Mon Jan 01 14:00:00 2024\n\
257 From: Carol <carol@example.com>\n\
258 Subject: Another thread\n\
259 Message-ID: <003@example.com>\n\
260 Date: Mon, 01 Jan 2024 14:00:00 +0000\n\
261 \n\
262 A different conversation.\n"
263 .to_vec()
264 }
265
266 #[test]
267 fn test_parse_mbox_basic() {
268 let data = create_mbox_data();
269 let cursor = Cursor::new(data);
270 let reader = MboxReader::new(cursor, MboxFormat::MboxO);
271 let messages: Vec<Result<RawMessage>> = reader.collect();
272 assert_eq!(messages.len(), 3);
273
274 let msg0 = messages[0].as_ref().unwrap();
275 assert!(msg0.from_line.contains("alice@example.com"));
276
277 let headers = crate::headers::parse_headers(&msg0.headers);
278 assert_eq!(crate::headers::find_header(&headers, "Subject"), Some("First message"));
279 }
280
281 #[test]
282 fn test_from_line_parsing() {
283 assert!(MboxReader::<std::io::Empty>::is_from_line(b"From alice@example.com Mon Jan 01"));
284 assert!(!MboxReader::<std::io::Empty>::is_from_line(b"From: alice@example.com"));
285 assert!(!MboxReader::<std::io::Empty>::is_from_line(b""));
286 }
287
288 #[test]
289 fn test_empty_mbox() {
290 let cursor = Cursor::new(b"");
291 let reader = MboxReader::new(cursor, MboxFormat::MboxO);
292 let count = reader.count();
293 assert_eq!(count, 0);
294 }
295
296 #[test]
297 fn test_mbox_no_headers() {
298 let data = b"From alice@example.com\n\nJust a body\n".to_vec();
299 let cursor = Cursor::new(data);
300 let reader = MboxReader::new(cursor, MboxFormat::MboxO);
301 let messages: Vec<Result<RawMessage>> = reader.collect();
302 assert_eq!(messages.len(), 1);
303 assert!(messages[0].is_ok());
304 }
305
306 #[test]
307 fn test_multipart_message_with_from_in_body() {
308 let data = b"From alice@example.com\n\
309 From: Alice <alice@example.com>\n\
310 Subject: Test\n\
311 \n\
312 This line looks like\n\
313 >From someone else\n\
314 but shouldn't be split.\n"
315 .to_vec();
316 let cursor = Cursor::new(data);
317 let reader = MboxReader::new(cursor, MboxFormat::MboxRd);
318 let messages: Vec<Result<RawMessage>> = reader.collect();
319 assert_eq!(messages.len(), 1);
320 let msg = messages[0].as_ref().unwrap();
321 assert_eq!(msg.from_line, "From alice@example.com");
322 let body_str = std::str::from_utf8(&msg.body).unwrap();
323 assert!(body_str.contains("From someone else"));
324 }
325
326 #[test]
327 fn test_max_message_size_exceeded() {
328 let data = b"From alice@example.com\n\
329 From: Alice <alice@example.com>\n\
330 \n\
331 This is a very long body line that exceeds our tiny limit.\n"
332 .to_vec();
333 let cursor = Cursor::new(data);
334 let reader = MboxReader::new(cursor, MboxFormat::MboxO).with_max_message_size(10);
335 let results: Vec<Result<RawMessage>> = reader.collect();
336 assert!(
337 results.iter().any(|r| r.is_err()),
338 "Should fail when message exceeds size limit"
339 );
340 }
341
342 #[test]
343 fn test_mboxrd_unescape() {
344 let data = b"From alice@example.com\n\
346 Subject: Test\n\
347 \n\
348 >From someone we know\n"
349 .to_vec();
350 let cursor = Cursor::new(data);
351 let reader = MboxReader::new(cursor, MboxFormat::MboxRd);
352 let messages: Vec<Result<RawMessage>> = reader.collect();
353 assert_eq!(messages.len(), 1);
354 let msg = messages[0].as_ref().unwrap();
355 let body = std::str::from_utf8(&msg.body).unwrap();
356 assert!(body.contains("From someone"), "'>From' should be unescaped to 'From'");
357 assert!(!body.contains(">From"), "unescaped body should not contain '>From'");
358 }
359
360 #[test]
361 fn test_parse_mbox_three_messages_bodies() {
362 let data = create_mbox_data();
363 let cursor = Cursor::new(data);
364 let reader = MboxReader::new(cursor, MboxFormat::MboxO);
365 let messages: Vec<Result<RawMessage>> = reader.collect();
366 assert_eq!(messages.len(), 3);
367 let body0 = std::str::from_utf8(&messages[0].as_ref().unwrap().body).unwrap();
368 assert!(body0.contains("first message body"));
369 let body1 = std::str::from_utf8(&messages[1].as_ref().unwrap().body).unwrap();
370 assert!(body1.contains("reply"));
371 }
372}