Files
rs-pop-imap-importer/src/email_processor.rs
T
herel e0b0c5e964 fix: add X-WS-* header detection and attachment test
- Detect X-WS-* headers (e.g., X-WS-Attachment-UUID) as MIME headers
  to ensure attachment headers like Content-Type and Content-Disposition
  get normalized properly
- Add test case for attachment header normalization
- Bump version to 0.5.0

The normalization fixes malformed continuation lines in attachment
headers generated by Infomaniak webmail, where lines like
"name=file.pdf" are missing the required leading whitespace.
2025-12-10 15:22:55 +01:00

260 lines
10 KiB
Rust

/// Email processing utilities for normalizing and fixing email format issues
use std::error::Error;
/// Normalizes a section of headers by fixing continuation lines
fn normalize_header_section(headers: &str, line_ending: &str) -> String {
let mut result = String::with_capacity(headers.len());
let mut previous_line_was_header = false;
let lines: Vec<&str> = headers.lines().collect();
let line_count = lines.len();
for (idx, line) in lines.iter().enumerate() {
let is_last_line = idx == line_count - 1;
// Check if this is a header line (starts with a field name followed by colon)
// RFC 5322: field names consist of printable ASCII except colon
let is_header_start = line.chars().next().map_or(false, |c| c.is_ascii_alphabetic())
&& line.find(':').map_or(false, |pos| {
// Ensure all characters before the colon are valid field-name characters
line[..pos].chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
});
if is_header_start {
result.push_str(line);
if !is_last_line {
result.push_str(line_ending);
}
previous_line_was_header = true;
} else if previous_line_was_header {
// This is a continuation line
if line.chars().next().map_or(false, |c| c.is_whitespace()) {
// Already has leading whitespace
result.push_str(line);
} else if !line.is_empty() {
// Missing leading whitespace - add a space
result.push(' ');
result.push_str(line);
} else {
result.push_str(line);
}
if !is_last_line {
result.push_str(line_ending);
}
} else {
result.push_str(line);
if !is_last_line {
result.push_str(line_ending);
}
previous_line_was_header = false;
}
}
result
}
/// Normalizes email headers to ensure RFC 5322 compliance
///
/// This function fixes improperly formatted header continuation lines by ensuring
/// that continuation lines start with at least one whitespace character (space or tab).
///
/// According to RFC 5322 Section 2.2.3:
/// - Header fields can be continued on subsequent lines
/// - Continuation lines MUST begin with at least one LWSP (space or tab)
///
/// This function processes both main email headers AND MIME part headers within the body.
/// It preserves the original line endings (CRLF or LF) of the email.
///
/// # Arguments
/// * `email` - The raw email content as a string
///
/// # Returns
/// * The normalized email with properly formatted header continuation lines
pub fn normalize_headers(email: &str) -> Result<String, Box<dyn Error>> {
// Detect line ending style: CRLF (Windows/SMTP) or LF (Unix)
let line_ending = if email.contains("\r\n") { "\r\n" } else { "\n" };
let separator = if line_ending == "\r\n" { "\r\n\r\n" } else { "\n\n" };
// Find the end of main headers
let main_headers_end = match email.find(separator) {
Some(pos) => pos,
None => return Ok(email.to_string()),
};
// Process main headers
let main_headers = &email[..main_headers_end];
let normalized_main_headers = normalize_header_section(main_headers, line_ending);
// Process the body - look for MIME part headers
let body_start = main_headers_end + separator.len();
let body = &email[body_start..];
let mut result = normalized_main_headers;
result.push_str(separator);
// Process body, looking for MIME part headers
// MIME part headers appear after boundary markers and before the next empty line
let mut current_pos = 0;
while current_pos < body.len() {
// Look for next empty line (potential MIME part header separator)
if let Some(next_sep_pos) = body[current_pos..].find(separator) {
let absolute_sep_pos = current_pos + next_sep_pos;
let section_before = &body[current_pos..absolute_sep_pos];
// Check if this section is MIME part headers:
// - Must contain at least one header line
// - MIME part headers typically include Content-Type, Content-Transfer-Encoding, etc.
// - Should NOT be mixed with body content (HTML, text, etc.)
let lines: Vec<&str> = section_before.lines().collect();
let mut header_count = 0;
let mut non_header_count = 0;
let mut has_mime_headers = false;
let mut last_was_header = false;
for line in &lines {
if line.is_empty() {
continue;
}
// Check if this is a MIME boundary marker
if line.starts_with("--") && line.len() > 2 {
continue; // Skip boundary markers in the analysis
}
// Check if this is a header start line
let is_header_start = line.chars().next().map_or(false, |c| c.is_ascii_alphabetic())
&& line.find(':').map_or(false, |pos| {
line[..pos].chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
});
// Check if this is a continuation line (starts with whitespace)
let is_continuation = line.chars().next().map_or(false, |c| c.is_whitespace());
if is_header_start {
header_count += 1;
last_was_header = true;
// Check for typical MIME headers
if line.starts_with("Content-") || line.starts_with("MIME-Version") || line.starts_with("X-WS-") {
has_mime_headers = true;
}
} else if is_continuation || last_was_header {
// This is either a proper continuation line OR a line following a header
// (which might be a malformed continuation line missing whitespace)
// In either case, don't count it as body content
continue;
} else {
// Not a header, not a continuation - this is body content
non_header_count += 1;
last_was_header = false;
}
}
// Only normalize if this section contains MIME headers and no body content
// (boundary markers are OK and expected)
if header_count > 0 && has_mime_headers && non_header_count == 0 {
let normalized_section = normalize_header_section(section_before, line_ending);
result.push_str(&normalized_section);
result.push_str(separator);
current_pos = absolute_sep_pos + separator.len();
} else {
// Not MIME headers, copy as-is
result.push_str(&body[current_pos..absolute_sep_pos + separator.len()]);
current_pos = absolute_sep_pos + separator.len();
}
} else {
// No more separators, copy rest of body as-is
result.push_str(&body[current_pos..]);
break;
}
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_headers_with_proper_continuation() {
let email = "From: test@example.com\nSubject: Test\n line 2\nTo: user@example.com\n\nBody";
let result = normalize_headers(email).unwrap();
assert!(result.contains("Subject: Test\n line 2\n"));
}
#[test]
fn test_normalize_headers_with_missing_whitespace() {
let email = "From: test@example.com\nSubject: Test\nline 2\nTo: user@example.com\n\nBody";
let result = normalize_headers(email).unwrap();
assert!(result.contains("Subject: Test\n line 2\n"));
}
#[test]
fn test_normalize_headers_preserves_body() {
let email = "From: test@example.com\nSubject: Test\n\nBody line 1\nBody line 2";
let result = normalize_headers(email).unwrap();
assert!(result.contains("Body line 1\nBody line 2"));
}
#[test]
fn test_normalize_headers_complex_continuation() {
let email = concat!(
"ARC-Seal: i=1; a=rsa-sha256; t=1764789271; cv=none;\n",
"d=google.com; s=arc-20240605;\n",
"b=WzYePPFoiBLQx6r6obqcdcSu658wc1rT9O383Yux3i6ngaTS4Z4Jc1vKOZ128wn1rR\n",
"To: test@example.com\n",
"\n",
"Body"
);
let result = normalize_headers(email).unwrap();
assert!(result.contains(" d=google.com; s=arc-20240605;"));
assert!(result.contains(" b=WzYePPFoiBLQx6r6obqcdcSu658wc1rT9O383Yux3i6ngaTS4Z4Jc1vKOZ128wn1rR"));
}
#[test]
fn test_normalize_headers_preserves_crlf() {
let email = "From: test@example.com\r\nSubject: Test\r\n\r\nBody";
let result = normalize_headers(email).unwrap();
assert!(result.contains("\r\n"));
assert!(!result.contains("\n\n")); // Should not have double LF
}
#[test]
fn test_normalize_headers_crlf_continuation() {
let email = "From: test@example.com\r\nSubject: Test\r\nline 2\r\nTo: user@example.com\r\n\r\nBody";
let result = normalize_headers(email).unwrap();
assert!(result.contains("Subject: Test\r\n line 2\r\n"));
}
#[test]
fn test_normalize_headers_no_changes_needed() {
let email = "From: test@example.com\r\nSubject: Test\r\n line 2\r\nTo: user@example.com\r\n\r\nBody";
let result = normalize_headers(email).unwrap();
assert_eq!(email, result, "Email should not be modified if already compliant");
}
#[test]
fn test_normalize_attachment_headers() {
let email = concat!(
"From: test@example.com\r\n",
"Subject: Test\r\n",
"\r\n",
"--boundary\r\n",
"X-WS-Attachment-UUID: 123\r\n",
"Content-Type: application/pdf;\r\n",
"name=test.pdf\r\n",
"Content-Disposition: attachment;\r\n",
"filename=test.pdf\r\n",
"\r\n",
"data"
);
let result = normalize_headers(email).unwrap();
assert!(result.contains("Content-Type: application/pdf;\r\n name=test.pdf"),
"Should add space to Content-Type continuation");
assert!(result.contains("Content-Disposition: attachment;\r\n filename=test.pdf"),
"Should add space to Content-Disposition continuation");
}
}