rs-pop-imap-importer/src/email_processor.rs

/// Email processing utilities for normalizing and fixing email format issues
use std::error::Error;

/// Normalizes a section of headers by fixing continuation lines
fn normalize_header_section(headers: &str, line_ending: &str) -> String {
    let mut result = String::with_capacity(headers.len());
    let mut previous_line_was_header = false;
    let lines: Vec<&str> = headers.lines().collect();
    let line_count = lines.len();

    for (idx, line) in lines.iter().enumerate() {
        let is_last_line = idx == line_count - 1;

        // Check if this is a header line (starts with a field name followed by colon)
        // RFC 5322: field names consist of printable ASCII except colon
        let is_header_start = line.chars().next().map_or(false, |c| c.is_ascii_alphabetic())
            && line.find(':').map_or(false, |pos| {
                // Ensure all characters before the colon are valid field-name characters
                line[..pos].chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
            });

        if is_header_start {
            result.push_str(line);
            if !is_last_line {
                result.push_str(line_ending);
            }
            previous_line_was_header = true;
        } else if previous_line_was_header {
            // This is a continuation line
            if line.chars().next().map_or(false, |c| c.is_whitespace()) {
                // Already has leading whitespace
                result.push_str(line);
            } else if !line.is_empty() {
                // Missing leading whitespace - add a space
                result.push(' ');
                result.push_str(line);
            } else {
                result.push_str(line);
            }
            if !is_last_line {
                result.push_str(line_ending);
            }
        } else {
            result.push_str(line);
            if !is_last_line {
                result.push_str(line_ending);
            }
            previous_line_was_header = false;
        }
    }

    result
}

/// Normalizes email headers to ensure RFC 5322 compliance
///
/// This function fixes improperly formatted header continuation lines by ensuring
/// that continuation lines start with at least one whitespace character (space or tab).
///
/// According to RFC 5322 Section 2.2.3:
/// - Header fields can be continued on subsequent lines
/// - Continuation lines MUST begin with at least one LWSP (space or tab)
///
/// This function processes both main email headers AND MIME part headers within the body.
/// It preserves the original line endings (CRLF or LF) of the email.
///
/// # Arguments
/// * `email` - The raw email content as a string
///
/// # Returns
/// * The normalized email with properly formatted header continuation lines
pub fn normalize_headers(email: &str) -> Result<String, Box<dyn Error>> {
    // Detect line ending style: CRLF (Windows/SMTP) or LF (Unix)
    let line_ending = if email.contains("\r\n") { "\r\n" } else { "\n" };
    let separator = if line_ending == "\r\n" { "\r\n\r\n" } else { "\n\n" };

    // Find the end of main headers
    let main_headers_end = match email.find(separator) {
        Some(pos) => pos,
        None => return Ok(email.to_string()),
    };

    // Process main headers
    let main_headers = &email[..main_headers_end];
    let normalized_main_headers = normalize_header_section(main_headers, line_ending);

    // Process the body - look for MIME part headers
    let body_start = main_headers_end + separator.len();
    let body = &email[body_start..];

    let mut result = normalized_main_headers;
    result.push_str(separator);

    // Process body, looking for MIME part headers
    // MIME part headers appear after boundary markers and before the next empty line
    let mut current_pos = 0;

    while current_pos < body.len() {
        // Look for next empty line (potential MIME part header separator)
        if let Some(next_sep_pos) = body[current_pos..].find(separator) {
            let absolute_sep_pos = current_pos + next_sep_pos;
            let section_before = &body[current_pos..absolute_sep_pos];

            // Check if this section is MIME part headers:
            // - Must contain at least one header line
            // - MIME part headers typically include Content-Type, Content-Transfer-Encoding, etc.
            // - Should NOT be mixed with body content (HTML, text, etc.)

            let lines: Vec<&str> = section_before.lines().collect();
            let mut header_count = 0;
            let mut non_header_count = 0;
            let mut has_mime_headers = false;

            let mut last_was_header = false;

            for line in &lines {
                if line.is_empty() {
                    continue;
                }

                // Check if this is a MIME boundary marker
                if line.starts_with("--") && line.len() > 2 {
                    continue; // Skip boundary markers in the analysis
                }

                // Check if this is a header start line
                let is_header_start = line.chars().next().map_or(false, |c| c.is_ascii_alphabetic())
                    && line.find(':').map_or(false, |pos| {
                        line[..pos].chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
                    });

                // Check if this is a continuation line (starts with whitespace)
                let is_continuation = line.chars().next().map_or(false, |c| c.is_whitespace());

                if is_header_start {
                    header_count += 1;
                    last_was_header = true;
                    // Check for typical MIME headers
                    if line.starts_with("Content-") || line.starts_with("MIME-Version") || line.starts_with("X-WS-") {
                        has_mime_headers = true;
                    }
                } else if is_continuation || last_was_header {
                    // This is either a proper continuation line OR a line following a header
                    // (which might be a malformed continuation line missing whitespace)
                    // In either case, don't count it as body content
                    continue;
                } else {
                    // Not a header, not a continuation - this is body content
                    non_header_count += 1;
                    last_was_header = false;
                }
            }

            // Only normalize if this section contains MIME headers and no body content
            // (boundary markers are OK and expected)
            if header_count > 0 && has_mime_headers && non_header_count == 0 {
                let normalized_section = normalize_header_section(section_before, line_ending);
                result.push_str(&normalized_section);
                result.push_str(separator);
                current_pos = absolute_sep_pos + separator.len();
            } else {
                // Not MIME headers, copy as-is
                result.push_str(&body[current_pos..absolute_sep_pos + separator.len()]);
                current_pos = absolute_sep_pos + separator.len();
            }
        } else {
            // No more separators, copy rest of body as-is
            result.push_str(&body[current_pos..]);
            break;
        }
    }

    Ok(result)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_normalize_headers_with_proper_continuation() {
        let email = "From: test@example.com\nSubject: Test\n line 2\nTo: user@example.com\n\nBody";
        let result = normalize_headers(email).unwrap();
        assert!(result.contains("Subject: Test\n line 2\n"));
    }

    #[test]
    fn test_normalize_headers_with_missing_whitespace() {
        let email = "From: test@example.com\nSubject: Test\nline 2\nTo: user@example.com\n\nBody";
        let result = normalize_headers(email).unwrap();
        assert!(result.contains("Subject: Test\n line 2\n"));
    }

    #[test]
    fn test_normalize_headers_preserves_body() {
        let email = "From: test@example.com\nSubject: Test\n\nBody line 1\nBody line 2";
        let result = normalize_headers(email).unwrap();
        assert!(result.contains("Body line 1\nBody line 2"));
    }

    #[test]
    fn test_normalize_headers_complex_continuation() {
        let email = concat!(
            "ARC-Seal: i=1; a=rsa-sha256; t=1764789271; cv=none;\n",
            "d=google.com; s=arc-20240605;\n",
            "b=WzYePPFoiBLQx6r6obqcdcSu658wc1rT9O383Yux3i6ngaTS4Z4Jc1vKOZ128wn1rR\n",
            "To: test@example.com\n",
            "\n",
            "Body"
        );
        let result = normalize_headers(email).unwrap();
        assert!(result.contains(" d=google.com; s=arc-20240605;"));
        assert!(result.contains(" b=WzYePPFoiBLQx6r6obqcdcSu658wc1rT9O383Yux3i6ngaTS4Z4Jc1vKOZ128wn1rR"));
    }

    #[test]
    fn test_normalize_headers_preserves_crlf() {
        let email = "From: test@example.com\r\nSubject: Test\r\n\r\nBody";
        let result = normalize_headers(email).unwrap();
        assert!(result.contains("\r\n"));
        assert!(!result.contains("\n\n")); // Should not have double LF
    }

    #[test]
    fn test_normalize_headers_crlf_continuation() {
        let email = "From: test@example.com\r\nSubject: Test\r\nline 2\r\nTo: user@example.com\r\n\r\nBody";
        let result = normalize_headers(email).unwrap();
        assert!(result.contains("Subject: Test\r\n line 2\r\n"));
    }

    #[test]
    fn test_normalize_headers_no_changes_needed() {
        let email = "From: test@example.com\r\nSubject: Test\r\n line 2\r\nTo: user@example.com\r\n\r\nBody";
        let result = normalize_headers(email).unwrap();
        assert_eq!(email, result, "Email should not be modified if already compliant");
    }

    #[test]
    fn test_normalize_attachment_headers() {
        let email = concat!(
            "From: test@example.com\r\n",
            "Subject: Test\r\n",
            "\r\n",
            "--boundary\r\n",
            "X-WS-Attachment-UUID: 123\r\n",
            "Content-Type: application/pdf;\r\n",
            "name=test.pdf\r\n",
            "Content-Disposition: attachment;\r\n",
            "filename=test.pdf\r\n",
            "\r\n",
            "data"
        );
        let result = normalize_headers(email).unwrap();
        assert!(result.contains("Content-Type: application/pdf;\r\n name=test.pdf"),
                "Should add space to Content-Type continuation");
        assert!(result.contains("Content-Disposition: attachment;\r\n filename=test.pdf"),
                "Should add space to Content-Disposition continuation");
    }
}