Coverage for adhoc-cicd-odoo-odoo / odoo / tools / mail.py: 60%
490 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-09 18:05 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-09 18:05 +0000
1# -*- coding: utf-8 -*-
2# Part of Odoo. See LICENSE file for full copyright and licensing details.
4import base64
5import collections
6import itertools
7import logging
8import random
9import re
10import socket
11import time
12import email.utils
13from email.utils import getaddresses as orig_getaddresses
14from urllib.parse import urlparse
15from typing import Literal
16import html as htmllib
18import idna
19import markupsafe
20from lxml import etree, html
21from lxml.html import (
22 XHTML_NAMESPACE,
23 _contains_block_level_tag,
24 _looks_like_full_html_bytes,
25 _looks_like_full_html_unicode,
26 clean,
27 defs,
28 document_fromstring,
29 html_parser,
30)
31from werkzeug import urls
33from odoo.tools import misc
35__all__ = [
36 "email_domain_extract",
37 "email_domain_normalize",
38 "email_normalize",
39 "email_normalize_all",
40 "email_split",
41 "encapsulate_email",
42 "formataddr",
43 "html2plaintext",
44 "html_normalize",
45 "html_sanitize",
46 "is_html_empty",
47 "parse_contact_from_email",
48 "plaintext2html",
49 "single_email_re",
50]
52_logger = logging.getLogger(__name__)
55# disable strict mode when present: we rely on original non-strict
56# parsing, and we know that it isn't reliable, that ok.
57# cfr python/cpython@4a153a1d3b18803a684cd1bcc2cdf3ede3dbae19
58if hasattr(email.utils, 'supports_strict_parsing'): 58 ↛ 62line 58 didn't jump to line 62 because the condition on line 58 was always true
59 def getaddresses(fieldvalues):
60 return orig_getaddresses(fieldvalues, strict=False)
61else:
62 getaddresses = orig_getaddresses
65#----------------------------------------------------------
66# HTML Sanitizer
67#----------------------------------------------------------
69safe_attrs = defs.safe_attrs | frozenset(
70 ['style',
71 'data-o-mail-quote', 'data-o-mail-quote-node', # quote detection
72 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translation-source-sha', 'data-oe-nodeid',
73 'data-last-history-steps', 'data-oe-protected', 'data-embedded', 'data-embedded-editable', 'data-embedded-props', 'data-oe-version',
74 'data-oe-transient-content', 'data-behavior-props', 'data-prop-name', 'data-width', 'data-height', 'data-scale-x', 'data-scale-y', 'data-x', 'data-y', # legacy editor
75 'data-oe-role', 'data-oe-aria-label',
76 'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id',
77 'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width',
78 'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype',
79 'data-attachment-id', 'data-format-mimetype',
80 'data-ai-field', 'data-ai-record-id',
81 'data-heading-link-id',
82 'data-mimetype-before-conversion',
83 'data-language-id',
84 'data-bs-toggle', # support nav-tabs
85 ])
86SANITIZE_TAGS = {
87 # allow new semantic HTML5 tags
88 'allow_tags': defs.tags | frozenset('article bdi section header footer hgroup nav aside figure main'.split() + [etree.Comment]),
89 'kill_tags': ['base', 'embed', 'frame', 'head', 'iframe', 'link', 'meta',
90 'noscript', 'object', 'script', 'style', 'title'],
91 'remove_tags': ['html', 'body'],
92}
95class _Cleaner(clean.Cleaner):
97 _style_re = re.compile(r'''([\w-]+)\s*:\s*((?:[^;"']|"[^";]*"|'[^';]*')+)''')
99 _style_whitelist = [
100 'font-size', 'font-family', 'font-weight', 'font-style', 'background-color', 'color', 'text-align',
101 'line-height', 'letter-spacing', 'text-transform', 'text-decoration', 'text-decoration', 'opacity',
102 'float', 'vertical-align', 'display', 'object-fit',
103 'padding', 'padding-top', 'padding-left', 'padding-bottom', 'padding-right',
104 'margin', 'margin-top', 'margin-left', 'margin-bottom', 'margin-right',
105 'white-space',
106 # appearance
107 'background-image', 'background-position', 'background-size', 'background-repeat', 'background-origin',
108 # box model
109 'border', 'border-color', 'border-radius', 'border-style', 'border-width', 'border-top', 'border-bottom',
110 'height', 'width', 'max-width', 'min-width', 'min-height',
111 # tables
112 'border-collapse', 'border-spacing', 'caption-side', 'empty-cells', 'table-layout']
114 _style_whitelist.extend(
115 ['border-%s-%s' % (position, attribute)
116 for position in ['top', 'bottom', 'left', 'right']
117 for attribute in ('style', 'color', 'width', 'left-radius', 'right-radius')]
118 )
120 strip_classes = False
121 sanitize_style = False
122 conditional_comments = True
124 def __call__(self, doc):
125 super(_Cleaner, self).__call__(doc)
127 # if we keep attributes but still remove classes
128 if not getattr(self, 'safe_attrs_only', False) and self.strip_classes: 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true
129 for el in doc.iter(tag=etree.Element):
130 self.strip_class(el)
132 # if we keep style attribute, sanitize them
133 if not self.style and self.sanitize_style:
134 for el in doc.iter(tag=etree.Element):
135 self.parse_style(el)
137 def strip_class(self, el):
138 if el.attrib.get('class'):
139 del el.attrib['class']
141 def parse_style(self, el):
142 attributes = el.attrib
143 styling = attributes.get('style')
144 if styling:
145 valid_styles = collections.OrderedDict()
146 styles = self._style_re.findall(styling)
147 for style in styles:
148 if style[0].lower() in self._style_whitelist:
149 valid_styles[style[0].lower()] = style[1]
150 if valid_styles: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true
151 el.attrib['style'] = '; '.join('%s:%s' % (key, val) for (key, val) in valid_styles.items())
152 else:
153 del el.attrib['style']
155 def kill_conditional_comments(self, doc):
156 """Override the default behavior of lxml.
158 https://github.com/lxml/lxml/blob/e82c9153c4a7d505480b94c60b9a84d79d948efb/src/lxml/html/clean.py#L501-L510
160 In some use cases, e.g. templates used for mass mailing,
161 we send emails containing conditional comments targeting Microsoft Outlook,
162 to give special styling instructions.
163 https://github.com/odoo/odoo/pull/119325/files#r1301064789
165 Within these conditional comments, unsanitized HTML can lie.
166 However, in modern browser, these comments are considered as simple comments,
167 their content is not executed.
168 https://caniuse.com/sr_ie-features
169 """
170 if self.conditional_comments:
171 super().kill_conditional_comments(doc)
174def tag_quote(el):
175 def _create_new_node(tag, text, tail=None, attrs=None):
176 new_node = etree.Element(tag)
177 new_node.text = text
178 new_node.tail = tail
179 if attrs:
180 for key, val in attrs.items():
181 new_node.set(key, val)
182 return new_node
184 def _tag_matching_regex_in_text(regex, node, tag='span', attrs=None):
185 text = node.text or ''
186 if not re.search(regex, text): 186 ↛ 189line 186 didn't jump to line 189 because the condition on line 186 was always true
187 return
189 child_node = None
190 idx, node_idx = 0, 0
191 for item in re.finditer(regex, text):
192 new_node = _create_new_node(tag, text[item.start():item.end()], None, attrs)
193 if child_node is None:
194 node.text = text[idx:item.start()]
195 new_node.tail = text[item.end():]
196 node.insert(node_idx, new_node)
197 else:
198 child_node.tail = text[idx:item.start()]
199 new_node.tail = text[item.end():]
200 node.insert(node_idx, new_node)
201 child_node = new_node
202 idx = item.end()
203 node_idx = node_idx + 1
205 el_class = el.get('class', '') or ''
206 el_id = el.get('id', '') or ''
208 # gmail or yahoo // # outlook, html // # msoffice
209 if 'gmail_extra' in el_class or \ 209 ↛ 211line 209 didn't jump to line 211 because the condition on line 209 was never true
210 ('SkyDrivePlaceholder' in el_class or 'SkyDrivePlaceholder' in el_class):
211 el.set('data-o-mail-quote', '1')
212 if el.getparent() is not None:
213 el.getparent().set('data-o-mail-quote-container', '1')
215 if (el.tag == 'hr' and ('stopSpelling' in el_class or 'stopSpelling' in el_id)) or \ 215 ↛ 218line 215 didn't jump to line 218 because the condition on line 215 was never true
216 'yahoo_quoted' in el_class:
217 # Quote all elements after this one
218 el.set('data-o-mail-quote', '1')
219 for sibling in el.itersiblings(preceding=False):
220 sibling.set('data-o-mail-quote', '1')
222 # odoo, gmail and outlook automatic signature wrapper
223 is_signature_wrapper = 'odoo_signature_wrapper' in el_class or 'gmail_signature' in el_class or el_id == "Signature"
224 is_outlook_auto_message = 'appendonsend' in el_id
225 # gmail and outlook reply quote
226 is_outlook_reply_quote = 'divRplyFwdMsg' in el_id
227 is_gmail_quote = 'gmail_quote' in el_class
228 is_quote_wrapper = is_signature_wrapper or is_gmail_quote or is_outlook_reply_quote
229 if is_quote_wrapper: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 el.set('data-o-mail-quote-container', '1')
231 el.set('data-o-mail-quote', '1')
233 # outlook reply wrapper is preceded with <hr> and a div containing recipient info
234 if is_outlook_reply_quote: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true
235 hr = el.getprevious()
236 reply_quote = el.getnext()
237 if hr is not None and hr.tag == 'hr':
238 hr.set('data-o-mail-quote', '1')
239 if reply_quote is not None:
240 reply_quote.set('data-o-mail-quote-container', '1')
241 reply_quote.set('data-o-mail-quote', '1')
243 if is_outlook_auto_message: 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true
244 if not el.text or not el.text.strip():
245 el.set('data-o-mail-quote-container', '1')
246 el.set('data-o-mail-quote', '1')
248 # html signature (-- <br />blah)
249 signature_begin = re.compile(r"((?:(?:^|\n)[-]{2}[\s]?$))")
250 if el.text and el.find('br') is not None and re.search(signature_begin, el.text):
251 el.set('data-o-mail-quote', '1')
252 if el.getparent() is not None: 252 ↛ 256line 252 didn't jump to line 256 because the condition on line 252 was always true
253 el.getparent().set('data-o-mail-quote-container', '1')
255 # text-based quotes (>, >>) and signatures (-- Signature)
256 text_complete_regex = re.compile(r"((?:\n[>]+[^\n\r]*)+|(?:(?:^|\n)[-]{2}[\s]?[\r\n]{1,2}[\s\S]+))")
257 if not el.get('data-o-mail-quote'):
258 _tag_matching_regex_in_text(text_complete_regex, el, 'span', {'data-o-mail-quote': '1'})
260 if el.tag == 'blockquote':
261 # remove single node
262 el.set('data-o-mail-quote-node', '1')
263 el.set('data-o-mail-quote', '1')
264 if el.getparent() is not None and not el.getparent().get('data-o-mail-quote-node'):
265 if el.getparent().get('data-o-mail-quote'):
266 el.set('data-o-mail-quote', '1')
267 # only quoting the elements following the first quote in the container
268 # avoids issues with repeated calls to html_normalize
269 elif el.getparent().get('data-o-mail-quote-container'):
270 if (first_sibling_quote := el.getparent().find("*[@data-o-mail-quote]")) is not None: 270 ↛ 276line 270 didn't jump to line 276 because the condition on line 270 was always true
271 siblings = el.getparent().getchildren()
272 quote_index = siblings.index(first_sibling_quote)
273 element_index = siblings.index(el)
274 if quote_index < element_index: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true
275 el.set('data-o-mail-quote', '1')
276 if el.getprevious() is not None and el.getprevious().get('data-o-mail-quote') and not el.text_content().strip():
277 el.set('data-o-mail-quote', '1')
280def fromstring(html_, base_url=None, parser=None, **kw):
281 """
282 This function mimics lxml.html.fromstring. It not only returns the parsed
283 element/document but also a flag indicating whether the input is for a
284 a single body element or not.
286 This tries to minimally parse the chunk of text, without knowing if it
287 is a fragment or a document.
289 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
290 """
291 if parser is None: 291 ↛ 293line 291 didn't jump to line 293 because the condition on line 291 was always true
292 parser = html_parser
293 if isinstance(html_, bytes): 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true
294 is_full_html = _looks_like_full_html_bytes(html_)
295 else:
296 is_full_html = _looks_like_full_html_unicode(html_)
297 doc = document_fromstring(html_, parser=parser, base_url=base_url, **kw)
298 if is_full_html:
299 return doc, False
300 # otherwise, lets parse it out...
301 bodies = doc.findall('body')
302 if not bodies: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true
303 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
304 if bodies: 304 ↛ 320line 304 didn't jump to line 320 because the condition on line 304 was always true
305 body = bodies[0]
306 if len(bodies) > 1: 306 ↛ 309line 306 didn't jump to line 309 because the condition on line 306 was never true
307 # Somehow there are multiple bodies, which is bad, but just
308 # smash them into one body
309 for other_body in bodies[1:]:
310 if other_body.text:
311 if len(body):
312 body[-1].tail = (body[-1].tail or '') + other_body.text
313 else:
314 body.text = (body.text or '') + other_body.text
315 body.extend(other_body)
316 # We'll ignore tail
317 # I guess we are ignoring attributes too
318 other_body.drop_tree()
319 else:
320 body = None
321 heads = doc.findall('head')
322 if not heads: 322 ↛ 324line 322 didn't jump to line 324 because the condition on line 322 was always true
323 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
324 if heads: 324 ↛ 326line 324 didn't jump to line 326 because the condition on line 324 was never true
325 # Well, we have some sort of structure, so lets keep it all
326 head = heads[0]
327 if len(heads) > 1:
328 for other_head in heads[1:]:
329 head.extend(other_head)
330 # We don't care about text or tail in a head
331 other_head.drop_tree()
332 return doc, False
333 if body is None: 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true
334 return doc, False
335 if (len(body) == 1 and (not body.text or not body.text.strip())
336 and (not body[-1].tail or not body[-1].tail.strip())):
337 # The body has just one element, so it was probably a single
338 # element passed in
339 return body[0], True
340 # Now we have a body which represents a bunch of tags which have the
341 # content that was passed in. We will create a fake container, which
342 # is the body tag, except <body> implies too much structure.
343 if _contains_block_level_tag(body):
344 body.tag = 'div'
345 else:
346 body.tag = 'span'
347 return body, False
350def html_normalize(src, filter_callback=None, output_method="html"):
351 """ Normalize `src` for storage as an html field value.
353 The string is parsed as an html tag soup, made valid, then decorated for
354 "email quote" detection, and prepared for an optional filtering.
355 The filtering step (e.g. sanitization) should be performed by the
356 `filter_callback` function (to avoid multiple parsing operations, and
357 normalize the result).
359 :param src: the html string to normalize
360 :param filter_callback: optional callable taking a single `etree._Element`
361 document parameter, to be called during normalization in order to
362 filter the output document
363 :param output_method: defines the output method to pass to `html.tostring`.
364 It defaults to 'html', but can also be 'xml' for xhtml output.
365 """
366 if not src: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true
367 return src
369 # html: remove encoding attribute inside tags
370 src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src)
372 src = src.replace('--!>', '-->')
373 src = re.sub(r'(<!-->|<!--->)', '<!-- -->', src)
374 # On the specific case of Outlook desktop it adds unnecessary '<o:.*></o:.*>' tags which are parsed
375 # in '<p></p>' which may alter the appearance (eg. spacing) of the mail body
376 src = re.sub(r'</?o:.*?>', '', src)
378 try:
379 doc, single_body_element = fromstring(src)
380 except etree.ParserError as e:
381 # HTML comment only string, whitespace only..
382 if 'empty' in str(e):
383 return ""
384 raise
386 # perform quote detection before cleaning and class removal
387 for el in doc.iter(tag=etree.Element):
388 tag_quote(el)
390 doc = html.fromstring(html.tostring(doc, method=output_method))
392 if filter_callback:
393 doc = filter_callback(doc)
395 src = html.tostring(doc, encoding='unicode', method=output_method)
397 if not single_body_element and src.startswith('<div>') and src.endswith('</div>'):
398 # the <div></div> may come from 2 places
399 # 1. the src is parsed as multiple body elements
400 # <div></div> wraps all elements.
401 # 2. the src is parsed as not only body elements
402 # <html></html> wraps all elements.
403 # then the Cleaner as the filter_callback which has 'html' in its
404 # 'remove_tags' will write <html></html> to <div></div> since it
405 # cannot directly drop the parent-most tag
406 src = src[5:-6]
408 # html considerations so real html content match database value
409 src = src.replace(u'\xa0', u' ')
411 return src
414def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, sanitize_form=True, sanitize_conditional_comments=True, strip_style=False, strip_classes=False, output_method="html"):
415 if not src:
416 return src
418 logger = logging.getLogger(__name__ + '.html_sanitize')
420 def sanitize_handler(doc):
421 kwargs = {
422 'page_structure': True,
423 'style': strip_style, # True = remove style tags/attrs
424 'sanitize_style': sanitize_style, # True = sanitize styling
425 'forms': sanitize_form, # True = remove form tags
426 'remove_unknown_tags': False,
427 'comments': False,
428 'conditional_comments': sanitize_conditional_comments, # True = remove conditional comments
429 'processing_instructions': False
430 }
431 if sanitize_tags:
432 kwargs.update(SANITIZE_TAGS)
434 if sanitize_attributes: # We keep all attributes in order to keep "style"
435 if strip_classes: 435 ↛ 436line 435 didn't jump to line 436 because the condition on line 435 was never true
436 current_safe_attrs = safe_attrs - frozenset(['class'])
437 else:
438 current_safe_attrs = safe_attrs
439 kwargs.update({
440 'safe_attrs_only': True,
441 'safe_attrs': current_safe_attrs,
442 })
443 else:
444 kwargs.update({
445 'safe_attrs_only': False, # keep oe-data attributes + style
446 'strip_classes': strip_classes, # remove classes, even when keeping other attributes
447 })
449 cleaner = _Cleaner(**kwargs)
450 cleaner(doc)
451 return doc
453 try:
454 sanitized = html_normalize(src, filter_callback=sanitize_handler, output_method=output_method)
455 except etree.ParserError:
456 if not silent:
457 raise
458 logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True)
459 sanitized = '<p>ParserError when sanitizing</p>'
460 except Exception:
461 if not silent:
462 raise
463 logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True)
464 sanitized = '<p>Unknown error when sanitizing</p>'
466 return markupsafe.Markup(sanitized)
468# ----------------------------------------------------------
469# HTML/Text management
470# ----------------------------------------------------------
472URL_SKIP_PROTOCOL_REGEX = r'mailto:|tel:|sms:'
473URL_REGEX = rf'''(\bhref=['"](?!{URL_SKIP_PROTOCOL_REGEX})([^'"]+)['"])'''
474TEXT_URL_REGEX = r'https?://[\w@:%.+&~#=/-]+(?:\?\S+)?'
475# retrieve inner content of the link
476HTML_TAG_URL_REGEX = URL_REGEX + r'([^<>]*>([^<>]+)<\/)?'
477HTML_TAGS_REGEX = re.compile('<.*?>')
478HTML_NEWLINES_REGEX = re.compile('<(div|p|br|tr)[^>]*>|\n')
481def validate_url(url):
482 if urls.url_parse(url).scheme not in ('http', 'https', 'ftp', 'ftps'):
483 return 'http://' + url
485 return url
488def is_html_empty(html_content: str | markupsafe.Markup | Literal[False] | None) -> bool:
489 """Check if a html content is empty. If there are only formatting tags with style
490 attributes or a void content return True. Famous use case if a
491 '<p style="..."><br></p>' added by some web editor.
493 :param html_content: html content, coming from example from an HTML field
494 :returns: True if no content found or if containing only void formatting tags
495 """
496 if not html_content:
497 return True
498 icon_re = r'<\s*(i|span)\b(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"])?)*\s*\bclass\s*=\s*["\'][^"\']*\b(fa|fab|fad|far|oi)\b'
499 tag_re = r'<\s*\/?(?:p|div|section|span|br|b|i|font)\b(?:(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"]))*)(?:\s*>|\s*\/\s*>)'
500 text_content = htmllib.unescape(re.sub(tag_re, '', html_content))
501 return not bool(text_content.strip()) and not re.search(icon_re, html_content)
504def html_keep_url(text):
505 """ Transform the url into clickable link with <a/> tag """
506 idx = 0
507 final = ''
508 link_tags = re.compile(r"""(?<!["'])((ftp|http|https):\/\/(\w+:{0,1}\w*@)?([^\s<"']+)(:[0-9]+)?(\/|\/([^\s<"']))?)(?![^\s<"']*["']|[^\s<"']*</a>)""")
509 for item in re.finditer(link_tags, text): 509 ↛ 510line 509 didn't jump to line 510 because the loop on line 509 never started
510 final += text[idx:item.start()]
511 final += create_link(item.group(0), item.group(0))
512 idx = item.end()
513 final += text[idx:]
514 return final
517def html_to_inner_content(html):
518 """Returns unformatted text after removing html tags and excessive whitespace from a
519 string/Markup. Passed strings will first be sanitized.
520 """
521 if is_html_empty(html): 521 ↛ 522line 521 didn't jump to line 522 because the condition on line 521 was never true
522 return ''
523 if not isinstance(html, markupsafe.Markup): 523 ↛ 524line 523 didn't jump to line 524 because the condition on line 523 was never true
524 html = html_sanitize(html)
525 processed = re.sub(HTML_NEWLINES_REGEX, ' ', html)
526 processed = re.sub(HTML_TAGS_REGEX, '', processed)
527 processed = re.sub(r' {2,}|\t', ' ', processed)
528 processed = processed.replace("\xa0", " ")
529 processed = htmllib.unescape(processed)
530 return processed.strip()
533def create_link(url, label):
534 return f'<a href="{url}" target="_blank" rel="noreferrer noopener">{label}</a>'
537def html2plaintext(
538 html: str | markupsafe.Markup | Literal[False] | None,
539 body_id: str | None = None,
540 encoding: str = 'utf-8',
541 include_references: bool = True
542) -> str:
543 """ From an HTML text, convert the HTML to plain text.
544 If @param body_id is provided then this is the tag where the
545 body (not necessarily <body>) starts.
546 :param include_references: If False, numbered references and
547 URLs for links and images will not be included.
548 """
549 ## (c) Fry-IT, www.fry-it.com, 2007
550 ## <peter@fry-it.com>
551 ## download here: http://www.peterbe.com/plog/html2plaintext
552 if not (html and html.strip()):
553 return ''
555 if isinstance(html, bytes): 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true
556 html = html.decode(encoding)
557 else:
558 assert isinstance(html, str), f"expected str got {html.__class__.__name__}"
560 tree = etree.fromstring(html, parser=etree.HTMLParser())
562 if body_id is not None: 562 ↛ 563line 562 didn't jump to line 563 because the condition on line 562 was never true
563 source = tree.xpath('//*[@id=%s]' % (body_id,))
564 else:
565 source = tree.xpath('//body')
566 if len(source): 566 ↛ 569line 566 didn't jump to line 569 because the condition on line 566 was always true
567 tree = source[0]
569 url_index = []
570 linkrefs = itertools.count(1)
571 if include_references: 571 ↛ 588line 571 didn't jump to line 588 because the condition on line 571 was always true
572 for link in tree.findall('.//a'):
573 if url := link.get('href'): 573 ↛ 572line 573 didn't jump to line 572 because the condition on line 573 was always true
574 link.tag = 'span'
575 link.text = f'{link.text} [{next(linkrefs)}]'
576 url_index.append(url)
578 for img in tree.findall('.//img'):
579 if src := img.get('src'): 579 ↛ 578line 579 didn't jump to line 578 because the condition on line 579 was always true
580 img.tag = 'span'
581 if src.startswith('data:'): 581 ↛ 582line 581 didn't jump to line 582 because the condition on line 581 was never true
582 img_name = None # base64 image
583 else:
584 img_name = re.search(r'[^/]+(?=\.[a-zA-Z]+(?:\?|$))', src)
585 img.text = '%s [%s]' % (img_name[0] if img_name else 'Image', next(linkrefs))
586 url_index.append(src)
588 html = etree.tostring(tree, encoding="unicode")
589 # \r char is converted into , must remove it
590 html = html.replace(' ', '')
592 html = html.replace('<strong>', '*').replace('</strong>', '*')
593 html = html.replace('<b>', '*').replace('</b>', '*')
594 html = html.replace('<h3>', '*').replace('</h3>', '*')
595 html = html.replace('<h2>', '**').replace('</h2>', '**')
596 html = html.replace('<h1>', '**').replace('</h1>', '**')
597 html = html.replace('<em>', '/').replace('</em>', '/')
598 html = html.replace('<tr>', '\n')
599 html = html.replace('</p>', '\n')
600 html = re.sub(r'<br\s*/?>', '\n', html)
601 html = re.sub('<.*?>', ' ', html)
602 html = html.replace(' ' * 2, ' ')
603 html = html.replace('>', '>')
604 html = html.replace('<', '<')
605 html = html.replace('&', '&')
606 html = html.replace(' ', '\N{NO-BREAK SPACE}')
608 # strip all lines
609 html = '\n'.join([x.strip() for x in html.splitlines()])
610 html = html.replace('\n' * 2, '\n')
612 if url_index:
613 html += '\n\n'
614 for i, url in enumerate(url_index, start=1):
615 html += f'[{i}] {url}\n'
617 return html.strip()
620def plaintext2html(text: str, container_tag: str | None = None, with_paragraph: bool = True) -> markupsafe.Markup:
621 r"""Convert plaintext into html. Content of the text is escaped to manage
622 html entities, using :func:`~odoo.tools.misc.html_escape`.
624 - all ``\n``, ``\r`` are replaced by ``<br/>``
625 - convert url into clickable link
627 :param text: plaintext to convert
628 :param container_tag: container of the html; by default the content is
629 embedded into a ``<div>``
630 :param with_paragraph: whether or not considering 2 or more consecutive ``<br/>``
631 as paragraph breaks and enclosing content in ``<p>``
632 """
633 assert isinstance(text, str)
634 text = misc.html_escape(text)
636 # 1. replace \n and \r
637 text = re.sub(r'(\r\n|\r|\n)', '<br/>', text)
639 # 2. clickable links
640 text = html_keep_url(text)
642 # 3-4: form paragraphs
643 final = text
644 if with_paragraph: 644 ↛ 645line 644 didn't jump to line 645 because the condition on line 644 was never true
645 idx = 0
646 final = '<p>'
647 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})')
648 for item in re.finditer(br_tags, text):
649 final += text[idx:item.start()] + '</p><p>'
650 idx = item.end()
651 final += text[idx:] + '</p>'
653 # 5. container
654 if container_tag: # FIXME: validate that container_tag is just a simple tag? 654 ↛ 655line 654 didn't jump to line 655 because the condition on line 654 was never true
655 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
656 return markupsafe.Markup(final)
658def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=None):
659 """ Append extra content at the end of an HTML snippet, trying
660 to locate the end of the HTML document (</body>, </html>, or
661 EOF), and converting the provided content in html unless ``plaintext``
662 is ``False``.
664 Content conversion can be done in two ways:
666 - wrapping it into a pre (``preserve=True``)
667 - use plaintext2html (``preserve=False``, using ``container_tag`` to
668 wrap the whole content)
670 A side-effect of this method is to coerce all HTML tags to
671 lowercase in ``html``, and strip enclosing <html> or <body> tags in
672 content if ``plaintext`` is False.
674 :param str html: html tagsoup (doesn't have to be XHTML)
675 :param str content: extra content to append
676 :param bool plaintext: whether content is plaintext and should
677 be wrapped in a <pre/> tag.
678 :param bool preserve: if content is plaintext, wrap it into a <pre>
679 instead of converting it into html
680 :param str container_tag: tag to wrap the content into, defaults to `div`.
681 :rtype: markupsafe.Markup
682 """
683 if plaintext and preserve:
684 content = '\n<pre>%s</pre>\n' % misc.html_escape(content)
685 elif plaintext:
686 content = '\n%s\n' % plaintext2html(content, container_tag)
687 else:
688 content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)
689 content = '\n%s\n' % content
690 # Force all tags to lowercase
691 html = re.sub(r'(</?)(\w+)([ >])',
692 lambda m: '%s%s%s' % (m[1], m[2].lower(), m[3]), html)
693 insert_location = html.find('</body>')
694 if insert_location == -1:
695 insert_location = html.find('</html>')
696 if insert_location == -1:
697 return markupsafe.Markup('%s%s' % (html, content))
698 return markupsafe.Markup('%s%s%s' % (html[:insert_location], content, html[insert_location:]))
701def prepend_html_content(html_body, html_content):
702 """Prepend some HTML content at the beginning of an other HTML content."""
703 replacement = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', html_content)
704 html_content = markupsafe.Markup(replacement) if isinstance(html_content, markupsafe.Markup) else replacement
705 html_content = html_content.strip()
707 body_match = re.search(r'<body[^>]*>', html_body) or re.search(r'<html[^>]*>', html_body)
708 insert_index = body_match.end() if body_match else 0
710 return html_body[:insert_index] + html_content + html_body[insert_index:]
712#----------------------------------------------------------
713# Emails
714#----------------------------------------------------------
716# matches any email in a body of text
717email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63})""", re.VERBOSE)
719# matches a string containing only one email
720single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63}$""", re.VERBOSE)
722mail_header_msgid_re = re.compile('<[^<>]+>')
724email_addr_escapes_re = re.compile(r'[\\"]')
726def generate_tracking_message_id(res_id):
727 """Returns a string that can be used in the Message-ID RFC822 header field
729 Used to track the replies related to a given object thanks to the "In-Reply-To"
730 or "References" fields that Mail User Agents will set.
731 """
732 try:
733 rnd = random.SystemRandom().random()
734 except NotImplementedError:
735 rnd = random.random()
736 rndstr = ("%.15f" % rnd)[2:]
737 return "<%s.%.15f-openerp-%s@%s>" % (rndstr, time.time(), res_id, socket.gethostname())
739def email_split_tuples(text):
740 """ Return a list of (name, email) address tuples found in ``text`` . Note
741 that text should be an email header or a stringified email list as it may
742 give broader results than expected on actual text. """
743 def _parse_based_on_spaces(pair):
744 """ With input 'name email@domain.com' (missing quotes for a formatting)
745 getaddresses returns ('', 'name email@domain.com). This when having no
746 name and an email a fallback to enhance parsing is to redo a getaddresses
747 by replacing spaces by commas. The new email will be split into sub pairs
748 allowing to find the email and name parts, allowing to make a new name /
749 email pair. Emails should not contain spaces thus this is coherent with
750 email formation. """
751 name, email = pair
752 if not name and email and ' ' in email: 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true
753 inside_pairs = getaddresses([email.replace(' ', ',')])
754 name_parts, found_email = [], False
755 for pair in inside_pairs:
756 if pair[1] and '@' not in pair[1]:
757 name_parts.append(pair[1])
758 if pair[1] and '@' in pair[1]:
759 found_email = pair[1]
760 name, email = (' '.join(name_parts), found_email) if found_email else (name, email)
761 return (name, email)
763 if not text:
764 return []
766 # found valid pairs, filtering out failed parsing
767 valid_pairs = [
768 (addr[0], addr[1]) for addr in getaddresses([text])
769 # getaddresses() returns '' when email parsing fails, and
770 # sometimes returns emails without at least '@'. The '@'
771 # is strictly required in RFC2822's `addr-spec`.
772 if addr[1] and '@' in addr[1]
773 ]
774 # corner case: returning '@gmail.com'-like email (see test_email_split)
775 if any(pair[1].startswith('@') for pair in valid_pairs): 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true
776 filtered = [
777 found_email for found_email in email_re.findall(text)
778 if found_email and not found_email.startswith('@')
779 ]
780 if filtered:
781 valid_pairs = [('', found_email) for found_email in filtered]
783 return list(map(_parse_based_on_spaces, valid_pairs))
786def email_split(text):
787 """ Return a list of the email addresses found in ``text`` """
788 return [email for (name, email) in email_split_tuples(text)]
791def email_split_and_format(text):
792 """ Return a list of email addresses found in ``text``, formatted using
793 formataddr. """
794 return [formataddr((name, email)) for (name, email) in email_split_tuples(text)]
797def email_split_and_normalize(text):
798 """ Same as 'email_split' but normalized email """
799 return [(name, _normalize_email(email)) for (name, email) in email_split_tuples(text)]
802def email_split_and_format_normalize(text):
803 """ Same as 'email_split_and_format' but normalizing email. """
804 return [
805 formataddr(
806 (name, _normalize_email(email))
807 ) for (name, email) in email_split_tuples(text)
808 ]
810def email_normalize(text, strict=True):
811 """ Sanitize and standardize email address entries. As of rfc5322 section
812 3.4.1 local-part is case-sensitive. However most main providers do consider
813 the local-part as case insensitive. With the introduction of smtp-utf8
814 within odoo, this assumption is certain to fall short for international
815 emails. We now consider that
817 * if local part is ascii: normalize still 'lower' ;
818 * else: use as it, SMTP-UF8 is made for non-ascii local parts;
820 Concerning domain part of the address, as of v14 international domain (IDNA)
821 are handled fine. The domain is always lowercase, lowering it is fine as it
822 is probably an error. With the introduction of IDNA, there is an encoding
823 that allow non-ascii characters to be encoded to ascii ones, using 'idna.encode'.
825 A normalized email is considered as :
826 - having a left part + @ + a right part (the domain can be without '.something')
827 - having no name before the address. Typically, having no 'Name <>'
828 Ex:
829 - Possible Input Email : 'Name <NaMe@DoMaIn.CoM>'
830 - Normalized Output Email : 'name@domain.com'
832 :param boolean strict: if True, text should contain a single email
833 (default behavior in stable 14+). If more than one email is found no
834 normalized email is returned. If False the first found candidate is used
835 e.g. if email is 'tony@e.com, "Tony2" <tony2@e.com>', result is either
836 False (strict=True), either 'tony@e.com' (strict=False).
838 :return: False if no email found (or if more than 1 email found when being
839 in strict mode); normalized email otherwise;
840 """
841 emails = email_split(text)
842 if not emails or (strict and len(emails) != 1):
843 return False
844 return _normalize_email(emails[0])
846def email_normalize_all(text):
847 """ Tool method allowing to extract email addresses from a text input and returning
848 normalized version of all found emails. If no email is found, a void list
849 is returned.
851 e.g. if email is 'tony@e.com, "Tony2" <tony2@e.com' returned result is ['tony@e.com, tony2@e.com']
853 :return list: list of normalized emails found in text
854 """
855 emails = email_split(text)
856 return list(filter(None, [_normalize_email(email) for email in emails]))
858def _normalize_email(email):
859 """ As of rfc5322 section 3.4.1 local-part is case-sensitive. However most
860 main providers do consider the local-part as case insensitive. With the
861 introduction of smtp-utf8 within odoo, this assumption is certain to fall
862 short for international emails. We now consider that
864 * if local part is ascii: normalize still 'lower' ;
865 * else: use as it, SMTP-UF8 is made for non-ascii local parts;
867 Concerning domain part of the address, as of v14 international domain (IDNA)
868 are handled fine. The domain is always lowercase, lowering it is fine as it
869 is probably an error. With the introduction of IDNA, there is an encoding
870 that allow non-ascii characters to be encoded to ascii ones, using 'idna.encode'.
872 A normalized email is considered as :
873 - having a left part + @ + a right part (the domain can be without '.something')
874 - having no name before the address. Typically, having no 'Name <>'
875 Ex:
876 - Possible Input Email : 'Name <NaMe@DoMaIn.CoM>'
877 - Normalized Output Email : 'name@domain.com'
878 """
879 local_part, at, domain = email.rpartition('@')
880 try:
881 local_part.encode('ascii')
882 except UnicodeEncodeError:
883 pass
884 else:
885 local_part = local_part.lower()
887 return local_part + at + domain.lower()
889def email_anonymize(normalized_email, *, redact_domain=False):
890 """
891 Replace most charaters in the local part of the email address with
892 '*' to hide the recipient, but keep enough characters for debugging
893 purpose.
895 The email address must be normalized already.
897 >>> email_anonymize('admin@example.com')
898 'a****@example.com'
899 >>> email_anonymize('portal@example.com')
900 'p***al@example.com'
901 >>> email_anonymize('portal@example.com', redact_domain=True)
902 'p***al@e******.com'
903 """
904 if not normalized_email:
905 return normalized_email
907 local, at, domain = normalized_email.partition('@')
908 if len(local) <= 5:
909 anon_local = local[:1] + '*' * (len(local) - 1)
910 else:
911 anon_local = local[:1] + '*' * (len(local) - 3) + local[-2:]
913 host, dot, tld = domain.rpartition('.')
914 if redact_domain and not domain.startswith('[') and all((host, dot, tld)):
915 anon_host = host[0] + '*' * (len(host) - 1)
916 else:
917 anon_host = host
919 return f'{anon_local}{at}{anon_host}{dot}{tld}'
921def email_domain_extract(email):
922 """ Extract the company domain to be used by IAP services notably. Domain
923 is extracted from email information e.g:
925 - info@proximus.be -> proximus.be
926 """
927 normalized_email = email_normalize(email)
928 if normalized_email:
929 return normalized_email.split('@')[1]
930 return False
932def email_domain_normalize(domain):
933 """Return the domain normalized or False if the domain is invalid."""
934 if not domain or '@' in domain:
935 return False
937 return domain.lower()
939def url_domain_extract(url):
940 """ Extract the company domain to be used by IAP services notably. Domain
941 is extracted from an URL e.g:
943 - www.info.proximus.be -> proximus.be
944 """
945 parser_results = urlparse(url)
946 company_hostname = parser_results.hostname
947 if company_hostname and '.' in company_hostname:
948 return '.'.join(company_hostname.split('.')[-2:]) # remove subdomains
949 return False
951def email_escape_char(email_address):
952 """ Escape problematic characters in the given email address string"""
953 return email_address.replace('\\', '\\\\').replace('%', '\\%').replace('_', '\\_')
955# was mail_thread.decode_header()
956def decode_message_header(message, header, separator=' '):
957 return separator.join(h for h in message.get_all(header, []) if h)
959def formataddr(pair, charset='utf-8'):
960 """Pretty format a 2-tuple of the form (realname, email_address).
962 If the first element of pair is falsy then only the email address
963 is returned.
965 Set the charset to ascii to get a RFC-2822 compliant email. The
966 realname will be base64 encoded (if necessary) and the domain part
967 of the email will be punycode encoded (if necessary). The local part
968 is left unchanged thus require the SMTPUTF8 extension when there are
969 non-ascii characters.
971 >>> formataddr(('John Doe', 'johndoe@example.com'))
972 '"John Doe" <johndoe@example.com>'
974 >>> formataddr(('', 'johndoe@example.com'))
975 'johndoe@example.com'
976 """
977 name, address = pair
978 local, _, domain = address.rpartition('@')
980 try:
981 domain.encode(charset)
982 except UnicodeEncodeError:
983 # rfc5890 - Internationalized Domain Names for Applications (IDNA)
984 domain = idna.encode(domain).decode('ascii')
986 if name:
987 try:
988 name.encode(charset)
989 except UnicodeEncodeError:
990 # charset mismatch, encode as utf-8/base64
991 # rfc2047 - MIME Message Header Extensions for Non-ASCII Text
992 name = base64.b64encode(name.encode('utf-8')).decode('ascii')
993 return f"=?utf-8?b?{name}?= <{local}@{domain}>"
994 else:
995 # ascii name, escape it if needed
996 # rfc2822 - Internet Message Format
997 # #section-3.4 - Address Specification
998 name = email_addr_escapes_re.sub(r'\\\g<0>', name)
999 return f'"{name}" <{local}@{domain}>'
1000 return f"{local}@{domain}"
1002def encapsulate_email(old_email, new_email):
1003 """Change the FROM of the message and use the old one as name.
1005 e.g.
1006 * Old From: "Admin" <admin@gmail.com>
1007 * New From: notifications@odoo.com
1008 * Output: "Admin" <notifications@odoo.com>
1009 """
1010 old_email_split = getaddresses([old_email])
1011 if not old_email_split or not old_email_split[0]:
1012 return old_email
1014 new_email_split = getaddresses([new_email])
1015 if not new_email_split or not new_email_split[0]:
1016 return
1018 old_name, old_email = old_email_split[0]
1019 if old_name:
1020 name_part = old_name
1021 else:
1022 name_part = old_email.split("@")[0]
1024 return formataddr((
1025 name_part,
1026 new_email_split[0][1],
1027 ))
1029def parse_contact_from_email(text):
1030 """ Parse contact name and email (given by text) in order to find contact
1031 information, able to populate records like partners, leads, ...
1032 Supported syntax:
1034 * Raoul <raoul@grosbedon.fr>
1035 * "Raoul le Grand" <raoul@grosbedon.fr>
1036 * Raoul raoul@grosbedon.fr (strange fault tolerant support from
1037 df40926d2a57c101a3e2d221ecfd08fbb4fea30e now supported directly
1038 in 'email_split_tuples';
1040 Otherwise: default, text is set as name.
1042 :return: name, email (normalized if possible)
1043 """
1044 if not text or not text.strip(): 1044 ↛ 1045line 1044 didn't jump to line 1045 because the condition on line 1044 was never true
1045 return '', ''
1046 split_results = email_split_tuples(text)
1047 name, email = split_results[0] if split_results else ('', '')
1049 if email: 1049 ↛ 1052line 1049 didn't jump to line 1052 because the condition on line 1049 was always true
1050 email_normalized = email_normalize(email, strict=False) or email
1051 else:
1052 name, email_normalized = text, ''
1054 return name, email_normalized
1056def unfold_references(msg_references):
1057 """ As it declared in [RFC2822] long header bodies can be "folded" using
1058 CRLF+WSP. Some mail clients split References header body which contains
1059 Message Ids by "\n ".
1061 RFC2882: https://tools.ietf.org/html/rfc2822#section-2.2.3 """
1062 return [
1063 re.sub(r'[\r\n\t ]+', r'', ref) # "Unfold" buggy references
1064 for ref in mail_header_msgid_re.findall(msg_references)
1065 ]