Coverage for adhoc-cicd-odoo-odoo/odoo/tools/mail.py: 60%

1# -*- coding: utf-8 -*-

2# Part of Odoo. See LICENSE file for full copyright and licensing details.

4import base64

5import collections

6import itertools

7import logging

8import random

9import re

10import socket

11import time

12import email.utils

13from email.utils import getaddresses as orig_getaddresses

14from urllib.parse import urlparse

15from typing import Literal

16import html as htmllib

18import idna

19import markupsafe

20from lxml import etree, html

21from lxml.html import (

22 XHTML_NAMESPACE,

23 _contains_block_level_tag,

24 _looks_like_full_html_bytes,

25 _looks_like_full_html_unicode,

26 clean,

27 defs,

28 document_fromstring,

29 html_parser,

30)

31from werkzeug import urls

33from odoo.tools import misc

35__all__ = [

36 "email_domain_extract",

37 "email_domain_normalize",

38 "email_normalize",

39 "email_normalize_all",

40 "email_split",

41 "encapsulate_email",

42 "formataddr",

43 "html2plaintext",

44 "html_normalize",

45 "html_sanitize",

46 "is_html_empty",

47 "parse_contact_from_email",

48 "plaintext2html",

49 "single_email_re",

50]

52_logger = logging.getLogger(__name__)

55# disable strict mode when present: we rely on original non-strict

56# parsing, and we know that it isn't reliable, that ok.

57# cfr python/cpython@4a153a1d3b18803a684cd1bcc2cdf3ede3dbae19

58if hasattr(email.utils, 'supports_strict_parsing'): 58 ↛ 62line 58 didn't jump to line 62 because the condition on line 58 was always true

59 def getaddresses(fieldvalues):

60 return orig_getaddresses(fieldvalues, strict=False)

61else:

62 getaddresses = orig_getaddresses

65#----------------------------------------------------------

66# HTML Sanitizer

67#----------------------------------------------------------

69safe_attrs = defs.safe_attrs | frozenset(

70 ['style',

71 'data-o-mail-quote', 'data-o-mail-quote-node', # quote detection

72 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translation-source-sha', 'data-oe-nodeid',

73 'data-last-history-steps', 'data-oe-protected', 'data-embedded', 'data-embedded-editable', 'data-embedded-props', 'data-oe-version',

74 'data-oe-transient-content', 'data-behavior-props', 'data-prop-name', 'data-width', 'data-height', 'data-scale-x', 'data-scale-y', 'data-x', 'data-y', # legacy editor

75 'data-oe-role', 'data-oe-aria-label',

76 'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id',

77 'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width',

78 'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype',

79 'data-attachment-id', 'data-format-mimetype',

80 'data-ai-field', 'data-ai-record-id',

81 'data-heading-link-id',

82 'data-mimetype-before-conversion',

83 'data-language-id',

84 'data-bs-toggle', # support nav-tabs

85 ])

86SANITIZE_TAGS = {

87 # allow new semantic HTML5 tags

88 'allow_tags': defs.tags | frozenset('article bdi section header footer hgroup nav aside figure main'.split() + [etree.Comment]),

89 'kill_tags': ['base', 'embed', 'frame', 'head', 'iframe', 'link', 'meta',

90 'noscript', 'object', 'script', 'style', 'title'],

91 'remove_tags': ['html', 'body'],

92}

95class _Cleaner(clean.Cleaner):

97 _style_re = re.compile(r'''([\w-]+)\s*:\s*((?:[^;"']|"[^";]*"|'[^';]*')+)''')

99 _style_whitelist = [

100 'font-size', 'font-family', 'font-weight', 'font-style', 'background-color', 'color', 'text-align',

101 'line-height', 'letter-spacing', 'text-transform', 'text-decoration', 'text-decoration', 'opacity',

102 'float', 'vertical-align', 'display', 'object-fit',

103 'padding', 'padding-top', 'padding-left', 'padding-bottom', 'padding-right',

104 'margin', 'margin-top', 'margin-left', 'margin-bottom', 'margin-right',

105 'white-space',

106 # appearance

107 'background-image', 'background-position', 'background-size', 'background-repeat', 'background-origin',

108 # box model

109 'border', 'border-color', 'border-radius', 'border-style', 'border-width', 'border-top', 'border-bottom',

110 'height', 'width', 'max-width', 'min-width', 'min-height',

111 # tables

112 'border-collapse', 'border-spacing', 'caption-side', 'empty-cells', 'table-layout']

113

114 _style_whitelist.extend(

115 ['border-%s-%s' % (position, attribute)

116 for position in ['top', 'bottom', 'left', 'right']

117 for attribute in ('style', 'color', 'width', 'left-radius', 'right-radius')]

118 )

119

120 strip_classes = False

121 sanitize_style = False

122 conditional_comments = True

123

124 def __call__(self, doc):

125 super(_Cleaner, self).__call__(doc)

126

127 # if we keep attributes but still remove classes

128 if not getattr(self, 'safe_attrs_only', False) and self.strip_classes: 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 for el in doc.iter(tag=etree.Element):

130 self.strip_class(el)

131

132 # if we keep style attribute, sanitize them

133 if not self.style and self.sanitize_style:

134 for el in doc.iter(tag=etree.Element):

135 self.parse_style(el)

136

137 def strip_class(self, el):

138 if el.attrib.get('class'):

139 del el.attrib['class']

140

141 def parse_style(self, el):

142 attributes = el.attrib

143 styling = attributes.get('style')

144 if styling:

145 valid_styles = collections.OrderedDict()

146 styles = self._style_re.findall(styling)

147 for style in styles:

148 if style[0].lower() in self._style_whitelist:

149 valid_styles[style[0].lower()] = style[1]

150 if valid_styles: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true

151 el.attrib['style'] = '; '.join('%s:%s' % (key, val) for (key, val) in valid_styles.items())

152 else:

153 del el.attrib['style']

154

155 def kill_conditional_comments(self, doc):

156 """Override the default behavior of lxml.

157

158 https://github.com/lxml/lxml/blob/e82c9153c4a7d505480b94c60b9a84d79d948efb/src/lxml/html/clean.py#L501-L510

159

160 In some use cases, e.g. templates used for mass mailing,

161 we send emails containing conditional comments targeting Microsoft Outlook,

162 to give special styling instructions.

163 https://github.com/odoo/odoo/pull/119325/files#r1301064789

164

165 Within these conditional comments, unsanitized HTML can lie.

166 However, in modern browser, these comments are considered as simple comments,

167 their content is not executed.

168 https://caniuse.com/sr_ie-features

169 """

170 if self.conditional_comments:

171 super().kill_conditional_comments(doc)

172

173

174def tag_quote(el):

175 def _create_new_node(tag, text, tail=None, attrs=None):

176 new_node = etree.Element(tag)

177 new_node.text = text

178 new_node.tail = tail

179 if attrs:

180 for key, val in attrs.items():

181 new_node.set(key, val)

182 return new_node

183

184 def _tag_matching_regex_in_text(regex, node, tag='span', attrs=None):

185 text = node.text or ''

186 if not re.search(regex, text): 186 ↛ 189line 186 didn't jump to line 189 because the condition on line 186 was always true

187 return

188

189 child_node = None

190 idx, node_idx = 0, 0

191 for item in re.finditer(regex, text):

192 new_node = _create_new_node(tag, text[item.start():item.end()], None, attrs)

193 if child_node is None:

194 node.text = text[idx:item.start()]

195 new_node.tail = text[item.end():]

196 node.insert(node_idx, new_node)

197 else:

198 child_node.tail = text[idx:item.start()]

199 new_node.tail = text[item.end():]

200 node.insert(node_idx, new_node)

201 child_node = new_node

202 idx = item.end()

203 node_idx = node_idx + 1

204

205 el_class = el.get('class', '') or ''

206 el_id = el.get('id', '') or ''

207

208 # gmail or yahoo // # outlook, html // # msoffice

209 if 'gmail_extra' in el_class or \ 209 ↛ 211line 209 didn't jump to line 211 because the condition on line 209 was never true

210 ('SkyDrivePlaceholder' in el_class or 'SkyDrivePlaceholder' in el_class):

211 el.set('data-o-mail-quote', '1')

212 if el.getparent() is not None:

213 el.getparent().set('data-o-mail-quote-container', '1')

214

215 if (el.tag == 'hr' and ('stopSpelling' in el_class or 'stopSpelling' in el_id)) or \ 215 ↛ 218line 215 didn't jump to line 218 because the condition on line 215 was never true

216 'yahoo_quoted' in el_class:

217 # Quote all elements after this one

218 el.set('data-o-mail-quote', '1')

219 for sibling in el.itersiblings(preceding=False):

220 sibling.set('data-o-mail-quote', '1')

221

222 # odoo, gmail and outlook automatic signature wrapper

223 is_signature_wrapper = 'odoo_signature_wrapper' in el_class or 'gmail_signature' in el_class or el_id == "Signature"

224 is_outlook_auto_message = 'appendonsend' in el_id

225 # gmail and outlook reply quote

226 is_outlook_reply_quote = 'divRplyFwdMsg' in el_id

227 is_gmail_quote = 'gmail_quote' in el_class

228 is_quote_wrapper = is_signature_wrapper or is_gmail_quote or is_outlook_reply_quote

229 if is_quote_wrapper: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 el.set('data-o-mail-quote-container', '1')

231 el.set('data-o-mail-quote', '1')

232

233 # outlook reply wrapper is preceded with <hr> and a div containing recipient info

234 if is_outlook_reply_quote: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true

235 hr = el.getprevious()

236 reply_quote = el.getnext()

237 if hr is not None and hr.tag == 'hr':

238 hr.set('data-o-mail-quote', '1')

239 if reply_quote is not None:

240 reply_quote.set('data-o-mail-quote-container', '1')

241 reply_quote.set('data-o-mail-quote', '1')

242

243 if is_outlook_auto_message: 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 if not el.text or not el.text.strip():

245 el.set('data-o-mail-quote-container', '1')

246 el.set('data-o-mail-quote', '1')

247

248 # html signature (-- <br />blah)

249 signature_begin = re.compile(r"((?:(?:^|\n)[-]{2}[\s]?$))")

250 if el.text and el.find('br') is not None and re.search(signature_begin, el.text):

251 el.set('data-o-mail-quote', '1')

252 if el.getparent() is not None: 252 ↛ 256line 252 didn't jump to line 256 because the condition on line 252 was always true

253 el.getparent().set('data-o-mail-quote-container', '1')

254

255 # text-based quotes (>, >>) and signatures (-- Signature)

256 text_complete_regex = re.compile(r"((?:\n[>]+[^\n\r]*)+|(?:(?:^|\n)[-]{2}[\s]?[\r\n]{1,2}[\s\S]+))")

257 if not el.get('data-o-mail-quote'):

258 _tag_matching_regex_in_text(text_complete_regex, el, 'span', {'data-o-mail-quote': '1'})

259

260 if el.tag == 'blockquote':

261 # remove single node

262 el.set('data-o-mail-quote-node', '1')

263 el.set('data-o-mail-quote', '1')

264 if el.getparent() is not None and not el.getparent().get('data-o-mail-quote-node'):

265 if el.getparent().get('data-o-mail-quote'):

266 el.set('data-o-mail-quote', '1')

267 # only quoting the elements following the first quote in the container

268 # avoids issues with repeated calls to html_normalize

269 elif el.getparent().get('data-o-mail-quote-container'):

270 if (first_sibling_quote := el.getparent().find("*[@data-o-mail-quote]")) is not None: 270 ↛ 276line 270 didn't jump to line 276 because the condition on line 270 was always true

271 siblings = el.getparent().getchildren()

272 quote_index = siblings.index(first_sibling_quote)

273 element_index = siblings.index(el)

274 if quote_index < element_index: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 el.set('data-o-mail-quote', '1')

276 if el.getprevious() is not None and el.getprevious().get('data-o-mail-quote') and not el.text_content().strip():

277 el.set('data-o-mail-quote', '1')

278

279

280def fromstring(html_, base_url=None, parser=None, **kw):

281 """

282 This function mimics lxml.html.fromstring. It not only returns the parsed

283 element/document but also a flag indicating whether the input is for a

284 a single body element or not.

285

286 This tries to minimally parse the chunk of text, without knowing if it

287 is a fragment or a document.

288

289 base_url will set the document's base_url attribute (and the tree's docinfo.URL)

290 """

291 if parser is None: 291 ↛ 293line 291 didn't jump to line 293 because the condition on line 291 was always true

292 parser = html_parser

293 if isinstance(html_, bytes): 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 is_full_html = _looks_like_full_html_bytes(html_)

295 else:

296 is_full_html = _looks_like_full_html_unicode(html_)

297 doc = document_fromstring(html_, parser=parser, base_url=base_url, **kw)

298 if is_full_html:

299 return doc, False

300 # otherwise, lets parse it out...

301 bodies = doc.findall('body')

302 if not bodies: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true

303 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)

304 if bodies: 304 ↛ 320line 304 didn't jump to line 320 because the condition on line 304 was always true

305 body = bodies[0]

306 if len(bodies) > 1: 306 ↛ 309line 306 didn't jump to line 309 because the condition on line 306 was never true

307 # Somehow there are multiple bodies, which is bad, but just

308 # smash them into one body

309 for other_body in bodies[1:]:

310 if other_body.text:

311 if len(body):

312 body[-1].tail = (body[-1].tail or '') + other_body.text

313 else:

314 body.text = (body.text or '') + other_body.text

315 body.extend(other_body)

316 # We'll ignore tail

317 # I guess we are ignoring attributes too

318 other_body.drop_tree()

319 else:

320 body = None

321 heads = doc.findall('head')

322 if not heads: 322 ↛ 324line 322 didn't jump to line 324 because the condition on line 322 was always true

323 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)

324 if heads: 324 ↛ 326line 324 didn't jump to line 326 because the condition on line 324 was never true

325 # Well, we have some sort of structure, so lets keep it all

326 head = heads[0]

327 if len(heads) > 1:

328 for other_head in heads[1:]:

329 head.extend(other_head)

330 # We don't care about text or tail in a head

331 other_head.drop_tree()

332 return doc, False

333 if body is None: 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true

334 return doc, False

335 if (len(body) == 1 and (not body.text or not body.text.strip())

336 and (not body[-1].tail or not body[-1].tail.strip())):

337 # The body has just one element, so it was probably a single

338 # element passed in

339 return body[0], True

340 # Now we have a body which represents a bunch of tags which have the

341 # content that was passed in. We will create a fake container, which

342 # is the body tag, except <body> implies too much structure.

343 if _contains_block_level_tag(body):

344 body.tag = 'div'

345 else:

346 body.tag = 'span'

347 return body, False

348

349

350def html_normalize(src, filter_callback=None, output_method="html"):

351 """ Normalize `src` for storage as an html field value.

352

353 The string is parsed as an html tag soup, made valid, then decorated for

354 "email quote" detection, and prepared for an optional filtering.

355 The filtering step (e.g. sanitization) should be performed by the

356 `filter_callback` function (to avoid multiple parsing operations, and

357 normalize the result).

358

359 :param src: the html string to normalize

360 :param filter_callback: optional callable taking a single `etree._Element`

361 document parameter, to be called during normalization in order to

362 filter the output document

363 :param output_method: defines the output method to pass to `html.tostring`.

364 It defaults to 'html', but can also be 'xml' for xhtml output.

365 """

366 if not src: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true

367 return src

368

369 # html: remove encoding attribute inside tags

370 src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src)

371

372 src = src.replace('--!>', '-->')

373 src = re.sub(r'()', '', src)

374 # On the specific case of Outlook desktop it adds unnecessary '<o:.*></o:.*>' tags which are parsed

375 # in '<p></p>' which may alter the appearance (eg. spacing) of the mail body

376 src = re.sub(r'</?o:.*?>', '', src)

377

378 try:

379 doc, single_body_element = fromstring(src)

380 except etree.ParserError as e:

381 # HTML comment only string, whitespace only..

382 if 'empty' in str(e):

383 return ""

384 raise

385

386 # perform quote detection before cleaning and class removal

387 for el in doc.iter(tag=etree.Element):

388 tag_quote(el)

389

390 doc = html.fromstring(html.tostring(doc, method=output_method))

391

392 if filter_callback:

393 doc = filter_callback(doc)

394

395 src = html.tostring(doc, encoding='unicode', method=output_method)

396

397 if not single_body_element and src.startswith('<div>') and src.endswith('</div>'):

398 # the <div></div> may come from 2 places

399 # 1. the src is parsed as multiple body elements

400 # <div></div> wraps all elements.

401 # 2. the src is parsed as not only body elements

402 # <html></html> wraps all elements.

403 # then the Cleaner as the filter_callback which has 'html' in its

404 # 'remove_tags' will write <html></html> to <div></div> since it

405 # cannot directly drop the parent-most tag

406 src = src[5:-6]

407

408 # html considerations so real html content match database value

409 src = src.replace(u'\xa0', u' ')

410

411 return src

412

413

414def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, sanitize_form=True, sanitize_conditional_comments=True, strip_style=False, strip_classes=False, output_method="html"):

415 if not src:

416 return src

417

418 logger = logging.getLogger(__name__ + '.html_sanitize')

419

420 def sanitize_handler(doc):

421 kwargs = {

422 'page_structure': True,

423 'style': strip_style, # True = remove style tags/attrs

424 'sanitize_style': sanitize_style, # True = sanitize styling

425 'forms': sanitize_form, # True = remove form tags

426 'remove_unknown_tags': False,

427 'comments': False,

428 'conditional_comments': sanitize_conditional_comments, # True = remove conditional comments

429 'processing_instructions': False

430 }

431 if sanitize_tags:

432 kwargs.update(SANITIZE_TAGS)

433

434 if sanitize_attributes: # We keep all attributes in order to keep "style"

435 if strip_classes: 435 ↛ 436line 435 didn't jump to line 436 because the condition on line 435 was never true

436 current_safe_attrs = safe_attrs - frozenset(['class'])

437 else:

438 current_safe_attrs = safe_attrs

439 kwargs.update({

440 'safe_attrs_only': True,

441 'safe_attrs': current_safe_attrs,

442 })

443 else:

444 kwargs.update({

445 'safe_attrs_only': False, # keep oe-data attributes + style

446 'strip_classes': strip_classes, # remove classes, even when keeping other attributes

447 })

448

449 cleaner = _Cleaner(**kwargs)

450 cleaner(doc)

451 return doc

452

453 try:

454 sanitized = html_normalize(src, filter_callback=sanitize_handler, output_method=output_method)

455 except etree.ParserError:

456 if not silent:

457 raise

458 logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True)

459 sanitized = '<p>ParserError when sanitizing</p>'

460 except Exception:

461 if not silent:

462 raise

463 logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True)

464 sanitized = '<p>Unknown error when sanitizing</p>'

465

466 return markupsafe.Markup(sanitized)

467

468# ----------------------------------------------------------

469# HTML/Text management

470# ----------------------------------------------------------

471

472URL_SKIP_PROTOCOL_REGEX = r'mailto:|tel:|sms:'

473URL_REGEX = rf'''(\bhref=['"](?!{URL_SKIP_PROTOCOL_REGEX})([^'"]+)['"])'''

474TEXT_URL_REGEX = r'https?://[\w@:%.+&~#=/-]+(?:\?\S+)?'

475# retrieve inner content of the link

476HTML_TAG_URL_REGEX = URL_REGEX + r'([^<>]*>([^<>]+)<\/)?'

477HTML_TAGS_REGEX = re.compile('<.*?>')

478HTML_NEWLINES_REGEX = re.compile('<(div|p|br|tr)[^>]*>|\n')

479

480

481def validate_url(url):

482 if urls.url_parse(url).scheme not in ('http', 'https', 'ftp', 'ftps'):

483 return 'http://' + url

484

485 return url

486

487

488def is_html_empty(html_content: str | markupsafe.Markup | Literal[False] | None) -> bool:

489 """Check if a html content is empty. If there are only formatting tags with style

490 attributes or a void content return True. Famous use case if a

491 '<p style="..."><br></p>' added by some web editor.

492

493 :param html_content: html content, coming from example from an HTML field

494 :returns: True if no content found or if containing only void formatting tags

495 """

496 if not html_content:

497 return True

498 icon_re = r'<\s*(i|span)\b(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"])?)*\s*\bclass\s*=\s*["\'][^"\']*\b(fa|fab|fad|far|oi)\b'

499 tag_re = r'<\s*\/?(?:p|div|section|span|br|b|i|font)\b(?:(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"]))*)(?:\s*>|\s*\/\s*>)'

500 text_content = htmllib.unescape(re.sub(tag_re, '', html_content))

501 return not bool(text_content.strip()) and not re.search(icon_re, html_content)

502

503

504def html_keep_url(text):

505 """ Transform the url into clickable link with <a/> tag """

506 idx = 0

507 final = ''

508 link_tags = re.compile(r"""(?<!["'])((ftp|http|https):\/\/(\w+:{0,1}\w*@)?([^\s<"']+)(:[0-9]+)?(\/|\/([^\s<"']))?)(?![^\s<"']*["']|[^\s<"']*</a>)""")

509 for item in re.finditer(link_tags, text): 509 ↛ 510line 509 didn't jump to line 510 because the loop on line 509 never started

510 final += text[idx:item.start()]

511 final += create_link(item.group(0), item.group(0))

512 idx = item.end()

513 final += text[idx:]

514 return final

515

516

517def html_to_inner_content(html):

518 """Returns unformatted text after removing html tags and excessive whitespace from a

519 string/Markup. Passed strings will first be sanitized.

520 """

521 if is_html_empty(html): 521 ↛ 522line 521 didn't jump to line 522 because the condition on line 521 was never true

522 return ''

523 if not isinstance(html, markupsafe.Markup): 523 ↛ 524line 523 didn't jump to line 524 because the condition on line 523 was never true

524 html = html_sanitize(html)

525 processed = re.sub(HTML_NEWLINES_REGEX, ' ', html)

526 processed = re.sub(HTML_TAGS_REGEX, '', processed)

527 processed = re.sub(r' {2,}|\t', ' ', processed)

528 processed = processed.replace("\xa0", " ")

529 processed = htmllib.unescape(processed)

530 return processed.strip()

531

532

533def create_link(url, label):

534 return f'<a href="{url}" target="_blank" rel="noreferrer noopener">{label}</a>'

535

536

537def html2plaintext(

538 html: str | markupsafe.Markup | Literal[False] | None,

539 body_id: str | None = None,

540 encoding: str = 'utf-8',

541 include_references: bool = True

542) -> str:

543 """ From an HTML text, convert the HTML to plain text.

544 If @param body_id is provided then this is the tag where the

545 body (not necessarily <body>) starts.

546 :param include_references: If False, numbered references and

547 URLs for links and images will not be included.

548 """

549 ## (c) Fry-IT, www.fry-it.com, 2007

550 ## <peter@fry-it.com>

551 ## download here: http://www.peterbe.com/plog/html2plaintext

552 if not (html and html.strip()):

553 return ''

554

555 if isinstance(html, bytes): 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true

556 html = html.decode(encoding)

557 else:

558 assert isinstance(html, str), f"expected str got {html.__class__.__name__}"

559

560 tree = etree.fromstring(html, parser=etree.HTMLParser())

561

562 if body_id is not None: 562 ↛ 563line 562 didn't jump to line 563 because the condition on line 562 was never true

563 source = tree.xpath('//*[@id=%s]' % (body_id,))

564 else:

565 source = tree.xpath('//body')

566 if len(source): 566 ↛ 569line 566 didn't jump to line 569 because the condition on line 566 was always true

567 tree = source[0]

568

569 url_index = []

570 linkrefs = itertools.count(1)

571 if include_references: 571 ↛ 588line 571 didn't jump to line 588 because the condition on line 571 was always true

572 for link in tree.findall('.//a'):

573 if url := link.get('href'): 573 ↛ 572line 573 didn't jump to line 572 because the condition on line 573 was always true

574 link.tag = 'span'

575 link.text = f'{link.text} [{next(linkrefs)}]'

576 url_index.append(url)

577

578 for img in tree.findall('.//img'):

579 if src := img.get('src'): 579 ↛ 578line 579 didn't jump to line 578 because the condition on line 579 was always true

580 img.tag = 'span'

581 if src.startswith('data:'): 581 ↛ 582line 581 didn't jump to line 582 because the condition on line 581 was never true

582 img_name = None # base64 image

583 else:

584 img_name = re.search(r'[^/]+(?=\.[a-zA-Z]+(?:\?|$))', src)

585 img.text = '%s [%s]' % (img_name[0] if img_name else 'Image', next(linkrefs))

586 url_index.append(src)

587

588 html = etree.tostring(tree, encoding="unicode")

589 # \r char is converted into , must remove it

590 html = html.replace('', '')

591

592 html = html.replace('<strong>', '*').replace('</strong>', '*')

593 html = html.replace('<b>', '*').replace('</b>', '*')

594 html = html.replace('<h3>', '*').replace('</h3>', '*')

595 html = html.replace('<h2>', '**').replace('</h2>', '**')

596 html = html.replace('<h1>', '**').replace('</h1>', '**')

597 html = html.replace('<em>', '/').replace('</em>', '/')

598 html = html.replace('<tr>', '\n')

599 html = html.replace('</p>', '\n')

600 html = re.sub(r'<br\s*/?>', '\n', html)

601 html = re.sub('<.*?>', ' ', html)

602 html = html.replace(' ' * 2, ' ')

603 html = html.replace('>', '>')

604 html = html.replace('<', '<')

605 html = html.replace('&', '&')

606 html = html.replace(' ', '\N{NO-BREAK SPACE}')

607

608 # strip all lines

609 html = '\n'.join([x.strip() for x in html.splitlines()])

610 html = html.replace('\n' * 2, '\n')

611

612 if url_index:

613 html += '\n\n'

614 for i, url in enumerate(url_index, start=1):

615 html += f'[{i}] {url}\n'

616

617 return html.strip()

618

619

620def plaintext2html(text: str, container_tag: str | None = None, with_paragraph: bool = True) -> markupsafe.Markup:

621 r"""Convert plaintext into html. Content of the text is escaped to manage

622 html entities, using :func:`~odoo.tools.misc.html_escape`.

623

624 - all ``\n``, ``\r`` are replaced by ``<br/>``

625 - convert url into clickable link

626

627 :param text: plaintext to convert

628 :param container_tag: container of the html; by default the content is

629 embedded into a ``<div>``

630 :param with_paragraph: whether or not considering 2 or more consecutive ``<br/>``

631 as paragraph breaks and enclosing content in ``<p>``

632 """

633 assert isinstance(text, str)

634 text = misc.html_escape(text)

635

636 # 1. replace \n and \r

637 text = re.sub(r'(\r\n|\r|\n)', '<br/>', text)

638

639 # 2. clickable links

640 text = html_keep_url(text)

641

642 # 3-4: form paragraphs

643 final = text

644 if with_paragraph: 644 ↛ 645line 644 didn't jump to line 645 because the condition on line 644 was never true

645 idx = 0

646 final = '<p>'

647 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})')

648 for item in re.finditer(br_tags, text):

649 final += text[idx:item.start()] + '</p><p>'

650 idx = item.end()

651 final += text[idx:] + '</p>'

652

653 # 5. container

654 if container_tag: # FIXME: validate that container_tag is just a simple tag? 654 ↛ 655line 654 didn't jump to line 655 because the condition on line 654 was never true

655 final = '<%s>%s</%s>' % (container_tag, final, container_tag)

656 return markupsafe.Markup(final)

657

658def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=None):

659 """ Append extra content at the end of an HTML snippet, trying

660 to locate the end of the HTML document (</body>, </html>, or

661 EOF), and converting the provided content in html unless ``plaintext``

662 is ``False``.

663

664 Content conversion can be done in two ways:

665

666 - wrapping it into a pre (``preserve=True``)

667 - use plaintext2html (``preserve=False``, using ``container_tag`` to

668 wrap the whole content)

669

670 A side-effect of this method is to coerce all HTML tags to

671 lowercase in ``html``, and strip enclosing <html> or <body> tags in

672 content if ``plaintext`` is False.

673

674 :param str html: html tagsoup (doesn't have to be XHTML)

675 :param str content: extra content to append

676 :param bool plaintext: whether content is plaintext and should

677 be wrapped in a <pre/> tag.

678 :param bool preserve: if content is plaintext, wrap it into a <pre>

679 instead of converting it into html

680 :param str container_tag: tag to wrap the content into, defaults to `div`.

681 :rtype: markupsafe.Markup

682 """

683 if plaintext and preserve:

684 content = '\n<pre>%s</pre>\n' % misc.html_escape(content)

685 elif plaintext:

686 content = '\n%s\n' % plaintext2html(content, container_tag)

687 else:

688 content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)

689 content = '\n%s\n' % content

690 # Force all tags to lowercase

691 html = re.sub(r'(</?)(\w+)([ >])',

692 lambda m: '%s%s%s' % (m[1], m[2].lower(), m[3]), html)

693 insert_location = html.find('</body>')

694 if insert_location == -1:

695 insert_location = html.find('</html>')

696 if insert_location == -1:

697 return markupsafe.Markup('%s%s' % (html, content))

698 return markupsafe.Markup('%s%s%s' % (html[:insert_location], content, html[insert_location:]))

699

700

701def prepend_html_content(html_body, html_content):

702 """Prepend some HTML content at the beginning of an other HTML content."""

703 replacement = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', html_content)

704 html_content = markupsafe.Markup(replacement) if isinstance(html_content, markupsafe.Markup) else replacement

705 html_content = html_content.strip()

706

707 body_match = re.search(r'<body[^>]*>', html_body) or re.search(r'<html[^>]*>', html_body)

708 insert_index = body_match.end() if body_match else 0

709

710 return html_body[:insert_index] + html_content + html_body[insert_index:]

711

712#----------------------------------------------------------

713# Emails

714#----------------------------------------------------------

715

716# matches any email in a body of text

717email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63})""", re.VERBOSE)

718

719# matches a string containing only one email

720single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63}$""", re.VERBOSE)

721

722mail_header_msgid_re = re.compile('<[^<>]+>')

723

724email_addr_escapes_re = re.compile(r'[\\"]')

725

726def generate_tracking_message_id(res_id):

727 """Returns a string that can be used in the Message-ID RFC822 header field

728

729 Used to track the replies related to a given object thanks to the "In-Reply-To"

730 or "References" fields that Mail User Agents will set.

731 """

732 try:

733 rnd = random.SystemRandom().random()

734 except NotImplementedError:

735 rnd = random.random()

736 rndstr = ("%.15f" % rnd)[2:]

737 return "<%s.%.15f-openerp-%s@%s>" % (rndstr, time.time(), res_id, socket.gethostname())

738

739def email_split_tuples(text):

740 """ Return a list of (name, email) address tuples found in ``text`` . Note

741 that text should be an email header or a stringified email list as it may

742 give broader results than expected on actual text. """

743 def _parse_based_on_spaces(pair):

744 """ With input 'name email@domain.com' (missing quotes for a formatting)

745 getaddresses returns ('', 'name email@domain.com). This when having no

746 name and an email a fallback to enhance parsing is to redo a getaddresses

747 by replacing spaces by commas. The new email will be split into sub pairs

748 allowing to find the email and name parts, allowing to make a new name /

749 email pair. Emails should not contain spaces thus this is coherent with

750 email formation. """

751 name, email = pair

752 if not name and email and ' ' in email: 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true

753 inside_pairs = getaddresses([email.replace(' ', ',')])

754 name_parts, found_email = [], False

755 for pair in inside_pairs:

756 if pair[1] and '@' not in pair[1]:

757 name_parts.append(pair[1])

758 if pair[1] and '@' in pair[1]:

759 found_email = pair[1]

760 name, email = (' '.join(name_parts), found_email) if found_email else (name, email)

761 return (name, email)

762

763 if not text:

764 return []

765

766 # found valid pairs, filtering out failed parsing

767 valid_pairs = [

768 (addr[0], addr[1]) for addr in getaddresses([text])

769 # getaddresses() returns '' when email parsing fails, and

770 # sometimes returns emails without at least '@'. The '@'

771 # is strictly required in RFC2822's `addr-spec`.

772 if addr[1] and '@' in addr[1]

773 ]

774 # corner case: returning '@gmail.com'-like email (see test_email_split)

775 if any(pair[1].startswith('@') for pair in valid_pairs): 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true

776 filtered = [

777 found_email for found_email in email_re.findall(text)

778 if found_email and not found_email.startswith('@')

779 ]

780 if filtered:

781 valid_pairs = [('', found_email) for found_email in filtered]

782

783 return list(map(_parse_based_on_spaces, valid_pairs))

784

785

786def email_split(text):

787 """ Return a list of the email addresses found in ``text`` """

788 return [email for (name, email) in email_split_tuples(text)]

789

790

791def email_split_and_format(text):

792 """ Return a list of email addresses found in ``text``, formatted using

793 formataddr. """

794 return [formataddr((name, email)) for (name, email) in email_split_tuples(text)]

795

796

797def email_split_and_normalize(text):

798 """ Same as 'email_split' but normalized email """

799 return [(name, _normalize_email(email)) for (name, email) in email_split_tuples(text)]

800

801

802def email_split_and_format_normalize(text):

803 """ Same as 'email_split_and_format' but normalizing email. """

804 return [

805 formataddr(

806 (name, _normalize_email(email))

807 ) for (name, email) in email_split_tuples(text)

808 ]

809

810def email_normalize(text, strict=True):

811 """ Sanitize and standardize email address entries. As of rfc5322 section

812 3.4.1 local-part is case-sensitive. However most main providers do consider

813 the local-part as case insensitive. With the introduction of smtp-utf8

814 within odoo, this assumption is certain to fall short for international

815 emails. We now consider that

816

817 * if local part is ascii: normalize still 'lower' ;

818 * else: use as it, SMTP-UF8 is made for non-ascii local parts;

819

820 Concerning domain part of the address, as of v14 international domain (IDNA)

821 are handled fine. The domain is always lowercase, lowering it is fine as it

822 is probably an error. With the introduction of IDNA, there is an encoding

823 that allow non-ascii characters to be encoded to ascii ones, using 'idna.encode'.

824

825 A normalized email is considered as :

826 - having a left part + @ + a right part (the domain can be without '.something')

827 - having no name before the address. Typically, having no 'Name <>'

828 Ex:

829 - Possible Input Email : 'Name <NaMe@DoMaIn.CoM>'

830 - Normalized Output Email : 'name@domain.com'

831

832 :param boolean strict: if True, text should contain a single email

833 (default behavior in stable 14+). If more than one email is found no

834 normalized email is returned. If False the first found candidate is used

835 e.g. if email is 'tony@e.com, "Tony2" <tony2@e.com>', result is either

836 False (strict=True), either 'tony@e.com' (strict=False).

837

838 :return: False if no email found (or if more than 1 email found when being

839 in strict mode); normalized email otherwise;

840 """

841 emails = email_split(text)

842 if not emails or (strict and len(emails) != 1):

843 return False

844 return _normalize_email(emails[0])

845

846def email_normalize_all(text):

847 """ Tool method allowing to extract email addresses from a text input and returning

848 normalized version of all found emails. If no email is found, a void list

849 is returned.

850

851 e.g. if email is 'tony@e.com, "Tony2" <tony2@e.com' returned result is ['tony@e.com, tony2@e.com']

852

853 :return list: list of normalized emails found in text

854 """

855 emails = email_split(text)

856 return list(filter(None, [_normalize_email(email) for email in emails]))

857

858def _normalize_email(email):

859 """ As of rfc5322 section 3.4.1 local-part is case-sensitive. However most

860 main providers do consider the local-part as case insensitive. With the

861 introduction of smtp-utf8 within odoo, this assumption is certain to fall

862 short for international emails. We now consider that

863

864 * if local part is ascii: normalize still 'lower' ;

865 * else: use as it, SMTP-UF8 is made for non-ascii local parts;

866

867 Concerning domain part of the address, as of v14 international domain (IDNA)

868 are handled fine. The domain is always lowercase, lowering it is fine as it

869 is probably an error. With the introduction of IDNA, there is an encoding

870 that allow non-ascii characters to be encoded to ascii ones, using 'idna.encode'.

871

872 A normalized email is considered as :

873 - having a left part + @ + a right part (the domain can be without '.something')

874 - having no name before the address. Typically, having no 'Name <>'

875 Ex:

876 - Possible Input Email : 'Name <NaMe@DoMaIn.CoM>'

877 - Normalized Output Email : 'name@domain.com'

878 """

879 local_part, at, domain = email.rpartition('@')

880 try:

881 local_part.encode('ascii')

882 except UnicodeEncodeError:

883 pass

884 else:

885 local_part = local_part.lower()

886

887 return local_part + at + domain.lower()

888

889def email_anonymize(normalized_email, *, redact_domain=False):

890 """

891 Replace most charaters in the local part of the email address with

892 '*' to hide the recipient, but keep enough characters for debugging

893 purpose.

894

895 The email address must be normalized already.

896

897 >>> email_anonymize('admin@example.com')

898 'a****@example.com'

899 >>> email_anonymize('portal@example.com')

900 'p***al@example.com'

901 >>> email_anonymize('portal@example.com', redact_domain=True)

902 'p***al@e******.com'

903 """

904 if not normalized_email:

905 return normalized_email

906

907 local, at, domain = normalized_email.partition('@')

908 if len(local) <= 5:

909 anon_local = local[:1] + '*' * (len(local) - 1)

910 else:

911 anon_local = local[:1] + '*' * (len(local) - 3) + local[-2:]

912

913 host, dot, tld = domain.rpartition('.')

914 if redact_domain and not domain.startswith('[') and all((host, dot, tld)):

915 anon_host = host[0] + '*' * (len(host) - 1)

916 else:

917 anon_host = host

918

919 return f'{anon_local}{at}{anon_host}{dot}{tld}'

920

921def email_domain_extract(email):

922 """ Extract the company domain to be used by IAP services notably. Domain

923 is extracted from email information e.g:

924

925 - info@proximus.be -> proximus.be

926 """

927 normalized_email = email_normalize(email)

928 if normalized_email:

929 return normalized_email.split('@')[1]

930 return False

931

932def email_domain_normalize(domain):

933 """Return the domain normalized or False if the domain is invalid."""

934 if not domain or '@' in domain:

935 return False

936

937 return domain.lower()

938

939def url_domain_extract(url):

940 """ Extract the company domain to be used by IAP services notably. Domain

941 is extracted from an URL e.g:

942

943 - www.info.proximus.be -> proximus.be

944 """

945 parser_results = urlparse(url)

946 company_hostname = parser_results.hostname

947 if company_hostname and '.' in company_hostname:

948 return '.'.join(company_hostname.split('.')[-2:]) # remove subdomains

949 return False

950

951def email_escape_char(email_address):

952 """ Escape problematic characters in the given email address string"""

953 return email_address.replace('\\', '\\\\').replace('%', '\\%').replace('_', '\\_')

954

955# was mail_thread.decode_header()

956def decode_message_header(message, header, separator=' '):

957 return separator.join(h for h in message.get_all(header, []) if h)

958

959def formataddr(pair, charset='utf-8'):

960 """Pretty format a 2-tuple of the form (realname, email_address).

961

962 If the first element of pair is falsy then only the email address

963 is returned.

964

965 Set the charset to ascii to get a RFC-2822 compliant email. The

966 realname will be base64 encoded (if necessary) and the domain part

967 of the email will be punycode encoded (if necessary). The local part

968 is left unchanged thus require the SMTPUTF8 extension when there are

969 non-ascii characters.

970

971 >>> formataddr(('John Doe', 'johndoe@example.com'))

972 '"John Doe" <johndoe@example.com>'

973

974 >>> formataddr(('', 'johndoe@example.com'))

975 'johndoe@example.com'

976 """

977 name, address = pair

978 local, _, domain = address.rpartition('@')

979

980 try:

981 domain.encode(charset)

982 except UnicodeEncodeError:

983 # rfc5890 - Internationalized Domain Names for Applications (IDNA)

984 domain = idna.encode(domain).decode('ascii')

985

986 if name:

987 try:

988 name.encode(charset)

989 except UnicodeEncodeError:

990 # charset mismatch, encode as utf-8/base64

991 # rfc2047 - MIME Message Header Extensions for Non-ASCII Text

992 name = base64.b64encode(name.encode('utf-8')).decode('ascii')

993 return f"=?utf-8?b?{name}?= <{local}@{domain}>"

994 else:

995 # ascii name, escape it if needed

996 # rfc2822 - Internet Message Format

997 # #section-3.4 - Address Specification

998 name = email_addr_escapes_re.sub(r'\\\g<0>', name)

999 return f'"{name}" <{local}@{domain}>'

1000 return f"{local}@{domain}"

1001

1002def encapsulate_email(old_email, new_email):

1003 """Change the FROM of the message and use the old one as name.

1004

1005 e.g.

1006 * Old From: "Admin" <admin@gmail.com>

1007 * New From: notifications@odoo.com

1008 * Output: "Admin" <notifications@odoo.com>

1009 """

1010 old_email_split = getaddresses([old_email])

1011 if not old_email_split or not old_email_split[0]:

1012 return old_email

1013

1014 new_email_split = getaddresses([new_email])

1015 if not new_email_split or not new_email_split[0]:

1016 return

1017

1018 old_name, old_email = old_email_split[0]

1019 if old_name:

1020 name_part = old_name

1021 else:

1022 name_part = old_email.split("@")[0]

1023

1024 return formataddr((

1025 name_part,

1026 new_email_split[0][1],

1027 ))

1028

1029def parse_contact_from_email(text):

1030 """ Parse contact name and email (given by text) in order to find contact

1031 information, able to populate records like partners, leads, ...

1032 Supported syntax:

1033

1034 * Raoul <raoul@grosbedon.fr>

1035 * "Raoul le Grand" <raoul@grosbedon.fr>

1036 * Raoul raoul@grosbedon.fr (strange fault tolerant support from

1037 df40926d2a57c101a3e2d221ecfd08fbb4fea30e now supported directly

1038 in 'email_split_tuples';

1039

1040 Otherwise: default, text is set as name.

1041

1042 :return: name, email (normalized if possible)

1043 """

1044 if not text or not text.strip(): 1044 ↛ 1045line 1044 didn't jump to line 1045 because the condition on line 1044 was never true

1045 return '', ''

1046 split_results = email_split_tuples(text)

1047 name, email = split_results[0] if split_results else ('', '')

1048

1049 if email: 1049 ↛ 1052line 1049 didn't jump to line 1052 because the condition on line 1049 was always true

1050 email_normalized = email_normalize(email, strict=False) or email

1051 else:

1052 name, email_normalized = text, ''

1053

1054 return name, email_normalized

1055

1056def unfold_references(msg_references):

1057 """ As it declared in [RFC2822] long header bodies can be "folded" using

1058 CRLF+WSP. Some mail clients split References header body which contains

1059 Message Ids by "\n ".

1060

1061 RFC2882: https://tools.ietf.org/html/rfc2822#section-2.2.3 """

1062 return [

1063 re.sub(r'[\r\n\t ]+', r'', ref) # "Unfold" buggy references

1064 for ref in mail_header_msgid_re.findall(msg_references)

1065 ]

Coverage for adhoc-cicd-odoo-odoo / odoo / tools / mail.py: 60%

490 statements