Coverage for adhoc-cicd-odoo-odoo / odoo / tools / mail.py: 60%

490 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-09 18:05 +0000

1# -*- coding: utf-8 -*- 

2# Part of Odoo. See LICENSE file for full copyright and licensing details. 

3 

4import base64 

5import collections 

6import itertools 

7import logging 

8import random 

9import re 

10import socket 

11import time 

12import email.utils 

13from email.utils import getaddresses as orig_getaddresses 

14from urllib.parse import urlparse 

15from typing import Literal 

16import html as htmllib 

17 

18import idna 

19import markupsafe 

20from lxml import etree, html 

21from lxml.html import ( 

22 XHTML_NAMESPACE, 

23 _contains_block_level_tag, 

24 _looks_like_full_html_bytes, 

25 _looks_like_full_html_unicode, 

26 clean, 

27 defs, 

28 document_fromstring, 

29 html_parser, 

30) 

31from werkzeug import urls 

32 

33from odoo.tools import misc 

34 

35__all__ = [ 

36 "email_domain_extract", 

37 "email_domain_normalize", 

38 "email_normalize", 

39 "email_normalize_all", 

40 "email_split", 

41 "encapsulate_email", 

42 "formataddr", 

43 "html2plaintext", 

44 "html_normalize", 

45 "html_sanitize", 

46 "is_html_empty", 

47 "parse_contact_from_email", 

48 "plaintext2html", 

49 "single_email_re", 

50] 

51 

52_logger = logging.getLogger(__name__) 

53 

54 

55# disable strict mode when present: we rely on original non-strict 

56# parsing, and we know that it isn't reliable, that ok. 

57# cfr python/cpython@4a153a1d3b18803a684cd1bcc2cdf3ede3dbae19 

58if hasattr(email.utils, 'supports_strict_parsing'): 58 ↛ 62line 58 didn't jump to line 62 because the condition on line 58 was always true

59 def getaddresses(fieldvalues): 

60 return orig_getaddresses(fieldvalues, strict=False) 

61else: 

62 getaddresses = orig_getaddresses 

63 

64 

65#---------------------------------------------------------- 

66# HTML Sanitizer 

67#---------------------------------------------------------- 

68 

69safe_attrs = defs.safe_attrs | frozenset( 

70 ['style', 

71 'data-o-mail-quote', 'data-o-mail-quote-node', # quote detection 

72 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translation-source-sha', 'data-oe-nodeid', 

73 'data-last-history-steps', 'data-oe-protected', 'data-embedded', 'data-embedded-editable', 'data-embedded-props', 'data-oe-version', 

74 'data-oe-transient-content', 'data-behavior-props', 'data-prop-name', 'data-width', 'data-height', 'data-scale-x', 'data-scale-y', 'data-x', 'data-y', # legacy editor 

75 'data-oe-role', 'data-oe-aria-label', 

76 'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id', 

77 'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width', 

78 'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype', 

79 'data-attachment-id', 'data-format-mimetype', 

80 'data-ai-field', 'data-ai-record-id', 

81 'data-heading-link-id', 

82 'data-mimetype-before-conversion', 

83 'data-language-id', 

84 'data-bs-toggle', # support nav-tabs 

85 ]) 

86SANITIZE_TAGS = { 

87 # allow new semantic HTML5 tags 

88 'allow_tags': defs.tags | frozenset('article bdi section header footer hgroup nav aside figure main'.split() + [etree.Comment]), 

89 'kill_tags': ['base', 'embed', 'frame', 'head', 'iframe', 'link', 'meta', 

90 'noscript', 'object', 'script', 'style', 'title'], 

91 'remove_tags': ['html', 'body'], 

92} 

93 

94 

95class _Cleaner(clean.Cleaner): 

96 

97 _style_re = re.compile(r'''([\w-]+)\s*:\s*((?:[^;"']|"[^";]*"|'[^';]*')+)''') 

98 

99 _style_whitelist = [ 

100 'font-size', 'font-family', 'font-weight', 'font-style', 'background-color', 'color', 'text-align', 

101 'line-height', 'letter-spacing', 'text-transform', 'text-decoration', 'text-decoration', 'opacity', 

102 'float', 'vertical-align', 'display', 'object-fit', 

103 'padding', 'padding-top', 'padding-left', 'padding-bottom', 'padding-right', 

104 'margin', 'margin-top', 'margin-left', 'margin-bottom', 'margin-right', 

105 'white-space', 

106 # appearance 

107 'background-image', 'background-position', 'background-size', 'background-repeat', 'background-origin', 

108 # box model 

109 'border', 'border-color', 'border-radius', 'border-style', 'border-width', 'border-top', 'border-bottom', 

110 'height', 'width', 'max-width', 'min-width', 'min-height', 

111 # tables 

112 'border-collapse', 'border-spacing', 'caption-side', 'empty-cells', 'table-layout'] 

113 

114 _style_whitelist.extend( 

115 ['border-%s-%s' % (position, attribute) 

116 for position in ['top', 'bottom', 'left', 'right'] 

117 for attribute in ('style', 'color', 'width', 'left-radius', 'right-radius')] 

118 ) 

119 

120 strip_classes = False 

121 sanitize_style = False 

122 conditional_comments = True 

123 

124 def __call__(self, doc): 

125 super(_Cleaner, self).__call__(doc) 

126 

127 # if we keep attributes but still remove classes 

128 if not getattr(self, 'safe_attrs_only', False) and self.strip_classes: 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 for el in doc.iter(tag=etree.Element): 

130 self.strip_class(el) 

131 

132 # if we keep style attribute, sanitize them 

133 if not self.style and self.sanitize_style: 

134 for el in doc.iter(tag=etree.Element): 

135 self.parse_style(el) 

136 

137 def strip_class(self, el): 

138 if el.attrib.get('class'): 

139 del el.attrib['class'] 

140 

141 def parse_style(self, el): 

142 attributes = el.attrib 

143 styling = attributes.get('style') 

144 if styling: 

145 valid_styles = collections.OrderedDict() 

146 styles = self._style_re.findall(styling) 

147 for style in styles: 

148 if style[0].lower() in self._style_whitelist: 

149 valid_styles[style[0].lower()] = style[1] 

150 if valid_styles: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true

151 el.attrib['style'] = '; '.join('%s:%s' % (key, val) for (key, val) in valid_styles.items()) 

152 else: 

153 del el.attrib['style'] 

154 

155 def kill_conditional_comments(self, doc): 

156 """Override the default behavior of lxml. 

157 

158 https://github.com/lxml/lxml/blob/e82c9153c4a7d505480b94c60b9a84d79d948efb/src/lxml/html/clean.py#L501-L510 

159 

160 In some use cases, e.g. templates used for mass mailing, 

161 we send emails containing conditional comments targeting Microsoft Outlook, 

162 to give special styling instructions. 

163 https://github.com/odoo/odoo/pull/119325/files#r1301064789 

164 

165 Within these conditional comments, unsanitized HTML can lie. 

166 However, in modern browser, these comments are considered as simple comments, 

167 their content is not executed. 

168 https://caniuse.com/sr_ie-features 

169 """ 

170 if self.conditional_comments: 

171 super().kill_conditional_comments(doc) 

172 

173 

174def tag_quote(el): 

175 def _create_new_node(tag, text, tail=None, attrs=None): 

176 new_node = etree.Element(tag) 

177 new_node.text = text 

178 new_node.tail = tail 

179 if attrs: 

180 for key, val in attrs.items(): 

181 new_node.set(key, val) 

182 return new_node 

183 

184 def _tag_matching_regex_in_text(regex, node, tag='span', attrs=None): 

185 text = node.text or '' 

186 if not re.search(regex, text): 186 ↛ 189line 186 didn't jump to line 189 because the condition on line 186 was always true

187 return 

188 

189 child_node = None 

190 idx, node_idx = 0, 0 

191 for item in re.finditer(regex, text): 

192 new_node = _create_new_node(tag, text[item.start():item.end()], None, attrs) 

193 if child_node is None: 

194 node.text = text[idx:item.start()] 

195 new_node.tail = text[item.end():] 

196 node.insert(node_idx, new_node) 

197 else: 

198 child_node.tail = text[idx:item.start()] 

199 new_node.tail = text[item.end():] 

200 node.insert(node_idx, new_node) 

201 child_node = new_node 

202 idx = item.end() 

203 node_idx = node_idx + 1 

204 

205 el_class = el.get('class', '') or '' 

206 el_id = el.get('id', '') or '' 

207 

208 # gmail or yahoo // # outlook, html // # msoffice 

209 if 'gmail_extra' in el_class or \ 209 ↛ 211line 209 didn't jump to line 211 because the condition on line 209 was never true

210 ('SkyDrivePlaceholder' in el_class or 'SkyDrivePlaceholder' in el_class): 

211 el.set('data-o-mail-quote', '1') 

212 if el.getparent() is not None: 

213 el.getparent().set('data-o-mail-quote-container', '1') 

214 

215 if (el.tag == 'hr' and ('stopSpelling' in el_class or 'stopSpelling' in el_id)) or \ 215 ↛ 218line 215 didn't jump to line 218 because the condition on line 215 was never true

216 'yahoo_quoted' in el_class: 

217 # Quote all elements after this one 

218 el.set('data-o-mail-quote', '1') 

219 for sibling in el.itersiblings(preceding=False): 

220 sibling.set('data-o-mail-quote', '1') 

221 

222 # odoo, gmail and outlook automatic signature wrapper 

223 is_signature_wrapper = 'odoo_signature_wrapper' in el_class or 'gmail_signature' in el_class or el_id == "Signature" 

224 is_outlook_auto_message = 'appendonsend' in el_id 

225 # gmail and outlook reply quote 

226 is_outlook_reply_quote = 'divRplyFwdMsg' in el_id 

227 is_gmail_quote = 'gmail_quote' in el_class 

228 is_quote_wrapper = is_signature_wrapper or is_gmail_quote or is_outlook_reply_quote 

229 if is_quote_wrapper: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 el.set('data-o-mail-quote-container', '1') 

231 el.set('data-o-mail-quote', '1') 

232 

233 # outlook reply wrapper is preceded with <hr> and a div containing recipient info 

234 if is_outlook_reply_quote: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true

235 hr = el.getprevious() 

236 reply_quote = el.getnext() 

237 if hr is not None and hr.tag == 'hr': 

238 hr.set('data-o-mail-quote', '1') 

239 if reply_quote is not None: 

240 reply_quote.set('data-o-mail-quote-container', '1') 

241 reply_quote.set('data-o-mail-quote', '1') 

242 

243 if is_outlook_auto_message: 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 if not el.text or not el.text.strip(): 

245 el.set('data-o-mail-quote-container', '1') 

246 el.set('data-o-mail-quote', '1') 

247 

248 # html signature (-- <br />blah) 

249 signature_begin = re.compile(r"((?:(?:^|\n)[-]{2}[\s]?$))") 

250 if el.text and el.find('br') is not None and re.search(signature_begin, el.text): 

251 el.set('data-o-mail-quote', '1') 

252 if el.getparent() is not None: 252 ↛ 256line 252 didn't jump to line 256 because the condition on line 252 was always true

253 el.getparent().set('data-o-mail-quote-container', '1') 

254 

255 # text-based quotes (>, >>) and signatures (-- Signature) 

256 text_complete_regex = re.compile(r"((?:\n[>]+[^\n\r]*)+|(?:(?:^|\n)[-]{2}[\s]?[\r\n]{1,2}[\s\S]+))") 

257 if not el.get('data-o-mail-quote'): 

258 _tag_matching_regex_in_text(text_complete_regex, el, 'span', {'data-o-mail-quote': '1'}) 

259 

260 if el.tag == 'blockquote': 

261 # remove single node 

262 el.set('data-o-mail-quote-node', '1') 

263 el.set('data-o-mail-quote', '1') 

264 if el.getparent() is not None and not el.getparent().get('data-o-mail-quote-node'): 

265 if el.getparent().get('data-o-mail-quote'): 

266 el.set('data-o-mail-quote', '1') 

267 # only quoting the elements following the first quote in the container 

268 # avoids issues with repeated calls to html_normalize 

269 elif el.getparent().get('data-o-mail-quote-container'): 

270 if (first_sibling_quote := el.getparent().find("*[@data-o-mail-quote]")) is not None: 270 ↛ 276line 270 didn't jump to line 276 because the condition on line 270 was always true

271 siblings = el.getparent().getchildren() 

272 quote_index = siblings.index(first_sibling_quote) 

273 element_index = siblings.index(el) 

274 if quote_index < element_index: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 el.set('data-o-mail-quote', '1') 

276 if el.getprevious() is not None and el.getprevious().get('data-o-mail-quote') and not el.text_content().strip(): 

277 el.set('data-o-mail-quote', '1') 

278 

279 

280def fromstring(html_, base_url=None, parser=None, **kw): 

281 """ 

282 This function mimics lxml.html.fromstring. It not only returns the parsed 

283 element/document but also a flag indicating whether the input is for a 

284 a single body element or not. 

285 

286 This tries to minimally parse the chunk of text, without knowing if it 

287 is a fragment or a document. 

288 

289 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 

290 """ 

291 if parser is None: 291 ↛ 293line 291 didn't jump to line 293 because the condition on line 291 was always true

292 parser = html_parser 

293 if isinstance(html_, bytes): 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 is_full_html = _looks_like_full_html_bytes(html_) 

295 else: 

296 is_full_html = _looks_like_full_html_unicode(html_) 

297 doc = document_fromstring(html_, parser=parser, base_url=base_url, **kw) 

298 if is_full_html: 

299 return doc, False 

300 # otherwise, lets parse it out... 

301 bodies = doc.findall('body') 

302 if not bodies: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true

303 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 

304 if bodies: 304 ↛ 320line 304 didn't jump to line 320 because the condition on line 304 was always true

305 body = bodies[0] 

306 if len(bodies) > 1: 306 ↛ 309line 306 didn't jump to line 309 because the condition on line 306 was never true

307 # Somehow there are multiple bodies, which is bad, but just 

308 # smash them into one body 

309 for other_body in bodies[1:]: 

310 if other_body.text: 

311 if len(body): 

312 body[-1].tail = (body[-1].tail or '') + other_body.text 

313 else: 

314 body.text = (body.text or '') + other_body.text 

315 body.extend(other_body) 

316 # We'll ignore tail 

317 # I guess we are ignoring attributes too 

318 other_body.drop_tree() 

319 else: 

320 body = None 

321 heads = doc.findall('head') 

322 if not heads: 322 ↛ 324line 322 didn't jump to line 324 because the condition on line 322 was always true

323 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 

324 if heads: 324 ↛ 326line 324 didn't jump to line 326 because the condition on line 324 was never true

325 # Well, we have some sort of structure, so lets keep it all 

326 head = heads[0] 

327 if len(heads) > 1: 

328 for other_head in heads[1:]: 

329 head.extend(other_head) 

330 # We don't care about text or tail in a head 

331 other_head.drop_tree() 

332 return doc, False 

333 if body is None: 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true

334 return doc, False 

335 if (len(body) == 1 and (not body.text or not body.text.strip()) 

336 and (not body[-1].tail or not body[-1].tail.strip())): 

337 # The body has just one element, so it was probably a single 

338 # element passed in 

339 return body[0], True 

340 # Now we have a body which represents a bunch of tags which have the 

341 # content that was passed in. We will create a fake container, which 

342 # is the body tag, except <body> implies too much structure. 

343 if _contains_block_level_tag(body): 

344 body.tag = 'div' 

345 else: 

346 body.tag = 'span' 

347 return body, False 

348 

349 

350def html_normalize(src, filter_callback=None, output_method="html"): 

351 """ Normalize `src` for storage as an html field value. 

352 

353 The string is parsed as an html tag soup, made valid, then decorated for 

354 "email quote" detection, and prepared for an optional filtering. 

355 The filtering step (e.g. sanitization) should be performed by the 

356 `filter_callback` function (to avoid multiple parsing operations, and 

357 normalize the result). 

358 

359 :param src: the html string to normalize 

360 :param filter_callback: optional callable taking a single `etree._Element` 

361 document parameter, to be called during normalization in order to 

362 filter the output document 

363 :param output_method: defines the output method to pass to `html.tostring`. 

364 It defaults to 'html', but can also be 'xml' for xhtml output. 

365 """ 

366 if not src: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true

367 return src 

368 

369 # html: remove encoding attribute inside tags 

370 src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src) 

371 

372 src = src.replace('--!>', '-->') 

373 src = re.sub(r'(<!-->|<!--->)', '<!-- -->', src) 

374 # On the specific case of Outlook desktop it adds unnecessary '<o:.*></o:.*>' tags which are parsed 

375 # in '<p></p>' which may alter the appearance (eg. spacing) of the mail body 

376 src = re.sub(r'</?o:.*?>', '', src) 

377 

378 try: 

379 doc, single_body_element = fromstring(src) 

380 except etree.ParserError as e: 

381 # HTML comment only string, whitespace only.. 

382 if 'empty' in str(e): 

383 return "" 

384 raise 

385 

386 # perform quote detection before cleaning and class removal 

387 for el in doc.iter(tag=etree.Element): 

388 tag_quote(el) 

389 

390 doc = html.fromstring(html.tostring(doc, method=output_method)) 

391 

392 if filter_callback: 

393 doc = filter_callback(doc) 

394 

395 src = html.tostring(doc, encoding='unicode', method=output_method) 

396 

397 if not single_body_element and src.startswith('<div>') and src.endswith('</div>'): 

398 # the <div></div> may come from 2 places 

399 # 1. the src is parsed as multiple body elements 

400 # <div></div> wraps all elements. 

401 # 2. the src is parsed as not only body elements 

402 # <html></html> wraps all elements. 

403 # then the Cleaner as the filter_callback which has 'html' in its 

404 # 'remove_tags' will write <html></html> to <div></div> since it 

405 # cannot directly drop the parent-most tag 

406 src = src[5:-6] 

407 

408 # html considerations so real html content match database value 

409 src = src.replace(u'\xa0', u'&nbsp;') 

410 

411 return src 

412 

413 

414def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, sanitize_form=True, sanitize_conditional_comments=True, strip_style=False, strip_classes=False, output_method="html"): 

415 if not src: 

416 return src 

417 

418 logger = logging.getLogger(__name__ + '.html_sanitize') 

419 

420 def sanitize_handler(doc): 

421 kwargs = { 

422 'page_structure': True, 

423 'style': strip_style, # True = remove style tags/attrs 

424 'sanitize_style': sanitize_style, # True = sanitize styling 

425 'forms': sanitize_form, # True = remove form tags 

426 'remove_unknown_tags': False, 

427 'comments': False, 

428 'conditional_comments': sanitize_conditional_comments, # True = remove conditional comments 

429 'processing_instructions': False 

430 } 

431 if sanitize_tags: 

432 kwargs.update(SANITIZE_TAGS) 

433 

434 if sanitize_attributes: # We keep all attributes in order to keep "style" 

435 if strip_classes: 435 ↛ 436line 435 didn't jump to line 436 because the condition on line 435 was never true

436 current_safe_attrs = safe_attrs - frozenset(['class']) 

437 else: 

438 current_safe_attrs = safe_attrs 

439 kwargs.update({ 

440 'safe_attrs_only': True, 

441 'safe_attrs': current_safe_attrs, 

442 }) 

443 else: 

444 kwargs.update({ 

445 'safe_attrs_only': False, # keep oe-data attributes + style 

446 'strip_classes': strip_classes, # remove classes, even when keeping other attributes 

447 }) 

448 

449 cleaner = _Cleaner(**kwargs) 

450 cleaner(doc) 

451 return doc 

452 

453 try: 

454 sanitized = html_normalize(src, filter_callback=sanitize_handler, output_method=output_method) 

455 except etree.ParserError: 

456 if not silent: 

457 raise 

458 logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True) 

459 sanitized = '<p>ParserError when sanitizing</p>' 

460 except Exception: 

461 if not silent: 

462 raise 

463 logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True) 

464 sanitized = '<p>Unknown error when sanitizing</p>' 

465 

466 return markupsafe.Markup(sanitized) 

467 

468# ---------------------------------------------------------- 

469# HTML/Text management 

470# ---------------------------------------------------------- 

471 

472URL_SKIP_PROTOCOL_REGEX = r'mailto:|tel:|sms:' 

473URL_REGEX = rf'''(\bhref=['"](?!{URL_SKIP_PROTOCOL_REGEX})([^'"]+)['"])''' 

474TEXT_URL_REGEX = r'https?://[\w@:%.+&~#=/-]+(?:\?\S+)?' 

475# retrieve inner content of the link 

476HTML_TAG_URL_REGEX = URL_REGEX + r'([^<>]*>([^<>]+)<\/)?' 

477HTML_TAGS_REGEX = re.compile('<.*?>') 

478HTML_NEWLINES_REGEX = re.compile('<(div|p|br|tr)[^>]*>|\n') 

479 

480 

481def validate_url(url): 

482 if urls.url_parse(url).scheme not in ('http', 'https', 'ftp', 'ftps'): 

483 return 'http://' + url 

484 

485 return url 

486 

487 

488def is_html_empty(html_content: str | markupsafe.Markup | Literal[False] | None) -> bool: 

489 """Check if a html content is empty. If there are only formatting tags with style 

490 attributes or a void content return True. Famous use case if a 

491 '<p style="..."><br></p>' added by some web editor. 

492 

493 :param html_content: html content, coming from example from an HTML field 

494 :returns: True if no content found or if containing only void formatting tags 

495 """ 

496 if not html_content: 

497 return True 

498 icon_re = r'<\s*(i|span)\b(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"])?)*\s*\bclass\s*=\s*["\'][^"\']*\b(fa|fab|fad|far|oi)\b' 

499 tag_re = r'<\s*\/?(?:p|div|section|span|br|b|i|font)\b(?:(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"]))*)(?:\s*>|\s*\/\s*>)' 

500 text_content = htmllib.unescape(re.sub(tag_re, '', html_content)) 

501 return not bool(text_content.strip()) and not re.search(icon_re, html_content) 

502 

503 

504def html_keep_url(text): 

505 """ Transform the url into clickable link with <a/> tag """ 

506 idx = 0 

507 final = '' 

508 link_tags = re.compile(r"""(?<!["'])((ftp|http|https):\/\/(\w+:{0,1}\w*@)?([^\s<"']+)(:[0-9]+)?(\/|\/([^\s<"']))?)(?![^\s<"']*["']|[^\s<"']*</a>)""") 

509 for item in re.finditer(link_tags, text): 509 ↛ 510line 509 didn't jump to line 510 because the loop on line 509 never started

510 final += text[idx:item.start()] 

511 final += create_link(item.group(0), item.group(0)) 

512 idx = item.end() 

513 final += text[idx:] 

514 return final 

515 

516 

517def html_to_inner_content(html): 

518 """Returns unformatted text after removing html tags and excessive whitespace from a 

519 string/Markup. Passed strings will first be sanitized. 

520 """ 

521 if is_html_empty(html): 521 ↛ 522line 521 didn't jump to line 522 because the condition on line 521 was never true

522 return '' 

523 if not isinstance(html, markupsafe.Markup): 523 ↛ 524line 523 didn't jump to line 524 because the condition on line 523 was never true

524 html = html_sanitize(html) 

525 processed = re.sub(HTML_NEWLINES_REGEX, ' ', html) 

526 processed = re.sub(HTML_TAGS_REGEX, '', processed) 

527 processed = re.sub(r' {2,}|\t', ' ', processed) 

528 processed = processed.replace("\xa0", " ") 

529 processed = htmllib.unescape(processed) 

530 return processed.strip() 

531 

532 

533def create_link(url, label): 

534 return f'<a href="{url}" target="_blank" rel="noreferrer noopener">{label}</a>' 

535 

536 

537def html2plaintext( 

538 html: str | markupsafe.Markup | Literal[False] | None, 

539 body_id: str | None = None, 

540 encoding: str = 'utf-8', 

541 include_references: bool = True 

542) -> str: 

543 """ From an HTML text, convert the HTML to plain text. 

544 If @param body_id is provided then this is the tag where the 

545 body (not necessarily <body>) starts. 

546 :param include_references: If False, numbered references and 

547 URLs for links and images will not be included. 

548 """ 

549 ## (c) Fry-IT, www.fry-it.com, 2007 

550 ## <peter@fry-it.com> 

551 ## download here: http://www.peterbe.com/plog/html2plaintext 

552 if not (html and html.strip()): 

553 return '' 

554 

555 if isinstance(html, bytes): 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true

556 html = html.decode(encoding) 

557 else: 

558 assert isinstance(html, str), f"expected str got {html.__class__.__name__}" 

559 

560 tree = etree.fromstring(html, parser=etree.HTMLParser()) 

561 

562 if body_id is not None: 562 ↛ 563line 562 didn't jump to line 563 because the condition on line 562 was never true

563 source = tree.xpath('//*[@id=%s]' % (body_id,)) 

564 else: 

565 source = tree.xpath('//body') 

566 if len(source): 566 ↛ 569line 566 didn't jump to line 569 because the condition on line 566 was always true

567 tree = source[0] 

568 

569 url_index = [] 

570 linkrefs = itertools.count(1) 

571 if include_references: 571 ↛ 588line 571 didn't jump to line 588 because the condition on line 571 was always true

572 for link in tree.findall('.//a'): 

573 if url := link.get('href'): 573 ↛ 572line 573 didn't jump to line 572 because the condition on line 573 was always true

574 link.tag = 'span' 

575 link.text = f'{link.text} [{next(linkrefs)}]' 

576 url_index.append(url) 

577 

578 for img in tree.findall('.//img'): 

579 if src := img.get('src'): 579 ↛ 578line 579 didn't jump to line 578 because the condition on line 579 was always true

580 img.tag = 'span' 

581 if src.startswith('data:'): 581 ↛ 582line 581 didn't jump to line 582 because the condition on line 581 was never true

582 img_name = None # base64 image 

583 else: 

584 img_name = re.search(r'[^/]+(?=\.[a-zA-Z]+(?:\?|$))', src) 

585 img.text = '%s [%s]' % (img_name[0] if img_name else 'Image', next(linkrefs)) 

586 url_index.append(src) 

587 

588 html = etree.tostring(tree, encoding="unicode") 

589 # \r char is converted into &#13;, must remove it 

590 html = html.replace('&#13;', '') 

591 

592 html = html.replace('<strong>', '*').replace('</strong>', '*') 

593 html = html.replace('<b>', '*').replace('</b>', '*') 

594 html = html.replace('<h3>', '*').replace('</h3>', '*') 

595 html = html.replace('<h2>', '**').replace('</h2>', '**') 

596 html = html.replace('<h1>', '**').replace('</h1>', '**') 

597 html = html.replace('<em>', '/').replace('</em>', '/') 

598 html = html.replace('<tr>', '\n') 

599 html = html.replace('</p>', '\n') 

600 html = re.sub(r'<br\s*/?>', '\n', html) 

601 html = re.sub('<.*?>', ' ', html) 

602 html = html.replace(' ' * 2, ' ') 

603 html = html.replace('&gt;', '>') 

604 html = html.replace('&lt;', '<') 

605 html = html.replace('&amp;', '&') 

606 html = html.replace('&nbsp;', '\N{NO-BREAK SPACE}') 

607 

608 # strip all lines 

609 html = '\n'.join([x.strip() for x in html.splitlines()]) 

610 html = html.replace('\n' * 2, '\n') 

611 

612 if url_index: 

613 html += '\n\n' 

614 for i, url in enumerate(url_index, start=1): 

615 html += f'[{i}] {url}\n' 

616 

617 return html.strip() 

618 

619 

620def plaintext2html(text: str, container_tag: str | None = None, with_paragraph: bool = True) -> markupsafe.Markup: 

621 r"""Convert plaintext into html. Content of the text is escaped to manage 

622 html entities, using :func:`~odoo.tools.misc.html_escape`. 

623 

624 - all ``\n``, ``\r`` are replaced by ``<br/>`` 

625 - convert url into clickable link 

626 

627 :param text: plaintext to convert 

628 :param container_tag: container of the html; by default the content is 

629 embedded into a ``<div>`` 

630 :param with_paragraph: whether or not considering 2 or more consecutive ``<br/>`` 

631 as paragraph breaks and enclosing content in ``<p>`` 

632 """ 

633 assert isinstance(text, str) 

634 text = misc.html_escape(text) 

635 

636 # 1. replace \n and \r 

637 text = re.sub(r'(\r\n|\r|\n)', '<br/>', text) 

638 

639 # 2. clickable links 

640 text = html_keep_url(text) 

641 

642 # 3-4: form paragraphs 

643 final = text 

644 if with_paragraph: 644 ↛ 645line 644 didn't jump to line 645 because the condition on line 644 was never true

645 idx = 0 

646 final = '<p>' 

647 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})') 

648 for item in re.finditer(br_tags, text): 

649 final += text[idx:item.start()] + '</p><p>' 

650 idx = item.end() 

651 final += text[idx:] + '</p>' 

652 

653 # 5. container 

654 if container_tag: # FIXME: validate that container_tag is just a simple tag? 654 ↛ 655line 654 didn't jump to line 655 because the condition on line 654 was never true

655 final = '<%s>%s</%s>' % (container_tag, final, container_tag) 

656 return markupsafe.Markup(final) 

657 

658def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=None): 

659 """ Append extra content at the end of an HTML snippet, trying 

660 to locate the end of the HTML document (</body>, </html>, or 

661 EOF), and converting the provided content in html unless ``plaintext`` 

662 is ``False``. 

663 

664 Content conversion can be done in two ways: 

665 

666 - wrapping it into a pre (``preserve=True``) 

667 - use plaintext2html (``preserve=False``, using ``container_tag`` to 

668 wrap the whole content) 

669 

670 A side-effect of this method is to coerce all HTML tags to 

671 lowercase in ``html``, and strip enclosing <html> or <body> tags in 

672 content if ``plaintext`` is False. 

673 

674 :param str html: html tagsoup (doesn't have to be XHTML) 

675 :param str content: extra content to append 

676 :param bool plaintext: whether content is plaintext and should 

677 be wrapped in a <pre/> tag. 

678 :param bool preserve: if content is plaintext, wrap it into a <pre> 

679 instead of converting it into html 

680 :param str container_tag: tag to wrap the content into, defaults to `div`. 

681 :rtype: markupsafe.Markup 

682 """ 

683 if plaintext and preserve: 

684 content = '\n<pre>%s</pre>\n' % misc.html_escape(content) 

685 elif plaintext: 

686 content = '\n%s\n' % plaintext2html(content, container_tag) 

687 else: 

688 content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content) 

689 content = '\n%s\n' % content 

690 # Force all tags to lowercase 

691 html = re.sub(r'(</?)(\w+)([ >])', 

692 lambda m: '%s%s%s' % (m[1], m[2].lower(), m[3]), html) 

693 insert_location = html.find('</body>') 

694 if insert_location == -1: 

695 insert_location = html.find('</html>') 

696 if insert_location == -1: 

697 return markupsafe.Markup('%s%s' % (html, content)) 

698 return markupsafe.Markup('%s%s%s' % (html[:insert_location], content, html[insert_location:])) 

699 

700 

701def prepend_html_content(html_body, html_content): 

702 """Prepend some HTML content at the beginning of an other HTML content.""" 

703 replacement = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', html_content) 

704 html_content = markupsafe.Markup(replacement) if isinstance(html_content, markupsafe.Markup) else replacement 

705 html_content = html_content.strip() 

706 

707 body_match = re.search(r'<body[^>]*>', html_body) or re.search(r'<html[^>]*>', html_body) 

708 insert_index = body_match.end() if body_match else 0 

709 

710 return html_body[:insert_index] + html_content + html_body[insert_index:] 

711 

712#---------------------------------------------------------- 

713# Emails 

714#---------------------------------------------------------- 

715 

716# matches any email in a body of text 

717email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63})""", re.VERBOSE) 

718 

719# matches a string containing only one email 

720single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63}$""", re.VERBOSE) 

721 

722mail_header_msgid_re = re.compile('<[^<>]+>') 

723 

724email_addr_escapes_re = re.compile(r'[\\"]') 

725 

726def generate_tracking_message_id(res_id): 

727 """Returns a string that can be used in the Message-ID RFC822 header field 

728 

729 Used to track the replies related to a given object thanks to the "In-Reply-To" 

730 or "References" fields that Mail User Agents will set. 

731 """ 

732 try: 

733 rnd = random.SystemRandom().random() 

734 except NotImplementedError: 

735 rnd = random.random() 

736 rndstr = ("%.15f" % rnd)[2:] 

737 return "<%s.%.15f-openerp-%s@%s>" % (rndstr, time.time(), res_id, socket.gethostname()) 

738 

739def email_split_tuples(text): 

740 """ Return a list of (name, email) address tuples found in ``text`` . Note 

741 that text should be an email header or a stringified email list as it may 

742 give broader results than expected on actual text. """ 

743 def _parse_based_on_spaces(pair): 

744 """ With input 'name email@domain.com' (missing quotes for a formatting) 

745 getaddresses returns ('', 'name email@domain.com). This when having no 

746 name and an email a fallback to enhance parsing is to redo a getaddresses 

747 by replacing spaces by commas. The new email will be split into sub pairs 

748 allowing to find the email and name parts, allowing to make a new name / 

749 email pair. Emails should not contain spaces thus this is coherent with 

750 email formation. """ 

751 name, email = pair 

752 if not name and email and ' ' in email: 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true

753 inside_pairs = getaddresses([email.replace(' ', ',')]) 

754 name_parts, found_email = [], False 

755 for pair in inside_pairs: 

756 if pair[1] and '@' not in pair[1]: 

757 name_parts.append(pair[1]) 

758 if pair[1] and '@' in pair[1]: 

759 found_email = pair[1] 

760 name, email = (' '.join(name_parts), found_email) if found_email else (name, email) 

761 return (name, email) 

762 

763 if not text: 

764 return [] 

765 

766 # found valid pairs, filtering out failed parsing 

767 valid_pairs = [ 

768 (addr[0], addr[1]) for addr in getaddresses([text]) 

769 # getaddresses() returns '' when email parsing fails, and 

770 # sometimes returns emails without at least '@'. The '@' 

771 # is strictly required in RFC2822's `addr-spec`. 

772 if addr[1] and '@' in addr[1] 

773 ] 

774 # corner case: returning '@gmail.com'-like email (see test_email_split) 

775 if any(pair[1].startswith('@') for pair in valid_pairs): 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true

776 filtered = [ 

777 found_email for found_email in email_re.findall(text) 

778 if found_email and not found_email.startswith('@') 

779 ] 

780 if filtered: 

781 valid_pairs = [('', found_email) for found_email in filtered] 

782 

783 return list(map(_parse_based_on_spaces, valid_pairs)) 

784 

785 

786def email_split(text): 

787 """ Return a list of the email addresses found in ``text`` """ 

788 return [email for (name, email) in email_split_tuples(text)] 

789 

790 

791def email_split_and_format(text): 

792 """ Return a list of email addresses found in ``text``, formatted using 

793 formataddr. """ 

794 return [formataddr((name, email)) for (name, email) in email_split_tuples(text)] 

795 

796 

797def email_split_and_normalize(text): 

798 """ Same as 'email_split' but normalized email """ 

799 return [(name, _normalize_email(email)) for (name, email) in email_split_tuples(text)] 

800 

801 

802def email_split_and_format_normalize(text): 

803 """ Same as 'email_split_and_format' but normalizing email. """ 

804 return [ 

805 formataddr( 

806 (name, _normalize_email(email)) 

807 ) for (name, email) in email_split_tuples(text) 

808 ] 

809 

810def email_normalize(text, strict=True): 

811 """ Sanitize and standardize email address entries. As of rfc5322 section 

812 3.4.1 local-part is case-sensitive. However most main providers do consider 

813 the local-part as case insensitive. With the introduction of smtp-utf8 

814 within odoo, this assumption is certain to fall short for international 

815 emails. We now consider that 

816 

817 * if local part is ascii: normalize still 'lower' ; 

818 * else: use as it, SMTP-UF8 is made for non-ascii local parts; 

819 

820 Concerning domain part of the address, as of v14 international domain (IDNA) 

821 are handled fine. The domain is always lowercase, lowering it is fine as it 

822 is probably an error. With the introduction of IDNA, there is an encoding 

823 that allow non-ascii characters to be encoded to ascii ones, using 'idna.encode'. 

824 

825 A normalized email is considered as : 

826 - having a left part + @ + a right part (the domain can be without '.something') 

827 - having no name before the address. Typically, having no 'Name <>' 

828 Ex: 

829 - Possible Input Email : 'Name <NaMe@DoMaIn.CoM>' 

830 - Normalized Output Email : 'name@domain.com' 

831 

832 :param boolean strict: if True, text should contain a single email 

833 (default behavior in stable 14+). If more than one email is found no 

834 normalized email is returned. If False the first found candidate is used 

835 e.g. if email is 'tony@e.com, "Tony2" <tony2@e.com>', result is either 

836 False (strict=True), either 'tony@e.com' (strict=False). 

837 

838 :return: False if no email found (or if more than 1 email found when being 

839 in strict mode); normalized email otherwise; 

840 """ 

841 emails = email_split(text) 

842 if not emails or (strict and len(emails) != 1): 

843 return False 

844 return _normalize_email(emails[0]) 

845 

846def email_normalize_all(text): 

847 """ Tool method allowing to extract email addresses from a text input and returning 

848 normalized version of all found emails. If no email is found, a void list 

849 is returned. 

850 

851 e.g. if email is 'tony@e.com, "Tony2" <tony2@e.com' returned result is ['tony@e.com, tony2@e.com'] 

852 

853 :return list: list of normalized emails found in text 

854 """ 

855 emails = email_split(text) 

856 return list(filter(None, [_normalize_email(email) for email in emails])) 

857 

858def _normalize_email(email): 

859 """ As of rfc5322 section 3.4.1 local-part is case-sensitive. However most 

860 main providers do consider the local-part as case insensitive. With the 

861 introduction of smtp-utf8 within odoo, this assumption is certain to fall 

862 short for international emails. We now consider that 

863 

864 * if local part is ascii: normalize still 'lower' ; 

865 * else: use as it, SMTP-UF8 is made for non-ascii local parts; 

866 

867 Concerning domain part of the address, as of v14 international domain (IDNA) 

868 are handled fine. The domain is always lowercase, lowering it is fine as it 

869 is probably an error. With the introduction of IDNA, there is an encoding 

870 that allow non-ascii characters to be encoded to ascii ones, using 'idna.encode'. 

871 

872 A normalized email is considered as : 

873 - having a left part + @ + a right part (the domain can be without '.something') 

874 - having no name before the address. Typically, having no 'Name <>' 

875 Ex: 

876 - Possible Input Email : 'Name <NaMe@DoMaIn.CoM>' 

877 - Normalized Output Email : 'name@domain.com' 

878 """ 

879 local_part, at, domain = email.rpartition('@') 

880 try: 

881 local_part.encode('ascii') 

882 except UnicodeEncodeError: 

883 pass 

884 else: 

885 local_part = local_part.lower() 

886 

887 return local_part + at + domain.lower() 

888 

889def email_anonymize(normalized_email, *, redact_domain=False): 

890 """ 

891 Replace most charaters in the local part of the email address with 

892 '*' to hide the recipient, but keep enough characters for debugging 

893 purpose. 

894 

895 The email address must be normalized already. 

896 

897 >>> email_anonymize('admin@example.com') 

898 'a****@example.com' 

899 >>> email_anonymize('portal@example.com') 

900 'p***al@example.com' 

901 >>> email_anonymize('portal@example.com', redact_domain=True) 

902 'p***al@e******.com' 

903 """ 

904 if not normalized_email: 

905 return normalized_email 

906 

907 local, at, domain = normalized_email.partition('@') 

908 if len(local) <= 5: 

909 anon_local = local[:1] + '*' * (len(local) - 1) 

910 else: 

911 anon_local = local[:1] + '*' * (len(local) - 3) + local[-2:] 

912 

913 host, dot, tld = domain.rpartition('.') 

914 if redact_domain and not domain.startswith('[') and all((host, dot, tld)): 

915 anon_host = host[0] + '*' * (len(host) - 1) 

916 else: 

917 anon_host = host 

918 

919 return f'{anon_local}{at}{anon_host}{dot}{tld}' 

920 

921def email_domain_extract(email): 

922 """ Extract the company domain to be used by IAP services notably. Domain 

923 is extracted from email information e.g: 

924 

925 - info@proximus.be -> proximus.be 

926 """ 

927 normalized_email = email_normalize(email) 

928 if normalized_email: 

929 return normalized_email.split('@')[1] 

930 return False 

931 

932def email_domain_normalize(domain): 

933 """Return the domain normalized or False if the domain is invalid.""" 

934 if not domain or '@' in domain: 

935 return False 

936 

937 return domain.lower() 

938 

939def url_domain_extract(url): 

940 """ Extract the company domain to be used by IAP services notably. Domain 

941 is extracted from an URL e.g: 

942 

943 - www.info.proximus.be -> proximus.be 

944 """ 

945 parser_results = urlparse(url) 

946 company_hostname = parser_results.hostname 

947 if company_hostname and '.' in company_hostname: 

948 return '.'.join(company_hostname.split('.')[-2:]) # remove subdomains 

949 return False 

950 

951def email_escape_char(email_address): 

952 """ Escape problematic characters in the given email address string""" 

953 return email_address.replace('\\', '\\\\').replace('%', '\\%').replace('_', '\\_') 

954 

955# was mail_thread.decode_header() 

956def decode_message_header(message, header, separator=' '): 

957 return separator.join(h for h in message.get_all(header, []) if h) 

958 

959def formataddr(pair, charset='utf-8'): 

960 """Pretty format a 2-tuple of the form (realname, email_address). 

961 

962 If the first element of pair is falsy then only the email address 

963 is returned. 

964 

965 Set the charset to ascii to get a RFC-2822 compliant email. The 

966 realname will be base64 encoded (if necessary) and the domain part 

967 of the email will be punycode encoded (if necessary). The local part 

968 is left unchanged thus require the SMTPUTF8 extension when there are 

969 non-ascii characters. 

970 

971 >>> formataddr(('John Doe', 'johndoe@example.com')) 

972 '"John Doe" <johndoe@example.com>' 

973 

974 >>> formataddr(('', 'johndoe@example.com')) 

975 'johndoe@example.com' 

976 """ 

977 name, address = pair 

978 local, _, domain = address.rpartition('@') 

979 

980 try: 

981 domain.encode(charset) 

982 except UnicodeEncodeError: 

983 # rfc5890 - Internationalized Domain Names for Applications (IDNA) 

984 domain = idna.encode(domain).decode('ascii') 

985 

986 if name: 

987 try: 

988 name.encode(charset) 

989 except UnicodeEncodeError: 

990 # charset mismatch, encode as utf-8/base64 

991 # rfc2047 - MIME Message Header Extensions for Non-ASCII Text 

992 name = base64.b64encode(name.encode('utf-8')).decode('ascii') 

993 return f"=?utf-8?b?{name}?= <{local}@{domain}>" 

994 else: 

995 # ascii name, escape it if needed 

996 # rfc2822 - Internet Message Format 

997 # #section-3.4 - Address Specification 

998 name = email_addr_escapes_re.sub(r'\\\g<0>', name) 

999 return f'"{name}" <{local}@{domain}>' 

1000 return f"{local}@{domain}" 

1001 

1002def encapsulate_email(old_email, new_email): 

1003 """Change the FROM of the message and use the old one as name. 

1004 

1005 e.g. 

1006 * Old From: "Admin" <admin@gmail.com> 

1007 * New From: notifications@odoo.com 

1008 * Output: "Admin" <notifications@odoo.com> 

1009 """ 

1010 old_email_split = getaddresses([old_email]) 

1011 if not old_email_split or not old_email_split[0]: 

1012 return old_email 

1013 

1014 new_email_split = getaddresses([new_email]) 

1015 if not new_email_split or not new_email_split[0]: 

1016 return 

1017 

1018 old_name, old_email = old_email_split[0] 

1019 if old_name: 

1020 name_part = old_name 

1021 else: 

1022 name_part = old_email.split("@")[0] 

1023 

1024 return formataddr(( 

1025 name_part, 

1026 new_email_split[0][1], 

1027 )) 

1028 

1029def parse_contact_from_email(text): 

1030 """ Parse contact name and email (given by text) in order to find contact 

1031 information, able to populate records like partners, leads, ... 

1032 Supported syntax: 

1033 

1034 * Raoul <raoul@grosbedon.fr> 

1035 * "Raoul le Grand" <raoul@grosbedon.fr> 

1036 * Raoul raoul@grosbedon.fr (strange fault tolerant support from 

1037 df40926d2a57c101a3e2d221ecfd08fbb4fea30e now supported directly 

1038 in 'email_split_tuples'; 

1039 

1040 Otherwise: default, text is set as name. 

1041 

1042 :return: name, email (normalized if possible) 

1043 """ 

1044 if not text or not text.strip(): 1044 ↛ 1045line 1044 didn't jump to line 1045 because the condition on line 1044 was never true

1045 return '', '' 

1046 split_results = email_split_tuples(text) 

1047 name, email = split_results[0] if split_results else ('', '') 

1048 

1049 if email: 1049 ↛ 1052line 1049 didn't jump to line 1052 because the condition on line 1049 was always true

1050 email_normalized = email_normalize(email, strict=False) or email 

1051 else: 

1052 name, email_normalized = text, '' 

1053 

1054 return name, email_normalized 

1055 

1056def unfold_references(msg_references): 

1057 """ As it declared in [RFC2822] long header bodies can be "folded" using 

1058 CRLF+WSP. Some mail clients split References header body which contains 

1059 Message Ids by "\n ". 

1060 

1061 RFC2882: https://tools.ietf.org/html/rfc2822#section-2.2.3 """ 

1062 return [ 

1063 re.sub(r'[\r\n\t ]+', r'', ref) # "Unfold" buggy references 

1064 for ref in mail_header_msgid_re.findall(msg_references) 

1065 ]