The entire concept of replacing various tags with <invalidTag>
is kludgy.
Thus I completely disabled scriptprotect for all of my Lucee apps years ago.
My solution for dealing with potentially malicious user-or-bot-content was to create a library of security functions utilizing the OWASP-based SanitizeHTML()
customized via their HtmlPolicyBuilder Java object. Then I clean an entire form with one simple command.
Usage example:
strError = Request.udf.CleanForm(
strip="UserEmail,Subject,Referer",
safe="Body"
);
That strips all HTML and/or removes unsafe HTML from the specified Form fields.
Hereâs my code!
/*
These functions are intended to be encapsulated inside a
Request.udf object via cfinclude.
See usage example below in the comments for CleanForm().
A dummy call of EncodeForHTML('') is required prior to
instantiating the OWASP object to avoid the risk of the
OWASP extension not loading after Lucee restart.
This bug was reported in 2019 and at this rate it may
never be resolved because it's extremely complicated!
https://dev.lucee.org/t/error-with-esapi-functions-esapi-properties-could-not-be-loaded-by-any-means/5513/5
https://luceeserver.atlassian.net/browse/LDEV-2293
*/
EncodeForHTML('');
Request.objHtmlPolicyBuilder = CreateObject("java", "org.owasp.html.HtmlPolicyBuilder");
function CleanForm(string strip="", string safe="") {
/*
Arguments.strip is list of Form fields to be completely stripped of HTML.
Arguments.safe is list of Form fields to be sanitized via this.SafeHTML().
Usage example:
strError = Request.udf.CleanForm(strip="UserEmail,Subject,Referer", safe="Body");
Note: Only the first error is returned, with the intention of prompting
the user to resolve and resubmit, repeating until there are no errors.
*/
var str; var strError;
loop list=strip index="str" {
strError = this.CleanField(str, true);
if (strError != "") return strError;
}
loop list=safe index="str" {
strError = this.CleanField(str, false);
if (strError != "") return strError;
}
return ""; // empty string indicates no error
}
function CleanField(required string str, required boolean stripmode) {
var cleaned;
str = trim(str);
if (str == "") return "<b>Error: One or more CleanForm() arguments are empty.</b>";
if (!Form.keyExists(str)) return '<b>Error: Form["#str#"] does not exist.</b>';
Form[str] = trim(Form[str]);
if (Form[str] == "") return ""; // empty string indicates no error
cleaned = stripmode ? this.StripHTML(Form[str]) : this.SafeHTML(Form[str]);
// here it's assumed that if ALL the content was stripped,
// it was likely unsafe!
if (cleaned == "") return "<b>Error: Form.#str# contains potentially unsafe code.</b><br>#EncodeForHTML(Form[str])#";
Form[str] = cleaned;
return ""; // empty string indicates no error
}
function SafeHTML(required string str) {
// The ALL CAPS comments refer to the pre-packaged OWASP Sanitizers
// which translate to the code defined here, but with some customizations
// such as percentages allowed in <img> border, height, and width.
var Pattern = CreateObject("java", "java.util.regex.Pattern");
// For performance, because most forms have multiple fields,
// declare the policy only once per Request.
// Application scope is slightly less performant and has risk of locking,
// but try it again after Lucee 6?
if (!Request.keyExists("SafeHtmlPolicy"))
Request.SafeHtmlPolicy = Request.objHtmlPolicyBuilder.init()
.allowCommonBlockElements() // BLOCKS
.allowCommonInlineFormattingElements() // FORMATTING
// LINKS
.allowStandardUrlProtocols()
.allowElements(["a"])
.allowAttributes(["href", "target"]).onElements(["a"]).requireRelNofollowOnLinks()
.allowStyling() // STYLES
.allowElements(["img"]) // IMAGES
.allowAttributes(["alt", "src"]).onElements(["img"])
.allowAttributes(["border", "height", "width"]).matching(Pattern.compile("[0-9%]+")).onElements(["img"])
// TABLES
.allowElements(["table", "tr", "td", "th", "colgroup", "caption", "col", "thead", "tbody", "tfoot"])
.allowAttributes(["summary"]).onElements(["table"])
.allowAttributes(["align"]).matching(true, ["center", "left", "right", "justify"]).globally() // true = ignoreCase
.allowAttributes(["valign"]).matching(true, ["top", "middle", "bottom", "baseline"])
.onElements(["table", "tr", "td", "th", "colgroup", "col", "thead", "tbody", "tfoot"])
.allowTextIn(["table"])
// OTHER
.allowAttributes(["class", "title"]).globally()
.allowAttributes(["lang"]).matching(Pattern.compile("[a-zA-Z]{2,20}")).globally()
.toFactory()
;
return str.SanitizeHTML(Request.SafeHtmlPolicy);
}
function StripHTML(required string str) {
// This is useful for keeping XSS out of databases, for example.
// Of course encoding all output is still necessary,
// but for validating input that's intended to be non-HTML,
// cleaning prevents potentially ugly output that's confusing to users,
// and also stops XSS before it's potentially uploaded via API, etc.
// Default policy removes ALL HTML
if (!Request.keyExists("StripHtmlPolicy"))
Request.StripHtmlPolicy = Request.objHtmlPolicyBuilder.init().toFactory();
// workaround for https://dev.lucee.org/t/canonicalize-illegal-hex-characters-in-escape-pattern/10675
str = Replace(str, "%", "%25", "ALL");
str = Canonicalize(str, false, false);
str = SanitizeHtml(str, Request.StripHtmlPolicy);
// DecodeForHTML reverts characters @, &, <, >, etc?
// from HTML entities back to plain text
str = DecodeForHTML(str);
return str;
}
function StripAndEncodeForHTML(required string str) {
// When outputting potentially unsafe data e.g. XSS code,
// avoid user confusion from the display of bare code
// by stripping all HTML, then encode as yet one more
// final security measure in case StripHTML() missed anything.
// The only time stripping is not appropriate is when the content
// is intended to be a code block, in which case just use EncodeForHTML().
return StripHTML(str).EncodeForHTML();
}