Is an online tool for cleaning HTML tags when copying text from Word and other text processors to your CMS.
http://www.igorware.com/tools/word-html-sanitizer
http://www.igorware.com/tools/word-html-sanitizer
JavaScript:
//developed by Igor Jerosimić
<!DOCTYPE html>
<html itemscope itemtype="http://schema.org/ItemPage" lang="en">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width" />
<title>Word HTML Sanitizer</title>
<meta name="application-name" content="IgorWare" />
<meta property="og:title" content="Word HTML Sanitizer" />
<meta name="keywords" content="html sanitizer,word cleaning lady,word html,html cleanup,html tidy" />
<meta name="description" content="Free online tool for cleaning HTML tags when copying text from Word to your CMS." />
<meta name="author" content="Igor Jerosimić" />
<meta property="dc.creator" content="Igor Jerosimić" />
<style>
.tabline{margin-top:-27px;overflow:hidden;}
.tabselector
{
float:right;
margin:0 5px 0 0;
padding:2px 7px;
cursor:pointer;
color:#000;
background-color:#e9e9e9;
border-color:#ccc #ccc #e9e9e9;
border-width:1px;
border-style:solid;
border-top-right-radius:3px;border-top-left-radius:3px;
}
.tabselector_active
{
color:#999;
background-color:#f1f1f1;
border-color:#ccc;
}
.border
{
clear:both;
height:200px;
border:1px solid #ccc;
border-radius:3px;
}
textarea,
.textarea
{
clear:both;
margin:0;
padding:2px;
width:100%;
height:200px;
overflow-x:hidden;
overflow-y:auto;
border:none;
outline:none;
border-top:7px solid #e9e9e9;
-moz-box-sizing:border-box;box-sizing:border-box;
}
#space_saved,#removed_tags{color:blue;}
#warning{color:red;}
</style>
</head>
<body>
<div itemprop="description" class="appdescription">
[B]Word HTML Sanitizer[/B] is an online tool for cleaning HTML tags when copying text from Word and other text processors to your CMS.
</div>
<div class="jsrequired">
<label for="input">Input</label>
<div class="border">
<div class="tabline">
<div id="tabselector_input" class="tabselector tabselector_active" onclick="tab_switch('input');">HTML</div>
<div id="tabselector_inputdiv" class="tabselector" onclick="tab_switch('inputdiv');">Visual</div>
</div>
<div id="inputdiv" class="textarea" contenteditable="true" onpaste="handlepaste(this, event)" onkeyup="inputdiv_change();"></div>
<textarea id="input" style="display:none;" autofocus="autofocus" onkeyup="input_change();"></textarea>
</div>
<label for="output">Output</label>
<div class="border">
<div class="tabline">
<div id="tabselector_preview" class="tabselector tabselector_active" onclick="preview_switch();" title="View preview">Preview</div>
</div>
<textarea id="output" onchange="output_change();"></textarea>
<div id="preview" class="textarea" style="display:none;"></div>
</div>
Saved space: [B]0[/B] bytes
<input id="run" name="run" type="button" style="float:right;" value="Clean It" onclick="run();" />
<input id="runauto" name="runauto" type="checkbox" style="float:right;" checked="checked" />
<label for="runauto" style="float:right;">Auto Clean on paste</label>
Excluded tags:
[B][/B]
[B][/B]
[HR][/HR]
Word HTML Sanitizer [B]v2013-03-21[/B]
</div>
<noscript>
<style>.jsrequired{display:none!important;}</style>
NOTE: You need to turn on javascript for this tool to work!
</noscript>
<script>
function run()
{
var hinput = document.getElementById("input").value;
if (hinput === "")
{
document.getElementById("output").value = "";
document.getElementById("removed_tags").innerHTML = "";
document.getElementById("warning").innerHTML = "";
document.getElementById("space_saved").innerHTML = "0";
document.getElementById("input").focus();
return;
}
var input_len = hinput.length;
// konvertujemo sve nove redove u space, mora space jer bi u suprotnom zavrsili sa spojenim recima
hinput = hinput.replace(/\n/g, " ");
hinput = hinput.replace(/\t/g, " ");
// izbaci komentare
hinput = hinput.replace(/<!--.*?-->/g, " ");
// konvertuj nbsp u space
hinput = hinput.replace(/ /g, " ");
// izbaci duple spaceove
hinput = hinput.replace(/ +/g, " ");
// obrisi nedozvoljene html tagove i ocisti klase kod dozvoljenih tagova
hinput = remove_and_clean(hinput);
// izbaci prazne, ostavi samo
ukoliko je to jedini element unutra
hinput = hinput.replace(/[I]( *(?:<br \/>)? *)<\/em>/gi, "$1");
hinput = hinput.replace(/[B]( *(?:<br \/>)? *)<\/strong>/gi, "$1");
// da li je moguce izbaciti tag? samo za strong i em
hinput = hinput.replace(/<\/em>( *?)[I]/gi, "$1");
hinput = hinput.replace(/<\/strong>( *?)[B]/gi, "$1");
hinput = hinput.replace(/<\/em>( *?)[I]/gi, "$1"); // jos jednom nakon izbacivanja strong
// izbaci duple tagove
//hinput = hinput.replace(/<(.+?)>(.*?)<\1>(.*?)<\/\1>(.*?)<\/\1>/gi, "<$1>$2$3$4</$1>");
//hinput = hinput.replace(/<(([^\/].)*?)>((?:(?!<\/\1>).)*?)<\1>/gi, "<$1>$2$3$4</$1>");
//hinput = hinput.replace(/<((?:(?!\/).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)<\/\1>/gi, "<$1>$2$3$4</$1>"); // [I][I][/I][/I] // <- najbolji koji valja
// ne valja -> hinput = hinput.replace(/<((?:(?!\/).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)/gi, "<$1>$2$3$4"); // [I][I][/I][/I]
//hinput = hinput.replace(/<((?:(?!\/).)*?)>(?:((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?))*<\/\1>/gi, "<$1>$2$3$4</$1>"); // [I][I][/I][I][/I][/I]
//console.log(/<((?:(?!\/).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)<\/\1>/gi.exec(hinput));
//console.log(/<((?:(?!\/).)*?)>(?:((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?))*<\/\1>/gi.exec(hinput));
//
j
j
j
j
//
[SIZE=7]j
[SIZE=7]j[/SIZE]
[SIZE=7]j[/SIZE]
[SIZE=7]j[/SIZE][/SIZE]
//[LIST][*]t1[*]t2[LIST][*][/LIST][/LIST]
/*var hexec = /<((?:(?!\/).)*?)>(?:((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?))*<\/\1>/gi.exec(hinput);
if (hexec)
{
console.log(hexec);
}*/
while (/<((?:(?!\/|ul|li).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)/i.test(hinput))
{
hinput = hinput.replace(/<((?:(?!\/|ul|li).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)/i, "<$1>$2$3$4");
}
// izbaci duple spaceove
hinput = hinput.replace(/ +/g, " ");
// izbaci duple paragrafe
hinput = hinput.replace(/(?: *
*<\/p> *)+/gi, "
");
// izbaci nepotrebno sa pocetka
hinput = hinput.replace(/^(?: *
?<\/p> *)+/gi, " ");
hinput = hinput.replace(/^(?: *<br \/>)+/gi, " ");
// izbaci nepotrebno sa kraja
hinput = hinput.replace(/(?: *
?<\/p> *)+$/gi, "");
hinput = hinput.replace(/(?: *<br \/> *)+$/gi, "");
// uradi beautify
//hinput = hinput.replace(/(?<!^)<(?!\/|em|strong|sub|sup)/gi, "\n<"); // javascript nema positive lookbehind
hinput = hinput.replace(/<(?!\/|em|strong|sub|sup)/gi, "\n<");
hinput = hinput.replace(/( ?\n ?)+/, "\n");
hinput = hinput.replace(/^\n/, "");
// vrati rezultat
var houtput = document.getElementById("output");
houtput.value = hinput;
document.getElementById("preview").innerHTML = hinput;
// izracunaj koliko je sacuvano prostora
var output_len = hinput.length;
document.getElementById("space_saved").innerHTML = (input_len - output_len);
}
function remove_and_clean(hinput)
{
// dozvoljeni tagovi, h = h1, h2, h3...
var allowed = ["a", "p", "h", "br", "ul", "ol", "li", "strong", "b", "em", "i", "img", "table", "thead", "tbody", "colgroup", "col", "th", "tr", "td", "sub", "sup"];
// obrisi nedozvoljene html tagove i ocisti klase kod dozvoljenih tagova
var i, poz1 = 0, poz2 = 0, htmlelement, newhtmlelement, name, is_allowed, len, warning_tags = {"a":{count:0}, "col":{count:0}, "img":{count:0}}, removed_tags = [];
while (true)
{
// pronadji pocetak html taga
poz1 = hinput.indexOf("<", poz2);
if (poz1 === -1) break;
// pronadji kraj html taga
poz2 = hinput.indexOf(">", poz1);
if (poz2 === -1) break;
// treba obuhvatiti i ']'
poz2 += 1;
// uzmi html tag
htmlelement = hinput.substring(poz1, poz2);
// konvertuj u lowercase - ne moze ovde jer onda sjebe alt i title tagove na slikama
//htmlelement = htmlelement.toLowerCase();
// uzmi ime html taga
name = htmlelement.replace(/^< *\/? *([a-zA-Z:]+) *?.*?\/? *?>$/g, "$1").toLowerCase();
// da li je dozvoljen tag
is_allowed = -1;
for (i = 0; i < allowed.length; i++)
{
if (name === allowed[i])
{
is_allowed = i;
break;
}
}
// zapamti duzinu stringa
len = htmlelement.length;
// obrisi ili ocisti
newhtmlelement = hinput.substring(0, poz1);
if (is_allowed !== -1)
{
// dozvoljen html tag
var newstr;
switch (name)
{
case "br":
newstr = "
";
break;
case "a":
if (htmlelement.replace(/^< *(\/?) *a.*?>$/gi, "$1") !== "/")
{
newstr = "<a";
if (htmlelement.match(/href=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
{
//newstr == htmlelement.replace(/^< *a +?.*?href="(.*?)" *?.*?>$/i, " href=\"$1\"");
newstr += htmlelement.replace(/^.*href=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " href=\"$2$3\"");
}
if (htmlelement.match(/title=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
{
newstr += htmlelement.replace(/^.*title=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " title=\"$2$3\"");
}
if (htmlelement.match(/name=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
{
newstr += htmlelement.replace(/^.*name=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " name=\"$2$3\"");
}
newstr += ">";
warning_tags.a.count++;
}
else
{
newstr = "</a>";
}
break;
case "img":
//newstr = htmlelement.replace(/^.*src=(['"])(.*?)\1.*$/i, "<img src=\"$2\"");
newstr = htmlelement.replace(/^.*src=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, "<img src=\"$2$3\"");
//if (htmlelement.match(/width="?(\d+)"?/i))
//if (htmlelement.match(/width=(?:("|\')(\d+)\1|((?:[^"\' \/>]\d)+))/i))
if (htmlelement.match(/width=(?:("|\')(\d+)\1|([^"\' \/>]\d+))/i))
{
//newstr += htmlelement.replace(/^.*width="?(\d+)"?.*$/i, " width=\"$1\"");
//newstr += htmlelement.replace(/^.*width=(?:("|\')(\d+)\1|((?:[^"\' \/>]\d)+)).*$/i, " width=\"$2$3\"");
newstr += htmlelement.replace(/^.*width=(?:("|\')(\d+)\1|([^"\' \/>]\d+)).*$/i, " width=\"$2$3\"");
}
//if (htmlelement.match(/height="?(\d+)"?/i))
if (htmlelement.match(/height=(?:("|\')(\d+)\1|([^"\' \/>]\d+))/i))
{
//newstr += htmlelement.replace(/^.*height="?(\d+)"?.*$/i, " height=\"$1\"");
newstr += htmlelement.replace(/^.*height=(?:("|\')(\d+)\1|([^"\' \/>]\d+)).*$/i, " height=\"$2$3\"");
}
if (htmlelement.match(/alt=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
{
//newstr += htmlelement.replace(/^.*alt="?([\w\s]*)"?.*$/i, " alt=\"$1\"");
//newstr += htmlelement.replace(/^.*alt=(['"])(.*?)\1.*$/i, " alt=\"$2\"");
newstr += htmlelement.replace(/^.*alt=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " alt=\"$2$3\"");
}
if (htmlelement.match(/title=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
{
newstr += htmlelement.replace(/^.*title=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " title=\"$2$3\"");
}
newstr += " />";
warning_tags.img.count++;
break;
case "b":
newstr = htmlelement.replace(/^< *(\/?) *b.*?>$/gi, "<$1strong>");
break;
case "i":
newstr = htmlelement.replace(/^< *(\/?) *i.*?>$/gi, "<$1em>");
break;
case "col":
//newstr = htmlelement.toLowerCase();
newstr = htmlelement.replace(/^< *(\/?) *col.*?>$/gi, "<col");
if (htmlelement.match(/width=(?:("|\')(\d+)\1|([^"\' \/>]\d+))/i))
{
//newstr += htmlelement.replace(/^.*width="?(\d+)"?.*$/i, " width=\"$1\"");
//newstr += htmlelement.replace(/^.*width=(?:("|\')(\d+)\1|((?:[^"\' \/>]\d)+)).*$/i, " width=\"$2$3\"");
newstr += htmlelement.replace(/^.*width=(?:("|\')(\d+)\1|([^"\' \/>]\d+)).*$/i, " width=\"$2$3\"");
}
newstr += " />";
warning_tags.col.count++;
break;
case "h":
newstr = htmlelement.replace(/^< *(\/?) *h(\d).*?>$/gi, "<$1h$2>");
break;
default:
newstr = htmlelement.replace(/^< *(\/?) *([a-zA-Z]+).*?( \/)?>$/g, "<$1$2$3>");
newstr = newstr.toLowerCase();
}
len -= newstr.length;
newhtmlelement += newstr;
}
else
{
// nedozvoljen html tag
name = htmlelement.replace(/^< *\/? *([a-z0-9:]+) *?.*?\/?>$/gi, "$1").toLowerCase();
//a = /<\/?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)\/?>/g
// NOTE: iako su komentari izbaceni ranije, postoji jos jedna vrsta tagova kao npr. <![if !vml]>
if (name !== "" && htmlelement[1] !== "/" && name[1] !== "!") // ne treba nam broj praznih tagova, zatvorenih tagova i komentara
{
var is_found_removed = false;
for (i = 0; i < removed_tags.length; i++)
{
if (removed_tags[i].tag === name)
{
removed_tags[i].count++;
is_found_removed = true;
break;
}
}
if (!is_found_removed)
{
removed_tags.push({"tag":name, "count":1});
}
}
}
newhtmlelement += hinput.substring(poz2);
// sacuvaj
hinput = newhtmlelement;
// posto smo izmenili string, moramo da podesimo poziciju
poz2 -= len;
}
var removed_tags_out = "";
for (i = 0; i < removed_tags.length; i++)
{
removed_tags_out += removed_tags[i].tag + " - " + removed_tags[i].count + "
";
}
document.getElementById("removed_tags").innerHTML = removed_tags_out;
var warning_tags_out = "";
if (warning_tags.a.count) warning_tags_out += "
check <a>" + " - " + warning_tags.a.count;
if (warning_tags.img.count) warning_tags_out += "
check <img>" + " - " + warning_tags.img.count;
if (warning_tags.col.count) warning_tags_out += "
check <col>" + " - " + warning_tags.col.count;
document.getElementById("warning").innerHTML = warning_tags_out;
return hinput;
}
function handlepaste(elem, e)
{
var savedcontent = elem.innerHTML;
if (document.getElementById("inputdiv").style.display === "block")
{
// Visual
elem.innerHTML = "";
waitforpastedata(elem, savedcontent);
return true;
}
else
{
// HTML
// webkit - get data from clipboard, put into editdiv, cleanup, then cancel event
if (e && e.clipboardData && e.clipboardData.getData)
{
if (/text\/html/.test(e.clipboardData.types))
{
elem.value = e.clipboardData.getData("text/html");
}
else if (/text\/plain/.test(e.clipboardData.types))
{
elem.value = e.clipboardData.getData("text/plain");
}
else
{
elem.value = "";
}
document.getElementById("inputdiv").innerHTML = elem.value;
//waitforpastedata(elem, savedcontent);
if (document.getElementById("runauto").checked) run();
if (e.preventDefault)
{
e.stopPropagation();
e.preventDefault();
}
return false;
}
// everything else - empty editdiv and allow browser to paste content into it, then cleanup
else //if (this.id !== "input")
{
elem.innerHTML = "";
waitforpastedata(elem, savedcontent);
return true;
}
}
}
function waitforpastedata(elem, savedcontent)
{
if (elem.childNodes && elem.childNodes.length > 0)
{
processpaste(elem, savedcontent);
}
else
{
var that = {
e: elem,
s: savedcontent
};
that.callself = function()
{
waitforpastedata(that.e, that.s);
};
setTimeout(that.callself, 20);
}
}
function processpaste(elem, savedcontent)
{
var pasteddata = elem.innerHTML;
//^^Alternatively loop through dom (elem.childNodes or elem.getElementsByTagName) here
elem.innerHTML = savedcontent;
document.getElementById("input").value = pasteddata;
document.getElementById("inputdiv").innerHTML = pasteddata;
if (document.getElementById("runauto").checked) run();
}
function inputdiv_change()
{
document.getElementById("input").value = document.getElementById("inputdiv").innerHTML;
if (document.getElementById("runauto").checked) run();
}
function input_change()
{
document.getElementById("inputdiv").innerHTML = document.getElementById("input").value;
if (document.getElementById("runauto").checked) run();
}
function output_change()
{
document.getElementById("preview").innerHTML = document.getElementById("output").value;
}
function tab_switch(id)
{
switch (id)
{
case "inputdiv":
document.getElementById("inputdiv").style.display = "block";
document.getElementById("input").style.display = "none";
document.getElementById("tabselector_inputdiv").className = "tabselector";
document.getElementById("tabselector_input").className = "tabselector tabselector_active";
break;
case "input":
document.getElementById("inputdiv").style.display = "none";
document.getElementById("input").style.display = "block";
document.getElementById("tabselector_inputdiv").className = "tabselector tabselector_active";
document.getElementById("tabselector_input").className = "tabselector";
break;
}
}
function preview_switch()
{
if (document.getElementById("preview").style.display !== "block")
{
document.getElementById("preview").style.display = "block";
document.getElementById("output").style.display = "none";
document.getElementById("tabselector_preview").className = "tabselector";
document.getElementById("tabselector_preview").title = "Hide Preview";
}
else
{
document.getElementById("preview").style.display = "none";
document.getElementById("output").style.display = "block";
document.getElementById("tabselector_preview").className = "tabselector tabselector_active";
document.getElementById("tabselector_preview").title = "View Preview";
}
}
</script>
</body>
</html>