[webapp] Word HTML Cleaner

Costas

Administrator
Staff member
Is an online tool for cleaning HTML tags when copying text from Word and other text processors to your CMS.

http://www.igorware.com/tools/word-html-sanitizer

JavaScript:
//developed by Igor Jerosimić
<!DOCTYPE html>
<html itemscope itemtype="http://schema.org/ItemPage" lang="en">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width" />

<title>Word HTML Sanitizer</title>
<meta name="application-name" content="IgorWare" />
<meta property="og:title" content="Word HTML Sanitizer" />
<meta name="keywords" content="html sanitizer,word cleaning lady,word html,html cleanup,html tidy" />
<meta name="description" content="Free online tool for cleaning HTML tags when copying text from Word to your CMS." />
<meta name="author" content="Igor Jerosimić" />
<meta property="dc.creator" content="Igor Jerosimić" />


<style>
.tabline{margin-top:-27px;overflow:hidden;}
.tabselector
{
	float:right;
	margin:0 5px 0 0;
	padding:2px 7px;
	cursor:pointer;
	color:#000;
	background-color:#e9e9e9;
	border-color:#ccc #ccc #e9e9e9;
	border-width:1px;
	border-style:solid;
	border-top-right-radius:3px;border-top-left-radius:3px;
}
.tabselector_active
{
	color:#999;
	background-color:#f1f1f1;
	border-color:#ccc;
}
.border
{
	clear:both;
	height:200px;
	border:1px solid #ccc;
	border-radius:3px;
}
textarea,
.textarea
{
	clear:both;
	margin:0;
	padding:2px;
	width:100%;
	height:200px;
	overflow-x:hidden;
	overflow-y:auto;
	border:none;
	outline:none;
	border-top:7px solid #e9e9e9;
	-moz-box-sizing:border-box;box-sizing:border-box;
}
#space_saved,#removed_tags{color:blue;}
#warning{color:red;}
</style>

</head>
<body>



<div itemprop="description" class="appdescription">
	[B]Word HTML Sanitizer[/B] is an online tool for cleaning HTML tags when copying text from Word and other text processors to your CMS.
</div>

<div class="jsrequired">
	<label for="input">Input</label>
	<div class="border">
		<div class="tabline">
			<div id="tabselector_input" class="tabselector tabselector_active" onclick="tab_switch('input');">HTML</div>
			<div id="tabselector_inputdiv" class="tabselector" onclick="tab_switch('inputdiv');">Visual</div>
		</div>
		<div id="inputdiv" class="textarea" contenteditable="true" onpaste="handlepaste(this, event)" onkeyup="inputdiv_change();"></div>
		<textarea id="input" style="display:none;" autofocus="autofocus" onkeyup="input_change();"></textarea>
	</div>

	


	<label for="output">Output</label>
	<div class="border">
		<div class="tabline">
			<div id="tabselector_preview" class="tabselector tabselector_active" onclick="preview_switch();" title="View preview">Preview</div>
		</div>
		<textarea id="output" onchange="output_change();"></textarea>
		<div id="preview" class="textarea" style="display:none;"></div>
	</div>


	
Saved space: [B]0[/B] bytes

	<input id="run" name="run" type="button" style="float:right;" value="Clean It" onclick="run();" />
	<input id="runauto" name="runauto" type="checkbox" style="float:right;" checked="checked" />
	<label for="runauto" style="float:right;">Auto Clean on paste</label>

	
Excluded tags:
[B][/B]

	
[B][/B]


	[HR][/HR]
Word HTML Sanitizer [B]v2013-03-21[/B]

</div>
<noscript>
<style>.jsrequired{display:none!important;}</style>

NOTE: You need to turn on javascript for this tool to work!

</noscript>

<script>

function run()
{
	var hinput = document.getElementById("input").value;
	if (hinput === "")
	{
		document.getElementById("output").value = "";
		document.getElementById("removed_tags").innerHTML = "";
		document.getElementById("warning").innerHTML = "";
		document.getElementById("space_saved").innerHTML = "0";
		document.getElementById("input").focus();
		return;
	}

	var input_len = hinput.length;

	// konvertujemo sve nove redove u space, mora space jer bi u suprotnom zavrsili sa spojenim recima
	hinput = hinput.replace(/\n/g, " ");
	hinput = hinput.replace(/\t/g, " ");

	// izbaci komentare
	hinput = hinput.replace(/<!--.*?-->/g, " ");

	// konvertuj nbsp u space
	hinput = hinput.replace(/ /g, " ");

	// izbaci duple spaceove
	hinput = hinput.replace(/ +/g, " ");

	// obrisi nedozvoljene html tagove i ocisti klase kod dozvoljenih tagova
	hinput = remove_and_clean(hinput);

	// izbaci prazne, ostavi samo 
 ukoliko je to jedini element unutra
	hinput = hinput.replace(/[I]( *(?:<br \/>)? *)<\/em>/gi, "$1");
	hinput = hinput.replace(/[B]( *(?:<br \/>)? *)<\/strong>/gi, "$1");

	// da li je moguce izbaciti tag? samo za strong i em
	hinput = hinput.replace(/<\/em>( *?)[I]/gi, "$1");
	hinput = hinput.replace(/<\/strong>( *?)[B]/gi, "$1");
	hinput = hinput.replace(/<\/em>( *?)[I]/gi, "$1");	// jos jednom nakon izbacivanja strong

	// izbaci duple tagove
	//hinput = hinput.replace(/<(.+?)>(.*?)<\1>(.*?)<\/\1>(.*?)<\/\1>/gi, "<$1>$2$3$4</$1>");
	//hinput = hinput.replace(/<(([^\/].)*?)>((?:(?!<\/\1>).)*?)<\1>/gi, "<$1>$2$3$4</$1>");
//hinput = hinput.replace(/<((?:(?!\/).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)<\/\1>/gi, "<$1>$2$3$4</$1>");		// [I][I][/I][/I]	// <- najbolji koji valja
// ne valja -> hinput = hinput.replace(/<((?:(?!\/).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)/gi, "<$1>$2$3$4");		// [I][I][/I][/I]
	//hinput = hinput.replace(/<((?:(?!\/).)*?)>(?:((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?))*<\/\1>/gi, "<$1>$2$3$4</$1>");	// [I][I][/I][I][/I][/I]

//console.log(/<((?:(?!\/).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)<\/\1>/gi.exec(hinput));
//console.log(/<((?:(?!\/).)*?)>(?:((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?))*<\/\1>/gi.exec(hinput));
//
j
j

j

j


//
[SIZE=7]j
[SIZE=7]j[/SIZE]
[SIZE=7]j[/SIZE]
[SIZE=7]j[/SIZE][/SIZE]
//[LIST][*]t1[*]t2[LIST][*][/LIST][/LIST]
	/*var hexec = /<((?:(?!\/).)*?)>(?:((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?))*<\/\1>/gi.exec(hinput);
	if (hexec)
	{
		console.log(hexec);
	}*/
	while (/<((?:(?!\/|ul|li).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)/i.test(hinput))
	{
		hinput = hinput.replace(/<((?:(?!\/|ul|li).)*?)>((?:(?!<\/?\1>).)*?)<\1>((?:(?!<\1>).)*?)<\/\1>((?:(?!<\1>).)*?)/i, "<$1>$2$3$4");
	}

	// izbaci duple spaceove
	hinput = hinput.replace(/ +/g, " ");

	// izbaci duple paragrafe
	hinput = hinput.replace(/(?: *
 *<\/p> *)+/gi, "
 
");

	// izbaci nepotrebno sa pocetka
	hinput = hinput.replace(/^(?: *
 ?<\/p> *)+/gi, " ");
	hinput = hinput.replace(/^(?: *<br \/>)+/gi, " ");

	// izbaci nepotrebno sa kraja
	hinput = hinput.replace(/(?: *
 ?<\/p> *)+$/gi, "");
	hinput = hinput.replace(/(?: *<br \/> *)+$/gi, "");

	// uradi beautify
	//hinput = hinput.replace(/(?<!^)<(?!\/|em|strong|sub|sup)/gi, "\n<");	// javascript nema positive lookbehind
	hinput = hinput.replace(/<(?!\/|em|strong|sub|sup)/gi, "\n<");
	hinput = hinput.replace(/( ?\n ?)+/, "\n");
	hinput = hinput.replace(/^\n/, "");

	// vrati rezultat
	var houtput = document.getElementById("output");
	houtput.value = hinput;
	document.getElementById("preview").innerHTML = hinput;

	// izracunaj koliko je sacuvano prostora
	var output_len = hinput.length;
	document.getElementById("space_saved").innerHTML = (input_len - output_len);
}

function remove_and_clean(hinput)
{
	// dozvoljeni tagovi, h = h1, h2, h3...
	var allowed = ["a", "p", "h", "br", "ul", "ol", "li", "strong", "b", "em", "i", "img", "table", "thead", "tbody", "colgroup", "col", "th", "tr", "td", "sub", "sup"];

	// obrisi nedozvoljene html tagove i ocisti klase kod dozvoljenih tagova
	var i, poz1 = 0, poz2 = 0, htmlelement, newhtmlelement, name, is_allowed, len, warning_tags = {"a":{count:0}, "col":{count:0}, "img":{count:0}}, removed_tags = [];
	while (true)
	{
		// pronadji pocetak html taga
		poz1 = hinput.indexOf("<", poz2);
		if (poz1 === -1) break;

		// pronadji kraj html taga
		poz2 = hinput.indexOf(">", poz1);
		if (poz2 === -1) break;

		// treba obuhvatiti i ']'
		poz2 += 1;

		// uzmi html tag
		htmlelement = hinput.substring(poz1, poz2);

		// konvertuj u lowercase - ne moze ovde jer onda sjebe alt i title tagove na slikama
		//htmlelement = htmlelement.toLowerCase();

		// uzmi ime html taga
		name = htmlelement.replace(/^< *\/? *([a-zA-Z:]+) *?.*?\/? *?>$/g, "$1").toLowerCase();

		// da li je dozvoljen tag
		is_allowed = -1;
		for (i = 0; i < allowed.length; i++)
		{
			if (name === allowed[i])
			{
				is_allowed = i;
				break;
			}
		}

		// zapamti duzinu stringa
		len = htmlelement.length;

		// obrisi ili ocisti
		newhtmlelement = hinput.substring(0, poz1);
		if (is_allowed !== -1)
		{
			// dozvoljen html tag
			var newstr;
			switch (name)
			{
			case "br":
				newstr = "
";
				break;
			case "a":
				if (htmlelement.replace(/^< *(\/?) *a.*?>$/gi, "$1") !== "/")
				{
					newstr = "<a";

					if (htmlelement.match(/href=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
					{
						//newstr == htmlelement.replace(/^< *a +?.*?href="(.*?)" *?.*?>$/i, " href=\"$1\"");
						newstr += htmlelement.replace(/^.*href=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " href=\"$2$3\"");
					}

					if (htmlelement.match(/title=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
					{
						newstr += htmlelement.replace(/^.*title=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " title=\"$2$3\"");
					}

					if (htmlelement.match(/name=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
					{
						newstr += htmlelement.replace(/^.*name=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " name=\"$2$3\"");
					}

					newstr += ">";

					warning_tags.a.count++;
				}
				else
				{
					newstr = "</a>";
				}
				break;
			case "img":
				//newstr = htmlelement.replace(/^.*src=(['"])(.*?)\1.*$/i, "<img src=\"$2\"");
				newstr = htmlelement.replace(/^.*src=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, "<img src=\"$2$3\"");

				//if (htmlelement.match(/width="?(\d+)"?/i))
				//if (htmlelement.match(/width=(?:("|\')(\d+)\1|((?:[^"\' \/>]\d)+))/i))
				if (htmlelement.match(/width=(?:("|\')(\d+)\1|([^"\' \/>]\d+))/i))
				{
					//newstr += htmlelement.replace(/^.*width="?(\d+)"?.*$/i, " width=\"$1\"");
					//newstr += htmlelement.replace(/^.*width=(?:("|\')(\d+)\1|((?:[^"\' \/>]\d)+)).*$/i, " width=\"$2$3\"");
					newstr += htmlelement.replace(/^.*width=(?:("|\')(\d+)\1|([^"\' \/>]\d+)).*$/i, " width=\"$2$3\"");
				}
				//if (htmlelement.match(/height="?(\d+)"?/i))
				if (htmlelement.match(/height=(?:("|\')(\d+)\1|([^"\' \/>]\d+))/i))
				{
					//newstr += htmlelement.replace(/^.*height="?(\d+)"?.*$/i, " height=\"$1\"");
					newstr += htmlelement.replace(/^.*height=(?:("|\')(\d+)\1|([^"\' \/>]\d+)).*$/i, " height=\"$2$3\"");
				}
				if (htmlelement.match(/alt=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
				{
					//newstr += htmlelement.replace(/^.*alt="?([\w\s]*)"?.*$/i, " alt=\"$1\"");
					//newstr += htmlelement.replace(/^.*alt=(['"])(.*?)\1.*$/i, " alt=\"$2\"");
					newstr += htmlelement.replace(/^.*alt=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " alt=\"$2$3\"");
				}
				if (htmlelement.match(/title=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*))/i))
				{
					newstr += htmlelement.replace(/^.*title=(?:("|\')(.*?)\1|((?:[^"\' \/>].)*)).*$/i, " title=\"$2$3\"");
				}

				newstr += " />";
				warning_tags.img.count++;
				break;
			case "b":
				newstr = htmlelement.replace(/^< *(\/?) *b.*?>$/gi, "<$1strong>");
				break;
			case "i":
				newstr = htmlelement.replace(/^< *(\/?) *i.*?>$/gi, "<$1em>");
				break;
			case "col":
				//newstr = htmlelement.toLowerCase();

				newstr = htmlelement.replace(/^< *(\/?) *col.*?>$/gi, "<col");

				if (htmlelement.match(/width=(?:("|\')(\d+)\1|([^"\' \/>]\d+))/i))
				{
					//newstr += htmlelement.replace(/^.*width="?(\d+)"?.*$/i, " width=\"$1\"");
					//newstr += htmlelement.replace(/^.*width=(?:("|\')(\d+)\1|((?:[^"\' \/>]\d)+)).*$/i, " width=\"$2$3\"");
					newstr += htmlelement.replace(/^.*width=(?:("|\')(\d+)\1|([^"\' \/>]\d+)).*$/i, " width=\"$2$3\"");
				}
				newstr += " />";

				warning_tags.col.count++;
				break;
			case "h":
				newstr = htmlelement.replace(/^< *(\/?) *h(\d).*?>$/gi, "<$1h$2>");
				break;
			default:
				newstr = htmlelement.replace(/^< *(\/?) *([a-zA-Z]+).*?( \/)?>$/g, "<$1$2$3>");
				newstr = newstr.toLowerCase();
			}
			len -= newstr.length;
			newhtmlelement += newstr;
		}
		else
		{
			// nedozvoljen html tag
			name = htmlelement.replace(/^< *\/? *([a-z0-9:]+) *?.*?\/?>$/gi, "$1").toLowerCase();

			//a = /<\/?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)\/?>/g

			// NOTE: iako su komentari izbaceni ranije, postoji jos jedna vrsta tagova kao npr. <![if !vml]>
			if (name !== "" && htmlelement[1] !== "/" && name[1] !== "!")	// ne treba nam broj praznih tagova, zatvorenih tagova i komentara
			{
				var is_found_removed = false;
				for (i = 0; i < removed_tags.length; i++)
				{
					if (removed_tags[i].tag === name)
					{
						removed_tags[i].count++;
						is_found_removed = true;
						break;
					}
				}

				if (!is_found_removed)
				{
					removed_tags.push({"tag":name, "count":1});
				}
			}
		}
		newhtmlelement += hinput.substring(poz2);

		// sacuvaj
		hinput = newhtmlelement;

		// posto smo izmenili string, moramo da podesimo poziciju
		poz2 -= len;
	}

	var removed_tags_out = "";
	for (i = 0; i < removed_tags.length; i++)
	{
		removed_tags_out += removed_tags[i].tag + " - " + removed_tags[i].count + "
";
	}
	document.getElementById("removed_tags").innerHTML = removed_tags_out;

	var warning_tags_out = "";
	if (warning_tags.a.count) warning_tags_out += "
check <a>" + " - " + warning_tags.a.count;
	if (warning_tags.img.count) warning_tags_out += "
check <img>" + " - " + warning_tags.img.count;
	if (warning_tags.col.count) warning_tags_out += "
check <col>" + " - " + warning_tags.col.count;
	document.getElementById("warning").innerHTML = warning_tags_out;

	return hinput;
}


function handlepaste(elem, e)
{
	var savedcontent = elem.innerHTML;

	if (document.getElementById("inputdiv").style.display === "block")
	{
		// Visual

		elem.innerHTML = "";
		waitforpastedata(elem, savedcontent);
		return true;
	}
	else
	{
		// HTML

		// webkit - get data from clipboard, put into editdiv, cleanup, then cancel event
		if (e && e.clipboardData && e.clipboardData.getData)
		{
			if (/text\/html/.test(e.clipboardData.types))
			{
				elem.value = e.clipboardData.getData("text/html");
			}
			else if (/text\/plain/.test(e.clipboardData.types))
			{
				elem.value = e.clipboardData.getData("text/plain");
			}
			else
			{
				elem.value = "";
			}
			document.getElementById("inputdiv").innerHTML = elem.value;

			//waitforpastedata(elem, savedcontent);
			if (document.getElementById("runauto").checked) run();

			if (e.preventDefault)
			{
				e.stopPropagation();
				e.preventDefault();
			}
			return false;
		}
		// everything else - empty editdiv and allow browser to paste content into it, then cleanup
		else //if (this.id !== "input")
		{
			elem.innerHTML = "";
			waitforpastedata(elem, savedcontent);
			return true;
		}
	}
}

function waitforpastedata(elem, savedcontent)
{
	if (elem.childNodes && elem.childNodes.length > 0)
	{
		processpaste(elem, savedcontent);
	}
	else
	{
		var that = {
			e: elem,
			s: savedcontent
		};
		that.callself = function()
		{
			waitforpastedata(that.e, that.s);
		};
		setTimeout(that.callself, 20);
	}
}

function processpaste(elem, savedcontent)
{
	var pasteddata = elem.innerHTML;
	//^^Alternatively loop through dom (elem.childNodes or elem.getElementsByTagName) here

	elem.innerHTML = savedcontent;
	document.getElementById("input").value = pasteddata;
	document.getElementById("inputdiv").innerHTML = pasteddata;

	if (document.getElementById("runauto").checked) run();
}

function inputdiv_change()
{
	document.getElementById("input").value = document.getElementById("inputdiv").innerHTML;

	if (document.getElementById("runauto").checked) run();
}

function input_change()
{
	document.getElementById("inputdiv").innerHTML = document.getElementById("input").value;

	if (document.getElementById("runauto").checked) run();
}

function output_change()
{
	document.getElementById("preview").innerHTML = document.getElementById("output").value;
}

function tab_switch(id)
{
	switch (id)
	{
		case "inputdiv":
			document.getElementById("inputdiv").style.display = "block";
			document.getElementById("input").style.display = "none";
			document.getElementById("tabselector_inputdiv").className = "tabselector";
			document.getElementById("tabselector_input").className = "tabselector tabselector_active";
			break;
		case "input":
			document.getElementById("inputdiv").style.display = "none";
			document.getElementById("input").style.display = "block";
			document.getElementById("tabselector_inputdiv").className = "tabselector tabselector_active";
			document.getElementById("tabselector_input").className = "tabselector";
			break;
	}
}

function preview_switch()
{
	if (document.getElementById("preview").style.display !== "block")
	{
		document.getElementById("preview").style.display = "block";
		document.getElementById("output").style.display = "none";
		document.getElementById("tabselector_preview").className = "tabselector";
		document.getElementById("tabselector_preview").title = "Hide Preview";
	}
	else
	{
		document.getElementById("preview").style.display = "none";
		document.getElementById("output").style.display = "block";
		document.getElementById("tabselector_preview").className = "tabselector tabselector_active";
		document.getElementById("tabselector_preview").title = "View Preview";
	}
}

</script>

</body>
</html>
 
Top