# Implements a view helper method that lets you conditionally sanitize HTML # as provided directly by a user, or via RedCloth or other markup/markdown # creating libraries. module HtmlFilterHelper # This helper is a flexible HTML sanitizer/whitelist that allows you to easily configure # "profiles" for allowed tags. You can also specify what tag attributes (and values for those # attributes) you consider "safe." # # Whitelisting some HTML without inspecting attributes is pointless - were I a hacker, I could # throw in cookie-stealing onmouseover/onclick events to gain admin control. I could throw in # CSS to load porn images as backgrounds. You need to filter out attributes if you're going to allow # user-specified HTML at all. # # The idea behind supporting multiple a "html profiles" is simple - sometimes you want to be # able to use a wider range of HTML than others, yet you still want to maintain some control # over your output. Admins get to put in a large subset of HTML tags and attributes, while # anonymous comments can only use simple formatting tags. # # PROFILES: # # A "profile" is an optional hash that defines a filtering profile to use. The default profile # allows the following tags with all attributes stripped: strong, b, ul, li, ol, i, u, code, # pre, p, div, br, table, tr, td, th, tbody, thead, span, h1, h2, h3, h4, h5, h6, dl, and dt. # # If a tag doesn't exist as a key in the profile, it will be "deactivated" by having its opening bracket # replaced with the HTML entity representing an open bracket, and the attributes will be untouched. # Tag attributes that aren't allowed on OK tags are stripped altogether. # # A profile consists of a hash of hashes, where the first-level keys are HTML tags. The second # level keys are attributes. There are two special attribute keywords - "none" and "any". These # define whether or not we should allow no attributes or any attributes at all. Otherwise, an array # of values defines what an attribute may be. # # Umm. . . Yeah. Maybe that's confusing. Here are some examples: # # Example: # # class User < ActiveRecord::Base # USER_PROFILE={ # 'b'=>{'none'=>1}, # 'strong'=>{'class'=>['foo','bar','blee']}, # 'img'=>{'any'=>1} # } # end # # .... somewhere in a view .... # # <%= filter_html(@html, User::USER_PROFILE) %> # # will "deactivate" all HTML tags except and , leaving the attributes on the deactivated # tags untouched. It will strip all attributes from "b". It will allow the attribute "class" on # when it contains the classes "foo", "bar", or "blee". It will allow any attributes on # tags, which is HIGHLY UNSAFE, but here for demonstration purposes. # # More Examples: # # filter_html(%Q|

Foo!

#

Am I justified?

#
    #
  • Item 1
  • #
  • Item 2
  • #
# |,{ # 'p'=>{'align'=>['center','left','right'], 'class'=>['body']}, # 'ul'=>{'none'=>1}, # 'li'=>{'class'=>['first','second','third']} # } # ) # # will give the following output: #

Foo!

Am I justified?

#
  • Item 1
  • Item 2
# # The idea is that you'd define your profiles as CONSTANTS in your models, and then # pass the appropriate profile into your filter_html() method in your view right before # displaying the content. # # The neat thing about using this method is that you leave the user's original data untouched # and you can relax/restrict the HTML tag profile more or less in the future without borking # the original content. The bad thing is that you're parsing HTML right before each display. # Fragment caching can help with that, though. # # SEE ALSO: # # http://svn.techno-weenie.net/projects/plugins/white_list/ # # AUTHOR: # Dan Collis-Puro - dan at endNOSPAMSUCKERpoint dot com # # http://www.endpoint.com - work # # http://www.kookdujour.com - blog # # Based on the "Easy HTML Whitelists" recipe in "Rails Recipes". # # The "profile" idea is loosely based on HTML::TagFilter on www.cpan.org # # # def filter_html(html,profile='') unless profile.class.to_s == 'Hash' profile=get_profile end if html.index('<') tokenizer = HTML::Tokenizer.new(html) new_text='' while token = tokenizer.next node=HTML::Node.parse(nil, 0, 0, token, false) if node.class.to_s == 'HTML::Tag' && profile[node.name] allowed_attributes=filter_attributes(node,profile[node.name]) new_node=HTML::Tag.new(node.parent, node.line, node.position, node.name, allowed_attributes, node.closing) new_text << new_node.to_s else new_text << node.to_s.gsub(/{'none'=>1}, 'b' => {'none'=>1}, 'ul' => {'none'=>1}, 'li' => {'none'=>1}, 'ol'=>{'none'=>1}, 'i'=>{'none'=>1}, 'u'=>{'none'=>1}, 'code'=>{'none'=>1}, 'pre'=>{'none'=>1}, 'p'=>{'none'=>1}, 'div'=>{'none'=>1}, 'br'=>{'none'=>1}, 'table'=>{'none'=>1}, 'tr'=>{'none'=>1}, 'td'=>{'none'=>1}, 'th'=>{'none'=>1}, 'tbody'=>{'none'=>1}, 'thead'=>{'none'=>1}, 'span'=>{'none'=>1}, 'h1'=>{'none'=>1}, 'h2'=>{'none'=>1}, 'h3'=>{'none'=>1}, 'h4'=>{'none'=>1}, 'h5'=>{'none'=>1}, 'h6'=>{'none'=>1}, 'dl'=>{'none'=>1}, 'dt'=>{'none'=>1} } end def filter_attributes(node,allowed_attributes) safe_attributes={} if allowed_attributes['none'] == 1 #No attributes are safe. Strip 'em all. return safe_attributes elsif allowed_attributes['any'] == 1 #We'll allow all attributes with all values. DANGER WILL ROBINSON! safe_attributes=node.attributes return safe_attributes end if node.attributes #If a node doesn't have attributes, don't bother inspecting them. for attribute in node.attributes.keys if allowed_attributes[attribute].class.to_s == 'Array' if allowed_attributes[attribute].include?('any') #We'll allow all values for this attribute. safe_attributes[attribute] = node.attributes[attribute] elsif allowed_attributes[attribute].include?(node.attributes[attribute]) #We're only allowing a subset of attributes. safe_attributes[attribute] = node.attributes[attribute] end end end end return safe_attributes end end