MultipartParser

class
Superclass
Object
Included Modules
Configurability
Extended With
Loggability
Strelka::MethodUtilities
Strelka::MultipartParser::FileInputField

A parser for extracting uploaded files and parameters from the body of a multipart/form-data request.

Synopsis

require 'strelka/multipartmimeparser'

parser = Strelka::MultipartMimeParser.new
files, params = parser.parse( io, '---boundary' )

Authors

Constants

BLANK_LINE_REGEXP

Pattern for matching a blank line

CONFIG_DEFAULTS

Configurability API – configuration defaults

CRLF_REGEXP

Line-ending regexp. Supports UNIX line-endings for testing.

HEADER_FIELD_EOL

Line-ending for RFC5322 header fields; EOL not followed by a WSP char

Attributes

buffer[R]

The current buffer for unparsed data

fields[R]

Parsed form fields

Public Class Methods

anchor
configure( options=nil )

Configurability API – configure the parser with the 'mimeparser' section of the config:

bufsize

the size of the buffer (in bytes) to use when reading the document. Larger sizes use more heap, but are faster.

spooldir

the directory to spool file upload parts to.

# File lib/strelka/multipartparser.rb, line 77
def self::configure( options=nil )
        if options
                self.log.debug "Configuring the %p: %p" % [ self, options ]
                self.bufsize  = Integer( options[:bufsize] )   || CONFIG_DEFAULTS[:bufsize]
                self.spooldir = Pathname( options[:spooldir] ) || CONFIG_DEFAULTS[:spooldir]
        else
                self.log.debug "Configuring %p with defaults: %p" % [ self, CONFIG_DEFAULTS ]
                self.bufsize  = CONFIG_DEFAULTS[:bufsize]
                self.spooldir = CONFIG_DEFAULTS[:spooldir]
        end
end
anchor
new( io, boundary )

Create a new Strelka::MultipartMimeParser

# File lib/strelka/multipartparser.rb, line 91
def initialize( io, boundary )
        io        = StringIO.new( io ) unless io.respond_to?( :read )
        boundary  = '--' + boundary # unless boundary.start_with?( '--' )

        @bufsize  = self.class.bufsize || CONFIG_DEFAULTS[:bufsize]
        @spooldir = self.class.spooldir || CONFIG_DEFAULTS[:spooldir]
        @io       = io
        @boundary = boundary
        @fields   = {}
        @buffer   = ''

        # Ensure that the buffer can contain at least a whole boundary,
        # otherwise we can't scan for it.
        @bufsize  = @boundary.bytesize * 1.5 if @bufsize < @boundary.bytesize * 1.5
        @spooldir.mkpath
end

Public Instance Methods

anchor
bufsize()

The configured buffer size to use when parsing

# File lib/strelka/multipartparser.rb, line 64
singleton_attr_accessor :bufsize
anchor
parse()

Parse the form data from the IO and return it as a Hash.

# File lib/strelka/multipartparser.rb, line 121
def parse
        self.log.debug "Starting parse: %p" % [ self ]

        # Strip off the initial boundary
        self.strip_boundary or
                raise Strelka::ParseError, "No initial boundary"

        # Now scan until we see the ending boundary (the one with the trailing '--')
        self.scan_part until @buffer.start_with?( '--' )

        self.log.debug "Finished parse. %d fields" % [ self.fields.length ]
        return self.fields
end
anchor
spooldir()

The configured spool directory for storing attachments

# File lib/strelka/multipartparser.rb, line 68
singleton_attr_accessor :spooldir

Protected Instance Methods

anchor
read_at_least( bytecount )

Read at least bytecount bytes from the io, appending the data onto the buffer.

# File lib/strelka/multipartparser.rb, line 321
def read_at_least( bytecount )
        # self.log.debug "Reading at least %d bytes from %p." % [ bytecount, @io ]

        if @io.eof?
                # self.log.debug "  input stream at EOF. Returning."
                return false
        end

        self.read_some_more until
                @buffer.bytesize >= bytecount || @io.eof?

        return true
end
anchor
read_chunk()

Read data from the state's IO until the buffer contains at least the number of bytes in the chunksize, or the IO is at EOF.

# File lib/strelka/multipartparser.rb, line 312
def read_chunk
        # self.log.debug "Reading a new chunk."
        self.read_at_least( @bufsize )
        # self.log.debug "  buffer is now: %p" % [ @buffer ]
end
anchor
read_some_more()

Try to read another chunk of data into the buffer of the given state, returning true unless the state's IO is at eof.

# File lib/strelka/multipartparser.rb, line 338
def read_some_more
        # self.log.debug "Reading more data from %p..." % [ @io ]
        return false if @io.eof?
        startsize = @buffer.bytesize

        @buffer << @io.read( @bufsize )
        # self.log.debug "  after reading, buffer has %d bytes." % [ @buffer.bytesize ]

        until @buffer.bytesize > startsize
                return false if @io.eof?
                Thread.pass
                @buffer << @io.read( @bufsize )
        end

        return true
end
anchor
scan_file_field( filename, headers )

Scan the body of the current document part, spooling the data to a tempfile on disk and returning the resulting filehandle.

# File lib/strelka/multipartparser.rb, line 234
def scan_file_field( filename, headers )
        self.log.info "Parsing file '%s'" % [ filename ]

        io, size = self.spool_file_upload

        io.extend( FileInputField )
        io.filename       = filename
        io.content_type   = headers['content-type']
        io.content_length = size

        self.log.debug "Scanned file %p to: %s (%d bytes)" % [ io.filename, io.path, size ]
        return io
end
anchor
scan_headers()

Scan the buffer for MIME headers and return them as a Hash.

# File lib/strelka/multipartparser.rb, line 179
def scan_headers
        headerlines = ''

        @buffer.slice!( /^#{CRLF_REGEXP}/ )

        # Find the headers
        while headerlines.empty?
                if pos = @buffer.index( BLANK_LINE_REGEXP )
                        headerlines = @buffer.slice!( 0, pos )
                else
                        self.log.debug "Couldn't find a blank line in the first %d bytes (%p)" %
                                [ @buffer.bytesize, @buffer[0..100] ]
                        self.read_at_least( @bufsize ) or
                                raise Strelka::ParseError, "EOF while searching for headers"
                end
        end

        # put headers into a hash
        headers = headerlines.strip.split( HEADER_FIELD_EOL ).inject({}) {|hash, line|
                line.gsub!( CRLF_REGEXP, '' ) # Un-fold long headers
                key, val = line.split( /:\s*/, 2 )
                hash[ key.downcase ] = val
                hash
        }
        self.log.debug "Scanned headers: %p" % [headers]

        # remove headers from parse buffer
        @buffer.slice!( /^#{BLANK_LINE_REGEXP}/ )

        return headers
end
anchor
scan_part()

Scan a part from the buffer.

# File lib/strelka/multipartparser.rb, line 142
def scan_part
        headers = self.scan_headers
        disposition = headers['content-disposition']

        raise UnimplementedError, "don't know what to do with %p parts" % [ disposition ] unless
                disposition.start_with?( 'form-data' )
        key = disposition[ /\bname="(\S+)"/i, 1 ] or
                raise Strelka::ParseError, "no field name: %p" % [ disposition ]
        val = nil

        # :TODO: Support for content-type and content-transfer-encoding headers for parts.

        # If it's a file, spool it out to a tempfile
        if disposition =~ /\bfilename=/i
                file = disposition[ /\bfilename="(?:.*\)?(.+?)"/, 1 ] or return nil
                self.log.debug "Parsing an uploaded file %p (%p)" % [ key, file ]
                val = self.scan_file_field( file, headers )

        # otherwise just read it as a regular parameter
        else
                self.log.debug "Parsing a form parameter (%p)" % [ key ]
                val = self.scan_regular_field( key )
        end

        # Convert the value to an Array if there are more than one
        if @fields.key?( key )
                @fields[ key ] = [ @fields[key] ] unless @fields[ key ].is_a?( Array )
                @fields[ key ] << val
        else
                @fields[ key ] = val
        end

        self.strip_boundary
end
anchor
scan_regular_field( key )

Scan the value after the scan pointer for the specified metadata parameter.

# File lib/strelka/multipartparser.rb, line 214
def scan_regular_field( key )
        param = ''

        self.log.debug "Scanning form parameter: %p" % [key]
        while param.empty?
                if start = @buffer.index( @boundary )
                        self.log.debug "Found the end of the parameter."
                        param = @buffer.slice!( 0, start )
                else
                        self.read_some_more or raise Strelka::ParseError,
                                "EOF while scanning a form parameter"
                end
        end

        return param.chomp
end
anchor
spool_file_upload()

Scan the file data and metadata in the given scannner, spooling the file data into a temporary file. Returns the tempfile object and a hash of metadata.

# File lib/strelka/multipartparser.rb, line 252
def spool_file_upload
        self.log.debug "Spooling file from upload"
        tmpfile = Tempfile.open( 'filedata', @spooldir.to_s, encoding: 'ascii-8bit' )
        size = 0

        # :TODO: Use mmap(2) to map the resulting IOs from mongrel's spool file
        # rather than writing them all out to disk a second time.
        until tmpfile.closed?

                # look for end, store everything until boundary
                if start = @buffer.index( @boundary )
                        self.log.debug "Found the end of the file"
                        leavings = @buffer.slice!( 0, start )
                        leavings.slice!( -2, 2 ) # trailing CRLF
                        tmpfile.write( leavings )
                        size += leavings.length
                        tmpfile.close

                # not at the end yet, buffer this chunker to disk
                elsif @buffer.bytesize >= @bufsize
                        # make sure we're never writing a portion of the boundary
                        # out while we're buffering
                        buf = @buffer.slice!( 0, @buffer.bytesize - @bufsize )
                        # self.log.debug "  writing %d bytes" % [ buf.bytesize ]
                        tmpfile.print( buf )
                        size += buf.bytesize
                end

                # put some more data into the buffer
                unless tmpfile.closed?
                        self.read_some_more or
                                raise Strelka::ParseError, "EOF while spooling file upload"
                end
        end

        return tmpfile, size
end
anchor
strip( pat )

Strip data from the head of the buffer that matches pat, returning it if successful, or returning nil if not. The matched data should fit within the parser's chunk size.

# File lib/strelka/multipartparser.rb, line 294
def strip( pat )
        self.read_chunk
        return nil unless @buffer.index( pat ) == 0
        @buffer.slice!( pat )
end
anchor
strip_boundary()

Strip the boundary that's at the front of the buffer, reading more data into it as necessary. Returns the boundary if successful, or nil if there wasn't a boundary in the buffer.

# File lib/strelka/multipartparser.rb, line 304
def strip_boundary
        self.log.debug "Stripping boundary:\n%p at:\n%p" % [ @boundary, @buffer[0,40] ]
        self.strip( @boundary )
end