Prepping the Reuters 21578 classification sample dataset

I’ve been playing around with some topic models and decided to look at the Reuters 21578 dataset. For your convenience, this dataset is stored as xml split between 20 files or so. And invalid xml at that. I prefer to work with flat text files, so this bit of ruby turns the xml into a single file, slightly cleaned of trailing periods and commas, with one line per input document. Hopefully this will save someone some time, whereas I got to spend an hour or more trying to figure out how to strip invalid UTF-8. This works in ruby 1.9 with the obvious gems installed and the reuters dataset as of today.

Updated May 2015: For those of you using ruby 1.9+ or having trouble installing iconv, you can use this instead:

# dump the reuters dataset from http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html
# into a single file, one line per article

require 'hpricot'

# config 
directory = 'reuters21578'
do_downcase = true
strip_reuter = true               # remove the reuter if it's the last elem of the array
strip_all_reuter = false      # remove all words that match reuter
strip_trailing_comma = true
strip_trailing_period = true

output = File.new('all.txt', 'w')

Dir.entries(directory).select{ |i| i=~ /sgm$/}.each do |filename|
  file = File.new("#{ directory }/#{ filename }", 'r').read
  xml = Hpricot(file)
  articles = xml.search('/REUTERS/*/BODY')

  puts "reading #{filename} : #{ articles.length}"

  articles.each{ |article|
    a = article.innerHTML
    # strip some bad unicode in reut2-017.sgm
    a.encode!("UTF-8", "binary", :invalid => :replace, :undef => :replace, :replace => "?")
    a = a.split(/\s/).map{ |i| i.chomp }.select{ |i| nil == (i =~ /^&/) }

    a.map!{ |i| i.downcase } if do_downcase
    a = a[0..-2] if strip_reuter and a.last =~ /reuter/i
    a.select!{ |i| nil == (i =~ /reuter/ ) } if strip_all_reuter


    a.map!{ |i| i.sub( /,$/, '') } if strip_trailing_comma
    a.map!{ |i| i.sub( /\.$/, '') } if strip_trailing_period
    a = a.select{ |i| i != '' }.join(' ')
    output.puts(a)
  }
end

output.close

Original:

# dump the reuters dataset from http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html
# into a single file, one line per article

require 'hpricot'
require 'iconv'

# config 
directory = 'reuters21578'
do_downcase = true
strip_reuter = true               # remove the reuter if it's the last elem of the array
strip_all_reuter = false      # remove all words that match reuter
strip_trailing_comma = true
strip_trailing_period = true

output = File.new('all.txt', 'w')
iconv = Iconv.new('UTF-8//IGNORE', 'UTF-8') # used to turn invalid utf-8 into valid

Dir.entries(directory).select{ |i| i=~ /sgm$/}.each do |filename|
    file = File.new("#{ directory }/#{ filename }", 'r').read
    xml = Hpricot(file)
    articles = xml.search('/REUTERS/*/BODY')
    
    puts "reading #{filename} : #{ articles.length}"

    articles.each{ |article|
            a = iconv.iconv(article.innerHTML)
            a = a.split(/\s/).map{ |i| i.chomp }.select{ |i| nil == (i =~ /^&/) }
            
            a.map!{ |i| i.downcase } if do_downcase
            a = a[0..-2] if strip_reuter and a.last =~ /reuter/i
            a.select!{ |i| nil == (i =~ /reuter/ ) } if strip_all_reuter

            
            a.map!{ |i| i.sub( /,$/, '') } if strip_trailing_comma
            a.map!{ |i| i.sub( /\.$/, '') } if strip_trailing_period
            a = a.select{ |i| i != '' }.join(' ')
            output.puts(a)
    }
end

output.close

Stochastic Nonsense

Put something smart here.

Prepping the Reuters 21578 Classification Sample Dataset