# fake tf.idf calculation; per document calculate
# (nr of times a word occurs) / (total nr of times the word occurs in all docs) 

######## CLASS DEFINITIONS ##########
class Document
  @@allwords = Hash.new

  # constructor for Document class
  # input  : file-name of the document to be opened
  # working: open the document and read  all words, ignoring punctuation marks. 
  #          Make sure all words are lowercased. Store the words for later
  #          processing per documentand also keep a classvar to store all the 
  #          words that have been seen so far.
  def initialize(d)
    @words = Hash.new

    File.open(d).each_line{ |line|
      # substitute non-words with whitespace
      line.gsub!(/\W/, ' ').downcase!

      # do the math. Note that:
      # if the class variable @@allwords does not have the new word then the
      # local wordlist for the present file doesn't have it either
      line.split.each{ |word|
        # check the classvar
        if @@allwords.has_key?( word) then
          @@allwords[word] += 1
          # check the local var
          if @words.has_key?( word ) then @words[word] += 1 else @words[word] = 1 end
        else
          @@allwords[word] = 1
          @words[word] = 1
        end
      }
    }
  end

  # show the words nicely formatted 
  # compact => gets rid of possibly empty stuff in the words array
  # inject  => starts with empty string, store that in s, then do some
  #            processing and pass the result to the s of the new iteration
  def to_s
    @words.to_a.compact.inject(""){ |s,w| s += "#{w[0]} (#{w[1]}) \n" }
  end

  # calculate the tf.idf value for the word that is passed in as argument
  # return 0 if the word doesn't exist for this particular word
  def tfidf(w)
    if @words.include?(w)
      tf = @words[w].to_f
      df = @@allwords[w].to_f
      tfidf = tf/df
      return tfidf
    else
      return "0"
    end
  end

  # [ CLASS METHOD ] return all words that have been read during 
  # the processing of documents
  def Document.allwords
    return @@allwords.keys
  end
end

######## DO THE PROCESSING ##########

## keep track of the files
fl = Array.new

## loop over files in data dir and do the magic
Dir.chdir("data")
Dir.glob("*") { |f|  fl.push( Document.new(f) ) }

## loop over the words that we know and nicely output the tf.idf scores
Document.allwords.each{|word|
  printf("%15s", word)
  fl.each{ |file| printf(" %3.4f", file.tfidf(word)) }
  print("\n")
}