# fake tf.idf calculation; per document calculate
# (nr of times a word occurs) / (total nr of times the word occurs in all docs)
######## CLASS DEFINITIONS ##########
class Document
@@allwords = Hash.new
# constructor for Document class
# input : file-name of the document to be opened
# working: open the document and read all words, ignoring punctuation marks.
# Make sure all words are lowercased. Store the words for later
# processing per documentand also keep a classvar to store all the
# words that have been seen so far.
def initialize(d)
@words = Hash.new
File.open(d).each_line{ |line|
# substitute non-words with whitespace
line.gsub!(/\W/, ' ').downcase!
# do the math. Note that:
# if the class variable @@allwords does not have the new word then the
# local wordlist for the present file doesn't have it either
line.split.each{ |word|
# check the classvar
if @@allwords.has_key?( word) then
@@allwords[word] += 1
# check the local var
if @words.has_key?( word ) then @words[word] += 1 else @words[word] = 1 end
else
@@allwords[word] = 1
@words[word] = 1
end
}
}
end
# show the words nicely formatted
# compact => gets rid of possibly empty stuff in the words array
# inject => starts with empty string, store that in s, then do some
# processing and pass the result to the s of the new iteration
def to_s
@words.to_a.compact.inject(""){ |s,w| s += "#{w[0]} (#{w[1]}) \n" }
end
# calculate the tf.idf value for the word that is passed in as argument
# return 0 if the word doesn't exist for this particular word
def tfidf(w)
if @words.include?(w)
tf = @words[w].to_f
df = @@allwords[w].to_f
tfidf = tf/df
return tfidf
else
return "0"
end
end
# [ CLASS METHOD ] return all words that have been read during
# the processing of documents
def Document.allwords
return @@allwords.keys
end
end
######## DO THE PROCESSING ##########
## keep track of the files
fl = Array.new
## loop over files in data dir and do the magic
Dir.chdir("data")
Dir.glob("*") { |f| fl.push( Document.new(f) ) }
## loop over the words that we know and nicely output the tf.idf scores
Document.allwords.each{|word|
printf("%15s", word)
fl.each{ |file| printf(" %3.4f", file.tfidf(word)) }
print("\n")
}