Monday, May 11, 2009

A simple scrapper with ruby mechanize


require 'rubygems'
require 'mechanize'

mech = WWW::Mechanize.new { |agent|
agent.user_agent_alias = 'Mac Safari'
}

def getNames(page)
results = []
rows = page.root.xpath('//table[@cellspacing="3"]/tr')
1.upto(rows.length() - 1 ) { |i|
results.push(rows[i].search("td").first.text.lstrip.rstrip)
}
return results
end

def getMaleNames(mech)
results = []
page = mech.get("http://names.mongabay.com/male_names.htm")
results.concat(getNames(page))
namelinks = []
page.links.each{ |link|
if ( /\/male_names\d/ =~ link.href)
namelinks.push(link.href)
end
}
namelinks.each { |url|
page = mech.get(url)
results.concat(getNames(page))
}
return results
end

def getFemaleNames(mech)
results = []
page = mech.get("http://names.mongabay.com/female_names.htm")
results.concat(getNames(page))
namelinks = []
page.links.each{ |link|
if ( /\/female_names\d/ =~ link.href)
namelinks.push(link.href)
end
}
namelinks.each { |url|
page = mech.get(url)
results.concat(getNames(page))
}
return results
end

def getSurnames(mech)
results = []
1.upto(10).each { |i|
page = mech.get(sprintf("http://names.mongabay.com/data/%d000.html", i))
rows = page.root.xpath('//table[@class="boldtable"]/tr')
1.upto(rows.length() - 1) { |i|
results.push(rows[i].search("td").first.text.lstrip.rstrip)
}
}
return results
end

puts "get male names"
File.open("malenames.txt", "a+") { |file|
getMaleNames(mech).each { |name|
file << name << "\n"
}
}

puts "get female names"
File.open("femalenames.txt", "a+") { |file|
getFemaleNames(mech).each { |name|
file << name << "\n"
}
}

puts "get surnames"
File.open("surnames.txt", "a+") { |file|
getSurnames(mech).each { |name|
file << name << "\n"
}
}

No comments:

Post a Comment