# Demonstration of screen scraping using Hpricot # Scrapes the CricInfo WAP site for commentary from Australian Cricket # games and posts them to twitter. # # This is for educational use only! Don't come crying to me if # you screen scrape a site with out permission and you get in trouble. # # Copyright Myles Eftos 2007 http://myles.eftos.id.au # Released under the GNU General Public License # See http://www.gnu.org/copyleft/gpl.html require 'rubygems' require 'open-uri' require 'hpricot' require 'twitter' username = 'your_twitter_username' password = 'your_twitter_password' # A modification to the String class to allow truncation class String def truncate(number, truncator = '...') if self.size > number return self[0..(number - 3)] + truncator end end end # Same additions to the Hpricot module to allow complete stipping of tags # "Borrowed" from http://underpantsgnome.com/2007/01/20/hpricot-scrub/ module Hpricot class Elements def strip each { |x| x.strip } end def strip_attributes(safe=[]) each { |x| x.strip_attributes(safe) } end end class Elem def remove parent.children.delete(self) end def strip children.each { |x| x.strip unless x.class == Hpricot::Text } if strip_removes? remove else parent.replace_child self, Hpricot.make(inner_html) unless parent.nil? end end def strip_attributes(safe=[]) attributes.each {|atr| remove_attribute(atr[0]) unless safe.include?(atr[0]) } unless attributes.nil? end def strip_removes? # I'm sure there are others that shuould be ripped instead of stripped attributes && attributes['type'] =~ /script|css/ end end class Doc def scrub(config={}) config = { :nuke_tags => [], :allow_tags => [], :allow_attributes => [] }.merge(config) config[:nuke_tags].each { |tag| (self/tag).remove } config[:allow_tags].each { |tag| (self/tag).strip_attributes(config[:allow_attributes]) } children.reverse.each {|e| e.strip unless e.class == Hpricot::Text || config[:allow_tags].include?(e.name) } self end end end # Get a list of current games url = 'http://ci.plusmo.com/cricket/wap/' response = '' # Read in the webpage open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}") do |f| response = f.read end games = Array.new doc = Hpricot(response) # Search for the correct set of table rows doc.search("/html/body/div[4]/table/tr").each do |td| link = td.search("/td/a").each do |a| if a.inner_html.downcase =~ /australia/ # Any row with the word australia in it (Case insensitve) is # added to the games array games << a.attributes["href"].to_s end end end # Iterate of the games array to pull in the list of commentary links games.each do |game| response = '' # We'll use the id later on to save the comparison file file = game.split("?id=").last open('http://ci.plusmo.com' + game, "User-Agent" => "Ruby/#{RUBY_VERSION}") do |f| response = f.read end doc = Hpricot(response) # The magic begins - parse out the summary text doc.search("div[@class=dat]").each do |div| summaries = Array.new details = Array.new score = "" over = 0 ball = 0 entry = 0 new_over_index = nil div.search("div[@class=summary]").each do |summary| summary = summary.at("strong").strip.to_s if summary.strip.to_s =~ /(\d+)\.(\d) (.+)/ summary = $3 # Parse out the over and ball numbers # So we can update the score later if $1.to_i > over over = $1.to_i ball = $2.to_i elsif $i.to_i == over && $2.to_i > ball ball = $2.to_i end # If it is the first ball of the over, then # save the index ready for display if $1.to_i == over && $2.to_i == 1 new_over_index = entry end entry += 1 end summaries << summary end # Parse the details div.search("div[@class=detail]").each do |detail| detail = detail.strip.to_s details << detail end # Parse the score div.search("div[@id=score]").each do |score| score = score.strip.to_s.gsub("?", "").gsub(" ", " ") end # merge the two halfs of the array (0..(summaries.size - 1)).each do |i| summaries[i] = (summaries[i].strip + (summaries[i].split('').last == "!" ? " " : ". ") + details[i].strip) end # Let's check where we are at in the past tweeted file old_summaries = Array.new if File.exists?("/tmp/cric_info_#{file}.txt") File.open("/tmp/cric_info_#{file}.txt").each { |line| old_summaries << line.strip } end # If there are any text messages on the website that aren't in # the saved text file, we will post them summaries_to_post = Array.new summaries.each { |summary| if !old_summaries.include?(summary.strip) summaries_to_post << summary end } # Write a new text file out for the next run File.open("/tmp/cric_info_#{file}.txt", "w") { |f| f << summaries.join("\n") } # And now post to twitter! twitter = Twitter::Base.new(username, password) summaries_to_post.reverse.each { |p| twitter.post(p.truncate(140, "").strip) # Find the index of the status for the first ball of an over # if it exists, print out the current score if new_over_index != nil && p == summaries[new_over_index] twitter.post("Current score: #{score} (Over: #{over})") end } end end