# Patent downloading utility
# Blake Watters <blake@near-time.com>
# 2/3/2007
#
# Usage: ruby grab_patent.rb <patent_number> [output_path]

require 'open-uri'
require 'rubygems'
require 'hpricot'

patent_id = ARGV.first
unless patent_id && !patent_id.to_i.zero?
  raise ArgumentError.new("A numeric Patent ID must be provided. Usage : grab_patent.rb <patent_number> [output_path]") 
end

# Retrieve the home for the Patent via Google
url = "http://www.google.com/patents?q=#{patent_id}&btnG=Search+Patents"
open(url) {|f| @response = f.read}
doc = Hpricot(@response)
patent_link = doc/"/html/body/p/a"
puts "Downloading Patent ID #{patent_id} : '#{patent_link.first.inner_html}'"
patent_url = patent_link.first['href']

# Find the Abstract so we can enter the viewing mode
open(patent_url) {|f| @response = f.read}
doc = Hpricot(@response)
abstract_url = doc.at('#summarytable a')['href']

# Grab the Abstract and move through each page
scraped_pages = []
images_to_download = []
next_url = abstract_url
begin
  scraped_pages << next_url
  open(next_url) {|f| @response = f.read}
  doc = Hpricot(@response)
  images_to_download << doc.at('#viewport img')['src']
  
  next_url = doc.search('td.arrow a').find {|e| e.at('img')['alt'] == 'Next Page'}['href']
end until scraped_pages.include?(next_url)

base_path = File.join(ENV['HOME'], 'Desktop', 'Patents')
output_path = ARGV[1] || File.join(base_path, patent_id.to_s)
Dir.mkdir(base_path) unless File.exists?(base_path)
Dir.mkdir(output_path) unless File.exists?(output_path)
puts "Found #{images_to_download.size} pages for Patent ID #{patent_id}, downloading to #{output_path}"
images_to_download.each_with_index do |image_url, index|
  page_number = index + 1
  file_path = File.join(output_path, "#{patent_id} - Page #{page_number}.png")
  open(image_url) do |image_stream|
    File.open(file_path, 'w+') {|f| f << image_stream.read}
  end
end

puts "Download of Patent #{patent_id} is complete."
system("open #{output_path}")
