#!/usr/bin/ruby
#
# maven-proxy.rb -- a caching-proxy CGI script for lazily populating
# local Maven repositories.
#
# Program: maven-proxy.rb
# Version: 1.0 [11/26/05 16:46 NJS]
# License: MIT
#
# Copyright (c) 2005 Nick Sieger <nicksieger@gmail.com>
# 
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# FEATURES
#
# - Lazily caches resources from from upstream repositories.
# - Does not use local cache for SNAPSHOT resources.
# - Overrides local cache and refetches from upstream if request URI
#   contains the query string '?refetch'.
# - Proxies but does not cache index (directory listing) content.
#   This allows for seamless navigation of multiple upstream
#   directories under a single hierarchy of the local repository.
# - For directory listings, reports the upstream location from where
#   it came.
# - Propagates remote redirects (e.g, /res to /res/) to keep child
#   links in directory listings correct.
# - For use with Ruby 1.8.x (built and tested on Ruby 1.8.2
#   i386-mswin32)
#
# OPERATION
#
# - Serves files from local repository, if they exist, unless:
#   a. The URI contains the string 'SNAPSHOT' -- snapshots should
#      always be resolved to the source
#   b. The query string on the URI is the string 'refetch'
# - Checks remote repositories in the order list for the requested
#   URI.  If a remote repository responds, the request is downloaded
#   and cached in the local repository, except if the request appears
#   to be for an index page, in which case the content is proxied but
#   not cached.
# - Responds with 404 if request is not cached and no upstream
#   repositories have the resource.
#
# CONFIGURATION
#
# 1. Check (and set if necessary) the path to your Ruby 1.8 executable
#    in the #! line above.
# 2. Set configuration variables below according to your installation.
#     LOCAL      -- full path to local repository location
#     REMOTE     -- space-separated list of remote repository urls
#                   (inside the %w{})
#     ALIAS      -- alias/leading URI to repository in the local webserver
#                   (optional; comment out to just use the CGI script path)
#     MIME_TYPES -- path to Apache-style mime.types file for additional
#                   types (optional)
#     LOG_FILE   -- path to log file (optional); comment out to disable logging
#
# APACHE
#
# If you're using this CGI inside Apache, a recommended way to use the
# proxy is to let Apache serve up cached files and directories and
# only defer to the CGI for missing and 'refetch'-ed files.  You can
# do this with rewrite rules as follows.
#
# ScriptAlias /cgi/ "/path/to/your/cgi/"
# <Directory /path/to/your/cgi>
#     Options ExecCGI
#     AddHandler cgi-script .rb
#     Allow from all
#     Order allow,deny
# </Directory>
#
# Alias /repos/
# <Directory /path/to/your/local/repos>
#     Options Indexes MultiViews FollowSymLinks
#     AllowOverride None
#     Allow from all
#     Order allow,deny
#     RewriteEngine on
#     RewriteBase /repos/
#     # Always defer to cgi when query string == 'refetch'
#     RewriteCond %{QUERY_STRING} ^refetch$
#     RewriteRule ^(.*)$ /cgi/maven-proxy.rb/$1 [QSA,L]
#     # Defer to cgi for missing files and directories
#     RewriteCond %{REQUEST_FILENAME} !-d
#     RewriteCond %{REQUEST_FILENAME} !-f
#     RewriteRule ^(.*)$ /cgi/maven-proxy.rb/$1 [QSA,L]
# </Directory>

# ==== Configuration section ====

LOCAL      = 'c:/projects/ruby/maven-repos'
REMOTE     = %w{http://www.ibiblio.org/maven2}
ALIAS      = '/repos'
MIME_TYPES = 'c:/tools/apache/Apache/conf/mime.types'
# LOG_FILE   = 'c:/tools/apache/Apache/logs/maven-proxy.log'

# Set umask for the CGI process (if desired)
File.umask(0002)

# ==== Program below here -- no configuration needed ====

require 'cgi'
require 'webrick/httputils'
require 'net/http'
require 'uri'
require 'fileutils'
require 'logger'

if defined?(LOG_FILE) && !LOG_FILE.nil? && LOG_FILE.length > 0
  LOGGER = Logger.new(LOG_FILE)
else
  require 'stringio'
  LOGGER = Logger.new(StringIO.new)
end

class MimeLookup
  include WEBrick::HTTPUtils
  def initialize()
    @mime_types = {}
    @mime_types.update(WEBrick::HTTPUtils::DefaultMimeTypes)
    @mime_types.update(load_mime_types(MIME_TYPES)) if defined?(MIME_TYPES)
    @mime_types.update({"md5" => "text/plain", "sha1" => "text/plain", "pom" => "text/xml"})
  end
  def mime_type(filename)
    super(filename, @mime_types)
  end
end

MimeTypes = MimeLookup.new

module Repository
  def initialize
    @local = LOCAL
    @local.sub!(%r{/$}, '')
  end

  def make_path(cgi, path_info = nil)
    path_info = cgi.path_info if path_info.nil?
    path = ''
    if defined?(ALIAS) && ALIAS
      path = "#{ALIAS}"
    else
      path = "#{cgi.script_name}"
    end
    path += "#{path_info}"
    path += "?#{cgi.query_string}" if cgi.query_string && cgi.query_string.length > 0
    path
  end

  def filename(path)
    File.join(@local, path.sub(%r{^/}, ''))
  end

  def stream(cgi)
    fn = filename(cgi.path_info)
    if File.exist?(fn) && File.readable?(fn)
      File.open(fn, 'rb') do |f|
        cgi.out("status" => "200",
                "type" => MimeTypes.mime_type(fn),
                "length" => File.size(fn)) { f.read }
      end
    else
      raise RuntimeError, "Unable to read file"
    end
  end
end

class LocalRepository
  include Repository

  # Refetch file from remote repository either when it is a snapshot
  # or when there is a '?refetch' on the end of the URL.
  def refetch?(cgi)
    cgi.path_info =~ /SNAPSHOT/ || ((cgi.query_string || "") == "refetch")
  end

  def resolve?(cgi)
    fn = filename(cgi.path_info)
    exist = !refetch?(cgi) && File.exist?(fn) && !File.directory?(fn)
    LOGGER.info "found at #{fn}" if exist
    exist
  end

  def to_s
    @local
  end
end

class RemoteRepository
  include Repository

  def initialize(remote)
    super()
    @remote = URI.parse(remote.sub(%r{/$}, ''))
  end

  def resolve?(cgi)
    path = cgi.path_info
    response = Net::HTTP.start(@remote.host, @remote.port) do |http|
      http.get("#{@remote.path}#{path}")
    end

    case response
    when Net::HTTPSuccess
      LOGGER.info "found at #{@remote}, downloading"
      # Don't download and cache HTML file for URIs that correspond to index.html
      if response['Content-Type'] =~ %r{text/html} && (path !~ /\.html$/ || path =~ %r{/index.html$})
        @contents = response.body
      else
        fn = filename(path)
        FileUtils.mkdir_p(File.dirname(fn))
        File.open(fn, "wb") do |dest|
          dest << response.body
        end
      end
    when Net::HTTPRedirection
      LOGGER.info "redirect: #{response['location']}"
      tmpuri = URI.parse(response['location'])
      if tmpuri.host == @remote.host && tmpuri.port == @remote.port
        newpath = tmpuri.path.sub(/^#{@remote.path}/, '')
        cgi.out("status" => "#{response.code} #{response.message}",
                "Location" => make_path(cgi, newpath)) { "Redirect" }
      end
    else
      LOGGER.info "not found, received #{response.inspect}"
      false
    end
  end

  def stream(cgi)
    if defined?(@contents)
      path = cgi.path_info
      cgi.out("status" => "200") do
        "<i>These contents are from the upstream location <a href=\"#{@remote}#{path}\">#{@remote}#{path}</a>.</i><br/><hr/>#{@contents}"
      end
    else
      super
    end
  end

  def to_s
    @remote.to_s
  end
end

class Main
  def initialize
    @cgi = CGI.new
  end


  def validate_config
    raise ArgumentError, "Invalid local repository path" unless File.exist? LOCAL
  end

  def fetch?
    resolved = false
    LOGGER.info "fetching #{@cgi.path_info}"
    repositories = [LocalRepository.new, REMOTE.map{|r| RemoteRepository.new(r)}].flatten
    repositories.each do |repos|
      LOGGER.info "checking repos: #{repos.to_s}"
      if repos.resolve?(@cgi)
        repos.stream @cgi
        resolved = true
        break
      end
    end
    resolved
  end

  def main
    begin
      validate_config
      unless fetch?
        @cgi.out("status" => "404 Not Found") do
          "<h1>File not found</h1><p>#{@cgi.path_info} was not found upstream in any of:</p>#{REMOTE.join('<br/>')}"
        end
      end
    rescue Exception => e
      LOGGER.error(e.backtrace.join("\n"))
      CGI.new.out("status" => "500 Internal Server Error") do
        "<h1>Error executing script</h1><p>#{e.to_s}</p>"
      end
    end
  end
end

# Only launch CGI in online mode, otherwise run tests
if ENV['REQUEST_METHOD']
  Main.new.main
else
  require 'test/unit'
  require 'fileutils'

  # Hostname where proxy is running -- assume localhost
  HOSTNAME = 'localhost'

  module ProxyTestHelper
    def initialize(*args)
      super
      @local       = LOCAL
      @hostname    = HOSTNAME
      @script_name = ALIAS
    end

    def rm_rf(dir)
      FileUtils.rm_rf File.join(@local, dir)
    end

    def current_time(path)
      File.stat(File.join(LOCAL, path)).mtime
    end

    def get(path)
      resp = Net::HTTP.start(@hostname) {|http| http.get("#{@script_name}/#{path}")}
    end

    def contains(expr)
      proc {|x| x =~ /#{Regexp.escape expr}/}
    end

    def does_not_contain(expr)
      proc {|x| x !~ /#{Regexp.escape expr}/}
    end

    def assert_redirect_to(place, resp)
      assert_kind_of Net::HTTPRedirection, resp, "Not redirected"
      to_match = resp['location']
      case place
      when Proc
        assert place.call(to_match), "Location '#{to_match}' doesn't match"
      else
        assert_match place.to_s, to_match, "Location '#{to_match}' doesn't match"
      end
    end

    def assert_body(expr, resp)
      assert_kind_of Net::HTTPSuccess, resp, "Request failed"
      to_match = resp.body
      case expr
      when Proc
        assert expr.call(to_match), "Body doesn't match"
      else
        assert_match expr.to_s, to_match, "Body doesn't match"
      end
    end

    def assert_header(hash, resp)
      assert_kind_of Net::HTTPSuccess, resp, "Request failed"
      hash.each do |key, val|
        assert_equal val, resp[key]
      end
    end

    def assert_cached(path)
      assert File.exist?(File.join(LOCAL, path)), "File '#{path}' not cached locally"
    end
  end

  class MavenProxyTest < Test::Unit::TestCase
    include ProxyTestHelper

    def setup
      rm_rf('org/apache/maven')
    end

    def test_all
      # redirects from remote server are propagated
      assert_redirect_to contains('org/apache/maven/'), get('org/apache/maven')
      assert_body contains('contents are from the upstream location'), get('org/apache/maven/')

      # check for proper content-type headers
      assert_header({'content-type' => 'text/xml'}, get('org/apache/maven/maven/maven-metadata.xml'))
      assert_cached 'org/apache/maven/maven/maven-metadata.xml'
      assert_header({'content-type' => 'text/plain'}, get('org/apache/maven/maven/maven-metadata.xml.md5'))
      assert_cached 'org/apache/maven/maven/maven-metadata.xml.md5'
      assert_header({'content-type' => 'text/plain'}, get('org/apache/maven/maven/maven-metadata.xml.sha1'))
      assert_cached 'org/apache/maven/maven/maven-metadata.xml.sha1'
      assert_header({'content-type' => 'application/octet-stream'}, get('org/apache/maven/maven-plugin-api/2.0/maven-plugin-api-2.0.jar'))
      assert_cached 'org/apache/maven/maven-plugin-api/2.0/maven-plugin-api-2.0.jar'

      # directory should be cached locally now
      assert_cached 'org/apache/maven'
      assert_body does_not_contain('contents are from the upstream location'), get('org/apache/maven/')

      # refetch -- should be from remote
      assert_body contains('contents are from the upstream location'), get('org/apache/maven/?refetch')
      assert_redirect_to contains('org/apache/maven/?refetch'), get('org/apache/maven?refetch')

      # check that this is really a POM
      assert_body contains('<groupId>org.apache.maven</groupId>'), get('org/apache/maven/maven/2.0/maven-2.0.pom')
      assert_cached 'org/apache/maven/maven/2.0/maven-2.0.pom'
      
      # Now refetch and ensure that it was redownloaded
      time = current_time('org/apache/maven/maven/2.0/maven-2.0.pom')
      sleep 1
      assert_body contains('<groupId>org.apache.maven</groupId>'), get('org/apache/maven/maven/2.0/maven-2.0.pom?refetch')
      assert current_time('org/apache/maven/maven/2.0/maven-2.0.pom') > time

      # Ensure 404 is propagated
      assert_kind_of Net::HTTPClientError, get('org/apache/maven/maven/2.0/zzz-2.0.pom')
    end
  end
end