From cfd685d0e2b49a2ea0a4f5f179d8f3d852b60caa Mon Sep 17 00:00:00 2001
From: Jeff Hobbs <hobbs@users.sourceforge.net>
Date: Tue, 19 Sep 2006 22:00:39 +0000
Subject: [PATCH] 	* library/httpvfs.tcl (vfs::http::urlparse): add
 method to 	deconstruct the url using RFC 3986 semantics. 
 (vfs::http::Mount): add support for HTTP basic auth if a user was 	passed
 in the url

---
 ChangeLog           |   7 ++
 library/httpvfs.tcl | 270 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 229 insertions(+), 48 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index cbd3ad4..d001120 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2006-09-19  Jeff Hobbs  <jeffh@ActiveState.com>
+
+	* library/httpvfs.tcl (vfs::http::urlparse): add method to
+	deconstruct the url using RFC 3986 semantics.
+	(vfs::http::Mount): add support for HTTP basic auth if a user was
+	passed in the url
+
 2006-09-15  Jeff Hobbs  <jeffh@ActiveState.com>
 
 	* library/pkgIndex.tcl: bump vfs::http to 0.6
diff --git a/library/httpvfs.tcl b/library/httpvfs.tcl
index 3717ec3..5dd5942 100644
--- a/library/httpvfs.tcl
+++ b/library/httpvfs.tcl
@@ -4,13 +4,14 @@ package provide vfs::http 0.6
 package require vfs 1.0
 package require http
 
-# This works for basic operations, but has not been very debugged.
+# This works for basic operations, using http GET and HEAD requests
+# to serve http data in a read-only file system.
 
 namespace eval vfs::http {
     # Allow for options when mounting an http URL
     variable options
     # -urlencode means automatically parse "foo/my file (2).txt" as
-    # "foo/my%20file%20%282%29.txt", as per RFC 3986, for the user.
+    # "foo/my%20file%20%282%29.txt", per RFC 3986, for the user.
     set options(-urlencode) 1
     # -urlparse would further parse URLs for ? (query string) and # (anchor)
     # components, leaving those unencoded. Only works when -urlencode is true.
@@ -22,6 +23,8 @@ proc vfs::http::Mount {dirurl local args} {
     variable options
     foreach {key val} $args {
 	# only do exact option name matching for now
+	# We could consider allowing general http options here,
+	# but those would be per-mount
 	if {[info exists options($key)]} {
 	    # currently only boolean values
 	    if {![string is boolean -strict $val]} {
@@ -30,62 +33,234 @@ proc vfs::http::Mount {dirurl local args} {
 	    set options($key) $val
 	}
     }
-    if {[string index $dirurl end] ne "/"} {
-	append dirurl "/"
-    }
-    if {[string match "http://*" $dirurl]} {
-	set rest [string range $dirurl 7 end]
-    } else {
-	set rest $dirurl
-	set dirurl "http://${dirurl}"
-    }
 
-    if {![regexp {(([^:]*)(:([^@]*))?@)?([^/]*)(/(.*/)?([^/]*))?$} $rest \
-	      junk junk user junk pass host junk path file]} {
-	return -code error "unable to parse url \"$dirurl\""
-    }
+    # Break the url into parts, verifying url
+    array set parts [urlparse $dirurl]
 
-    if {[string length $file]} {
-	return -code error "Can only mount directories, not\
-	  files (perhaps you need a trailing '/' - I understood\
-	  a path '$path' and file '$file')"
+    if {[info exists parts(query)] || [info exists parts(anchor)]} {
+	return -code error "invalid url \"$dirurl\":\
+		no query string or anchor fragments allowed"
     }
 
-    if {$user eq ""} {
-	set user anonymous
+    if {[info exists parts(user)]} {
+	# At this point we need base64 for HTTP Basic AUTH
+	package require base64
+	foreach {user passwd} [split $parts(user) :] { break }
+	set auth "Basic [base64::encode $user:$passwd]"
+	set headers [list Authorization $auth]
+    } else {
+	set headers ""
     }
 
-    set token [::http::geturl $dirurl -validate 1]
+    set token [::http::geturl $parts(url) -validate 1 -headers $headers]
     http::wait $token
     set status [http::status $token]
     http::cleanup $token
     if {$status ne "ok"} {
 	# we'll take whatever http agrees is "ok"
-	return -code error "received status \"$status\" for \"$dirurl\""
+	return -code error "received status \"$status\" for \"$parts(url)\""
     }
 
-    if {![catch {vfs::filesystem info $dirurl}]} {
+    # Add a / to make sure the url and names are clearly separated later
+    if {[string index $parts(url) end] ne "/"} {
+	append parts(url) "/"
+    }
+
+    if {![catch {vfs::filesystem info $parts(url)}]} {
 	# unmount old mount
-	::vfs::log "ftp-vfs: unmounted old mount point at $dirurl"
-	vfs::unmount $dirurl
+	::vfs::log "ftp-vfs: unmounted old mount point at $parts(url)"
+	vfs::unmount $parts(url)
     }
-    ::vfs::log "http $host, $path mounted at $local"
-    vfs::filesystem mount $local [list vfs::http::handler $dirurl $path]
-    # Register command to unmount
-    vfs::RegisterMount $local [list ::vfs::http::Unmount $dirurl]
-    return $dirurl
+    ::vfs::log "http $dirurl ($parts(url)) mounted at $local"
+    # Pass headers along as they may include authentication
+    vfs::filesystem mount $local \
+	[list vfs::http::handler $parts(url) $headers $parts(file)]
+    # Register command to unmount - headers not needed
+    vfs::RegisterMount $local [list ::vfs::http::Unmount $parts(url)]
+    return $parts(url)
 }
 
-proc vfs::http::Unmount {dirurl local} {
+proc vfs::http::Unmount {url local} {
     vfs::filesystem unmount $local
 }
 
-proc vfs::http::handler {dirurl path cmd root relative actualpath args} {
+proc vfs::http::handler {url headers path cmd root relative actualpath args} {
     if {$cmd eq "matchindirectory"} {
-	eval [linsert $args 0 $cmd $dirurl $relative $actualpath]
+	eval [linsert $args 0 $cmd $url $headers $relative $actualpath]
+    } else {
+	eval [linsert $args 0 $cmd $url $headers $relative]
+    }
+}
+
+proc vfs::http::urlparse {url} {
+    # Taken from http 2.5.3
+
+    # Validate URL by parts.  We suck out user:pass if it exists as the
+    # core http package does not automate HTTP Basic Auth yet.
+
+    # Returns data in [array get] format.  The url, host and file keys are
+    # guaranteed to exist.  proto, port, query, anchor, and user should be
+    # checked with [info exists]. (user may contain password)
+
+    # URLs have basically four parts.
+    # First, before the colon, is the protocol scheme (e.g. http)
+    # Second, for HTTP-like protocols, is the authority
+    #	The authority is preceded by // and lasts up to (but not including)
+    #	the following / and it identifies up to four parts, of which only one,
+    #	the host, is required (if an authority is present at all). All other
+    #	parts of the authority (user name, password, port number) are optional.
+    # Third is the resource name, which is split into two parts at a ?
+    #	The first part (from the single "/" up to "?") is the path, and the
+    #	second part (from that "?" up to "#") is the query. *HOWEVER*, we do
+    #	not need to separate them; we send the whole lot to the server.
+    # Fourth is the fragment identifier, which is everything after the first
+    #	"#" in the URL. The fragment identifier MUST NOT be sent to the server
+    #	and indeed, we don't bother to validate it (it could be an error to
+    #	pass it in here, but it's cheap to strip).
+    #
+    # An example of a URL that has all the parts:
+    #   http://jschmoe:xyzzy@www.bogus.net:8000/foo/bar.tml?q=foo#changes
+    # The "http" is the protocol, the user is "jschmoe", the password is
+    # "xyzzy", the host is "www.bogus.net", the port is "8000", the path is
+    # "/foo/bar.tml", the query is "q=foo", and the fragment is "changes".
+    #
+    # Note that the RE actually combines the user and password parts, as
+    # recommended in RFC 3986. Indeed, that RFC states that putting passwords
+    # in URLs is a Really Bad Idea, something with which I would agree utterly.
+    # Also note that we do not currently support IPv6 addresses.
+    #
+    # From a validation perspective, we need to ensure that the parts of the
+    # URL that are going to the server are correctly encoded.
+
+    set URLmatcher {(?x)		# this is _expanded_ syntax
+	^
+	(?: (\w+) : ) ?			# <protocol scheme>
+	(?: //
+	    (?:
+		(
+		    [^@/\#?]+		# <userinfo part of authority>
+		) @
+	    )?
+	    ( [^/:\#?]+ )		# <host part of authority>
+	    (?: : (\d+) )?		# <port part of authority>
+	)?
+	( / [^\#?]* (?: \? [^\#?]* )?)?	# <path> (including query)
+	(?: \# (.*) )?			# <fragment> (aka anchor)
+	$
+    }
+
+    # Phase one: parse
+    if {![regexp -- $URLmatcher $url -> proto user host port srvurl anchor]} {
+	unset $token
+	return -code error "Unsupported URL: $url"
+    }
+    # Phase two: validate
+    if {$host eq ""} {
+	# Caller has to provide a host name; we do not have a "default host"
+	# that would enable us to handle relative URLs.
+	unset $token
+	return -code error "Missing host part: $url"
+	# Note that we don't check the hostname for validity here; if it's
+	# invalid, we'll simply fail to resolve it later on.
+    }
+    if {$port ne "" && $port>65535} {
+	unset $token
+	return -code error "Invalid port number: $port"
+    }
+    # The user identification and resource identification parts of the URL can
+    # have encoded characters in them; take care!
+    if {$user ne ""} {
+	# Check for validity according to RFC 3986, Appendix A
+	set validityRE {(?xi)
+	    ^
+	    (?: [-\w.~!$&'()*+,;=:] | %[0-9a-f][0-9a-f] )+
+	    $
+	}
+	if {![regexp -- $validityRE $user]} {
+	    unset $token
+	    # Provide a better error message in this error case
+	    if {[regexp {(?i)%(?![0-9a-f][0-9a-f]).?.?} $user bad]} {
+		return -code error \
+			"Illegal encoding character usage \"$bad\" in URL user"
+	    }
+	    return -code error "Illegal characters in URL user"
+	}
+    }
+    if {$srvurl ne ""} {
+	# Check for validity according to RFC 3986, Appendix A
+	set validityRE {(?xi)
+	    ^
+	    # Path part (already must start with / character)
+	    (?:	      [-\w.~!$&'()*+,;=:@/]  | %[0-9a-f][0-9a-f] )*
+	    # Query part (optional, permits ? characters)
+	    (?: \? (?: [-\w.~!$&'()*+,;=:@/?] | %[0-9a-f][0-9a-f] )* )?
+	    $
+	}
+	if {![regexp -- $validityRE $srvurl]} {
+	    unset $token
+	    # Provide a better error message in this error case
+	    if {[regexp {(?i)%(?![0-9a-f][0-9a-f])..} $srvurl bad]} {
+		return -code error \
+			"Illegal encoding character usage \"$bad\" in URL path"
+	    }
+	    return -code error "Illegal characters in URL path"
+	}
     } else {
-	eval [linsert $args 0 $cmd $dirurl $relative]
+	set srvurl /
+    }
+    if {$proto eq ""} {
+	set proto http
+    } else {
+	set result(proto) $proto
+    }
+
+    # Here we vary from core http
+
+    # vfs::http - we only support http at this time.  Perhaps https later?
+    if {$proto ne "http"} {
+	return -code error "Unsupported URL type \"$proto\""
     }
+
+    # OK, now reassemble into a full URL, with result containing the
+    # parts that exist and will be returned to the user
+    array set result {}
+    set url ${proto}://
+    if {$user ne ""} {
+	set result(user) $user
+	# vfs::http will do HTTP basic auth on their existence,
+	# but we pass these through as they are innocuous
+	append url $user
+	append url @
+    }
+    append url $host
+    set result(host) $host
+    if {$port ne ""} {
+	# don't bother with adding default port
+	append url : $port
+	set result(port) $port
+    }
+    append url $srvurl
+    if {$anchor ne ""} {
+	# XXX: Don't append see the anchor, as it is generally a client-side
+	# XXX: item.  The user can add it back if they want.
+	#append url \# $anchor
+	set result(anchor) $anchor
+    }
+
+    set idx [string first ? $srvurl]
+    if {$idx >= 0} {
+	set query [string range [expr {$idx+1}] end]
+	set file  [string range 0 [expr {$idx-1}]]
+	set result(file) $file
+	set result(query) $query
+    } else {
+	set result(file) $srvurl
+    }
+
+    set result(url) $url
+
+    # return array format list of items
+    return [array get result]
 }
 
 proc vfs::http::urlname {name} {
@@ -116,7 +291,7 @@ proc vfs::http::urlname {name} {
 # If we implement the commands below, we will have a perfect
 # virtual file system for remote http sites.
 
-proc vfs::http::stat {dirurl name} {
+proc vfs::http::stat {dirurl headers name} {
     set urlname [urlname $name]
     ::vfs::log "stat $name ($urlname)"
 
@@ -124,7 +299,7 @@ proc vfs::http::stat {dirurl name} {
     # as a file (not a directory) since with http, even directories
     # really behave as the index.html they contain.
 
-    set token [::http::geturl "$dirurl$urlname" -validate 1]
+    set token [::http::geturl "$dirurl$urlname" -validate 1 -headers $headers]
     http::wait $token
     set ncode [http::ncode $token]
     if {$ncode == 404 || [http::status $token] ne "ok"} {
@@ -143,7 +318,7 @@ proc vfs::http::stat {dirurl name} {
     return $res
 }
 
-proc vfs::http::access {dirurl name mode} {
+proc vfs::http::access {dirurl headers name mode} {
     set urlname [urlname $name]
     ::vfs::log "access $name $mode ($urlname)"
     if {$mode & 2} {
@@ -151,7 +326,7 @@ proc vfs::http::access {dirurl name mode} {
 	return -code error "read-only"
     }
     if {$name == ""} { return 1 }
-    set token [::http::geturl "$dirurl$urlname" -validate 1]
+    set token [::http::geturl "$dirurl$urlname" -validate 1 -headers $headers]
     http::wait $token
     set ncode [http::ncode $token]
     if {$ncode == 404 || [http::status $token] ne "ok"} {
@@ -169,7 +344,7 @@ proc vfs::http::access {dirurl name mode} {
 
 # We've chosen to implement these channels by using a memchan.
 # The alternative would be to use temporary files.
-proc vfs::http::open {dirurl name mode permissions} {
+proc vfs::http::open {dirurl headers name mode permissions} {
     set urlname [urlname $name]
     ::vfs::log "open $name $mode $permissions ($urlname)"
     # return a list of two elements:
@@ -179,7 +354,7 @@ proc vfs::http::open {dirurl name mode permissions} {
     switch -glob -- $mode {
 	"" -
 	"r" {
-	    set token [::http::geturl "$dirurl$urlname"]
+	    set token [::http::geturl "$dirurl$urlname" -headers $headers]
 
 	    set filed [vfs::memchan]
 	    fconfigure $filed -translation binary
@@ -202,7 +377,7 @@ proc vfs::http::open {dirurl name mode permissions} {
     }
 }
 
-proc vfs::http::matchindirectory {dirurl path actualpath pattern type} {
+proc vfs::http::matchindirectory {dirurl headers path actualpath pattern type} {
     ::vfs::log "matchindirectory $path $pattern $type"
     set res [list]
 
@@ -218,22 +393,22 @@ proc vfs::http::matchindirectory {dirurl path actualpath pattern type} {
     return $res
 }
 
-proc vfs::http::createdirectory {dirurl name} {
+proc vfs::http::createdirectory {dirurl headers name} {
     ::vfs::log "createdirectory $name"
     vfs::filesystem posixerror $::vfs::posix(EROFS)
 }
 
-proc vfs::http::removedirectory {dirurl name recursive} {
+proc vfs::http::removedirectory {dirurl headers name recursive} {
     ::vfs::log "removedirectory $name"
     vfs::filesystem posixerror $::vfs::posix(EROFS)
 }
 
-proc vfs::http::deletefile {dirurl name} {
+proc vfs::http::deletefile {dirurl headers name} {
     ::vfs::log "deletefile $name"
     vfs::filesystem posixerror $::vfs::posix(EROFS)
 }
 
-proc vfs::http::fileattributes {dirurl path args} {
+proc vfs::http::fileattributes {dirurl headers path args} {
     ::vfs::log "fileattributes $args"
     switch -- [llength $args] {
 	0 {
@@ -253,7 +428,6 @@ proc vfs::http::fileattributes {dirurl path args} {
     }
 }
 
-proc vfs::http::utime {dirurl path actime mtime} {
+proc vfs::http::utime {dirurl headers path actime mtime} {
     vfs::filesystem posixerror $::vfs::posix(EROFS)
 }
-
-- 
2.23.0