# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
"""
Utility functions for parsing, formatting, and manipulating URLs.
"""
import itertools
import os
import posixpath
import sys
import urllib.parse
import urllib.request
from llnl.path import convert_to_posix_path
from spack.util.path import sanitize_filename
[docs]
def validate_scheme(scheme):
"""Returns true if the URL scheme is generally known to Spack. This function
helps mostly in validation of paths vs urls, as Windows paths such as
C:/x/y/z (with backward not forward slash) may parse as a URL with scheme
C and path /x/y/z."""
return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git")
def _split_all(path):
"""Split path into its atomic components.
Returns the shortest list, L, of strings such that posixpath.join(*L) ==
path and posixpath.split(element) == ('', element) for every element in L
except possibly the first. This first element may possibly have the value
of '/'.
"""
result = []
a = path
old_a = None
while a != old_a:
(old_a, (a, b)) = a, posixpath.split(a)
if a or b:
result.insert(0, b or "/")
return result
[docs]
def local_file_path(url):
"""Get a local file path from a url.
If url is a file:// URL, return the absolute path to the local
file or directory referenced by it. Otherwise, return None.
"""
if isinstance(url, str):
url = urllib.parse.urlparse(url)
if url.scheme == "file":
return urllib.request.url2pathname(url.path)
return None
[docs]
def path_to_file_url(path):
if not os.path.isabs(path):
path = os.path.abspath(path)
return urllib.parse.urljoin("file:", urllib.request.pathname2url(path))
[docs]
def file_url_string_to_path(url):
return urllib.request.url2pathname(urllib.parse.urlparse(url).path)
[docs]
def is_path_instead_of_url(path_or_url):
"""Historically some config files and spack commands used paths
where urls should be used. This utility can be used to validate
and promote paths to urls."""
scheme = urllib.parse.urlparse(path_or_url).scheme
# On non-Windows, no scheme means it's likely a path
if not sys.platform == "win32":
return not scheme
# On Windows, we may have drive letters.
return "A" <= scheme <= "Z"
[docs]
def join(base_url, path, *extra, **kwargs):
"""Joins a base URL with one or more local URL path components
If resolve_href is True, treat the base URL as though it where the locator
of a web page, and the remaining URL path components as though they formed
a relative URL to be resolved against it (i.e.: as in posixpath.join(...)).
The result is an absolute URL to the resource to which a user's browser
would navigate if they clicked on a link with an "href" attribute equal to
the relative URL.
If resolve_href is False (default), then the URL path components are joined
as in posixpath.join().
Note: file:// URL path components are not canonicalized as part of this
operation. To canonicalize, pass the joined url to format().
Examples:
base_url = 's3://bucket/index.html'
body = fetch_body(prefix)
link = get_href(body) # link == '../other-bucket/document.txt'
# wrong - link is a local URL that needs to be resolved against base_url
spack.util.url.join(base_url, link)
's3://bucket/other_bucket/document.txt'
# correct - resolve local URL against base_url
spack.util.url.join(base_url, link, resolve_href=True)
's3://other_bucket/document.txt'
prefix = 'https://mirror.spack.io/build_cache'
# wrong - prefix is just a URL prefix
spack.util.url.join(prefix, 'my-package', resolve_href=True)
'https://mirror.spack.io/my-package'
# correct - simply append additional URL path components
spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
'https://mirror.spack.io/build_cache/my-package'
# For canonicalizing file:// URLs, take care to explicitly differentiate
# between absolute and relative join components.
"""
paths = [
(x) if isinstance(x, str) else x.geturl() for x in itertools.chain((base_url, path), extra)
]
paths = [convert_to_posix_path(x) for x in paths]
n = len(paths)
last_abs_component = None
scheme = ""
for i in range(n - 1, -1, -1):
obj = urllib.parse.urlparse(paths[i], scheme="", allow_fragments=False)
scheme = obj.scheme
# in either case the component is absolute
if scheme or obj.path.startswith("/"):
if not scheme:
# Without a scheme, we have to go back looking for the
# next-last component that specifies a scheme.
for j in range(i - 1, -1, -1):
obj = urllib.parse.urlparse(paths[j], scheme="", allow_fragments=False)
if obj.scheme:
paths[i] = "{SM}://{NL}{PATH}".format(
SM=obj.scheme,
NL=((obj.netloc + "/") if obj.scheme != "s3" else ""),
PATH=paths[i][1:],
)
break
last_abs_component = i
break
if last_abs_component is not None:
paths = paths[last_abs_component:]
if len(paths) == 1:
result = urllib.parse.urlparse(paths[0], scheme="file", allow_fragments=False)
# another subtlety: If the last argument to join() is an absolute
# file:// URL component with a relative path, the relative path
# needs to be resolved.
if result.scheme == "file" and result.netloc:
result = urllib.parse.ParseResult(
scheme=result.scheme,
netloc="",
path=posixpath.abspath(result.netloc + result.path),
params=result.params,
query=result.query,
fragment=None,
)
return result.geturl()
return _join(*paths, **kwargs)
def _join(base_url, path, *extra, **kwargs):
base_url = urllib.parse.urlparse(base_url)
resolve_href = kwargs.get("resolve_href", False)
(scheme, netloc, base_path, params, query, _) = base_url
scheme = scheme.lower()
path_tokens = [
part
for part in itertools.chain(
_split_all(path),
itertools.chain.from_iterable(_split_all(extra_path) for extra_path in extra),
)
if part and part != "/"
]
base_path_args = ["/fake-root"]
if scheme == "s3":
if netloc:
base_path_args.append(netloc)
if base_path.startswith("/"):
base_path = base_path[1:]
base_path_args.append(base_path)
if resolve_href:
new_base_path, _ = posixpath.split(posixpath.join(*base_path_args))
base_path_args = [new_base_path]
base_path_args.extend(path_tokens)
base_path = posixpath.relpath(posixpath.join(*base_path_args), "/fake-root")
if scheme == "s3":
path_tokens = [part for part in _split_all(base_path) if part and part != "/"]
if path_tokens:
netloc = path_tokens.pop(0)
base_path = posixpath.join("", *path_tokens)
if sys.platform == "win32":
base_path = convert_to_posix_path(base_path)
return format(
urllib.parse.ParseResult(
scheme=scheme, netloc=netloc, path=base_path, params=params, query=query, fragment=None
)
)
[docs]
def default_download_filename(url: str) -> str:
"""This method computes a default file name for a given URL.
Note that it makes no request, so this is not the same as the
option curl -O, which uses the remote file name from the response
header."""
parsed_url = urllib.parse.urlparse(url)
# Only use the last path component + params + query + fragment
name = urllib.parse.urlunparse(
parsed_url._replace(scheme="", netloc="", path=posixpath.basename(parsed_url.path))
)
valid_name = sanitize_filename(name)
# Don't download to hidden files please
if valid_name[0] == ".":
valid_name = "_" + valid_name[1:]
return valid_name