Source code for spack.util.elf

# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)

import bisect
import re
import struct
from collections import namedtuple
from struct import calcsize, unpack, unpack_from

ElfHeader = namedtuple(
    "ElfHeader",
    [
        "e_type",
        "e_machine",
        "e_version",
        "e_entry",
        "e_phoff",
        "e_shoff",
        "e_flags",
        "e_ehsize",
        "e_phentsize",
        "e_phnum",
        "e_shentsize",
        "e_shnum",
        "e_shstrndx",
    ],
)

SectionHeader = namedtuple(
    "SectionHeader",
    [
        "sh_name",
        "sh_type",
        "sh_flags",
        "sh_addr",
        "sh_offset",
        "sh_size",
        "sh_link",
        "sh_info",
        "sh_addralign",
        "sh_entsize",
    ],
)

ProgramHeader32 = namedtuple(
    "ProgramHeader32",
    ["p_type", "p_offset", "p_vaddr", "p_paddr", "p_filesz", "p_memsz", "p_flags", "p_align"],
)

ProgramHeader64 = namedtuple(
    "ProgramHeader64",
    ["p_type", "p_flags", "p_offset", "p_vaddr", "p_paddr", "p_filesz", "p_memsz", "p_align"],
)


[docs]class ELF_CONSTANTS:
    MAGIC = b"\x7fELF"
    CLASS32 = 1
    CLASS64 = 2
    DATA2LSB = 1
    DATA2MSB = 2
    ET_EXEC = 2
    ET_DYN = 3
    PT_LOAD = 1
    PT_DYNAMIC = 2
    PT_INTERP = 3
    DT_NULL = 0
    DT_NEEDED = 1
    DT_STRTAB = 5
    DT_SONAME = 14
    DT_RPATH = 15
    DT_RUNPATH = 29
    SHT_STRTAB = 3


[docs]class ElfFile(object):
    """Parsed ELF file."""

    __slots__ = [
        "is_64_bit",
        "is_little_endian",
        "byte_order",
        "elf_hdr",
        "pt_load",
        # pt_interp
        "has_pt_interp",
        "pt_interp_p_offset",
        "pt_interp_p_filesz",
        "pt_interp_str",
        # pt_dynamic
        "has_pt_dynamic",
        "pt_dynamic_p_offset",
        "pt_dynamic_p_filesz",
        "pt_dynamic_strtab_offset",  # string table for dynamic section
        # rpath
        "has_rpath",
        "dt_rpath_offset",
        "dt_rpath_str",
        "rpath_strtab_offset",
        "is_runpath",
        # dt needed
        "has_needed",
        "dt_needed_strtab_offsets",
        "dt_needed_strs",
        # dt soname
        "has_soname",
        "dt_soname_strtab_offset",
        "dt_soname_str",
    ]

    def __init__(self):
        self.dt_needed_strtab_offsets = []
        self.has_soname = False
        self.has_rpath = False
        self.has_needed = False
        self.pt_load = []
        self.has_pt_dynamic = False
        self.has_pt_interp = False


[docs]def parse_c_string(byte_string, start=0):
    """
    Retrieve a C-string at a given offset in a byte string

    Arguments:
        byte_string (bytes): String
        start (int): Offset into the string

    Returns:
        bytes: A copy of the C-string excluding the terminating null byte
    """
    str_end = byte_string.find(b"\0", start)
    if str_end == -1:
        raise ElfParsingError("C-string is not null terminated")
    return byte_string[start:str_end]


[docs]def read_exactly(f, num_bytes, msg):
    """
    Read exactly num_bytes at the current offset, otherwise raise
    a parsing error with the given error message.

    Arguments:
        f: file handle
        num_bytes (int): Number of bytes to read
        msg (str): Error to show when bytes cannot be read

    Returns:
        bytes: the ``num_bytes`` bytes that were read.
    """
    data = f.read(num_bytes)
    if len(data) != num_bytes:
        raise ElfParsingError(msg)
    return data


[docs]def parse_program_headers(f, elf):
    """
    Parse program headers

    Arguments:
        f: file handle
        elf (ElfFile): ELF file parser data
    """
    # Forward to the program header
    f.seek(elf.elf_hdr.e_phoff)

    # Here we have to make a mapping from virtual address to offset in the file.
    ProgramHeader = ProgramHeader64 if elf.is_64_bit else ProgramHeader32
    ph_fmt = elf.byte_order + ("LLQQQQQQ" if elf.is_64_bit else "LLLLLLLL")
    ph_size = calcsize(ph_fmt)
    ph_num = elf.elf_hdr.e_phnum

    # Read all program headers in one go
    data = read_exactly(f, ph_num * ph_size, "Malformed program header")

    for i in range(ph_num):
        ph = ProgramHeader._make(unpack_from(ph_fmt, data, i * ph_size))

        # Skip segments of size 0; we don't distinguish between missing segment and
        # empty segments. I've see an empty PT_DYNAMIC section for an ELF file that
        # contained debug data.
        if ph.p_filesz == 0:
            continue

        # For PT_LOAD entries: Save offsets and virtual addrs of the loaded ELF segments
        # This way we can map offsets by virtual address to offsets in the file.
        if ph.p_type == ELF_CONSTANTS.PT_LOAD:
            elf.pt_load.append((ph.p_offset, ph.p_vaddr))

        elif ph.p_type == ELF_CONSTANTS.PT_INTERP:
            elf.pt_interp_p_offset = ph.p_offset
            elf.pt_interp_p_filesz = ph.p_filesz
            elf.has_pt_interp = True

        elif ph.p_type == ELF_CONSTANTS.PT_DYNAMIC:
            elf.pt_dynamic_p_offset = ph.p_offset
            elf.pt_dynamic_p_filesz = ph.p_filesz
            elf.has_pt_dynamic = True

    # The linker sorts PT_LOAD segments by vaddr, but let's do it just to be sure, since
    # patchelf for example has a flag to leave them in an arbitrary order.
    elf.pt_load.sort(key=lambda x: x[1])


[docs]def parse_pt_interp(f, elf):
    """
    Parse the interpreter (i.e. absolute path to the dynamic linker)

    Arguments:
        f: file handle
        elf (ElfFile): ELF file parser data
    """
    f.seek(elf.pt_interp_p_offset)
    data = read_exactly(f, elf.pt_interp_p_filesz, "Malformed PT_INTERP entry")
    elf.pt_interp_str = parse_c_string(data)


[docs]def find_strtab_size_at_offset(f, elf, offset):
    """
    Retrieve the size of a string table section at a particular known offset

    Arguments:
        f: file handle
        elf (ElfFile): ELF file parser data
        offset (int): offset of the section in the file (i.e. ``sh_offset``)

    Returns:
        int: the size of the string table in bytes
    """
    section_hdr_fmt = elf.byte_order + ("LLQQQQLLQQ" if elf.is_64_bit else "LLLLLLLLLL")
    section_hdr_size = calcsize(section_hdr_fmt)
    f.seek(elf.elf_hdr.e_shoff)
    for _ in range(elf.elf_hdr.e_shnum):
        data = read_exactly(f, section_hdr_size, "Malformed section header")
        sh = SectionHeader._make(unpack(section_hdr_fmt, data))
        if sh.sh_type == ELF_CONSTANTS.SHT_STRTAB and sh.sh_offset == offset:
            return sh.sh_size

    raise ElfParsingError("Could not determine strtab size")


[docs]def retrieve_strtab(f, elf, offset):
    """
    Read a full string table at the given offset, which
    requires looking it up in the section headers.

    Arguments:
        elf (ElfFile): ELF file parser data
        vaddr (int): virtual address

    Returns:
        bytes: file offset
    """
    size = find_strtab_size_at_offset(f, elf, offset)
    f.seek(offset)
    return read_exactly(f, size, "Could not read string table")


[docs]def vaddr_to_offset(elf, vaddr):
    """
    Given a virtual address, find the corresponding offset in the ELF file itself.

    Arguments:
        elf (ElfFile): ELF file parser data
        vaddr (int): virtual address
    """
    idx = bisect.bisect_right([p_vaddr for (p_offset, p_vaddr) in elf.pt_load], vaddr) - 1
    p_offset, p_vaddr = elf.pt_load[idx]
    return p_offset - p_vaddr + vaddr


[docs]def parse_pt_dynamic(f, elf):
    """
    Parse the dynamic section of an ELF file

    Arguments:
        f: file handle
        elf (ElfFile): ELF file parse data
    """
    dynamic_array_fmt = elf.byte_order + ("qQ" if elf.is_64_bit else "lL")
    dynamic_array_size = calcsize(dynamic_array_fmt)

    current_offset = elf.pt_dynamic_p_offset
    count_rpath = 0
    count_runpath = 0
    count_strtab = 0

    f.seek(elf.pt_dynamic_p_offset)

    # In case of broken ELF files, don't read beyond the advertized size.
    for _ in range(elf.pt_dynamic_p_filesz // dynamic_array_size):
        data = read_exactly(f, dynamic_array_size, "Malformed dynamic array entry")
        tag, val = unpack(dynamic_array_fmt, data)
        if tag == ELF_CONSTANTS.DT_NULL:
            break
        elif tag == ELF_CONSTANTS.DT_RPATH:
            count_rpath += 1
            elf.rpath_strtab_offset = val
            elf.dt_rpath_offset = current_offset
            elf.is_runpath = False
            elf.has_rpath = True
        elif tag == ELF_CONSTANTS.DT_RUNPATH:
            count_runpath += 1
            elf.rpath_strtab_offset = val
            elf.dt_rpath_offset = current_offset
            elf.is_runpath = True
            elf.has_rpath = True
        elif tag == ELF_CONSTANTS.DT_STRTAB:
            count_strtab += 1
            strtab_vaddr = val
        elif tag == ELF_CONSTANTS.DT_NEEDED:
            elf.has_needed = True
            elf.dt_needed_strtab_offsets.append(val)
        elif tag == ELF_CONSTANTS.DT_SONAME:
            elf.has_soname = True
            elf.dt_soname_strtab_offset = val
        current_offset += dynamic_array_size

    # No rpath/runpath, that happens.
    if count_rpath == count_runpath == 0:
        elf.has_rpath = False
    elif count_rpath + count_runpath != 1:
        raise ElfParsingError("Could not find a unique rpath/runpath.")

    if count_strtab != 1:
        raise ElfParsingError("Could not find a unique strtab of for the dynamic section strings")

    # Nothing to retrieve, so don't bother getting the string table.
    if not (elf.has_rpath or elf.has_soname or elf.has_needed):
        return

    elf.pt_dynamic_strtab_offset = vaddr_to_offset(elf, strtab_vaddr)
    string_table = retrieve_strtab(f, elf, elf.pt_dynamic_strtab_offset)

    if elf.has_needed:
        elf.dt_needed_strs = list(
            parse_c_string(string_table, offset) for offset in elf.dt_needed_strtab_offsets
        )

    if elf.has_soname:
        elf.dt_soname_str = parse_c_string(string_table, elf.dt_soname_strtab_offset)

    if elf.has_rpath:
        elf.dt_rpath_str = parse_c_string(string_table, elf.rpath_strtab_offset)


[docs]def parse_header(f, elf):
    # Read the 32/64 bit class independent part of the header and validate
    e_ident = f.read(16)

    # Require ELF magic bytes.
    if len(e_ident) != 16 or e_ident[:4] != ELF_CONSTANTS.MAGIC:
        raise ElfParsingError("Not an ELF file")

    # Defensively require a valid class and data.
    e_ident_class, e_ident_data = e_ident[4], e_ident[5]

    if e_ident_class not in (ELF_CONSTANTS.CLASS32, ELF_CONSTANTS.CLASS64):
        raise ElfParsingError("Invalid class found")

    if e_ident_data not in (ELF_CONSTANTS.DATA2LSB, ELF_CONSTANTS.DATA2MSB):
        raise ElfParsingError("Invalid data type")

    elf.is_64_bit = e_ident_class == ELF_CONSTANTS.CLASS64
    elf.is_little_endian = e_ident_data == ELF_CONSTANTS.DATA2LSB

    # Set up byte order and types for unpacking
    elf.byte_order = "<" if elf.is_little_endian else ">"

    # Parse the rest of the header
    elf_header_fmt = elf.byte_order + ("HHLQQQLHHHHHH" if elf.is_64_bit else "HHLLLLLHHHHHH")
    hdr_size = calcsize(elf_header_fmt)
    data = read_exactly(f, hdr_size, "ELF header malformed")
    elf.elf_hdr = ElfHeader._make(unpack(elf_header_fmt, data))


def _do_parse_elf(f, interpreter=True, dynamic_section=True):
    # We don't (yet?) allow parsing ELF files at a nonzero offset, we just
    # jump to absolute offsets as they are specified in the ELF file.
    if f.tell() != 0:
        raise ElfParsingError("Cannot parse at a nonzero offset")

    elf = ElfFile()
    parse_header(f, elf)

    # We don't handle anything but executables and shared libraries now.
    if elf.elf_hdr.e_type not in (ELF_CONSTANTS.ET_EXEC, ELF_CONSTANTS.ET_DYN):
        raise ElfParsingError("Not an ET_DYN or ET_EXEC type")

    parse_program_headers(f, elf)

    # Parse PT_INTERP section
    if interpreter and elf.has_pt_interp:
        parse_pt_interp(f, elf)

    # Parse PT_DYNAMIC section.
    if dynamic_section and elf.has_pt_dynamic and len(elf.pt_load) > 0:
        parse_pt_dynamic(f, elf)

    return elf


[docs]def parse_elf(f, interpreter=False, dynamic_section=False):
    """Given a file handle f for an ELF file opened in binary mode, return an ElfFile
    object that is stores data about rpaths"""
    try:
        return _do_parse_elf(f, interpreter, dynamic_section)
    except (DeprecationWarning, struct.error):
        # According to the docs old versions of Python can throw DeprecationWarning
        # instead of struct.error.
        raise ElfParsingError("Malformed ELF file")


[docs]def get_rpaths(path):
    """Returns list of rpaths of the given file as UTF-8 strings, or None if the file
    does not have any rpaths."""
    try:
        with open(path, "rb") as f:
            elf = parse_elf(f, interpreter=False, dynamic_section=True)
    except ElfParsingError:
        return None

    if not elf.has_rpath:
        return None

    # If it does, split the string in components
    rpath = elf.dt_rpath_str
    rpath = rpath.decode("utf-8")
    return rpath.split(":")


[docs]def replace_rpath_in_place_or_raise(path, substitutions):
    regex = re.compile(b"|".join(re.escape(p) for p in substitutions.keys()))

    try:
        with open(path, "rb+") as f:
            elf = parse_elf(f, interpreter=False, dynamic_section=True)

            # If there's no RPATH, then there's no need to replace anything.
            if not elf.has_rpath:
                return False

            # Get the non-empty rpaths. Sometimes there's a bunch of trailing
            # colons ::::: used for padding, we don't add them back to make it
            # more likely that the string doesn't grow.
            rpaths = list(filter(len, elf.dt_rpath_str.split(b":")))

            num_rpaths = len(rpaths)

            if num_rpaths == 0:
                return False

            changed = False
            for i in range(num_rpaths):
                old_rpath = rpaths[i]
                match = regex.match(old_rpath)
                if match:
                    changed = True
                    rpaths[i] = substitutions[match.group()] + old_rpath[match.end() :]

            # Nothing to replace!
            if not changed:
                return False

            new_rpath_string = b":".join(rpaths)

            pad = len(elf.dt_rpath_str) - len(new_rpath_string)

            if pad < 0:
                raise ElfDynamicSectionUpdateFailed(elf.dt_rpath_str, new_rpath_string)

            # We zero out the bits we shortened because (a) it should be a
            # C-string and (b) it's nice not to have spurious parts of old
            # paths in the output of `strings file`. Note that we're all
            # good when pad == 0; the original terminating null is used.
            new_rpath_string += b"\x00" * pad

            # The rpath is at a given offset in the string table used by the
            # dynamic section.
            rpath_offset = elf.pt_dynamic_strtab_offset + elf.rpath_strtab_offset

            f.seek(rpath_offset)
            f.write(new_rpath_string)
            return True

    except ElfParsingError:
        # This just means the file wasnt an elf file, so there's no point
        # in updating its rpath anyways; ignore this problem.
        return False


[docs]class ElfDynamicSectionUpdateFailed(Exception):
    def __init__(self, old, new):
        self.old = old
        self.new = new
        super(ElfDynamicSectionUpdateFailed, self).__init__(
            "New rpath {} is longer than old rpath {}".format(
                new.decode("utf-8"), old.decode("utf-8")
            )
        )


[docs]class ElfParsingError(Exception):
    pass