Source code for luigi.contrib.gcs

# -*- coding: utf-8 -*-
#
# Copyright 2015 Twitter Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""luigi bindings for Google Cloud Storage"""

import io
import logging
import mimetypes
import os
import tempfile
import time
from io import BytesIO
from urllib.parse import urlsplit

from tenacity import after_log, retry, retry_if_exception, retry_if_exception_type, stop_after_attempt, wait_exponential

import luigi.target
from luigi.contrib import gcp
from luigi.format import FileWrapper

logger = logging.getLogger("luigi-interface")

# Retry when following errors happened
RETRYABLE_ERRORS = None

try:
    import httplib2
    from googleapiclient import discovery, errors, http
except ImportError:
    logger.warning(
        "Loading GCS module without the python packages googleapiclient & google-auth. \
        This will crash at runtime if GCS functionality is used."
    )
else:
    RETRYABLE_ERRORS = (httplib2.HttpLib2Error, IOError)

# Number of bytes to send/receive in each request.
CHUNKSIZE = 10 * 1024 * 1024

# Mimetype to use if one can't be guessed from the file extension.
DEFAULT_MIMETYPE = "application/octet-stream"

# Time to sleep while waiting for eventual consistency to finish.
EVENTUAL_CONSISTENCY_SLEEP_INTERVAL = 0.1

# Maximum number of sleeps for eventual consistency.
EVENTUAL_CONSISTENCY_MAX_SLEEPS = 300

# Uri for batch requests
GCS_BATCH_URI = "https://storage.googleapis.com/batch/storage/v1"


# Retry configurations. For more details, see https://tenacity.readthedocs.io/en/latest/

[docs]
def is_error_5xx(err):
    return isinstance(err, errors.HttpError) and err.resp.status >= 500



gcs_retry = retry(
    retry=(retry_if_exception(is_error_5xx) | retry_if_exception_type(RETRYABLE_ERRORS)),
    wait=wait_exponential(multiplier=1, min=1, max=10),
    stop=stop_after_attempt(5),
    reraise=True,
    after=after_log(logger, logging.WARNING),
)


def _wait_for_consistency(checker):
    """Eventual consistency: wait until GCS reports something is true.

    This is necessary for e.g. create/delete where the operation might return,
    but won't be reflected for a bit.
    """
    for _ in range(EVENTUAL_CONSISTENCY_MAX_SLEEPS):
        if checker():
            return

        time.sleep(EVENTUAL_CONSISTENCY_SLEEP_INTERVAL)

    logger.warning("Exceeded wait for eventual GCS consistency - this may be abug in the library or something is terribly wrong.")



[docs]
class InvalidDeleteException(luigi.target.FileSystemException):
    pass




[docs]
class GCSClient(luigi.target.FileSystem):
    """An implementation of a FileSystem over Google Cloud Storage.

       There are several ways to use this class. By default it will use the app
       default credentials, as described at https://developers.google.com/identity/protocols/application-default-credentials .
       Alternatively, you may pass an google-auth credentials object. e.g. to use a service account::

         credentials = google.auth.jwt.Credentials.from_service_account_info(
             '012345678912-ThisIsARandomServiceAccountEmail@developer.gserviceaccount.com',
             'These are the contents of the p12 file that came with the service account',
             scope='https://www.googleapis.com/auth/devstorage.read_write')
         client = GCSClient(oauth_credentials=credentails)

        The chunksize parameter specifies how much data to transfer when downloading
        or uploading files.

    .. warning::
      By default this class will use "automated service discovery" which will require
      a connection to the web. The google api client downloads a JSON file to "create" the
      library interface on the fly. If you want a more hermetic build, you can pass the
      contents of this file (currently found at https://www.googleapis.com/discovery/v1/apis/storage/v1/rest )
      as the ``descriptor`` argument.
    """

    def __init__(self, oauth_credentials=None, descriptor="", http_=None, chunksize=CHUNKSIZE, **discovery_build_kwargs):
        self.chunksize = chunksize
        authenticate_kwargs = gcp.get_authenticate_kwargs(oauth_credentials, http_)

        build_kwargs = authenticate_kwargs.copy()
        build_kwargs.update(discovery_build_kwargs)

        if descriptor:
            self.client = discovery.build_from_document(descriptor, **build_kwargs)
        else:
            build_kwargs.setdefault("cache_discovery", False)
            self.client = discovery.build("storage", "v1", **build_kwargs)

    def _path_to_bucket_and_key(self, path):
        (scheme, netloc, path, _, _) = urlsplit(path)
        assert scheme == "gs"
        path_without_initial_slash = path[1:]
        return netloc, path_without_initial_slash

    def _is_root(self, key):
        return len(key) == 0 or key == "/"

    def _add_path_delimiter(self, key):
        return key if key[-1:] == "/" else key + "/"

    @gcs_retry
    def _obj_exists(self, bucket, obj):
        try:
            self.client.objects().get(bucket=bucket, object=obj).execute()
        except errors.HttpError as ex:
            if ex.resp["status"] == "404":
                return False
            raise
        else:
            return True

    def _list_iter(self, bucket, prefix):
        request = self.client.objects().list(bucket=bucket, prefix=prefix)
        response = request.execute()

        while response is not None:
            for it in response.get("items", []):
                yield it

            request = self.client.objects().list_next(request, response)
            if request is None:
                break

            response = request.execute()

    @gcs_retry
    def _do_put(self, media, dest_path):
        bucket, obj = self._path_to_bucket_and_key(dest_path)

        request = self.client.objects().insert(bucket=bucket, name=obj, media_body=media)
        if not media.resumable():
            return request.execute()

        response = None
        while response is None:
            status, response = request.next_chunk()
            if status:
                logger.debug("Upload progress: %.2f%%", 100 * status.progress())

        _wait_for_consistency(lambda: self._obj_exists(bucket, obj))
        return response


[docs]
    def exists(self, path):
        bucket, obj = self._path_to_bucket_and_key(path)
        if self._obj_exists(bucket, obj):
            return True

        return self.isdir(path)



[docs]
    def isdir(self, path):
        bucket, obj = self._path_to_bucket_and_key(path)
        if self._is_root(obj):
            try:
                self.client.buckets().get(bucket=bucket).execute()
            except errors.HttpError as ex:
                if ex.resp["status"] == "404":
                    return False
                raise

        obj = self._add_path_delimiter(obj)
        if self._obj_exists(bucket, obj):
            return True

        # Any objects with this prefix
        resp = self.client.objects().list(bucket=bucket, prefix=obj, maxResults=20).execute()
        lst = next(iter(resp.get("items", [])), None)
        return bool(lst)



[docs]
    def remove(self, path, recursive=True):
        (bucket, obj) = self._path_to_bucket_and_key(path)

        if self._is_root(obj):
            raise InvalidDeleteException("Cannot delete root of bucket at path {}".format(path))

        if self._obj_exists(bucket, obj):
            self.client.objects().delete(bucket=bucket, object=obj).execute()
            _wait_for_consistency(lambda: not self._obj_exists(bucket, obj))
            return True

        if self.isdir(path):
            if not recursive:
                raise InvalidDeleteException("Path {} is a directory. Must use recursive delete".format(path))

            req = http.BatchHttpRequest(batch_uri=GCS_BATCH_URI)
            for it in self._list_iter(bucket, self._add_path_delimiter(obj)):
                req.add(self.client.objects().delete(bucket=bucket, object=it["name"]))
            req.execute()

            _wait_for_consistency(lambda: not self.isdir(path))
            return True

        return False



[docs]
    def put(self, filename, dest_path, mimetype=None, chunksize=None):
        chunksize = chunksize or self.chunksize
        resumable = os.path.getsize(filename) > 0

        mimetype = mimetype or mimetypes.guess_type(dest_path)[0] or DEFAULT_MIMETYPE
        media = http.MediaFileUpload(filename, mimetype=mimetype, chunksize=chunksize, resumable=resumable)

        self._do_put(media, dest_path)


    def _forward_args_to_put(self, kwargs):
        return self.put(**kwargs)


[docs]
    def put_multiple(self, filepaths, remote_directory, mimetype=None, chunksize=None, num_process=1):
        if isinstance(filepaths, str):
            raise ValueError("filenames must be a list of strings. If you want to put a single file, use the `put(self, filename, ...)` method")

        put_kwargs_list = [
            {
                "filename": filepath,
                "dest_path": os.path.join(remote_directory, os.path.basename(filepath)),
                "mimetype": mimetype,
                "chunksize": chunksize,
            }
            for filepath in filepaths
        ]

        if num_process > 1:
            from contextlib import closing
            from multiprocessing import Pool

            with closing(Pool(num_process)) as p:
                return p.map(self._forward_args_to_put, put_kwargs_list)
        else:
            for put_kwargs in put_kwargs_list:
                self._forward_args_to_put(put_kwargs)



[docs]
    def put_string(self, contents, dest_path, mimetype=None):
        mimetype = mimetype or mimetypes.guess_type(dest_path)[0] or DEFAULT_MIMETYPE
        assert isinstance(mimetype, str)
        if not isinstance(contents, bytes):
            contents = contents.encode("utf-8")
        media = http.MediaIoBaseUpload(BytesIO(contents), mimetype, resumable=bool(contents))
        self._do_put(media, dest_path)



[docs]
    def mkdir(self, path, parents=True, raise_if_exists=False):
        if self.exists(path):
            if raise_if_exists:
                raise luigi.target.FileAlreadyExists()
            elif not self.isdir(path):
                raise luigi.target.NotADirectory()
            else:
                return

        self.put_string(b"", self._add_path_delimiter(path), mimetype="text/plain")



[docs]
    def copy(self, source_path, destination_path):
        src_bucket, src_obj = self._path_to_bucket_and_key(source_path)
        dest_bucket, dest_obj = self._path_to_bucket_and_key(destination_path)

        if self.isdir(source_path):
            src_prefix = self._add_path_delimiter(src_obj)
            dest_prefix = self._add_path_delimiter(dest_obj)

            source_path = self._add_path_delimiter(source_path)
            copied_objs = []
            for obj in self.listdir(source_path):
                suffix = obj[len(source_path) :]

                self.client.objects().copy(
                    sourceBucket=src_bucket, sourceObject=src_prefix + suffix, destinationBucket=dest_bucket, destinationObject=dest_prefix + suffix, body={}
                ).execute()
                copied_objs.append(dest_prefix + suffix)

            _wait_for_consistency(lambda: all(self._obj_exists(dest_bucket, obj) for obj in copied_objs))
        else:
            self.client.objects().copy(
                sourceBucket=src_bucket, sourceObject=src_obj, destinationBucket=dest_bucket, destinationObject=dest_obj, body={}
            ).execute()
            _wait_for_consistency(lambda: self._obj_exists(dest_bucket, dest_obj))



[docs]
    def rename(self, *args, **kwargs):
        """
        Alias for ``move()``
        """
        self.move(*args, **kwargs)



[docs]
    def move(self, source_path, destination_path):
        """
        Rename/move an object from one GCS location to another.
        """
        self.copy(source_path, destination_path)
        self.remove(source_path)



[docs]
    def listdir(self, path):
        """
        Get an iterable with GCS folder contents.
        Iterable contains paths relative to queried path.
        """
        bucket, obj = self._path_to_bucket_and_key(path)

        obj_prefix = self._add_path_delimiter(obj)
        if self._is_root(obj_prefix):
            obj_prefix = ""

        obj_prefix_len = len(obj_prefix)
        for it in self._list_iter(bucket, obj_prefix):
            yield self._add_path_delimiter(path) + it["name"][obj_prefix_len:]



[docs]
    def list_wildcard(self, wildcard_path):
        """Yields full object URIs matching the given wildcard.

        Currently only the '*' wildcard after the last path delimiter is supported.

        (If we need "full" wildcard functionality we should bring in gsutil dependency with its
        https://github.com/GoogleCloudPlatform/gsutil/blob/master/gslib/wildcard_iterator.py...)
        """
        path, wildcard_obj = wildcard_path.rsplit("/", 1)
        assert "*" not in path, "The '*' wildcard character is only supported after the last '/'"
        wildcard_parts = wildcard_obj.split("*")
        assert len(wildcard_parts) == 2, "Only one '*' wildcard is supported"

        for it in self.listdir(path):
            if (
                it.startswith(path + "/" + wildcard_parts[0])
                and it.endswith(wildcard_parts[1])
                and len(it) >= len(path + "/" + wildcard_parts[0]) + len(wildcard_parts[1])
            ):
                yield it



[docs]
    @gcs_retry
    def download(self, path, chunksize=None, chunk_callback=lambda _: False):
        """Downloads the object contents to local file system.

        Optionally stops after the first chunk for which chunk_callback returns True.
        """
        chunksize = chunksize or self.chunksize
        bucket, obj = self._path_to_bucket_and_key(path)

        with tempfile.NamedTemporaryFile(delete=False) as fp:
            # We can't return the tempfile reference because of a bug in python: http://bugs.python.org/issue18879
            return_fp = _DeleteOnCloseFile(fp.name, "r")

            # Special case empty files because chunk-based downloading doesn't work.
            result = self.client.objects().get(bucket=bucket, object=obj).execute()
            if int(result["size"]) == 0:
                return return_fp

            request = self.client.objects().get_media(bucket=bucket, object=obj)
            downloader = http.MediaIoBaseDownload(fp, request, chunksize=chunksize)

            done = False
            while not done:
                _, done = downloader.next_chunk()
                if chunk_callback(fp):
                    done = True

        return return_fp




class _DeleteOnCloseFile(io.FileIO):
    def close(self):
        super(_DeleteOnCloseFile, self).close()
        try:
            os.remove(self.name)
        except OSError:
            # Catch a potential threading race condition and also allow this
            # method to be called multiple times.
            pass

    def readable(self):
        return True

    def writable(self):
        return False

    def seekable(self):
        return True



[docs]
class AtomicGCSFile(luigi.target.AtomicLocalFile):
    """
    A GCS file that writes to a temp file and put to GCS on close.
    """

    def __init__(self, path, gcs_client):
        self.gcs_client = gcs_client
        super(AtomicGCSFile, self).__init__(path)


[docs]
    def move_to_final_destination(self):
        self.gcs_client.put(self.tmp_path, self.path)





[docs]
class GCSTarget(luigi.target.FileSystemTarget):
    fs = None

    def __init__(self, path, format=None, client=None):
        super(GCSTarget, self).__init__(path)
        if format is None:
            format = luigi.format.get_default_format()

        self.format = format
        self.fs = client or GCSClient()


[docs]
    def open(self, mode="r"):
        if mode == "r":
            return self.format.pipe_reader(FileWrapper(io.BufferedReader(self.fs.download(self.path))))
        elif mode == "w":
            return self.format.pipe_writer(AtomicGCSFile(self.path, self.fs))
        else:
            raise ValueError("Unsupported open mode '{}'".format(mode))





[docs]
class GCSFlagTarget(GCSTarget):
    """
    Defines a target directory with a flag-file (defaults to `_SUCCESS`) used
    to signify job success.

    This checks for two things:

    * the path exists (just like the GCSTarget)
    * the _SUCCESS file exists within the directory.

    Because Hadoop outputs into a directory and not a single file,
    the path is assumed to be a directory.

    This is meant to be a handy alternative to AtomicGCSFile.

    The AtomicFile approach can be burdensome for GCS since there are no directories, per se.

    If we have 1,000,000 output files, then we have to rename 1,000,000 objects.
    """

    fs = None

    def __init__(self, path, format=None, client=None, flag="_SUCCESS"):
        """
        Initializes a GCSFlagTarget.

        :param path: the directory where the files are stored.
        :type path: str
        :param client:
        :type client:
        :param flag:
        :type flag: str
        """
        if format is None:
            format = luigi.format.get_default_format()

        if path[-1] != "/":
            raise ValueError("GCSFlagTarget requires the path to be to a directory.  It must end with a slash ( / ).")
        super(GCSFlagTarget, self).__init__(path, format=format, client=client)
        self.format = format
        self.fs = client or GCSClient()
        self.flag = flag


[docs]
    def exists(self):
        flag_target = self.path + self.flag
        return self.fs.exists(flag_target)