Source code for pyveg.src.pyveg_pipeline

"""
Definitions:
===========

A PIPELINE is the whole analysis procedure for one set of coordinates.
It will likely consist of a couple of SEQUENCES - e.g.
one for vegetation data and one for weather data.

A SEQUENCE is composed of one or more MODULES, that each do specific tasks,
e.g. download data, process images, calculate quantities from image.

A special type of MODULE may be placed at the end of a PIPELINE to combine
the results of the different SEQUENCES into one output file.
"""

import os
import json
import subprocess
import time

import logging
from logging.handlers import RotatingFileHandler

logger = logging.getLogger("pyveg_logger")
formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
logger.setLevel(logging.INFO)

c_handler = logging.StreamHandler()
c_handler.setFormatter(formatter)
f_handler = RotatingFileHandler(
    "pyveg_{}.log".format(time.strftime("%Y-%m-%d_%H-%M-%S")),
    maxBytes=5 * 1024 * 1024, backupCount=10
)
f_handler.setFormatter(formatter)

logger.addHandler(f_handler)
logger.addHandler(c_handler)




from pyveg.src.file_utils import save_json

from shutil import copyfile

try:
    from pyveg.src import azure_utils
    from pyveg.src import batch_utils
except:
    print("Azure utils could not be imported - is Azure SDK installed?")


[docs]class Pipeline(object):
    """
    A Pipeline contains all the Sequences we want to run on a particular
    set of coordinates and a date range. e.g. there might be one Sequence
    for vegetation data and one for weather data.
    """

    def __init__(self, name):
        self.name = name
        self.sequences = []
        self.coords = None
        self.date_range = None
        self.output_location = None
        self.output_location_type = None
        self.is_configured = False

    def __iadd__(self, sequence):
        """
        Overload the '+=' operator, so we can add sequences
        directly to the pipeline.
        """
        sequence.parent = self
        self.__setattr__(sequence.name, sequence)
        self.sequences.append(sequence)
        return self

    def __repr__(self):
        """
        overload the builtin operator for printing the pipeline.
        """
        output = "\n[Pipeline]: {} \n".format(self.name)
        output += "=======================\n"
        output += "coordinates: {}\n".format(self.coords)
        output += "date_range:  {}\n".format(self.date_range)
        output += "output_location:  {}\n".format(self.output_location)
        output += "output_location_type:  {}\n".format(self.output_location_type)
        output += "\n ------- Sequences ----------\n\n"
        for s in self.sequences:
            output += s.__repr__()
        output += "=======================\n"
        return output

[docs]    def get(self, seq_name):
        """
        Return a sequence object when asked for by name.
        """
        for sequence in self.sequences:
            if sequence.name == seq_name:
                return sequence

[docs]    def configure(self):
        """
        Configure all the sequences in this pipeline.
        """
        for var in ["coords", "date_range", "output_location", "output_location_type"]:
            if (not var in vars(self)) or (not self.__getattribute__(var)):
                raise RuntimeError(
                    "{}: need to set {} before calling configure()".format(
                        self.name, var
                    )
                )
        if self.output_location_type == "azure":
            # ensure that the output location conforms to azure container-name requirements
            container_name = azure_utils.sanitize_container_name(self.output_location)
            self.output_location = container_name

        for sequence in self.sequences:
            if not "coords" in vars(sequence):
                sequence.coords = self.coords
            if not "date_range" in vars(sequence):
                sequence.date_range = self.date_range
            sequence.configure()

        self.is_configured = True

[docs]    def run(self):
        """
        run all the sequences in this pipeline
        """
        for sequence in self.sequences:
            sequence.run()
        self.print_run_status()
        self.cleanup()

[docs]    def print_run_status(self):
        for sequence in self.sequences:
            sequence.print_run_status()

[docs]    def cleanup(self):
        """
        Call cleanup() for all our sequences
        """
        for sequence in self.sequences:
            sequence.cleanup()



[docs]class Sequence(object):
    """
    A Sequence is a collection of Modules where the output of one module is
    typically the input to the next one.
    It will typically correspond to a particular data collection, e.g. for
    vegetation imagery, we might have one module to download the images,
    one to process them, and one to analyze the processed images.
    """

    def __init__(self, name):
        self.name = name
        self.modules = []
        self.depends_on = []
        self.parent = None
        self.output_location = None
        self.output_location_type = None
        self.is_configured = False
        self.is_finished = False
        self.run_status = {}

    def __iadd__(self, module):
        """
        overload the += operator so we can add modules directly to the sequence
        """
        module.parent = self
        self.modules.append(module)
        # if the module doesn't already have a name, or only the default one,
        # give it a name <sequence_name>_<class_name> here
        if (not module.name) or (module.name == module.__class__.__name__):
            module.name = "{}_{}".format(self.name, module.__class__.__name__)
        # add module name as an attribute of the sequence
        self.__setattr__(module.name, module)

        return self

[docs]    def join_path(self, *path_elements):
        """
        If output_location_type is 'local', we will just use
        os.path.join, which puts a "/" separator in for posix, or "\" for windows.
        However, if output_location_type is 'azure', we always want "/".

        Parameters
        ==========
        path_elements: list of strings.  Directory-like path elements.

        Returns
        =======
        path: str, the path elements joined by "/" or "\".
        """
        if self.output_location_type == "azure":
            path = "/".join(path_elements)
        else:
            path = os.path.join(*path_elements)
        return path

[docs]    def set_output_location(self):
        if self.parent:
            self.output_location_type = self.parent.output_location_type
            self.output_location = self.join_path(
                self.parent.output_location,
                f"gee_{self.coords[0]}_{self.coords[1]}"
                + "_"
                + self.name.replace("/", "-"),
            )
        else:
            self.output_location = (
                f"gee_{self.coords[0]}_{self.coords[1]}"
                + "_"
                + self.name.replace("/", "-")
            )
            self.output_location_type = "local"

[docs]    def set_config(self, config_dict):
        for k, v in config_dict.items():
            logger.info("{}: setting {} to {}".format(self.name, k, v))
            self.__setattr__(k, v)

[docs]    def configure(self):

        if (not self.coords) or (not self.date_range):
            raise RuntimeError(
                "{}: Need to set coords and date range before calling configure()".format(
                    self.name
                )
            )
        if not self.output_location:
            self.set_output_location()
        # set the input location for each module to be the output of the previous one.
        for i, module in enumerate(self.modules):
            module.output_location = self.output_location
            module.output_location_type = self.output_location_type
            if i > 0:
                module.input_location = self.modules[i - 1].output_location
                module.input_location_type = self.modules[i - 1].output_location_type
                # modules will depend on the previous module in the sequence
                module.depends_on.append(self.modules[i - 1].name)
            module.coords = self.coords
            module.date_range = self.date_range
            module.configure()
        self.is_configured = True

[docs]    def run(self):
        """
        Before we run the Modules in this Sequence, check if there are any other Sequences
        on which we depend, and if so, wait for them to finish.
        """
        if len(self.depends_on) > 0:
            logger.info(
                "{} Checking if all dependency Sequences have finished".format(
                    self.name
                )
            )
            dependencies_finished = False
            while not dependencies_finished:
                num_seq_finished = 0
                for seq_name in self.depends_on:
                    seq = self.parent.get(seq_name)
                    logger.info("{}: checking status of {}".format(self.name, seq.name))
                    if seq.check_if_finished():
                        logger.info("     {}   ... finished".format(seq.name))
                        num_seq_finished += 1
                    dependencies_finished = num_seq_finished == len(self.depends_on)
                    logger.info(
                        "{}: {} / {} dependencies finished".format(
                            self.name,
                            num_seq_finished, len(self.depends_on)
                        )
                    )
                time.sleep(10)

        self.create_batch_job_if_needed()
        for module in self.modules:
            self.run_status[module.name] =  module.run()

    def __repr__(self):
        if not self.is_configured:
            return "Sequence not configured\n"

        output = "\n    [Sequence]: {} \n".format(self.name)
        output += "    =======================\n"
        for k, v in vars(self).items():
            # exclude the things we don't want to print
            if (
                k == "name"
                or k == "modules"
                or k == "parent"
                or isinstance(v, BaseModule)
            ):
                continue
            output += "    {}: {}\n".format(k, v)
        output += "\n    ------- Modules ----------\n\n"
        for m in self.modules:
            output += m.__repr__()
        output += "    =======================\n\n"
        return output

[docs]    def print_run_status(self):
        """
        For all modules in the sequence, print out how many jobs
        succeeded or failed.
        """
        logger.info("\nSequence {}".format(self.name))
        logger.info("-----------------\n")
        for module in self.modules:
            module.print_run_status()
        logger.info("\n")

[docs]    def get(self, mod_name):
        """
        Return a module object when asked for by name, or by class name
        """
        for module in self.modules:
            if module.name == mod_name:
                return module
            elif module.__class__.__name__ == mod_name:
                return module

[docs]    def has_batch_job(self):
        """
        Do any of the Modules in this sequence have run_mode == 'batch'?
        """
        for module in self.modules:
            if "run_mode" in vars(module) and module.run_mode == "batch":
                return True
        return False

[docs]    def create_batch_job_if_needed(self):
        """
        If any modules in this sequence are to be run in batch mode,
        create a batch job for them.
        """

        if self.has_batch_job():
            self.batch_job_id = self.name + "_" + time.strftime("%Y-%m-%d_%H-%M-%S")
            batch_utils.create_job(self.batch_job_id)
            logger.info(
                "Sequence {}: Creating batch job {}".format(
                    self.name, self.batch_job_id
                )
            )

[docs]    def check_if_finished(self):
        """
        Only relevant when one or more modules are running in batch mode,
        Sequences that depend on this Sequence will call this function
        while they wait for all Modules to finish.
        """
        num_modules_finished = 0

        for module in self.modules:
            logger.info("{}: checking status of {}".format(self.name, module.name))
            if module.check_if_finished():
                logger.info("{}   ... finished".format(module.name))
                num_modules_finished += 1
        logger.info(
            "{} / {} modules finished".format(num_modules_finished, len(self.modules))
        )

        self.is_finished = num_modules_finished == len(self.modules)
        return self.is_finished

[docs]    def cleanup(self):
        """
        If we have batch resources (job/pool), remove them to avoid charges
        """
        if self.has_batch_job():
            if "batch_job_id" in vars(self):
                batch_utils.delete_job(self.batch_job_id)
            batch_utils.delete_pool()


[docs]class BaseModule(object):
    """
    A "Module" is a building block of a sequence - takes some input, does something
    (e.g. Downloads from GEE, processes some images, ...) and produces some output.
    The working directory for all modules within a sequence will be given by the sequence -
    modules may write output to subdirectories of this (e.g. for different dates), but what
    we call "output_location" will be the base directory common to all modules, and will contain
    info about the image collection name, and the coordinates.
    """

    def __init__(self, name=None):
        if name:
            self.name = name
        else:
            self.name = self.__class__.__name__
        self.params = []
        self.parent = None
        self.depends_on = []
        self.is_configured = False
        self.is_finished = False
        self.run_status = {"succeeded": 0, "failed": 0, "incomplete": 0}

[docs]    def set_parameters(self, config_dict):
        for k, v in config_dict.items():
            logger.info("{}: setting {} to {}".format(self.name, k, v))
            self.__setattr__(k, v)

[docs]    def configure(self, config_dict=None):
        """
        Order of preference for configuriation:
        1) config_dict
        2) values held by the parent Sequence
        3) default values
        So we set them in reverse order here, so higher priorities will override.
        """
        self.set_default_parameters()

        if self.parent:
            for param, param_type in self.params:
                if param in vars(self.parent):
                    self.__setattr__(param, self.parent.__getattribute__(param))
        if config_dict:
            self.set_parameters(config_dict)

        self.check_config()

        self.is_configured = True

[docs]    def check_config(self):
        """
        Loop through list of parameters, which will each be a tuple (name, [allowed_types])
        and check that the parameter exists, and is of the correct type.
        """
        for param in self.params:
            if not param[0] in vars(self):
                raise RuntimeError(
                    "{}: {} needs to be set.".format(self.name, param[0])
                )
            val = self.__getattribute__(param[0])
            type_ok = False
            for param_type in param[1]:
                if isinstance(val, param_type):
                    type_ok = True
                    break
            if not type_ok:
                raise TypeError(
                    "{}: {} should be {}, got {}:{}".format(
                        self.name, param[0], param[1], val, type(val)
                    )
                )
        return True

[docs]    def set_default_parameters(self):
        pass

[docs]    def prepare_for_run(self):
        if not self.is_configured:
            raise RuntimeError(
                "Module {} needs to be configured before running".format(self.name)
            )
        if self.output_location_type == "azure":
            # if we're running this module standalone on azure, we might need to
            # create the output container on the blob storage account"
            output_location_base = self.output_location.split("/")[0]
            container_name = azure_utils.sanitize_container_name(output_location_base)
            if not azure_utils.check_container_exists(container_name):
                logger.info("Create container {}".format(container_name))
                azure_utils.create_container(container_name)
        elif self.output_location_type == "local" and not os.path.exists(
            self.output_location
        ):
            os.makedirs(self.output_location, exist_ok=True)

[docs]    def check_if_finished(self):
        return self.is_finished

    def __repr__(self):
        if not self.is_configured:
            return "\n        Module not configured"

        output = "        [Module]: {} \n".format(self.name)
        output += "        =======================\n"
        for k, v in vars(self).items():
            # exclude the things we don't want to print
            if k == "name" or k == "parent" or k == "params":
                continue
            output += "        {}: {}\n".format(k, v)
        output += "        =======================\n\n"
        return output

[docs]    def join_path(self, *path_elements):
        """
        If output_location_type is 'local', we will just use
        os.path.join, which puts a "/" separator in for posix, or "\" for windows.
        However, if output_location_type is 'azure', we always want "/".

        Parameters
        ==========
        path_elements: list of strings.  Directory-like path elements.

        Returns
        =======
        path: str, the path elements joined by "/" or "\".
        """
        if self.output_location_type == "azure":
            path = "/".join(path_elements)
        else:
            path = os.path.join(*path_elements)
        return path

[docs]    def copy_to_output_location(self, tmpdir, output_location, file_endings=[]):
        """
        Copy contents of a temporary directory to a specified output location.

        Parameters
        ==========
        tmpdir: str, location of temporary directory
        output_location: str, either path to a local directory (if self.output_location_type is "local")
                              or to Azure <container>/<blob_path> if self.output_location_type=="azure")
        file_endings: list of str, optional.  If given, only files with those endings will be copied.
        """
        if self.output_location_type == "local":
            os.makedirs(output_location, exist_ok=True)
            for root, dirs, files in os.walk(tmpdir):
                for filename in files:
                    if file_endings:
                        for ending in file_endings:
                            if filename.endswith(ending):

                                copyfile(os.path.join(root,filename),
                                         os.path.join(output_location,filename))
                    else:
                        subprocess.run(
                            [
                                "cp",
                                "-r",
                                os.path.join(root, filename),
                                os.path.join(output_location, filename),
                            ]
                        )
        elif self.output_location_type == "azure":
            # first part of self.output_location should be the container name
            container_name = self.output_location.split("/")[0]
            azure_utils.write_files_to_blob(
                tmpdir, container_name, output_location, file_endings
            )

[docs]    def list_directory(self, directory_path, location_type):
        """
        List contents of a directory, either on local file system
        or Azure blob storage.
        """
        if location_type == "local":
            if not os.path.isdir(directory_path):
                return []
            return os.listdir(directory_path)
        elif location_type == "azure":
            # first part of self.output_location should be the container name
            container_name = self.output_location.split("/")[0]
            return azure_utils.list_directory(directory_path, container_name)
        else:
            raise RuntimeError("Unknown location_type - must be 'local' or 'azure'")

[docs]    def save_json(self, data, filename, location, location_type):
        """
        Save json to local filesystem or blob storage depending on location_type
        """
        if location_type == "local":
            save_json(data, location, filename)
        elif location_type == "azure":
            # first part of self.output_location should be the container name
            container_name = self.output_location.split("/")[0]
            azure_utils.save_json(data, location, filename, container_name)
        else:
            raise RuntimeError("Unknown location_type - must be 'local' or 'azure'")

[docs]    def get_json(self, filepath, location_type):
        """
        Read a json file either local or blob storage.
        """
        if location_type == "local":
            if os.path.exists(filepath):
                return json.load(open(filepath))
            else:
                return None
        elif location_type == "azure":
            # first part of filepath  should be the container name
            container_name = filepath.split("/")[0]
            return azure_utils.read_json(filepath, container_name)
        else:
            raise RuntimeError("Unknown location_type - must be 'local' or 'azure'")

[docs]    def get_file(self, filename, location_type):
        """
        Just return the filename if location _type is "local".
        Otherwise return a tempfile with the contents of a blob if the location
        is "azure".
        """
        if location_type == "local":
            return filename
        elif location_type == "azure":
            # first part of self.output_location should be the container name
            container_name = self.output_location.split("/")[0]
            return azure_utils.get_blob_to_tempfile(filename, container_name)
        else:
            raise RuntimeError("Unknown location_type - must be 'local' or 'azure'")

[docs]    def check_for_existing_files(self, location, num_files_expected):
        """
        See if there are already num_files in the specified location.
        If "replace_existing_files" is set to True, always return False
        """
        if self.output_location_type == "local":
            os.makedirs(location, exist_ok=True)
        # if we haven't specified number of expected files per point it will be -1
        if num_files_expected < 0:
            return False
        if self.replace_existing_files:
            return False
        existing_files = self.list_directory(location, self.output_location_type)
        if len(existing_files) == num_files_expected:
            logger.info(
                "{}: Already found {} files in {} - skipping".format(
                    self.name, num_files_expected, location
                )
            )
            return True
        return False

[docs]    def get_config(self):
        """
        Get the configuration of this module as a dict.
        """
        config_dict = {}
        for param, _ in self.params:
            config_dict[param] = self.__getattribute__(param)
        config_dict["class_name"] = self.__class__.__name__
        return config_dict

[docs]    def save_config(self, config_location):
        """
        Write out the configuration of this module as a json file.
        """
        config_dict = self.get_config()

        output_config_dir = os.path.dirname(config_location)
        if output_config_dir and not os.path.exists(output_config_dir):
            os.makedirs(output_config_dir)

        with open(config_location, "w") as output_json:
            json.dump(config_dict, output_json)
        logger.info("{}: wrote config to {}".format(self.name, config_location))


[docs]    def print_run_status(self):
        """
        Print out how many jobs succeeded or failed
        """
        logger.info(
            "{}: Succeeded: {}  Failed: {}   Incomplete: {}".format(
                self.name,
                self.run_status["succeeded"],
                self.run_status["failed"],
                self.run_status["incomplete"]
                )
            )