Source code for ms_graph_exporter.celery.graph_api_base

# -*- coding: utf-8 -*-
"""Module implements base class :class:`GraphApiTask` for data extraction Celery tasks.

:class:`GraphApiTask` defines Redis and MS Graph API clients as attributes to be shared
by all tasks within a Celery worker.
"""
from datetime import datetime, timedelta
from functools import partial
from json import dumps as json_dumps
from logging import Logger
from typing import Any, Callable, Dict, List, Optional, Tuple

from celery import Task
from celery.utils.log import get_task_logger

import gevent
from gevent.pool import Group

from redis import BlockingConnectionPool, ConnectionPool, RedisError
from redis.client import Redis

from yaml import YAMLError, safe_load

from ms_graph_exporter.ms_graph.api import MsGraph
from ms_graph_exporter.ms_graph.response import MsGraphResponse


[docs]class GraphApiTask(Task):
    """Base class for MS Graph API data extraction Celery tasks.

    Allows to share instances of Redis connection pool
    :class:`BlockingConnectionPool <redis:redis.BlockingConnectionPool>`
    and :class:`~ms_graph_exporter.ms_graph.api.MsGraph` between all tasks
    executed within a single Celery worker.

    Attributes
    ----------
    _config : :obj:`~typing.Dict` [:obj:`str`, :obj:`~typing.Any`]
        Dictionary holding configuration parameters that are
        initialized by the worker on startup.

    _logger : :obj:`~logging.Logger`
        Channel to be used for log output specific to the worker.

    _ms_graph : :obj:`~ms_graph_exporter.ms_graph.api.MsGraph`
        Ms Graph API client instance to be used for queries.

    _redis_pool : :obj:`Redis <redis:redis.BlockingConnectionPool>`
        Connection pool to be used by Redis clients for response data queuing.

    """

    _config: Dict[str, Any] = {
        "graph_app_config": "instance/app_config.yaml",
        "graph_client_id": "",
        "graph_client_secret": "",
        "graph_greenlets_count": 10,
        "graph_page_size": 50,
        "graph_queue_backend": "redis",
        "graph_queue_type": "list",
        "graph_queue_key": "ms_graph_exporter",
        "graph_redis_url": "redis://localhost:6379?db=0",
        "graph_redis_pool_block": True,
        "graph_redis_pool_gevent_queue": True,
        "graph_redis_pool_max_connections": 15,
        "graph_redis_pool_timeout": 1,
        "graph_streams": 1,
        "graph_stream_frame": 60,
        "graph_tenant": "",
        "graph_timelag": 120,
    }

    _logger: Logger = get_task_logger(__name__)

    _ms_graph: Optional[MsGraph] = None

    _redis_conn_pool: Optional[ConnectionPool] = None

    _redis_client: Optional[Redis] = None

    _redis_methods_map: Dict[str, str] = {"list": "RPUSH", "channel": "PUBLISH"}
    """Map between Redis queue types and data pushing operation."""

[docs]    @classmethod
    def config_update(cls, **options) -> None:
        """Update internal configuration.

        Accept arbitrary set of keyword arguments and update ``_config`` dict
        with all values matching keys defined there initially.

        Parameters
        ----------
        **options
            Set of arbitrary keyword arguments used to update ``_config``.

        """
        cls._logger.debug("[%s]: Update config from kwargs.", cls.__name__)

        config_update: Dict = {k: options[k] for k in options.keys() if "graph_" in k}

        cls._config.update(config_update)

        cls._logger.debug("[%s]: Final config: %s", cls.__name__, cls._config)

[docs]    @classmethod
    def config_update_from_file(cls, config_file: str = None) -> None:
        """Update internal configuration from file.

        Read YAML ``config_file`` and update ``_config`` dict with all values matching
        keys defined there initially.

        Parameters
        ----------
        config_file
            Path to YAML config file. If not defined, tries to load file mentioned
            in ``_config["graph_app_config"]``

        """
        if config_file is None:
            config_file = cls._config["graph_app_config"]

        if config_file != "":
            cls._logger.debug(
                "[%s]: Update config from file: %s", cls.__name__, config_file
            )

            cls._config["graph_app_config"] = config_file

            config_yaml: Dict = {}
            with open(config_file, "r") as f:
                try:
                    config_yaml = safe_load(f)
                except YAMLError:
                    cls._logger.exception(
                        "Exception loading config file '%s'", config_file
                    )

            if config_yaml:
                config_update = {
                    "graph_{}".format(k): config_yaml.get(k, None)
                    for k in config_yaml.keys()
                }

                cls._config.update(config_update)

                cls._logger.debug("[%s]: Final config: %s", cls.__name__, cls._config)

    @property
    def graph(self) -> MsGraph:
        """Provide a shared instance of MS Graph API client."""
        if self._ms_graph is None:
            self._ms_graph = MsGraph(
                client_id=self._config["graph_client_id"],
                client_secret=self._config["graph_client_secret"],
                tenant=self._config["graph_tenant"],
            )

        return self._ms_graph

[docs]    def _records_to_redis_naive(self, records: List[Any]) -> bool:
        """Push records to Redis.

        Push ``records`` into a Redis list or publish in a channel. Does not
        apply any optimizations. Yields control to Gevent Hub at each iteration.

        Parameters
        ----------
        records
            List of records to store.

        Returns
        -------
        bool
            Flag indicating success or failure of the operation.

        """
        redis_client: Redis = self.redis_client

        queue_type: str = self._config["graph_queue_type"]
        queue_key: str = self._config["graph_queue_key"]

        try:
            redis_action = getattr(
                redis_client, self._redis_methods_map[queue_type].lower()
            )

            for r in records:
                gevent.sleep()
                redis_action(queue_key, json_dumps(r))

        except RedisError as e:
            self._logger.exception("Redis Exception: %s", str(e))  # noqa: G200
            result = False

        else:
            result = True

        return result

[docs]    def _records_to_redis_pipe(self, records: List[Any]) -> bool:
        """Push records to Redis with pipelining.

        Push ``records`` into a list or publish in a channel. Utilize pipelining to
        minimize connection overhead. Yields control to Gevent Hub at each iteration.

        Parameters
        ----------
        records
            List of records to store.

        Returns
        -------
        bool
            Flag indicating success or failure of the operation.

        """
        redis_client: Redis = self.redis_client

        queue_type: str = self._config["graph_queue_type"]
        queue_key: str = self._config["graph_queue_key"]

        try:
            with redis_client.pipeline() as pipe:

                pipe.multi()

                redis_action = getattr(
                    pipe, self._redis_methods_map[queue_type].lower()
                )

                for r in records:
                    gevent.sleep()
                    redis_action(queue_key, json_dumps(r))

                pipe.execute()

        except RedisError as e:
            self._logger.exception("Redis Exception: %s", str(e))  # noqa: G200
            result = False

        else:
            result = True

        return result

    @property
    def redis_client(self) -> Redis:
        """Provide an instance of Redis client."""
        if self._redis_client is None:
            redis_client = Redis(connection_pool=self.redis_conn_pool)

            self._redis_client = redis_client

            self._logger.debug(
                "[%s]: Initialized Redis client: %s", self.__name__, self._redis_client
            )

        return self._redis_client

    @property
    def redis_conn_pool(self) -> ConnectionPool:
        """Provide an instance of Redis connection pool."""
        if self._redis_conn_pool is None:
            if self._config["graph_redis_pool_block"]:
                pool_class: Callable = BlockingConnectionPool
            else:
                pool_class = ConnectionPool

            if self._config["graph_redis_pool_gevent_queue"]:
                redis_conn_pool = pool_class().from_url(
                    self._config["graph_redis_url"],
                    decode_components=True,
                    max_connections=self._config["graph_redis_pool_max_connections"],
                    timeout=self._config["graph_redis_pool_timeout"],
                    queue_class=gevent.queue.LifoQueue,
                )

            else:
                redis_conn_pool = pool_class().from_url(
                    self._config["graph_redis_url"],
                    decode_components=True,
                    max_connections=self._config["graph_redis_pool_max_connections"],
                    timeout=self._config["graph_redis_pool_timeout"],
                )

            self._redis_conn_pool = redis_conn_pool

            self._logger.debug(
                "[%s]: Initialized Redis connection pool: %s",
                self.__name__,
                self._redis_conn_pool,
            )

        return self._redis_conn_pool

[docs]    def redis_rm_queue(self) -> bool:
        """Delete Redis queue storing data records.

        Returns
        -------
        bool
            Flag indicating success or failure of the operation.

        """
        redis_client: Redis = self.redis_client

        queue_type: str = self._config["graph_queue_type"]
        queue_key: str = self._config["graph_queue_key"]

        try:
            redis_client.delete(queue_key)

        except RedisError as e:
            result: bool = False
            self._logger.exception(  # noqa: G200
                "Exception deleting Redis key: %s", str(e)
            )

        else:
            result = True
            self._logger.info("Cleared %s key '%s'", queue_type.upper(), queue_key)

        return result

[docs]    def task_fetch_slice(
        self, slice_start: datetime = None, slice_end: datetime = None
    ) -> MsGraphResponse:
        """Fetch a time slice of records.

        Request time-domain data slice ``[slice_start - slice_end]``

        Parameters
        ----------
        slice_start
            Beginning of the time-domain data slice.

        slice_end
            End of the time-domain data slice.

        """
        result = self.graph.get_signins(
            timestamp_start=slice_start,
            timestamp_end=slice_end,
            page_size=self._config["graph_page_size"],
        )

        return result

[docs]    def task_get_time_slices(
        self, timestamp: datetime = None
    ) -> List[Tuple[datetime, datetime]]:
        """Calculate time slices from the point in time.

        Based on available configuration, generates a series of consecutive
        time slices (streams) going back in time from ``timestamp``.

        Parameters
        ----------
        timestamp
            Point in time from which to calculate time slices. Microseconds are
            zeroed and ``timestamp`` is rounded up to a second. If not defined,
            uses current time.

        Note
        ----
        Following ``_config`` options influence the result:

        * ``timelag`` - seconds to shift back from ``timestamp`` adjusting
          the whole time-frame for the series.
        * ``streams`` - Number of time slices to generate.
        * ``stream_frame`` - Size of each time slice in seconds.


        So, with the ``timestamp=...-07T22:02:53.123``, ``timelag=60``,
        ``streams=3`` and ``stream_frame=10`` following series is generated:

        .. code-block:: text

            1. [...-07T22:01:44 - ...-07T22:01:53]
            2. [...-07T22:01:34 - ...-07T22:01:43]
            3. [...-07T22:01:24 - ...-07T22:01:33]

        """
        total_streams: int = self._config["graph_streams"]

        t_now: datetime = (
            timestamp.replace(microsecond=0)
            if timestamp is not None
            else datetime.utcnow().replace(microsecond=0)
        )

        t_lag: timedelta = timedelta(seconds=self._config["graph_timelag"])
        t_sec: timedelta = timedelta(seconds=1)
        t_delta: timedelta = timedelta(seconds=self._config["graph_stream_frame"])

        frame_end: datetime = t_now - t_lag - t_sec
        frame_start: datetime = frame_end + t_sec - t_delta * total_streams

        self._logger.info(
            "Split [%s - %s] into %s slices",
            frame_start.isoformat(),
            frame_end.isoformat(),
            total_streams,
        )

        result: List[Tuple[datetime, datetime]] = []

        for i in range(total_streams):
            slice_start: datetime = frame_end + t_sec - t_delta * (i + 1)
            slice_end: datetime = frame_end - t_delta * i

            result.append((slice_start, slice_end))

        return result

[docs]    def task_records_to_log(self, records: List[Any]) -> None:
        """Push records to log.

        Output ``records`` in the log.

        Parameters
        ----------
        records
            List of records to queue.

        """
        for r in records:
            self._logger.info(
                "[%s]: %s - %s - %s/%s",
                r["createdDateTime"],
                r["userPrincipalName"],
                r["ipAddress"],
                r["location"]["countryOrRegion"],
                r["location"]["city"],
            )

[docs]    def task_redis_push_single(
        self, records: List[Any], push_mode: str = "pipe"
    ) -> bool:
        """Push records to Redis in a single batch.

        Note
        ----
        See the note for :meth:`task_redis_push_multi_spawn` for detailed explanation on
        parameter ``push_mode``.

        Parameters
        ----------
        records
            List of records to queue.

        push_mode
            Specifies the data transfer strategy to use.

        Returns
        -------
        bool
            Flag indicating success or failure of the operation

        """
        result: bool = False

        push_method = getattr(self, "_records_to_redis_{}".format(push_mode))

        result = push_method(records=records)

        return result

[docs]    def task_redis_push_multi_spawn(
        self, records: List[Any], push_mode: str = "pipe"
    ) -> bool:
        """Push records to Redis in parallel.

        Split ``records`` list into smaller data sets to be pushed to Redis
        with multiple Greenlets.

        Note
        ----
        Parameter ``push_mode`` defines which data transfer strategy to use
        by providing the suffix for specific ``_records_to_redis_*`` method
        to be called. Currently accepts ``naive`` or ``pipe``.

        Parameters
        ----------
        records
            List of records to store in Redis.

        push_mode
            Specifies the data transfer strategy to use.

        Returns
        -------
        bool
            Flag indicating success or failure of the operation

        """
        # noqa: D202
        def chunks(l: List, n: int):
            """Yield successive n-sized chunks from l."""
            for i in range(0, len(l), n):
                yield l[i : i + n]  # noqa: E203

        result: bool = False
        batch_size: int = 0
        records_len: int = len(records)
        greenlets_count: int = self._config["graph_greenlets_count"]

        if records_len > greenlets_count:
            batch_size = (((records_len << 1) // greenlets_count) + 1) >> 1

        elif records_len > 0:
            batch_size = records_len

        if records_len:
            greenlets: Group = Group()

            # See https://docs.python.org/3/library/functools.html#functools.partial
            redis_push_func_partial: Callable = partial(
                getattr(self, "_records_to_redis_{}".format(push_mode))
            )

            greenlet_list: List = []

            for batch in chunks(records, batch_size):
                greenlet_list.append(
                    greenlets.spawn(redis_push_func_partial, records=batch)
                )

            greenlets.join(raise_error=True)

            greenlet_results = [g.value for g in greenlet_list]

            self._logger.debug(
                "%s Greenlets spawned: %s", len(greenlet_results), greenlet_results
            )

            result = False if False in greenlet_results else True

        self._logger.debug("Summary result: %s", result)

        return result

[docs]    def task_redis_push_multi_imap(
        self, records: List[Any], push_mode: str = "pipe"
    ) -> bool:
        """Push records to Redis in parallel.

        Split ``records`` list into smaller data sets to be pushed to Redis
        with multiple Greenlets. Parameter ``push_mode`` defines which data
        transfer strategy to use.

        Note
        ----
        See the note for :meth:`task_redis_push_multi_spawn` for detailed explanation on
        parameter ``push_mode``.

        Parameters
        ----------
        records
            List of records to store in Redis.

        push_mode
            Specifies the data transfer strategy to use.

        Returns
        -------
        bool
            Flag indicating success or failure of the operation

        """
        # noqa: D202
        def chunks(l: List, n: int):
            """Yield successive n-sized chunks from l."""
            for i in range(0, len(l), n):
                yield l[i : i + n]  # noqa: E203

        result: bool = False
        batch_size: int = 0
        records_len: int = len(records)
        greenlets_count: int = self._config["graph_greenlets_count"]

        if records_len > greenlets_count:
            batch_size = (((records_len << 1) // greenlets_count) + 1) >> 1

        elif records_len > 0:
            batch_size = records_len

        if batch_size:
            greenlets: Group = Group()

            redis_push_func_partial: Callable = partial(
                getattr(self, "_records_to_redis_{}".format(push_mode))
            )

            greenlet_results: List = []

            for g in greenlets.imap_unordered(
                redis_push_func_partial, chunks(records, batch_size)
            ):
                greenlet_results.append(g)

            self._logger.debug(
                "%s Greenlets returned: %s", len(greenlet_results), greenlet_results
            )

            result = False if False in greenlet_results else True

        self._logger.debug("Summary result: %s", result)

        return result