Source code for qgym.envs.routing.routing_rewarders

"""This module will contain some vanilla Rewarders for the :class:`~qgym.envs.Routing`
environment.

Usage:
    The rewarders in this module can be customized by initializing the rewarders with
    different values.

    .. code-block:: python

        from qgym.envs.routing import BasicRewarder

        rewarder = BasicRewarder(
            illegal_action_penalty = -1,
            update_cycle_penalty = -2,
            schedule_gate_bonus: = 3,
            )

    After initialization, the rewarders can be given to the :class:`~qgym.envs.Routing`
    environment.

.. note::
    When implementing custom rewarders, they should inherit from
    :class:`~qgym.templates.Rewarder`. Furthermore, they must implement the
    :func:`~qgym.templates.Rewarder.compute_reward` method. Which takes as input the
    old state, the new state and the given action. See the documentation of the
    :obj:`~qgym.envs.routing.routing` module for more information on the state and
    action space.

"""

import warnings

from qgym.envs.routing.routing_state import RoutingState
from qgym.templates import Rewarder
from qgym.utils.input_validation import check_real, warn_if_negative, warn_if_positive



[docs]
class BasicRewarder(Rewarder):
    """RL Rewarder, for computing rewards on the
    :class:`~qgym.envs.routing.RoutingState`.
    """


[docs]
    def __init__(
        self,
        illegal_action_penalty: float = -50,
        penalty_per_swap: float = -10,
        reward_per_surpass: float = 10,
    ) -> None:
        """Set the rewards and penalties.

        Args:
            illegal_action_penalty: Penalty for performing an illegal action. An action
                is illegal when the action means 'surpass' even though the next gate
                cannot be surpassed. This value should be negative (but is not required)
                and defaults to -50.
            penalty_per_swap: Penalty for placing a swap. In general, we want to have as
                little swaps as possible. Therefore, this value should be negative and
                defaults to -10.
            reward_per_surpass: Reward given for surpassing a gate. In general, we want
                to have go to the end of the circuit as fast as possible. Therefore,
                this value should be positive and defaults to 10.
        """
        self._illegal_action_penalty = check_real(
            illegal_action_penalty, "illegal_action_penalty"
        )
        self._penalty_per_swap = check_real(penalty_per_swap, "penalty_per_swap")
        self._reward_per_surpass = check_real(reward_per_surpass, "reward_per_surpass")
        self._set_reward_range()

        warn_if_positive(self._illegal_action_penalty, "illegal_action_penalty")
        warn_if_positive(self._penalty_per_swap, "penalty_per_swap")
        warn_if_negative(self._reward_per_surpass, "reward_per_surpass")



[docs]
    def compute_reward(
        self, *, old_state: RoutingState, action: int, new_state: RoutingState
    ) -> float:
        """Compute a reward, based on the old state, new state, and the given action.

        Args:
            old_state: :class:`~qgym.envs.routing.RoutingState` before the current
                action.
            action: Action that has just been taken.
            new_state: :class:`~qgym.envs.routing.RoutingState` after the current
                action.

        Returns:
            The reward for this action.
        """

        if self._is_illegal(action, old_state):
            return self._illegal_action_penalty

        reward = old_state.position * self._reward_per_surpass
        reward += len(old_state.swap_gates_inserted) * self._penalty_per_swap
        if action == old_state.n_connections:
            reward += self._reward_per_surpass
        else:
            reward += self._penalty_per_swap

        return reward


    def _is_illegal(self, action: int, old_state: RoutingState) -> bool:
        """Checks whether an action chosen by the agent is illegal.

        Returns:
            Boolean value stating whether the action was illegal or not.
        """
        if action != old_state.n_connections:
            return False

        qubit1, qubit2 = old_state.interaction_circuit[old_state.position]
        return not old_state.is_legal_surpass(qubit1, qubit2)

    def _set_reward_range(self) -> None:
        """Set the reward range."""
        l_bound = -float("inf")
        if (
            self._illegal_action_penalty >= 0
            and self._penalty_per_swap >= 0
            and self._reward_per_surpass >= 0
        ):
            l_bound = 0

        u_bound = float("inf")
        if (
            self._illegal_action_penalty <= 0
            and self._penalty_per_swap <= 0
            and self._reward_per_surpass <= 0
        ):
            u_bound = 0

        self._reward_range = (l_bound, u_bound)




[docs]
class SwapQualityRewarder(BasicRewarder):
    """Rewarder for the :class:`~qgym.envs.Routing` environment which takes swap
    qualities into account.

    The :class:`SwapQualityRewarder` has an adjusted reward w.r.t. the
    :class:`BasicRewarder` in the sense that good SWAPs give lower penalties and bad
    SWAPs give higher penalties.
    """


[docs]
    def __init__(
        self,
        illegal_action_penalty: float = -50,
        penalty_per_swap: float = -10,
        reward_per_surpass: float = 10,
        good_swap_reward: float = 5,
    ) -> None:
        """Set the rewards and penalties and a flag.

        Args:
            illegal_action_penalty: Penalty for performing an illegal action. An action
                is illegal when the action means 'surpass' even though the next gate
                cannot be surpassed. This value should be negative (but is not required)
                and defaults to -50.
            penalty_per_swap: Penalty for placing a swap. In general, we want to have as
                little swaps as possible. Therefore, this value should be negative and
                defaults to -10.
            reward_per_surpass: Reward given for surpassing a gate. In general, we want
                to have go to the end of the circuit as fast as possible. Therefore,
                this value should be positive and defaults to 10.
            good_swap_reward: Reward given for placing a good swap. In general, we want
                to place as little swaps as possible. However, when they are good, the
                penalty for the placement should be suppressed. That happens with this
                reward. So, the value should be positive and smaller than the
                penalty_per_swap, in order not to get positive rewards for swaps,
                defaults to 5.
        """
        super().__init__(
            illegal_action_penalty=illegal_action_penalty,
            penalty_per_swap=penalty_per_swap,
            reward_per_surpass=reward_per_surpass,
        )

        self._good_swap_reward = check_real(good_swap_reward, "reward_per_good_swap")

        if not 0 <= self._good_swap_reward < -self._penalty_per_swap:
            warnings.warn("Good swaps should not result in positive rewards.")

        warn_if_negative(self._good_swap_reward, "reward_per_good_swap")



[docs]
    def compute_reward(
        self,
        *,
        old_state: RoutingState,
        action: int,
        new_state: RoutingState,
    ) -> float:
        """Compute a reward, based on the old state, the given action and the new state.

        Specifically, the change in observation reach is used.

        Args:
            old_state: :class:`~qgym.envs.routing.RoutingState` before the current
                action.
            action: Action that has just been taken.
            new_state: :class:`~qgym.envs.routing.RoutingState` after the current
                action.

        Returns:
            The reward for this action. If the action is illegal, then the reward is
            `illegal_action_penalty`. If the action is legal, then the reward for a
            surpass is just reward_per_surpass. But, for a legal swap the reward
            adjusted with respect to the BasicRewarder. Namely, the penalty of a swap is
            reduced if it increases the observation_reach and the penalty is increased
            if the observation_reach is decreases.
        """
        if self._is_illegal(action, old_state):
            return self._illegal_action_penalty

        if action == old_state.n_connections:
            return self._reward_per_surpass

        return (
            self._penalty_per_swap
            + self._good_swap_reward
            * self._observation_enhancement_factor(old_state, new_state)
        )


    def _observation_enhancement_factor(
        self,
        old_state: RoutingState,
        new_state: RoutingState,
    ) -> float:
        """Calculates the change of the observation reach as an effect of a swap.

        Args:
            old_state: ``RoutingState`` before the current action.
            new_state: ``RoutingState`` after the current action.

        Returns:
            A fraction that expresses the procentual improvement w.r.t the `old_state`'s
            observation.
        """
        try:
            is_legal_surpass = old_state.obtain_observation()["is_legal_surpass"]
            old_executable_gates_ahead = int(is_legal_surpass.sum())

            is_legal_surpass = new_state.obtain_observation()["is_legal_surpass"]
            new_executable_gates_ahead = int(is_legal_surpass.sum())
        except KeyError as error:
            if not old_state.observe_legal_surpasses:
                msg = "observe_legal_surpasses needs to be True to compute"
                msg += "observation_enhancement_factor"
                raise ValueError(msg) from error
            raise error

        return (
            new_executable_gates_ahead - old_executable_gates_ahead
        ) / old_state.max_observation_reach

    def _set_reward_range(self) -> None:
        """Set the reward range."""
        l_bound = -float("inf")
        if (
            self._illegal_action_penalty >= 0
            and self._penalty_per_swap >= 0
            and self._reward_per_surpass >= 0
            and self._good_swap_reward >= 0
        ):
            l_bound = 0

        u_bound = float("inf")
        if (
            self._illegal_action_penalty <= 0
            and self._penalty_per_swap <= 0
            and self._reward_per_surpass <= 0
            and self._good_swap_reward <= 0
        ):
            u_bound = 0

        self._reward_range = (l_bound, u_bound)




[docs]
class EpisodeRewarder(BasicRewarder):
    """Rewarder for the ``Routing`` environment, which only gives a reward after at
    the end of a full episode. The reward is the highest for the lowest amount of SWAPs.
    This could be improved for setting for taking into account the fidelity of edges and
    scoring good and looking at what edges the circuit is executed.
    """


[docs]
    def compute_reward(
        self,
        *,
        old_state: RoutingState,
        action: int,
        new_state: RoutingState,
    ) -> float:
        """Compute a reward, based on the new state, and the given action.

        Args:
            old_state: ``RoutingState`` before the current action.
            action: Action that has just been taken.
            new_state: ``RoutingState`` after the current action.

        Returns:
            If an action is illegal returns the `illegal_action_penalty`. If the episode
            is finished returns the reward calculated over the episode, otherwise
            returns 0.
        """
        if self._is_illegal(action, old_state):
            return self._illegal_action_penalty

        if not new_state.is_done():
            return 0

        return len(new_state.swap_gates_inserted) * self._penalty_per_swap


    def _set_reward_range(self) -> None:
        """Set the reward range."""
        l_bound = -float("inf")
        if self._illegal_action_penalty >= 0 and self._penalty_per_swap >= 0:
            l_bound = 0

        u_bound = float("inf")
        if self._illegal_action_penalty <= 0 and self._penalty_per_swap <= 0:
            u_bound = 0

        self._reward_range = (l_bound, u_bound)