Source code for qgym.envs.routing.routing_rewarders

"""This module will contain some vanilla Rewarders for the :class:`~qgym.envs.Routing`
environment.

Usage:
    The rewarders in this module can be customized by initializing the rewarders with
    different values.

    .. code-block:: python

        from qgym.envs.routing import BasicRewarder

        rewarder = BasicRewarder(
            illegal_action_penalty = -1,
            update_cycle_penalty = -2,
            schedule_gate_bonus: = 3,
            )

    After initialization, the rewarders can be given to the :class:`~qgym.envs.Routing`
    environment.

.. note::
    When implementing custom rewarders, they should inherit from
    :class:`~qgym.templates.Rewarder`. Furthermore, they must implement the
    :func:`~qgym.templates.Rewarder.compute_reward` method. Which takes as input the
    old state, the new state and the given action. See the documentation of the
    :obj:`~qgym.envs.routing.routing` module for more information on the state and
    action space.

"""

import warnings

from qgym.envs.routing.routing_state import RoutingState
from qgym.templates import Rewarder
from qgym.utils.input_validation import check_real, warn_if_negative, warn_if_positive


[docs] class BasicRewarder(Rewarder): """RL Rewarder, for computing rewards on the :class:`~qgym.envs.routing.RoutingState`. """
[docs] def __init__( self, illegal_action_penalty: float = -50, penalty_per_swap: float = -10, reward_per_surpass: float = 10, ) -> None: """Set the rewards and penalties. Args: illegal_action_penalty: Penalty for performing an illegal action. An action is illegal when the action means 'surpass' even though the next gate cannot be surpassed. This value should be negative (but is not required) and defaults to -50. penalty_per_swap: Penalty for placing a swap. In general, we want to have as little swaps as possible. Therefore, this value should be negative and defaults to -10. reward_per_surpass: Reward given for surpassing a gate. In general, we want to have go to the end of the circuit as fast as possible. Therefore, this value should be positive and defaults to 10. """ self._illegal_action_penalty = check_real( illegal_action_penalty, "illegal_action_penalty" ) self._penalty_per_swap = check_real(penalty_per_swap, "penalty_per_swap") self._reward_per_surpass = check_real(reward_per_surpass, "reward_per_surpass") self._set_reward_range() warn_if_positive(self._illegal_action_penalty, "illegal_action_penalty") warn_if_positive(self._penalty_per_swap, "penalty_per_swap") warn_if_negative(self._reward_per_surpass, "reward_per_surpass")
[docs] def compute_reward( self, *, old_state: RoutingState, action: int, new_state: RoutingState ) -> float: """Compute a reward, based on the old state, new state, and the given action. Args: old_state: :class:`~qgym.envs.routing.RoutingState` before the current action. action: Action that has just been taken. new_state: :class:`~qgym.envs.routing.RoutingState` after the current action. Returns: The reward for this action. """ if self._is_illegal(action, old_state): return self._illegal_action_penalty reward = old_state.position * self._reward_per_surpass reward += len(old_state.swap_gates_inserted) * self._penalty_per_swap if action == old_state.n_connections: reward += self._reward_per_surpass else: reward += self._penalty_per_swap return reward
def _is_illegal(self, action: int, old_state: RoutingState) -> bool: """Checks whether an action chosen by the agent is illegal. Returns: Boolean value stating whether the action was illegal or not. """ if action != old_state.n_connections: return False qubit1, qubit2 = old_state.interaction_circuit[old_state.position] return not old_state.is_legal_surpass(qubit1, qubit2) def _set_reward_range(self) -> None: """Set the reward range.""" l_bound = -float("inf") if ( self._illegal_action_penalty >= 0 and self._penalty_per_swap >= 0 and self._reward_per_surpass >= 0 ): l_bound = 0 u_bound = float("inf") if ( self._illegal_action_penalty <= 0 and self._penalty_per_swap <= 0 and self._reward_per_surpass <= 0 ): u_bound = 0 self._reward_range = (l_bound, u_bound)
[docs] class SwapQualityRewarder(BasicRewarder): """Rewarder for the :class:`~qgym.envs.Routing` environment which takes swap qualities into account. The :class:`SwapQualityRewarder` has an adjusted reward w.r.t. the :class:`BasicRewarder` in the sense that good SWAPs give lower penalties and bad SWAPs give higher penalties. """
[docs] def __init__( self, illegal_action_penalty: float = -50, penalty_per_swap: float = -10, reward_per_surpass: float = 10, good_swap_reward: float = 5, ) -> None: """Set the rewards and penalties and a flag. Args: illegal_action_penalty: Penalty for performing an illegal action. An action is illegal when the action means 'surpass' even though the next gate cannot be surpassed. This value should be negative (but is not required) and defaults to -50. penalty_per_swap: Penalty for placing a swap. In general, we want to have as little swaps as possible. Therefore, this value should be negative and defaults to -10. reward_per_surpass: Reward given for surpassing a gate. In general, we want to have go to the end of the circuit as fast as possible. Therefore, this value should be positive and defaults to 10. good_swap_reward: Reward given for placing a good swap. In general, we want to place as little swaps as possible. However, when they are good, the penalty for the placement should be suppressed. That happens with this reward. So, the value should be positive and smaller than the penalty_per_swap, in order not to get positive rewards for swaps, defaults to 5. """ super().__init__( illegal_action_penalty=illegal_action_penalty, penalty_per_swap=penalty_per_swap, reward_per_surpass=reward_per_surpass, ) self._good_swap_reward = check_real(good_swap_reward, "reward_per_good_swap") if not 0 <= self._good_swap_reward < -self._penalty_per_swap: warnings.warn("Good swaps should not result in positive rewards.") warn_if_negative(self._good_swap_reward, "reward_per_good_swap")
[docs] def compute_reward( self, *, old_state: RoutingState, action: int, new_state: RoutingState, ) -> float: """Compute a reward, based on the old state, the given action and the new state. Specifically, the change in observation reach is used. Args: old_state: :class:`~qgym.envs.routing.RoutingState` before the current action. action: Action that has just been taken. new_state: :class:`~qgym.envs.routing.RoutingState` after the current action. Returns: The reward for this action. If the action is illegal, then the reward is `illegal_action_penalty`. If the action is legal, then the reward for a surpass is just reward_per_surpass. But, for a legal swap the reward adjusted with respect to the BasicRewarder. Namely, the penalty of a swap is reduced if it increases the observation_reach and the penalty is increased if the observation_reach is decreases. """ if self._is_illegal(action, old_state): return self._illegal_action_penalty if action == old_state.n_connections: return self._reward_per_surpass return ( self._penalty_per_swap + self._good_swap_reward * self._observation_enhancement_factor(old_state, new_state) )
def _observation_enhancement_factor( self, old_state: RoutingState, new_state: RoutingState, ) -> float: """Calculates the change of the observation reach as an effect of a swap. Args: old_state: ``RoutingState`` before the current action. new_state: ``RoutingState`` after the current action. Returns: A fraction that expresses the procentual improvement w.r.t the `old_state`'s observation. """ try: is_legal_surpass = old_state.obtain_observation()["is_legal_surpass"] old_executable_gates_ahead = int(is_legal_surpass.sum()) is_legal_surpass = new_state.obtain_observation()["is_legal_surpass"] new_executable_gates_ahead = int(is_legal_surpass.sum()) except KeyError as error: if not old_state.observe_legal_surpasses: msg = "observe_legal_surpasses needs to be True to compute" msg += "observation_enhancement_factor" raise ValueError(msg) from error raise error return ( new_executable_gates_ahead - old_executable_gates_ahead ) / old_state.max_observation_reach def _set_reward_range(self) -> None: """Set the reward range.""" l_bound = -float("inf") if ( self._illegal_action_penalty >= 0 and self._penalty_per_swap >= 0 and self._reward_per_surpass >= 0 and self._good_swap_reward >= 0 ): l_bound = 0 u_bound = float("inf") if ( self._illegal_action_penalty <= 0 and self._penalty_per_swap <= 0 and self._reward_per_surpass <= 0 and self._good_swap_reward <= 0 ): u_bound = 0 self._reward_range = (l_bound, u_bound)
[docs] class EpisodeRewarder(BasicRewarder): """Rewarder for the ``Routing`` environment, which only gives a reward after at the end of a full episode. The reward is the highest for the lowest amount of SWAPs. This could be improved for setting for taking into account the fidelity of edges and scoring good and looking at what edges the circuit is executed. """
[docs] def compute_reward( self, *, old_state: RoutingState, action: int, new_state: RoutingState, ) -> float: """Compute a reward, based on the new state, and the given action. Args: old_state: ``RoutingState`` before the current action. action: Action that has just been taken. new_state: ``RoutingState`` after the current action. Returns: If an action is illegal returns the `illegal_action_penalty`. If the episode is finished returns the reward calculated over the episode, otherwise returns 0. """ if self._is_illegal(action, old_state): return self._illegal_action_penalty if not new_state.is_done(): return 0 return len(new_state.swap_gates_inserted) * self._penalty_per_swap
def _set_reward_range(self) -> None: """Set the reward range.""" l_bound = -float("inf") if self._illegal_action_penalty >= 0 and self._penalty_per_swap >= 0: l_bound = 0 u_bound = float("inf") if self._illegal_action_penalty <= 0 and self._penalty_per_swap <= 0: u_bound = 0 self._reward_range = (l_bound, u_bound)