Source code for irlc.ex13.dqn_network
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
[docs]
class DQNNetwork:
"""
A class representing a deep Q network.
Note that this function is batched. I.e. ``s`` is assumed to be a numpy array of dimension ``batch_size x n``
The following example shows how you can evaluate the Q-values in a given state. An example:
.. runblock:: pycon
>>> from irlc.ex13.torch_networks import TorchNetwork
>>> import gymnasium as gym
>>> import numpy as np
>>> env = gym.make("CartPole-v1")
>>> Q = TorchNetwork(env, trainable=True, learning_rate=0.001) # DQN network requires an env to set network dimensions
>>> batch_size = 32 # As an example
>>> states = np.random.rand(batch_size, env.observation_space.shape[0]) # Creates some dummy input
>>> states.shape # batch_size x n
>>> qvals = Q(states) # Evaluate Q(s,a)
>>> qvals.shape # This is a tensor of dimension batch_size x actions
>>> print(qvals[0,1]) # Get Q(s_0, 1)
>>> Y = np.random.rand(batch_size, env.action_space.n) # Generate target Q-values (training data)
>>> Q.fit(states, Y) # Train the Q-network for 1 gradient descent step
"""
[docs]
def update_Phi(self, source, tau=0.01):
r"""
Update (adapts) the weights in this network towards those in source by a small amount.
For each weight :math:`w_i` in (this) network, and each corresponding weight :math:`w'_i` in the ``source`` network,
the following Polyak update is performed:
.. math::
w_i \leftarrow w_i + \tau (w'_i - w_i)
:param source: Target network to update towards
:param tau: Update rate (rate of change :math:`\\tau`
:return: ``None``
"""
raise NotImplementedError
def __call__(self, s):
"""
Evaluate the Q-values in the given (batched) state.
:param s: A matrix of size ``batch_size x n`` where :math:`n` is the state dimension.
:return: The Q-values as a ``batch_size x d`` dimensional matrix where :math:`d` is the number of actions.
"""
raise NotImplementedError
[docs]
def fit(self, s, target):
r"""
Fit the network weights by minimizing
.. math::
\frac{1}{B}\sum_{i=1}^B \sum_{a=1}^K \| q_\phi(s_i)_a - y_{i,a} \|^2
where ``target`` corresponds to :math:`y` and is a ``[batch_size x actions]`` matrix of target Q-values.
:param s:
:param target:
:return:
"""
raise NotImplementedError