import math import torch import numpy as np from torch import nn from typing import Tuple class RunningMeanStd(object): def __init__(self, epsilon: float = 1e-4, shape: Tuple[int, ...] = ()): """ Calulates the running mean and std of a data stream https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm :param epsilon: helps with arithmetic issues :param shape: the shape of the data stream's output """ self.mean = np.zeros(shape, np.float64) self.var = np.ones(shape, np.float64) self.count = epsilon def update(self, arr: np.ndarray) -> None: batch_mean = np.mean(arr, axis=0) batch_var = np.var(arr, axis=0) batch_count = arr.shape[0] self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean: np.ndarray, batch_var: np.ndarray, batch_count: int) -> None: delta = batch_mean - self.mean tot_count = self.count + batch_count new_mean = self.mean + delta * batch_count / tot_count m_a = self.var * self.count m_b = batch_var * batch_count m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) new_var = m_2 / (self.count + batch_count) new_count = batch_count + self.count self.mean = new_mean self.var = new_var self.count = new_count class Normalizer(RunningMeanStd): def __init__(self, input_dim, epsilon=1e-4, clip_obs=10.0): super().__init__(shape=input_dim) self.epsilon = epsilon self.clip_obs = clip_obs def normalize(self, input): return np.clip( (input - self.mean) / np.sqrt(self.var + self.epsilon), -self.clip_obs, self.clip_obs) def normalize_torch(self, input, device): mean_torch = torch.tensor( self.mean, device=device, dtype=torch.float32) std_torch = torch.sqrt(torch.tensor( self.var + self.epsilon, device=device, dtype=torch.float32)) return torch.clamp( (input - mean_torch) / std_torch, -self.clip_obs, self.clip_obs) def update_normalizer(self, rollouts, expert_loader): policy_data_generator = rollouts.feed_forward_generator_amp( None, mini_batch_size=expert_loader.batch_size) expert_data_generator = expert_loader.dataset.feed_forward_generator_amp( expert_loader.batch_size) for expert_batch, policy_batch in zip(expert_data_generator, policy_data_generator): self.update( torch.vstack(tuple(policy_batch) + tuple(expert_batch)).cpu().numpy()) def build_mlp(input_dim, output_dim, hidden_units=[64, 64], hidden_activation=nn.Tanh(), output_activation=None): layers = [] units = input_dim for next_units in hidden_units: layers.append(nn.Linear(units, next_units)) layers.append(hidden_activation) units = next_units layers.append(nn.Linear(units, output_dim)) if output_activation is not None: layers.append(output_activation) return nn.Sequential(*layers) def calculate_log_pi(log_stds, noises, actions): gaussian_log_probs = (-0.5 * noises.pow(2) - log_stds).sum( dim=-1, keepdim=True) - 0.5 * math.log(2 * math.pi) * log_stds.size(-1) return gaussian_log_probs - torch.log( 1 - actions.pow(2) + 1e-6).sum(dim=-1, keepdim=True) def reparameterize(means, log_stds): noises = torch.randn_like(means) us = means + noises * log_stds.exp() actions = torch.tanh(us) return actions, calculate_log_pi(log_stds, noises, actions) def atanh(x): return 0.5 * (torch.log(1 + x + 1e-6) - torch.log(1 - x + 1e-6)) def evaluate_lop_pi(means, log_stds, actions): noises = (atanh(actions) - means) / (log_stds.exp() + 1e-8) return calculate_log_pi(log_stds, noises, actions)