Info
- Title: MoCoGAN: Decomposing Motion and Content for Video Generation
- Task: Video Generation
- Author: Sergey Tulyakov, Ming-Yu Liu, Xiaodong Yang, Jan Kautz
- Date: July 2017
- Arxiv: 1707.04993
- Published: CVPR 2018
Abstract
Visual signals in a video can be divided into content and motion. While content specifies which objects are in the video, motion describes their dynamics. Based on this prior, we propose the Motion and Content decomposed Generative Adversarial Network (MoCoGAN) framework for video generation. The proposed framework generates a video by mapping a sequence of random vectors to a sequence of video frames. Each random vector consists of a content part and a motion part. While the content part is kept fixed, the motion part is realized as a stochastic process. To learn motion and content decomposition in an unsupervised manner, we introduce a novel adversarial learning scheme utilizing both image and video discriminators. Extensive experimental results on several challenging datasets with qualitative and quantitative comparison to the state-of-the-art approaches, verify effectiveness of the proposed framework. In addition, we show that MoCoGAN allows one to generate videos with same content but different motion as well as videos with different content and same motion.
Highlights & Drawbacks
- Propose a novel GAN framework for unconditional video generation, mapping noise vectors to videos.
- Show the proposed framework provides a means to control content and motion in video generation, which is absent in the existing video generation frameworks.
Motivation & Design
The MoCoGAN framework for video generation. For a video, the content vector, $z_C$, is sampled once and fixed. Then, a series of random variables $[ε(1), …, ε(K)]$ is sampled and mapped to a series of motion $M$ z(k)’s are from the recurrent neural network, $R_M$. A generator $G_I$ produces a frame,$ x ̃$ , using the content and the motion vectors ${zC, z(k)}$. The discriminators, DIM and DV, are trained on real and fake images and videos, respectively, sampled from the training set v and the generated set $v ̃$. The function S1 samples a single frame from a video, $S_T$ samples $T$ consequtive frames.
Performance & Ablation Study
Code
Video Generator
class VideoGenerator(nn.Module):
def __init__(self, n_channels, dim_z_content, dim_z_category, dim_z_motion,
video_length, ngf=64):
super(VideoGenerator, self).__init__()
self.n_channels = n_channels
self.dim_z_content = dim_z_content
self.dim_z_category = dim_z_category
self.dim_z_motion = dim_z_motion
self.video_length = video_length
dim_z = dim_z_motion + dim_z_category + dim_z_content
self.recurrent = nn.GRUCell(dim_z_motion, dim_z_motion)
self.main = nn.Sequential(
nn.ConvTranspose2d(dim_z, ngf * 8, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
nn.ConvTranspose2d(ngf, self.n_channels, 4, 2, 1, bias=False),
nn.Tanh()
)
def sample_z_m(self, num_samples, video_len=None):
video_len = video_len if video_len is not None else self.video_length
h_t = [self.get_gru_initial_state(num_samples)]
for frame_num in range(video_len):
e_t = self.get_iteration_noise(num_samples)
h_t.append(self.recurrent(e_t, h_t[-1]))
z_m_t = [h_k.view(-1, 1, self.dim_z_motion) for h_k in h_t]
z_m = torch.cat(z_m_t[1:], dim=1).view(-1, self.dim_z_motion)
return z_m
def sample_z_categ(self, num_samples, video_len):
video_len = video_len if video_len is not None else self.video_length
if self.dim_z_category <= 0:
return None, np.zeros(num_samples)
classes_to_generate = np.random.randint(self.dim_z_category, size=num_samples)
one_hot = np.zeros((num_samples, self.dim_z_category), dtype=np.float32)
one_hot[np.arange(num_samples), classes_to_generate] = 1
one_hot_video = np.repeat(one_hot, video_len, axis=0)
one_hot_video = torch.from_numpy(one_hot_video)
if torch.cuda.is_available():
one_hot_video = one_hot_video.cuda()
return Variable(one_hot_video), classes_to_generate
def sample_z_content(self, num_samples, video_len=None):
video_len = video_len if video_len is not None else self.video_length
content = np.random.normal(0, 1, (num_samples, self.dim_z_content)).astype(np.float32)
content = np.repeat(content, video_len, axis=0)
content = torch.from_numpy(content)
if torch.cuda.is_available():
content = content.cuda()
return Variable(content)
def sample_z_video(self, num_samples, video_len=None):
z_content = self.sample_z_content(num_samples, video_len)
z_category, z_category_labels = self.sample_z_categ(num_samples, video_len)
z_motion = self.sample_z_m(num_samples, video_len)
if z_category is not None:
z = torch.cat([z_content, z_category, z_motion], dim=1)
else:
z = torch.cat([z_content, z_motion], dim=1)
return z, z_category_labels
def sample_videos(self, num_samples, video_len=None):
video_len = video_len if video_len is not None else self.video_length
z, z_category_labels = self.sample_z_video(num_samples, video_len)
h = self.main(z.view(z.size(0), z.size(1), 1, 1))
h = h.view(h.size(0) / video_len, video_len, self.n_channels, h.size(3), h.size(3))
z_category_labels = torch.from_numpy(z_category_labels)
if torch.cuda.is_available():
z_category_labels = z_category_labels.cuda()
h = h.permute(0, 2, 1, 3, 4)
return h, Variable(z_category_labels, requires_grad=False)
def sample_images(self, num_samples):
z, z_category_labels = self.sample_z_video(num_samples * self.video_length * 2)
j = np.sort(np.random.choice(z.size(0), num_samples, replace=False)).astype(np.int64)
z = z[j, ::]
z = z.view(z.size(0), z.size(1), 1, 1)
h = self.main(z)
return h, None
Train Generator
def sample_fake_image_batch(batch_size):
return generator.sample_images(batch_size)
def sample_fake_video_batch(batch_size):
return generator.sample_videos(batch_size)
def train_generator(self,
image_discriminator, video_discriminator,
sample_fake_images, sample_fake_videos,
opt):
opt.zero_grad()
# train on images
fake_batch, generated_categories = sample_fake_images(self.image_batch_size)
fake_labels, fake_categorical = image_discriminator(fake_batch)
all_ones = self.ones_like(fake_labels)
l_generator = self.gan_criterion(fake_labels, all_ones)
# train on videos
fake_batch, generated_categories = sample_fake_videos(self.video_batch_size)
fake_labels, fake_categorical = video_discriminator(fake_batch)
all_ones = self.ones_like(fake_labels)
l_generator += self.gan_criterion(fake_labels, all_ones)
if self.use_infogan:
# Ask the generator to generate categories recognizable by the discriminator
l_generator += self.category_criterion(fake_categorical.squeeze(), generated_categories)
l_generator.backward()
opt.step()
return l_generator
Video Discriminator
class PatchVideoDiscriminator(nn.Module):
def __init__(self, n_channels, n_output_neurons=1, bn_use_gamma=True, use_noise=False, noise_sigma=None, ndf=64):
super(PatchVideoDiscriminator, self).__init__()
self.n_channels = n_channels
self.n_output_neurons = n_output_neurons
self.use_noise = use_noise
self.bn_use_gamma = bn_use_gamma
self.main = nn.Sequential(
Noise(use_noise, sigma=noise_sigma),
nn.Conv3d(n_channels, ndf, 4, stride=(1, 2, 2), padding=(0, 1, 1), bias=False),
nn.LeakyReLU(0.2, inplace=True),
Noise(use_noise, sigma=noise_sigma),
nn.Conv3d(ndf, ndf * 2, 4, stride=(1, 2, 2), padding=(0, 1, 1), bias=False),
nn.BatchNorm3d(ndf * 2),
nn.LeakyReLU(0.2, inplace=True),
Noise(use_noise, sigma=noise_sigma),
nn.Conv3d(ndf * 2, ndf * 4, 4, stride=(1, 2, 2), padding=(0, 1, 1), bias=False),
nn.BatchNorm3d(ndf * 4),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv3d(ndf * 4, 1, 4, stride=(1, 2, 2), padding=(0, 1, 1), bias=False),
)
def forward(self, input):
h = self.main(input).squeeze()
return h, None
class VideoDiscriminator(nn.Module):
def __init__(self, n_channels, n_output_neurons=1, bn_use_gamma=True, use_noise=False, noise_sigma=None, ndf=64):
super(VideoDiscriminator, self).__init__()
self.n_channels = n_channels
self.n_output_neurons = n_output_neurons
self.use_noise = use_noise
self.bn_use_gamma = bn_use_gamma
self.main = nn.Sequential(
Noise(use_noise, sigma=noise_sigma),
nn.Conv3d(n_channels, ndf, 4, stride=(1, 2, 2), padding=(0, 1, 1), bias=False),
nn.LeakyReLU(0.2, inplace=True),
Noise(use_noise, sigma=noise_sigma),
nn.Conv3d(ndf, ndf * 2, 4, stride=(1, 2, 2), padding=(0, 1, 1), bias=False),
nn.BatchNorm3d(ndf * 2),
nn.LeakyReLU(0.2, inplace=True),
Noise(use_noise, sigma=noise_sigma),
nn.Conv3d(ndf * 2, ndf * 4, 4, stride=(1, 2, 2), padding=(0, 1, 1), bias=False),
nn.BatchNorm3d(ndf * 4),
nn.LeakyReLU(0.2, inplace=True),
Noise(use_noise, sigma=noise_sigma),
nn.Conv3d(ndf * 4, ndf * 8, 4, stride=(1, 2, 2), padding=(0, 1, 1), bias=False),
nn.BatchNorm3d(ndf * 8),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv3d(ndf * 8, n_output_neurons, 4, 1, 0, bias=False),
)
def forward(self, input):
h = self.main(input).squeeze()
return h, None
class CategoricalVideoDiscriminator(VideoDiscriminator):
def __init__(self, n_channels, dim_categorical, n_output_neurons=1, use_noise=False, noise_sigma=None):
super(CategoricalVideoDiscriminator, self).__init__(n_channels=n_channels,
n_output_neurons=n_output_neurons + dim_categorical,
use_noise=use_noise,
noise_sigma=noise_sigma)
self.dim_categorical = dim_categorical
def split(self, input):
return input[:, :input.size(1) - self.dim_categorical], input[:, input.size(1) - self.dim_categorical:]
def forward(self, input):
h, _ = super(CategoricalVideoDiscriminator, self).forward(input)
labels, categ = self.split(h)
return labels, categ
Train Discriminator
def train_discriminator(self, discriminator, sample_true, sample_fake, opt, batch_size, use_categories):
opt.zero_grad()
real_batch = sample_true()
batch = Variable(real_batch['images'], requires_grad=False)
# util.show_batch(batch.data)
fake_batch, generated_categories = sample_fake(batch_size)
real_labels, real_categorical = discriminator(batch)
fake_labels, fake_categorical = discriminator(fake_batch.detach())
ones = self.ones_like(real_labels)
zeros = self.zeros_like(fake_labels)
l_discriminator = self.gan_criterion(real_labels, ones) + \
self.gan_criterion(fake_labels, zeros)
if use_categories:
# Ask the video discriminator to learn categories from training videos
categories_gt = Variable(torch.squeeze(real_batch['categories'].long()), requires_grad=False)
l_discriminator += self.category_criterion(real_categorical.squeeze(), categories_gt)
l_discriminator.backward()
opt.step()
return l_discriminator