Commit b565bb97 authored by Jan Schlüter's avatar Jan Schlüter
Browse files

Commit of MIREX 2013 version

parents
==== Submission to the MIREX 2013 Audio Onset Detection task ====
=== Short description ===
This submission applies a trained convolutional neural network to a mel-scaled
spectrogram to detect the positions of likely onsets.
=== Contact Information ===
Jan Schlüter <jan.schlueter@ofai.at>
Sebastian Böck <sebastian.boeck@jku.at>
=== Calling format ===
The main program to run is onset_detector_cnn.py. It requires all other files to
be extracted to the same directory. Input and output files can be located
anywhere and specified with absolute or relative path. To obtain a parameter
sweep over the binarization threshold, do the following ten runs:
python onset_detector_cnn.py %input .25 %output
python onset_detector_cnn.py %input .3 %output
python onset_detector_cnn.py %input .35 %output
python onset_detector_cnn.py %input .4 %output
python onset_detector_cnn.py %input .45 %output
python onset_detector_cnn.py %input .5 %output
python onset_detector_cnn.py %input .55 %output
python onset_detector_cnn.py %input .6 %output
python onset_detector_cnn.py %input .65 %output
python onset_detector_cnn.py %input .7 %output
The best run should be the .55 setting.
=== Software requirements ===
The program requires Python 2.6 or 2.7, numpy, and scipy. The software has been
tested under Ubuntu 12.04 LTS, but should run on other Linux distributions,
Mac OS X and probably even Windows.
=== Resource requirements ===
To analyze a 1 minute file, the software will use:
- 1 thread (no multiprocessing support available)
- 12 seconds CPU time on a 3.33 GHz i5; 8 seconds CPU time on a 3.4 GHz i7
- 100 MiB memory
- no temporary disk space
=== Special notes ===
The software will try to interpret the input file as a wave file, stereo or
mono, 44.1 kHz sampling rate, 16 bits sample width. If this fails, it will try
to decode and resample the input file with ffmpeg.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Applies a trained CNN to given input data.
For usage information, call without any parameters.
Author: Jan Schlüter
"""
import sys
import os
import numpy as np
import scipy.signal
import scipy.ndimage
if map(int, scipy.__version__.split('.')) < (0, 12, 0):
import warnings
warnings.simplefilter("ignore", np.ComplexWarning)
def print_usage():
print 'Applies a trained CNN to given input data, using numpy/scipy.'
print 'Usage: %s INFILE BLOCKLEN FEAT [FEAT ...] CNNFILE OUTFILE' % sys.argv[0]
print ' INFILE: .npz file of input data'
print ' BLOCKLEN: number of data points to use per prediction (centered on'
print ' frame to classify)'
print ' FEAT: name of feature in the .npz file. If multiple feature names'
print ' are given, they will be stacked as separate channels.'
print ' CNNFILE: .h5 or .npz file of convolutional neural network weights'
print ' OUTFILE: .npy file to write the network output to'
def load_cnn(modelfile):
if modelfile.endswith('.h5'):
import h5py
model = list()
with h5py.File(modelfile, 'r') as f:
for l in sorted(f['layers'].keys()):
params = f['layers'][l]
model.append((params.attrs['class'],
dict((p, params[p].value) for p in params.keys())))
else:
cnn = np.load(modelfile)
model = list()
for i in xrange(cnn['num_layers']):
layertype = str(cnn['%d_class' % i])
layerparams = dict((p.split('_', 1)[1], cnn[p])
for p in cnn.files if p.startswith('%d_' % i) and (p != '%d_class' % i))
model.append((layertype, layerparams))
return model
def forward_pass(datapoints, blocklen, model):
# this is a non-optimal reimplementation of what could be done with theano,
# to avoid the dependency on theano
def conv(datapoints, blocklen, W, bias, maxpool=None):
# convolve each channel separately with each filter
datapoints = [[scipy.signal.convolve2d(channel, w, mode='valid') for w in W[:, c]] for c, channel in enumerate(datapoints)]
blocklen = blocklen - W.shape[2] + 1
# sum results of each input channel
datapoints = [sum(data[f] for data in datapoints) for f in xrange(W.shape[0])]
# max-pool
if maxpool is not None:
datapoints = [scipy.ndimage.filters.maximum_filter(channel, tuple(maxpool), mode='constant')[maxpool[0]/2:-maxpool[0]/2 if maxpool[0]>1 else None,maxpool[1]/2::maxpool[1]] for channel in datapoints]
blocklen = blocklen / maxpool[0]
# join channels to single matrix
datapoints = np.vstack(channel[np.newaxis, ...] for channel in datapoints)
# add bias per output channel
datapoints += bias[:,np.newaxis,np.newaxis]
# compute tanh nonlinearity (in-place)
np.tanh(datapoints, datapoints)
return datapoints, blocklen
def full(datapoints, blocklen, W, bias):
# if blocklen > 1, we could perform another convolution to simulate a
# fully-connected pass for each flattened block of blocklen frames.
# without theano, a convolution with 256 output channels is too slow,
# though, so we iterate over flattened blocks instead.
if blocklen > 1:
# XXX: if num_channels was not the first dimension, we could use
# stride_tricks instead of copying the data...
datapoints = np.vstack(datapoints[:,idx:idx+blocklen].ravel()
for idx in xrange(datapoints.shape[1] - blocklen + 1))
datapoints = np.dot(datapoints, W) + bias
blocklen = 1
# otherwise, if blocklen == 1, we can do a standard forward pass
elif blocklen == 1:
# flatten datapoints in case it's still a 3D tensor
if datapoints.ndim > 2:
datapoints = datapoints.reshape(datapoints.shape[1], -1)
# compute neuron activations
datapoints = np.dot(datapoints, W) + bias
else:
raise ValueError("blocklen must be greater than zero")
# apply sigmoid nonlinearity
datapoints = 1. / (1 + np.exp(-datapoints))
return datapoints, blocklen
for layertype, layerparams in model:
if layertype == 'ConvolutionalLayer':
datapoints, blocklen = conv(datapoints, blocklen, **layerparams)
else:
datapoints, blocklen = full(datapoints, blocklen, **layerparams)
return datapoints
def apply_cnn(indata, blocklen, modelfile):
# load cnn model
model = load_cnn(modelfile)
# stack channels if needed
if isinstance(indata, (list, tuple)):
indata = np.vstack(channel[np.newaxis, ...] for channel in indata) # num_channels x num_datapoints x num_feats
# zero-pad if needed (actually, we pad by repeating the first or last frame)
if blocklen > 1:
zeropad = np.zeros((indata.shape[0], blocklen/2, indata.shape[2]), dtype=indata.dtype)
indata = np.concatenate((zeropad + indata[:,:1], indata, zeropad + indata[:,-1:]), axis=1)
# pass the full data array to the CNN
outputs = forward_pass(indata, blocklen, model)
return outputs
def main():
if len(sys.argv) < 6:
print_usage()
return
# 'parse' command line
infile, blocklen = sys.argv[1:3]
blocklen = int(blocklen)
featnames = sys.argv[3:-2]
cnnfile, outfile = sys.argv[-2:]
# load input data
indata = np.load(infile)
indata = [indata[featname] for featname in featnames]
# call apply_cnn()
outdata = apply_cnn(indata, blocklen, cnnfile)
# write output data
np.save(outfile, outdata)
if __name__=="__main__":
main()
File added
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Reads a .wav or other input file and writes out one or more mel spectrograms.
For usage information, call without any parameters.
Author: Jan Schlüter
"""
import sys
import os
import numpy as np
import wave
import melbank
SAMPLE_RATE = 44100
def print_usage():
print 'Reads a .wav or other input file and writes out one or more'
print ' mel spectrograms.'
print 'Usage: %s INFILE OUTFILE FRAMELEN [FRAMELEN ...]' % sys.argv[0]
print ' INFILE: .wav or other input file, %g kHz, 16 bit, mono' % (SAMPLE_RATE / 1e3)
print ' OUTFILE: .npz output file; features will be called \'melspect<framelen>\''
print ' FRAMELEN: spectrogram frame length'
def read_wave(infile):
f = wave.open(infile)
try:
num_channels = f.getnchannels()
num_bits = f.getsampwidth() * 8
sample_rate = f.getframerate()
num_samples = f.getnframes()
samples = f.readframes(num_samples)
finally:
f.close()
if (num_bits != 16) or (sample_rate != SAMPLE_RATE):
raise ValueError("Unsupported wave file. Needs 16 bits, %d Hz." % SAMPLE_RATE)
samples = np.frombuffer(samples, dtype=np.int16)
samples = (samples / 2.**15).astype(np.float32)
if num_channels == 1:
pass
elif num_channels == 2:
samples = (samples[::2] + samples[1::2]) / 2
else:
raise ValueError("Unsupported wave file. Needs mono or stereo.")
return samples
def read_ffmpeg(infile):
import subprocess
call = ["ffmpeg", "-v", "quiet", "-y", "-i", infile, "-f", "f32le", "-ac", "1", "-ar", str(SAMPLE_RATE), "pipe:1"]
samples = subprocess.check_output(call)
samples = np.frombuffer(samples, dtype=np.float32)
return samples
#def yaafe_hanning(winlen):
# window = np.empty(winlen)
# window[0] = 0
# half = winlen / 2
# i = np.arange(half)
# window[1:half+1] = 0.5 * (1.0 - np.cos((i+1) * 2 * np.pi / winlen))
# window[half+1:] = window[half:1:-1]
# return window
def filtered_stft(samples, framelen, hopsize, transmat):
window = np.hanning(framelen)
#window = yaafe_hanning(framelen)
zeropad = np.zeros(framelen/2, dtype=samples.dtype)
samples = np.concatenate((zeropad, samples, zeropad))
spect = np.vstack(np.dot(np.abs(np.fft.rfft(samples[pos:pos+framelen] * window)), transmat)
for pos in xrange(0, len(samples) - framelen, hopsize))
return spect
def logarithmize(spect):
eps = 2.220446049250313e-16 # compatibility with yaafe
np.maximum(spect, eps, spect)
np.log(spect, spect)
def extract_spect(infile, framelens, fps=100):
# read input samples
try:
samples = read_wave(infile)
except wave.Error:
samples = read_ffmpeg(infile)
# apply STFTs and mel bank and logarithmize
hopsize = SAMPLE_RATE / fps
result = list()
for framelen in framelens:
meltrans = melbank.MelBank(framelen / 2 + 1, SAMPLE_RATE, num_filters=80, min_freq=27.5, max_freq=16000, dtype=np.double)
spect = filtered_stft(samples, framelen, hopsize, meltrans.as_matrix())
logarithmize(spect)
result.append(spect.astype(np.float32))
return result
def main():
if len(sys.argv) < 4:
print_usage()
return
# 'parse' command line
infile, outfile = sys.argv[1:3]
framelens = map(int, sys.argv[3:])
# call extract_spect()
spects = extract_spect(infile, framelens)
# write output file
np.savez(outfile, **dict(('melspect%d' % framelen, spect) for framelen, spect in zip(framelens, spects)))
if __name__=="__main__":
main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Performs mean/std normalization of given data.
For usage information, call without any parameters.
Author: Jan Schlüter
"""
import sys
import os
import numpy as np
from itertools import izip
def print_usage():
print 'Performs mean/std normalization of given data.'
print 'Usage: %s INFILE FEATNAME COVMEAN [FEATNAME COVMEAN ...] OUTFILE' % sys.argv[0]
print ' INFILE: .npz file of input data'
print ' FEATNAME: name of a feature in the .npz file'
print ' COVMEAN: .npz file of covariance and mean vector for FEATNAME'
print ' OUTFILE: .npz file to write processed data to'
def meanstd(data, covmean, in_place=True):
"""
Apply mean/std normalization to the given data.
@param data: A numpy array of data to transform, with datapoints in rows
@param covmean: A dict-like with 'mean' mapping to a mean vector and 'cov'
mapping to a covariance vector
@param in_place: If True (the default), input data is modified in place.
@return: The data, mean-centered and divided by standard deviation.
"""
m = covmean['mean'].ravel()
c = covmean['cov'].diagonal()
s = np.sqrt(c)
if in_place:
data -= m
else:
data = data - np.array(m, dtype=data.dtype, copy=False)
data /= s
return data
def main():
if (len(sys.argv) < 5) or ((len(sys.argv) - 3) % 2 != 0):
print_usage()
return
# 'parse' command line
infile = sys.argv[1]
transforms = sys.argv[2:-1]
outfile = sys.argv[-1]
# wrap transformations into a dictionary featname -> covmean
transforms = dict(izip(transforms[::2], (np.load(t) for t in transforms[1::2])))
# load input data
indata = np.load(infile)
# transform input data
outdata = dict((featname, meanstd(indata[featname], transforms[featname]))
for featname in transforms)
# write output data
np.savez(outfile, **outdata)
if __name__=="__main__":
main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Module for creating and applying a mel filterbank.
Author: Jan Schlüter
"""
import numpy as np
class MelBank(object):
"""
Encapsulates a mel filterbank. Offers to apply the filterbank to given
input data, or to return a transformation matrix representing the bank.
"""
def __init__(self, length, sample_rate, num_filters, min_freq=130.0, max_freq=6854.0, norm=True, shape='tri', dtype=np.double):
"""
Creates a new mel filterbank instance.
@param length: Length of frames (in samples) the bank is to be
applied to
@param sample_rate: Sample rate of input data (used to calculate
the cutoff frequencies)
@param num_filters: The number of filters (the number of mel bands)
@param min_freq: The low cutoff point of the lowest filter
@param max_freq: The high cutoff point of the highest filter
@param norm: Whether to normalize each filter to unit area
@param shape: The filter shape: 'tri' for triangular, 'hann' for hann
@param dtype: The dtype of the filters
"""
# Creates a mel filter bank, compatible to what yaafe uses
self.length = length
self.num_filters = num_filters
self.dtype = dtype
# - minimum, maximum and filter band width in mel
min_mel = 1127 * np.log(1 + min_freq / 700.0)
max_mel = 1127 * np.log(1 + max_freq / 700.0)
band_width = (max_mel - min_mel) / (num_filters + 1)
# - peaks of the filters in mel and in Hz
peaks_mel = min_mel + np.arange(num_filters + 2) * band_width
peaks_freq = 700 * (np.exp(peaks_mel / 1127) - 1)
# - frequency at each fft bin
fft_freqs = np.arange(length) * float(sample_rate) / ((length - 1) * 2)
# - prepare list of filters
self._filters = []
# - create triangular filters
for b in xrange(1, num_filters + 1):
# The triangle starts at the previous filter's peak (peaks_freq[b-1]),
# has its maximum at peaks_freq[b] and ends at peaks_freq[b+1].
left, top, right = peaks_freq[b-1:b+2] #b-1, b, b+1
# Find out where to put the triangle in frequency space
l, t, r = np.searchsorted(fft_freqs, [left, top, right])
# Create the filter (equal to yaafe):
if shape == 'tri':
filt = np.empty(r-l, dtype=dtype)
filt[:t-l] = (fft_freqs[l:t] - left)/(top - left)
filt[t-l:] = (right - fft_freqs[t:r])/(right - top)
if norm: # Normalize each filter to unit filter energy.
filt *= 2.0 / (right - left)
elif shape == 'hann':
filt = np.hanning(r-l).astype(dtype)
if norm: # Normalize each filter to unit filter energy.
filt *= 2.0 / (right - left)
else:
raise ValueError('Unsupported value for parameter `shape`.')
# Append to the list of filters
self._filters.append((l, filt))
def as_matrix(self):
"""
Returns the filterbank as a transformation matrix of shape
(self.length, self.num_filters). This can be right-multiplied
to input data to apply the filterbank (inefficiently, however).
"""
mat = np.zeros((self.length, self.num_filters), dtype=self.dtype)
for b, (l, filt) in enumerate(self._filters):
mat[l:l+len(filt), b] = filt
return mat
def apply(self, data):
"""
Applies the filterbank to the given input data. This is meant to be
more efficient than a dot product with the filter matrix, but it can
actually be slower (depending on your BLAS implementation).
@param data: Input data as a 1-dimensional or 2-dimensional matrix.
For 2-dimensional matrices, input frames are expected in rows.
Each row must have a length equal to self.length (as specified
in the filterbank constructor).
@return The transformed input data; again in rows, same dtype as input.
"""
if len(data.shape) not in (1,2):
raise ValueError("Only handles 1- and 2-dimensional data, got %d dimensions." % len(data.shape))
if data.shape[-1] != self.length:
raise ValueError("Expected data.shape[-1] of %d, got %d." % (self.length, data.shape[-1]))
# For performance reasons, we handle 1 and 2 dimensions separately
# (this allows us to use np.dot in the 1D case)
if len(data.shape) == 1:
outdata = np.empty(self.num_filters, dtype=data.dtype)
for b, (l, filt) in enumerate(self._filters):
outdata[b] = np.dot(data[l:l+len(filt)], filt)
elif len(data.shape) == 2:
outdata = np.empty((data.shape[0], self.num_filters), dtype=data.dtype)
for b, (l, filt) in enumerate(self._filters):
outdata[:,b] = (data[:,l:l+len(filt)] * filt).sum(axis=1)
return outdata
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Performs onset detection with a CNN.
MIREX 2013 submission of Schlüter/Böck.
For usage information, call without any parameters.
Author: Jan Schlüter
"""
import sys
import os
import numpy as np
from itertools import izip
import extract_spect
import meanstd
import apply_cnn
import peakpick
def print_usage():
print 'Performs onset detection with a CNN. MIREX 2013 submission of'
print ' Schlüter/Böck.'
print 'Usage: %s INFILE [THRESHOLD] OUTFILE' % sys.argv[0]
print ' INFILE: wave file or other input file to process'
print ' THRESHOLD: detection threshold to use. If omitted, will use'
print ' hard-coded default value.'
print ' OUTFILE: text file of detected onset positions'
def onset_detector_cnn(infile, outfile, threshold=0.54):
# feature extraction
framelens = [2048, 1024, 4096]
feats = extract_spect.extract_spect(infile, framelens, fps=100)
# feature preprocessing
for feat, framelen in izip(feats, framelens):
meanstd.meanstd(feat, np.load(os.path.join(os.path.dirname(__file__),
'cov_mean.%d.npz' % framelen)), in_place=True)
# pass through CNN
activations = apply_cnn.apply_cnn(feats, blocklen=15,
modelfile=os.path.join(os.path.dirname(__file__), 'cnn.npz'))
# peak-picking
detections = peakpick.peakpick(activations, threshold)
# save to text file
peakpick.onsets_to_txt(outfile, detections, fps=100)
def main():
if len(sys.argv) not in (3, 4):
print_usage()
return
# 'parse' command line
infile = sys.argv[1]
outfile = sys.argv[-1]
od_args = dict()
if len(sys.argv) > 3:
od_args['threshold'] = float(sys.argv[2])
# call onset_detector_cnn() function
onset_detector_cnn(infile, outfile, **od_args)
if __name__=="__main__":
main()
#!/bin/bash
# prepare files
#cp -a /home/jan/data/phd/onsets/pretraining/sb_onset_db.ismir2012/mellogsb1024/blocks1/cov_mean.npz cov_mean.1024.npz
#cp -a /home/jan/data/phd/onsets/pretraining/sb_onset_db.ismir2012/mellogsb/blocks1/cov_mean.npz cov_mean.2048.npz
#cp -a /home/jan/data/phd/onsets/pretraining/sb_onset_db.ismir2012/mellogsb4096/blocks1/cov_mean.npz cov_mean.4096.npz
#./cnn_h5_to_npz.py /home/jan/data/phd/onsets/finetuning/sb_onset_db.mirex2013/mellogsb,1024,4096/blocks15/mean/std/cnn/testE_mom0.9-10-20_eta1-0.995_drop0.5_noval_moreon0.25_ep300/testE_fold0.h5 cnn.npz
# define file list
files="README onset_detector_cnn.py extract_spect.py melbank.py meanstd.py apply_cnn.py peakpick.py cov_mean.*.npz cnn.npz"
# create tar archive
tar -czf onset_detector_cnn.tgz $files
# create zip archive
tmpdir=$(mktemp -d /dev/shm/tmp.XXXXX)
mkdir "$tmpdir"/schlueter_onsets
cp -a $files "$tmpdir"/schlueter_onsets
pushd "$tmpdir" > /dev/null
mv schlueter_onsets/README{,.txt}
zip -r schlueter_onsets.zip schlueter_onsets
popd > /dev/null
mv "$tmpdir/schlueter_onsets.zip" .
rm -r "$tmpdir"
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Picks peaks in an onset detection curve.
For usage information, call without any parameters.
Author: Jan Schlüter
"""
import sys
import os
import numpy as np
def print_usage():
print 'Picks peaks in an onset detection curve.'
print 'Usage: %s INFILE THRESHOLD [SMOOTH [FPS]] OUTFILE' % sys.argv[0]
print ' INFILE: .npy file of real-valued predictions between 0 and 1'
print ' THRESHOLD: binarization threshold'
print ' SMOOTH: amount of median smoothing over time (default: 5 frames)'
print ' FPS: number of frames per second (default: 100)'
print ' OUTFILE: .txt file to write detected onsets to, one per line'