这些服务的一个关键特性是播放列表,通常按流派分组。这些数据可能来自出版歌曲的人手工标注。但这并不是一个很好的划分,因为可能是一些艺人想利用一个特定流派的流行趋势。更好的选择是依靠自动音乐类型分类。与我的两位合作者张伟信(Wilson Cheung)和顾长乐(Joy Gu)一起,我们试图比较不同的音乐样本分类方法。特别是,我们评估了标准机器学习和深度学习方法的性能。我们发现特征工程是至关重要的,而领域知识可以真正提高性能。
有关如何下载数据的说明,请参阅存储库中包含的自述文件。我们非常感谢Michaël Defferrard、Kirell Benzi、Pierre Vandergheynst、Xavier Bresson将这些数据整合在一起并免费提供,但我们只能想象Spotify或Pandora Radio拥有的数据规模所能提供的见解。有了这些数据,我们可以描述各种模型来执行手头的任务。
在谱图中应用迁移学习的卷积神经网络是性能最好的,尽管SVM和Gaussian naivebayes在性能上相似(考虑到后者的简化假设,这本身就很有趣)。我们在报告中描述了最好的超参数和模型体系结构。
# import libraries import pandas as pd import tensorflow as tf from IPython.display import Audio import os import matplotlib.pyplot as plt import numpy as np import math import sys from datetime import datetime import pickle import librosa import ast import scipy import librosa.display from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from tensorflow import keras from google.colab import files keras.backend.clear_session() tf.random.set_seed(42) np.random.seed(42)
# mount the drive # adapted from https://colab.sandbox.google.com/notebooks/io.ipynb#scrollTo=S7c8WYyQdh5i from google.colab import drive drive.mount('/content/drive') # load the metadata to Colab from Drive, will greatly speed up the I/O process zip_path_metadata = "/content/drive/My Drive/master_degree/machine_learning/Project/fma_metadata.zip" !cp "{zip_path_metadata}" . !unzip -q fma_metadata.zip !rm fma_metadata.zip # authenticate for GCS access if 'google.colab' in sys.modules: from google.colab import auth auth.authenticate_user()
# set some variables for creating the dataset AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API GCS_PATTERN = 'gs://music-genre-classification-project-isye6740/fma_small_wav/*/*.wav' GCS_OUTPUT_1D = 'gs://music-genre-classification-project-isye6740/tfrecords-wav-1D/songs' # prefix for output file names, first type of model GCS_OUTPUT_2D = 'gs://music-genre-classification-project-isye6740/tfrecords-wav-2D/songs' # prefix for output file names, second type of model GCS_OUTPUT_FEATURES = 'gs://music-genre-classification-project-isye6740/tfrecords-features/songs' # prefix for output file names, models built with extracted features SHARDS = 16 window_size = 10000 # number of raw audio samples length_size_2d = 50176 # number of data points to form the Mel spectrogram feature_size = 85210 # size of the feature vector N_CLASSES = 8 DATA_SIZE = (224,224,3) # required data size for transfer learning
# function to load metadata # adapted from https://github.com/mdeff/fma/blob/master/utils.py def metadata_load(filepath): filename = os.path.basename(filepath) if 'features' in filename: return pd.read_csv(filepath, index_col=0, header=[0, 1, 2]) if 'echonest' in filename: return pd.read_csv(filepath, index_col=0, header=[0, 1, 2]) if 'genres' in filename: return pd.read_csv(filepath, index_col=0) if 'tracks' in filename: tracks = pd.read_csv(filepath, index_col=0, header=[0, 1]) COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'), ('track', 'genres'), ('track', 'genres_all')] for column in COLUMNS: tracks[column] = tracks[column].map(ast.literal_eval) COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'), ('album', 'date_created'), ('album', 'date_released'), ('artist', 'date_created'), ('artist', 'active_year_begin'), ('artist', 'active_year_end')] for column in COLUMNS: tracks[column] = pd.to_datetime(tracks[column]) SUBSETS = ('small', 'medium', 'large') try: tracks['set', 'subset'] = tracks['set', 'subset'].astype( pd.CategoricalDtype(categories=SUBSETS, ordered=True)) except ValueError: # the categories and ordered arguments were removed in pandas 0.25 tracks['set', 'subset'] = tracks['set', 'subset'].astype( pd.CategoricalDtype(categories=SUBSETS, ordered=True)) COLUMNS = [('track', 'genre_top'), ('track', 'license'), ('album', 'type'), ('album', 'information'), ('artist', 'bio')] for column in COLUMNS: tracks[column] = tracks[column].astype('category') return tracks # function to get genre information for each track ID def track_genre_information(GENRE_PATH, TRACKS_PATH, subset): """ GENRE_PATH (str): path to the csv with the genre metadata TRACKS_PATH (str): path to the csv with the track metadata FILE_PATHS (list): list of paths to the mp3 files subset (str): the subset of the data desired """ # get the genre information genres = pd.read_csv(GENRE_PATH) # load metadata on all the tracks tracks = metadata_load(TRACKS_PATH) # focus on the specific subset tracks subset_tracks = tracks[tracks['set', 'subset'] <= subset] # extract track ID and genre information for each track subset_tracks_genre = np.array([np.array(subset_tracks.index), np.array(subset_tracks['track', 'genre_top'])]).T # combine the information in a dataframe tracks_genre_df = pd.DataFrame({'track_id': subset_tracks_genre[:,0], 'genre': subset_tracks_genre[:,1]}) # label classes with numbers encoder = LabelEncoder() tracks_genre_df['genre_nb'] = encoder.fit_transform(tracks_genre_df.genre) return tracks_genre_df # get genre information for all tracks from the small subset GENRE_PATH = "fma_metadata/genres.csv" TRACKS_PATH = "fma_metadata/tracks.csv" subset = 'small' small_tracks_genre = track_genre_information(GENRE_PATH, TRACKS_PATH, subset)
# check the number of songs which are stored in GCS nb_songs = len(tf.io.gfile.glob(GCS_PATTERN)) shard_size = math.ceil(1.0 * nb_songs / SHARDS) print("Pattern matches {} songs which will be rewritten as {} .tfrec files containing {} songs each.".format(nb_songs, SHARDS, shard_size)) # functions to create the dataset from raw audio # define a function to get the label associated with a file path def get_label(file_path, genre_df=small_tracks_genre): path = file_path.numpy() path = path.decode("utf-8") track_id = int(path.split('/')[-1].split('.')[0].lstrip('0')) label = genre_df.loc[genre_df.track_id == track_id,'genre_nb'].values[0] return tf.constant([label]) # define a function that extracts the desired features from a file path def get_audio(file_path, window_size=window_size): wav = tf.io.read_file(file_path) audio = tf.audio.decode_wav(wav, desired_channels=1).audio filtered_audio = audio[:window_size,:] return filtered_audio # process the path def process_path(file_path, window_size=window_size): label = get_label(file_path) audio = get_audio(file_path, window_size) return audio, label # parser, wrap around the processing function and specify output shape def parser(file_path, window_size=window_size): audio, label = tf.py_function(process_path, [file_path], (tf.float32, tf.int32)) audio.set_shape((window_size,1)) label.set_shape((1,)) return audio, label filenames = tf.data.Dataset.list_files(GCS_PATTERN, seed=35155) # This also shuffles the images dataset_1d = filenames.map(parser, num_parallel_calls=AUTO) dataset_1d = dataset_1d.batch(shard_size)
现在我们有了数据集,我们使用TFRecord格式将其存储在GCS上。这是GPU和TPU推荐使用的格式,因为并行化带来了快速的I/O。其主要思想是tf.Features和tf.Example. 我们将数据集写入这些示例,存储在GCS上。这部分代码应该需要对其他项目进行最少的编辑,除了更改特性类型之外。如果数据已经上传到记录格式一次,则可以跳过此部分。本节中的大部分代码都改编自TensorFlow官方文档以及本教程中有关音频管道的内容。
# write to TFRecord # need to TFRecord to greatly speed up the I/O process, previously a bottleneck # functions to create various features # adapted from https://codelabs.developers.google.com/codelabs/keras-flowers-data/#4 # and https://www.tensorflow.org/tutorials/load_data/tfrecord def _bytestring_feature(list_of_bytestrings): return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings)) def _int_feature(list_of_ints): # int64 return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints)) def _float_feature(list_of_floats): # float32 return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats)) # writer function def to_tfrecord(tfrec_filewriter, song, label): one_hot_class = np.eye(N_CLASSES)[label][0] feature = { "song": _float_feature(song.flatten().tolist()), # one song in the list "class": _int_feature([label]), # one class in the list "one_hot_class": _float_feature(one_hot_class.tolist()) # variable length list of floats, n=len(CLASSES) } return tf.train.Example(features=tf.train.Features(feature=feature)) def write_tfrecord(dataset, GCS_OUTPUT): print("Writing TFRecords") for shard, (song, label) in enumerate(dataset): # batch size used as shard size here shard_size = song.numpy().shape[0] # good practice to have the number of records in the filename filename = GCS_OUTPUT + "{:02d}-{}.tfrec".format(shard, shard_size) with tf.io.TFRecordWriter(filename) as out_file: for i in range(shard_size): example = to_tfrecord(out_file, song.numpy()[i], label.numpy()[i]) out_file.write(example.SerializeToString()) print("Wrote file {} containing {} records".format(filename, shard_size))s
一旦这些记录被存储,我们需要其他函数来读取它们。依次处理每个示例,从TFRecord中提取相关信息并重新构造tf.数据集. 这看起来像是一个循环过程(创建一个tf.数据集→作为TFRecord上传到GCS→将TFRecord读入tf.数据集),但这实际上通过简化I/O过程提供了巨大的速度效率。如果I/O是瓶颈,使用GPU或TPU是没有帮助的,这种方法允许我们通过优化数据加载来充分利用它们在训练期间的速度增益。
# function to parse an example and return the song feature and the one-hot class # adapted from https://codelabs.developers.google.com/codelabs/keras-flowers-data/#4 # and https://www.tensorflow.org/tutorials/load_data/tfrecord def read_tfrecord_1d(example): features = { "song": tf.io.FixedLenFeature([window_size], tf.float32), # tf.string means bytestring "class": tf.io.FixedLenFeature([1], tf.int64), # shape [] means scalar "one_hot_class": tf.io.VarLenFeature(tf.float32), } example = tf.io.parse_single_example(example, features) song = example['song'] # song = tf.audio.decode_wav(example['song'], desired_channels=1).audio song = tf.cast(example['song'], tf.float32) song = tf.reshape(song, [window_size, 1]) label = tf.reshape(example['class'], [1]) one_hot_class = tf.sparse.to_dense(example['one_hot_class']) one_hot_class = tf.reshape(one_hot_class, [N_CLASSES]) return song, one_hot_class # function to load the dataset from TFRecords def load_dataset_1d(filenames): # read from TFRecords. For optimal performance, read from multiple # TFRecord files at once and set the option experimental_deterministic = False # to allow order-altering optimizations. option_no_order = tf.data.Options() option_no_order.experimental_deterministic = False dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) dataset = dataset.with_options(option_no_order) dataset = dataset.map(read_tfrecord_1d, num_parallel_calls=AUTO) # ignore potentially corrupted records dataset = dataset.apply(tf.data.experimental.ignore_errors()) return dataset
# function to create training, validation and testing sets # adapted from https://colab.sandbox.google.com/notebooks/tpu.ipynb # and https://codelabs.developers.google.com/codelabs/keras-flowers-data/#4 def create_train_validation_testing_sets(TFREC_PATTERN, VALIDATION_SPLIT=0.2, TESTING_SPLIT=0.2): """ TFREC_PATTERN: string pattern for the TFREC bucket on GCS """ # see which accelerator is available try: # detect TPUs tpu = None tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu) except ValueError: # detect GPUs strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines print("Number of accelerators: ", strategy.num_replicas_in_sync) # Configuration # adapted from https://codelabs.developers.google.com/codelabs/keras-flowers-data/#4 if tpu: BATCH_SIZE = 16*strategy.num_replicas_in_sync # A TPU has 8 cores so this will be 128 else: BATCH_SIZE = 32 # On Colab/GPU, a higher batch size does not help and sometimes does not fit on the GPU (OOM) # splitting data files between training and validation filenames = tf.io.gfile.glob(TFREC_PATTERN) testing_split = int(len(filenames) * TESTING_SPLIT) training_filenames = filenames[testing_split:] testing_filenames = filenames[:testing_split] validation_split = int(len(filenames) * VALIDATION_SPLIT) validation_filenames = training_filenames[:validation_split] training_filenames = training_filenames[validation_split:] validation_steps = int(3670 // len(filenames) * len(validation_filenames)) // BATCH_SIZE steps_per_epoch = int(3670 // len(filenames) * len(training_filenames)) // BATCH_SIZE return tpu, BATCH_SIZE, strategy, training_filenames, validation_filenames, testing_filenames, steps_per_epoch # get the batched dataset, optimizing for I/O performance # follow best practice for shuffling and repeating data def get_batched_dataset(filenames, load_func, train=False): """ filenames: filenames to load load_func: specific loading function to use train: Boolean, whether this is a training set """ dataset = load_func(filenames) dataset = dataset.cache() # This dataset fits in RAM if train: # Best practices for Keras: # Training dataset: repeat then batch # Evaluation dataset: do not repeat dataset = dataset.repeat() dataset = dataset.batch(BATCH_SIZE) dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size) # should shuffle too but this dataset was well shuffled on disk already return dataset # source: Dataset performance guide: https://www.tensorflow.org/guide/performance/datasets # instantiate the datasets training_dataset_1d = get_batched_dataset(training_filenames_1d, load_dataset_1d, train=True) validation_dataset_1d = get_batched_dataset(validation_filenames_1d, load_dataset_1d, train=False) testing_dataset_1d = get_batched_dataset(testing_filenames_1d, load_dataset_1d, train=False)
# create a CNN model with strategy.scope(): # create the model model = tf.keras.Sequential([ tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', input_shape=[window_size,1], name = 'conv1'), tf.keras.layers.MaxPooling1D(name='max1'), tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', name='conv2'), tf.keras.layers.MaxPooling1D(name='max2'), tf.keras.layers.Flatten(name='flatten'), tf.keras.layers.Dense(100, activation='relu', name='dense1'), tf.keras.layers.Dropout(0.5, name='dropout2'), tf.keras.layers.Dense(20, activation='relu', name='dense2'), tf.keras.layers.Dropout(0.5, name='dropout3'), tf.keras.layers.Dense(8, name='dense3') ]) #compile model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy']) model.summary() # train the model logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir) EPOCHS = 100 raw_audio_history = model.fit(training_dataset_1d, steps_per_epoch=steps_per_epoch, validation_data=validation_dataset_1d, epochs=EPOCHS, callbacks=tensorboard_callback) # evaluate on the test data model.evaluate(testing_dataset_1d)
%load_ext tensorboard %tensorboard --logdir logs/scalars
作者:Célestin Hermez
本文代码 https://github.com/celestinhermez/music-genre-classification
deephub 翻译组