# External imports
# Module for manipulating arrays
import numpy as np

# Module for loading signals
import soundfile as sf

# Module for manipulating signals, notably 
import librosa

import IPython.display as ipd


# Module containing our NTD resolution algorithm
import nn_fac.ntd as NTD

# Module encapsulating the computation of features from the signal
import musicntd.model.features as features

# General module for manipulating data: conversion between time, bars, frame indexes, loading of data, ...
import musicntd.data_manipulation as dm

# Module constructing the tensor, starting from the spectrogram
import musicntd.tensor_factory as tf

# Plotting module
from musicntd.model.current_plot import *


# Song
song_path = "C:/Users/amarmore/this_folder/The Beatles - Come Together.wav"
the_signal, sampling_rate = sf.read(song_path)

# Get the downbeats
bars = dm.get_bars_from_audio(song_path)

C:\Users\amarmore\AppData\Local\Continuum\anaconda3\envs\NTD_segmentation\lib\site-packages\scipy\io\wavfile.py:273: WavFileWarning: Chunk (non-data) not understood, skipping it.
  WavFileWarning)


n_fft=2048
hop_length = 32

stft_complex = librosa.core.stft(np.asfortranarray(the_signal[:,0]), n_fft=n_fft, hop_length = hop_length)
for i in range(1,the_signal.shape[1]):
    stft_complex += librosa.core.stft(np.asfortranarray(the_signal[:,i]), n_fft=n_fft, hop_length = hop_length)
mag, phase = librosa.magphase(stft_complex, power=1) # Magnitude spectrogram


hop_length_seconds = hop_length / sampling_rate
subdivision = 96

tensor_stft = tf.tensorize_barwise(mag, bars, hop_length_seconds, subdivision)


hops = []
for bar_idx in range(tensor_stft.shape[2]):
    len_sig = bars[bar_idx+1][1] - bars[bar_idx+1][0]
    hop = int(len_sig/96 * sampling_rate)
    hops.append(hop)
median_hop = int(np.median(hops))


nb_bars = 16 # you can set it to 89 if you use the executable format, and listen to the whole song.
time = nb_bars * subdivision
audio_stft = librosa.griffinlim(np.reshape(tensor_stft[:,:,:nb_bars], (1025, time), order = 'F'), hop_length = median_hop)


ipd.Audio(audio_stft, rate=sampling_rate)


ranks = [32,24,12] #Dimensions of the decomposition
n_iter_max = 100


core_beta2, factors_beta2 = NTD.ntd(tensor_stft, ranks = ranks, init = "tucker", verbose = False, deterministic = True,
                    sparsity_coefficients = [None, None, None, None], normalize = [True, True, False, True], mode_core_norm = 2, n_iter_max = n_iter_max)


core_beta1, factors_beta1 = NTD.ntd_mu(tensor_stft, ranks = ranks, init = "tucker", verbose = False, deterministic = True, beta = 1,
                    sparsity_coefficients = [None, None, None, None], normalize = [True, True, False, True], mode_core_norm = 2, n_iter_max = n_iter_max)


core_beta0, factors_beta0 = NTD.ntd_mu(tensor_stft, ranks = ranks, init = "tucker", verbose = False, deterministic = True, beta = 0,
                    sparsity_coefficients = [None, None, None, None], normalize = [True, True, False, True], mode_core_norm = 2, n_iter_max = n_iter_max)


# function reconstructing the signal from the ntd results.
def reconstruct_song_from_ntd(core, factors, bars, nb_bars = None):
    if nb_bars == None:
        nb_bars = factors[2].shape[0]
    barwise_spec_shape = (factors[0]@core[:,:,0]@factors[1].T).shape
    signal_content = None
    for bar_idx in range(nb_bars):
        len_sig = bars[bar_idx+1][1] - bars[bar_idx+1][0]
        hop = int(len_sig/96 * sampling_rate)
        patterns_weights = factors[2][bar_idx]
        bar_content = np.zeros(barwise_spec_shape)
        for pat_idx in range(ranks[2]):
            bar_content += patterns_weights[pat_idx] * factors[0]@core[:,:,pat_idx]@factors[1].T
        signal_content = np.concatenate((signal_content, bar_content), axis=1) if signal_content is not None else bar_content
    reconstructed_song = librosa.griffinlim(signal_content, hop_length = hop)
    return reconstructed_song


audio_beta2 = reconstruct_song_from_ntd(core_beta2, factors_beta2, bars, nb_bars = nb_bars)
signal_beta2 = ipd.Audio(audio_beta2, rate=sampling_rate)

audio_beta1 = reconstruct_song_from_ntd(core_beta1, factors_beta1, bars, nb_bars = nb_bars)
signal_beta1 = ipd.Audio(audio_beta1, rate=sampling_rate)

audio_beta0 = reconstruct_song_from_ntd(core_beta0, factors_beta0, bars, nb_bars = nb_bars)
signal_beta0 = ipd.Audio(audio_beta0, rate=sampling_rate)

plot_audio_diff_beta_in_dataframe(signal_beta2, signal_beta1, signal_beta0)


def compute_pattern_signals(core, factors, hop_length):
    audios_list = []
    for i in range(factors[2].shape[1]):
        pattern = factors[0]@core[:,:,i]@factors[1].T
        audio = librosa.griffinlim(pattern, hop_length = hop_length)
        signal_audio = ipd.Audio(audio, rate=sampling_rate)
        audios_list.append(signal_audio)
    return audios_list


patterns_beta2 = compute_pattern_signals(core_beta2, factors_beta2, hop_length = median_hop)
patterns_beta1 = compute_pattern_signals(core_beta1, factors_beta1, hop_length = median_hop)
patterns_beta0 = compute_pattern_signals(core_beta0, factors_beta0, hop_length = median_hop)

plot_audio_diff_beta_in_dataframe(patterns_beta2, patterns_beta1, patterns_beta0)

Imports¶

STFT¶

NTD: Nonnegative Tucker Decomposition¶

$\beta$ = 2: Euclidean nom¶

$\beta$ = 1: Kullback-Leibler divergence¶

$\beta = 0$: Itakura-Saito divergence¶

Listening to the reconstructed songs¶

Listening to all patterns¶

References¶