Statistical Modeling in MIR: Hidden Markov Models and Gaussian Mixtures
Master the probabilistic frameworks that power chord recognition, beat tracking, and musical structure analysis. Learn how HMMs capture temporal dependencies and GMMs model complex distributions in music.
The Power of Probabilistic Models in Music
Music is inherently uncertain and variable. The same chord progression can be played with different voicings, the same rhythm with subtle timing variations. Statistical models provide a principled framework for handling this uncertainty, learning patterns from data, and making probabilistic predictions about musical content.
- • Handle uncertainty: Music performance varies; models capture this variability
- • Learn from data: Parameters estimated from training examples
- • Temporal modeling: Capture how musical events evolve over time
- • Interpretability: Model parameters often have musical meaning
Gaussian Mixture Models (GMMs)
GMMs model complex probability distributions as weighted sums of Gaussian components. In music, they're used for timbre modeling, speaker identification, and clustering acoustic features.
A GMM with components models the probability density:
Where:
- = mixing weight (prior probability) of component
- = mean vector of component
- = covariance matrix of component
- (weights sum to 1)
Compute responsibilities:
Posterior probability that point belongs to component
Update parameters:
where
1import numpy as np
2from sklearn.mixture import GaussianMixture
3import librosa
4import matplotlib.pyplot as plt
5
6class MusicGMM:
7 def __init__(self, n_components=8, covariance_type='full'):
8 """
9 Initialize GMM for music analysis
10
11 Parameters:
12 - n_components: Number of Gaussian components
13 - covariance_type: 'full', 'tied', 'diag', 'spherical'
14 """
15 self.gmm = GaussianMixture(
16 n_components=n_components,
17 covariance_type=covariance_type,
18 max_iter=100,
19 n_init=3,
20 random_state=42
21 )
22 self.feature_extractor = None
23
24 def extract_features(self, audio, sr=22050):
25 """Extract MFCC features for timbre modeling"""
26 # Compute MFCCs
27 mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
28
29 # Add delta and delta-delta features
30 delta_mfccs = librosa.feature.delta(mfccs)
31 delta2_mfccs = librosa.feature.delta(mfccs, order=2)
32
33 # Stack features
34 features = np.vstack([mfccs, delta_mfccs, delta2_mfccs])
35
36 # Transpose for (n_samples, n_features) format
37 return features.T
38
39 def fit_instrument_model(self, audio_samples, labels=None):
40 """
41 Fit GMM to model instrument timbres
42
43 Parameters:
44 - audio_samples: List of audio arrays
45 - labels: Optional instrument labels for supervised learning
46 """
47 all_features = []
48
49 for audio in audio_samples:
50 features = self.extract_features(audio)
51 all_features.append(features)
52
53 # Concatenate all features
54 X = np.vstack(all_features)
55
56 # Fit GMM
57 self.gmm.fit(X)
58
59 # Store results
60 self.means_ = self.gmm.means_
61 self.covariances_ = self.gmm.covariances_
62 self.weights_ = self.gmm.weights_
63
64 return self
65
66 def score_audio(self, audio, sr=22050):
67 """Compute log-likelihood of audio under the model"""
68 features = self.extract_features(audio, sr)
69 return self.gmm.score(features)
70
71 def classify_frames(self, audio, sr=22050):
72 """Classify each frame to most likely component"""
73 features = self.extract_features(audio, sr)
74 labels = self.gmm.predict(features)
75 probabilities = self.gmm.predict_proba(features)
76
77 return labels, probabilities
78
79 def generate_samples(self, n_samples=100):
80 """Generate new samples from the learned distribution"""
81 samples, labels = self.gmm.sample(n_samples)
82 return samples, labels
83
84# Universal Background Model for speaker/instrument verification
85class UBM_GMM:
86 def __init__(self, n_components=64):
87 self.ubm = GaussianMixture(n_components=n_components, covariance_type='diag')
88 self.adapted_models = {}
89
90 def train_ubm(self, background_data):
91 """Train Universal Background Model on diverse data"""
92 features = []
93 for audio in background_data:
94 mfccs = librosa.feature.mfcc(y=audio, n_mfcc=13)
95 features.append(mfccs.T)
96
97 X = np.vstack(features)
98 self.ubm.fit(X)
99 return self
100
101 def adapt_model(self, target_audio, speaker_id, relevance_factor=16):
102 """
103 MAP adaptation for speaker/instrument-specific model
104
105 Uses Maximum a Posteriori adaptation from UBM
106 """
107 # Extract features
108 mfccs = librosa.feature.mfcc(y=target_audio, n_mfcc=13)
109 X = mfccs.T
110
111 # E-step: compute responsibilities
112 responsibilities = self.ubm.predict_proba(X)
113
114 # Compute statistics
115 n = responsibilities.sum(axis=0)
116 F = responsibilities.T @ X
117 S = responsibilities.T @ (X ** 2)
118
119 # MAP adaptation
120 alpha = n / (n + relevance_factor)
121
122 # Adapt means only (common approach)
123 adapted_means = alpha[:, np.newaxis] * (F / n[:, np.newaxis]) + (1 - alpha[:, np.newaxis]) * self.ubm.means_
124
125 # Create adapted model
126 adapted_gmm = GaussianMixture(n_components=self.ubm.n_components)
127 adapted_gmm.means_ = adapted_means
128 adapted_gmm.covariances_ = self.ubm.covariances_
129 adapted_gmm.weights_ = self.ubm.weights_
130 adapted_gmm.precisions_cholesky_ = self.ubm.precisions_cholesky_
131
132 self.adapted_models[speaker_id] = adapted_gmm
133 return adapted_gmm
134
135# Example: Music genre classification with GMM
136def genre_classification_gmm(audio_files, genres, test_audio):
137 """
138 Classify music genre using GMM likelihood
139 """
140 genre_models = {}
141
142 # Train one GMM per genre
143 for genre in np.unique(genres):
144 genre_audio = [audio_files[i] for i in range(len(audio_files))
145 if genres[i] == genre]
146
147 model = MusicGMM(n_components=16)
148 model.fit_instrument_model(genre_audio)
149 genre_models[genre] = model
150
151 # Classify test audio
152 scores = {}
153 for genre, model in genre_models.items():
154 scores[genre] = model.score_audio(test_audio)
155
156 # Return genre with highest likelihood
157 return max(scores, key=scores.get), scores
Hidden Markov Models (HMMs)
HMMs model sequences where the system transitions through hidden states, each emitting observable outputs. They're fundamental for chord recognition, beat tracking, and structural segmentation in music.
States and Observations
- • Hidden states: (e.g., chord types)
- • Observations: (e.g., audio features)
Model Parameters
Compute probability of observation sequence:
Recursion:
Find most likely state sequence:
Recursion:
Learn HMM parameters via EM:
Update parameters using expected counts
1import numpy as np
2from hmmlearn import hmm
3import librosa
4
5class ChordRecognitionHMM:
6 def __init__(self, n_states=24, n_mix=4):
7 """
8 HMM for chord recognition
9
10 Parameters:
11 - n_states: Number of chord types (e.g., 12 major + 12 minor)
12 - n_mix: Number of Gaussian mixtures per state
13 """
14 self.n_states = n_states
15 self.chord_labels = self._generate_chord_labels()
16
17 # Use Gaussian Mixture HMM for continuous observations
18 self.model = hmm.GMMHMM(
19 n_components=n_states,
20 n_mix=n_mix,
21 covariance_type="diag",
22 n_iter=100
23 )
24
25 # Music theory constraints
26 self._init_with_music_theory()
27
28 def _generate_chord_labels(self):
29 """Generate chord labels (C, C#, D, ..., B for major and minor)"""
30 notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
31 labels = []
32 for note in notes:
33 labels.append(f"{note}:maj")
34 for note in notes:
35 labels.append(f"{note}:min")
36 return labels[:self.n_states]
37
38 def _init_with_music_theory(self):
39 """Initialize transition matrix with music theory knowledge"""
40 # Create transition matrix based on circle of fifths
41 trans_mat = np.ones((self.n_states, self.n_states)) * 0.01
42
43 for i in range(self.n_states):
44 # Self-transition (chord sustains)
45 trans_mat[i, i] = 0.6
46
47 # Common progressions (I-IV-V, ii-V-I, etc.)
48 # Simplified: increase probability for fifth relationships
49 fifth_up = (i + 7) % 12
50 fifth_down = (i - 7) % 12
51 fourth_up = (i + 5) % 12
52
53 # Account for major/minor
54 if i < 12: # Major chord
55 trans_mat[i, fifth_up] = 0.15 # V
56 trans_mat[i, fourth_up] = 0.1 # IV
57 trans_mat[i, (i + 12) % self.n_states] = 0.05 # Relative minor
58 else: # Minor chord
59 trans_mat[i, fifth_up + 12] = 0.15
60 trans_mat[i, fourth_up + 12] = 0.1
61 trans_mat[i, i - 12] = 0.05 # Relative major
62
63 # Normalize rows
64 trans_mat = trans_mat / trans_mat.sum(axis=1, keepdims=True)
65 self.model.transmat_ = trans_mat
66
67 # Initialize start probabilities (often starts on tonic)
68 start_prob = np.ones(self.n_states) * 0.01
69 start_prob[0] = 0.3 # C major
70 start_prob[12] = 0.2 # C minor
71 self.model.startprob_ = start_prob / start_prob.sum()
72
73 def extract_chroma_features(self, audio, sr=22050):
74 """Extract chromagram features for chord recognition"""
75 # Compute constant-Q chromagram
76 chroma_cq = librosa.feature.chroma_cqt(y=audio, sr=sr, norm=2)
77
78 # Add energy and spectral features
79 rms = librosa.feature.rms(y=audio)
80 spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
81
82 # Combine features
83 features = np.vstack([
84 chroma_cq,
85 rms / rms.max(),
86 spectral_centroid / sr
87 ])
88
89 return features.T
90
91 def train(self, audio_files, chord_annotations):
92 """
93 Train HMM on annotated audio
94
95 Parameters:
96 - audio_files: List of audio file paths
97 - chord_annotations: List of chord sequences (time-aligned)
98 """
99 all_features = []
100 all_lengths = []
101
102 for audio_file, annotations in zip(audio_files, chord_annotations):
103 audio, sr = librosa.load(audio_file, sr=22050)
104 features = self.extract_chroma_features(audio, sr)
105
106 all_features.append(features)
107 all_lengths.append(len(features))
108
109 # Concatenate all sequences
110 X = np.vstack(all_features)
111
112 # Train HMM
113 self.model.fit(X, lengths=all_lengths)
114
115 return self
116
117 def recognize_chords(self, audio, sr=22050):
118 """Recognize chord sequence from audio"""
119 features = self.extract_chroma_features(audio, sr)
120
121 # Decode most likely state sequence
122 states = self.model.predict(features)
123
124 # Convert states to chord labels
125 chords = [self.chord_labels[state] for state in states]
126
127 # Get frame times
128 hop_length = 512
129 frame_times = librosa.frames_to_time(
130 np.arange(len(chords)),
131 sr=sr,
132 hop_length=hop_length
133 )
134
135 return chords, frame_times, states
136
137 def smooth_chord_sequence(self, chords, min_duration=0.1):
138 """Post-process to remove very short chords"""
139 smoothed = []
140 current_chord = chords[0]
141 chord_start = 0
142
143 for i, chord in enumerate(chords[1:], 1):
144 if chord != current_chord:
145 duration = i - chord_start
146 if duration >= min_duration * 22050 / 512: # Convert to frames
147 smoothed.extend([current_chord] * (i - chord_start))
148 else:
149 # Too short, extend previous chord
150 if smoothed:
151 smoothed.extend([smoothed[-1]] * (i - chord_start))
152 else:
153 smoothed.extend([chord] * (i - chord_start))
154 current_chord = chord
155 chord_start = i
156
157 # Handle last segment
158 smoothed.extend([current_chord] * (len(chords) - chord_start))
159
160 return smoothed
161
162# Beat tracking with HMM
163class BeatTrackingHMM:
164 def __init__(self, tempo_range=(60, 180), n_tempos=120):
165 """
166 HMM for beat tracking
167
168 States represent beat positions within a bar
169 """
170 self.tempo_range = tempo_range
171 self.n_tempos = n_tempos
172 self.tempos = np.linspace(tempo_range[0], tempo_range[1], n_tempos)
173
174 def build_transition_matrix(self, tempo, sr=22050, hop_length=512):
175 """Build transition matrix for given tempo"""
176 # Beat period in frames
177 beat_period = 60.0 / tempo * sr / hop_length
178
179 # Number of states (quantized beat positions)
180 n_states = int(beat_period * 4) # 4 beats per bar
181
182 # Transition matrix
183 A = np.zeros((n_states, n_states))
184
185 for i in range(n_states):
186 # Most likely: advance to next position
187 next_pos = (i + 1) % n_states
188 A[i, next_pos] = 0.9
189
190 # Small probability of staying (rubato)
191 A[i, i] = 0.05
192
193 # Small probability of skipping (syncopation)
194 skip_pos = (i + 2) % n_states
195 A[i, skip_pos] = 0.05
196
197 return A
198
199 def compute_onset_strength(self, audio, sr=22050):
200 """Compute onset strength envelope"""
201 onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
202
203 # Normalize
204 onset_env = onset_env / onset_env.max()
205
206 return onset_env
207
208 def track_beats(self, audio, sr=22050):
209 """Track beats using dynamic programming"""
210 onset_env = self.compute_onset_strength(audio, sr)
211
212 # Find best tempo using autocorrelation
213 tempo, beats = librosa.beat.beat_track(
214 onset_envelope=onset_env,
215 sr=sr,
216 trim=False
217 )
218
219 # Refine with HMM
220 A = self.build_transition_matrix(tempo, sr)
221 n_states = A.shape[0]
222
223 # Viterbi decoding for beat positions
224 T = len(onset_env)
225 delta = np.zeros((T, n_states))
226 psi = np.zeros((T, n_states), dtype=int)
227
228 # Initialization
229 delta[0, :] = onset_env[0]
230
231 # Recursion
232 for t in range(1, T):
233 for j in range(n_states):
234 prob = delta[t-1, :] * A[:, j]
235 psi[t, j] = np.argmax(prob)
236 delta[t, j] = np.max(prob) * onset_env[t] if j == 0 else np.max(prob)
237
238 # Backtracking
239 states = np.zeros(T, dtype=int)
240 states[-1] = np.argmax(delta[-1, :])
241
242 for t in range(T-2, -1, -1):
243 states[t] = psi[t+1, states[t+1]]
244
245 # Extract beat times
246 beat_frames = np.where(states == 0)[0]
247 beat_times = librosa.frames_to_time(beat_frames, sr=sr)
248
249 return beat_times, tempo
Advanced Applications
Model musical structure at multiple time scales:
- • Level 1: Note-level transitions
- • Level 2: Chord progressions
- • Level 3: Sections (verse, chorus)
Model multiple independent processes simultaneously:
Example: Separate models for harmony, rhythm, and melody
Condition transitions on input features:
Useful for score-following and alignment tasks
Practical Implementation Tips
1import numpy as np
2import librosa
3from sklearn.mixture import GaussianMixture
4from hmmlearn import hmm
5import madmom
6
7class MusicAnalysisPipeline:
8 def __init__(self):
9 self.gmm_timbre = GaussianMixture(n_components=8)
10 self.hmm_chord = hmm.GMMHMM(n_components=24, n_mix=4)
11 self.hmm_beat = None # Initialized per tempo
12
13 def analyze_complete_track(self, audio_file):
14 """Complete analysis pipeline for a music track"""
15 # Load audio
16 audio, sr = librosa.load(audio_file, sr=22050)
17
18 results = {}
19
20 # 1. Tempo and Beat Tracking
21 tempo, beats = self.detect_tempo_beats(audio, sr)
22 results['tempo'] = tempo
23 results['beats'] = beats
24
25 # 2. Chord Recognition
26 chords, chord_times = self.recognize_chords(audio, sr)
27 results['chords'] = list(zip(chord_times, chords))
28
29 # 3. Structure Segmentation
30 segments = self.segment_structure(audio, sr)
31 results['structure'] = segments
32
33 # 4. Timbre Analysis
34 timbre_profile = self.analyze_timbre(audio, sr)
35 results['timbre'] = timbre_profile
36
37 # 5. Key Detection
38 key = self.detect_key(audio, sr)
39 results['key'] = key
40
41 return results
42
43 def detect_tempo_beats(self, audio, sr):
44 """Tempo and beat detection using madmom"""
45 # Use madmom's DBN beat tracker (HMM-based)
46 proc = madmom.features.beats.DBNBeatTrackingProcessor(fps=100)
47 act = madmom.features.beats.RNNBeatProcessor()(audio)
48 beats = proc(act)
49
50 # Estimate tempo
51 tempo = 60 / np.median(np.diff(beats))
52
53 return tempo, beats
54
55 def recognize_chords(self, audio, sr):
56 """Chord recognition with pre/post processing"""
57 # Extract features
58 chroma = librosa.feature.chroma_cqt(y=audio, sr=sr)
59
60 # Smooth chroma
61 chroma_smooth = librosa.decompose.nn_filter(
62 chroma,
63 aggregate=np.median,
64 metric='cosine'
65 )
66
67 # Prepare for HMM
68 features = chroma_smooth.T
69
70 # Decode chords
71 if hasattr(self.hmm_chord, 'transmat_'):
72 states = self.hmm_chord.predict(features)
73 chord_labels = self._state_to_chord(states)
74 else:
75 # Fallback to template matching
76 chord_labels = self._template_matching(chroma_smooth)
77
78 # Time stamps
79 hop_length = 512
80 times = librosa.frames_to_time(
81 np.arange(len(chord_labels)),
82 sr=sr,
83 hop_length=hop_length
84 )
85
86 return chord_labels, times
87
88 def segment_structure(self, audio, sr):
89 """Segment song structure using self-similarity and HMM"""
90 # Compute self-similarity matrix
91 chroma = librosa.feature.chroma_cqt(y=audio, sr=sr)
92 ssm = librosa.segment.recurrence_matrix(
93 chroma,
94 mode='affinity',
95 metric='cosine'
96 )
97
98 # Enhance diagonal structure
99 ssm_enhanced = librosa.segment.path_enhance(ssm, 15)
100
101 # Find segment boundaries
102 boundaries = librosa.segment.agglomerative(chroma, 10)
103
104 # Label segments using GMM clustering
105 segment_features = []
106 labels = []
107
108 for i in range(len(boundaries) - 1):
109 start, end = boundaries[i], boundaries[i+1]
110 segment_chroma = chroma[:, start:end]
111
112 # Compute segment statistics
113 features = np.concatenate([
114 np.mean(segment_chroma, axis=1),
115 np.std(segment_chroma, axis=1)
116 ])
117 segment_features.append(features)
118
119 # Cluster segments
120 if len(segment_features) > 3:
121 gmm = GaussianMixture(n_components=min(4, len(segment_features)))
122 labels = gmm.fit_predict(segment_features)
123 else:
124 labels = list(range(len(segment_features)))
125
126 # Map to section labels
127 section_names = ['Intro', 'Verse', 'Chorus', 'Bridge', 'Outro']
128 segments = []
129 for i, (start, end) in enumerate(zip(boundaries[:-1], boundaries[1:])):
130 time_start = librosa.frames_to_time(start, sr=sr)
131 time_end = librosa.frames_to_time(end, sr=sr)
132 label = section_names[labels[i] % len(section_names)]
133 segments.append({
134 'start': time_start,
135 'end': time_end,
136 'label': label
137 })
138
139 return segments
140
141 def analyze_timbre(self, audio, sr):
142 """Timbre analysis using GMM on spectral features"""
143 # Extract multiple timbral features
144 features = []
145
146 # MFCCs
147 mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
148 features.append(np.mean(mfcc, axis=1))
149 features.append(np.std(mfcc, axis=1))
150
151 # Spectral features
152 cent = librosa.feature.spectral_centroid(y=audio, sr=sr)
153 features.append([np.mean(cent), np.std(cent)])
154
155 rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
156 features.append([np.mean(rolloff), np.std(rolloff)])
157
158 zcr = librosa.feature.zero_crossing_rate(audio)
159 features.append([np.mean(zcr), np.std(zcr)])
160
161 # Flatten features
162 feature_vector = np.concatenate(features)
163
164 return {
165 'brightness': float(np.mean(cent)),
166 'roughness': float(np.std(zcr)),
167 'warmth': float(np.mean(mfcc[1])), # MFCC 1 correlates with warmth
168 'features': feature_vector.tolist()
169 }
170
171 def detect_key(self, audio, sr):
172 """Key detection using pitch class profiles"""
173 # Compute chromagram
174 chroma = librosa.feature.chroma_cqt(y=audio, sr=sr)
175 chroma_mean = np.mean(chroma, axis=1)
176
177 # Krumhansl-Schmuckler key profiles
178 major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09,
179 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
180 minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53,
181 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
182
183 # Correlate with all keys
184 correlations = {}
185 notes = ['C', 'C#', 'D', 'D#', 'E', 'F',
186 'F#', 'G', 'G#', 'A', 'A#', 'B']
187
188 for shift in range(12):
189 # Rotate chroma
190 rotated_chroma = np.roll(chroma_mean, shift)
191
192 # Correlate with profiles
193 major_corr = np.corrcoef(rotated_chroma, major_profile)[0, 1]
194 minor_corr = np.corrcoef(rotated_chroma, minor_profile)[0, 1]
195
196 correlations[f"{notes[shift]} major"] = major_corr
197 correlations[f"{notes[shift]} minor"] = minor_corr
198
199 # Return most likely key
200 detected_key = max(correlations, key=correlations.get)
201 confidence = correlations[detected_key]
202
203 return {
204 'key': detected_key,
205 'confidence': confidence,
206 'all_correlations': correlations
207 }
208
209# Usage example
210pipeline = MusicAnalysisPipeline()
211results = pipeline.analyze_complete_track('song.mp3')
212
213print(f"Tempo: {results['tempo']:.1f} BPM")
214print(f"Key: {results['key']['key']} (confidence: {results['key']['confidence']:.2f})")
215print(f"Structure: {results['structure']}")
216print(f"First 10 chords: {results['chords'][:10]}")
Model Selection Guidelines
- ✓ Modeling complex distributions
- ✓ Clustering acoustic features
- ✓ Speaker/instrument identification
- ✓ No temporal dependencies
- ✓ Need soft clustering
- ✓ Sequential data with states
- ✓ Chord/key recognition
- ✓ Beat tracking
- ✓ Musical structure analysis
- ✓ Score following
Key Takeaways
- GMMs model distributions: Excellent for capturing the statistical properties of timbre and spectral features without temporal constraints.
- HMMs capture sequences: Ideal for modeling how musical events evolve over time, from beats to large-scale structure.
- EM algorithm is key: Both models use Expectation-Maximization for parameter learning from data.
- Domain knowledge helps: Incorporating music theory constraints significantly improves model performance.