14. Code describing the Viterbi algorithm#
We start with defining HMM as a list of states, each state defined as a dictionary. For keeping the example simple we model alignments of DNA sequences i.e. the model emits sequences from an alfabeth of 4 letters, rather than the 20 letters of amino acid sequences. We designed the HMM so that the match states favors a sequence “ACGT”. We also include a dictionary giving realtive positions to the previous state of each type of state (M, I, D, etc.)
import numpy as np
profile_hmm = [
{'type': 'S', 'emission': {}, 'transition': {'M': 0.9, 'I': 0.05, 'D': 0.05}}, # Start State
{'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'M': 0.9, 'I': 0.1}}, # Insert State 1
{'type': 'D', 'emission': {}, 'transition': {'M': 0.9, 'D': 0.1}}, # Delete State 1
{'type': 'M', 'emission': {'A': 0.6, 'C': 0.1, 'G': 0.2, 'T': 0.1}, 'transition': {'M': 0.9, 'I': 0.05, 'D': 0.05}}, # Match State 1
{'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'M': 0.9, 'I': 0.1}}, # Insert State 2
{'type': 'D', 'emission': {}, 'transition': {'M': 0.9, 'D': 0.1}}, # Delete State 2
{'type': 'M', 'emission': {'A': 0.2, 'C': 0.6, 'G': 0.1, 'T': 0.1}, 'transition': {'M': 0.9, 'I': 0.05, 'D': 0.05}}, # Match State 2
{'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'M': 0.9, 'I': 0.1}}, # Insert State 3
{'type': 'D', 'emission': {}, 'transition': {'M': 0.9, 'D': 0.1}}, # Delete State 3
{'type': 'M', 'emission': {'A': 0.1, 'C': 0.2, 'G': 0.5, 'T': 0.2}, 'transition': {'M': 0.9, 'I': 0.05, 'D': 0.05}}, # Match State 3
{'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'M': 0.9, 'D': 0.1}}, # Insert State 4
{'type': 'D', 'emission': {}, 'transition': {'E': 1.0}}, # Delete State 4
{'type': 'M', 'emission': {'A': 0.2, 'C': 0.2, 'G': 0.2, 'T': 0.4}, 'transition': {'E': 1.0}}, # Match State 4
{'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'E': 1.0}}, # Insert State 5
{'type': 'E', 'emission': {}, 'transition': {}}, # End State
]
prev_rel_states = { # Relative position to previous state of each type
'S': {},
'M': {'S': -3, 'M':-3, 'I':-2, 'D':-4},
'I': {'S': -1, 'M':-1, 'I':0 },
'D': {'S': -2, 'M':-2, 'D':-3},
'E': { 'M':-2, 'I':-1, 'D':-3},
}
Then we define the dynamic programming algorithm to compute the Viterbi matrix, and backtracking the optimal path (the Viterbi path) through the model.
def viterbi(profile_hmm, sequence):
num_states = len(profile_hmm)
num_bases = len(sequence)
# Initialize the Viterbi and path matrices
viterbi_matrix = np.zeros((num_states, num_bases+2))
viterbi_path = np.zeros((num_states, num_bases+2), dtype=int)
# Initialize the first column of the Viterbi matrix
viterbi_matrix[0, 0] = 1.0
# Fill the Viterbi matrix
for base_idx in range(1, num_bases+2):
for state in range(num_states):
transition_probs = {}
current_type = profile_hmm[state]['type'] # Is this a 'M', I', or 'D' state?
isCurrentSilent = not profile_hmm[state]['emission']
# Get the previous states that can transition to the current state
prev_abs_states = { t : state + rel for t, rel in prev_rel_states[current_type].items() if (state + rel >= 0) and (t == profile_hmm[state+rel]['type']) and (current_type in profile_hmm[state+rel]['transition'])}
# Get the previous base index, it is different for silent states (S, E and D)
prev_abs_base = base_idx if (isCurrentSilent) else base_idx -1
for prev_type, prev_abs_state in prev_abs_states.items():
transition_prob = profile_hmm[prev_abs_state]['transition'][current_type]
prev_score = viterbi_matrix[prev_abs_state, prev_abs_base]
transition_probs[prev_abs_state] = transition_prob * prev_score
if transition_probs: # Check if the list is not empty
max_prev_state = max(transition_probs, key=transition_probs.get)
max_transition_prob = transition_probs[max_prev_state]
# print(max_prev_state, max_transition_prob)
if profile_hmm[state]['emission'] and base_idx <= num_bases:
emission_prob = profile_hmm[state]['emission'].get(sequence[base_idx-1], 0)
else:
emission_prob = 1.0
viterbi_matrix[state, base_idx] = max_transition_prob * emission_prob
viterbi_path[state, base_idx] = max_prev_state
# Trace back to find the most probable path
state = num_states - 1
base_idx = num_bases + 1
letter='-'
best_path = []
while base_idx>=1 and state>0:
best_path.append((state, profile_hmm[state]['type'], letter ))
state = viterbi_path[state, base_idx]
isSilent = not profile_hmm[state]['emission']
if isSilent:
letter = '-'
else:
base_idx -= 1
letter = sequence[base_idx-1]
best_path.append((state, profile_hmm[state]['type'], letter ))
best_path.reverse()
return best_path
Lets first try with a “ACGT” sequence.
decoded_path = viterbi(profile_hmm, 'ACGT')
print("Decoded path:")
for state, type, letter in decoded_path:
print(f"State {state} of type {type} emitted {letter}")
Decoded path:
State 0 of type S emitted -
State 3 of type M emitted A
State 6 of type M emitted C
State 9 of type M emitted G
State 12 of type M emitted T
State 14 of type E emitted -
We could try a sequence that is longer than the model.
decoded_path = viterbi(profile_hmm, 'ACAAGT')
print("Decoded path:")
for state, type, letter in decoded_path:
print(f"State {state} of type {type} emitted {letter}")
Decoded path:
State 0 of type S emitted -
State 3 of type M emitted A
State 6 of type M emitted C
State 7 of type I emitted A
State 7 of type I emitted A
State 9 of type M emitted G
State 12 of type M emitted T
State 14 of type E emitted -
And a shorter sequence.
decoded_path = viterbi(profile_hmm, 'AGT')
print("Decoded path:")
for state, type, letter in decoded_path:
print(f"State {state} of type {type} emitted {letter}")
Decoded path:
State 0 of type S emitted -
State 3 of type M emitted A
State 5 of type D emitted -
State 9 of type M emitted G
State 12 of type M emitted T
State 14 of type E emitted -