Code describing the Viterbi algorithm

14. Code describing the Viterbi algorithm#

We start with defining HMM as a list of states, each state defined as a dictionary. For keeping the example simple we model alignments of DNA sequences i.e. the model emits sequences from an alfabeth of 4 letters, rather than the 20 letters of amino acid sequences. We designed the HMM so that the match states favors a sequence “ACGT”. We also include a dictionary giving realtive positions to the previous state of each type of state (M, I, D, etc.)

import numpy as np

profile_hmm = [
    {'type': 'S', 'emission': {},                                       'transition': {'M': 0.9, 'I': 0.05, 'D': 0.05}},    # Start State
    {'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'M': 0.9, 'I': 0.1}},                # Insert State 1
    {'type': 'D', 'emission': {},                                       'transition': {'M': 0.9, 'D': 0.1}},                # Delete State 1
    {'type': 'M', 'emission': {'A': 0.6, 'C': 0.1, 'G': 0.2, 'T': 0.1}, 'transition': {'M': 0.9, 'I': 0.05, 'D': 0.05}},    # Match State 1
    {'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'M': 0.9, 'I': 0.1}},                # Insert State 2
    {'type': 'D', 'emission': {},                                       'transition': {'M': 0.9, 'D': 0.1}},                # Delete State 2
    {'type': 'M', 'emission': {'A': 0.2, 'C': 0.6, 'G': 0.1, 'T': 0.1}, 'transition': {'M': 0.9, 'I': 0.05, 'D': 0.05}},    # Match State 2
    {'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'M': 0.9, 'I': 0.1}},                # Insert State 3
    {'type': 'D', 'emission': {},                                       'transition': {'M': 0.9, 'D': 0.1}},                # Delete State 3
    {'type': 'M', 'emission': {'A': 0.1, 'C': 0.2, 'G': 0.5, 'T': 0.2}, 'transition': {'M': 0.9, 'I': 0.05, 'D': 0.05}},    # Match State 3
    {'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'M': 0.9, 'D': 0.1}},                # Insert State 4
    {'type': 'D', 'emission': {},                                       'transition': {'E': 1.0}},                          # Delete State 4
    {'type': 'M', 'emission': {'A': 0.2, 'C': 0.2, 'G': 0.2, 'T': 0.4}, 'transition': {'E': 1.0}},                          # Match State 4
    {'type': 'I', 'emission': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2}, 'transition': {'E': 1.0}},                          # Insert State 5
    {'type': 'E', 'emission': {},                                       'transition': {}},                                  # End State 
]

prev_rel_states = {  # Relative position to previous state of each type
    'S': {},
    'M': {'S': -3, 'M':-3, 'I':-2, 'D':-4},
    'I': {'S': -1, 'M':-1, 'I':0         },
    'D': {'S': -2, 'M':-2,         'D':-3},
    'E': {         'M':-2, 'I':-1, 'D':-3},
}

Then we define the dynamic programming algorithm to compute the Viterbi matrix, and backtracking the optimal path (the Viterbi path) through the model.

def viterbi(profile_hmm, sequence):
    num_states = len(profile_hmm)
    num_bases = len(sequence)

    # Initialize the Viterbi and path matrices
    viterbi_matrix = np.zeros((num_states, num_bases+2))
    viterbi_path = np.zeros((num_states, num_bases+2), dtype=int)

    # Initialize the first column of the Viterbi matrix
    viterbi_matrix[0, 0] = 1.0

    # Fill the Viterbi matrix
    for base_idx in range(1, num_bases+2):
        for state in range(num_states):
            transition_probs = {}
            current_type = profile_hmm[state]['type']  # Is this a 'M', I', or 'D' state?
            isCurrentSilent = not profile_hmm[state]['emission']
            # Get the previous states that can transition to the current state
            prev_abs_states = { t : state + rel for t, rel in prev_rel_states[current_type].items() if (state + rel >= 0) and (t == profile_hmm[state+rel]['type']) and (current_type in profile_hmm[state+rel]['transition'])}
            # Get the previous base index, it is different for silent states (S, E and D)
            prev_abs_base = base_idx if (isCurrentSilent) else base_idx -1  
            for prev_type, prev_abs_state in prev_abs_states.items():
                transition_prob = profile_hmm[prev_abs_state]['transition'][current_type]
                prev_score = viterbi_matrix[prev_abs_state, prev_abs_base]
                transition_probs[prev_abs_state] = transition_prob * prev_score
            if transition_probs:  # Check if the list is not empty
                max_prev_state = max(transition_probs, key=transition_probs.get)
                max_transition_prob = transition_probs[max_prev_state]
                # print(max_prev_state, max_transition_prob)
                if profile_hmm[state]['emission'] and base_idx <= num_bases:
                    emission_prob = profile_hmm[state]['emission'].get(sequence[base_idx-1], 0)
                else:
                    emission_prob = 1.0
                viterbi_matrix[state, base_idx] = max_transition_prob * emission_prob
                viterbi_path[state, base_idx] = max_prev_state
    # Trace back to find the most probable path
    state = num_states - 1
    base_idx = num_bases + 1
    letter='-'
    best_path = []
    while base_idx>=1 and state>0:
        best_path.append((state, profile_hmm[state]['type'], letter ))
        state = viterbi_path[state, base_idx]
        isSilent = not profile_hmm[state]['emission']
        if isSilent:
            letter = '-'
        else:
            base_idx -= 1
            letter = sequence[base_idx-1]
    best_path.append((state, profile_hmm[state]['type'], letter ))
    best_path.reverse()
    return best_path

Lets first try with a “ACGT” sequence.

decoded_path = viterbi(profile_hmm, 'ACGT')
print("Decoded path:")
for state, type, letter in decoded_path:
    print(f"State {state} of type {type} emitted {letter}")

Decoded path:
State 0 of type S emitted -
State 3 of type M emitted A
State 6 of type M emitted C
State 9 of type M emitted G
State 12 of type M emitted T
State 14 of type E emitted -

We could try a sequence that is longer than the model.

decoded_path = viterbi(profile_hmm, 'ACAAGT')
print("Decoded path:")
for state, type, letter in decoded_path:
    print(f"State {state} of type {type} emitted {letter}")

Decoded path:
State 0 of type S emitted -
State 3 of type M emitted A
State 6 of type M emitted C
State 7 of type I emitted A
State 7 of type I emitted A
State 9 of type M emitted G
State 12 of type M emitted T
State 14 of type E emitted -

And a shorter sequence.

decoded_path = viterbi(profile_hmm, 'AGT')
print("Decoded path:")
for state, type, letter in decoded_path:
    print(f"State {state} of type {type} emitted {letter}")

Decoded path:
State 0 of type S emitted -
State 3 of type M emitted A
State 5 of type D emitted -
State 9 of type M emitted G
State 12 of type M emitted T
State 14 of type E emitted -