Code: Aligning amino acid sequences

10. Code: Aligning amino acid sequences#

Here we resuse the code from previous chapters. We have hidden this code to make you focus on the salient parts of the reasoning.

Hide code cell source
import numpy as np

# Print 2 sequences on top of each other
def print_alignment(seqA,seqB):
    print(seqA)
    print(seqB)

# Print a dynamic programming score matrix
# together with its sequences
def print_dynamic(seqA,seqB,dpm):
    seqA,seqB = "-" + seqA, "-" + seqB
    m,n = len(seqA),len(seqB)
    print('{:^5}'.format(" "), end=""),
    for j in range(n):
        print('{:^5}'.format(seqB[j]), end="")
    print()
    for i in range(m):
        print ('{:^5}'.format(seqA[i]), end="")
        for j in range(n):
            print ('{:5.1f}'.format(dpm[i,j]), end="")
        print()
    print()

# Format an alignment by inserting gaps in sequences
def format_alignment(seqA,seqB,S,trace):
    print("Best score: " + str(S[-1,-1]))
    outA,outB = "",""
    i,j = len(seqA),len(seqB)
    while i>0 or j>0:
        di,dj = trace[i,j]
        i += int(di)
        j += int(dj)
        if di == 0:
            outA = "-" + outA
        else:
            outA = seqA[i] + outA
        if dj == 0:
            outB = "-" + outB
        else:
            outB = seqB[j] + outB
    return outA,outB

# Initiating dynamic programming matrices, S and trace
def initiate_global_dp(m,n):
    S = np.zeros((m,n))       # An m*n matrix, initiated with 0's
    trace = np.zeros((m,n,2)) # An m*n matrix, initiated with (0,0)'s
    S[0,0] = 0.
    trace[0,0,:] = (0.,0.)
    for i in range(1,m):
        S[i,0] = i * gap_penalty()
        trace[i,0,:] =(-1,0)
    for j in range(1,n):
        S[0,j] = j * gap_penalty()
        trace[0,j,:] =(0,-1)
    return S,trace
def global_align(seqA,seqB,print_dynamic_matrix = False):
    # Initiating variables
    m, n = len(seqA)+1, len(seqB)+1
    S,trace = initiate_global_dp(m,n)
    # Fill in the rest of the dynamic programming matrix
    for i in range(1,m):
        for j in range(1,n):
            # Note the subtraction of 1 for the sequence position
            # In python sequences are indexed from 0 to len-1
            match = S[i-1,j-1] + match_score(seqA[i-1],seqB[j-1]) 
            delete = S[i-1,j] + match_score(seqA[i-1],'-') 
            insert = S[i,j-1] + match_score('-',seqB[j-1]) 
            S[i,j] = max(match,delete,insert)
            if match >= max(insert,delete):
                trace[i,j,:] = (-1,-1)
            elif delete >= insert:
                trace[i,j,:] = (-1,0)
            else:
                trace[i,j,:] = (0,-1)
    if print_dynamic_matrix:
        print_dynamic(seqA,seqB,S)
    return format_alignment(seqA,seqB,S,trace)

def format_local_alignment(seqA,seqB,S,trace):
    outA,outB = "",""
    i,j = np.unravel_index(S.argmax(),S.shape)
    print("Best score: " + str(S[i,j]))
    while min(trace[i,j])<0:
        di,dj = trace[i,j]
        i += int(di)
        j += int(dj)
        if di == 0:
            outA = "-" + outA
        else:
            outA = seqA[i] + outA
        if dj == 0:
            outB = "-" + outB
        else:
            outB = seqB[j] + outB
    return outA,outB

# Initiating dynamic programming matrices, S and trace
def initiate_local_dp(m,n):
    S = np.zeros((m,n))
    trace = np.zeros((m,n,2))
    S[0,0] = 0.
    trace[0,0,:] = (0.,0.)
    for i in range(1,m):
        S[i,0] = 0
        trace[i,0,:] =(0,0)
    for j in range(1,n):
        S[0,j] = 0
        trace[0,j,:] =(0,0)
    return S,trace
def local_align(seqA,seqB,print_dynamic_matrix = False):
    # Initiating variables
    m, n = len(seqA)+1, len(seqB)+1
    S,trace = initiate_local_dp(m,n)
    # Fill in the rest of the dynamic programming matrix
    for i in range(1,m):
        for j in range(1,n):
            match = S[i-1][j-1] + match_score(seqA[i-1],seqB[j-1])
            delete = S[i-1,j] + match_score(seqA[i-1],'-') 
            insert = S[i,j-1] + match_score('-',seqB[j-1]) 
            S[i,j] = max(match, delete, insert, 0.)
            if match >= max(delete,insert,0.):
                trace[i,j,:] = (-1,-1.)
            elif delete >= max(insert,0.):
                trace[i,j,:] = (-1.,0.)
            elif insert >= 0.:
                trace[i,j,:] = (0.,-1.)
            else:
                trace[i,j,:] = (0.,0.)
    if print_dynamic_matrix:
        print_dynamic(seqA,seqB,S)
    return format_local_alignment(seqA,seqB,S,trace)

def format_semiglobal_alignment(seqA,seqB,S,trace):
    outA,outB = "",""
    m,n = S.shape[0]-1, S.shape[1]-1
    i,j = S[:,n].argmax(), S[m,:].argmax()
    if S[i,n] > S[m,j]:
        print("Best score: " + str(S[i,n]))
        j = n
        # point the alignmnént from (m,n) to here
        for ix in range(i+1,m+1):
            print(ix,n)
            trace[ix,n,:] = (-1,0)
    else:
        print("Best score: " + str(S[m,j]))
        i = m    
        # point the alignmnént from (m,n) to here
        for ix in range(j+1,n+1):
            print(m,ix)
            trace[m,ix,:] = (0,-1)
    i,j = m,n
    while min(trace[i,j])<0:
        di,dj = trace[i,j]
        i += int(di)
        j += int(dj)
        if di == 0:
            outA = "-" + outA
        else:
            outA = seqA[i] + outA
        if dj == 0:
            outB = "-" + outB
        else:
            outB = seqB[j] + outB
    return outA,outB

# Initiating dynamic programming matrices, S and trace
def initiate_semiglobal_dp(m,n):
    S = np.zeros((m,n))
    trace = np.zeros((m,n,2))
    S[0,0] = 0.
    trace[0,0,:] = (0.,0.)
    for i in range(1,m):
        S[i,0] = 0
        trace[i,0,:] =(-1,0)
    for j in range(1,n):
        S[0,j] = 0
        trace[0,j,:] =(0,-1)
    return S,trace
def semiglobal_align(seqA,seqB,print_dynamic_matrix = False):
    # Initiating variables
    m, n = len(seqA)+1, len(seqB)+1
    S,trace = initiate_semiglobal_dp(m,n)
    # Fill in the rest of the dynamic programming matrix
    for i in range(1,m):
        for j in range(1,n):
            match = S[i-1][j-1] + match_score(seqA[i-1],seqB[j-1])
            delete = S[i-1,j] + match_score(seqA[i-1],'-') 
            insert = S[i,j-1] + match_score('-',seqB[j-1]) 
            S[i,j] = max(match, delete, insert)
            if match >= max(delete,insert):
                trace[i,j,:] = (-1,-1.)
            elif delete >= insert:
                trace[i,j,:] = (-1.,0.)
            else:
                trace[i,j,:] = (0.,-1.)
    if print_dynamic_matrix:
        print_dynamic(seqA,seqB,S)
    return format_semiglobal_alignment(seqA,seqB,S,trace)

10.1. Protein sequences#

We can use the exact same alignment methods for protein sequences, as long as we define an appropriate scoring function. Here we will use a PAM250 matrix for the matches, and a gap penalty of 10.

PAM250 = {
'A': {'A': 2,  'C': -2, 'D':  0, 'E': 0, 'F': -3, 'G':  1, 'H': -1, 'I': -1, 'K': -1, 'L': -2, 'M': -1, 'N':  0, 'P':  1, 'Q':  0, 'R': -2, 'S':  1, 'T':  1, 'V':  0, 'W': -6, 'Y': -3},
'C': {'A': -2, 'C': 12, 'D': -5, 'E':-5, 'F': -4, 'G': -3, 'H': -3, 'I': -2, 'K': -5, 'L': -6, 'M': -5, 'N': -4, 'P': -3, 'Q': -5, 'R': -4, 'S':  0, 'T': -2, 'V': -2, 'W': -8, 'Y':  0},
'D': {'A': 0,  'C': -5, 'D':  4, 'E': 3, 'F': -6, 'G':  1, 'H':  1, 'I': -2, 'K':  0, 'L': -4, 'M': -3, 'N':  2, 'P': -1, 'Q':  2, 'R': -1, 'S':  0, 'T':  0, 'V': -2, 'W': -7, 'Y': -4},
'E': {'A': 0,  'C': -5, 'D':  3, 'E': 4, 'F': -5, 'G':  0, 'H':  1, 'I': -2, 'K':  0, 'L': -3, 'M': -2, 'N':  1, 'P': -1, 'Q':  2, 'R': -1, 'S':  0, 'T':  0, 'V': -2, 'W': -7, 'Y': -4},
'F': {'A': -3, 'C': -4, 'D': -6, 'E':-5, 'F':  9, 'G': -5, 'H': -2, 'I':  1, 'K': -5, 'L':  2, 'M':  0, 'N': -3, 'P': -5, 'Q': -5, 'R': -4, 'S': -3, 'T': -3, 'V': -1, 'W':  0, 'Y':  7},
'G': {'A': 1,  'C': -3, 'D':  1, 'E': 0, 'F': -5, 'G':  5, 'H': -2, 'I': -3, 'K': -2, 'L': -4, 'M': -3, 'N':  0, 'P':  0, 'Q': -1, 'R': -3, 'S':  1, 'T':  0, 'V': -1, 'W': -7, 'Y': -5},
'H': {'A': -1, 'C': -3, 'D':  1, 'E': 1, 'F': -2, 'G': -2, 'H':  6, 'I': -2, 'K':  0, 'L': -2, 'M': -2, 'N':  2, 'P':  0, 'Q':  3, 'R':  2, 'S': -1, 'T': -1, 'V': -2, 'W': -3, 'Y':  0},
'I': {'A': -1, 'C': -2, 'D': -2, 'E':-2, 'F':  1, 'G': -3, 'H': -2, 'I':  5, 'K': -2, 'L':  2, 'M':  2, 'N': -2, 'P': -2, 'Q': -2, 'R': -2, 'S': -1, 'T':  0, 'V':  4, 'W': -5, 'Y': -1},
'K': {'A': -1, 'C': -5, 'D':  0, 'E': 0, 'F': -5, 'G': -2, 'H':  0, 'I': -2, 'K':  5, 'L': -3, 'M':  0, 'N':  1, 'P': -1, 'Q':  1, 'R':  3, 'S':  0, 'T':  0, 'V': -2, 'W': -3, 'Y': -4},
'L': {'A': -2, 'C': -6, 'D': -4, 'E':-3, 'F':  2, 'G': -4, 'H': -2, 'I':  2, 'K': -3, 'L':  6, 'M':  4, 'N': -3, 'P': -3, 'Q': -2, 'R': -3, 'S': -3, 'T': -2, 'V':  2, 'W': -2, 'Y': -1},
'M': {'A': -1, 'C': -5, 'D': -3, 'E':-2, 'F':  0, 'G': -3, 'H': -2, 'I':  2, 'K':  0, 'L':  4, 'M':  6, 'N': -2, 'P': -2, 'Q': -1, 'R':  0, 'S': -2, 'T': -1, 'V':  2, 'W': -4, 'Y': -2},
'N': {'A': 0,  'C': -4, 'D':  2, 'E': 1, 'F': -3, 'G':  0, 'H':  2, 'I': -2, 'K':  1, 'L': -3, 'M': -2, 'N':  2, 'P':  0, 'Q':  1, 'R':  0, 'S':  1, 'T':  0, 'V': -2, 'W': -4, 'Y': -2},
'P': {'A': 1,  'C': -3, 'D': -1, 'E':-1, 'F': -5, 'G':  0, 'H':  0, 'I': -2, 'K': -1, 'L': -3, 'M': -2, 'N':  0, 'P':  6, 'Q':  0, 'R':  0, 'S':  1, 'T':  0, 'V': -1, 'W': -6, 'Y': -5},
'Q': {'A': 0,  'C': -5, 'D':  2, 'E': 2, 'F': -5, 'G': -1, 'H':  3, 'I': -2, 'K':  1, 'L': -2, 'M': -1, 'N':  1, 'P':  0, 'Q':  4, 'R':  1, 'S': -1, 'T': -1, 'V': -2, 'W': -5, 'Y': -4},
'R': {'A': -2, 'C': -4, 'D': -1, 'E':-1, 'F': -4, 'G': -3, 'H':  2, 'I': -2, 'K':  3, 'L': -3, 'M':  0, 'N':  0, 'P':  0, 'Q':  1, 'R':  6, 'S':  0, 'T': -1, 'V': -2, 'W':  2, 'Y': -4},
'S': {'A': 1,  'C':  0, 'D':  0, 'E': 0, 'F': -3, 'G':  1, 'H': -1, 'I': -1, 'K':  0, 'L': -3, 'M': -2, 'N':  1, 'P':  1, 'Q': -1, 'R':  0, 'S':  2, 'T':  1, 'V': -1, 'W': -2, 'Y': -3},
'T': {'A': 1,  'C': -2, 'D':  0, 'E': 0, 'F': -3, 'G':  0, 'H': -1, 'I':  0, 'K':  0, 'L': -2, 'M': -1, 'N':  0, 'P':  0, 'Q': -1, 'R': -1, 'S':  1, 'T':  3, 'V':  0, 'W': -5, 'Y': -3},
'V': {'A': 0,  'C': -2, 'D': -2, 'E':-2, 'F': -1, 'G': -1, 'H': -2, 'I':  4, 'K': -2, 'L':  2, 'M':  2, 'N': -2, 'P': -1, 'Q': -2, 'R': -2, 'S': -1, 'T':  0, 'V':  4, 'W': -6, 'Y': -2},
'W': {'A': -6, 'C': -8, 'D': -7, 'E':-7, 'F':  0, 'G': -7, 'H': -3, 'I': -5, 'K': -3, 'L': -2, 'M': -4, 'N': -4, 'P': -6, 'Q': -5, 'R':  2, 'S': -2, 'T': -5, 'V': -6, 'W': 17, 'Y':  0},
'Y': {'A': -3, 'C':  0, 'D': -4, 'E':-4, 'F':  7, 'G': -5, 'H':  0, 'I': -1, 'K': -4, 'L': -1, 'M': -2, 'N': -2, 'P': -5, 'Q': -4, 'R': -4, 'S': -3, 'T': -3, 'V': -2, 'W':  0, 'Y': 10}}

def gap_penalty():
    return -10.0

def match_score(letterA,letterB):
    if letterA == '-' or letterB == '-':
        return gap_penalty()
    else:
        return PAM250[letterA][letterB]
seqA,seqB = global_align("IAMAPEPTIDE","IAMPEPPED",True)
print_alignment(seqA,seqB)
       -    I    A    M    P    E    P    P    E    D  
  -    0.0-10.0-20.0-30.0-40.0-50.0-60.0-70.0-80.0-90.0
  I  -10.0  5.0 -5.0-15.0-25.0-35.0-45.0-55.0-65.0-75.0
  A  -20.0 -5.0  7.0 -3.0-13.0-23.0-33.0-43.0-53.0-63.0
  M  -30.0-15.0 -3.0 13.0  3.0 -7.0-17.0-27.0-37.0-47.0
  A  -40.0-25.0-13.0  3.0 14.0  4.0 -6.0-16.0-26.0-36.0
  P  -50.0-35.0-23.0 -7.0  9.0 13.0 10.0  0.0-10.0-20.0
  E  -60.0-45.0-33.0-17.0 -1.0 13.0 12.0  9.0  4.0 -6.0
  P  -70.0-55.0-43.0-27.0-11.0  3.0 19.0 18.0  8.0  3.0
  T  -80.0-65.0-53.0-37.0-21.0 -7.0  9.0 19.0 18.0  8.0
  I  -90.0-75.0-63.0-47.0-31.0-17.0 -1.0  9.0 17.0 16.0
  D  -100.0-85.0-73.0-57.0-41.0-27.0-11.0 -1.0 12.0 21.0
  E  -110.0-95.0-83.0-67.0-51.0-37.0-21.0-11.0  3.0 15.0

Best score: 15.0
IAMAPEPTIDE
IAM-PEPP-ED
seqA,seqB = global_align("MYPERFECTDAY","THERESAMAY",False)
print_alignment(seqA,seqB)
Best score: 3.0
MYPERFECTDAY
T-HER-ESAMAY
seqA,seqB = local_align("MYPERFECTDAY","THERESAMAY",False)
print_alignment(seqA,seqB)
Best score: 14.0
PERFECTDAY
HER-ESAMAY