Tutorial 3: Analysing alignment(s)

[1]:

import requests
import re
import logomaker as lmaker
import numpy as np
import matplotlib.pyplot as plt

Functions to download, read and transform alignment files

[2]:

def download_alignment_files(uniprot_id):
    url="https://pycom.brunel.ac.uk/alignments/"
    file_name=uniprot_id+".aln"

    web_url=url+file_name

    try:
        response = requests.get(web_url, stream=True)

        # Check if the request was successful (HTTP status code 200)
        response.raise_for_status()
        if response.status_code == 200:
        # Open the file in binary write mode and save the content
            with open(file_name, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            print(f"File downloaded successfully and saved as '{file_name}'.")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")

def convert_to_fasta(aln_file_in,fasta_file_out):
    file_data = open(aln_file_in,'r')
    count=0
    out=""
    for line in file_data:
        out+=">SEQ_%d\n"%(count)
        out+="%s"%(line)
        count+=1
    f=open(fasta_file_out,'w')
    f.write(out)
    f.close()

def read_alignment_file(file_path):
    with open(file_path, "r") as file:
        content = file.read()
        sequences = re.findall(r'[A-Z\-]+', content)
    return sequences

Download the alignment file

[3]:

uniprot_id="P0C5H4"
# download the alignment file from PyCoM (pycom.brunel.ac.uk) server
download_alignment_files(uniprot_id)

File downloaded successfully and saved as 'P0C5H4.aln'.

Read the alignment file

[4]:

# get the sequences from the alignment file
alignment_file_path = uniprot_id+".aln"
sequences = read_alignment_file(alignment_file_path)

Convert the sequence to FASTA format

[5]:

# Generate sequence alignment compatible with https://alignmentviewer.org/ format
# you can upload the fasta file to https://alignmentviewer.org/ for analysis of the alignment
convert_to_fasta("P0C5H4.aln","P0C5H4.fasta")

[6]:

# number of sequences in the alignment file
len(sequences)

[6]:

Transfrom sequence list to frequency matrix

[14]:

# convert the sequence list to a pandas dataframe with count's of each aminoacid at each position
df_sequence_count_matrix=lmaker.alignment_to_matrix(sequences)

plt.axis('off')
plt.imshow(df_sequence_count_matrix.T, aspect='auto')

df_sequence_count_matrix

[14]:

	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y
pos
0	0	0	0	0	0	0	0	2	0	19	0	0	0	0	2	0	0	0	0	0
1	0	0	0	4	0	0	0	1	17	0	0	0	0	1	0	0	0	0	0	0
2	0	23	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	1	0	0	0	2	0	2	0	0	17	0	0	0	0	0	0	0	1
4	0	0	0	0	0	0	0	0	12	0	0	0	0	3	2	1	5	0	0	0
5	0	0	0	0	0	0	0	1	0	16	0	0	4	1	0	0	0	1	0	0
6	0	0	0	0	1	1	0	7	0	5	0	0	0	0	0	0	0	9	0	0
7	0	0	1	0	0	0	0	0	1	0	0	0	20	0	1	0	0	0	0	0
8	0	0	0	0	6	0	0	5	0	6	0	0	5	0	0	0	0	0	1	0
9	6	0	0	0	8	0	1	3	0	3	0	0	0	0	0	0	0	2	0	0
10	0	0	0	0	0	0	3	0	1	0	0	0	0	0	0	5	1	0	5	8
11	0	1	0	0	0	0	0	1	20	0	1	0	0	0	0	0	0	0	0	0
12	0	0	0	0	0	0	0	0	1	0	0	0	1	0	0	0	21	0	0	0
13	0	22	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0
14	0	0	0	0	0	0	0	0	1	0	0	0	22	0	0	0	0	0	0	0
15	5	0	0	15	0	0	0	0	1	0	0	0	1	0	0	0	0	1	0	0
16	0	0	0	0	0	22	0	0	0	0	0	0	0	0	0	0	0	1	0	0
17	0	0	0	0	0	0	0	0	17	0	0	2	1	3	0	0	0	0	0	0
18	0	0	1	0	1	0	0	0	0	0	0	21	0	0	0	0	0	0	0	0
19	0	0	0	0	0	0	0	1	0	22	0	0	0	0	0	0	0	0	0	0

../_images/tutorials_03_Alignment_analysis_12_1.png

Plot the Logo images

[8]:

# Based on tutorial from logo https://logomaker.readthedocs.io/en/latest/examples.html#ww-domain-information-logo

seq_logo=lmaker.Logo(df_sequence_count_matrix,
            color_scheme="chemistry",
            font_name='Arial Rounded MT Bold')
seq_logo.style_xticks(anchor=0, spacing=2, rotation=45)
#seq_logo.highlight_position(p=4, color='gold', alpha=.5)
#seq_logo.highlight_position(p=26, color='gold', alpha=.5)

# style using Axes methods
seq_logo.ax.set_ylabel('Count')
seq_logo.ax.set_xlim([-1, len(df_sequence_count_matrix)])

[8]:

(-1.0, 20.0)

Matrix transformation examples

[9]:

# convert frequency matrix to probability matrix
df_probability_matrix=lmaker.transform_matrix(df_sequence_count_matrix,normalize_values=True)
# convert probability matrix to information matrix
df_information_matrix=lmaker.transform_matrix(df_probability_matrix,from_type="probability",to_type="information",normalize_values=False)

Some more Logo images

[10]:

df_information_matrix

[10]:

	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y
pos
0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.302734	0.000000	2.875969	0.000000	0.000000	0.000000	0.000000	0.302734	0.000000	0.000000	0.000000	0.000000	0.000000
1	0.000000	0.000000	0.000000	0.550845	0.000000	0.000000	0.000000	0.137711	2.341092	0.000000	0.000000	0.000000	0.000000	0.137711	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
2	0.000000	4.321928	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
3	0.000000	0.000000	0.130150	0.000000	0.000000	0.000000	0.260300	0.000000	0.260300	0.000000	0.000000	2.212548	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.130150
4	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.287255	0.000000	0.000000	0.000000	0.000000	0.321814	0.214543	0.107271	0.536356	0.000000	0.000000	0.000000
5	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.127339	0.000000	2.037427	0.000000	0.000000	0.509357	0.127339	0.000000	0.000000	0.000000	0.127339	0.000000	0.000000
6	0.000000	0.000000	0.000000	0.000000	0.104259	0.104259	0.000000	0.729811	0.000000	0.521294	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.938329	0.000000	0.000000
7	0.000000	0.000000	0.154633	0.000000	0.000000	0.000000	0.000000	0.000000	0.154633	0.000000	0.000000	0.000000	3.092665	0.000000	0.154633	0.000000	0.000000	0.000000	0.000000	0.000000
8	0.000000	0.000000	0.000000	0.000000	0.562585	0.000000	0.000000	0.468821	0.000000	0.562585	0.000000	0.000000	0.468821	0.000000	0.000000	0.000000	0.000000	0.000000	0.093764	0.000000
9	0.526072	0.000000	0.000000	0.000000	0.701429	0.000000	0.087679	0.263036	0.000000	0.263036	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.175357	0.000000	0.000000
10	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.268449	0.000000	0.089483	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.447416	0.089483	0.000000	0.447416	0.715865
11	0.000000	0.154633	0.000000	0.000000	0.000000	0.000000	0.000000	0.154633	3.092665	0.000000	0.154633	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
12	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.165598	0.000000	0.000000	0.000000	0.165598	0.000000	0.000000	0.000000	3.477548	0.000000	0.000000	0.000000
13	0.000000	3.887218	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.176692	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
14	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.176692	0.000000	0.000000	0.000000	3.887218	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
15	0.619805	0.000000	0.000000	1.859416	0.000000	0.000000	0.000000	0.000000	0.123961	0.000000	0.000000	0.000000	0.123961	0.000000	0.000000	0.000000	0.000000	0.123961	0.000000	0.000000
16	0.000000	0.000000	0.000000	0.000000	0.000000	3.887218	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.176692	0.000000	0.000000
17	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.301079	0.000000	0.000000	0.270715	0.135358	0.406073	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
18	0.000000	0.000000	0.165598	0.000000	0.165598	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.477548	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
19	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.176692	0.000000	3.887218	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000

[11]:

seq_logo=lmaker.Logo(df_information_matrix,vpad=0.1,width=0.8,figsize=(21,4),color_scheme="chemistry",font_name='Arial Rounded MT Bold')
seq_logo.style_xticks(anchor=0, spacing=2, rotation=45)
#seq_logo.highlight_position(p=4, color='gold', alpha=.5)
#seq_logo.highlight_position(p=26, color='gold', alpha=.5)

# style using Axes methods
seq_logo.ax.set_ylabel('Information (bits)')
seq_logo.ax.set_xticks(np.arange(0,len(df_sequence_count_matrix)+4,2))
seq_logo.ax.set_xlim([-1, len(df_sequence_count_matrix)])

[11]:

(-1.0, 20.0)