{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tutorial 4: Generate images from the paper" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# importing all usefull classes from PyCoM\n", "from pycom import PyCom, ProteinParams,CoMAnalysis\n", "import pandas as pd\n", "import numpy as np\n", "# matplotlib\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "matplotlib.rcParams['pdf.fonttype'] = 42\n", "matplotlib.rcParams['font.family'] = \"sans-serif\"\n", "matplotlib.rcParams['font.sans-serif'] = \"Arial\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#set the path to the database\n", "database_folder_path=\"/Volumes/mason/Work/Sarath/Research/pycom/\"\n", "file_matrix_db = database_folder_path+\"pycom.mat\"\n", "file_protein_db= database_folder_path+\"pycom.db\"\n", "my_color=\"#6495ED\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "obj_pycom = PyCom(db_path=file_protein_db, mat_path=file_matrix_db)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "obj_pycom = PyCom(remote=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Construct your query (its empty as I want all information)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Here we are asking for all the proteins that match the enzyme class 3 and have been associated with the disease cancer.\n", "query_parameters={}\n", "# executing the query returns a pandas dataframe with information about all the proteins which match the query" ] }, { "cell_type": "markdown", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "Finding out dimensions of the dataframe:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/sdantu/Work/pyc_wspace/pycom/pycom/pycom/interface/_find_helper.py:19: UserWarning: No constraints were passed to find(). This will return all proteins in the database.\n", " warn('No constraints were passed to find(). This will return all proteins in the database.')\n" ] } ], "source": [ "entries_data_frame=obj_pycom.find(query_parameters)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
neffsequence_lengthhelix_fracturn_fracstrand_frachas_ptmhas_pdbhas_substrate
count457622.000000457622.000000457622.000000457622.000000457622.000000457622.000000457622.000000457622.000000
mean8.397407251.2787340.0139260.0012620.0091620.0649230.0502380.427045
std2.498266124.6276420.0760690.0083480.0521920.2463890.2184360.494649
min1.0000005.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%6.928000147.0000000.0000000.0000000.0000000.0000000.0000000.000000
50%8.621000243.0000000.0000000.0000000.0000000.0000000.0000000.000000
75%10.176000351.0000000.0000000.0000000.0000000.0000000.0000001.000000
max17.205000500.0000000.9565220.5428570.8169011.0000001.0000001.000000
\n", "
" ], "text/plain": [ " neff sequence_length helix_frac turn_frac \\\n", "count 457622.000000 457622.000000 457622.000000 457622.000000 \n", "mean 8.397407 251.278734 0.013926 0.001262 \n", "std 2.498266 124.627642 0.076069 0.008348 \n", "min 1.000000 5.000000 0.000000 0.000000 \n", "25% 6.928000 147.000000 0.000000 0.000000 \n", "50% 8.621000 243.000000 0.000000 0.000000 \n", "75% 10.176000 351.000000 0.000000 0.000000 \n", "max 17.205000 500.000000 0.956522 0.542857 \n", "\n", " strand_frac has_ptm has_pdb has_substrate \n", "count 457622.000000 457622.000000 457622.000000 457622.000000 \n", "mean 0.009162 0.064923 0.050238 0.427045 \n", "std 0.052192 0.246389 0.218436 0.494649 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 0.000000 \n", "75% 0.000000 0.000000 0.000000 1.000000 \n", "max 0.816901 1.000000 1.000000 1.000000 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entries_data_frame.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save the query to a csv file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame.to_csv(\"Full_DB_Query.csv\",index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read query data from csv file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#entries_data_frame=pd.read_csv(\"Full_DB.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find unique entries in a column:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame['has_ptm'].unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame['has_pdb'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame[\"neff\"].min()" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### Supported query keywords:\n", "\n", "* `uniprot_id`: The UniProt ID of the protein.\n", "* `sequence`: The amino acid sequence of protein to search for. (full match)\n", "* `min_length` / `max_length`: Min/Max number of residues in the protein.\n", "* `min_helix` / `max_helix`: Min/Max percentage of helical structure in the protein.\n", "* `min_turn` / `max_turn`: Min/Max percentage of turn structure in the protein.\n", "* `min_strand` / `max_strand`: Min/Max percentage of beta strand structure in the protein.\n", "* `organism`: Taxonomic name of the genus / species of the protein. (case-insensitive)\n", " * Species name or any parent taxonomic level can be used. (`pyc.get_organism_list()` for full list)\n", " * Surround with `:` to get precise results\n", " * `:homo:` returns `Homo sapiens` & `Homo sapiens neanderthalensis`)\n", " * `homo` also returns **homo**eomma, t**homo**mys, and *hundreds* others\n", "* `organism_id`: Precise NCBI Taxonomy ID of the species of the protein. (prefer to use `organism` instead)\n", "* `cath`: CATH classification of the protein (`3.40.50.360` or `3.40.*.*` or `3.*`).\n", "* `enzyme`: Enzyme Commission number of the protein. (`1.3.1.3` or `1.3.*.*` or `1.*`).\n", "* `has_substrate`: Whether the protein has a known substrate. (`True`/`False`)\n", "* `has_ptm`: Whether the protein has a known post-translational modification. (`True`/`False`)\n", "* `has_pbd`: Whether the protein has a known PDB structure. (`True`/`False`)\n", "* `disease`: The disease associated with the protein. (name of disease, case-insensitive, e.g `cancer`)\n", " * Use `pyc.get_disease_list()` for full list.\n", " * `cancer` searches for `Ovarian cancer`, `Lung cancer`, ...\n", "* `disease_id`: The ID of the disease associated with the protein. (`DI-02205`, get_disease_list()\n", "* `has_disease`: Whether the protein is associated with a disease. (`True`/`False`)\n", "* `cofactor`: The cofactor associated with the protein. (name of cofactor, case-insensitive, e.g `Zn(2+)`])\n", "* `cofactor_id`: The ID of the cofactor associated with the protein. (`CHEBI:00001`, get_cofactor_list())\n", "* `biological_process`: Biological process associated with the protein. (e.g `antiviral defense`, use `pyc.get_biological_process_list()` for full list)\n", "* `cellular_component`: Cellular component associated with the protein. (e.g `nucleus`, use `pyc.get_cellular_component_list()` for full list\n", "* `domain`: Domain associated with the protein. (e.g `zinc-finger`, use `pyc.get_domain_list()` for full list)\n", "* `ligand`: Ligand associated with the protein. (e.g `zinc`, use `pyc.get_ligand_list()` for full list\n", "* `molecular_function`: Molecular function associated with the protein. (e.g `antioxidant activity`, use `pyc.get_molecular_function_list()` for full list\n", "* `ptm`: Post-translational modification associated with the protein. (e.g `phosphoprotein`, use `pyc.get_ptm_list()` for full list\n", "\n", "\n", "Here is an example of making a large query, then paginating the results:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# plotting parameters\n", "ticks_font=12\n", "labels_font=14" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plot N_eff" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2023-07-27T17:10:24.136598Z", "start_time": "2023-07-27T17:10:11.777570Z" } }, "outputs": [], "source": [ "xlabel='$\\mathrm{N}_{eff}$'\n", "ylabel=\"Count\"\n", "plt.figure(figsize=(4,3))\n", "neff_hist=plt.hist(entries_data_frame[\"neff\"],bins=50,color=my_color,cumulative=False,density=False,alpha=0.5)\n", "plt.xlabel(xlabel,fontsize=labels_font)\n", "plt.ylabel(ylabel,fontsize=labels_font)\n", "plt.xticks(np.arange(0,18,2),fontsize=ticks_font)\n", "plt.yticks(fontsize=ticks_font)\n", "plt.grid(linestyle=\"--\",lw=1)\n", "plt.tight_layout()\n", "plt.savefig(\"Neff.png\",dpi=300,transparent=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plot Sequence length distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "xlabel='Sequence length'\n", "ylabel=\"Count\"\n", "plt.figure(figsize=(4,3))\n", "seq_hist=plt.hist(entries_data_frame[\"sequence_length\"],bins=50,color=my_color,cumulative=False,density=False,alpha=0.5)\n", "plt.xlabel(xlabel,fontsize=labels_font)\n", "plt.ylabel(ylabel,fontsize=labels_font)\n", "plt.xticks(np.arange(0,550,100),fontsize=ticks_font)\n", "plt.yticks(fontsize=ticks_font)\n", "plt.grid(linestyle=\"--\",lw=1)\n", "plt.tight_layout()\n", "plt.savefig(\"seq_len.png\",dpi=300,transparent=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "#This is not informative\n", "\n", "xlabel='Secondary structure (%)'\n", "ylabel=\"Count\"\n", "helix_hist=plt.hist(entries_has_pdb[\"helix_frac\"]*100,bins=50,color=\"#6495ED\",cumulative=False,density=False,alpha=0.4)\n", "strand_hist=plt.hist(entries_has_pdb[\"strand_frac\"]*100,bins=50,color=\"#FFBF00\",cumulative=False,density=False,alpha=0.4)\n", "turn_hist=plt.hist(entries_has_pdb[\"helix_frac\"]*100,bins=50,color=\"#DE3163\",cumulative=False,density=False,alpha=0.4)\n", "\n", "plt.xlabel(xlabel,fontsize=labels_font)\n", "plt.ylabel(ylabel,fontsize=labels_font)\n", "plt.xticks(np.arange(0,110,10),fontsize=ticks_font)\n", "plt.yticks(fontsize=ticks_font)\n", "plt.savefig(\"sstruc.png\",dpi=300,transparent=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get columns in the dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add biological features to the dataframe for each protein\n", "\n", "Initialise the object loader class and then call each add function\n", "\n", "1. Add Enzyme Classification \n", "2. Add CATH Class\n", "3. Add Co-factors\n", "4. Add PTM\n", "5. Add Diseases\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "obj_data_loader=obj_pycom.get_data_loader()\n", "entries_data_frame=obj_data_loader.add_enzyme_commission(entries_data_frame,force_single_entry=False)\n", "entries_data_frame=obj_data_loader.add_cath_class(entries_data_frame,force_single_entry=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame=obj_data_loader.add_pdbs(entries_data_frame,force_single_entry=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame=obj_data_loader.add_cofactors(entries_data_frame,force_single_entry=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame=obj_data_loader.add_ptm(entries_data_frame,force_single_entry=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame=obj_data_loader.add_diseases(entries_data_frame,force_single_entry=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save the progress to a csv file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame.to_csv(\"Full_DB_With_Details.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame['sequence'].unique().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Number of entries with PDB files available" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame[\"pdb_id\"].notna().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ec_full.describe(include=\"all\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame[\"cofactor\"].notna().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_data_frame[\"cath_class\"].notna().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_with_ec_data=entries_data_frame[entries_data_frame[\"enzyme_commission\"].notna()]\n", "entries_with_cath_data=entries_data_frame[entries_data_frame[\"cath_class\"].notna()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_with_cath_data[\"cath_class\"].isna().value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "from collections import OrderedDict\n", "\n", "def group_data_by_class(data,data_type=1):\n", " global dict_group_data\n", " dict_group_data={}\n", "\n", " data_class=\"enzyme_commission\"\n", " if(data_type==1):\n", " data_class=\"enzyme_commission\"\n", " if(data_type==2):\n", " data_class=\"cath_class\"\n", "\n", " for i_data in data[data_class]:\n", " n=len(i_data)\n", " if(n==1):\n", " classid=i_data[0].split('.')[0]\n", " update_dict_group(classid)\n", " if(n>1):\n", " for j in i_data:\n", " update_dict_group(j[0].split('.')[0])\n", " dict_group_data=dict(sorted(dict_group_data.items())) \n", " return dict_group_data\n", "def update_dict_group(classid):\n", " global dict_group_data\n", "\n", " if(classid in dict_group_data.keys()):\n", " dict_group_data[classid]=dict_group_data[classid]+1\n", " else:\n", " dict_group_data[classid]=1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ec_numbers=group_data_by_class(entries_with_ec_data,data_type=1)\n", "ec_numbers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cath_numbers=group_data_by_class(entries_with_cath_data,data_type=2)\n", "cath_numbers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_with_cath_data[\"cath_class\"].isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entries_with_cath_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ec_pie=plt.pie(ec_numbers.values(),\n", " labels=ec_numbers.keys(),\n", " autopct='%1.1f%%',\n", " textprops=dict(color=\"w\",fontsize=10),\n", " startangle=90)\n", "plt.savefig(\"ecdata.png\",dpi=300,transparent=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cath_pie=plt.pie(cath_numbers.values(),\n", " labels=cath_numbers.keys(),\n", " autopct='%1.1f%%',\n", " textprops=dict(color=\"black\",fontsize=10),\n", " startangle=90)\n", "plt.tight_layout()\n", "plt.savefig(\"cathdata.png\",dpi=300,transparent=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2023-07-27T17:15:43.086844Z", "start_time": "2023-07-27T17:15:05.298653Z" } }, "outputs": [], "source": [ "plt.figure(figsize=(4,3))\n", "plt.bar(cath_numbers.keys(),\n", " height=cath_numbers.values(),\n", " color=my_color \n", " )\n", "plt.xlabel(\"CATH Class\",fontsize=labels_font)\n", "plt.ylabel(\"Count\",fontsize=labels_font)\n", "plt.xticks(fontsize=ticks_font)\n", "plt.yticks(fontsize=ticks_font)\n", "plt.grid(axis='y',ls=\"--\",lw=1)\n", "plt.tight_layout()\n", "plt.savefig(\"cathdata.png\",dpi=300,transparent=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "plt.figure(figsize=(4,3))\n", "plt.bar(ec_numbers.keys(),\n", " height=ec_numbers.values(),\n", " color=my_color,\n", " )\n", "plt.xlabel(\"Enzyme Commission Class\",fontsize=labels_font)\n", "plt.ylabel(\"Count\",fontsize=labels_font)\n", "plt.xticks(fontsize=ticks_font)\n", "plt.yticks(fontsize=ticks_font)\n", "plt.grid(axis='y',ls=\"--\",lw=1)\n", "plt.tight_layout()\n", "plt.savefig(\"ecdata.png\",dpi=300,transparent=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis=entries_data_frame[entries_data_frame['enzyme_commission'].notna()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dis[\"enzyme_commission\"].unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Protein" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3-conda (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }