In [3]:
import pandas as pd
import matplotlib.pyplot as plt
In [23]:
#read the downloaded TCGA BRCA SOmatic Variants MC3 gene-level non-silent mutation data. The downloaded text file (after unzip) is 64MB.
#Be sure to move the downloaded file into the directory: AI_class
TCGA_BRCA_MC3=pd.read_csv("mc3_gene_level_BRCA_mc3_gene_level.txt", sep='\t')
In [24]:
#the data frame contains 792 columns, corresponds to 791 samples, the first column is gene name
#mutations has been mapped onto each gene. So you are dealing with files that have been processed by bioinformatics pipeline
TCGA_BRCA_MC3
Out[24]:
sample TCGA-3C-AAAU-01 TCGA-3C-AALI-01 TCGA-3C-AALJ-01 TCGA-3C-AALK-01 TCGA-4H-AAAK-01 TCGA-5L-AAT0-01 TCGA-5L-AAT1-01 TCGA-5T-A9QA-01 TCGA-A1-A0SB-01 ... TCGA-UL-AAZ6-01 TCGA-UU-A93S-01 TCGA-V7-A7HQ-01 TCGA-W8-A86G-01 TCGA-WT-AB41-01 TCGA-WT-AB44-01 TCGA-XX-A899-01 TCGA-XX-A89A-01 TCGA-Z7-A8R5-01 TCGA-Z7-A8R6-01
0 UBE2Q2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 CHMP1B 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 PSMA2P1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 SHQ1P1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 CPHL1P 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
40538 PTRF 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
40539 DIAPH2-AS1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
40540 SELV 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
40541 NFIX 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
40542 SELP 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

40543 rows × 792 columns

In [ ]:
#here the 0 means non-silent mutaton is not present; 1 means non-silent mutatoin is present
In [26]:
#this data frame can also give us most frequently mutated
data = TCGA_BRCA_MC3.copy().set_index(TCGA_BRCA_MC3.columns[0])

gene_mutated_counts=data.sum(axis=1)
In [28]:
gene_mutated_counts
Out[28]:
sample
UBE2Q2        2
CHMP1B        1
PSMA2P1       0
SHQ1P1        0
CPHL1P        0
             ..
PTRF          6
DIAPH2-AS1    0
SELV          0
NFIX          1
SELP          6
Length: 40543, dtype: int64
In [30]:
#looking for genes most frequently mutated
top_values = gene_mutated_counts.sort_values(ascending=False).head(25)
top_values
Out[30]:
sample
PIK3CA    273
TP53      264
TTN       137
CDH1      102
GATA3      97
MUC16      78
KMT2C      77
MAP3K1     66
SYNE1      50
PTEN       48
RYR2       47
FLG        43
HMCN1      43
SPTA1      42
USH2A      41
DMD        40
NEB        39
ZFHX4      38
NCOR1      38
OBSCN      38
CSMD3      35
RUNX1      32
MAP2K4     32
LRP2       32
MUC4       32
dtype: int64
In [31]:
import matplotlib.pyplot as plt
plt.figure(figsize=(9,6))
top_values[::-1].plot(kind='barh', edgecolor='black')
plt.xlabel('Number of TCGA-BRCA tumours with ≥1 nonsilent mutation')
plt.title('Most frequently mutated genes in TCGA Breast Cancer (MC3)')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [11]:
#check your path
import os

# Show current path
print(os.getcwd())
/Users/yongmeiwang/yongmei_linux/Jupyter_notebook/AI_Class_summer2025
In [4]:
#donwload the other public version, smaller file, 8.5MB after unzip it
#make sure you have the file under the path, then the following command will read it. If you did not unzip it, pd.read can still read it. 

TCGA_BRCA_MC3_Public=pd.read_csv("mc3_BRCA_mc3.txt", sep='\t')
In [5]:
TCGA_BRCA_MC3_Public
Out[5]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
0 TCGA-3C-AAAU-01 10 122668955 122668955 G A WDR11 3'UTR NaN 0.39 NaN NaN
1 TCGA-3C-AAAU-01 10 8115874 8115875 - A GATA3 Frame_Shift_Ins p.P409Afs*99 0.34 NaN NaN
2 TCGA-3C-AAAU-01 11 65272906 65272908 AAA - MALAT1 RNA NaN 0.27 NaN NaN
3 TCGA-3C-AAAU-01 11 66082467 66082467 C T CD248 Missense_Mutation p.E678K 0.07 tolerated(0.12) benign(0.001)
4 TCGA-3C-AAAU-01 11 66193652 66193652 G C NPAS4 3'UTR NaN 0.20 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
92114 TCGA-Z7-A8R6-01 9 95396703 95396703 C T IPPK Missense_Mutation p.E379K 0.16 deleterious(0.01) probably_damaging(0.968)
92115 TCGA-Z7-A8R6-01 X 123217344 123217344 C T STAG2 Missense_Mutation p.L1000F 0.39 deleterious(0) probably_damaging(1)
92116 TCGA-Z7-A8R6-01 X 30671631 30671631 G A GK 5'UTR NaN 0.36 NaN NaN
92117 TCGA-Z7-A8R6-01 X 51151398 51151398 C G CXorf67 3'UTR NaN 0.32 NaN NaN
92118 TCGA-Z7-A8R6-01 X 54014379 54014379 T A PHF8 Splice_Site p.X613_splice 0.07 NaN NaN

92119 rows × 12 columns

In [7]:
TCGA_BRCA_MC3_Public_v2 = pd.read_csv("mc3_BRCA_mc3.txt", sep='\t', encoding='utf-8-sig')
TCGA_BRCA_MC3_Public_v2
Out[7]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
0 TCGA-3C-AAAU-01 10 122668955 122668955 G A WDR11 3'UTR NaN 0.39 NaN NaN
1 TCGA-3C-AAAU-01 10 8115874 8115875 - A GATA3 Frame_Shift_Ins p.P409Afs*99 0.34 NaN NaN
2 TCGA-3C-AAAU-01 11 65272906 65272908 AAA - MALAT1 RNA NaN 0.27 NaN NaN
3 TCGA-3C-AAAU-01 11 66082467 66082467 C T CD248 Missense_Mutation p.E678K 0.07 tolerated(0.12) benign(0.001)
4 TCGA-3C-AAAU-01 11 66193652 66193652 G C NPAS4 3'UTR NaN 0.20 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
92114 TCGA-Z7-A8R6-01 9 95396703 95396703 C T IPPK Missense_Mutation p.E379K 0.16 deleterious(0.01) probably_damaging(0.968)
92115 TCGA-Z7-A8R6-01 X 123217344 123217344 C T STAG2 Missense_Mutation p.L1000F 0.39 deleterious(0) probably_damaging(1)
92116 TCGA-Z7-A8R6-01 X 30671631 30671631 G A GK 5'UTR NaN 0.36 NaN NaN
92117 TCGA-Z7-A8R6-01 X 51151398 51151398 C G CXorf67 3'UTR NaN 0.32 NaN NaN
92118 TCGA-Z7-A8R6-01 X 54014379 54014379 T A PHF8 Splice_Site p.X613_splice 0.07 NaN NaN

92119 rows × 12 columns

In [6]:
#the above file contains mutation types observed. The first column is sample ID that link mutations observed to patient ID and samples. 
#The column heading "gene" is the HUGO gene symbols. We will work with this one to figure out what types of mutatoins are present in the TCGA BRCA study

# Show available columns
print(TCGA_BRCA_MC3_Public.columns)
Index(['sample', 'chr', 'start', 'end', 'reference', 'alt', 'gene', 'effect',
       'Amino_Acid_Change', 'DNA_VAF', 'SIFT', 'PolyPhen'],
      dtype='object')
In [8]:
#The effect columns give the types of mutations present. Lets look at the mutation types more carefully
mutation_type_counts=TCGA_BRCA_MC3_Public['effect'].value_counts()
In [9]:
mutation_type_counts
Out[9]:
effect
Missense_Mutation         45634
Silent                    17122
Frame_Shift_Del            8522
3'UTR                      6695
Nonsense_Mutation          3666
Intron                     3212
5'UTR                      2492
Splice_Site                1399
RNA                        1160
Frame_Shift_Ins             610
3'Flank                     530
5'Flank                     443
In_Frame_Del                441
Translation_Start_Site       74
Nonstop_Mutation             66
In_Frame_Ins                 34
large deletion               19
Name: count, dtype: int64
In [10]:
#Lets look at SIFT column. This column tell us Whether a missense mutation (amino acid change) is 
#likely to be damaging or tolerated by the protein.
print(TCGA_BRCA_MC3_Public['SIFT'].value_counts())
SIFT
deleterious(0)                    11747
deleterious(0.01)                  3740
deleterious(0.02)                  2256
deleterious(0.03)                  1784
deleterious(0.04)                  1400
                                  ...  
tolerated_low_confidence(0.97)        1
tolerated_low_confidence(0.77)        1
tolerated_low_confidence(0.92)        1
tolerated_low_confidence(0.93)        1
tolerated_low_confidence(0.87)        1
Name: count, Length: 204, dtype: int64
In [12]:
#also lets try to look at sample unique value
samples_types=TCGA_BRCA_MC3_Public['sample'].value_counts()
In [13]:
samples_types
Out[13]:
sample
TCGA-AC-A23H-01    6405
TCGA-EW-A2FV-01    4231
TCGA-D8-A27V-01    3332
TCGA-5L-AAT1-01    1995
TCGA-BH-A18G-01    1899
                   ... 
TCGA-AO-A03U-01       7
TCGA-A2-A25F-01       6
TCGA-LL-A440-01       6
TCGA-EW-A1P1-01       3
TCGA-AC-A2FK-01       3
Name: count, Length: 791, dtype: int64
In [14]:
gene_types=TCGA_BRCA_MC3_Public['gene'].value_counts()
In [15]:
gene_types
Out[15]:
gene
PIK3CA     315
TTN        285
TP53       273
MUC16      141
CDH1       108
          ... 
ZNF587B      1
BLZF1        1
SLPI         1
PLTP         1
FOXQ1        1
Name: count, Length: 18065, dtype: int64
In [18]:
#Plot the top 50 genes most frequently mutated
import matplotlib.pyplot as plt

top_genes = gene_types.head(25)

plt.figure(figsize=(10, 6))
top_genes.plot(kind='barh', color='blue', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA BRCA (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [19]:
#look into mutation types on TP53
TP53_mutations = TCGA_BRCA_MC3_Public[TCGA_BRCA_MC3_Public['gene'] == 'TP53']
print(TP53_mutations['effect'].value_counts())
effect
Missense_Mutation    166
Nonsense_Mutation     40
Frame_Shift_Del       32
Splice_Site           18
Frame_Shift_Ins       10
In_Frame_Del           3
Silent                 3
3'UTR                  1
Name: count, dtype: int64
In [20]:
TP53_mutations
Out[20]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
347 TCGA-3C-AALI-01 17 7578382 7578382 G T TP53 Nonsense_Mutation p.S183* 0.65 NaN NaN
3791 TCGA-A1-A0SI-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.30 tolerated(0.11) benign(0.308)
4064 TCGA-A1-A0SK-01 17 7578532 7578532 A T TP53 Missense_Mutation p.M133K 0.98 deleterious(0) benign(0.122)
4354 TCGA-A1-A0SO-01 17 7578190 7578190 T C TP53 Missense_Mutation p.Y220C 0.87 deleterious(0) probably_damaging(1)
4633 TCGA-A1-A0SP-01 17 7578382 7578382 G C TP53 Nonsense_Mutation p.S183* 0.49 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
90161 TCGA-PL-A8LZ-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.32 tolerated(0.11) benign(0.308)
90450 TCGA-S3-AA0Z-01 17 7578550 7578550 G T TP53 Missense_Mutation p.S127Y 0.93 deleterious(0) probably_damaging(1)
90551 TCGA-S3-AA10-01 17 7579335 7579336 - C TP53 Frame_Shift_Ins p.T118Dfs*31 0.33 NaN NaN
91294 TCGA-UU-A93S-01 17 7578523 7578523 T G TP53 Missense_Mutation p.Q136P 0.62 deleterious(0) probably_damaging(1)
92047 TCGA-Z7-A8R6-01 17 7577085 7577085 C T TP53 Missense_Mutation p.E285K 0.88 deleterious(0) probably_damaging(0.985)

273 rows × 12 columns

In [21]:
TP53_mutations.head
Out[21]:
<bound method NDFrame.head of                 sample chr    start      end reference alt  gene  \
347    TCGA-3C-AALI-01  17  7578382  7578382         G   T  TP53   
3791   TCGA-A1-A0SI-01  17  7578406  7578406         C   T  TP53   
4064   TCGA-A1-A0SK-01  17  7578532  7578532         A   T  TP53   
4354   TCGA-A1-A0SO-01  17  7578190  7578190         T   C  TP53   
4633   TCGA-A1-A0SP-01  17  7578382  7578382         G   C  TP53   
...                ...  ..      ...      ...       ...  ..   ...   
90161  TCGA-PL-A8LZ-01  17  7578406  7578406         C   T  TP53   
90450  TCGA-S3-AA0Z-01  17  7578550  7578550         G   T  TP53   
90551  TCGA-S3-AA10-01  17  7579335  7579336         -   C  TP53   
91294  TCGA-UU-A93S-01  17  7578523  7578523         T   G  TP53   
92047  TCGA-Z7-A8R6-01  17  7577085  7577085         C   T  TP53   

                  effect Amino_Acid_Change  DNA_VAF             SIFT  \
347    Nonsense_Mutation           p.S183*     0.65              NaN   
3791   Missense_Mutation           p.R175H     0.30  tolerated(0.11)   
4064   Missense_Mutation           p.M133K     0.98   deleterious(0)   
4354   Missense_Mutation           p.Y220C     0.87   deleterious(0)   
4633   Nonsense_Mutation           p.S183*     0.49              NaN   
...                  ...               ...      ...              ...   
90161  Missense_Mutation           p.R175H     0.32  tolerated(0.11)   
90450  Missense_Mutation           p.S127Y     0.93   deleterious(0)   
90551    Frame_Shift_Ins      p.T118Dfs*31     0.33              NaN   
91294  Missense_Mutation           p.Q136P     0.62   deleterious(0)   
92047  Missense_Mutation           p.E285K     0.88   deleterious(0)   

                       PolyPhen  
347                         NaN  
3791              benign(0.308)  
4064              benign(0.122)  
4354       probably_damaging(1)  
4633                        NaN  
...                         ...  
90161             benign(0.308)  
90450      probably_damaging(1)  
90551                       NaN  
91294      probably_damaging(1)  
92047  probably_damaging(0.985)  

[273 rows x 12 columns]>
In [32]:
PIK3CA_mutations=TCGA_BRCA_MC3_Public[TCGA_BRCA_MC3_Public['gene'] == 'PIK3CA']
print(PIK3CA_mutations['effect'].value_counts())
effect
Missense_Mutation    298
In_Frame_Del          10
Silent                 7
Name: count, dtype: int64
In [33]:
PIK3CA_mutations.head
Out[33]:
<bound method NDFrame.head of                 sample chr      start        end reference alt    gene  \
979    TCGA-3C-AALK-01   3  178936082  178936082         G   A  PIK3CA   
980    TCGA-3C-AALK-01   3  178951957  178951957         G   T  PIK3CA   
1215   TCGA-5L-AAT0-01   3  178952085  178952085         A   T  PIK3CA   
2631   TCGA-5L-AAT1-01   3  178916876  178916876         G   A  PIK3CA   
2632   TCGA-5L-AAT1-01   3  178936082  178936082         G   A  PIK3CA   
...                ...  ..        ...        ...       ...  ..     ...   
90859  TCGA-S3-AA14-01   3  178936082  178936082         G   A  PIK3CA   
91557  TCGA-WT-AB41-01   3  178917478  178917478         G   A  PIK3CA   
91681  TCGA-XX-A899-01   3  178916861  178916861         T   C  PIK3CA   
91870  TCGA-XX-A89A-01   3  178936082  178936082         G   A  PIK3CA   
91992  TCGA-Z7-A8R5-01   3  178952085  178952085         A   G  PIK3CA   

                  effect Amino_Acid_Change  DNA_VAF               SIFT  \
979    Missense_Mutation           p.E542K     0.21  deleterious(0.04)   
980    Missense_Mutation          p.M1004I     0.19  deleterious(0.01)   
1215   Missense_Mutation          p.H1047L     0.22    tolerated(0.44)   
2631   Missense_Mutation            p.R88Q     0.17    tolerated(0.06)   
2632   Missense_Mutation           p.E542K     0.18  deleterious(0.04)   
...                  ...               ...      ...                ...   
90859  Missense_Mutation           p.E542K     0.26  deleterious(0.04)   
91557  Missense_Mutation           p.G118D     0.36    tolerated(0.05)   
91681  Missense_Mutation            p.F83S     0.21  deleterious(0.02)   
91870  Missense_Mutation           p.E542K     0.27  deleterious(0.04)   
91992  Missense_Mutation          p.H1047R     0.16    tolerated(0.11)   

                       PolyPhen  
979     probably_damaging(0.96)  
980               benign(0.331)  
1215              benign(0.085)  
2631   probably_damaging(0.998)  
2632    probably_damaging(0.96)  
...                         ...  
90859   probably_damaging(0.96)  
91557  possibly_damaging(0.704)  
91681              benign(0.09)  
91870   probably_damaging(0.96)  
91992  possibly_damaging(0.529)  

[315 rows x 12 columns]>
In [34]:
#try to see if missense mutatons correlated 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your DataFrame is called tp53_df
# Extract numeric SIFT score from string like 'tolerated(0.11)'
TP53_mutations['SIFT_score'] = TP53_mutations['SIFT'].str.extract(r"\((.*?)\)").astype(float)

# Filter only missense mutations with valid SIFT scores
missense_tp53 = TP53_mutations[
    (TP53_mutations['effect'] == 'Missense_Mutation') &
    (TP53_mutations['SIFT_score'].notna())
]

# Plot distribution
plt.figure(figsize=(8, 5))
sns.histplot(missense_tp53['SIFT_score'], bins=10, kde=True, color='steelblue')
plt.axvline(0.05, color='red', linestyle='--', label='Deleterious Threshold (0.05)')
plt.xlabel("SIFT Score")
plt.ylabel("Number of Missense Mutations")
plt.title("SIFT Score Distribution for TP53 Missense Mutations")
plt.legend()
plt.tight_layout()
plt.show()
/var/folders/zj/bjsfy87j7xxdx6w16y54dr_h0000gn/T/ipykernel_34821/714209853.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TP53_mutations['SIFT_score'] = TP53_mutations['SIFT'].str.extract(r"\((.*?)\)").astype(float)
No description has been provided for this image
In [35]:
missense_tp53
Out[35]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen SIFT_score
3791 TCGA-A1-A0SI-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.30 tolerated(0.11) benign(0.308) 0.11
4064 TCGA-A1-A0SK-01 17 7578532 7578532 A T TP53 Missense_Mutation p.M133K 0.98 deleterious(0) benign(0.122) 0.00
4354 TCGA-A1-A0SO-01 17 7578190 7578190 T C TP53 Missense_Mutation p.Y220C 0.87 deleterious(0) probably_damaging(1) 0.00
4973 TCGA-A2-A04W-01 17 7577124 7577124 C T TP53 Missense_Mutation p.V272M 0.34 deleterious(0) probably_damaging(0.997) 0.00
5208 TCGA-A2-A0CL-01 17 7577120 7577120 C A TP53 Missense_Mutation p.R273L 0.18 deleterious(0) probably_damaging(0.993) 0.00
... ... ... ... ... ... ... ... ... ... ... ... ... ...
89947 TCGA-PL-A8LV-01 17 7577114 7577114 C T TP53 Missense_Mutation p.C275Y 0.45 deleterious(0) probably_damaging(0.994) 0.00
90161 TCGA-PL-A8LZ-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.32 tolerated(0.11) benign(0.308) 0.11
90450 TCGA-S3-AA0Z-01 17 7578550 7578550 G T TP53 Missense_Mutation p.S127Y 0.93 deleterious(0) probably_damaging(1) 0.00
91294 TCGA-UU-A93S-01 17 7578523 7578523 T G TP53 Missense_Mutation p.Q136P 0.62 deleterious(0) probably_damaging(1) 0.00
92047 TCGA-Z7-A8R6-01 17 7577085 7577085 C T TP53 Missense_Mutation p.E285K 0.88 deleterious(0) probably_damaging(0.985) 0.00

166 rows × 13 columns

In [36]:
TP53_mutations
Out[36]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen SIFT_score
347 TCGA-3C-AALI-01 17 7578382 7578382 G T TP53 Nonsense_Mutation p.S183* 0.65 NaN NaN NaN
3791 TCGA-A1-A0SI-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.30 tolerated(0.11) benign(0.308) 0.11
4064 TCGA-A1-A0SK-01 17 7578532 7578532 A T TP53 Missense_Mutation p.M133K 0.98 deleterious(0) benign(0.122) 0.00
4354 TCGA-A1-A0SO-01 17 7578190 7578190 T C TP53 Missense_Mutation p.Y220C 0.87 deleterious(0) probably_damaging(1) 0.00
4633 TCGA-A1-A0SP-01 17 7578382 7578382 G C TP53 Nonsense_Mutation p.S183* 0.49 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
90161 TCGA-PL-A8LZ-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.32 tolerated(0.11) benign(0.308) 0.11
90450 TCGA-S3-AA0Z-01 17 7578550 7578550 G T TP53 Missense_Mutation p.S127Y 0.93 deleterious(0) probably_damaging(1) 0.00
90551 TCGA-S3-AA10-01 17 7579335 7579336 - C TP53 Frame_Shift_Ins p.T118Dfs*31 0.33 NaN NaN NaN
91294 TCGA-UU-A93S-01 17 7578523 7578523 T G TP53 Missense_Mutation p.Q136P 0.62 deleterious(0) probably_damaging(1) 0.00
92047 TCGA-Z7-A8R6-01 17 7577085 7577085 C T TP53 Missense_Mutation p.E285K 0.88 deleterious(0) probably_damaging(0.985) 0.00

273 rows × 13 columns

In [39]:
# correlate DNA_VAF with SIFT score 
import pandas as pd
import matplotlib.pyplot as plt

# Extract numeric SIFT score from text like 'tolerated(0.11)'
TP53_mutations['SIFT_score'] = TP53_mutations['SIFT'].str.extract(r"\((.*?)\)").astype(float)

# Convert DNA_VAF to numeric if it's not already
TP53_mutations['DNA_VAF'] = pd.to_numeric(TP53_mutations['DNA_VAF'], errors='coerce')

# Filter for valid missense mutations with VAF and SIFT scores
missense_tp53 = TP53_mutations[
    (TP53_mutations['effect'] == 'Missense_Mutation') &
    (TP53_mutations['SIFT_score'].notna()) &
    (TP53_mutations['DNA_VAF'].notna())
]

# Create scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(missense_tp53['SIFT_score'], missense_tp53['DNA_VAF'], color='steelblue', alpha=0.7, edgecolor='black')
plt.axvline(0.05, color='red', linestyle='--', label='SIFT Deleterious Threshold (0.05)')
plt.xlabel("SIFT Score")
plt.ylabel("DNA Variant Allele Frequency (VAF)")
plt.title("TP53 Missense Mutations: SIFT Score vs. VAF")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
/var/folders/zj/bjsfy87j7xxdx6w16y54dr_h0000gn/T/ipykernel_34821/1021251088.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TP53_mutations['SIFT_score'] = TP53_mutations['SIFT'].str.extract(r"\((.*?)\)").astype(float)
/var/folders/zj/bjsfy87j7xxdx6w16y54dr_h0000gn/T/ipykernel_34821/1021251088.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TP53_mutations['DNA_VAF'] = pd.to_numeric(TP53_mutations['DNA_VAF'], errors='coerce')
No description has been provided for this image
In [40]:
TP53_mutations
Out[40]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen SIFT_score
347 TCGA-3C-AALI-01 17 7578382 7578382 G T TP53 Nonsense_Mutation p.S183* 0.65 NaN NaN NaN
3791 TCGA-A1-A0SI-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.30 tolerated(0.11) benign(0.308) 0.11
4064 TCGA-A1-A0SK-01 17 7578532 7578532 A T TP53 Missense_Mutation p.M133K 0.98 deleterious(0) benign(0.122) 0.00
4354 TCGA-A1-A0SO-01 17 7578190 7578190 T C TP53 Missense_Mutation p.Y220C 0.87 deleterious(0) probably_damaging(1) 0.00
4633 TCGA-A1-A0SP-01 17 7578382 7578382 G C TP53 Nonsense_Mutation p.S183* 0.49 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
90161 TCGA-PL-A8LZ-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.32 tolerated(0.11) benign(0.308) 0.11
90450 TCGA-S3-AA0Z-01 17 7578550 7578550 G T TP53 Missense_Mutation p.S127Y 0.93 deleterious(0) probably_damaging(1) 0.00
90551 TCGA-S3-AA10-01 17 7579335 7579336 - C TP53 Frame_Shift_Ins p.T118Dfs*31 0.33 NaN NaN NaN
91294 TCGA-UU-A93S-01 17 7578523 7578523 T G TP53 Missense_Mutation p.Q136P 0.62 deleterious(0) probably_damaging(1) 0.00
92047 TCGA-Z7-A8R6-01 17 7577085 7577085 C T TP53 Missense_Mutation p.E285K 0.88 deleterious(0) probably_damaging(0.985) 0.00

273 rows × 13 columns

In [42]:
TP53_mutations['effect'].value_counts()
Out[42]:
effect
Missense_Mutation    166
Nonsense_Mutation     40
Frame_Shift_Del       32
Splice_Site           18
Frame_Shift_Ins       10
In_Frame_Del           3
Silent                 3
3'UTR                  1
Name: count, dtype: int64
In [43]:
PIK3CA_mutations['effect'].value_counts()
Out[43]:
effect
Missense_Mutation    298
In_Frame_Del          10
Silent                 7
Name: count, dtype: int64
In [50]:
# figure out position of missense on PIK3CA_mutations
# Filter for PIK3CA missense mutations
#pik3ca_missense = TCGA_BRCA_MC3_Public[
#    (TCGA_BRCA_MC3_Public['gene'] == 'PIK3CA') &
#    (TCGA_BRCA_MC3_Public['effect'].str.contains("missense", case=False, na=False))
#]

pik3ca_missense = PIK3CA_mutations[PIK3CA_mutations['effect'].str.contains("missense", case=False, na=False)]


# Show relevant columns: position and amino acid change
result = pik3ca_missense[['sample', 'chr', 'start', 'end', 'Amino_Acid_Change', 'DNA_VAF', 'SIFT']]

# Sort by genomic start position (optional)
result = result.sort_values(by='start')

# Display the result
result
Out[50]:
sample chr start end Amino_Acid_Change DNA_VAF SIFT
77187 TCGA-EW-A1OV-01 3 178916854 178916854 p.E81K 0.12 deleterious(0)
34962 TCGA-BH-A0B6-01 3 178916854 178916854 p.E81K 0.26 deleterious(0)
91681 TCGA-XX-A899-01 3 178916861 178916861 p.F83S 0.21 deleterious(0.02)
2631 TCGA-5L-AAT1-01 3 178916876 178916876 p.R88Q 0.17 tolerated(0.06)
73544 TCGA-E9-A1RE-01 3 178916924 178916924 p.P104L 0.62 deleterious(0.01)
... ... ... ... ... ... ... ...
47642 TCGA-C8-A131-01 3 178952085 178952085 p.H1047R 0.32 tolerated(0.11)
43758 TCGA-BH-A1F8-01 3 178952085 178952085 p.H1047R 0.37 tolerated(0.11)
5693 TCGA-A2-A0CW-01 3 178952090 178952090 p.G1049R 0.21 tolerated(0.09)
45306 TCGA-BH-A2L8-01 3 178952090 178952090 p.G1049R 0.16 tolerated(0.09)
5785 TCGA-A2-A0EN-01 3 178952139 178952139 p.H1065L 0.23 tolerated_low_confidence(0.13)

298 rows × 7 columns

In [51]:
pik3ca_missense
Out[51]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
979 TCGA-3C-AALK-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.21 deleterious(0.04) probably_damaging(0.96)
980 TCGA-3C-AALK-01 3 178951957 178951957 G T PIK3CA Missense_Mutation p.M1004I 0.19 deleterious(0.01) benign(0.331)
1215 TCGA-5L-AAT0-01 3 178952085 178952085 A T PIK3CA Missense_Mutation p.H1047L 0.22 tolerated(0.44) benign(0.085)
2631 TCGA-5L-AAT1-01 3 178916876 178916876 G A PIK3CA Missense_Mutation p.R88Q 0.17 tolerated(0.06) probably_damaging(0.998)
2632 TCGA-5L-AAT1-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.18 deleterious(0.04) probably_damaging(0.96)
... ... ... ... ... ... ... ... ... ... ... ... ...
90859 TCGA-S3-AA14-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.26 deleterious(0.04) probably_damaging(0.96)
91557 TCGA-WT-AB41-01 3 178917478 178917478 G A PIK3CA Missense_Mutation p.G118D 0.36 tolerated(0.05) possibly_damaging(0.704)
91681 TCGA-XX-A899-01 3 178916861 178916861 T C PIK3CA Missense_Mutation p.F83S 0.21 deleterious(0.02) benign(0.09)
91870 TCGA-XX-A89A-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.27 deleterious(0.04) probably_damaging(0.96)
91992 TCGA-Z7-A8R5-01 3 178952085 178952085 A G PIK3CA Missense_Mutation p.H1047R 0.16 tolerated(0.11) possibly_damaging(0.529)

298 rows × 12 columns

In [52]:
# Extract numeric part from the amino acid change
pik3ca_missense['AA_position'] = pik3ca_missense['Amino_Acid_Change'].str.extract(r'p\.\D+(\d+)\D*').astype(float)

# Drop NAs and sort
pik3ca_missense = pik3ca_missense.dropna(subset=['AA_position']).sort_values(by='AA_position')

# View the result
print(pik3ca_missense[['Amino_Acid_Change', 'AA_position', 'DNA_VAF']])
      Amino_Acid_Change  AA_position  DNA_VAF
34962            p.E81K         81.0     0.26
77187            p.E81K         81.0     0.12
91681            p.F83S         83.0     0.21
2631             p.R88Q         88.0     0.17
73544           p.P104L        104.0     0.62
...                 ...          ...      ...
47642          p.H1047R       1047.0     0.32
43758          p.H1047R       1047.0     0.37
5693           p.G1049R       1049.0     0.21
45306          p.G1049R       1049.0     0.16
5785           p.H1065L       1065.0     0.23

[298 rows x 3 columns]
/var/folders/zj/bjsfy87j7xxdx6w16y54dr_h0000gn/T/ipykernel_34821/87756107.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pik3ca_missense['AA_position'] = pik3ca_missense['Amino_Acid_Change'].str.extract(r'p\.\D+(\d+)\D*').astype(float)
In [53]:
#generate AA position scatter plot
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 2))
plt.scatter(pik3ca_missense['AA_position'], [1]*len(pik3ca_missense),  # y=1 for all to align horizontally
            color='darkorange', edgecolor='black', alpha=0.8)

plt.xlabel("Amino Acid Position (PIK3CA)")
plt.yticks([])  # Hide y-axis ticks
plt.title("Missense Mutations in PIK3CA (Amino Acid Positions)")
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [54]:
#change to bar plot
import matplotlib.pyplot as plt

# Count how many times each AA_position occurs
aa_position_counts = pik3ca_missense['AA_position'].value_counts().sort_index()

# Plot
plt.figure(figsize=(12, 4))
plt.bar(aa_position_counts.index, aa_position_counts.values, color='tomato', edgecolor='black')
plt.xlabel("Amino Acid Position (PIK3CA)")
plt.ylabel("Mutation Count")
plt.title("Frequency of Missense Mutations in PIK3CA by Amino Acid Position")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]: