In [3]:
import pandas as pd
import matplotlib.pyplot as plt
In [23]:
#read the downloaded TCGA BRCA SOmatic Variants MC3 gene-level non-silent mutation data. The downloaded text file (after unzip) is 64MB.
#Be sure to move the downloaded file into the directory: AI_class
TCGA_BRCA_MC3=pd.read_csv("mc3_gene_level_BRCA_mc3_gene_level.txt", sep='\t')
In [24]:
#the data frame contains 792 columns, corresponds to 791 samples, the first column is gene name
#mutations has been mapped onto each gene. So you are dealing with files that have been processed by bioinformatics pipeline
TCGA_BRCA_MC3
Out[24]:
| sample | TCGA-3C-AAAU-01 | TCGA-3C-AALI-01 | TCGA-3C-AALJ-01 | TCGA-3C-AALK-01 | TCGA-4H-AAAK-01 | TCGA-5L-AAT0-01 | TCGA-5L-AAT1-01 | TCGA-5T-A9QA-01 | TCGA-A1-A0SB-01 | ... | TCGA-UL-AAZ6-01 | TCGA-UU-A93S-01 | TCGA-V7-A7HQ-01 | TCGA-W8-A86G-01 | TCGA-WT-AB41-01 | TCGA-WT-AB44-01 | TCGA-XX-A899-01 | TCGA-XX-A89A-01 | TCGA-Z7-A8R5-01 | TCGA-Z7-A8R6-01 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | UBE2Q2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | CHMP1B | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | PSMA2P1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | SHQ1P1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | CPHL1P | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 40538 | PTRF | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 40539 | DIAPH2-AS1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 40540 | SELV | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 40541 | NFIX | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 40542 | SELP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
40543 rows × 792 columns
In [ ]:
#here the 0 means non-silent mutaton is not present; 1 means non-silent mutatoin is present
In [26]:
#this data frame can also give us most frequently mutated
data = TCGA_BRCA_MC3.copy().set_index(TCGA_BRCA_MC3.columns[0])
gene_mutated_counts=data.sum(axis=1)
In [28]:
gene_mutated_counts
Out[28]:
sample
UBE2Q2 2
CHMP1B 1
PSMA2P1 0
SHQ1P1 0
CPHL1P 0
..
PTRF 6
DIAPH2-AS1 0
SELV 0
NFIX 1
SELP 6
Length: 40543, dtype: int64
In [30]:
#looking for genes most frequently mutated
top_values = gene_mutated_counts.sort_values(ascending=False).head(25)
top_values
Out[30]:
sample PIK3CA 273 TP53 264 TTN 137 CDH1 102 GATA3 97 MUC16 78 KMT2C 77 MAP3K1 66 SYNE1 50 PTEN 48 RYR2 47 FLG 43 HMCN1 43 SPTA1 42 USH2A 41 DMD 40 NEB 39 ZFHX4 38 NCOR1 38 OBSCN 38 CSMD3 35 RUNX1 32 MAP2K4 32 LRP2 32 MUC4 32 dtype: int64
In [31]:
import matplotlib.pyplot as plt
plt.figure(figsize=(9,6))
top_values[::-1].plot(kind='barh', edgecolor='black')
plt.xlabel('Number of TCGA-BRCA tumours with ≥1 nonsilent mutation')
plt.title('Most frequently mutated genes in TCGA Breast Cancer (MC3)')
plt.tight_layout()
plt.show()
In [11]:
#check your path
import os
# Show current path
print(os.getcwd())
/Users/yongmeiwang/yongmei_linux/Jupyter_notebook/AI_Class_summer2025
In [4]:
#donwload the other public version, smaller file, 8.5MB after unzip it
#make sure you have the file under the path, then the following command will read it. If you did not unzip it, pd.read can still read it.
TCGA_BRCA_MC3_Public=pd.read_csv("mc3_BRCA_mc3.txt", sep='\t')
In [5]:
TCGA_BRCA_MC3_Public
Out[5]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TCGA-3C-AAAU-01 | 10 | 122668955 | 122668955 | G | A | WDR11 | 3'UTR | NaN | 0.39 | NaN | NaN |
| 1 | TCGA-3C-AAAU-01 | 10 | 8115874 | 8115875 | - | A | GATA3 | Frame_Shift_Ins | p.P409Afs*99 | 0.34 | NaN | NaN |
| 2 | TCGA-3C-AAAU-01 | 11 | 65272906 | 65272908 | AAA | - | MALAT1 | RNA | NaN | 0.27 | NaN | NaN |
| 3 | TCGA-3C-AAAU-01 | 11 | 66082467 | 66082467 | C | T | CD248 | Missense_Mutation | p.E678K | 0.07 | tolerated(0.12) | benign(0.001) |
| 4 | TCGA-3C-AAAU-01 | 11 | 66193652 | 66193652 | G | C | NPAS4 | 3'UTR | NaN | 0.20 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 92114 | TCGA-Z7-A8R6-01 | 9 | 95396703 | 95396703 | C | T | IPPK | Missense_Mutation | p.E379K | 0.16 | deleterious(0.01) | probably_damaging(0.968) |
| 92115 | TCGA-Z7-A8R6-01 | X | 123217344 | 123217344 | C | T | STAG2 | Missense_Mutation | p.L1000F | 0.39 | deleterious(0) | probably_damaging(1) |
| 92116 | TCGA-Z7-A8R6-01 | X | 30671631 | 30671631 | G | A | GK | 5'UTR | NaN | 0.36 | NaN | NaN |
| 92117 | TCGA-Z7-A8R6-01 | X | 51151398 | 51151398 | C | G | CXorf67 | 3'UTR | NaN | 0.32 | NaN | NaN |
| 92118 | TCGA-Z7-A8R6-01 | X | 54014379 | 54014379 | T | A | PHF8 | Splice_Site | p.X613_splice | 0.07 | NaN | NaN |
92119 rows × 12 columns
In [7]:
TCGA_BRCA_MC3_Public_v2 = pd.read_csv("mc3_BRCA_mc3.txt", sep='\t', encoding='utf-8-sig')
TCGA_BRCA_MC3_Public_v2
Out[7]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TCGA-3C-AAAU-01 | 10 | 122668955 | 122668955 | G | A | WDR11 | 3'UTR | NaN | 0.39 | NaN | NaN |
| 1 | TCGA-3C-AAAU-01 | 10 | 8115874 | 8115875 | - | A | GATA3 | Frame_Shift_Ins | p.P409Afs*99 | 0.34 | NaN | NaN |
| 2 | TCGA-3C-AAAU-01 | 11 | 65272906 | 65272908 | AAA | - | MALAT1 | RNA | NaN | 0.27 | NaN | NaN |
| 3 | TCGA-3C-AAAU-01 | 11 | 66082467 | 66082467 | C | T | CD248 | Missense_Mutation | p.E678K | 0.07 | tolerated(0.12) | benign(0.001) |
| 4 | TCGA-3C-AAAU-01 | 11 | 66193652 | 66193652 | G | C | NPAS4 | 3'UTR | NaN | 0.20 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 92114 | TCGA-Z7-A8R6-01 | 9 | 95396703 | 95396703 | C | T | IPPK | Missense_Mutation | p.E379K | 0.16 | deleterious(0.01) | probably_damaging(0.968) |
| 92115 | TCGA-Z7-A8R6-01 | X | 123217344 | 123217344 | C | T | STAG2 | Missense_Mutation | p.L1000F | 0.39 | deleterious(0) | probably_damaging(1) |
| 92116 | TCGA-Z7-A8R6-01 | X | 30671631 | 30671631 | G | A | GK | 5'UTR | NaN | 0.36 | NaN | NaN |
| 92117 | TCGA-Z7-A8R6-01 | X | 51151398 | 51151398 | C | G | CXorf67 | 3'UTR | NaN | 0.32 | NaN | NaN |
| 92118 | TCGA-Z7-A8R6-01 | X | 54014379 | 54014379 | T | A | PHF8 | Splice_Site | p.X613_splice | 0.07 | NaN | NaN |
92119 rows × 12 columns
In [6]:
#the above file contains mutation types observed. The first column is sample ID that link mutations observed to patient ID and samples.
#The column heading "gene" is the HUGO gene symbols. We will work with this one to figure out what types of mutatoins are present in the TCGA BRCA study
# Show available columns
print(TCGA_BRCA_MC3_Public.columns)
Index(['sample', 'chr', 'start', 'end', 'reference', 'alt', 'gene', 'effect',
'Amino_Acid_Change', 'DNA_VAF', 'SIFT', 'PolyPhen'],
dtype='object')
In [8]:
#The effect columns give the types of mutations present. Lets look at the mutation types more carefully
mutation_type_counts=TCGA_BRCA_MC3_Public['effect'].value_counts()
In [9]:
mutation_type_counts
Out[9]:
effect Missense_Mutation 45634 Silent 17122 Frame_Shift_Del 8522 3'UTR 6695 Nonsense_Mutation 3666 Intron 3212 5'UTR 2492 Splice_Site 1399 RNA 1160 Frame_Shift_Ins 610 3'Flank 530 5'Flank 443 In_Frame_Del 441 Translation_Start_Site 74 Nonstop_Mutation 66 In_Frame_Ins 34 large deletion 19 Name: count, dtype: int64
In [10]:
#Lets look at SIFT column. This column tell us Whether a missense mutation (amino acid change) is
#likely to be damaging or tolerated by the protein.
print(TCGA_BRCA_MC3_Public['SIFT'].value_counts())
SIFT
deleterious(0) 11747
deleterious(0.01) 3740
deleterious(0.02) 2256
deleterious(0.03) 1784
deleterious(0.04) 1400
...
tolerated_low_confidence(0.97) 1
tolerated_low_confidence(0.77) 1
tolerated_low_confidence(0.92) 1
tolerated_low_confidence(0.93) 1
tolerated_low_confidence(0.87) 1
Name: count, Length: 204, dtype: int64
In [12]:
#also lets try to look at sample unique value
samples_types=TCGA_BRCA_MC3_Public['sample'].value_counts()
In [13]:
samples_types
Out[13]:
sample
TCGA-AC-A23H-01 6405
TCGA-EW-A2FV-01 4231
TCGA-D8-A27V-01 3332
TCGA-5L-AAT1-01 1995
TCGA-BH-A18G-01 1899
...
TCGA-AO-A03U-01 7
TCGA-A2-A25F-01 6
TCGA-LL-A440-01 6
TCGA-EW-A1P1-01 3
TCGA-AC-A2FK-01 3
Name: count, Length: 791, dtype: int64
In [14]:
gene_types=TCGA_BRCA_MC3_Public['gene'].value_counts()
In [15]:
gene_types
Out[15]:
gene
PIK3CA 315
TTN 285
TP53 273
MUC16 141
CDH1 108
...
ZNF587B 1
BLZF1 1
SLPI 1
PLTP 1
FOXQ1 1
Name: count, Length: 18065, dtype: int64
In [18]:
#Plot the top 50 genes most frequently mutated
import matplotlib.pyplot as plt
top_genes = gene_types.head(25)
plt.figure(figsize=(10, 6))
top_genes.plot(kind='barh', color='blue', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA BRCA (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
In [19]:
#look into mutation types on TP53
TP53_mutations = TCGA_BRCA_MC3_Public[TCGA_BRCA_MC3_Public['gene'] == 'TP53']
print(TP53_mutations['effect'].value_counts())
effect Missense_Mutation 166 Nonsense_Mutation 40 Frame_Shift_Del 32 Splice_Site 18 Frame_Shift_Ins 10 In_Frame_Del 3 Silent 3 3'UTR 1 Name: count, dtype: int64
In [20]:
TP53_mutations
Out[20]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 347 | TCGA-3C-AALI-01 | 17 | 7578382 | 7578382 | G | T | TP53 | Nonsense_Mutation | p.S183* | 0.65 | NaN | NaN |
| 3791 | TCGA-A1-A0SI-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.30 | tolerated(0.11) | benign(0.308) |
| 4064 | TCGA-A1-A0SK-01 | 17 | 7578532 | 7578532 | A | T | TP53 | Missense_Mutation | p.M133K | 0.98 | deleterious(0) | benign(0.122) |
| 4354 | TCGA-A1-A0SO-01 | 17 | 7578190 | 7578190 | T | C | TP53 | Missense_Mutation | p.Y220C | 0.87 | deleterious(0) | probably_damaging(1) |
| 4633 | TCGA-A1-A0SP-01 | 17 | 7578382 | 7578382 | G | C | TP53 | Nonsense_Mutation | p.S183* | 0.49 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90161 | TCGA-PL-A8LZ-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.32 | tolerated(0.11) | benign(0.308) |
| 90450 | TCGA-S3-AA0Z-01 | 17 | 7578550 | 7578550 | G | T | TP53 | Missense_Mutation | p.S127Y | 0.93 | deleterious(0) | probably_damaging(1) |
| 90551 | TCGA-S3-AA10-01 | 17 | 7579335 | 7579336 | - | C | TP53 | Frame_Shift_Ins | p.T118Dfs*31 | 0.33 | NaN | NaN |
| 91294 | TCGA-UU-A93S-01 | 17 | 7578523 | 7578523 | T | G | TP53 | Missense_Mutation | p.Q136P | 0.62 | deleterious(0) | probably_damaging(1) |
| 92047 | TCGA-Z7-A8R6-01 | 17 | 7577085 | 7577085 | C | T | TP53 | Missense_Mutation | p.E285K | 0.88 | deleterious(0) | probably_damaging(0.985) |
273 rows × 12 columns
In [21]:
TP53_mutations.head
Out[21]:
<bound method NDFrame.head of sample chr start end reference alt gene \
347 TCGA-3C-AALI-01 17 7578382 7578382 G T TP53
3791 TCGA-A1-A0SI-01 17 7578406 7578406 C T TP53
4064 TCGA-A1-A0SK-01 17 7578532 7578532 A T TP53
4354 TCGA-A1-A0SO-01 17 7578190 7578190 T C TP53
4633 TCGA-A1-A0SP-01 17 7578382 7578382 G C TP53
... ... .. ... ... ... .. ...
90161 TCGA-PL-A8LZ-01 17 7578406 7578406 C T TP53
90450 TCGA-S3-AA0Z-01 17 7578550 7578550 G T TP53
90551 TCGA-S3-AA10-01 17 7579335 7579336 - C TP53
91294 TCGA-UU-A93S-01 17 7578523 7578523 T G TP53
92047 TCGA-Z7-A8R6-01 17 7577085 7577085 C T TP53
effect Amino_Acid_Change DNA_VAF SIFT \
347 Nonsense_Mutation p.S183* 0.65 NaN
3791 Missense_Mutation p.R175H 0.30 tolerated(0.11)
4064 Missense_Mutation p.M133K 0.98 deleterious(0)
4354 Missense_Mutation p.Y220C 0.87 deleterious(0)
4633 Nonsense_Mutation p.S183* 0.49 NaN
... ... ... ... ...
90161 Missense_Mutation p.R175H 0.32 tolerated(0.11)
90450 Missense_Mutation p.S127Y 0.93 deleterious(0)
90551 Frame_Shift_Ins p.T118Dfs*31 0.33 NaN
91294 Missense_Mutation p.Q136P 0.62 deleterious(0)
92047 Missense_Mutation p.E285K 0.88 deleterious(0)
PolyPhen
347 NaN
3791 benign(0.308)
4064 benign(0.122)
4354 probably_damaging(1)
4633 NaN
... ...
90161 benign(0.308)
90450 probably_damaging(1)
90551 NaN
91294 probably_damaging(1)
92047 probably_damaging(0.985)
[273 rows x 12 columns]>
In [32]:
PIK3CA_mutations=TCGA_BRCA_MC3_Public[TCGA_BRCA_MC3_Public['gene'] == 'PIK3CA']
print(PIK3CA_mutations['effect'].value_counts())
effect Missense_Mutation 298 In_Frame_Del 10 Silent 7 Name: count, dtype: int64
In [33]:
PIK3CA_mutations.head
Out[33]:
<bound method NDFrame.head of sample chr start end reference alt gene \
979 TCGA-3C-AALK-01 3 178936082 178936082 G A PIK3CA
980 TCGA-3C-AALK-01 3 178951957 178951957 G T PIK3CA
1215 TCGA-5L-AAT0-01 3 178952085 178952085 A T PIK3CA
2631 TCGA-5L-AAT1-01 3 178916876 178916876 G A PIK3CA
2632 TCGA-5L-AAT1-01 3 178936082 178936082 G A PIK3CA
... ... .. ... ... ... .. ...
90859 TCGA-S3-AA14-01 3 178936082 178936082 G A PIK3CA
91557 TCGA-WT-AB41-01 3 178917478 178917478 G A PIK3CA
91681 TCGA-XX-A899-01 3 178916861 178916861 T C PIK3CA
91870 TCGA-XX-A89A-01 3 178936082 178936082 G A PIK3CA
91992 TCGA-Z7-A8R5-01 3 178952085 178952085 A G PIK3CA
effect Amino_Acid_Change DNA_VAF SIFT \
979 Missense_Mutation p.E542K 0.21 deleterious(0.04)
980 Missense_Mutation p.M1004I 0.19 deleterious(0.01)
1215 Missense_Mutation p.H1047L 0.22 tolerated(0.44)
2631 Missense_Mutation p.R88Q 0.17 tolerated(0.06)
2632 Missense_Mutation p.E542K 0.18 deleterious(0.04)
... ... ... ... ...
90859 Missense_Mutation p.E542K 0.26 deleterious(0.04)
91557 Missense_Mutation p.G118D 0.36 tolerated(0.05)
91681 Missense_Mutation p.F83S 0.21 deleterious(0.02)
91870 Missense_Mutation p.E542K 0.27 deleterious(0.04)
91992 Missense_Mutation p.H1047R 0.16 tolerated(0.11)
PolyPhen
979 probably_damaging(0.96)
980 benign(0.331)
1215 benign(0.085)
2631 probably_damaging(0.998)
2632 probably_damaging(0.96)
... ...
90859 probably_damaging(0.96)
91557 possibly_damaging(0.704)
91681 benign(0.09)
91870 probably_damaging(0.96)
91992 possibly_damaging(0.529)
[315 rows x 12 columns]>
In [34]:
#try to see if missense mutatons correlated
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming your DataFrame is called tp53_df
# Extract numeric SIFT score from string like 'tolerated(0.11)'
TP53_mutations['SIFT_score'] = TP53_mutations['SIFT'].str.extract(r"\((.*?)\)").astype(float)
# Filter only missense mutations with valid SIFT scores
missense_tp53 = TP53_mutations[
(TP53_mutations['effect'] == 'Missense_Mutation') &
(TP53_mutations['SIFT_score'].notna())
]
# Plot distribution
plt.figure(figsize=(8, 5))
sns.histplot(missense_tp53['SIFT_score'], bins=10, kde=True, color='steelblue')
plt.axvline(0.05, color='red', linestyle='--', label='Deleterious Threshold (0.05)')
plt.xlabel("SIFT Score")
plt.ylabel("Number of Missense Mutations")
plt.title("SIFT Score Distribution for TP53 Missense Mutations")
plt.legend()
plt.tight_layout()
plt.show()
/var/folders/zj/bjsfy87j7xxdx6w16y54dr_h0000gn/T/ipykernel_34821/714209853.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy TP53_mutations['SIFT_score'] = TP53_mutations['SIFT'].str.extract(r"\((.*?)\)").astype(float)
In [35]:
missense_tp53
Out[35]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | SIFT_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3791 | TCGA-A1-A0SI-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.30 | tolerated(0.11) | benign(0.308) | 0.11 |
| 4064 | TCGA-A1-A0SK-01 | 17 | 7578532 | 7578532 | A | T | TP53 | Missense_Mutation | p.M133K | 0.98 | deleterious(0) | benign(0.122) | 0.00 |
| 4354 | TCGA-A1-A0SO-01 | 17 | 7578190 | 7578190 | T | C | TP53 | Missense_Mutation | p.Y220C | 0.87 | deleterious(0) | probably_damaging(1) | 0.00 |
| 4973 | TCGA-A2-A04W-01 | 17 | 7577124 | 7577124 | C | T | TP53 | Missense_Mutation | p.V272M | 0.34 | deleterious(0) | probably_damaging(0.997) | 0.00 |
| 5208 | TCGA-A2-A0CL-01 | 17 | 7577120 | 7577120 | C | A | TP53 | Missense_Mutation | p.R273L | 0.18 | deleterious(0) | probably_damaging(0.993) | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 89947 | TCGA-PL-A8LV-01 | 17 | 7577114 | 7577114 | C | T | TP53 | Missense_Mutation | p.C275Y | 0.45 | deleterious(0) | probably_damaging(0.994) | 0.00 |
| 90161 | TCGA-PL-A8LZ-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.32 | tolerated(0.11) | benign(0.308) | 0.11 |
| 90450 | TCGA-S3-AA0Z-01 | 17 | 7578550 | 7578550 | G | T | TP53 | Missense_Mutation | p.S127Y | 0.93 | deleterious(0) | probably_damaging(1) | 0.00 |
| 91294 | TCGA-UU-A93S-01 | 17 | 7578523 | 7578523 | T | G | TP53 | Missense_Mutation | p.Q136P | 0.62 | deleterious(0) | probably_damaging(1) | 0.00 |
| 92047 | TCGA-Z7-A8R6-01 | 17 | 7577085 | 7577085 | C | T | TP53 | Missense_Mutation | p.E285K | 0.88 | deleterious(0) | probably_damaging(0.985) | 0.00 |
166 rows × 13 columns
In [36]:
TP53_mutations
Out[36]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | SIFT_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 347 | TCGA-3C-AALI-01 | 17 | 7578382 | 7578382 | G | T | TP53 | Nonsense_Mutation | p.S183* | 0.65 | NaN | NaN | NaN |
| 3791 | TCGA-A1-A0SI-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.30 | tolerated(0.11) | benign(0.308) | 0.11 |
| 4064 | TCGA-A1-A0SK-01 | 17 | 7578532 | 7578532 | A | T | TP53 | Missense_Mutation | p.M133K | 0.98 | deleterious(0) | benign(0.122) | 0.00 |
| 4354 | TCGA-A1-A0SO-01 | 17 | 7578190 | 7578190 | T | C | TP53 | Missense_Mutation | p.Y220C | 0.87 | deleterious(0) | probably_damaging(1) | 0.00 |
| 4633 | TCGA-A1-A0SP-01 | 17 | 7578382 | 7578382 | G | C | TP53 | Nonsense_Mutation | p.S183* | 0.49 | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90161 | TCGA-PL-A8LZ-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.32 | tolerated(0.11) | benign(0.308) | 0.11 |
| 90450 | TCGA-S3-AA0Z-01 | 17 | 7578550 | 7578550 | G | T | TP53 | Missense_Mutation | p.S127Y | 0.93 | deleterious(0) | probably_damaging(1) | 0.00 |
| 90551 | TCGA-S3-AA10-01 | 17 | 7579335 | 7579336 | - | C | TP53 | Frame_Shift_Ins | p.T118Dfs*31 | 0.33 | NaN | NaN | NaN |
| 91294 | TCGA-UU-A93S-01 | 17 | 7578523 | 7578523 | T | G | TP53 | Missense_Mutation | p.Q136P | 0.62 | deleterious(0) | probably_damaging(1) | 0.00 |
| 92047 | TCGA-Z7-A8R6-01 | 17 | 7577085 | 7577085 | C | T | TP53 | Missense_Mutation | p.E285K | 0.88 | deleterious(0) | probably_damaging(0.985) | 0.00 |
273 rows × 13 columns
In [39]:
# correlate DNA_VAF with SIFT score
import pandas as pd
import matplotlib.pyplot as plt
# Extract numeric SIFT score from text like 'tolerated(0.11)'
TP53_mutations['SIFT_score'] = TP53_mutations['SIFT'].str.extract(r"\((.*?)\)").astype(float)
# Convert DNA_VAF to numeric if it's not already
TP53_mutations['DNA_VAF'] = pd.to_numeric(TP53_mutations['DNA_VAF'], errors='coerce')
# Filter for valid missense mutations with VAF and SIFT scores
missense_tp53 = TP53_mutations[
(TP53_mutations['effect'] == 'Missense_Mutation') &
(TP53_mutations['SIFT_score'].notna()) &
(TP53_mutations['DNA_VAF'].notna())
]
# Create scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(missense_tp53['SIFT_score'], missense_tp53['DNA_VAF'], color='steelblue', alpha=0.7, edgecolor='black')
plt.axvline(0.05, color='red', linestyle='--', label='SIFT Deleterious Threshold (0.05)')
plt.xlabel("SIFT Score")
plt.ylabel("DNA Variant Allele Frequency (VAF)")
plt.title("TP53 Missense Mutations: SIFT Score vs. VAF")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
/var/folders/zj/bjsfy87j7xxdx6w16y54dr_h0000gn/T/ipykernel_34821/1021251088.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy TP53_mutations['SIFT_score'] = TP53_mutations['SIFT'].str.extract(r"\((.*?)\)").astype(float) /var/folders/zj/bjsfy87j7xxdx6w16y54dr_h0000gn/T/ipykernel_34821/1021251088.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy TP53_mutations['DNA_VAF'] = pd.to_numeric(TP53_mutations['DNA_VAF'], errors='coerce')
In [40]:
TP53_mutations
Out[40]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | SIFT_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 347 | TCGA-3C-AALI-01 | 17 | 7578382 | 7578382 | G | T | TP53 | Nonsense_Mutation | p.S183* | 0.65 | NaN | NaN | NaN |
| 3791 | TCGA-A1-A0SI-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.30 | tolerated(0.11) | benign(0.308) | 0.11 |
| 4064 | TCGA-A1-A0SK-01 | 17 | 7578532 | 7578532 | A | T | TP53 | Missense_Mutation | p.M133K | 0.98 | deleterious(0) | benign(0.122) | 0.00 |
| 4354 | TCGA-A1-A0SO-01 | 17 | 7578190 | 7578190 | T | C | TP53 | Missense_Mutation | p.Y220C | 0.87 | deleterious(0) | probably_damaging(1) | 0.00 |
| 4633 | TCGA-A1-A0SP-01 | 17 | 7578382 | 7578382 | G | C | TP53 | Nonsense_Mutation | p.S183* | 0.49 | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90161 | TCGA-PL-A8LZ-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.32 | tolerated(0.11) | benign(0.308) | 0.11 |
| 90450 | TCGA-S3-AA0Z-01 | 17 | 7578550 | 7578550 | G | T | TP53 | Missense_Mutation | p.S127Y | 0.93 | deleterious(0) | probably_damaging(1) | 0.00 |
| 90551 | TCGA-S3-AA10-01 | 17 | 7579335 | 7579336 | - | C | TP53 | Frame_Shift_Ins | p.T118Dfs*31 | 0.33 | NaN | NaN | NaN |
| 91294 | TCGA-UU-A93S-01 | 17 | 7578523 | 7578523 | T | G | TP53 | Missense_Mutation | p.Q136P | 0.62 | deleterious(0) | probably_damaging(1) | 0.00 |
| 92047 | TCGA-Z7-A8R6-01 | 17 | 7577085 | 7577085 | C | T | TP53 | Missense_Mutation | p.E285K | 0.88 | deleterious(0) | probably_damaging(0.985) | 0.00 |
273 rows × 13 columns
In [42]:
TP53_mutations['effect'].value_counts()
Out[42]:
effect Missense_Mutation 166 Nonsense_Mutation 40 Frame_Shift_Del 32 Splice_Site 18 Frame_Shift_Ins 10 In_Frame_Del 3 Silent 3 3'UTR 1 Name: count, dtype: int64
In [43]:
PIK3CA_mutations['effect'].value_counts()
Out[43]:
effect Missense_Mutation 298 In_Frame_Del 10 Silent 7 Name: count, dtype: int64
In [50]:
# figure out position of missense on PIK3CA_mutations
# Filter for PIK3CA missense mutations
#pik3ca_missense = TCGA_BRCA_MC3_Public[
# (TCGA_BRCA_MC3_Public['gene'] == 'PIK3CA') &
# (TCGA_BRCA_MC3_Public['effect'].str.contains("missense", case=False, na=False))
#]
pik3ca_missense = PIK3CA_mutations[PIK3CA_mutations['effect'].str.contains("missense", case=False, na=False)]
# Show relevant columns: position and amino acid change
result = pik3ca_missense[['sample', 'chr', 'start', 'end', 'Amino_Acid_Change', 'DNA_VAF', 'SIFT']]
# Sort by genomic start position (optional)
result = result.sort_values(by='start')
# Display the result
result
Out[50]:
| sample | chr | start | end | Amino_Acid_Change | DNA_VAF | SIFT | |
|---|---|---|---|---|---|---|---|
| 77187 | TCGA-EW-A1OV-01 | 3 | 178916854 | 178916854 | p.E81K | 0.12 | deleterious(0) |
| 34962 | TCGA-BH-A0B6-01 | 3 | 178916854 | 178916854 | p.E81K | 0.26 | deleterious(0) |
| 91681 | TCGA-XX-A899-01 | 3 | 178916861 | 178916861 | p.F83S | 0.21 | deleterious(0.02) |
| 2631 | TCGA-5L-AAT1-01 | 3 | 178916876 | 178916876 | p.R88Q | 0.17 | tolerated(0.06) |
| 73544 | TCGA-E9-A1RE-01 | 3 | 178916924 | 178916924 | p.P104L | 0.62 | deleterious(0.01) |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 47642 | TCGA-C8-A131-01 | 3 | 178952085 | 178952085 | p.H1047R | 0.32 | tolerated(0.11) |
| 43758 | TCGA-BH-A1F8-01 | 3 | 178952085 | 178952085 | p.H1047R | 0.37 | tolerated(0.11) |
| 5693 | TCGA-A2-A0CW-01 | 3 | 178952090 | 178952090 | p.G1049R | 0.21 | tolerated(0.09) |
| 45306 | TCGA-BH-A2L8-01 | 3 | 178952090 | 178952090 | p.G1049R | 0.16 | tolerated(0.09) |
| 5785 | TCGA-A2-A0EN-01 | 3 | 178952139 | 178952139 | p.H1065L | 0.23 | tolerated_low_confidence(0.13) |
298 rows × 7 columns
In [51]:
pik3ca_missense
Out[51]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 979 | TCGA-3C-AALK-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.21 | deleterious(0.04) | probably_damaging(0.96) |
| 980 | TCGA-3C-AALK-01 | 3 | 178951957 | 178951957 | G | T | PIK3CA | Missense_Mutation | p.M1004I | 0.19 | deleterious(0.01) | benign(0.331) |
| 1215 | TCGA-5L-AAT0-01 | 3 | 178952085 | 178952085 | A | T | PIK3CA | Missense_Mutation | p.H1047L | 0.22 | tolerated(0.44) | benign(0.085) |
| 2631 | TCGA-5L-AAT1-01 | 3 | 178916876 | 178916876 | G | A | PIK3CA | Missense_Mutation | p.R88Q | 0.17 | tolerated(0.06) | probably_damaging(0.998) |
| 2632 | TCGA-5L-AAT1-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.18 | deleterious(0.04) | probably_damaging(0.96) |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90859 | TCGA-S3-AA14-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.26 | deleterious(0.04) | probably_damaging(0.96) |
| 91557 | TCGA-WT-AB41-01 | 3 | 178917478 | 178917478 | G | A | PIK3CA | Missense_Mutation | p.G118D | 0.36 | tolerated(0.05) | possibly_damaging(0.704) |
| 91681 | TCGA-XX-A899-01 | 3 | 178916861 | 178916861 | T | C | PIK3CA | Missense_Mutation | p.F83S | 0.21 | deleterious(0.02) | benign(0.09) |
| 91870 | TCGA-XX-A89A-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.27 | deleterious(0.04) | probably_damaging(0.96) |
| 91992 | TCGA-Z7-A8R5-01 | 3 | 178952085 | 178952085 | A | G | PIK3CA | Missense_Mutation | p.H1047R | 0.16 | tolerated(0.11) | possibly_damaging(0.529) |
298 rows × 12 columns
In [52]:
# Extract numeric part from the amino acid change
pik3ca_missense['AA_position'] = pik3ca_missense['Amino_Acid_Change'].str.extract(r'p\.\D+(\d+)\D*').astype(float)
# Drop NAs and sort
pik3ca_missense = pik3ca_missense.dropna(subset=['AA_position']).sort_values(by='AA_position')
# View the result
print(pik3ca_missense[['Amino_Acid_Change', 'AA_position', 'DNA_VAF']])
Amino_Acid_Change AA_position DNA_VAF 34962 p.E81K 81.0 0.26 77187 p.E81K 81.0 0.12 91681 p.F83S 83.0 0.21 2631 p.R88Q 88.0 0.17 73544 p.P104L 104.0 0.62 ... ... ... ... 47642 p.H1047R 1047.0 0.32 43758 p.H1047R 1047.0 0.37 5693 p.G1049R 1049.0 0.21 45306 p.G1049R 1049.0 0.16 5785 p.H1065L 1065.0 0.23 [298 rows x 3 columns]
/var/folders/zj/bjsfy87j7xxdx6w16y54dr_h0000gn/T/ipykernel_34821/87756107.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy pik3ca_missense['AA_position'] = pik3ca_missense['Amino_Acid_Change'].str.extract(r'p\.\D+(\d+)\D*').astype(float)
In [53]:
#generate AA position scatter plot
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 2))
plt.scatter(pik3ca_missense['AA_position'], [1]*len(pik3ca_missense), # y=1 for all to align horizontally
color='darkorange', edgecolor='black', alpha=0.8)
plt.xlabel("Amino Acid Position (PIK3CA)")
plt.yticks([]) # Hide y-axis ticks
plt.title("Missense Mutations in PIK3CA (Amino Acid Positions)")
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
In [54]:
#change to bar plot
import matplotlib.pyplot as plt
# Count how many times each AA_position occurs
aa_position_counts = pik3ca_missense['AA_position'].value_counts().sort_index()
# Plot
plt.figure(figsize=(12, 4))
plt.bar(aa_position_counts.index, aa_position_counts.values, color='tomato', edgecolor='black')
plt.xlabel("Amino Acid Position (PIK3CA)")
plt.ylabel("Mutation Count")
plt.title("Frequency of Missense Mutations in PIK3CA by Amino Acid Position")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
In [ ]: