In [1]:
import pandas as pd
import numpy as np
In [2]:
import sys
print(sys.executable)
/Users/yongmeiwang/opt/anaconda3/envs/py3.9/bin/python
need to download a file called survival data from Xena browser
In [3]:
TCGA_survival=pd.read_csv("survival_BRCA_survival.txt",sep="\t")
In [4]:
TCGA_survival
Out[4]:
| sample | _PATIENT | OS | OS.time | DSS | DSS.time | DFI | DFI.time | PFI | PFI.time | Redaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TCGA-3C-AAAU-01 | TCGA-3C-AAAU | 0 | 4047.0 | 0.0 | 4047.0 | 1.0 | 1808.0 | 1 | 1808.0 | NaN |
| 1 | TCGA-3C-AALI-01 | TCGA-3C-AALI | 0 | 4005.0 | 0.0 | 4005.0 | 0.0 | 4005.0 | 0 | 4005.0 | NaN |
| 2 | TCGA-3C-AALJ-01 | TCGA-3C-AALJ | 0 | 1474.0 | 0.0 | 1474.0 | 0.0 | 1474.0 | 0 | 1474.0 | NaN |
| 3 | TCGA-3C-AALK-01 | TCGA-3C-AALK | 0 | 1448.0 | 0.0 | 1448.0 | NaN | NaN | 0 | 1448.0 | NaN |
| 4 | TCGA-4H-AAAK-01 | TCGA-4H-AAAK | 0 | 348.0 | 0.0 | 348.0 | 0.0 | 348.0 | 0 | 348.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1231 | TCGA-WT-AB44-01 | TCGA-WT-AB44 | 0 | 883.0 | 0.0 | 883.0 | 0.0 | 883.0 | 0 | 883.0 | NaN |
| 1232 | TCGA-XX-A899-01 | TCGA-XX-A899 | 0 | 467.0 | 0.0 | 467.0 | 0.0 | 467.0 | 0 | 467.0 | NaN |
| 1233 | TCGA-XX-A89A-01 | TCGA-XX-A89A | 0 | 488.0 | 0.0 | 488.0 | 0.0 | 488.0 | 0 | 488.0 | NaN |
| 1234 | TCGA-Z7-A8R5-01 | TCGA-Z7-A8R5 | 0 | 3287.0 | 0.0 | 3287.0 | NaN | NaN | 1 | 181.0 | NaN |
| 1235 | TCGA-Z7-A8R6-01 | TCGA-Z7-A8R6 | 0 | 3256.0 | 0.0 | 3256.0 | 0.0 | 3256.0 | 0 | 3256.0 | NaN |
1236 rows × 11 columns
we will use OS.time¶
In [5]:
TCGA_survival.sort_values("OS.time",ascending=False)
Out[5]:
| sample | _PATIENT | OS | OS.time | DSS | DSS.time | DFI | DFI.time | PFI | PFI.time | Redaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 508 | TCGA-B6-A0RU-01 | TCGA-B6-A0RU | 0 | 8605.0 | 0.0 | 8605.0 | 1.0 | 3076.0 | 1 | 3076.0 | NaN |
| 479 | TCGA-B6-A0I5-01 | TCGA-B6-A0I5 | 0 | 8556.0 | 0.0 | 8556.0 | 0.0 | 8556.0 | 0 | 8556.0 | NaN |
| 483 | TCGA-B6-A0IA-01 | TCGA-B6-A0IA | 0 | 8391.0 | 0.0 | 8391.0 | 0.0 | 8391.0 | 0 | 8391.0 | NaN |
| 502 | TCGA-B6-A0RN-01 | TCGA-B6-A0RN | 0 | 8008.0 | 0.0 | 8008.0 | 0.0 | 8008.0 | 0 | 8008.0 | NaN |
| 496 | TCGA-B6-A0RE-01 | TCGA-B6-A0RE | 0 | 7777.0 | 0.0 | 7777.0 | 0.0 | 7777.0 | 0 | 7777.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 770 | TCGA-C8-A12K-01 | TCGA-C8-A12K | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | NaN |
| 213 | TCGA-A8-A08H-01 | TCGA-A8-A08H | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | NaN |
| 202 | TCGA-A8-A081-01 | TCGA-A8-A081 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | NaN |
| 204 | TCGA-A8-A083-01 | TCGA-A8-A083 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | NaN |
| 1197 | TCGA-OL-A66H-01 | TCGA-OL-A66H | 0 | NaN | 0.0 | NaN | 0.0 | NaN | 0 | NaN | NaN |
1236 rows × 11 columns
In [ ]:
In [6]:
#here the column OS is the "overall survival events", 1 would mean the patient died, 0 mean the patient censored (meaning patient has not died)
#check how many died
num_deaths = TCGA_survival['OS'].sum()
print(f"Number of patients who died: {num_deaths}")
Number of patients who died: 202
In [7]:
#this is a better way to check
death_counts = TCGA_survival['OS'].value_counts()
print(death_counts)
OS 0 1034 1 202 Name: count, dtype: int64
In [8]:
import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
# Use your dataframe
df = TCGA_survival.copy()
In [9]:
#Define time and event columns
time = df['OS.time'] # Time to event or censoring
event = df['OS'] # 1 = event (death), 0 = censored
In [10]:
#make sure the data do not have nan; and are numeric
print(df['OS.time'].isna().sum())
print(df['OS'].isna().sum())
print(df['OS.time'].dtype)
print(df['OS'].dtype)
1 0 float64 int64
In [11]:
#OS.time has one Na. otherwise they are of numeric
#drop one Na.
In [12]:
df.shape #check shape before drop
df = df.dropna(subset=['OS.time', 'OS'])
df.shape #check again
Out[12]:
(1235, 11)
In [13]:
#ready to make KaplanMeier plot using the BRCA survival OS events.
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
kmf = KaplanMeierFitter()
kmf.fit(df['OS.time'], df['OS'], label="Overall Survival")
plt.figure(figsize=(8, 6))
kmf.plot()
plt.title("Kaplan-Meier Survival Curve - TCGA BRCA")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.grid(True)
plt.tight_layout()
plt.show()
Do that again for your cancer of interest¶
In [14]:
## next section we will see how mutations on the frequently mutated genes PIK3CA, TP53 impact survival.
## so we will need to define PIK3CA mutation status for patient sample ID(example: TCGA-B6-A0RU-01).
# you will use this mutation files downloaded from Xena browser : mc3_BRCA_mc3.txt
In [15]:
#donwload the other public version, smaller file, 8.5MB after unzip it
#make sure you have the file under the path, then the following command will read it. If you did not unzip it, pd.read can still read it.
TCGA_BRCA_MC3_Public=pd.read_csv("mc3_BRCA_mc3.txt", sep='\t')
In [16]:
#take a look at the data
TCGA_BRCA_MC3_Public
Out[16]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TCGA-3C-AAAU-01 | 10 | 122668955 | 122668955 | G | A | WDR11 | 3'UTR | NaN | 0.39 | NaN | NaN |
| 1 | TCGA-3C-AAAU-01 | 10 | 8115874 | 8115875 | - | A | GATA3 | Frame_Shift_Ins | p.P409Afs*99 | 0.34 | NaN | NaN |
| 2 | TCGA-3C-AAAU-01 | 11 | 65272906 | 65272908 | AAA | - | MALAT1 | RNA | NaN | 0.27 | NaN | NaN |
| 3 | TCGA-3C-AAAU-01 | 11 | 66082467 | 66082467 | C | T | CD248 | Missense_Mutation | p.E678K | 0.07 | tolerated(0.12) | benign(0.001) |
| 4 | TCGA-3C-AAAU-01 | 11 | 66193652 | 66193652 | G | C | NPAS4 | 3'UTR | NaN | 0.20 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 92114 | TCGA-Z7-A8R6-01 | 9 | 95396703 | 95396703 | C | T | IPPK | Missense_Mutation | p.E379K | 0.16 | deleterious(0.01) | probably_damaging(0.968) |
| 92115 | TCGA-Z7-A8R6-01 | X | 123217344 | 123217344 | C | T | STAG2 | Missense_Mutation | p.L1000F | 0.39 | deleterious(0) | probably_damaging(1) |
| 92116 | TCGA-Z7-A8R6-01 | X | 30671631 | 30671631 | G | A | GK | 5'UTR | NaN | 0.36 | NaN | NaN |
| 92117 | TCGA-Z7-A8R6-01 | X | 51151398 | 51151398 | C | G | CXorf67 | 3'UTR | NaN | 0.32 | NaN | NaN |
| 92118 | TCGA-Z7-A8R6-01 | X | 54014379 | 54014379 | T | A | PHF8 | Splice_Site | p.X613_splice | 0.07 | NaN | NaN |
92119 rows × 12 columns
In [19]:
#we will filter tumors that harbor PIK3CA mutations.
pik = TCGA_BRCA_MC3_Public[
TCGA_BRCA_MC3_Public["gene"] == "PIK3CA"
]
pik
Out[19]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 979 | TCGA-3C-AALK-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.21 | deleterious(0.04) | probably_damaging(0.96) |
| 980 | TCGA-3C-AALK-01 | 3 | 178951957 | 178951957 | G | T | PIK3CA | Missense_Mutation | p.M1004I | 0.19 | deleterious(0.01) | benign(0.331) |
| 1215 | TCGA-5L-AAT0-01 | 3 | 178952085 | 178952085 | A | T | PIK3CA | Missense_Mutation | p.H1047L | 0.22 | tolerated(0.44) | benign(0.085) |
| 2631 | TCGA-5L-AAT1-01 | 3 | 178916876 | 178916876 | G | A | PIK3CA | Missense_Mutation | p.R88Q | 0.17 | tolerated(0.06) | probably_damaging(0.998) |
| 2632 | TCGA-5L-AAT1-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.18 | deleterious(0.04) | probably_damaging(0.96) |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90859 | TCGA-S3-AA14-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.26 | deleterious(0.04) | probably_damaging(0.96) |
| 91557 | TCGA-WT-AB41-01 | 3 | 178917478 | 178917478 | G | A | PIK3CA | Missense_Mutation | p.G118D | 0.36 | tolerated(0.05) | possibly_damaging(0.704) |
| 91681 | TCGA-XX-A899-01 | 3 | 178916861 | 178916861 | T | C | PIK3CA | Missense_Mutation | p.F83S | 0.21 | deleterious(0.02) | benign(0.09) |
| 91870 | TCGA-XX-A89A-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.27 | deleterious(0.04) | probably_damaging(0.96) |
| 91992 | TCGA-Z7-A8R5-01 | 3 | 178952085 | 178952085 | A | G | PIK3CA | Missense_Mutation | p.H1047R | 0.16 | tolerated(0.11) | possibly_damaging(0.529) |
315 rows × 12 columns
In [23]:
# the gene name will tell us mutations observed on the gene. We will select only PIK3CA gene from that column, and identify non-silent
#mutations on that gene. We define nonsilent mutations as the following under effect
nonsilent = [
"Missense_Mutation",
"Nonsense_Mutation",
"Frame_Shift_Del",
"Frame_Shift_Ins",
"Splice_Site",
"Nonstop_Mutation",
"In_Frame_Del",
"In_Frame_Ins"
]
pik = pik[pik["effect"].isin(nonsilent)]
pik
Out[23]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 979 | TCGA-3C-AALK-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.21 | deleterious(0.04) | probably_damaging(0.96) |
| 980 | TCGA-3C-AALK-01 | 3 | 178951957 | 178951957 | G | T | PIK3CA | Missense_Mutation | p.M1004I | 0.19 | deleterious(0.01) | benign(0.331) |
| 1215 | TCGA-5L-AAT0-01 | 3 | 178952085 | 178952085 | A | T | PIK3CA | Missense_Mutation | p.H1047L | 0.22 | tolerated(0.44) | benign(0.085) |
| 2631 | TCGA-5L-AAT1-01 | 3 | 178916876 | 178916876 | G | A | PIK3CA | Missense_Mutation | p.R88Q | 0.17 | tolerated(0.06) | probably_damaging(0.998) |
| 2632 | TCGA-5L-AAT1-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.18 | deleterious(0.04) | probably_damaging(0.96) |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90859 | TCGA-S3-AA14-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.26 | deleterious(0.04) | probably_damaging(0.96) |
| 91557 | TCGA-WT-AB41-01 | 3 | 178917478 | 178917478 | G | A | PIK3CA | Missense_Mutation | p.G118D | 0.36 | tolerated(0.05) | possibly_damaging(0.704) |
| 91681 | TCGA-XX-A899-01 | 3 | 178916861 | 178916861 | T | C | PIK3CA | Missense_Mutation | p.F83S | 0.21 | deleterious(0.02) | benign(0.09) |
| 91870 | TCGA-XX-A89A-01 | 3 | 178936082 | 178936082 | G | A | PIK3CA | Missense_Mutation | p.E542K | 0.27 | deleterious(0.04) | probably_damaging(0.96) |
| 91992 | TCGA-Z7-A8R5-01 | 3 | 178952085 | 178952085 | A | G | PIK3CA | Missense_Mutation | p.H1047R | 0.16 | tolerated(0.11) | possibly_damaging(0.529) |
308 rows × 12 columns
In [30]:
#now extract patient ID from samples
pik_patient_ids = pik["sample"].astype(str).str[:12].dropna().unique()
pik_patient_ids
Out[30]:
array(['TCGA-3C-AALK', 'TCGA-5L-AAT0', 'TCGA-5L-AAT1', 'TCGA-A1-A0SI',
'TCGA-A2-A04N', 'TCGA-A2-A04W', 'TCGA-A2-A0CR', 'TCGA-A2-A0CS',
'TCGA-A2-A0CW', 'TCGA-A2-A0D3', 'TCGA-A2-A0EN', 'TCGA-A2-A0EW',
'TCGA-A2-A0SY', 'TCGA-A2-A0T4', 'TCGA-A2-A0T7', 'TCGA-A2-A0YC',
'TCGA-A2-A0YH', 'TCGA-A2-A0YI', 'TCGA-A2-A0YK', 'TCGA-A2-A0YL',
'TCGA-A2-A0YT', 'TCGA-A2-A1FZ', 'TCGA-A2-A1G0', 'TCGA-A2-A25A',
'TCGA-A2-A25C', 'TCGA-A2-A25D', 'TCGA-A2-A4S2', 'TCGA-A7-A0D9',
'TCGA-A7-A0DB', 'TCGA-A7-A13G', 'TCGA-A7-A26E', 'TCGA-A7-A26H',
'TCGA-A7-A5ZW', 'TCGA-A7-A5ZX', 'TCGA-A7-A6VX', 'TCGA-A8-A075',
'TCGA-AC-A23C', 'TCGA-AC-A23E', 'TCGA-AC-A23H', 'TCGA-AC-A2B8',
'TCGA-AC-A2FF', 'TCGA-AC-A2FO', 'TCGA-AC-A3OD', 'TCGA-AC-A3YJ',
'TCGA-AC-A5EH', 'TCGA-AC-A5XS', 'TCGA-AC-A6NO', 'TCGA-AC-A8OS',
'TCGA-AN-A0XL', 'TCGA-AN-A0XO', 'TCGA-AN-A0XP', 'TCGA-AN-A0XS',
'TCGA-AO-A03M', 'TCGA-AO-A03N', 'TCGA-AO-A125', 'TCGA-AO-A126',
'TCGA-AO-A12A', 'TCGA-AO-A1KR', 'TCGA-AQ-A0Y5', 'TCGA-AQ-A1H2',
'TCGA-AQ-A54O', 'TCGA-AQ-A7U7', 'TCGA-AR-A0TR', 'TCGA-AR-A0TZ',
'TCGA-AR-A1AL', 'TCGA-AR-A1AO', 'TCGA-AR-A1AS', 'TCGA-AR-A1AV',
'TCGA-AR-A1AW', 'TCGA-AR-A24K', 'TCGA-AR-A24M', 'TCGA-AR-A24O',
'TCGA-AR-A24S', 'TCGA-AR-A24T', 'TCGA-AR-A255', 'TCGA-AR-A2LK',
'TCGA-AR-A2LM', 'TCGA-AR-A2LO', 'TCGA-AR-A5QM', 'TCGA-AR-A5QP',
'TCGA-AR-A5QQ', 'TCGA-B6-A0RH', 'TCGA-B6-A0RN', 'TCGA-B6-A0RO',
'TCGA-B6-A0RP', 'TCGA-B6-A0RQ', 'TCGA-B6-A0WW', 'TCGA-B6-A0WY',
'TCGA-B6-A0X0', 'TCGA-B6-A0X5', 'TCGA-B6-A0X7', 'TCGA-B6-A401',
'TCGA-B6-A40B', 'TCGA-BH-A0B0', 'TCGA-BH-A0B6', 'TCGA-BH-A0BA',
'TCGA-BH-A0BC', 'TCGA-BH-A0BF', 'TCGA-BH-A0BJ', 'TCGA-BH-A0BM',
'TCGA-BH-A0BO', 'TCGA-BH-A0BQ', 'TCGA-BH-A0BT', 'TCGA-BH-A0DE',
'TCGA-BH-A0DK', 'TCGA-BH-A0DL', 'TCGA-BH-A0DO', 'TCGA-BH-A0DP',
'TCGA-BH-A0DT', 'TCGA-BH-A0DV', 'TCGA-BH-A0DX', 'TCGA-BH-A0EA',
'TCGA-BH-A0H3', 'TCGA-BH-A0H9', 'TCGA-BH-A0HA', 'TCGA-BH-A0HI',
'TCGA-BH-A0HN', 'TCGA-BH-A0W3', 'TCGA-BH-A0W5', 'TCGA-BH-A0W7',
'TCGA-BH-A18F', 'TCGA-BH-A18H', 'TCGA-BH-A18I', 'TCGA-BH-A18J',
'TCGA-BH-A1ET', 'TCGA-BH-A1EU', 'TCGA-BH-A1EY', 'TCGA-BH-A1F8',
'TCGA-BH-A1FE', 'TCGA-BH-A201', 'TCGA-BH-A202', 'TCGA-BH-A203',
'TCGA-BH-A208', 'TCGA-BH-A2L8', 'TCGA-BH-A42V', 'TCGA-BH-A5J0',
'TCGA-BH-A8FY', 'TCGA-C8-A12L', 'TCGA-C8-A12N', 'TCGA-C8-A12T',
'TCGA-C8-A12U', 'TCGA-C8-A12Y', 'TCGA-C8-A130', 'TCGA-C8-A131',
'TCGA-C8-A133', 'TCGA-C8-A1HE', 'TCGA-C8-A1HF', 'TCGA-C8-A26W',
'TCGA-C8-A26X', 'TCGA-C8-A274', 'TCGA-C8-A278', 'TCGA-C8-A3M7',
'TCGA-C8-A3M8', 'TCGA-C8-A8HQ', 'TCGA-D8-A143', 'TCGA-D8-A145',
'TCGA-D8-A146', 'TCGA-D8-A1J8', 'TCGA-D8-A1JD', 'TCGA-D8-A1JE',
'TCGA-D8-A1JF', 'TCGA-D8-A1JG', 'TCGA-D8-A1JH', 'TCGA-D8-A1JJ',
'TCGA-D8-A1JK', 'TCGA-D8-A1JN', 'TCGA-D8-A1JP', 'TCGA-D8-A1JS',
'TCGA-D8-A1JU', 'TCGA-D8-A1XB', 'TCGA-D8-A1XL', 'TCGA-D8-A1XM',
'TCGA-D8-A1XO', 'TCGA-D8-A1XS', 'TCGA-D8-A1XY', 'TCGA-D8-A1Y1',
'TCGA-D8-A1Y2', 'TCGA-D8-A27G', 'TCGA-D8-A27K', 'TCGA-D8-A27L',
'TCGA-D8-A27P', 'TCGA-D8-A27T', 'TCGA-D8-A3Z6', 'TCGA-D8-A73U',
'TCGA-D8-A73X', 'TCGA-E2-A105', 'TCGA-E2-A108', 'TCGA-E2-A10B',
'TCGA-E2-A10C', 'TCGA-E2-A10F', 'TCGA-E2-A14V', 'TCGA-E2-A14Y',
'TCGA-E2-A153', 'TCGA-E2-A154', 'TCGA-E2-A156', 'TCGA-E2-A159',
'TCGA-E2-A15C', 'TCGA-E2-A15E', 'TCGA-E2-A15G', 'TCGA-E2-A15K',
'TCGA-E2-A15L', 'TCGA-E2-A15P', 'TCGA-E2-A1B4', 'TCGA-E2-A1BC',
'TCGA-E2-A1BD', 'TCGA-E2-A1IF', 'TCGA-E2-A1IL', 'TCGA-E2-A1IN',
'TCGA-E2-A1IO', 'TCGA-E2-A1L6', 'TCGA-E2-A1L8', 'TCGA-E2-A576',
'TCGA-E2-A9RU', 'TCGA-E9-A1N5', 'TCGA-E9-A1NG', 'TCGA-E9-A1NH',
'TCGA-E9-A1R0', 'TCGA-E9-A1R3', 'TCGA-E9-A1R4', 'TCGA-E9-A1R5',
'TCGA-E9-A1RA', 'TCGA-E9-A1RC', 'TCGA-E9-A1RD', 'TCGA-E9-A1RE',
'TCGA-E9-A1RH', 'TCGA-E9-A1RI', 'TCGA-E9-A226', 'TCGA-E9-A227',
'TCGA-E9-A229', 'TCGA-E9-A249', 'TCGA-E9-A295', 'TCGA-E9-A3X8',
'TCGA-EW-A1IW', 'TCGA-EW-A1IX', 'TCGA-EW-A1IZ', 'TCGA-EW-A1J1',
'TCGA-EW-A1J5', 'TCGA-EW-A1OV', 'TCGA-EW-A1P0', 'TCGA-EW-A1P5',
'TCGA-EW-A1PE', 'TCGA-EW-A2FV', 'TCGA-EW-A423', 'TCGA-EW-A6SC',
'TCGA-GM-A2D9', 'TCGA-GM-A2DD', 'TCGA-GM-A2DH', 'TCGA-GM-A2DI',
'TCGA-GM-A3NW', 'TCGA-HN-A2OB', 'TCGA-JL-A3YX', 'TCGA-LD-A66U',
'TCGA-LD-A74U', 'TCGA-LL-A50Y', 'TCGA-LL-A5YN', 'TCGA-LL-A6FP',
'TCGA-LL-A7T0', 'TCGA-LL-A9Q3', 'TCGA-OK-A5Q2', 'TCGA-OL-A5D6',
'TCGA-OL-A5DA', 'TCGA-OL-A5RX', 'TCGA-OL-A5S0', 'TCGA-OL-A66L',
'TCGA-OL-A6VQ', 'TCGA-PE-A5DD', 'TCGA-PE-A5DE', 'TCGA-S3-AA11',
'TCGA-S3-AA14', 'TCGA-WT-AB41', 'TCGA-XX-A899', 'TCGA-XX-A89A',
'TCGA-Z7-A8R5'], dtype=object)
In [31]:
len(pik_patient_ids)
Out[31]:
273
In [32]:
# we have 273 patients with PIK3C mutations. we make a dataframe to identify patient mutation status
import pandas as pd
pik_status = pd.DataFrame({"patient_id": pik_patient_ids})
pik_status["PIK3CA_mut"] = 1
pik_status.head()
Out[32]:
| patient_id | PIK3CA_mut | |
|---|---|---|
| 0 | TCGA-3C-AALK | 1 |
| 1 | TCGA-5L-AAT0 | 1 |
| 2 | TCGA-5L-AAT1 | 1 |
| 3 | TCGA-A1-A0SI | 1 |
| 4 | TCGA-A2-A04N | 1 |
In [33]:
#check TCGA survival
TCGA_survival
Out[33]:
| sample | _PATIENT | OS | OS.time | DSS | DSS.time | DFI | DFI.time | PFI | PFI.time | Redaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TCGA-3C-AAAU-01 | TCGA-3C-AAAU | 0 | 4047.0 | 0.0 | 4047.0 | 1.0 | 1808.0 | 1 | 1808.0 | NaN |
| 1 | TCGA-3C-AALI-01 | TCGA-3C-AALI | 0 | 4005.0 | 0.0 | 4005.0 | 0.0 | 4005.0 | 0 | 4005.0 | NaN |
| 2 | TCGA-3C-AALJ-01 | TCGA-3C-AALJ | 0 | 1474.0 | 0.0 | 1474.0 | 0.0 | 1474.0 | 0 | 1474.0 | NaN |
| 3 | TCGA-3C-AALK-01 | TCGA-3C-AALK | 0 | 1448.0 | 0.0 | 1448.0 | NaN | NaN | 0 | 1448.0 | NaN |
| 4 | TCGA-4H-AAAK-01 | TCGA-4H-AAAK | 0 | 348.0 | 0.0 | 348.0 | 0.0 | 348.0 | 0 | 348.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1231 | TCGA-WT-AB44-01 | TCGA-WT-AB44 | 0 | 883.0 | 0.0 | 883.0 | 0.0 | 883.0 | 0 | 883.0 | NaN |
| 1232 | TCGA-XX-A899-01 | TCGA-XX-A899 | 0 | 467.0 | 0.0 | 467.0 | 0.0 | 467.0 | 0 | 467.0 | NaN |
| 1233 | TCGA-XX-A89A-01 | TCGA-XX-A89A | 0 | 488.0 | 0.0 | 488.0 | 0.0 | 488.0 | 0 | 488.0 | NaN |
| 1234 | TCGA-Z7-A8R5-01 | TCGA-Z7-A8R5 | 0 | 3287.0 | 0.0 | 3287.0 | NaN | NaN | 1 | 181.0 | NaN |
| 1235 | TCGA-Z7-A8R6-01 | TCGA-Z7-A8R6 | 0 | 3256.0 | 0.0 | 3256.0 | 0.0 | 3256.0 | 0 | 3256.0 | NaN |
1236 rows × 11 columns
In [55]:
#realized that the above surival table has duplicate patient ID. run the following
surv = TCGA_survival.rename(columns={"_PATIENT":"patient_id", "OS.time":"OS_time", "OS":"OS_event"}).copy()
print("rows in survival:", len(surv))
print("unique patients in survival:", surv["patient_id"].nunique())
rows in survival: 1236 unique patients in survival: 1097
In [56]:
#lets remove duplicate
surv = TCGA_survival.rename(columns={
"_PATIENT": "patient_id",
"OS.time": "OS_time",
"OS": "OS_event"
}).copy()
print("Before dedup:", len(surv), surv["patient_id"].nunique())
# Keep one record per patient
surv_unique = surv.drop_duplicates(subset="patient_id")
print("After dedup:", len(surv_unique), surv_unique["patient_id"].nunique())
Before dedup: 1236 1097 After dedup: 1097 1097
In [57]:
# prepare survival table
df = surv_unique.merge(pik_status, on="patient_id", how="left")
df["PIK3CA_mut"] = df["PIK3CA_mut"].fillna(0).astype(int)
print(df["PIK3CA_mut"].value_counts())
print("Unique mutated patients:",
df.loc[df["PIK3CA_mut"] == 1, "patient_id"].nunique())
PIK3CA_mut 0 824 1 273 Name: count, dtype: int64 Unique mutated patients: 273
In [58]:
df
Out[58]:
| sample | patient_id | OS_event | OS_time | DSS | DSS.time | DFI | DFI.time | PFI | PFI.time | Redaction | PIK3CA_mut | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TCGA-3C-AAAU-01 | TCGA-3C-AAAU | 0 | 4047.0 | 0.0 | 4047.0 | 1.0 | 1808.0 | 1 | 1808.0 | NaN | 0 |
| 1 | TCGA-3C-AALI-01 | TCGA-3C-AALI | 0 | 4005.0 | 0.0 | 4005.0 | 0.0 | 4005.0 | 0 | 4005.0 | NaN | 0 |
| 2 | TCGA-3C-AALJ-01 | TCGA-3C-AALJ | 0 | 1474.0 | 0.0 | 1474.0 | 0.0 | 1474.0 | 0 | 1474.0 | NaN | 0 |
| 3 | TCGA-3C-AALK-01 | TCGA-3C-AALK | 0 | 1448.0 | 0.0 | 1448.0 | NaN | NaN | 0 | 1448.0 | NaN | 1 |
| 4 | TCGA-4H-AAAK-01 | TCGA-4H-AAAK | 0 | 348.0 | 0.0 | 348.0 | 0.0 | 348.0 | 0 | 348.0 | NaN | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1092 | TCGA-WT-AB44-01 | TCGA-WT-AB44 | 0 | 883.0 | 0.0 | 883.0 | 0.0 | 883.0 | 0 | 883.0 | NaN | 0 |
| 1093 | TCGA-XX-A899-01 | TCGA-XX-A899 | 0 | 467.0 | 0.0 | 467.0 | 0.0 | 467.0 | 0 | 467.0 | NaN | 1 |
| 1094 | TCGA-XX-A89A-01 | TCGA-XX-A89A | 0 | 488.0 | 0.0 | 488.0 | 0.0 | 488.0 | 0 | 488.0 | NaN | 1 |
| 1095 | TCGA-Z7-A8R5-01 | TCGA-Z7-A8R5 | 0 | 3287.0 | 0.0 | 3287.0 | NaN | NaN | 1 | 181.0 | NaN | 1 |
| 1096 | TCGA-Z7-A8R6-01 | TCGA-Z7-A8R6 | 0 | 3256.0 | 0.0 | 3256.0 | 0.0 | 3256.0 | 0 | 3256.0 | NaN | 0 |
1097 rows × 12 columns
In [63]:
#still need to fix some problems in OS_time
df[["OS_time","OS_event","PIK3CA_mut"]].dtypes
df[["OS_time","OS_event","PIK3CA_mut"]].isna().sum()
df[["OS_time","OS_event"]]
Out[63]:
| OS_time | OS_event | |
|---|---|---|
| 0 | 4047.0 | 0 |
| 1 | 4005.0 | 0 |
| 2 | 1474.0 | 0 |
| 3 | 1448.0 | 0 |
| 4 | 348.0 | 0 |
| ... | ... | ... |
| 1092 | 883.0 | 0 |
| 1093 | 467.0 | 0 |
| 1094 | 488.0 | 0 |
| 1095 | 3287.0 | 0 |
| 1096 | 3256.0 | 0 |
1097 rows × 2 columns
In [64]:
import pandas as pd
df2 = df.copy()
df2["OS_time"] = pd.to_numeric(df2["OS_time"], errors="coerce")
df2["OS_event"] = pd.to_numeric(df2["OS_event"], errors="coerce")
# Keep only valid rows: time present and >=0, event is 0 or 1
df2 = df2.dropna(subset=["OS_time", "OS_event"])
df2 = df2[df2["OS_time"] >= 0]
df2 = df2[df2["OS_event"].isin([0, 1])]
# Ensure integer event
df2["OS_event"] = df2["OS_event"].astype(int)
df2[["OS_time","OS_event"]].describe()
df2["OS_event"].value_counts()
Out[64]:
OS_event 0 945 1 151 Name: count, dtype: int64
In [65]:
#make kaplain-meier plot using df2
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
kmf = KaplanMeierFitter()
plt.figure(figsize=(7,5))
for val, label in [(1, "PIK3CA Mutated"), (0, "PIK3CA Wild-Type")]:
mask = df2["PIK3CA_mut"] == val
kmf.fit(df2.loc[mask, "OS_time"],
event_observed=df2.loc[mask, "OS_event"],
label=label)
kmf.plot_survival_function()
plt.title("TCGA BRCA Overall Survival: PIK3CA Mutation")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.show()
In [66]:
##calculate log-rank P
from lifelines.statistics import logrank_test
mut = df2["PIK3CA_mut"] == 1
wt = df2["PIK3CA_mut"] == 0
results = logrank_test(
df2.loc[mut, "OS_time"],
df2.loc[wt, "OS_time"],
event_observed_A=df2.loc[mut, "OS_event"],
event_observed_B=df2.loc[wt, "OS_event"]
)
print("Log-rank p-value:", results.p_value)
Log-rank p-value: 0.8706629249616884
In [68]:
#we will filter tumors that harbor TP53 mutations.
TP53 = TCGA_BRCA_MC3_Public[
TCGA_BRCA_MC3_Public["gene"] == "TP53"
]
TP53
Out[68]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 347 | TCGA-3C-AALI-01 | 17 | 7578382 | 7578382 | G | T | TP53 | Nonsense_Mutation | p.S183* | 0.65 | NaN | NaN |
| 3791 | TCGA-A1-A0SI-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.30 | tolerated(0.11) | benign(0.308) |
| 4064 | TCGA-A1-A0SK-01 | 17 | 7578532 | 7578532 | A | T | TP53 | Missense_Mutation | p.M133K | 0.98 | deleterious(0) | benign(0.122) |
| 4354 | TCGA-A1-A0SO-01 | 17 | 7578190 | 7578190 | T | C | TP53 | Missense_Mutation | p.Y220C | 0.87 | deleterious(0) | probably_damaging(1) |
| 4633 | TCGA-A1-A0SP-01 | 17 | 7578382 | 7578382 | G | C | TP53 | Nonsense_Mutation | p.S183* | 0.49 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90161 | TCGA-PL-A8LZ-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.32 | tolerated(0.11) | benign(0.308) |
| 90450 | TCGA-S3-AA0Z-01 | 17 | 7578550 | 7578550 | G | T | TP53 | Missense_Mutation | p.S127Y | 0.93 | deleterious(0) | probably_damaging(1) |
| 90551 | TCGA-S3-AA10-01 | 17 | 7579335 | 7579336 | - | C | TP53 | Frame_Shift_Ins | p.T118Dfs*31 | 0.33 | NaN | NaN |
| 91294 | TCGA-UU-A93S-01 | 17 | 7578523 | 7578523 | T | G | TP53 | Missense_Mutation | p.Q136P | 0.62 | deleterious(0) | probably_damaging(1) |
| 92047 | TCGA-Z7-A8R6-01 | 17 | 7577085 | 7577085 | C | T | TP53 | Missense_Mutation | p.E285K | 0.88 | deleterious(0) | probably_damaging(0.985) |
273 rows × 12 columns
In [69]:
#make sure TP53 mutations are non-silent
# the gene name will tell us mutations observed on the gene. We will select only PIK3CA gene from that column, and identify non-silent
#mutations on that gene. We define nonsilent mutations as the following under effect
nonsilent = [
"Missense_Mutation",
"Nonsense_Mutation",
"Frame_Shift_Del",
"Frame_Shift_Ins",
"Splice_Site",
"Nonstop_Mutation",
"In_Frame_Del",
"In_Frame_Ins"
]
TP53 = TP53[TP53["effect"].isin(nonsilent)]
TP53
Out[69]:
| sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 347 | TCGA-3C-AALI-01 | 17 | 7578382 | 7578382 | G | T | TP53 | Nonsense_Mutation | p.S183* | 0.65 | NaN | NaN |
| 3791 | TCGA-A1-A0SI-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.30 | tolerated(0.11) | benign(0.308) |
| 4064 | TCGA-A1-A0SK-01 | 17 | 7578532 | 7578532 | A | T | TP53 | Missense_Mutation | p.M133K | 0.98 | deleterious(0) | benign(0.122) |
| 4354 | TCGA-A1-A0SO-01 | 17 | 7578190 | 7578190 | T | C | TP53 | Missense_Mutation | p.Y220C | 0.87 | deleterious(0) | probably_damaging(1) |
| 4633 | TCGA-A1-A0SP-01 | 17 | 7578382 | 7578382 | G | C | TP53 | Nonsense_Mutation | p.S183* | 0.49 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90161 | TCGA-PL-A8LZ-01 | 17 | 7578406 | 7578406 | C | T | TP53 | Missense_Mutation | p.R175H | 0.32 | tolerated(0.11) | benign(0.308) |
| 90450 | TCGA-S3-AA0Z-01 | 17 | 7578550 | 7578550 | G | T | TP53 | Missense_Mutation | p.S127Y | 0.93 | deleterious(0) | probably_damaging(1) |
| 90551 | TCGA-S3-AA10-01 | 17 | 7579335 | 7579336 | - | C | TP53 | Frame_Shift_Ins | p.T118Dfs*31 | 0.33 | NaN | NaN |
| 91294 | TCGA-UU-A93S-01 | 17 | 7578523 | 7578523 | T | G | TP53 | Missense_Mutation | p.Q136P | 0.62 | deleterious(0) | probably_damaging(1) |
| 92047 | TCGA-Z7-A8R6-01 | 17 | 7577085 | 7577085 | C | T | TP53 | Missense_Mutation | p.E285K | 0.88 | deleterious(0) | probably_damaging(0.985) |
269 rows × 12 columns
In [70]:
#now extract patient ID from samples
tp53_patient_ids = TP53["sample"].astype(str).str[:12].dropna().unique()
tp53_patient_ids
Out[70]:
array(['TCGA-3C-AALI', 'TCGA-A1-A0SI', 'TCGA-A1-A0SK', 'TCGA-A1-A0SO',
'TCGA-A1-A0SP', 'TCGA-A2-A04U', 'TCGA-A2-A04W', 'TCGA-A2-A0CL',
'TCGA-A2-A0CW', 'TCGA-A2-A0ST', 'TCGA-A2-A0SX', 'TCGA-A2-A0T1',
'TCGA-A2-A0T3', 'TCGA-A2-A0YE', 'TCGA-A2-A0YG', 'TCGA-A2-A0YH',
'TCGA-A2-A0YJ', 'TCGA-A2-A0YM', 'TCGA-A2-A0YT', 'TCGA-A2-A1G1',
'TCGA-A2-A3XT', 'TCGA-A2-A3XV', 'TCGA-A2-A3XX', 'TCGA-A2-A3Y0',
'TCGA-A2-A4S1', 'TCGA-A2-A4S3', 'TCGA-A7-A0DA', 'TCGA-A7-A13D',
'TCGA-A7-A13E', 'TCGA-A7-A26F', 'TCGA-A7-A26G', 'TCGA-A7-A26I',
'TCGA-A7-A2KD', 'TCGA-A7-A4SD', 'TCGA-A7-A4SE', 'TCGA-A7-A56D',
'TCGA-A7-A5ZV', 'TCGA-A7-A6VV', 'TCGA-A7-A6VW', 'TCGA-A7-A6VX',
'TCGA-A7-A6VY', 'TCGA-A8-A075', 'TCGA-AC-A23C', 'TCGA-AC-A2BK',
'TCGA-AC-A2QH', 'TCGA-AC-A3OD', 'TCGA-AC-A5EH', 'TCGA-AC-A5XU',
'TCGA-AC-A62X', 'TCGA-AC-A6IW', 'TCGA-AC-A7VC', 'TCGA-AC-A8OQ',
'TCGA-AN-A0XN', 'TCGA-AN-A0XT', 'TCGA-AN-A0XU', 'TCGA-AO-A03N',
'TCGA-AO-A03V', 'TCGA-AO-A124', 'TCGA-AO-A128', 'TCGA-AO-A129',
'TCGA-AO-A12D', 'TCGA-AO-A12G', 'TCGA-AQ-A04H', 'TCGA-AR-A0TP',
'TCGA-AR-A0TS', 'TCGA-AR-A0TV', 'TCGA-AR-A0TX', 'TCGA-AR-A0U0',
'TCGA-AR-A0U1', 'TCGA-AR-A0U2', 'TCGA-AR-A0U4', 'TCGA-AR-A1AH',
'TCGA-AR-A1AI', 'TCGA-AR-A1AJ', 'TCGA-AR-A1AN', 'TCGA-AR-A1AP',
'TCGA-AR-A1AQ', 'TCGA-AR-A1AR', 'TCGA-AR-A1AS', 'TCGA-AR-A1AW',
'TCGA-AR-A1AX', 'TCGA-AR-A1AY', 'TCGA-AR-A24K', 'TCGA-AR-A24P',
'TCGA-AR-A24Q', 'TCGA-AR-A24S', 'TCGA-AR-A24T', 'TCGA-AR-A24U',
'TCGA-AR-A251', 'TCGA-AR-A254', 'TCGA-AR-A256', 'TCGA-AR-A2LH',
'TCGA-AR-A2LR', 'TCGA-AR-A5QQ', 'TCGA-B6-A0I1', 'TCGA-B6-A0RH',
'TCGA-B6-A0RU', 'TCGA-B6-A0WX', 'TCGA-B6-A0X1', 'TCGA-B6-A1KN',
'TCGA-B6-A401', 'TCGA-B6-A409', 'TCGA-BH-A0AV', 'TCGA-BH-A0B0',
'TCGA-BH-A0B7', 'TCGA-BH-A0BC', 'TCGA-BH-A0BF', 'TCGA-BH-A0BG',
'TCGA-BH-A0BL', 'TCGA-BH-A0BP', 'TCGA-BH-A0BT', 'TCGA-BH-A0C0',
'TCGA-BH-A0C3', 'TCGA-BH-A0DI', 'TCGA-BH-A0DL', 'TCGA-BH-A0DZ',
'TCGA-BH-A0RX', 'TCGA-BH-A0WA', 'TCGA-BH-A18H', 'TCGA-BH-A18Q',
'TCGA-BH-A18T', 'TCGA-BH-A18U', 'TCGA-BH-A18V', 'TCGA-BH-A1EY',
'TCGA-BH-A1F0', 'TCGA-BH-A1F2', 'TCGA-BH-A1F6', 'TCGA-BH-A1FC',
'TCGA-BH-A1FE', 'TCGA-BH-A1FN', 'TCGA-BH-A1FU', 'TCGA-BH-A202',
'TCGA-BH-A203', 'TCGA-BH-A208', 'TCGA-BH-A5IZ', 'TCGA-BH-A5J0',
'TCGA-C8-A12K', 'TCGA-C8-A12L', 'TCGA-C8-A12O', 'TCGA-C8-A12P',
'TCGA-C8-A12Q', 'TCGA-C8-A12V', 'TCGA-C8-A12W', 'TCGA-C8-A12Z',
'TCGA-C8-A130', 'TCGA-C8-A131', 'TCGA-C8-A134', 'TCGA-C8-A135',
'TCGA-C8-A138', 'TCGA-C8-A1HF', 'TCGA-C8-A1HG', 'TCGA-C8-A1HJ',
'TCGA-C8-A1HK', 'TCGA-C8-A1HM', 'TCGA-C8-A26V', 'TCGA-C8-A26W',
'TCGA-C8-A26Y', 'TCGA-C8-A275', 'TCGA-C8-A278', 'TCGA-C8-A27A',
'TCGA-C8-A27B', 'TCGA-C8-A8HP', 'TCGA-D8-A13Y', 'TCGA-D8-A13Z',
'TCGA-D8-A142', 'TCGA-D8-A143', 'TCGA-D8-A147', 'TCGA-D8-A1J9',
'TCGA-D8-A1JF', 'TCGA-D8-A1JG', 'TCGA-D8-A1JJ', 'TCGA-D8-A1JK',
'TCGA-D8-A1JL', 'TCGA-D8-A1JM', 'TCGA-D8-A1X5', 'TCGA-D8-A1XA',
'TCGA-D8-A1XL', 'TCGA-D8-A1XQ', 'TCGA-D8-A1XT', 'TCGA-D8-A1XW',
'TCGA-D8-A1XZ', 'TCGA-D8-A1Y3', 'TCGA-D8-A27F', 'TCGA-D8-A27M',
'TCGA-D8-A27N', 'TCGA-E2-A108', 'TCGA-E2-A109', 'TCGA-E2-A14N',
'TCGA-E2-A14P', 'TCGA-E2-A14R', 'TCGA-E2-A14X', 'TCGA-E2-A14Y',
'TCGA-E2-A14Z', 'TCGA-E2-A150', 'TCGA-E2-A152', 'TCGA-E2-A155',
'TCGA-E2-A158', 'TCGA-E2-A159', 'TCGA-E2-A15E', 'TCGA-E2-A15M',
'TCGA-E2-A1AZ', 'TCGA-E2-A1B0', 'TCGA-E2-A1B1', 'TCGA-E2-A1B6',
'TCGA-E2-A1II', 'TCGA-E2-A1IN', 'TCGA-E2-A1L7', 'TCGA-E2-A1LG',
'TCGA-E2-A1LH', 'TCGA-E2-A1LK', 'TCGA-E2-A573', 'TCGA-E2-A574',
'TCGA-E2-A9RU', 'TCGA-E9-A1N5', 'TCGA-E9-A1N8', 'TCGA-E9-A1N9',
'TCGA-E9-A1NF', 'TCGA-E9-A1RB', 'TCGA-E9-A1RH', 'TCGA-E9-A226',
'TCGA-E9-A22E', 'TCGA-E9-A22G', 'TCGA-E9-A243', 'TCGA-E9-A244',
'TCGA-E9-A248', 'TCGA-E9-A5FL', 'TCGA-E9-A6HE', 'TCGA-EW-A1OZ',
'TCGA-EW-A1P1', 'TCGA-EW-A1P4', 'TCGA-EW-A1P8', 'TCGA-EW-A1PA',
'TCGA-EW-A1PB', 'TCGA-EW-A1PH', 'TCGA-EW-A2FR', 'TCGA-EW-A6S9',
'TCGA-EW-A6SB', 'TCGA-EW-A6SD', 'TCGA-GM-A2DB', 'TCGA-GM-A2DD',
'TCGA-GM-A2DF', 'TCGA-GM-A2DH', 'TCGA-GM-A2DL', 'TCGA-GM-A3XL',
'TCGA-LD-A74U', 'TCGA-LD-A7W5', 'TCGA-LL-A5YO', 'TCGA-LL-A5YP',
'TCGA-LL-A6FR', 'TCGA-LL-A73Y', 'TCGA-LL-A7SZ', 'TCGA-LL-A8F5',
'TCGA-OL-A5D6', 'TCGA-OL-A5D7', 'TCGA-OL-A5RW', 'TCGA-OL-A66I',
'TCGA-OL-A6VO', 'TCGA-PE-A5DE', 'TCGA-PL-A8LV', 'TCGA-PL-A8LZ',
'TCGA-S3-AA0Z', 'TCGA-S3-AA10', 'TCGA-UU-A93S', 'TCGA-Z7-A8R6'],
dtype=object)
In [72]:
len(tp53_patient_ids)
Out[72]:
264
In [73]:
# we have 209 patients with TP53 mutations. we make a dataframe to identify patient mutation status
import pandas as pd
TP53_status = pd.DataFrame({"patient_id": tp53_patient_ids})
TP53_status["TP53_mut"] = 1
TP53_status
Out[73]:
| patient_id | TP53_mut | |
|---|---|---|
| 0 | TCGA-3C-AALI | 1 |
| 1 | TCGA-A1-A0SI | 1 |
| 2 | TCGA-A1-A0SK | 1 |
| 3 | TCGA-A1-A0SO | 1 |
| 4 | TCGA-A1-A0SP | 1 |
| ... | ... | ... |
| 259 | TCGA-PL-A8LZ | 1 |
| 260 | TCGA-S3-AA0Z | 1 |
| 261 | TCGA-S3-AA10 | 1 |
| 262 | TCGA-UU-A93S | 1 |
| 263 | TCGA-Z7-A8R6 | 1 |
264 rows × 2 columns
In [74]:
##merge with survial info
df_tp53 = surv_unique.merge(TP53_status, on="patient_id", how="left")
df_tp53["TP53_mut"] = df_tp53["TP53_mut"].fillna(0).astype(int)
df_tp53["TP53_mut"].value_counts()
Out[74]:
TP53_mut 0 833 1 264 Name: count, dtype: int64
In [75]:
df_tp53
Out[75]:
| sample | patient_id | OS_event | OS_time | DSS | DSS.time | DFI | DFI.time | PFI | PFI.time | Redaction | TP53_mut | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TCGA-3C-AAAU-01 | TCGA-3C-AAAU | 0 | 4047.0 | 0.0 | 4047.0 | 1.0 | 1808.0 | 1 | 1808.0 | NaN | 0 |
| 1 | TCGA-3C-AALI-01 | TCGA-3C-AALI | 0 | 4005.0 | 0.0 | 4005.0 | 0.0 | 4005.0 | 0 | 4005.0 | NaN | 1 |
| 2 | TCGA-3C-AALJ-01 | TCGA-3C-AALJ | 0 | 1474.0 | 0.0 | 1474.0 | 0.0 | 1474.0 | 0 | 1474.0 | NaN | 0 |
| 3 | TCGA-3C-AALK-01 | TCGA-3C-AALK | 0 | 1448.0 | 0.0 | 1448.0 | NaN | NaN | 0 | 1448.0 | NaN | 0 |
| 4 | TCGA-4H-AAAK-01 | TCGA-4H-AAAK | 0 | 348.0 | 0.0 | 348.0 | 0.0 | 348.0 | 0 | 348.0 | NaN | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1092 | TCGA-WT-AB44-01 | TCGA-WT-AB44 | 0 | 883.0 | 0.0 | 883.0 | 0.0 | 883.0 | 0 | 883.0 | NaN | 0 |
| 1093 | TCGA-XX-A899-01 | TCGA-XX-A899 | 0 | 467.0 | 0.0 | 467.0 | 0.0 | 467.0 | 0 | 467.0 | NaN | 0 |
| 1094 | TCGA-XX-A89A-01 | TCGA-XX-A89A | 0 | 488.0 | 0.0 | 488.0 | 0.0 | 488.0 | 0 | 488.0 | NaN | 0 |
| 1095 | TCGA-Z7-A8R5-01 | TCGA-Z7-A8R5 | 0 | 3287.0 | 0.0 | 3287.0 | NaN | NaN | 1 | 181.0 | NaN | 0 |
| 1096 | TCGA-Z7-A8R6-01 | TCGA-Z7-A8R6 | 0 | 3256.0 | 0.0 | 3256.0 | 0.0 | 3256.0 | 0 | 3256.0 | NaN | 1 |
1097 rows × 12 columns
In [76]:
##ready for kaplain-mieer plots
import pandas as pd
df2 = df_tp53.copy()
df2["OS_time"] = pd.to_numeric(df2["OS_time"], errors="coerce")
df2["OS_event"] = pd.to_numeric(df2["OS_event"], errors="coerce")
# Keep only valid rows: time present and >=0, event is 0 or 1
df2 = df2.dropna(subset=["OS_time", "OS_event"])
df2 = df2[df2["OS_time"] >= 0]
df2 = df2[df2["OS_event"].isin([0, 1])]
# Ensure integer event
df2["OS_event"] = df2["OS_event"].astype(int)
df2[["OS_time","OS_event"]].describe()
df2["OS_event"].value_counts()
Out[76]:
OS_event 0 945 1 151 Name: count, dtype: int64
In [77]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
kmf = KaplanMeierFitter()
plt.figure(figsize=(7,5))
for val, label in [(1, "TP53 Mutated"), (0, "TP53 Wild-Type")]:
mask = df2["TP53_mut"] == val
kmf.fit(df2.loc[mask, "OS_time"],
event_observed=df2.loc[mask, "OS_event"],
label=label)
kmf.plot_survival_function()
plt.title("TCGA BRCA Overall Survival: TP53 Mutation")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.show()
In [79]:
##calculate log-rank P
from lifelines.statistics import logrank_test
mut = df2["TP53_mut"] == 1
wt = df2["TP53_mut"] == 0
results = logrank_test(
df2.loc[mut, "OS_time"],
df2.loc[wt, "OS_time"],
event_observed_A=df2.loc[mut, "OS_event"],
event_observed_B=df2.loc[wt, "OS_event"]
)
print("Log-rank p-value:", results.p_value)
Log-rank p-value: 0.767454702553489
TP53 mutation showed a little differnece in surival.¶
In [ ]: