In [1]:
import pandas as pd                                      
import numpy as np  
In [2]:
import sys
print(sys.executable)
/Users/yongmeiwang/opt/anaconda3/envs/py3.9/bin/python

need to download a file called survival data from Xena browser

In [3]:
TCGA_survival=pd.read_csv("survival_BRCA_survival.txt",sep="\t")
In [4]:
TCGA_survival
Out[4]:
sample _PATIENT OS OS.time DSS DSS.time DFI DFI.time PFI PFI.time Redaction
0 TCGA-3C-AAAU-01 TCGA-3C-AAAU 0 4047.0 0.0 4047.0 1.0 1808.0 1 1808.0 NaN
1 TCGA-3C-AALI-01 TCGA-3C-AALI 0 4005.0 0.0 4005.0 0.0 4005.0 0 4005.0 NaN
2 TCGA-3C-AALJ-01 TCGA-3C-AALJ 0 1474.0 0.0 1474.0 0.0 1474.0 0 1474.0 NaN
3 TCGA-3C-AALK-01 TCGA-3C-AALK 0 1448.0 0.0 1448.0 NaN NaN 0 1448.0 NaN
4 TCGA-4H-AAAK-01 TCGA-4H-AAAK 0 348.0 0.0 348.0 0.0 348.0 0 348.0 NaN
... ... ... ... ... ... ... ... ... ... ... ...
1231 TCGA-WT-AB44-01 TCGA-WT-AB44 0 883.0 0.0 883.0 0.0 883.0 0 883.0 NaN
1232 TCGA-XX-A899-01 TCGA-XX-A899 0 467.0 0.0 467.0 0.0 467.0 0 467.0 NaN
1233 TCGA-XX-A89A-01 TCGA-XX-A89A 0 488.0 0.0 488.0 0.0 488.0 0 488.0 NaN
1234 TCGA-Z7-A8R5-01 TCGA-Z7-A8R5 0 3287.0 0.0 3287.0 NaN NaN 1 181.0 NaN
1235 TCGA-Z7-A8R6-01 TCGA-Z7-A8R6 0 3256.0 0.0 3256.0 0.0 3256.0 0 3256.0 NaN

1236 rows × 11 columns

we will use OS.time¶

In [5]:
TCGA_survival.sort_values("OS.time",ascending=False)
Out[5]:
sample _PATIENT OS OS.time DSS DSS.time DFI DFI.time PFI PFI.time Redaction
508 TCGA-B6-A0RU-01 TCGA-B6-A0RU 0 8605.0 0.0 8605.0 1.0 3076.0 1 3076.0 NaN
479 TCGA-B6-A0I5-01 TCGA-B6-A0I5 0 8556.0 0.0 8556.0 0.0 8556.0 0 8556.0 NaN
483 TCGA-B6-A0IA-01 TCGA-B6-A0IA 0 8391.0 0.0 8391.0 0.0 8391.0 0 8391.0 NaN
502 TCGA-B6-A0RN-01 TCGA-B6-A0RN 0 8008.0 0.0 8008.0 0.0 8008.0 0 8008.0 NaN
496 TCGA-B6-A0RE-01 TCGA-B6-A0RE 0 7777.0 0.0 7777.0 0.0 7777.0 0 7777.0 NaN
... ... ... ... ... ... ... ... ... ... ... ...
770 TCGA-C8-A12K-01 TCGA-C8-A12K 0 0.0 0.0 0.0 0.0 0.0 0 0.0 NaN
213 TCGA-A8-A08H-01 TCGA-A8-A08H 0 0.0 0.0 0.0 0.0 0.0 0 0.0 NaN
202 TCGA-A8-A081-01 TCGA-A8-A081 0 0.0 0.0 0.0 0.0 0.0 0 0.0 NaN
204 TCGA-A8-A083-01 TCGA-A8-A083 0 0.0 0.0 0.0 0.0 0.0 0 0.0 NaN
1197 TCGA-OL-A66H-01 TCGA-OL-A66H 0 NaN 0.0 NaN 0.0 NaN 0 NaN NaN

1236 rows × 11 columns

In [ ]:
 
In [6]:
#here the column OS is the "overall survival events", 1 would mean the patient died, 0 mean the patient censored (meaning patient has not died)
#check how many died 
num_deaths = TCGA_survival['OS'].sum()
print(f"Number of patients who died: {num_deaths}")
Number of patients who died: 202
In [7]:
#this is a better way to check
death_counts = TCGA_survival['OS'].value_counts()
print(death_counts)
OS
0    1034
1     202
Name: count, dtype: int64

now lets make a kaplan meyer plot using Overall SUrival (OS). you need to install lifelines¶

pip3 install lifelines in another terminal¶

In [8]:
import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Use your dataframe
df = TCGA_survival.copy()
In [9]:
#Define time and event columns
time = df['OS.time']      # Time to event or censoring
event = df['OS']          # 1 = event (death), 0 = censored
In [10]:
#make sure the data do not have nan; and are numeric
print(df['OS.time'].isna().sum())
print(df['OS'].isna().sum())
print(df['OS.time'].dtype)
print(df['OS'].dtype)
1
0
float64
int64
In [11]:
#OS.time has one Na. otherwise they are of numeric 
#drop one Na.
In [12]:
df.shape #check shape before drop
df = df.dropna(subset=['OS.time', 'OS'])
df.shape #check again
Out[12]:
(1235, 11)
In [13]:
#ready to make KaplanMeier plot using the BRCA survival OS events. 

from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf = KaplanMeierFitter()
kmf.fit(df['OS.time'], df['OS'], label="Overall Survival")

plt.figure(figsize=(8, 6))
kmf.plot()
plt.title("Kaplan-Meier Survival Curve - TCGA BRCA")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image

Do that again for your cancer of interest¶

In [14]:
## next section we will see how mutations on the frequently mutated genes PIK3CA, TP53 impact survival. 
## so we will need to define PIK3CA mutation status for patient sample ID(example: TCGA-B6-A0RU-01).
# you will use this mutation files downloaded from Xena browser : mc3_BRCA_mc3.txt
In [15]:
#donwload the other public version, smaller file, 8.5MB after unzip it
#make sure you have the file under the path, then the following command will read it. If you did not unzip it, pd.read can still read it. 

TCGA_BRCA_MC3_Public=pd.read_csv("mc3_BRCA_mc3.txt", sep='\t')
In [16]:
#take a look at the data
TCGA_BRCA_MC3_Public
Out[16]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
0 TCGA-3C-AAAU-01 10 122668955 122668955 G A WDR11 3'UTR NaN 0.39 NaN NaN
1 TCGA-3C-AAAU-01 10 8115874 8115875 - A GATA3 Frame_Shift_Ins p.P409Afs*99 0.34 NaN NaN
2 TCGA-3C-AAAU-01 11 65272906 65272908 AAA - MALAT1 RNA NaN 0.27 NaN NaN
3 TCGA-3C-AAAU-01 11 66082467 66082467 C T CD248 Missense_Mutation p.E678K 0.07 tolerated(0.12) benign(0.001)
4 TCGA-3C-AAAU-01 11 66193652 66193652 G C NPAS4 3'UTR NaN 0.20 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
92114 TCGA-Z7-A8R6-01 9 95396703 95396703 C T IPPK Missense_Mutation p.E379K 0.16 deleterious(0.01) probably_damaging(0.968)
92115 TCGA-Z7-A8R6-01 X 123217344 123217344 C T STAG2 Missense_Mutation p.L1000F 0.39 deleterious(0) probably_damaging(1)
92116 TCGA-Z7-A8R6-01 X 30671631 30671631 G A GK 5'UTR NaN 0.36 NaN NaN
92117 TCGA-Z7-A8R6-01 X 51151398 51151398 C G CXorf67 3'UTR NaN 0.32 NaN NaN
92118 TCGA-Z7-A8R6-01 X 54014379 54014379 T A PHF8 Splice_Site p.X613_splice 0.07 NaN NaN

92119 rows × 12 columns

In [19]:
#we will filter tumors that harbor PIK3CA mutations.
pik = TCGA_BRCA_MC3_Public[
    TCGA_BRCA_MC3_Public["gene"] == "PIK3CA"
]
pik
Out[19]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
979 TCGA-3C-AALK-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.21 deleterious(0.04) probably_damaging(0.96)
980 TCGA-3C-AALK-01 3 178951957 178951957 G T PIK3CA Missense_Mutation p.M1004I 0.19 deleterious(0.01) benign(0.331)
1215 TCGA-5L-AAT0-01 3 178952085 178952085 A T PIK3CA Missense_Mutation p.H1047L 0.22 tolerated(0.44) benign(0.085)
2631 TCGA-5L-AAT1-01 3 178916876 178916876 G A PIK3CA Missense_Mutation p.R88Q 0.17 tolerated(0.06) probably_damaging(0.998)
2632 TCGA-5L-AAT1-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.18 deleterious(0.04) probably_damaging(0.96)
... ... ... ... ... ... ... ... ... ... ... ... ...
90859 TCGA-S3-AA14-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.26 deleterious(0.04) probably_damaging(0.96)
91557 TCGA-WT-AB41-01 3 178917478 178917478 G A PIK3CA Missense_Mutation p.G118D 0.36 tolerated(0.05) possibly_damaging(0.704)
91681 TCGA-XX-A899-01 3 178916861 178916861 T C PIK3CA Missense_Mutation p.F83S 0.21 deleterious(0.02) benign(0.09)
91870 TCGA-XX-A89A-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.27 deleterious(0.04) probably_damaging(0.96)
91992 TCGA-Z7-A8R5-01 3 178952085 178952085 A G PIK3CA Missense_Mutation p.H1047R 0.16 tolerated(0.11) possibly_damaging(0.529)

315 rows × 12 columns

In [23]:
# the gene name will tell us mutations observed on the gene. We will select only PIK3CA gene from that column, and identify non-silent 
#mutations on that gene. We define nonsilent mutations as the following under effect 
nonsilent = [
    "Missense_Mutation",
    "Nonsense_Mutation",
    "Frame_Shift_Del",
    "Frame_Shift_Ins",
    "Splice_Site",
    "Nonstop_Mutation",
    "In_Frame_Del",
    "In_Frame_Ins"
]

pik = pik[pik["effect"].isin(nonsilent)]
pik
Out[23]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
979 TCGA-3C-AALK-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.21 deleterious(0.04) probably_damaging(0.96)
980 TCGA-3C-AALK-01 3 178951957 178951957 G T PIK3CA Missense_Mutation p.M1004I 0.19 deleterious(0.01) benign(0.331)
1215 TCGA-5L-AAT0-01 3 178952085 178952085 A T PIK3CA Missense_Mutation p.H1047L 0.22 tolerated(0.44) benign(0.085)
2631 TCGA-5L-AAT1-01 3 178916876 178916876 G A PIK3CA Missense_Mutation p.R88Q 0.17 tolerated(0.06) probably_damaging(0.998)
2632 TCGA-5L-AAT1-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.18 deleterious(0.04) probably_damaging(0.96)
... ... ... ... ... ... ... ... ... ... ... ... ...
90859 TCGA-S3-AA14-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.26 deleterious(0.04) probably_damaging(0.96)
91557 TCGA-WT-AB41-01 3 178917478 178917478 G A PIK3CA Missense_Mutation p.G118D 0.36 tolerated(0.05) possibly_damaging(0.704)
91681 TCGA-XX-A899-01 3 178916861 178916861 T C PIK3CA Missense_Mutation p.F83S 0.21 deleterious(0.02) benign(0.09)
91870 TCGA-XX-A89A-01 3 178936082 178936082 G A PIK3CA Missense_Mutation p.E542K 0.27 deleterious(0.04) probably_damaging(0.96)
91992 TCGA-Z7-A8R5-01 3 178952085 178952085 A G PIK3CA Missense_Mutation p.H1047R 0.16 tolerated(0.11) possibly_damaging(0.529)

308 rows × 12 columns

In [30]:
#now extract patient ID from samples 

pik_patient_ids = pik["sample"].astype(str).str[:12].dropna().unique()
pik_patient_ids
Out[30]:
array(['TCGA-3C-AALK', 'TCGA-5L-AAT0', 'TCGA-5L-AAT1', 'TCGA-A1-A0SI',
       'TCGA-A2-A04N', 'TCGA-A2-A04W', 'TCGA-A2-A0CR', 'TCGA-A2-A0CS',
       'TCGA-A2-A0CW', 'TCGA-A2-A0D3', 'TCGA-A2-A0EN', 'TCGA-A2-A0EW',
       'TCGA-A2-A0SY', 'TCGA-A2-A0T4', 'TCGA-A2-A0T7', 'TCGA-A2-A0YC',
       'TCGA-A2-A0YH', 'TCGA-A2-A0YI', 'TCGA-A2-A0YK', 'TCGA-A2-A0YL',
       'TCGA-A2-A0YT', 'TCGA-A2-A1FZ', 'TCGA-A2-A1G0', 'TCGA-A2-A25A',
       'TCGA-A2-A25C', 'TCGA-A2-A25D', 'TCGA-A2-A4S2', 'TCGA-A7-A0D9',
       'TCGA-A7-A0DB', 'TCGA-A7-A13G', 'TCGA-A7-A26E', 'TCGA-A7-A26H',
       'TCGA-A7-A5ZW', 'TCGA-A7-A5ZX', 'TCGA-A7-A6VX', 'TCGA-A8-A075',
       'TCGA-AC-A23C', 'TCGA-AC-A23E', 'TCGA-AC-A23H', 'TCGA-AC-A2B8',
       'TCGA-AC-A2FF', 'TCGA-AC-A2FO', 'TCGA-AC-A3OD', 'TCGA-AC-A3YJ',
       'TCGA-AC-A5EH', 'TCGA-AC-A5XS', 'TCGA-AC-A6NO', 'TCGA-AC-A8OS',
       'TCGA-AN-A0XL', 'TCGA-AN-A0XO', 'TCGA-AN-A0XP', 'TCGA-AN-A0XS',
       'TCGA-AO-A03M', 'TCGA-AO-A03N', 'TCGA-AO-A125', 'TCGA-AO-A126',
       'TCGA-AO-A12A', 'TCGA-AO-A1KR', 'TCGA-AQ-A0Y5', 'TCGA-AQ-A1H2',
       'TCGA-AQ-A54O', 'TCGA-AQ-A7U7', 'TCGA-AR-A0TR', 'TCGA-AR-A0TZ',
       'TCGA-AR-A1AL', 'TCGA-AR-A1AO', 'TCGA-AR-A1AS', 'TCGA-AR-A1AV',
       'TCGA-AR-A1AW', 'TCGA-AR-A24K', 'TCGA-AR-A24M', 'TCGA-AR-A24O',
       'TCGA-AR-A24S', 'TCGA-AR-A24T', 'TCGA-AR-A255', 'TCGA-AR-A2LK',
       'TCGA-AR-A2LM', 'TCGA-AR-A2LO', 'TCGA-AR-A5QM', 'TCGA-AR-A5QP',
       'TCGA-AR-A5QQ', 'TCGA-B6-A0RH', 'TCGA-B6-A0RN', 'TCGA-B6-A0RO',
       'TCGA-B6-A0RP', 'TCGA-B6-A0RQ', 'TCGA-B6-A0WW', 'TCGA-B6-A0WY',
       'TCGA-B6-A0X0', 'TCGA-B6-A0X5', 'TCGA-B6-A0X7', 'TCGA-B6-A401',
       'TCGA-B6-A40B', 'TCGA-BH-A0B0', 'TCGA-BH-A0B6', 'TCGA-BH-A0BA',
       'TCGA-BH-A0BC', 'TCGA-BH-A0BF', 'TCGA-BH-A0BJ', 'TCGA-BH-A0BM',
       'TCGA-BH-A0BO', 'TCGA-BH-A0BQ', 'TCGA-BH-A0BT', 'TCGA-BH-A0DE',
       'TCGA-BH-A0DK', 'TCGA-BH-A0DL', 'TCGA-BH-A0DO', 'TCGA-BH-A0DP',
       'TCGA-BH-A0DT', 'TCGA-BH-A0DV', 'TCGA-BH-A0DX', 'TCGA-BH-A0EA',
       'TCGA-BH-A0H3', 'TCGA-BH-A0H9', 'TCGA-BH-A0HA', 'TCGA-BH-A0HI',
       'TCGA-BH-A0HN', 'TCGA-BH-A0W3', 'TCGA-BH-A0W5', 'TCGA-BH-A0W7',
       'TCGA-BH-A18F', 'TCGA-BH-A18H', 'TCGA-BH-A18I', 'TCGA-BH-A18J',
       'TCGA-BH-A1ET', 'TCGA-BH-A1EU', 'TCGA-BH-A1EY', 'TCGA-BH-A1F8',
       'TCGA-BH-A1FE', 'TCGA-BH-A201', 'TCGA-BH-A202', 'TCGA-BH-A203',
       'TCGA-BH-A208', 'TCGA-BH-A2L8', 'TCGA-BH-A42V', 'TCGA-BH-A5J0',
       'TCGA-BH-A8FY', 'TCGA-C8-A12L', 'TCGA-C8-A12N', 'TCGA-C8-A12T',
       'TCGA-C8-A12U', 'TCGA-C8-A12Y', 'TCGA-C8-A130', 'TCGA-C8-A131',
       'TCGA-C8-A133', 'TCGA-C8-A1HE', 'TCGA-C8-A1HF', 'TCGA-C8-A26W',
       'TCGA-C8-A26X', 'TCGA-C8-A274', 'TCGA-C8-A278', 'TCGA-C8-A3M7',
       'TCGA-C8-A3M8', 'TCGA-C8-A8HQ', 'TCGA-D8-A143', 'TCGA-D8-A145',
       'TCGA-D8-A146', 'TCGA-D8-A1J8', 'TCGA-D8-A1JD', 'TCGA-D8-A1JE',
       'TCGA-D8-A1JF', 'TCGA-D8-A1JG', 'TCGA-D8-A1JH', 'TCGA-D8-A1JJ',
       'TCGA-D8-A1JK', 'TCGA-D8-A1JN', 'TCGA-D8-A1JP', 'TCGA-D8-A1JS',
       'TCGA-D8-A1JU', 'TCGA-D8-A1XB', 'TCGA-D8-A1XL', 'TCGA-D8-A1XM',
       'TCGA-D8-A1XO', 'TCGA-D8-A1XS', 'TCGA-D8-A1XY', 'TCGA-D8-A1Y1',
       'TCGA-D8-A1Y2', 'TCGA-D8-A27G', 'TCGA-D8-A27K', 'TCGA-D8-A27L',
       'TCGA-D8-A27P', 'TCGA-D8-A27T', 'TCGA-D8-A3Z6', 'TCGA-D8-A73U',
       'TCGA-D8-A73X', 'TCGA-E2-A105', 'TCGA-E2-A108', 'TCGA-E2-A10B',
       'TCGA-E2-A10C', 'TCGA-E2-A10F', 'TCGA-E2-A14V', 'TCGA-E2-A14Y',
       'TCGA-E2-A153', 'TCGA-E2-A154', 'TCGA-E2-A156', 'TCGA-E2-A159',
       'TCGA-E2-A15C', 'TCGA-E2-A15E', 'TCGA-E2-A15G', 'TCGA-E2-A15K',
       'TCGA-E2-A15L', 'TCGA-E2-A15P', 'TCGA-E2-A1B4', 'TCGA-E2-A1BC',
       'TCGA-E2-A1BD', 'TCGA-E2-A1IF', 'TCGA-E2-A1IL', 'TCGA-E2-A1IN',
       'TCGA-E2-A1IO', 'TCGA-E2-A1L6', 'TCGA-E2-A1L8', 'TCGA-E2-A576',
       'TCGA-E2-A9RU', 'TCGA-E9-A1N5', 'TCGA-E9-A1NG', 'TCGA-E9-A1NH',
       'TCGA-E9-A1R0', 'TCGA-E9-A1R3', 'TCGA-E9-A1R4', 'TCGA-E9-A1R5',
       'TCGA-E9-A1RA', 'TCGA-E9-A1RC', 'TCGA-E9-A1RD', 'TCGA-E9-A1RE',
       'TCGA-E9-A1RH', 'TCGA-E9-A1RI', 'TCGA-E9-A226', 'TCGA-E9-A227',
       'TCGA-E9-A229', 'TCGA-E9-A249', 'TCGA-E9-A295', 'TCGA-E9-A3X8',
       'TCGA-EW-A1IW', 'TCGA-EW-A1IX', 'TCGA-EW-A1IZ', 'TCGA-EW-A1J1',
       'TCGA-EW-A1J5', 'TCGA-EW-A1OV', 'TCGA-EW-A1P0', 'TCGA-EW-A1P5',
       'TCGA-EW-A1PE', 'TCGA-EW-A2FV', 'TCGA-EW-A423', 'TCGA-EW-A6SC',
       'TCGA-GM-A2D9', 'TCGA-GM-A2DD', 'TCGA-GM-A2DH', 'TCGA-GM-A2DI',
       'TCGA-GM-A3NW', 'TCGA-HN-A2OB', 'TCGA-JL-A3YX', 'TCGA-LD-A66U',
       'TCGA-LD-A74U', 'TCGA-LL-A50Y', 'TCGA-LL-A5YN', 'TCGA-LL-A6FP',
       'TCGA-LL-A7T0', 'TCGA-LL-A9Q3', 'TCGA-OK-A5Q2', 'TCGA-OL-A5D6',
       'TCGA-OL-A5DA', 'TCGA-OL-A5RX', 'TCGA-OL-A5S0', 'TCGA-OL-A66L',
       'TCGA-OL-A6VQ', 'TCGA-PE-A5DD', 'TCGA-PE-A5DE', 'TCGA-S3-AA11',
       'TCGA-S3-AA14', 'TCGA-WT-AB41', 'TCGA-XX-A899', 'TCGA-XX-A89A',
       'TCGA-Z7-A8R5'], dtype=object)
In [31]:
len(pik_patient_ids)
Out[31]:
273
In [32]:
# we have 273 patients with PIK3C mutations. we make a dataframe to identify patient mutation status
import pandas as pd
pik_status = pd.DataFrame({"patient_id": pik_patient_ids})
pik_status["PIK3CA_mut"] = 1
pik_status.head()
Out[32]:
patient_id PIK3CA_mut
0 TCGA-3C-AALK 1
1 TCGA-5L-AAT0 1
2 TCGA-5L-AAT1 1
3 TCGA-A1-A0SI 1
4 TCGA-A2-A04N 1
In [33]:
#check TCGA survival
TCGA_survival
Out[33]:
sample _PATIENT OS OS.time DSS DSS.time DFI DFI.time PFI PFI.time Redaction
0 TCGA-3C-AAAU-01 TCGA-3C-AAAU 0 4047.0 0.0 4047.0 1.0 1808.0 1 1808.0 NaN
1 TCGA-3C-AALI-01 TCGA-3C-AALI 0 4005.0 0.0 4005.0 0.0 4005.0 0 4005.0 NaN
2 TCGA-3C-AALJ-01 TCGA-3C-AALJ 0 1474.0 0.0 1474.0 0.0 1474.0 0 1474.0 NaN
3 TCGA-3C-AALK-01 TCGA-3C-AALK 0 1448.0 0.0 1448.0 NaN NaN 0 1448.0 NaN
4 TCGA-4H-AAAK-01 TCGA-4H-AAAK 0 348.0 0.0 348.0 0.0 348.0 0 348.0 NaN
... ... ... ... ... ... ... ... ... ... ... ...
1231 TCGA-WT-AB44-01 TCGA-WT-AB44 0 883.0 0.0 883.0 0.0 883.0 0 883.0 NaN
1232 TCGA-XX-A899-01 TCGA-XX-A899 0 467.0 0.0 467.0 0.0 467.0 0 467.0 NaN
1233 TCGA-XX-A89A-01 TCGA-XX-A89A 0 488.0 0.0 488.0 0.0 488.0 0 488.0 NaN
1234 TCGA-Z7-A8R5-01 TCGA-Z7-A8R5 0 3287.0 0.0 3287.0 NaN NaN 1 181.0 NaN
1235 TCGA-Z7-A8R6-01 TCGA-Z7-A8R6 0 3256.0 0.0 3256.0 0.0 3256.0 0 3256.0 NaN

1236 rows × 11 columns

In [55]:
#realized that the above surival table has duplicate patient ID. run the following
surv = TCGA_survival.rename(columns={"_PATIENT":"patient_id", "OS.time":"OS_time", "OS":"OS_event"}).copy()

print("rows in survival:", len(surv))
print("unique patients in survival:", surv["patient_id"].nunique())
rows in survival: 1236
unique patients in survival: 1097
In [56]:
#lets remove duplicate
surv = TCGA_survival.rename(columns={
    "_PATIENT": "patient_id",
    "OS.time": "OS_time",
    "OS": "OS_event"
}).copy()

print("Before dedup:", len(surv), surv["patient_id"].nunique())

# Keep one record per patient
surv_unique = surv.drop_duplicates(subset="patient_id")

print("After dedup:", len(surv_unique), surv_unique["patient_id"].nunique())
Before dedup: 1236 1097
After dedup: 1097 1097
In [57]:
# prepare survival table
df = surv_unique.merge(pik_status, on="patient_id", how="left")
df["PIK3CA_mut"] = df["PIK3CA_mut"].fillna(0).astype(int)

print(df["PIK3CA_mut"].value_counts())
print("Unique mutated patients:",
      df.loc[df["PIK3CA_mut"] == 1, "patient_id"].nunique())
PIK3CA_mut
0    824
1    273
Name: count, dtype: int64
Unique mutated patients: 273
In [58]:
df
Out[58]:
sample patient_id OS_event OS_time DSS DSS.time DFI DFI.time PFI PFI.time Redaction PIK3CA_mut
0 TCGA-3C-AAAU-01 TCGA-3C-AAAU 0 4047.0 0.0 4047.0 1.0 1808.0 1 1808.0 NaN 0
1 TCGA-3C-AALI-01 TCGA-3C-AALI 0 4005.0 0.0 4005.0 0.0 4005.0 0 4005.0 NaN 0
2 TCGA-3C-AALJ-01 TCGA-3C-AALJ 0 1474.0 0.0 1474.0 0.0 1474.0 0 1474.0 NaN 0
3 TCGA-3C-AALK-01 TCGA-3C-AALK 0 1448.0 0.0 1448.0 NaN NaN 0 1448.0 NaN 1
4 TCGA-4H-AAAK-01 TCGA-4H-AAAK 0 348.0 0.0 348.0 0.0 348.0 0 348.0 NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ...
1092 TCGA-WT-AB44-01 TCGA-WT-AB44 0 883.0 0.0 883.0 0.0 883.0 0 883.0 NaN 0
1093 TCGA-XX-A899-01 TCGA-XX-A899 0 467.0 0.0 467.0 0.0 467.0 0 467.0 NaN 1
1094 TCGA-XX-A89A-01 TCGA-XX-A89A 0 488.0 0.0 488.0 0.0 488.0 0 488.0 NaN 1
1095 TCGA-Z7-A8R5-01 TCGA-Z7-A8R5 0 3287.0 0.0 3287.0 NaN NaN 1 181.0 NaN 1
1096 TCGA-Z7-A8R6-01 TCGA-Z7-A8R6 0 3256.0 0.0 3256.0 0.0 3256.0 0 3256.0 NaN 0

1097 rows × 12 columns

In [63]:
#still need to fix some problems in OS_time
df[["OS_time","OS_event","PIK3CA_mut"]].dtypes
df[["OS_time","OS_event","PIK3CA_mut"]].isna().sum()
df[["OS_time","OS_event"]]
Out[63]:
OS_time OS_event
0 4047.0 0
1 4005.0 0
2 1474.0 0
3 1448.0 0
4 348.0 0
... ... ...
1092 883.0 0
1093 467.0 0
1094 488.0 0
1095 3287.0 0
1096 3256.0 0

1097 rows × 2 columns

In [64]:
import pandas as pd

df2 = df.copy()

df2["OS_time"] = pd.to_numeric(df2["OS_time"], errors="coerce")
df2["OS_event"] = pd.to_numeric(df2["OS_event"], errors="coerce")

# Keep only valid rows: time present and >=0, event is 0 or 1
df2 = df2.dropna(subset=["OS_time", "OS_event"])
df2 = df2[df2["OS_time"] >= 0]
df2 = df2[df2["OS_event"].isin([0, 1])]

# Ensure integer event
df2["OS_event"] = df2["OS_event"].astype(int)

df2[["OS_time","OS_event"]].describe()
df2["OS_event"].value_counts()
Out[64]:
OS_event
0    945
1    151
Name: count, dtype: int64
In [65]:
#make kaplain-meier plot using df2
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf = KaplanMeierFitter()
plt.figure(figsize=(7,5))

for val, label in [(1, "PIK3CA Mutated"), (0, "PIK3CA Wild-Type")]:
    mask = df2["PIK3CA_mut"] == val
    kmf.fit(df2.loc[mask, "OS_time"],
            event_observed=df2.loc[mask, "OS_event"],
            label=label)
    kmf.plot_survival_function()

plt.title("TCGA BRCA Overall Survival: PIK3CA Mutation")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.show()
No description has been provided for this image
In [66]:
##calculate log-rank P
from lifelines.statistics import logrank_test

mut = df2["PIK3CA_mut"] == 1
wt = df2["PIK3CA_mut"] == 0

results = logrank_test(
    df2.loc[mut, "OS_time"],
    df2.loc[wt, "OS_time"],
    event_observed_A=df2.loc[mut, "OS_event"],
    event_observed_B=df2.loc[wt, "OS_event"]
)

print("Log-rank p-value:", results.p_value)
Log-rank p-value: 0.8706629249616884

Conclusion: PIK3CA mutation does not drive worse surival.¶

lets redo this analysis using TP53¶

In [68]:
#we will filter tumors that harbor TP53 mutations.
TP53 = TCGA_BRCA_MC3_Public[
    TCGA_BRCA_MC3_Public["gene"] == "TP53"
]
TP53
Out[68]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
347 TCGA-3C-AALI-01 17 7578382 7578382 G T TP53 Nonsense_Mutation p.S183* 0.65 NaN NaN
3791 TCGA-A1-A0SI-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.30 tolerated(0.11) benign(0.308)
4064 TCGA-A1-A0SK-01 17 7578532 7578532 A T TP53 Missense_Mutation p.M133K 0.98 deleterious(0) benign(0.122)
4354 TCGA-A1-A0SO-01 17 7578190 7578190 T C TP53 Missense_Mutation p.Y220C 0.87 deleterious(0) probably_damaging(1)
4633 TCGA-A1-A0SP-01 17 7578382 7578382 G C TP53 Nonsense_Mutation p.S183* 0.49 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
90161 TCGA-PL-A8LZ-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.32 tolerated(0.11) benign(0.308)
90450 TCGA-S3-AA0Z-01 17 7578550 7578550 G T TP53 Missense_Mutation p.S127Y 0.93 deleterious(0) probably_damaging(1)
90551 TCGA-S3-AA10-01 17 7579335 7579336 - C TP53 Frame_Shift_Ins p.T118Dfs*31 0.33 NaN NaN
91294 TCGA-UU-A93S-01 17 7578523 7578523 T G TP53 Missense_Mutation p.Q136P 0.62 deleterious(0) probably_damaging(1)
92047 TCGA-Z7-A8R6-01 17 7577085 7577085 C T TP53 Missense_Mutation p.E285K 0.88 deleterious(0) probably_damaging(0.985)

273 rows × 12 columns

In [69]:
#make sure TP53 mutations are non-silent
# the gene name will tell us mutations observed on the gene. We will select only PIK3CA gene from that column, and identify non-silent 
#mutations on that gene. We define nonsilent mutations as the following under effect 
nonsilent = [
    "Missense_Mutation",
    "Nonsense_Mutation",
    "Frame_Shift_Del",
    "Frame_Shift_Ins",
    "Splice_Site",
    "Nonstop_Mutation",
    "In_Frame_Del",
    "In_Frame_Ins"
]

TP53 = TP53[TP53["effect"].isin(nonsilent)]
TP53
Out[69]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
347 TCGA-3C-AALI-01 17 7578382 7578382 G T TP53 Nonsense_Mutation p.S183* 0.65 NaN NaN
3791 TCGA-A1-A0SI-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.30 tolerated(0.11) benign(0.308)
4064 TCGA-A1-A0SK-01 17 7578532 7578532 A T TP53 Missense_Mutation p.M133K 0.98 deleterious(0) benign(0.122)
4354 TCGA-A1-A0SO-01 17 7578190 7578190 T C TP53 Missense_Mutation p.Y220C 0.87 deleterious(0) probably_damaging(1)
4633 TCGA-A1-A0SP-01 17 7578382 7578382 G C TP53 Nonsense_Mutation p.S183* 0.49 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
90161 TCGA-PL-A8LZ-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.32 tolerated(0.11) benign(0.308)
90450 TCGA-S3-AA0Z-01 17 7578550 7578550 G T TP53 Missense_Mutation p.S127Y 0.93 deleterious(0) probably_damaging(1)
90551 TCGA-S3-AA10-01 17 7579335 7579336 - C TP53 Frame_Shift_Ins p.T118Dfs*31 0.33 NaN NaN
91294 TCGA-UU-A93S-01 17 7578523 7578523 T G TP53 Missense_Mutation p.Q136P 0.62 deleterious(0) probably_damaging(1)
92047 TCGA-Z7-A8R6-01 17 7577085 7577085 C T TP53 Missense_Mutation p.E285K 0.88 deleterious(0) probably_damaging(0.985)

269 rows × 12 columns

In [70]:
#now extract patient ID from samples 

tp53_patient_ids = TP53["sample"].astype(str).str[:12].dropna().unique()
tp53_patient_ids
Out[70]:
array(['TCGA-3C-AALI', 'TCGA-A1-A0SI', 'TCGA-A1-A0SK', 'TCGA-A1-A0SO',
       'TCGA-A1-A0SP', 'TCGA-A2-A04U', 'TCGA-A2-A04W', 'TCGA-A2-A0CL',
       'TCGA-A2-A0CW', 'TCGA-A2-A0ST', 'TCGA-A2-A0SX', 'TCGA-A2-A0T1',
       'TCGA-A2-A0T3', 'TCGA-A2-A0YE', 'TCGA-A2-A0YG', 'TCGA-A2-A0YH',
       'TCGA-A2-A0YJ', 'TCGA-A2-A0YM', 'TCGA-A2-A0YT', 'TCGA-A2-A1G1',
       'TCGA-A2-A3XT', 'TCGA-A2-A3XV', 'TCGA-A2-A3XX', 'TCGA-A2-A3Y0',
       'TCGA-A2-A4S1', 'TCGA-A2-A4S3', 'TCGA-A7-A0DA', 'TCGA-A7-A13D',
       'TCGA-A7-A13E', 'TCGA-A7-A26F', 'TCGA-A7-A26G', 'TCGA-A7-A26I',
       'TCGA-A7-A2KD', 'TCGA-A7-A4SD', 'TCGA-A7-A4SE', 'TCGA-A7-A56D',
       'TCGA-A7-A5ZV', 'TCGA-A7-A6VV', 'TCGA-A7-A6VW', 'TCGA-A7-A6VX',
       'TCGA-A7-A6VY', 'TCGA-A8-A075', 'TCGA-AC-A23C', 'TCGA-AC-A2BK',
       'TCGA-AC-A2QH', 'TCGA-AC-A3OD', 'TCGA-AC-A5EH', 'TCGA-AC-A5XU',
       'TCGA-AC-A62X', 'TCGA-AC-A6IW', 'TCGA-AC-A7VC', 'TCGA-AC-A8OQ',
       'TCGA-AN-A0XN', 'TCGA-AN-A0XT', 'TCGA-AN-A0XU', 'TCGA-AO-A03N',
       'TCGA-AO-A03V', 'TCGA-AO-A124', 'TCGA-AO-A128', 'TCGA-AO-A129',
       'TCGA-AO-A12D', 'TCGA-AO-A12G', 'TCGA-AQ-A04H', 'TCGA-AR-A0TP',
       'TCGA-AR-A0TS', 'TCGA-AR-A0TV', 'TCGA-AR-A0TX', 'TCGA-AR-A0U0',
       'TCGA-AR-A0U1', 'TCGA-AR-A0U2', 'TCGA-AR-A0U4', 'TCGA-AR-A1AH',
       'TCGA-AR-A1AI', 'TCGA-AR-A1AJ', 'TCGA-AR-A1AN', 'TCGA-AR-A1AP',
       'TCGA-AR-A1AQ', 'TCGA-AR-A1AR', 'TCGA-AR-A1AS', 'TCGA-AR-A1AW',
       'TCGA-AR-A1AX', 'TCGA-AR-A1AY', 'TCGA-AR-A24K', 'TCGA-AR-A24P',
       'TCGA-AR-A24Q', 'TCGA-AR-A24S', 'TCGA-AR-A24T', 'TCGA-AR-A24U',
       'TCGA-AR-A251', 'TCGA-AR-A254', 'TCGA-AR-A256', 'TCGA-AR-A2LH',
       'TCGA-AR-A2LR', 'TCGA-AR-A5QQ', 'TCGA-B6-A0I1', 'TCGA-B6-A0RH',
       'TCGA-B6-A0RU', 'TCGA-B6-A0WX', 'TCGA-B6-A0X1', 'TCGA-B6-A1KN',
       'TCGA-B6-A401', 'TCGA-B6-A409', 'TCGA-BH-A0AV', 'TCGA-BH-A0B0',
       'TCGA-BH-A0B7', 'TCGA-BH-A0BC', 'TCGA-BH-A0BF', 'TCGA-BH-A0BG',
       'TCGA-BH-A0BL', 'TCGA-BH-A0BP', 'TCGA-BH-A0BT', 'TCGA-BH-A0C0',
       'TCGA-BH-A0C3', 'TCGA-BH-A0DI', 'TCGA-BH-A0DL', 'TCGA-BH-A0DZ',
       'TCGA-BH-A0RX', 'TCGA-BH-A0WA', 'TCGA-BH-A18H', 'TCGA-BH-A18Q',
       'TCGA-BH-A18T', 'TCGA-BH-A18U', 'TCGA-BH-A18V', 'TCGA-BH-A1EY',
       'TCGA-BH-A1F0', 'TCGA-BH-A1F2', 'TCGA-BH-A1F6', 'TCGA-BH-A1FC',
       'TCGA-BH-A1FE', 'TCGA-BH-A1FN', 'TCGA-BH-A1FU', 'TCGA-BH-A202',
       'TCGA-BH-A203', 'TCGA-BH-A208', 'TCGA-BH-A5IZ', 'TCGA-BH-A5J0',
       'TCGA-C8-A12K', 'TCGA-C8-A12L', 'TCGA-C8-A12O', 'TCGA-C8-A12P',
       'TCGA-C8-A12Q', 'TCGA-C8-A12V', 'TCGA-C8-A12W', 'TCGA-C8-A12Z',
       'TCGA-C8-A130', 'TCGA-C8-A131', 'TCGA-C8-A134', 'TCGA-C8-A135',
       'TCGA-C8-A138', 'TCGA-C8-A1HF', 'TCGA-C8-A1HG', 'TCGA-C8-A1HJ',
       'TCGA-C8-A1HK', 'TCGA-C8-A1HM', 'TCGA-C8-A26V', 'TCGA-C8-A26W',
       'TCGA-C8-A26Y', 'TCGA-C8-A275', 'TCGA-C8-A278', 'TCGA-C8-A27A',
       'TCGA-C8-A27B', 'TCGA-C8-A8HP', 'TCGA-D8-A13Y', 'TCGA-D8-A13Z',
       'TCGA-D8-A142', 'TCGA-D8-A143', 'TCGA-D8-A147', 'TCGA-D8-A1J9',
       'TCGA-D8-A1JF', 'TCGA-D8-A1JG', 'TCGA-D8-A1JJ', 'TCGA-D8-A1JK',
       'TCGA-D8-A1JL', 'TCGA-D8-A1JM', 'TCGA-D8-A1X5', 'TCGA-D8-A1XA',
       'TCGA-D8-A1XL', 'TCGA-D8-A1XQ', 'TCGA-D8-A1XT', 'TCGA-D8-A1XW',
       'TCGA-D8-A1XZ', 'TCGA-D8-A1Y3', 'TCGA-D8-A27F', 'TCGA-D8-A27M',
       'TCGA-D8-A27N', 'TCGA-E2-A108', 'TCGA-E2-A109', 'TCGA-E2-A14N',
       'TCGA-E2-A14P', 'TCGA-E2-A14R', 'TCGA-E2-A14X', 'TCGA-E2-A14Y',
       'TCGA-E2-A14Z', 'TCGA-E2-A150', 'TCGA-E2-A152', 'TCGA-E2-A155',
       'TCGA-E2-A158', 'TCGA-E2-A159', 'TCGA-E2-A15E', 'TCGA-E2-A15M',
       'TCGA-E2-A1AZ', 'TCGA-E2-A1B0', 'TCGA-E2-A1B1', 'TCGA-E2-A1B6',
       'TCGA-E2-A1II', 'TCGA-E2-A1IN', 'TCGA-E2-A1L7', 'TCGA-E2-A1LG',
       'TCGA-E2-A1LH', 'TCGA-E2-A1LK', 'TCGA-E2-A573', 'TCGA-E2-A574',
       'TCGA-E2-A9RU', 'TCGA-E9-A1N5', 'TCGA-E9-A1N8', 'TCGA-E9-A1N9',
       'TCGA-E9-A1NF', 'TCGA-E9-A1RB', 'TCGA-E9-A1RH', 'TCGA-E9-A226',
       'TCGA-E9-A22E', 'TCGA-E9-A22G', 'TCGA-E9-A243', 'TCGA-E9-A244',
       'TCGA-E9-A248', 'TCGA-E9-A5FL', 'TCGA-E9-A6HE', 'TCGA-EW-A1OZ',
       'TCGA-EW-A1P1', 'TCGA-EW-A1P4', 'TCGA-EW-A1P8', 'TCGA-EW-A1PA',
       'TCGA-EW-A1PB', 'TCGA-EW-A1PH', 'TCGA-EW-A2FR', 'TCGA-EW-A6S9',
       'TCGA-EW-A6SB', 'TCGA-EW-A6SD', 'TCGA-GM-A2DB', 'TCGA-GM-A2DD',
       'TCGA-GM-A2DF', 'TCGA-GM-A2DH', 'TCGA-GM-A2DL', 'TCGA-GM-A3XL',
       'TCGA-LD-A74U', 'TCGA-LD-A7W5', 'TCGA-LL-A5YO', 'TCGA-LL-A5YP',
       'TCGA-LL-A6FR', 'TCGA-LL-A73Y', 'TCGA-LL-A7SZ', 'TCGA-LL-A8F5',
       'TCGA-OL-A5D6', 'TCGA-OL-A5D7', 'TCGA-OL-A5RW', 'TCGA-OL-A66I',
       'TCGA-OL-A6VO', 'TCGA-PE-A5DE', 'TCGA-PL-A8LV', 'TCGA-PL-A8LZ',
       'TCGA-S3-AA0Z', 'TCGA-S3-AA10', 'TCGA-UU-A93S', 'TCGA-Z7-A8R6'],
      dtype=object)
In [72]:
len(tp53_patient_ids)
Out[72]:
264
In [73]:
# we have 209 patients with TP53 mutations. we make a dataframe to identify patient mutation status
import pandas as pd
TP53_status = pd.DataFrame({"patient_id": tp53_patient_ids})
TP53_status["TP53_mut"] = 1
TP53_status
Out[73]:
patient_id TP53_mut
0 TCGA-3C-AALI 1
1 TCGA-A1-A0SI 1
2 TCGA-A1-A0SK 1
3 TCGA-A1-A0SO 1
4 TCGA-A1-A0SP 1
... ... ...
259 TCGA-PL-A8LZ 1
260 TCGA-S3-AA0Z 1
261 TCGA-S3-AA10 1
262 TCGA-UU-A93S 1
263 TCGA-Z7-A8R6 1

264 rows × 2 columns

In [74]:
##merge with survial info
df_tp53 = surv_unique.merge(TP53_status, on="patient_id", how="left")

df_tp53["TP53_mut"] = df_tp53["TP53_mut"].fillna(0).astype(int)

df_tp53["TP53_mut"].value_counts()
Out[74]:
TP53_mut
0    833
1    264
Name: count, dtype: int64
In [75]:
df_tp53
Out[75]:
sample patient_id OS_event OS_time DSS DSS.time DFI DFI.time PFI PFI.time Redaction TP53_mut
0 TCGA-3C-AAAU-01 TCGA-3C-AAAU 0 4047.0 0.0 4047.0 1.0 1808.0 1 1808.0 NaN 0
1 TCGA-3C-AALI-01 TCGA-3C-AALI 0 4005.0 0.0 4005.0 0.0 4005.0 0 4005.0 NaN 1
2 TCGA-3C-AALJ-01 TCGA-3C-AALJ 0 1474.0 0.0 1474.0 0.0 1474.0 0 1474.0 NaN 0
3 TCGA-3C-AALK-01 TCGA-3C-AALK 0 1448.0 0.0 1448.0 NaN NaN 0 1448.0 NaN 0
4 TCGA-4H-AAAK-01 TCGA-4H-AAAK 0 348.0 0.0 348.0 0.0 348.0 0 348.0 NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ...
1092 TCGA-WT-AB44-01 TCGA-WT-AB44 0 883.0 0.0 883.0 0.0 883.0 0 883.0 NaN 0
1093 TCGA-XX-A899-01 TCGA-XX-A899 0 467.0 0.0 467.0 0.0 467.0 0 467.0 NaN 0
1094 TCGA-XX-A89A-01 TCGA-XX-A89A 0 488.0 0.0 488.0 0.0 488.0 0 488.0 NaN 0
1095 TCGA-Z7-A8R5-01 TCGA-Z7-A8R5 0 3287.0 0.0 3287.0 NaN NaN 1 181.0 NaN 0
1096 TCGA-Z7-A8R6-01 TCGA-Z7-A8R6 0 3256.0 0.0 3256.0 0.0 3256.0 0 3256.0 NaN 1

1097 rows × 12 columns

In [76]:
##ready for kaplain-mieer plots
import pandas as pd

df2 = df_tp53.copy()

df2["OS_time"] = pd.to_numeric(df2["OS_time"], errors="coerce")
df2["OS_event"] = pd.to_numeric(df2["OS_event"], errors="coerce")

# Keep only valid rows: time present and >=0, event is 0 or 1
df2 = df2.dropna(subset=["OS_time", "OS_event"])
df2 = df2[df2["OS_time"] >= 0]
df2 = df2[df2["OS_event"].isin([0, 1])]

# Ensure integer event
df2["OS_event"] = df2["OS_event"].astype(int)

df2[["OS_time","OS_event"]].describe()
df2["OS_event"].value_counts()
Out[76]:
OS_event
0    945
1    151
Name: count, dtype: int64
In [77]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf = KaplanMeierFitter()
plt.figure(figsize=(7,5))

for val, label in [(1, "TP53 Mutated"), (0, "TP53 Wild-Type")]:
    mask = df2["TP53_mut"] == val
    kmf.fit(df2.loc[mask, "OS_time"],
            event_observed=df2.loc[mask, "OS_event"],
            label=label)
    kmf.plot_survival_function()

plt.title("TCGA BRCA Overall Survival: TP53 Mutation")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.show()
No description has been provided for this image
In [79]:
##calculate log-rank P
from lifelines.statistics import logrank_test

mut = df2["TP53_mut"] == 1
wt = df2["TP53_mut"] == 0

results = logrank_test(
    df2.loc[mut, "OS_time"],
    df2.loc[wt, "OS_time"],
    event_observed_A=df2.loc[mut, "OS_event"],
    event_observed_B=df2.loc[wt, "OS_event"]
)

print("Log-rank p-value:", results.p_value)
Log-rank p-value: 0.767454702553489

TP53 mutation showed a little differnece in surival.¶

In [ ]: