import pandas as pd                                      
import numpy as np

import sys
print(sys.executable)

/Users/yongmeiwang/opt/anaconda3/envs/py3.9/bin/python

TCGA_survival=pd.read_csv("survival_BRCA_survival.txt",sep="\t")

TCGA_survival

TCGA_survival.sort_values("OS.time",ascending=False)

#here the column OS is the "overall survival events", 1 would mean the patient died, 0 mean the patient censored (meaning patient has not died)
#check how many died 
num_deaths = TCGA_survival['OS'].sum()
print(f"Number of patients who died: {num_deaths}")

Number of patients who died: 202

#this is a better way to check
death_counts = TCGA_survival['OS'].value_counts()
print(death_counts)

OS
0    1034
1     202
Name: count, dtype: int64

import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Use your dataframe
df = TCGA_survival.copy()

#Define time and event columns
time = df['OS.time']      # Time to event or censoring
event = df['OS']          # 1 = event (death), 0 = censored

#make sure the data do not have nan; and are numeric
print(df['OS.time'].isna().sum())
print(df['OS'].isna().sum())
print(df['OS.time'].dtype)
print(df['OS'].dtype)

1
0
float64
int64

#OS.time has one Na. otherwise they are of numeric 
#drop one Na.

df.shape #check shape before drop
df = df.dropna(subset=['OS.time', 'OS'])
df.shape #check again

(1235, 11)

#ready to make KaplanMeier plot using the BRCA survival OS events. 

from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf = KaplanMeierFitter()
kmf.fit(df['OS.time'], df['OS'], label="Overall Survival")

plt.figure(figsize=(8, 6))
kmf.plot()
plt.title("Kaplan-Meier Survival Curve - TCGA BRCA")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.grid(True)
plt.tight_layout()
plt.show()

## next section we will see how mutations on the frequently mutated genes PIK3CA, TP53 impact survival. 
## so we will need to define PIK3CA mutation status for patient sample ID(example: TCGA-B6-A0RU-01).
# you will use this mutation files downloaded from Xena browser : mc3_BRCA_mc3.txt

#donwload the other public version, smaller file, 8.5MB after unzip it
#make sure you have the file under the path, then the following command will read it. If you did not unzip it, pd.read can still read it. 

TCGA_BRCA_MC3_Public=pd.read_csv("mc3_BRCA_mc3.txt", sep='\t')

#take a look at the data
TCGA_BRCA_MC3_Public

#we will filter tumors that harbor PIK3CA mutations.
pik = TCGA_BRCA_MC3_Public[
    TCGA_BRCA_MC3_Public["gene"] == "PIK3CA"
]
pik

# the gene name will tell us mutations observed on the gene. We will select only PIK3CA gene from that column, and identify non-silent 
#mutations on that gene. We define nonsilent mutations as the following under effect 
nonsilent = [
    "Missense_Mutation",
    "Nonsense_Mutation",
    "Frame_Shift_Del",
    "Frame_Shift_Ins",
    "Splice_Site",
    "Nonstop_Mutation",
    "In_Frame_Del",
    "In_Frame_Ins"
]

pik = pik[pik["effect"].isin(nonsilent)]
pik

#now extract patient ID from samples 

pik_patient_ids = pik["sample"].astype(str).str[:12].dropna().unique()
pik_patient_ids

array(['TCGA-3C-AALK', 'TCGA-5L-AAT0', 'TCGA-5L-AAT1', 'TCGA-A1-A0SI',
       'TCGA-A2-A04N', 'TCGA-A2-A04W', 'TCGA-A2-A0CR', 'TCGA-A2-A0CS',
       'TCGA-A2-A0CW', 'TCGA-A2-A0D3', 'TCGA-A2-A0EN', 'TCGA-A2-A0EW',
       'TCGA-A2-A0SY', 'TCGA-A2-A0T4', 'TCGA-A2-A0T7', 'TCGA-A2-A0YC',
       'TCGA-A2-A0YH', 'TCGA-A2-A0YI', 'TCGA-A2-A0YK', 'TCGA-A2-A0YL',
       'TCGA-A2-A0YT', 'TCGA-A2-A1FZ', 'TCGA-A2-A1G0', 'TCGA-A2-A25A',
       'TCGA-A2-A25C', 'TCGA-A2-A25D', 'TCGA-A2-A4S2', 'TCGA-A7-A0D9',
       'TCGA-A7-A0DB', 'TCGA-A7-A13G', 'TCGA-A7-A26E', 'TCGA-A7-A26H',
       'TCGA-A7-A5ZW', 'TCGA-A7-A5ZX', 'TCGA-A7-A6VX', 'TCGA-A8-A075',
       'TCGA-AC-A23C', 'TCGA-AC-A23E', 'TCGA-AC-A23H', 'TCGA-AC-A2B8',
       'TCGA-AC-A2FF', 'TCGA-AC-A2FO', 'TCGA-AC-A3OD', 'TCGA-AC-A3YJ',
       'TCGA-AC-A5EH', 'TCGA-AC-A5XS', 'TCGA-AC-A6NO', 'TCGA-AC-A8OS',
       'TCGA-AN-A0XL', 'TCGA-AN-A0XO', 'TCGA-AN-A0XP', 'TCGA-AN-A0XS',
       'TCGA-AO-A03M', 'TCGA-AO-A03N', 'TCGA-AO-A125', 'TCGA-AO-A126',
       'TCGA-AO-A12A', 'TCGA-AO-A1KR', 'TCGA-AQ-A0Y5', 'TCGA-AQ-A1H2',
       'TCGA-AQ-A54O', 'TCGA-AQ-A7U7', 'TCGA-AR-A0TR', 'TCGA-AR-A0TZ',
       'TCGA-AR-A1AL', 'TCGA-AR-A1AO', 'TCGA-AR-A1AS', 'TCGA-AR-A1AV',
       'TCGA-AR-A1AW', 'TCGA-AR-A24K', 'TCGA-AR-A24M', 'TCGA-AR-A24O',
       'TCGA-AR-A24S', 'TCGA-AR-A24T', 'TCGA-AR-A255', 'TCGA-AR-A2LK',
       'TCGA-AR-A2LM', 'TCGA-AR-A2LO', 'TCGA-AR-A5QM', 'TCGA-AR-A5QP',
       'TCGA-AR-A5QQ', 'TCGA-B6-A0RH', 'TCGA-B6-A0RN', 'TCGA-B6-A0RO',
       'TCGA-B6-A0RP', 'TCGA-B6-A0RQ', 'TCGA-B6-A0WW', 'TCGA-B6-A0WY',
       'TCGA-B6-A0X0', 'TCGA-B6-A0X5', 'TCGA-B6-A0X7', 'TCGA-B6-A401',
       'TCGA-B6-A40B', 'TCGA-BH-A0B0', 'TCGA-BH-A0B6', 'TCGA-BH-A0BA',
       'TCGA-BH-A0BC', 'TCGA-BH-A0BF', 'TCGA-BH-A0BJ', 'TCGA-BH-A0BM',
       'TCGA-BH-A0BO', 'TCGA-BH-A0BQ', 'TCGA-BH-A0BT', 'TCGA-BH-A0DE',
       'TCGA-BH-A0DK', 'TCGA-BH-A0DL', 'TCGA-BH-A0DO', 'TCGA-BH-A0DP',
       'TCGA-BH-A0DT', 'TCGA-BH-A0DV', 'TCGA-BH-A0DX', 'TCGA-BH-A0EA',
       'TCGA-BH-A0H3', 'TCGA-BH-A0H9', 'TCGA-BH-A0HA', 'TCGA-BH-A0HI',
       'TCGA-BH-A0HN', 'TCGA-BH-A0W3', 'TCGA-BH-A0W5', 'TCGA-BH-A0W7',
       'TCGA-BH-A18F', 'TCGA-BH-A18H', 'TCGA-BH-A18I', 'TCGA-BH-A18J',
       'TCGA-BH-A1ET', 'TCGA-BH-A1EU', 'TCGA-BH-A1EY', 'TCGA-BH-A1F8',
       'TCGA-BH-A1FE', 'TCGA-BH-A201', 'TCGA-BH-A202', 'TCGA-BH-A203',
       'TCGA-BH-A208', 'TCGA-BH-A2L8', 'TCGA-BH-A42V', 'TCGA-BH-A5J0',
       'TCGA-BH-A8FY', 'TCGA-C8-A12L', 'TCGA-C8-A12N', 'TCGA-C8-A12T',
       'TCGA-C8-A12U', 'TCGA-C8-A12Y', 'TCGA-C8-A130', 'TCGA-C8-A131',
       'TCGA-C8-A133', 'TCGA-C8-A1HE', 'TCGA-C8-A1HF', 'TCGA-C8-A26W',
       'TCGA-C8-A26X', 'TCGA-C8-A274', 'TCGA-C8-A278', 'TCGA-C8-A3M7',
       'TCGA-C8-A3M8', 'TCGA-C8-A8HQ', 'TCGA-D8-A143', 'TCGA-D8-A145',
       'TCGA-D8-A146', 'TCGA-D8-A1J8', 'TCGA-D8-A1JD', 'TCGA-D8-A1JE',
       'TCGA-D8-A1JF', 'TCGA-D8-A1JG', 'TCGA-D8-A1JH', 'TCGA-D8-A1JJ',
       'TCGA-D8-A1JK', 'TCGA-D8-A1JN', 'TCGA-D8-A1JP', 'TCGA-D8-A1JS',
       'TCGA-D8-A1JU', 'TCGA-D8-A1XB', 'TCGA-D8-A1XL', 'TCGA-D8-A1XM',
       'TCGA-D8-A1XO', 'TCGA-D8-A1XS', 'TCGA-D8-A1XY', 'TCGA-D8-A1Y1',
       'TCGA-D8-A1Y2', 'TCGA-D8-A27G', 'TCGA-D8-A27K', 'TCGA-D8-A27L',
       'TCGA-D8-A27P', 'TCGA-D8-A27T', 'TCGA-D8-A3Z6', 'TCGA-D8-A73U',
       'TCGA-D8-A73X', 'TCGA-E2-A105', 'TCGA-E2-A108', 'TCGA-E2-A10B',
       'TCGA-E2-A10C', 'TCGA-E2-A10F', 'TCGA-E2-A14V', 'TCGA-E2-A14Y',
       'TCGA-E2-A153', 'TCGA-E2-A154', 'TCGA-E2-A156', 'TCGA-E2-A159',
       'TCGA-E2-A15C', 'TCGA-E2-A15E', 'TCGA-E2-A15G', 'TCGA-E2-A15K',
       'TCGA-E2-A15L', 'TCGA-E2-A15P', 'TCGA-E2-A1B4', 'TCGA-E2-A1BC',
       'TCGA-E2-A1BD', 'TCGA-E2-A1IF', 'TCGA-E2-A1IL', 'TCGA-E2-A1IN',
       'TCGA-E2-A1IO', 'TCGA-E2-A1L6', 'TCGA-E2-A1L8', 'TCGA-E2-A576',
       'TCGA-E2-A9RU', 'TCGA-E9-A1N5', 'TCGA-E9-A1NG', 'TCGA-E9-A1NH',
       'TCGA-E9-A1R0', 'TCGA-E9-A1R3', 'TCGA-E9-A1R4', 'TCGA-E9-A1R5',
       'TCGA-E9-A1RA', 'TCGA-E9-A1RC', 'TCGA-E9-A1RD', 'TCGA-E9-A1RE',
       'TCGA-E9-A1RH', 'TCGA-E9-A1RI', 'TCGA-E9-A226', 'TCGA-E9-A227',
       'TCGA-E9-A229', 'TCGA-E9-A249', 'TCGA-E9-A295', 'TCGA-E9-A3X8',
       'TCGA-EW-A1IW', 'TCGA-EW-A1IX', 'TCGA-EW-A1IZ', 'TCGA-EW-A1J1',
       'TCGA-EW-A1J5', 'TCGA-EW-A1OV', 'TCGA-EW-A1P0', 'TCGA-EW-A1P5',
       'TCGA-EW-A1PE', 'TCGA-EW-A2FV', 'TCGA-EW-A423', 'TCGA-EW-A6SC',
       'TCGA-GM-A2D9', 'TCGA-GM-A2DD', 'TCGA-GM-A2DH', 'TCGA-GM-A2DI',
       'TCGA-GM-A3NW', 'TCGA-HN-A2OB', 'TCGA-JL-A3YX', 'TCGA-LD-A66U',
       'TCGA-LD-A74U', 'TCGA-LL-A50Y', 'TCGA-LL-A5YN', 'TCGA-LL-A6FP',
       'TCGA-LL-A7T0', 'TCGA-LL-A9Q3', 'TCGA-OK-A5Q2', 'TCGA-OL-A5D6',
       'TCGA-OL-A5DA', 'TCGA-OL-A5RX', 'TCGA-OL-A5S0', 'TCGA-OL-A66L',
       'TCGA-OL-A6VQ', 'TCGA-PE-A5DD', 'TCGA-PE-A5DE', 'TCGA-S3-AA11',
       'TCGA-S3-AA14', 'TCGA-WT-AB41', 'TCGA-XX-A899', 'TCGA-XX-A89A',
       'TCGA-Z7-A8R5'], dtype=object)

len(pik_patient_ids)

273

# we have 273 patients with PIK3C mutations. we make a dataframe to identify patient mutation status
import pandas as pd
pik_status = pd.DataFrame({"patient_id": pik_patient_ids})
pik_status["PIK3CA_mut"] = 1
pik_status.head()

#check TCGA survival
TCGA_survival

#realized that the above surival table has duplicate patient ID. run the following
surv = TCGA_survival.rename(columns={"_PATIENT":"patient_id", "OS.time":"OS_time", "OS":"OS_event"}).copy()

print("rows in survival:", len(surv))
print("unique patients in survival:", surv["patient_id"].nunique())

rows in survival: 1236
unique patients in survival: 1097

#lets remove duplicate
surv = TCGA_survival.rename(columns={
    "_PATIENT": "patient_id",
    "OS.time": "OS_time",
    "OS": "OS_event"
}).copy()

print("Before dedup:", len(surv), surv["patient_id"].nunique())

# Keep one record per patient
surv_unique = surv.drop_duplicates(subset="patient_id")

print("After dedup:", len(surv_unique), surv_unique["patient_id"].nunique())

Before dedup: 1236 1097
After dedup: 1097 1097

# prepare survival table
df = surv_unique.merge(pik_status, on="patient_id", how="left")
df["PIK3CA_mut"] = df["PIK3CA_mut"].fillna(0).astype(int)

print(df["PIK3CA_mut"].value_counts())
print("Unique mutated patients:",
      df.loc[df["PIK3CA_mut"] == 1, "patient_id"].nunique())

PIK3CA_mut
0    824
1    273
Name: count, dtype: int64
Unique mutated patients: 273

df

#still need to fix some problems in OS_time
df[["OS_time","OS_event","PIK3CA_mut"]].dtypes
df[["OS_time","OS_event","PIK3CA_mut"]].isna().sum()
df[["OS_time","OS_event"]]

import pandas as pd

df2 = df.copy()

df2["OS_time"] = pd.to_numeric(df2["OS_time"], errors="coerce")
df2["OS_event"] = pd.to_numeric(df2["OS_event"], errors="coerce")

# Keep only valid rows: time present and >=0, event is 0 or 1
df2 = df2.dropna(subset=["OS_time", "OS_event"])
df2 = df2[df2["OS_time"] >= 0]
df2 = df2[df2["OS_event"].isin([0, 1])]

# Ensure integer event
df2["OS_event"] = df2["OS_event"].astype(int)

df2[["OS_time","OS_event"]].describe()
df2["OS_event"].value_counts()

OS_event
0    945
1    151
Name: count, dtype: int64

#make kaplain-meier plot using df2
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf = KaplanMeierFitter()
plt.figure(figsize=(7,5))

for val, label in [(1, "PIK3CA Mutated"), (0, "PIK3CA Wild-Type")]:
    mask = df2["PIK3CA_mut"] == val
    kmf.fit(df2.loc[mask, "OS_time"],
            event_observed=df2.loc[mask, "OS_event"],
            label=label)
    kmf.plot_survival_function()

plt.title("TCGA BRCA Overall Survival: PIK3CA Mutation")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.show()

##calculate log-rank P
from lifelines.statistics import logrank_test

mut = df2["PIK3CA_mut"] == 1
wt = df2["PIK3CA_mut"] == 0

results = logrank_test(
    df2.loc[mut, "OS_time"],
    df2.loc[wt, "OS_time"],
    event_observed_A=df2.loc[mut, "OS_event"],
    event_observed_B=df2.loc[wt, "OS_event"]
)

print("Log-rank p-value:", results.p_value)

Log-rank p-value: 0.8706629249616884

#we will filter tumors that harbor TP53 mutations.
TP53 = TCGA_BRCA_MC3_Public[
    TCGA_BRCA_MC3_Public["gene"] == "TP53"
]
TP53

#make sure TP53 mutations are non-silent
# the gene name will tell us mutations observed on the gene. We will select only PIK3CA gene from that column, and identify non-silent 
#mutations on that gene. We define nonsilent mutations as the following under effect 
nonsilent = [
    "Missense_Mutation",
    "Nonsense_Mutation",
    "Frame_Shift_Del",
    "Frame_Shift_Ins",
    "Splice_Site",
    "Nonstop_Mutation",
    "In_Frame_Del",
    "In_Frame_Ins"
]

TP53 = TP53[TP53["effect"].isin(nonsilent)]
TP53

#now extract patient ID from samples 

tp53_patient_ids = TP53["sample"].astype(str).str[:12].dropna().unique()
tp53_patient_ids

array(['TCGA-3C-AALI', 'TCGA-A1-A0SI', 'TCGA-A1-A0SK', 'TCGA-A1-A0SO',
       'TCGA-A1-A0SP', 'TCGA-A2-A04U', 'TCGA-A2-A04W', 'TCGA-A2-A0CL',
       'TCGA-A2-A0CW', 'TCGA-A2-A0ST', 'TCGA-A2-A0SX', 'TCGA-A2-A0T1',
       'TCGA-A2-A0T3', 'TCGA-A2-A0YE', 'TCGA-A2-A0YG', 'TCGA-A2-A0YH',
       'TCGA-A2-A0YJ', 'TCGA-A2-A0YM', 'TCGA-A2-A0YT', 'TCGA-A2-A1G1',
       'TCGA-A2-A3XT', 'TCGA-A2-A3XV', 'TCGA-A2-A3XX', 'TCGA-A2-A3Y0',
       'TCGA-A2-A4S1', 'TCGA-A2-A4S3', 'TCGA-A7-A0DA', 'TCGA-A7-A13D',
       'TCGA-A7-A13E', 'TCGA-A7-A26F', 'TCGA-A7-A26G', 'TCGA-A7-A26I',
       'TCGA-A7-A2KD', 'TCGA-A7-A4SD', 'TCGA-A7-A4SE', 'TCGA-A7-A56D',
       'TCGA-A7-A5ZV', 'TCGA-A7-A6VV', 'TCGA-A7-A6VW', 'TCGA-A7-A6VX',
       'TCGA-A7-A6VY', 'TCGA-A8-A075', 'TCGA-AC-A23C', 'TCGA-AC-A2BK',
       'TCGA-AC-A2QH', 'TCGA-AC-A3OD', 'TCGA-AC-A5EH', 'TCGA-AC-A5XU',
       'TCGA-AC-A62X', 'TCGA-AC-A6IW', 'TCGA-AC-A7VC', 'TCGA-AC-A8OQ',
       'TCGA-AN-A0XN', 'TCGA-AN-A0XT', 'TCGA-AN-A0XU', 'TCGA-AO-A03N',
       'TCGA-AO-A03V', 'TCGA-AO-A124', 'TCGA-AO-A128', 'TCGA-AO-A129',
       'TCGA-AO-A12D', 'TCGA-AO-A12G', 'TCGA-AQ-A04H', 'TCGA-AR-A0TP',
       'TCGA-AR-A0TS', 'TCGA-AR-A0TV', 'TCGA-AR-A0TX', 'TCGA-AR-A0U0',
       'TCGA-AR-A0U1', 'TCGA-AR-A0U2', 'TCGA-AR-A0U4', 'TCGA-AR-A1AH',
       'TCGA-AR-A1AI', 'TCGA-AR-A1AJ', 'TCGA-AR-A1AN', 'TCGA-AR-A1AP',
       'TCGA-AR-A1AQ', 'TCGA-AR-A1AR', 'TCGA-AR-A1AS', 'TCGA-AR-A1AW',
       'TCGA-AR-A1AX', 'TCGA-AR-A1AY', 'TCGA-AR-A24K', 'TCGA-AR-A24P',
       'TCGA-AR-A24Q', 'TCGA-AR-A24S', 'TCGA-AR-A24T', 'TCGA-AR-A24U',
       'TCGA-AR-A251', 'TCGA-AR-A254', 'TCGA-AR-A256', 'TCGA-AR-A2LH',
       'TCGA-AR-A2LR', 'TCGA-AR-A5QQ', 'TCGA-B6-A0I1', 'TCGA-B6-A0RH',
       'TCGA-B6-A0RU', 'TCGA-B6-A0WX', 'TCGA-B6-A0X1', 'TCGA-B6-A1KN',
       'TCGA-B6-A401', 'TCGA-B6-A409', 'TCGA-BH-A0AV', 'TCGA-BH-A0B0',
       'TCGA-BH-A0B7', 'TCGA-BH-A0BC', 'TCGA-BH-A0BF', 'TCGA-BH-A0BG',
       'TCGA-BH-A0BL', 'TCGA-BH-A0BP', 'TCGA-BH-A0BT', 'TCGA-BH-A0C0',
       'TCGA-BH-A0C3', 'TCGA-BH-A0DI', 'TCGA-BH-A0DL', 'TCGA-BH-A0DZ',
       'TCGA-BH-A0RX', 'TCGA-BH-A0WA', 'TCGA-BH-A18H', 'TCGA-BH-A18Q',
       'TCGA-BH-A18T', 'TCGA-BH-A18U', 'TCGA-BH-A18V', 'TCGA-BH-A1EY',
       'TCGA-BH-A1F0', 'TCGA-BH-A1F2', 'TCGA-BH-A1F6', 'TCGA-BH-A1FC',
       'TCGA-BH-A1FE', 'TCGA-BH-A1FN', 'TCGA-BH-A1FU', 'TCGA-BH-A202',
       'TCGA-BH-A203', 'TCGA-BH-A208', 'TCGA-BH-A5IZ', 'TCGA-BH-A5J0',
       'TCGA-C8-A12K', 'TCGA-C8-A12L', 'TCGA-C8-A12O', 'TCGA-C8-A12P',
       'TCGA-C8-A12Q', 'TCGA-C8-A12V', 'TCGA-C8-A12W', 'TCGA-C8-A12Z',
       'TCGA-C8-A130', 'TCGA-C8-A131', 'TCGA-C8-A134', 'TCGA-C8-A135',
       'TCGA-C8-A138', 'TCGA-C8-A1HF', 'TCGA-C8-A1HG', 'TCGA-C8-A1HJ',
       'TCGA-C8-A1HK', 'TCGA-C8-A1HM', 'TCGA-C8-A26V', 'TCGA-C8-A26W',
       'TCGA-C8-A26Y', 'TCGA-C8-A275', 'TCGA-C8-A278', 'TCGA-C8-A27A',
       'TCGA-C8-A27B', 'TCGA-C8-A8HP', 'TCGA-D8-A13Y', 'TCGA-D8-A13Z',
       'TCGA-D8-A142', 'TCGA-D8-A143', 'TCGA-D8-A147', 'TCGA-D8-A1J9',
       'TCGA-D8-A1JF', 'TCGA-D8-A1JG', 'TCGA-D8-A1JJ', 'TCGA-D8-A1JK',
       'TCGA-D8-A1JL', 'TCGA-D8-A1JM', 'TCGA-D8-A1X5', 'TCGA-D8-A1XA',
       'TCGA-D8-A1XL', 'TCGA-D8-A1XQ', 'TCGA-D8-A1XT', 'TCGA-D8-A1XW',
       'TCGA-D8-A1XZ', 'TCGA-D8-A1Y3', 'TCGA-D8-A27F', 'TCGA-D8-A27M',
       'TCGA-D8-A27N', 'TCGA-E2-A108', 'TCGA-E2-A109', 'TCGA-E2-A14N',
       'TCGA-E2-A14P', 'TCGA-E2-A14R', 'TCGA-E2-A14X', 'TCGA-E2-A14Y',
       'TCGA-E2-A14Z', 'TCGA-E2-A150', 'TCGA-E2-A152', 'TCGA-E2-A155',
       'TCGA-E2-A158', 'TCGA-E2-A159', 'TCGA-E2-A15E', 'TCGA-E2-A15M',
       'TCGA-E2-A1AZ', 'TCGA-E2-A1B0', 'TCGA-E2-A1B1', 'TCGA-E2-A1B6',
       'TCGA-E2-A1II', 'TCGA-E2-A1IN', 'TCGA-E2-A1L7', 'TCGA-E2-A1LG',
       'TCGA-E2-A1LH', 'TCGA-E2-A1LK', 'TCGA-E2-A573', 'TCGA-E2-A574',
       'TCGA-E2-A9RU', 'TCGA-E9-A1N5', 'TCGA-E9-A1N8', 'TCGA-E9-A1N9',
       'TCGA-E9-A1NF', 'TCGA-E9-A1RB', 'TCGA-E9-A1RH', 'TCGA-E9-A226',
       'TCGA-E9-A22E', 'TCGA-E9-A22G', 'TCGA-E9-A243', 'TCGA-E9-A244',
       'TCGA-E9-A248', 'TCGA-E9-A5FL', 'TCGA-E9-A6HE', 'TCGA-EW-A1OZ',
       'TCGA-EW-A1P1', 'TCGA-EW-A1P4', 'TCGA-EW-A1P8', 'TCGA-EW-A1PA',
       'TCGA-EW-A1PB', 'TCGA-EW-A1PH', 'TCGA-EW-A2FR', 'TCGA-EW-A6S9',
       'TCGA-EW-A6SB', 'TCGA-EW-A6SD', 'TCGA-GM-A2DB', 'TCGA-GM-A2DD',
       'TCGA-GM-A2DF', 'TCGA-GM-A2DH', 'TCGA-GM-A2DL', 'TCGA-GM-A3XL',
       'TCGA-LD-A74U', 'TCGA-LD-A7W5', 'TCGA-LL-A5YO', 'TCGA-LL-A5YP',
       'TCGA-LL-A6FR', 'TCGA-LL-A73Y', 'TCGA-LL-A7SZ', 'TCGA-LL-A8F5',
       'TCGA-OL-A5D6', 'TCGA-OL-A5D7', 'TCGA-OL-A5RW', 'TCGA-OL-A66I',
       'TCGA-OL-A6VO', 'TCGA-PE-A5DE', 'TCGA-PL-A8LV', 'TCGA-PL-A8LZ',
       'TCGA-S3-AA0Z', 'TCGA-S3-AA10', 'TCGA-UU-A93S', 'TCGA-Z7-A8R6'],
      dtype=object)

len(tp53_patient_ids)

264

# we have 209 patients with TP53 mutations. we make a dataframe to identify patient mutation status
import pandas as pd
TP53_status = pd.DataFrame({"patient_id": tp53_patient_ids})
TP53_status["TP53_mut"] = 1
TP53_status

##merge with survial info
df_tp53 = surv_unique.merge(TP53_status, on="patient_id", how="left")

df_tp53["TP53_mut"] = df_tp53["TP53_mut"].fillna(0).astype(int)

df_tp53["TP53_mut"].value_counts()

TP53_mut
0    833
1    264
Name: count, dtype: int64

df_tp53

##ready for kaplain-mieer plots
import pandas as pd

df2 = df_tp53.copy()

df2["OS_time"] = pd.to_numeric(df2["OS_time"], errors="coerce")
df2["OS_event"] = pd.to_numeric(df2["OS_event"], errors="coerce")

# Keep only valid rows: time present and >=0, event is 0 or 1
df2 = df2.dropna(subset=["OS_time", "OS_event"])
df2 = df2[df2["OS_time"] >= 0]
df2 = df2[df2["OS_event"].isin([0, 1])]

# Ensure integer event
df2["OS_event"] = df2["OS_event"].astype(int)

df2[["OS_time","OS_event"]].describe()
df2["OS_event"].value_counts()

OS_event
0    945
1    151
Name: count, dtype: int64

from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf = KaplanMeierFitter()
plt.figure(figsize=(7,5))

for val, label in [(1, "TP53 Mutated"), (0, "TP53 Wild-Type")]:
    mask = df2["TP53_mut"] == val
    kmf.fit(df2.loc[mask, "OS_time"],
            event_observed=df2.loc[mask, "OS_event"],
            label=label)
    kmf.plot_survival_function()

plt.title("TCGA BRCA Overall Survival: TP53 Mutation")
plt.xlabel("Time (days)")
plt.ylabel("Survival Probability")
plt.show()

##calculate log-rank P
from lifelines.statistics import logrank_test

mut = df2["TP53_mut"] == 1
wt = df2["TP53_mut"] == 0

results = logrank_test(
    df2.loc[mut, "OS_time"],
    df2.loc[wt, "OS_time"],
    event_observed_A=df2.loc[mut, "OS_event"],
    event_observed_B=df2.loc[wt, "OS_event"]
)

print("Log-rank p-value:", results.p_value)

Log-rank p-value: 0.767454702553489

	sample	_PATIENT	OS	OS.time	DSS	DSS.time	DFI	DFI.time	PFI	PFI.time	Redaction
0	TCGA-3C-AAAU-01	TCGA-3C-AAAU	0	4047.0	0.0	4047.0	1.0	1808.0	1	1808.0	NaN
1	TCGA-3C-AALI-01	TCGA-3C-AALI	0	4005.0	0.0	4005.0	0.0	4005.0	0	4005.0	NaN
2	TCGA-3C-AALJ-01	TCGA-3C-AALJ	0	1474.0	0.0	1474.0	0.0	1474.0	0	1474.0	NaN
3	TCGA-3C-AALK-01	TCGA-3C-AALK	0	1448.0	0.0	1448.0	NaN	NaN	0	1448.0	NaN
4	TCGA-4H-AAAK-01	TCGA-4H-AAAK	0	348.0	0.0	348.0	0.0	348.0	0	348.0	NaN
...	...	...	...	...	...	...	...	...	...	...	...
1231	TCGA-WT-AB44-01	TCGA-WT-AB44	0	883.0	0.0	883.0	0.0	883.0	0	883.0	NaN
1232	TCGA-XX-A899-01	TCGA-XX-A899	0	467.0	0.0	467.0	0.0	467.0	0	467.0	NaN
1233	TCGA-XX-A89A-01	TCGA-XX-A89A	0	488.0	0.0	488.0	0.0	488.0	0	488.0	NaN
1234	TCGA-Z7-A8R5-01	TCGA-Z7-A8R5	0	3287.0	0.0	3287.0	NaN	NaN	1	181.0	NaN
1235	TCGA-Z7-A8R6-01	TCGA-Z7-A8R6	0	3256.0	0.0	3256.0	0.0	3256.0	0	3256.0	NaN

	sample	_PATIENT	OS	OS.time	DSS	DSS.time	DFI	DFI.time	PFI	PFI.time	Redaction
508	TCGA-B6-A0RU-01	TCGA-B6-A0RU	0	8605.0	0.0	8605.0	1.0	3076.0	1	3076.0	NaN
479	TCGA-B6-A0I5-01	TCGA-B6-A0I5	0	8556.0	0.0	8556.0	0.0	8556.0	0	8556.0	NaN
483	TCGA-B6-A0IA-01	TCGA-B6-A0IA	0	8391.0	0.0	8391.0	0.0	8391.0	0	8391.0	NaN
502	TCGA-B6-A0RN-01	TCGA-B6-A0RN	0	8008.0	0.0	8008.0	0.0	8008.0	0	8008.0	NaN
496	TCGA-B6-A0RE-01	TCGA-B6-A0RE	0	7777.0	0.0	7777.0	0.0	7777.0	0	7777.0	NaN
...	...	...	...	...	...	...	...	...	...	...	...
770	TCGA-C8-A12K-01	TCGA-C8-A12K	0	0.0	0.0	0.0	0.0	0.0	0	0.0	NaN
213	TCGA-A8-A08H-01	TCGA-A8-A08H	0	0.0	0.0	0.0	0.0	0.0	0	0.0	NaN
202	TCGA-A8-A081-01	TCGA-A8-A081	0	0.0	0.0	0.0	0.0	0.0	0	0.0	NaN
204	TCGA-A8-A083-01	TCGA-A8-A083	0	0.0	0.0	0.0	0.0	0.0	0	0.0	NaN
1197	TCGA-OL-A66H-01	TCGA-OL-A66H	0	NaN	0.0	NaN	0.0	NaN	0	NaN	NaN

	sample	chr	start	end	reference	alt	gene	effect	Amino_Acid_Change	DNA_VAF	SIFT	PolyPhen
0	TCGA-3C-AAAU-01	10	122668955	122668955	G	A	WDR11	3'UTR	NaN	0.39	NaN	NaN
1	TCGA-3C-AAAU-01	10	8115874	8115875	-	A	GATA3	Frame_Shift_Ins	p.P409Afs*99	0.34	NaN	NaN
2	TCGA-3C-AAAU-01	11	65272906	65272908	AAA	-	MALAT1	RNA	NaN	0.27	NaN	NaN
3	TCGA-3C-AAAU-01	11	66082467	66082467	C	T	CD248	Missense_Mutation	p.E678K	0.07	tolerated(0.12)	benign(0.001)
4	TCGA-3C-AAAU-01	11	66193652	66193652	G	C	NPAS4	3'UTR	NaN	0.20	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...
92114	TCGA-Z7-A8R6-01	9	95396703	95396703	C	T	IPPK	Missense_Mutation	p.E379K	0.16	deleterious(0.01)	probably_damaging(0.968)
92115	TCGA-Z7-A8R6-01	X	123217344	123217344	C	T	STAG2	Missense_Mutation	p.L1000F	0.39	deleterious(0)	probably_damaging(1)
92116	TCGA-Z7-A8R6-01	X	30671631	30671631	G	A	GK	5'UTR	NaN	0.36	NaN	NaN
92117	TCGA-Z7-A8R6-01	X	51151398	51151398	C	G	CXorf67	3'UTR	NaN	0.32	NaN	NaN
92118	TCGA-Z7-A8R6-01	X	54014379	54014379	T	A	PHF8	Splice_Site	p.X613_splice	0.07	NaN	NaN

	sample	chr	start	end	reference	alt	gene	effect	Amino_Acid_Change	DNA_VAF	SIFT	PolyPhen
979	TCGA-3C-AALK-01	3	178936082	178936082	G	A	PIK3CA	Missense_Mutation	p.E542K	0.21	deleterious(0.04)	probably_damaging(0.96)
980	TCGA-3C-AALK-01	3	178951957	178951957	G	T	PIK3CA	Missense_Mutation	p.M1004I	0.19	deleterious(0.01)	benign(0.331)
1215	TCGA-5L-AAT0-01	3	178952085	178952085	A	T	PIK3CA	Missense_Mutation	p.H1047L	0.22	tolerated(0.44)	benign(0.085)
2631	TCGA-5L-AAT1-01	3	178916876	178916876	G	A	PIK3CA	Missense_Mutation	p.R88Q	0.17	tolerated(0.06)	probably_damaging(0.998)
2632	TCGA-5L-AAT1-01	3	178936082	178936082	G	A	PIK3CA	Missense_Mutation	p.E542K	0.18	deleterious(0.04)	probably_damaging(0.96)
...	...	...	...	...	...	...	...	...	...	...	...	...
90859	TCGA-S3-AA14-01	3	178936082	178936082	G	A	PIK3CA	Missense_Mutation	p.E542K	0.26	deleterious(0.04)	probably_damaging(0.96)
91557	TCGA-WT-AB41-01	3	178917478	178917478	G	A	PIK3CA	Missense_Mutation	p.G118D	0.36	tolerated(0.05)	possibly_damaging(0.704)
91681	TCGA-XX-A899-01	3	178916861	178916861	T	C	PIK3CA	Missense_Mutation	p.F83S	0.21	deleterious(0.02)	benign(0.09)
91870	TCGA-XX-A89A-01	3	178936082	178936082	G	A	PIK3CA	Missense_Mutation	p.E542K	0.27	deleterious(0.04)	probably_damaging(0.96)
91992	TCGA-Z7-A8R5-01	3	178952085	178952085	A	G	PIK3CA	Missense_Mutation	p.H1047R	0.16	tolerated(0.11)	possibly_damaging(0.529)

	sample	chr	start	end	reference	alt	gene	effect	Amino_Acid_Change	DNA_VAF	SIFT	PolyPhen
979	TCGA-3C-AALK-01	3	178936082	178936082	G	A	PIK3CA	Missense_Mutation	p.E542K	0.21	deleterious(0.04)	probably_damaging(0.96)
980	TCGA-3C-AALK-01	3	178951957	178951957	G	T	PIK3CA	Missense_Mutation	p.M1004I	0.19	deleterious(0.01)	benign(0.331)
1215	TCGA-5L-AAT0-01	3	178952085	178952085	A	T	PIK3CA	Missense_Mutation	p.H1047L	0.22	tolerated(0.44)	benign(0.085)
2631	TCGA-5L-AAT1-01	3	178916876	178916876	G	A	PIK3CA	Missense_Mutation	p.R88Q	0.17	tolerated(0.06)	probably_damaging(0.998)
2632	TCGA-5L-AAT1-01	3	178936082	178936082	G	A	PIK3CA	Missense_Mutation	p.E542K	0.18	deleterious(0.04)	probably_damaging(0.96)
...	...	...	...	...	...	...	...	...	...	...	...	...
90859	TCGA-S3-AA14-01	3	178936082	178936082	G	A	PIK3CA	Missense_Mutation	p.E542K	0.26	deleterious(0.04)	probably_damaging(0.96)
91557	TCGA-WT-AB41-01	3	178917478	178917478	G	A	PIK3CA	Missense_Mutation	p.G118D	0.36	tolerated(0.05)	possibly_damaging(0.704)
91681	TCGA-XX-A899-01	3	178916861	178916861	T	C	PIK3CA	Missense_Mutation	p.F83S	0.21	deleterious(0.02)	benign(0.09)
91870	TCGA-XX-A89A-01	3	178936082	178936082	G	A	PIK3CA	Missense_Mutation	p.E542K	0.27	deleterious(0.04)	probably_damaging(0.96)
91992	TCGA-Z7-A8R5-01	3	178952085	178952085	A	G	PIK3CA	Missense_Mutation	p.H1047R	0.16	tolerated(0.11)	possibly_damaging(0.529)

we will use OS.time¶

now lets make a kaplan meyer plot using Overall SUrival (OS). you need to install lifelines¶

pip3 install lifelines in another terminal¶

Do that again for your cancer of interest¶

Conclusion: PIK3CA mutation does not drive worse surival.¶

lets redo this analysis using TP53¶

TP53 mutation showed a little differnece in surival.¶

	patient_id	PIK3CA_mut
0	TCGA-3C-AALK	1
1	TCGA-5L-AAT0	1
2	TCGA-5L-AAT1	1
3	TCGA-A1-A0SI	1
4	TCGA-A2-A04N	1

	sample	chr	start	end	reference	alt	gene	effect	Amino_Acid_Change	DNA_VAF	SIFT	PolyPhen
347	TCGA-3C-AALI-01	17	7578382	7578382	G	T	TP53	Nonsense_Mutation	p.S183*	0.65	NaN	NaN
3791	TCGA-A1-A0SI-01	17	7578406	7578406	C	T	TP53	Missense_Mutation	p.R175H	0.30	tolerated(0.11)	benign(0.308)
4064	TCGA-A1-A0SK-01	17	7578532	7578532	A	T	TP53	Missense_Mutation	p.M133K	0.98	deleterious(0)	benign(0.122)
4354	TCGA-A1-A0SO-01	17	7578190	7578190	T	C	TP53	Missense_Mutation	p.Y220C	0.87	deleterious(0)	probably_damaging(1)
4633	TCGA-A1-A0SP-01	17	7578382	7578382	G	C	TP53	Nonsense_Mutation	p.S183*	0.49	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...
90161	TCGA-PL-A8LZ-01	17	7578406	7578406	C	T	TP53	Missense_Mutation	p.R175H	0.32	tolerated(0.11)	benign(0.308)
90450	TCGA-S3-AA0Z-01	17	7578550	7578550	G	T	TP53	Missense_Mutation	p.S127Y	0.93	deleterious(0)	probably_damaging(1)
90551	TCGA-S3-AA10-01	17	7579335	7579336	-	C	TP53	Frame_Shift_Ins	p.T118Dfs*31	0.33	NaN	NaN
91294	TCGA-UU-A93S-01	17	7578523	7578523	T	G	TP53	Missense_Mutation	p.Q136P	0.62	deleterious(0)	probably_damaging(1)
92047	TCGA-Z7-A8R6-01	17	7577085	7577085	C	T	TP53	Missense_Mutation	p.E285K	0.88	deleterious(0)	probably_damaging(0.985)

	patient_id	TP53_mut
0	TCGA-3C-AALI	1
1	TCGA-A1-A0SI	1
2	TCGA-A1-A0SK	1
3	TCGA-A1-A0SO	1
4	TCGA-A1-A0SP	1
...	...	...
259	TCGA-PL-A8LZ	1
260	TCGA-S3-AA0Z	1
261	TCGA-S3-AA10	1
262	TCGA-UU-A93S	1
263	TCGA-Z7-A8R6	1