RAW (Vendor) Format | Mass spectrometry data uploaded by the data submitters as RAW or vendor format files corresponding to the mass spectrometers used to acquire the spectra. |
mzML Format | RAW format spectra in the HUPO Proteome Standards Initiative (PSI) compliant mzML format.. |
import requests # Define the pdc study ID we would like to query pdc_study_id = 'PDC000127' # Next, we define a function to make the GraphQL queries. def query_pdc(query): url = 'https://pdc.cancer.gov/graphql' pdc_response = requests.post(url, json={'query': query}) if pdc_response.ok: return pdc_response.json() else: return pdc_response.raise_for_status() # First, we will retrieve the experimental design for this study using the 'studyExperimentalDesign' API experimental_design_query = '{ studyExperimentalDesign(pdc_study_id: "' + pdc_study_id + '" acceptDUA: true) { pdc_study_id, study_run_metadata_id, study_run_metadata_submitter_id, study_id, study_submitter_id, analyte, acquisition_type, experiment_type, plex_dataset_name, experiment_number, number_of_fractions, label_free{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, itraq_113{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, itraq_114{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, itraq_115{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, itraq_116{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, itraq_117{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, itraq_118{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, itraq_119{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, itraq_121{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_126{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_127n{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_127cd{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_128nd{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_128cd{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_129n{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_129c{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_130n{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_130c{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_131{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id}, tmt_131c{aliquot_id, aliquot_run_metadata_id, aliquot_submitter_id} } }' decoded = query_pdc(experimental_design_query) # You can check the output for errors here experimental_design = decoded['data']['studyExperimentalDesign'] # Note that for each channel within a given plex and channel there are three identifiers: # 1. aliquot_id: the aliquot's unique identifier # 2. aliquot_run_metadata_id: a unique identifier for that specific plex, channel, and aliquot # 3. aliquot_submitter_id: the submitter's name/identifier for that aliquot experimental_design_frame.loc[0, "tmt_127n"]
OUTPUT for one plex: { 'pdc_study_id': 'PDC000127', 'study_run_metadata_id': 'de9ebff4-2074-11e9-b7f8-0a80fada099c', 'study_run_metadata_submitter_id': 'S044-1-1', 'study_id': 'dbe94609-1fb3-11e9-b7f8-0a80fada099c', 'study_submitter_id': 'CPTAC CCRCC Discovery Study - Proteome S044-1', 'analyte': 'Proteome', 'acquisition_type': 'DDA', 'experiment_type': 'TMT10', 'plex_dataset_name': '01CPTAC_CCRCC_Proteome_JHU_20171007', 'experiment_number': 1, 'number_of_fractions': '25', 'label_free': None, 'itraq_113': None, 'itraq_114': None, 'itraq_115': None, 'itraq_116': None, 'itraq_117': None, 'itraq_118': None, 'itraq_119': None, 'itraq_121': None, 'tmt_126': [{'aliquot_id': 'f02c5363-2053-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': 'f663550c-207a-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0079430001'}], 'tmt_127n': [{'aliquot_id': 'af16a68f-2053-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': '3c8eb15b-207b-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0023360001'}], 'tmt_127c': [{'aliquot_id': 'ae4c8249-2053-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': '8346c222-207b-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0023350003'}], 'tmt_128n': [{'aliquot_id': 'ef59c2bc-2053-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': 'ca3dd19f-207b-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0079410003'}], 'tmt_128c': [{'aliquot_id': '0310b558-2054-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': '1157f93f-207c-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0087040003'}], 'tmt_129n': [{'aliquot_id': 'dbf3270e-2053-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': '5eeab785-207c-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0077310003'}], 'tmt_129c': [{'aliquot_id': 'dcbb61d1-2053-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': 'a6231a02-207c-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0077320001'}], 'tmt_130n': [{'aliquot_id': '03e278f2-2054-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': 'eab2a877-207c-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0087050003'}], 'tmt_130c': [{'aliquot_id': '67fd9d0c-2053-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': '306e5186-207d-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0002270011'}], 'tmt_131': [{'aliquot_id': '3040dd8d-2054-11e9-b7f8-0a80fada099c', 'aliquot_run_metadata_id': '83eb8d70-207d-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'Pooled Sample'}], 'tmt_131c': None}
# Next, we will retrieve the clinical data for each participant in the study. This requires that we first retrieve the list of case identifiers for those involved in this study. # For this, we will use the biospecimenPerStudy API. This will also provide the case:sample:aliquot mapping for the study. biospecimen_query = '{ biospecimenPerStudy (pdc_study_id: "' + pdc_study_id + '"' + 'acceptDUA: true){ aliquot_id sample_id case_id aliquot_submitter_id sample_submitter_id case_submitter_id aliquot_status case_status sample_status project_name sample_type disease_type primary_site pool taxon externalReferences { external_reference_id reference_resource_shortname reference_resource_name reference_entity_location }}}}' decoded = query_pdc(biospecimen_query) #can check for errors here biospecimen_data = decoded['data']['biospecimenPerStudy'] biospecimen_data[0]
OUTPUT: { 'aliquot_id': 'bd34fbb3-2053-11e9-b7f8-0a80fada099c', 'sample_id': 'b72322c6-204c-11e9-b7f8-0a80fada099c', 'case_id': 'dae8930e-1fb8-11e9-b7f8-0a80fada099c', 'aliquot_submitter_id': 'CPT0026410003', 'sample_submitter_id': 'C3L-00791-01', 'case_submitter_id': 'C3L-00791', 'aliquot_status': 'Qualified', 'case_status': 'Qualified', 'sample_status': 'Qualified', 'project_name': 'CPTAC3 Discovery and Confirmatory', 'sample_type': 'Primary Tumor', 'disease_type': 'Clear Cell Renal Cell Carcinoma', 'primary_site': 'Kidney', 'pool': 'No', 'taxon': 'Homo sapiens', 'externalReferences': [{'external_reference_id': 'ad9a7ce1-9f9e-4092-8eae-493297289022', 'reference_resource_shortname': 'GDC', 'reference_resource_name': 'Genomic Data Commons', 'reference_entity_location': 'https://portal.gdc.cancer.gov/cases/ad9a7ce1-9f9e-4092-8eae-493297289022'}, {'external_reference_id': 'CPTAC-CCRCC', 'reference_resource_shortname': 'TCIA', 'reference_resource_name': 'The Cancer Imaging Archive', 'reference_entity_location': 'https://doi.org/10.7937/K9/TCIA.2018.OBLAMN27'}]}
#get a list of the unique case unique identifiers case_list = list(set([bio_dict['case_id'] for bio_dict in biospecimen_data])) case_string = ';'.join(case_list) case_query = case_query = '{ case (case_id: "' + case_uuid + '" acceptDUA: true) { case_submitter_id days_to_lost_to_followup disease_type index_date lost_to_followup primary_site consent_type days_to_consent externalReferences { external_reference_id reference_resource_shortname reference_resource_name reference_entity_location }' + \ ' demographics { case_submitter_id demographic_id ethnicity gender race cause_of_death days_to_birth days_to_death vital_status year_of_birth year_of_death age_at_index premature_at_birth weeks_gestation_at_birth age_is_obfuscated cause_of_death_source occupation_duration_years country_of_residence_at_enrollment }' + \ ' diagnoses { case_submitter_id diagnosis_id age_at_diagnosis classification_of_tumor days_to_last_follow_up days_to_last_known_disease_status days_to_recurrence diagnosis_is_primary_disease last_known_disease_status morphology primary_diagnosis progression_or_recurrence site_of_resection_or_biopsy tissue_or_organ_of_origin tumor_grade tumor_stage prior_malignancy ajcc_clinical_m ajcc_clinical_n ajcc_clinical_stage ajcc_clinical_t ajcc_pathologic_m ajcc_pathologic_n ajcc_pathologic_stage ajcc_pathologic_t anaplasia_present anaplasia_present_type ann_arbor_b_symptoms ann_arbor_clinical_stage ann_arbor_extranodal_involvement ann_arbor_pathologic_stage best_overall_response breslow_thickness burkitt_lymphoma_clinical_variant child_pugh_classification circumferential_resection_margin cog_liver_stage cog_neuroblastoma_risk_group cog_renal_stage cog_rhabdomyosarcoma_risk_group colon_polyps_history days_to_best_overall_response days_to_diagnosis days_to_hiv_diagnosis days_to_new_event enneking_msts_grade enneking_msts_metastasis enneking_msts_stage enneking_msts_tumor_site esophageal_columnar_dysplasia_degree esophageal_columnar_metaplasia_present figo_stage first_symptom_prior_to_diagnosis gastric_esophageal_junction_involvement goblet_cells_columnar_mucosa_present gross_tumor_weight hiv_positive hpv_positive_type hpv_status icd_10_code inpc_grade inpc_histologic_group inrg_stage irs_group irs_stage ishak_fibrosis_score iss_stage laterality ldh_level_at_diagnosis ldh_normal_range_upper lymph_nodes_positive lymph_nodes_tested lymphatic_invasion_present medulloblastoma_molecular_classification metastasis_at_diagnosis metastasis_at_diagnosis_site method_of_diagnosis mitosis_karyorrhexis_index new_event_anatomic_site new_event_type overall_survival perineural_invasion_present peripancreatic_lymph_nodes_positive prior_treatment progression_free_survival progression_free_survival_event residual_disease supratentorial_localization synchronous_malignancy tumor_confined_to_organ_of_origin tumor_focality tumor_largest_dimension_diameter tumor_regression_grade vascular_invasion_present vascular_invasion_type wilms_tumor_histologic_subtype year_of_diagnosis gleason_grade_group igcccg_stage international_prognostic_index largest_extrapelvic_peritoneal_focus masaoka_stage non_nodal_regional_disease non_nodal_tumor_deposits ovarian_specimen_status ovarian_surface_involvement percent_tumor_invasion peritoneal_fluid_cytological_status primary_gleason_grade secondary_gleason_grade weiss_assessment_score }' + \ ' exposures { case_submitter_id alcohol_days_per_week alcohol_drinks_per_day alcohol_history alcohol_intensity asbestos_exposure cigarettes_per_day coal_dust_exposure environmental_tobacco_smoke_exposure pack_years_smoked radon_exposure respirable_crystalline_silica_exposure smoking_frequency time_between_waking_and_first_smoke tobacco_smoking_onset_year tobacco_smoking_quit_year tobacco_smoking_status type_of_smoke_exposure type_of_tobacco_used years_smoked age_at_onset, alcohol_type, exposure_duration, exposure_duration_years, exposure_type, marijuana_use_per_week, parent_with_radiation_exposure, secondhand_smoke_as_child, smokeless_tobacco_quit_age, tobacco_use_per_day}' + \ ' follow_ups { case_submitter_id, adverse_event, barretts_esophagus_goblet_cells_present, bmi, cause_of_response, comorbidity, comorbidity_method_of_diagnosis, days_to_adverse_event, days_to_comorbidity, days_to_follow_up, days_to_progression, days_to_progression_free, days_to_recurrence, diabetes_treatment_type, disease_response, dlco_ref_predictive_percent, ecog_performance_status, fev1_ref_post_bronch_percent, fev1_ref_pre_bronch_percent, fev1_fvc_pre_bronch_percent, fev1_fvc_post_bronch_percent, height, hepatitis_sustained_virological_response, hpv_positive_type, karnofsky_performance_status, menopause_status, pancreatitis_onset_year, progression_or_recurrence, progression_or_recurrence_anatomic_site, progression_or_recurrence_type, reflux_treatment_type, risk_factor, risk_factor_treatment, viral_hepatitis_serologies, weight, adverse_event_grade, aids_risk_factors, body_surface_area, cd4_count, cdc_hiv_risk_factors, days_to_imaging, evidence_of_recurrence_type, eye_color, haart_treatment_indicator, history_of_tumor, history_of_tumor_type, hiv_viral_load, hormonal_contraceptive_type, hormonal_contraceptive_use, hormone_replacement_therapy_type, hysterectomy_margins_involved, hysterectomy_type, imaging_result, imaging_type, immunosuppressive_treatment_type, nadir_cd4_count, pregnancy_outcome, procedures_performed, recist_targeted_regions_number, recist_targeted_regions_sum, scan_tracer_used, undescended_testis_corrected, undescended_testis_corrected_age, undescended_testis_corrected_laterality, undescended_testis_corrected_method, undescended_testis_history, undescended_testis_history_laterality }' + \ ' samples { sample_submitter_id sample_id sample_type sample_type_id gdc_sample_id gdc_project_id biospecimen_anatomic_site composition current_weight days_to_collection days_to_sample_procurement diagnosis_pathologically_confirmed freezing_method initial_weight intermediate_dimension longest_dimension method_of_sample_procurement pathology_report_uuid preservation_method sample_type_id shortest_dimension time_between_clamping_and_freezing time_between_excision_and_freezing tissue_type tumor_code tumor_code_id tumor_descriptor biospecimen_laterality catalog_reference distance_normal_to_tumor distributor_reference growth_rate passage_count sample_ordinal tissue_collection_type diagnoses{diagnosis_id, annotation}' + \ ' aliquots { aliquot_submitter_id aliquot_id analyte_type aliquot_is_ref pool status aliquot_quantity aliquot_volume amount concentration } } } }' decoded = query_pdc(case_query) #you can check the output for errors here case_data = decoded['data']['case'] case_data[0]
OUTPUT: {' case_submitter_id': 'NCI7-2', 'days_to_lost_to_followup': 0, 'disease_type': 'Other', 'index_date': '', 'lost_to_followup': '', 'primary_site': 'Not Reported', 'consent_type': None, 'days_to_consent': None, 'externalReferences': [], 'demographics': [{'case_submitter_id': 'NCI7-2', 'demographic_id': '2ae1252e-1fd1-11e9-b7f8-0a80fada099c', 'ethnicity': 'Not Reported', 'gender': 'Not Reported', 'race': 'Not Reported', 'cause_of_death': 'Not Reported', 'days_to_birth': '0', 'days_to_death': None, 'vital_status': 'Not Reported', 'year_of_birth': None, 'year_of_death': None, 'age_at_index': None, 'premature_at_birth': None, 'weeks_gestation_at_birth': None, 'age_is_obfuscated': None, 'cause_of_death_source': None, 'occupation_duration_years': None, 'country_of_residence_at_enrollment': None}], 'diagnoses': [{'case_submitter_id': 'NCI7-2', 'diagnosis_id': '7342064a-3040-11e9-8379-0a80fada099c', 'age_at_diagnosis': '0', 'classification_of_tumor': 'Not Reported', 'days_to_last_follow_up': '0.00', 'days_to_last_known_disease_status': '0.00', 'days_to_recurrence': '0.00', 'diagnosis_is_primary_disease': None, 'last_known_disease_status': 'Not Reported', 'morphology': 'Not Reported', 'primary_diagnosis': 'Not Reported', 'progression_or_recurrence': 'Not Reported', 'site_of_resection_or_biopsy': 'Not Reported', 'tissue_or_organ_of_origin': 'Not Reported', 'tumor_grade': 'Not Reported', 'tumor_stage': 'Not Reported', 'prior_malignancy': 'Not Reported', 'ajcc_clinical_m': 'Not Reported', 'ajcc_clinical_n': 'Not Reported', 'ajcc_clinical_stage': 'Not Reported', 'ajcc_clinical_t': 'Not Reported', 'ajcc_pathologic_m': 'Not Reported', 'ajcc_pathologic_n': 'Not Reported', 'ajcc_pathologic_stage': 'Not Reported', 'ajcc_pathologic_t': 'Not Reported', 'anaplasia_present': None, 'anaplasia_present_type': None, 'ann_arbor_b_symptoms': 'Not Reported', 'ann_arbor_clinical_stage': 'Not Reported', 'ann_arbor_extranodal_involvement': 'Not Reported', 'ann_arbor_pathologic_stage': 'Not Reported', 'best_overall_response': None, 'breslow_thickness': None, 'burkitt_lymphoma_clinical_variant': 'Not Reported', 'child_pugh_classification': None, 'circumferential_resection_margin': None, 'cog_liver_stage': None, 'cog_neuroblastoma_risk_group': None, 'cog_renal_stage': None, 'cog_rhabdomyosarcoma_risk_group': None, 'colon_polyps_history': None, 'days_to_best_overall_response': None, 'days_to_diagnosis': None, 'days_to_hiv_diagnosis': None, 'days_to_new_event': None, 'enneking_msts_grade': None, 'enneking_msts_metastasis': None, 'enneking_msts_stage': None, 'enneking_msts_tumor_site': None, 'esophageal_columnar_dysplasia_degree': None, 'esophageal_columnar_metaplasia_present': None, 'figo_stage': 'Not Reported', 'first_symptom_prior_to_diagnosis': None, 'gastric_esophageal_junction_involvement': None, 'goblet_cells_columnar_mucosa_present': None, 'gross_tumor_weight': None, 'hiv_positive': None, 'hpv_positive_type': None, 'hpv_status': None, 'icd_10_code': None, 'inpc_grade': None, 'inpc_histologic_group': None, 'inrg_stage': None, 'irs_group': None, 'irs_stage': None, 'ishak_fibrosis_score': None, 'iss_stage': 'Not Reported', 'laterality': 'Not Reported', 'ldh_level_at_diagnosis': None, 'ldh_normal_range_upper': None, 'lymph_nodes_positive': None, 'lymph_nodes_tested': None, 'lymphatic_invasion_present': 'Not Reported', 'medulloblastoma_molecular_classification': None, 'metastasis_at_diagnosis': None, 'metastasis_at_diagnosis_site': None, 'method_of_diagnosis': 'Not Reported', 'mitosis_karyorrhexis_index': None, 'new_event_anatomic_site': None, 'new_event_type': None, 'overall_survival': None, 'perineural_invasion_present': 'Not Reported', 'peripancreatic_lymph_nodes_positive': None, 'prior_treatment': 'Not Reported', 'progression_free_survival': None, 'progression_free_survival_event': None, 'residual_disease': 'Not Reported', 'supratentorial_localization': None, 'synchronous_malignancy': None, 'tumor_confined_to_organ_of_origin': None, 'tumor_focality': None, 'tumor_largest_dimension_diameter': None, 'tumor_regression_grade': None, 'vascular_invasion_present': 'Not Reported', 'vascular_invasion_type': None, 'wilms_tumor_histologic_subtype': None, 'year_of_diagnosis': None, 'gleason_grade_group': None, 'igcccg_stage': None, 'international_prognostic_index': None, 'largest_extrapelvic_peritoneal_focus': None, 'masaoka_stage': None, 'non_nodal_regional_disease': None, 'non_nodal_tumor_deposits': None, 'ovarian_specimen_status': None, 'ovarian_surface_involvement': None, 'percent_tumor_invasion': None, 'peritoneal_fluid_cytological_status': None, 'primary_gleason_grade': None, 'secondary_gleason_grade': None, 'weiss_assessment_score': None}], 'exposures': [], 'follow_ups': [], 'samples': [{'sample_submitter_id': 'NCI7-2', 'sample_id': 'fc86d5da-204d-11e9-b7f8-0a80fada099c', 'sample_type': 'Cell Lines', 'sample_type_id': None, 'gdc_sample_id': None, 'gdc_project_id': None, 'biospecimen_anatomic_site': 'Not Reported', 'composition': 'Not Reported', 'current_weight': None, 'days_to_collection': None, 'days_to_sample_procurement': None, 'diagnosis_pathologically_confirmed': 'Not Reported', 'freezing_method': '', 'initial_weight': None, 'intermediate_dimension': None, 'longest_dimension': None, 'method_of_sample_procurement': 'Not Reported', 'pathology_report_uuid': None, 'preservation_method': 'Not Reported', 'shortest_dimension': None, 'time_between_clamping_and_freezing': None, 'time_between_excision_and_freezing': None, 'tissue_type': 'Not Reported', 'tumor_code': None, 'tumor_code_id': None, 'tumor_descriptor': 'Not Reported', 'biospecimen_laterality': None, 'catalog_reference': None, 'distance_normal_to_tumor': None, 'distributor_reference': None, 'growth_rate': None, 'passage_count': None, 'sample_ordinal': None, 'tissue_collection_type': None, 'diagnoses': [], 'aliquots': [{'aliquot_submitter_id': 'NCI7-2', "aliquot_id": "1cb3cb3d-2054-11e9-b7f8-0a80fada099c", "analyte_type": "Protein", "aliquot_is_ref": "no", "pool": "No", "status": "Qualified", "aliquot_quantity": null, "aliquot_volume": null, "amount": null, "concentration": null }}}}