pgscatalog/pgsc_calc 2.2.0
See the online documentation for additional explanation of the terms and data presented in this report.
1. Metadata
1.1 Workflow version
1.2 Workflow command
Code
cat command.txt | fold -w 80 -s | awk -F ' ' 'NR==1 { print "$", $0} NR>1 { print " " $0}' | sed 's/$/\\/' | sed '$ s/.$//' $ nextflow run pgsc_calc/main.nf -profile singularity -c ../codon.config --input \
../hgdp_samplesheet.csv --target_build GRCh38 --pgs_id PGS001229 --max_cpus 4 \
--max_memory 128.GB --run_ancestry \
/nfs/production/parkinso/spot/intervene/pgsc_1000G_v1.tar.zst -resume
1.3 Scoring file metadata
Code
json_list <- jsonlite::fromJSON(params$log_scorefiles, simplifyVector = FALSE)
link_traits <- function(trait_efo, mapped) {
if (length(trait_efo) == 0) {
return("")
} else {
return(purrr::map2_chr(trait_efo, mapped, ~ stringr::str_glue('<a href="http://www.ebi.ac.uk/efo/{.x}">{.y}</a>')))
}
}
extract_traits <- function(x) {
trait_efo <- purrr::map(x, ~ extract_chr_handle_null(.x$header, "trait_efo"))
mapped <- purrr::map(x, ~ extract_chr_handle_null(.x$header, "trait_mapped"))
trait_display <- purrr::map2(trait_efo, mapped, link_traits)
mapped_trait_links <- purrr::map_chr(trait_display, ~ paste(.x, collapse = "<br />"))
reported_traits <- purrr::map(x, ~ extract_chr_handle_null(.x, "trait_reported"))
purrr::map2_chr(reported_traits, mapped_trait_links, ~ {
stringr::str_glue("<u>Reported trait:</u> {.x} <br /> <u>Mapped trait(s):</u> {.y}")
})
}
extract_chr_handle_null <- function(x, field) {
return(replace(x[[field]], is.null(x[[field]]), ""))
}
link_pgscatalog <- function(id, link_type) {
if (id != "") {
return(stringr::str_glue('<a href="https://www.pgscatalog.org/{link_type}/{id}">{id}</a>'))
} else {
return(id)
}
}
add_note <- function(id, note) {
if (id != "") {
return(stringr::str_glue("{id} <br /> <small>{note}</small>"))
} else {
return(id)
}
}
annotate_genome_build <- function(original_build, harmonised_build) {
return(stringr::str_glue("<u>Original build:</u> {original_build} <br /> <u>Harmonised build:</u> {harmonised_build}"))
}
# extract fields from json list
tibble(
pgs_id = map_chr(json_list, "pgs_id"),
pgs_name = map_chr(json_list, ~ extract_chr_handle_null(.x$header, "pgs_name")),
pgp_id = map_chr(json_list, ~ extract_chr_handle_null(.x$header, "pgp_id")),
citation = map_chr(json_list, ~ extract_chr_handle_null(.x$header, "citation")),
trait_display = extract_traits(json_list),
genome_build = purrr::map_chr(json_list, ~ extract_chr_handle_null(.x$header, "genome_build")),
harmonised_build = purrr::map_chr(json_list, ~ extract_chr_handle_null(.x$header, "HmPOS_build")),
n_variants = purrr::map_chr(json_list, ~ extract_chr_handle_null(.x$header, "variants_number")),
compatible_effect_type = map_lgl(json_list, "compatible_effect_type"),
has_complex_alleles = map_lgl(json_list, "has_complex_alleles")) %>%
# add links to pgs catalog identifiers
mutate(pgs_id = purrr::map_chr(pgs_id, ~ link_pgscatalog(.x, "score")),
pgp_id = purrr::map_chr(pgp_id, ~ link_pgscatalog(.x, "publication"))) %>%
# add notes
mutate(pgp_id = purrr::map2_chr(pgp_id, citation, ~ add_note(.x, .y)),
pgs_id = purrr::map2_chr(pgs_id, pgs_name, ~ add_note(.x, .y)),
genome_build = purrr::map2_chr(genome_build, harmonised_build, ~ annotate_genome_build(.x, .y))) %>%
# pick columns
select(pgs_id, pgp_id, trait_display, n_variants, genome_build, has_complex_alleles, compatible_effect_type) -> scorefile_metadata- Some scoring files contain complex alleles (e.g. APOE / HLA / CYP)
- These variants are excluded from the PGS calculation in the current version
- Please check Appendix A - Curation of PGS including complex alleles for more detailed information
2. Variant matching
2.1 Parameters
Code
cat params.txtkeep_multiallelic: false
keep_ambiguous : false
min_overlap : 0.75
2.2 Reference matching summary
2.3 Summary
Code
log_df %>%
mutate(match_status = forcats::fct_collapse(match_status, matched = "matched", other_level = "unmatched")) %>%
group_by(sampleset, accession, match_status, score_pass) %>%
count(wt = count) %>%
group_by(sampleset, accession) %>%
mutate(percent = round(n / sum(n) * 100, 1), n_variants = sum(n)) %>%
arrange(accession, desc(percent)) %>%
tidyr::pivot_wider(names_from = match_status, values_from = c(n, percent)) %>%
replace(is.na(.), 0) -> compatCode
if (!"n_unmatched" %in% colnames(compat)) {
# handle missing column if all PGS matches perfectly (e.g. no unmatched or excluded variants)
compat <- compat %>%
mutate(n_unmatched = 0)
}
compat %>%
select(sampleset, accession, n_variants, score_pass, percent_matched,
n_matched, n_unmatched) %>%
mutate(score_pass = as.logical(score_pass)) %>%
DT::datatable(rownames = FALSE,
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv')),
colnames = c(
"Sampleset" = "sampleset",
"Scoring file" = "accession",
"Number of variants" = "n_variants",
"Passed matching" = "score_pass",
"Match %" = "percent_matched",
"Total matched" = "n_matched",
"Total unmatched" = "n_unmatched"
)) %>%
DT::formatStyle('Scoring file',
valueColumns = 'Passed matching',
backgroundColor = DT::styleEqual(c(FALSE, TRUE), c('#c2a5cf', '#a6dba0')))- Low variant overlap or a high proportion of excluded variants may indicate differences in genome build, imputation quality, or allele coding.
- Scores with limited variant matching are likely to perform poorly and should be interpreted cautiously.
2.4 Detailed log
- Strand-ambiguous or multi-allelic variants increase uncertainty in scoring accuracy.
- Review variant counts and matching types to confirm the score is compatible with your dataset.
3. Genetic ancestry similarity
3.1 Principal Component Analysis table
# A tibble: 6 × 23
sampleset FID IID PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 hgdp HGDP00… HGDP… -19.6 -31.0 -19.0 -2.26 0.542 -5.35 -1.35 -1.44
2 hgdp HGDP00… HGDP… -19.7 -29.4 -20.5 -1.40 -0.253 -6.91 0.199 -1.30
3 hgdp HGDP00… HGDP… -19.3 -29.4 -18.9 -4.45 2.25 -4.99 -2.08 -0.479
4 hgdp HGDP00… HGDP… -19.0 -31.4 -22.4 0.0416 0.433 -6.46 -1.50 0.689
5 hgdp HGDP00… HGDP… -18.5 -30.3 -20.9 -0.652 1.12 -5.74 -0.0217 0.804
6 hgdp HGDP00… HGDP… -20.0 -30.7 -22.6 -0.545 -0.0196 -6.03 0.876 -2.15
# ℹ 12 more variables: PC9 <dbl>, PC10 <dbl>, Unrelated <lgl>, RF_P_AFR <dbl>,
# RF_P_AMR <dbl>, RF_P_EAS <dbl>, RF_P_EUR <dbl>, RF_P_SAS <dbl>,
# MostSimilarPop <chr>, MostSimilarPop_LowConfidence <lgl>, REFERENCE <lgl>,
# SuperPop <chr>
3.2 Principal Component Analysis plots
- This plot shows how target samples relate to reference populations used for ancestry adjustment.
- Samples located far from any reference group may have uncertain ancestry assignments and should be reviewed.
3.3 Population similarity summary
- Population proportions provide context for ancestry-aware scoring.
4. Scores
- All requested scores were calculated successfully
1 scores for 3504 samples processed.
4.1 Score data
4.1.1 Density plot(s)
The summary density plots show up to six scoring files
- These distributions show variation of calculated scores across individuals.
- Unusually narrow or multimodal distributions may reflect technical issues or population structure rather than biological signals.
4.2 Get all scores
All scores can be found in the results directory, at:
hgdp/score/hgdp_pgs.txt.gz
- Polygenic scores represent relative, probabilistic estimates of genetic predisposition.
- They should be interpreted within the context of ancestry, phenotype definition, and sample characteristics, not as absolute risk predictions.
5. Citation
Samuel A. Lambert, Benjamin Wingfield, Joel T. Gibson, Laurent Gil, Santhi Ramachandran, Florent Yvon, Shirin Saverimuttu, Emily Tinsley, Elizabeth Lewis, Scott C. Ritchie, Jingqin Wu, Rodrigo Canovas, Aoife McMahon, Laura W. Harris, Helen Parkinson, Michael Inouye. Enhancing the Polygenic Score Catalog with tools for score calculation and ancestry normalization. Nature Genetics (2024) | doi: 10.1038/s41588-024-01937-x
For scores from the PGS Catalog, please remember to cite the original publications from which they came (these are listed in the metadata table).
6. Score licenses
- Scores deposited in the PGS Catalog may have specific license terms
- It’s important to follow the license terms when you reuse scoring files
- Please check below for a summary of license terms
- License terms for custom scoring files aren’t reported here, please check how the creators of the scoring file licensed their data
Code
# as of 2023-12-12 only non-default licenses are recorded in the scoring file header
default_ebi_terms <- "PGS obtained from the Catalog should be cited appropriately, and used in accordance with any licensing restrictions set by the authors. See EBI Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional details."
tibble(
pgs_id = map_chr(json_list, "pgs_id"),
license_text = map_chr(json_list, ~ extract_chr_handle_null(.x$header, "license"))) %>%
mutate(license_text = ifelse(license_text == "", default_ebi_terms, license_text)) %>%
# display license terms for files in the PGS Catalog only (with a PGS ID)
filter(startsWith(pgs_id, "PGS")) %>%
DT::datatable(., colnames = c(
"PGS ID" = "pgs_id",
"License text" = "license_text"
))Code
message(stringr::str_glue("End of report, finished at {Sys.time()}"))End of report, finished at 2025-12-04 14:07:33.42629