Skip to contents
# Create a BrAPI connection to the T3 wheat sandbox
conn <- BrAPI::createBrAPIConnection(
  "wheat-sandbox.triticeaetoolbox.org",
  is_breedbase = TRUE
)

# Convert protocol metadata to a tibble
geno_proto <- conn$wizard("genotyping_protocols")$content[[1]]

extract_db_id <- function(subList){
  c(sapply(subList, FUN=function(v) return(v[[1]])))
}

extract_name <- function(subList){
  c(sapply(subList, FUN=function(v) return(v[[2]])))
}

extract_project_meta <- function(projSubList, name_or_id="name"){
  if ("message" %in% names(projSubList)){
    meta <- c(NULL)
  } else{
    idx <- dplyr::if_else(name_or_id == "name", 2, 1)
    meta <- sapply(projSubList$list, FUN=function(v) return(v[[idx]]))
  }
  return(meta)
}

genotyping_protocols <- tibble::tibble(
  genotyping_protocol_db_id=extract_db_id(geno_proto),
  genotyping_protocol_name=extract_name(geno_proto))

# For each protocol, retrieve genotyping projects
geno_projects <- purrr::map(
  .x=genotyping_protocols$genotyping_protocol_db_id,
  .f=function(pid){
    res <- conn$wizard(data_type="genotyping_projects",
                       filters=list(genotyping_protocols=pid))$content
  },
  .progress=TRUE)

genotyping_protocols <- genotyping_protocols |>
  dplyr::mutate(
    genotyping_project_db_id=purrr::map(geno_projects, extract_project_meta, 
                              name_or_id="id"))

genotyping_protocols <- tidyr::unnest(genotyping_protocols,
                                      genotyping_project_db_id)

genotyping_protocols <- genotyping_protocols |> 
  dplyr::mutate(
    genotyping_project_name=purrr::map(geno_projects, extract_project_meta, 
                             name_or_id="name") |> unlist()) 

# For each project, retrieve accessions
project_accessions <- purrr::map(
  .x=genotyping_protocols$genotyping_project_db_id,
  .f=function(pid){
    res <- conn$wizard(data_type="accessions",
                       filters=list(genotyping_projects=pid))$content[[1]]
  },
  .progress = TRUE)

genotyping_protocols <- genotyping_protocols |>
  dplyr::mutate(
    accessions_db_id=purrr::map(project_accessions, extract_db_id)) |> 
  dplyr::mutate(
    accessions_name=purrr::map(project_accessions, extract_name)) 

# Find pairs of projects that share > x accessions in common
find_projects_overlapping_accessions <- function(accessions, projects, 
                                                 min_overlap=10){
  min_overlap <- min(min_overlap, length(accessions))
  overlapping_projects <- integer()
  for (i in 1:nrow(projects)){
    if (length(intersect(accessions, projects$accessions_db_id[[i]])) >= 
        min_overlap){
      overlapping_projects <- c(overlapping_projects,
                                projects$genotyping_project_db_id[i])
    }
  }
  return(overlapping_projects)
}

genotyping_protocols <- genotyping_protocols |>
  dplyr::mutate(
    overlapping_projects=purrr::map(genotyping_protocols$accessions_db_id, 
                                find_projects_overlapping_accessions,
                                genotyping_protocols))

# Create sets of projects where all projects share > x accessions with at least
# one other project

# Optional output
# readr::write_csv(genotyping_protocols, "archived_vcf_projects.csv")
# saveRDS(genotyping_protocols, "archived_vcf_projects.rds")