::available.packages() %>% as_tibble() utils
Diving into dependen-“sea”
How CRAN packages are interconnected
When writing a package, we may want to use functions in other packages. This creates a dependency for our package and a reverse dependency on the package we borrow functions from. As one of the recipients of the isoband
email1, I’m curious to know how interconnected CRAN packages are. Luckily, it is not too hard to get data on this, and so the journey begins…
Preparing dependency data
The utils
package provides the function available.packages()
to extract CRAN package information. The data includes information on the package name, version, dependency, and license:
code
# A tibble: 18,650 × 17
Package Version Priority Depends Imports LinkingTo Suggests Enhances License
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 A3 1.0.0 <NA> R (>= … <NA> <NA> randomF… <NA> GPL (>…
2 AATtools 0.0.2 <NA> R (>= … magrit… <NA> <NA> <NA> GPL-3
3 ABACUS 1.0.0 <NA> R (>= … ggplot… <NA> rmarkdo… <NA> GPL-3
4 abbrevi… 0.1 <NA> <NA> <NA> <NA> testtha… <NA> GPL-3
5 abbyyR 0.5.5 <NA> R (>= … httr, … <NA> testtha… <NA> MIT + …
6 abc 2.2.1 <NA> R (>= … <NA> <NA> <NA> <NA> GPL (>…
7 abc.data 1.0 <NA> R (>= … <NA> <NA> <NA> <NA> GPL (>…
8 ABC.RAP 0.9.0 <NA> R (>= … graphi… <NA> knitr, … <NA> GPL-3
9 abcADM 1.0 <NA> <NA> Rcpp (… Rcpp, BH <NA> <NA> GPL-3
10 ABCanal… 1.2.1 <NA> R (>= … plotrix <NA> <NA> <NA> GPL-3
# ℹ 18,640 more rows
# ℹ 8 more variables: License_is_FOSS <chr>, License_restricts_use <chr>,
# OS_type <chr>, Archs <chr>, MD5sum <chr>, NeedsCompilation <chr>,
# File <chr>, Repository <chr>
From this, we can extract a table to map out the direct dependency every CRAN package has. In this post we will focus on the two strong dependencies: Depends and Imports:
code
<- raw %>%
all_pkgs ::separate_rows(Imports, sep = ",") %>%
tidyr::separate_rows(Depends, sep = ",") %>%
tidyrmutate(
across(c(Depends, Imports), ~gsub("\\(.*\\)", "\\1", .x)),
across(c(Depends, Imports), str_trim)
)
<- all_pkgs %>%
(dep_lookup_tbl ::select(Package, Depends, Imports) %>%
dplyrrename(downstream = Package) %>%
pivot_longer(Depends:Imports, names_to = "type", values_to = "upstream") %>%
distinct() %>%
filter(!upstream %in% c("R", "")) %>%
filter(!is.na(upstream)) %>%
arrange(downstream))
# A tibble: 96,576 × 3
downstream type upstream
<chr> <chr> <chr>
1 A3 Depends xtable
2 A3 Depends pbapply
3 AATtools Imports magrittr
4 AATtools Imports dplyr
5 AATtools Imports doParallel
6 AATtools Imports foreach
7 ABACUS Imports ggplot2
8 ABACUS Imports shiny
9 ABC.RAP Imports graphics
10 ABC.RAP Imports stats
# ℹ 96,566 more rows
Dependency is a transitive relation. This means a package also (indirectly) depends on all the dependencies of the package of it imports and so on. Changes from an package will propagate downwards through its dependency chain. With the direct dependency table above, we can iteratively construct the extended dependency tree:
code
<- function(upstream, data){
find_all_deps print(upstream)
<- tibble()
dt <- data
dt2 <- 1
i while(nrow(dt2) > nrow(dt)){
print(i)
<- dt2
dt <- paste0("upstream", i)
n <- dt %>%
dt2 rename(upstream = downstream) %>%
left_join(dep_lookup_tbl %>% select(-type), by = "upstream") %>%
rename(!!quo_name(n) := upstream)
<- i + 1
i
}
<- dt2 %>%
dep pivot_longer(
cols = c(contains("upstream"), "downstream"),
names_to = "dump", values_to = "downstream") %>%
distinct(downstream) %>%
filter(!is.na(downstream)) %>%
mutate(downstream = sort(downstream))
return(dep)
}
<- dep_lookup_tbl %>%
dep_all arrange(-desc(upstream)) %>%
nest(direct_deps = -upstream) %>%
mutate(all_deps = map2(upstream, direct_deps, find_all_deps))
<- dep_all %>%
(edges select(-direct_deps) %>%
unnest(all_deps) %>%
filter(!is.na(upstream), !is.na(downstream)))
# A tibble: 550,306 × 2
upstream downstream
<chr> <chr>
1 a4Core nlcv
2 abc abctools
3 abc EasyABC
4 abc ecolottery
5 abc nlrx
6 abc paleopop
7 abc poems
8 abc.data abc
9 abc.data abctools
10 abc.data EasyABC
# ℹ 550,296 more rows
The plot below shows the number of dependencies and reverse dependencies a package has.
code
<- tibble(id = unique(c(edges$upstream, edges$downstream))) %>%
nodes left_join(edges %>% count(upstream, name = "n_revdep"), by = c("id" = "upstream")) %>%
left_join(edges %>% count(downstream, name = "n_dep"), by = c("id" = "downstream")) %>%
filter(!is.na(id)) %>%
mutate(n_revdep = ifelse(is.na(n_revdep), 0, n_revdep),
n_dep = ifelse(is.na(n_dep), 0, n_dep))
################################################################
# deriving color categories
<- raw %>% filter(Priority == "recommended") %>% pull(Package)
recommended
<- c("base", "compiler", "datasets", "grDevices", "graphics", "grid", "methods", "parallel", "splines", "stats", "stats4", "tcltk", "tools", "translations", "utils")
base
<- gh("GET /orgs/{username}/repos", username = "r-lib", .limit = 200)
r_lib_gh <- vapply(r_lib_gh, "[[", "", "name")
r_lib
<- gh("GET /orgs/{username}/repos", username = "tidyverse", .limit = 40)
r_tidyverse_gh <- vapply(r_tidyverse_gh, "[[", "", "name")
tidyverse
<- nodes %>%
nodes mutate(category =
case_when(id %in% tidyverse ~ "tidyverse",
%in% base ~ "base",
id %in% r_lib ~ "r-lib",
id %in% recommended ~ "recommended",
id TRUE ~ "zzz"))
################################################################
# to deal with zero mark after sqrt tranform
# https://github.com/tidyverse/ggplot2/issues/980
<- function() {
mysqrt_trans ::trans_new("mysqrt",
scalestransform = base::sqrt,
inverse = function(x) ifelse(x<0, 0, x^2),
domain = c(0, Inf))
}
<- nodes %>%
p mutate(tooltip = glue::glue("Pkg: {id}, dep: {n_dep}, revdep: {n_revdep}")) %>%
ggplot(aes(x = n_dep, y = n_revdep)) +
geom_point_interactive(aes(tooltip = tooltip)) +
::geom_text_repel(
ggrepeldata = nodes %>% filter(n_revdep > 3100),
aes(color= category, label = id), min.segment.length = 0) +
scale_color_brewer(palette = "Set1") +
scale_y_continuous(breaks = c(0, 50, 200, 500, 1000, 2500, 5000, 7500, 10000, 15000), trans = "mysqrt") +
scale_x_continuous(breaks = c(0, 1, 5, 10, 20, 40, 80, 120, 160, 200), trans = "mysqrt") +
theme(panel.grid.minor = element_blank(),
legend.position = "bottom") +
xlab("Number of dependencies") +
ylab("Number of reverse dependencies")
girafe(ggobj = p, width_svg = 16, height_svg = 12)