This generates an interaction_model
object. If you are comfortable thinking about matrices, you can think of this as a matrix-like-object.
Usage
make_interaction_model(
.data,
formula,
duplicates = "add",
parse_text = FALSE,
dropNA = TRUE,
data_prefix = NULL,
...
)
Arguments
- .data
a tibble that contains the variables in the formula. The only exception is that the left-hand-side can be 1 and this does not need to be in .data.
- formula
a formula, like outcome ~ row_id * (measurement_type & context).
- ...
additional arguments passed to
tidytext::unnest_tokens
Value
a list with four elements. First, the interaction_tibble, akin to a sparse matrix in triplet form. Second, row_universe which is akin to the row names of A, but in a tidy form. Thir, column_universe which is like row_universe. Fourth, some settings.
Examples
library(nycflights13)
im = make_interaction_model(flights,~(month & day)*dest)
names(im)
#> [1] "interaction_tibble" "row_universe" "column_universe"
#> [4] "settings"
im$row_universe
#> # A tibble: 365 × 4
#> month day n row_num
#> <int> <int> <int> <int>
#> 1 11 27 1014 1
#> 2 7 11 1006 2
#> 3 7 8 1004 3
#> 4 7 10 1004 4
#> 5 12 2 1004 5
#> 6 7 18 1003 6
#> 7 7 25 1003 7
#> 8 7 12 1002 8
#> 9 7 9 1001 9
#> 10 7 17 1001 10
#> # ℹ 355 more rows
im$column_universe
#> # A tibble: 105 × 3
#> dest n col_num
#> <chr> <int> <int>
#> 1 ORD 17283 1
#> 2 ATL 17215 2
#> 3 LAX 16174 3
#> 4 BOS 15508 4
#> 5 MCO 14082 5
#> 6 CLT 14064 6
#> 7 SFO 13331 7
#> 8 FLL 12055 8
#> 9 MIA 11728 9
#> 10 DCA 9705 10
#> # ℹ 95 more rows
im$interaction_tibble
#> # A tibble: 31,229 × 3
#> row_num col_num outcome
#> <int> <int> <dbl>
#> 1 1 1 52
#> 2 1 2 51
#> 3 1 3 49
#> 4 1 4 43
#> 5 1 5 40
#> 6 1 6 42
#> 7 1 7 43
#> 8 1 8 38
#> 9 1 9 37
#> 10 1 10 28
#> # ℹ 31,219 more rows
im$settings
#> $fo
#> 1 ~ (month & day) * dest
#> <environment: 0x7f9ba08a5ce0>
#>
#> $data_prefix
#> NULL
#>
#> $outcome_aggregation
#> [1] "count"
#>
#> $outcome_variables
#> [1] "outcome_unweighted_1"
#>
#> $row_variables
#> [1] "month" "day"
#>
#> $column_variables
#> [1] "dest"
#>
# you can extract the sparse Matrix:
A = longpca:::get_Matrix(im, import_names = TRUE)
str(A)
#> Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
#> ..@ i : int [1:31229] 0 1 2 3 4 5 6 7 8 9 ...
#> ..@ p : int [1:106] 0 365 730 1095 1460 1825 2190 2555 2920 3285 ...
#> ..@ Dim : int [1:2] 365 105
#> ..@ Dimnames:List of 2
#> .. ..$ : chr [1:365] "27/11" "11/7" "8/7" "10/7" ...
#> .. ..$ : chr [1:105] "ORD" "ATL" "LAX" "BOS" ...
#> ..@ x : num [1:31229] 52 55 55 55 49 54 55 55 54 55 ...
#> ..@ factors : list()
im = make_interaction_model(all_packages, ~Package*Imports, parse_text = TRUE)
names(im)
#> [1] "interaction_tibble" "row_universe" "column_universe"
#> [4] "settings"
im$row_universe
#> # A tibble: 20,319 × 3
#> Package n row_num
#> <chr> <int> <int>
#> 1 Seurat 64 1
#> 2 tidyverse 60 2
#> 3 radiant.data 58 3
#> 4 radiant.model 58 4
#> 5 SSDM 55 5
#> 6 BasketballAnalyzeR 53 6
#> 7 tRigon 49 7
#> 8 AFM 48 8
#> 9 dextergui 48 9
#> 10 proteus 48 10
#> # ℹ 20,309 more rows
im$column_universe
#> # A tibble: 6,230 × 4
#> from_text token n col_num
#> <chr> <chr> <int> <int>
#> 1 Imports stats 5442 1
#> 2 Imports utils 3423 2
#> 3 Imports dplyr 3299 3
#> 4 Imports methods 3210 4
#> 5 Imports ggplot2 3135 5
#> 6 Imports rcpp 2548 6
#> 7 Imports rlang 2172 7
#> 8 Imports graphics 2158 8
#> 9 Imports magrittr 1954 9
#> 10 Imports stringr 1698 10
#> # ℹ 6,220 more rows
im$interaction_tibble
#> # A tibble: 114,833 × 3
#> row_num col_num outcome
#> <int> <int> <dbl>
#> 1 1 1 1
#> 2 1 2 1
#> 3 1 5 1
#> 4 1 6 1
#> 5 1 7 1
#> 6 1 8 1
#> 7 1 12 1
#> 8 1 13 1
#> 9 1 14 1
#> 10 1 15 1
#> # ℹ 114,823 more rows
im$settings
#> $fo
#> 1 ~ Package * Imports
#> <environment: 0x7f9bb01affa8>
#>
#> $data_prefix
#> [1] "text"
#>
#> $outcome_aggregation
#> [1] "count"
#>
#> $outcome_variables
#> [1] "outcome_unweighted_1"
#>
#> $row_variables
#> [1] "Package"
#>
#> $column_variables
#> [1] "from_text" "token"
#>
# with text, there is often a great number of weakly connected words (words that appear once).
# you can remove these words that appear less than 10 times (and documents that have less than 10 words) via:
core(im, core_threshold = 10)
#> [1] "adding graph summaries (coreness and connected components)."
#> $row_universe
#> # A tibble: 3,058 × 5
#> Package n coreness component_label row_num
#> <chr> <int> <dbl> <dbl> <int>
#> 1 Seurat 64 16 1 1
#> 2 tidyverse 60 16 1 2
#> 3 radiant.data 58 16 1 3
#> 4 radiant.model 58 16 1 4
#> 5 SSDM 55 16 1 5
#> 6 BasketballAnalyzeR 53 16 1 6
#> 7 tRigon 49 16 1 7
#> 8 AFM 48 16 1 8
#> 9 dextergui 48 16 1 9
#> 10 proteus 48 16 1 10
#> # ℹ 3,048 more rows
#>
#> $column_universe
#> # A tibble: 666 × 6
#> from_text token n coreness component_label col_num
#> <chr> <chr> <int> <dbl> <dbl> <int>
#> 1 Imports stats 5442 16 1 1
#> 2 Imports utils 3423 16 1 2
#> 3 Imports dplyr 3299 16 1 3
#> 4 Imports methods 3210 16 1 4
#> 5 Imports ggplot2 3135 16 1 5
#> 6 Imports rcpp 2548 16 1 6
#> 7 Imports rlang 2172 16 1 7
#> 8 Imports graphics 2158 16 1 8
#> 9 Imports magrittr 1954 16 1 9
#> 10 Imports stringr 1698 16 1 10
#> # ℹ 656 more rows
#>
# core retains the k-core of the "largest connected component" in the bipartite graph between rows and columns.
library(dplyr)
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
# You can provide more than two text columns to make_interaction_model:
im_text = make_interaction_model(top_packages, ~Package*(Title & Description), parse_text= TRUE)
im_text$column_universe |> arrange(desc(n))
#> # A tibble: 9,800 × 4
#> from_text token n col_num
#> <chr> <chr> <int> <int>
#> 1 Description and 2571 1
#> 2 Description the 1999 2
#> 3 Description of 1388 3
#> 4 Description for 1350 4
#> 5 Description to 1104 5
#> 6 Description a 1013 6
#> 7 Description in 722 7
#> 8 Description functions 581 8
#> 9 Description data 541 9
#> 10 Description package 515 10
#> # ℹ 9,790 more rows
# remove stop words by removing them from the column_universe,
# then use the function subset_im to renumber the columns/rows and remove any lines from interaction_tibble
im_text$column_universe = im_text$column_universe |> anti_join(tidytext::stop_words, by = c("token"="word"))
im_text = im_text |> subset_im()
im_text$column_universe |> arrange(desc(n))
#> # A tibble: 9,214 × 4
#> from_text token n col_num
#> <chr> <chr> <int> <int>
#> 1 Description functions 581 1
#> 2 Description data 541 2
#> 3 Description package 515 3
#> 4 Description models 319 4
#> 5 Description doi 285 5
#> 6 Description methods 226 6
#> 7 Description analysis 218 7
#> 8 Description function 164 8
#> 9 Description model 155 9
#> 10 Description based 142 10
#> # ℹ 9,204 more rows