Skip to contents

This generates an interaction_model object. If you are comfortable thinking about matrices, you can think of this as a matrix-like-object.

Usage

make_interaction_model(
  .data,
  formula,
  duplicates = "add",
  parse_text = FALSE,
  dropNA = TRUE,
  data_prefix = NULL,
  ...
)

Arguments

.data

a tibble that contains the variables in the formula. The only exception is that the left-hand-side can be 1 and this does not need to be in .data.

formula

a formula, like outcome ~ row_id * (measurement_type & context).

...

additional arguments passed to tidytext::unnest_tokens

Value

a list with four elements. First, the interaction_tibble, akin to a sparse matrix in triplet form. Second, row_universe which is akin to the row names of A, but in a tidy form. Thir, column_universe which is like row_universe. Fourth, some settings.

Examples

library(nycflights13)
im = make_interaction_model(flights,~(month & day)*dest)
names(im)
#> [1] "interaction_tibble" "row_universe"       "column_universe"   
#> [4] "settings"          
im$row_universe
#> # A tibble: 365 × 4
#>    month   day     n row_num
#>    <int> <int> <int>   <int>
#>  1    11    27  1014       1
#>  2     7    11  1006       2
#>  3     7     8  1004       3
#>  4     7    10  1004       4
#>  5    12     2  1004       5
#>  6     7    18  1003       6
#>  7     7    25  1003       7
#>  8     7    12  1002       8
#>  9     7     9  1001       9
#> 10     7    17  1001      10
#> # ℹ 355 more rows
im$column_universe
#> # A tibble: 105 × 3
#>    dest      n col_num
#>    <chr> <int>   <int>
#>  1 ORD   17283       1
#>  2 ATL   17215       2
#>  3 LAX   16174       3
#>  4 BOS   15508       4
#>  5 MCO   14082       5
#>  6 CLT   14064       6
#>  7 SFO   13331       7
#>  8 FLL   12055       8
#>  9 MIA   11728       9
#> 10 DCA    9705      10
#> # ℹ 95 more rows
im$interaction_tibble
#> # A tibble: 31,229 × 3
#>    row_num col_num outcome
#>      <int>   <int>   <dbl>
#>  1       1       1      52
#>  2       1       2      51
#>  3       1       3      49
#>  4       1       4      43
#>  5       1       5      40
#>  6       1       6      42
#>  7       1       7      43
#>  8       1       8      38
#>  9       1       9      37
#> 10       1      10      28
#> # ℹ 31,219 more rows
im$settings
#> $fo
#> 1 ~ (month & day) * dest
#> <environment: 0x7f9ba08a5ce0>
#> 
#> $data_prefix
#> NULL
#> 
#> $outcome_aggregation
#> [1] "count"
#> 
#> $outcome_variables
#> [1] "outcome_unweighted_1"
#> 
#> $row_variables
#> [1] "month" "day"  
#> 
#> $column_variables
#> [1] "dest"
#> 
# you can extract the sparse Matrix:
A = longpca:::get_Matrix(im,  import_names = TRUE)
str(A)
#> Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
#>   ..@ i       : int [1:31229] 0 1 2 3 4 5 6 7 8 9 ...
#>   ..@ p       : int [1:106] 0 365 730 1095 1460 1825 2190 2555 2920 3285 ...
#>   ..@ Dim     : int [1:2] 365 105
#>   ..@ Dimnames:List of 2
#>   .. ..$ : chr [1:365] "27/11" "11/7" "8/7" "10/7" ...
#>   .. ..$ : chr [1:105] "ORD" "ATL" "LAX" "BOS" ...
#>   ..@ x       : num [1:31229] 52 55 55 55 49 54 55 55 54 55 ...
#>   ..@ factors : list()
im = make_interaction_model(all_packages, ~Package*Imports, parse_text = TRUE)
names(im)
#> [1] "interaction_tibble" "row_universe"       "column_universe"   
#> [4] "settings"          
im$row_universe
#> # A tibble: 20,319 × 3
#>    Package                n row_num
#>    <chr>              <int>   <int>
#>  1 Seurat                64       1
#>  2 tidyverse             60       2
#>  3 radiant.data          58       3
#>  4 radiant.model         58       4
#>  5 SSDM                  55       5
#>  6 BasketballAnalyzeR    53       6
#>  7 tRigon                49       7
#>  8 AFM                   48       8
#>  9 dextergui             48       9
#> 10 proteus               48      10
#> # ℹ 20,309 more rows
im$column_universe
#> # A tibble: 6,230 × 4
#>    from_text token        n col_num
#>    <chr>     <chr>    <int>   <int>
#>  1 Imports   stats     5442       1
#>  2 Imports   utils     3423       2
#>  3 Imports   dplyr     3299       3
#>  4 Imports   methods   3210       4
#>  5 Imports   ggplot2   3135       5
#>  6 Imports   rcpp      2548       6
#>  7 Imports   rlang     2172       7
#>  8 Imports   graphics  2158       8
#>  9 Imports   magrittr  1954       9
#> 10 Imports   stringr   1698      10
#> # ℹ 6,220 more rows
im$interaction_tibble
#> # A tibble: 114,833 × 3
#>    row_num col_num outcome
#>      <int>   <int>   <dbl>
#>  1       1       1       1
#>  2       1       2       1
#>  3       1       5       1
#>  4       1       6       1
#>  5       1       7       1
#>  6       1       8       1
#>  7       1      12       1
#>  8       1      13       1
#>  9       1      14       1
#> 10       1      15       1
#> # ℹ 114,823 more rows
im$settings
#> $fo
#> 1 ~ Package * Imports
#> <environment: 0x7f9bb01affa8>
#> 
#> $data_prefix
#> [1] "text"
#> 
#> $outcome_aggregation
#> [1] "count"
#> 
#> $outcome_variables
#> [1] "outcome_unweighted_1"
#> 
#> $row_variables
#> [1] "Package"
#> 
#> $column_variables
#> [1] "from_text" "token"    
#> 
# with text, there is often a great number of weakly connected words (words that appear once).
# you can remove these words that appear less than 10 times (and documents that have less than 10 words) via:
core(im, core_threshold = 10)
#> [1] "adding graph summaries (coreness and connected components)."
#> $row_universe
#> # A tibble: 3,058 × 5
#>    Package                n coreness component_label row_num
#>    <chr>              <int>    <dbl>           <dbl>   <int>
#>  1 Seurat                64       16               1       1
#>  2 tidyverse             60       16               1       2
#>  3 radiant.data          58       16               1       3
#>  4 radiant.model         58       16               1       4
#>  5 SSDM                  55       16               1       5
#>  6 BasketballAnalyzeR    53       16               1       6
#>  7 tRigon                49       16               1       7
#>  8 AFM                   48       16               1       8
#>  9 dextergui             48       16               1       9
#> 10 proteus               48       16               1      10
#> # ℹ 3,048 more rows
#> 
#> $column_universe
#> # A tibble: 666 × 6
#>    from_text token        n coreness component_label col_num
#>    <chr>     <chr>    <int>    <dbl>           <dbl>   <int>
#>  1 Imports   stats     5442       16               1       1
#>  2 Imports   utils     3423       16               1       2
#>  3 Imports   dplyr     3299       16               1       3
#>  4 Imports   methods   3210       16               1       4
#>  5 Imports   ggplot2   3135       16               1       5
#>  6 Imports   rcpp      2548       16               1       6
#>  7 Imports   rlang     2172       16               1       7
#>  8 Imports   graphics  2158       16               1       8
#>  9 Imports   magrittr  1954       16               1       9
#> 10 Imports   stringr   1698       16               1      10
#> # ℹ 656 more rows
#> 
# core retains the k-core of the "largest connected component" in the bipartite graph between rows and columns.

library(dplyr)
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
# You can provide more than two text columns to make_interaction_model:
im_text = make_interaction_model(top_packages, ~Package*(Title & Description), parse_text= TRUE)
im_text$column_universe |> arrange(desc(n))
#> # A tibble: 9,800 × 4
#>    from_text   token         n col_num
#>    <chr>       <chr>     <int>   <int>
#>  1 Description and        2571       1
#>  2 Description the        1999       2
#>  3 Description of         1388       3
#>  4 Description for        1350       4
#>  5 Description to         1104       5
#>  6 Description a          1013       6
#>  7 Description in          722       7
#>  8 Description functions   581       8
#>  9 Description data        541       9
#> 10 Description package     515      10
#> # ℹ 9,790 more rows
# remove stop words by removing them from the column_universe,
#  then use the function subset_im to renumber the columns/rows and remove any lines from interaction_tibble
im_text$column_universe = im_text$column_universe |> anti_join(tidytext::stop_words, by = c("token"="word"))
im_text = im_text |> subset_im()
im_text$column_universe |> arrange(desc(n))
#> # A tibble: 9,214 × 4
#>    from_text   token         n col_num
#>    <chr>       <chr>     <int>   <int>
#>  1 Description functions   581       1
#>  2 Description data        541       2
#>  3 Description package     515       3
#>  4 Description models      319       4
#>  5 Description doi         285       5
#>  6 Description methods     226       6
#>  7 Description analysis    218       7
#>  8 Description function    164       8
#>  9 Description model       155       9
#> 10 Description based       142      10
#> # ℹ 9,204 more rows