Fits a decision tree in one data set and tests the performance in another

cross_validate(
  train,
  test,
  cluster,
  genes_use = Seurat::VariableFeatures(train),
  warn.gene.removal = TRUE,
  ...
)

Arguments

train	a Seurat object to be used for trainning.
test	another Seurat object to be used for testing.
cluster	the cluster whose equivalence needs to be found.
genes_use	character vector specifying which genes to use for the classification, defaults to Seurat::VariableFeatures(train)
warn.gene.removal	logical indicating wether to warn the user when genes are removed because they are missing in one of the datasets. defults to TRUE
...	additional arguments to be passed to ranger_importances.Seurat

Value

a list containing the (1) tree fit, (2) a summary_table (3) the concensus rules of the tree (4) ranger_significance_table (5) the suggested genes for the gating

Examples

cross_validate(small_5050_mix, small_9901_mix, cluster = "0")
#> Warning: Only few negative importance values found, inaccurate p-values. Consider the 'altmann' approach.
#> 
#> This can be done by setting the argument 'imp_method' to 'altmann', note that this method is extremely computationally intensive.
#> 
#> This warning can be disabled by setting the argument `warn.imp.method` to `FALSE`
#> 
#> For more information please refer to ?ranger::ranger
#> Warning: Some important genes were removed because they are not present in the test dataset. 
#> Removed genes: ASNS, CD3D, HEY1, ADA, XIST, CDKN2A, RPL26, TSC22D3, CA2, AIF1, GAL, MDK, MAP1A, PSMB9, TSTD1, FAM127B, ID2, DMKN, PSMB8, KRT18, HOXA9, ZNF503, MAP1B, PYGL, HSPA1B, CSRP2
#> $party_fit
#> 
#> Model formula:
#> ident ~ ARHGDIB + TMSB4X + MZB1 + SOX4 + FYB + CD1E + UBE2C + 
#>     HIST1H1E + ITGA4 + CDK1 + ITM2A + CHI3L2 + HIST1H4C + MALAT1 + 
#>     JUN + CXCR4 + DDIT4 + HIST1H2BK
#> 
#> Fitted party:
#> [1] root
#> |   [2] ARHGDIB <= 2.29381
#> |   |   [3] SOX4 <= 2.74241: not clus 0 (n = 90, err = 27.8%)
#> |   |   [4] SOX4 > 2.74241: clus 0 (n = 8, err = 0.0%)
#> |   [5] ARHGDIB > 2.29381
#> |   |   [6] DDIT4 <= 2.13797: clus 0 (n = 148, err = 4.1%)
#> |   |   [7] DDIT4 > 2.13797: clus 0 (n = 9, err = 44.4%)
#> 
#> Number of inner nodes:    3
#> Number of terminal nodes: 4
#> 
#> $confusion_matrix
#>             cluster
#> predicted      0   1   2
#>   clus 0     263  98   8
#>   not clus 0   0   0  15
#> 
#> $concensus_rules
#> Cluster-clus 0: 
#> 	all elements:
#> 		ARHGDIB +
#> 	majority elements:
#> 		DDIT4 -
#> Cluster-not clus 0: 
#> 	all elements:
#> 		ARHGDIB -
#> 		SOX4 -
#> 
#> $ranger_significance_table
#>           importance      pvalue      gene
#> ASNS       6.1981030 0.000000000      ASNS
#> ARHGDIB    4.6568825 0.000000000   ARHGDIB
#> CD3D       4.5486561 0.000000000      CD3D
#> TMSB4X     3.8594044 0.000000000    TMSB4X
#> HEY1       3.4640647 0.000000000      HEY1
#> ADA        2.7639487 0.000000000       ADA
#> MZB1       2.5351154 0.000000000      MZB1
#> XIST       2.1050942 0.000000000      XIST
#> CDKN2A     2.0906554 0.000000000    CDKN2A
#> RPL26      2.0897659 0.000000000     RPL26
#> SOX4       2.0034795 0.000000000      SOX4
#> TSC22D3    1.9670025 0.000000000   TSC22D3
#> CA2        1.9547972 0.000000000       CA2
#> FYB        1.6685485 0.000000000       FYB
#> AIF1       1.5561841 0.000000000      AIF1
#> GAL        1.1628424 0.000000000       GAL
#> MDK        1.0391597 0.000000000       MDK
#> CD1E       0.9016758 0.000000000      CD1E
#> UBE2C      0.7935225 0.009433962     UBE2C
#> MAP1A      0.7925588 0.009433962     MAP1A
#> PSMB9      0.6315636 0.009433962     PSMB9
#> TSTD1      0.5575521 0.009433962     TSTD1
#> HIST1H1E   0.5060435 0.009433962  HIST1H1E
#> FAM127B    0.5018757 0.009433962   FAM127B
#> ID2        0.4493514 0.009433962       ID2
#> DMKN       0.4488922 0.009433962      DMKN
#> PSMB8      0.4441328 0.009433962     PSMB8
#> ITGA4      0.4201880 0.009433962     ITGA4
#> CDK1       0.4042143 0.009433962      CDK1
#> ITM2A      0.3883246 0.009433962     ITM2A
#> KRT18      0.3738098 0.009433962     KRT18
#> CHI3L2     0.3534820 0.009433962    CHI3L2
#> HIST1H4C   0.3438604 0.009433962  HIST1H4C
#> HOXA9      0.3198131 0.009433962     HOXA9
#> ZNF503     0.3125776 0.009433962    ZNF503
#> MALAT1     0.3075915 0.009433962    MALAT1
#> JUN        0.3029134 0.009433962       JUN
#> MAP1B      0.3000326 0.009433962     MAP1B
#> PYGL       0.2815934 0.018867925      PYGL
#> CXCR4      0.2759808 0.018867925     CXCR4
#> DDIT4      0.2183946 0.028301887     DDIT4
#> HSPA1B     0.2162486 0.028301887    HSPA1B
#> HIST1H2BK  0.2082975 0.037735849 HIST1H2BK
#> CSRP2      0.2003424 0.037735849     CSRP2
#> 
#> $gating_genes
#> [1] "ARHGDIB" "SOX4"    "DDIT4"  
#> 
cross_validate(small_5050_mix, small_9901_mix, cluster = "ALL")
#> Warning: Only few negative importance values found, inaccurate p-values. Consider the 'altmann' approach.
#> 
#> This can be done by setting the argument 'imp_method' to 'altmann', note that this method is extremely computationally intensive.
#> 
#> This warning can be disabled by setting the argument `warn.imp.method` to `FALSE`
#> 
#> For more information please refer to ?ranger::ranger
#> Warning: Some important genes were removed because they are not present in the test dataset. 
#> Removed genes: ASNS, CD3D, ADA, HEY1, TSC22D3, XIST, MAP1A, RPL26, CA2, AIF1, CDKN2A, FAM127B, GAL, CSRP2, PSMB8, TSTD1, MDK, HOXA9, ZNF503, DMKN, ID2, CTC1, IFI16, KRT18, PYGL, PSMB9, MAP1B, SPRED2, RNF138, LNP1
#> $party_fit
#> 
#> Model formula:
#> ident ~ TMSB4X + ARHGDIB + MZB1 + SOX4 + FYB + UBE2C + CD1E + 
#>     ITM2A + HIST1H1E + HIST1H4C + DDIT4 + CHI3L2 + JUN + CDK1 + 
#>     ITGA4 + CXCR4 + HIST1H1C + MYC
#> 
#> Fitted party:
#> [1] root
#> |   [2] ARHGDIB <= 2.29381
#> |   |   [3] SOX4 <= 2.74241: 1 (n = 90, err = 27.8%)
#> |   |   [4] SOX4 > 2.74241: 0 (n = 8, err = 0.0%)
#> |   [5] ARHGDIB > 2.29381
#> |   |   [6] DDIT4 <= 2.13797: 0 (n = 148, err = 4.1%)
#> |   |   [7] DDIT4 > 2.13797: 0 (n = 9, err = 44.4%)
#> 
#> Number of inner nodes:    3
#> Number of terminal nodes: 4
#> 
#> $confusion_matrix
#>          cluster
#> predicted   0   1   2
#>         0 263  98   8
#>         1   0   0  15
#> 
#> $concensus_rules
#> Cluster-0: 
#> 	all elements:
#> 		ARHGDIB +
#> 	majority elements:
#> 		DDIT4 -
#> Cluster-1: 
#> 	all elements:
#> 		ARHGDIB -
#> 		SOX4 -
#> 
#> $ranger_significance_table
#>          importance     pvalue     gene
#> ASNS      5.0523069 0.00000000     ASNS
#> CD3D      4.5759691 0.00000000     CD3D
#> TMSB4X    4.3567501 0.00000000   TMSB4X
#> ARHGDIB   4.3289627 0.00000000  ARHGDIB
#> MZB1      3.5292376 0.00000000     MZB1
#> ADA       3.0016297 0.00000000      ADA
#> HEY1      2.7798605 0.00000000     HEY1
#> TSC22D3   2.3415148 0.00000000  TSC22D3
#> XIST      2.0798456 0.00000000     XIST
#> SOX4      1.8714852 0.00000000     SOX4
#> MAP1A     1.6867208 0.00000000    MAP1A
#> RPL26     1.6366171 0.00000000    RPL26
#> CA2       1.4931733 0.00000000      CA2
#> AIF1      1.4917225 0.00000000     AIF1
#> FYB       1.4892930 0.00000000      FYB
#> CDKN2A    1.4302791 0.00000000   CDKN2A
#> UBE2C     1.1031831 0.00000000    UBE2C
#> CD1E      0.9464429 0.00000000     CD1E
#> FAM127B   0.9218146 0.00000000  FAM127B
#> GAL       0.7591950 0.00000000      GAL
#> ITM2A     0.6595954 0.00000000    ITM2A
#> HIST1H1E  0.6520071 0.00000000 HIST1H1E
#> CSRP2     0.5320566 0.00000000    CSRP2
#> HIST1H4C  0.4876504 0.01020408 HIST1H4C
#> PSMB8     0.4840830 0.01020408    PSMB8
#> DDIT4     0.3981723 0.01020408    DDIT4
#> TSTD1     0.3894432 0.01020408    TSTD1
#> MDK       0.3768194 0.01020408      MDK
#> HOXA9     0.3689784 0.01020408    HOXA9
#> ZNF503    0.3525577 0.01020408   ZNF503
#> CHI3L2    0.3054693 0.01020408   CHI3L2
#> DMKN      0.2965305 0.01020408     DMKN
#> JUN       0.2865324 0.01020408      JUN
#> CDK1      0.2831493 0.01020408     CDK1
#> ID2       0.2830716 0.01020408      ID2
#> CTC1      0.2813576 0.01020408     CTC1
#> IFI16     0.2710781 0.01020408    IFI16
#> KRT18     0.2704436 0.01020408    KRT18
#> ITGA4     0.2689324 0.01020408    ITGA4
#> PYGL      0.2584941 0.01020408     PYGL
#> PSMB9     0.2523764 0.01020408    PSMB9
#> MAP1B     0.2228242 0.04081633    MAP1B
#> CXCR4     0.2227893 0.04081633    CXCR4
#> HIST1H1C  0.2195790 0.04081633 HIST1H1C
#> MYC       0.2099993 0.04081633      MYC
#> SPRED2    0.1998300 0.04081633   SPRED2
#> RNF138    0.1982636 0.04081633   RNF138
#> LNP1      0.1882561 0.04081633     LNP1
#> 
#> $gating_genes
#> [1] "ARHGDIB" "SOX4"    "DDIT4"  
#>

Fits a decision tree in one data set and tests the performance in another

Arguments

Value

Examples

Contents