print(tekstid)
## # A tibble: 300 x 10
##    kood  keeletase sonad spikkus lpikkus lemmad yld_kaanded yld_Par S_ains
##    <chr>     <dbl> <dbl>   <dbl>   <dbl>  <dbl>       <dbl>   <dbl>  <dbl>
##  1 A2I_…         1    39    4.87    4.33     31           4  0.128   0.256
##  2 A2I_…         1    72    4.19    6        43           7  0.0417  0.194
##  3 A2I_…         1    38    5.74    6.33     26           8  0.158   0.237
##  4 A2I_…         1    61    5.46    7.62     43           7  0.0656  0.262
##  5 A2I_…         1    75    4.77    6.25     45          10  0.08    0.24 
##  6 A2II…         1    31    5.10    5.17     26           7  0.161   0.452
##  7 A2II…         1    47    4.64    7.83     41           6  0.0213  0.447
##  8 A2II…         1    40    4.78    6.67     32           5  0.1     0.35 
##  9 A2II…         1    43    5       6.14     33           8  0.0465  0.326
## 10 A2II…         1    57    4.60    7.12     42           7  0.0526  0.351
## # … with 290 more rows, and 1 more variable: V_aux <dbl>

Osakaalude asemel on vaja absoluutarve. Ainsuslike nimisõnade arvu leidmiseks korrutan nende osakaalu sõnade arvuga tekstis ja talletan korrutise uues tulbas.

tekstid <- tekstid %>% mutate(S_ains_arv = S_ains * sonad)

Leian kõigi sõnade arvu ning ainsuslike nimisõnade arvu keeleoskustasemete kaupa.

tekstid %>% group_by(keeletase) %>% summarise(sonad_sum = sum(sonad))
## # A tibble: 4 x 2
##   keeletase sonad_sum
##       <dbl>     <dbl>
## 1         1      3767
## 2         2      8716
## 3         3     12500
## 4         4     16460
tekstid %>% group_by(keeletase) %>% summarise(S_ains_sum = sum(S_ains_arv))
## # A tibble: 4 x 2
##   keeletase S_ains_sum
##       <dbl>      <dbl>
## 1         1      1015.
## 2         2      1961.
## 3         3      2448.
## 4         4      3715.

Ainsuslike nimisõnade osakaal B1-tasemega õppijate kirjutistes jääb 95% tõenäosusega vahemikku 21,6% kuni 23,4%. Kui soovin üldistada 99% kindlusega, siis on vahemikuks 21,4% kuni 23,7%.

prop.test(1961, 8716)
## 
##  1-sample proportions test with continuity correction
## 
## data:  1961 out of 8716, null probability 0.5
## X-squared = 2635.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.2162878 0.2339332
## sample estimates:
##         p 
## 0.2249885
prop.test(1961, 8716, conf.level=0.99)
## 
##  1-sample proportions test with continuity correction
## 
## data:  1961 out of 8716, null probability 0.5
## X-squared = 2635.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 99 percent confidence interval:
##  0.2136228 0.2367747
## sample estimates:
##         p 
## 0.2249885

Ainsuslike nimisõnade osakaal B2-tasemega õppijate kirjutistes jääb 99% tõenäosusega vahemikku 18,7% kuni 20,5%.

prop.test(2448, 12500, conf.level=0.99)
## 
##  1-sample proportions test with continuity correction
## 
## data:  2448 out of 12500, null probability 0.5
## X-squared = 4624.4, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 99 percent confidence interval:
##  0.1868202 0.2051839
## sample estimates:
##       p 
## 0.19584

Loon maatriksi B1- ja B2-taseme summadega ning võrdlen ainsuslike nimisõnade osakaalu kahel tasemel. 95% tõenäosusega esineb B1-tasemel 1,8% kuni 5% võrra rohkem ainsuslikke nimisõnu. Kui soovin olla üldistatavas vahemikus 99% kindel, siis on erinevus üldkogumis 1,3% kuni 5,5%.

matrix(nrow=2, ncol=2, c(1961, 8716, 2448, 12500))
##      [,1]  [,2]
## [1,] 1961  2448
## [2,] 8716 12500
prop.test(matrix(nrow=2, ncol=2, c(1961, 8716, 2448, 12500)))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  matrix(nrow = 2, ncol = 2, c(1961, 8716, 2448, 12500))
## X-squared = 17.171, df = 1, p-value = 3.416e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.01771997 0.05018010
## sample estimates:
##    prop 1    prop 2 
## 0.4447721 0.4108220
prop.test(matrix(nrow=2, ncol=2, c(1961, 8716, 2448, 12500)), conf.level=0.99)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  matrix(nrow = 2, ncol = 2, c(1961, 8716, 2448, 12500))
## X-squared = 17.171, df = 1, p-value = 3.416e-05
## alternative hypothesis: two.sided
## 99 percent confidence interval:
##  0.01266315 0.05523692
## sample estimates:
##    prop 1    prop 2 
## 0.4447721 0.4108220

Loon iga keeletaseme jaoks uue andmetabeli kahe summamuutujaga ning kasutan maatriksites tabelitest võetud muutujate nimesid.

summad_A2 <- tekstid %>% filter(keeletase==1) %>% summarise(
    sonad_sum = sum(sonad),
    S_ains_sum = sum(S_ains_arv)
  )

summad_B1 <- tekstid %>% filter(keeletase==2) %>% summarise(
    sonad_sum = sum(sonad),
    S_ains_sum = sum(S_ains_arv)
  )

summad_B2 <- tekstid %>% filter(keeletase==3) %>% summarise(
    sonad_sum = sum(sonad),
    S_ains_sum = sum(S_ains_arv)
  )

summad_C1 <- tekstid %>% filter(keeletase==4) %>% summarise(
    sonad_sum = sum(sonad),
    S_ains_sum = sum(S_ains_arv)
  )

A2- ja B1-taseme võrdlus toob esile, et 95% tõenäosusega esineb A2-tasemel 2% kuni 5,8% võrra rohkem ainsuslikke nimisõnu. 99% usaldusnivoo juures jääb üldistatav erinevus vahemikku 1,4% kuni 6,4%.

matrix(nrow=2, ncol=2, c(summad_A2$S_ains_sum, summad_A2$sonad_sum, 
                         summad_B1$S_ains_sum, summad_B1$sonad_sum))
##          [,1]     [,2]
## [1,] 1015.001 1961.002
## [2,] 3767.000 8716.000
prop.test(matrix(nrow=2, ncol=2, c(summad_A2$S_ains_sum, summad_A2$sonad_sum, 
                         summad_B1$S_ains_sum, summad_B1$sonad_sum)))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  matrix(nrow = 2, ncol = 2, c(summad_A2$S_ains_sum, summad_A2$sonad_sum,     summad_B1$S_ains_sum, summad_B1$sonad_sum))
## X-squared = 17.182, df = 1, p-value = 3.397e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.02024374 0.05833939
## sample estimates:
##    prop 1    prop 2 
## 0.3410620 0.3017704
prop.test(matrix(nrow=2, ncol=2, c(summad_A2$S_ains_sum, summad_A2$sonad_sum, 
                         summad_B1$S_ains_sum, summad_B1$sonad_sum)), conf.level=0.99)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  matrix(nrow = 2, ncol = 2, c(summad_A2$S_ains_sum, summad_A2$sonad_sum,     summad_B1$S_ains_sum, summad_B1$sonad_sum))
## X-squared = 17.182, df = 1, p-value = 3.397e-05
## alternative hypothesis: two.sided
## 99 percent confidence interval:
##  0.01432386 0.06425928
## sample estimates:
##    prop 1    prop 2 
## 0.3410620 0.3017704

B2- ja C1-taseme võrdluses selgub, et 95% tõenäosusega on C1-tasemel ainsuslikke nimisõnu 2,1% kuni 4,8% võrra vähem.

matrix(nrow=2, ncol=2, c(summad_B2$S_ains_sum, summad_B2$sonad_sum, 
                                   summad_C1$S_ains_sum, summad_C1$sonad_sum))
##           [,1]  [,2]
## [1,]  2447.999  3715
## [2,] 12500.000 16460
prop.test(matrix(nrow=2, ncol=2, c(summad_B2$S_ains_sum, summad_B2$sonad_sum, 
                                   summad_C1$S_ains_sum, summad_C1$sonad_sum)))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  matrix(nrow = 2, ncol = 2, c(summad_B2$S_ains_sum, summad_B2$sonad_sum,     summad_C1$S_ains_sum, summad_C1$sonad_sum))
## X-squared = 24.487, df = 1, p-value = 7.48e-07
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.04800183 -0.02083963
## sample estimates:
##    prop 1    prop 2 
## 0.3972091 0.4316298

Salvestan osastavas käändes sõnade arvu eraldi tulbana ning arvutan summad keeleoskustasemeti.

tekstid <- tekstid %>% mutate(yld_Par_arv = yld_Par * sonad)
tekstid %>% group_by(keeletase) %>% summarise(yld_Par_sum = sum(yld_Par_arv))
## # A tibble: 4 x 2
##   keeletase yld_Par_sum
##       <dbl>       <dbl>
## 1         1        216.
## 2         2        544.
## 3         3        933.
## 4         4       1305.

Võrdlen kõrvutiasetsevaid keeleoskustasemeid. Erinevus on statistiliselt oluline vaid B1- ja B2-taseme vahel (p = 0,001).

prop.test(matrix(nrow=2, ncol=2, c(216, 3767, 544, 8716)))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  matrix(nrow = 2, ncol = 2, c(216, 3767, 544, 8716))
## X-squared = 0.96853, df = 1, p-value = 0.325
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.05132010  0.01620033
## sample estimates:
##    prop 1    prop 2 
## 0.2842105 0.3017704
prop.test(matrix(nrow=2, ncol=2, c(544, 8716, 933, 12500)))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  matrix(nrow = 2, ncol = 2, c(544, 8716, 933, 12500))
## X-squared = 10.155, df = 1, p-value = 0.001439
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.06834421 -0.01667153
## sample estimates:
##    prop 1    prop 2 
## 0.3683142 0.4108220
prop.test(matrix(nrow=2, ncol=2, c(933, 12500, 1305, 16460)))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  matrix(nrow = 2, ncol = 2, c(933, 12500, 1305, 16460))
## X-squared = 1.7813, df = 1, p-value = 0.182
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.036188992  0.006709485
## sample estimates:
##    prop 1    prop 2 
## 0.4168901 0.4316298

Võtan 20 korral valimist välja 50 juhuslikku B2-taseme teksti (kokku on 80 teksti) ning leian neis kõigi sõnade arvu ja osastavas käändes sõnade arvu. Võrdlen usaldusintervalli alam- ja ülempiire. Mõnel katsel tulemuseks saadud ülempiir on väiksem kui teisel katsel saadud alampiir.

yldarv <- 50 #juhuvalimi suurus
katseid <- 20

Par_kokku <- {}
sonad_kokku <- {}

for (i in 1:katseid){
  juhuvalim <- tekstid %>% sample_n(yldarv) %>% filter(keeletase==3) %>% summarise(
    yld_Par_sum = sum(yld_Par_arv),
    sonad_sum = sum(sonad))
  Par_kokku[[i]] <- juhuvalim$yld_Par_sum
  sonad_kokku[[i]] <- juhuvalim$sonad_sum
}

#Loon tühja maatriksi, millel 20 rida ja 2 tulpa. 
testivastused <- matrix(nrow=katseid, ncol=4)

#Talletan usaldusintervalli alam- ja ülempiiri maatriksi ridadena.
for (i in 1:katseid){
  pt <- prop.test(Par_kokku[[i]], sonad_kokku[[i]])
  testivastused[i, ] <- c(x1=pt$conf.int[1], y1=0, x2=pt$conf.int[2], y2=0)
}

print(testivastused)
##             [,1] [,2]       [,3] [,4]
##  [1,] 0.06401787    0 0.08684182    0
##  [2,] 0.06502180    0 0.08500585    0
##  [3,] 0.06795285    0 0.09000009    0
##  [4,] 0.06874799    0 0.09899794    0
##  [5,] 0.05504413    0 0.07689987    0
##  [6,] 0.05866959    0 0.08348899    0
##  [7,] 0.06629730    0 0.09275833    0
##  [8,] 0.07211304    0 0.09362970    0
##  [9,] 0.06704538    0 0.09320061    0
## [10,] 0.06479665    0 0.08968499    0
## [11,] 0.07252662    0 0.09508376    0
## [12,] 0.07203139    0 0.09375901    0
## [13,] 0.06682836    0 0.08946138    0
## [14,] 0.05309893    0 0.07421229    0
## [15,] 0.05356014    0 0.07466262    0
## [16,] 0.04488144    0 0.07030436    0
## [17,] 0.06583388    0 0.08869041    0
## [18,] 0.06388783    0 0.08521331    0
## [19,] 0.05845582    0 0.08410161    0
## [20,] 0.07134266    0 0.08922759    0
ggplot() + xlim(0.04, 0.1) +
  geom_curve(aes(x=V1, y=V2, xend=V3, yend=V4), 
             data=as_tibble(testivastused))
## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.

Kuna osastavas käändes sõnade osakaal on tasemeti pigem sarnane, siis leian osastavas sõnade üldarvu ja kõigi sõnade üldarvu valimis. Selliste sõnade osakaalu saab 95% tõenäosusega üldistada 0,5% täpsusega: 7,0% - 7,5%.

tekstid %>% summarise(yld_Par_sum = sum(yld_Par_arv))
## # A tibble: 1 x 1
##   yld_Par_sum
##         <dbl>
## 1       2998.
tekstid %>% summarise(sonad_sum = sum(sonad))
## # A tibble: 1 x 1
##   sonad_sum
##       <dbl>
## 1     41443
prop.test(2998, 41443)
## 
##  1-sample proportions test with continuity correction
## 
## data:  2998 out of 41443, null probability 0.5
## X-squared = 30317, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.06987383 0.07488648
## sample estimates:
##          p 
## 0.07234032