前回までに分布特性を把握するためのいくつかの指標を説明し、
その使い方や注意点を喚起した。またグループ分けが有用なことも説明した。
解析の過程では、特徴の異なるサンプルや外れ値を除外することもあるので、
その方法について紹介する。
また、単純集計としてよく利用される頻度集計やクロス集計の方法についても
紹介する。
/* Lesson 11-1 */ /* File Name = les1101.sas 12/10/08 */ data gakusei; infile 'all08ce.prn' firstobs=2; input sex $ shintyou taijyuu kyoui jitaku $ kodukai carryer $ tsuuwa; if sex^='M' & sex^='F' then delete; : 男でも女でもない場合、除外 if kodukai>=200000 then delete; : 20万円以上の場合、除外 proc print data=gakusei(obs=5); run; proc means data=gakusei; run; proc univariate data=gakusei plot; var shintyou taijyuu kyoui kodukai; run; proc chart data=gakusei; hbar shintyou taijyuu kyoui kodukai; run; proc sort data=gakusei; by sex; run; proc means data=gakusei; by sex; run; proc univariate data=gakusei plot; var shintyou taijyuu kyoui kodukai; by sex; run; proc chart data=gakusei; hbar shintyou taijyuu kyoui kodukai; by sex; run; proc chart data=gakusei; hbar shintyou taijyuu kyoui kodukai/group=sex; run;
SAS システム 2 11:52 Thursday, December 4, 2008 Variable N Mean Std Dev Minimum Maximum --------------------------------------------------------------------- SHINTYOU 382 167.8473822 8.2930247 145.0000000 188.0000000 TAIJYUU 343 58.9533528 9.3369344 35.0000000 100.0000000 KYOUI 115 86.6347826 7.5618275 56.0000000 112.0000000 KODUKAI 367 44820.16 41260.85 0 180000.00 TSUUWA 173 6612.32 4375.11 0 30000.00 --------------------------------------------------------------------- SAS システム 21 11:52 Thursday, December 4, 2008 Univariate Procedure Variable=KODUKAI Moments N 367 Sum Wgts 367 Mean 44820.16 Sum 16449000 Std Dev 41260.85 Variance 1.7025E9 Skewness 1.1826 Kurtosis 0.795026 USS 1.36E12 CSS 6.231E11 CV 92.05869 Std Mean 2153.799 T:Mean=0 20.80982 Pr>|T| 0.0001 Num ^= 0 310 Num > 0 310 M(Sign) 155 Pr>=|M| 0.0001 Sgn Rank 24102.5 Pr>=|S| 0.0001 SAS システム 22 11:52 Thursday, December 4, 2008 Univariate Procedure Variable=KODUKAI Quantiles(Def=5) 100% Max 180000 99% 160000 75% Q3 60000 95% 140000 50% Med 30000 90% 100000 25% Q1 20000 10% 0 0% Min 0 5% 0 1% 0 Range 180000 Q3-Q1 40000 Mode 0 SAS システム 25 11:52 Thursday, December 4, 2008 Univariate Procedure Variable=KODUKAI Histogram # Boxplot 190000+* 1 0 .** 6 0 .**** 12 0 130000+***** 14 0 .********* 25 | .**** 11 | 70000+************* 38 +-----+ .******************* 56 | + | .*************************************** 116 *-----* 10000+****************************** 88 | ----+----+----+----+----+----+----+---- * may represent up to 3 counts SAS システム 34 11:52 Thursday, December 4, 2008 --------------------------------- SEX=F -------------------------------- Variable N Mean Std Dev Minimum Maximum --------------------------------------------------------------------- SHINTYOU 125 158.9016000 5.2486555 145.0000000 171.0000000 TAIJYUU 86 48.8720930 4.7986187 35.0000000 60.0000000 KYOUI 43 83.0000000 3.9400266 70.0000000 90.0000000 KODUKAI 121 44752.07 35327.05 0 180000.00 TSUUWA 68 6576.24 4187.38 80.0000000 25000.00 --------------------------------------------------------------------- SAS システム 35 11:52 Thursday, December 4, 2008 --------------------------------- SEX=M -------------------------------- Variable N Mean Std Dev Minimum Maximum --------------------------------------------------------------------- SHINTYOU 257 172.1984436 5.5615025 156.0000000 188.0000000 TAIJYUU 257 62.3268482 7.9531354 46.0000000 100.0000000 KYOUI 72 88.8055556 8.3575704 56.0000000 112.0000000 KODUKAI 246 44853.66 43954.46 0 165000.00 TSUUWA 105 6635.70 4512.19 0 30000.00 --------------------------------------------------------------------- SAS システム 92 11:52 Thursday, December 4, 2008 Univariate Procedure Schematic Plots Variable=SHINTYOU 200 + | | 0 180 + | | | *--+--* | | +-----+ 160 + *--+--* 0 | +-----+ 0 | 0 140 + ------------+-----------+----------- SEX F M SAS システム 93 11:52 Thursday, December 4, 2008 Univariate Procedure Schematic Plots Variable=TAIJYUU | 100 + * | 0 | | *--+--* 50 + *--+--* +-----+ | 0 | 0 + ------------+-----------+----------- SEX F M SAS システム 107 11:52 Thursday, December 4, 2008 SEX SHINTYOU Cum. Cum. Midpoint Freq Freq Percent Percent | F 144 | 1 1 0.26 0.26 148 |* 5 6 1.31 1.57 152 |*** 16 22 4.19 5.76 156 |***** 27 49 7.07 12.83 160 |******* 36 85 9.42 22.25 164 |***** 26 111 6.81 29.06 168 |** 12 123 3.14 32.20 172 | 2 125 0.52 32.72 176 | 0 125 0.00 32.72 180 | 0 125 0.00 32.72 184 | 0 125 0.00 32.72 188 | 0 125 0.00 32.72 | M 144 | 0 125 0.00 32.72 148 | 0 125 0.00 32.72 152 | 0 125 0.00 32.72 156 | 2 127 0.52 33.25 160 |* 7 134 1.83 35.08 164 |**** 18 152 4.71 39.79 168 |********* 45 197 11.78 51.57 172 |****************** 91 288 23.82 75.39 176 |********** 49 337 12.83 88.22 180 |******* 33 370 8.64 96.86 184 |** 9 379 2.36 99.21 188 |* 3 382 0.79 100.00 | ----+---+---+---+-- 20 40 60 80 Frequency SAS システム 113 11:52 Thursday, December 4, 2008 SEX KODUKAI Cum. Cum. Midpoint Freq Freq Percent Percent | F 0 |***** 13 13 3.54 3.54 15000 |********* 23 36 6.27 9.81 30000 |********** 24 60 6.54 16.35 45000 |********* 23 83 6.27 22.62 60000 |****** 15 98 4.09 26.70 75000 |**** 10 108 2.72 29.43 90000 | 0 108 0.00 29.43 105000 |** 5 113 1.36 30.79 120000 |* 3 116 0.82 31.61 135000 |* 2 118 0.54 32.15 150000 |* 2 120 0.54 32.70 165000 | 0 120 0.00 32.70 180000 | 1 121 0.27 32.97 | M 0 |********************* 52 173 14.17 47.14 15000 |**************** 41 214 11.17 58.31 30000 |******************** 51 265 13.90 72.21 45000 |************* 32 297 8.72 80.93 60000 |***** 12 309 3.27 84.20 75000 |***** 12 321 3.27 87.47 90000 | 1 322 0.27 87.74 105000 |******** 19 341 5.18 92.92 120000 |*** 8 349 2.18 95.10 135000 |* 3 352 0.82 95.91 150000 |**** 9 361 2.45 98.37 165000 |** 6 367 1.63 100.00 180000 | 0 367 0.00 100.00 | ----+---+---+---+---+- 10 20 30 40 50 Frequency
data seito08; infile 'seito.prn'; input id $ sex $ kesseki $ univ $ koku $ suu1 $ suu2 $ tireki $ koumin $ rika $; if sex^='M' then delete; /* male only */ if kesseki^='0' then delete; /* syusseki-sya only */ area="不明"; if univ="早稲田大学" then area="東日本"; if univ="慶応大学" then area="東日本"; if univ="関西大学" then area="西日本"; if univ="同志社大学" then area="西日本"; if tireki="世界史-0" then tireki="世界史"; if tireki="世界史-2" then tireki="世界史"; if tireki="日本史-2" then tireki="日本史"; if tireki="日本史-3" then tireki="日本史"; ...
[例4] 複数の処理をさせたい場合 : do 〜 end で囲む
if tireki="世界史-0" then do; tireki="世界史"; koumin=.; end; ...
[比較演算子]
/* Lesson 11-2 */ /* File Name = les1102.sas 12/10/08 */ data gakusei; infile 'all08ce.prn' firstobs=2; input sex $ shintyou taijyuu kyoui jitaku $ kodukai carryer $ tsuuwa; proc print data=gakusei(obs=5); run; : proc freq data=gakusei; : 頻度を算出 tables sex jitaku carryer; : 一変量ごとに run; : proc freq data=gakusei; : 頻度を算出 tables sex*jitaku; : 二変量の組み合わせで tables sex*carryer; : tables jitaku*carryer; : run; :
SAS システム 1 11:52 Thursday, December 4, 2008 OBS SEX SHINTYOU TAIJYUU KYOUI JITAKU KODUKAI CARRYER TSUUWA 1 F 145.0 38 . J 10000 . 2 F 146.7 41 85 J 10000 Vodafone 6000 3 F 148.0 42 . J 50000 . 4 F 148.0 43 80 J 50000 DoCoMo 4000 5 F 148.9 . . J 60000 . SAS システム 2 11:52 Thursday, December 4, 2008 Cumulative Cumulative SEX Frequency Percent Frequency Percent ------------------------------------------------- F 134 33.6 134 33.6 M 265 66.4 399 100.0 Frequency Missing = 5 Cumulative Cumulative JITAKU Frequency Percent Frequency Percent ---------------------------------------------------- G 130 37.5 130 37.5 J 217 62.5 347 100.0 Frequency Missing = 57 SAS システム 4 11:52 Thursday, December 4, 2008 Cumulative Cumulative CARRYER Frequency Percent Frequency Percent ------------------------------------------------------ DDIp 2 1.1 2 1.1 DoCoMo 71 40.6 73 41.7 J-PHONE 10 5.7 83 47.4 KDDI 1 0.6 84 48.0 No 5 2.9 89 50.9 Vodafone 20 11.4 109 62.3 Willcom 1 0.6 110 62.9 au 46 26.3 156 89.1 au+willc 1 0.6 157 89.7 au/willc 1 0.6 158 90.3 docomo 8 4.6 166 94.9 docomo+w 1 0.6 167 95.4 docomo/a 1 0.6 168 96.0 docomo/w 1 0.6 169 96.6 softbank 5 2.9 174 99.4 vodafone 1 0.6 175 100.0 Frequency Missing = 229 SAS システム 6 11:52 Thursday, December 4, 2008 TABLE OF SEX BY JITAKU SEX JITAKU Frequency| Percent | Row Pct | Col Pct |G |J | Total ---------+--------+--------+ F | 40 | 75 | 115 | 11.59 | 21.74 | 33.33 | 34.78 | 65.22 | | 31.01 | 34.72 | ---------+--------+--------+ M | 89 | 141 | 230 | 25.80 | 40.87 | 66.67 | 38.70 | 61.30 | | 68.99 | 65.28 | ---------+--------+--------+ Total 129 216 345 37.39 62.61 100.00 Frequency Missing = 59 SAS システム 9 11:52 Thursday, December 4, 2008 TABLE OF SEX BY CARRYER SEX CARRYER Frequency| Percent | Row Pct | Col Pct |DDIp |DoCoMo |J-PHONE |KDDI |No |Vodafone| Total ---------+--------+--------+--------+--------+--------+--------+ F | 1 | 30 | 4 | 0 | 1 | 9 | 66 | 0.57 | 17.24 | 2.30 | 0.00 | 0.57 | 5.17 | 37.93 | 1.52 | 45.45 | 6.06 | 0.00 | 1.52 | 13.64 | | 50.00 | 42.25 | 44.44 | 0.00 | 20.00 | 45.00 | ---------+--------+--------+--------+--------+--------+--------+ M | 1 | 41 | 5 | 1 | 4 | 11 | 108 | 0.57 | 23.56 | 2.87 | 0.57 | 2.30 | 6.32 | 62.07 | 0.93 | 37.96 | 4.63 | 0.93 | 3.70 | 10.19 | | 50.00 | 57.75 | 55.56 | 100.00 | 80.00 | 55.00 | ---------+--------+--------+--------+--------+--------+--------+ Total 2 71 9 1 5 20 174 1.15 40.80 5.17 0.57 2.87 11.49 100.00 (Continued) SAS システム 11 11:52 Thursday, December 4, 2008 TABLE OF SEX BY CARRYER SEX CARRYER Frequency| Percent | Row Pct | Col Pct |Willcom |au |au+willc|au/willc|docomo |docomo+w| Total ---------+--------+--------+--------+--------+--------+--------+ F | 1 | 15 | 1 | 0 | 1 | 0 | 66 | 0.57 | 8.62 | 0.57 | 0.00 | 0.57 | 0.00 | 37.93 | 1.52 | 22.73 | 1.52 | 0.00 | 1.52 | 0.00 | | 100.00 | 32.61 | 100.00 | 0.00 | 12.50 | 0.00 | ---------+--------+--------+--------+--------+--------+--------+ M | 0 | 31 | 0 | 1 | 7 | 1 | 108 | 0.00 | 17.82 | 0.00 | 0.57 | 4.02 | 0.57 | 62.07 | 0.00 | 28.70 | 0.00 | 0.93 | 6.48 | 0.93 | | 0.00 | 67.39 | 0.00 | 100.00 | 87.50 | 100.00 | ---------+--------+--------+--------+--------+--------+--------+ Total 1 46 1 1 8 1 174 0.57 26.44 0.57 0.57 4.60 0.57 100.00 (Continued) SAS システム 13 11:52 Thursday, December 4, 2008 TABLE OF SEX BY CARRYER SEX CARRYER Frequency| Percent | Row Pct | Col Pct |docomo/a|docomo/w|softbank|vodafone| Total ---------+--------+--------+--------+--------+ F | 0 | 0 | 3 | 0 | 66 | 0.00 | 0.00 | 1.72 | 0.00 | 37.93 | 0.00 | 0.00 | 4.55 | 0.00 | | 0.00 | 0.00 | 60.00 | 0.00 | ---------+--------+--------+--------+--------+ M | 1 | 1 | 2 | 1 | 108 | 0.57 | 0.57 | 1.15 | 0.57 | 62.07 | 0.93 | 0.93 | 1.85 | 0.93 | | 100.00 | 100.00 | 40.00 | 100.00 | ---------+--------+--------+--------+--------+ Total 1 1 5 1 174 0.57 0.57 2.87 0.57 100.00 Frequency Missing = 230 SAS システム 16 11:52 Thursday, December 4, 2008 TABLE OF JITAKU BY CARRYER JITAKU CARRYER Frequency| Percent | Row Pct | Col Pct |DDIp |DoCoMo |J-PHONE |KDDI |No |Vodafone| Total ---------+--------+--------+--------+--------+--------+--------+ G | 1 | 27 | 4 | 1 | 0 | 4 | 56 | 0.66 | 17.88 | 2.65 | 0.66 | 0.00 | 2.65 | 37.09 | 1.79 | 48.21 | 7.14 | 1.79 | 0.00 | 7.14 | | 100.00 | 44.26 | 44.44 | 100.00 | 0.00 | 23.53 | ---------+--------+--------+--------+--------+--------+--------+ J | 0 | 34 | 5 | 0 | 4 | 13 | 95 | 0.00 | 22.52 | 3.31 | 0.00 | 2.65 | 8.61 | 62.91 | 0.00 | 35.79 | 5.26 | 0.00 | 4.21 | 13.68 | | 0.00 | 55.74 | 55.56 | 0.00 | 100.00 | 76.47 | ---------+--------+--------+--------+--------+--------+--------+ Total 1 61 9 1 4 17 151 0.66 40.40 5.96 0.66 2.65 11.26 100.00 (Continued) ≪以下略≫
≪前略≫ if carryer="au+willc" then carryer="au+Willc"; if carryer="docomo" then carryer="DoCoMo"; if carryer="docomo+w" then carryer="DoCoMo+W"; if carryer="vodafone" then carryer="Vodafone"; ≪後略≫
≪前略≫ proc freq data=gakusei order=freq; : 頻度の高いもの順 tables sex jitaku carryer; : run; : : proc freq data=gakusei order=freq; : 頻度の高いもの順 tables sex*jitaku; : tables sex*carryer; : tables jitaku*carryer; : run; : ≪後略≫
/* Lesson 11-5 */ /* File Name = les1105.sas 12/10/08 */ data gakusei; infile 'all08ce.prn' firstobs=2; input sex $ shintyou taijyuu kyoui jitaku $ kodukai carryer $ tsuuwa; proc format; : 階級を作る。class shintyou の意 value clshint low-<150=' -149' : 階級の定義 1 150-<160='150-159' : 2 160-<170='160-169' : 3 170-<180='170-179' : 4 180-high='180- ' : 5 other ='missing'; : 6 run; : proc print data=gakusei(obs=5); run; proc freq data=gakusei; : 頻度を算出 tables shintyou; : 一変量ごとに format shintyou clshint.; : 連続変量をグループ化することの指定 run; : : proc freq data=gakusei; : 頻度を算出 tables sex*shintyou; : 二変量の組合わせで format shintyou clshint.; : 連続変量をグループ化することの指定 run; : : proc sort data=gakusei; : 今までの方法で実現しようとすると by sex; : run; : proc freq data=gakusei; : tables shintyou; : format shintyou clshint.; : 連続変量をグループ化することの指定 by sex; : 性別ごとに run; :
SAS システム 2 11:52 Thursday, December 4, 2008 Cumulative Cumulative SHINTYOU Frequency Percent Frequency Percent ------------------------------------------------------ -149 6 1.5 6 1.5 150-159 61 15.7 67 17.2 160-169 131 33.7 198 50.9 170-179 163 41.9 361 92.8 180- 28 7.2 389 100.0 Frequency Missing = 15 SAS システム 3 11:52 Thursday, December 4, 2008 TABLE OF SEX BY SHINTYOU SEX SHINTYOU Frequency| Percent | Row Pct | Col Pct | -149 |150-159 |160-169 |170-179 |180- | Total ---------+--------+--------+--------+--------+--------+ F | 6 | 58 | 61 | 2 | 0 | 127 | 1.55 | 14.95 | 15.72 | 0.52 | 0.00 | 32.73 | 4.72 | 45.67 | 48.03 | 1.57 | 0.00 | | 100.00 | 95.08 | 46.92 | 1.23 | 0.00 | ---------+--------+--------+--------+--------+--------+ M | 0 | 3 | 69 | 161 | 28 | 261 | 0.00 | 0.77 | 17.78 | 41.49 | 7.22 | 67.27 | 0.00 | 1.15 | 26.44 | 61.69 | 10.73 | | 0.00 | 4.92 | 53.08 | 98.77 | 100.00 | ---------+--------+--------+--------+--------+--------+ Total 6 61 130 163 28 388 1.55 15.72 33.51 42.01 7.22 100.00 Frequency Missing = 16 SAS システム 6 11:52 Thursday, December 4, 2008 ------------------------------- SEX=' ' -------------------------------- Cumulative Cumulative SHINTYOU Frequency Percent Frequency Percent ------------------------------------------------------ 160-169 1 100.0 1 100.0 Frequency Missing = 4 SAS システム 7 11:52 Thursday, December 4, 2008 -------------------------------- SEX=F --------------------------------- Cumulative Cumulative SHINTYOU Frequency Percent Frequency Percent ------------------------------------------------------ -149 6 4.7 6 4.7 150-159 58 45.7 64 50.4 160-169 61 48.0 125 98.4 170-179 2 1.6 127 100.0 Frequency Missing = 7 SAS システム 8 11:52 Thursday, December 4, 2008 -------------------------------- SEX=M --------------------------------- Cumulative Cumulative SHINTYOU Frequency Percent Frequency Percent ------------------------------------------------------ 150-159 3 1.1 3 1.1 160-169 69 26.4 72 27.6 170-179 161 61.7 233 89.3 180- 28 10.7 261 100.0 Frequency Missing = 4
data mon2008; infile 'd:\home\mon05d.csv' dlm=',' firstobs=2 truncover; missover dsd ; input No $ Univ : $30. SName : $40. Faculty : $50. Dept : $50. Center1 : $8. Center2 : $8. Sel1 : $8. Sel2 : $8. Book1 : $10. Book2 : $10. Vol0 VolS VolT ZenKou $ ScoreS ScoreT KoKouSi ;
data mon2008; infile 'd:\home\mon05e.txt' dlm='09'x firstobs=2 truncover;
data math; infile 'foo.dat' lrecl=230;
data math; infile 'foo.dat' lrecl=230 truncover;
input kamoku $ 2 kesseki $ 3 k_code $ 10-11 t_score 12-14 s_scor01 103-104 s_scor02 105-106 s_scor03 107-108 s_scor04 109-110 ;
data math; infile 'foo.dat' firstobs=4;