前回までに分布特性を把握するためのいくつかの指標を説明し、
その使い方や注意点を喚起した。またグループ分けが有用なことも説明した。
解析の過程では、特徴の異なるサンプルや外れ値を除外することもあるので、
その方法について紹介する。
また、単純集計としてよく利用される頻度集計やクロス集計の方法についても
紹介する。
/* Lesson 11-1 */ /* File Name = les1101.sas 06/19/08 */ data gakusei; infile 'all08ae.prn' firstobs=2; input sex $ shintyou taijyuu kyoui jitaku $ kodukai carryer $ tsuuwa; if sex^='M' & sex^='F' then delete; : 男でも女でもない場合、除外 if kodukai>=200000 then delete; : 20万円以上の場合、除外 proc print data=gakusei(obs=5); run; proc means data=gakusei; run; proc univariate data=gakusei plot; var shintyou taijyuu kyoui kodukai; run; proc chart data=gakusei; hbar shintyou taijyuu kyoui kodukai; run; proc sort data=gakusei; by sex; run; proc means data=gakusei; by sex; run; proc univariate data=gakusei plot; var shintyou taijyuu kyoui kodukai; by sex; run; proc chart data=gakusei; hbar shintyou taijyuu kyoui kodukai; by sex; run; proc chart data=gakusei; hbar shintyou taijyuu kyoui kodukai/group=sex; run;
SAS システム 2 19:37 Tuesday, June 17, 2008 Variable N Mean Std Dev Minimum Maximum --------------------------------------------------------------------- SHINTYOU 374 167.7489305 8.2695466 145.0000000 188.0000000 TAIJYUU 335 58.7367164 9.2250113 35.0000000 100.0000000 KYOUI 113 86.4867257 7.5155969 56.0000000 112.0000000 KODUKAI 359 44732.59 41443.43 0 180000.00 TSUUWA 165 6556.25 4366.12 0 30000.00 --------------------------------------------------------------------- SAS システム 21 19:37 Tuesday, June 17, 2008 Univariate Procedure Variable=KODUKAI Moments N 359 Sum Wgts 359 Mean 44732.59 Sum 16059000 Std Dev 41443.43 Variance 1.7176E9 Skewness 1.188306 Kurtosis 0.804002 USS 1.333E12 CSS 6.149E11 CV 92.64707 Std Mean 2187.301 T:Mean=0 20.45105 Pr>|T| 0.0001 Num ^= 0 302 Num > 0 302 M(Sign) 151 Pr>=|M| 0.0001 Sgn Rank 22876.5 Pr>=|S| 0.0001 SAS システム 22 19:37 Tuesday, June 17, 2008 Univariate Procedure Variable=KODUKAI Quantiles(Def=5) 100% Max 180000 99% 160000 75% Q3 60000 95% 150000 50% Med 30000 90% 100000 25% Q1 20000 10% 0 0% Min 0 5% 0 1% 0 Range 180000 Q3-Q1 40000 Mode 0 SAS システム 25 19:37 Tuesday, June 17, 2008 Univariate Procedure Variable=KODUKAI Histogram # Boxplot 190000+* 1 0 .** 6 0 .**** 12 0 130000+***** 14 0 .******** 23 | .**** 11 | 70000+************* 38 +-----+ .****************** 54 | + | .************************************** 112 *-----* 10000+****************************** 88 | ----+----+----+----+----+----+----+--- * may represent up to 3 counts SAS システム 33 19:37 Tuesday, June 17, 2008 --------------------------------- SEX=F -------------------------------- Variable N Mean Std Dev Minimum Maximum --------------------------------------------------------------------- SHINTYOU 125 158.9016000 5.2486555 145.0000000 171.0000000 TAIJYUU 86 48.8720930 4.7986187 35.0000000 60.0000000 KYOUI 43 83.0000000 3.9400266 70.0000000 90.0000000 KODUKAI 121 44752.07 35327.05 0 180000.00 TSUUWA 68 6576.24 4187.38 80.0000000 25000.00 --------------------------------------------------------------------- SAS システム 34 19:37 Tuesday, June 17, 2008 --------------------------------- SEX=M -------------------------------- Variable N Mean Std Dev Minimum Maximum --------------------------------------------------------------------- SHINTYOU 249 172.1903614 5.4608197 156.0000000 188.0000000 TAIJYUU 249 62.1437751 7.8306863 46.0000000 100.0000000 KYOUI 70 88.6285714 8.3668972 56.0000000 112.0000000 KODUKAI 238 44722.69 44300.74 0 165000.00 TSUUWA 97 6542.25 4508.67 0 30000.00 --------------------------------------------------------------------- SAS システム 91 19:37 Tuesday, June 17, 2008 Univariate Procedure Schematic Plots Variable=SHINTYOU 200 + | | 0 180 + | | | *--+--* | | +-----+ 160 + *--+--* 0 | +-----+ 0 | 0 140 + ------------+-----------+----------- SEX F M SAS システム 92 19:37 Tuesday, June 17, 2008 Univariate Procedure Schematic Plots Variable=TAIJYUU | 100 + * | 0 | | *--+--* 50 + *--+--* +-----+ | 0 | 0 + ------------+-----------+----------- SEX F M SAS システム 105 19:37 Tuesday, June 17, 2008 SEX SHINTYOU Cum. Cum. Midpoint Freq Freq Percent Percent | F 144 | 1 1 0.27 0.27 148 |* 5 6 1.34 1.60 152 |*** 16 22 4.28 5.88 156 |***** 27 49 7.22 13.10 160 |******* 36 85 9.63 22.73 164 |***** 26 111 6.95 29.68 168 |** 12 123 3.21 32.89 172 | 2 125 0.53 33.42 176 | 0 125 0.00 33.42 180 | 0 125 0.00 33.42 184 | 0 125 0.00 33.42 188 | 0 125 0.00 33.42 | M 144 | 0 125 0.00 33.42 148 | 0 125 0.00 33.42 152 | 0 125 0.00 33.42 156 | 2 127 0.53 33.96 160 |* 6 133 1.60 35.56 164 |*** 17 150 4.55 40.11 168 |********* 44 194 11.76 51.87 172 |****************** 89 283 23.80 75.67 176 |********** 48 331 12.83 88.50 180 |****** 32 363 8.56 97.06 184 |** 9 372 2.41 99.47 188 | 2 374 0.53 100.00 | ----+---+---+---+-- 20 40 60 80 Frequency SAS システム 111 19:37 Tuesday, June 17, 2008 SEX KODUKAI Cum. Cum. Midpoint Freq Freq Percent Percent | F 0 |****** 15 15 4.18 4.18 20000 |********** 26 41 7.24 11.42 40000 |*********** 28 69 7.80 19.22 60000 |************ 29 98 8.08 27.30 80000 |**** 10 108 2.79 30.08 100000 |** 5 113 1.39 31.48 120000 |* 3 116 0.84 32.31 140000 |* 2 118 0.56 32.87 160000 |* 2 120 0.56 33.43 180000 | 1 121 0.28 33.70 | M 0 |********************* 52 173 14.48 48.19 20000 |******************* 47 220 13.09 61.28 40000 |********************* 53 273 14.76 76.04 60000 |************ 30 303 8.36 84.40 80000 |***** 12 315 3.34 87.74 100000 |******* 18 333 5.01 92.76 120000 |*** 8 341 2.23 94.99 140000 |* 3 344 0.84 95.82 160000 |****** 15 359 4.18 100.00 180000 | 0 359 0.00 100.00 | ----+---+---+---+---+- 10 20 30 40 50 Frequency
data seito08; infile 'seito.prn'; input id $ sex $ kesseki $ univ $ koku $ suu1 $ suu2 $ tireki $ koumin $ rika $; if sex^='M' then delete; /* male only */ if kesseki^='0' then delete; /* syusseki-sya only */ area="不明"; if univ="早稲田大学" then area="東日本"; if univ="慶応大学" then area="東日本"; if univ="関西大学" then area="西日本"; if univ="同志社大学" then area="西日本"; if tireki="世界史-0" then tireki="世界史"; if tireki="世界史-2" then tireki="世界史"; if tireki="日本史-2" then tireki="日本史"; if tireki="日本史-3" then tireki="日本史"; ...
[例4] 複数の処理をさせたい場合 : do 〜 end で囲む
if tireki="世界史-0" then do; tireki="世界史"; koumin=.; end; ...
[比較演算子]
/* Lesson 11-2 */ /* File Name = les1102.sas 06/19/08 */ data gakusei; infile 'all08ae.prn' firstobs=2; input sex $ shintyou taijyuu kyoui jitaku $ kodukai carryer $ tsuuwa; proc print data=gakusei(obs=5); run; : proc freq data=gakusei; : 頻度を算出 tables sex jitaku carryer; : 一変量ごとに run; : proc freq data=gakusei; : 頻度を算出 tables sex*jitaku; : 二変量の組み合わせで tables sex*carryer; : tables jitaku*carryer; : run; :
SAS システム 1 19:37 Tuesday, June 17, 2008 OBS SEX SHINTYOU TAIJYUU KYOUI JITAKU KODUKAI CARRYER TSUUWA 1 F 145.0 38 . J 10000 . 2 F 146.7 41 85 J 10000 Vodafone 6000 3 F 148.0 42 . J 50000 . 4 F 148.0 43 80 J 50000 DoCoMo 4000 5 F 148.9 . . J 60000 . SAS システム 2 19:37 Tuesday, June 17, 2008 Cumulative Cumulative SEX Frequency Percent Frequency Percent ------------------------------------------------- F 134 34.4 134 34.4 M 256 65.6 390 100.0 Frequency Missing = 5 Cumulative Cumulative JITAKU Frequency Percent Frequency Percent ---------------------------------------------------- G 129 38.2 129 38.2 J 209 61.8 338 100.0 Frequency Missing = 57 SAS システム 4 19:37 Tuesday, June 17, 2008 Cumulative Cumulative CARRYER Frequency Percent Frequency Percent ------------------------------------------------------ DDIp 2 1.2 2 1.2 DoCoMo 71 42.8 73 44.0 J-PHONE 10 6.0 83 50.0 KDDI 1 0.6 84 50.6 No 5 3.0 89 53.6 Vodafone 20 12.0 109 65.7 Willcom 1 0.6 110 66.3 au 44 26.5 154 92.8 au+willc 1 0.6 155 93.4 docomo 5 3.0 160 96.4 docomo+w 1 0.6 161 97.0 softbank 4 2.4 165 99.4 vodafone 1 0.6 166 100.0 Frequency Missing = 229 SAS システム 6 19:37 Tuesday, June 17, 2008 TABLE OF SEX BY JITAKU SEX JITAKU Frequency| Percent | Row Pct | Col Pct |G |J | Total ---------+--------+--------+ F | 40 | 75 | 115 | 11.90 | 22.32 | 34.23 | 34.78 | 65.22 | | 31.25 | 36.06 | ---------+--------+--------+ M | 88 | 133 | 221 | 26.19 | 39.58 | 65.77 | 39.82 | 60.18 | | 68.75 | 63.94 | ---------+--------+--------+ Total 128 208 336 38.10 61.90 100.00 Frequency Missing = 59 SAS システム 9 19:37 Tuesday, June 17, 2008 TABLE OF SEX BY CARRYER SEX CARRYER Frequency| Percent | Row Pct | Col Pct |DDIp |DoCoMo |J-PHONE |KDDI |No | Total ---------+--------+--------+--------+--------+--------+ F | 1 | 30 | 4 | 0 | 1 | 66 | 0.61 | 18.18 | 2.42 | 0.00 | 0.61 | 40.00 | 1.52 | 45.45 | 6.06 | 0.00 | 1.52 | | 50.00 | 42.25 | 44.44 | 0.00 | 20.00 | ---------+--------+--------+--------+--------+--------+ M | 1 | 41 | 5 | 1 | 4 | 99 | 0.61 | 24.85 | 3.03 | 0.61 | 2.42 | 60.00 | 1.01 | 41.41 | 5.05 | 1.01 | 4.04 | | 50.00 | 57.75 | 55.56 | 100.00 | 80.00 | ---------+--------+--------+--------+--------+--------+ Total 2 71 9 1 5 165 1.21 43.03 5.45 0.61 3.03 100.00 (Continued) SAS システム 11 19:37 Tuesday, June 17, 2008 TABLE OF SEX BY CARRYER SEX CARRYER Frequency| Percent | Row Pct | Col Pct |Vodafone|Willcom |au |au+willc|docomo | Total ---------+--------+--------+--------+--------+--------+ F | 9 | 1 | 15 | 1 | 1 | 66 | 5.45 | 0.61 | 9.09 | 0.61 | 0.61 | 40.00 | 13.64 | 1.52 | 22.73 | 1.52 | 1.52 | | 45.00 | 100.00 | 34.09 | 100.00 | 20.00 | ---------+--------+--------+--------+--------+--------+ M | 11 | 0 | 29 | 0 | 4 | 99 | 6.67 | 0.00 | 17.58 | 0.00 | 2.42 | 60.00 | 11.11 | 0.00 | 29.29 | 0.00 | 4.04 | | 55.00 | 0.00 | 65.91 | 0.00 | 80.00 | ---------+--------+--------+--------+--------+--------+ Total 20 1 44 1 5 165 12.12 0.61 26.67 0.61 3.03 100.00 (Continued) SAS システム 13 19:37 Tuesday, June 17, 2008 TABLE OF SEX BY CARRYER SEX CARRYER Frequency| Percent | Row Pct | Col Pct |docomo+w|softbank|vodafone| Total ---------+--------+--------+--------+ F | 0 | 3 | 0 | 66 | 0.00 | 1.82 | 0.00 | 40.00 | 0.00 | 4.55 | 0.00 | | 0.00 | 75.00 | 0.00 | ---------+--------+--------+--------+ M | 1 | 1 | 1 | 99 | 0.61 | 0.61 | 0.61 | 60.00 | 1.01 | 1.01 | 1.01 | | 100.00 | 25.00 | 100.00 | ---------+--------+--------+--------+ Total 1 4 1 165 0.61 2.42 0.61 100.00 Frequency Missing = 230 SAS システム 16 19:37 Tuesday, June 17, 2008 TABLE OF JITAKU BY CARRYER JITAKU CARRYER Frequency| Percent | Row Pct | Col Pct |DDIp |DoCoMo |J-PHONE |KDDI |No | Total ---------+--------+--------+--------+--------+--------+ G | 1 | 27 | 4 | 1 | 0 | 55 | 0.70 | 19.01 | 2.82 | 0.70 | 0.00 | 38.73 | 1.82 | 49.09 | 7.27 | 1.82 | 0.00 | | 100.00 | 44.26 | 44.44 | 100.00 | 0.00 | ---------+--------+--------+--------+--------+--------+ J | 0 | 34 | 5 | 0 | 4 | 87 | 0.00 | 23.94 | 3.52 | 0.00 | 2.82 | 61.27 | 0.00 | 39.08 | 5.75 | 0.00 | 4.60 | | 0.00 | 55.74 | 55.56 | 0.00 | 100.00 | ---------+--------+--------+--------+--------+--------+ Total 1 61 9 1 4 142 0.70 42.96 6.34 0.70 2.82 100.00 (Continued) ≪以下略≫
≪前略≫ if carryer="au+willc" then carryer="au+Willc"; if carryer="docomo" then carryer="DoCoMo"; if carryer="docomo+w" then carryer="DoCoMo+W"; if carryer="vodafone" then carryer="Vodafone"; ≪後略≫
≪前略≫ proc freq data=gakusei order=freq; : 頻度の高いもの順 tables sex jitaku carryer; : run; : : proc freq data=gakusei order=freq; : 頻度の高いもの順 tables sex*jitaku; : tables sex*carryer; : tables jitaku*carryer; : run; : ≪後略≫
/* Lesson 11-5 */ /* File Name = les1105.sas 06/19/08 */ data gakusei; infile 'all08ae.prn' firstobs=2; input sex $ shintyou taijyuu kyoui jitaku $ kodukai carryer $ tsuuwa; proc format; : 階級を作る。class shintyou の意 value clshint low-<150=' -149' : 階級の定義 1 150-<160='150-159' : 2 160-<170='160-169' : 3 170-<180='170-179' : 4 180-high='180- ' : 5 other ='missing'; : 6 run; : proc print data=gakusei(obs=5); run; proc freq data=gakusei; : 頻度を算出 tables shintyou; : 一変量ごとに format shintyou clshint.; : 連続変量をグループ化することの指定 run; : : proc freq data=gakusei; : 頻度を算出 tables sex*shintyou; : 二変量の組合わせで format shintyou clshint.; : 連続変量をグループ化することの指定 run; : : proc sort data=gakusei; : 今までの方法で実現しようとすると by sex; : run; : proc freq data=gakusei; : tables shintyou; : format shintyou clshint.; : 連続変量をグループ化することの指定 by sex; : 性別ごとに run; :
SAS システム 2 19:37 Tuesday, June 17, 2008 Cumulative Cumulative SHINTYOU Frequency Percent Frequency Percent ------------------------------------------------------ -149 6 1.6 6 1.6 150-159 60 15.8 66 17.4 160-169 129 33.9 195 51.3 170-179 159 41.8 354 93.2 180- 26 6.8 380 100.0 Frequency Missing = 15 SAS システム 3 19:37 Tuesday, June 17, 2008 TABLE OF SEX BY SHINTYOU SEX SHINTYOU Frequency| Percent | Row Pct | Col Pct | -149 |150-159 |160-169 |170-179 |180- | Total ---------+--------+--------+--------+--------+--------+ F | 6 | 58 | 61 | 2 | 0 | 127 | 1.58 | 15.30 | 16.09 | 0.53 | 0.00 | 33.51 | 4.72 | 45.67 | 48.03 | 1.57 | 0.00 | | 100.00 | 96.67 | 47.66 | 1.26 | 0.00 | ---------+--------+--------+--------+--------+--------+ M | 0 | 2 | 67 | 157 | 26 | 252 | 0.00 | 0.53 | 17.68 | 41.42 | 6.86 | 66.49 | 0.00 | 0.79 | 26.59 | 62.30 | 10.32 | | 0.00 | 3.33 | 52.34 | 98.74 | 100.00 | ---------+--------+--------+--------+--------+--------+ Total 6 60 128 159 26 379 1.58 15.83 33.77 41.95 6.86 100.00 Frequency Missing = 16 SAS システム 6 19:37 Tuesday, June 17, 2008 ------------------------------- SEX=' ' -------------------------------- Cumulative Cumulative SHINTYOU Frequency Percent Frequency Percent ------------------------------------------------------ 160-169 1 100.0 1 100.0 Frequency Missing = 4 SAS システム 7 19:37 Tuesday, June 17, 2008 -------------------------------- SEX=F --------------------------------- Cumulative Cumulative SHINTYOU Frequency Percent Frequency Percent ------------------------------------------------------ -149 6 4.7 6 4.7 150-159 58 45.7 64 50.4 160-169 61 48.0 125 98.4 170-179 2 1.6 127 100.0 Frequency Missing = 7 SAS システム 8 19:37 Tuesday, June 17, 2008 -------------------------------- SEX=M --------------------------------- Cumulative Cumulative SHINTYOU Frequency Percent Frequency Percent ------------------------------------------------------ 150-159 2 0.8 2 0.8 160-169 67 26.6 69 27.4 170-179 157 62.3 226 89.7 180- 26 10.3 252 100.0 Frequency Missing = 4
data mon2008; infile 'd:\home\mon05d.csv' dlm=',' firstobs=2 truncover; missover dsd ; input No $ Univ : $30. SName : $40. Faculty : $50. Dept : $50. Center1 : $8. Center2 : $8. Sel1 : $8. Sel2 : $8. Book1 : $10. Book2 : $10. Vol0 VolS VolT ZenKou $ ScoreS ScoreT KoKouSi ;
data mon2008; infile 'd:\home\mon05e.txt' dlm='09'x firstobs=2 truncover;
data math; infile 'foo.dat' lrecl=230;
data math; infile 'foo.dat' lrecl=230 truncover;
input kamoku $ 2 kesseki $ 3 k_code $ 10-11 t_score 12-14 s_scor01 103-104 s_scor02 105-106 s_scor03 107-108 s_scor04 109-110 ;
data math; infile 'foo.dat' firstobs=4;