前回までに分布特性を把握するためのいくつかの指標を説明し、
その使い方や注意点を喚起した。またグループ分けが有用なことも説明した。
解析の過程では、特徴の異なるサンプルや外れ値を除外することもあるので、
その方法について紹介する。
また、単純集計としてよく利用される頻度集計やクロス集計の方法についても
紹介する。
/* Lesson 11-1 */
/* File Name = les1101.sas 06/19/08 */
data gakusei;
infile 'all08ae.prn'
firstobs=2;
input sex $ shintyou taijyuu kyoui
jitaku $ kodukai carryer $ tsuuwa;
if sex^='M' & sex^='F' then delete; : 男でも女でもない場合、除外
if kodukai>=200000 then delete; : 20万円以上の場合、除外
proc print data=gakusei(obs=5);
run;
proc means data=gakusei;
run;
proc univariate data=gakusei plot;
var shintyou taijyuu kyoui kodukai;
run;
proc chart data=gakusei;
hbar shintyou taijyuu kyoui kodukai;
run;
proc sort data=gakusei;
by sex;
run;
proc means data=gakusei;
by sex;
run;
proc univariate data=gakusei plot;
var shintyou taijyuu kyoui kodukai;
by sex;
run;
proc chart data=gakusei;
hbar shintyou taijyuu kyoui kodukai;
by sex;
run;
proc chart data=gakusei;
hbar shintyou taijyuu kyoui kodukai/group=sex;
run;
SAS システム 2
19:37 Tuesday, June 17, 2008
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 374 167.7489305 8.2695466 145.0000000 188.0000000
TAIJYUU 335 58.7367164 9.2250113 35.0000000 100.0000000
KYOUI 113 86.4867257 7.5155969 56.0000000 112.0000000
KODUKAI 359 44732.59 41443.43 0 180000.00
TSUUWA 165 6556.25 4366.12 0 30000.00
---------------------------------------------------------------------
SAS システム 21
19:37 Tuesday, June 17, 2008
Univariate Procedure
Variable=KODUKAI
Moments
N 359 Sum Wgts 359
Mean 44732.59 Sum 16059000
Std Dev 41443.43 Variance 1.7176E9
Skewness 1.188306 Kurtosis 0.804002
USS 1.333E12 CSS 6.149E11
CV 92.64707 Std Mean 2187.301
T:Mean=0 20.45105 Pr>|T| 0.0001
Num ^= 0 302 Num > 0 302
M(Sign) 151 Pr>=|M| 0.0001
Sgn Rank 22876.5 Pr>=|S| 0.0001
SAS システム 22
19:37 Tuesday, June 17, 2008
Univariate Procedure
Variable=KODUKAI
Quantiles(Def=5)
100% Max 180000 99% 160000
75% Q3 60000 95% 150000
50% Med 30000 90% 100000
25% Q1 20000 10% 0
0% Min 0 5% 0
1% 0
Range 180000
Q3-Q1 40000
Mode 0
SAS システム 25
19:37 Tuesday, June 17, 2008
Univariate Procedure
Variable=KODUKAI
Histogram # Boxplot
190000+* 1 0
.** 6 0
.**** 12 0
130000+***** 14 0
.******** 23 |
.**** 11 |
70000+************* 38 +-----+
.****************** 54 | + |
.************************************** 112 *-----*
10000+****************************** 88 |
----+----+----+----+----+----+----+---
* may represent up to 3 counts
SAS システム 33
19:37 Tuesday, June 17, 2008
--------------------------------- SEX=F --------------------------------
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 125 158.9016000 5.2486555 145.0000000 171.0000000
TAIJYUU 86 48.8720930 4.7986187 35.0000000 60.0000000
KYOUI 43 83.0000000 3.9400266 70.0000000 90.0000000
KODUKAI 121 44752.07 35327.05 0 180000.00
TSUUWA 68 6576.24 4187.38 80.0000000 25000.00
---------------------------------------------------------------------
SAS システム 34
19:37 Tuesday, June 17, 2008
--------------------------------- SEX=M --------------------------------
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 249 172.1903614 5.4608197 156.0000000 188.0000000
TAIJYUU 249 62.1437751 7.8306863 46.0000000 100.0000000
KYOUI 70 88.6285714 8.3668972 56.0000000 112.0000000
KODUKAI 238 44722.69 44300.74 0 165000.00
TSUUWA 97 6542.25 4508.67 0 30000.00
---------------------------------------------------------------------
SAS システム 91
19:37 Tuesday, June 17, 2008
Univariate Procedure
Schematic Plots
Variable=SHINTYOU
200 +
|
| 0
180 + |
| | *--+--*
| | +-----+
160 + *--+--* 0
| +-----+ 0
| 0
140 +
------------+-----------+-----------
SEX F M
SAS システム 92
19:37 Tuesday, June 17, 2008
Univariate Procedure
Schematic Plots
Variable=TAIJYUU
|
100 + *
| 0
| | *--+--*
50 + *--+--* +-----+
| 0
|
0 +
------------+-----------+-----------
SEX F M
SAS システム 105
19:37 Tuesday, June 17, 2008
SEX SHINTYOU Cum. Cum.
Midpoint Freq Freq Percent Percent
|
F 144 | 1 1 0.27 0.27
148 |* 5 6 1.34 1.60
152 |*** 16 22 4.28 5.88
156 |***** 27 49 7.22 13.10
160 |******* 36 85 9.63 22.73
164 |***** 26 111 6.95 29.68
168 |** 12 123 3.21 32.89
172 | 2 125 0.53 33.42
176 | 0 125 0.00 33.42
180 | 0 125 0.00 33.42
184 | 0 125 0.00 33.42
188 | 0 125 0.00 33.42
|
M 144 | 0 125 0.00 33.42
148 | 0 125 0.00 33.42
152 | 0 125 0.00 33.42
156 | 2 127 0.53 33.96
160 |* 6 133 1.60 35.56
164 |*** 17 150 4.55 40.11
168 |********* 44 194 11.76 51.87
172 |****************** 89 283 23.80 75.67
176 |********** 48 331 12.83 88.50
180 |****** 32 363 8.56 97.06
184 |** 9 372 2.41 99.47
188 | 2 374 0.53 100.00
|
----+---+---+---+--
20 40 60 80
Frequency
SAS システム 111
19:37 Tuesday, June 17, 2008
SEX KODUKAI Cum. Cum.
Midpoint Freq Freq Percent Percent
|
F 0 |****** 15 15 4.18 4.18
20000 |********** 26 41 7.24 11.42
40000 |*********** 28 69 7.80 19.22
60000 |************ 29 98 8.08 27.30
80000 |**** 10 108 2.79 30.08
100000 |** 5 113 1.39 31.48
120000 |* 3 116 0.84 32.31
140000 |* 2 118 0.56 32.87
160000 |* 2 120 0.56 33.43
180000 | 1 121 0.28 33.70
|
M 0 |********************* 52 173 14.48 48.19
20000 |******************* 47 220 13.09 61.28
40000 |********************* 53 273 14.76 76.04
60000 |************ 30 303 8.36 84.40
80000 |***** 12 315 3.34 87.74
100000 |******* 18 333 5.01 92.76
120000 |*** 8 341 2.23 94.99
140000 |* 3 344 0.84 95.82
160000 |****** 15 359 4.18 100.00
180000 | 0 359 0.00 100.00
|
----+---+---+---+---+-
10 20 30 40 50
Frequency
data seito08;
infile 'seito.prn';
input id $ sex $ kesseki $ univ $
koku $ suu1 $ suu2 $ tireki $ koumin $ rika $;
if sex^='M' then delete; /* male only */
if kesseki^='0' then delete; /* syusseki-sya only */
area="不明";
if univ="早稲田大学" then area="東日本";
if univ="慶応大学" then area="東日本";
if univ="関西大学" then area="西日本";
if univ="同志社大学" then area="西日本";
if tireki="世界史-0" then tireki="世界史";
if tireki="世界史-2" then tireki="世界史";
if tireki="日本史-2" then tireki="日本史";
if tireki="日本史-3" then tireki="日本史";
...
[例4] 複数の処理をさせたい場合 : do 〜 end で囲む
if tireki="世界史-0" then do;
tireki="世界史";
koumin=.;
end;
...
[比較演算子]
/* Lesson 11-2 */
/* File Name = les1102.sas 06/19/08 */
data gakusei;
infile 'all08ae.prn'
firstobs=2;
input sex $ shintyou taijyuu kyoui
jitaku $ kodukai carryer $ tsuuwa;
proc print data=gakusei(obs=5);
run;
:
proc freq data=gakusei; : 頻度を算出
tables sex jitaku carryer; : 一変量ごとに
run; :
proc freq data=gakusei; : 頻度を算出
tables sex*jitaku; : 二変量の組み合わせで
tables sex*carryer; :
tables jitaku*carryer; :
run; :
SAS システム 1
19:37 Tuesday, June 17, 2008
OBS SEX SHINTYOU TAIJYUU KYOUI JITAKU KODUKAI CARRYER TSUUWA
1 F 145.0 38 . J 10000 .
2 F 146.7 41 85 J 10000 Vodafone 6000
3 F 148.0 42 . J 50000 .
4 F 148.0 43 80 J 50000 DoCoMo 4000
5 F 148.9 . . J 60000 .
SAS システム 2
19:37 Tuesday, June 17, 2008
Cumulative Cumulative
SEX Frequency Percent Frequency Percent
-------------------------------------------------
F 134 34.4 134 34.4
M 256 65.6 390 100.0
Frequency Missing = 5
Cumulative Cumulative
JITAKU Frequency Percent Frequency Percent
----------------------------------------------------
G 129 38.2 129 38.2
J 209 61.8 338 100.0
Frequency Missing = 57
SAS システム 4
19:37 Tuesday, June 17, 2008
Cumulative Cumulative
CARRYER Frequency Percent Frequency Percent
------------------------------------------------------
DDIp 2 1.2 2 1.2
DoCoMo 71 42.8 73 44.0
J-PHONE 10 6.0 83 50.0
KDDI 1 0.6 84 50.6
No 5 3.0 89 53.6
Vodafone 20 12.0 109 65.7
Willcom 1 0.6 110 66.3
au 44 26.5 154 92.8
au+willc 1 0.6 155 93.4
docomo 5 3.0 160 96.4
docomo+w 1 0.6 161 97.0
softbank 4 2.4 165 99.4
vodafone 1 0.6 166 100.0
Frequency Missing = 229
SAS システム 6
19:37 Tuesday, June 17, 2008
TABLE OF SEX BY JITAKU
SEX JITAKU
Frequency|
Percent |
Row Pct |
Col Pct |G |J | Total
---------+--------+--------+
F | 40 | 75 | 115
| 11.90 | 22.32 | 34.23
| 34.78 | 65.22 |
| 31.25 | 36.06 |
---------+--------+--------+
M | 88 | 133 | 221
| 26.19 | 39.58 | 65.77
| 39.82 | 60.18 |
| 68.75 | 63.94 |
---------+--------+--------+
Total 128 208 336
38.10 61.90 100.00
Frequency Missing = 59
SAS システム 9
19:37 Tuesday, June 17, 2008
TABLE OF SEX BY CARRYER
SEX CARRYER
Frequency|
Percent |
Row Pct |
Col Pct |DDIp |DoCoMo |J-PHONE |KDDI |No | Total
---------+--------+--------+--------+--------+--------+
F | 1 | 30 | 4 | 0 | 1 | 66
| 0.61 | 18.18 | 2.42 | 0.00 | 0.61 | 40.00
| 1.52 | 45.45 | 6.06 | 0.00 | 1.52 |
| 50.00 | 42.25 | 44.44 | 0.00 | 20.00 |
---------+--------+--------+--------+--------+--------+
M | 1 | 41 | 5 | 1 | 4 | 99
| 0.61 | 24.85 | 3.03 | 0.61 | 2.42 | 60.00
| 1.01 | 41.41 | 5.05 | 1.01 | 4.04 |
| 50.00 | 57.75 | 55.56 | 100.00 | 80.00 |
---------+--------+--------+--------+--------+--------+
Total 2 71 9 1 5 165
1.21 43.03 5.45 0.61 3.03 100.00
(Continued)
SAS システム 11
19:37 Tuesday, June 17, 2008
TABLE OF SEX BY CARRYER
SEX CARRYER
Frequency|
Percent |
Row Pct |
Col Pct |Vodafone|Willcom |au |au+willc|docomo | Total
---------+--------+--------+--------+--------+--------+
F | 9 | 1 | 15 | 1 | 1 | 66
| 5.45 | 0.61 | 9.09 | 0.61 | 0.61 | 40.00
| 13.64 | 1.52 | 22.73 | 1.52 | 1.52 |
| 45.00 | 100.00 | 34.09 | 100.00 | 20.00 |
---------+--------+--------+--------+--------+--------+
M | 11 | 0 | 29 | 0 | 4 | 99
| 6.67 | 0.00 | 17.58 | 0.00 | 2.42 | 60.00
| 11.11 | 0.00 | 29.29 | 0.00 | 4.04 |
| 55.00 | 0.00 | 65.91 | 0.00 | 80.00 |
---------+--------+--------+--------+--------+--------+
Total 20 1 44 1 5 165
12.12 0.61 26.67 0.61 3.03 100.00
(Continued)
SAS システム 13
19:37 Tuesday, June 17, 2008
TABLE OF SEX BY CARRYER
SEX CARRYER
Frequency|
Percent |
Row Pct |
Col Pct |docomo+w|softbank|vodafone| Total
---------+--------+--------+--------+
F | 0 | 3 | 0 | 66
| 0.00 | 1.82 | 0.00 | 40.00
| 0.00 | 4.55 | 0.00 |
| 0.00 | 75.00 | 0.00 |
---------+--------+--------+--------+
M | 1 | 1 | 1 | 99
| 0.61 | 0.61 | 0.61 | 60.00
| 1.01 | 1.01 | 1.01 |
| 100.00 | 25.00 | 100.00 |
---------+--------+--------+--------+
Total 1 4 1 165
0.61 2.42 0.61 100.00
Frequency Missing = 230
SAS システム 16
19:37 Tuesday, June 17, 2008
TABLE OF JITAKU BY CARRYER
JITAKU CARRYER
Frequency|
Percent |
Row Pct |
Col Pct |DDIp |DoCoMo |J-PHONE |KDDI |No | Total
---------+--------+--------+--------+--------+--------+
G | 1 | 27 | 4 | 1 | 0 | 55
| 0.70 | 19.01 | 2.82 | 0.70 | 0.00 | 38.73
| 1.82 | 49.09 | 7.27 | 1.82 | 0.00 |
| 100.00 | 44.26 | 44.44 | 100.00 | 0.00 |
---------+--------+--------+--------+--------+--------+
J | 0 | 34 | 5 | 0 | 4 | 87
| 0.00 | 23.94 | 3.52 | 0.00 | 2.82 | 61.27
| 0.00 | 39.08 | 5.75 | 0.00 | 4.60 |
| 0.00 | 55.74 | 55.56 | 0.00 | 100.00 |
---------+--------+--------+--------+--------+--------+
Total 1 61 9 1 4 142
0.70 42.96 6.34 0.70 2.82 100.00
(Continued)
≪以下略≫
≪前略≫ if carryer="au+willc" then carryer="au+Willc"; if carryer="docomo" then carryer="DoCoMo"; if carryer="docomo+w" then carryer="DoCoMo+W"; if carryer="vodafone" then carryer="Vodafone"; ≪後略≫
≪前略≫
proc freq data=gakusei order=freq; : 頻度の高いもの順
tables sex jitaku carryer; :
run; :
:
proc freq data=gakusei order=freq; : 頻度の高いもの順
tables sex*jitaku; :
tables sex*carryer; :
tables jitaku*carryer; :
run; :
≪後略≫
/* Lesson 11-5 */
/* File Name = les1105.sas 06/19/08 */
data gakusei;
infile 'all08ae.prn'
firstobs=2;
input sex $ shintyou taijyuu kyoui
jitaku $ kodukai carryer $ tsuuwa;
proc format; : 階級を作る。class shintyou の意
value clshint low-<150=' -149' : 階級の定義 1
150-<160='150-159' : 2
160-<170='160-169' : 3
170-<180='170-179' : 4
180-high='180- ' : 5
other ='missing'; : 6
run; :
proc print data=gakusei(obs=5);
run;
proc freq data=gakusei; : 頻度を算出
tables shintyou; : 一変量ごとに
format shintyou clshint.; : 連続変量をグループ化することの指定
run; :
:
proc freq data=gakusei; : 頻度を算出
tables sex*shintyou; : 二変量の組合わせで
format shintyou clshint.; : 連続変量をグループ化することの指定
run; :
:
proc sort data=gakusei; : 今までの方法で実現しようとすると
by sex; :
run; :
proc freq data=gakusei; :
tables shintyou; :
format shintyou clshint.; : 連続変量をグループ化することの指定
by sex; : 性別ごとに
run; :
SAS システム 2
19:37 Tuesday, June 17, 2008
Cumulative Cumulative
SHINTYOU Frequency Percent Frequency Percent
------------------------------------------------------
-149 6 1.6 6 1.6
150-159 60 15.8 66 17.4
160-169 129 33.9 195 51.3
170-179 159 41.8 354 93.2
180- 26 6.8 380 100.0
Frequency Missing = 15
SAS システム 3
19:37 Tuesday, June 17, 2008
TABLE OF SEX BY SHINTYOU
SEX SHINTYOU
Frequency|
Percent |
Row Pct |
Col Pct | -149 |150-159 |160-169 |170-179 |180- | Total
---------+--------+--------+--------+--------+--------+
F | 6 | 58 | 61 | 2 | 0 | 127
| 1.58 | 15.30 | 16.09 | 0.53 | 0.00 | 33.51
| 4.72 | 45.67 | 48.03 | 1.57 | 0.00 |
| 100.00 | 96.67 | 47.66 | 1.26 | 0.00 |
---------+--------+--------+--------+--------+--------+
M | 0 | 2 | 67 | 157 | 26 | 252
| 0.00 | 0.53 | 17.68 | 41.42 | 6.86 | 66.49
| 0.00 | 0.79 | 26.59 | 62.30 | 10.32 |
| 0.00 | 3.33 | 52.34 | 98.74 | 100.00 |
---------+--------+--------+--------+--------+--------+
Total 6 60 128 159 26 379
1.58 15.83 33.77 41.95 6.86 100.00
Frequency Missing = 16
SAS システム 6
19:37 Tuesday, June 17, 2008
------------------------------- SEX=' ' --------------------------------
Cumulative Cumulative
SHINTYOU Frequency Percent Frequency Percent
------------------------------------------------------
160-169 1 100.0 1 100.0
Frequency Missing = 4
SAS システム 7
19:37 Tuesday, June 17, 2008
-------------------------------- SEX=F ---------------------------------
Cumulative Cumulative
SHINTYOU Frequency Percent Frequency Percent
------------------------------------------------------
-149 6 4.7 6 4.7
150-159 58 45.7 64 50.4
160-169 61 48.0 125 98.4
170-179 2 1.6 127 100.0
Frequency Missing = 7
SAS システム 8
19:37 Tuesday, June 17, 2008
-------------------------------- SEX=M ---------------------------------
Cumulative Cumulative
SHINTYOU Frequency Percent Frequency Percent
------------------------------------------------------
150-159 2 0.8 2 0.8
160-169 67 26.6 69 27.4
170-179 157 62.3 226 89.7
180- 26 10.3 252 100.0
Frequency Missing = 4
data mon2008;
infile 'd:\home\mon05d.csv' dlm=','
firstobs=2
truncover;
missover
dsd
;
input No $ Univ : $30. SName : $40. Faculty : $50. Dept : $50.
Center1 : $8. Center2 : $8. Sel1 : $8. Sel2 : $8.
Book1 : $10. Book2 : $10.
Vol0 VolS VolT
ZenKou $ ScoreS ScoreT KoKouSi
;
data mon2008;
infile 'd:\home\mon05e.txt' dlm='09'x
firstobs=2
truncover;
data math; infile 'foo.dat' lrecl=230;
data math; infile 'foo.dat' lrecl=230 truncover;
input
kamoku $ 2
kesseki $ 3
k_code $ 10-11
t_score 12-14
s_scor01 103-104
s_scor02 105-106
s_scor03 107-108
s_scor04 109-110
;
data math; infile 'foo.dat' firstobs=4;