前々回までに分布特性を把握するためのいくつかの指標を説明し、 その使い方や注意点を喚起した。 それらを踏まえて、データの特性を考慮し、グループ毎の集計を行なうと、 今までは判らなかったデータの特徴を把握することができる。 また、外れ値を除外して解析する方法についても紹介する。
/* Lesson 7-01 */
/* File Name = les0701.sas 11/15/06 */
data gakusei;
infile 'all06be.prn'
firstobs=2;
input sex $ shintyou taijyuu kyoui
jitaku $ kodukai carryer $ tsuuwa;
proc print data=gakusei(obs=5);
run;
proc means data=gakusei;
run;
proc univariate data=gakusei plot;
var shintyou taijyuu kyoui kodukai;
run;
proc chart data=gakusei; : ヒストグラム
hbar shintyou taijyuu kyoui kodukai; : 指定した変量の水平棒グラフを表示
run; :
:
proc sort data=gakusei; : 並べ替え(ソート)
by sex; : 性別ごとに
run; :
:
proc means data=gakusei; : 平均の計算
by sex; : 性別ごとに
run; :
proc univariate data=gakusei plot; : 基礎統計量の計算
var shintyou taijyuu kyoui kodukai; : 指定した変量について計算
by sex; : 性別ごとに
run; :
proc chart data=gakusei; : ヒストグラム
hbar shintyou taijyuu kyoui kodukai; : 指定した変量の水平棒グラフを表示
by sex; : 性別ごとに
run; :
:
proc chart data=gakusei; : ヒストグラム
hbar shintyou taijyuu kyoui kodukai/group=sex; : 性別ごとに併置して
run; :
SAS システム 2
19:32 Tuesday, November 14, 2006
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 353 167.9450425 8.1156439 145.0000000 186.0000000
TAIJYUU 320 58.7368750 9.2439500 35.0000000 100.0000000
KYOUI 112 86.5446429 7.5168141 56.0000000 112.0000000
KODUKAI 341 48020.53 47469.51 0 300000.00
TSUUWA 141 6526.26 4353.60 0 30000.00
---------------------------------------------------------------------
SAS システム 3
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=SHINTYOU
Moments
N 353 Sum Wgts 353
Mean 167.945 Sum 59284.6
Std Dev 8.115644 Variance 65.86368
Skewness -0.37194 Kurtosis -0.39749
USS 9979739 CSS 23184.01
CV 4.832321 Std Mean 0.431952
T:Mean=0 388.8047 Pr>|T| 0.0001
Num ^= 0 353 Num > 0 353
M(Sign) 176.5 Pr>=|M| 0.0001
Sgn Rank 31240.5 Pr>=|S| 0.0001
SAS システム 4
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=SHINTYOU
Quantiles(Def=5)
100% Max 186 99% 184
75% Q3 173.8 95% 180
50% Med 169.3 90% 178
25% Q1 162 10% 156
0% Min 145 5% 153
1% 148
Range 41
Q3-Q1 11.8
Mode 170
SAS システム 7
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=SHINTYOU
Histogram # Boxplot
187.5+* 2 |
.******* 21 |
.****************** 54 |
.********************************* 98 +-----+
167.5+*********************** 68 *--+--*
.****************** 52 +-----+
.************ 36 |
.****** 16 |
147.5+** 6 |
----+----+----+----+----+----+---
* may represent up to 3 counts
SAS システム 21
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=KODUKAI
Moments
N 341 Sum Wgts 341
Mean 48020.53 Sum 16375000
Std Dev 47469.51 Variance 2.2534E9
Skewness 1.749481 Kurtosis 4.445074
USS 1.552E12 CSS 7.661E11
CV 98.85253 Std Mean 2570.618
T:Mean=0 18.68054 Pr>|T| 0.0001
Num ^= 0 288 Num > 0 288
M(Sign) 144 Pr>=|M| 0.0001
Sgn Rank 20808 Pr>=|S| 0.0001
SAS システム 22
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=KODUKAI
Quantiles(Def=5)
100% Max 300000 99% 200000
75% Q3 60000 95% 150000
50% Med 30000 90% 120000
25% Q1 20000 10% 0
0% Min 0 5% 0
1% 0
Range 300000
Q3-Q1 40000
Mode 0
SAS システム 25
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=KODUKAI
Histogram # Boxplot
325000+* 2 *
.
.* 2 *
175000+**** 18 0
.******** 37 0
.**************** 76 +-----+
25000+****************************************** 206 *--+--*
----+----+----+----+----+----+----+----+--
* may represent up to 5 counts
SAS システム 27
19:32 Tuesday, November 14, 2006
SHINTYOU Cum. Cum.
Midpoint Freq Freq Percent Percent
|
146 |* 2 2 0.57 0.57
150 |*** 8 10 2.27 2.83
154 |****** 16 26 4.53 7.37
158 |************* 32 58 9.07 16.43
162 |****************** 45 103 12.75 29.18
166 |******************* 47 150 13.31 42.49
170 |****************************** 74 224 20.96 63.46
174 |**************************** 70 294 19.83 83.29
178 |************** 36 330 10.20 93.48
182 |******** 19 349 5.38 98.87
186 |** 4 353 1.13 100.00
|
----+---+---+---+---+---+---+--
10 20 30 40 50 60 70
SAS システム 31
19:32 Tuesday, November 14, 2006
KODUKAI Cum. Cum.
Midpoint Freq Freq Percent Percent
|
0 |*************** 76 76 22.29 22.29
30000 |************************* 126 202 36.95 59.24
60000 |************* 65 267 19.06 78.30
90000 |******* 37 304 10.85 89.15
120000 |*** 14 318 4.11 93.26
150000 |*** 17 335 4.99 98.24
180000 | 2 337 0.59 98.83
210000 | 2 339 0.59 99.41
240000 | 0 339 0.00 99.41
270000 | 0 339 0.00 99.41
300000 | 2 341 0.59 100.00
|
----+---+---+---+---+---+-
20 40 60 80 100 120
SAS システム 33
19:32 Tuesday, November 14, 2006
--------------------------------- SEX=F --------------------------------
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 114 159.1245614 5.3845002 145.0000000 171.0000000
TAIJYUU 81 48.7160494 4.7777468 35.0000000 60.0000000
KYOUI 42 82.9523810 3.9382575 70.0000000 90.0000000
KODUKAI 111 49081.08 44989.87 0 300000.00
TSUUWA 55 6820.07 4413.43 200.0000000 25000.00
---------------------------------------------------------------------
SAS システム 34
19:32 Tuesday, November 14, 2006
--------------------------------- SEX=M --------------------------------
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 238 172.1697479 5.3528108 156.0000000 186.0000000
TAIJYUU 238 62.1420168 7.8319964 46.0000000 100.0000000
KYOUI 70 88.7000000 8.3133383 56.0000000 112.0000000
KODUKAI 228 47385.96 48771.07 0 300000.00
TSUUWA 85 6295.27 4337.49 0 30000.00
---------------------------------------------------------------------
SAS システム 53
19:32 Tuesday, November 14, 2006
-------------------------------- SEX=F ---------------------------------
Univariate Procedure
Variable=SHINTYOU
Moments
N 114 Sum Wgts 114
Mean 159.1246 Sum 18140.2
Std Dev 5.3845 Variance 28.99284
Skewness -0.236 Kurtosis -0.31118
USS 2889828 CSS 3276.191
CV 3.383827 Std Mean 0.504305
T:Mean=0 315.5326 Pr>|T| 0.0001
Num ^= 0 114 Num > 0 114
M(Sign) 57 Pr>=|M| 0.0001
Sgn Rank 3277.5 Pr>=|S| 0.0001
SAS システム 55
19:32 Tuesday, November 14, 2006
-------------------------------- SEX=F ---------------------------------
Univariate Procedure
Variable=SHINTYOU
Quantiles(Def=5)
100% Max 171 99% 170
75% Q3 163 95% 167
50% Med 160 90% 166
25% Q1 156 10% 152
0% Min 145 5% 149
1% 146.7
Range 26
Q3-Q1 7
Mode 156
SAS システム 58
19:32 Tuesday, November 14, 2006
-------------------------------- SEX=F ---------------------------------
Univariate Procedure
Variable=SHINTYOU
Stem Leaf # Boxplot
17 001 3 |
16 555555666666677778 18 |
16 00000000000000111112222222222333344444 38 +-----+
15 555566666666666667777888888999999 33 +--+--+
15 0112222333333444 16 |
14 578899 6 0
----+----+----+----+----+----+----+---
Multiply Stem.Leaf by 10**+1
SAS システム 81
19:32 Tuesday, November 14, 2006
-------------------------------- SEX=M ---------------------------------
Univariate Procedure
Variable=SHINTYOU
Moments
N 238 Sum Wgts 238
Mean 172.1697 Sum 40976.4
Std Dev 5.352811 Variance 28.65258
Skewness -0.12712 Kurtosis 0.159543
USS 7061687 CSS 6790.662
CV 3.109031 Std Mean 0.346971
T:Mean=0 496.2076 Pr>|T| 0.0001
Num ^= 0 238 Num > 0 238
M(Sign) 119 Pr>=|M| 0.0001
Sgn Rank 14220.5 Pr>=|S| 0.0001
SAS システム 83
19:32 Tuesday, November 14, 2006
-------------------------------- SEX=M ---------------------------------
Univariate Procedure
Variable=SHINTYOU
Quantiles(Def=5)
100% Max 186 99% 184
75% Q3 175 95% 181
50% Med 172 90% 179.9
25% Q1 169 10% 165
0% Min 156 5% 163
1% 160
Range 30
Q3-Q1 6
Mode 170
SAS システム 86
19:32 Tuesday, November 14, 2006
-------------------------------- SEX=M ---------------------------------
Univariate Procedure
Variable=SHINTYOU
Histogram # Boxplot
187.5+* 2 0
.*********** 21 |
.*************************** 54 +-----+
172.5+************************************************ 96 *--+--*
.************************ 48 +-----+
.******** 15 |
157.5+* 2 0
----+----+----+----+----+----+----+----+----+---
* may represent up to 2 counts
SAS システム 109
19:32 Tuesday, November 14, 2006
Univariate Procedure
Schematic Plots
Variable=SHINTYOU
200 +
|
| 0
180 + |
| | *--+--*
| *--+--* | +-----+
160 + *--+--* 0
| +-----+ 0
| 0
140 +
------------+-----------+-----------+-----------
SEX F M
SAS システム 110
19:32 Tuesday, November 14, 2006
Univariate Procedure
Schematic Plots
Variable=TAIJYUU
|
100 + *
| 0
| *--+--* | *--+--*
50 + *--+--* +-----+
| 0
|
0 +
------------+-----------+-----------+-----------
SEX F M
SAS システム 111
19:32 Tuesday, November 14, 2006
Univariate Procedure
Schematic Plots
Variable=KYOUI
|
150 +
|
| 0
100 + +-----+
| *--+--* *--+--*
| 0 0
50 + *
------------+-----------+-----------+-----------
SEX F M
SAS システム 112
19:32 Tuesday, November 14, 2006
Univariate Procedure
Schematic Plots
Variable=KODUKAI
300000 + * *
|
|
200000 + * 0
| 0 0
| 0 |
100000 + +-----+ | |
| *--+--* +-----+ +-----+
| +-----+ *--+--* *--+--*
0 + | +-----+
------------+-----------+-----------+-----------
SEX F M
SAS システム 116
19:32 Tuesday, November 14, 2006
-------------------------------- SEX=F ---------------------------------
SHINTYOU Cum. Cum.
Midpoint Freq Freq Percent Percent
|
144 |* 1 1 0.88 0.88
147 |*** 3 4 2.63 3.51
150 |***** 5 9 4.39 7.89
153 |************* 13 22 11.40 19.30
156 |********************* 21 43 18.42 37.72
159 |************************** 26 69 22.81 60.53
162 |******************* 19 88 16.67 77.19
165 |****************** 18 106 15.79 92.98
168 |***** 5 111 4.39 97.37
171 |*** 3 114 2.63 100.00
|
-----+----+----+----+----+-
5 10 15 20 25
Frequency
SAS システム 121
19:32 Tuesday, November 14, 2006
-------------------------------- SEX=M ---------------------------------
SHINTYOU Cum. Cum.
Midpoint Freq Freq Percent Percent
|
156 |* 2 2 0.84 0.84
159 |** 5 7 2.10 2.94
162 |*** 8 15 3.36 6.30
165 |***** 12 27 5.04 11.34
168 |*************** 38 65 15.97 27.31
171 |************************** 64 129 26.89 54.20
174 |******************** 50 179 21.01 75.21
177 |*********** 28 207 11.76 86.97
180 |******** 21 228 8.82 95.80
183 |*** 8 236 3.36 99.16
186 |* 2 238 0.84 100.00
|
----+---+---+---+---+---+--
10 20 30 40 50 60
Frequency
SAS システム 128
19:32 Tuesday, November 14, 2006
SEX SHINTYOU Cum. Cum.
Midpoint Freq Freq Percent Percent
|
146 | 0 0 0.00 0.00
150 | 0 0 0.00 0.00
154 | 0 0 0.00 0.00
158 | 0 0 0.00 0.00
162 | 0 0 0.00 0.00
166 | 0 0 0.00 0.00
170 | 1 1 0.28 0.28
174 | 0 1 0.00 0.28
178 | 0 1 0.00 0.28
182 | 0 1 0.00 0.28
186 | 0 1 0.00 0.28
|
F 146 | 2 3 0.57 0.85
150 |** 8 11 2.27 3.12
154 |*** 16 27 4.53 7.65
158 |****** 30 57 8.50 16.15
162 |****** 32 89 9.07 25.21
166 |**** 22 111 6.23 31.44
170 |* 4 115 1.13 32.58
174 | 0 115 0.00 32.58
178 | 0 115 0.00 32.58
182 | 0 115 0.00 32.58
186 | 0 115 0.00 32.58
|
M 146 | 0 115 0.00 32.58
150 | 0 115 0.00 32.58
154 | 0 115 0.00 32.58
158 | 2 117 0.57 33.14
162 |*** 13 130 3.68 36.83
166 |***** 25 155 7.08 43.91
170 |************** 69 224 19.55 63.46
174 |************** 70 294 19.83 83.29
178 |******* 36 330 10.20 93.48
182 |**** 19 349 5.38 98.87
186 |* 4 353 1.13 100.00
|
----+---+---+--
20 40 60
Frequency
SAS システム 136
19:32 Tuesday, November 14, 2006
SEX KODUKAI Cum. Cum.
Midpoint Freq Freq Percent Percent
|
0 | 0 0 0.00 0.00
30000 | 1 1 0.29 0.29
60000 | 0 1 0.00 0.29
90000 | 1 2 0.29 0.59
120000 | 0 2 0.00 0.59
150000 | 0 2 0.00 0.59
180000 | 0 2 0.00 0.59
210000 | 0 2 0.00 0.59
240000 | 0 2 0.00 0.59
270000 | 0 2 0.00 0.59
300000 | 0 2 0.00 0.59
|
F 0 |*** 17 19 4.99 5.57
30000 |********* 43 62 12.61 18.18
60000 |****** 32 94 9.38 27.57
90000 |** 10 104 2.93 30.50
120000 |* 4 108 1.17 31.67
150000 | 2 110 0.59 32.26
180000 | 1 111 0.29 32.55
210000 | 1 112 0.29 32.84
240000 | 0 112 0.00 32.84
270000 | 0 112 0.00 32.84
300000 | 1 113 0.29 33.14
|
M 0 |************ 59 172 17.30 50.44
30000 |**************** 82 254 24.05 74.49
60000 |******* 33 287 9.68 84.16
90000 |***** 26 313 7.62 91.79
120000 |** 10 323 2.93 94.72
150000 |*** 15 338 4.40 99.12
180000 | 1 339 0.29 99.41
210000 | 1 340 0.29 99.71
240000 | 0 340 0.00 99.71
270000 | 0 340 0.00 99.71
300000 | 1 341 0.29 100.00
|
----+---+---+---+
20 40 60 80
Frequency
/* Lesson 7-02 */
/* File Name = les0702.sas 11/15/06 */
data gakusei;
infile 'all06be.prn'
firstobs=2;
input sex $ shintyou taijyuu kyoui
jitaku $ kodukai carryer $ tsuuwa;
if kodukai>=200000 then delete; : 20万円以上の場合、除外
if sex^='M' & sex^='F' then delete; : 男でも女でもない場合、除外
(以下略)
SAS システム 2
19:32 Tuesday, November 14, 2006
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 348 167.9212644 8.1408570 145.0000000 186.0000000
TAIJYUU 315 58.7961905 9.2728999 35.0000000 100.0000000
KYOUI 109 86.6330275 7.5983192 56.0000000 112.0000000
KODUKAI 335 45528.36 41941.97 0 180000.00
TSUUWA 140 6501.44 4359.22 0 30000.00
---------------------------------------------------------------------
SAS システム 21
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=KODUKAI
Moments
N 335 Sum Wgts 335
Mean 45528.36 Sum 15252000
Std Dev 41941.97 Variance 1.7591E9
Skewness 1.163738 Kurtosis 0.714217
USS 1.282E12 CSS 5.875E11
CV 92.12273 Std Mean 2291.535
T:Mean=0 19.86807 Pr>|T| 0.0001
Num ^= 0 282 Num > 0 282
M(Sign) 141 Pr>=|M| 0.0001
Sgn Rank 19951.5 Pr>=|S| 0.0001
SAS システム 22
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=KODUKAI
Quantiles(Def=5)
100% Max 180000 99% 160000
75% Q3 60000 95% 150000
50% Med 30000 90% 100000
25% Q1 20000 10% 0
0% Min 0 5% 0
1% 0
Range 180000
Q3-Q1 40000
Mode 0
SAS システム 25
19:32 Tuesday, November 14, 2006
Univariate Procedure
Variable=KODUKAI
Histogram # Boxplot
190000+* 1 0
.** 6 0
.**** 12 0
130000+***** 13 0
.******** 22 |
.**** 11 |
70000+************ 35 +-----+
.***************** 50 | + |
.*********************************** 105 *-----*
10000+*************************** 80 |
----+----+----+----+----+----+----+
* may represent up to 3 counts
SAS システム 32
19:32 Tuesday, November 14, 2006
--------------------------------- SEX=F --------------------------------
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 112 159.0375000 5.3858360 145.0000000 171.0000000
TAIJYUU 79 48.7468354 4.8339302 35.0000000 60.0000000
KYOUI 40 82.9750000 4.0350230 70.0000000 90.0000000
KODUKAI 109 45394.50 35411.80 0 180000.00
TSUUWA 55 6820.07 4413.43 200.0000000 25000.00
---------------------------------------------------------------------
SAS システム 33
19:32 Tuesday, November 14, 2006
--------------------------------- SEX=M --------------------------------
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 236 172.1372881 5.3566709 156.0000000 186.0000000
TAIJYUU 236 62.1601695 7.8627497 46.0000000 100.0000000
KYOUI 69 88.7536232 8.3620392 56.0000000 112.0000000
KODUKAI 226 45592.92 44826.31 0 165000.00
TSUUWA 85 6295.27 4337.49 0 30000.00
---------------------------------------------------------------------
SAS システム 90
19:32 Tuesday, November 14, 2006
Univariate Procedure
Schematic Plots
Variable=SHINTYOU
200 +
|
| 0
180 + |
| | *--+--*
| | +-----+
160 + *--+--* 0
| +-----+ 0
| 0
140 +
------------+-----------+-----------
SEX F M
SAS システム 93
19:32 Tuesday, November 14, 2006
Univariate Procedure
Schematic Plots
Variable=KODUKAI
|
200000 +
| 0 0
| 0 |
100000 + | |
| +-----+ +-----+
| *--+--* *--+--*
0 + | +-----+
------------+-----------+-----------
SEX F M
SAS システム 105
19:32 Tuesday, November 14, 2006
SEX SHINTYOU Cum. Cum.
Midpoint Freq Freq Percent Percent
|
F 146 | 2 2 0.57 0.57
150 |** 8 10 2.30 2.87
154 |*** 16 26 4.60 7.47
158 |****** 30 56 8.62 16.09
162 |****** 31 87 8.91 25.00
166 |**** 21 108 6.03 31.03
170 |* 4 112 1.15 32.18
174 | 0 112 0.00 32.18
178 | 0 112 0.00 32.18
182 | 0 112 0.00 32.18
186 | 0 112 0.00 32.18
|
M 146 | 0 112 0.00 32.18
150 | 0 112 0.00 32.18
154 | 0 112 0.00 32.18
158 | 2 114 0.57 32.76
162 |*** 13 127 3.74 36.49
166 |***** 25 152 7.18 43.68
170 |************** 69 221 19.83 63.51
174 |************** 69 290 19.83 83.33
178 |******* 35 325 10.06 93.39
182 |**** 19 344 5.46 98.85
186 |* 4 348 1.15 100.00
|
----+---+---+--
20 40 60
Frequency
SAS システム 111
19:32 Tuesday, November 14, 2006
SEX KODUKAI Cum. Cum.
Midpoint Freq Freq Percent Percent
|
F 0 |***** 13 13 3.88 3.88
20000 |********* 22 35 6.57 10.45
40000 |*********** 27 62 8.06 18.51
60000 |********** 25 87 7.46 25.97
80000 |**** 10 97 2.99 28.96
100000 |** 5 102 1.49 30.45
120000 |* 3 105 0.90 31.34
140000 | 1 106 0.30 31.64
160000 |* 2 108 0.60 32.24
180000 | 1 109 0.30 32.54
|
M 0 |******************** 49 158 14.63 47.16
20000 |***************** 43 201 12.84 60.00
40000 |******************** 51 252 15.22 75.22
60000 |*********** 28 280 8.36 83.58
80000 |***** 12 292 3.58 87.16
100000 |******* 17 309 5.07 92.24
120000 |*** 8 317 2.39 94.63
140000 |* 3 320 0.90 95.52
160000 |****** 15 335 4.48 100.00
180000 | 0 335 0.00 100.00
|
----+---+---+---+---+
10 20 30 40 50
Frequency
data seito06;
infile 'seito.prn';
input id $ sex $ kesseki $ univ $
koku $ suu1 $ suu2 $ tireki $ koumin $ rika $;
if sex^='M' then delete; /* male only */
if kesseki^='0' then delete; /* syusseki-sya only */
area="不明";
if univ="早稲田大学" then area="東日本";
if univ="慶応大学" then area="東日本";
if univ="関西大学" then area="西日本";
if univ="同志社大学" then area="西日本";
if tireki="世界史-0" then tireki="世界史";
if tireki="世界史-2" then tireki="世界史";
if tireki="日本史-2" then tireki="日本史";
if tireki="日本史-3" then tireki="日本史";
...
[例4] 複数の処理をさせたい場合 : do 〜 end で囲む
if tireki="世界史-0" then do;
tireki="世界史";
koumin=.;
end;
...
[比較演算子]
data math; infile 'foo.dat' lrecl=230;
data math; infile 'foo.dat' lrecl=230 truncover;
input
kamoku $ 2
kesseki $ 3
k_code $ 10-11
t_score 12-14
s_scor01 103-104
s_scor02 105-106
s_scor03 107-108
s_scor04 109-110
;
data math; infile 'foo.dat' firstobs=4;