前回までに分布特性を把握するためのいくつかの指標を説明し、
その使い方や注意点を喚起した。またグループ分けが有用なことも説明した。
解析の過程では、特徴の異なるサンプルや外れ値を除外することもあるので、
その方法について紹介する。
また、単純集計としてよく利用される頻度集計やクロス集計の方法についても
紹介する。
/* Lesson 11-1 */
/* File Name = les1101.sas 12/10/08 */
data gakusei;
infile 'all08ce.prn'
firstobs=2;
input sex $ shintyou taijyuu kyoui
jitaku $ kodukai carryer $ tsuuwa;
if sex^='M' & sex^='F' then delete; : 男でも女でもない場合、除外
if kodukai>=200000 then delete; : 20万円以上の場合、除外
proc print data=gakusei(obs=5);
run;
proc means data=gakusei;
run;
proc univariate data=gakusei plot;
var shintyou taijyuu kyoui kodukai;
run;
proc chart data=gakusei;
hbar shintyou taijyuu kyoui kodukai;
run;
proc sort data=gakusei;
by sex;
run;
proc means data=gakusei;
by sex;
run;
proc univariate data=gakusei plot;
var shintyou taijyuu kyoui kodukai;
by sex;
run;
proc chart data=gakusei;
hbar shintyou taijyuu kyoui kodukai;
by sex;
run;
proc chart data=gakusei;
hbar shintyou taijyuu kyoui kodukai/group=sex;
run;
SAS システム 2
11:52 Thursday, December 4, 2008
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 382 167.8473822 8.2930247 145.0000000 188.0000000
TAIJYUU 343 58.9533528 9.3369344 35.0000000 100.0000000
KYOUI 115 86.6347826 7.5618275 56.0000000 112.0000000
KODUKAI 367 44820.16 41260.85 0 180000.00
TSUUWA 173 6612.32 4375.11 0 30000.00
---------------------------------------------------------------------
SAS システム 21
11:52 Thursday, December 4, 2008
Univariate Procedure
Variable=KODUKAI
Moments
N 367 Sum Wgts 367
Mean 44820.16 Sum 16449000
Std Dev 41260.85 Variance 1.7025E9
Skewness 1.1826 Kurtosis 0.795026
USS 1.36E12 CSS 6.231E11
CV 92.05869 Std Mean 2153.799
T:Mean=0 20.80982 Pr>|T| 0.0001
Num ^= 0 310 Num > 0 310
M(Sign) 155 Pr>=|M| 0.0001
Sgn Rank 24102.5 Pr>=|S| 0.0001
SAS システム 22
11:52 Thursday, December 4, 2008
Univariate Procedure
Variable=KODUKAI
Quantiles(Def=5)
100% Max 180000 99% 160000
75% Q3 60000 95% 140000
50% Med 30000 90% 100000
25% Q1 20000 10% 0
0% Min 0 5% 0
1% 0
Range 180000
Q3-Q1 40000
Mode 0
SAS システム 25
11:52 Thursday, December 4, 2008
Univariate Procedure
Variable=KODUKAI
Histogram # Boxplot
190000+* 1 0
.** 6 0
.**** 12 0
130000+***** 14 0
.********* 25 |
.**** 11 |
70000+************* 38 +-----+
.******************* 56 | + |
.*************************************** 116 *-----*
10000+****************************** 88 |
----+----+----+----+----+----+----+----
* may represent up to 3 counts
SAS システム 34
11:52 Thursday, December 4, 2008
--------------------------------- SEX=F --------------------------------
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 125 158.9016000 5.2486555 145.0000000 171.0000000
TAIJYUU 86 48.8720930 4.7986187 35.0000000 60.0000000
KYOUI 43 83.0000000 3.9400266 70.0000000 90.0000000
KODUKAI 121 44752.07 35327.05 0 180000.00
TSUUWA 68 6576.24 4187.38 80.0000000 25000.00
---------------------------------------------------------------------
SAS システム 35
11:52 Thursday, December 4, 2008
--------------------------------- SEX=M --------------------------------
Variable N Mean Std Dev Minimum Maximum
---------------------------------------------------------------------
SHINTYOU 257 172.1984436 5.5615025 156.0000000 188.0000000
TAIJYUU 257 62.3268482 7.9531354 46.0000000 100.0000000
KYOUI 72 88.8055556 8.3575704 56.0000000 112.0000000
KODUKAI 246 44853.66 43954.46 0 165000.00
TSUUWA 105 6635.70 4512.19 0 30000.00
---------------------------------------------------------------------
SAS システム 92
11:52 Thursday, December 4, 2008
Univariate Procedure
Schematic Plots
Variable=SHINTYOU
200 +
|
| 0
180 + |
| | *--+--*
| | +-----+
160 + *--+--* 0
| +-----+ 0
| 0
140 +
------------+-----------+-----------
SEX F M
SAS システム 93
11:52 Thursday, December 4, 2008
Univariate Procedure
Schematic Plots
Variable=TAIJYUU
|
100 + *
| 0
| | *--+--*
50 + *--+--* +-----+
| 0
|
0 +
------------+-----------+-----------
SEX F M
SAS システム 107
11:52 Thursday, December 4, 2008
SEX SHINTYOU Cum. Cum.
Midpoint Freq Freq Percent Percent
|
F 144 | 1 1 0.26 0.26
148 |* 5 6 1.31 1.57
152 |*** 16 22 4.19 5.76
156 |***** 27 49 7.07 12.83
160 |******* 36 85 9.42 22.25
164 |***** 26 111 6.81 29.06
168 |** 12 123 3.14 32.20
172 | 2 125 0.52 32.72
176 | 0 125 0.00 32.72
180 | 0 125 0.00 32.72
184 | 0 125 0.00 32.72
188 | 0 125 0.00 32.72
|
M 144 | 0 125 0.00 32.72
148 | 0 125 0.00 32.72
152 | 0 125 0.00 32.72
156 | 2 127 0.52 33.25
160 |* 7 134 1.83 35.08
164 |**** 18 152 4.71 39.79
168 |********* 45 197 11.78 51.57
172 |****************** 91 288 23.82 75.39
176 |********** 49 337 12.83 88.22
180 |******* 33 370 8.64 96.86
184 |** 9 379 2.36 99.21
188 |* 3 382 0.79 100.00
|
----+---+---+---+--
20 40 60 80
Frequency
SAS システム 113
11:52 Thursday, December 4, 2008
SEX KODUKAI Cum. Cum.
Midpoint Freq Freq Percent Percent
|
F 0 |***** 13 13 3.54 3.54
15000 |********* 23 36 6.27 9.81
30000 |********** 24 60 6.54 16.35
45000 |********* 23 83 6.27 22.62
60000 |****** 15 98 4.09 26.70
75000 |**** 10 108 2.72 29.43
90000 | 0 108 0.00 29.43
105000 |** 5 113 1.36 30.79
120000 |* 3 116 0.82 31.61
135000 |* 2 118 0.54 32.15
150000 |* 2 120 0.54 32.70
165000 | 0 120 0.00 32.70
180000 | 1 121 0.27 32.97
|
M 0 |********************* 52 173 14.17 47.14
15000 |**************** 41 214 11.17 58.31
30000 |******************** 51 265 13.90 72.21
45000 |************* 32 297 8.72 80.93
60000 |***** 12 309 3.27 84.20
75000 |***** 12 321 3.27 87.47
90000 | 1 322 0.27 87.74
105000 |******** 19 341 5.18 92.92
120000 |*** 8 349 2.18 95.10
135000 |* 3 352 0.82 95.91
150000 |**** 9 361 2.45 98.37
165000 |** 6 367 1.63 100.00
180000 | 0 367 0.00 100.00
|
----+---+---+---+---+-
10 20 30 40 50
Frequency
data seito08;
infile 'seito.prn';
input id $ sex $ kesseki $ univ $
koku $ suu1 $ suu2 $ tireki $ koumin $ rika $;
if sex^='M' then delete; /* male only */
if kesseki^='0' then delete; /* syusseki-sya only */
area="不明";
if univ="早稲田大学" then area="東日本";
if univ="慶応大学" then area="東日本";
if univ="関西大学" then area="西日本";
if univ="同志社大学" then area="西日本";
if tireki="世界史-0" then tireki="世界史";
if tireki="世界史-2" then tireki="世界史";
if tireki="日本史-2" then tireki="日本史";
if tireki="日本史-3" then tireki="日本史";
...
[例4] 複数の処理をさせたい場合 : do 〜 end で囲む
if tireki="世界史-0" then do;
tireki="世界史";
koumin=.;
end;
...
[比較演算子]
/* Lesson 11-2 */
/* File Name = les1102.sas 12/10/08 */
data gakusei;
infile 'all08ce.prn'
firstobs=2;
input sex $ shintyou taijyuu kyoui
jitaku $ kodukai carryer $ tsuuwa;
proc print data=gakusei(obs=5);
run;
:
proc freq data=gakusei; : 頻度を算出
tables sex jitaku carryer; : 一変量ごとに
run; :
proc freq data=gakusei; : 頻度を算出
tables sex*jitaku; : 二変量の組み合わせで
tables sex*carryer; :
tables jitaku*carryer; :
run; :
SAS システム 1
11:52 Thursday, December 4, 2008
OBS SEX SHINTYOU TAIJYUU KYOUI JITAKU KODUKAI CARRYER TSUUWA
1 F 145.0 38 . J 10000 .
2 F 146.7 41 85 J 10000 Vodafone 6000
3 F 148.0 42 . J 50000 .
4 F 148.0 43 80 J 50000 DoCoMo 4000
5 F 148.9 . . J 60000 .
SAS システム 2
11:52 Thursday, December 4, 2008
Cumulative Cumulative
SEX Frequency Percent Frequency Percent
-------------------------------------------------
F 134 33.6 134 33.6
M 265 66.4 399 100.0
Frequency Missing = 5
Cumulative Cumulative
JITAKU Frequency Percent Frequency Percent
----------------------------------------------------
G 130 37.5 130 37.5
J 217 62.5 347 100.0
Frequency Missing = 57
SAS システム 4
11:52 Thursday, December 4, 2008
Cumulative Cumulative
CARRYER Frequency Percent Frequency Percent
------------------------------------------------------
DDIp 2 1.1 2 1.1
DoCoMo 71 40.6 73 41.7
J-PHONE 10 5.7 83 47.4
KDDI 1 0.6 84 48.0
No 5 2.9 89 50.9
Vodafone 20 11.4 109 62.3
Willcom 1 0.6 110 62.9
au 46 26.3 156 89.1
au+willc 1 0.6 157 89.7
au/willc 1 0.6 158 90.3
docomo 8 4.6 166 94.9
docomo+w 1 0.6 167 95.4
docomo/a 1 0.6 168 96.0
docomo/w 1 0.6 169 96.6
softbank 5 2.9 174 99.4
vodafone 1 0.6 175 100.0
Frequency Missing = 229
SAS システム 6
11:52 Thursday, December 4, 2008
TABLE OF SEX BY JITAKU
SEX JITAKU
Frequency|
Percent |
Row Pct |
Col Pct |G |J | Total
---------+--------+--------+
F | 40 | 75 | 115
| 11.59 | 21.74 | 33.33
| 34.78 | 65.22 |
| 31.01 | 34.72 |
---------+--------+--------+
M | 89 | 141 | 230
| 25.80 | 40.87 | 66.67
| 38.70 | 61.30 |
| 68.99 | 65.28 |
---------+--------+--------+
Total 129 216 345
37.39 62.61 100.00
Frequency Missing = 59
SAS システム 9
11:52 Thursday, December 4, 2008
TABLE OF SEX BY CARRYER
SEX CARRYER
Frequency|
Percent |
Row Pct |
Col Pct |DDIp |DoCoMo |J-PHONE |KDDI |No |Vodafone| Total
---------+--------+--------+--------+--------+--------+--------+
F | 1 | 30 | 4 | 0 | 1 | 9 | 66
| 0.57 | 17.24 | 2.30 | 0.00 | 0.57 | 5.17 | 37.93
| 1.52 | 45.45 | 6.06 | 0.00 | 1.52 | 13.64 |
| 50.00 | 42.25 | 44.44 | 0.00 | 20.00 | 45.00 |
---------+--------+--------+--------+--------+--------+--------+
M | 1 | 41 | 5 | 1 | 4 | 11 | 108
| 0.57 | 23.56 | 2.87 | 0.57 | 2.30 | 6.32 | 62.07
| 0.93 | 37.96 | 4.63 | 0.93 | 3.70 | 10.19 |
| 50.00 | 57.75 | 55.56 | 100.00 | 80.00 | 55.00 |
---------+--------+--------+--------+--------+--------+--------+
Total 2 71 9 1 5 20 174
1.15 40.80 5.17 0.57 2.87 11.49 100.00
(Continued)
SAS システム 11
11:52 Thursday, December 4, 2008
TABLE OF SEX BY CARRYER
SEX CARRYER
Frequency|
Percent |
Row Pct |
Col Pct |Willcom |au |au+willc|au/willc|docomo |docomo+w| Total
---------+--------+--------+--------+--------+--------+--------+
F | 1 | 15 | 1 | 0 | 1 | 0 | 66
| 0.57 | 8.62 | 0.57 | 0.00 | 0.57 | 0.00 | 37.93
| 1.52 | 22.73 | 1.52 | 0.00 | 1.52 | 0.00 |
| 100.00 | 32.61 | 100.00 | 0.00 | 12.50 | 0.00 |
---------+--------+--------+--------+--------+--------+--------+
M | 0 | 31 | 0 | 1 | 7 | 1 | 108
| 0.00 | 17.82 | 0.00 | 0.57 | 4.02 | 0.57 | 62.07
| 0.00 | 28.70 | 0.00 | 0.93 | 6.48 | 0.93 |
| 0.00 | 67.39 | 0.00 | 100.00 | 87.50 | 100.00 |
---------+--------+--------+--------+--------+--------+--------+
Total 1 46 1 1 8 1 174
0.57 26.44 0.57 0.57 4.60 0.57 100.00
(Continued)
SAS システム 13
11:52 Thursday, December 4, 2008
TABLE OF SEX BY CARRYER
SEX CARRYER
Frequency|
Percent |
Row Pct |
Col Pct |docomo/a|docomo/w|softbank|vodafone| Total
---------+--------+--------+--------+--------+
F | 0 | 0 | 3 | 0 | 66
| 0.00 | 0.00 | 1.72 | 0.00 | 37.93
| 0.00 | 0.00 | 4.55 | 0.00 |
| 0.00 | 0.00 | 60.00 | 0.00 |
---------+--------+--------+--------+--------+
M | 1 | 1 | 2 | 1 | 108
| 0.57 | 0.57 | 1.15 | 0.57 | 62.07
| 0.93 | 0.93 | 1.85 | 0.93 |
| 100.00 | 100.00 | 40.00 | 100.00 |
---------+--------+--------+--------+--------+
Total 1 1 5 1 174
0.57 0.57 2.87 0.57 100.00
Frequency Missing = 230
SAS システム 16
11:52 Thursday, December 4, 2008
TABLE OF JITAKU BY CARRYER
JITAKU CARRYER
Frequency|
Percent |
Row Pct |
Col Pct |DDIp |DoCoMo |J-PHONE |KDDI |No |Vodafone| Total
---------+--------+--------+--------+--------+--------+--------+
G | 1 | 27 | 4 | 1 | 0 | 4 | 56
| 0.66 | 17.88 | 2.65 | 0.66 | 0.00 | 2.65 | 37.09
| 1.79 | 48.21 | 7.14 | 1.79 | 0.00 | 7.14 |
| 100.00 | 44.26 | 44.44 | 100.00 | 0.00 | 23.53 |
---------+--------+--------+--------+--------+--------+--------+
J | 0 | 34 | 5 | 0 | 4 | 13 | 95
| 0.00 | 22.52 | 3.31 | 0.00 | 2.65 | 8.61 | 62.91
| 0.00 | 35.79 | 5.26 | 0.00 | 4.21 | 13.68 |
| 0.00 | 55.74 | 55.56 | 0.00 | 100.00 | 76.47 |
---------+--------+--------+--------+--------+--------+--------+
Total 1 61 9 1 4 17 151
0.66 40.40 5.96 0.66 2.65 11.26 100.00
(Continued)
≪以下略≫
≪前略≫ if carryer="au+willc" then carryer="au+Willc"; if carryer="docomo" then carryer="DoCoMo"; if carryer="docomo+w" then carryer="DoCoMo+W"; if carryer="vodafone" then carryer="Vodafone"; ≪後略≫
≪前略≫
proc freq data=gakusei order=freq; : 頻度の高いもの順
tables sex jitaku carryer; :
run; :
:
proc freq data=gakusei order=freq; : 頻度の高いもの順
tables sex*jitaku; :
tables sex*carryer; :
tables jitaku*carryer; :
run; :
≪後略≫
/* Lesson 11-5 */
/* File Name = les1105.sas 12/10/08 */
data gakusei;
infile 'all08ce.prn'
firstobs=2;
input sex $ shintyou taijyuu kyoui
jitaku $ kodukai carryer $ tsuuwa;
proc format; : 階級を作る。class shintyou の意
value clshint low-<150=' -149' : 階級の定義 1
150-<160='150-159' : 2
160-<170='160-169' : 3
170-<180='170-179' : 4
180-high='180- ' : 5
other ='missing'; : 6
run; :
proc print data=gakusei(obs=5);
run;
proc freq data=gakusei; : 頻度を算出
tables shintyou; : 一変量ごとに
format shintyou clshint.; : 連続変量をグループ化することの指定
run; :
:
proc freq data=gakusei; : 頻度を算出
tables sex*shintyou; : 二変量の組合わせで
format shintyou clshint.; : 連続変量をグループ化することの指定
run; :
:
proc sort data=gakusei; : 今までの方法で実現しようとすると
by sex; :
run; :
proc freq data=gakusei; :
tables shintyou; :
format shintyou clshint.; : 連続変量をグループ化することの指定
by sex; : 性別ごとに
run; :
SAS システム 2
11:52 Thursday, December 4, 2008
Cumulative Cumulative
SHINTYOU Frequency Percent Frequency Percent
------------------------------------------------------
-149 6 1.5 6 1.5
150-159 61 15.7 67 17.2
160-169 131 33.7 198 50.9
170-179 163 41.9 361 92.8
180- 28 7.2 389 100.0
Frequency Missing = 15
SAS システム 3
11:52 Thursday, December 4, 2008
TABLE OF SEX BY SHINTYOU
SEX SHINTYOU
Frequency|
Percent |
Row Pct |
Col Pct | -149 |150-159 |160-169 |170-179 |180- | Total
---------+--------+--------+--------+--------+--------+
F | 6 | 58 | 61 | 2 | 0 | 127
| 1.55 | 14.95 | 15.72 | 0.52 | 0.00 | 32.73
| 4.72 | 45.67 | 48.03 | 1.57 | 0.00 |
| 100.00 | 95.08 | 46.92 | 1.23 | 0.00 |
---------+--------+--------+--------+--------+--------+
M | 0 | 3 | 69 | 161 | 28 | 261
| 0.00 | 0.77 | 17.78 | 41.49 | 7.22 | 67.27
| 0.00 | 1.15 | 26.44 | 61.69 | 10.73 |
| 0.00 | 4.92 | 53.08 | 98.77 | 100.00 |
---------+--------+--------+--------+--------+--------+
Total 6 61 130 163 28 388
1.55 15.72 33.51 42.01 7.22 100.00
Frequency Missing = 16
SAS システム 6
11:52 Thursday, December 4, 2008
------------------------------- SEX=' ' --------------------------------
Cumulative Cumulative
SHINTYOU Frequency Percent Frequency Percent
------------------------------------------------------
160-169 1 100.0 1 100.0
Frequency Missing = 4
SAS システム 7
11:52 Thursday, December 4, 2008
-------------------------------- SEX=F ---------------------------------
Cumulative Cumulative
SHINTYOU Frequency Percent Frequency Percent
------------------------------------------------------
-149 6 4.7 6 4.7
150-159 58 45.7 64 50.4
160-169 61 48.0 125 98.4
170-179 2 1.6 127 100.0
Frequency Missing = 7
SAS システム 8
11:52 Thursday, December 4, 2008
-------------------------------- SEX=M ---------------------------------
Cumulative Cumulative
SHINTYOU Frequency Percent Frequency Percent
------------------------------------------------------
150-159 3 1.1 3 1.1
160-169 69 26.4 72 27.6
170-179 161 61.7 233 89.3
180- 28 10.7 261 100.0
Frequency Missing = 4
data mon2008;
infile 'd:\home\mon05d.csv' dlm=','
firstobs=2
truncover;
missover
dsd
;
input No $ Univ : $30. SName : $40. Faculty : $50. Dept : $50.
Center1 : $8. Center2 : $8. Sel1 : $8. Sel2 : $8.
Book1 : $10. Book2 : $10.
Vol0 VolS VolT
ZenKou $ ScoreS ScoreT KoKouSi
;
data mon2008;
infile 'd:\home\mon05e.txt' dlm='09'x
firstobs=2
truncover;
data math; infile 'foo.dat' lrecl=230;
data math; infile 'foo.dat' lrecl=230 truncover;
input
kamoku $ 2
kesseki $ 3
k_code $ 10-11
t_score 12-14
s_scor01 103-104
s_scor02 105-106
s_scor03 107-108
s_scor04 109-110
;
data math; infile 'foo.dat' firstobs=4;