level | Enumeration | Description |
---|---|---|
0 | IMSLS_NONE | Printing of data warnings and final results is suppressed. |
1 | IMSLS_FINAL | Prints final summary of Naive Bayes classifier training. |
2 | IMSLS_DATA_WARNINGS | Prints information about missing values and PDF calculations equal to zero. |
3 | IMSLS_TRACE_ALL | Prints final summary plus all data warnings associated with missing values and PDF calculations equal to zero. |
PRO naive_bayes_classification_ex1
n_patterns =150; ; 150 training patterns
n_continuous =4; ; four continuous input attributes
n_classes =3; ; three classification categories
classification = LONARR(n_patterns)
continuous = FLTARR(n_patterns,n_continuous)
; irisData(): The raw data matrix. This is a 2D matrix
; with 150 rows and 5 columns. The last 4
; columns are the continuous input attributes
; and the 1st column is the classification
; category (1-3). This data contains no
; categorical input attributes.
irisData = STATDATA(3)
; Data corrections described in the KDD data mining archive
irisData(34,4) = 0.1
irisData(37,2) = 3.1
irisData(37,3) = 1.5
; Set up the required input arrays from the data matrix
classification(*) = irisData(*,0)-1
continuous(*,0:3) = irisData(*,1:4)
classErrors = NAIVE_BAYES_TRAINER(n_classes, classification,$
Continuous=continuous,$
NB_classifier=nb_classifier)
PRINT," Iris Classification Error Rates"
PRINT,"----------------------------------------------"
PRINT,"Setosa Versicolour Virginica | TOTAL"
PRINT,STRTRIM(classErrors(0,0),2),"/",$
STRTRIM(classErrors(0,1),2)," ",$
STRTRIM(classErrors(1,0),2),"/",$
STRTRIM(classErrors(1,1),2)," ",$
STRTRIM(classErrors(2,0),2),"/",$
STRTRIM(classErrors(2,1),2)," ",$
STRTRIM(classErrors(3,0),2),"/",$
STRTRIM(classErrors(3,1),2)
PRINT,"----------------------------------------------"
; CALL NAIVE_BAYES_CLASSIFICATION
predictedClass = NAIVE_BAYES_CLASSIFICATION( $
nb_classifier, n_patterns, $
Continuous=continuous, $
Pred_class_prob=pred_class_prob)
classLabel = ["Setosa ","Versicolour","Virginica "]
PRINT," PROBABILITIES FOR INCORRECT CLASSIFICATIONS"
PRINT,"TRAINING PATTERNS| PREDICTED |"
PRINT,'X1 X2 X3 X4 | CLASS | CLASS' +$
'P(0) P(1) P(2)"
PRINT,"---------------------------------------------------
FOR i=0L, n_patterns-1 DO BEGIN
IF (classification(i) NE predictedClass(i)) THEN BEGIN
PRINT,STRING(continuous(i,0),Format="(f3.1)")," ",$
STRING(continuous(i,1),Format="(f3.1)")," ",$
STRING(continuous(i,2),Format="(f3.1)")," ",$
STRING(continuous(i,3),Format="(f3.1)")," | ",$
classLabel(classification(i))," | ",$
classLabel(predictedClass(i))," ",$
STRING(pred_class_prob(i,0),Format="(f4.2)")," ",$
STRING(pred_class_prob(i,1),Format="(f4.2)")," ",$
STRING(pred_class_prob(i,2),Format="(f4.2)")
ENDIF
ENDFOR
END
Iris Classification Error Rates
----------------------------------------------
Setosa Versicolour Virginica | TOTAL
0/50 3/50 3/50 6/150
----------------------------------------------
PROBABILITIES FOR INCORRECT CLASSIFICATIONS
TRAINING PATTERNS| PREDICTED |
X1 X2 X3 X4 | CLASS | CLASS P(0) P(1) P(2)
---------------------------------------------------
6.9 3.1 4.9 1.5 | Versicolour | Virginica 0.00 0.46 0.54
5.9 3.2 4.8 1.8 | Versicolour | Virginica 0.00 0.16 0.84
6.7 3.0 5.0 1.7 | Versicolour | Virginica 0.00 0.08 0.92
4.9 2.5 4.5 1.7 | Virginica | Versicolour 0.00 0.97 0.03
6.0 2.2 5.0 1.5 | Virginica | Versicolour 0.00 0.96 0.04
6.3 2.8 5.1 1.5 | Virginica | Versicolour 0.00 0.71 0.29
PRO naive_bayes_classification_ex2
condPdfTableLength = 0
n_patterns = 4601
n_variables = 58
n_sample = 2000
n_classes = 2 ;spam or no spam
n_continuous = 57
classSample = LONARR(n_sample)
classification_errors = LONARR(6)
label1 = $
" Trainer from Training Dataset of " + $
STRTRIM(n_sample,2)+" Observations "
label2 = $
" Classifier for Entire Dataset of "+ $
STRTRIM(n_patterns,2)+" Observations "
n_spam = 0
spamData = STATDATA(11)
continuous = FLTARR(n_patterns,n_continuous)
continuousSample = FLTARR(n_sample,n_continuous)
classification = LONARR(n_patterns)
; Map continuous attributes into transformed representation
; and initialize spam count.
classification(*) = spamData(*,n_variables-1)
tmp = WHERE(classification EQ 1, n_spam)
continuous(*,0:53) = ASIN(SQRT(spamData(*,0:53)/100.0))
continuous(*,54:n_variables-2) = spamdata(*,54:n_variables-2)
PRINT,"Number of Patterns = ", STRTRIM(n_patterns,2)
PRINT,"Number Classified as Spam = ", STRTRIM(n_spam,2)
; Select random sample for training Naive Bayes Classifier
RANDOMOPT,set=1234567L
rndSampleIndex = RANDOM(n_sample,$
/Sample_indices,Parameters=n_patterns)
i = rndSampleIndex-1
classSample(*) = classification(i)
continuousSample(*,*) = continuous(i,*)
; Train Naive Bayes Classifier
classErrors = NAIVE_BAYES_TRAINER(n_classes,$
classSample,$
Continuous=continuousSample,$
Nb_classifier=nb_classifier)
; Print error rates for training sample
print_Error_Rates, classErrors, label1
; CALL NAIVE_BAYES_CLASSIFICATION TO CLASSIFIY ENTIRE
; DATASET
predictedClass = NAIVE_BAYES_CLASSIFICATION(nb_classifier,$
n_patterns,$
Continuous=continuous)
; Calculate classification error rates for entire dataset
classification_errors(*) = 0L
tmpindex = WHERE(classification EQ 0, count)
classification_errors(1) = count
tmp2 = WHERE(classification(tmpindex) NE $
predictedClass(tmpindex), count)
classification_errors(0) = count
tmpindex = WHERE(classification EQ 1, count)
classification_errors(3) = count
tmp2 = WHERE(classification(tmpindex) NE $
predictedClass(tmpindex), count)
classification_errors(2) = count
classification_errors(5) = $
classification_errors(1)+classification_errors(3)
classification_errors(4) = $
classification_errors(0)+classification_errors(2)
; Print error rates for entire dataset
print_Error_Rates,classification_errors, label2
END
PRO print_error_rates, classErrors, label
IF(SIZE(classErrors,/Ndim) EQ 2) THEN BEGIN
p0 = 100.0*classErrors(0,0)/classErrors(0,1)
p1 = 100.0*classErrors(1,0)/classErrors(1,1)
p2 = 100.0*classErrors(2,0)/classErrors(2,1)
PRINT, label
PRINT,"----------------------------------------------------"
PRINT," Not Spam Spam | TOTAL"
PRINT," ",STRTRIM(classErrors(0,0),2),"/",$
STRTRIM(classErrors(0,1),2),"=",$
STRING(p0,Format="(f4.1)"),"% ",$
STRTRIM(classErrors(1,0),2),"/",$
STRTRIM(classErrors(1,1),2),"=",$
STRING(p1,Format="(f4.1)"),"% | ",$
STRTRIM(classErrors(2,0),2),"/",$
STRTRIM(classErrors(2,1),2),"=",$
STRING(p2,Format="(f4.1)"),"%"
PRINT,"----------------------------------------------------"
ENDIF ELSE BEGIN
p0 = 100.0*classErrors(0)/classErrors(1)
p1 = 100.0*classErrors(2)/classErrors(3)
p2 = 100.0*classErrors(4)/classErrors(5)
PRINT, label
PRINT,"----------------------------------------------------"
PRINT," Not Spam Spam | TOTAL"
PRINT," ",STRTRIM(classErrors(0),2),"/",$
STRTRIM(classErrors(1),2),"=",$
STRING(p0,Format="(f4.1)"),"% ",$
STRTRIM(classErrors(2),2),"/",$
STRTRIM(classErrors(3),2),"=",$
STRING(p1,Format="(f4.1)"),"% | ",$
STRTRIM(classErrors(4),2),"/",$
STRTRIM(classErrors(5),2),"=",$
STRING(p2,Format="(f4.1)"),"%"
PRINT,"----------------------------------------------------"
ENDELSE
END
Number of Patterns = 4601
Number Classified as Spam = 1813
Trainer from Training Dataset of 2000 Observations
----------------------------------------------------
Not Spam Spam | TOTAL
30/1202= 2.5% 218/798=27.3% | 248/2000=12.4%
----------------------------------------------------
Classifier for Entire Dataset of 4601 Observations
----------------------------------------------------
Not Spam Spam | TOTAL
79/2788= 2.8% 549/1813=30.3% | 628/4601=13.6%
----------------------------------------------------