%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from xgboost import XGBClassifier
data1.head(2)


data=data1.loc[:, 'D':'Pf']
data.head(1)


print(data.shape)

(340, 11)


print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   D       340 non-null    int64  
 1   t       340 non-null    float64
 2   Age     340 non-null    float64
 3   YS      340 non-null    int64  
 4   UTS     340 non-null    int64  
 5   Pop     340 non-null    int64  
 6   d0      340 non-null    float64
 7   L0      340 non-null    int64  
 8   d       340 non-null    float64
 9   L       340 non-null    float64
 10  Pf      340 non-null    int64  
dtypes: float64(5), int64(6)
memory usage: 29.3 KB
None


muD=data.D.mean()
stdD=data.D.std()
mut=data.t.mean()
stdt=data.t.std()
muL=data.L.mean()
stdL=data.L.std()
mud=data.d.mean()
stdd=data.d.std()
muYS=data.YS.mean()
stdYS=data.YS.std()
muUTS=data.UTS.mean()
stdUTS=data.UTS.std()


N=400
D=np.random.normal(muD, stdD, N)
t=np.random.normal(mut, stdt, N)
L=np.random.normal(muL, stdL, N)
d=np.random.normal(mud, stdd, N)
YS=np.random.normal(muYS, stdYS, N)
UTS=np.random.normal(muUTS, stdUTS, N)


D=list(abs(D))
t=list(abs(t))
L=list(abs(L))
d=list(abs(d))
YS=list(abs(YS))
UTS=list(abs(UTS))


sim_data = {'D':D,'t':t,'L':L,'d':d,'YS':YS,'UTS':UTS}
df=pd.DataFrame(sim_data)


df.head(5)


df.D.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x118964eb0>


df.t.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x11aa3ae50>


Pb31=[]
N=1000
for index, row in df.iterrows():
    D = row['D']
    t = row['t']
    YS = row['YS']
    UTS = row['UTS']
    d = row['d']
    L = row['L']
    muD, sigmaD= D, D*0.05
    mut, sigmat= t, t*0.05
    muY, sigmaY= YS, YS*0.1
    muUTS, sigmaUTS= UTS, UTS*0.1
    mud, sigmad = d, d*0.05
    muL, sigmaL = L, L*0.05

#     muPo=2*t*YS/D
    muPo=10
    sigmaPo = muPo*0.1
    Po=0.8*np.random.normal(muPo, sigmaPo, N)
    D = np.random.normal(muD, sigmaD, N)
    t = np.random.normal(mut, sigmat, N)
    Y = np.random.normal(muY, sigmaY, N)
    UTS = np.random.normal(muUTS, sigmaUTS, N)
    d = np.random.normal(mud, sigmad, N)
    L = np.random.normal(muL, sigmaL, N)
    # ASME B31G==================================================
    xx=L**2/D*t
    xx=xx.mean()
    if xx>50:
        M2=0.032*(L**2)/(D*t)+3.3
    else:
        M2=np.sqrt(1+0.625*(L**2)/(D*t)-0.003375*(L**4)/(D*t)**2)
    M2
    Pb31gm=(2*(YS+68.95)*t/D)*((1-0.85*(d/t))/(1-0.85*d/(t*M2)))
        
    p_b31gm=Pb31gm-Po
    z_b31gm=sum(p_b31gm<0);
    PF_b31gm=z_b31gm/N
    Pb31.append(PF_b31gm)


prs_data = {'Prs':Pb31}
dl=pd.DataFrame(prs_data)


# df.loc[(dl['Prs'] > 0) & (dl['Prs'] <= 0.5)];


# df.groupby(pd.cut(dl.Prs, [0, 0.3, 0.6, 1]))


Class = []
for index, row in dl.iterrows():
    cl = row['Prs']
    if cl < 0.2:
        Cls=1
    elif (cl > 0.2) & (cl < 0.8):
        Cls=2
    else:
        Cls=3
    Cls
    Class.append(Cls)


df['Cls']=Class


data=df


print(data['Cls'].unique())

[1 3 2]


data_mod_lc = data.copy()
data_mod_lc['Cls'] = data_mod_lc['Cls'].astype('category')
#print(cat_bird_lc.dtypes)
data_mod_lc['Cls'] = data_mod_lc['Cls'].cat.codes
data_mod_lc.head(5)


data['Cls'] = data_mod_lc['Cls']
data.head()


c = data_mod_lc.Cls.astype('category')
d = dict(enumerate(c.cat.categories))
print (d)

{0: 0, 1: 1, 2: 2}


#checking missing values by column
data.isnull().sum()

D      0
t      0
L      0
d      0
YS     0
UTS    0
Cls    0
dtype: int64


corr = data.drop('Cls', axis=1).corr() # examining correlations
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.0) | (corr <= -0.0)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 12}, square=True);


features = list(data.columns.values)
print(features)

['D', 't', 'L', 'd', 'YS', 'UTS', 'Cls']


import seaborn as sns

quantitative_features_list1 = ['Cls','D', 't', 'L', 'd', 'YS']
data_plot_data=data_mod_num = data[quantitative_features_list1]
sns.pairplot(data_plot_data, hue='Cls')

<seaborn.axisgrid.PairGrid at 0x11f1cae20>


#Lets check wheter data is imbalanced
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
sns.countplot(x='Cls',data=data, palette='hls')
plt.show()


#One Hot Encoding for the categorical values
data = pd.get_dummies(data = data)
data.head()


X = data.loc[:, data.columns != 'Cls']
y=data['Cls']
X.head()


y.head()

0    0
1    2
2    2
3    2
4    0
Name: Cls, dtype: int8


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


import time
start_GNB = time.time()


from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.85
Accuracy of GNB classifier on test set: 0.84


y_pred_train = gnb.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

[[147   1   4]
 [ 18   3   7]
 [ 13   0  87]]


from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.83      0.97      0.89       152
           1       0.75      0.11      0.19        28
           2       0.89      0.87      0.88       100

    accuracy                           0.85       280
   macro avg       0.82      0.65      0.65       280
weighted avg       0.84      0.85      0.82       280


y_pred = gnb.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[66  0  3]
 [ 6  0  4]
 [ 6  0 35]]


y_pred.shape

(120,)


y_test.shape

(120,)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90        69
           1       0.00      0.00      0.00        10
           2       0.83      0.85      0.84        41

    accuracy                           0.84       120
   macro avg       0.56      0.60      0.58       120
weighted avg       0.77      0.84      0.80       120

/Users/Ram/opt/anaconda3/envs/myenv_conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


end_GNB = time.time()
T_GNB=end_GNB - start_GNB
print(T_GNB, 'sec')

103.62185907363892 sec


#KNN


start_KNN = time.time()


from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.68
Accuracy of K-NN classifier on test set: 0.55


y_pred_train = knn.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

[[128   3  21]
 [ 22   3   3]
 [ 40   2  58]]


from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.67      0.84      0.75       152
           1       0.38      0.11      0.17        28
           2       0.71      0.58      0.64       100

    accuracy                           0.68       280
   macro avg       0.59      0.51      0.52       280
weighted avg       0.66      0.68      0.65       280


y_pred = knn.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[54  1 14]
 [ 5  0  5]
 [29  0 12]]


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.78      0.69        69
           1       0.00      0.00      0.00        10
           2       0.39      0.29      0.33        41

    accuracy                           0.55       120
   macro avg       0.33      0.36      0.34       120
weighted avg       0.49      0.55      0.51       120


end_KNN = time.time()
T_KNN=end_KNN - start_KNN
print(T_KNN, 'sec')

62.91131067276001 sec


start_DT = time.time()


from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.82


y_pred_train = clf.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

[[152   0   0]
 [  0  28   0]
 [  0   0 100]]


from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       152
           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00       100

    accuracy                           1.00       280
   macro avg       1.00      1.00      1.00       280
weighted avg       1.00      1.00      1.00       280


y_pred = clf.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[65  3  1]
 [ 2  5  3]
 [ 4  8 29]]


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93        69
           1       0.31      0.50      0.38        10
           2       0.88      0.71      0.78        41

    accuracy                           0.82       120
   macro avg       0.70      0.72      0.70       120
weighted avg       0.85      0.82      0.83       120


end_DT = time.time()
T_DT=end_DT - start_DT
print(T_DT, 'sec')

28.08662724494934 sec


start_RF = time.time()


from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
RF = RandomForestClassifier(n_estimators=200,  random_state=0)
RF.fit(X_train, y_train)
print('Accuracy of RF classifier on training set: {:.2f}'
     .format(RF.score(X_train, y_train)))
print('Accuracy of RF classifier on test set: {:.2f}'
     .format(RF.score(X_test, y_test)))

Accuracy of RF classifier on training set: 1.00
Accuracy of RF classifier on test set: 0.84


y_pred_train = RF.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

[[152   0   0]
 [  0  28   0]
 [  0   0 100]]


y_pred = RF.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[69  0  0]
 [ 5  1  4]
 [10  0 31]]


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      1.00      0.90        69
           1       1.00      0.10      0.18        10
           2       0.89      0.76      0.82        41

    accuracy                           0.84       120
   macro avg       0.90      0.62      0.63       120
weighted avg       0.86      0.84      0.81       120


end_RF = time.time()
T_RF=end_RF - start_RF
print(T_RF, 'sec')

40.61605095863342 sec


RF.feature_importances_

array([0.1013581 , 0.26029247, 0.12185457, 0.36905566, 0.07728382,
       0.07015538])


features = list(X.columns.values)

importances = RF.feature_importances_
import numpy as np
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


start_AB = time.time()


from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

ADB = AdaBoostClassifier(n_estimators=100, random_state=0)
ADB.fit(X_train, y_train)
print('Accuracy of ADB classifier on training set: {:.2f}'
     .format(ADB.score(X_train, y_train)))
print('Accuracy of ADB classifier on test set: {:.2f}'
     .format(ADB.score(X_test, y_test)))

Accuracy of ADB classifier on training set: 0.92
Accuracy of ADB classifier on test set: 0.78


y_pred_train = ADB.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

[[138  11   3]
 [  7  21   0]
 [  1   0  99]]


from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93       152
           1       0.66      0.75      0.70        28
           2       0.97      0.99      0.98       100

    accuracy                           0.92       280
   macro avg       0.86      0.88      0.87       280
weighted avg       0.93      0.92      0.92       280


y_pred = ADB.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[59  8  2]
 [ 3  7  0]
 [ 3 10 28]]


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.86      0.88        69
           1       0.28      0.70      0.40        10
           2       0.93      0.68      0.79        41

    accuracy                           0.78       120
   macro avg       0.71      0.75      0.69       120
weighted avg       0.86      0.78      0.81       120


end_AB = time.time()
T_AB=end_AB - start_AB
print(T_AB, 'sec')

16.438927173614502 sec


start_XGB = time.time()


from xgboost import XGBClassifier
from sklearn.datasets import make_classification

XGB = XGBClassifier()
XGB.fit(X_train, y_train)
print('Accuracy of XGB classifier on training set: {:.2f}'
     .format(XGB.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
     .format(XGB.score(X_test, y_test)))

Accuracy of XGB classifier on training set: 1.00
Accuracy of XGB classifier on test set: 0.87


from xgboost import XGBClassifier
from sklearn.datasets import make_classification

XGB = XGBClassifier()
XGB.fit(X_train, y_train)
print('Accuracy of XGB classifier on training set: {:.2f}'
     .format(XGB.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
     .format(XGB.score(X_test, y_test)))

Accuracy of XGB classifier on training set: 1.00
Accuracy of XGB classifier on test set: 0.87


y_pred_train = XGB.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

[[152   0   0]
 [  0  28   0]
 [  0   0 100]]


from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       152
           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00       100

    accuracy                           1.00       280
   macro avg       1.00      1.00      1.00       280
weighted avg       1.00      1.00      1.00       280


y_pred = XGB.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[66  1  2]
 [ 4  3  3]
 [ 3  3 35]]


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93        69
           1       0.43      0.30      0.35        10
           2       0.88      0.85      0.86        41

    accuracy                           0.87       120
   macro avg       0.74      0.70      0.72       120
weighted avg       0.85      0.87      0.86       120


end_XGB = time.time()
T_XGB=end_XGB - start_XGB
print(T_XGB, 'sec')

2.236138105392456 sec


start_LGB = time.time()


from lightgbm import LGBMClassifier
from sklearn.datasets import make_classification

LGB = LGBMClassifier()
LGB.fit(X_train, y_train)
print('Accuracy of LGB classifier on training set: {:.2f}'
     .format(LGB.score(X_train, y_train)))
print('Accuracy of LGB classifier on test set: {:.2f}'
     .format(LGB.score(X_test, y_test)))

Accuracy of LGB classifier on training set: 1.00
Accuracy of LGB classifier on test set: 0.88


y_pred_train = LGB.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

[[152   0   0]
 [  0  28   0]
 [  0   0 100]]


from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       152
           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00       100

    accuracy                           1.00       280
   macro avg       1.00      1.00      1.00       280
weighted avg       1.00      1.00      1.00       280


y_pred = LGB.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[67  2  0]
 [ 5  2  3]
 [ 3  2 36]]


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93        69
           1       0.33      0.20      0.25        10
           2       0.92      0.88      0.90        41

    accuracy                           0.88       120
   macro avg       0.72      0.68      0.69       120
weighted avg       0.86      0.88      0.86       120


end_LGB = time.time()
T_LGB=end_LGB - start_LGB
print(T_LGB, 'sec')

79.14487195014954 sec


start_Cat = time.time()


y_pred_train = CGB.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

[[152   0   0]
 [  0  28   0]
 [  0   0 100]]


from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       152
           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00       100

    accuracy                           1.00       280
   macro avg       1.00      1.00      1.00       280
weighted avg       1.00      1.00      1.00       280


y_pred = CGB.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[67  1  1]
 [ 3  1  6]
 [ 2  1 38]]


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95        69
           1       0.33      0.10      0.15        10
           2       0.84      0.93      0.88        41

    accuracy                           0.88       120
   macro avg       0.70      0.67      0.66       120
weighted avg       0.85      0.88      0.86       120


end_Cat = time.time()
T_Cat=end_Cat - start_Cat
print(T_Cat, 'sec')

65.83709383010864 sec

	D	t	Age	YS	UTS	Pop	d0	L0	d	L	Pf
0	459	8.1	23.9	601	684	17	0.26	166	3.36	264.7	3
1	459	8.0	81.2	589	731	16	0.13	117	5.65	633.5	4

	D	t	L	d	YS	UTS
0	424.525982	14.841884	295.813779	2.200810	431.100008	521.121553
1	464.708042	10.172511	670.336380	11.842100	430.750390	586.998472
2	719.832758	9.777176	848.148727	11.396276	440.786767	456.376008
3	231.345539	6.736219	1152.586131	8.140984	401.460233	558.504395
4	313.087142	15.714775	493.149866	3.134762	427.902602	527.192251

	D	t	L	d	YS	UTS	Cls
0	424.525982	14.841884	295.813779	2.200810	431.100008	521.121553	0
1	464.708042	10.172511	670.336380	11.842100	430.750390	586.998472	2
2	719.832758	9.777176	848.148727	11.396276	440.786767	456.376008	2
3	231.345539	6.736219	1152.586131	8.140984	401.460233	558.504395	2
4	313.087142	15.714775	493.149866	3.134762	427.902602	527.192251	0

	D	t	L	d	YS	UTS	Cls
0	424.525982	14.841884	295.813779	2.200810	431.100008	521.121553	0
1	464.708042	10.172511	670.336380	11.842100	430.750390	586.998472	2
2	719.832758	9.777176	848.148727	11.396276	440.786767	456.376008	2
3	231.345539	6.736219	1152.586131	8.140984	401.460233	558.504395	2
4	313.087142	15.714775	493.149866	3.134762	427.902602	527.192251	0

	D	t	L	d	YS	UTS	Cls
0	424.525982	14.841884	295.813779	2.200810	431.100008	521.121553	0
1	464.708042	10.172511	670.336380	11.842100	430.750390	586.998472	2
2	719.832758	9.777176	848.148727	11.396276	440.786767	456.376008	2
3	231.345539	6.736219	1152.586131	8.140984	401.460233	558.504395	2
4	313.087142	15.714775	493.149866	3.134762	427.902602	527.192251	0