%%html
<script src="https://bits.csb.pitt.edu/preamble.js"></script>

import numpy as np
from matplotlib.pylab import cm
import scipy.cluster.vq as vq #vq: vector quantization
import matplotlib.pylab as plt
%matplotlib inline
randpts = np.vstack((np.random.randn(100,2)/(4,1),(np.random.randn(100,2)+(1,0))/(1,4)))
(means,clusters) = vq.kmeans2(randpts,4)
plt.scatter(randpts[:,0],randpts[:,1],c=clusters)
plt.plot(means[:,0],means[:,1],'*',ms=20);

!wget http://mscbio2025.csb.pitt.edu/files/er.smi

--2023-10-30 20:28:02--  http://mscbio2025.csb.pitt.edu/files/er.smi
Resolving mscbio2025.csb.pitt.edu (mscbio2025.csb.pitt.edu)... 136.142.4.139
Connecting to mscbio2025.csb.pitt.edu (mscbio2025.csb.pitt.edu)|136.142.4.139|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20022 (20K) [application/smil+xml]
Saving to: ‘er.smi.1’

er.smi.1            100%[===================>]  19.55K  --.-KB/s    in 0s      

2023-10-30 20:28:02 (47.1 MB/s) - ‘er.smi.1’ saved [20022/20022]

from openbabel import pybel
yvals = []
fps = []
for mol in pybel.readfile('smi','er.smi'):
    yvals.append(float(mol.title))
    fpbits = mol.calcfp().bits
    fp = np.zeros(1024)
    fp[fpbits] = 1
    fps.append(fp)
    
X = np.array(fps)
y = np.array(yvals)

plt.figure(figsize=(12,5))
plt.matshow(X,cmap=cm.seismic)
plt.show()
plt.figure(figsize=(10,5))
plt.plot(y,color='blue')
plt.show()
X.shape

<Figure size 864x360 with 0 Axes>

(387, 1024)

plt.hist(y);

%%html
<div id="classtype" style="width: 500px"></div>
<script>

    var divid = '#classtype';
	jQuery(divid).asker({
	    id: divid,
	    question: "What sort of problem is this??",
        answers: ['Classification','Regression','Unsupervised'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

import sklearn

%%html
<div id="wshape" style="width: 500px"></div>
<script>

    var divid = '#wshape';
	jQuery(divid).asker({
	    id: divid,
	    question: "What is the shape of w?",
        answers: ['387','1024','(378,1024)','(1024,387)',"I've never taken matrix algebra"],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

from sklearn import linear_model
model = linear_model.LinearRegression() # create the model
model.fit(X,y) # fit the model to the data
p = model.predict(X) # make predictions with the model

plt.figure(figsize = (5,5))
plt.scatter(y,p); plt.xlabel('True'); plt.ylabel('Predict');

ylabel = y > -1
plabel = p > -1
plt.hist(np.uint8(ylabel));

from sklearn.metrics import * #pull in accuracy score, amount other things
print('Accuracy')
print(accuracy_score(ylabel, plabel))

print('\nConfusion Matrix')
print(np.array([['TP', 'FN'],['FP','TP']]))
print(confusion_matrix(ylabel,plabel).transpose())

Accuracy
0.9431524547803618

Confusion Matrix
[['TP' 'FN']
 ['FP' 'TP']]
[[200  10]
 [ 12 165]]

print(classification_report(ylabel,plabel))

              precision    recall  f1-score   support

       False       0.95      0.94      0.95       212
        True       0.93      0.94      0.94       175

    accuracy                           0.94       387
   macro avg       0.94      0.94      0.94       387
weighted avg       0.94      0.94      0.94       387

%%html
<div id="confq" style="width: 500px"></div>
<script>

    var divid = '#confq';
	jQuery(divid).asker({
	    id: divid,
	    question: "What would the recall be if our classifer predicted everything as true?",
        answers: ['0','81/306','0.5','1.0'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

fpr, tpr, thresholds = roc_curve(ylabel, p)  #not using rounded values
plt.figure(figsize=(7,7))
plt.plot(fpr,tpr,linewidth=4,clip_on=False)
plt.xlabel("FPR"); plt.ylabel("TPR")
plt.gca().set_aspect('equal')
plt.ylim(0,1); plt.xlim(0,1); plt.show()

np.random.shuffle(p)
fpr, tpr, thresholds = roc_curve(ylabel, p)  
plt.plot(fpr,tpr,linewidth=3); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.ylim(0,1); plt.xlim(0,1)
plt.gca().set_aspect('equal')
plt.figure(figsize=(10,10))
print(roc_auc_score(ylabel,p))

0.49897574123989225

<Figure size 720x720 with 0 Axes>

%%html
<div id="crossq" style="width: 500px"></div>
<script>

    var divid = '#crossq';
	jQuery(divid).asker({
	    id: divid,
	    question: "In 5-fold cross validation, on average, how many times will a given example be in the training set?",
        answers: ['0','1','2.5','4','5'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
accuracies = []; rocs = []
accuracies_train = []; rocs_train = [];
for train,test in kf.split(X): # these are arrays of indices
    model = linear_model.LinearRegression() 
    model.fit(X[train],y[train]) #slice out the training folds
    p = model.predict(X[test]) #slice out the test fold
    accuracies.append(accuracy_score(ylabel[test],p > -1))
    fpr, tpr, thresholds = roc_curve(ylabel[test], p)  
    rocs.append( (fpr,tpr, roc_auc_score(ylabel[test], p)))
    #training data
    p_train = model.predict(X[train]) #slice out the test fold
    accuracies_train.append(accuracy_score(ylabel[train],p_train > -1))
    fpr, tpr, thresholds = roc_curve(ylabel[train], p_train)  
    rocs_train.append( (fpr,tpr, roc_auc_score(ylabel[train], p_train)))

print(accuracies)
print("Average accuracy:",np.mean(accuracies),'\n')

print(accuracies_train)
print("Average accuracy (training data):",np.mean(accuracies_train))

[0.6282051282051282, 0.6410256410256411, 0.5454545454545454, 0.7142857142857143, 0.7012987012987013]
Average accuracy: 0.6460539460539461 

[0.9514563106796117, 0.948220064724919, 0.9612903225806452, 0.9580645161290322, 0.9483870967741935]
Average accuracy (training data): 0.9534836621776803

fig, ax = plt.subplots(1,2, figsize=(10,5));

for roc in rocs:
    ax[0].plot(roc[0],roc[1],label="AUC %.2f" %roc[2]); 
ax[0].set(xlabel="FPR", ylabel="TPR", ); ax[0].set_ylim(0,1); ax[0].set_xlim(0,1); ax[0].legend(loc='best');

for roc in rocs_train:
    ax[1].plot(roc[0],roc[1],label="AUC %.2f" %roc[2]); 
ax[1].set(xlabel="FPR", ylabel="TPR", ); ax[1].set_ylim(0,1); ax[1].set_xlim(0,1); ax[1].legend(loc='best');



plt.show()

from sklearn.model_selection import cross_validate

cross_validate(linear_model.LinearRegression(),X,y > -1,cv=5,scoring=['roc_auc'])

{'fit_time': array([0.08149719, 0.09700084, 0.07871628, 0.0587194 , 0.05852962]),
 'score_time': array([0.01111794, 0.01063108, 0.002177  , 0.00211978, 0.0022378 ]),
 'test_roc_auc': array([0.67105263, 0.52777778, 0.65067568, 0.69740741, 0.63090418])}

%%html
<div id="howgood" style="width: 500px"></div>
<script>

    var divid = '#howgood';
	jQuery(divid).asker({
	    id: divid,
	    question: "How good is the predictiveness of our model?",
        answers: ['A','B','C','D'],
		extra: ['Still Perfect','Not perfect, but still good','Not great, but better than random','Horrible'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

kf = KFold(n_splits=5)
accuracies = []; rocs = []
for train,test in kf.split(X): # these are arrays of indices
    model = linear_model.Lasso(alpha=0.005) 
    model.fit(X[train],y[train]) #slice out the training folds
    p = model.predict(X[test]) #slice out the test fold
    accuracies.append(accuracy_score(ylabel[test],p > 0))
    fpr, tpr, thresholds = roc_curve(ylabel[test], p)  
    rocs.append( (fpr,tpr, roc_auc_score(ylabel[test],p)))

print(accuracies)
print("Average accuracy:",np.mean(accuracies))

[0.6666666666666666, 0.7948717948717948, 0.6493506493506493, 0.8051948051948052, 0.7402597402597403]
Average accuracy: 0.7312687312687312

for roc in rocs:
    plt.plot(roc[0],roc[1],label="AUC %.2f" %roc[2]); 
plt.gca().set_aspect('equal')    
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.ylim(0,1); plt.xlim(0,1); plt.legend(loc='best'); plt.show()

linmodel = linear_model.LinearRegression() 
linmodel.fit(X,y) 
lassomodel = linear_model.Lasso(alpha=0.005) 
lassomodel.fit(X,y);

print("Nonzero coefficients in linear:",np.count_nonzero(linmodel.coef_))
print("Nonzero coefficients in LASSO:",np.count_nonzero(lassomodel.coef_))

Nonzero coefficients in linear: 881
Nonzero coefficients in LASSO: 64

from sklearn import model_selection
#setup grid search with default 5-fold CV and scoring
searcher = model_selection.GridSearchCV(linear_model.Lasso(max_iter=10000), {'alpha': [0.001,0.005,0.01,0.1]})
searcher.fit(X,y)
searcher.best_params_

{'alpha': 0.005}

lassomodel = linear_model.LassoCV(n_jobs=8,max_iter=10000)
lassomodel.fit(X,y)

LassoCV(max_iter=10000, n_jobs=8)

LassoCV(max_iter=10000, n_jobs=8)

lassomodel.alpha_

0.0045520394784971056

from sklearn import svm
kf = KFold(n_splits=5)
accuracies = []; rocs = []
for train,test in kf.split(X): # these are arrays of indices
    model = svm.SVC(probability=True) 
    model.fit(X[train],ylabel[train]) #slice out the training folds
    p = model.predict(X[test]) # prediction (0 or 1)
    probs = model.predict_proba(X[test])[:,1] #probability of being 1
    accuracies.append(accuracy_score(ylabel[test],p))
    fpr, tpr, thresholds = roc_curve(ylabel[test], probs)  
    rocs.append( (fpr,tpr, roc_auc_score(ylabel[test],probs)))

print(accuracies)
print("Average accuracy:",np.mean(accuracies))

[0.6923076923076923, 0.7307692307692307, 0.6883116883116883, 0.7922077922077922, 0.6753246753246753]
Average accuracy: 0.7157842157842158

for roc in rocs:
    plt.plot(roc[0],roc[1],label="AUC %.2f" %roc[2]); 
plt.gca().set_aspect('equal')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.ylim(0,1); plt.xlim(0,1); plt.legend(loc='best'); plt.show()

from sklearn import model_selection
searcher = model_selection.GridSearchCV(svm.SVC(), {'kernel': ['linear','rbf'],'C': [1,10,100,1000]},scoring='roc_auc',n_jobs=-1)
searcher.fit(X,ylabel)

GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [1, 10, 100, 1000], 'kernel': ['linear', 'rbf']},
             scoring='roc_auc')

GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [1, 10, 100, 1000], 'kernel': ['linear', 'rbf']},
             scoring='roc_auc')

SVC()

SVC()

print("Best AUC:",searcher.best_score_)
print("Parameters",searcher.best_params_)

Best AUC: 0.8149612403100776
Parameters {'C': 10, 'kernel': 'rbf'}

from sklearn import neighbors
kf = KFold(n_splits=5)
accuracies = []; rocs = []
for train,test in kf.split(X): # these are arrays of indices
    model = neighbors.KNeighborsClassifier() # defaults to k=5
    model.fit(X[train],ylabel[train]) #slice out the training folds
    p = model.predict(X[test]) # prediction (0 or 1)
    probs = model.predict_proba(X[test])[:,1] #probability of being 1
    accuracies.append(accuracy_score(ylabel[test],p))
    fpr, tpr, thresholds = roc_curve(ylabel[test], probs)  
    rocs.append( (fpr,tpr, roc_auc_score(ylabel[test],probs)))

print(accuracies)
print("Average accuracy:",np.mean(accuracies))

[0.7307692307692307, 0.6923076923076923, 0.7532467532467533, 0.7272727272727273, 0.7662337662337663]
Average accuracy: 0.733966033966034

for roc in rocs:
    plt.plot(roc[0],roc[1],label="AUC %.2f" %roc[2]); 
plt.gca().set_aspect('equal')    
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.ylim(0,1); plt.xlim(0,1); plt.legend(loc='best'); plt.show()

%%html
<div id="knnq" style="width: 500px"></div>
<script>
    var divid = '#knnq';
	jQuery(divid).asker({
	    id: divid,
	    question: "What could <b>not</b> be a valid probability from the previous k-nn (k=5) model?",
        answers: ['0','.5','.6','1'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>

searcher = model_selection.GridSearchCV(neighbors.KNeighborsClassifier(), \
                {'n_neighbors': [1,2,3,4,5,10]},scoring='roc_auc',n_jobs=-1)
searcher.fit(X,ylabel);

print("Best AUC:",searcher.best_score_)
print("Parameters",searcher.best_params_)

Best AUC: 0.8024758740705586
Parameters {'n_neighbors': 5}

%%html
<div id="dtqml" style="width: 500px"></div>
<script>
    var divid = '#dtqml';
	jQuery(divid).asker({
	    id: divid,
	    question: "Humidity is low, it's windy, and it is sunny. What do you do?",
		answers: ["Play","Don't Play"],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();
</script>

from sklearn import tree
kf = KFold(n_splits=5)
accuracies = []; rocs = []
for train,test in kf.split(X): # these are arrays of indices
    model = tree.DecisionTreeClassifier() 
    model.fit(X[train],ylabel[train]) #slice out the training folds
    p = model.predict(X[test]) # prediction (0 or 1)
    probs = model.predict_proba(X[test])[:,1] #probability of being 1
    accuracies.append(accuracy_score(ylabel[test],p))
    fpr, tpr, thresholds = roc_curve(ylabel[test], probs)  
    rocs.append( (fpr,tpr, roc_auc_score(ylabel[test],probs)))

print(accuracies)
print("Average accuracy:",np.mean(accuracies))

[0.7435897435897436, 0.717948717948718, 0.6753246753246753, 0.7272727272727273, 0.7142857142857143]
Average accuracy: 0.7156843156843158

for roc in rocs:
    plt.plot(roc[0],roc[1],label="AUC %.2f" %roc[2]); 
plt.gca().set_aspect('equal')    
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.ylim(0,1); plt.xlim(0,1); plt.legend(loc='best'); plt.show()

set(probs)

{0.0, 0.5, 1.0}

searcher = model_selection.GridSearchCV(tree.DecisionTreeClassifier(),  \
                {'max_depth': [1,2,3,4,5,10]},scoring='roc_auc',n_jobs=-1)
searcher.fit(X,ylabel);

print("Best AUC:",searcher.best_score_)
print("Parameters",searcher.best_params_)

Best AUC: 0.7580414491377947
Parameters {'max_depth': 5}

model = tree.DecisionTreeClassifier(max_depth=5).fit(X,ylabel)
set(model.predict_proba(X)[:,1])

{0.0,
 0.046153846153846156,
 0.15217391304347827,
 0.2391304347826087,
 0.5,
 0.6666666666666666,
 1.0}

!wget http://mscbio2025.csb.pitt.edu/files/er.smi

from openbabel import pybel
yvals = []
fps = []
for mol in pybel.readfile('smi','er.smi'):
    yvals.append(float(mol.title))
    fpbits = mol.calcfp().bits
    fp = np.zeros(1024)
    fp[fpbits] = 1
    fps.append(fp)
    
X = np.array(fps)
y = np.array(yvals)

--2023-10-30 20:29:41--  http://mscbio2025.csb.pitt.edu/files/er.smi
Resolving mscbio2025.csb.pitt.edu (mscbio2025.csb.pitt.edu)... 136.142.4.139
Connecting to mscbio2025.csb.pitt.edu (mscbio2025.csb.pitt.edu)|136.142.4.139|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20022 (20K) [application/smil+xml]
Saving to: ‘er.smi.2’

er.smi.2            100%[===================>]  19.55K  --.-KB/s    in 0s      

2023-10-30 20:29:41 (41.0 MB/s) - ‘er.smi.2’ saved [20022/20022]

from sklearn.model_selection import KFold
from sklearn.metrics import *
from sklearn import ensemble
kf = KFold(n_splits=5)
errors = []
predictions = []
for train,test in kf.split(X): # these are arrays of indices
    model = sklearn.ensemble.RandomForestRegressor() 
    model.fit(X[train],y[train]) #slice out the training folds
    p = model.predict(X[test]) # prediction (0 or 1)
    predictions += zip(y[test],p)
    errors.append(mean_squared_error(y[test],p))
predictions = np.array(predictions)

print(errors)
print("Average squared error",np.mean(errors))

[0.2941807667340254, 0.3567487151063835, 0.19864288026362262, 0.3638722668982609, 0.209028520490121]
Average squared error 0.2844946298984827

import seaborn as sns
sns.jointplot(x=predictions[:,0],y=predictions[:,1])
plt.xlabel("Experiment"); plt.ylabel("Predicted")
np.corrcoef(predictions[:,0],predictions[:,1])[0][1]

0.9037840225075521

Machine Learning Overview¶

11/7/2023¶

What is machine learning?¶

Unsupervised Learning¶

Supervised Learning¶

Labels¶

Classification¶

Regression¶

Features¶

Example¶

sklearn¶

Linear Model¶

Linear Model¶

Evaluating Predictions¶

Other measures¶

ROC Curves¶

AUC¶

Correct Model Evaluation¶

Cross Validation¶

K-Fold Cross Validation¶

Cross Validation¶

Alternatively...¶

Generalization Error¶

LASSO¶

Lasso vs. LinearRegression¶

Model Parameter Optimization¶

Model specific optimization¶

Support Vector Machine (SVM)¶

SVM Kernels¶

Training SVM¶

Training SVM¶

Training SVM¶

Nearest Neighbors (NN)¶

Training NN¶

Training NN¶

Training NN¶

Decision Trees¶

Random Forest¶

Training a Decision Tree¶

Training a Decision Tree¶

Training a Decision Tree¶

Regression¶

Key Points¶

Which method works best?¶

Project¶

`sklearn`¶