Skip to main content

Log a sklearn Model

Train & Log a Custom Scikit-Learn Model with Katonic-SDK Log package.

Train and Log a Custom built Scikit-Learns Model with Katonic-SDK Log package.

Import necessary packagesโ€‹

import os

import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, recall_score, f1_score, precision_score
from sklearn.linear_model import LogisticRegression
from katonic.log.logmodel import LogModel

Define Experiment nameโ€‹

experiment_name= "sklearn_model"

Initiate LogModel with experiment nameโ€‹

lm = LogModel(experiment_name, source_name="scikit_learn_logging.ipynb")

Check Metadata of the created / existing experimentโ€‹

# experiment id
exp_id = lm.id

print("experiment name: ", lm.name)
print("experiment location: ", lm.location)
print("experiment id: ", lm.id)
print("experiment status: ", lm.stage)

Artifact path where you want to log your modelโ€‹

artifact_path = "scikit-learn-model"

Read data for trainingโ€‹

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv')
df.head()

Get Features and Labelsโ€‹

x = df.drop(columns=['Outcome'], axis=1)
y = df['Outcome']

Split the dataset in Train and Testโ€‹

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.20,random_state=98)

Define Evaluation Metricsโ€‹

def metric(actual, pred):
acc_score = accuracy_score(actual, pred)
recall = recall_score(actual, pred)
precision_scr = precision_score(actual, pred)
f1_scr = f1_score(actual, pred)
auc_roc = roc_auc_score(actual, pred)
log_los = log_loss(actual, pred)

return (
acc_score,
auc_roc,
log_los,
recall,
f1_scr,
precision_scr
)

Train Random Forest Modelโ€‹

model_clf = RandomForestClassifier(max_depth=2, random_state=0)
model_clf.fit(X_train, y_train)

Calculate metrics of the Random Forest modelโ€‹

y_pred = model_clf.predict(X_test)
(acc_score, auc_roc, log_los, recall, f1_scr, precision_scr) = metric(y_test, y_pred)

model_mertics = {
"accuracy_score": acc_score,
"roc_auc_score": auc_roc,
"log_loss": log_los,
"recall": recall,
"f1_score": f1_scr,
"precision_score": precision_scr
}

Log Random Forest Modelโ€‹

lm.model_logging(
model_name="random_forest",
model_type="scikit-learn",
model=model_clf,
artifact_path=artifact_path,
current_working_dir=f'{os.getcwd()}/scikit_learn_logging.ipynb',
metrics=model_mertics
)

Train Logistic Regression Modelโ€‹

model_clf = LogisticRegression(random_state=0)
model_clf.fit(X_train, y_train)

Calculate metrics of the Logistic Regression modelโ€‹

y_pred = model_clf.predict(X_test)
(acc_score, auc_roc, log_los, recall, f1_scr, precision_scr) = metric(y_test, y_pred)

model_mertics = {
"accuracy_score": acc_score,
"roc_auc_score": auc_roc,
"log_loss": log_los,
"recall": recall,
"f1_score": f1_scr,
"precision_score": precision_scr
}

Log Logistic Regression modelโ€‹

Note: When you are logging models supported by scikit-learn, please use scikit-learn as model_type.

lm.model_logging(
model_name="logistic_regression",
model_type="scikit-learn",
model=model_clf,
artifact_path=artifact_path,
current_working_dir=f'{os.getcwd()}/scikit_learn_logging.ipynb',
metrics=model_mertics
)

Train Adaboost Modelโ€‹

model_clf = AdaBoostClassifier(random_state=0)
model_clf.fit(X_train, y_train)

Calculate metrics of the Adaboost modelโ€‹

y_pred = model_clf.predict(X_test)
(acc_score, auc_roc, log_los, recall, f1_scr, precision_scr) = metric(y_test, y_pred)

model_mertics = {
"accuracy_score": acc_score,
"roc_auc_score": auc_roc,
"log_loss": log_los,
"recall": recall,
"f1_score": f1_scr,
"precision_score": precision_scr
}

Log Adaboost modelโ€‹

lm.model_logging(
model_name="adaboostclassifier",
model_type="scikit-learn",
model=model_clf,
artifact_path=artifact_path,
current_working_dir=f'{os.getcwd()}/scikit_learn_logging.ipynb',
metrics=model_mertics
)

Train Gradient Boost Modelโ€‹

model_clf = GradientBoostingClassifier(random_state=0)
model_clf.fit(X_train, y_train)

Calculate metrics of the Gradient Boost modelโ€‹

y_pred = model_clf.predict(X_test)
(acc_score, auc_roc, log_los, recall, f1_scr, precision_scr) = metric(y_test, y_pred)

model_mertics = {
"accuracy_score": acc_score,
"roc_auc_score": auc_roc,
"log_loss": log_los,
"recall": recall,
"f1_score": f1_scr,
"precision_score": precision_scr
}

Log Gradientboost modelโ€‹

lm.model_logging(
model_name="gradientboostclassifier",
model_type="scikit-learn",
model=model_clf,
artifact_path=artifact_path,
current_working_dir=f'{os.getcwd()}/scikit_learn_logging.ipynb',
metrics=model_mertics
)

Check all the logged Experimentsโ€‹

You can search and get all the logged experiments with experiment ID.

df_runs = lm.search_runs(exp_id)
print("Number of runs done : ", len(df_runs))
df_runs.head()