It is challenging to directly compare two XGBoost models to see if they are equal.
This example explores a number of approaches we could use to compare two XGBoost models.
1. Model Evaluation Metrics
Compare the performance metrics of the models such as accuracy, precision, recall, F1 score, or RMSE on the same validation dataset.
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train two XGBoost models
model1 = xgb.XGBClassifier(random_state=42)
model2 = xgb.XGBClassifier(n_estimators=200, max_depth=3, random_state=42)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
# Make predictions
preds1 = model1.predict(X_test)
preds2 = model2.predict(X_test)
# Calculate accuracy for both models
accuracy1 = accuracy_score(y_test, preds1)
accuracy2 = accuracy_score(y_test, preds2)
print(f"Model 1 Accuracy: {accuracy1}")
print(f"Model 2 Accuracy: {accuracy2}")
2. Feature Importances
Compare the feature importances between the two models.
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train two XGBoost models
model1 = xgb.XGBClassifier(random_state=42)
model2 = xgb.XGBClassifier(n_estimators=200, max_depth=3, random_state=42)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
# Plot feature importance for model 1
xgb.plot_importance(model1)
plt.title("Model 1 Feature Importance")
plt.show()
# Plot feature importance for model 2
xgb.plot_importance(model2)
plt.title("Model 2 Feature Importance")
plt.show()
3. Tree Structures
Compare the tree structures of the models. You can use xgb.to_graphviz
to visualize the trees.
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train two XGBoost models
model1 = xgb.XGBClassifier(random_state=42)
model2 = xgb.XGBClassifier(n_estimators=200, max_depth=3, random_state=42)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
# Visualize a specific tree from model1
xgb.plot_tree(model1, num_trees=0)
plt.title("Model 1 Tree 0")
plt.show()
# Visualize a specific tree from model2
xgb.plot_tree(model2, num_trees=0)
plt.title("Model 2 Tree 0")
plt.show()
4. Model Parameters
Compare the hyperparameters of the two models to see if there are any differences.
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train two XGBoost models
model1 = xgb.XGBClassifier(random_state=42)
model2 = xgb.XGBClassifier(n_estimators=200, max_depth=3, random_state=42)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
# Compare model parameters
print("Model 1 Parameters:", model1.get_params())
print("Model 2 Parameters:", model2.get_params())
5. SHAP Values
Compare the SHAP (SHapley Additive exPlanations) values for more detailed feature importance and interaction effects.
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train two XGBoost models
model1 = xgb.XGBClassifier(random_state=42)
model2 = xgb.XGBClassifier(n_estimators=200, max_depth=3, random_state=42)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
# Explain the models predictions using SHAP
explainer1 = shap.Explainer(model1)
shap_values1 = explainer1(X_test)
explainer2 = shap.Explainer(model2)
shap_values2 = explainer2(X_test)
# Plot SHAP values for the first model
shap.summary_plot(shap_values1, X_test, title="Model 1 SHAP Summary")
# Plot SHAP values for the second model
shap.summary_plot(shap_values2, X_test, title="Model 2 SHAP Summary")
6. Serialization and File Comparison
Serialize the models to JSON and compare the resulting files.
import xgboost as xgb
import json
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train two XGBoost models
model1 = xgb.XGBClassifier(random_state=42)
model2 = xgb.XGBClassifier(n_estimators=200, max_depth=3, random_state=42)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
# Save models to JSON
model1.save_model("model1.json")
model2.save_model("model2.json")
# Load JSON files and compare
with open("model1.json", 'r') as f:
model1_json = json.load(f)
with open("model2.json", 'r') as f:
model2_json = json.load(f)
# Compare the JSON content directly
print(model1_json == model2_json)
These methods will allow you to compare various aspects of the models to understand their differences and similarities.