-
机器学习十大算法
机器学习十大算法的 Python 代码示例,我们将使用常见的
scikit-learn
库来实现,数据集使用鸢尾花数据集。1. 决策树算法(Decision Tree)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 创建决策树分类器 clf = DecisionTreeClassifier() clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print(f"决策树准确率: {accuracy}")
2. 朴素贝叶斯算法(Naive Bayes)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 创建朴素贝叶斯分类器 clf = GaussianNB() clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print(f"朴素贝叶斯准确率: {accuracy}")
3. 支持向量机(Support Vector Machine,SVM)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 创建 SVM 分类器 clf = SVC() clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print(f"SVM 准确率: {accuracy}")
4. K 近邻算法(K – Nearest Neighbor,KNN)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 创建 KNN 分类器 clf = KNeighborsClassifier() clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print(f"KNN 准确率: {accuracy}")
5. 逻辑回归(Logistic Regression)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 创建逻辑回归分类器 clf = LogisticRegression(max_iter=1000) clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print(f"逻辑回归准确率: {accuracy}")
6. 随机森林算法(Random Forest)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 创建随机森林分类器 clf = RandomForestClassifier() clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print(f"随机森林准确率: {accuracy}")
7. 梯度提升树(Gradient Boosting Decision Tree,GBDT)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 创建 GBDT 分类器 clf = GradientBoostingClassifier() clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print(f"GBDT 准确率: {accuracy}")
8. K – 均值聚类算法(K – Means Clustering)
from sklearn.datasets import load_iris from sklearn.cluster import KMeans import matplotlib.pyplot as plt # 加载数据集 iris = load_iris() X = iris.data # 创建 KMeans 聚类器 kmeans = KMeans(n_clusters=3, random_state=42) kmeans.fit(X) # 获取聚类标签 labels = kmeans.labels_ # 可视化聚类结果(取前两个特征) plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis') centers = kmeans.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200) plt.title('K - Means Clustering') plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.show()
9. 主成分分析(Principal Component Analysis,PCA)
from sklearn.datasets import load_iris from sklearn.decomposition import PCA import matplotlib.pyplot as plt # 加载数据集 iris = load_iris() X = iris.data # 创建 PCA 对象,降维到 2 维 pca = PCA(n_components=2) X_pca = pca.fit_transform(X) # 可视化降维后的数据 plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target, cmap='viridis') plt.title('PCA of Iris Dataset') plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.show()
10. AdaBoost 算法(Adaptive Boosting)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.ensemble import AdaBoostClassifier from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 创建 AdaBoost 分类器 clf = AdaBoostClassifier() clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print(f"AdaBoost 准确率: {accuracy}")
这些代码示例展示了如何使用
scikit-learn
库实现机器学习十大算法,并在鸢尾花数据集上进行简单的训练和测试。你可以根据实际需求调整参数和使用其他数据集。