• 机器学习算法

    机器学习十大算法

    机器学习十大算法的 Python 代码示例,我们将使用常见的scikit-learn库来实现,数据集使用鸢尾花数据集。

    1. 决策树算法(Decision Tree)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 创建决策树分类器
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    
    # 预测
    y_pred = clf.predict(X_test)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"决策树准确率: {accuracy}")

    2. 朴素贝叶斯算法(Naive Bayes)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import accuracy_score
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 创建朴素贝叶斯分类器
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    
    # 预测
    y_pred = clf.predict(X_test)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"朴素贝叶斯准确率: {accuracy}")

    3. 支持向量机(Support Vector Machine,SVM)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 创建 SVM 分类器
    clf = SVC()
    clf.fit(X_train, y_train)
    
    # 预测
    y_pred = clf.predict(X_test)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"SVM 准确率: {accuracy}")

    4. K 近邻算法(K – Nearest Neighbor,KNN)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import accuracy_score
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 创建 KNN 分类器
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)
    
    # 预测
    y_pred = clf.predict(X_test)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"KNN 准确率: {accuracy}")

    5. 逻辑回归(Logistic Regression)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 创建逻辑回归分类器
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    
    # 预测
    y_pred = clf.predict(X_test)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"逻辑回归准确率: {accuracy}")

    6. 随机森林算法(Random Forest)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 创建随机森林分类器
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    
    # 预测
    y_pred = clf.predict(X_test)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"随机森林准确率: {accuracy}")

    7. 梯度提升树(Gradient Boosting Decision Tree,GBDT)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.metrics import accuracy_score
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 创建 GBDT 分类器
    clf = GradientBoostingClassifier()
    clf.fit(X_train, y_train)
    
    # 预测
    y_pred = clf.predict(X_test)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"GBDT 准确率: {accuracy}")

    8. K – 均值聚类算法(K – Means Clustering)

    from sklearn.datasets import load_iris
    from sklearn.cluster import KMeans
    import matplotlib.pyplot as plt
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    
    # 创建 KMeans 聚类器
    kmeans = KMeans(n_clusters=3, random_state=42)
    kmeans.fit(X)
    
    # 获取聚类标签
    labels = kmeans.labels_
    
    # 可视化聚类结果(取前两个特征)
    plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200)
    plt.title('K - Means Clustering')
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.show()

    9. 主成分分析(Principal Component Analysis,PCA)

    from sklearn.datasets import load_iris
    from sklearn.decomposition import PCA
    import matplotlib.pyplot as plt
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    
    # 创建 PCA 对象,降维到 2 维
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    # 可视化降维后的数据
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target, cmap='viridis')
    plt.title('PCA of Iris Dataset')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()

    10. AdaBoost 算法(Adaptive Boosting)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.metrics import accuracy_score
    
    # 加载数据集
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 创建 AdaBoost 分类器
    clf = AdaBoostClassifier()
    clf.fit(X_train, y_train)
    
    # 预测
    y_pred = clf.predict(X_test)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"AdaBoost 准确率: {accuracy}")

    这些代码示例展示了如何使用scikit-learn库实现机器学习十大算法,并在鸢尾花数据集上进行简单的训练和测试。你可以根据实际需求调整参数和使用其他数据集。