{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using decision trees in MLlib SPARK" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can find how to use other algorithms (Random Forest, Gradient Boosting, etc.) [here](https://github.com/apache/spark/tree/master/examples/src/main/python/mllib)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sys\n", "import os\n", "import os.path\n", "SPARK_HOME = \"\"\"C:\\spark-1.5.0-bin-hadoop2.6\"\"\" #CHANGE THIS PATH TO YOURS!\n", "\n", "sys.path.append(os.path.join(SPARK_HOME, \"python\", \"lib\", \"py4j-0.8.2.1-src.zip\"))\n", "sys.path.append(os.path.join(SPARK_HOME, \"python\", \"lib\", \"pyspark.zip\"))\n", "os.environ[\"SPARK_HOME\"] = SPARK_HOME\n", "\n", "from pyspark import SparkContext\n", "sc = SparkContext(master=\"local[*]\", appName=\"PythonDecisionTreeClassificationExample\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "from pyspark.mllib.regression import LabeledPoint\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.datasets import load_iris\n", "iris = load_iris()\n", "X = iris.data # Input attributes\n", "y = iris.target # Label\n", "# zip is used so that each instance is a tuble of (label, input attributes). \n", "# This will make life easier later\n", "# Note: zip([1,2,3], [\"a\",\"b\",\"c\"]) => [(1, 'a'), (2, 'b'), (3, 'c')]\n", "data = zip(y,X) " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n" ] } ], "source": [ "data_rdd = sc.parallelize(data,4)\n", "print data_rdd.getNumPartitions()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[LabeledPoint(0.0, [5.1,3.5,1.4,0.2])]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_rdd = data_rdd.map(lambda x: LabeledPoint(x[0], x[1]))\n", "data_rdd.take(1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from pyspark.mllib.tree import DecisionTree, DecisionTreeModel" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "(trainingData_rdd, testData_rdd) = data_rdd.randomSplit([0.7, 0.3])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "model = DecisionTree.trainClassifier(trainingData_rdd, numClasses=3, categoricalFeaturesInfo={},impurity='gini', maxDepth=5)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test Error = 0.0208333333333\n", "Learned classification tree model:\n", "DecisionTreeModel classifier of depth 5 with 15 nodes\n", " If (feature 2 <= 1.7)\n", " Predict: 0.0\n", " Else (feature 2 > 1.7)\n", " If (feature 2 <= 4.8)\n", " If (feature 3 <= 1.6)\n", " Predict: 1.0\n", " Else (feature 3 > 1.6)\n", " If (feature 1 <= 2.8)\n", " Predict: 2.0\n", " Else (feature 1 > 2.8)\n", " Predict: 1.0\n", " Else (feature 2 > 4.8)\n", " If (feature 3 <= 1.7)\n", " If (feature 2 <= 5.0)\n", " If (feature 0 <= 6.0)\n", " Predict: 2.0\n", " Else (feature 0 > 6.0)\n", " Predict: 1.0\n", " Else (feature 2 > 5.0)\n", " Predict: 2.0\n", " Else (feature 3 > 1.7)\n", " Predict: 2.0\n", "\n" ] } ], "source": [ "predictions = model.predict(testData_rdd.map(lambda x: x.features))\n", "labelsAndPredictions = testData_rdd.map(lambda lp: lp.label).zip(predictions)\n", "testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData_rdd.count())\n", "print('Test Error = ' + str(testErr))\n", "print('Learned classification tree model:')\n", "print(model.toDebugString())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }