{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using decision trees in MLlib SPARK"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can find how to use other algorithms (Random Forest, Gradient Boosting, etc.) [here](https://github.com/apache/spark/tree/master/examples/src/main/python/mllib)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import os.path\n",
    "SPARK_HOME = \"\"\"C:\\spark-1.5.0-bin-hadoop2.6\"\"\" #CHANGE THIS PATH TO YOURS!\n",
    "\n",
    "sys.path.append(os.path.join(SPARK_HOME, \"python\", \"lib\", \"py4j-0.8.2.1-src.zip\"))\n",
    "sys.path.append(os.path.join(SPARK_HOME, \"python\", \"lib\", \"pyspark.zip\"))\n",
    "os.environ[\"SPARK_HOME\"] = SPARK_HOME\n",
    "\n",
    "from pyspark import SparkContext\n",
    "sc = SparkContext(master=\"local[*]\", appName=\"PythonDecisionTreeClassificationExample\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from pyspark.mllib.regression import LabeledPoint\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_iris\n",
    "iris = load_iris()\n",
    "X = iris.data            # Input attributes\n",
    "y = iris.target          # Label\n",
    "# zip is used so that each instance is a tuble of (label, input attributes). \n",
    "# This will make life easier later\n",
    "# Note: zip([1,2,3], [\"a\",\"b\",\"c\"]) => [(1, 'a'), (2, 'b'), (3, 'c')]\n",
    "data = zip(y,X) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4\n"
     ]
    }
   ],
   "source": [
    "data_rdd = sc.parallelize(data,4)\n",
    "print data_rdd.getNumPartitions()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[LabeledPoint(0.0, [5.1,3.5,1.4,0.2])]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_rdd = data_rdd.map(lambda x: LabeledPoint(x[0], x[1]))\n",
    "data_rdd.take(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.mllib.tree import DecisionTree, DecisionTreeModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "(trainingData_rdd, testData_rdd) = data_rdd.randomSplit([0.7, 0.3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "model = DecisionTree.trainClassifier(trainingData_rdd, numClasses=3, categoricalFeaturesInfo={},impurity='gini', maxDepth=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Error = 0.0208333333333\n",
      "Learned classification tree model:\n",
      "DecisionTreeModel classifier of depth 5 with 15 nodes\n",
      "  If (feature 2 <= 1.7)\n",
      "   Predict: 0.0\n",
      "  Else (feature 2 > 1.7)\n",
      "   If (feature 2 <= 4.8)\n",
      "    If (feature 3 <= 1.6)\n",
      "     Predict: 1.0\n",
      "    Else (feature 3 > 1.6)\n",
      "     If (feature 1 <= 2.8)\n",
      "      Predict: 2.0\n",
      "     Else (feature 1 > 2.8)\n",
      "      Predict: 1.0\n",
      "   Else (feature 2 > 4.8)\n",
      "    If (feature 3 <= 1.7)\n",
      "     If (feature 2 <= 5.0)\n",
      "      If (feature 0 <= 6.0)\n",
      "       Predict: 2.0\n",
      "      Else (feature 0 > 6.0)\n",
      "       Predict: 1.0\n",
      "     Else (feature 2 > 5.0)\n",
      "      Predict: 2.0\n",
      "    Else (feature 3 > 1.7)\n",
      "     Predict: 2.0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions = model.predict(testData_rdd.map(lambda x: x.features))\n",
    "labelsAndPredictions = testData_rdd.map(lambda lp: lp.label).zip(predictions)\n",
    "testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData_rdd.count())\n",
    "print('Test Error = ' + str(testErr))\n",
    "print('Learned classification tree model:')\n",
    "print(model.toDebugString())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}