{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ATTRIBUTE / FEATURE SELECTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, relevant libraries are imported" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.datasets import load_boston\n", "from sklearn import tree\n", "from sklearn.cross_validation import train_test_split, cross_val_score, KFold\n", "from sklearn import metrics\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.grid_search import GridSearchCV\n", "# Attribute selection methods from sklearn\n", "from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, f_classif, f_regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RANKING/FILTER ATTRIBUTE SELECTION WITH TRAIN / TEST" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(506L, 13L)\n" ] } ], "source": [ "boston = load_boston()\n", "X, y = boston.data, boston.target\n", "print(X.shape)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=33)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's see the estimated accuracy with the original dataset (all the attributes)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "18.6822660099\n" ] } ], "source": [ "clf = tree.DecisionTreeRegressor()\n", "clf = clf.fit(X_train, y_train)\n", "y_test_pred = clf.predict(X_test)\n", "print metrics.mean_squared_error(y_test, y_test_pred)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(12, -366.83409486707029)\n", "(5, -308.37535794972194)\n", "(10, -103.27726686601574)\n", "(2, -70.968775051003149)\n", "(9, -64.349952462738642)\n", "(4, -47.922568894311766)\n", "(0, -44.628812695117624)\n", "(1, -40.337404608424059)\n", "(8, -35.025756985597887)\n", "(6, -31.350893876051142)\n", "(11, -28.25284632711735)\n", "(7, -9.7600313040364668)\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEACAYAAAC57G0KAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHRlJREFUeJzt3XuUHGWd//H3JzeTiDAESMgNEzCBBFAuGhQkaRURWUxw\nZQ1RIL+F4+Jhf8i6Z8UEdZl197gsHlncn4urC7hBSSTLNSy3BKQj6EpUSARCSIIJMkBCyAUhieQy\n398fVUM6w2SmZ6Znqrv68zqnz1RXV3V/6yTnU08/9dTTigjMzCx/+mRdgJmZ9QwHvJlZTjngzcxy\nygFvZpZTDngzs5xywJuZ5VRZAS+pr6QnJN2dPh8iaZGklZIWSmoo2Xa2pFWSVkg6vacKNzOz9pXb\ngr8MWA60DJqfBSyKiPHAQ+lzJE0EpgMTgTOA6yT5W4KZWQY6DF9Jo4AzgesBpaunAnPS5TnA2eny\nNGBeROyMiLXAamBSJQs2M7PylNO6/lfgK0BzybphEbE+XV4PDEuXRwBNJds1ASO7W6SZmXVeuwEv\n6SzglYh4gj2t971EMtdBe/MdeC4EM7MM9Ovg9ZOBqZLOBAYC+0v6MbBe0qERsU7ScOCVdPsXgdEl\n+49K1+1FkkPfzKwLIqLNxnZb2m3BR8QVETE6IsYC5wI/i4jzgQXAzHSzmcCd6fIC4FxJAySNBcYB\nS/bx3rl9XHnllZnX4OPz8dXbsdXD8XVWRy34t+Vy+vcqYL6ki4C1wGfT0F4uaT7JiJtdwCXRlarM\nzKzbyg74iFgMLE6XNwGn7WO7bwHfqkh1ZmbWZR6j3gMKhULWJfQoH1/tyvOxQf6Pr7OURQ+KJPfc\nmJl1kiSiUhdZzcysdjngzcxyygFvZpZTDngzs5xywJuZ5ZQD3swspxzwZmY55YA3M8spB7yZWU45\n4M3McsoBb2aWUw54M7Ocyizgly3L6pPNzOpDZgE/dSq89FJWn25mln+ZBfzFF8OnPgVbt2ZVgZlZ\nvmU2H3xzc3DhhbB5M9x2G/Tt2+tlmJnVlJqZD16CH/wA/vhHuPzyrKowM8uvdgNe0kBJj0laKukp\nSY3p+kZJTZKeSB+fLNlntqRVklZIOr299x8wIGm933MP/Md/VOR4zMws1WEXjaTBEbFNUj/gUeAy\n4Azg9Yi4ptW2E4G5wAeAkcCDwPiIaG613V4/2ffcc/DhD8N//Rd84hPdPygzszyqeBdNRGxLFwcA\n/YGWZG7rQ6YB8yJiZ0SsBVYDkzr6jCOOgFtvhfPPhyefLKtuMzPrQIcBL6mPpKXAemBhRCxJX7pU\n0jJJN0hqSNeNAJpKdm8iacl36JRT4LvfTUbWrFvXiSMwM7M2ldOCb46I44BRwEmSjga+D4wFjgNe\nBr7T3luUW8yMGXDhhckY+W3bOt7ezMz2rV+5G0bEa5IeBs6IiLcCXdL1wN3p0xeB0SW7jUrXvU1j\nY+Nby4VCgUKhAMA3vgGrVyfdNf/939DHkymYWZ0qFosUi8Uu79/uRVZJBwO7ImKLpEHAA8BVwOMR\nsS7d5svAByLicyUXWSex5yLre6LVh7S+yNram2/C6afDSSfB1Vd3+djMzHKlsxdZO2rBDwfmSOpL\n0p1zS0TcK+kmSceRdL+sAS4GiIjlkuYDy4FdwCXtJvk+vOMdcPvt8KEPwbhx8IUvdPYdzMwssztZ\ny/ncVavg1FPhxz+Gj3+8FwozM6tiNXMnaznGjYP58+Hzn4enn866GjOz2lLVAQ8weTJccw2cdRas\nX591NWZmtaPqAx7gvPPgggtg2jTYvj3raszMakNV98GXikiCfscOuOUWD580s/qTqz74UhLccAO8\n/DJ87WtZV2NmVv1qJuABBg6EO+9MboC68casqzEzq25l38laLQ4+OJleePJkePe74WMfy7oiM7Pq\nVFMt+BZHHpn0w8+YAc88k3U1ZmbVqSYDHqBQgG9/G/7sz+CVV7Kuxsys+tTMKJp9+frX4Wc/Sx4D\nB1bkLc3MqlJnR9HUfMA3N8PnPpcsz53r4ZNmll+5HSa5L336JD/194c/wN//fdbVmJlVj5oPeNgz\nfHLu3CTszcysBodJ7svQocnwySlTYMyY5CKsmVk9y0ULvsWECTBvHkyfDs8+m3U1ZmbZylXAQ3Lj\n07e+lQyffPXVrKsxM8tOzY+i2ZdZs+AXv4AHH0x+IcrMrNbV3TDJfWluTrpqBgyAn/wkmazMzKyW\n1d0wyX3p0wduuglWr4Z///esqzEz633tBrykgZIek7RU0lOSGtP1QyQtkrRS0kJJDSX7zJa0StIK\nSaf3cP3tGjQoGRt/xx1ZVmFmlo0Ou2gkDY6IbZL6AY8ClwGfAV6NiKslfRU4MCJmSZoIzAU+AIwE\nHgTGR0Rzq/fs8S6aFlu2wOjRsHFj0l1jZlarKt5FExHb0sUBQH8ggKnAnHT9HODsdHkaMC8idkbE\nWmA1MKncYnpCQ0Py492/+U2WVZiZ9b4OA15SH0lLgfXAwohYAgyLiJafwF4PDEuXRwBNJbs3kbTk\nMzV5MixenHUVZma9q8M7WdPuleMkHQDcIemYVq+HpPb6W9p8rbGx8a3lQqFAoQdvPZ0yBX74Q5g9\nu8c+wsys4orFIsViscv7d2qYpKRvANuALwCFiFgnaTjwcEQcJWkWQERclW5/P3BlRDzW6n16rQ8e\nYMMGeM97kn74frmZnMHM6k1F++AlHdwyQkbSIODjwDPAAmBmutlM4M50eQFwrqQBksYC44AlnTuE\nyjvkEBg1CpYuzboSM7Pe01F7djgwR1JfkpPBLRFxr6RfAfMlXQSsBT4LEBHLJc0HlgO7gEt6tane\njilT4Oc/h/e/P+tKzMx6R27vZG3tpz9NJiK7665e/Vgzs4rxVAX78NJLcMwxyQRk/tUnM6tFnqpg\nH0aMgIMOgqefzroSM7PeUTcBD0k/vMfDm1m9qKuAnzw5udBqZlYP6qYPHmDtWjjpJFi3ztMHm1nt\ncR98O8aMSX6ge+XKrCsxM+t5dRXw4G4aM6sfdRfwvtBqZvWi7gK+ZWbJ6ri/1sys59RdwI8bBzt3\nJhdczczyrO4CXtozL42ZWZ7VXcCDL7SaWX2oy4D3hVYzqwd1GfATJyY/xv3ii1lXYmbWc+oy4Pv0\ngVNPdTeNmeVbXQY8uB/ezPLPAW9mllN1NdlYqd27k/nhV61KfrPVzKzaebKxMvXtCyefDI88knUl\nZmY9o8OAlzRa0sOSnpb0lKQvpesbJTVJeiJ9fLJkn9mSVklaIen0njyA7vBwSTPLs35lbLMT+HJE\nLJW0H/BbSYuAAK6JiGtKN5Y0EZgOTARGAg9KGh8RzRWuvdsmT4ZLLsm6CjOzntFhCz4i1kXE0nT5\nDeAZkuAGaKsvaBowLyJ2RsRaYDUwqTLlVtaJJ8Lq1cmYeDOzvOlUH7ykMcDxwK/SVZdKWibpBkkN\n6boRQFPJbk3sOSFUlQEDkl94evTRrCsxM6u8crpoAEi7Z24FLouINyR9H/hm+vI/At8BLtrH7m8b\nMtPY2PjWcqFQoFAolFtKRbUMlzzrrEw+3sxsn4rFIsViscv7lzVMUlJ/4H+A+yLi2jZeHwPcHRHH\nSpoFEBFXpa/dD1wZEY+VbJ/5MMkWixfD5ZfDY491vK2ZWZYqPkxSkoAbgOWl4S5peMlmnwaeTJcX\nAOdKGiBpLDAOWFJuQb1t0iR4+ml4442sKzEzq6xyumhOAc4DfifpiXTdFcAMSceRdL+sAS4GiIjl\nkuYDy4FdwCVV01xvw6BBcMIJ8MtfwulVO6DTzKzz6vZO1lJf/3ry95/+Kds6zMza4ztZu8Dz0phZ\nHrkFT9L/fuihsGFD0mVjZlaN3ILvgv32g6OPhiVVeynYzKzzHPApz0tjZnnjgE+5H97M8sZ98Kkt\nW2D0aNi4MZnCwMys2rgPvosaGuA974Hf/jbrSszMKsMBX8LdNGaWJw74Er7QamZ54j74Ehs2wLhx\nST98375ZV2Nmtjf3wXfDIYfAyJGwbFnWlZiZdZ8DvpXJk91NY2b54IBvxRdazSwv3Affyosvwvve\nB6+8An18+jOzKuI++G4aOTIZE798edaVmJl1jwO+DVOmuJvGzGqfA74NvtBqZnnggG9Dy4XWKr1M\nYGZWFgd8G8aMgf79YdWqrCsxM+u6DgNe0mhJD0t6WtJTkr6Urh8iaZGklZIWSmoo2We2pFWSVkiq\nuZ+yljxc0sxqXzkt+J3AlyPiaOCDwF9LmgDMAhZFxHjgofQ5kiYC04GJwBnAdZJq7puCL7SaWa3r\nMHgjYl1ELE2X3wCeAUYCU4E56WZzgLPT5WnAvIjYGRFrgdXApArX3eN8odXMal2nWtaSxgDHA48B\nwyJiffrSemBYujwCaCrZrYnkhFBTxo+HN9+E55/PuhIzs67pV+6GkvYDbgMui4jXpT03U0VESGpv\nzMnbXmtsbHxruVAoUCgUyi2lV7T0wy9eDBdckHU1ZlaPisUixWKxy/uXNVWBpP7A/wD3RcS16boV\nQCEi1kkaDjwcEUdJmgUQEVel290PXBkRj5W8X9VOVVDqe9+DpUvh+uuzrsTMrAemKlDSVL8BWN4S\n7qkFwMx0eSZwZ8n6cyUNkDQWGAcsKbegauIfADGzWtZhC17Sh4GfA79jT1fLbJLQng8cBqwFPhsR\nW9J9rgAuBHaRdOk80Oo9a6IF39yczBH/5JMwYkTW1ZhZvetsC96zSXbg7LNhxgyYPj3rSsys3nk2\nyQrzcEkzq1UO+A74jlYzq1XuounArl1w0EHw3HNw8MFZV2Nm9cxdNBXWrx+cfDI88kjWlZiZdY4D\nvgyel8bMapEDvgy+0Gpmtch98GXYsSPph29qggMOyLoaM6tX7oPvAQMGwKRJ8OijWVdiZlY+B3yZ\nPFzSzGqNA75MvtBqZrXGffBl2r49mZdm/Xp45zuzrsbM6pH74HvIoEFw/PHwv/+bdSVmZuVxwHeC\nh0uaWS1xwHeCL7SaWS1xH3wnvP46DB8Or74KAwdmXY2Z1Rv3wfegd70LJk6EJTX5+1RmVm8c8J3k\n4ZJmVisc8J3kC61mVivcB99JmzfDYYfBpk3Qv3/W1ZhZPal4H7ykGyWtl/RkybpGSU2Snkgfnyx5\nbbakVZJWSDq984dQ3Q48EI44Ah5/POtKzMzaV04XzY+AM1qtC+CaiDg+fdwHIGkiMB2YmO5znaTc\ndQO5m8bMakGH4RsRjwCb23ipra8J04B5EbEzItYCq4FJ3aqwCvlCq5nVgu60ri+VtEzSDZIa0nUj\ngKaSbZqAkd34jKp06qnJ1MG7d2ddiZnZvvXr4n7fB76ZLv8j8B3gon1s2+bV1MbGxreWC4UChUKh\ni6X0vqFDYcQI+N3vkvlpzMx6QrFYpFgsdnn/skbRSBoD3B0Rx7b3mqRZABFxVfra/cCVEfFYq31q\ndhRNiy9+EY46Cv7mb7KuxMzqRa/cySppeMnTTwMtI2wWAOdKGiBpLDAOyOV9n56XxsyqXYddNJLm\nAVOAgyW9AFwJFCQdR9L9sga4GCAilkuaDywHdgGX1HxTfR8mT4bLLoMIUNnnUzOz3uMbnbrhiCNg\nwQI4+uisKzGzeuDJxnqRh0uaWTVzwHeDb3gys2rmgO+GlhZ8DnqbzCyHHPDdMGYM9O0Lzz2XdSVm\nZm/ngO8Gyd00Zla9HPDd5AutZlatHPDd5Ba8mVUrB3w3HXkkbN8Ozz+fdSVmZntzwHdTSz/8I49k\nXYmZ2d4c8BXgbhozq0YO+ArwhVYzq0YO+Ao45hjYsAFefjnrSszM9nDAV0CfPsmvPLkf3syqiQO+\nQjw/vJlVGwd8hUyZ4gutZlZdPB98hezaBQcdBL//ffLXzKzSPB98Rvr1gw99yP3wZlY9HPAV5OGS\nZlZNHPAV5AutZlZNOgx4STdKWi/pyZJ1QyQtkrRS0kJJDSWvzZa0StIKSaf3VOHV6AMfgGefhdde\ny7oSM7PyWvA/As5otW4WsCgixgMPpc+RNBGYDkxM97lOUt18SxgwIAl5t+LNrBp0GL4R8QiwudXq\nqcCcdHkOcHa6PA2YFxE7I2ItsBqYVJlSa8PFF8OFF8J110Fzc9bVmFk962rrelhErE+X1wPD0uUR\nQFPJdk3AyC5+Rk2aPj1pwd98c3LRdcWKrCsys3rVr7tvEBEhqb1B7W2+1tjY+NZyoVCgUCh0t5Sq\nMWFCMlzyuuvgwx+Gv/1b+MpXoH//rCszs1pSLBYpFotd3r+sG50kjQHujohj0+crgEJErJM0HHg4\nIo6SNAsgIq5Kt7sfuDIiHmv1frm70Wlfnn8evvjFZCKyG26AE0/MuiIzq1W9daPTAmBmujwTuLNk\n/bmSBkgaC4wDlnTxM3Lh3e+Ge++Fv/s7OPNMuPxy2LYt66rMrB6UM0xyHvBL4EhJL0j6S+Aq4OOS\nVgIfTZ8TEcuB+cBy4D7gkrppqrdDgvPOgyefhBdegPe9D7rxrcvMrCyeiyYDd98Nl1wCn/wkXH01\nNDR0vI+ZmeeiqQGf+hQ89RT07Zv8WMhdd2VdkZnlkVvwGVu8GL7wBTj+ePi3f4Nhwzrex8zqk1vw\nNWbKFFi2DMaOhfe+F266CXzuM7NKcAu+ijz+OFx0EQwdCj/4AYwZk3VFZlZN3IKvYSecAEuWwEc+\nAu9/P3z3u7B7d9ZVmVmtcgu+Sj37bNI3v3NncoPUxIlZV2RmWXMLPieOPDIZKz9zZtJP/81vwo4d\nWVdlZrXEAV/F+vRJpjl4/PGk6+bEE5O/ZmblcMDXgNGjk5ujrrgCpk5NJi/bujXrqsys2jnga4QE\nM2YkN0i98goceyw8+GDWVZlZNfNF1hp1771J981pp8Ff/RUcfjgcckhyIjCzfOrsRVYHfA17/XX4\nh39IfmDk97+HP/0pCfrDD09unGpZPvzwZEz9oEFZV2xm3eGAr2OvvQZr1iRhX/pYsyaZl/6gg/YO\n/dKTwKGHJhd1zax6OeCtTbt3w0svvT38Wx6vv5608ktPAKUngne+M+sjMDMHvHXJG2/A2rVth/+a\nNbD//nsC/6MfhbPPTr4RmFnvccBbxTU3w7p1SdCvXJlc4F24EE46Cc45Jwn7oUOzrtIs/xzw1iu2\nboX77oNbb4X770/m0TnnHPj0p2H48KyrM8snB7z1uu3b4YEHkrC/555kjP4558Cf/zmMGpV1dWb5\n0asBL2kt8EdgN7AzIiZJGgLcArwbWAt8NiK2tNrPAZ9Tb74JixYlYb9gAUyYkIT9Zz4Dhx2WdXVm\nta23A34NcGJEbCpZdzXwakRcLemrwIERMavVfg74OrBjBzz0UBL2d90FRxyxJ+wPPzzr6sxqTxYB\n//6I2FiybgUwJSLWSzoUKEbEUa32c8DXmZ07k9kxb70V7rgjmV/nnHOSx7hxWVdnVht6O+B/D7xG\n0kXzg4j4T0mbI+LA9HUBm1qel+zngK9ju3bBI48kYX/bbcnv0LaE/YQJWVdnVr16O+CHR8TLkg4B\nFgGXAgtKA13SpogY0mo/B7wByQ1Yv/jFnrBvaIC/+Isk7I8+2nPrmJXqbMD3686HRcTL6d8Nku4A\nJgHrJR0aEeskDQdeaWvfxsbGt5YLhQKFQqE7pViN6tsXJk9OHtdeC7/6VRL2Z54Jgwcnwy6POAIO\nPHDvR0NDcvOVp1ewPCsWixSLxS7v3+UWvKTBQN+IeF3SO4GFwD8ApwEbI+JfJM0CGnyR1TorAn79\n62QkzksvwebNex5btiR/t21LQr51+LecANpa3/I44IDk5GJWS3qti0bSWOCO9Gk/4OaI+Od0mOR8\n4DA8TNJ60K5de8K+dfi3frRe/8c/wn77vT34hwxJpl0eOvTtj4MPhn7d+s5r1j2+0cmsDM3Nyeyb\nrcN/0yZ49dXkR1VaPzZuTFr+paG/r5PB0KHJtwhfQ7BKcsCb9ZDdu5OTQFvh39Zj27b2TwAtj5Ej\nk5FE7jKyjjjgzarEm2/Chg3tnwTWrUuuMWzatCfsR41K/pY+WtYNHpz1UVmWHPBmNWjHDnj5ZXjx\nxb0fTU17Px806O2h3/pEcPDB7hrKKwe8WU5FJC391qHf+kSwdWsyo2dbJ4IRI5J5/A86KLmg3L9/\n1kdlneGAN6tz27cn3T5tnQheeim5WLxpU3I9YfDgJOhbQr/0sa/1++/vbwhZccCbWVmam5Phohs3\n7nls2rT387bWbd+eDCkt52QwZEgymqihIRmW6hvTuscBb2Y9aseOpPXf0YmgZd1rryVDUbdtg3e9\na0/gt34ccED76/ff3yONHPBmVpV27Uq+MWzZkjxagr+9R+k2r7+efAto74Rw5JEwbVqyXR454M0s\nl3bvTkJ+XyeALVtgyZJk8rpp02DmTCgU8tUt5IA3s7q2bh3MnQtz5iRdSeefn4T9+PFZV9Z9Dngz\ns9SyZUnQz50LY8cmQT99enKRuBY54M3MWtm5ExYuTML+gQfg9NOTsP/EJ2rrXgAHvJlZOzZvhvnz\nk7B/7jn43OeSsD/uuKwr65gD3sysTKtWwU03JY+GBrjgAvj85+HQQ7OurG0OeDOzTmpuhsWLk6C/\n8044+eQk7KdNg4EDs65uDwe8mVk3bN0Kt9+ehP3jjye/D3zBBUnoZz1FgwPezKxCXngBbr456a/f\ntSsJ+vPPhzFjsqnHAW9mVmEtvxE8Zw7ccgscc0xyYfaUU/a06ktb9z21btQoB7yZWY958024554k\n7JcvT9aVxlnLck+se/nlKgh4SWcA1wJ9gesj4l9ave6ANzPrpM520VR8lgZJfYHvAWcAE4EZkiZU\n+nOqWbFYzLqEHuXjq115PjbI//F1Vk9MwzMJWB0RayNiJ/BTYFoPfE7Vyvt/Mh9f7crzsUH+j6+z\neiLgRwIvlDxvSteZmVkv6omAd+e6mVkVqPhFVkkfBBoj4oz0+WygufRCqySfBMzMuiDTUTSS+gHP\nAh8DXgKWADMi4pmKfpCZmbWrX6XfMCJ2Sfq/wAMkwyRvcLibmfW+TG50MjOzntfrv1Yo6QxJKySt\nkvTV3v78niRptKSHJT0t6SlJX8q6pkqT1FfSE5LuzrqWSpPUIOlWSc9IWp5eT8oNSV9O/18+KWmu\npHdkXVN3SLpR0npJT5asGyJpkaSVkhZKasiyxu7Yx/F9O/3/uUzS7ZIOaO89ejXg6+AmqJ3AlyPi\naOCDwF/n7PgALgOWk8/RUt8F7o2ICcB7gdx0LUoaCVwKnBgRx5J0n56bbVXd9iOSLCk1C1gUEeOB\nh9Lntaqt41sIHB0R7wNWArPbe4PebsHn+iaoiFgXEUvT5TdIAmJEtlVVjqRRwJnA9UDGE6dWVtoS\nOjUiboTkWlJEvJZxWZXWDxicDoQYDLyYcT3dEhGPAJtbrZ4KzEmX5wBn92pRFdTW8UXEoohoTp8+\nBoxq7z16O+Dr5iYoSWOA40n+EfLiX4GvAM0dbViDxgIbJP1I0uOS/lPS4KyLqpSIeBH4DvAHktFt\nWyLiwWyr6hHDImJ9urweGJZlMT3sQuDe9jbo7YDP49f6t5G0H3ArcFnakq95ks4CXomIJ8hZ6z3V\nDzgBuC4iTgC2Uttf7/ci6UCS1u0Ykm+V+0n6fKZF9bB0RsNcZo6krwE7ImJue9v1dsC/CIwueT6a\npBWfG5L6A7cBP4mIO7Oup4JOBqZKWgPMAz4q6aaMa6qkJqApIn6dPr+VJPDz4jRgTURsjIhdwO0k\n/6Z5s17SoQCShgOvZFxPxUn6PyRdpR2eoHs74H8DjJM0RtIAYDqwoJdr6DGSBNwALI+Ia7Oup5Ii\n4oqIGB0RY0kuzv0sIi7Iuq5KiYh1wAuSxqerTgOezrCkSnse+KCkQen/09NILpbnzQJgZro8E8hT\nI6tlKvavANMi4k8dbd+rAZ+2HFpugloO3JKzm6BOAc4DPpIOJXwi/QfJozx+9b0UuFnSMpJRNN/K\nuJ6KiYglJN9KHgd+l67+YXYVdZ+kecAvgSMlvSDpL4GrgI9LWgl8NH1ek9o4vguB/wfsByxK8+W6\ndt/DNzqZmeVTr9/oZGZmvcMBb2aWUw54M7OccsCbmeWUA97MLKcc8GZmOeWANzPLKQe8mVlO/X+i\nm7pnlRHacQAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline \n", "# We want to rank all attributes, and the best ones will be selected later\n", "selector = SelectKBest(f_regression, k=\"all\")\n", "selector.fit(X_train, y_train)\n", "sorted_attributes = np.argsort(-selector.scores_)\n", "sorted_scores = np.sort(-selector.scores_)\n", "for index,element in enumerate(zip(sorted_attributes, sorted_scores)):\n", " print element\n", " if index>10: break\n", " \n", "plt.plot(-sorted_scores)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It seems that the first three attributes are the most correlated with the label. Let's see what happens if we select only the best three attributes." ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Select the first 3 best attributes\n", "X_train_new = X_train[:, sorted_attributes[0:3]]\n", "X_test_new = X_test[:, sorted_attributes[0:3]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can see that the error is not too different, even though we have removed most of the attributes." ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "16.9882758621\n" ] } ], "source": [ "clf = tree.DecisionTreeRegressor()\n", "clf = clf.fit(X_train, y_train)\n", "y_test_pred = clf.predict(X_test)\n", "print metrics.mean_squared_error(y_test, y_test_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we are going to construct a method which is a combination or a sequence (a pipeline, in fact) of an attribute selector + a decision tree regressor. clf is therefore the pipeline (a sequence of attribute selection + regression algorithm). The number of attributes to be selected is a hyper-parameter of clf. max_depth is also a hyper-parameter of clf. We can use grid search in order to tune both parameters." ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 0.2s\n", "[Parallel(n_jobs=1)]: Done 199 tasks | elapsed: 1.0s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 52 candidates, totalling 260 fits\n", "Wall time: 1.51 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed: 1.4s finished\n" ] } ], "source": [ "from sklearn.pipeline import Pipeline\n", "\n", "param_grid = {'feature_selection__k': np.arange(X_train.shape[1])+1,\n", " 'regression__max_depth': np.arange(4)+1}\n", "\n", "clf = Pipeline([\n", " ('feature_selection', SelectKBest(f_regression)),\n", " ('regression', tree.DecisionTreeRegressor())\n", "])\n", "\n", "clf_grid = GridSearchCV(clf, \n", " param_grid,\n", " scoring='mean_squared_error',\n", " cv=5 , n_jobs=1, verbose=1)\n", "%time _ = clf_grid.fit(X,y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let's see the best hyper-parameters. It seems that for this case, 3 attributes should be selected" ] }, { "cell_type": "code", "execution_count": 89, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'feature_selection__k': 3, 'regression__max_depth': 3} -30.311835006\n" ] }, { "data": { "text/plain": [ "[mean: -30.31184, std: 16.85403, params: {'feature_selection__k': 3, 'regression__max_depth': 3},\n", " mean: -30.39138, std: 17.51927, params: {'feature_selection__k': 2, 'regression__max_depth': 3},\n", " mean: -30.43885, std: 18.08344, params: {'feature_selection__k': 3, 'regression__max_depth': 4},\n", " mean: -31.41281, std: 15.55386, params: {'feature_selection__k': 7, 'regression__max_depth': 3},\n", " mean: -31.66207, std: 15.95840, params: {'feature_selection__k': 5, 'regression__max_depth': 3},\n", " mean: -32.31163, std: 15.17629, params: {'feature_selection__k': 6, 'regression__max_depth': 3},\n", " mean: -32.63583, std: 15.19363, params: {'feature_selection__k': 4, 'regression__max_depth': 3},\n", " mean: -32.84554, std: 21.35249, params: {'feature_selection__k': 2, 'regression__max_depth': 4},\n", " mean: -33.93441, std: 11.92595, params: {'feature_selection__k': 7, 'regression__max_depth': 4},\n", " mean: -36.48472, std: 13.54363, params: {'feature_selection__k': 13, 'regression__max_depth': 3}]" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print clf_grid.best_params_, clf_grid.best_score_" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[mean: -30.31184, std: 16.85403, params: {'feature_selection__k': 3, 'regression__max_depth': 3},\n", " mean: -30.39138, std: 17.51927, params: {'feature_selection__k': 2, 'regression__max_depth': 3},\n", " mean: -30.43885, std: 18.08344, params: {'feature_selection__k': 3, 'regression__max_depth': 4},\n", " mean: -31.41281, std: 15.55386, params: {'feature_selection__k': 7, 'regression__max_depth': 3},\n", " mean: -31.66207, std: 15.95840, params: {'feature_selection__k': 5, 'regression__max_depth': 3},\n", " mean: -32.31163, std: 15.17629, params: {'feature_selection__k': 6, 'regression__max_depth': 3},\n", " mean: -32.63583, std: 15.19363, params: {'feature_selection__k': 4, 'regression__max_depth': 3},\n", " mean: -32.84554, std: 21.35249, params: {'feature_selection__k': 2, 'regression__max_depth': 4},\n", " mean: -33.93441, std: 11.92595, params: {'feature_selection__k': 7, 'regression__max_depth': 4},\n", " mean: -36.48472, std: 13.54363, params: {'feature_selection__k': 13, 'regression__max_depth': 3}]" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf_grid.grid_scores_.sort(key=lambda(x): -x[1])\n", "clf_grid.grid_scores_[0:10]" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "# USING PCA FOR TRANSFORMING ATTRIBUTES WITH TRAIN / TEST EVALUATION" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn import decomposition\n", "from sklearn import datasets" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(150L, 4L)\n" ] } ], "source": [ "iris = datasets.load_iris()\n", "X = iris.data\n", "y = iris.target\n", "print X.shape\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=33)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's use the maximum number of PCA components for the moment(4 iris attributes implies 4 attributes)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "PCA(copy=True, n_components=4, whiten=False)" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca = decomposition.PCA(n_components=4)\n", "pca.fit(X_train)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let's see how much variance explains each of the four components. We can see that the first component explaines most of the variance 92%" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 0.92337895, 0.05717063, 0.01542177, 0.00402865])" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.explained_variance_ratio_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let's compute the cummulative variance explained by n components. It seems that with 2 components we can already explain more than 95% of the variance. Using that criterion, we should use 2 components." ] }, { "cell_type": "code", "execution_count": 122, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEACAYAAABfxaZOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH9NJREFUeJzt3Xu0VXW99/H3xy3kLZG8Cxx5zsk8gKbWeJBSczXiFGlG\ncR6Hh5OXAJFUshrjeY7Bqdx5S0+jcaIgJWMLJoq3MDQNPUeXd1Fii6IbgpRCULFUFNEDG77PH3Nu\n2642e619WXuuy+c1xh7sOedvzfWdTfvy4zd/3/lTRGBmZvVhl6wDMDOzvuOkb2ZWR5z0zczqiJO+\nmVkdcdI3M6sjTvpmZnWkaNKX1CTpFUnPdNLmJ5JWS1ou6Zh2+8dIWpkeu7C3gjYzs+4ppad/LTBm\nZwclnQR8OCIOA84Brkr3NwAz088OB8ZLGtbjiM3MrNuKJv2IeAh4vZMmXwTmpW2XAPtIOggYCayJ\niLURsQ1YAIztechmZtZdvTGmPwhY1277xXTfITvZb2ZmGemtB7nqpfOYmVkZ7doL51gPDGm3PZik\nV9+vYP+QdP/7SPLLf8zMuiEiutzh7o2kvwiYCiyQNAp4IyJekfQX4DBJQ4ENwGnA+I5OUMsvfWts\nbKSxsTHrMMrG11fdavn6+vLa/ud/4OWXYcOGv/689NL7tzdsgLffhoMPhkMO+etPR9sDB4KKpHMV\na7ATRZO+pBuBE4H9JK0DLiLpxRMRsyPiLkknSVoDvA1MSI+1SpoKLAYagDkR0dKtKM3MMrB1a5LM\nCxN44fabb8JBB/1t8j7xxPfv+9CHiifzciua9COiw955QZupO9l/N3B3N+IyMyubbdvglVc67o23\n3/fGG3DAAX+bzI877v379t0XdqmSUtfeGN6xTuRyuaxDKCtfX3WrtetrbYWNG5OE3b9/jtmzO07m\nr70G++33/sR9yCEwatT7t/fbDxoasr6q3qWsx9MlRdYxmFll274dXn1152Plbfv+/OdkCKUwmReO\nmx9wQPUnc0ndepDrpG9mmdmxI0nUnT38fOmlpPc+cODfJu/ChH7ggbBrnYxfOOmbWcWIgL/8pfOH\nny+9lDwkHTCgtGTev3/WV1VZnPTNrOwi4PXXi09NfPll2GuvjodW2u876CD4wAeyvqrq5KRvZt0W\nAZs2dT6TpW179907n2Pe9uduu2V9VbXNSd/MStLaCnfdBbfcAmvX/jWh9++/82Tetu/gg2GPPbK+\nAgMnfTMrYtUqaGqC666Dv/97OOMMGD78r8l8r72yjtC6ortJv06ec5vVp82b4dZbYc4cWL06SfT3\n3QfDvLJF3XJP36zGRMCSJUmiv/VWOP54mDQJTj4Z+vXLOjrrLe7pm9W5jRvhl79MhnC2bYOJE+HZ\nZ5PxeLM2TvpmVay1FRYvTnr1990HX/oSXHUVnHBC9i/2ssrkpG9WhdasgWuvhblzYciQpFc/dy7s\nvXfWkVmlc9I3qxJbtsBttyW9+ueeg9NPT3r5RxyRdWRWTfwg16yCRcDSpUmiv/nm5C2QkybBKaf4\ntQT1zg9yzWrIn/8M11+fPJR9++1k+Obpp2Hw4Kwjs2rnnr5Zhdi+He69N0n099yT9OYnTkxWX6qW\nBTqs77gi16xKvfDCXx/KHnhgkujHj4d99sk6MqtkHt4xqyLvvAMLFyZj9U8/Df/6r3DHHXDUUVlH\nZrWu6D8aJY2RtFLSakkXdnB8oKSFkpZLWiJpRLtj35D0jKQVkr7R28GbVZtly+D885Ox+XnzYMoU\nePFFmDHDCd/6Rqc9fUkNwExgNLAeeFLSoohoaddsOrAsIr4s6XBgFjBa0hHA2cD/BrYBv5V0Z0T8\noRwXYlapXnsN5s9Pxupffz0Zvlm2DA49NOvIrB4V6+mPBNZExNqI2AYsAMYWtBkG3A8QEauAoZIO\nSPcviYh3I2I78AAwrlejN6tQO3YkD2XHj0/eaPnYY/DDH8Lzz8P3vueEb9kpNqY/CFjXbvtF4NiC\nNstJkvnDkkYCh6afewa4VNKHgHeBk4EneiNos0r1xz8mD2SvvTZZoHviRJg1K/ndrBIUS/qlTKu5\nApghqZkk0TcD2yNipaQrgXuAt9P9O3oSrFklevdd+PWvk4eyv/td0rtfuBCOOSbryMz+VrGkvx4Y\n0m57CElv/z0R8RYwsW1b0gvA8+mxJqAp3X858KeOvqSxsfG933O5HLlcrsTwzbKzfHmS6G+4AY4+\nOqmUXbTIywRaeeTzefL5fI/P0+k8fUm7AquAzwAbSIZnxrd/kCtpAPBORGyVNBk4LiK+mh47ICI2\nSvo7YDFwbES8WfAdnqdvVeONN5Ik39SUvMp4woTkZ+jQrCOzelOWefoR0SppKknCbgDmRESLpCnp\n8dnAcGCupABWAJPaneJWSfuSzN45rzDhm1WDHTsgn08S/Z13wuc+B5dfDp/5DDQ0ZB2dWde4Itds\nJ9atS+bSX3st7LlnMnxz+umw775ZR2bmilyzXrF1azIuP2dOsuTgaafBTTfBxz/uRUmsNjjpmwEr\nViSJfv58GDEi6dXfdhvssUfWkZn1Lid9q1ubNsGCBclY/fr18NWvJkVU//APWUdmVj4e07e6EgEP\nPpgk+l//GkaPTnr1n/2sH8padfGrlc06sWFD8lC2qSlZcWrSJDjjDNh//6wjM+seP8g1K7BtWzLF\ncs4ceOQROPXUZDWqkSP9UNbql5O+1ZyWliTR//KXcPjhSa/+ppuSaZdm9c5J32rCW28lib2pCdau\nhbPOgocego98JOvIzCqLx/StakUkwzZNTckLznK55K2Wn/887OrujNU4P8i1uvHyy3DddUmyh2T4\n5swzk/VlzeqFH+RaTdu2De6+Oxmrf/BBGDcuSfqf+IQfypp1hZO+VbRVq5Lkft11yQpUkyYlVbN7\n7ZV1ZGbVyUnfKs7mzXDLLUmvfs2aZOjmvvtg2LCsIzOrfh7Tt4oQAY8/nvTqb70VTjgheSh78snQ\nr1/W0ZlVHo/pW1XauDGZTz9nDrS2Jon+2WfhkEOyjsysNjnpW59rbYXFi5NEf9998KUvwezZcPzx\nfihrVm4e3rE+s2ZNMnwzbx4MGZI8lD3tNNh776wjM6s+Ht6xirRlSzJG39QEzz2XvOTsnnuSd9ab\nWd9zT996XQQ8+WSS6G++OZlLP3EinHJK8oZLM+u57vb0dynhxGMkrZS0WtKFHRwfKGmhpOWSlkga\n0e7YtyStkPSMpBskfaCrAVr1+POf4cc/ho9+FMaPT4Zwnn4afvMb+Od/dsI3qwSd9vQlNQCrgNHA\neuBJYHxEtLRr80PgzYi4RNLhwKyIGC1pEPAQMCwi/kfSTcBdETGv4Dvc069i27fDvfcmD2XvvTfp\nzU+aBJ/6FOxStEthZt1VrjH9kcCaiFibfskCYCzQ0q7NMOAKgIhYJWmopLalKXYF9pC0HdiD5C8O\nqxE33gj/9m9w0EFJor/mGthnn6yjMrPOFEv6g4B17bZfBI4taLMcGAc8LGkkcCgwOCKaJf0I+BPw\nDrA4Iv6rd8K2rD37LFxwQTJ0M3Jk1tGYWamKJf1Sxl2uAGZIagaeAZqB7ZIGAl8EhgKbgFskfSUi\n5heeoLGx8b3fc7kcuVyulNgtI1u3wumnww9+4IRv1lfy+Tz5fL7H5yk2pj8KaIyIMen2NGBHRFzZ\nyWdeAI4EPg98LiLOTvefAYyKiPML2ntMv8p8+9vJ6lS33+5iKrOslGtMfylwmKShwAbgNGB8wRcP\nAN6JiK2SJgMPRMRmSX8ERknaHXiX5GHwE10N0CrLgw8mb7x86iknfLNq1GnSj4hWSVOBxUADMCci\nWiRNSY/PBoYDcyUFsAKYlB57QtKtwDKgNf3z52W7Eiu7TZuSN15ecw0ccEDW0ZhZd7g4y0p25pmw\nxx5w9dVZR2Jmfg2DldUttySvPm5uzjoSM+sJ9/StqPXr4WMfgzvu8Gwds0pRttcwWH3bsQMmTIDz\nz3fCN6sFTvrWqZkz4a23YPr0rCMxs97g4R3bqeeegxNPhMcegw9/OOtozKw9D+9Yr9q6Fb7yFbj8\ncid8s1rinr51aNq0pKfvqluzyuQpm9ZrHnwQ5s6F5cud8M1qjYd37H02bYKzznLVrVmt8vCOvc9Z\nZ8Huu7vq1qzSeXjHeuzWW5OZOq66Natd7ukbABs2wDHHuOrWrFp4yqZ1W1vV7XnnOeGb1TonfWPW\nrOQB7r//e9aRmFm5eXinzrVV3T76KBx2WNbRmFmpPLxjXda21u1llznhm9UL9/Tr2LRpsGIFLFrk\nIiyzauMpm9YlDz2UVN16rVuz+uLhnTr05pvJ0oc//zkceGDW0ZhZXyqa9CWNkbRS0mpJF3ZwfKCk\nhZKWS1oiaUS6/3BJze1+Nkm6oBwXYV1zwQXw2c/CKadkHYmZ9bVOh3ckNQAzgdHAeuBJSYsioqVd\ns+nAsoj4sqTDgVnA6IhYBRyTnmeX9PMLy3AN1gW33QaPPOKqW7N6VaynPxJYExFrI2IbsAAYW9Bm\nGHA/QJroh0rav6DNaOAPEbGuF2K2btqwISnAuv562GuvrKMxsywUS/qDgPaJ+sV0X3vLgXEAkkYC\nhwKDC9r8C3BD98O0noqAiRPh3HPh2GOzjsbMslJs9k4pcymvAGZIagaeAZqB7W0HJfUHTgH+5nlA\nm8bGxvd+z+Vy5HK5Er7WumLWLHj9dVfdmlWrfD5PPp/v8Xk6nacvaRTQGBFj0u1pwI6IuLKTz7wA\nHBkRm9PtscC5befooL3n6ZdZSwt86lOuujWrJeWqyF0KHCZpaNpjPw1YVPDFA9JjSJoMPNCW8FPj\ngRu7Gpj1jra1bi+91AnfzIoM70REq6SpwGKgAZgTES2SpqTHZwPDgbmSAlgBTGr7vKQ9SR7iTi5T\n/FZEYyMMGgTnnJN1JGZWCfwahhr28MNw6qlJ1a2LsMxqi1+4Zu/TVnU7e7YTvpn9lXv6NWrCBOjX\nL3nVgpnVHr9wzd7zq18lQzuuujWzQu7p15iXXkrWur39dhg1KutozKxcPKZv71XdTpnihG9mHXPS\nryE/+xn85S/wne9kHYmZVSoP79SIlhY44YSk6vYjH8k6GjMrNw/v1LG2tW4vvdQJ38w656RfA77/\nfTj44GQs38ysM56yWeUeeQSamrzWrZmVxj39Kvbmm3DGGa66NbPS+UFuFZs4ERoa4Jprso7EzPqa\nK3LrzK9+BQ8+mAzrmJmVyj39KtRWdbtwIXziE1lHY2ZZ8JTNOtFWdXvOOU74ZtZ1TvpVpq3q9rvf\nzToSM6tGHt6pIitXwvHHJ9M0Dz8862jMLEse3qlxbVW3l1zihG9m3eekXyUuvjiZi/+1r2UdiZlV\ns6JJX9IYSSslrZZ0YQfHB0paKGm5pCWSRrQ7to+kWyW1SHpOkl/42w2PPgpz5iQ/rro1s57oNOlL\nagBmAmOA4cB4ScMKmk0HlkXEUcCZwIx2x2YAd0XEMOCjQEtvBV4v3norqbq9+mo46KCsozGzales\npz8SWBMRayNiG7AAGFvQZhhwP0BErAKGStpf0gDghIhoSo+1RsSm3g2/9n3zm/DpT8PYwv/Vzcy6\noVhF7iBgXbvtF4FjC9osB8YBD0saCRwKDAYCeFXStcBRwO+Ab0TElt4IvB7cfjs88IDXujWz3lMs\n6Zcyl/IKYIakZuAZoBnYDvQHPgZMjYgnJf0Y+DbwvcITNDY2vvd7Lpcjl8uVEntNe+ml5KHtwoXw\nwQ9mHY2ZZS2fz5PP53t8nk7n6acPXhsjYky6PQ3YERFXdvKZF4Ajgb2AxyLif6X7jwe+HRFfKGjv\nefoFIuDkk+HjH0+maJqZFSrXPP2lwGGShkrqD5wGLCr44gHpMSRNBh6IiM0R8TKwTlLbWk6jgWe7\nGmA9uuoqePVV+N7f/JvIzKxnOh3eiYhWSVOBxUADMCciWiRNSY/PJpnVM1dSACuASe1O8XVgfvqX\nwh+ACWW4hpqyalWS7B95BPr1yzoaM6s1fg1DBdm2DT75SZgwAc47L+tozKyS+TUMNeDii2H//eHc\nc7OOxMxqlRdRqRCPPpqsgOW1bs2snNzTrwCuujWzvuIx/Qpw9tnJNM05c7KOxMyqhdfIrVK33w73\n3++1bs2sb7inn6GXX4ajj04WOf/kJ7OOxsyqiWfvVJm2tW7PPtsJ38z6jpN+Rq6+GjZuhIsuyjoS\nM6snHt7JwKpVcNxx8PDD8I//mHU0ZlaNPLxTJbZtS9a6vfhiJ3wz63tO+n3skktcdWtm2fGUzT70\n2GPw858ni6K46tbMsuCefh/ZvDmpur3qKjj44KyjMbN65Qe5fWTyZNi+HZqaso7EzGqBK3Ir2O23\nw3//NyxfnnUkZlbv3NMvs7aq29tuS6Zpmpn1hu729J30yygCvvCFJOlfdlnW0ZhZLfE8/Qo0e3bS\n03fVrZlVCvf0y+T3v0/eqfPQQzBsWNbRmFmtKVtPX9IYSSslrZZ0YQfHB0paKGm5pCWSRrQ7tlbS\n05KaJT3R1eCqVVvV7fe/74RvZpWl09k7khqAmcBoYD3wpKRFEdHSrtl0YFlEfFnS4cCstD1AALmI\neK33Q69cl14K++7rxc3NrPIU6+mPBNZExNqI2AYsAMYWtBkG3A8QEauAoZL2b3e8rmpPH388Gctv\nanLVrZlVnmJJfxCwrt32i+m+9pYD4wAkjQQOBQanxwL4L0lLJU3uebiVbfPmZFjnZz9z1a2ZVaZi\nxVmlPGG9ApghqRl4BmgGtqfHjo+IDWnP/15JKyPiocITNDY2vvd7Lpcjl8uV8LWV51vfghNOgHHj\nso7EzGpNPp8nn8/3+Dydzt6RNApojIgx6fY0YEdEXNnJZ14AjoyIzQX7LwI2R8SPCvbXxOydRYvg\nm99M1rrde++sozGzWleu2TtLgcMkDZXUHzgNWFTwxQPSY6RDOA9ExGZJe0j6YLp/T+CzJP8SqDmv\nvAJTpsB11znhm1ll63R4JyJaJU0FFgMNwJyIaJE0JT0+GxgOzJUUwApgUvrxA4GFSp5m7grMj4h7\nynMZ2YmASZNgwgQ4/visozEz65yLs3po9uzkHfmPPQb9+2cdjZnVC797JwO//33yErUHH3QRlpn1\nLb97p49t25YsinLRRU74ZlY9nPS76bLLYOBAOP/8rCMxMyudF1Hphscfh6uvhmXLXHVrZtXFPf0u\naqu6nTULDjkk62jMzLrGD3K76JxzYOtWmDs360jMrJ55jdw+sGgR3Huv17o1s+rlpF+itqrbm292\n1a2ZVS8P75QgAr74RTjiCPjBD7KOxszMwztldc01sH493HZb1pGYmfWMe/pFrF6drHXrqlszqySu\nyC2D1tak6vZ733PCN7Pa4KTficsugwEDXHVrZrXDY/o7sWRJsuxhczPs4r8azaxGOJ11wFW3Zlar\n/CC3A1OmwLvvwrx5WUdiZtYxT9nsJXfcAffck6x1a2ZWa9zTb2fjRjjqqKTq9oQTso7GzGznvHJW\nD0XA2LEwfDhccUXW0ZiZda5s8/QljZG0UtJqSRd2cHygpIWSlktaImlEwfEGSc2S7uhqcH3pF7+A\ndevg4ouzjsTMrHw6TfqSGoCZwBhgODBeUmGZ0nRgWUQcBZwJzCg4/g3gOSD77vxOrFkD06fD/Ple\n3NzMaluxnv5IYE1ErI2IbcACYGxBm2HA/QARsQoYKml/AEmDgZOAXwAVucZUa2syPfO7302GdszM\nalmxpD8IWNdu+8V0X3vLgXEAkkYChwKD02P/Cfw/YEePIy2Tyy5LXpU8dWrWkZiZlV+xKZulDMlc\nAcyQ1Aw8AzQDOyR9AdgYEc2Scp2doLGx8b3fc7kcuVynzXtNW9XtsmWuujWzypbP58nn8z0+T6ez\ndySNAhojYky6PQ3YERFXdvKZF4CPAtOAM4BWYDdgb+C2iDizoH0ms3fefhuOOSbp6Z96ap9/vZlZ\nj5RlyqakXYFVwGeADcATwPiIaGnXZgDwTkRslTQZOC4ivlpwnhOB/xsRp3TwHZkk/a99DbZsgeuu\n6/OvNjPrsbJU5EZEq6SpwGKgAZgTES2SpqTHZ5PM6pkrKYAVwKSdna6rwZXLnXfCb3/rtW7NrP7U\nXXHWxo1w9NGwYAF86lN99rVmZr3KFbkliIAvfSlZEMVVt2ZWzfzCtRLMmQN/+hPcckvWkZiZZaNu\nevpr1sAnPgH5PIwYUbS5mVlF8xq5nWiruv3Od5zwzay+1UXSv/xy+OAH4etfzzoSM7Ns1fyY/hNP\nwMyZXuvWzAxqvKf/9tvJsM7MmTCo8I1BZmZ1qKYf5J57brLI+S9/WZbTm5llxlM2C/zmN3D33a66\nNTNrryaT/quvwuTJcOONMGBA1tGYmVWOmhveiYAvfxkOPxyu3Om7QM3MqpuHd1JNTbB2Ldx0U9aR\nmJlVnprq6a9ZA6NGwQMPuAjLzGpb3VfktrbCGWe46tbMrDM1k/R/8APYc0+44IKsIzEzq1w1Mab/\n5JPw0596rVszs2KqPkW2Vd3+9KcweHDW0ZiZVbaqf5B73nnw5ptw/fW9GJSZWYWryymbd92VVN66\n6tbMrDRFh3ckjZG0UtJqSRd2cHygpIWSlktaImlEun+3dPspSSskNfZm4K++CmefDfPmwT779OaZ\nzcxqV6fDO5IagFXAaGA98CQwPiJa2rX5IfBmRFwi6XBgVkSMTo/tERFbJO0KPAx8IyKWFHxHl4d3\nImDcODjsMPiP/+jSR83MakK55umPBNZExNqI2AYsAMYWtBkG3A8QEauAoZL2T7e3pG36A/2AHV0N\nsCNNTfDCC3DJJb1xNjOz+lEs6Q8C1rXbfjHd195yYByApJHAocDgdLtB0lPAK8A9EfFkTwP+wx/g\n299OHtx+4AM9PZuZWX0p9iC3lHGXK4AZkpqBZ4BmYDtARGwHjpY0AFgoaUREPFt4gsbGxvd+z+Vy\n5HK5Dr+orep2+nQ44ogSIjMzqxH5fJ58Pt/j8xQb0x8FNEbEmHR7GrAjInb6/kpJLwBHRsTmgv3f\nBbZExI8K9pc8pn/ppZDPwz33uAjLzOpbucb0lwKHSRoqqT9wGrCo4IsHpMeQNBl4ICI2S9pP0j7p\n/t2BfwJa6KalS+EnP4G5c53wzcy6q9PhnYholTQVWAw0AHMiokXSlPT4bGA4MFdSACuASenHDwbm\npTOAdgFuioi7uhPkli1J1e1PfuKqWzOznqiKitzzz4c33oD58/soKDOzClezFbl33w133umqWzOz\n3lDRSb+t6nb+fFfdmpn1hood3mmruv3wh+GHP8wgMDOzClZzwzvXXgvPPw8LFmQdiZlZ7ajInv7z\nz8Oxx8J998GRR2YUmJlZBauZNXLbqm6nTXPCNzPrbRWX9K+8EnbbDb75zawjMTOrPRU1vLN0KZx0\nEvzudzBkSKZhmZlVtKof3mmrup0xwwnfzKxcKqanP3UqvPYa3HBDpuGYmVWFqp6yeffdcMcd8NRT\nWUdiZlbbKqKnf8ghwfXXw6c/nWkoZmZVo6rH9MePd8I3M+sLFdHTf/fd8NKHZmZdUNU9fSd8M7O+\nURFJ38zM+oaTvplZHXHSNzOrIyUlfUljJK2UtFrShR0cHyhpoaTlkpZIGpHuHyLpfknPSloh6YLe\nvgAzMytd0aSfLmw+ExhDsgj6eEnDCppNB5ZFxFHAmcCMdP824FsRMQIYBZzfwWdrWj6fzzqEsvL1\nVbdavr5avraeKKWnPxJYExFrI2IbsAAYW9BmGHA/QESsAoZK2j8iXo6Ip9L9m4EW4JBei74K1Pp/\neL6+6lbL11fL19YTpST9QcC6dtsvpvvaWw6MA5A0EjgUGNy+gaShwDHAku6FamZmPVVK0i+leusK\nYB9JzcBUoBnY3nZQ0l7ArcA30h6/mZlloGhFrqRRQGNEjEm3pwE7IuLKTj7zAnBkRGyW1A+4E7g7\nIn7cQdtsS4LNzKpUud6yuRQ4LB2e2QCcBoxv30DSAOCdiNgqaTLwQJrwBcwBnuso4Xc3aDMz656i\nST8iWiVNBRYDDcCciGiRNCU9PptkVs/ctNe+ApiUfvw44HTg6XToB2BaRPy2l6/DzMxKkPkL18zM\nrO/0WUVusQKvtM1P0uPLJR3TV7H1hhIK2HKSNklqTn++k0Wc3SGpSdIrkp7ppE0137tOr6/K711J\nBZLVev9Kub4qv3+7pQWvT6XX17iTdqXfv4go+w/JsNAaYCjQD3gKGFbQ5iTgrvT3Y4HH+yK2Pry+\nHLAo61i7eX0nkEy3fWYnx6v23pV4fdV87w4Cjk5/3wtYVWP/3yvl+qr2/qXx75H+uSvwOHBsT+5f\nX/X0Synw+iIwDyAilpBMAT2wj+LrqVKuD6AqH1pHxEPA6500qeZ7V8r1QfXeu1IKJKv2/pV4fVCl\n9w8gIrakv/Yn6VTuKGjSpfvXV0m/lAKvjtoMpjqUcn0BfDL959ddkob3WXTlV833rhQ1ce86KZCs\nifvXyfVV9f2TtIukp4BXgHsi4smCJl26f321MHqpT4sL/zaulqfMpcS5DBgSEVskfR64HfhIecPq\nU9V670pR9feuhALJqr5/Ra6vqu9fROwAjk6nxi+UNCIini1oVvL966ue/npgSLvtISR/G3XWZnC6\nrxoUvb6IeKvtn2kRcTfQT9KH+i7Esqrme1dUtd+7tEDyNuD6iLi9gyZVff+KXV+13782EbGJ5B1n\nYwoOden+9VXSf6/AS1J/kgKvRQVtFpG8obOtCviNiHilj+LrqaLXJ+nAtFit7f1EiojX+j7Usqjm\ne1dUNd+7UgokqeL7V8r1Vfn920/SPunvuwP/RPLcor0u3b8+Gd6JEgq8IuIuSSdJWgO8DUzoi9h6\nQynXB/wf4FxJrcAW4F8yC7iLJN0InAjsJ2kdcBHJA6Wqv3dQ/Pqo4ntHxwWS04G/g5q4f0Wvj+q+\nfwcD85S84n4X4Kb0fnU7d7o4y8ysjni5RDOzOuKkb2ZWR5z0zczqiJO+mVkdcdI3M6sjTvpmZnXE\nSd/MrI446ZuZ1ZH/D31yW4BOuVFJAAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "accumulated_variance = np.cumsum(pca.explained_variance_ratio_)\n", "plt.plot(accumulated_variance)\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Therefore, let's compute 2 PCA components and apply them to train and test. We can see that the new input attributes (X_train_new and X_test_new have 2 new attributes) " ] }, { "cell_type": "code", "execution_count": 125, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pca = decomposition.PCA(n_components=2)\n", "pca.fit(X_train)\n", "X_train_new = pca.transform(X_train)\n", "X_test_new = pca.transform(X_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we can apply a classifier to the new, reduced, training set, and test it on the transformed test set" ] }, { "cell_type": "code", "execution_count": 130, "metadata": { "collapsed": false }, "outputs": [], "source": [ "clf = tree.DecisionTreeClassifier()\n", "clf = clf.fit(X_train_new, y_train)" ] }, { "cell_type": "code", "execution_count": 136, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0\n", "0.916666666667\n" ] } ], "source": [ "y_train_pred = clf.predict(X_train_new)\n", "y_test_pred = clf.predict(X_test_new)\n", "print metrics.accuracy_score(y_train, y_train_pred)\n", "print metrics.accuracy_score(y_test, y_test_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we construct the tree with the original dataset (before applying PCA), we see that test accuracy is larger with 4 attributes than with 2 PCA components. So in this case, PCA would not be useful from an accuracy point of view, but it would be useful to reduce the complexity of the model (with PCA we have only 2 components instead of the 4 original attributes)." ] }, { "cell_type": "code", "execution_count": 138, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0\n", "0.95\n" ] } ], "source": [ "clf = tree.DecisionTreeClassifier()\n", "clf = clf.fit(X_train, y_train)\n", "y_train_pred = clf.predict(X_train)\n", "y_test_pred = clf.predict(X_test)\n", "print metrics.accuracy_score(y_train, y_train_pred)\n", "print metrics.accuracy_score(y_test, y_test_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }