{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# chemdiab dataset example\n", "\n", "Data fom\n", "[locfit R package](https://cran.r-project.org/web/packages/locfit/locfit.pdf).\n", "\n", "Numeric variables are rw, fpg, ga, ina and sspg. Classifier cc is the Diabetic type.\n", "\n", "Originally from **Reaven, G. M. and Miller, R. G. (1979). An attempt to define the nature of chemical diabetes using a multidimensional analysis. Diabetologia 16, 17-24.**\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# import data\n", "import numpy as np\n", "import inspect\n", "from os import path\n", "dtype = [(\"rw\",np.float64),(\"fpg\",np.float64),(\"ga\",np.float64),\n", " (\"ina\",np.float64),('sspg', np.float64),('cc', 'S20')]\n", "file_path = path.dirname(path.abspath(\n", " inspect.getfile(inspect.currentframe())))\n", "file_path = path.join(path.dirname(file_path),\"chemdiab.tab\")\n", "chemdiab = np.genfromtxt(\"chemdiab.tab\", names=True,\n", " dtype=dtype)\n", "# get only numeric columns as numpy array\n", "nam_cols = list(chemdiab.dtype.names[0:5])\n", "num_cols = chemdiab[nam_cols]\n", "num_cols = num_cols.view(np.float64)\\\n", " .reshape(num_cols.shape + (-1,))\n", "\n", "cat, ind = np.unique(chemdiab[chemdiab.dtype.names[5]],\n", " return_inverse=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# clean and visualize data\n", "from sklearn.preprocessing import StandardScaler, FunctionTransformer\n", "from cartographer.filterers import KernelDensityFilterer\n", "from seaborn import plt\n", "\n", "std_scaler = StandardScaler()\n", "scaled = std_scaler.fit_transform(num_cols)\n", "\n", "fig, axs = plt.subplots(2,3, figsize=(16,8))\n", "flat_axs = axs.flatten()\n", "for i, name in enumerate(nam_cols):\n", " flat_axs[i].hist(scaled[:,i])\n", " flat_axs[i].set_title(name)\n", " \n", "flat_axs[5].hist(ind)\n", "flat_axs[5].set_title(\"category\")\n", "\n", "fig" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from cartographer.mapper import Mapper\n", "from cartographer.coverers import HyperRectangleCoverer\n", "from sklearn.cluster import DBSCAN\n", "from cartographer.visualization import html_graph\n", "from IPython.core.display import HTML\n", "\n", "m = Mapper(coverer=HyperRectangleCoverer(intervals=10, overlap=0.5),\n", " filterer=KernelDensityFilterer(bandwidth=1.0),\n", " clusterer=DBSCAN(min_samples=5,eps=2.))\n", "m.fit(scaled)\n", "\n", "HTML(html_graph(m, {\"ind\": ind}, {\"kde\": m.filterer.transform(scaled)}))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }