{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:11:37.705189Z",
     "start_time": "2020-04-02T11:11:30.145843Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip install mlxtend"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Our first step is to download a piece of text from Wikipedia and to parse paragraphs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:10:11.306643Z",
     "start_time": "2020-04-02T11:10:00.060331Z"
    }
   },
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "\n",
    "respond = requests.get(\"https://en.wikipedia.org/wiki/Poznań\")\n",
    "soup = BeautifulSoup(respond.text, \"lxml\")\n",
    "page = soup.find_all('p')\n",
    "\n",
    "raw_text = [paragraph.text for paragraph in page]\n",
    "\n",
    "print(raw_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we will split the text into paragraphs and remove the lines with less than 3 words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:10:16.661948Z",
     "start_time": "2020-04-02T11:10:16.656744Z"
    }
   },
   "outputs": [],
   "source": [
    "text = [ line.split() for line in raw_text if len(line) > 2 ]\n",
    "\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Our text still contains a lot of stop-words and some additional tokens such as 1.2, [2], etc. We will use the `nltk` library to remove the stop-words and we'll transform everything to alpha tokens."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:13:16.278000Z",
     "start_time": "2020-04-02T11:13:15.736564Z"
    }
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:13:43.369515Z",
     "start_time": "2020-04-02T11:13:42.955115Z"
    }
   },
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "\n",
    "clean_text = [\n",
    "    [ \n",
    "        word.lower() \n",
    "         for word \n",
    "        in line \n",
    "        if word.isalpha() \n",
    "        and word.lower() not in stopwords.words('english') \n",
    "    ]\n",
    "    for line \n",
    "    in text\n",
    "]\n",
    "\n",
    "clean_text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we are ready to transform the list of lists into the format suitable for association rule mining, i.e., to transform the input lists into boolean flags."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:13:47.075080Z",
     "start_time": "2020-04-02T11:13:47.070912Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from mlxtend.preprocessing import TransactionEncoder\n",
    "from mlxtend.frequent_patterns import apriori\n",
    "\n",
    "te = TransactionEncoder()\n",
    "te_array = te.fit(clean_text).transform(clean_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:13:49.234374Z",
     "start_time": "2020-04-02T11:13:49.229600Z"
    }
   },
   "outputs": [],
   "source": [
    "# te_array contains binary version of the input data\n",
    "\n",
    "te_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:13:50.711973Z",
     "start_time": "2020-04-02T11:13:50.705627Z"
    }
   },
   "outputs": [],
   "source": [
    "te_array.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:13:53.448413Z",
     "start_time": "2020-04-02T11:13:53.435857Z"
    }
   },
   "outputs": [],
   "source": [
    "# original tokens are preserved in the columns_ field\n",
    "\n",
    "te.columns_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`mlxtend` package assumes that the input data are stored as a `pandas.DataFrame`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:14:00.447256Z",
     "start_time": "2020-04-02T11:14:00.425140Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame(te_array, columns=te.columns_)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we are ready to find frequent collections of words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:14:05.369752Z",
     "start_time": "2020-04-02T11:14:05.350421Z"
    }
   },
   "outputs": [],
   "source": [
    "frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)\n",
    "\n",
    "frequent_itemsets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can also mine association rules which will have additional measures of quality and interestingness"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:15:03.216830Z",
     "start_time": "2020-04-02T11:15:03.153412Z"
    }
   },
   "outputs": [],
   "source": [
    "from mlxtend.frequent_patterns import association_rules\n",
    "??association_rules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:15:12.752527Z",
     "start_time": "2020-04-02T11:15:12.717567Z"
    }
   },
   "outputs": [],
   "source": [
    "from mlxtend.frequent_patterns import association_rules\n",
    "\n",
    "association_rules(frequent_itemsets, \n",
    "                  metric='confidence', \n",
    "                  min_threshold=0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:15:22.534455Z",
     "start_time": "2020-04-02T11:15:22.496652Z"
    }
   },
   "outputs": [],
   "source": [
    "association_rules(frequent_itemsets, metric='lift', min_threshold=5.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Both frequent itemsets and association rules (antecedens and consequents) are returned as `frozenset`s, so we can use [standard API calls](https://docs.python.org/3/library/stdtypes.html#set-types-set-frozenset) to find subsets, supersets, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-02T11:15:32.431239Z",
     "start_time": "2020-04-02T11:15:32.414726Z"
    }
   },
   "outputs": [],
   "source": [
    "rules = association_rules(frequent_itemsets, metric=\"confidence\", min_threshold=0.5)\n",
    "\n",
    "capital_idx =  rules['antecedents'].apply(lambda x: x.issuperset({'capital','polish'}))\n",
    "rules[capital_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}