jup 2 and 3
This commit is contained in:
1847
claude/datasaurus.csv
Normal file
1847
claude/datasaurus.csv
Normal file
File diff suppressed because it is too large
Load Diff
9903
claude/dataset_A_indie_game_telemetry_clean.csv
Normal file
9903
claude/dataset_A_indie_game_telemetry_clean.csv
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
865
claude/lab02_task1_datasets.ipynb
Normal file
865
claude/lab02_task1_datasets.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -39,7 +39,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -249,7 +249,7 @@
|
||||
"4 Yes "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -259,6 +259,7 @@
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"import warnings\n",
|
||||
"from ydata_profiling import ProfileReport\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"# Load the raw dataset — do NOT clean anything yet\n",
|
||||
@@ -270,7 +271,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -308,6 +309,80 @@
|
||||
"print(df.dtypes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a9827626",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "8bca7ad18a13487ba5853443b29dbd90",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 20/20 [00:03<00:00, 5.91it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "771cb3df4e4946bca1717279b4a6c0ca",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "394f98b346ea44d49127b1657140d9b0",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Render HTML: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "5868640bd17244c3ada054ba2adfbd58",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Export report to file: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"profile = ProfileReport(df, title=\"ProfileReport\").to_file(\"report.html\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -331,7 +406,7 @@
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "bd10cd653e7a47f891552a79e946376c",
|
||||
"model_id": "289eb8e084bc4ca5bd4bfbe3d95f511b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
@@ -354,7 +429,7 @@
|
||||
"source": [
|
||||
"# Generate the SweetViz report\n",
|
||||
"# This may take 30–60 seconds\n",
|
||||
"report = sv.analyze(df_raw)\n",
|
||||
"report = sv.analyze(df)\n",
|
||||
"report.show_html('sweetviz_raw_report.html')\n",
|
||||
"\n",
|
||||
"print('Report saved as sweetviz_raw_report.html — open it in your browser.')"
|
||||
@@ -389,16 +464,33 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 4,
|
||||
"id": "89d0471a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2026-02-22 20:12:55,619 - INFO - D-Tale started at: http://127.0.0.1:40000\n"
|
||||
]
|
||||
},
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<StringDtype(storage='python', na_value=<NA>)>"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df['input_method'] = df['input_method'].astype('string')\n",
|
||||
"df['purchase_amount'] = df['purchase_amount'].astype('string')\n",
|
||||
"\n",
|
||||
"df['input_method'].dtype"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@@ -409,82 +501,8 @@
|
||||
],
|
||||
"source": [
|
||||
"# Launch D-Tale with the raw dataset\n",
|
||||
"# A link will appear — click it to open D-Tale in a new browser ta\n",
|
||||
"d = dtale.show(df_raw, host='127.0.0.1', subprocess=False, open_browser=True)\n",
|
||||
"print(\"Open D-Tale at:\", d._url) # lists all running instances\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4c2e5293",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "bad operand type for abs(): 'str'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[16]\u001b[39m\u001b[32m, line 21\u001b[39m\n\u001b[32m 18\u001b[39m \tstr_data = pd.to_numeric(s, errors=\u001b[33m'\u001b[39m\u001b[33mcoerce\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 19\u001b[39m pd.Series(str_data, name=\u001b[33m'\u001b[39m\u001b[33mpurchase_amount\u001b[39m\u001b[33m'\u001b[39m, index=s.index)\n\u001b[32m---> \u001b[39m\u001b[32m21\u001b[39m df[\u001b[33m'\u001b[39m\u001b[33mpurchase_amount\u001b[39m\u001b[33m'\u001b[39m] = \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mpurchase_amount\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mabs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:1722\u001b[39m, in \u001b[36mNDFrame.abs\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1654\u001b[39m \u001b[38;5;129m@final\u001b[39m\n\u001b[32m 1655\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mabs\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Self:\n\u001b[32m 1656\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 1657\u001b[39m \u001b[33;03m Return a Series/DataFrame with absolute numeric value of each element.\u001b[39;00m\n\u001b[32m 1658\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 1720\u001b[39m \u001b[33;03m 3 7 40 -50\u001b[39;00m\n\u001b[32m 1721\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1722\u001b[39m res_mgr = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_mgr\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mabs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1723\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._constructor_from_mgr(res_mgr, axes=res_mgr.axes).__finalize__(\n\u001b[32m 1724\u001b[39m \u001b[38;5;28mself\u001b[39m, name=\u001b[33m\"\u001b[39m\u001b[33mabs\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1725\u001b[39m )\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:361\u001b[39m, in \u001b[36mBaseBlockManager.apply\u001b[39m\u001b[34m(self, f, align_keys, **kwargs)\u001b[39m\n\u001b[32m 358\u001b[39m kwargs[k] = obj[b.mgr_locs.indexer]\n\u001b[32m 360\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(f):\n\u001b[32m--> \u001b[39m\u001b[32m361\u001b[39m applied = \u001b[43mb\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 363\u001b[39m applied = \u001b[38;5;28mgetattr\u001b[39m(b, f)(**kwargs)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:395\u001b[39m, in \u001b[36mBlock.apply\u001b[39m\u001b[34m(self, func, **kwargs)\u001b[39m\n\u001b[32m 389\u001b[39m \u001b[38;5;129m@final\u001b[39m\n\u001b[32m 390\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mapply\u001b[39m(\u001b[38;5;28mself\u001b[39m, func, **kwargs) -> \u001b[38;5;28mlist\u001b[39m[Block]:\n\u001b[32m 391\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 392\u001b[39m \u001b[33;03m apply the function to my values; return a block if we are not\u001b[39;00m\n\u001b[32m 393\u001b[39m \u001b[33;03m one\u001b[39;00m\n\u001b[32m 394\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m395\u001b[39m result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 397\u001b[39m result = maybe_coerce_values(result)\n\u001b[32m 398\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._split_op_result(result)\n",
|
||||
"\u001b[31mTypeError\u001b[39m: bad operand type for abs(): 'str'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):\n",
|
||||
"\tdf = df.to_frame(index=False)\n",
|
||||
"\n",
|
||||
"# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required\n",
|
||||
"df = df.reset_index().drop('index', axis=1, errors='ignore')\n",
|
||||
"df.columns = [str(c) for c in df.columns] # update columns to strings in case they are numbers\n",
|
||||
"\n",
|
||||
"df['purchase_amount'] = df['purchase_amount'].str.replace(',', '.', case=False, regex='False')\n",
|
||||
"df['purchase_amount'] = s = df['purchase_amount'] \n",
|
||||
"\n",
|
||||
"if s.str.startswith('0x').any():\n",
|
||||
"\tstr_data = s.apply(float.fromhex)\n",
|
||||
"else:\n",
|
||||
"\tstr_data = pd.to_numeric(s, errors='coerce')\n",
|
||||
"\t\n",
|
||||
"pd.Series(str_data, name='purchase_amount', index=s.index)\n",
|
||||
"\n",
|
||||
"df['purchase_amount'] = df['purchase_amount'].abs()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "8180fa05",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2026-02-22 20:18:35,563 - INFO - D-Tale started at: http://127.0.0.1:40000\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Open D-Tale at: http://127.0.0.1:40000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Launch D-Tale with the raw dataset\n",
|
||||
"# A link will appear — click it to open D-Tale in a new browser ta\n",
|
||||
"d = dtale.show(df, host='127.0.0.1', subprocess=False, open_browser=True)\n",
|
||||
"# A link will appear — click it to open D-Tale in a new browser tab\n",
|
||||
"d = dtale.show(df, host='127.0.0.1', subprocess=True, open_browser=False)\n",
|
||||
"print(\"Open D-Tale at:\", d._url) # lists all running instances\n"
|
||||
]
|
||||
},
|
||||
@@ -498,13 +516,15 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" TCP 169.254.62.24:40000 0.0.0.0:0 LISTENING 11972\n",
|
||||
" TCP 127.0.0.1:40000 0.0.0.0:0 LISTENING 50108\n",
|
||||
" TCP 127.0.0.1:55125 127.0.0.1:40000 TIME_WAIT 0\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Check if something else is already on port 40000\n",
|
||||
"# Check what is listening and binded IP, for debugging odd cases\n",
|
||||
"# to handle cases where d-tale might bind to docker IPs\n",
|
||||
"import subprocess\n",
|
||||
"result = subprocess.run('netstat -ano | findstr :40000', shell=True, capture_output=True, text=True)\n",
|
||||
"print(result.stdout or \"Nothing on port 40000\")"
|
||||
|
||||
711
claude/lab02_task2_telemetry_v2.ipynb
Normal file
711
claude/lab02_task2_telemetry_v2.ipynb
Normal file
@@ -0,0 +1,711 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lab 02 · Task 2 — Guided EDA and Data Cleaning\n",
|
||||
"\n",
|
||||
"**Estimated time:** ~50 minutes \n",
|
||||
"**Dataset:** `dataset_A_indie_game_telemetry.csv`\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### Objectives\n",
|
||||
"\n",
|
||||
"By the end of this task you will be able to:\n",
|
||||
"- Use **SweetViz** to rapidly profile a dataset and identify issues\n",
|
||||
"- Use **D-Tale** to navigate and inspect a dataframe interactively\n",
|
||||
"- Use **pandas** to fix the most common categories of data quality problems\n",
|
||||
"- Make and justify cleaning decisions rather than applying fixes mechanically\n",
|
||||
"\n",
|
||||
"### Tools and their roles in this task\n",
|
||||
"\n",
|
||||
"| Tool | Role |\n",
|
||||
"|---|---|\n",
|
||||
"| **SweetViz** | Automated profiling — generate a report, triage what needs fixing |\n",
|
||||
"| **D-Tale** | Interactive navigation — browse rows, inspect value counts, confirm fixes visually |\n",
|
||||
"| **pandas** | All actual cleaning — every transformation is explicit, reproducible code |\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Part 1 — Setup and First Look"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the raw dataset — do NOT clean anything yet\n",
|
||||
"df = pd.read_csv('dataset_A_indie_game_telemetry_v2.csv')\n",
|
||||
"\n",
|
||||
"print(f'Shape: {df.shape}')\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Column names and types as pandas inferred them\n",
|
||||
"print(df.dtypes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ Notice:** Several columns that should be boolean (`crash_flag`, `is_featured_event`, `is_long_session`) or\n",
|
||||
"> numeric (`purchase_amount`) have been inferred as `object`. This is your first signal that something is wrong.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 2 — Automated Profiling with SweetViz\n",
|
||||
"\n",
|
||||
"SweetViz generates a visual report for the entire dataset in one call. Think of it as a **triage tool** — it shows you *where* to look; the actual investigation and fixing happens afterwards."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate the profiling report (~30–60 seconds)\n",
|
||||
"report = sv.analyze(df)\n",
|
||||
"report.show_html('sweetviz_raw_report.html', open_browser=True)\n",
|
||||
"print('Report saved. Open sweetviz_raw_report.html in your browser.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Open the report and answer the following before moving on.\n",
|
||||
"\n",
|
||||
"| Question | Your finding |\n",
|
||||
"|---|---|\n",
|
||||
"| Which columns have missing values? Which has the most? | *...* |\n",
|
||||
"| Which columns are shown as TEXT but should be boolean or numeric? | *...* |\n",
|
||||
"| Are there numeric columns with suspicious ranges? | *...* |\n",
|
||||
"| How many distinct values does `region` have? Does that seem right? | *...* |\n",
|
||||
"| What is unusual about `purchase_amount`? | *...* |\n",
|
||||
"\n",
|
||||
"*(Double-click to fill in your answers)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 3 — Navigate and Inspect with D-Tale\n",
|
||||
"\n",
|
||||
"Before writing any cleaning code, use D-Tale to browse the raw data and *see* the problems with your own eyes. You will not clean anything here — D-Tale is your inspection tool.\n",
|
||||
"\n",
|
||||
"**Launch D-Tale:**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"d = dtale.show(df, host='127.0.0.1', subprocess=True, open_browser=True)\n",
|
||||
"print('=' * 50)\n",
|
||||
"print('D-Tale is running.')\n",
|
||||
"print('Open this URL in your browser:', d._url)\n",
|
||||
"print('In VS Code: Ctrl+click the URL above.')\n",
|
||||
"print('=' * 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Inspection checklist\n",
|
||||
"\n",
|
||||
"Use D-Tale to confirm each issue SweetViz flagged. For each column, click the column header → **Describe** to see value counts and distribution.\n",
|
||||
"\n",
|
||||
"| What to inspect | How to do it in D-Tale | What you should see |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `crash_flag` unique values | Column header → Describe | 8 variants of True/False |\n",
|
||||
"| `region` unique values | Column header → Describe | ~32 variants of 5 region names |\n",
|
||||
"| `input_method` unique values | Column header → Describe | A typo: `controllr` |\n",
|
||||
"| `purchase_amount` raw values | Sort column ascending | Some values use comma: `1,80` |\n",
|
||||
"| `avg_fps` distribution | Column header → Describe | Max of 10,000 — clearly wrong |\n",
|
||||
"| Missing values overview | Top menu → Describe (all columns) | `gpu_model` dominates |\n",
|
||||
"\n",
|
||||
"> Once you have seen the problems in the raw data, come back to the notebook for cleaning.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 4 — Clean with Pandas\n",
|
||||
"\n",
|
||||
"We will work through seven issue categories. Each section follows the same pattern:\n",
|
||||
"1. **Inspect** — confirm the problem in code\n",
|
||||
"2. **Fix** — apply the pandas transformation\n",
|
||||
"3. **Verify** — check the result\n",
|
||||
"\n",
|
||||
"We work on a copy of the original dataframe so the raw data is always available for comparison."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Always work on a copy — keep df as the unchanged original\n",
|
||||
"df_clean = df.copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.1 — Boolean columns: inconsistent encoding\n",
|
||||
"\n",
|
||||
"Three columns (`crash_flag`, `is_featured_event`, `is_long_session`) each have **8 different representations** of the same two values: `True`, `False`, `true`, `false`, `1`, `0`, `Yes`, `No`.\n",
|
||||
"\n",
|
||||
"The fix is to define an explicit mapping and apply it with `.map()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — confirm the problem\n",
|
||||
"print('crash_flag unique values:', sorted(df_clean['crash_flag'].dropna().unique()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define the mapping for replacements\n",
|
||||
"# Why did I place True:True and False: False? Ideas?\n",
|
||||
"\n",
|
||||
"bool_map = {\n",
|
||||
" 'True': True, 'true': True, '1': True, 'Yes': True, True: True,\n",
|
||||
" 'False': False, 'false': False, '0': False, 'No': False, False: False\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"df_clean['crash_flag'] = df_clean['crash_flag'].map(bool_map)\n",
|
||||
"\n",
|
||||
"print('crash_flag after mapping:')\n",
|
||||
"print(df_clean['crash_flag'].value_counts())\n",
|
||||
"print('Nulls:', df_clean['crash_flag'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Apply the same mapping to the other two boolean columns\n",
|
||||
"# Follow the same pattern as above for is_featured_event and is_long_session\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.2 — Categorical columns: case and whitespace inconsistency\n",
|
||||
"\n",
|
||||
"Four columns have values that are logically identical but differ in case or surrounding whitespace:\n",
|
||||
"- `region` — 32 variants of 5 values (e.g. `us-west`, `US-WEST`, `Us-west`, `' us-west '`)\n",
|
||||
"- `map_name` — 36 variants of 6 values\n",
|
||||
"- `platform` — 32 variants of 6 values\n",
|
||||
"- `input_method` — 30 variants, including a **typo**: `controllr`\n",
|
||||
"\n",
|
||||
"The fix uses pandas string methods: `.str.strip()` removes surrounding whitespace, `.str.lower()` normalises case. They can be chained."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many unique values before cleaning?\n",
|
||||
"print('region unique before:', df_clean['region'].unique())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix region: strip whitespace and convert to lowercase\n",
|
||||
"df_clean['region'] = df_clean['region'].str.strip().str.lower()\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('region unique after:', df_clean['region'].unique())\n",
|
||||
"print(df_clean['region'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Apply the same strip + lower to map_name and platform\n",
|
||||
"# Follow the same pattern as above\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# input_method needs an extra step: fix the typo and standardise kb/m → kbm\n",
|
||||
"\n",
|
||||
"# Step 0: Inspect\n",
|
||||
"print('input_method unique before:', df_clean['input_method'].unique())\n",
|
||||
"\n",
|
||||
"# Step 1: strip and lowercase first\n",
|
||||
"df_clean['input_method'] = df_clean['input_method'].str.strip().str.lower()\n",
|
||||
"\n",
|
||||
"# Step 2: fix the two inconsistencies with replace()\n",
|
||||
"df_clean['input_method'] = df_clean['input_method'].replace({\n",
|
||||
" 'controllr': 'controller', \n",
|
||||
" 'kb/m': 'kbm' \n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# Verify — should now show exactly 3 unique values\n",
|
||||
"print('input_method unique after:', df_clean['input_method'].unique())\n",
|
||||
"print(df_clean['input_method'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.3 — `purchase_amount`: comma as decimal separator\n",
|
||||
"\n",
|
||||
"About 12% of rows use a comma instead of a decimal point (`1,80` instead of `1.80`). This prevented pandas from reading the column as numeric, so it was loaded as `object`.\n",
|
||||
"\n",
|
||||
"The fix: replace the comma in the string, then convert the column type."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many rows have a comma?\n",
|
||||
"comma_rows = df_clean['purchase_amount'].astype(str).str.contains(',', na=False)\n",
|
||||
"print(f'Rows with comma separator: {comma_rows.sum()}')\n",
|
||||
"print('Examples:', df_clean.loc[comma_rows, 'purchase_amount'].unique()[:6])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: replace comma with decimal point, then convert to float\n",
|
||||
"df_clean['purchase_amount'] = (\n",
|
||||
" df_clean['purchase_amount']\n",
|
||||
" .astype(str) # ensure we are working with strings\n",
|
||||
" .str.replace(',', '.', regex=False) # swap the separator\n",
|
||||
" .replace('nan', float('nan')) # restore actual NaN rows\n",
|
||||
" .astype(float) # convert to numeric\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('dtype:', df_clean['purchase_amount'].dtype)\n",
|
||||
"print(df_clean['purchase_amount'].describe().round(2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.4 — Missing values: decisions and strategy\n",
|
||||
"\n",
|
||||
"Not all missing values are the same. Before deciding what to do, you need to understand *why* the value is missing — the reason determines the correct action.\n",
|
||||
"\n",
|
||||
"| Column | Missing | Why | Decision |\n",
|
||||
"|---|---|---|---|\n",
|
||||
"| `gpu_model` | 66.7% | Console/mobile players have no GPU | Keep column — missingness is meaningful |\n",
|
||||
"| `build_version` | 16.5% | Not logged in older sessions | Keep as NaN — valid historical absence |\n",
|
||||
"| `device_temp_c` | 4.9% | Sensor not available on some devices | Keep as NaN |\n",
|
||||
"| `session_length_s` | 1.0% | Session ended abnormally | Drop these rows — see below |\n",
|
||||
"| `ping_ms`, `purchase_amount`, `end_time` | < 2% | Sporadic gaps | Keep as NaN |\n",
|
||||
"\n",
|
||||
"<br>\n",
|
||||
"\n",
|
||||
"> **Context always matters.** There is no universal rule for missing values. The decisions above are reasonable for this dataset and analytical goal -- but a different context (e.g. building a machine learning model) might lead to different choices."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — missing value counts across all columns\n",
|
||||
"missing = df_clean.isnull().sum()\n",
|
||||
"missing_pct = (missing / len(df_clean) * 100).round(1)\n",
|
||||
"pd.DataFrame({'missing': missing, '%': missing_pct})[missing > 0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# session_length_s: drop rows where it is missing\n",
|
||||
"# Rationale: session duration is a core metric — a session with no recorded\n",
|
||||
"# duration is structurally incomplete and cannot be used for most analyses.\n",
|
||||
"# These 98 rows represent <1% of the dataset, so dropping is safe.\n",
|
||||
"\n",
|
||||
"rows_before = len(df_clean)\n",
|
||||
"df_clean = df_clean.dropna(subset=['session_length_s'])\n",
|
||||
"\n",
|
||||
"print(f'Rows dropped: {rows_before - len(df_clean)}')\n",
|
||||
"print(f'Rows remaining: {len(df_clean)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.5 — Outliers: `avg_fps`\n",
|
||||
"\n",
|
||||
"The `avg_fps` column has a maximum of 10,000 fps — physically impossible for a game running in real time. The 75th percentile is ~82 fps, confirming that 10,000 is a logging error, not an extreme but plausible value.\n",
|
||||
"\n",
|
||||
"**Decision:** set values above 300 fps to `NaN` rather than dropping the entire row. The rest of the data in those rows (crash flag, purchase amount, session type) is likely still valid — it would be wasteful to discard it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many rows are affected?\n",
|
||||
"threshold = 300\n",
|
||||
"outlier_mask = df_clean['avg_fps'] > threshold\n",
|
||||
"print(f'Rows with avg_fps > {threshold}: {outlier_mask.sum()}')\n",
|
||||
"print('\\navg_fps distribution (before fix):')\n",
|
||||
"print(df_clean['avg_fps'].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: set outlier values to NaN using .loc with a boolean mask\n",
|
||||
"df_clean.loc[outlier_mask, 'avg_fps'] = float('nan')\n",
|
||||
"\n",
|
||||
"# Verify — max should now be well below 300\n",
|
||||
"print('avg_fps distribution (after fix):')\n",
|
||||
"print(df_clean['avg_fps'].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.6 — Datetime columns: mixed formats *(guided + optional deep dive)*\n",
|
||||
"\n",
|
||||
"The `start_time` and `end_time` columns contain timestamps in at least four different formats:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"2025-07-18T18:32:00Z ← ISO 8601 with UTC marker\n",
|
||||
"2025-07-18 20:03:21-05:00 ← ISO 8601 with UTC offset\n",
|
||||
"20/10/2025 02:49 ← European DD/MM/YYYY\n",
|
||||
"08/01/2025 06:35 ← Ambiguous: US MM/DD or European?\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Mixed datetime formats are one of the most complex cleaning problems because some ambiguities cannot be resolved automatically — `08/01/2025` could be August 1st or January 8th, and no algorithm can determine which without external context.\n",
|
||||
"\n",
|
||||
"The pragmatic approach is to parse what can be parsed reliably and flag what cannot."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — what does start_time actually look like?\n",
|
||||
"print('Sample values from start_time:')\n",
|
||||
"print(df_clean['start_time'].dropna().sample(8, random_state=42).tolist())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: pd.to_datetime with utc=True normalises all timezone-aware formats to UTC.\n",
|
||||
"# errors='coerce' converts anything it cannot parse to NaT (Not a Time) instead of crashing.\n",
|
||||
"df_clean['start_time'] = pd.to_datetime(df_clean['start_time'], utc=True, errors='coerce')\n",
|
||||
"df_clean['end_time'] = pd.to_datetime(df_clean['end_time'], utc=True, errors='coerce')\n",
|
||||
"\n",
|
||||
"# Verify — check how many rows could not be parsed\n",
|
||||
"print('start_time dtype:', df_clean['start_time'].dtype)\n",
|
||||
"print('Unparsed start_time (NaT):', df_clean['start_time'].isna().sum())\n",
|
||||
"print('Unparsed end_time (NaT): ', df_clean['end_time'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Note:** The number of NaT values above reflects rows where pandas could not parse the format unambiguously. These are not errors in the code — they are genuinely ambiguous records that require a domain decision to resolve (e.g., knowing that the data source always uses DD/MM/YYYY).\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"**📌 Optional — explore the unparsed rows**\n",
|
||||
"\n",
|
||||
"If you want to go further, the cells below help you examine which formats failed and attempt a two-pass parsing strategy. This is optional and not required to complete the lab.\n",
|
||||
"\n",
|
||||
"<details>\n",
|
||||
"<summary>Click to expand optional exploration</summary>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Identify the raw values that failed to parse\n",
|
||||
"# We use the index of df_clean to look up the original values in df,\n",
|
||||
"# rather than a boolean mask — the two dataframes have different lengths\n",
|
||||
"# after the dropna() in step 4.4, so their indices no longer align.\n",
|
||||
"unparsed_idx = df_clean.index[df_clean['start_time'].isna()]\n",
|
||||
"print(f'Rows with unparsed start_time: {len(unparsed_idx)}')\n",
|
||||
"print('\\nRaw values that could not be parsed:')\n",
|
||||
"print(df.loc[unparsed_idx, 'start_time'].dropna().unique()[:15])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Two-pass strategy — try a second format for the rows that failed\n",
|
||||
"# If you determine the ambiguous rows use DD/MM/YYYY, try dayfirst=True on them only\n",
|
||||
"unparsed_idx = df_clean.index[df_clean['start_time'].isna()]\n",
|
||||
"df_clean.loc[unparsed_idx, 'start_time'] = pd.to_datetime(\n",
|
||||
" df.loc[unparsed_idx, 'start_time'],\n",
|
||||
" dayfirst=True, utc=True, errors='coerce'\n",
|
||||
")\n",
|
||||
"print('NaT after second pass:', df_clean['start_time'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"</details>\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 5 — Verify with D-Tale\n",
|
||||
"\n",
|
||||
"Reload the cleaned dataframe into D-Tale and visually confirm the fixes. This is a quick sanity check — you are looking for anything that looks wrong before committing to the cleaned dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Shut down the previous D-Tale instance and reload with the clean data\n",
|
||||
"d.kill()\n",
|
||||
"d_clean = dtale.show(df_clean, host='127.0.0.1', subprocess=True, open_browser=True)\n",
|
||||
"print('Open cleaned data in D-Tale:', d_clean._url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In D-Tale, verify the following:\n",
|
||||
"\n",
|
||||
"| Column | What to check | Expected result |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `crash_flag` | Describe → value counts | Only `True` and `False` |\n",
|
||||
"| `region` | Describe → value counts | Exactly 5 values, all lowercase |\n",
|
||||
"| `input_method` | Describe → value counts | Exactly 3 values, no `controllr` |\n",
|
||||
"| `purchase_amount` | Describe → dtype and range | float64, no commas |\n",
|
||||
"| `avg_fps` | Describe → max | Below 300 |\n",
|
||||
"| `session_length_s` | Describe → missing count | 0 |\n",
|
||||
"| `start_time` | Describe → dtype | datetime64 |\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 6 — Before vs After with SweetViz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c8f0e03a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Debug\n",
|
||||
"\n",
|
||||
"# Test comparison column by column\n",
|
||||
"# for col in df_clean.columns:\n",
|
||||
"# try:\n",
|
||||
"# sv.compare([df[[col]], 'Raw'], [df_clean[[col]].reset_index(drop=True), 'Cleaned'])\n",
|
||||
"# except Exception as e:\n",
|
||||
"# print(f\"FAIL: {col} — {e}\")\n",
|
||||
"# else:\n",
|
||||
"# print(f\"ok: {col}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"exclude = ['start_time', 'end_time'] # needed to exclude these two because we converted them to datetime and sweetviz is not able to compare it with the original data types\n",
|
||||
"\n",
|
||||
"compare = sv.compare(\n",
|
||||
" [df.drop(columns=exclude), 'Raw'],\n",
|
||||
" [df_clean.drop(columns=exclude).reset_index(drop=True), 'Cleaned']\n",
|
||||
")\n",
|
||||
"compare.show_html('sweetviz_comparison_report.html', open_browser=True)\n",
|
||||
"print('Comparison report saved.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the comparison report, check that:\n",
|
||||
"- Boolean columns changed from TEXT → BOOL with only 2 distinct values\n",
|
||||
"- Categorical columns show dramatically reduced DISTINCT counts\n",
|
||||
"- `purchase_amount` changed from TEXT → NUMERIC\n",
|
||||
"- `avg_fps` maximum is no longer 10,000\n",
|
||||
"- `session_length_s` shows 0 missing\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 7 — Save the Cleaned Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_clean.to_csv('dataset_A_indie_game_telemetry_clean.csv', index=False)\n",
|
||||
"print(f'Saved: {len(df_clean)} rows, {len(df_clean.columns)} columns')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Key Takeaways\n",
|
||||
"\n",
|
||||
"**Three tools, three roles — they complement each other:**\n",
|
||||
"- **SweetViz** surfaces issues fast but cannot fix them: use it for triage and validation\n",
|
||||
"- **D-Tale** lets you see the data as a human would: use it to understand problems before and after fixing them\n",
|
||||
"- **pandas** is where all actual cleaning happens: explicit, reproducible, and version-controllable\n",
|
||||
"\n",
|
||||
"**Cleaning decisions are not mechanical:**\n",
|
||||
"- Dropping `session_length_s` nulls was justified here: it would not be in every context\n",
|
||||
"- Setting `avg_fps` outliers to NaN (not dropping rows) preserved valid data in other columns\n",
|
||||
"- `gpu_model` missingness is structurally meaningful: imputing it would destroy information\n",
|
||||
"\n",
|
||||
"**Common issue categories you have now fixed with pandas:**\n",
|
||||
"\n",
|
||||
"| Issue | pandas approach |\n",
|
||||
"|---|---|\n",
|
||||
"| Boolean encoding chaos | `.map(bool_map)` |\n",
|
||||
"| Case / whitespace inconsistency | `.str.strip().str.lower()` |\n",
|
||||
"| Typos in categories | `.replace({'controllr': 'controller'})` |\n",
|
||||
"| Wrong decimal separator | `.str.replace(',', '.')` + `.astype(float)` |\n",
|
||||
"| Structural missing values | `dropna(subset=[...])` with explicit rationale |\n",
|
||||
"| Outliers | Boolean mask + `.loc[mask, col] = NaN` |\n",
|
||||
"| Mixed datetime formats | `pd.to_datetime(utc=True, errors='coerce')` |\n",
|
||||
"\n",
|
||||
"→ In **Task 3**, you will apply these skills independently to a new dataset — with a checklist but without step-by-step guidance."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
675
claude/lab02_task2_telemetry_v2b.ipynb
Normal file
675
claude/lab02_task2_telemetry_v2b.ipynb
Normal file
@@ -0,0 +1,675 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5,
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.10.0"
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lab 02 \u00b7 Task 2 \u2014 Guided EDA and Data Cleaning\n",
|
||||
"\n",
|
||||
"**Estimated time:** ~50 minutes \n",
|
||||
"**Dataset:** `dataset_A_indie_game_telemetry.csv`\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### Objectives\n",
|
||||
"\n",
|
||||
"By the end of this task you will be able to:\n",
|
||||
"- Use **SweetViz** to rapidly profile a dataset and identify issues\n",
|
||||
"- Use **D-Tale** to navigate and inspect a dataframe interactively\n",
|
||||
"- Use **pandas** to fix the most common categories of data quality problems\n",
|
||||
"- Make and justify cleaning decisions rather than applying fixes mechanically\n",
|
||||
"\n",
|
||||
"### Tools and their roles in this task\n",
|
||||
"\n",
|
||||
"| Tool | Role |\n",
|
||||
"|---|---|\n",
|
||||
"| **SweetViz** | Automated profiling \u2014 generate a report, triage what needs fixing |\n",
|
||||
"| **D-Tale** | Interactive navigation \u2014 browse rows, inspect value counts, confirm fixes visually |\n",
|
||||
"| **pandas** | All actual cleaning \u2014 every transformation is explicit, reproducible code |\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Part 1 \u2014 Setup and First Look"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the raw dataset \u2014 do NOT clean anything yet\n",
|
||||
"df = pd.read_csv('dataset_A_indie_game_telemetry.csv')\n",
|
||||
"\n",
|
||||
"print(f'Shape: {df.shape}')\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Column names and types as pandas inferred them\n",
|
||||
"print(df.dtypes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **\u26a0\ufe0f Notice:** Several columns that should be boolean (`crash_flag`, `is_featured_event`, `is_long_session`) or\n",
|
||||
"> numeric (`purchase_amount`) have been inferred as `object`. This is your first signal that something is wrong.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 2 \u2014 Automated Profiling with SweetViz\n",
|
||||
"\n",
|
||||
"SweetViz generates a visual report for the entire dataset in one call. Think of it as a **triage tool** \u2014 it shows you *where* to look; the actual investigation and fixing happens afterwards."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate the profiling report (~30\u201360 seconds)\n",
|
||||
"report = sv.analyze(df)\n",
|
||||
"report.show_html('sweetviz_raw_report.html', open_browser=False)\n",
|
||||
"print('Report saved. Open sweetviz_raw_report.html in your browser.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Open the report and answer the following before moving on.\n",
|
||||
"\n",
|
||||
"| Question | Your finding |\n",
|
||||
"|---|---|\n",
|
||||
"| Which columns have missing values? Which has the most? | *...* |\n",
|
||||
"| Which columns are shown as TEXT but should be boolean or numeric? | *...* |\n",
|
||||
"| Are there numeric columns with suspicious ranges? | *...* |\n",
|
||||
"| How many distinct values does `region` have? Does that seem right? | *...* |\n",
|
||||
"| What is unusual about `purchase_amount`? | *...* |\n",
|
||||
"\n",
|
||||
"*(Double-click to fill in your answers)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 3 \u2014 Navigate and Inspect with D-Tale\n",
|
||||
"\n",
|
||||
"Before writing any cleaning code, use D-Tale to browse the raw data and *see* the problems with your own eyes. You will not clean anything here \u2014 D-Tale is your inspection tool.\n",
|
||||
"\n",
|
||||
"**Launch D-Tale:**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"d = dtale.show(df, host='127.0.0.1', subprocess=False, open_browser=False)\n",
|
||||
"print('=' * 50)\n",
|
||||
"print('D-Tale is running.')\n",
|
||||
"print('Open this URL in your browser:', d._url)\n",
|
||||
"print('In VS Code: Ctrl+click the URL above.')\n",
|
||||
"print('=' * 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Inspection checklist\n",
|
||||
"\n",
|
||||
"Use D-Tale to confirm each issue SweetViz flagged. For each column, click the column header \u2192 **Describe** to see value counts and distribution.\n",
|
||||
"\n",
|
||||
"| What to inspect | How to do it in D-Tale | What you should see |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `crash_flag` unique values | Column header \u2192 Describe | 8 variants of True/False |\n",
|
||||
"| `region` unique values | Column header \u2192 Describe | ~32 variants of 5 region names |\n",
|
||||
"| `input_method` unique values | Column header \u2192 Describe | A typo: `controllr` |\n",
|
||||
"| `purchase_amount` raw values | Sort column ascending | Some values use comma: `1,80` |\n",
|
||||
"| `avg_fps` distribution | Column header \u2192 Describe | Max of 10,000 \u2014 clearly wrong |\n",
|
||||
"| Missing values overview | Top menu \u2192 Describe (all columns) | `gpu_model` dominates |\n",
|
||||
"\n",
|
||||
"> Once you have seen the problems in the raw data, come back to the notebook for cleaning.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 4 \u2014 Clean with Pandas\n",
|
||||
"\n",
|
||||
"We will work through seven issue categories. Each section follows the same pattern:\n",
|
||||
"1. **Inspect** \u2014 confirm the problem in code\n",
|
||||
"2. **Fix** \u2014 apply the pandas transformation\n",
|
||||
"3. **Verify** \u2014 check the result\n",
|
||||
"\n",
|
||||
"We work on a copy of the original dataframe so the raw data is always available for comparison."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Always work on a copy \u2014 keep df as the unchanged original\n",
|
||||
"df_clean = df.copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.1 \u2014 Boolean columns: inconsistent encoding\n",
|
||||
"\n",
|
||||
"Three columns (`crash_flag`, `is_featured_event`, `is_long_session`) each have **8 different representations** of the same two values: `True`, `False`, `true`, `false`, `1`, `0`, `Yes`, `No`.\n",
|
||||
"\n",
|
||||
"The fix is to define an explicit mapping and apply it with `.map()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect \u2014 confirm the problem\n",
|
||||
"print('crash_flag unique values:', sorted(df_clean['crash_flag'].dropna().unique()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define the mapping once \u2014 reuse it for all three columns\n",
|
||||
"bool_map = {\n",
|
||||
" 'True': True, 'true': True, '1': True, 'Yes': True,\n",
|
||||
" 'False': False, 'false': False, '0': False, 'No': False\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Apply to crash_flag\n",
|
||||
"df_clean['crash_flag'] = df_clean['crash_flag'].map(bool_map)\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('crash_flag after mapping:')\n",
|
||||
"print(df_clean['crash_flag'].value_counts())\n",
|
||||
"print('Nulls:', df_clean['crash_flag'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Apply the same mapping to the other two boolean columns\n",
|
||||
"# Follow the same pattern as above for is_featured_event and is_long_session\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.2 \u2014 Categorical columns: case and whitespace inconsistency\n",
|
||||
"\n",
|
||||
"Four columns have values that are logically identical but differ in case or surrounding whitespace:\n",
|
||||
"- `region` \u2014 32 variants of 5 values (e.g. `us-west`, `US-WEST`, `Us-west`, `' us-west '`)\n",
|
||||
"- `map_name` \u2014 36 variants of 6 values\n",
|
||||
"- `platform` \u2014 32 variants of 6 values\n",
|
||||
"- `input_method` \u2014 30 variants, including a **typo**: `controllr`\n",
|
||||
"\n",
|
||||
"The fix uses pandas string methods: `.str.strip()` removes surrounding whitespace, `.str.lower()` normalises case. They can be chained."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect \u2014 how many unique values before cleaning?\n",
|
||||
"print('region unique before:', df_clean['region'].nunique())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix region: strip whitespace and convert to lowercase\n",
|
||||
"df_clean['region'] = df_clean['region'].str.strip().str.lower()\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('region unique after:', df_clean['region'].nunique())\n",
|
||||
"print(df_clean['region'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Apply the same strip + lower to map_name and platform\n",
|
||||
"# Follow the same pattern as above\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# input_method needs an extra step: fix the typo and standardise kb/m \u2192 kbm\n",
|
||||
"\n",
|
||||
"# Step 1: strip and lowercase first\n",
|
||||
"df_clean['input_method'] = df_clean['input_method'].str.strip().str.lower()\n",
|
||||
"\n",
|
||||
"# Step 2: fix the two inconsistencies with replace()\n",
|
||||
"df_clean['input_method'] = df_clean['input_method'].replace({\n",
|
||||
" 'controllr': 'controller', # typo\n",
|
||||
" 'kb/m': 'kbm' # variant name \u2192 canonical form\n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# Verify \u2014 should now show exactly 3 unique values\n",
|
||||
"print('input_method unique after:', df_clean['input_method'].nunique())\n",
|
||||
"print(df_clean['input_method'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.3 \u2014 `purchase_amount`: comma as decimal separator\n",
|
||||
"\n",
|
||||
"About 12% of rows use a comma instead of a decimal point (`1,80` instead of `1.80`). This prevented pandas from reading the column as numeric, so it was loaded as `object`.\n",
|
||||
"\n",
|
||||
"The fix: replace the comma in the string, then convert the column type."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect \u2014 how many rows have a comma?\n",
|
||||
"comma_rows = df_clean['purchase_amount'].astype(str).str.contains(',', na=False)\n",
|
||||
"print(f'Rows with comma separator: {comma_rows.sum()}')\n",
|
||||
"print('Examples:', df_clean.loc[comma_rows, 'purchase_amount'].unique()[:6])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: replace comma with decimal point, then convert to float\n",
|
||||
"df_clean['purchase_amount'] = (\n",
|
||||
" df_clean['purchase_amount']\n",
|
||||
" .astype(str) # ensure we are working with strings\n",
|
||||
" .str.replace(',', '.', regex=False) # swap the separator\n",
|
||||
" .replace('nan', float('nan')) # restore actual NaN rows\n",
|
||||
" .astype(float) # convert to numeric\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('dtype:', df_clean['purchase_amount'].dtype)\n",
|
||||
"print(df_clean['purchase_amount'].describe().round(2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.4 \u2014 Missing values: decisions and strategy\n",
|
||||
"\n",
|
||||
"Not all missing values are the same. Before deciding what to do, you need to understand *why* the value is missing \u2014 the reason determines the correct action.\n",
|
||||
"\n",
|
||||
"| Column | Missing | Why | Decision |\n",
|
||||
"|---|---|---|---|\n",
|
||||
"| `gpu_model` | 66.7% | Console/mobile players have no GPU | Keep column \u2014 missingness is meaningful |\n",
|
||||
"| `build_version` | 16.5% | Not logged in older sessions | Keep as NaN \u2014 valid historical absence |\n",
|
||||
"| `device_temp_c` | 4.9% | Sensor not available on some devices | Keep as NaN |\n",
|
||||
"| `session_length_s` | 1.0% | Session ended abnormally | Drop these rows \u2014 see below |\n",
|
||||
"| `ping_ms`, `purchase_amount`, `end_time` | < 2% | Sporadic gaps | Keep as NaN |\n",
|
||||
"\n",
|
||||
"> **\u26a0\ufe0f Context always matters.** There is no universal rule for missing values. The decisions above are reasonable for this dataset and analytical goal \u2014 but a different context (e.g. building a machine learning model) might lead to different choices."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect \u2014 missing value counts across all columns\n",
|
||||
"missing = df_clean.isnull().sum()\n",
|
||||
"missing_pct = (missing / len(df_clean) * 100).round(1)\n",
|
||||
"pd.DataFrame({'missing': missing, '%': missing_pct})[missing > 0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# session_length_s: drop rows where it is missing\n",
|
||||
"# Rationale: session duration is a core metric \u2014 a session with no recorded\n",
|
||||
"# duration is structurally incomplete and cannot be used for most analyses.\n",
|
||||
"# These 98 rows represent <1% of the dataset, so dropping is safe.\n",
|
||||
"\n",
|
||||
"rows_before = len(df_clean)\n",
|
||||
"df_clean = df_clean.dropna(subset=['session_length_s'])\n",
|
||||
"\n",
|
||||
"print(f'Rows dropped: {rows_before - len(df_clean)}')\n",
|
||||
"print(f'Rows remaining: {len(df_clean)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.5 \u2014 Outliers: `avg_fps`\n",
|
||||
"\n",
|
||||
"The `avg_fps` column has a maximum of 10,000 fps \u2014 physically impossible for a game running in real time. The 75th percentile is ~82 fps, confirming that 10,000 is a logging error, not an extreme but plausible value.\n",
|
||||
"\n",
|
||||
"**Decision:** set values above 300 fps to `NaN` rather than dropping the entire row. The rest of the data in those rows (crash flag, purchase amount, session type) is likely still valid \u2014 it would be wasteful to discard it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect \u2014 how many rows are affected?\n",
|
||||
"threshold = 300\n",
|
||||
"outlier_mask = df_clean['avg_fps'] > threshold\n",
|
||||
"print(f'Rows with avg_fps > {threshold}: {outlier_mask.sum()}')\n",
|
||||
"print('\\navg_fps distribution (before fix):')\n",
|
||||
"print(df_clean['avg_fps'].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: set outlier values to NaN using .loc with a boolean mask\n",
|
||||
"df_clean.loc[outlier_mask, 'avg_fps'] = float('nan')\n",
|
||||
"\n",
|
||||
"# Verify \u2014 max should now be well below 300\n",
|
||||
"print('avg_fps distribution (after fix):')\n",
|
||||
"print(df_clean['avg_fps'].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.6 \u2014 Datetime columns: mixed formats *(guided + optional deep dive)*\n",
|
||||
"\n",
|
||||
"The `start_time` and `end_time` columns contain timestamps in at least four different formats:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"2025-07-18T18:32:00Z \u2190 ISO 8601 with UTC marker\n",
|
||||
"2025-07-18 20:03:21-05:00 \u2190 ISO 8601 with UTC offset\n",
|
||||
"20/10/2025 02:49 \u2190 European DD/MM/YYYY\n",
|
||||
"08/01/2025 06:35 \u2190 Ambiguous: US MM/DD or European?\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Mixed datetime formats are one of the most complex cleaning problems because some ambiguities cannot be resolved automatically \u2014 `08/01/2025` could be August 1st or January 8th, and no algorithm can determine which without external context.\n",
|
||||
"\n",
|
||||
"The pragmatic approach is to parse what can be parsed reliably and flag what cannot."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect \u2014 what does start_time actually look like?\n",
|
||||
"print('Sample values from start_time:')\n",
|
||||
"print(df_clean['start_time'].dropna().sample(8, random_state=42).tolist())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: pd.to_datetime with utc=True normalises all timezone-aware formats to UTC.\n",
|
||||
"# errors='coerce' converts anything it cannot parse to NaT (Not a Time) instead of crashing.\n",
|
||||
"df_clean['start_time'] = pd.to_datetime(df_clean['start_time'], utc=True, errors='coerce')\n",
|
||||
"df_clean['end_time'] = pd.to_datetime(df_clean['end_time'], utc=True, errors='coerce')\n",
|
||||
"\n",
|
||||
"# Verify \u2014 check how many rows could not be parsed\n",
|
||||
"print('start_time dtype:', df_clean['start_time'].dtype)\n",
|
||||
"print('Unparsed start_time (NaT):', df_clean['start_time'].isna().sum())\n",
|
||||
"print('Unparsed end_time (NaT): ', df_clean['end_time'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Note:** The number of NaT values above reflects rows where pandas could not parse the format unambiguously. These are not errors in the code \u2014 they are genuinely ambiguous records that require a domain decision to resolve (e.g., knowing that the data source always uses DD/MM/YYYY).\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"**\ud83d\udccc Optional \u2014 explore the unparsed rows**\n",
|
||||
"\n",
|
||||
"If you want to go further, the cells below help you examine which formats failed and attempt a two-pass parsing strategy. This is optional and not required to complete the lab.\n",
|
||||
"\n",
|
||||
"<details>\n",
|
||||
"<summary>Click to expand optional exploration</summary>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Identify the raw values that failed to parse\n",
|
||||
"# We use the index of df_clean to look up the original values in df,\n",
|
||||
"# rather than a boolean mask \u2014 the two dataframes have different lengths\n",
|
||||
"# after the dropna() in step 4.4, so their indices no longer align.\n",
|
||||
"unparsed_idx = df_clean.index[df_clean['start_time'].isna()]\n",
|
||||
"print(f'Rows with unparsed start_time: {len(unparsed_idx)}')\n",
|
||||
"print('\\nRaw values that could not be parsed:')\n",
|
||||
"print(df.loc[unparsed_idx, 'start_time'].dropna().unique()[:15])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Two-pass strategy \u2014 try a second format for the rows that failed\n",
|
||||
"# If you determine the ambiguous rows use DD/MM/YYYY, try dayfirst=True on them only\n",
|
||||
"unparsed_idx = df_clean.index[df_clean['start_time'].isna()]\n",
|
||||
"df_clean.loc[unparsed_idx, 'start_time'] = pd.to_datetime(\n",
|
||||
" df.loc[unparsed_idx, 'start_time'],\n",
|
||||
" dayfirst=True, utc=True, errors='coerce'\n",
|
||||
")\n",
|
||||
"print('NaT after second pass:', df_clean['start_time'].isna().sum())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"</details>\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 5 \u2014 Verify with D-Tale\n",
|
||||
"\n",
|
||||
"Reload the cleaned dataframe into D-Tale and visually confirm the fixes. This is a quick sanity check \u2014 you are looking for anything that looks wrong before committing to the cleaned dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Shut down the previous D-Tale instance and reload with the clean data\n",
|
||||
"d.kill()\n",
|
||||
"d_clean = dtale.show(df_clean, host='127.0.0.1', subprocess=False, open_browser=False)\n",
|
||||
"print('Open cleaned data in D-Tale:', d_clean._url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In D-Tale, verify the following:\n",
|
||||
"\n",
|
||||
"| Column | What to check | Expected result |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `crash_flag` | Describe \u2192 value counts | Only `True` and `False` |\n",
|
||||
"| `region` | Describe \u2192 value counts | Exactly 5 values, all lowercase |\n",
|
||||
"| `input_method` | Describe \u2192 value counts | Exactly 3 values, no `controllr` |\n",
|
||||
"| `purchase_amount` | Describe \u2192 dtype and range | float64, no commas |\n",
|
||||
"| `avg_fps` | Describe \u2192 max | Below 300 |\n",
|
||||
"| `session_length_s` | Describe \u2192 missing count | 0 |\n",
|
||||
"| `start_time` | Describe \u2192 dtype | datetime64 |\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 6 \u2014 Before vs After with SweetViz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate a side-by-side comparison report (~60 seconds)\n",
|
||||
"compare = sv.compare([df, 'Raw'], [df_clean, 'Cleaned'])\n",
|
||||
"compare.show_html('sweetviz_comparison_report.html', open_browser=False)\n",
|
||||
"print('Comparison report saved. Open sweetviz_comparison_report.html in your browser.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the comparison report, check that:\n",
|
||||
"- \u2705 Boolean columns changed from TEXT \u2192 BOOL with only 2 distinct values\n",
|
||||
"- \u2705 Categorical columns show dramatically reduced DISTINCT counts\n",
|
||||
"- \u2705 `purchase_amount` changed from TEXT \u2192 NUMERIC\n",
|
||||
"- \u2705 `avg_fps` maximum is no longer 10,000\n",
|
||||
"- \u2705 `session_length_s` shows 0 missing\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 7 \u2014 Save the Cleaned Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_clean.to_csv('dataset_A_indie_game_telemetry_clean.csv', index=False)\n",
|
||||
"print(f'Saved: {len(df_clean)} rows, {len(df_clean.columns)} columns')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## \ud83d\udd11 Key Takeaways\n",
|
||||
"\n",
|
||||
"**Three tools, three roles \u2014 they complement each other:**\n",
|
||||
"- **SweetViz** surfaces issues fast but cannot fix them \u2014 use it for triage and validation\n",
|
||||
"- **D-Tale** lets you see the data as a human would \u2014 use it to understand problems before and after fixing them\n",
|
||||
"- **pandas** is where all actual cleaning happens \u2014 explicit, reproducible, and version-controllable\n",
|
||||
"\n",
|
||||
"**Cleaning decisions are not mechanical:**\n",
|
||||
"- Dropping `session_length_s` nulls was justified here \u2014 it would not be in every context\n",
|
||||
"- Setting `avg_fps` outliers to NaN (not dropping rows) preserved valid data in other columns\n",
|
||||
"- `gpu_model` missingness is structurally meaningful \u2014 imputing it would destroy information\n",
|
||||
"\n",
|
||||
"**Common issue categories you have now fixed with pandas:**\n",
|
||||
"\n",
|
||||
"| Issue | pandas approach |\n",
|
||||
"|---|---|\n",
|
||||
"| Boolean encoding chaos | `.map(bool_map)` |\n",
|
||||
"| Case / whitespace inconsistency | `.str.strip().str.lower()` |\n",
|
||||
"| Typos in categories | `.replace({'controllr': 'controller'})` |\n",
|
||||
"| Wrong decimal separator | `.str.replace(',', '.')` + `.astype(float)` |\n",
|
||||
"| Structural missing values | `dropna(subset=[...])` with explicit rationale |\n",
|
||||
"| Outliers | Boolean mask + `.loc[mask, col] = NaN` |\n",
|
||||
"| Mixed datetime formats | `pd.to_datetime(utc=True, errors='coerce')` |\n",
|
||||
"\n",
|
||||
"\u2192 In **Task 3**, you will apply these skills independently to a new dataset \u2014 with a checklist but without step-by-step guidance."
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
757
claude/lab02_task2_telemetry_v3.ipynb
Normal file
757
claude/lab02_task2_telemetry_v3.ipynb
Normal file
@@ -0,0 +1,757 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lab 02 · Task 2 — Guided EDA and Data Cleaning\n",
|
||||
"\n",
|
||||
"**Estimated time:** ~50 minutes \n",
|
||||
"**Dataset:** `dataset_A_indie_game_telemetry.csv`\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### Objectives\n",
|
||||
"\n",
|
||||
"By the end of this task you will be able to:\n",
|
||||
"- Use **SweetViz** to rapidly profile a dataset and identify issues\n",
|
||||
"- Use **D-Tale** to navigate and inspect a dataframe interactively\n",
|
||||
"- Use **pandas** to fix the most common categories of data quality problems\n",
|
||||
"- Make and justify cleaning decisions rather than applying fixes mechanically\n",
|
||||
"\n",
|
||||
"### Tools and their roles in this task\n",
|
||||
"\n",
|
||||
"| Tool | Role |\n",
|
||||
"|---|---|\n",
|
||||
"| **SweetViz** | Automated profiling — generate a report, triage what needs fixing |\n",
|
||||
"| **D-Tale** | Interactive navigation — browse rows, inspect value counts, confirm fixes visually |\n",
|
||||
"| **pandas** | All actual cleaning — every transformation is explicit, reproducible code |\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Part 1 — Setup and First Look"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the raw dataset — do NOT clean anything yet\n",
|
||||
"df = pd.read_csv('dataset_A_indie_game_telemetry_v2.csv')\n",
|
||||
"\n",
|
||||
"print(f'Shape: {df.shape}')\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Column names and types as pandas inferred them\n",
|
||||
"print(df.dtypes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ Notice:** Several columns that should be boolean (`crash_flag`, `is_featured_event`, `is_long_session`) or\n",
|
||||
"> numeric (`purchase_amount`) have been inferred as `object`. This is your first signal that something is wrong.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 2 — Automated Profiling with SweetViz\n",
|
||||
"\n",
|
||||
"SweetViz generates a visual report for the entire dataset in one call. Think of it as a **triage tool** — it shows you *where* to look; the actual investigation and fixing happens afterwards."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate the profiling report (~30–60 seconds)\n",
|
||||
"report = sv.analyze(df)\n",
|
||||
"report.show_html('sweetviz_raw_report.html', open_browser=True)\n",
|
||||
"print('Report saved. Open sweetviz_raw_report.html in your browser.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Open the report and answer the following before moving on.\n",
|
||||
"\n",
|
||||
"| Question | Your finding |\n",
|
||||
"|---|---|\n",
|
||||
"| Which columns have missing values? Which has the most? | *...* |\n",
|
||||
"| Which columns are shown as TEXT but should be boolean or numeric? | *...* |\n",
|
||||
"| Are there numeric columns with suspicious ranges? | *...* |\n",
|
||||
"| How many distinct values does `region` have? Does that seem right? | *...* |\n",
|
||||
"| What is unusual about `purchase_amount`? | *...* |\n",
|
||||
"\n",
|
||||
"*(Double-click to fill in your answers)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 3 — Navigate and Inspect with D-Tale\n",
|
||||
"\n",
|
||||
"Before writing any cleaning code, use D-Tale to browse the raw data and *see* the problems with your own eyes. You will not clean anything here — D-Tale is your inspection tool.\n",
|
||||
"\n",
|
||||
"**Launch D-Tale:**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"d = dtale.show(df, host='127.0.0.1', subprocess=True, open_browser=True)\n",
|
||||
"print('=' * 50)\n",
|
||||
"print('D-Tale is running.')\n",
|
||||
"print('Open this URL in your browser:', d._url)\n",
|
||||
"print('In VS Code: Ctrl+click the URL above.')\n",
|
||||
"print('=' * 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Inspection checklist\n",
|
||||
"\n",
|
||||
"Use D-Tale to confirm each issue SweetViz flagged. For each column, click the column header → **Describe** to see value counts and distribution.\n",
|
||||
"\n",
|
||||
"| What to inspect | How to do it in D-Tale | What you should see |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `crash_flag` unique values | Column header → Describe | 8 variants of True/False |\n",
|
||||
"| `region` unique values | Column header → Describe | ~32 variants of 5 region names |\n",
|
||||
"| `input_method` unique values | Column header → Describe | A typo: `controllr` |\n",
|
||||
"| `purchase_amount` raw values | Sort column ascending | Some values use comma: `1,80` |\n",
|
||||
"| `avg_fps` distribution | Column header → Describe | Max of 10,000 — clearly wrong |\n",
|
||||
"| Missing values overview | Top menu → Describe (all columns) | `gpu_model` dominates |\n",
|
||||
"\n",
|
||||
"> Once you have seen the problems in the raw data, come back to the notebook for cleaning.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 4 — Clean with Pandas\n",
|
||||
"\n",
|
||||
"We will work through seven issue categories. Each section follows the same pattern:\n",
|
||||
"1. **Inspect** — confirm the problem in code\n",
|
||||
"2. **Fix** — apply the pandas transformation\n",
|
||||
"3. **Verify** — check the result\n",
|
||||
"\n",
|
||||
"We work on a copy of the original dataframe so the raw data is always available for comparison."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Always work on a copy — keep df as the unchanged original\n",
|
||||
"df_clean = df.copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.1 — Boolean columns: inconsistent encoding\n",
|
||||
"\n",
|
||||
"Three columns (`crash_flag`, `is_featured_event`, `is_long_session`) each have **8 different representations** of the same two values: `True`, `False`, `true`, `false`, `1`, `0`, `Yes`, `No`.\n",
|
||||
"\n",
|
||||
"The fix is to define an explicit mapping and apply it with `.map()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — confirm the problem\n",
|
||||
"print('crash_flag unique values:', sorted(df_clean['crash_flag'].dropna().unique()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define the mapping for replacements\n",
|
||||
"# Why did I place True:True and False: False? Ideas?\n",
|
||||
"\n",
|
||||
"bool_map = {\n",
|
||||
" 'True': True, 'true': True, '1': True, 'Yes': True, True: True,\n",
|
||||
" 'False': False, 'false': False, '0': False, 'No': False, False: False\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"df_clean['crash_flag'] = df_clean['crash_flag'].map(bool_map)\n",
|
||||
"\n",
|
||||
"print('crash_flag after mapping:')\n",
|
||||
"print(df_clean['crash_flag'].value_counts())\n",
|
||||
"print('Nulls:', df_clean['crash_flag'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Apply the same mapping to the other two boolean columns\n",
|
||||
"# Follow the same pattern as above for is_featured_event and is_long_session\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.2 — Categorical columns: case and whitespace inconsistency\n",
|
||||
"\n",
|
||||
"Four columns have values that are logically identical but differ in case or surrounding whitespace:\n",
|
||||
"- `region` — 32 variants of 5 values (e.g. `us-west`, `US-WEST`, `Us-west`, `' us-west '`)\n",
|
||||
"- `map_name` — 36 variants of 6 values\n",
|
||||
"- `platform` — 32 variants of 6 values\n",
|
||||
"- `input_method` — 30 variants, including a **typo**: `controllr`\n",
|
||||
"\n",
|
||||
"The fix uses pandas string methods: `.str.strip()` removes surrounding whitespace, `.str.lower()` normalises case. They can be chained."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many unique values before cleaning?\n",
|
||||
"print('region unique before:', df_clean['region'].unique())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix region: strip whitespace and convert to lowercase\n",
|
||||
"df_clean['region'] = df_clean['region'].str.strip().str.lower()\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('region unique after:', df_clean['region'].unique())\n",
|
||||
"print(df_clean['region'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TO DO: \n",
|
||||
"# Apply the same strip + lower to map_name and platform\n",
|
||||
"# Follow the same pattern as above\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# input_method needs an extra step: fix the typo and standardise kb/m → kbm\n",
|
||||
"\n",
|
||||
"# Step 0: Inspect\n",
|
||||
"print('input_method unique before:', df_clean['input_method'].unique())\n",
|
||||
"\n",
|
||||
"# Step 1: strip and lowercase first\n",
|
||||
"df_clean['input_method'] = df_clean['input_method'].str.strip().str.lower()\n",
|
||||
"\n",
|
||||
"# Step 2: fix the two inconsistencies with replace()\n",
|
||||
"df_clean['input_method'] = df_clean['input_method'].replace({\n",
|
||||
" 'controllr': 'controller', \n",
|
||||
" 'kb/m': 'kbm' \n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# Verify — should now show exactly 3 unique values\n",
|
||||
"print('input_method unique after:', df_clean['input_method'].unique())\n",
|
||||
"print(df_clean['input_method'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.3 — `purchase_amount`: comma as decimal separator\n",
|
||||
"\n",
|
||||
"About 12% of rows use a comma instead of a decimal point (`1,80` instead of `1.80`). This prevented pandas from reading the column as numeric, so it was loaded as `object`.\n",
|
||||
"\n",
|
||||
"The fix: replace the comma in the string, then convert the column type."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many rows have a comma?\n",
|
||||
"comma_rows = df_clean['purchase_amount'].astype(str).str.contains(',', na=False)\n",
|
||||
"print(f'Rows with comma separator: {comma_rows.sum()}')\n",
|
||||
"print('Examples:', df_clean.loc[comma_rows, 'purchase_amount'].unique()[:6])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: replace comma with decimal point, then convert to float\n",
|
||||
"df_clean['purchase_amount'] = (\n",
|
||||
" df_clean['purchase_amount']\n",
|
||||
" .astype(str) # ensure we are working with strings\n",
|
||||
" .str.replace(',', '.', regex=False) # swap the separator\n",
|
||||
" .replace('nan', float('nan')) # restore actual NaN rows\n",
|
||||
" .astype(float) # convert to numeric\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('dtype:', df_clean['purchase_amount'].dtype)\n",
|
||||
"print(df_clean['purchase_amount'].describe().round(2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.4 — Missing values: decisions and strategy\n",
|
||||
"\n",
|
||||
"Not all missing values are the same. Before deciding what to do, you need to understand *why* the value is missing — the reason determines the correct action.\n",
|
||||
"\n",
|
||||
"| Column | Missing | Why | Decision |\n",
|
||||
"|---|---|---|---|\n",
|
||||
"| `gpu_model` | 66.7% | Console/mobile players have no GPU | Keep column — missingness is meaningful |\n",
|
||||
"| `build_version` | 16.5% | Not logged in older sessions | Keep as NaN — valid historical absence |\n",
|
||||
"| `device_temp_c` | 4.9% | Sensor not available on some devices | Keep as NaN |\n",
|
||||
"| `session_length_s` | 1.0% | Session ended abnormally | Drop missing rows now; fix negatives/outliers after datetime correction (section 4.6) |\n",
|
||||
"| `ping_ms`, `purchase_amount`, `end_time` | < 2% | Sporadic gaps | Keep as NaN |\n",
|
||||
"\n",
|
||||
"<br>\n",
|
||||
"\n",
|
||||
"> **⚠️ Context always matters.** There is no universal rule for missing values. The decisions above are reasonable for this dataset and analytical goal — but a different context might lead to different choices.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — missing value counts across all columns\n",
|
||||
"missing = df_clean.isnull().sum()\n",
|
||||
"missing_pct = (missing / len(df_clean) * 100).round(1)\n",
|
||||
"pd.DataFrame({'missing': missing, '%': missing_pct})[missing > 0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# session_length_s: drop rows where it is missing\n",
|
||||
"# Rationale: session duration is a core metric — a session with no recorded\n",
|
||||
"# duration is structurally incomplete and cannot be used for most analyses.\n",
|
||||
"# These 98 rows represent <1% of the dataset, so dropping is safe.\n",
|
||||
"\n",
|
||||
"rows_before = len(df_clean)\n",
|
||||
"df_clean = df_clean.dropna(subset=['session_length_s'])\n",
|
||||
"\n",
|
||||
"print(f'Rows dropped: {rows_before - len(df_clean)}')\n",
|
||||
"print(f'Rows remaining: {len(df_clean)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.5 — Outliers: `avg_fps`\n",
|
||||
"\n",
|
||||
"The `avg_fps` column has a maximum of 10,000 fps — physically impossible for a game running in real time. The 75th percentile is ~82 fps, confirming that 10,000 is a logging error, not an extreme but plausible value.\n",
|
||||
"\n",
|
||||
"**Decision:** set values above 300 fps to `NaN` rather than dropping the entire row. The rest of the data in those rows (crash flag, purchase amount, session type) is likely still valid — it would be wasteful to discard it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many rows are affected?\n",
|
||||
"threshold = 300\n",
|
||||
"outlier_mask = df_clean['avg_fps'] > threshold\n",
|
||||
"print(f'Rows with avg_fps > {threshold}: {outlier_mask.sum()}')\n",
|
||||
"print('\\navg_fps distribution (before fix):')\n",
|
||||
"print(df_clean['avg_fps'].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: set outlier values to NaN using .loc with a boolean mask\n",
|
||||
"df_clean.loc[outlier_mask, 'avg_fps'] = float('nan')\n",
|
||||
"\n",
|
||||
"# Verify — max should now be well below 300\n",
|
||||
"print('avg_fps distribution (after fix):')\n",
|
||||
"print(df_clean['avg_fps'].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.6 — Datetime columns: mixed formats\n",
|
||||
"\n",
|
||||
"The `start_time` and `end_time` columns contain timestamps in at least four different formats:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"2025-07-18T18:32:00Z : ISO 8601 with UTC marker\n",
|
||||
"2025-07-18 20:03:21-05:00 : ISO 8601 with UTC offset\n",
|
||||
"20/10/2025 02:49 : European DD/MM/YYYY\n",
|
||||
"08/01/2025 06:35 : Ambiguous: US MM/DD or European DD/MM?\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Mixed datetime formats are one of the most complex cleaning problems because some ambiguities cannot be resolved automatically -- `08/01/2025` could be August 1st or January 8th, and no algorithm can determine which without external context.\n",
|
||||
"\n",
|
||||
"> **Connection to `session_length_s`:** The negative values and extreme outliers we saw earlier in `session_length_s` are not independent errors -- they are a *consequence* of this datetime problem. When `start_time` and `end_time` were recorded in different formats and misinterpreted, the pre-computed duration came out wrong. After fixing the timestamps, we will recompute `session_length_s` from scratch and validate the result.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — what does start_time actually look like?\n",
|
||||
"print('Sample values from start_time:')\n",
|
||||
"print(df_clean['start_time'].dropna().sample(8, random_state=42).tolist())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: pd.to_datetime with utc=True normalises all timezone-aware formats to UTC.\n",
|
||||
"# errors='coerce' converts anything it cannot parse to NaT (Not a Time) instead of crashing.\n",
|
||||
"df_clean['start_time'] = pd.to_datetime(df_clean['start_time'], utc=True, errors='coerce')\n",
|
||||
"df_clean['end_time'] = pd.to_datetime(df_clean['end_time'], utc=True, errors='coerce')\n",
|
||||
"\n",
|
||||
"# Verify — check how many rows could not be parsed\n",
|
||||
"print('start_time dtype:', df_clean['start_time'].dtype)\n",
|
||||
"print('Unparsed start_time (NaT):', df_clean['start_time'].isna().sum())\n",
|
||||
"print('Unparsed end_time (NaT): ', df_clean['end_time'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Recompute session_length_s from the corrected timestamps\n",
|
||||
"# Now that start_time and end_time are both timezone-aware UTC datetimes,\n",
|
||||
"# the subtraction is unambiguous. We convert the result to seconds.\n",
|
||||
"df_clean['session_length_s'] = (\n",
|
||||
" df_clean['end_time'] - df_clean['start_time']\n",
|
||||
").dt.total_seconds()\n",
|
||||
"\n",
|
||||
"print('session_length_s after recomputation:')\n",
|
||||
"print(df_clean['session_length_s'].describe().round(1))\n",
|
||||
"print(f'\\nNegative values: {(df_clean[\"session_length_s\"] < 0).sum()}')\n",
|
||||
"print(f'> 8h (28800s): {(df_clean[\"session_length_s\"] > 28800).sum()}')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Any remaining negative values are rows where timestamps were genuinely\n",
|
||||
"# ambiguous and could not be resolved -- the computed duration is meaningless.\n",
|
||||
"# Set them to NaN rather than dropping the row.\n",
|
||||
"\n",
|
||||
"neg_mask = df_clean['session_length_s'] < 0\n",
|
||||
"df_clean.loc[neg_mask, 'session_length_s'] = float('nan')\n",
|
||||
"print(f'Negative durations set to NaN: {neg_mask.sum()}')\n",
|
||||
"\n",
|
||||
"# Values above 8 hours (28800s) are suspicious for a game session.\n",
|
||||
"# Inspect them before deciding.\n",
|
||||
"\n",
|
||||
"long_mask = df_clean['session_length_s'] > 28800\n",
|
||||
"print(f'\\nSessions > 8h: {long_mask.sum()}')\n",
|
||||
"print(df_clean.loc[long_mask, ['session_length_s', 'start_time', 'end_time']].head(5).to_string())\n",
|
||||
"\n",
|
||||
"# Decision: sessions > 8h are almost certainly logging errors (game left running,\n",
|
||||
"# server not recording session end). Set to NaN.\n",
|
||||
"# As always — this threshold is a judgement call that depends on the game and context.\n",
|
||||
"df_clean.loc[long_mask, 'session_length_s'] = float('nan')\n",
|
||||
"print(f'\\nSessions > 8h set to NaN: {long_mask.sum()}')\n",
|
||||
"print('\\nFinal session_length_s distribution:')\n",
|
||||
"print(df_clean['session_length_s'].describe().round(1))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Note:** The number of NaT values above reflects rows where pandas could not parse the format unambiguously. These are not errors in the code — they are genuinely ambiguous records that require a domain decision to resolve (e.g., knowing that the data source always uses DD/MM/YYYY).\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"**📌 Optional — explore the unparsed rows**\n",
|
||||
"\n",
|
||||
"If you want to go further, the cells below help you examine which formats failed and attempt a two-pass parsing strategy. This is optional and not required to complete the lab.\n",
|
||||
"\n",
|
||||
"<details>\n",
|
||||
"<summary>Click to expand optional exploration</summary>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Identify the raw values that failed to parse\n",
|
||||
"# We use the index of df_clean to look up the original values in df,\n",
|
||||
"# rather than a boolean mask — the two dataframes have different lengths\n",
|
||||
"# after the dropna() in step 4.4, so their indices no longer align.\n",
|
||||
"unparsed_idx = df_clean.index[df_clean['start_time'].isna()]\n",
|
||||
"print(f'Rows with unparsed start_time: {len(unparsed_idx)}')\n",
|
||||
"print('\\nRaw values that could not be parsed:')\n",
|
||||
"print(df.loc[unparsed_idx, 'start_time'].dropna().unique()[:15])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Two-pass strategy — try a second format for the rows that failed\n",
|
||||
"# If you determine the ambiguous rows use DD/MM/YYYY, try dayfirst=True on them only\n",
|
||||
"unparsed_idx = df_clean.index[df_clean['start_time'].isna()]\n",
|
||||
"df_clean.loc[unparsed_idx, 'start_time'] = pd.to_datetime(\n",
|
||||
" df.loc[unparsed_idx, 'start_time'],\n",
|
||||
" dayfirst=True, utc=True, errors='coerce'\n",
|
||||
")\n",
|
||||
"print('NaT after second pass:', df_clean['start_time'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"</details>\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 5 — Verify with D-Tale\n",
|
||||
"\n",
|
||||
"Reload the cleaned dataframe into D-Tale and visually confirm the fixes. This is a quick sanity check — you are looking for anything that looks wrong before committing to the cleaned dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Shut down the previous D-Tale instance and reload with the clean data\n",
|
||||
"d.kill()\n",
|
||||
"d_clean = dtale.show(df_clean, host='127.0.0.1', subprocess=True, open_browser=True)\n",
|
||||
"print('Open cleaned data in D-Tale:', d_clean._url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In D-Tale, verify the following:\n",
|
||||
"\n",
|
||||
"| Column | What to check | Expected result |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `crash_flag` | Describe → value counts | Only `True` and `False` |\n",
|
||||
"| `region` | Describe → value counts | Exactly 5 values, all lowercase |\n",
|
||||
"| `input_method` | Describe → value counts | Exactly 3 values, no `controllr` |\n",
|
||||
"| `purchase_amount` | Describe → dtype and range | float64, no commas |\n",
|
||||
"| `avg_fps` | Describe → max | Below 300 |\n",
|
||||
"| `session_length_s` | Describe → min and max | No negatives, no values > 28800 |\n",
|
||||
"| `start_time` | Describe → dtype | datetime64 |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c8f0e03a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Debug\n",
|
||||
"\n",
|
||||
"# Test comparison column by column\n",
|
||||
"# for col in df_clean.columns:\n",
|
||||
"# try:\n",
|
||||
"# sv.compare([df[[col]], 'Raw'], [df_clean[[col]].reset_index(drop=True), 'Cleaned'])\n",
|
||||
"# except Exception as e:\n",
|
||||
"# print(f\"FAIL: {col} — {e}\")\n",
|
||||
"# else:\n",
|
||||
"# print(f\"ok: {col}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"exclude = ['start_time', 'end_time'] # needed to exclude these two because we converted them to datetime and sweetviz is not able to compare it with the original data types\n",
|
||||
"\n",
|
||||
"compare = sv.compare(\n",
|
||||
" [df.drop(columns=exclude), 'Raw'],\n",
|
||||
" [df_clean.drop(columns=exclude).reset_index(drop=True), 'Cleaned']\n",
|
||||
")\n",
|
||||
"compare.show_html('sweetviz_comparison_report.html', open_browser=True)\n",
|
||||
"print('Comparison report saved.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the comparison report, check that:\n",
|
||||
"- Boolean columns changed from TEXT → BOOL with only 2 distinct values\n",
|
||||
"- Categorical columns show dramatically reduced DISTINCT counts\n",
|
||||
"- `purchase_amount` changed from TEXT → NUMERIC\n",
|
||||
"- `avg_fps` maximum is no longer 10,000\n",
|
||||
"- `session_length_s` shows 0 missing\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 7 — Save the Cleaned Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_clean.to_csv('dataset_A_indie_game_telemetry_clean.csv', index=False)\n",
|
||||
"print(f'Saved: {len(df_clean)} rows, {len(df_clean.columns)} columns')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Key Takeaways\n",
|
||||
"\n",
|
||||
"**Three tools, three roles — they complement each other:**\n",
|
||||
"- **SweetViz** surfaces issues fast but cannot fix them: use it for triage and validation\n",
|
||||
"- **D-Tale** lets you see the data as a human would: use it to understand problems before and after fixing them\n",
|
||||
"- **pandas** is where all actual cleaning happens: explicit, reproducible, and version-controllable\n",
|
||||
"\n",
|
||||
"**Cleaning decisions are not mechanical:**\n",
|
||||
"- Dropping `session_length_s` nulls was justified here: it would not be in every context\n",
|
||||
"- Setting `avg_fps` outliers to NaN (not dropping rows) preserved valid data in other columns\n",
|
||||
"- `gpu_model` missingness is structurally meaningful: imputing it would destroy information\n",
|
||||
"\n",
|
||||
"**Common issue categories you have now fixed with pandas:**\n",
|
||||
"\n",
|
||||
"| Issue | pandas approach |\n",
|
||||
"|---|---|\n",
|
||||
"| Boolean encoding chaos | `.map(bool_map)` |\n",
|
||||
"| Case / whitespace inconsistency | `.str.strip().str.lower()` |\n",
|
||||
"| Typos in categories | `.replace({'controllr': 'controller'})` |\n",
|
||||
"| Wrong decimal separator | `.str.replace(',', '.')` + `.astype(float)` |\n",
|
||||
"| Structural missing values | `dropna(subset=[...])` with explicit rationale |\n",
|
||||
"| Outliers | Boolean mask + `.loc[mask, col] = NaN` |\n",
|
||||
"| Mixed datetime formats | `pd.to_datetime(utc=True, errors='coerce')` |\n",
|
||||
"\n",
|
||||
"→ In **Task 3**, you will apply these skills independently to a new dataset — with a checklist but without step-by-step guidance."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
844
claude/lab02_task2_telemetry_v4.ipynb
Normal file
844
claude/lab02_task2_telemetry_v4.ipynb
Normal file
@@ -0,0 +1,844 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lab 02 · Task 2 — Guided EDA and Data Cleaning\n",
|
||||
"\n",
|
||||
"**Estimated time:** ~50 minutes \n",
|
||||
"**Dataset:** `dataset_A_indie_game_telemetry.csv`\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### Objectives\n",
|
||||
"\n",
|
||||
"By the end of this task you will be able to:\n",
|
||||
"- Use **SweetViz** to rapidly profile a dataset and identify issues\n",
|
||||
"- Use **D-Tale** to navigate and inspect a dataframe interactively\n",
|
||||
"- Use **pandas** to fix the most common categories of data quality problems\n",
|
||||
"- Make and justify cleaning decisions rather than applying fixes mechanically\n",
|
||||
"\n",
|
||||
"### Tools and their roles in this task\n",
|
||||
"\n",
|
||||
"| Tool | Role |\n",
|
||||
"|---|---|\n",
|
||||
"| **SweetViz** | Automated profiling — generate a report, triage what needs fixing |\n",
|
||||
"| **D-Tale** | Interactive navigation — browse rows, inspect value counts, confirm fixes visually |\n",
|
||||
"| **pandas** | All actual cleaning — every transformation is explicit, reproducible code |\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Part 1 — Setup and First Look"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the raw dataset — do NOT clean anything yet\n",
|
||||
"df = pd.read_csv('dataset_A_indie_game_telemetry_v2.csv')\n",
|
||||
"\n",
|
||||
"print(f'Shape: {df.shape}')\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Column names and types as pandas inferred them\n",
|
||||
"print(df.dtypes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ Notice:** Several columns that should be boolean (`crash_flag`, `is_featured_event`, `is_long_session`) or\n",
|
||||
"> numeric (`purchase_amount`) have been inferred as `object`. This is your first signal that something is wrong.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 2 — Automated Profiling with SweetViz\n",
|
||||
"\n",
|
||||
"SweetViz generates a visual report for the entire dataset in one call. Think of it as a **triage tool** — it shows you *where* to look; the actual investigation and fixing happens afterwards."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate the profiling report (~30–60 seconds)\n",
|
||||
"report = sv.analyze(df)\n",
|
||||
"report.show_html('sweetviz_raw_report.html', open_browser=True)\n",
|
||||
"print('Report saved. Open sweetviz_raw_report.html in your browser.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Open the report and answer the following before moving on.\n",
|
||||
"\n",
|
||||
"| Question | Your finding |\n",
|
||||
"|---|---|\n",
|
||||
"| Which columns have missing values? Which has the most? | *...* |\n",
|
||||
"| Which columns are shown as TEXT but should be boolean or numeric? | *...* |\n",
|
||||
"| Are there numeric columns with suspicious ranges? | *...* |\n",
|
||||
"| How many distinct values does `region` have? Does that seem right? | *...* |\n",
|
||||
"| What is unusual about `purchase_amount`? | *...* |\n",
|
||||
"\n",
|
||||
"*(Double-click to fill in your answers)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 3 — Navigate and Inspect with D-Tale\n",
|
||||
"\n",
|
||||
"Before writing any cleaning code, use D-Tale to browse the raw data and *see* the problems with your own eyes. You will not clean anything here — D-Tale is your inspection tool.\n",
|
||||
"\n",
|
||||
"**Launch D-Tale:**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"d = dtale.show(df, host='127.0.0.1', subprocess=True, open_browser=True)\n",
|
||||
"print('=' * 50)\n",
|
||||
"print('D-Tale is running.')\n",
|
||||
"print('Open this URL in your browser:', d._url)\n",
|
||||
"print('In VS Code: Ctrl+click the URL above.')\n",
|
||||
"print('=' * 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Inspection checklist\n",
|
||||
"\n",
|
||||
"Use D-Tale to confirm each issue SweetViz flagged. For each column, click the column header → **Describe** to see value counts and distribution.\n",
|
||||
"\n",
|
||||
"| What to inspect | How to do it in D-Tale | What you should see |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `crash_flag` unique values | Column header → Describe | 8 variants of True/False |\n",
|
||||
"| `region` unique values | Column header → Describe | ~32 variants of 5 region names |\n",
|
||||
"| `input_method` unique values | Column header → Describe | A typo: `controllr` |\n",
|
||||
"| `purchase_amount` raw values | Sort column ascending | Some values use comma: `1,80` |\n",
|
||||
"| `avg_fps` distribution | Column header → Describe | Max of 10,000 — clearly wrong |\n",
|
||||
"| Missing values overview | Top menu → Describe (all columns) | `gpu_model` dominates |\n",
|
||||
"\n",
|
||||
"<br>\n",
|
||||
"\n",
|
||||
"> Once you have seen the problems in the raw data, come back to the notebook for cleaning.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 4 — Clean with Pandas\n",
|
||||
"\n",
|
||||
"We will work through seven issue categories. Each section follows the same pattern:\n",
|
||||
"1. **Inspect** — confirm the problem in code\n",
|
||||
"2. **Fix** — apply the pandas transformation\n",
|
||||
"3. **Verify** — check the result\n",
|
||||
"\n",
|
||||
"We work on a copy of the original dataframe so the raw data is always available for comparison."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Always work on a copy — keep df as the unchanged original\n",
|
||||
"df_clean = df.copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.1 — Boolean columns: inconsistent encoding\n",
|
||||
"\n",
|
||||
"Three columns (`crash_flag`, `is_featured_event`, `is_long_session`) each have **8 different representations** of the same two values: `True`, `False`, `true`, `false`, `1`, `0`, `Yes`, `No`.\n",
|
||||
"\n",
|
||||
"The fix is to define an explicit mapping and apply it with `.map()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — confirm the problem\n",
|
||||
"print('crash_flag unique values:', sorted(df_clean['crash_flag'].dropna().unique()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define the mapping for replacements\n",
|
||||
"# Why did I place True:True and False: False? Ideas?\n",
|
||||
"\n",
|
||||
"bool_map = {\n",
|
||||
" 'True': True, 'true': True, '1': True, 'Yes': True, True: True,\n",
|
||||
" 'False': False, 'false': False, '0': False, 'No': False, False: False\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"df_clean['crash_flag'] = df_clean['crash_flag'].map(bool_map)\n",
|
||||
"\n",
|
||||
"print('crash_flag after mapping:')\n",
|
||||
"print(df_clean['crash_flag'].value_counts())\n",
|
||||
"print('Nulls:', df_clean['crash_flag'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TO DO:\n",
|
||||
"# Apply the same mapping to the other two boolean columns\n",
|
||||
"# Follow the same pattern as above for is_featured_event and is_long_session\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.2 — Categorical columns: case and whitespace inconsistency\n",
|
||||
"\n",
|
||||
"Four columns have values that are logically identical but differ in case or surrounding whitespace:\n",
|
||||
"- `region` — 32 variants of 5 values (e.g. `us-west`, `US-WEST`, `Us-west`, `' us-west '`)\n",
|
||||
"- `map_name` — 36 variants of 6 values\n",
|
||||
"- `platform` — 32 variants of 6 values\n",
|
||||
"- `input_method` — 30 variants, including a **typo**: `controllr`\n",
|
||||
"\n",
|
||||
"The fix uses pandas string methods: `.str.strip()` removes surrounding whitespace, `.str.lower()` normalises case. They can be chained."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many unique values before cleaning?\n",
|
||||
"print('region unique before:', df_clean['region'].unique())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix region: strip whitespace and convert to lowercase\n",
|
||||
"df_clean['region'] = df_clean['region'].str.strip().str.lower()\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('region unique after:', df_clean['region'].unique())\n",
|
||||
"print(df_clean['region'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TO DO: \n",
|
||||
"# Apply the same strip + lower to map_name and platform\n",
|
||||
"# Follow the same pattern as above\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# input_method needs an extra step: fix the typo and standardise kb/m → kbm\n",
|
||||
"\n",
|
||||
"# Step 0: Inspect\n",
|
||||
"print('input_method unique before:', df_clean['input_method'].unique())\n",
|
||||
"\n",
|
||||
"# Step 1: strip and lowercase first\n",
|
||||
"df_clean['input_method'] = df_clean['input_method'].str.strip().str.lower()\n",
|
||||
"\n",
|
||||
"# Step 2: fix the two inconsistencies with replace()\n",
|
||||
"df_clean['input_method'] = df_clean['input_method'].replace({\n",
|
||||
" 'controllr': 'controller', \n",
|
||||
" 'kb/m': 'kbm' \n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# Verify — should now show exactly 3 unique values\n",
|
||||
"print('input_method unique after:', df_clean['input_method'].unique())\n",
|
||||
"print(df_clean['input_method'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.3 — `purchase_amount`: comma as decimal separator\n",
|
||||
"\n",
|
||||
"About 12% of rows use a comma instead of a decimal point (`1,80` instead of `1.80`). This prevented pandas from reading the column as numeric, so it was loaded as `object`.\n",
|
||||
"\n",
|
||||
"The fix: replace the comma in the string, then convert the column type."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many rows have a comma?\n",
|
||||
"comma_rows = df_clean['purchase_amount'].astype(str).str.contains(',', na=False)\n",
|
||||
"print(f'Rows with comma separator: {comma_rows.sum()}')\n",
|
||||
"print('Examples:', df_clean.loc[comma_rows, 'purchase_amount'].unique()[:6])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: replace comma with decimal point, then convert to float\n",
|
||||
"df_clean['purchase_amount'] = (\n",
|
||||
" df_clean['purchase_amount']\n",
|
||||
" .astype(str) # ensure we are working with strings\n",
|
||||
" .str.replace(',', '.', regex=False) # swap the separator\n",
|
||||
" .replace('nan', float('nan')) # restore actual NaN rows\n",
|
||||
" .astype(float) # convert to numeric\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Verify\n",
|
||||
"print('dtype:', df_clean['purchase_amount'].dtype)\n",
|
||||
"print(df_clean['purchase_amount'].describe().round(2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.4 — Missing values: decisions and strategy\n",
|
||||
"\n",
|
||||
"Not all missing values are the same. Before deciding what to do, you need to understand *why* the value is missing — the reason determines the correct action.\n",
|
||||
"\n",
|
||||
"| Column | Missing | Why | Decision |\n",
|
||||
"|---|---|---|---|\n",
|
||||
"| `gpu_model` | 66.7% | Console/mobile players have no GPU | Keep column — missingness is meaningful |\n",
|
||||
"| `build_version` | 16.5% | Not logged in older sessions | Keep as NaN — valid historical absence |\n",
|
||||
"| `device_temp_c` | 4.9% | Sensor not available on some devices | Keep as NaN |\n",
|
||||
"| `session_length_s` | 1.0% | Session ended abnormally | Drop missing rows now; fix negatives/outliers after datetime correction (section 4.6) |\n",
|
||||
"| `ping_ms`, `purchase_amount`, `end_time` | < 2% | Sporadic gaps | Keep as NaN |\n",
|
||||
"\n",
|
||||
"<br>\n",
|
||||
"\n",
|
||||
"> **⚠️ Context always matters.** There is no universal rule for missing values. The decisions above are reasonable for this dataset and analytical goal — but a different context might lead to different choices.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — missing value counts across all columns\n",
|
||||
"missing = df_clean.isnull().sum()\n",
|
||||
"missing_pct = (missing / len(df_clean) * 100).round(1)\n",
|
||||
"pd.DataFrame({'missing': missing, '%': missing_pct})[missing > 0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# session_length_s: drop rows where it is missing\n",
|
||||
"# Rationale: session duration is a core metric — a session with no recorded\n",
|
||||
"# duration is structurally incomplete and cannot be used for most analyses.\n",
|
||||
"# These 98 rows represent <1% of the dataset, so dropping is safe.\n",
|
||||
"\n",
|
||||
"rows_before = len(df_clean)\n",
|
||||
"df_clean = df_clean.dropna(subset=['session_length_s'])\n",
|
||||
"\n",
|
||||
"print(f'Rows dropped: {rows_before - len(df_clean)}')\n",
|
||||
"print(f'Rows remaining: {len(df_clean)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.5 — Outliers: `avg_fps`\n",
|
||||
"\n",
|
||||
"The `avg_fps` column has a maximum of 10,000 fps — physically impossible for a game running in real time. The 75th percentile is ~82 fps, confirming that 10,000 is a logging error, not an extreme but plausible value.\n",
|
||||
"\n",
|
||||
"**Decision:** set values above 300 fps to `NaN` rather than dropping the entire row. The rest of the data in those rows (crash flag, purchase amount, session type) is likely still valid — it would be wasteful to discard it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many rows are affected?\n",
|
||||
"threshold = 300\n",
|
||||
"outlier_mask = df_clean['avg_fps'] > threshold\n",
|
||||
"print(f'Rows with avg_fps > {threshold}: {outlier_mask.sum()}')\n",
|
||||
"print('\\navg_fps distribution (before fix):')\n",
|
||||
"print(df_clean['avg_fps'].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: set outlier values to NaN using .loc with a boolean mask\n",
|
||||
"df_clean.loc[outlier_mask, 'avg_fps'] = float('nan')\n",
|
||||
"\n",
|
||||
"# Verify — max should now be well below 300\n",
|
||||
"print('avg_fps distribution (after fix):')\n",
|
||||
"print(df_clean['avg_fps'].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.6 — Datetime columns: mixed formats\n",
|
||||
"\n",
|
||||
"The `start_time` and `end_time` columns contain timestamps in at least four different formats:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"2025-07-18T18:32:00Z : ISO 8601 with UTC marker\n",
|
||||
"2025-07-18 20:03:21-05:00 : ISO 8601 with UTC offset\n",
|
||||
"20/10/2025 02:49 : European DD/MM/YYYY\n",
|
||||
"08/01/2025 06:35 : Ambiguous: US MM/DD or European DD/MM?\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Mixed datetime formats are one of the most complex cleaning problems because some ambiguities cannot be resolved automatically -- `08/01/2025` could be August 1st or January 8th, and no algorithm can determine which without external context.\n",
|
||||
"\n",
|
||||
"> **Connection to `session_length_s`:** The negative values and extreme outliers we saw earlier in `session_length_s` are not independent errors -- they are a *consequence* of this datetime problem. When `start_time` and `end_time` were recorded in different formats and misinterpreted, the pre-computed duration came out wrong. After fixing the timestamps, we will recompute `session_length_s` from scratch and validate the result.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — what does start_time actually look like?\n",
|
||||
"print('Sample values from start_time:')\n",
|
||||
"print(df_clean['start_time'].dropna().sample(8, random_state=42).tolist())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix: pd.to_datetime with utc=True normalises all timezone-aware formats to UTC.\n",
|
||||
"# errors='coerce' converts anything it cannot parse to NaT (Not a Time) instead of crashing.\n",
|
||||
"df_clean['start_time'] = pd.to_datetime(df_clean['start_time'], utc=True, errors='coerce')\n",
|
||||
"df_clean['end_time'] = pd.to_datetime(df_clean['end_time'], utc=True, errors='coerce')\n",
|
||||
"\n",
|
||||
"# Verify — check how many rows could not be parsed\n",
|
||||
"print('start_time dtype:', df_clean['start_time'].dtype)\n",
|
||||
"print('Unparsed start_time (NaT):', df_clean['start_time'].isna().sum())\n",
|
||||
"print('Unparsed end_time (NaT): ', df_clean['end_time'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Recompute session_length_s from the corrected timestamps\n",
|
||||
"# Now that start_time and end_time are both timezone-aware UTC datetimes,\n",
|
||||
"# the subtraction is unambiguous. We convert the result to seconds.\n",
|
||||
"df_clean['session_length_s'] = (\n",
|
||||
" df_clean['end_time'] - df_clean['start_time']\n",
|
||||
").dt.total_seconds()\n",
|
||||
"\n",
|
||||
"print('session_length_s after recomputation:')\n",
|
||||
"print(df_clean['session_length_s'].describe().round(1))\n",
|
||||
"print(f'\\nNegative values: {(df_clean[\"session_length_s\"] < 0).sum()}')\n",
|
||||
"print(f'> 8h (28800s): {(df_clean[\"session_length_s\"] > 28800).sum()}')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Any remaining negative values are rows where timestamps were genuinely\n",
|
||||
"# ambiguous and could not be resolved -- the computed duration is meaningless.\n",
|
||||
"# Set them to NaN rather than dropping the row.\n",
|
||||
"\n",
|
||||
"neg_mask = df_clean['session_length_s'] < 0\n",
|
||||
"df_clean.loc[neg_mask, 'session_length_s'] = float('nan')\n",
|
||||
"print(f'Negative durations set to NaN: {neg_mask.sum()}')\n",
|
||||
"\n",
|
||||
"# Values above 8 hours (28800s) are suspicious for a game session.\n",
|
||||
"# Inspect them before deciding.\n",
|
||||
"\n",
|
||||
"long_mask = df_clean['session_length_s'] > 28800\n",
|
||||
"print(f'\\nSessions > 8h: {long_mask.sum()}')\n",
|
||||
"print(df_clean.loc[long_mask, ['session_length_s', 'start_time', 'end_time']].head(5).to_string())\n",
|
||||
"\n",
|
||||
"# Decision: sessions > 8h are almost certainly logging errors (game left running,\n",
|
||||
"# server not recording session end). Set to NaN.\n",
|
||||
"# As always — this threshold is a judgement call that depends on the game and context.\n",
|
||||
"df_clean.loc[long_mask, 'session_length_s'] = float('nan')\n",
|
||||
"print(f'\\nSessions > 8h set to NaN: {long_mask.sum()}')\n",
|
||||
"print('\\nFinal session_length_s distribution:')\n",
|
||||
"print(df_clean['session_length_s'].describe().round(1))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Note:** The number of NaT values above reflects rows where pandas could not parse the format unambiguously. These are not errors in the code — they are genuinely ambiguous records that require a domain decision to resolve (e.g., knowing that the data source always uses DD/MM/YYYY).\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"** **OPTIONAL** — explore the unparsed rows**\n",
|
||||
"\n",
|
||||
"If you want to go further, the cells below help you examine which formats failed and attempt a two-pass parsing strategy. This is optional and not required to complete the lab.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL — Step 1: inspect the unparsed rows\n",
|
||||
"# We use the index of df_clean (not a boolean mask) to look up raw values in df,\n",
|
||||
"# since the two dataframes have different lengths after the dropna() in step 4.4.\n",
|
||||
"unparsed_idx = df_clean.index[df_clean['start_time'].isna()]\n",
|
||||
"raw_start = df.loc[unparsed_idx, 'start_time'].dropna()\n",
|
||||
"\n",
|
||||
"print(f'Rows still unparsed: {len(unparsed_idx)}')\n",
|
||||
"print('\\nSample raw values:')\n",
|
||||
"print(raw_start.unique()[:12])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Step 2: define a systematic multi-format parser\n",
|
||||
"#\n",
|
||||
"# Rather than guessing with dayfirst=True, we try explicit format strings\n",
|
||||
"# in sequence and stop as soon as one succeeds for each row.\n",
|
||||
"# This is precise and transparent — no silent inference.\n",
|
||||
"\n",
|
||||
"def try_formats(series, formats):\n",
|
||||
" \"\"\"Try explicit datetime format strings in order.\n",
|
||||
" Returns a UTC-aware Series; rows that match no format remain NaT.\"\"\"\n",
|
||||
" result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns, UTC]')\n",
|
||||
" remaining = series.copy()\n",
|
||||
" for fmt in formats:\n",
|
||||
" parsed = pd.to_datetime(remaining, format=fmt, errors='coerce', utc=True)\n",
|
||||
" resolved_idx = parsed.index[parsed.notna()] # use index labels, not boolean mask\n",
|
||||
" result.loc[resolved_idx] = parsed.loc[resolved_idx]\n",
|
||||
" remaining = remaining.drop(index=resolved_idx) # drop resolved rows by label\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"# Format strings to try, in order of specificity\n",
|
||||
"# DD/MM/YYYY is tried before MM/DD/YYYY because values where day > 12\n",
|
||||
"# can only be DD/MM — those are unambiguous and should be resolved first.\n",
|
||||
"# Values where day <= 12 will match both formats; the first one wins.\n",
|
||||
"# Those cases are genuinely ambiguous — we flag them separately below.\n",
|
||||
"candidate_formats = [\n",
|
||||
" '%d/%m/%Y %H:%M', # European with time: 20/10/2025 14:30\n",
|
||||
" '%m/%d/%Y %H:%M', # US with time: 10/20/2025 14:30\n",
|
||||
" '%d/%m/%Y', # European date only: 20/10/2025\n",
|
||||
" '%m/%d/%Y', # US date only: 10/20/2025\n",
|
||||
"]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Step 3: apply the systematic parser to unparsed rows\n",
|
||||
"raw_start = df.loc[unparsed_idx, 'start_time']\n",
|
||||
"raw_end = df.loc[unparsed_idx, 'end_time']\n",
|
||||
"\n",
|
||||
"resolved_start = try_formats(raw_start, candidate_formats)\n",
|
||||
"resolved_end = try_formats(raw_end, candidate_formats)\n",
|
||||
"\n",
|
||||
"df_clean.loc[unparsed_idx, 'start_time'] = resolved_start\n",
|
||||
"df_clean.loc[unparsed_idx, 'end_time'] = resolved_end\n",
|
||||
"\n",
|
||||
"print(f'Resolved in second pass: {resolved_start.notna().sum()}')\n",
|
||||
"print(f'Still NaT (truly ambiguous): {df_clean[\"start_time\"].isna().sum()}')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Step 4: inspect truly ambiguous rows\n",
|
||||
"# These are rows where day <= 12, making both DD/MM and MM/DD valid.\n",
|
||||
"# No algorithm can resolve them without knowing the data source convention.\n",
|
||||
"# They remain NaT — do not silently guess.\n",
|
||||
"still_nat_idx = df_clean.index[df_clean['start_time'].isna()]\n",
|
||||
"if len(still_nat_idx) > 0:\n",
|
||||
" print('Truly ambiguous timestamps (cannot resolve without domain knowledge):')\n",
|
||||
" print(df.loc[still_nat_idx, ['start_time', 'end_time']].head(10).to_string())\n",
|
||||
"else:\n",
|
||||
" print('All timestamps resolved.')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTIONAL: Step 5: recompute session_length_s with the newly resolved timestamps\n",
|
||||
"# More rows now have valid start_time and end_time, so more durations can be recovered.\n",
|
||||
"df_clean['session_length_s'] = (\n",
|
||||
" df_clean['end_time'] - df_clean['start_time']\n",
|
||||
").dt.total_seconds()\n",
|
||||
"\n",
|
||||
"# Re-apply the same validation as before\n",
|
||||
"neg_mask = df_clean['session_length_s'] < 0\n",
|
||||
"long_mask = df_clean['session_length_s'] > 28800\n",
|
||||
"df_clean.loc[neg_mask | long_mask, 'session_length_s'] = float('nan')\n",
|
||||
"\n",
|
||||
"print('session_length_s after second-pass recomputation:')\n",
|
||||
"print(df_clean['session_length_s'].describe().round(1))\n",
|
||||
"print(f'\\nNaN values: {df_clean[\"session_length_s\"].isna().sum()}')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"</details>\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 5 — Verify with D-Tale\n",
|
||||
"\n",
|
||||
"Reload the cleaned dataframe into D-Tale and visually confirm the fixes. This is a quick sanity check — you are looking for anything that looks wrong before committing to the cleaned dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Shut down the previous D-Tale instance and reload with the clean data\n",
|
||||
"d.kill()\n",
|
||||
"d_clean = dtale.show(df_clean, host='127.0.0.1', subprocess=True, open_browser=True)\n",
|
||||
"print('Open cleaned data in D-Tale:', d_clean._url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In D-Tale, verify the following:\n",
|
||||
"\n",
|
||||
"| Column | What to check | Expected result |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `crash_flag` | Describe → value counts | Only `True` and `False` |\n",
|
||||
"| `region` | Describe → value counts | Exactly 5 values, all lowercase |\n",
|
||||
"| `input_method` | Describe → value counts | Exactly 3 values, no `controllr` |\n",
|
||||
"| `purchase_amount` | Describe → dtype and range | float64, no commas |\n",
|
||||
"| `avg_fps` | Describe → max | Below 300 |\n",
|
||||
"| `session_length_s` | Describe → min and max | No negatives, no values > 28800 |\n",
|
||||
"| `start_time` | Describe → dtype | datetime64 |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c8f0e03a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Debug\n",
|
||||
"\n",
|
||||
"# Test comparison column by column\n",
|
||||
"# for col in df_clean.columns:\n",
|
||||
"# try:\n",
|
||||
"# sv.compare([df[[col]], 'Raw'], [df_clean[[col]].reset_index(drop=True), 'Cleaned'])\n",
|
||||
"# except Exception as e:\n",
|
||||
"# print(f\"FAIL: {col} — {e}\")\n",
|
||||
"# else:\n",
|
||||
"# print(f\"ok: {col}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Compare both versions of the dataset using SweetViz... \n",
|
||||
"# Not perfect, but some basic information (e.g., works bad with booleans vs categorical in crash_flag)\n",
|
||||
"# needed to exclude these two because we converted them to datetime and sweetviz is not able to compare it with the original data types\n",
|
||||
"\n",
|
||||
"exclude = ['start_time', 'end_time'] \n",
|
||||
"\n",
|
||||
"compare = sv.compare(\n",
|
||||
" [df.drop(columns=exclude), 'Raw'],\n",
|
||||
" [df_clean.drop(columns=exclude).reset_index(drop=True), 'Cleaned']\n",
|
||||
")\n",
|
||||
"compare.show_html('sweetviz_comparison_report.html', open_browser=True)\n",
|
||||
"print('Comparison report saved.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the comparison report, check that:\n",
|
||||
"- Boolean columns changed from TEXT → BOOL with only 2 distinct values\n",
|
||||
"- Categorical columns show dramatically reduced DISTINCT counts\n",
|
||||
"- `purchase_amount` changed from TEXT → NUMERIC\n",
|
||||
"- `avg_fps` maximum is no longer 10,000\n",
|
||||
"- `session_length_s` shows 0 missing\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 7 — Save the Cleaned Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_clean.to_csv('dataset_A_indie_game_telemetry_clean.csv', index=False)\n",
|
||||
"print(f'Saved: {len(df_clean)} rows, {len(df_clean.columns)} columns')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Key Takeaways\n",
|
||||
"\n",
|
||||
"**Three tools, three roles — they complement each other:**\n",
|
||||
"- **SweetViz** surfaces issues fast but cannot fix them: use it for triage and validation\n",
|
||||
"- **D-Tale** lets you see the data as a human would: use it to understand problems before and after fixing them\n",
|
||||
"- **pandas** is where all actual cleaning happens: explicit, reproducible, and version-controllable\n",
|
||||
"\n",
|
||||
"**Cleaning decisions are not mechanical:**\n",
|
||||
"- Dropping `session_length_s` nulls was justified here: it would not be in every context\n",
|
||||
"- Setting `avg_fps` outliers to NaN (not dropping rows) preserved valid data in other columns\n",
|
||||
"- `gpu_model` missingness is structurally meaningful: imputing it would destroy information\n",
|
||||
"\n",
|
||||
"**Common issue categories you have now fixed with pandas:**\n",
|
||||
"\n",
|
||||
"| Issue | pandas approach |\n",
|
||||
"|---|---|\n",
|
||||
"| Boolean encoding chaos | `.map(bool_map)` |\n",
|
||||
"| Case / whitespace inconsistency | `.str.strip().str.lower()` |\n",
|
||||
"| Typos in categories | `.replace({'controllr': 'controller'})` |\n",
|
||||
"| Wrong decimal separator | `.str.replace(',', '.')` + `.astype(float)` |\n",
|
||||
"| Structural missing values | `dropna(subset=[...])` with explicit rationale |\n",
|
||||
"| Outliers | Boolean mask + `.loc[mask, col] = NaN` |\n",
|
||||
"| Mixed datetime formats | `pd.to_datetime(utc=True, errors='coerce')` |\n",
|
||||
"\n",
|
||||
"→ In **Task 3**, you will apply these skills independently to a new dataset — with a checklist but without step-by-step guidance."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
676
claude/lab02_task3_git_activity.ipynb
Normal file
676
claude/lab02_task3_git_activity.ipynb
Normal file
@@ -0,0 +1,676 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5,
|
||||
"metadata": {
|
||||
"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
|
||||
"language_info": {"name": "python", "version": "3.10.0"}
|
||||
},
|
||||
"cells": [
|
||||
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lab 02 · Task 3 — Independent EDA and Cleaning\n",
|
||||
"\n",
|
||||
"**Estimated time:** ~20 minutes \n",
|
||||
"**Dataset:** `dataset_D_git_classroom_activity.csv`\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### Context\n",
|
||||
"\n",
|
||||
"You have been handed an activity log from a Git-based classroom platform. It records **10,000 events** — commits, pull requests, CI runs, code reviews, and test runs — generated by students and bots across multiple repositories.\n",
|
||||
"\n",
|
||||
"Your goal is to apply the same EDA and cleaning pipeline from Task 2 to this new dataset. This time the guidance is lighter: each section tells you *what* to look for and *which tools and methods to use*, but the code is yours to write.\n",
|
||||
"\n",
|
||||
"### Pipeline reminder\n",
|
||||
"\n",
|
||||
"| Step | Tool | Goal |\n",
|
||||
"|---|---|---|\n",
|
||||
"| 1 — Load and inspect | pandas | Understand structure and inferred types |\n",
|
||||
"| 2 — Automated profiling | SweetViz | Triage issues across all columns |\n",
|
||||
"| 3 — Navigate and inspect | D-Tale | See problems with your own eyes |\n",
|
||||
"| 4 — Clean | pandas | Fix each issue with explicit, reproducible code |\n",
|
||||
"| 5 — Verify | D-Tale + SweetViz | Confirm fixes landed correctly |\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Part 1 — Load and Inspect"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_csv('dataset_D_git_classroom_activity.csv')\n",
|
||||
"\n",
|
||||
"# Inspect shape, column types, and first rows\n",
|
||||
"# Use: df.shape, df.dtypes, df.head()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **What to note:** Which columns were inferred as `object` but should be boolean or numeric? Any column that should be numeric but is `object` almost always signals a formatting problem in the raw values.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 2 — Automated Profiling with SweetViz\n",
|
||||
"\n",
|
||||
"Generate a SweetViz report on the raw dataset. Use it to fill in the triage checklist below before moving on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate the SweetViz report\n",
|
||||
"# Use: sv.analyze(df)\n",
|
||||
"# Save to 'sweetviz_git_raw.html'\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Triage checklist\n",
|
||||
"\n",
|
||||
"| Question | Your finding |\n",
|
||||
"|---|---|\n",
|
||||
"| Which columns have missing values? Which has the most, and by how much? | *...* |\n",
|
||||
"| Which columns are shown as TEXT but should be boolean? | *...* |\n",
|
||||
"| Which columns are shown as TEXT but should be numeric? | *...* |\n",
|
||||
"| How many distinct values does `event_type` have? Does that seem right? | *...* |\n",
|
||||
"| What is unusual about `ci_status` distinct values compared to `event_type`? | *...* |\n",
|
||||
"| Are there numeric columns with suspicious ranges? | *...* |\n",
|
||||
"\n",
|
||||
"*(Double-click to fill in your answers)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 3 — Navigate and Inspect with D-Tale\n",
|
||||
"\n",
|
||||
"Launch D-Tale and use it to confirm each issue visually. Do not clean anything here."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Launch D-Tale\n",
|
||||
"# Use: dtale.show(df, host='127.0.0.1', subprocess=False, open_browser=False)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Inspection checklist\n",
|
||||
"\n",
|
||||
"For each item, use D-Tale's **column header → Describe** to inspect value counts and distribution.\n",
|
||||
"\n",
|
||||
"| What to inspect | What you should find |\n",
|
||||
"|---|---|\n",
|
||||
"| `is_weekend` unique values | 8 representations of True/False |\n",
|
||||
"| `event_type` unique values | Many case/whitespace variants of 7 event types |\n",
|
||||
"| `ci_status` unique values | Case/whitespace variants — but also: are FAILED and FAILURE the same thing? |\n",
|
||||
"| `os` unique values | WIN, Windows, win — which is the canonical form? |\n",
|
||||
"| `coverage_percent` raw values | Some use comma as decimal separator |\n",
|
||||
"| `pr_merge_time_hours` missing % | Very high — is this random or structural? |\n",
|
||||
"| `tests_failed` vs `tests_run` | Sort `tests_failed` descending — are there rows where it exceeds `tests_run`? |\n",
|
||||
"| `lines_added` distribution | Any extreme values? |\n",
|
||||
"| `pr_merge_time_hours` min | Any negative values? |\n",
|
||||
"| `commit_message_length` min | Any zero values? What would a zero-length commit message mean? |\n",
|
||||
"\n",
|
||||
"> **Note on `pr_merge_time_hours`:** Think carefully about why this column has so many missing values before deciding what to do. Look at the `event_type` column for rows where it is missing — does a pattern emerge?\n",
|
||||
"\n",
|
||||
"*(Record any additional observations below)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 4 — Clean with Pandas\n",
|
||||
"\n",
|
||||
"Work through each issue below. For each one: inspect → fix → verify. \n",
|
||||
"The first example in each category is more detailed; subsequent columns follow the same pattern.\n",
|
||||
"\n",
|
||||
"Start by creating a working copy:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_clean = df.copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.1 — Boolean columns\n",
|
||||
"\n",
|
||||
"**Columns:** `is_weekend`, `label_is_high_quality`, `exam_period` \n",
|
||||
"**Issue:** 8 different representations of True/False \n",
|
||||
"**Approach:** `.map()` with an explicit dictionary, same as Task 2 \n",
|
||||
"\n",
|
||||
"> **Hint:** Define the `bool_map` dictionary once and reuse it for all three columns. Include both string and boolean keys to make the mapping safe to re-run."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect\n",
|
||||
"print(sorted(df_clean['is_weekend'].dropna().unique().tolist()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix is_weekend, label_is_high_quality, exam_period\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Verify — each column should have only True and False, 0 nulls\n",
|
||||
"for col in ['is_weekend', 'label_is_high_quality', 'exam_period']:\n",
|
||||
" print(f\"{col}: {df_clean[col].value_counts().to_dict()} | nulls: {df_clean[col].isna().sum()}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.2 — `is_bot_user`: case and whitespace\n",
|
||||
"\n",
|
||||
"**Issue:** 6 variants of 2 values (`Human`, `Bot`) with mixed case and whitespace \n",
|
||||
"**Approach:** `.str.strip().str.lower()` — no typos, no synonym merging needed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect\n",
|
||||
"print(df_clean['is_bot_user'].value_counts().to_string())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix is_bot_user\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Verify — should show exactly 2 values: 'human' and 'bot'\n",
|
||||
"print(df_clean['is_bot_user'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.3 — Categorical columns: case and whitespace\n",
|
||||
"\n",
|
||||
"**Columns:** `dominant_language`, `editor`, `os`, `event_type` \n",
|
||||
"**Issue:** Many case/whitespace variants — strip and lowercase resolves most \n",
|
||||
"\n",
|
||||
"> **Note on `os`:** After stripping and lowercasing you will still have `win` and `windows` as separate values. Decide on a canonical form and merge them with `.replace()`.\n",
|
||||
"\n",
|
||||
"> **Note on `event_type`:** After stripping and lowercasing, verify the number of unique values matches the number of distinct event types you expect."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect dominant_language before\n",
|
||||
"print(f'dominant_language unique before: {df_clean[\"dominant_language\"].nunique()}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix dominant_language — strip and lowercase\n",
|
||||
"# Your code here\n",
|
||||
"\n",
|
||||
"# Apply the same to editor and event_type\n",
|
||||
"# Your code here\n",
|
||||
"\n",
|
||||
"# Fix os — strip, lowercase, then merge win/windows variants\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Verify\n",
|
||||
"for col in ['dominant_language', 'editor', 'os', 'event_type']:\n",
|
||||
" print(f\"{col} ({df_clean[col].nunique()} unique): {sorted(df_clean[col].dropna().unique().tolist())}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.4 — `ci_status`: case, whitespace, and synonym merging\n",
|
||||
"\n",
|
||||
"**Issue:** Case and whitespace variants — but also `FAILED` and `FAILURE` represent the same outcome and need to be merged into one canonical value. \n",
|
||||
"**Approach:** Strip and lowercase first, then use `.replace()` to merge synonyms.\n",
|
||||
"\n",
|
||||
"> **Decision to make:** After lowercasing, you will have `failed` and `failure` as separate values. Pick one as the canonical form and justify your choice in a markdown cell below."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect\n",
|
||||
"print(df_clean['ci_status'].value_counts().to_string())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix ci_status — strip, lowercase, then merge synonyms\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Verify — should show exactly 4 values: success, failed, cancelled + your merged form\n",
|
||||
"print(df_clean['ci_status'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Your decision:** Which canonical form did you choose for `failed`/`failure`, and why?\n",
|
||||
"\n",
|
||||
"*(Double-click to write your answer)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.5 — `coverage_percent`: comma decimal separator and type conversion\n",
|
||||
"\n",
|
||||
"**Issue:** Loaded as `object` — some values use a comma instead of a decimal point \n",
|
||||
"**Approach:** Same as `purchase_amount` in Task 2 — `.str.replace()` then `.astype(float)`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect — how many rows have a comma?\n",
|
||||
"print(df_clean['coverage_percent'].dtype)\n",
|
||||
"comma_rows = df_clean['coverage_percent'].astype(str).str.contains(',', na=False)\n",
|
||||
"print(f'Rows with comma: {comma_rows.sum()}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix coverage_percent\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Verify\n",
|
||||
"print(f'dtype: {df_clean[\"coverage_percent\"].dtype}')\n",
|
||||
"print(df_clean['coverage_percent'].describe().round(2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.6 — Missing values: decisions and strategy\n",
|
||||
"\n",
|
||||
"This dataset has four columns with missing values. Inspect each one and decide what to do.\n",
|
||||
"\n",
|
||||
"| Column | Missing | Your hypothesis for why | Your decision |\n",
|
||||
"|---|---|---|---|\n",
|
||||
"| `pr_merge_time_hours` | 71.7% | *...* | *...* |\n",
|
||||
"| `commit_message_length` | 7.0% | *...* | *...* |\n",
|
||||
"| `build_duration_s` | 2.1% | *...* | *...* |\n",
|
||||
"| `time_to_ci_minutes` | 2.0% | *...* | *...* |\n",
|
||||
"\n",
|
||||
"*(Double-click to fill in the table)*\n",
|
||||
"\n",
|
||||
"> **Hint for `pr_merge_time_hours`:** Filter D-Tale to show only rows where `pr_merge_time_hours` is NOT null. What values appear in `event_type`? What does this tell you about why it is missing for the other rows?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Inspect missing counts\n",
|
||||
"missing = df_clean.isnull().sum()\n",
|
||||
"pct = (missing / len(df_clean) * 100).round(1)\n",
|
||||
"pd.DataFrame({'missing': missing, '%': pct})[missing > 0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Investigate pr_merge_time_hours — which event types have non-null values?\n",
|
||||
"print(df_clean.loc[df_clean['pr_merge_time_hours'].notna(), 'event_type'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Apply your decisions from the table above\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.7 — Outliers and impossible values\n",
|
||||
"\n",
|
||||
"Three issues to address:\n",
|
||||
"\n",
|
||||
"**A. `pr_merge_time_hours` — negative values** \n",
|
||||
"A negative merge time is impossible. Inspect the affected rows and set them to `NaN`. \n",
|
||||
"Use: boolean mask + `.loc[mask, col] = float('nan')`\n",
|
||||
"\n",
|
||||
"**B. `tests_failed > tests_run` — cross-column logical impossibility** \n",
|
||||
"231 rows have more failed tests than tests were run — physically impossible. This is a new type of issue: it requires checking consistency *between* two columns, not just inspecting one in isolation. \n",
|
||||
"Inspect the affected rows, then set `tests_failed` to `NaN` for those rows.\n",
|
||||
"\n",
|
||||
"**C. `lines_added` and `lines_deleted` — extreme outliers** \n",
|
||||
"Some commits add or delete thousands of lines — potentially valid (e.g. adding a large library) or a logging error. \n",
|
||||
"Inspect the affected rows before deciding. Document your threshold choice."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A — Inspect negative pr_merge_time_hours\n",
|
||||
"neg_mask = df_clean['pr_merge_time_hours'] < 0\n",
|
||||
"print(f'Negative pr_merge_time_hours: {neg_mask.sum()}')\n",
|
||||
"print(df_clean.loc[neg_mask, ['event_type', 'pr_merge_time_hours']].head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix A — set negative values to NaN\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# B — Inspect tests_failed > tests_run\n",
|
||||
"impossible_mask = df_clean['tests_failed'] > df_clean['tests_run']\n",
|
||||
"print(f'Rows where tests_failed > tests_run: {impossible_mask.sum()}')\n",
|
||||
"print(df_clean.loc[impossible_mask, ['tests_run', 'tests_failed']].describe().round(1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix B — set tests_failed to NaN for impossible rows\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# C — Inspect lines_added and lines_deleted outliers\n",
|
||||
"print('lines_added distribution:')\n",
|
||||
"print(df_clean['lines_added'].describe().round(1))\n",
|
||||
"print(f'\\nRows > 1000 lines added: {(df_clean[\"lines_added\"] > 1000).sum()}')\n",
|
||||
"print(df_clean.loc[df_clean['lines_added'] > 1000, \n",
|
||||
" ['event_type', 'lines_added', 'lines_deleted', 'dominant_language']].head(8).to_string())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fix C — apply your decision on lines_added and lines_deleted outliers\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Your decisions:** What thresholds did you use? What was your reasoning for each?\n",
|
||||
"\n",
|
||||
"*(Double-click to write your answers)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 4.8 — `timestamp`: mixed datetime formats *(optional)*\n",
|
||||
"\n",
|
||||
"Like Task 2, the `timestamp` column contains mixed datetime formats. However, unlike Task 2, there is no derived column that depends on it — so the impact of unresolved timestamps is lower here.\n",
|
||||
"\n",
|
||||
"Apply a first-pass parse with `pd.to_datetime(utc=True, errors='coerce')`. Check how many rows remain unparsed. If you want to go further, apply the `try_formats()` strategy from Task 2's optional section."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Parse timestamp — first pass\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 5 — Verify with D-Tale"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Reload D-Tale with the cleaned dataframe\n",
|
||||
"# Use: dtale.show(df_clean, host='127.0.0.1', subprocess=False, open_browser=False)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check each of the following in D-Tale:\n",
|
||||
"\n",
|
||||
"| Column | Expected result |\n",
|
||||
"|---|---|\n",
|
||||
"| `is_weekend`, `label_is_high_quality`, `exam_period` | Only `True` / `False` |\n",
|
||||
"| `is_bot_user` | Only `human` / `bot` |\n",
|
||||
"| `event_type` | Exactly 7 values, all lowercase |\n",
|
||||
"| `ci_status` | Exactly 4 values, no `failure`/`FAILED` duplicates |\n",
|
||||
"| `os` | Exactly 3 values, no `win`/`windows` duplicates |\n",
|
||||
"| `coverage_percent` | dtype = float64 |\n",
|
||||
"| `pr_merge_time_hours` | No negative values |\n",
|
||||
"| `tests_failed` | No values exceeding `tests_run` |\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 6 — Before vs After with SweetViz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate comparison report\n",
|
||||
"# Exclude timestamp if you converted it (same reason as Task 2)\n",
|
||||
"# Save to 'sweetviz_git_comparison.html'\n",
|
||||
"# Your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 7 — Save"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_clean.to_csv('dataset_D_git_classroom_activity_clean.csv', index=False)\n",
|
||||
"print(f'Saved: {len(df_clean)} rows, {len(df_clean.columns)} columns')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Reflection\n",
|
||||
"\n",
|
||||
"Answer the following before finishing:\n",
|
||||
"\n",
|
||||
"**1.** The `pr_merge_time_hours` column is missing for 71.7% of rows. Is this a data quality problem? Why or why not?\n",
|
||||
"\n",
|
||||
"**2.** You found rows where `tests_failed > tests_run`. What does this kind of cross-column check tell you that a single-column inspection would have missed?\n",
|
||||
"\n",
|
||||
"**3.** For `ci_status`, you had to decide whether `failed` and `failure` are the same thing. What kind of knowledge — beyond the data itself — did you need to make that decision?\n",
|
||||
"\n",
|
||||
"**4.** Compare this dataset to the telemetry dataset from Task 2. Which issues were the same? Which were new? What does that tell you about the generality of the cleaning skills you are building?\n",
|
||||
"\n",
|
||||
"*(Double-click to write your answers)*"
|
||||
]
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
388
claude/report.html
Normal file
388
claude/report.html
Normal file
File diff suppressed because one or more lines are too long
11985
claude/sweetviz_comparison_report.html
Normal file
11985
claude/sweetviz_comparison_report.html
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user