a lot
This commit is contained in:
320
EDA_Lab_Starter.ipynb
Normal file
320
EDA_Lab_Starter.ipynb
Normal file
@@ -0,0 +1,320 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Interactive EDA Lab Starter (SweetViz & D‑Tale)\n",
|
||||
"Use this notebook to explore the datasets and practice cleaning common EDA issues."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Optional installs:\n",
|
||||
"# !pip install -q sweetviz dtale pandas numpy\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>session_id</th>\n",
|
||||
" <th>user_id</th>\n",
|
||||
" <th>start_time</th>\n",
|
||||
" <th>end_time</th>\n",
|
||||
" <th>session_length_s</th>\n",
|
||||
" <th>region</th>\n",
|
||||
" <th>platform</th>\n",
|
||||
" <th>gpu_model</th>\n",
|
||||
" <th>avg_fps</th>\n",
|
||||
" <th>ping_ms</th>\n",
|
||||
" <th>map_name</th>\n",
|
||||
" <th>crash_flag</th>\n",
|
||||
" <th>purchase_amount</th>\n",
|
||||
" <th>party_size</th>\n",
|
||||
" <th>input_method</th>\n",
|
||||
" <th>build_version</th>\n",
|
||||
" <th>is_featured_event</th>\n",
|
||||
" <th>device_temp_c</th>\n",
|
||||
" <th>session_type</th>\n",
|
||||
" <th>is_long_session</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>sess_c2fba8e7f37a</td>\n",
|
||||
" <td>user_488</td>\n",
|
||||
" <td>2025-07-18T18:32:00Z</td>\n",
|
||||
" <td>2025-07-18 20:03:21-05:00</td>\n",
|
||||
" <td>5481.0</td>\n",
|
||||
" <td>us-west</td>\n",
|
||||
" <td>pc</td>\n",
|
||||
" <td>GTX1080</td>\n",
|
||||
" <td>83.52</td>\n",
|
||||
" <td>431.16</td>\n",
|
||||
" <td>ocean</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>0,00</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>Touch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>85.6</td>\n",
|
||||
" <td>ranked</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>sess_33d286298cf9</td>\n",
|
||||
" <td>user_1511</td>\n",
|
||||
" <td>2025-06-13 23:21:08+00:00</td>\n",
|
||||
" <td>2025-06-13 23:36:30+01:00</td>\n",
|
||||
" <td>922.0</td>\n",
|
||||
" <td>Us-east</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>72.75</td>\n",
|
||||
" <td>29.12</td>\n",
|
||||
" <td>desert</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>Touch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>62.0</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>sess_be2bb4d8986a</td>\n",
|
||||
" <td>user_830</td>\n",
|
||||
" <td>2025-10-20 02:42:07-05:00</td>\n",
|
||||
" <td>20/10/2025 02:49</td>\n",
|
||||
" <td>451.0</td>\n",
|
||||
" <td>sa-east-1</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>69.2</td>\n",
|
||||
" <td>40.47</td>\n",
|
||||
" <td>Forest</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>TOUCH</td>\n",
|
||||
" <td>1.4</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>69.0</td>\n",
|
||||
" <td>ranked</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>sess_7f425ca9a0e2</td>\n",
|
||||
" <td>user_1</td>\n",
|
||||
" <td>08/01/2025 06:35</td>\n",
|
||||
" <td>2025-08-01T08:32:45Z</td>\n",
|
||||
" <td>7031.0</td>\n",
|
||||
" <td>sa-east-1</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>33.29</td>\n",
|
||||
" <td>92.4</td>\n",
|
||||
" <td>Desert</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>17.55</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Controller</td>\n",
|
||||
" <td>1.3.2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>48.1</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>sess_5657e28b22ec</td>\n",
|
||||
" <td>user_211</td>\n",
|
||||
" <td>2025-09-08T23:41:44Z</td>\n",
|
||||
" <td>2025-09-09 00:32:59+01:00</td>\n",
|
||||
" <td>3075.0</td>\n",
|
||||
" <td>US-EAST</td>\n",
|
||||
" <td>switch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>69.96</td>\n",
|
||||
" <td>12.63</td>\n",
|
||||
" <td>Desert</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>controllr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>54.7</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" session_id user_id start_time \\\n",
|
||||
"0 sess_c2fba8e7f37a user_488 2025-07-18T18:32:00Z \n",
|
||||
"1 sess_33d286298cf9 user_1511 2025-06-13 23:21:08+00:00 \n",
|
||||
"2 sess_be2bb4d8986a user_830 2025-10-20 02:42:07-05:00 \n",
|
||||
"3 sess_7f425ca9a0e2 user_1 08/01/2025 06:35 \n",
|
||||
"4 sess_5657e28b22ec user_211 2025-09-08T23:41:44Z \n",
|
||||
"\n",
|
||||
" end_time session_length_s region platform \\\n",
|
||||
"0 2025-07-18 20:03:21-05:00 5481.0 us-west pc \n",
|
||||
"1 2025-06-13 23:36:30+01:00 922.0 Us-east PlayStation \n",
|
||||
"2 20/10/2025 02:49 451.0 sa-east-1 PlayStation \n",
|
||||
"3 2025-08-01T08:32:45Z 7031.0 sa-east-1 PlayStation \n",
|
||||
"4 2025-09-09 00:32:59+01:00 3075.0 US-EAST switch \n",
|
||||
"\n",
|
||||
" gpu_model avg_fps ping_ms map_name crash_flag purchase_amount party_size \\\n",
|
||||
"0 GTX1080 83.52 431.16 ocean Yes 0,00 2 \n",
|
||||
"1 NaN 72.75 29.12 desert No 0.0 3 \n",
|
||||
"2 NaN 69.2 40.47 Forest False 0.0 5 \n",
|
||||
"3 NaN 33.29 92.4 Desert No 17.55 1 \n",
|
||||
"4 NaN 69.96 12.63 Desert False 0.0 2 \n",
|
||||
"\n",
|
||||
" input_method build_version is_featured_event device_temp_c session_type \\\n",
|
||||
"0 Touch NaN No 85.6 ranked \n",
|
||||
"1 Touch NaN 0 62.0 casual \n",
|
||||
"2 TOUCH 1.4 False 69.0 ranked \n",
|
||||
"3 Controller 1.3.2 0 48.1 casual \n",
|
||||
"4 controllr NaN 0 54.7 casual \n",
|
||||
"\n",
|
||||
" is_long_session \n",
|
||||
"0 True \n",
|
||||
"1 0 \n",
|
||||
"2 False \n",
|
||||
"3 True \n",
|
||||
"4 Yes "
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"csv_path = 'dataset_A_indie_game_telemetry_v2.csv' # or D/E/F\n",
|
||||
"df_raw = pd.read_csv(csv_path, dtype=str)\n",
|
||||
"df_raw.head()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "84e6ef6f5e434536a0dcf69111e15ef9",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" | | [ 0%] 00:00 -> (? left)"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.\n",
|
||||
"SweetViz report written to sweetviz_report.html\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sweetviz as sv\n",
|
||||
"report = sv.analyze(df_raw)\n",
|
||||
"report.show_html('sweetviz_report.html')\n",
|
||||
"print('SweetViz report written to sweetviz_report.html')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import dtale\n",
|
||||
"d = dtale.show(df_raw)\n",
|
||||
"d.open_browser()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## EDA Quests\n",
|
||||
"- Parse timestamps\n",
|
||||
"- Fix units (minutes vs seconds)\n",
|
||||
"- Normalize categories & booleans\n",
|
||||
"- Convert comma-decimals to floats\n",
|
||||
"- Identify & treat outliers\n",
|
||||
"- Re-run SweetViz and compare before/after\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
# VI_Lab_01_EDA
|
||||
|
||||
|
||||
## Datasaurus dozen
|
||||
https://cran.r-project.org/web/packages/datasauRus/index.html
|
||||
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -15,20 +15,390 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"id": "d9080704",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"from ydata_profiling import ProfileReport\n",
|
||||
"import pyreadr"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "64c538cf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" event_id user_id repo_id timestamp \\\n",
|
||||
"0 evt_d351e59b15fd user_2432 repo_575 29/03/2025 17:26 \n",
|
||||
"1 evt_435c1b33622f user_2017 repo_1112 01/07/2025 12:20 \n",
|
||||
"2 evt_758099c90286 user_930 repo_103 2025-01-30 02:26:34+00:00 \n",
|
||||
"3 evt_312809052420 user_1892 repo_988 2025-03-21 08:01:25-05:00 \n",
|
||||
"4 evt_0b2d75d29ec3 user_2793 repo_419 2025-02-28 18:22:51-05:00 \n",
|
||||
"\n",
|
||||
" event_type lines_added lines_deleted files_changed dominant_language \\\n",
|
||||
"0 pr_opened 40 2 3 Python \n",
|
||||
"1 Commit 3 24 2 GO \n",
|
||||
"2 pr_merged 13 12 11 Rust \n",
|
||||
"3 pr_opened 28 6 3 C++ \n",
|
||||
"4 Review_comment 79 2 3 C++ \n",
|
||||
"\n",
|
||||
" ci_status ... time_to_ci_minutes build_duration_s tests_run tests_failed \\\n",
|
||||
"0 SUCCESS ... 13.38 493.98 115 8 \n",
|
||||
"1 FAILED ... 16.86 107.57 90 14 \n",
|
||||
"2 failed ... 448.32 193.38 92 6 \n",
|
||||
"3 SUCCESS ... NaN 498.92 177 12 \n",
|
||||
"4 failure ... 1.14 162.55 113 9 \n",
|
||||
"\n",
|
||||
" is_weekend pr_merge_time_hours label_is_high_quality exam_period \\\n",
|
||||
"0 1 54.6 0 true \n",
|
||||
"1 False NaN 0 No \n",
|
||||
"2 No 68.6 False false \n",
|
||||
"3 0 50.6 No No \n",
|
||||
"4 false NaN False false \n",
|
||||
"\n",
|
||||
" commit_message_length is_bot_user \n",
|
||||
"0 39.0 HUMAN \n",
|
||||
"1 65.0 Bot \n",
|
||||
"2 79.0 Human \n",
|
||||
"3 NaN BOT \n",
|
||||
"4 48.0 human \n",
|
||||
"\n",
|
||||
"[5 rows x 23 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"csv_path = 'dataset_D_git_classroom_activity_v2.csv' # or D/E/F\n",
|
||||
"df = pd.read_csv(csv_path, dtype=str)\n",
|
||||
"\n",
|
||||
"print(df.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "8b5b7074",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "235ae862343743c189592812139566b2",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" | | [ 0%] 00:00 -> (? left)"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sweetviz as sv\n",
|
||||
"report = sv.analyze(df)\n",
|
||||
"report.show_html(\"sweetviz_report.html\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f9192f8c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import dtale\n",
|
||||
"d = dtale.show(df)\n",
|
||||
"d.open_browser()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "6588c7d5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "6da981bddb8148c6b90b1e959bf69e81",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"d:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\ydata_profiling\\model\\typeset_relations.py:118: FutureWarning:\n",
|
||||
"\n",
|
||||
"In a future version of pandas, parsing datetimes with mixed time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`\n",
|
||||
"\n",
|
||||
"d:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\ydata_profiling\\model\\typeset_relations.py:118: FutureWarning:\n",
|
||||
"\n",
|
||||
"In a future version of pandas, parsing datetimes with mixed time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`\n",
|
||||
"\n",
|
||||
"d:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\ydata_profiling\\model\\typeset_relations.py:118: FutureWarning:\n",
|
||||
"\n",
|
||||
"In a future version of pandas, parsing datetimes with mixed time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`\n",
|
||||
"\n",
|
||||
"d:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\ydata_profiling\\model\\typeset_relations.py:118: FutureWarning:\n",
|
||||
"\n",
|
||||
"In a future version of pandas, parsing datetimes with mixed time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`\n",
|
||||
"\n",
|
||||
"100%|██████████| 23/23 [00:02<00:00, 10.28it/s]\n",
|
||||
"2026-02-22 18:18:00,663 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:00,665 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:01,181 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:01,184 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:01,769 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:01,771 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:02,102 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:02,104 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:06,881 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:06,883 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:15,152 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:15,153 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:15,879 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:15,881 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:16,450 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:16,451 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:17,432 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:17,434 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:18,477 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:18,478 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:18,927 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:18,928 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:19,290 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:19,291 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:19,537 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:19,539 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:23,884 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:23,886 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:32,551 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:32,552 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:33,203 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:33,206 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:33,680 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:33,682 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:34,583 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:34,585 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:35,018 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:35,020 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:35,361 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:35,362 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:35,606 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:35,608 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:35,731 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:35,732 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:40,821 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:40,824 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:49,090 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:49,092 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:49,640 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:49,642 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:50,002 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:50,004 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:50,868 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:50,869 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:51,180 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:51,185 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:55,498 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:55,504 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:59,720 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:18:59,726 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:05,342 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:05,349 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:13,151 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:13,157 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:26,049 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:26,055 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:30,583 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:30,588 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:35,069 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:35,073 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:38,983 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:38,988 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:45,280 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:45,294 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:52,574 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:52,589 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:59,780 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:19:59,793 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:09,561 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:09,574 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:20,546 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:20,559 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:37,612 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:37,625 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:45,120 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:45,134 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:52,334 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:20:52,339 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:00,146 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:00,159 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:07,049 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:07,051 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:07,799 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:07,801 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:08,453 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:08,455 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:08,989 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:08,991 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:13,508 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:13,511 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:21,077 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:21,079 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:22,011 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:22,013 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:22,775 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:22,776 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:23,964 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:23,966 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:24,694 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:24,696 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:25,246 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:25,248 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:25,721 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:25,723 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:26,091 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:26,093 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:30,494 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:30,495 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:42,759 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:42,760 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:43,548 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:43,550 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:44,121 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:44,122 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:45,072 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:45,073 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:45,610 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:45,612 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:46,597 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:46,599 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:47,495 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:47,497 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:48,310 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:48,312 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:52,223 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:52,226 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:55,903 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:55,907 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:57,084 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:57,087 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:58,045 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:58,047 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:59,518 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:21:59,520 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:00,485 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:00,487 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:01,013 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:01,015 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:01,438 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:01,439 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:01,743 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:01,745 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:05,940 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:05,942 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:12,901 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:12,903 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:13,646 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:13,648 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:14,203 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:14,205 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:15,198 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n",
|
||||
"2026-02-22 18:22:15,199 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "e92ab4eef2e64b41a613807fdf8c6a7c",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "aa07e39a84b748a7b81430aa794762bb",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Render HTML: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "1c42855b0063421daf892af1da802367",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Export report to file: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"profile = ProfileReport(df, title=\"Indie Games Telemetry Dataset Profile\", explorative=True)\n",
|
||||
"profile.to_file(\"ydata_profile_report.html\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
1847
claude/datasaurus_dozen.csv
Normal file
1847
claude/datasaurus_dozen.csv
Normal file
File diff suppressed because it is too large
Load Diff
10001
claude/dataset_A_indie_game_telemetry_v2.csv
Normal file
10001
claude/dataset_A_indie_game_telemetry_v2.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
claude/lab02_instructor_guide.docx
Normal file
BIN
claude/lab02_instructor_guide.docx
Normal file
Binary file not shown.
865
claude/lab02_task1_datasaurus.ipynb
Normal file
865
claude/lab02_task1_datasaurus.ipynb
Normal file
File diff suppressed because one or more lines are too long
829
claude/lab02_task2_telemetry.ipynb
Normal file
829
claude/lab02_task2_telemetry.ipynb
Normal file
@@ -0,0 +1,829 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lab 02 · Task 2 — Guided EDA and Data Cleaning with SweetViz & D-Tale\n",
|
||||
"\n",
|
||||
"**Estimated time:** ~50 minutes \n",
|
||||
"**Dataset:** `dataset_A_indie_game_telemetry.csv`\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### Objectives\n",
|
||||
"\n",
|
||||
"By the end of this task you will be able to:\n",
|
||||
"- Generate an automated EDA report with **SweetViz** to get a rapid overview of a dataset\n",
|
||||
"- Use **D-Tale** interactively to identify and fix data quality problems\n",
|
||||
"- Recognise the most common categories of data issues: inconsistent encoding, mixed types, excessive missingness, and outliers\n",
|
||||
"- Understand how interactive tools translate cleaning actions into pandas code\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### Context\n",
|
||||
"\n",
|
||||
"You have been handed a telemetry dataset from a small indie game studio. It contains **10,000 session records** with information about players, platforms, performance metrics, and purchases. Before any visualisation or analysis can be built on top of this data, it must be understood and cleaned.\n",
|
||||
"\n",
|
||||
"This is real-world data quality: messy, inconsistent, and requiring decisions — not just mechanical fixes.\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Part 1 — Setup and First Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Shape: (10000, 20)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>session_id</th>\n",
|
||||
" <th>user_id</th>\n",
|
||||
" <th>start_time</th>\n",
|
||||
" <th>end_time</th>\n",
|
||||
" <th>session_length_s</th>\n",
|
||||
" <th>region</th>\n",
|
||||
" <th>platform</th>\n",
|
||||
" <th>gpu_model</th>\n",
|
||||
" <th>avg_fps</th>\n",
|
||||
" <th>ping_ms</th>\n",
|
||||
" <th>map_name</th>\n",
|
||||
" <th>crash_flag</th>\n",
|
||||
" <th>purchase_amount</th>\n",
|
||||
" <th>party_size</th>\n",
|
||||
" <th>input_method</th>\n",
|
||||
" <th>build_version</th>\n",
|
||||
" <th>is_featured_event</th>\n",
|
||||
" <th>device_temp_c</th>\n",
|
||||
" <th>session_type</th>\n",
|
||||
" <th>is_long_session</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>sess_c2fba8e7f37a</td>\n",
|
||||
" <td>user_488</td>\n",
|
||||
" <td>2025-07-18T18:32:00Z</td>\n",
|
||||
" <td>2025-07-18 20:03:21-05:00</td>\n",
|
||||
" <td>5481.0</td>\n",
|
||||
" <td>us-west</td>\n",
|
||||
" <td>pc</td>\n",
|
||||
" <td>GTX1080</td>\n",
|
||||
" <td>83.52</td>\n",
|
||||
" <td>431.16</td>\n",
|
||||
" <td>ocean</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>0,00</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>Touch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>85.6</td>\n",
|
||||
" <td>ranked</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>sess_33d286298cf9</td>\n",
|
||||
" <td>user_1511</td>\n",
|
||||
" <td>2025-06-13 23:21:08+00:00</td>\n",
|
||||
" <td>2025-06-13 23:36:30+01:00</td>\n",
|
||||
" <td>922.0</td>\n",
|
||||
" <td>Us-east</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>72.75</td>\n",
|
||||
" <td>29.12</td>\n",
|
||||
" <td>desert</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>Touch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>62.0</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>sess_be2bb4d8986a</td>\n",
|
||||
" <td>user_830</td>\n",
|
||||
" <td>2025-10-20 02:42:07-05:00</td>\n",
|
||||
" <td>20/10/2025 02:49</td>\n",
|
||||
" <td>451.0</td>\n",
|
||||
" <td>sa-east-1</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>69.20</td>\n",
|
||||
" <td>40.47</td>\n",
|
||||
" <td>Forest</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>TOUCH</td>\n",
|
||||
" <td>1.4</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>69.0</td>\n",
|
||||
" <td>ranked</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>sess_7f425ca9a0e2</td>\n",
|
||||
" <td>user_1</td>\n",
|
||||
" <td>08/01/2025 06:35</td>\n",
|
||||
" <td>2025-08-01T08:32:45Z</td>\n",
|
||||
" <td>7031.0</td>\n",
|
||||
" <td>sa-east-1</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>33.29</td>\n",
|
||||
" <td>92.40</td>\n",
|
||||
" <td>Desert</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>17.55</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Controller</td>\n",
|
||||
" <td>1.3.2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>48.1</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>sess_5657e28b22ec</td>\n",
|
||||
" <td>user_211</td>\n",
|
||||
" <td>2025-09-08T23:41:44Z</td>\n",
|
||||
" <td>2025-09-09 00:32:59+01:00</td>\n",
|
||||
" <td>3075.0</td>\n",
|
||||
" <td>US-EAST</td>\n",
|
||||
" <td>switch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>69.96</td>\n",
|
||||
" <td>12.63</td>\n",
|
||||
" <td>Desert</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>controllr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>54.7</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" session_id user_id start_time \\\n",
|
||||
"0 sess_c2fba8e7f37a user_488 2025-07-18T18:32:00Z \n",
|
||||
"1 sess_33d286298cf9 user_1511 2025-06-13 23:21:08+00:00 \n",
|
||||
"2 sess_be2bb4d8986a user_830 2025-10-20 02:42:07-05:00 \n",
|
||||
"3 sess_7f425ca9a0e2 user_1 08/01/2025 06:35 \n",
|
||||
"4 sess_5657e28b22ec user_211 2025-09-08T23:41:44Z \n",
|
||||
"\n",
|
||||
" end_time session_length_s region platform \\\n",
|
||||
"0 2025-07-18 20:03:21-05:00 5481.0 us-west pc \n",
|
||||
"1 2025-06-13 23:36:30+01:00 922.0 Us-east PlayStation \n",
|
||||
"2 20/10/2025 02:49 451.0 sa-east-1 PlayStation \n",
|
||||
"3 2025-08-01T08:32:45Z 7031.0 sa-east-1 PlayStation \n",
|
||||
"4 2025-09-09 00:32:59+01:00 3075.0 US-EAST switch \n",
|
||||
"\n",
|
||||
" gpu_model avg_fps ping_ms map_name crash_flag purchase_amount party_size \\\n",
|
||||
"0 GTX1080 83.52 431.16 ocean Yes 0,00 2 \n",
|
||||
"1 NaN 72.75 29.12 desert No 0.0 3 \n",
|
||||
"2 NaN 69.20 40.47 Forest False 0.0 5 \n",
|
||||
"3 NaN 33.29 92.40 Desert No 17.55 1 \n",
|
||||
"4 NaN 69.96 12.63 Desert False 0.0 2 \n",
|
||||
"\n",
|
||||
" input_method build_version is_featured_event device_temp_c session_type \\\n",
|
||||
"0 Touch NaN No 85.6 ranked \n",
|
||||
"1 Touch NaN 0 62.0 casual \n",
|
||||
"2 TOUCH 1.4 False 69.0 ranked \n",
|
||||
"3 Controller 1.3.2 0 48.1 casual \n",
|
||||
"4 controllr NaN 0 54.7 casual \n",
|
||||
"\n",
|
||||
" is_long_session \n",
|
||||
"0 True \n",
|
||||
"1 0 \n",
|
||||
"2 False \n",
|
||||
"3 True \n",
|
||||
"4 Yes "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"# Load the raw dataset — do NOT clean anything yet\n",
|
||||
"df = pd.read_csv('dataset_A_indie_game_telemetry_v2.csv')\n",
|
||||
"\n",
|
||||
"print(f'Shape: {df.shape}')\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Column types (as loaded):\n",
|
||||
"session_id object\n",
|
||||
"user_id object\n",
|
||||
"start_time object\n",
|
||||
"end_time object\n",
|
||||
"session_length_s float64\n",
|
||||
"region object\n",
|
||||
"platform object\n",
|
||||
"gpu_model object\n",
|
||||
"avg_fps float64\n",
|
||||
"ping_ms float64\n",
|
||||
"map_name object\n",
|
||||
"crash_flag object\n",
|
||||
"purchase_amount object\n",
|
||||
"party_size int64\n",
|
||||
"input_method object\n",
|
||||
"build_version object\n",
|
||||
"is_featured_event object\n",
|
||||
"device_temp_c float64\n",
|
||||
"session_type object\n",
|
||||
"is_long_session object\n",
|
||||
"dtype: object\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Quick look at column types as pandas inferred them\n",
|
||||
"print('Column types (as loaded):')\n",
|
||||
"print(df.dtypes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ Notice:** Several columns that should be boolean (`crash_flag`, `is_featured_event`, `is_long_session`) or numeric (`purchase_amount`) have been inferred as `object`. This is your first signal that something is wrong.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 2 — Automated Overview with SweetViz\n",
|
||||
"\n",
|
||||
"Before diving into manual inspection, generate a SweetViz report. This gives you a visual overview of every column in one step — distributions, types, missing values, and anomalies.\n",
|
||||
"\n",
|
||||
"**Think of SweetViz as your \"triage\" tool.** It shows you *where* to look; D-Tale is where you look *closely*."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "bd10cd653e7a47f891552a79e946376c",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" | | [ 0%] 00:00 -> (? left)"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Report sweetviz_raw_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.\n",
|
||||
"Report saved as sweetviz_raw_report.html — open it in your browser.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Generate the SweetViz report\n",
|
||||
"# This may take 30–60 seconds\n",
|
||||
"report = sv.analyze(df_raw)\n",
|
||||
"report.show_html('sweetviz_raw_report.html')\n",
|
||||
"\n",
|
||||
"print('Report saved as sweetviz_raw_report.html — open it in your browser.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 📋 SweetViz Checklist — What to look for\n",
|
||||
"\n",
|
||||
"Open `sweetviz_raw_report.html` and answer the following questions. Write your findings below before moving on.\n",
|
||||
"\n",
|
||||
"| Question | Your finding |\n",
|
||||
"|---|---|\n",
|
||||
"| Which columns have missing values? Which has the most? | *...* |\n",
|
||||
"| Which columns are listed as TEXT but should be numeric or boolean? | *...* |\n",
|
||||
"| Are there any numeric columns with suspicious ranges (very high max or very low min)? | *...* |\n",
|
||||
"| How many unique values does `region` have? Does that seem right? | *...* |\n",
|
||||
"| What is unusual about `purchase_amount`? | *...* |\n",
|
||||
"\n",
|
||||
"*(Double-click to fill in your answers)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 3 — Deep Inspection and Cleaning with D-Tale\n",
|
||||
"\n",
|
||||
"D-Tale opens the dataset in an interactive grid. You can sort, filter, inspect, and clean without writing a single line of pandas — but D-Tale records every action as code you can export later.\n",
|
||||
"\n",
|
||||
"**Launch D-Tale now:**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2026-02-22 20:12:55,619 - INFO - D-Tale started at: http://127.0.0.1:40000\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Open D-Tale at: http://127.0.0.1:40000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Launch D-Tale with the raw dataset\n",
|
||||
"# A link will appear — click it to open D-Tale in a new browser ta\n",
|
||||
"d = dtale.show(df_raw, host='127.0.0.1', subprocess=False, open_browser=True)\n",
|
||||
"print(\"Open D-Tale at:\", d._url) # lists all running instances\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4c2e5293",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "bad operand type for abs(): 'str'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[16]\u001b[39m\u001b[32m, line 21\u001b[39m\n\u001b[32m 18\u001b[39m \tstr_data = pd.to_numeric(s, errors=\u001b[33m'\u001b[39m\u001b[33mcoerce\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 19\u001b[39m pd.Series(str_data, name=\u001b[33m'\u001b[39m\u001b[33mpurchase_amount\u001b[39m\u001b[33m'\u001b[39m, index=s.index)\n\u001b[32m---> \u001b[39m\u001b[32m21\u001b[39m df[\u001b[33m'\u001b[39m\u001b[33mpurchase_amount\u001b[39m\u001b[33m'\u001b[39m] = \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mpurchase_amount\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mabs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:1722\u001b[39m, in \u001b[36mNDFrame.abs\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1654\u001b[39m \u001b[38;5;129m@final\u001b[39m\n\u001b[32m 1655\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mabs\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Self:\n\u001b[32m 1656\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 1657\u001b[39m \u001b[33;03m Return a Series/DataFrame with absolute numeric value of each element.\u001b[39;00m\n\u001b[32m 1658\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 1720\u001b[39m \u001b[33;03m 3 7 40 -50\u001b[39;00m\n\u001b[32m 1721\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1722\u001b[39m res_mgr = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_mgr\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mabs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1723\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._constructor_from_mgr(res_mgr, axes=res_mgr.axes).__finalize__(\n\u001b[32m 1724\u001b[39m \u001b[38;5;28mself\u001b[39m, name=\u001b[33m\"\u001b[39m\u001b[33mabs\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1725\u001b[39m )\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:361\u001b[39m, in \u001b[36mBaseBlockManager.apply\u001b[39m\u001b[34m(self, f, align_keys, **kwargs)\u001b[39m\n\u001b[32m 358\u001b[39m kwargs[k] = obj[b.mgr_locs.indexer]\n\u001b[32m 360\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(f):\n\u001b[32m--> \u001b[39m\u001b[32m361\u001b[39m applied = \u001b[43mb\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 363\u001b[39m applied = \u001b[38;5;28mgetattr\u001b[39m(b, f)(**kwargs)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:395\u001b[39m, in \u001b[36mBlock.apply\u001b[39m\u001b[34m(self, func, **kwargs)\u001b[39m\n\u001b[32m 389\u001b[39m \u001b[38;5;129m@final\u001b[39m\n\u001b[32m 390\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mapply\u001b[39m(\u001b[38;5;28mself\u001b[39m, func, **kwargs) -> \u001b[38;5;28mlist\u001b[39m[Block]:\n\u001b[32m 391\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 392\u001b[39m \u001b[33;03m apply the function to my values; return a block if we are not\u001b[39;00m\n\u001b[32m 393\u001b[39m \u001b[33;03m one\u001b[39;00m\n\u001b[32m 394\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m395\u001b[39m result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 397\u001b[39m result = maybe_coerce_values(result)\n\u001b[32m 398\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._split_op_result(result)\n",
|
||||
"\u001b[31mTypeError\u001b[39m: bad operand type for abs(): 'str'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):\n",
|
||||
"\tdf = df.to_frame(index=False)\n",
|
||||
"\n",
|
||||
"# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required\n",
|
||||
"df = df.reset_index().drop('index', axis=1, errors='ignore')\n",
|
||||
"df.columns = [str(c) for c in df.columns] # update columns to strings in case they are numbers\n",
|
||||
"\n",
|
||||
"df['purchase_amount'] = df['purchase_amount'].str.replace(',', '.', case=False, regex='False')\n",
|
||||
"df['purchase_amount'] = s = df['purchase_amount'] \n",
|
||||
"\n",
|
||||
"if s.str.startswith('0x').any():\n",
|
||||
"\tstr_data = s.apply(float.fromhex)\n",
|
||||
"else:\n",
|
||||
"\tstr_data = pd.to_numeric(s, errors='coerce')\n",
|
||||
"\t\n",
|
||||
"pd.Series(str_data, name='purchase_amount', index=s.index)\n",
|
||||
"\n",
|
||||
"df['purchase_amount'] = df['purchase_amount'].abs()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "8180fa05",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2026-02-22 20:18:35,563 - INFO - D-Tale started at: http://127.0.0.1:40000\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Open D-Tale at: http://127.0.0.1:40000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Launch D-Tale with the raw dataset\n",
|
||||
"# A link will appear — click it to open D-Tale in a new browser ta\n",
|
||||
"d = dtale.show(df, host='127.0.0.1', subprocess=False, open_browser=True)\n",
|
||||
"print(\"Open D-Tale at:\", d._url) # lists all running instances\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "745a5655",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" TCP 169.254.62.24:40000 0.0.0.0:0 LISTENING 11972\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Check if something else is already on port 40000\n",
|
||||
"import subprocess\n",
|
||||
"result = subprocess.run('netstat -ano | findstr :40000', shell=True, capture_output=True, text=True)\n",
|
||||
"print(result.stdout or \"Nothing on port 40000\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 🔍 Issue 1 — Missing Values\n",
|
||||
"\n",
|
||||
"In D-Tale, go to **\"Describe\"** (top menu → Describe) to see the missing value counts per column.\n",
|
||||
"\n",
|
||||
"You will find:\n",
|
||||
"\n",
|
||||
"| Column | Missing | Note |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `gpu_model` | ~67% | Most players are on console — GPU does not apply |\n",
|
||||
"| `build_version` | ~17% | Not recorded in older sessions |\n",
|
||||
"| `device_temp_c` | ~5% | Sensor not available on some devices |\n",
|
||||
"| `session_length_s` | ~1% | Session ended abnormally (crash?) |\n",
|
||||
"| `ping_ms`, `purchase_amount`, `end_time` | <2% | Sporadic gaps |\n",
|
||||
"\n",
|
||||
"**Cleaning decisions to make in D-Tale:**\n",
|
||||
"\n",
|
||||
"1. **`gpu_model`** — This column is missing for 67% of rows. Rather than imputing, consider: is this column useful for a console/mobile player? Go to **Column Actions → Delete Column** and remove it. Alternatively, you can keep it and decide during analysis.\n",
|
||||
"\n",
|
||||
"2. **`build_version`** — Missings are structurally valid (older sessions). Keep the column; do not impute.\n",
|
||||
"\n",
|
||||
"3. **Remaining columns** — Leave missing values in place for now. We will handle them during analysis when context is clearer.\n",
|
||||
"\n",
|
||||
"> 📝 **Record your decisions:** Which columns did you keep? Which did you drop? Why?\n",
|
||||
"\n",
|
||||
"*(Double-click to write your decisions here)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 🔍 Issue 2 — Boolean Columns with Mixed Encodings\n",
|
||||
"\n",
|
||||
"Three columns represent true/false flags but were stored with at least **8 different representations**:\n",
|
||||
"\n",
|
||||
"- `crash_flag` → `Yes`, `No`, `True`, `False`, `true`, `false`, `1`, `0`\n",
|
||||
"- `is_featured_event` → same 8 representations \n",
|
||||
"- `is_long_session` → same 8 representations\n",
|
||||
"\n",
|
||||
"**In D-Tale, clean each column:**\n",
|
||||
"\n",
|
||||
"1. Click the column header → **Column Actions → Type Conversion**\n",
|
||||
"2. Select **String to Bool** (D-Tale will map Yes/True/1 → True and No/False/0 → False)\n",
|
||||
"3. Preview the result before applying\n",
|
||||
"4. Repeat for all three columns\n",
|
||||
"\n",
|
||||
"> 💡 **Alternative via Find & Replace:** If Type Conversion does not cover all variants, use **Column Actions → Replace** to manually map unusual values (e.g., `Yes` → `True`) before converting.\n",
|
||||
"\n",
|
||||
"After cleaning, verify with Describe: each column should show only `True` and `False`.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 🔍 Issue 3 — Categorical Columns: Case and Whitespace Chaos\n",
|
||||
"\n",
|
||||
"Four categorical columns have serious inconsistency:\n",
|
||||
"\n",
|
||||
"- `region` — 32 variants of 5 values (e.g., `us-west`, `US-WEST`, `Us-west`, `' us-west '`)\n",
|
||||
"- `map_name` — 36 variants of 6 values\n",
|
||||
"- `platform` — 32 variants of 6 values\n",
|
||||
"- `input_method` — 30 variants, including a typo: `controllr` instead of `controller`\n",
|
||||
"\n",
|
||||
"**Clean each column in D-Tale:**\n",
|
||||
"\n",
|
||||
"1. Click column header → **Column Actions → Type Conversion → String Cleaning**\n",
|
||||
"2. Apply **Strip whitespace** and **Lowercase** (or **Uppercase** — be consistent)\n",
|
||||
"3. For `input_method`, also apply a **Replace** to fix `controllr` → `controller` and `kb/m` → `kbm` (pick one variant and standardise)\n",
|
||||
"\n",
|
||||
"After cleaning, each column should have the expected number of unique values:\n",
|
||||
"\n",
|
||||
"| Column | Before | After |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `region` | 32 | 5 |\n",
|
||||
"| `map_name` | 36 | 6 |\n",
|
||||
"| `platform` | 32 | 6 |\n",
|
||||
"| `input_method` | 30 | 3 |\n",
|
||||
"\n",
|
||||
"> Use **Describe → value_counts** to verify before and after each fix.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 🔍 Issue 4 — `purchase_amount`: Comma as Decimal Separator\n",
|
||||
"\n",
|
||||
"Some rows contain values like `\"0,00\"` and `\"1,80\"` where a comma was used instead of a decimal point. This prevents pandas from reading the column as numeric.\n",
|
||||
"\n",
|
||||
"**In D-Tale:**\n",
|
||||
"\n",
|
||||
"1. Filter the column to show only rows where the value contains a comma: **Column Actions → Filter → contains `,`**\n",
|
||||
"2. Apply a **Replace**: replace `,` with `.` in the column\n",
|
||||
"3. Then convert the column type: **Column Actions → Type Conversion → Float**\n",
|
||||
"\n",
|
||||
"> After conversion, verify the column dtype and check the range (min/max) with Describe.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 🔍 Issue 5 — Outliers in Numeric Columns\n",
|
||||
"\n",
|
||||
"The SweetViz report and D-Tale Describe should have flagged suspicious ranges. Check these now:\n",
|
||||
"\n",
|
||||
"| Column | Suspicious value | Likely explanation |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `avg_fps` | max = 10,000 | Sensor error or logging bug — physically impossible |\n",
|
||||
"| `ping_ms` | max = 627 ms | High but plausible for satellite connections |\n",
|
||||
"| `device_temp_c` | max = 100°C | Right at thermal throttling limit — possible but worth flagging |\n",
|
||||
"\n",
|
||||
"**In D-Tale, investigate `avg_fps`:**\n",
|
||||
"\n",
|
||||
"1. Use **Charts** (top menu) to plot a histogram of `avg_fps` — does it show an extreme outlier spike?\n",
|
||||
"2. Use **Filter** to see how many rows have `avg_fps > 300` (a hard upper bound for realistic gameplay)\n",
|
||||
"3. **Decide:** Should these rows be dropped, or should the value be set to `NaN` to mark it as invalid?\n",
|
||||
"4. Apply your decision via **Column Actions → Replace** or a row-level **Filter + Delete**\n",
|
||||
"\n",
|
||||
"> 📝 **Record your decision and reasoning:** What threshold did you use? How many rows were affected?\n",
|
||||
"\n",
|
||||
"*(Double-click to write your answer here)*\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"### 🔍 Issue 6 — Mixed Datetime Formats\n",
|
||||
"\n",
|
||||
"The `start_time` and `end_time` columns contain timestamps in multiple formats:\n",
|
||||
"\n",
|
||||
"- ISO 8601 with timezone: `2025-07-18T18:32:00Z`\n",
|
||||
"- ISO with offset: `2025-07-18 20:03:21-05:00` \n",
|
||||
"- European: `20/10/2025 02:49`\n",
|
||||
"- US: `08/01/2025 06:35`\n",
|
||||
"\n",
|
||||
"This is one of the harder issues to fix entirely within D-Tale's UI. For now:\n",
|
||||
"\n",
|
||||
"1. In D-Tale, go to **Column Actions → Type Conversion** on `start_time` and try **String to Date** with `infer_datetime_format=True`\n",
|
||||
"2. Check how many values fail to parse (shown as NaT after conversion)\n",
|
||||
"3. Make note of any unresolved formats — these will need to be handled in pandas with `pd.to_datetime(..., errors='coerce')` and may require a more careful cleaning pass\n",
|
||||
"\n",
|
||||
"> ⚠️ **Key insight:** Not all cleaning can be done point-and-click. Some issues require programmatic resolution. This is where the code D-Tale generates becomes valuable.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 4 — Export the Cleaning Code from D-Tale\n",
|
||||
"\n",
|
||||
"Every cleaning action you performed in D-Tale was recorded as pandas code. Let's export and inspect it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieve the cleaned dataframe from D-Tale\n",
|
||||
"# (This reflects all changes made in the D-Tale UI)\n",
|
||||
"df_clean = d.data.copy()\n",
|
||||
"\n",
|
||||
"print(f'Cleaned shape: {df_clean.shape}')\n",
|
||||
"print('\\nColumn types after cleaning:')\n",
|
||||
"print(df_clean.dtypes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# D-Tale also lets you export the complete cleaning pipeline as Python code.\n",
|
||||
"# In the D-Tale UI: click the code icon (</>) in the top-right corner → \"Export Code\"\n",
|
||||
"# Paste the exported code below:\n",
|
||||
"\n",
|
||||
"# --- Paste D-Tale exported code here ---\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 4.1 — Manual refinement in pandas\n",
|
||||
"\n",
|
||||
"D-Tale generates the skeleton; pandas lets you add precision. Here is an example of cleaning the `start_time` column more robustly — something D-Tale's UI cannot fully handle."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example: robust datetime parsing for mixed-format timestamps\n",
|
||||
"# pd.to_datetime with utc=True normalises all timezone representations\n",
|
||||
"df_clean['start_time'] = pd.to_datetime(df_clean['start_time'], utc=True, errors='coerce')\n",
|
||||
"df_clean['end_time'] = pd.to_datetime(df_clean['end_time'], utc=True, errors='coerce')\n",
|
||||
"\n",
|
||||
"# Check how many rows could not be parsed\n",
|
||||
"print('Unparsed start_time rows:', df_clean['start_time'].isna().sum())\n",
|
||||
"print('Unparsed end_time rows: ', df_clean['end_time'].isna().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example: cap avg_fps outliers (adjust the threshold based on your decision above)\n",
|
||||
"# Replace values > 300 with NaN to mark them as invalid rather than deleting rows\n",
|
||||
"fps_threshold = 300\n",
|
||||
"n_outliers = (df_clean['avg_fps'] > fps_threshold).sum()\n",
|
||||
"df_clean.loc[df_clean['avg_fps'] > fps_threshold, 'avg_fps'] = float('nan')\n",
|
||||
"\n",
|
||||
"print(f'Rows with avg_fps > {fps_threshold} set to NaN: {n_outliers}')\n",
|
||||
"print(f'avg_fps range after: {df_clean[\"avg_fps\"].min():.1f} – {df_clean[\"avg_fps\"].max():.1f}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 5 — Validation: Before vs After\n",
|
||||
"\n",
|
||||
"The real test of cleaning work is a comparison report. SweetViz can compare two dataframes side by side."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate a comparison report: raw vs cleaned\n",
|
||||
"# This may take 60–90 seconds\n",
|
||||
"compare_report = sv.compare([df_raw, 'Raw'], [df_clean, 'Cleaned'])\n",
|
||||
"compare_report.show_html('sweetviz_comparison_report.html', open_browser=False)\n",
|
||||
"\n",
|
||||
"print('Comparison report saved — open sweetviz_comparison_report.html in your browser.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Open the comparison report and verify:\n",
|
||||
"\n",
|
||||
"- ✅ Boolean columns now show only `True` / `False`\n",
|
||||
"- ✅ Categorical columns have the expected number of unique values\n",
|
||||
"- ✅ `purchase_amount` is now numeric\n",
|
||||
"- ✅ `avg_fps` no longer has a 10,000 outlier\n",
|
||||
"- ✅ Missing value counts have changed as expected\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Part 6 — Save the Cleaned Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_clean.to_csv('dataset_A_indie_game_telemetry_clean.csv', index=False)\n",
|
||||
"print('Cleaned dataset saved.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"## 🔑 Key Takeaways\n",
|
||||
"\n",
|
||||
"- **SweetViz** gives you a rapid automated overview — use it at the start and for before/after comparison. It does not clean; it informs.\n",
|
||||
"- **D-Tale** lets you explore interactively, spot patterns, and clean through a UI. Every action is tracked as pandas code, so you are never locked into the GUI.\n",
|
||||
"- **Pandas** remains essential for edge cases: complex datetime parsing, conditional logic, and anything requiring programmatic iteration.\n",
|
||||
"- The three tools form a pipeline: **SweetViz → triage → D-Tale → interactive cleaning → pandas → refinement**.\n",
|
||||
"\n",
|
||||
"**Common issue categories you have now seen:**\n",
|
||||
"\n",
|
||||
"| Category | Example in this dataset |\n",
|
||||
"|---|---|\n",
|
||||
"| Boolean encoding inconsistency | 8 representations of True/False |\n",
|
||||
"| Categorical case/whitespace chaos | 32 variants of 5 region names |\n",
|
||||
"| Typos in categories | `controllr` vs `controller` |\n",
|
||||
"| Wrong decimal separator | `1,80` instead of `1.80` |\n",
|
||||
"| Structural missingness | `gpu_model` absent for console players |\n",
|
||||
"| Sensor/logging outliers | `avg_fps = 10,000` |\n",
|
||||
"| Mixed datetime formats | ISO 8601 mixed with European dates |\n",
|
||||
"\n",
|
||||
"→ In **Task 3**, you will apply these same skills independently to a new dataset — with less guidance."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
11968
claude/sweetviz_raw_report.html
Normal file
11968
claude/sweetviz_raw_report.html
Normal file
File diff suppressed because one or more lines are too long
BIN
data/box_plots.rda
Normal file
BIN
data/box_plots.rda
Normal file
Binary file not shown.
BIN
data/box_plots_long.rda
Normal file
BIN
data/box_plots_long.rda
Normal file
Binary file not shown.
BIN
data/datasaurus_dozen.rda
Normal file
BIN
data/datasaurus_dozen.rda
Normal file
Binary file not shown.
BIN
data/datasaurus_dozen_wide.rda
Normal file
BIN
data/datasaurus_dozen_wide.rda
Normal file
Binary file not shown.
BIN
data/simpsons_paradox.rda
Normal file
BIN
data/simpsons_paradox.rda
Normal file
Binary file not shown.
BIN
data/simpsons_paradox_wide.rda
Normal file
BIN
data/simpsons_paradox_wide.rda
Normal file
Binary file not shown.
BIN
data/twelve_from_slant_alternate_long.rda
Normal file
BIN
data/twelve_from_slant_alternate_long.rda
Normal file
Binary file not shown.
BIN
data/twelve_from_slant_long.rda
Normal file
BIN
data/twelve_from_slant_long.rda
Normal file
Binary file not shown.
BIN
data/twelve_from_slant_wide.rda
Normal file
BIN
data/twelve_from_slant_wide.rda
Normal file
Binary file not shown.
329
datasaurus_profile_report.html
Normal file
329
datasaurus_profile_report.html
Normal file
File diff suppressed because one or more lines are too long
569
datasaurus_task0.csv
Normal file
569
datasaurus_task0.csv
Normal file
@@ -0,0 +1,569 @@
|
||||
dataset,x,y
|
||||
dino,61.05620938387066,50.000786041836115
|
||||
dino,57.91495193642296,59.204465996007286
|
||||
dino,61.47023196059987,43.113610600617946
|
||||
dino,57.80035367010236,47.24321395851151
|
||||
dino,53.58712459282577,50.05299250969186
|
||||
dino,54.576174284643514,55.27136753481488
|
||||
dino,57.04415090058797,48.60837508246414
|
||||
dino,55.7754529309817,49.668371636871335
|
||||
dino,59.97631629263042,46.97420868117099
|
||||
dino,55.252270806603605,43.729521303491374
|
||||
dino,43.78804073666369,51.2680929772018
|
||||
dino,57.457744795438025,44.28917489796779
|
||||
dino,63.07901849595043,40.728171627006176
|
||||
dino,54.18303406920578,47.06408074987083
|
||||
dino,60.13111685743383,55.346793849501424
|
||||
dino,54.61978970278766,49.89081259801087
|
||||
dino,50.448857009479546,38.09601765888036
|
||||
dino,52.60835140269539,48.7817448455199
|
||||
dino,58.921162722910886,54.01189924392206
|
||||
dino,52.45069273036819,46.48848624712332
|
||||
dino,49.80578813973163,40.89991031410513
|
||||
dino,47.17491923749995,57.75387697615895
|
||||
dino,51.96139127299339,45.80962849194407
|
||||
dino,48.98881855980029,51.88745177915955
|
||||
dino,47.5444086097682,46.93629859893016
|
||||
dino,50.4181337552253,49.93451248929631
|
||||
dino,51.95677944972451,42.09683907938794
|
||||
dino,53.88727108664538,50.14165935265209
|
||||
dino,54.26606888953267,49.512359488698905
|
||||
dino,51.46271162527614,46.18629417006431
|
||||
dino,51.3101582088962,46.2022341922973
|
||||
dino,50.74741487182219,39.36858698834162
|
||||
dino,54.70970456901501,45.99109531895869
|
||||
dino,47.47920661213582,50.31391127762887
|
||||
dino,50.37080654246703,48.259726978980694
|
||||
dino,56.91636224871015,48.64491455378705
|
||||
dino,58.5576027381732,41.825870898231734
|
||||
dino,55.60936656471019,44.575949545298435
|
||||
dino,50.51681140327247,45.105751676177924
|
||||
dino,52.75378987149051,48.28082671114873
|
||||
dino,49.33940063686657,52.50413243477094
|
||||
dino,55.86264975892184,40.31878156861388
|
||||
dino,59.9530087751824,57.479445880152916
|
||||
dino,58.715118284638606,47.10037582093825
|
||||
dino,49.71698951395783,53.272258634655685
|
||||
dino,52.38729221210728,54.11222535191214
|
||||
dino,54.83309991230744,52.88319518241856
|
||||
dino,55.425465588697605,51.532865840959744
|
||||
dino,54.042000082883284,56.92935246952918
|
||||
dino,54.50764837081448,50.00994681722351
|
||||
dino,61.53260278822502,41.26120469428777
|
||||
dino,48.91806000605707,52.846983540790056
|
||||
dino,49.30750637954336,57.71810592824646
|
||||
dino,52.34552407696101,44.26272594279621
|
||||
dino,61.69176810592154,55.40257395717212
|
||||
dino,61.47023584170628,52.53022329137693
|
||||
dino,50.55509725978119,57.55032476549517
|
||||
dino,52.92798651619448,52.012281978981974
|
||||
dino,57.789007871094995,47.22494953454583
|
||||
dino,56.45631748138432,52.61103335783263
|
||||
dino,55.505702124622516,42.50299604707903
|
||||
dino,55.19295269682422,54.63192948343515
|
||||
dino,51.221728561074535,47.25182729836165
|
||||
dino,52.25938579311345,57.24631864239671
|
||||
dino,56.68917902804974,50.03730918120555
|
||||
dino,50.920335702218736,50.69624595645909
|
||||
dino,51.3026693573705,48.15915279137175
|
||||
dino,51.456615686484476,51.3821664747325
|
||||
dino,56.30636326645976,46.958506222110024
|
||||
dino,55.58402685064658,42.53469245634747
|
||||
dino,48.03496962917758,50.19695850632269
|
||||
dino,54.66669398149011,51.17515718446053
|
||||
dino,63.532579099455766,52.72239743495207
|
||||
dino,50.34871109822337,53.585081440479264
|
||||
dino,48.736370357953916,45.692076975926454
|
||||
dino,53.72703357870147,56.56671360824683
|
||||
dino,51.02098071180624,43.867807306704925
|
||||
dino,53.60618990229827,44.682608568189465
|
||||
dino,58.50654368842603,42.60034245818288
|
||||
dino,49.41012539035559,45.81089977627783
|
||||
dino,52.00787019723078,57.64766026908493
|
||||
dino,57.79768322770305,48.437756206925954
|
||||
dino,49.09825792467933,52.221814882007735
|
||||
dino,49.99913861044174,40.27614451611194
|
||||
dino,58.752119169409205,49.58471305962425
|
||||
dino,57.683435295123274,49.5936382647151
|
||||
dino,57.42732244761076,44.74487203349926
|
||||
dino,49.86302863286214,51.40797259140813
|
||||
dino,50.786361343304634,44.552251111249
|
||||
dino,52.17786998593063,48.08739579512528
|
||||
dino,52.58402435498606,41.125243532909906
|
||||
dino,51.42552638866844,36.88298423887787
|
||||
dino,56.500925804108746,39.98971172196626
|
||||
dino,49.5824666422862,48.260825396304874
|
||||
dino,51.041748014434745,55.71507297703368
|
||||
dino,48.828572361106204,49.33525434674591
|
||||
dino,53.842868727090014,42.15953251129401
|
||||
dino,56.09310664212701,47.14226834388876
|
||||
dino,57.08716220485467,52.117520769818654
|
||||
dino,62.65294379712276,54.68263974718196
|
||||
dino,52.523272648230225,46.803104112120366
|
||||
dino,58.39863838354845,51.27631865361299
|
||||
dino,56.56052610439037,39.91521977844583
|
||||
dino,53.902695502404256,44.309845453971555
|
||||
dino,55.11969839617295,47.50924805178521
|
||||
dino,57.640715632370366,49.58609107595651
|
||||
dino,57.1453118484359,45.667904516320284
|
||||
dino,50.222214976326995,45.94975153398726
|
||||
dino,53.93191834455424,49.89575867777541
|
||||
dino,63.037235802763405,47.78871424169679
|
||||
dino,50.17621999802889,46.27009112150307
|
||||
dino,52.14561610141563,50.40740736886731
|
||||
dino,47.8368119422215,48.31630997100166
|
||||
dino,54.6260261518615,49.16090518100138
|
||||
dino,51.61073572413855,46.810391351319964
|
||||
dino,48.303756364069876,45.53340058318903
|
||||
dino,51.82855409593313,50.080250231307126
|
||||
dino,49.37527027271235,51.90599050854997
|
||||
dino,59.97793817796548,37.650074874932336
|
||||
dino,55.70503492311241,51.38454017515123
|
||||
dino,51.450251897791084,46.01364092835601
|
||||
dino,53.468477689652175,46.51104560299136
|
||||
dino,52.76394812381151,39.61998096835012
|
||||
dino,58.60932625913248,53.39809296018411
|
||||
dino,50.74654296318319,40.66787836098743
|
||||
dino,56.08425950581103,45.12106015093467
|
||||
dino,54.56781265328312,46.40335791427452
|
||||
dino,56.76615500428075,51.47374571828003
|
||||
dino,51.09761048614566,41.083180223024726
|
||||
dino,47.66824641065967,51.051896895536025
|
||||
dino,49.24456296886388,45.46591822850656
|
||||
dino,51.61474384619797,47.73716351865227
|
||||
dino,46.25488077661397,48.94389298396914
|
||||
dino,56.095564095336826,48.442110435223306
|
||||
dino,52.75645531320611,48.48700083134392
|
||||
dino,55.59618538256052,34.13703621786675
|
||||
dino,61.823649233002776,49.95046661343963
|
||||
dino,51.39036567045192,46.045233124061994
|
||||
dino,55.97496710939675,47.41948030482817
|
||||
dino,45.87726212887402,58.3224643067966
|
||||
dino,53.55783737107011,53.100863558579
|
||||
dino,51.231800608862436,55.68188527122899
|
||||
star,55.145374755569115,51.044219172377254
|
||||
star,49.81898653541218,54.0557264484135
|
||||
star,56.75927265813915,54.50923114782499
|
||||
star,51.48764976143369,45.594864407696065
|
||||
star,63.21566679073577,42.69992088639226
|
||||
star,53.45620119728672,53.684456813013476
|
||||
star,54.39089987085942,50.91476839876647
|
||||
star,52.4022038829485,49.850279439237596
|
||||
star,48.77389259305873,56.29065339809094
|
||||
star,53.52734381948572,44.59910898001575
|
||||
star,56.665532328127654,45.69640106305723
|
||||
star,48.662966114388986,41.266412471012224
|
||||
star,56.77509261076053,47.20213280926867
|
||||
star,53.46519376132624,53.388719029881315
|
||||
star,49.492696764973026,44.34661123567587
|
||||
star,52.4604807632749,48.47175794658537
|
||||
star,53.83131419483768,46.56556403805046
|
||||
star,53.75349439161741,47.46347361854413
|
||||
star,51.121582445792825,43.93503505722961
|
||||
star,55.09806543089576,43.54542458502236
|
||||
star,49.37057896323658,46.438538744371535
|
||||
star,53.36933193534474,59.283617486491046
|
||||
star,51.18119889657507,52.71630362484748
|
||||
star,56.988753336818526,42.05527522398132
|
||||
star,57.0930119096104,42.080596799033415
|
||||
star,43.36331104801304,51.031597621796905
|
||||
star,46.976437666249126,50.25467230902957
|
||||
star,51.26395640905113,56.29775398094936
|
||||
star,58.274037597264034,45.733070980743065
|
||||
star,51.24864955588527,41.9296129845294
|
||||
star,52.23630947082963,46.59822252407746
|
||||
star,52.54122582243326,48.78351927636182
|
||||
star,56.314085990915515,49.74827228496587
|
||||
star,50.943424304374226,40.811042630992105
|
||||
star,59.458127392409885,44.55275407725031
|
||||
star,51.39082560025992,45.394053438494446
|
||||
star,46.627721799373404,45.61012997979757
|
||||
star,52.08137674396821,51.10179149171756
|
||||
star,56.79382859642934,48.018854445431344
|
||||
star,57.72739349645722,49.69982491900631
|
||||
star,53.93727155358978,48.804640841491114
|
||||
star,53.237386025674404,46.02575242983275
|
||||
star,52.92906585242413,42.35994334264996
|
||||
star,55.12176682126518,43.034381945352095
|
||||
star,57.36652505629455,46.752707099195256
|
||||
star,54.19797992660036,50.46918388140478
|
||||
star,56.573257860251715,40.146882956832734
|
||||
star,53.172385295344114,52.40089456040391
|
||||
star,47.207576722270986,49.936402376975316
|
||||
star,44.97774308239124,42.88746578182198
|
||||
star,54.15452220736075,39.71642448839023
|
||||
star,50.0579570492634,40.64082496268207
|
||||
star,60.59253972883024,48.82113877743667
|
||||
star,56.269161111410675,46.88662449742423
|
||||
star,52.586273004971204,39.917629056744836
|
||||
star,52.83265054900855,44.19253894094189
|
||||
star,57.431695697169346,53.705509333287864
|
||||
star,59.86631486229671,52.26275969730616
|
||||
star,51.605384252308056,42.42051507019803
|
||||
star,57.066652726580344,49.78146408736144
|
||||
star,46.925846197291875,49.77740896371885
|
||||
star,57.25807928995147,48.29462794590815
|
||||
star,53.25978531596263,43.96175756191822
|
||||
star,48.21386120174645,52.00148974670014
|
||||
star,52.763542220913166,46.832666692281535
|
||||
star,60.93088474767653,51.42250553429595
|
||||
star,55.483300005124406,48.710309025936176
|
||||
star,60.07997944306309,56.59794653708097
|
||||
star,57.718020445918114,50.91112295698962
|
||||
star,45.62158771517542,48.61860957116753
|
||||
star,53.47957218322519,48.469766146927846
|
||||
star,57.77218434929007,34.30161416405222
|
||||
star,51.72275178611926,49.34952177470381
|
||||
star,52.13261781578895,40.9154694343687
|
||||
star,57.47585394758718,49.38435952923064
|
||||
star,50.11558171822206,49.57408602257912
|
||||
star,57.28634284819918,48.026463231496805
|
||||
star,57.202259213723984,48.391300875808305
|
||||
star,52.419084069382585,42.20289741800043
|
||||
star,53.65627693211355,48.971464690228856
|
||||
star,57.503331046349324,47.424462657563865
|
||||
star,55.82966242488396,43.17693993133136
|
||||
star,50.8694833766899,47.44805350486556
|
||||
star,49.78148614405995,52.1012391866234
|
||||
star,55.85252131727443,49.395478821962264
|
||||
star,55.355616500863775,58.10521780742398
|
||||
star,52.12454324813282,36.99279357249721
|
||||
star,54.79720078758586,47.74698229519167
|
||||
star,51.92992382995838,43.10585070320615
|
||||
star,52.24324191279141,48.90669214608911
|
||||
star,51.988733197429845,60.06226839771874
|
||||
star,50.157982473467406,44.03441318646164
|
||||
star,44.845519839941886,49.257422075107684
|
||||
star,45.93437348880096,45.302726833127494
|
||||
star,52.89731786175777,44.45136017076556
|
||||
star,60.955490709818044,52.97197195657749
|
||||
star,59.2765475052063,43.58790590725041
|
||||
star,58.51437625805828,50.48000473171981
|
||||
star,57.08562379470738,53.147194143913836
|
||||
star,50.36494701616379,45.878411895110496
|
||||
star,57.450384045313804,34.72190453512533
|
||||
star,60.05331233029282,50.76566032103792
|
||||
star,53.81718415735906,49.102538278785865
|
||||
star,49.880258866764095,46.25028317705448
|
||||
star,58.40113735288149,54.490109861631105
|
||||
star,64.78489621025432,47.630376668597926
|
||||
star,51.36578813277998,45.42883017030005
|
||||
star,49.92783249885054,47.610726220295746
|
||||
star,55.53092972004907,47.82878859734023
|
||||
star,58.3853873826632,46.828920993277315
|
||||
star,52.610197390005744,45.093657615698376
|
||||
star,47.46946189506202,40.16116137845773
|
||||
star,49.28336827744925,54.5071403583238
|
||||
star,57.58104109155972,54.87482033196495
|
||||
star,48.67115338162199,38.1568765510699
|
||||
star,51.359774719463665,48.87909476648014
|
||||
star,55.99476109963931,53.239860779840264
|
||||
star,55.137118683228856,56.71334390327816
|
||||
star,53.10957727620672,43.43460390979102
|
||||
star,47.27512713802226,43.555143209522754
|
||||
star,54.96847184394049,43.55639871323184
|
||||
star,57.74696985414103,55.06163853018722
|
||||
star,44.52165237909359,52.320261502488236
|
||||
star,45.04158376535305,50.00749527545144
|
||||
star,58.89948225677464,48.32428053171788
|
||||
star,48.88124330718304,45.07284397861114
|
||||
star,52.9534182171564,47.088776081050284
|
||||
star,53.18841263693333,47.45058610345343
|
||||
star,54.853920195640676,41.957131731333895
|
||||
star,53.03192068051912,55.59130585177853
|
||||
star,52.46141830742993,45.78081953422401
|
||||
star,58.31278921485695,35.20407666827952
|
||||
star,58.72551440515314,44.840481209974165
|
||||
star,54.65571428981035,48.48160677960598
|
||||
star,57.769872476881574,46.66202626882326
|
||||
star,51.2878968737422,54.48922895325549
|
||||
star,44.54330473143528,48.101670908526216
|
||||
star,48.608298309483516,44.192133058717204
|
||||
star,62.04502672585255,47.77702286772072
|
||||
star,54.78027878860552,39.09218572147204
|
||||
star,51.08382136482122,48.98278700364393
|
||||
star,55.41903077245287,51.08443277196639
|
||||
circle,54.0345115956703,50.6350210422733
|
||||
circle,55.815127650542735,38.85129794497735
|
||||
circle,54.1480228876406,51.83951203866352
|
||||
circle,56.35951928293808,46.18070595014605
|
||||
circle,50.77749396984253,42.40844037839184
|
||||
circle,53.47578395383435,53.66539939779861
|
||||
circle,46.19278359407359,44.70054135135251
|
||||
circle,49.4407901782929,51.9247876062025
|
||||
circle,51.782761493714794,45.64681170922604
|
||||
circle,53.132201720253406,50.226966254473986
|
||||
circle,52.430444007401455,32.769284726000365
|
||||
circle,56.17324756555008,50.19521478836021
|
||||
circle,53.12183588667515,42.579816896640324
|
||||
circle,55.407120442725436,49.89617766767793
|
||||
circle,52.119868469196504,46.916342647122306
|
||||
circle,50.279373989902716,47.10705453956336
|
||||
circle,47.798282619666075,50.08659410515918
|
||||
circle,50.222526036702824,49.19051573916156
|
||||
circle,48.3761483349284,45.0497117706523
|
||||
circle,53.558042379736285,39.696500940653685
|
||||
circle,54.46059149256037,46.10426218560038
|
||||
circle,47.030575208763075,41.48378622943842
|
||||
circle,56.420480336328666,52.47777992775662
|
||||
circle,53.47236544088013,50.02380906020249
|
||||
circle,54.895374253251646,49.64811491063869
|
||||
circle,59.14393602832117,40.46500800892864
|
||||
circle,56.70584292944649,46.0899552221109
|
||||
circle,53.10296426299344,46.48875134772465
|
||||
circle,52.499411533354866,41.869019041084904
|
||||
circle,54.7333567970304,56.354715163944284
|
||||
circle,53.775467918204974,47.99307478632452
|
||||
circle,51.25080385137335,47.41262726790944
|
||||
circle,55.86466570413612,46.14878779647829
|
||||
circle,52.184783835792,50.01632270081623
|
||||
circle,50.32798092072382,49.262483135384365
|
||||
circle,57.281287189045685,54.799742708397424
|
||||
circle,53.63847197089221,54.837986199033566
|
||||
circle,58.13763954592607,43.01893679814467
|
||||
circle,49.1282459536274,46.4751818107279
|
||||
circle,58.11574197037942,47.63856496219998
|
||||
circle,51.59736976936885,55.7612159002428
|
||||
circle,55.147617952013384,36.39702862104629
|
||||
circle,55.26864250517075,50.60020307285434
|
||||
circle,54.90243461788442,50.248560501159965
|
||||
circle,53.73089756430807,41.408020651776326
|
||||
circle,52.51718398711819,43.27192102221854
|
||||
circle,50.26903635682249,41.684658254488625
|
||||
circle,55.80995637055859,48.48948072706314
|
||||
circle,52.20733854927717,44.753310361348056
|
||||
circle,53.90630757991419,53.395973640562445
|
||||
circle,45.98313713800434,49.88438260425446
|
||||
circle,51.81715210392887,38.57707077510276
|
||||
circle,46.21718766734416,43.43608252932354
|
||||
circle,54.87803822317218,49.96531466990046
|
||||
circle,50.244073709288855,53.08510495706622
|
||||
circle,59.69193398606443,49.980432924782804
|
||||
circle,51.634389328767554,53.62209592255184
|
||||
circle,57.021582782653354,52.33703705677459
|
||||
circle,51.374145300113874,33.827227473626486
|
||||
circle,62.4671640859347,39.94560798275033
|
||||
circle,53.85692771255911,59.90372675609875
|
||||
circle,55.322307025097494,52.74623236779118
|
||||
circle,47.99041372247315,39.11166522633147
|
||||
circle,51.86918883208178,53.4537486721725
|
||||
circle,52.61500220941076,44.026818394642504
|
||||
circle,54.79186915977987,53.40967609238633
|
||||
circle,48.22023920370651,41.94728502938324
|
||||
circle,50.84532298196253,53.47319187356046
|
||||
circle,54.93928610379493,58.66076705285222
|
||||
circle,57.74578290332446,47.82452411565165
|
||||
circle,59.060311352355065,49.05748506365939
|
||||
circle,51.18031458997022,51.39987422122551
|
||||
circle,51.214693384555666,46.54801449598067
|
||||
circle,59.311130783831935,47.493592568913
|
||||
circle,50.787434450633484,45.67831154282254
|
||||
circle,58.087162342354695,45.237296632916355
|
||||
circle,52.452516612597414,45.44853630183186
|
||||
circle,54.73570197736124,46.07255119812196
|
||||
circle,47.59265580410985,43.5640952907748
|
||||
circle,50.268843833974245,54.21659692227578
|
||||
circle,57.25069616843617,50.93629689699913
|
||||
circle,51.97856673094236,43.921042290030144
|
||||
circle,51.96992959337057,42.74059948724163
|
||||
circle,63.98880156634803,36.77339175814299
|
||||
circle,56.256034140295235,41.57723851003736
|
||||
circle,53.58262603402123,43.05999028753133
|
||||
circle,49.28948415006948,42.29901849532519
|
||||
circle,61.01994461496823,47.33505788845204
|
||||
circle,50.937191222087655,50.778934820414364
|
||||
circle,54.0413972582652,51.60016879670826
|
||||
circle,46.70297337624868,49.518019522310006
|
||||
circle,57.09077934840953,39.69200854442718
|
||||
circle,55.79278113769325,56.480907864140804
|
||||
circle,53.940569186581186,52.107029685124054
|
||||
circle,56.68228180124364,44.46247151244712
|
||||
circle,54.15906693834598,40.165026445691986
|
||||
circle,52.19478785158989,49.32843987483118
|
||||
circle,56.89240197495119,48.12306062623956
|
||||
circle,56.87993492057266,42.48546893522315
|
||||
circle,53.593210901780516,48.09639692256538
|
||||
circle,61.39836498671855,46.92916671899958
|
||||
circle,52.00393344802327,48.10675611921774
|
||||
circle,50.32354622052023,48.963769245326084
|
||||
circle,52.53977913381497,39.04336225979408
|
||||
circle,53.76565379546456,46.412284530349005
|
||||
circle,47.47030679172806,47.66432922692739
|
||||
circle,59.95742384829792,50.606518741378565
|
||||
circle,56.447708770924635,41.29251637208479
|
||||
circle,55.90759347568889,48.74224790690039
|
||||
circle,56.116180953337725,50.11314310854413
|
||||
circle,48.56087709798474,47.79299594221016
|
||||
circle,50.968516558299335,47.74957952857589
|
||||
circle,50.410396292392676,54.56235183570498
|
||||
circle,50.56411044622263,43.50528921767232
|
||||
circle,54.29834562617421,42.614504652980024
|
||||
circle,52.30134679026854,43.85017701231019
|
||||
circle,59.64468825555845,51.929019134155865
|
||||
circle,53.77012192613842,46.04391473912992
|
||||
circle,57.76367045830045,50.02602040161444
|
||||
circle,55.99220961873143,47.86903881327875
|
||||
circle,47.24707988891427,47.437670087202235
|
||||
circle,51.87004032316373,51.22527636729939
|
||||
circle,58.04736973197676,44.71024477619416
|
||||
circle,55.87354093710821,56.67939498842857
|
||||
circle,51.32914911771776,56.40960870036569
|
||||
circle,50.58965661131741,48.114798778039756
|
||||
circle,53.95541755263263,48.05749449935031
|
||||
circle,50.649287832368216,45.04408448117785
|
||||
circle,51.3291188545624,49.634812977022285
|
||||
circle,55.320140458041244,59.129721658691295
|
||||
circle,59.48395602516449,45.45078378930761
|
||||
circle,55.29947846318474,52.9855899039588
|
||||
circle,54.12240729735407,47.651792107765324
|
||||
circle,54.20629977107956,52.33638314404214
|
||||
circle,50.60671790877907,46.371652655899126
|
||||
circle,55.881732579385925,49.55723535777078
|
||||
circle,54.958331039425566,46.150994168480985
|
||||
circle,57.89014315657015,58.66934123602269
|
||||
circle,55.625661974704826,47.0341164922508
|
||||
circle,57.02296115557817,45.30433681623505
|
||||
circle,51.00123862118841,48.16404373806856
|
||||
circle,43.66881346812022,42.23024818173995
|
||||
bullseye,52.60815257631662,41.233055709261436
|
||||
bullseye,49.86942759243148,45.81625831270996
|
||||
bullseye,47.42813882587756,45.96964101870084
|
||||
bullseye,51.85891934186862,48.12702604192508
|
||||
bullseye,58.616736121976075,48.86252208246433
|
||||
bullseye,54.08424808536825,48.49727228515356
|
||||
bullseye,54.90957110048451,42.91630675695116
|
||||
bullseye,53.540898700916806,49.543756209183066
|
||||
bullseye,48.51696006982776,52.328264614079266
|
||||
bullseye,58.32550413783276,44.843120057753055
|
||||
bullseye,53.03464883418758,43.60904828594963
|
||||
bullseye,56.797521934351266,42.693888562770454
|
||||
bullseye,53.11009195902832,43.70540046096164
|
||||
bullseye,54.20381710804516,39.0288536425514
|
||||
bullseye,59.30584656946277,43.17696787896868
|
||||
bullseye,54.239578732465105,46.937384776148456
|
||||
bullseye,50.95154195231001,43.561099316820325
|
||||
bullseye,57.74559417420984,45.3717970344903
|
||||
bullseye,55.084680738549245,43.99251557302813
|
||||
bullseye,51.411274272608956,50.36123575043924
|
||||
bullseye,57.721633984444466,47.1234179883649
|
||||
bullseye,48.31232051343825,57.98978039875002
|
||||
bullseye,50.573802767063164,40.29206300164106
|
||||
bullseye,64.37769835107261,45.97983853074531
|
||||
bullseye,48.15306924695437,44.58280116605659
|
||||
bullseye,55.470179584089074,48.9515577879697
|
||||
bullseye,50.59308321098564,57.113618000639796
|
||||
bullseye,51.91368128802651,42.07656704794224
|
||||
bullseye,57.84277359384264,54.64531423269841
|
||||
bullseye,50.73002760953509,40.99326353480345
|
||||
bullseye,58.12175306966242,37.76338193471019
|
||||
bullseye,49.09351336241354,52.83723075025117
|
||||
bullseye,53.77858980791044,46.68031325703656
|
||||
bullseye,55.411266425977516,47.23612788227295
|
||||
bullseye,48.80525311134764,54.38037673003094
|
||||
bullseye,59.30005621154726,49.0266628188898
|
||||
bullseye,54.18053606172804,59.69812403010029
|
||||
bullseye,52.89426861993665,46.7021150908298
|
||||
bullseye,55.45792499696202,55.35660978071169
|
||||
bullseye,60.37108301766993,46.70713684161614
|
||||
bullseye,55.23332498383738,41.109582664679614
|
||||
bullseye,52.75209556833515,43.79854802260347
|
||||
bullseye,49.972672990804064,56.407883581336634
|
||||
bullseye,50.83085335277542,45.34197045994273
|
||||
bullseye,55.463395151667434,54.489126334867926
|
||||
bullseye,55.924460505555345,61.79677557010791
|
||||
bullseye,53.7013280869954,49.29358220114873
|
||||
bullseye,55.10240269593623,55.17524693394577
|
||||
bullseye,56.02895580443846,47.41885149806426
|
||||
bullseye,50.21004562037248,49.22221727980816
|
||||
bullseye,59.60537932516784,45.948091031710646
|
||||
bullseye,56.11577447366633,49.23073894342422
|
||||
bullseye,57.454078633525256,43.97623129681065
|
||||
bullseye,63.38658812210625,41.60419446485871
|
||||
bullseye,52.53779564005585,52.69046270452823
|
||||
bullseye,55.18693268997645,52.14993079540552
|
||||
bullseye,52.01559066406975,47.625975086598295
|
||||
bullseye,54.048927934555266,55.846298072679524
|
||||
bullseye,56.761716097532194,51.98336054182336
|
||||
bullseye,51.36829562985282,52.84441319281525
|
||||
bullseye,54.902326654275214,54.94572657838964
|
||||
bullseye,62.05624061967387,46.466171119864974
|
||||
bullseye,52.37478747821975,43.67977504448815
|
||||
bullseye,53.42568195313472,46.08987275524808
|
||||
bullseye,55.43801759828404,47.277165915331324
|
||||
bullseye,52.55360287687352,53.32292568063926
|
||||
bullseye,50.248479075394194,50.16553976575672
|
||||
bullseye,52.376233091246064,51.62184252434982
|
||||
bullseye,59.54104618688892,46.48450873287964
|
||||
bullseye,55.7641316290926,48.8939643286659
|
||||
bullseye,50.80231040182761,49.20393754870969
|
||||
bullseye,55.15648202111525,50.06435410223087
|
||||
bullseye,53.20640441271982,48.47096150155073
|
||||
bullseye,49.40955622062746,46.209429622600716
|
||||
bullseye,56.223850718839195,52.46236943665765
|
||||
bullseye,52.310740703498915,48.523570147166424
|
||||
bullseye,54.91221330049627,49.00739973352216
|
||||
bullseye,56.163094341201564,38.90961184808215
|
||||
bullseye,53.802703719409706,49.19516800623382
|
||||
bullseye,49.99867860418518,56.369928535053546
|
||||
bullseye,54.64623706895267,55.81702372514465
|
||||
bullseye,50.83790791266769,43.463499392373365
|
||||
bullseye,54.897008883862725,39.60655818585671
|
||||
bullseye,54.85986236250437,48.48609616001459
|
||||
bullseye,58.06266112616841,51.505206705825486
|
||||
bullseye,52.33009060045986,42.512516726148775
|
||||
bullseye,60.849220885373185,44.03942489717425
|
||||
bullseye,49.817901771722134,42.57571970269277
|
||||
bullseye,58.46922126214666,45.40549897787574
|
||||
bullseye,50.9851821352776,48.688449129516734
|
||||
bullseye,53.172221157770835,44.609522696068765
|
||||
bullseye,57.01596586791392,53.32657746048997
|
||||
bullseye,57.941270035759466,51.834598348305995
|
||||
bullseye,55.61050212452014,39.120560000190856
|
||||
bullseye,60.677003225507875,49.509946051787765
|
||||
bullseye,56.43262571040257,53.57481161473701
|
||||
bullseye,59.733410011527965,50.09199005654596
|
||||
bullseye,55.74218463718262,45.0038786127014
|
||||
bullseye,54.13235900455504,43.72919369592823
|
||||
bullseye,51.12023787143242,43.532127988429046
|
||||
bullseye,53.37590443601085,53.24546593960005
|
||||
bullseye,66.68389909316072,48.94749818773957
|
||||
bullseye,48.60634764897552,54.32491664928128
|
||||
bullseye,52.79686449409589,44.69695703011504
|
||||
bullseye,54.839397911692224,41.796877002218864
|
||||
bullseye,54.88985265603443,47.55812238400423
|
||||
bullseye,54.3935116272619,49.907081271045854
|
||||
bullseye,54.269969028963224,48.08169042056446
|
||||
bullseye,55.13725807599178,50.07700313085564
|
||||
bullseye,49.87407015873897,40.850043706572755
|
||||
bullseye,53.75344779131114,40.836322550329456
|
||||
bullseye,54.35012588368669,52.69373437841155
|
||||
bullseye,56.428446687664184,42.759147965872636
|
||||
bullseye,50.558950192169924,49.64150647500378
|
||||
bullseye,52.39480877946596,46.41672352473947
|
||||
bullseye,56.387625924991816,43.06356653271174
|
||||
bullseye,52.39506116035527,43.99958761957699
|
||||
bullseye,49.82748200785858,43.71460905664418
|
||||
bullseye,56.709848677385644,48.259101947412105
|
||||
bullseye,50.4833574846597,46.844491962035015
|
||||
bullseye,47.44477077151129,44.33343596208167
|
||||
bullseye,62.59829813946915,47.548780751671245
|
||||
bullseye,56.92663570812157,47.672558124277586
|
||||
bullseye,55.392676940967235,51.31629044839587
|
||||
bullseye,49.581533609894116,47.845318713630206
|
||||
bullseye,60.315460777665955,44.02249724973355
|
||||
bullseye,51.73424058507112,46.46154361316499
|
||||
bullseye,55.07609629270499,50.62458931822941
|
||||
bullseye,59.06964661927463,50.4974911673433
|
||||
bullseye,53.751787496806664,54.295835648054066
|
||||
bullseye,56.81644408856633,40.52160241871492
|
||||
bullseye,64.105472961424,56.84960694098367
|
||||
bullseye,53.3271431089317,49.88955050869237
|
||||
bullseye,59.29743499998336,47.138996036515586
|
||||
bullseye,56.92140716150808,53.52289236785738
|
||||
bullseye,49.94069636906224,44.98834073208569
|
||||
bullseye,57.68563359124232,50.30407238574415
|
||||
bullseye,57.69518624125579,47.33715992673141
|
||||
bullseye,52.84397915618307,38.006802622083136
|
||||
bullseye,49.415998293970226,48.23533047329247
|
||||
bullseye,57.29822887825694,50.65589183267848
|
||||
bullseye,53.48703210388919,46.64114216754652
|
||||
|
10001
dataset_A_indie_game_telemetry_v2.csv
Normal file
10001
dataset_A_indie_game_telemetry_v2.csv
Normal file
File diff suppressed because it is too large
Load Diff
10001
dataset_D_git_classroom_activity_v2.csv
Normal file
10001
dataset_D_git_classroom_activity_v2.csv
Normal file
File diff suppressed because it is too large
Load Diff
452
git_profile_report.html
Normal file
452
git_profile_report.html
Normal file
File diff suppressed because one or more lines are too long
145
gpt-sugg/Datasaurus_Lab_Instructor_Intro.ipynb
Normal file
145
gpt-sugg/Datasaurus_Lab_Instructor_Intro.ipynb
Normal file
@@ -0,0 +1,145 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4e23ea39",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"# 📘 Instructor Version – Introductory EDA Lab\n",
|
||||
"\n",
|
||||
"## Learning Objectives\n",
|
||||
"\n",
|
||||
"Students should:\n",
|
||||
"\n",
|
||||
"- Understand what a dataset structure looks like\n",
|
||||
"- Identify variable types\n",
|
||||
"- Compute descriptive statistics\n",
|
||||
"- Recognize the limits of summary statistics\n",
|
||||
"- Appreciate visualization as a fundamental step in EDA\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "38ebe89c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"## Teaching Strategy\n",
|
||||
"\n",
|
||||
"This is NOT a technical coding lab.\n",
|
||||
"\n",
|
||||
"It is conceptual:\n",
|
||||
"- Data structure awareness\n",
|
||||
"- Reading metadata\n",
|
||||
"- Interpreting statistics\n",
|
||||
"- Understanding why visualization matters\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ecee2660",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"import pyreadr\n",
|
||||
"import pandas as pd\n",
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"\n",
|
||||
"result = pyreadr.read_r(\"datasaurus_dozen.rda\")\n",
|
||||
"df = list(result.values())[0]\n",
|
||||
"df.head()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ca5dfd49",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"## Discussion Prompts\n",
|
||||
"\n",
|
||||
"### After df.info():\n",
|
||||
"- What is categorical?\n",
|
||||
"- What is numerical?\n",
|
||||
"- Why does data type matter?\n",
|
||||
"\n",
|
||||
"### After df.describe():\n",
|
||||
"Important insight:\n",
|
||||
"Different datasets may have nearly identical summary statistics.\n",
|
||||
"\n",
|
||||
"Ask:\n",
|
||||
"Would you trust the numbers without visualization?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7c61c04b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"df.describe()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5093ed70",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"## Sweetviz Discussion\n",
|
||||
"\n",
|
||||
"Use the report to show:\n",
|
||||
"\n",
|
||||
"- Similar means and standard deviations\n",
|
||||
"- Very different visual distributions\n",
|
||||
"- The importance of scatter plots\n",
|
||||
"\n",
|
||||
"Key message:\n",
|
||||
"📌 \"Statistics describe. Visualization reveals.\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8ae6139f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"report = sv.analyze(df)\n",
|
||||
"report.show_html(\"sweetviz_report.html\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d3a3d619",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"## Key Concept to Emphasize\n",
|
||||
"\n",
|
||||
"EDA is:\n",
|
||||
"- Understanding structure\n",
|
||||
"- Understanding distributions\n",
|
||||
"- Detecting anomalies\n",
|
||||
"- Preparing for cleaning\n",
|
||||
"\n",
|
||||
"Next lab:\n",
|
||||
"Students receive messy datasets with:\n",
|
||||
"- Missing values\n",
|
||||
"- Wrong types\n",
|
||||
"- Outliers\n",
|
||||
"- Inconsistent categories\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
2720
gpt-sugg/Datasaurus_Lab_Student_Intro.ipynb
Normal file
2720
gpt-sugg/Datasaurus_Lab_Student_Intro.ipynb
Normal file
File diff suppressed because one or more lines are too long
3894
gpt-sugg/scatter_export_1771754524118.html
Normal file
3894
gpt-sugg/scatter_export_1771754524118.html
Normal file
File diff suppressed because one or more lines are too long
2053
gpt-sugg/sweetviz_report.html
Normal file
2053
gpt-sugg/sweetviz_report.html
Normal file
File diff suppressed because one or more lines are too long
379
indie_Games_profile_report.html
Normal file
379
indie_Games_profile_report.html
Normal file
File diff suppressed because one or more lines are too long
352
penguins_profile_report.html
Normal file
352
penguins_profile_report.html
Normal file
File diff suppressed because one or more lines are too long
13332
sweetviz_report.html
Normal file
13332
sweetviz_report.html
Normal file
File diff suppressed because one or more lines are too long
1945
task0_sweetviz_report.html
Normal file
1945
task0_sweetviz_report.html
Normal file
File diff suppressed because one or more lines are too long
37
y-prof.py
Normal file
37
y-prof.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from ydata_profiling import ProfileReport
|
||||
import pyreadr
|
||||
|
||||
|
||||
|
||||
# result = pyreadr.read_r("./data/datasaurus_dozen.rda")
|
||||
# result.keys()
|
||||
|
||||
# df = list(result.values())[0]
|
||||
# df.head()
|
||||
|
||||
|
||||
# profile = ProfileReport(df, title="Datasaurus Dataset Profile", explorative=True)
|
||||
# profile.to_file("datasaurus_profile_report.html")
|
||||
|
||||
|
||||
# exit()
|
||||
|
||||
# Load the penguins dataset
|
||||
df = sns.load_dataset("penguins")
|
||||
|
||||
csv_path = 'dataset_D_git_classroom_activity_v2.csv' # or D/E/F
|
||||
df_raw = pd.read_csv(csv_path, dtype=str)
|
||||
|
||||
|
||||
|
||||
# Display basic info about the dataset
|
||||
print("Dataset shape:", df.shape)
|
||||
print("\nFirst 5 rows:")
|
||||
print(df_raw.head())
|
||||
# Generate and save the profile report
|
||||
profile = ProfileReport(df_raw, title="Indie Games Telemetry Dataset Profile", explorative=True)
|
||||
profile.to_file("git_profile_report.html")
|
||||
print("\nProfile report saved as 'penguins_profile_report.html'")
|
||||
print("Open this file in a web browser to view the detailed analysis.")
|
||||
452
ydata_profile_report.html
Normal file
452
ydata_profile_report.html
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user