Add dataset + Sweetviz reports; update notebooks
Add raw and cleaned Git classroom dataset (claude/dataset_D_git_classroom_activity.csv and _clean.csv) and two Sweetviz HTML reports. Update several lab notebooks (lab02_task1_datasets.ipynb, lab02_task1_datasets_v2b.ipynb, lab02_task2_telemetry_v4.ipynb, lab02_task3_git_activity_solutions.ipynb) and deploy/lab01_task2_telemetry.ipynb to incorporate the new data, cleanup steps, and EDA outputs.
This commit is contained in:
10001
claude/dataset_D_git_classroom_activity.csv
Normal file
10001
claude/dataset_D_git_classroom_activity.csv
Normal file
File diff suppressed because it is too large
Load Diff
10001
claude/dataset_D_git_classroom_activity_clean.csv
Normal file
10001
claude/dataset_D_git_classroom_activity_clean.csv
Normal file
File diff suppressed because it is too large
Load Diff
@@ -580,7 +580,7 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\sss\\AppData\\Local\\Temp\\ipykernel_64804\\2163207487.py:2: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
|
||||
"C:\\Users\\sss\\AppData\\Local\\Temp\\ipykernel_19904\\2163207487.py:2: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
|
||||
" correlation = df.groupby('dataset').apply(lambda g: g['x'].corr(g['y'])).round(2)\n"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"id": "d321d996",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -610,7 +610,7 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\sss\\AppData\\Local\\Temp\\ipykernel_58292\\2163207487.py:2: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
|
||||
"C:\\Users\\sss\\AppData\\Local\\Temp\\ipykernel_34380\\2163207487.py:2: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
|
||||
" correlation = df.groupby('dataset').apply(lambda g: g['x'].corr(g['y'])).round(2)\n"
|
||||
]
|
||||
}
|
||||
@@ -986,12 +986,12 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\sss\\AppData\\Local\\Temp\\ipykernel_58292\\1218167074.py:4: FutureWarning: \n",
|
||||
"C:\\Users\\sss\\AppData\\Local\\Temp\\ipykernel_34380\\1218167074.py:4: FutureWarning: \n",
|
||||
"\n",
|
||||
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
|
||||
"\n",
|
||||
" sns.boxplot(data=df_focus, x='dataset', y=var, ax=ax,\n",
|
||||
"C:\\Users\\sss\\AppData\\Local\\Temp\\ipykernel_58292\\1218167074.py:4: FutureWarning: \n",
|
||||
"C:\\Users\\sss\\AppData\\Local\\Temp\\ipykernel_34380\\1218167074.py:4: FutureWarning: \n",
|
||||
"\n",
|
||||
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
|
||||
"\n",
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -67,14 +67,228 @@
|
||||
"import sweetviz as sv\n",
|
||||
"import dtale\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')"
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"import pygwalker as pyg"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Shape: (10000, 20)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>session_id</th>\n",
|
||||
" <th>user_id</th>\n",
|
||||
" <th>start_time</th>\n",
|
||||
" <th>end_time</th>\n",
|
||||
" <th>session_length_s</th>\n",
|
||||
" <th>region</th>\n",
|
||||
" <th>platform</th>\n",
|
||||
" <th>gpu_model</th>\n",
|
||||
" <th>avg_fps</th>\n",
|
||||
" <th>ping_ms</th>\n",
|
||||
" <th>map_name</th>\n",
|
||||
" <th>crash_flag</th>\n",
|
||||
" <th>purchase_amount</th>\n",
|
||||
" <th>party_size</th>\n",
|
||||
" <th>input_method</th>\n",
|
||||
" <th>build_version</th>\n",
|
||||
" <th>is_featured_event</th>\n",
|
||||
" <th>device_temp_c</th>\n",
|
||||
" <th>session_type</th>\n",
|
||||
" <th>is_long_session</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>sess_c2fba8e7f37a</td>\n",
|
||||
" <td>user_488</td>\n",
|
||||
" <td>2025-07-18T18:32:00Z</td>\n",
|
||||
" <td>2025-07-18 20:03:21-05:00</td>\n",
|
||||
" <td>5481.0</td>\n",
|
||||
" <td>us-west</td>\n",
|
||||
" <td>pc</td>\n",
|
||||
" <td>GTX1080</td>\n",
|
||||
" <td>83.52</td>\n",
|
||||
" <td>431.16</td>\n",
|
||||
" <td>ocean</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>0,00</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>Touch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>85.6</td>\n",
|
||||
" <td>ranked</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>sess_33d286298cf9</td>\n",
|
||||
" <td>user_1511</td>\n",
|
||||
" <td>2025-06-13 23:21:08+00:00</td>\n",
|
||||
" <td>2025-06-13 23:36:30+01:00</td>\n",
|
||||
" <td>922.0</td>\n",
|
||||
" <td>Us-east</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>72.75</td>\n",
|
||||
" <td>29.12</td>\n",
|
||||
" <td>desert</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>Touch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>62.0</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>sess_be2bb4d8986a</td>\n",
|
||||
" <td>user_830</td>\n",
|
||||
" <td>2025-10-20 02:42:07-05:00</td>\n",
|
||||
" <td>20/10/2025 02:49</td>\n",
|
||||
" <td>451.0</td>\n",
|
||||
" <td>sa-east-1</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>69.20</td>\n",
|
||||
" <td>40.47</td>\n",
|
||||
" <td>Forest</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>TOUCH</td>\n",
|
||||
" <td>1.4</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>69.0</td>\n",
|
||||
" <td>ranked</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>sess_7f425ca9a0e2</td>\n",
|
||||
" <td>user_1</td>\n",
|
||||
" <td>08/01/2025 06:35</td>\n",
|
||||
" <td>2025-08-01T08:32:45Z</td>\n",
|
||||
" <td>7031.0</td>\n",
|
||||
" <td>sa-east-1</td>\n",
|
||||
" <td>PlayStation</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>33.29</td>\n",
|
||||
" <td>92.40</td>\n",
|
||||
" <td>Desert</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>17.55</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Controller</td>\n",
|
||||
" <td>1.3.2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>48.1</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>sess_5657e28b22ec</td>\n",
|
||||
" <td>user_211</td>\n",
|
||||
" <td>2025-09-08T23:41:44Z</td>\n",
|
||||
" <td>2025-09-09 00:32:59+01:00</td>\n",
|
||||
" <td>3075.0</td>\n",
|
||||
" <td>US-EAST</td>\n",
|
||||
" <td>switch</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>69.96</td>\n",
|
||||
" <td>12.63</td>\n",
|
||||
" <td>Desert</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>controllr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>54.7</td>\n",
|
||||
" <td>casual</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" session_id user_id start_time \\\n",
|
||||
"0 sess_c2fba8e7f37a user_488 2025-07-18T18:32:00Z \n",
|
||||
"1 sess_33d286298cf9 user_1511 2025-06-13 23:21:08+00:00 \n",
|
||||
"2 sess_be2bb4d8986a user_830 2025-10-20 02:42:07-05:00 \n",
|
||||
"3 sess_7f425ca9a0e2 user_1 08/01/2025 06:35 \n",
|
||||
"4 sess_5657e28b22ec user_211 2025-09-08T23:41:44Z \n",
|
||||
"\n",
|
||||
" end_time session_length_s region platform \\\n",
|
||||
"0 2025-07-18 20:03:21-05:00 5481.0 us-west pc \n",
|
||||
"1 2025-06-13 23:36:30+01:00 922.0 Us-east PlayStation \n",
|
||||
"2 20/10/2025 02:49 451.0 sa-east-1 PlayStation \n",
|
||||
"3 2025-08-01T08:32:45Z 7031.0 sa-east-1 PlayStation \n",
|
||||
"4 2025-09-09 00:32:59+01:00 3075.0 US-EAST switch \n",
|
||||
"\n",
|
||||
" gpu_model avg_fps ping_ms map_name crash_flag purchase_amount party_size \\\n",
|
||||
"0 GTX1080 83.52 431.16 ocean Yes 0,00 2 \n",
|
||||
"1 NaN 72.75 29.12 desert No 0.0 3 \n",
|
||||
"2 NaN 69.20 40.47 Forest False 0.0 5 \n",
|
||||
"3 NaN 33.29 92.40 Desert No 17.55 1 \n",
|
||||
"4 NaN 69.96 12.63 Desert False 0.0 2 \n",
|
||||
"\n",
|
||||
" input_method build_version is_featured_event device_temp_c session_type \\\n",
|
||||
"0 Touch NaN No 85.6 ranked \n",
|
||||
"1 Touch NaN 0 62.0 casual \n",
|
||||
"2 TOUCH 1.4 False 69.0 ranked \n",
|
||||
"3 Controller 1.3.2 0 48.1 casual \n",
|
||||
"4 controllr NaN 0 54.7 casual \n",
|
||||
"\n",
|
||||
" is_long_session \n",
|
||||
"0 True \n",
|
||||
"1 0 \n",
|
||||
"2 False \n",
|
||||
"3 True \n",
|
||||
"4 Yes "
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load the raw dataset — do NOT clean anything yet\n",
|
||||
"df = pd.read_csv('dataset_A_indie_game_telemetry_v2.csv')\n",
|
||||
@@ -83,6 +297,79 @@
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8ca0358e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "0e7e473ff13d4dab8162abe663d1cf88",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Box(children=(HTML(value='\\n<div id=\"ifr-pyg-00064b925f58a3a86prM98tTQNaGHo5k\" style=\"height: auto\">\\n <hea…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<script>\n",
|
||||
" window.addEventListener(\"message\", function(event) {\n",
|
||||
" const backgroundMap = {\n",
|
||||
" \"dark\": \"hsl(240 10% 3.9%)\",\n",
|
||||
" \"light\": \"hsl(0 0 100%)\",\n",
|
||||
" };\n",
|
||||
" const colorMap = {\n",
|
||||
" \"dark\": \"hsl(0 0% 98%)\",\n",
|
||||
" \"light\": \"hsl(240 10% 3.9%)\",\n",
|
||||
" };\n",
|
||||
" if (event.data.action === \"changeAppearance\" && event.data.gid === \"00064b925f58a3a86prM98tTQNaGHo5k\") {\n",
|
||||
" var iframe = document.getElementById(\"gwalker-00064b925f58a3a86prM98tTQNaGHo5k\");\n",
|
||||
" iframe.style.background = backgroundMap[event.data.appearance];\n",
|
||||
" iframe.style.color = colorMap[event.data.appearance];\n",
|
||||
" }\n",
|
||||
" });\n",
|
||||
"</script>"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.HTML object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.HTML object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<pygwalker.api.pygwalker.PygWalker at 0x29f79a1b690>"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pyg.walk(df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
16794
claude/sweetviz_git_comparison.html
Normal file
16794
claude/sweetviz_git_comparison.html
Normal file
File diff suppressed because one or more lines are too long
14510
claude/sweetviz_git_raw.html
Normal file
14510
claude/sweetviz_git_raw.html
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user