321 lines
10 KiB
Plaintext
321 lines
10 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Interactive EDA Lab Starter (SweetViz & D‑Tale)\n",
|
||
"Use this notebook to explore the datasets and practice cleaning common EDA issues."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Optional installs:\n",
|
||
"# !pip install -q sweetviz dtale pandas numpy\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>session_id</th>\n",
|
||
" <th>user_id</th>\n",
|
||
" <th>start_time</th>\n",
|
||
" <th>end_time</th>\n",
|
||
" <th>session_length_s</th>\n",
|
||
" <th>region</th>\n",
|
||
" <th>platform</th>\n",
|
||
" <th>gpu_model</th>\n",
|
||
" <th>avg_fps</th>\n",
|
||
" <th>ping_ms</th>\n",
|
||
" <th>map_name</th>\n",
|
||
" <th>crash_flag</th>\n",
|
||
" <th>purchase_amount</th>\n",
|
||
" <th>party_size</th>\n",
|
||
" <th>input_method</th>\n",
|
||
" <th>build_version</th>\n",
|
||
" <th>is_featured_event</th>\n",
|
||
" <th>device_temp_c</th>\n",
|
||
" <th>session_type</th>\n",
|
||
" <th>is_long_session</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>sess_c2fba8e7f37a</td>\n",
|
||
" <td>user_488</td>\n",
|
||
" <td>2025-07-18T18:32:00Z</td>\n",
|
||
" <td>2025-07-18 20:03:21-05:00</td>\n",
|
||
" <td>5481.0</td>\n",
|
||
" <td>us-west</td>\n",
|
||
" <td>pc</td>\n",
|
||
" <td>GTX1080</td>\n",
|
||
" <td>83.52</td>\n",
|
||
" <td>431.16</td>\n",
|
||
" <td>ocean</td>\n",
|
||
" <td>Yes</td>\n",
|
||
" <td>0,00</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>Touch</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>85.6</td>\n",
|
||
" <td>ranked</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>sess_33d286298cf9</td>\n",
|
||
" <td>user_1511</td>\n",
|
||
" <td>2025-06-13 23:21:08+00:00</td>\n",
|
||
" <td>2025-06-13 23:36:30+01:00</td>\n",
|
||
" <td>922.0</td>\n",
|
||
" <td>Us-east</td>\n",
|
||
" <td>PlayStation</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>72.75</td>\n",
|
||
" <td>29.12</td>\n",
|
||
" <td>desert</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Touch</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>62.0</td>\n",
|
||
" <td>casual</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>sess_be2bb4d8986a</td>\n",
|
||
" <td>user_830</td>\n",
|
||
" <td>2025-10-20 02:42:07-05:00</td>\n",
|
||
" <td>20/10/2025 02:49</td>\n",
|
||
" <td>451.0</td>\n",
|
||
" <td>sa-east-1</td>\n",
|
||
" <td>PlayStation</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>69.2</td>\n",
|
||
" <td>40.47</td>\n",
|
||
" <td>Forest</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>TOUCH</td>\n",
|
||
" <td>1.4</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>69.0</td>\n",
|
||
" <td>ranked</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>sess_7f425ca9a0e2</td>\n",
|
||
" <td>user_1</td>\n",
|
||
" <td>08/01/2025 06:35</td>\n",
|
||
" <td>2025-08-01T08:32:45Z</td>\n",
|
||
" <td>7031.0</td>\n",
|
||
" <td>sa-east-1</td>\n",
|
||
" <td>PlayStation</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>33.29</td>\n",
|
||
" <td>92.4</td>\n",
|
||
" <td>Desert</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>17.55</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Controller</td>\n",
|
||
" <td>1.3.2</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>48.1</td>\n",
|
||
" <td>casual</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>sess_5657e28b22ec</td>\n",
|
||
" <td>user_211</td>\n",
|
||
" <td>2025-09-08T23:41:44Z</td>\n",
|
||
" <td>2025-09-09 00:32:59+01:00</td>\n",
|
||
" <td>3075.0</td>\n",
|
||
" <td>US-EAST</td>\n",
|
||
" <td>switch</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>69.96</td>\n",
|
||
" <td>12.63</td>\n",
|
||
" <td>Desert</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>controllr</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>54.7</td>\n",
|
||
" <td>casual</td>\n",
|
||
" <td>Yes</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" session_id user_id start_time \\\n",
|
||
"0 sess_c2fba8e7f37a user_488 2025-07-18T18:32:00Z \n",
|
||
"1 sess_33d286298cf9 user_1511 2025-06-13 23:21:08+00:00 \n",
|
||
"2 sess_be2bb4d8986a user_830 2025-10-20 02:42:07-05:00 \n",
|
||
"3 sess_7f425ca9a0e2 user_1 08/01/2025 06:35 \n",
|
||
"4 sess_5657e28b22ec user_211 2025-09-08T23:41:44Z \n",
|
||
"\n",
|
||
" end_time session_length_s region platform \\\n",
|
||
"0 2025-07-18 20:03:21-05:00 5481.0 us-west pc \n",
|
||
"1 2025-06-13 23:36:30+01:00 922.0 Us-east PlayStation \n",
|
||
"2 20/10/2025 02:49 451.0 sa-east-1 PlayStation \n",
|
||
"3 2025-08-01T08:32:45Z 7031.0 sa-east-1 PlayStation \n",
|
||
"4 2025-09-09 00:32:59+01:00 3075.0 US-EAST switch \n",
|
||
"\n",
|
||
" gpu_model avg_fps ping_ms map_name crash_flag purchase_amount party_size \\\n",
|
||
"0 GTX1080 83.52 431.16 ocean Yes 0,00 2 \n",
|
||
"1 NaN 72.75 29.12 desert No 0.0 3 \n",
|
||
"2 NaN 69.2 40.47 Forest False 0.0 5 \n",
|
||
"3 NaN 33.29 92.4 Desert No 17.55 1 \n",
|
||
"4 NaN 69.96 12.63 Desert False 0.0 2 \n",
|
||
"\n",
|
||
" input_method build_version is_featured_event device_temp_c session_type \\\n",
|
||
"0 Touch NaN No 85.6 ranked \n",
|
||
"1 Touch NaN 0 62.0 casual \n",
|
||
"2 TOUCH 1.4 False 69.0 ranked \n",
|
||
"3 Controller 1.3.2 0 48.1 casual \n",
|
||
"4 controllr NaN 0 54.7 casual \n",
|
||
"\n",
|
||
" is_long_session \n",
|
||
"0 True \n",
|
||
"1 0 \n",
|
||
"2 False \n",
|
||
"3 True \n",
|
||
"4 Yes "
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"csv_path = 'dataset_A_indie_game_telemetry_v2.csv' # or D/E/F\n",
|
||
"df_raw = pd.read_csv(csv_path, dtype=str)\n",
|
||
"df_raw.head()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "84e6ef6f5e434536a0dcf69111e15ef9",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" | | [ 0%] 00:00 -> (? left)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.\n",
|
||
"SweetViz report written to sweetviz_report.html\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import sweetviz as sv\n",
|
||
"report = sv.analyze(df_raw)\n",
|
||
"report.show_html('sweetviz_report.html')\n",
|
||
"print('SweetViz report written to sweetviz_report.html')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import dtale\n",
|
||
"d = dtale.show(df_raw)\n",
|
||
"d.open_browser()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## EDA Quests\n",
|
||
"- Parse timestamps\n",
|
||
"- Fix units (minutes vs seconds)\n",
|
||
"- Normalize categories & booleans\n",
|
||
"- Convert comma-decimals to floats\n",
|
||
"- Identify & treat outliers\n",
|
||
"- Re-run SweetViz and compare before/after\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.9"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|