Files
VI_Lab_01_EDA/EDA_Lab_Starter.ipynb
2026-02-23 08:21:32 +00:00

321 lines
10 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Interactive EDA Lab Starter (SweetViz & DTale)\n",
"Use this notebook to explore the datasets and practice cleaning common EDA issues."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Optional installs:\n",
"# !pip install -q sweetviz dtale pandas numpy\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>session_id</th>\n",
" <th>user_id</th>\n",
" <th>start_time</th>\n",
" <th>end_time</th>\n",
" <th>session_length_s</th>\n",
" <th>region</th>\n",
" <th>platform</th>\n",
" <th>gpu_model</th>\n",
" <th>avg_fps</th>\n",
" <th>ping_ms</th>\n",
" <th>map_name</th>\n",
" <th>crash_flag</th>\n",
" <th>purchase_amount</th>\n",
" <th>party_size</th>\n",
" <th>input_method</th>\n",
" <th>build_version</th>\n",
" <th>is_featured_event</th>\n",
" <th>device_temp_c</th>\n",
" <th>session_type</th>\n",
" <th>is_long_session</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>sess_c2fba8e7f37a</td>\n",
" <td>user_488</td>\n",
" <td>2025-07-18T18:32:00Z</td>\n",
" <td>2025-07-18 20:03:21-05:00</td>\n",
" <td>5481.0</td>\n",
" <td>us-west</td>\n",
" <td>pc</td>\n",
" <td>GTX1080</td>\n",
" <td>83.52</td>\n",
" <td>431.16</td>\n",
" <td>ocean</td>\n",
" <td>Yes</td>\n",
" <td>0,00</td>\n",
" <td>2</td>\n",
" <td>Touch</td>\n",
" <td>NaN</td>\n",
" <td>No</td>\n",
" <td>85.6</td>\n",
" <td>ranked</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>sess_33d286298cf9</td>\n",
" <td>user_1511</td>\n",
" <td>2025-06-13 23:21:08+00:00</td>\n",
" <td>2025-06-13 23:36:30+01:00</td>\n",
" <td>922.0</td>\n",
" <td>Us-east</td>\n",
" <td>PlayStation</td>\n",
" <td>NaN</td>\n",
" <td>72.75</td>\n",
" <td>29.12</td>\n",
" <td>desert</td>\n",
" <td>No</td>\n",
" <td>0.0</td>\n",
" <td>3</td>\n",
" <td>Touch</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>62.0</td>\n",
" <td>casual</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>sess_be2bb4d8986a</td>\n",
" <td>user_830</td>\n",
" <td>2025-10-20 02:42:07-05:00</td>\n",
" <td>20/10/2025 02:49</td>\n",
" <td>451.0</td>\n",
" <td>sa-east-1</td>\n",
" <td>PlayStation</td>\n",
" <td>NaN</td>\n",
" <td>69.2</td>\n",
" <td>40.47</td>\n",
" <td>Forest</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>5</td>\n",
" <td>TOUCH</td>\n",
" <td>1.4</td>\n",
" <td>False</td>\n",
" <td>69.0</td>\n",
" <td>ranked</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>sess_7f425ca9a0e2</td>\n",
" <td>user_1</td>\n",
" <td>08/01/2025 06:35</td>\n",
" <td>2025-08-01T08:32:45Z</td>\n",
" <td>7031.0</td>\n",
" <td>sa-east-1</td>\n",
" <td>PlayStation</td>\n",
" <td>NaN</td>\n",
" <td>33.29</td>\n",
" <td>92.4</td>\n",
" <td>Desert</td>\n",
" <td>No</td>\n",
" <td>17.55</td>\n",
" <td>1</td>\n",
" <td>Controller</td>\n",
" <td>1.3.2</td>\n",
" <td>0</td>\n",
" <td>48.1</td>\n",
" <td>casual</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>sess_5657e28b22ec</td>\n",
" <td>user_211</td>\n",
" <td>2025-09-08T23:41:44Z</td>\n",
" <td>2025-09-09 00:32:59+01:00</td>\n",
" <td>3075.0</td>\n",
" <td>US-EAST</td>\n",
" <td>switch</td>\n",
" <td>NaN</td>\n",
" <td>69.96</td>\n",
" <td>12.63</td>\n",
" <td>Desert</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>controllr</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>54.7</td>\n",
" <td>casual</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" session_id user_id start_time \\\n",
"0 sess_c2fba8e7f37a user_488 2025-07-18T18:32:00Z \n",
"1 sess_33d286298cf9 user_1511 2025-06-13 23:21:08+00:00 \n",
"2 sess_be2bb4d8986a user_830 2025-10-20 02:42:07-05:00 \n",
"3 sess_7f425ca9a0e2 user_1 08/01/2025 06:35 \n",
"4 sess_5657e28b22ec user_211 2025-09-08T23:41:44Z \n",
"\n",
" end_time session_length_s region platform \\\n",
"0 2025-07-18 20:03:21-05:00 5481.0 us-west pc \n",
"1 2025-06-13 23:36:30+01:00 922.0 Us-east PlayStation \n",
"2 20/10/2025 02:49 451.0 sa-east-1 PlayStation \n",
"3 2025-08-01T08:32:45Z 7031.0 sa-east-1 PlayStation \n",
"4 2025-09-09 00:32:59+01:00 3075.0 US-EAST switch \n",
"\n",
" gpu_model avg_fps ping_ms map_name crash_flag purchase_amount party_size \\\n",
"0 GTX1080 83.52 431.16 ocean Yes 0,00 2 \n",
"1 NaN 72.75 29.12 desert No 0.0 3 \n",
"2 NaN 69.2 40.47 Forest False 0.0 5 \n",
"3 NaN 33.29 92.4 Desert No 17.55 1 \n",
"4 NaN 69.96 12.63 Desert False 0.0 2 \n",
"\n",
" input_method build_version is_featured_event device_temp_c session_type \\\n",
"0 Touch NaN No 85.6 ranked \n",
"1 Touch NaN 0 62.0 casual \n",
"2 TOUCH 1.4 False 69.0 ranked \n",
"3 Controller 1.3.2 0 48.1 casual \n",
"4 controllr NaN 0 54.7 casual \n",
"\n",
" is_long_session \n",
"0 True \n",
"1 0 \n",
"2 False \n",
"3 True \n",
"4 Yes "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"csv_path = 'dataset_A_indie_game_telemetry_v2.csv' # or D/E/F\n",
"df_raw = pd.read_csv(csv_path, dtype=str)\n",
"df_raw.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "84e6ef6f5e434536a0dcf69111e15ef9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" | | [ 0%] 00:00 -> (? left)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.\n",
"SweetViz report written to sweetviz_report.html\n"
]
}
],
"source": [
"import sweetviz as sv\n",
"report = sv.analyze(df_raw)\n",
"report.show_html('sweetviz_report.html')\n",
"print('SweetViz report written to sweetviz_report.html')\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import dtale\n",
"d = dtale.show(df_raw)\n",
"d.open_browser()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## EDA Quests\n",
"- Parse timestamps\n",
"- Fix units (minutes vs seconds)\n",
"- Normalize categories & booleans\n",
"- Convert comma-decimals to floats\n",
"- Identify & treat outliers\n",
"- Re-run SweetViz and compare before/after\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}