jup 2 and 3

This commit is contained in:
2026-02-23 19:10:06 +00:00
parent ed360f9967
commit 0c3f0d03bf
13 changed files with 28761 additions and 1407 deletions

View File

@@ -39,7 +39,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -249,7 +249,7 @@
"4 Yes "
]
},
"execution_count": 10,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -259,6 +259,7 @@
"import sweetviz as sv\n",
"import dtale\n",
"import warnings\n",
"from ydata_profiling import ProfileReport\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"# Load the raw dataset — do NOT clean anything yet\n",
@@ -270,7 +271,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -308,6 +309,80 @@
"print(df.dtypes)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a9827626",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8bca7ad18a13487ba5853443b29dbd90",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [00:03<00:00, 5.91it/s]\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "771cb3df4e4946bca1717279b4a6c0ca",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "394f98b346ea44d49127b1657140d9b0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Render HTML: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5868640bd17244c3ada054ba2adfbd58",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Export report to file: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"profile = ProfileReport(df, title=\"ProfileReport\").to_file(\"report.html\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -331,7 +406,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bd10cd653e7a47f891552a79e946376c",
"model_id": "289eb8e084bc4ca5bd4bfbe3d95f511b",
"version_major": 2,
"version_minor": 0
},
@@ -354,7 +429,7 @@
"source": [
"# Generate the SweetViz report\n",
"# This may take 3060 seconds\n",
"report = sv.analyze(df_raw)\n",
"report = sv.analyze(df)\n",
"report.show_html('sweetviz_raw_report.html')\n",
"\n",
"print('Report saved as sweetviz_raw_report.html — open it in your browser.')"
@@ -389,16 +464,33 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 4,
"id": "89d0471a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2026-02-22 20:12:55,619 - INFO - D-Tale started at: http://127.0.0.1:40000\n"
]
},
"data": {
"text/plain": [
"<StringDtype(storage='python', na_value=<NA>)>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['input_method'] = df['input_method'].astype('string')\n",
"df['purchase_amount'] = df['purchase_amount'].astype('string')\n",
"\n",
"df['input_method'].dtype"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
@@ -409,82 +501,8 @@
],
"source": [
"# Launch D-Tale with the raw dataset\n",
"# A link will appear — click it to open D-Tale in a new browser ta\n",
"d = dtale.show(df_raw, host='127.0.0.1', subprocess=False, open_browser=True)\n",
"print(\"Open D-Tale at:\", d._url) # lists all running instances\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c2e5293",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "bad operand type for abs(): 'str'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[16]\u001b[39m\u001b[32m, line 21\u001b[39m\n\u001b[32m 18\u001b[39m \tstr_data = pd.to_numeric(s, errors=\u001b[33m'\u001b[39m\u001b[33mcoerce\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 19\u001b[39m pd.Series(str_data, name=\u001b[33m'\u001b[39m\u001b[33mpurchase_amount\u001b[39m\u001b[33m'\u001b[39m, index=s.index)\n\u001b[32m---> \u001b[39m\u001b[32m21\u001b[39m df[\u001b[33m'\u001b[39m\u001b[33mpurchase_amount\u001b[39m\u001b[33m'\u001b[39m] = \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mpurchase_amount\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mabs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:1722\u001b[39m, in \u001b[36mNDFrame.abs\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1654\u001b[39m \u001b[38;5;129m@final\u001b[39m\n\u001b[32m 1655\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mabs\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Self:\n\u001b[32m 1656\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 1657\u001b[39m \u001b[33;03m Return a Series/DataFrame with absolute numeric value of each element.\u001b[39;00m\n\u001b[32m 1658\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 1720\u001b[39m \u001b[33;03m 3 7 40 -50\u001b[39;00m\n\u001b[32m 1721\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1722\u001b[39m res_mgr = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_mgr\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mabs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1723\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._constructor_from_mgr(res_mgr, axes=res_mgr.axes).__finalize__(\n\u001b[32m 1724\u001b[39m \u001b[38;5;28mself\u001b[39m, name=\u001b[33m\"\u001b[39m\u001b[33mabs\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1725\u001b[39m )\n",
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:361\u001b[39m, in \u001b[36mBaseBlockManager.apply\u001b[39m\u001b[34m(self, f, align_keys, **kwargs)\u001b[39m\n\u001b[32m 358\u001b[39m kwargs[k] = obj[b.mgr_locs.indexer]\n\u001b[32m 360\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(f):\n\u001b[32m--> \u001b[39m\u001b[32m361\u001b[39m applied = \u001b[43mb\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 363\u001b[39m applied = \u001b[38;5;28mgetattr\u001b[39m(b, f)(**kwargs)\n",
"\u001b[36mFile \u001b[39m\u001b[32md:\\Projects\\43679_InteractiveVis\\VI_Lab_01_EDA\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:395\u001b[39m, in \u001b[36mBlock.apply\u001b[39m\u001b[34m(self, func, **kwargs)\u001b[39m\n\u001b[32m 389\u001b[39m \u001b[38;5;129m@final\u001b[39m\n\u001b[32m 390\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mapply\u001b[39m(\u001b[38;5;28mself\u001b[39m, func, **kwargs) -> \u001b[38;5;28mlist\u001b[39m[Block]:\n\u001b[32m 391\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 392\u001b[39m \u001b[33;03m apply the function to my values; return a block if we are not\u001b[39;00m\n\u001b[32m 393\u001b[39m \u001b[33;03m one\u001b[39;00m\n\u001b[32m 394\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m395\u001b[39m result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 397\u001b[39m result = maybe_coerce_values(result)\n\u001b[32m 398\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._split_op_result(result)\n",
"\u001b[31mTypeError\u001b[39m: bad operand type for abs(): 'str'"
]
}
],
"source": [
"# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'\n",
"\n",
"import pandas as pd\n",
"\n",
"if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):\n",
"\tdf = df.to_frame(index=False)\n",
"\n",
"# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required\n",
"df = df.reset_index().drop('index', axis=1, errors='ignore')\n",
"df.columns = [str(c) for c in df.columns] # update columns to strings in case they are numbers\n",
"\n",
"df['purchase_amount'] = df['purchase_amount'].str.replace(',', '.', case=False, regex='False')\n",
"df['purchase_amount'] = s = df['purchase_amount'] \n",
"\n",
"if s.str.startswith('0x').any():\n",
"\tstr_data = s.apply(float.fromhex)\n",
"else:\n",
"\tstr_data = pd.to_numeric(s, errors='coerce')\n",
"\t\n",
"pd.Series(str_data, name='purchase_amount', index=s.index)\n",
"\n",
"df['purchase_amount'] = df['purchase_amount'].abs()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "8180fa05",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2026-02-22 20:18:35,563 - INFO - D-Tale started at: http://127.0.0.1:40000\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Open D-Tale at: http://127.0.0.1:40000\n"
]
}
],
"source": [
"# Launch D-Tale with the raw dataset\n",
"# A link will appear — click it to open D-Tale in a new browser ta\n",
"d = dtale.show(df, host='127.0.0.1', subprocess=False, open_browser=True)\n",
"# A link will appear — click it to open D-Tale in a new browser tab\n",
"d = dtale.show(df, host='127.0.0.1', subprocess=True, open_browser=False)\n",
"print(\"Open D-Tale at:\", d._url) # lists all running instances\n"
]
},
@@ -498,13 +516,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
" TCP 169.254.62.24:40000 0.0.0.0:0 LISTENING 11972\n",
" TCP 127.0.0.1:40000 0.0.0.0:0 LISTENING 50108\n",
" TCP 127.0.0.1:55125 127.0.0.1:40000 TIME_WAIT 0\n",
"\n"
]
}
],
"source": [
"# Check if something else is already on port 40000\n",
"# Check what is listening and binded IP, for debugging odd cases\n",
"# to handle cases where d-tale might bind to docker IPs\n",
"import subprocess\n",
"result = subprocess.run('netstat -ano | findstr :40000', shell=True, capture_output=True, text=True)\n",
"print(result.stdout or \"Nothing on port 40000\")"