Lab1/funzioni.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "17aa372a",
   "metadata": {},
   "source": [
    "# Tutte le funzioni utili ordinate e nominate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7f6f9db",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pip install matplotlib numpy pandas scipy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e31f5b72",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy as sc"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8797256",
   "metadata": {},
   "source": [
    "### Calcolo di compatibilità\n",
    "\n",
    "input:\n",
    "  - x1, x2: i valori numerici delle quantità\n",
    "  - u1, u2: le incertezze associate\n",
    "\n",
    "output:\n",
    "  - k: scarto normalizzato\n",
    "  - diff: differenza x1 - x2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a88eb843",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compatibilita(x1, x2, u1, u2):\n",
    "  diff = x1 - x2\n",
    "  k = np.abs(diff) / np.sqrt(u1**2 + u2**2)\n",
    "  return k, diff\n",
    "\n",
    "print(compatibilita(1, 2, 0.5, 0.7))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72995c62",
   "metadata": {},
   "source": [
    "### Calcola la media e l'incertezza campionaria\n",
    "\n",
    "input:\n",
    "  - x: array numpy (anche con NaN)\n",
    "  - dim: dimensione in cui calcolare (0:righe / 1:colonne) [dafault 0]\n",
    "\n",
    "output:\n",
    "  - media\n",
    "  - uA: deviazione standarrd campionaria (s/sqrt(N))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3d561eb0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1.5 3. ]\n",
      "a    0.50000\n",
      "b    0.57735\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "def media(x, dim = 0):\n",
    "  uA = np.nanstd(x, axis=dim, ddof=1)\n",
    "  media = np.nanmean(x, axis=dim)\n",
    "\n",
    "  N = np.sum(~np.isnan(x), axis=dim)\n",
    "\n",
    "  uA = uA / np.sqrt(N)\n",
    "\n",
    "  return media, uA\n",
    "\n",
    "\n",
    "df = pd.DataFrame({\n",
    "    \"a\": [1, 2, np.nan],\n",
    "    \"b\": [2, 3, 4]\n",
    "})\n",
    "media, uA = media(df)\n",
    "print(media)\n",
    "print(uA)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4e8604e",
   "metadata": {},
   "source": [
    "### Media pesata\n",
    "\n",
    "input:\n",
    "  - x: valori\n",
    "  - sigmaX: sigma della x\n",
    "  - dim: dimensione in cui calcolare (0:righe / 1:colonne) [dafault 0]\n",
    "\n",
    "output:\n",
    "  - media: media pesata\n",
    "  - sigma: incertezza sulla media pesata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffdeea69",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def mediaPesata(x, ux, dim = 0):\n",
    "  x_arr = x.to_numpy() if isinstance(x, (pd.Series, pd.DataFrame)) else np.asarray(x)\n",
    "  sigma_arr = ux.to_numpy() if isinstance(ux, (pd.Series, pd.DataFrame)) else np.asarray(ux)\n",
    "\n",
    "  w = 1.0 / (sigma_arr ** 2)\n",
    "\n",
    "  num = np.nansum(w * x_arr, axis=dim)\n",
    "  den = np.nansum(w, axis=dim)\n",
    "\n",
    "  media = num / den\n",
    "  sigma = np.sqrt(1.0 / den)\n",
    "\n",
    "  return media, sigma\n",
    "\n",
    "df = pd.DataFrame({\n",
    "    \"x\": [10, 12, 11, np.nan],\n",
    "    \"sx\": [0.3, 0.4, 0.2, np.nan]\n",
    "})\n",
    "\n",
    "media, sigma = mediaPesata(df[\"x\"], df[\"sx\"], dim=0)\n",
    "\n",
    "print(\"media pesata:\", media)\n",
    "print(\"sigma:\", sigma)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c5441c1c",
   "metadata": {},
   "source": [
    "### Identificazione autlier con t di Student\n",
    "Questo sistema permette di calcolare la probabilità che un valore faccia parte di un dataset\n",
    "input:\n",
    "  - x: dati\n",
    "  - ux: incertezze\n",
    "  - dim: dimensione in cui calcolare (0:righe / 1:colonne) [dafault 0]\n",
    "\n",
    "outpu:\n",
    "  - i: maschera degli outlier\n",
    "  - xi: maschera dei valori outlier\n",
    "  - NP: numero atteso di valori"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c4561c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def outlier_t(x, ux, dim=0):\n",
    "  # Conversione posizionale\n",
    "  x_arr = x.to_numpy() if isinstance(x, (pd.Series, pd.DataFrame)) else np.asarray(x)\n",
    "  u_arr = ux.to_numpy() if isinstance(ux, (pd.Series, pd.DataFrame)) else np.asarray(ux)\n",
    "\n",
    "  # Deviazione standard campionaria\n",
    "  s = np.nanstd(x_arr, axis=dim, ddof=1)\n",
    "  mi = np.nanmean(x_arr, axis=dim)\n",
    "  \n",
    "  N = np.sum(~np.isnan(x_arr), axis=dim)\n",
    "\n",
    "  GdL = N - 1\n",
    "\n",
    "  # Reshape magico (funziona)\n",
    "  shape = list(x_arr.shape)\n",
    "  shape[dim] = 1\n",
    "  s = s.reshape(shape)\n",
    "  mi = mi.reshape(shape)\n",
    "  N = N.reshape(shape)\n",
    "  GdL = GdL.reshape(shape)\n",
    "  if u_arr.ndim == 0 or u_arr.size == 1:\n",
    "    u_arr = float(u_arr.flat[0])\n",
    "\n",
    "  # Statistica t\n",
    "  sigma_tot = np.sqrt(s**2 + u_arr**2)\n",
    "  t = np.abs(x_arr - mi) / sigma_tot\n",
    "  tail_p = 2 * sc.stats.t.sf(t, GdL)\n",
    "\n",
    "  # Numero atteso\n",
    "  NP = N * tail_p\n",
    "\n",
    "  # Outlier\n",
    "  i = NP < 0.5\n",
    "  xi = x_arr[i]\n",
    "\n",
    "  return i, xi, NP\n",
    "\n",
    "\n",
    "df = pd.DataFrame({\n",
    "  \"x\": [10, 12, 50, 11, np.nan, 13, 9, 7, 2],\n",
    "  \"u\": [1, 1, 1, 1, np.nan, 0.3, 1, 8, 3]\n",
    "})\n",
    "\n",
    "i, xi, NP = outlier_t(df[\"x\"], df[\"u\"], dim = 0)\n",
    "\n",
    "print(\"Maschera:\", i)\n",
    "print(\"Outlier:\", xi)\n",
    "print(\"NP:\", NP)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07d4fc7d",
   "metadata": {},
   "source": [
    "### Regressione lineare pesata\n",
    "input:\n",
    "  - x: dati sulla x\n",
    "  - y: dati sulla y\n",
    "  - ux: sigma x\n",
    "  - uy: sigma y\n",
    "\n",
    "output:\n",
    "  - A, B: Ax + B\n",
    "  - uA, uB: incertezze\n",
    "  - cov_AB: covarianza\n",
    "  - chi: chi^2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "78c30380",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'pd' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 49\u001b[39m\n\u001b[32m     45\u001b[39m   chi = sc.stats.chi2.cdf(x2, dof)  \u001b[38;5;66;03m# P(X² > x2)\u001b[39;00m\n\u001b[32m     47\u001b[39m   \u001b[38;5;28;01mreturn\u001b[39;00m A, B, sigma_A, sigma_B, cov_AB, chi\n\u001b[32m---> \u001b[39m\u001b[32m49\u001b[39m df = \u001b[43mpd\u001b[49m.DataFrame({\n\u001b[32m     50\u001b[39m   \u001b[33m\"\u001b[39m\u001b[33mx\u001b[39m\u001b[33m\"\u001b[39m:[\u001b[32m1.0\u001b[39m, \u001b[32m2.0\u001b[39m, \u001b[32m3.0\u001b[39m, \u001b[32m4.0\u001b[39m, \u001b[32m5.0\u001b[39m],\n\u001b[32m     51\u001b[39m   \u001b[33m\"\u001b[39m\u001b[33my\u001b[39m\u001b[33m\"\u001b[39m:[\u001b[32m2.1\u001b[39m, \u001b[32m3.9\u001b[39m, \u001b[32m6.2\u001b[39m, \u001b[32m7.8\u001b[39m, \u001b[32m10.1\u001b[39m],\n\u001b[32m     52\u001b[39m   \u001b[33m\"\u001b[39m\u001b[33mux\u001b[39m\u001b[33m\"\u001b[39m:[\u001b[32m0.1\u001b[39m, \u001b[32m0.1\u001b[39m, \u001b[32m0.1\u001b[39m, \u001b[32m0.1\u001b[39m, \u001b[32m0.1\u001b[39m],\n\u001b[32m     53\u001b[39m   \u001b[33m\"\u001b[39m\u001b[33muy\u001b[39m\u001b[33m\"\u001b[39m:[\u001b[32m0.2\u001b[39m, \u001b[32m0.2\u001b[39m, \u001b[32m0.2\u001b[39m, \u001b[32m0.2\u001b[39m, \u001b[32m0.2\u001b[39m]\n\u001b[32m     54\u001b[39m   })\n\u001b[32m     56\u001b[39m A, B, sA, sB, covAB, chi = reg_lin(df[\u001b[33m\"\u001b[39m\u001b[33mx\u001b[39m\u001b[33m\"\u001b[39m], df[\u001b[33m\"\u001b[39m\u001b[33my\u001b[39m\u001b[33m\"\u001b[39m], df[\u001b[33m\"\u001b[39m\u001b[33mux\u001b[39m\u001b[33m\"\u001b[39m], df[\u001b[33m\"\u001b[39m\u001b[33muy\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m     57\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mAx + B : \u001b[39m\u001b[33m\"\u001b[39m)\n",
      "\u001b[31mNameError\u001b[39m: name 'pd' is not defined"
     ]
    }
   ],
   "source": [
    "def sigma_y_equiv(x, y, sigma_x, sigma_y):\n",
    "  # Stima iniziale di A con sola sigma_y\n",
    "  sy2 = sigma_y**2\n",
    "  Sw   = np.sum(1 / sy2)\n",
    "  Sx   = np.sum(x / sy2)\n",
    "  Sxx  = np.sum(x**2 / sy2)\n",
    "  Sy   = np.sum(y / sy2)\n",
    "  Sxy  = np.sum(x * y / sy2)\n",
    "  delta = Sxx * Sw - Sx**2\n",
    "  A_est = (Sxy * Sw - Sx * Sy) / delta\n",
    "\n",
    "  # Propagazione\n",
    "  sigma_eq = np.sqrt(sigma_y**2 + A_est**2 * sigma_x**2)\n",
    "  return sigma_eq\n",
    "\n",
    "\n",
    "def reg_lin(x, y, sigma_x, sigma_y):\n",
    "  x = np.asarray(x, dtype=float)\n",
    "  y = np.asarray(y, dtype=float)\n",
    "  sigma_x = np.asarray(sigma_x, dtype=float)\n",
    "  sigma_y = np.asarray(sigma_y, dtype=float)\n",
    "\n",
    "  # Propagazione errore x → y\n",
    "  sigma_y_eq = sigma_y_equiv(x, y, sigma_x, sigma_y)\n",
    "\n",
    "  # Somme pesate\n",
    "  w    = 1.0 / sigma_y_eq**2\n",
    "  Sw   = np.sum(w)\n",
    "  Sx   = np.sum(w * x)\n",
    "  Sxx  = np.sum(w * x**2)\n",
    "  Sy   = np.sum(w * y)\n",
    "  Sxy  = np.sum(w * x * y)\n",
    "  delta = Sxx * Sw - Sx**2\n",
    "\n",
    "  # Parametri\n",
    "  A      = (Sxy * Sw - Sx * Sy)  / delta\n",
    "  B      = (Sxx * Sy - Sxy * Sx) / delta\n",
    "  sigma_A = np.sqrt(Sw  / delta)\n",
    "  sigma_B = np.sqrt(Sxx / delta)\n",
    "  cov_AB  = -Sx / delta\n",
    "\n",
    "  # Chi quadro\n",
    "  x2  = np.sum((y - A * x - B)**2 / sigma_y_eq**2)\n",
    "  dof = len(x) - 2\n",
    "  chi = sc.stats.chi2.cdf(x2, dof)  # P(X² > x2)\n",
    "\n",
    "  return A, B, sigma_A, sigma_B, cov_AB, chi\n",
    "\n",
    "df = pd.DataFrame({\n",
    "  \"x\":[1.0, 2.0, 3.0, 4.0, 5.0],\n",
    "  \"y\":[2.1, 3.9, 6.2, 7.8, 10.1],\n",
    "  \"ux\":[0.1, 0.1, 0.1, 0.1, 0.1],\n",
    "  \"uy\":[0.2, 0.2, 0.2, 0.2, 0.2]\n",
    "  })\n",
    "\n",
    "A, B, sA, sB, covAB, chi = reg_lin(df[\"x\"], df[\"y\"], df[\"ux\"], df[\"uy\"])\n",
    "print(\"Ax + B : \")\n",
    "print(f\"A      = {A:.4f} +- {sA:.4f}\")\n",
    "print(f\"B      = {B:.4f} +- {sB:.4f}\")\n",
    "print(f\"cov_AB = {covAB:.6f}\")\n",
    "print(f\"p-value chi² = {chi:.4f}\")\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}