103 lines
3.5 KiB
Python
103 lines
3.5 KiB
Python
"""Simple SecureAggregator for OpenGrowth MVP.
|
|
|
|
This is a lightweight, production-friendly utility that collects numeric samples
|
|
and computes a mean with a 95% confidence interval using a t-distribution.
|
|
It's intentionally small and dependency-free to ensure reliable packaging and
|
|
easy reasoning during tests.
|
|
"""
|
|
from math import sqrt
|
|
from typing import List
|
|
|
|
|
|
class SecureAggregator:
|
|
"""Compute mean and 95% CI for a stream of numbers.
|
|
|
|
- Samples are stored locally (no data is transmitted in this MVP).
|
|
- CI is computed using the t-distribution with df = n - 1. For n < 2, CI is 0.
|
|
"""
|
|
|
|
def __init__(self, confidence: float = 0.95):
|
|
if not (0 < confidence < 1):
|
|
raise ValueError("confidence must be between 0 and 1")
|
|
self.confidence = confidence
|
|
self._samples: List[float] = []
|
|
|
|
def add_sample(self, value: float) -> None:
|
|
if not isinstance(value, (int, float)):
|
|
raise TypeError("sample value must be numeric")
|
|
self._samples.append(float(value))
|
|
|
|
def clear(self) -> None:
|
|
self._samples.clear()
|
|
|
|
def _mean(self) -> float:
|
|
if not self._samples:
|
|
raise ValueError("no samples available to compute mean")
|
|
return sum(self._samples) / len(self._samples)
|
|
|
|
def _std_err(self) -> float:
|
|
n = len(self._samples)
|
|
if n < 2:
|
|
return 0.0
|
|
mean = self._mean()
|
|
# Sample standard deviation (unbiased): sqrt( sum((x-mean)^2) / (n-1) )
|
|
var = sum((x - mean) ** 2 for x in self._samples) / (n - 1)
|
|
sd = sqrt(var)
|
|
return sd / sqrt(n)
|
|
|
|
def aggregate(self) -> dict:
|
|
"""Return a dict with mean and CI bounds.
|
|
|
|
Example: {"mean": 12.3, "ci_low": 11.1, "ci_high": 13.5}
|
|
If there are fewer than 2 samples, CI bounds are equal to the mean and 0.
|
|
"""
|
|
if not self._samples:
|
|
raise ValueError("no samples to aggregate")
|
|
mean = self._mean()
|
|
n = len(self._samples)
|
|
if n < 2:
|
|
return {"mean": mean, "ci_low": mean, "ci_high": mean}
|
|
|
|
# Use a conservative t-value for 95% CI when df = n-1
|
|
# Approximate t-values for common small n (df): [1: inf, 2: 4.303, 3: 3.182, 4: 2.776, 5: 2.571, 6: 2.447, 7: 2.365, 8: 2.306, 9: 2.262, 10: 2.228]
|
|
# We'll implement a simple default using the normal approximation for larger n
|
|
if n >= 30:
|
|
t_value = 1.959964 # approx z-score for 95% CI
|
|
else:
|
|
# small-sample fallback: a small lookup for a few common n values
|
|
t_lookup = {
|
|
2: 4.303,
|
|
3: 3.182,
|
|
4: 2.776,
|
|
5: 2.571,
|
|
6: 2.447,
|
|
7: 2.365,
|
|
8: 2.306,
|
|
9: 2.262,
|
|
10: 2.228,
|
|
11: 2.207,
|
|
12: 2.191,
|
|
13: 2.178,
|
|
14: 2.160,
|
|
15: 2.145,
|
|
16: 2.131,
|
|
17: 2.120,
|
|
18: 2.110,
|
|
19: 2.101,
|
|
20: 2.093,
|
|
21: 2.086,
|
|
22: 2.080,
|
|
23: 2.074,
|
|
24: 2.069,
|
|
25: 2.064,
|
|
26: 2.060,
|
|
27: 2.056,
|
|
28: 2.052,
|
|
29: 2.048,
|
|
30: 2.045,
|
|
}
|
|
t_value = t_lookup.get(n, 1.96)
|
|
se = self._std_err()
|
|
half_width = t_value * se
|
|
return {"mean": mean, "ci_low": mean - half_width, "ci_high": mean + half_width}
|