opengrowth-privacy-preservi.../opengrowth_privacy_preservi.../aggregator.py

103 lines
3.5 KiB
Python

"""Simple SecureAggregator for OpenGrowth MVP.
This is a lightweight, production-friendly utility that collects numeric samples
and computes a mean with a 95% confidence interval using a t-distribution.
It's intentionally small and dependency-free to ensure reliable packaging and
easy reasoning during tests.
"""
from math import sqrt
from typing import List
class SecureAggregator:
"""Compute mean and 95% CI for a stream of numbers.
- Samples are stored locally (no data is transmitted in this MVP).
- CI is computed using the t-distribution with df = n - 1. For n < 2, CI is 0.
"""
def __init__(self, confidence: float = 0.95):
if not (0 < confidence < 1):
raise ValueError("confidence must be between 0 and 1")
self.confidence = confidence
self._samples: List[float] = []
def add_sample(self, value: float) -> None:
if not isinstance(value, (int, float)):
raise TypeError("sample value must be numeric")
self._samples.append(float(value))
def clear(self) -> None:
self._samples.clear()
def _mean(self) -> float:
if not self._samples:
raise ValueError("no samples available to compute mean")
return sum(self._samples) / len(self._samples)
def _std_err(self) -> float:
n = len(self._samples)
if n < 2:
return 0.0
mean = self._mean()
# Sample standard deviation (unbiased): sqrt( sum((x-mean)^2) / (n-1) )
var = sum((x - mean) ** 2 for x in self._samples) / (n - 1)
sd = sqrt(var)
return sd / sqrt(n)
def aggregate(self) -> dict:
"""Return a dict with mean and CI bounds.
Example: {"mean": 12.3, "ci_low": 11.1, "ci_high": 13.5}
If there are fewer than 2 samples, CI bounds are equal to the mean and 0.
"""
if not self._samples:
raise ValueError("no samples to aggregate")
mean = self._mean()
n = len(self._samples)
if n < 2:
return {"mean": mean, "ci_low": mean, "ci_high": mean}
# Use a conservative t-value for 95% CI when df = n-1
# Approximate t-values for common small n (df): [1: inf, 2: 4.303, 3: 3.182, 4: 2.776, 5: 2.571, 6: 2.447, 7: 2.365, 8: 2.306, 9: 2.262, 10: 2.228]
# We'll implement a simple default using the normal approximation for larger n
if n >= 30:
t_value = 1.959964 # approx z-score for 95% CI
else:
# small-sample fallback: a small lookup for a few common n values
t_lookup = {
2: 4.303,
3: 3.182,
4: 2.776,
5: 2.571,
6: 2.447,
7: 2.365,
8: 2.306,
9: 2.262,
10: 2.228,
11: 2.207,
12: 2.191,
13: 2.178,
14: 2.160,
15: 2.145,
16: 2.131,
17: 2.120,
18: 2.110,
19: 2.101,
20: 2.093,
21: 2.086,
22: 2.080,
23: 2.074,
24: 2.069,
25: 2.064,
26: 2.060,
27: 2.056,
28: 2.052,
29: 2.048,
30: 2.045,
}
t_value = t_lookup.get(n, 1.96)
se = self._std_err()
half_width = t_value * se
return {"mean": mean, "ci_low": mean - half_width, "ci_high": mean + half_width}