opengrowth-privacy-preservi.../opengrowth_privacy_preservi.../aggregator.py

"""Simple SecureAggregator for OpenGrowth MVP.

This is a lightweight, production-friendly utility that collects numeric samples
and computes a mean with a 95% confidence interval using a t-distribution.
It's intentionally small and dependency-free to ensure reliable packaging and
easy reasoning during tests.
"""
from math import sqrt
from typing import List


class SecureAggregator:
    """Compute mean and 95% CI for a stream of numbers.

    - Samples are stored locally (no data is transmitted in this MVP).
    - CI is computed using the t-distribution with df = n - 1. For n < 2, CI is 0.
    """

    def __init__(self, confidence: float = 0.95):
        if not (0 < confidence < 1):
            raise ValueError("confidence must be between 0 and 1")
        self.confidence = confidence
        self._samples: List[float] = []

    def add_sample(self, value: float) -> None:
        if not isinstance(value, (int, float)):
            raise TypeError("sample value must be numeric")
        self._samples.append(float(value))

    def clear(self) -> None:
        self._samples.clear()

    def _mean(self) -> float:
        if not self._samples:
            raise ValueError("no samples available to compute mean")
        return sum(self._samples) / len(self._samples)

    def _std_err(self) -> float:
        n = len(self._samples)
        if n < 2:
            return 0.0
        mean = self._mean()
        # Sample standard deviation (unbiased): sqrt( sum((x-mean)^2) / (n-1) )
        var = sum((x - mean) ** 2 for x in self._samples) / (n - 1)
        sd = sqrt(var)
        return sd / sqrt(n)

    def aggregate(self) -> dict:
        """Return a dict with mean and CI bounds.

        Example: {"mean": 12.3, "ci_low": 11.1, "ci_high": 13.5}
        If there are fewer than 2 samples, CI bounds are equal to the mean and 0.
        """
        if not self._samples:
            raise ValueError("no samples to aggregate")
        mean = self._mean()
        n = len(self._samples)
        if n < 2:
            return {"mean": mean, "ci_low": mean, "ci_high": mean}

        # Use a conservative t-value for 95% CI when df = n-1
        # Approximate t-values for common small n (df): [1: inf, 2: 4.303, 3: 3.182, 4: 2.776, 5: 2.571, 6: 2.447, 7: 2.365, 8: 2.306, 9: 2.262, 10: 2.228]
        # We'll implement a simple default using the normal approximation for larger n
        if n >= 30:
            t_value = 1.959964  # approx z-score for 95% CI
        else:
            # small-sample fallback: a small lookup for a few common n values
            t_lookup = {
                2: 4.303,
                3: 3.182,
                4: 2.776,
                5: 2.571,
                6: 2.447,
                7: 2.365,
                8: 2.306,
                9: 2.262,
                10: 2.228,
                11: 2.207,
                12: 2.191,
                13: 2.178,
                14: 2.160,
                15: 2.145,
                16: 2.131,
                17: 2.120,
                18: 2.110,
                19: 2.101,
                20: 2.093,
                21: 2.086,
                22: 2.080,
                23: 2.074,
                24: 2.069,
                25: 2.064,
                26: 2.060,
                27: 2.056,
                28: 2.052,
                29: 2.048,
                30: 2.045,
            }
            t_value = t_lookup.get(n, 1.96)
        se = self._std_err()
        half_width = t_value * se
        return {"mean": mean, "ci_low": mean - half_width, "ci_high": mean + half_width}