Skip to content

standardize_all

sdc_census10to20.standardize_all

standardize_all(data: DataFrame, *, measure_info=None, input_only_measures=None, filter_geo: str = 'state', geoid_col: str = 'geoid', measure_col: str = 'measure', year_col: str = 'year', value_col: str = 'value', moe_col: str = 'moe', region_type_col: str = 'region_type', state_fips: str = '51', vintage_cutoff_year: int = 2020) -> pd.DataFrame

Standardize 2010 geographies to 2020 boundaries for tract and block-group rows.

Returns both the original measure (with _geo10 suffix for sub-county rows whose year is before vintage_cutoff_year — 2020 by default — _geo20 otherwise) and the redistributed measure (_geo20 suffix) so downstream consumers can compare or pick.

Assumes SDC long format: (geoid, year, measure, value, moe[, region_type]).

Parameters:

Name Type Description Default
data DataFrame

Input frame in SDC long format.

required
measure_info dict or path - like

Loaded measure_info.json dict or path to one. Used to derive geo_standardize specs and, when input_only_measures is None, to auto-detect helper measures via referenced_helper_measures.

None
input_only_measures iterable of str

Measures to keep in the input frame for ratio/density recompute but EXCLUDE from the standardized output (no _geo10/_geo20 emitted, no heuristic warning). When None and measure_info is given, auto-derives the referenced-but-unpublished helper counts via referenced_helper_measures.

None
filter_geo str

"state" (default) or "county" — restricts output to GEOIDs whose state/county prefix appears in the original data.

'state'
geoid_col str

Column name for the GEOID (default "geoid").

'geoid'
measure_col str

Column name for the measure identifier (default "measure").

'measure'
year_col str

Column name for the vintage year (default "year").

'year'
value_col str

Column name for the numeric value (default "value").

'value'
moe_col str

Column name for the margin of error (default "moe").

'moe'
region_type_col str

Optional column name for the region type label (default "region_type"); included in output only if present in data.

'region_type'
state_fips str

State FIPS code used to fetch the Census relationship file (default "51" — Virginia).

'51'
vintage_cutoff_year int

Sub-county rows with year < vintage_cutoff_year are treated as 2010-vintage (emit _geo10 plus a converted _geo20); rows at or above it are native 2020 (default 2020 reproduces the prior behavior).

2020
Source code in packages/sdc-census10to20/src/sdc_census10to20/convert.py
def standardize_all(
    data: pd.DataFrame,
    *,
    measure_info=None,
    input_only_measures=None,
    filter_geo: str = "state",
    geoid_col: str = "geoid",
    measure_col: str = "measure",
    year_col: str = "year",
    value_col: str = "value",
    moe_col: str = "moe",
    region_type_col: str = "region_type",
    state_fips: str = "51",
    vintage_cutoff_year: int = 2020,
) -> pd.DataFrame:
    """Standardize 2010 geographies to 2020 boundaries for tract and block-group rows.

    Returns both the original measure (with ``_geo10`` suffix for sub-county rows whose
    year is before ``vintage_cutoff_year`` — 2020 by default — ``_geo20`` otherwise) and
    the redistributed measure (``_geo20`` suffix) so downstream consumers can compare or pick.

    Assumes SDC long format: ``(geoid, year, measure, value, moe[, region_type])``.

    Parameters
    ----------
    data : pd.DataFrame
        Input frame in SDC long format.
    measure_info : dict or path-like, optional
        Loaded ``measure_info.json`` dict or path to one.  Used to derive
        ``geo_standardize`` specs and, when ``input_only_measures`` is ``None``,
        to auto-detect helper measures via ``referenced_helper_measures``.
    input_only_measures : iterable of str, optional
        Measures to keep in the input frame for ratio/density recompute but
        EXCLUDE from the standardized output (no ``_geo10``/``_geo20`` emitted,
        no heuristic warning).  When ``None`` and ``measure_info`` is given,
        auto-derives the referenced-but-unpublished helper counts via
        ``referenced_helper_measures``.
    filter_geo : str
        ``"state"`` (default) or ``"county"`` — restricts output to GEOIDs
        whose state/county prefix appears in the original data.
    geoid_col : str
        Column name for the GEOID (default ``"geoid"``).
    measure_col : str
        Column name for the measure identifier (default ``"measure"``).
    year_col : str
        Column name for the vintage year (default ``"year"``).
    value_col : str
        Column name for the numeric value (default ``"value"``).
    moe_col : str
        Column name for the margin of error (default ``"moe"``).
    region_type_col : str
        Optional column name for the region type label (default
        ``"region_type"``); included in output only if present in ``data``.
    state_fips : str
        State FIPS code used to fetch the Census relationship file
        (default ``"51"`` — Virginia).
    vintage_cutoff_year : int
        Sub-county rows with ``year < vintage_cutoff_year`` are treated as
        2010-vintage (emit ``_geo10`` plus a converted ``_geo20``); rows at or
        above it are native 2020 (default ``2020`` reproduces the prior behavior).
    """
    years = data[year_col].unique()
    measures = data[measure_col].unique()

    columns = [geoid_col, measure_col, year_col, value_col, moe_col]
    if region_type_col in data.columns:
        columns.append(region_type_col)
    data = data[columns].copy()
    data[geoid_col] = data[geoid_col].astype(str)

    specs = parse_geo_standardize_info(measure_info) if measure_info is not None else {}

    if input_only_measures is not None:
        input_only = set(input_only_measures)
    elif measure_info is not None:
        input_only = referenced_helper_measures(measure_info)
    else:
        input_only = set()

    native_2020 = {
        b for b, s in specs.items() if s.get("measure_type") == "geo2020"
    }

    original = data[~data[measure_col].isin(input_only)].copy()
    original[measure_col] = original.apply(
        lambda row: (
            f"{row[measure_col]}_geo20"
            if row[measure_col] in native_2020
            else (
                f"{row[measure_col]}_geo10"
                if row[year_col] < vintage_cutoff_year and len(row[geoid_col]) in _SUB_COUNTY_LENGTHS
                else f"{row[measure_col]}_geo20"
            )
        ),
        axis=1,
    )

    standardized_parts: list[pd.DataFrame] = []

    for yr in years:
        if yr < vintage_cutoff_year:
            for meas in measures:
                # Helper (input-only) and geo2020-native measures emit no converted rows.
                if meas in input_only or meas in native_2020:
                    continue
                for geoid_len in _SUB_COUNTY_LENGTHS:
                    temp = data[
                        (data[year_col] == yr)
                        & (data[measure_col] == meas)
                        & (data[geoid_col].str.len() == geoid_len)
                    ]
                    if temp.empty:
                        continue

                    spec = specs.get(meas)
                    if spec:
                        mtype = spec["measure_type"]
                    else:
                        mtype = _classify_by_name(meas)
                        if measure_info is not None:
                            warnings.warn(
                                f"measure {meas!r} has no geo_standardize metadata; "
                                f"falling back to name heuristic -> {mtype!r}",
                                UserWarning,
                                stacklevel=2,
                            )

                    if mtype == "count":
                        converted = convert_2010_to_2020_bounds(
                            temp, geoid_col=geoid_col, val_col=value_col,
                            state_fips=state_fips,
                        )
                    elif mtype in ("ratio", "rate"):
                        if spec and spec.get("numerator") and spec.get("denominator"):
                            num_slice = _measure_slice(
                                data, yr, geoid_len, spec["numerator"],
                                year_col=year_col, geoid_col=geoid_col,
                                measure_col=measure_col, value_col=value_col,
                            )
                            den_slice = _measure_slice(
                                data, yr, geoid_len, spec["denominator"],
                                year_col=year_col, geoid_col=geoid_col,
                                measure_col=measure_col, value_col=value_col,
                            )
                            if num_slice.empty or den_slice.empty:
                                raise ValueError(
                                    f"ratio {meas!r}: numerator/denominator "
                                    f"counts missing from frame for year {yr}"
                                )
                            converted = _redistribute_ratio_exact(
                                num_slice, den_slice, spec.get("scale", 100),
                                geoid_col=geoid_col, value_col=value_col,
                                state_fips=state_fips,
                            )
                        else:
                            weight = spec.get("weight") if spec else None
                            if not weight:
                                raise ValueError(
                                    f"ratio {meas!r}: declare numerator+denominator "
                                    f"or a weight in geo_standardize"
                                )
                            w_slice = _measure_slice(
                                data, yr, geoid_len, weight,
                                year_col=year_col, geoid_col=geoid_col,
                                measure_col=measure_col, value_col=value_col,
                            )
                            if w_slice.empty:
                                raise ValueError(
                                    f"ratio {meas!r}: weight {weight!r} missing "
                                    f"from frame for year {yr}"
                                )
                            converted = _redistribute_ratio_weighted(
                                temp[[geoid_col, value_col]], w_slice,
                                geoid_col=geoid_col, value_col=value_col,
                                state_fips=state_fips,
                            )
                    elif mtype in ("median", "mean", "replicate"):  # Non-additive intensive measures: replicate the area-dominant parent (no true reaggregation).
                        converted = _redistribute_replicate(
                            temp[[geoid_col, value_col]],
                            geoid_col=geoid_col, value_col=value_col,
                            state_fips=state_fips,
                        )
                    elif mtype == "density":
                        if not (spec and spec.get("count")):
                            raise ValueError(
                                f"density {meas!r}: declare 'count' in geo_standardize"
                            )
                        c_slice = _measure_slice(
                            data, yr, geoid_len, spec["count"],
                            year_col=year_col, geoid_col=geoid_col,
                            measure_col=measure_col, value_col=value_col,
                        )
                        if c_slice.empty:
                            raise ValueError(
                                f"density {meas!r}: count {spec['count']!r} missing "
                                f"from frame for year {yr}"
                            )
                        converted = _redistribute_density(
                            c_slice, geoid_col=geoid_col, value_col=value_col,
                            state_fips=state_fips,
                            area_divisor=spec.get("area_divisor", 1.0),
                        )
                    elif mtype == "index" or (spec and spec.get("interpolate") is False):
                        continue  # indices recomputed from standardized inputs downstream
                    else:
                        raise ValueError(
                            f"unknown measure_type {mtype!r} for measure {meas!r}"
                        )

                    converted[year_col] = yr
                    converted[measure_col] = f"{meas}_geo20"
                    converted[moe_col] = pd.NA
                    if region_type_col in data.columns:
                        converted[region_type_col] = _GEOID_LEN_TO_REGION_TYPE[geoid_len]
                    standardized_parts.append(converted)

    standardized = (
        pd.concat(standardized_parts, ignore_index=True)
        if standardized_parts
        else pd.DataFrame(columns=data.columns)
    )

    final = pd.concat([standardized, original], ignore_index=True)

    if filter_geo == "state":
        geoids = original[geoid_col].str[:2].unique()
        final = final[final[geoid_col].str[:2].isin(geoids)]
    elif filter_geo == "county":
        geoids = original[geoid_col].str[:5].unique()
        final = final[final[geoid_col].str[:5].isin(geoids)]

    return final