Package `sprime`

sprime - A biomedical library for screening high-throughput screening data.

sprime provides tools for analyzing and processing high-throughput screening data in preclinical drug studies.

Sub-modules

sprime.hill_fitting: Hill curve fitting module for sprime …
sprime.reporting: Data quality reporting system for sprime …
sprime.response_pipeline: Response preprocessing for raw qHTS curves before linear-x 4PL fitting …
sprime.sprime: sprime - A biomedical library for screening high-throughput screening data …

Functions

def calculate_delta_s_prime(s_prime_data: Union[ScreeningDataset, List[Dict]], reference_cell_line_names: Union[str, List[str]], test_cell_line_names: Union[str, List[str]], headings_one_to_one_in_ref_and_test: Optional[List[str]] = None, source_profile: "Literal['ref', 'test']" = 'test') ‑> Dict[str, List[Dict]]

Expand source code

def calculate_delta_s_prime(
    s_prime_data: Union[ScreeningDataset, List[Dict]],
    reference_cell_line_names: Union[str, List[str]],
    test_cell_line_names: Union[str, List[str]],
    headings_one_to_one_in_ref_and_test: Optional[List[str]] = None,
    source_profile: Literal["ref", "test"] = "test",
) -> Dict[str, List[Dict]]:
    """
    Calculate delta S' between reference and test cell lines.

    Matches original pseudo-code: delta_s_prime()

    Compound-level columns (MOA, drug targets, optional headings) auto-propagate;
    see ScreeningDataset.calculate_delta_s_prime for details.

    Args:
        s_prime_data: ScreeningDataset or list of dicts with S' values
        reference_cell_line_names: Reference cell line name(s)
        test_cell_line_names: Test cell line name(s)
        headings_one_to_one_in_ref_and_test: Optional list of metadata headings
            that exist 1:1 in ref and test; included in output.
        source_profile: 'ref' or 'test'; which profile to use for compound-level values.

    Returns:
        Dictionary with keys for each reference cell line, containing
        lists of dicts with delta S' and compound-level fields per combo.
    """
    if isinstance(s_prime_data, ScreeningDataset):
        return s_prime_data.calculate_delta_s_prime(
            reference_cell_line_names,
            test_cell_line_names,
            headings_one_to_one_in_ref_and_test=headings_one_to_one_in_ref_and_test,
            source_profile=source_profile,
        )
    else:
        if not s_prime_data:
            return {}

        assay_name = s_prime_data[0].get("assay", "Unknown")
        assay = Assay(name=assay_name)
        screening_dataset = ScreeningDataset(assay=assay)

        for row in s_prime_data:
            compound = Compound(
                name=row.get("compound_name", "Unknown"),
                drug_id=row.get("drug_id", ""),
                pubchem_sid=None,
                smiles=None,
            )
            cell_line = CellLine(name=row.get("cell_line", ""), ref_id=None)

            hill_params = None
            if row.get("ec50") is not None:
                hill_params = HillCurveParams(
                    ec50=row.get("ec50"),
                    zero_asymptote=row.get("zero_asymptote"),
                    inf_asymptote=row.get("inf_asymptote"),
                    steepness_coefficient=row.get(
                        "steepness_coefficient", row.get("hill_coefficient")
                    ),
                    r_squared=row.get("r_squared"),
                )

            profile = DoseResponseProfile(
                compound=compound,
                cell_line=cell_line,
                assay=assay,
                concentrations=None,
                responses=None,
                concentration_units="microM",
                hill_params=hill_params,
                s_prime=row.get("s_prime"),
                rank=row.get("rank"),
                metadata=None,
            )

            if profile.s_prime is not None and profile.hill_params is not None:
                try:
                    screening_dataset.add_profile(profile)
                except ValueError:
                    continue

        return screening_dataset.calculate_delta_s_prime(
            reference_cell_line_names,
            test_cell_line_names,
            headings_one_to_one_in_ref_and_test=headings_one_to_one_in_ref_and_test,
            source_profile=source_profile,
        )

Calculate delta S' between reference and test cell lines.

Matches original pseudo-code: delta_s_prime()

Compound-level columns (MOA, drug targets, optional headings) auto-propagate; see ScreeningDataset.calculate_delta_s_prime for details.

Args

s_prime_data: ScreeningDataset or list of dicts with S' values
reference_cell_line_names: Reference cell line name(s)
test_cell_line_names: Test cell line name(s)
headings_one_to_one_in_ref_and_test: Optional list of metadata headings that exist 1:1 in ref and test; included in output.
source_profile: 'ref' or 'test'; which profile to use for compound-level values.

Returns

Dictionary with keys for each reference cell line, containing lists of dicts with delta S' and compound-level fields per combo.

def calculate_s_prime_from_params(ac50: float, zero_asymptote: float, inf_asymptote: float) ‑> float

Expand source code

def calculate_s_prime_from_params(
    ac50: float, zero_asymptote: float, inf_asymptote: float
) -> float:
    """
    Calculate S' from Hill curve parameters.

    **S' = asinh((zero_asymptote - inf_asymptote) / EC50).** Same ``zero_asymptote`` /
    ``inf_asymptote`` as :class:`HillCurveParams` (some CSVs use legacy
    column names **Lower** / **Upper** for those two slots). This is **not** ``asinh((inf - zero) / EC50)``.

    **Do not assume asymptotes.** Pass ``zero_asymptote`` and ``inf_asymptote`` that come from a
    **fit to raw data** (e.g. :func:`fit_hill_from_raw_data` / :class:`HillCurveParams`) or from
    **explicit precalc columns** on your file (e.g. ``Zero_asymptote`` / ``Inf_asymptote``). Never
    invent values such as ``inf_asymptote = 0`` to match a summary table unless the **data** say so.

    If the source only publishes a **span** (e.g. tabulated **A-D** = ``zero_asymptote - inf_asymptote``)
    and **EC50** but not the two asymptotes separately, compute S' as ``asinh(span / EC50)`` directly
    (equivalent to any decomposition with that span) rather than guessing a ``(zero, inf)`` pair.

    Args:
        ac50: AC50 or EC50 value
        zero_asymptote: Hill ``zero_asymptote`` (concentration -> 0, left of dose axis)
        inf_asymptote: Hill ``inf_asymptote`` (saturating dose, right of dose axis)

    Returns:
        float: S' value
    """
    ratio = (zero_asymptote - inf_asymptote) / ac50
    return math.asinh(ratio)

Calculate S' from Hill curve parameters.

S' = asinh((zero_asymptote - inf_asymptote) / EC50). Same zero_asymptote / inf_asymptote as :class:HillCurveParams (some CSVs use legacy column names Lower / Upper for those two slots). This is not asinh((inf - zero) / EC50).

Do not assume asymptotes. Pass zero_asymptote and inf_asymptote that come from a fit to raw data (e.g. :func:fit_hill_from_raw_data() / :class:HillCurveParams) or from explicit precalc columns on your file (e.g. Zero_asymptote / Inf_asymptote). Never invent values such as inf_asymptote = 0 to match a summary table unless the data say so.

If the source only publishes a span (e.g. tabulated A-D = zero_asymptote - inf_asymptote) and EC50 but not the two asymptotes separately, compute S' as asinh(span / EC50) directly (equivalent to any decomposition with that span) rather than guessing a (zero, inf) pair.

Args

ac50: AC50 or EC50 value
zero_asymptote: Hill zero_asymptote (concentration -> 0, left of dose axis)
inf_asymptote: Hill inf_asymptote (saturating dose, right of dose axis)

Returns

float: S' value

def convert_to_micromolar(concentrations: List[float], units: str) ‑> List[float]

Expand source code

def convert_to_micromolar(concentrations: List[float], units: str) -> List[float]:
    """
    Convert concentration values to microMolar.

    Args:
        concentrations: List of concentration values
        units: Current units. Supported (case-insensitive), smallest to largest:
            fM (fm, femtom); pM (pm, picom); nM (nm, nanom);
            microM (mu-m alias, um, microm, micro); mM (mm, millim); M (m, mol).

    Returns:
        List of concentrations in microMolar
    """
    units_lower = units.lower().strip()

    conversion_factors = {
        "fm": 1e-9,
        "femtom": 1e-9,
        "pm": 1e-6,
        "picom": 1e-6,
        "nm": 0.001,
        "nanom": 0.001,
        "microm": 1.0,
        "micro": 1.0,
        _UNIT_UM_ALIAS: 1.0,
        "um": 1.0,
        "mm": 1000.0,
        "millim": 1000.0,
        "m": 1000000.0,
        "mol": 1000000.0,
    }

    factor = conversion_factors.get(units_lower, 1.0)
    return [c * factor for c in concentrations]

Convert concentration values to microMolar.

Args

concentrations: List of concentration values
units: Current units. Supported (case-insensitive), smallest to largest: fM (fm, femtom); pM (pm, picom); nM (nm, nanom); microM (mu-m alias, um, microm, micro); mM (mm, millim); M (m, mol).

Returns

List of concentrations in microMolar

def fit_hill_from_raw_data(raw_responses: List[float], concentration_array: List[float], *, control_response: Optional[float] = None, skip_control_response_normalization: bool = False, response_normalization: ResponseNormalizationMethod = 'asymptote_normalized', scale_factor: float = 100.0, concentration_units: str = 'microM', **hill_calc_params) ‑> HillCurveParams

Expand source code

def fit_hill_from_raw_data(
    raw_responses: List[float],
    concentration_array: List[float],
    *,
    control_response: Optional[float] = None,
    skip_control_response_normalization: bool = False,
    response_normalization: ResponseNormalizationMethod = "asymptote_normalized",
    scale_factor: float = S_PRIME_RESPONSE_SCALE_FACTOR,
    concentration_units: str = "microM",
    **hill_calc_params,
) -> HillCurveParams:
    """
    Fit Hill curve from dose-response points with explicit response preprocessing.

    **S' is not computed here** - use :func:`calculate_s_prime_from_params` on the returned
    :class:`HillCurveParams` (or :meth:`DoseResponseProfile.calculate_s_prime`).

    **Control / DMSO step:** When ``skip_control_response_normalization=False`` (default),
    ``control_response`` **must** be a non-zero vehicle (**DMSO**) readout; each value in
    ``raw_responses`` is divided by it (test/control ratio). Loaders should surface a failed
    check as an import/processing report detail using the :exc:`ValueError` message below.

    When ``skip_control_response_normalization=True``, values in ``raw_responses`` are treated
    as **already** on the post-ratio scale (e.g. ratios from a spreadsheet); no divide-by-control
    is applied. To fit **fully pre-scaled** arrays (legacy callers that applied
    :mod:`sprime.response_pipeline` themselves), use ``skip_control_response_normalization=True``,
    ``response_normalization="response_scale"``, and ``scale_factor=1.0`` so responses are not
    transformed again - that ``1.0`` override is an **exception** to the usual x100 rule below.

    **``scale_factor`` (x100 convention):** For real workflows it should **stay at the default
    ``100``** (:data:`~sprime.response_pipeline.S_PRIME_RESPONSE_SCALE_FACTOR`). That matches
    ``tests/fixtures/SPrime_variation_reference.csv`` and published S' pipelines. **Do not change
    it** unless you have a documented reason (e.g. unit tests or the rare legacy hand-off above).

    **Response normalization** (after the optional ratio step; default **asymptote_normalized**):

    - ``asymptote_normalized`` - :func:`~sprime.response_pipeline.normalize_to_max_value` (largest
      value becomes ``1``, others proportionally lower), then multiply by ``scale_factor`` (almost
      always ``100``). Matches
      :func:`sprime.response_pipeline.pipeline_asymptote_normalized` and the variation reference
      *normalise to 1* / *x 100 scale* columns.
    - ``response_scale`` - multiply by ``scale_factor`` only (no max normalization; almost always ``100``).
      Matches :func:`sprime.response_pipeline.pipeline_response_scale` and *Nonnormalized, x100 scale*.

    **Concentrations** are converted to **microM** via :func:`convert_to_micromolar`. Supported unit
    strings are the same as at CSV import (case-insensitive), e.g. ``microM`` / ``um`` / ``nM`` /
    ``mM`` / ``M`` - see :func:`convert_to_micromolar`.

    Args:
        raw_responses: Per-concentration readouts (raw plate units if using control ratio).
        concentration_array: Concentrations in ``concentration_units`` (converted to microM internally).
        control_response: Vehicle control readout when ``skip_control_response_normalization=False``.
        skip_control_response_normalization: If ``False`` (default), require ``control_response`` and
            apply test/control ratio. If ``True``, skip that step.
        response_normalization: ``asymptote_normalized`` (default) or ``response_scale``; see above.
        scale_factor: **Leave at default ``100``** for normal use (same as
            :data:`sprime.response_pipeline.S_PRIME_RESPONSE_SCALE_FACTOR`). Only override in rare
            cases (tests, or ``1.0`` when passing responses that already include x100 from
            :mod:`sprime.response_pipeline`).
        concentration_units: Input concentration unit label for :func:`convert_to_micromolar`.
        **hill_calc_params: Passed to :func:`sprime.hill_fitting.fit_hill_curve`.

    Returns:
        HillCurveParams: Fitted curve parameters

    Raises:
        ValueError: If ``skip_control_response_normalization=False`` and ``control_response`` is
            missing or zero; if max normalization sees all-zero values.
        ImportError: If scipy is not installed.
    """
    if hill_fitting is None:
        raise ImportError("Hill curve fitting requires scipy. " "Install with: pip install scipy")

    if not skip_control_response_normalization:
        if control_response is None:
            raise ValueError(
                "control_response is required when skip_control_response_normalization=False "
                "(vehicle/DMSO control readout for per-profile test/control ratio; "
                "map CSV column Control_Response). Add Control_Response or set "
                "skip_control_response_normalization=True if responses are already control-relative."
            )
        if control_response == 0:
            raise ValueError(
                "control_response must be non-zero when skip_control_response_normalization=False "
                "(cannot form test/control ratio for DMSO/vehicle normalization)."
            )
        working = ratios_to_control(raw_responses, control_response)
    else:
        working = [float(x) for x in raw_responses]

    if response_normalization == "asymptote_normalized":
        working = normalize_to_max_value(working)
        working = scale_responses(working, scale_factor)
    else:
        working = scale_responses(working, scale_factor)

    concentrations = convert_to_micromolar(concentration_array, concentration_units)

    return hill_fitting.fit_hill_curve(concentrations, working, **hill_calc_params)

Fit Hill curve from dose-response points with explicit response preprocessing.

S' is not computed here - use :func:calculate_s_prime_from_params() on the returned :class:HillCurveParams (or :meth:DoseResponseProfile.calculate_s_prime()).

Control / DMSO step: When skip_control_response_normalization=False (default), control_response must be a non-zero vehicle (DMSO) readout; each value in raw_responses is divided by it (test/control ratio). Loaders should surface a failed check as an import/processing report detail using the :exc:ValueError message below.

When skip_control_response_normalization=True, values in raw_responses are treated as already on the post-ratio scale (e.g. ratios from a spreadsheet); no divide-by-control is applied. To fit fully pre-scaled arrays (legacy callers that applied :mod:sprime.response_pipeline themselves), use skip_control_response_normalization=True, response_normalization="response_scale", and scale_factor=1.0 so responses are not transformed again - that 1.0 override is an exception to the usual x100 rule below.

scale_factor (x100 convention): For real workflows it should stay at the default 100 (:data:~sprime.response_pipeline.S_PRIME_RESPONSE_SCALE_FACTOR). That matches tests/fixtures/SPrime_variation_reference.csv and published S' pipelines. Do not change it unless you have a documented reason (e.g. unit tests or the rare legacy hand-off above).

Response normalization (after the optional ratio step; default asymptote_normalized):

asymptote_normalized - :func:~sprime.response_pipeline.normalize_to_max_value (largest value becomes 1, others proportionally lower), then multiply by scale_factor (almost always 100). Matches :func:pipeline_asymptote_normalized() and the variation reference normalise to 1 / x 100 scale columns.
response_scale - multiply by scale_factor only (no max normalization; almost always 100). Matches :func:pipeline_response_scale() and Nonnormalized, x100 scale.

Concentrations are converted to microM via :func:convert_to_micromolar(). Supported unit strings are the same as at CSV import (case-insensitive), e.g. microM / um / nM / mM / M - see :func:convert_to_micromolar().

Args

raw_responses: Per-concentration readouts (raw plate units if using control ratio).
concentration_array: Concentrations in concentration_units (converted to microM internally).
control_response: Vehicle control readout when skip_control_response_normalization=False.
skip_control_response_normalization: If False (default), require control_response and apply test/control ratio. If True, skip that step.
response_normalization: asymptote_normalized (default) or response_scale; see above.
scale_factor: Leave at default 100 for normal use (same as :data:S_PRIME_RESPONSE_SCALE_FACTOR). Only override in rare cases (tests, or 1.0 when passing responses that already include x100 from :mod:sprime.response_pipeline).
concentration_units: Input concentration unit label for :func:convert_to_micromolar().
**hill_calc_params: Passed to :func:fit_hill_curve().

Returns

HillCurveParams: Fitted curve parameters

Raises

ValueError: If skip_control_response_normalization=False and control_response is missing or zero; if max normalization sees all-zero values.
ImportError: If scipy is not installed.

def get_s_prime_from_data(list_of_rows: List[Dict], allow_overwrite_precalc_params: bool = False, values_as: str = 'columns', skip_control_response_normalization: bool = False, *, response_normalization: ResponseNormalizationMethod, **fit_params) ‑> List[Dict]

Expand source code

def get_s_prime_from_data(
    list_of_rows: List[Dict],
    allow_overwrite_precalc_params: bool = False,
    values_as: str = "columns",
    skip_control_response_normalization: bool = False,
    *,
    response_normalization: ResponseNormalizationMethod,
    **fit_params,
) -> List[Dict]:
    """
    Calculate S' values from in-memory data structure.

    Matches original pseudo-code: getSPrimeFromData()
    Uses global reporting configuration for console/log output.

    Args:
        list_of_rows: List of dictionaries matching CSV row format
        allow_overwrite_precalc_params: If True, allow overwriting pre-calculated
            curve parameters (AC50, asymptotes, Hill_Slope, r2) with fitted values when
            both raw and pre-calc exist. Default False (raise). When True,
            overwrites are logged as warnings.
        values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
        skip_control_response_normalization: If False (default), raw rows must include non-zero
            ``Control_Response``. Set True if responses are already control-normalized.
        response_normalization: Required. Stored on :class:`RawDataset` for :meth:`~RawDataset.to_screening_dataset`.
        **fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

    Returns:
        List of dictionaries with S' values and ranking

    Raises:
        ValueError: If required columns are missing or data is invalid
    """
    if not list_of_rows:
        return []

    # Validate required columns exist (check first row keys)
    first_row_keys = list(list_of_rows[0].keys()) if list_of_rows else []
    if first_row_keys:
        _validate_required_columns(
            first_row_keys,
            source_name="in-memory data",
            values_as=values_as,
            skip_control_response_normalization=skip_control_response_normalization,
        )

    # Create report (row numbers will be 0 for in-memory data)
    if ProcessingReport is not None:
        report = ProcessingReport()
    else:
        report = None

    # Create a temporary RawDataset by manually constructing profiles
    # We'll use the first row to infer assay name, or use a default
    assay_name = list_of_rows[0].get("Screen ID", list_of_rows[0].get("Assay", "Unknown"))
    assay = Assay(name=assay_name)
    raw_dataset = RawDataset(
        assay=assay,
        response_normalization=response_normalization,
        skip_control_response_normalization=skip_control_response_normalization,
    )

    if report:
        report.total_rows = len(list_of_rows)

    reserved = _reserved_column_names(values_as, first_row_keys)
    compounds_seen = set()

    for row_idx, row in enumerate(list_of_rows):
        if report:
            report.rows_processed += 1

        # Check if row is fully blank
        is_fully_blank = not any(
            v.strip() if isinstance(v, str) else str(v).strip() for v in row.values() if v
        )

        if is_fully_blank:
            continue

        # Get cell line name early for use in warnings
        cell_line_name = row.get("Cell_Line", "").strip()

        # Check for empty cell line - RAISE EXCEPTION
        if not cell_line_name:
            raise ValueError(
                f"Row {row_idx + 1}: Missing required 'Cell_Line' value in in-memory data. "
                f"All rows must have a cell line specified."
            )

        # Get compound info (Compound_ID required; NCGCID pass-through only)
        compound_name = row.get("Compound Name", "").strip() or "Unknown"
        compound_id = row.get("Compound_ID", "").strip()

        if not compound_id:
            raise ValueError(
                f"Row {row_idx + 1}: Missing required 'Compound_ID' value in in-memory data. "
                f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                f"All rows must have a compound identifier."
            )

        # Track compound
        if report and compound_id not in compounds_seen:
            compounds_seen.add(compound_id)
            report.compounds_loaded += 1

        # Check for missing compound name
        if report and compound_name == "Unknown":
            report.add_warning(
                row_number=0,
                category="DATA_QUALITY",
                message="Compound Name missing, using 'Unknown'",
                drug_id=compound_id,
                cell_line=cell_line_name,
                field_name="Compound Name",
            )
            report.missing_compound_names += 1

        compound = Compound(
            name=compound_name,
            drug_id=compound_id,
            pubchem_sid=row.get("pubchem_sid", "").strip() or None,
            smiles=row.get("SMILES", "").strip() or None,
        )

        # Create CellLine (cell_line_name already defined above)
        cell_line = CellLine(
            name=cell_line_name, ref_id=row.get("Cell_Line_Ref_ID", "").strip() or None
        )

        # Extract raw dose-response data (if present)
        concentrations = None
        responses = None

        if values_as == "list":
            resp_key = next((k for k in row if k.lower() == "responses"), None)
            conc_key = next((k for k in row if k.lower() == "concentrations"), None)
            if resp_key and conc_key:
                resp_str = (row.get(resp_key) or "").strip()
                conc_str = (row.get(conc_key) or "").strip()
                if resp_str and conc_str:
                    responses = []
                    conc_parts = []
                    for part in resp_str.split(","):
                        t = part.strip()
                        if t:
                            try:
                                v = float(t)
                                if not (math.isnan(v) or math.isinf(v)):
                                    responses.append(v)
                            except (ValueError, TypeError):
                                pass
                    for part in conc_str.split(","):
                        t = part.strip()
                        if t:
                            try:
                                v = float(t)
                                if not (math.isnan(v) or math.isinf(v)):
                                    conc_parts.append(v)
                            except (ValueError, TypeError):
                                pass
                    if len(responses) == len(conc_parts) and len(responses) >= 4:
                        units = row.get("Concentration_Units", "").strip()
                        if not units:
                            raise ValueError(
                                f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in in-memory data. "
                                f"Compound: {compound_name}, Cell_Line: {cell_line_name}."
                            )
                        concentrations = convert_to_micromolar(conc_parts, units)
                    else:
                        if len(responses) != len(conc_parts):
                            raise ValueError(
                                f"Row {row_idx + 1}: Responses and Concentrations length mismatch "
                                f"({len(responses)} vs {len(conc_parts)}) in in-memory data."
                            )
                        responses = None
                        concentrations = None

        else:
            data_cols = [k for k in row.keys() if k.startswith("Data") or k.startswith("DATA")]
            conc_cols = [
                k
                for k in row.keys()
                if (k.startswith("Conc") or k.startswith("CONC"))
                and "Units" not in k
                and "units" not in k
            ]
            if data_cols and conc_cols:
                data_cols = sorted(
                    data_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                )
                conc_cols = sorted(
                    conc_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                )
                responses = []
                concentrations = []
                for data_col, conc_col in zip(data_cols, conc_cols):
                    try:
                        resp_val = row.get(data_col, "") or ""
                        conc_val = row.get(conc_col, "") or ""
                        resp_val = resp_val.strip() if isinstance(resp_val, str) else ""
                        conc_val = conc_val.strip() if isinstance(conc_val, str) else ""
                        if resp_val and conc_val:
                            responses.append(float(resp_val))
                            concentrations.append(float(conc_val))
                    except (ValueError, TypeError):
                        continue
                if concentrations and responses:
                    units = row.get("Concentration_Units", "").strip()
                    if not units:
                        raise ValueError(
                            f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in in-memory data. "
                            f"Compound: {compound_name}, Cell_Line: {cell_line_name}."
                        )
                    concentrations = convert_to_micromolar(concentrations, units)
                else:
                    concentrations = None
                    responses = None

        # Extract pre-calculated Hill params (if present)
        hill_params = None
        ac50 = row.get("AC50", "").strip() or row.get("ec50", "").strip()
        if ac50:
            try:
                inf_a = row.get("Inf_asymptote", row.get("Upper", row.get("Infinity", "0")))
                zero_a = row.get("Zero_asymptote", row.get("Lower", row.get("Zero", "0")))
                hill_params = HillCurveParams(
                    ec50=float(ac50),
                    zero_asymptote=float(str(zero_a).strip() or "0"),
                    inf_asymptote=float(str(inf_a).strip() or "0"),
                    steepness_coefficient=_try_float(
                        row.get("Hill_Slope", row.get("Hill", row.get("slope", "")))
                    ),
                    r_squared=_try_float(row.get("r2", row.get(_RES_COL_R2, ""))),
                )
            except (ValueError, TypeError):
                hill_params = None

        # Validate that row has either raw data or pre-calculated params
        has_raw_data = (
            concentrations is not None and responses is not None and len(concentrations) > 0
        )
        has_precalc_params = hill_params is not None

        if not (has_raw_data or has_precalc_params):
            raise ValueError(
                f"Row {row_idx + 1}: No dose-response data found for compound '{compound_name}' "
                f"(Compound_ID: {compound_id}) in cell line '{cell_line_name}' in in-memory data. "
                f"Row must have either: (1) raw data columns (DATA*/CONC*), or "
                f"(2) pre-calculated parameters (AC50, Zero_asymptote, Inf_asymptote)."
            )

        _validate_control_response_for_raw_row(
            row,
            f"Row {row_idx + 1} in in-memory data",
            has_raw_data,
            compound_name=compound_name,
            cell_line_name=cell_line_name,
            compound_id=compound_id,
            skip_control_response_normalization=skip_control_response_normalization,
        )

        # Pre-calculated S' (if present)
        s_prime = _try_float(row.get("S'", row.get("S Prime", "")))
        rank = _try_int(row.get("Rank", ""))

        # Extract metadata: generic pass-through (all non-reserved columns, exact header, value as-is)
        metadata = {}
        for col in row.keys():
            if col in reserved:
                continue
            raw = row.get(col, "")
            raw = raw if isinstance(raw, str) else str(raw)
            metadata[col] = raw

        # Create profile
        profile = DoseResponseProfile(
            compound=compound,
            cell_line=cell_line,
            assay=assay,
            concentrations=concentrations,
            responses=responses,
            concentration_units="microM",
            hill_params=hill_params,
            s_prime=s_prime,
            rank=rank,
            metadata=metadata if metadata else None,
            control_response=_control_response_numeric_for_raw_row(
                row,
                has_raw_data,
                skip_control_response_normalization,
            ),
        )

        raw_dataset.add_profile(profile)
        if report:
            report.profiles_created += 1

    # Process to ScreeningDataset (reuses same report)
    screening_dataset, process_report = raw_dataset.to_screening_dataset(
        report=report, allow_overwrite_precalc_params=allow_overwrite_precalc_params, **fit_params
    )

    # Print and write log based on global config
    if report and ReportingConfig is not None:
        report.print_console_summary()
        report.write_log_file()

    # Add ranking
    profiles = list(screening_dataset.profiles)
    profiles.sort(key=lambda p: p.s_prime if p.s_prime is not None else float("-inf"), reverse=True)

    for rank, profile in enumerate(profiles, start=1):
        profile.rank = rank

    return screening_dataset.to_dict_list()

Calculate S' values from in-memory data structure.

Matches original pseudo-code: getSPrimeFromData() Uses global reporting configuration for console/log output.

Args

list_of_rows: List of dictionaries matching CSV row format
allow_overwrite_precalc_params: If True, allow overwriting pre-calculated curve parameters (AC50, asymptotes, Hill_Slope, r2) with fitted values when both raw and pre-calc exist. Default False (raise). When True, overwrites are logged as warnings.
values_as: "columns" (DATA/CONC) or "list" (Responses, Concentrations)
skip_control_response_normalization: If False (default), raw rows must include non-zero Control_Response. Set True if responses are already control-normalized.
response_normalization: Required. Stored on :class:RawDataset for :meth:~RawDataset.to_screening_dataset.
**fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

Returns

List of dictionaries with S' values and ranking

Raises

ValueError: If required columns are missing or data is invalid

def get_s_primes_from_file(filepath: Union[str, Path], allow_overwrite_precalc_params: bool = False, values_as: str = 'columns', skip_control_response_normalization: bool = True, *, response_normalization: ResponseNormalizationMethod, **fit_params) ‑> List[Dict]

Expand source code

def get_s_primes_from_file(
    filepath: Union[str, Path],
    allow_overwrite_precalc_params: bool = False,
    values_as: str = "columns",
    skip_control_response_normalization: bool = True,
    *,
    response_normalization: ResponseNormalizationMethod,
    **fit_params,
) -> List[Dict]:
    """
    Load CSV and calculate S' values for all compounds.

    Matches original pseudo-code: getSPrimesFromFile()
    Uses global reporting configuration for console/log output.

    Args:
        filepath: Path to CSV file
        allow_overwrite_precalc_params: If True, allow overwriting pre-calculated
            curve parameters (AC50, asymptotes, Hill_Slope, r2) with fitted values when
            both raw and pre-calc exist. Default False (raise). When True,
            overwrites are logged as warnings.
        values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
        skip_control_response_normalization: If False, require non-zero ``Control_Response``
            on each raw row. Passed to :meth:`RawDataset.load_from_file`.
        response_normalization: Required. Passed to :meth:`RawDataset.load_from_file`.
        **fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

    Returns:
        List of dictionaries with S' values and ranking
    """
    # Create single report that accumulates both load and process warnings
    if ProcessingReport is not None:
        report = ProcessingReport()
        report.input_filepath = Path(filepath)
    else:
        report = None

    # Load
    raw_dataset, load_report = RawDataset.load_from_file(
        filepath,
        report=report,
        values_as=values_as,
        skip_control_response_normalization=skip_control_response_normalization,
        response_normalization=response_normalization,
    )

    # Process (reuses same report)
    screening_dataset, process_report = raw_dataset.to_screening_dataset(
        report=report, allow_overwrite_precalc_params=allow_overwrite_precalc_params, **fit_params
    )

    # Print and write log based on global config
    if report and ReportingConfig is not None:
        report.print_console_summary()
        report.write_log_file()

    # Add ranking
    profiles = list(screening_dataset.profiles)
    profiles.sort(key=lambda p: p.s_prime if p.s_prime is not None else float("-inf"), reverse=True)

    for rank, profile in enumerate(profiles, start=1):
        profile.rank = rank

    return screening_dataset.to_dict_list()

Load CSV and calculate S' values for all compounds.

Matches original pseudo-code: getSPrimesFromFile() Uses global reporting configuration for console/log output.

Args

filepath: Path to CSV file
allow_overwrite_precalc_params: If True, allow overwriting pre-calculated curve parameters (AC50, asymptotes, Hill_Slope, r2) with fitted values when both raw and pre-calc exist. Default False (raise). When True, overwrites are logged as warnings.
values_as: "columns" (DATA/CONC) or "list" (Responses, Concentrations)
skip_control_response_normalization: If False, require non-zero Control_Response on each raw row. Passed to :meth:RawDataset.load_from_file().
response_normalization: Required. Passed to :meth:RawDataset.load_from_file().
**fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

Returns

List of dictionaries with S' values and ranking

Classes

class Assay (name: str, description: Optional[str] = None, screen_id: Optional[str] = None, readout_type: Optional[str] = None, time_profile: Optional[str] = None)

Expand source code

@dataclass
class Assay:
    """
    Represents a standardized biological assay procedure.

    Attributes:
        name: Assay name or identifier
        description: Assay description (optional)
        screen_id: Screen ID from data (e.g., 'HTS002') (optional)
        readout_type: Type of measurement (e.g., 'activity', 'luminescence') (optional)
        time_profile: Time point if applicable (e.g., '24Hr', '48Hr', '4Day') (optional)
    """

    name: str
    description: Optional[str] = None
    screen_id: Optional[str] = None
    readout_type: Optional[str] = None
    time_profile: Optional[str] = None

    def __repr__(self):
        return f"Assay({self.name})"

Represents a standardized biological assay procedure.

Attributes

name: Assay name or identifier
description: Assay description (optional)
screen_id: Screen ID from data (e.g., 'HTS002') (optional)
readout_type: Type of measurement (e.g., 'activity', 'luminescence') (optional)
time_profile: Time point if applicable (e.g., '24Hr', '48Hr', '4Day') (optional)

Instance variables

var description : str | None
var name : str
var readout_type : str | None
var screen_id : str | None
var time_profile : str | None

class CellLine (name: str, ref_id: Optional[str] = None)

Expand source code

@dataclass(frozen=True)
class CellLine:
    """
    Represents a cell line/clone.

    Attributes:
        name: Cell line name (e.g., 'ipNF96.11C', 'LS513_LARGE_INTESTINE')
        ref_id: Reference identifier (e.g., 'ACH-000007', 'depmap_id') (optional)
    """

    name: str
    ref_id: Optional[str] = None

    def __repr__(self):
        return f"CellLine({self.name})"

Represents a cell line/clone.

Attributes

name: Cell line name (e.g., 'ipNF96.11C', 'LS513_LARGE_INTESTINE')
ref_id: Reference identifier (e.g., 'ACH-000007', 'depmap_id') (optional)

Instance variables

var name : str
var ref_id : str | None

class Compound (name: str, drug_id: str, pubchem_sid: Optional[str] = None, smiles: Optional[str] = None)

Expand source code

@dataclass(frozen=True)
class Compound:
    """
    Represents a chemical compound/drug entity.

    Attributes:
        name: Compound name
        drug_id: Unique compound identifier (from Compound_ID column)
        pubchem_sid: PubChem substance identifier (optional)
        smiles: SMILES notation of chemical structure (optional)
    """

    name: str
    drug_id: str
    pubchem_sid: Optional[str] = None
    smiles: Optional[str] = None

    def __repr__(self):
        return f"Compound({self.name}, id={self.drug_id})"

Represents a chemical compound/drug entity.

Attributes

name: Compound name
drug_id: Unique compound identifier (from Compound_ID column)
pubchem_sid: PubChem substance identifier (optional)
smiles: SMILES notation of chemical structure (optional)

Instance variables

var drug_id : str
var name : str
var pubchem_sid : str | None
var smiles : str | None

class ConsoleOutput (*args, **kwds)

Expand source code

class ConsoleOutput(Enum):
    """Console output verbosity levels."""

    NONE = "none"  # No console output
    SUMMARY = "summary"  # Brief summary (default)
    VERBOSE = "verbose"  # Detailed output with all warnings

Console output verbosity levels.

Ancestors

enum.Enum

Class variables

var NONE
var SUMMARY
var VERBOSE

class DoseResponseProfile (compound: Compound, cell_line: CellLine, assay: Assay, concentrations: Optional[List[float]] = None, responses: Optional[List[float]] = None, concentration_units: str = 'microM', hill_params: Optional[HillCurveParams] = None, s_prime: Optional[float] = None, rank: Optional[int] = None, metadata: Optional[Dict[str, str]] = None, control_response: Optional[float] = None)

Expand source code

@dataclass
class DoseResponseProfile:
    """
    Represents a dose-response profile for one Compound-CellLine pair in one Assay.

    Contains raw data, fitted curve parameters, and calculated S' value.

    Attributes:
        compound: Compound entity
        cell_line: CellLine entity
        assay: Assay entity
        concentrations: Raw concentration values (list of floats) (optional)
        responses: Raw response values (list of floats) (optional)
        concentration_units: Units for concentrations (default: 'microM')
        hill_params: Fitted Hill curve parameters (optional)
        s_prime: Calculated S' value (optional)
        rank: Rank of S' value (optional)
        metadata: Additional metadata from CSV (e.g., MOA, drug targets) (optional)
    """

    compound: Compound
    cell_line: CellLine
    assay: Assay

    # Raw data
    concentrations: Optional[List[float]] = None
    responses: Optional[List[float]] = None
    concentration_units: str = "microM"

    # Fitted parameters
    hill_params: Optional[HillCurveParams] = None

    # Results
    s_prime: Optional[float] = None
    rank: Optional[int] = None

    # Additional metadata
    metadata: Optional[Dict[str, str]] = None

    #: Vehicle control readout from CSV (used only when strict import applies
    #: :func:`~sprime.response_pipeline.pipeline_asymptote_normalized` before fitting).
    control_response: Optional[float] = None

    def fit_hill_curve(self, **fit_params) -> HillCurveParams:
        """
        Fit four-parameter Hill equation to raw dose-response data.

        Updates self.hill_params with fitted parameters.

        Args:
            **fit_params: Additional parameters for curve fitting

        Returns:
            HillCurveParams: Fitted curve parameters

        Raises:
            ValueError: If raw data is not available or hill_params already exist
        """
        if self.hill_params is not None:
            return self.hill_params  # Already fitted

        if self.concentrations is None or self.responses is None:
            raise ValueError("Need raw data to fit Hill curve")

        if len(self.concentrations) != len(self.responses):
            raise ValueError("Concentrations and responses must have same length")

        if hill_fitting is None:
            raise ImportError(
                "Hill curve fitting requires scipy. " "Install with: pip install scipy"
            )

        # Fit Hill curve using hill_fitting module
        self.hill_params = hill_fitting.fit_hill_curve(
            self.concentrations, self.responses, **fit_params
        )

        return self.hill_params

    def calculate_s_prime(self) -> float:
        """
        Calculate **S' = asinh((zero_asymptote - inf_asymptote) / EC50)**.

        Uses **fitted** ``hill_params`` (or precalc loaded into them) - do not substitute assumed
        asymptotes. Same definition as :func:`calculate_s_prime_from_params`.

        Sign is meaningful and canonical when ``hill_params`` came from
        :func:`~sprime.hill_fitting.fit_hill_curve`: inhibitory curves (response decreases with
        dose) have ``zero_asymptote > inf_asymptote`` so S' > 0; disinhibitory curves have S' < 0.

        Requires hill_params to be set (call fit_hill_curve first).

        Returns:
            float: S' value

        Raises:
            ValueError: If hill_params is not available
        """
        if self.hill_params is None:
            raise ValueError("Must fit Hill curve before calculating S'")

        hp = self.hill_params
        ratio = (hp.zero_asymptote - hp.inf_asymptote) / hp.ec50
        self.s_prime = math.asinh(ratio)  # asinh(x) = ln(x + sqrt(x^2 + 1))
        return self.s_prime

    def fit_and_calculate_s_prime(self, **fit_params) -> float:
        """
        Convenience method: fit curve then calculate S'.

        Args:
            **fit_params: Parameters for curve fitting

        Returns:
            float: S' value
        """
        self.fit_hill_curve(**fit_params)
        return self.calculate_s_prime()

Represents a dose-response profile for one Compound-CellLine pair in one Assay.

Contains raw data, fitted curve parameters, and calculated S' value.

Attributes

compound: Compound entity
cell_line: CellLine entity
assay: Assay entity
concentrations: Raw concentration values (list of floats) (optional)
responses: Raw response values (list of floats) (optional)
concentration_units: Units for concentrations (default: 'microM')
hill_params: Fitted Hill curve parameters (optional)
s_prime: Calculated S' value (optional)
rank: Rank of S' value (optional)
metadata: Additional metadata from CSV (e.g., MOA, drug targets) (optional)

Instance variables

var assay : Assay
var cell_line : CellLine
var compound : Compound
var concentration_units : str
var concentrations : List[float] | None
var control_response : float | None: Vehicle control readout from CSV (used only when strict import applies :func:~sprime.response_pipeline.pipeline_asymptote_normalized before fitting).
var hill_params : HillCurveParams | None
var metadata : Dict[str, str] | None
var rank : int | None
var responses : List[float] | None
var s_prime : float | None

Methods

def calculate_s_prime(self) ‑> float

Expand source code

def calculate_s_prime(self) -> float:
    """
    Calculate **S' = asinh((zero_asymptote - inf_asymptote) / EC50)**.

    Uses **fitted** ``hill_params`` (or precalc loaded into them) - do not substitute assumed
    asymptotes. Same definition as :func:`calculate_s_prime_from_params`.

    Sign is meaningful and canonical when ``hill_params`` came from
    :func:`~sprime.hill_fitting.fit_hill_curve`: inhibitory curves (response decreases with
    dose) have ``zero_asymptote > inf_asymptote`` so S' > 0; disinhibitory curves have S' < 0.

    Requires hill_params to be set (call fit_hill_curve first).

    Returns:
        float: S' value

    Raises:
        ValueError: If hill_params is not available
    """
    if self.hill_params is None:
        raise ValueError("Must fit Hill curve before calculating S'")

    hp = self.hill_params
    ratio = (hp.zero_asymptote - hp.inf_asymptote) / hp.ec50
    self.s_prime = math.asinh(ratio)  # asinh(x) = ln(x + sqrt(x^2 + 1))
    return self.s_prime

Calculate S' = asinh((zero_asymptote - inf_asymptote) / EC50).

Uses fitted hill_params (or precalc loaded into them) - do not substitute assumed asymptotes. Same definition as :func:calculate_s_prime_from_params().

Sign is meaningful and canonical when hill_params came from :func:~sprime.hill_fitting.fit_hill_curve: inhibitory curves (response decreases with dose) have zero_asymptote > inf_asymptote so S' > 0; disinhibitory curves have S' < 0.

Requires hill_params to be set (call fit_hill_curve first).

Returns

float: S' value

Raises

ValueError: If hill_params is not available

def fit_and_calculate_s_prime(self, **fit_params) ‑> float

Expand source code

def fit_and_calculate_s_prime(self, **fit_params) -> float:
    """
    Convenience method: fit curve then calculate S'.

    Args:
        **fit_params: Parameters for curve fitting

    Returns:
        float: S' value
    """
    self.fit_hill_curve(**fit_params)
    return self.calculate_s_prime()

Convenience method: fit curve then calculate S'.

Args

**fit_params: Parameters for curve fitting

Returns

float: S' value

def fit_hill_curve(self, **fit_params) ‑> HillCurveParams

Expand source code

def fit_hill_curve(self, **fit_params) -> HillCurveParams:
    """
    Fit four-parameter Hill equation to raw dose-response data.

    Updates self.hill_params with fitted parameters.

    Args:
        **fit_params: Additional parameters for curve fitting

    Returns:
        HillCurveParams: Fitted curve parameters

    Raises:
        ValueError: If raw data is not available or hill_params already exist
    """
    if self.hill_params is not None:
        return self.hill_params  # Already fitted

    if self.concentrations is None or self.responses is None:
        raise ValueError("Need raw data to fit Hill curve")

    if len(self.concentrations) != len(self.responses):
        raise ValueError("Concentrations and responses must have same length")

    if hill_fitting is None:
        raise ImportError(
            "Hill curve fitting requires scipy. " "Install with: pip install scipy"
        )

    # Fit Hill curve using hill_fitting module
    self.hill_params = hill_fitting.fit_hill_curve(
        self.concentrations, self.responses, **fit_params
    )

    return self.hill_params

Fit four-parameter Hill equation to raw dose-response data.

Updates self.hill_params with fitted parameters.

Args

**fit_params: Additional parameters for curve fitting

Returns

HillCurveParams: Fitted curve parameters

Raises

ValueError: If raw data is not available or hill_params already exist

class HillCurveParams (ec50: float, zero_asymptote: float, inf_asymptote: float, steepness_coefficient: Optional[float] = None, r_squared: Optional[float] = None)

Expand source code

@dataclass
class HillCurveParams:
    """
    Parameters from a linear-x four-parameter logistic (linear-x 4PL) fit: concentration x
    enters as (x/C)^n. This differs from log-x 4PL tools (log10 dose in the exponent); see
    docs/background/README_4PL_Dose_Response.md#linear-x-vs-log-x-4pl-hill-slope.

    Hill equation: y = inf_asymptote + (zero_asymptote - inf_asymptote) / (1 + (x/C)^n)
    Where:
        zero_asymptote = response in the limit concentration -> 0 (left side of the dose axis)
        inf_asymptote = response in the limit saturating concentration (right side of the dose axis)
        C = ec50 (half-maximal concentration)
        n = steepness_coefficient (exponent controlling curve steepness)

    Curves may increase or decrease with dose; **zero** and **inf** name the **dose extremes**, not
    which response value is numerically larger.

    **S':** :math:`\\mathrm{asinh}((\\mathrm{zero} - \\mathrm{inf}) / \\mathrm{EC50})` with the same
    ``zero_asymptote`` / ``inf_asymptote``. Do not substitute **inf - zero** (common in some 4PL
    "amplitude" wording): the sign differs and **S'** is defined only with **zero - inf**.

    **Sign convention:** :func:`~sprime.hill_fitting.fit_hill_curve` always returns parameters in
    canonical form -- inhibitory curves (response decreases with dose) have
    ``zero_asymptote > inf_asymptote`` so S' > 0; disinhibitory curves have
    ``zero_asymptote < inf_asymptote`` so S' < 0. The sign of S' therefore reliably tracks
    biological curve direction for all fitted results.

    **Conceptual mapping:** In many pharmacology/biochemistry texts the exponent *n* in a
    linear-x Hill form is called the **Hill coefficient** (often linked to cooperativity).
    That is the same mathematical role as ``steepness_coefficient`` here. We use the name
    ``steepness_coefficient`` to emphasize the linear-x parameterization and to avoid
    confusing this *n* with "Hill slope" values reported by **log-x** dose-response tools,
    which are not numerically the same.

    Attributes:
        ec50: Half-maximal effect concentration (AC50/EC50)
        zero_asymptote: Limit as concentration -> 0 (left of dose axis)
        inf_asymptote: Limit at saturating concentration (right of dose axis)
        steepness_coefficient: Exponent *n* in (x/C)^n (optional). Same role as the classical
            Hill coefficient *n* in this linear-x model; see conceptual mapping above.
        r_squared: R-squared goodness of fit statistic (optional)
    """

    ec50: float
    zero_asymptote: float
    inf_asymptote: float
    steepness_coefficient: Optional[float] = None
    r_squared: Optional[float] = None

Parameters from a linear-x four-parameter logistic (linear-x 4PL) fit: concentration x enters as (x/C)^n. This differs from log-x 4PL tools (log10 dose in the exponent); see docs/background/README_4PL_Dose_Response.md#linear-x-vs-log-x-4pl-hill-slope.

Hill equation: y = inf_asymptote + (zero_asymptote - inf_asymptote) / (1 + (x/C)^n)

Where

zero_asymptote = response in the limit concentration -> 0 (left side of the dose axis) inf_asymptote = response in the limit saturating concentration (right side of the dose axis) C = ec50 (half-maximal concentration) n = steepness_coefficient (exponent controlling curve steepness)

Curves may increase or decrease with dose; zero and inf name the dose extremes, not which response value is numerically larger.

S': :math:\mathrm{asinh}((\mathrm{zero} - \mathrm{inf}) / \mathrm{EC50}) with the same zero_asymptote / inf_asymptote. Do not substitute inf - zero (common in some 4PL "amplitude" wording): the sign differs and S' is defined only with zero - inf.

Sign convention: :func:~sprime.hill_fitting.fit_hill_curve always returns parameters in canonical form – inhibitory curves (response decreases with dose) have zero_asymptote > inf_asymptote so S' > 0; disinhibitory curves have zero_asymptote < inf_asymptote so S' < 0. The sign of S' therefore reliably tracks biological curve direction for all fitted results.

Conceptual mapping: In many pharmacology/biochemistry texts the exponent n in a linear-x Hill form is called the Hill coefficient (often linked to cooperativity). That is the same mathematical role as steepness_coefficient here. We use the name steepness_coefficient to emphasize the linear-x parameterization and to avoid confusing this n with "Hill slope" values reported by log-x dose-response tools, which are not numerically the same.

Attributes

ec50: Half-maximal effect concentration (AC50/EC50)
zero_asymptote: Limit as concentration -> 0 (left of dose axis)
inf_asymptote: Limit at saturating concentration (right of dose axis)
steepness_coefficient: Exponent n in (x/C)^n (optional). Same role as the classical Hill coefficient n in this linear-x model; see conceptual mapping above.
r_squared: R-squared goodness of fit statistic (optional)

Instance variables

var ec50 : float
var inf_asymptote : float
var r_squared : float | None
var steepness_coefficient : float | None
var zero_asymptote : float

class ProcessingReport (input_filepath: pathlib.Path | None = None, total_rows: int = 0, rows_processed: int = 0, rows_skipped: int = 0, compounds_loaded: int = 0, profiles_created: int = 0, profiles_with_s_prime: int = 0, profiles_failed_fit: int = 0, missing_drug_ids: int = 0, missing_compound_names: int = 0, missing_cell_lines: int = 0, insufficient_data_points: int = 0, invalid_numeric_values: int = 0, forward_filled_fields: int = 0, warnings: List[WarningEntry] = <factory>)

Expand source code

@dataclass
class ProcessingReport:
    """Combined report for load + process operations."""

    # File info
    input_filepath: Optional[Path] = None

    # Summary metrics
    total_rows: int = 0
    rows_processed: int = 0
    rows_skipped: int = 0
    compounds_loaded: int = 0
    profiles_created: int = 0
    profiles_with_s_prime: int = 0
    profiles_failed_fit: int = 0

    # Data quality counts
    missing_drug_ids: int = 0
    missing_compound_names: int = 0
    missing_cell_lines: int = 0
    insufficient_data_points: int = 0
    invalid_numeric_values: int = 0
    forward_filled_fields: int = 0

    # All warnings (from both load and process)
    warnings: List[WarningEntry] = field(default_factory=list)

    def add_warning(
        self,
        row_number: int,
        category: str,
        message: str,
        drug_id: Optional[str] = None,
        compound_name: Optional[str] = None,
        cell_line: Optional[str] = None,
        field_name: Optional[str] = None,
    ):
        """Add a warning entry."""
        self.warnings.append(
            WarningEntry(
                row_number=row_number,
                category=category,
                message=message,
                drug_id=drug_id,
                compound_name=compound_name,
                cell_line=cell_line,
                field_name=field_name,
            )
        )

    def write_log_file(self, filepath: Optional[Path] = None):
        """Write log file if enabled."""
        if not ReportingConfig.log_to_file:
            return

        # Use provided path or configured path or auto-generate
        if filepath:
            log_path = Path(filepath)
        elif ReportingConfig.log_filepath:
            log_path = ReportingConfig.log_filepath
        elif self.input_filepath:
            # Auto-generate from input filename
            log_path = self.input_filepath.parent / f"{self.input_filepath.stem}_processing.log"
        else:
            log_path = Path("sprime_processing.log")

        write_processing_log(self, log_path)

    def print_console_summary(self):
        """Print console summary based on configured verbosity."""
        if ReportingConfig.console_output == ConsoleOutput.NONE:
            return
        elif ReportingConfig.console_output == ConsoleOutput.SUMMARY:
            print_processing_summary(self)
        elif ReportingConfig.console_output == ConsoleOutput.VERBOSE:
            print_processing_summary_verbose(self)

Combined report for load + process operations.

Instance variables

var compounds_loaded : int
var forward_filled_fields : int
var input_filepath : pathlib.Path | None
var insufficient_data_points : int
var invalid_numeric_values : int
var missing_cell_lines : int
var missing_compound_names : int
var missing_drug_ids : int
var profiles_created : int
var profiles_failed_fit : int
var profiles_with_s_prime : int
var rows_processed : int
var rows_skipped : int
var total_rows : int
var warnings : List[WarningEntry]

Methods

def add_warning(self, row_number: int, category: str, message: str, drug_id: str | None = None, compound_name: str | None = None, cell_line: str | None = None, field_name: str | None = None)

Expand source code

def add_warning(
    self,
    row_number: int,
    category: str,
    message: str,
    drug_id: Optional[str] = None,
    compound_name: Optional[str] = None,
    cell_line: Optional[str] = None,
    field_name: Optional[str] = None,
):
    """Add a warning entry."""
    self.warnings.append(
        WarningEntry(
            row_number=row_number,
            category=category,
            message=message,
            drug_id=drug_id,
            compound_name=compound_name,
            cell_line=cell_line,
            field_name=field_name,
        )
    )

Add a warning entry.

def print_console_summary(self)

Expand source code

def print_console_summary(self):
    """Print console summary based on configured verbosity."""
    if ReportingConfig.console_output == ConsoleOutput.NONE:
        return
    elif ReportingConfig.console_output == ConsoleOutput.SUMMARY:
        print_processing_summary(self)
    elif ReportingConfig.console_output == ConsoleOutput.VERBOSE:
        print_processing_summary_verbose(self)

Print console summary based on configured verbosity.

def write_log_file(self, filepath: pathlib.Path | None = None)

Expand source code

def write_log_file(self, filepath: Optional[Path] = None):
    """Write log file if enabled."""
    if not ReportingConfig.log_to_file:
        return

    # Use provided path or configured path or auto-generate
    if filepath:
        log_path = Path(filepath)
    elif ReportingConfig.log_filepath:
        log_path = ReportingConfig.log_filepath
    elif self.input_filepath:
        # Auto-generate from input filename
        log_path = self.input_filepath.parent / f"{self.input_filepath.stem}_processing.log"
    else:
        log_path = Path("sprime_processing.log")

    write_processing_log(self, log_path)

Write log file if enabled.

class RawDataset (assay: Assay, *, response_normalization: ResponseNormalizationMethod, skip_control_response_normalization: bool = False)

Expand source code

class RawDataset:
    """
    Holds raw dose-response curve data loaded from CSV/files.

    Focus: Data loading, validation, basic structure.
    May contain profiles with raw data (concentrations/responses) or
    pre-calculated Hill params from CSV.
    """

    def __init__(
        self,
        assay: Assay,
        *,
        response_normalization: ResponseNormalizationMethod,
        skip_control_response_normalization: bool = False,
    ):
        """
        Initialize RawDataset.

        Args:
            assay: Assay entity for this dataset
            response_normalization: Prescribed preprocessing after test/control ratio when the
                DMSO path applies: ``asymptote_normalized`` (max-normalize then x100) or
                ``response_scale`` (x100 only). Required at import so file/load intent is explicit.
                When ``skip_control_response_normalization=True``, responses are not re-scaled at
                process time; this value still documents the intended interpretation of ``Responses``.
            skip_control_response_normalization: If False (default), each raw row must include a
                non-zero numeric ``Control_Response`` (vehicle readout). If True, skip that
                requirement; use when responses are already control-normalized upstream.
        """
        self.assay = assay
        self.response_normalization = response_normalization
        self.skip_control_response_normalization = skip_control_response_normalization
        self._profiles: Dict[
            Tuple[str, str], DoseResponseProfile
        ] = {}  # (compound_id, cellline_name) -> profile

    @classmethod
    def load_from_file(
        cls,
        filepath: Union[str, Path],
        assay_name: Optional[str] = None,
        report: Optional["ProcessingReport"] = None,
        values_as: str = "columns",
        skip_control_response_normalization: bool = False,
        *,
        response_normalization: ResponseNormalizationMethod,
        **csv_kwargs,
    ) -> Tuple["RawDataset", "ProcessingReport"]:
        """
        Load CSV file and create RawDataset with quality reporting.

        Handles:
        - Extracting raw data (Data0..DataN, Conc0..ConcN) or Responses/Concentrations
        - Loading pre-calculated params (AC50, Zero_asymptote, Inf_asymptote) if present
        - Creating DoseResponseProfile objects
        - Tracking data quality issues and warnings

        Args:
            filepath: Path to CSV file
            assay_name: Name for assay (defaults to filename stem)
            report: Optional ProcessingReport to accumulate warnings (creates new if None)
            values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
            skip_control_response_normalization: If False (default), raw rows must include non-zero
                ``Control_Response`` (vehicle readout). Set True only if responses are already
                control-normalized.
            response_normalization: Required. ``asymptote_normalized`` or ``response_scale``;
                applied after test/control ratio when processing raw curves with the DMSO path
                (see :mod:`sprime.response_pipeline`). Must match how the submitting lab defined
                the dose-response sheet (normalized vs non-normalized x100 columns).
            **csv_kwargs: Additional arguments for csv.DictReader

        Returns:
            Tuple of (RawDataset, ProcessingReport)
        """
        filepath = Path(filepath)

        # Create or use existing report
        if report is None:
            if ProcessingReport is not None:
                report = ProcessingReport()
                report.input_filepath = filepath
            else:
                # Fallback if reporting not available
                report = None

        # Infer assay name
        if assay_name is None:
            assay_name = filepath.stem

        assay = Assay(name=assay_name)
        raw_dataset = cls(
            assay=assay,
            response_normalization=response_normalization,
            skip_control_response_normalization=skip_control_response_normalization,
        )

        # Read CSV
        with open(filepath, "r", encoding="utf-8") as f:
            reader = csv.DictReader(f, **csv_kwargs)
            fieldnames = reader.fieldnames
            rows = list(reader)

        # Validate required columns exist in header
        if fieldnames:
            _validate_required_columns(
                fieldnames,
                source_name=f"CSV file '{filepath}'",
                values_as=values_as,
                skip_control_response_normalization=skip_control_response_normalization,
            )

        if report:
            report.total_rows = len(rows) + 1  # +1 for header

        if not rows:
            if report is None:
                # Return dummy report if reporting not available
                return raw_dataset, None
            return raw_dataset, report

        reserved = _reserved_column_names(values_as, fieldnames or [])

        # Track compounds seen
        compounds_seen = set()

        # Process rows with line numbers (row 1 is header, so start at 2)
        for row_num, row in enumerate(rows, start=2):
            # Check if row is fully blank (all values empty/whitespace)
            is_fully_blank = not any(
                v.strip() if isinstance(v, str) else str(v).strip() for v in row.values() if v
            )

            if is_fully_blank:
                # Skip fully blank rows silently (no logging, don't count as processed)
                continue

            if report:
                report.rows_processed += 1

            # Check for empty cell line - RAISE EXCEPTION
            cell_line_name = row.get("Cell_Line", "").strip()
            if not cell_line_name:
                raise ValueError(
                    f"Row {row_num}: Missing required 'Cell_Line' value. "
                    f"All rows must have a cell line specified."
                )

            # Get compound info (Compound_ID required; NCGCID is pass-through only)
            # Rows are taken literally: empty values are null, no forward-filling.
            compound_name = row.get("Compound Name", "").strip() or "Unknown"
            compound_id = row.get("Compound_ID", "").strip()

            if not compound_id:
                raise ValueError(
                    f"Row {row_num}: Missing required 'Compound_ID' value. "
                    f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                    f"All rows must have a compound identifier."
                )

            # Track compound
            if report and compound_id not in compounds_seen:
                compounds_seen.add(compound_id)
                report.compounds_loaded += 1

            # Check for missing compound name
            if report and compound_name == "Unknown":
                report.add_warning(
                    row_number=row_num,
                    category="DATA_QUALITY",
                    message="Compound Name missing, using 'Unknown'",
                    drug_id=compound_id,
                    cell_line=cell_line_name,
                    field_name="Compound Name",
                )
                report.missing_compound_names += 1

            compound = Compound(
                name=compound_name,
                drug_id=compound_id,
                pubchem_sid=row.get("pubchem_sid", "").strip() or None,
                smiles=row.get("SMILES", "").strip() or None,
            )

            # Create CellLine
            cell_line = CellLine(
                name=cell_line_name, ref_id=row.get("Cell_Line_Ref_ID", "").strip() or None
            )

            # Extract raw dose-response data (if present)
            concentrations = None
            responses = None

            if values_as == "list":
                resp_key = next((k for k in row if k.lower() == "responses"), None)
                conc_key = next((k for k in row if k.lower() == "concentrations"), None)
                if resp_key and conc_key:
                    resp_str = (row.get(resp_key) or "").strip()
                    conc_str = (row.get(conc_key) or "").strip()
                    if resp_str and conc_str:
                        responses = []
                        concentrations = []
                        for part in resp_str.split(","):
                            t = part.strip()
                            if t:
                                try:
                                    v = float(t)
                                    if report and (math.isnan(v) or math.isinf(v)):
                                        report.add_warning(
                                            row_number=row_num,
                                            category="NUMERICAL",
                                            message="Invalid numeric value (NaN/Inf) in Responses",
                                            drug_id=compound_id,
                                            compound_name=compound_name,
                                            cell_line=cell_line_name,
                                            field_name="Responses",
                                        )
                                        report.invalid_numeric_values += 1
                                        continue
                                    responses.append(v)
                                except (ValueError, TypeError) as e:
                                    if report:
                                        report.add_warning(
                                            row_number=row_num,
                                            category="DATA_QUALITY",
                                            message=f"Non-numeric value in Responses: {e}",
                                            drug_id=compound_id,
                                            compound_name=compound_name,
                                            cell_line=cell_line_name,
                                            field_name="Responses",
                                        )
                                        report.invalid_numeric_values += 1
                                    continue
                        conc_parts = []
                        for part in conc_str.split(","):
                            t = part.strip()
                            if t:
                                try:
                                    v = float(t)
                                    if report and (math.isnan(v) or math.isinf(v)):
                                        report.add_warning(
                                            row_number=row_num,
                                            category="NUMERICAL",
                                            message="Invalid numeric value (NaN/Inf) in Concentrations",
                                            drug_id=compound_id,
                                            compound_name=compound_name,
                                            cell_line=cell_line_name,
                                            field_name="Concentrations",
                                        )
                                        report.invalid_numeric_values += 1
                                        continue
                                    conc_parts.append(v)
                                except (ValueError, TypeError) as e:
                                    if report:
                                        report.add_warning(
                                            row_number=row_num,
                                            category="DATA_QUALITY",
                                            message=f"Non-numeric value in Concentrations: {e}",
                                            drug_id=compound_id,
                                            compound_name=compound_name,
                                            cell_line=cell_line_name,
                                            field_name="Concentrations",
                                        )
                                        report.invalid_numeric_values += 1
                                    continue
                        if len(responses) != len(conc_parts):
                            raise ValueError(
                                f"Row {row_num}: Responses and Concentrations length mismatch "
                                f"({len(responses)} vs {len(conc_parts)}). "
                                f"Compound: {compound_name}, Cell_Line: {cell_line_name}."
                            )
                        if len(responses) < 4:
                            if report:
                                report.add_warning(
                                    row_number=row_num,
                                    category="MISSING_DATA",
                                    message=f"Insufficient data points: {len(responses)} found, need 4+ for curve fitting",
                                    drug_id=compound_id,
                                    compound_name=compound_name,
                                    cell_line=cell_line_name,
                                )
                                report.insufficient_data_points += 1
                            concentrations = None
                            responses = None
                        else:
                            units = row.get("Concentration_Units", "").strip()
                            if not units:
                                raise ValueError(
                                    f"Row {row_num}: Missing required 'Concentration_Units' for raw data. "
                                    f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                                    f"Raw dose-response data requires Concentration_Units."
                                )
                            concentrations = convert_to_micromolar(conc_parts, units)
                            responses = responses

            elif True:
                # Columns format: DATA*/CONC*
                data_cols = [k for k in row.keys() if k.startswith("Data") or k.startswith("DATA")]
                conc_cols = [
                    k
                    for k in row.keys()
                    if (k.startswith("Conc") or k.startswith("CONC"))
                    and "Units" not in k
                    and "units" not in k
                ]
                if data_cols and conc_cols:
                    # Sort columns to ensure correct order
                    data_cols = sorted(
                        data_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                    )
                    conc_cols = sorted(
                        conc_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                    )

                    responses = []
                    concentrations = []

                    for data_col, conc_col in zip(data_cols, conc_cols):
                        try:
                            resp_val = row.get(data_col, "") or ""
                            conc_val = row.get(conc_col, "") or ""
                            resp_val = resp_val.strip() if isinstance(resp_val, str) else ""
                            conc_val = conc_val.strip() if isinstance(conc_val, str) else ""
                            if resp_val and conc_val:
                                resp_float = float(resp_val)
                                conc_float = float(conc_val)

                                # Check for invalid numeric values
                                if report:
                                    if math.isnan(resp_float) or math.isinf(resp_float):
                                        report.add_warning(
                                            row_number=row_num,
                                            category="NUMERICAL",
                                            message=f"Invalid numeric value (NaN/Inf) in {data_col}",
                                            drug_id=compound_id,
                                            compound_name=compound_name,
                                            cell_line=cell_line_name,
                                            field_name=data_col,
                                        )
                                        report.invalid_numeric_values += 1
                                        continue

                                    if math.isnan(conc_float) or math.isinf(conc_float):
                                        report.add_warning(
                                            row_number=row_num,
                                            category="NUMERICAL",
                                            message=f"Invalid numeric value (NaN/Inf) in {conc_col}",
                                            drug_id=compound_id,
                                            compound_name=compound_name,
                                            cell_line=cell_line_name,
                                            field_name=conc_col,
                                        )
                                        report.invalid_numeric_values += 1
                                        continue

                                responses.append(resp_float)
                                concentrations.append(conc_float)
                        except (ValueError, TypeError) as e:
                            if report:
                                report.add_warning(
                                    row_number=row_num,
                                    category="DATA_QUALITY",
                                    message=f"Non-numeric value in {data_col} or {conc_col}: {str(e)}",
                                    drug_id=compound_id,
                                    compound_name=compound_name,
                                    cell_line=cell_line_name,
                                    field_name=f"{data_col}/{conc_col}",
                                )
                                report.invalid_numeric_values += 1
                            continue

                    # Check for sufficient data points
                    if report and len(responses) < 4:
                        report.add_warning(
                            row_number=row_num,
                            category="MISSING_DATA",
                            message=f"Insufficient data points: {len(responses)} found, need 4+ for curve fitting",
                            drug_id=compound_id,
                            compound_name=compound_name,
                            cell_line=cell_line_name,
                        )
                        report.insufficient_data_points += 1

                    if concentrations and responses:
                        units = row.get("Concentration_Units", "").strip()
                        if not units:
                            raise ValueError(
                                f"Row {row_num}: Missing required 'Concentration_Units' for raw data. "
                                f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                                f"Raw dose-response data requires Concentration_Units."
                            )
                        concentrations = convert_to_micromolar(concentrations, units)
                        concentrations = concentrations if concentrations else None
                        responses = responses if responses else None

            # Extract pre-calculated Hill params (if present)
            hill_params = None
            ac50 = row.get("AC50", "").strip() or row.get("ec50", "").strip()
            if ac50:
                try:
                    r2_raw = row.get("r2", row.get(_RES_COL_R2, ""))
                    r_sq = _try_float(r2_raw)
                    inf_a = row.get("Inf_asymptote", row.get("Upper", row.get("Infinity", "0")))
                    zero_a = row.get("Zero_asymptote", row.get("Lower", row.get("Zero", "0")))
                    hill_params = HillCurveParams(
                        ec50=float(ac50),
                        zero_asymptote=float(str(zero_a).strip() or "0"),
                        inf_asymptote=float(str(inf_a).strip() or "0"),
                        steepness_coefficient=_try_float(
                            row.get("Hill_Slope", row.get("Hill", row.get("slope", "")))
                        ),
                        r_squared=r_sq,
                    )
                except (ValueError, TypeError):
                    hill_params = None

            # Pre-calculated S' (if present)
            s_prime = _try_float(row.get("S'", row.get("S Prime", "")))
            rank = _try_int(row.get("Rank", ""))

            # Validate that row has either raw data or pre-calculated params
            has_raw_data = (
                concentrations is not None and responses is not None and len(concentrations) > 0
            )
            has_precalc_params = hill_params is not None

            if not (has_raw_data or has_precalc_params):
                raise ValueError(
                    f"Row {row_num}: No dose-response data found for compound '{compound_name}' "
                    f"(Compound_ID: {compound_id}) in cell line '{cell_line_name}'. "
                    f"Row must have either: (1) raw data columns (DATA*/CONC*), or "
                    f"(2) pre-calculated parameters (AC50, Zero_asymptote, Inf_asymptote)."
                )

            _validate_control_response_for_raw_row(
                row,
                f"Row {row_num}",
                has_raw_data,
                compound_name=compound_name,
                cell_line_name=cell_line_name,
                compound_id=compound_id,
                skip_control_response_normalization=skip_control_response_normalization,
            )

            # Extract metadata: generic pass-through (all non-reserved columns, exact header, value as-is)
            metadata = {}
            for col in row.keys():
                if col in reserved:
                    continue
                raw = row.get(col, "")
                raw = raw if isinstance(raw, str) else str(raw)
                metadata[col] = raw

            # Create profile
            profile = DoseResponseProfile(
                compound=compound,
                cell_line=cell_line,
                assay=assay,
                concentrations=concentrations,
                responses=responses,
                concentration_units="microM",
                hill_params=hill_params,
                s_prime=s_prime,
                rank=rank,
                metadata=metadata if metadata else None,
                control_response=_control_response_numeric_for_raw_row(
                    row,
                    has_raw_data,
                    skip_control_response_normalization,
                ),
            )

            raw_dataset.add_profile(profile)
            if report:
                report.profiles_created += 1

        if report is None:
            # Return dummy report if reporting not available
            return raw_dataset, None
        return raw_dataset, report

    def add_profile(self, profile: DoseResponseProfile):
        """
        Add a profile to the dataset.

        Args:
            profile: DoseResponseProfile to add

        Raises:
            ValueError: If profile for this compound-cellline pair already exists
        """
        key = (profile.compound.drug_id, profile.cell_line.name)
        if key in self._profiles:
            raise ValueError(f"Profile for {key} already exists")
        self._profiles[key] = profile

    def get_profile(
        self, compound: Union[Compound, str], cell_line: Union[CellLine, str]
    ) -> Optional[DoseResponseProfile]:
        """
        Retrieve a specific profile.

        Args:
            compound: Compound object or drug_id string
            cell_line: CellLine object or cell_line name string

        Returns:
            DoseResponseProfile or None if not found
        """
        compound_id = compound.drug_id if isinstance(compound, Compound) else compound
        cellline_name = cell_line.name if isinstance(cell_line, CellLine) else cell_line
        return self._profiles.get((compound_id, cellline_name))

    def to_screening_dataset(
        self,
        report: Optional["ProcessingReport"] = None,
        allow_overwrite_precalc_params: bool = False,
        **fit_params,
    ) -> Tuple["ScreeningDataset", "ProcessingReport"]:
        """
        Process raw data into ScreeningDataset with quality reporting.

        For each profile:
        1. When ``skip_control_response_normalization`` is False on this dataset and the profile
           carries a parsed ``control_response``, raw responses are transformed using
           ``self.response_normalization`` (set at import): either
           :func:`~sprime.response_pipeline.pipeline_asymptote_normalized` or
           :func:`~sprime.response_pipeline.pipeline_response_scale`, before fitting.
        2. Fit Hill curve when raw data exists, or use pre-calc when no raw.
        3. Always compute S' from Hill params. Warn if S' was provided in input and is overwritten
           **only when raw data was fitted** (surprising overwrite). For precalc-only rows (no raw
           curve), S' from the sheet is always recomputed from imported Hill parameters---see
           ``PRE_CALC_ONLY`` without a separate ``OVERWRITE_S_PRIME`` warning.

        When both raw data and pre-calc (AC50, Zero_asymptote, Inf_asymptote, Hill_Slope, r2) exist:
        - allow_overwrite_precalc_params=False (default): raise (would overwrite).
        - allow_overwrite_precalc_params=True: fit, overwrite pre-calc, and log
          a warning that pre-calculated curve parameters were overwritten.

        Args:
            report: Optional ProcessingReport to accumulate warnings (creates new if None)
            allow_overwrite_precalc_params: If True, allow overwriting pre-calculated
                curve parameters (EC50, zero/inf asymptotes, steepness, r-squared) with fitted values.
                Default False: raise when we would overwrite.
            **fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

        Returns:
            Tuple of (ScreeningDataset, ProcessingReport)

        Raises:
            ValueError: If profile cannot be processed, or would overwrite
                pre-calculated parameters without allow_overwrite_precalc_params=True.
        """
        if report is None:
            if ProcessingReport is not None:
                report = ProcessingReport()
            else:
                report = None

        screening_dataset = ScreeningDataset(assay=self.assay)

        for profile in self._profiles.values():
            processed_profile = DoseResponseProfile(
                compound=profile.compound,
                cell_line=profile.cell_line,
                assay=profile.assay,
                concentrations=profile.concentrations,
                responses=(list(profile.responses) if profile.responses is not None else None),
                concentration_units=profile.concentration_units,
                hill_params=profile.hill_params,
                s_prime=profile.s_prime,
                rank=profile.rank,
                metadata=profile.metadata,
                control_response=profile.control_response,
            )

            has_raw = (
                processed_profile.concentrations is not None
                and processed_profile.responses is not None
                and len(processed_profile.concentrations) > 0
            )
            had_precalc = processed_profile.hill_params is not None

            if has_raw:
                if (
                    not self.skip_control_response_normalization
                    and processed_profile.control_response is not None
                ):
                    from .response_pipeline import (
                        pipeline_asymptote_normalized,
                        pipeline_response_scale,
                    )

                    if self.response_normalization == "asymptote_normalized":
                        processed_profile.responses = pipeline_asymptote_normalized(
                            processed_profile.responses,
                            processed_profile.control_response,
                        )
                    else:
                        processed_profile.responses = pipeline_response_scale(
                            processed_profile.responses,
                            processed_profile.control_response,
                        )
                if had_precalc and not allow_overwrite_precalc_params:
                    raise ValueError(
                        f"Pre-calculated curve parameters (AC50, Zero_asymptote, Inf_asymptote, Hill_Slope, r2) would be "
                        f"overwritten by fitted values for compound '{processed_profile.compound.name}' "
                        f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                        f"'{processed_profile.cell_line.name}'. "
                        f"Set allow_overwrite_precalc_params=True to permit."
                    )
                processed_profile.hill_params = None
                try:
                    processed_profile.fit_hill_curve(**fit_params)
                except (RuntimeError, ValueError) as e:
                    raise ValueError(
                        f"Curve fitting failed for compound '{processed_profile.compound.name}' "
                        f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                        f"'{processed_profile.cell_line.name}': {str(e)}"
                    )
                if had_precalc:
                    msg = (
                        f"Pre-calculated curve parameters (AC50, Zero_asymptote, Inf_asymptote, Hill_Slope, r2) overwritten by "
                        f"fitted values for '{processed_profile.compound.name}' / "
                        f"'{processed_profile.cell_line.name}'."
                    )
                    warnings.warn(msg, UserWarning, stacklevel=2)
                    if report:
                        report.add_warning(
                            row_number=0,
                            category="OVERWRITE_PRECALC_PARAMS",
                            message=msg,
                            drug_id=processed_profile.compound.drug_id,
                            compound_name=processed_profile.compound.name,
                            cell_line=processed_profile.cell_line.name,
                        )
            else:
                if processed_profile.hill_params is None:
                    raise ValueError(
                        f"No data available to process for compound '{processed_profile.compound.name}' "
                        f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                        f"'{processed_profile.cell_line.name}'. Profile has neither raw data "
                        f"(concentrations/responses) nor pre-calculated Hill curve parameters."
                    )
                msg = (
                    f"Using pre-calc Hill parameters as-is for '{processed_profile.compound.name}' / "
                    f"'{processed_profile.cell_line.name}' (no raw data)."
                )
                warnings.warn(msg, UserWarning, stacklevel=2)
                if report:
                    report.add_warning(
                        row_number=0,
                        category="PRE_CALC_ONLY",
                        message=msg,
                        drug_id=processed_profile.compound.drug_id,
                        compound_name=processed_profile.compound.name,
                        cell_line=processed_profile.cell_line.name,
                    )

            # Check fit quality
            if (
                report
                and processed_profile.hill_params
                and processed_profile.hill_params.r_squared is not None
            ):
                if processed_profile.hill_params.r_squared < 0.7:
                    report.add_warning(
                        row_number=0,
                        category="CURVE_FIT",
                        message=f"Poor fit quality: r_squared = {processed_profile.hill_params.r_squared:.3f}",
                        drug_id=processed_profile.compound.drug_id,
                        compound_name=processed_profile.compound.name,
                        cell_line=processed_profile.cell_line.name,
                    )

            # Always compute S'. Warn if CSV supplied S' but we refit from raw (unexpected duplicate).
            # Precalc-only rows: S' column is informational; recomputation from Hill params is expected,
            # so do not duplicate PRE_CALC_ONLY with OVERWRITE_S_PRIME.
            if processed_profile.s_prime is not None and has_raw:
                msg = (
                    f"S' overwritten by recomputation for '{processed_profile.compound.name}' / "
                    f"'{processed_profile.cell_line.name}' (S' was provided in input)."
                )
                warnings.warn(msg, UserWarning, stacklevel=2)
                if report:
                    report.add_warning(
                        row_number=0,
                        category="OVERWRITE_S_PRIME",
                        message=msg,
                        drug_id=processed_profile.compound.drug_id,
                        compound_name=processed_profile.compound.name,
                        cell_line=processed_profile.cell_line.name,
                    )
            try:
                processed_profile.calculate_s_prime()
            except Exception as e:
                raise ValueError(
                    f"S' calculation failed for compound '{processed_profile.compound.name}' "
                    f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                    f"'{processed_profile.cell_line.name}': {str(e)}"
                )

            # Validate S' was calculated
            if processed_profile.s_prime is None:
                raise ValueError(
                    f"S' calculation returned None for compound '{processed_profile.compound.name}' "
                    f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                    f"'{processed_profile.cell_line.name}'. This indicates invalid Hill curve parameters."
                )

            # Add profile to screening dataset
            try:
                screening_dataset.add_profile(processed_profile)
                if report:
                    report.profiles_with_s_prime += 1
            except ValueError as e:
                # Profile already exists or invalid - re-raise with context
                raise ValueError(
                    f"Cannot add profile for compound '{processed_profile.compound.name}' "
                    f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                    f"'{processed_profile.cell_line.name}': {str(e)}"
                )

        if report is None:
            # Return dummy report if reporting not available
            return screening_dataset, None
        return screening_dataset, report

    @property
    def profiles(self):
        """Iterator over all profiles"""
        return self._profiles.values()

    def __len__(self):
        return len(self._profiles)

Holds raw dose-response curve data loaded from CSV/files.

Focus: Data loading, validation, basic structure. May contain profiles with raw data (concentrations/responses) or pre-calculated Hill params from CSV.

Initialize RawDataset.

Args

assay: Assay entity for this dataset
response_normalization: Prescribed preprocessing after test/control ratio when the DMSO path applies: asymptote_normalized (max-normalize then x100) or response_scale (x100 only). Required at import so file/load intent is explicit. When skip_control_response_normalization=True, responses are not re-scaled at process time; this value still documents the intended interpretation of Responses.
skip_control_response_normalization: If False (default), each raw row must include a non-zero numeric Control_Response (vehicle readout). If True, skip that requirement; use when responses are already control-normalized upstream.

Static methods

def load_from_file(filepath: Union[str, Path], assay_name: Optional[str] = None, report: "Optional['ProcessingReport']" = None, values_as: str = 'columns', skip_control_response_normalization: bool = False, *, response_normalization: ResponseNormalizationMethod, **csv_kwargs) ‑> Tuple[RawDataset, ProcessingReport]

Load CSV file and create RawDataset with quality reporting.

Handles: - Extracting raw data (Data0..DataN, Conc0..ConcN) or Responses/Concentrations - Loading pre-calculated params (AC50, Zero_asymptote, Inf_asymptote) if present - Creating DoseResponseProfile objects - Tracking data quality issues and warnings

Args

filepath: Path to CSV file
assay_name: Name for assay (defaults to filename stem)
report: Optional ProcessingReport to accumulate warnings (creates new if None)
values_as: "columns" (DATA/CONC) or "list" (Responses, Concentrations)
skip_control_response_normalization: If False (default), raw rows must include non-zero Control_Response (vehicle readout). Set True only if responses are already control-normalized.
response_normalization: Required. asymptote_normalized or response_scale; applied after test/control ratio when processing raw curves with the DMSO path (see :mod:sprime.response_pipeline). Must match how the submitting lab defined the dose-response sheet (normalized vs non-normalized x100 columns).
**csv_kwargs: Additional arguments for csv.DictReader

Returns

Tuple of (RawDataset, ProcessingReport)

Instance variables

prop profiles

Expand source code

@property
def profiles(self):
    """Iterator over all profiles"""
    return self._profiles.values()

Iterator over all profiles

Methods

def add_profile(self, profile: DoseResponseProfile)

Expand source code

def add_profile(self, profile: DoseResponseProfile):
    """
    Add a profile to the dataset.

    Args:
        profile: DoseResponseProfile to add

    Raises:
        ValueError: If profile for this compound-cellline pair already exists
    """
    key = (profile.compound.drug_id, profile.cell_line.name)
    if key in self._profiles:
        raise ValueError(f"Profile for {key} already exists")
    self._profiles[key] = profile

Add a profile to the dataset.

Args

profile: DoseResponseProfile to add

Raises

ValueError: If profile for this compound-cellline pair already exists

def get_profile(self, compound: Union[Compound, str], cell_line: Union[CellLine, str]) ‑> DoseResponseProfile | None

Expand source code

def get_profile(
    self, compound: Union[Compound, str], cell_line: Union[CellLine, str]
) -> Optional[DoseResponseProfile]:
    """
    Retrieve a specific profile.

    Args:
        compound: Compound object or drug_id string
        cell_line: CellLine object or cell_line name string

    Returns:
        DoseResponseProfile or None if not found
    """
    compound_id = compound.drug_id if isinstance(compound, Compound) else compound
    cellline_name = cell_line.name if isinstance(cell_line, CellLine) else cell_line
    return self._profiles.get((compound_id, cellline_name))

Retrieve a specific profile.

Args

compound: Compound object or drug_id string
cell_line: CellLine object or cell_line name string

Returns

DoseResponseProfile or None if not found

def to_screening_dataset(self, report: "Optional['ProcessingReport']" = None, allow_overwrite_precalc_params: bool = False, **fit_params) ‑> Tuple[ScreeningDataset, ProcessingReport]

Expand source code

def to_screening_dataset(
    self,
    report: Optional["ProcessingReport"] = None,
    allow_overwrite_precalc_params: bool = False,
    **fit_params,
) -> Tuple["ScreeningDataset", "ProcessingReport"]:
    """
    Process raw data into ScreeningDataset with quality reporting.

    For each profile:
    1. When ``skip_control_response_normalization`` is False on this dataset and the profile
       carries a parsed ``control_response``, raw responses are transformed using
       ``self.response_normalization`` (set at import): either
       :func:`~sprime.response_pipeline.pipeline_asymptote_normalized` or
       :func:`~sprime.response_pipeline.pipeline_response_scale`, before fitting.
    2. Fit Hill curve when raw data exists, or use pre-calc when no raw.
    3. Always compute S' from Hill params. Warn if S' was provided in input and is overwritten
       **only when raw data was fitted** (surprising overwrite). For precalc-only rows (no raw
       curve), S' from the sheet is always recomputed from imported Hill parameters---see
       ``PRE_CALC_ONLY`` without a separate ``OVERWRITE_S_PRIME`` warning.

    When both raw data and pre-calc (AC50, Zero_asymptote, Inf_asymptote, Hill_Slope, r2) exist:
    - allow_overwrite_precalc_params=False (default): raise (would overwrite).
    - allow_overwrite_precalc_params=True: fit, overwrite pre-calc, and log
      a warning that pre-calculated curve parameters were overwritten.

    Args:
        report: Optional ProcessingReport to accumulate warnings (creates new if None)
        allow_overwrite_precalc_params: If True, allow overwriting pre-calculated
            curve parameters (EC50, zero/inf asymptotes, steepness, r-squared) with fitted values.
            Default False: raise when we would overwrite.
        **fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

    Returns:
        Tuple of (ScreeningDataset, ProcessingReport)

    Raises:
        ValueError: If profile cannot be processed, or would overwrite
            pre-calculated parameters without allow_overwrite_precalc_params=True.
    """
    if report is None:
        if ProcessingReport is not None:
            report = ProcessingReport()
        else:
            report = None

    screening_dataset = ScreeningDataset(assay=self.assay)

    for profile in self._profiles.values():
        processed_profile = DoseResponseProfile(
            compound=profile.compound,
            cell_line=profile.cell_line,
            assay=profile.assay,
            concentrations=profile.concentrations,
            responses=(list(profile.responses) if profile.responses is not None else None),
            concentration_units=profile.concentration_units,
            hill_params=profile.hill_params,
            s_prime=profile.s_prime,
            rank=profile.rank,
            metadata=profile.metadata,
            control_response=profile.control_response,
        )

        has_raw = (
            processed_profile.concentrations is not None
            and processed_profile.responses is not None
            and len(processed_profile.concentrations) > 0
        )
        had_precalc = processed_profile.hill_params is not None

        if has_raw:
            if (
                not self.skip_control_response_normalization
                and processed_profile.control_response is not None
            ):
                from .response_pipeline import (
                    pipeline_asymptote_normalized,
                    pipeline_response_scale,
                )

                if self.response_normalization == "asymptote_normalized":
                    processed_profile.responses = pipeline_asymptote_normalized(
                        processed_profile.responses,
                        processed_profile.control_response,
                    )
                else:
                    processed_profile.responses = pipeline_response_scale(
                        processed_profile.responses,
                        processed_profile.control_response,
                    )
            if had_precalc and not allow_overwrite_precalc_params:
                raise ValueError(
                    f"Pre-calculated curve parameters (AC50, Zero_asymptote, Inf_asymptote, Hill_Slope, r2) would be "
                    f"overwritten by fitted values for compound '{processed_profile.compound.name}' "
                    f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                    f"'{processed_profile.cell_line.name}'. "
                    f"Set allow_overwrite_precalc_params=True to permit."
                )
            processed_profile.hill_params = None
            try:
                processed_profile.fit_hill_curve(**fit_params)
            except (RuntimeError, ValueError) as e:
                raise ValueError(
                    f"Curve fitting failed for compound '{processed_profile.compound.name}' "
                    f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                    f"'{processed_profile.cell_line.name}': {str(e)}"
                )
            if had_precalc:
                msg = (
                    f"Pre-calculated curve parameters (AC50, Zero_asymptote, Inf_asymptote, Hill_Slope, r2) overwritten by "
                    f"fitted values for '{processed_profile.compound.name}' / "
                    f"'{processed_profile.cell_line.name}'."
                )
                warnings.warn(msg, UserWarning, stacklevel=2)
                if report:
                    report.add_warning(
                        row_number=0,
                        category="OVERWRITE_PRECALC_PARAMS",
                        message=msg,
                        drug_id=processed_profile.compound.drug_id,
                        compound_name=processed_profile.compound.name,
                        cell_line=processed_profile.cell_line.name,
                    )
        else:
            if processed_profile.hill_params is None:
                raise ValueError(
                    f"No data available to process for compound '{processed_profile.compound.name}' "
                    f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                    f"'{processed_profile.cell_line.name}'. Profile has neither raw data "
                    f"(concentrations/responses) nor pre-calculated Hill curve parameters."
                )
            msg = (
                f"Using pre-calc Hill parameters as-is for '{processed_profile.compound.name}' / "
                f"'{processed_profile.cell_line.name}' (no raw data)."
            )
            warnings.warn(msg, UserWarning, stacklevel=2)
            if report:
                report.add_warning(
                    row_number=0,
                    category="PRE_CALC_ONLY",
                    message=msg,
                    drug_id=processed_profile.compound.drug_id,
                    compound_name=processed_profile.compound.name,
                    cell_line=processed_profile.cell_line.name,
                )

        # Check fit quality
        if (
            report
            and processed_profile.hill_params
            and processed_profile.hill_params.r_squared is not None
        ):
            if processed_profile.hill_params.r_squared < 0.7:
                report.add_warning(
                    row_number=0,
                    category="CURVE_FIT",
                    message=f"Poor fit quality: r_squared = {processed_profile.hill_params.r_squared:.3f}",
                    drug_id=processed_profile.compound.drug_id,
                    compound_name=processed_profile.compound.name,
                    cell_line=processed_profile.cell_line.name,
                )

        # Always compute S'. Warn if CSV supplied S' but we refit from raw (unexpected duplicate).
        # Precalc-only rows: S' column is informational; recomputation from Hill params is expected,
        # so do not duplicate PRE_CALC_ONLY with OVERWRITE_S_PRIME.
        if processed_profile.s_prime is not None and has_raw:
            msg = (
                f"S' overwritten by recomputation for '{processed_profile.compound.name}' / "
                f"'{processed_profile.cell_line.name}' (S' was provided in input)."
            )
            warnings.warn(msg, UserWarning, stacklevel=2)
            if report:
                report.add_warning(
                    row_number=0,
                    category="OVERWRITE_S_PRIME",
                    message=msg,
                    drug_id=processed_profile.compound.drug_id,
                    compound_name=processed_profile.compound.name,
                    cell_line=processed_profile.cell_line.name,
                )
        try:
            processed_profile.calculate_s_prime()
        except Exception as e:
            raise ValueError(
                f"S' calculation failed for compound '{processed_profile.compound.name}' "
                f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                f"'{processed_profile.cell_line.name}': {str(e)}"
            )

        # Validate S' was calculated
        if processed_profile.s_prime is None:
            raise ValueError(
                f"S' calculation returned None for compound '{processed_profile.compound.name}' "
                f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                f"'{processed_profile.cell_line.name}'. This indicates invalid Hill curve parameters."
            )

        # Add profile to screening dataset
        try:
            screening_dataset.add_profile(processed_profile)
            if report:
                report.profiles_with_s_prime += 1
        except ValueError as e:
            # Profile already exists or invalid - re-raise with context
            raise ValueError(
                f"Cannot add profile for compound '{processed_profile.compound.name}' "
                f"(Compound_ID: {processed_profile.compound.drug_id}) in cell line "
                f"'{processed_profile.cell_line.name}': {str(e)}"
            )

    if report is None:
        # Return dummy report if reporting not available
        return screening_dataset, None
    return screening_dataset, report

Process raw data into ScreeningDataset with quality reporting.

For each profile: 1. When skip_control_response_normalization is False on this dataset and the profile carries a parsed control_response, raw responses are transformed using self.response_normalization (set at import): either :func:~sprime.response_pipeline.pipeline_asymptote_normalized or :func:~sprime.response_pipeline.pipeline_response_scale, before fitting. 2. Fit Hill curve when raw data exists, or use pre-calc when no raw. 3. Always compute S' from Hill params. Warn if S' was provided in input and is overwritten only when raw data was fitted (surprising overwrite). For precalc-only rows (no raw curve), S' from the sheet is always recomputed from imported Hill parameters—see PRE_CALC_ONLY without a separate OVERWRITE_S_PRIME warning.

When both raw data and pre-calc (AC50, Zero_asymptote, Inf_asymptote, Hill_Slope, r2) exist: - allow_overwrite_precalc_params=False (default): raise (would overwrite). - allow_overwrite_precalc_params=True: fit, overwrite pre-calc, and log a warning that pre-calculated curve parameters were overwritten.

Args

report: Optional ProcessingReport to accumulate warnings (creates new if None)
allow_overwrite_precalc_params: If True, allow overwriting pre-calculated curve parameters (EC50, zero/inf asymptotes, steepness, r-squared) with fitted values. Default False: raise when we would overwrite.
**fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

Returns

Tuple of (ScreeningDataset, ProcessingReport)

Raises

ValueError: If profile cannot be processed, or would overwrite pre-calculated parameters without allow_overwrite_precalc_params=True.

class ReportingConfig

Expand source code

class ReportingConfig:
    """Global configuration for data quality reporting."""

    # Log file settings
    log_to_file: bool = False
    log_filepath: Optional[Path] = None

    # Console output settings
    console_output: ConsoleOutput = ConsoleOutput.SUMMARY

    @classmethod
    def configure(
        cls,
        log_to_file: bool = False,
        log_filepath: Optional[Union[str, Path]] = None,
        console_output: Union[ConsoleOutput, str] = ConsoleOutput.SUMMARY,
    ):
        """
        Configure global reporting settings.

        Args:
            log_to_file: If True, write detailed log file (default: False)
            log_filepath: Path to log file (default: auto-generated from input filename)
            console_output: Console verbosity - "none", "summary", or "verbose" (default: "summary")
        """
        cls.log_to_file = log_to_file

        if log_filepath:
            cls.log_filepath = Path(log_filepath)
        else:
            cls.log_filepath = None

        if isinstance(console_output, str):
            cls.console_output = ConsoleOutput(console_output.lower())
        else:
            cls.console_output = console_output

    @classmethod
    def reset(cls):
        """Reset to defaults."""
        cls.log_to_file = False
        cls.log_filepath = None
        cls.console_output = ConsoleOutput.SUMMARY

Global configuration for data quality reporting.

Class variables

var console_output : ConsoleOutput
var log_filepath : pathlib.Path | None
var log_to_file : bool

Static methods

def configure(log_to_file: bool = False, log_filepath: str | pathlib.Path | None = None, console_output: ConsoleOutput | str = ConsoleOutput.SUMMARY)

Configure global reporting settings.

Args

log_to_file: If True, write detailed log file (default: False)
log_filepath: Path to log file (default: auto-generated from input filename)
console_output: Console verbosity - "none", "summary", or "verbose" (default: "summary")

def reset()

Reset to defaults.

class SPrime

Expand source code

class SPrime:
    """
    Main API for sprime package.

    Provides factory methods and convenience functions for loading data,
    processing dose-response profiles, and calculating S' values.
    """

    @staticmethod
    def load(
        filepath_or_dataframe: Union[str, Path, pd.DataFrame],
        assay_name: Optional[str] = None,
        values_as: str = "columns",
        skip_control_response_normalization: bool = False,
        *,
        response_normalization: ResponseNormalizationMethod,
        **csv_kwargs,
    ) -> Tuple[RawDataset, Optional["ProcessingReport"]]:
        """
        Load raw data from CSV file or pandas DataFrame with quality reporting.

        Auto-detects pandas DataFrame input and converts it. For CSV files,
        uses global reporting configuration for console/log output.

        Args:
            filepath_or_dataframe: Path to CSV file, or pandas DataFrame (pandas required for DataFrame)
            assay_name: Name for assay (defaults to filename stem or 'DataFrame')
            values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
            skip_control_response_normalization: If False (default), raw rows must include a non-zero
                ``Control_Response``. Set True if responses are already control-normalized.
            response_normalization: Required. See :meth:`RawDataset.load_from_file`.
            **csv_kwargs: Additional arguments for csv.DictReader (ignored for DataFrames)

        Returns:
            Tuple of (RawDataset, ProcessingReport)

        Raises:
            ValueError: If required columns are missing
            ImportError: If DataFrame provided but pandas not installed
        """
        # Auto-detect DataFrame
        try:
            import pandas as pd

            if isinstance(filepath_or_dataframe, pd.DataFrame):
                return SPrime.load_from_dataframe(
                    filepath_or_dataframe,
                    assay_name,
                    values_as=values_as,
                    skip_control_response_normalization=skip_control_response_normalization,
                    response_normalization=response_normalization,
                )
        except ImportError:
            pass  # pandas not available, treat as file path
        except (AttributeError, TypeError):
            pass  # Not a DataFrame, treat as file path

        # Treat as file path
        raw_dataset, report = RawDataset.load_from_file(
            filepath_or_dataframe,
            assay_name,
            values_as=values_as,
            skip_control_response_normalization=skip_control_response_normalization,
            response_normalization=response_normalization,
            **csv_kwargs,
        )

        # Auto-print and write log based on global config
        if report and ReportingConfig is not None:
            report.print_console_summary()
            report.write_log_file()

        return raw_dataset, report

    @staticmethod
    def load_from_dataframe(
        df,
        assay_name: Optional[str] = None,
        values_as: str = "columns",
        skip_control_response_normalization: bool = True,
        *,
        response_normalization: ResponseNormalizationMethod,
    ) -> Tuple[RawDataset, Optional["ProcessingReport"]]:
        """
        Load raw data from pandas DataFrame with quality reporting.

        Args:
            df: pandas DataFrame with columns matching CSV format
            assay_name: Name for assay (defaults to 'DataFrame')
            values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
            skip_control_response_normalization: If False (default), raw rows must include non-zero
                ``Control_Response``. Set True if responses are already control-normalized.
            response_normalization: Required. See :meth:`RawDataset.load_from_file`.

        Returns:
            Tuple of (RawDataset, ProcessingReport)

        Raises:
            ValueError: If required columns are missing
            ImportError: If pandas is not installed
            TypeError: If input is not a pandas DataFrame
        """
        # Convert DataFrame to list of dicts
        list_of_rows = _convert_dataframe_to_dict_list(df)

        # Use get_s_prime_from_data logic but return RawDataset
        if not list_of_rows:
            # Create empty dataset
            if assay_name is None:
                assay_name = "DataFrame"
            assay = Assay(name=assay_name)
            raw_dataset = RawDataset(
                assay=assay,
                skip_control_response_normalization=skip_control_response_normalization,
            )
            if ProcessingReport is not None:
                report = ProcessingReport()
            else:
                report = None
            return raw_dataset, report

        # Validate required columns exist
        first_row_keys = list(list_of_rows[0].keys()) if list_of_rows else []
        if first_row_keys:
            _validate_required_columns(
                first_row_keys,
                source_name="DataFrame",
                values_as=values_as,
                skip_control_response_normalization=skip_control_response_normalization,
            )

        # Create report
        if ProcessingReport is not None:
            report = ProcessingReport()
        else:
            report = None

        # Create RawDataset
        if assay_name is None:
            assay_name = "DataFrame"
        assay = Assay(name=assay_name)
        raw_dataset = RawDataset(
            assay=assay,
            response_normalization=response_normalization,
            skip_control_response_normalization=skip_control_response_normalization,
        )

        if report:
            report.total_rows = len(list_of_rows)

        # Process rows (similar to get_s_prime_from_data but add to RawDataset)
        # Rows are taken literally: empty values are null, no forward-filling.
        reserved = _reserved_column_names(values_as, first_row_keys)
        compounds_seen = set()

        for row_idx, row in enumerate(list_of_rows):
            if report:
                report.rows_processed += 1

            # Check if row is fully blank
            is_fully_blank = not any(
                v.strip() if isinstance(v, str) else str(v).strip() for v in row.values() if v
            )

            if is_fully_blank:
                continue

            # Check for empty cell line - RAISE EXCEPTION
            cell_line_name = row.get("Cell_Line", "").strip()
            if not cell_line_name:
                raise ValueError(
                    f"Row {row_idx + 1}: Missing required 'Cell_Line' value in DataFrame. "
                    f"All rows must have a cell line specified."
                )

            # Get compound info (Compound_ID required; NCGCID pass-through only)
            compound_name = row.get("Compound Name", "").strip() or "Unknown"
            compound_id = row.get("Compound_ID", "").strip()

            if not compound_id:
                raise ValueError(
                    f"Row {row_idx + 1}: Missing required 'Compound_ID' value in DataFrame. "
                    f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                    f"All rows must have a compound identifier."
                )

            # Track compound
            if report and compound_id not in compounds_seen:
                compounds_seen.add(compound_id)
                report.compounds_loaded += 1

            # Create compound and cell line objects
            compound = Compound(
                name=compound_name,
                drug_id=compound_id,
                pubchem_sid=row.get("pubchem_sid", "").strip() or None,
                smiles=row.get("SMILES", "").strip() or None,
            )

            cell_line = CellLine(
                name=cell_line_name, ref_id=row.get("Cell_Line_Ref_ID", "").strip() or None
            )

            # Extract raw dose-response data (if present)
            concentrations = None
            responses = None

            if values_as == "list":
                resp_key = next((k for k in row if k.lower() == "responses"), None)
                conc_key = next((k for k in row if k.lower() == "concentrations"), None)
                if resp_key and conc_key:
                    resp_str = (row.get(resp_key) or "").strip()
                    conc_str = (row.get(conc_key) or "").strip()
                    if resp_str and conc_str:
                        responses = []
                        conc_parts = []
                        for part in resp_str.split(","):
                            t = part.strip()
                            if t:
                                try:
                                    v = float(t)
                                    if not (math.isnan(v) or math.isinf(v)):
                                        responses.append(v)
                                except (ValueError, TypeError):
                                    pass
                        for part in conc_str.split(","):
                            t = part.strip()
                            if t:
                                try:
                                    v = float(t)
                                    if not (math.isnan(v) or math.isinf(v)):
                                        conc_parts.append(v)
                                except (ValueError, TypeError):
                                    pass
                        if len(responses) == len(conc_parts) and len(responses) >= 4:
                            units = row.get("Concentration_Units", "").strip()
                            if not units:
                                raise ValueError(
                                    f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in DataFrame. "
                                    f"Compound: {compound_name}, Cell_Line: {cell_line_name}."
                                )
                            concentrations = convert_to_micromolar(conc_parts, units)
                        else:
                            if len(responses) != len(conc_parts):
                                raise ValueError(
                                    f"Row {row_idx + 1}: Responses and Concentrations length mismatch "
                                    f"({len(responses)} vs {len(conc_parts)}) in DataFrame."
                                )
                            responses = None
                            concentrations = None

            else:
                data_cols = [k for k in row.keys() if k.startswith("Data") or k.startswith("DATA")]
                conc_cols = [
                    k
                    for k in row.keys()
                    if (k.startswith("Conc") or k.startswith("CONC"))
                    and "Units" not in k
                    and "units" not in k
                ]
                if data_cols and conc_cols:
                    data_cols = sorted(
                        data_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                    )
                    conc_cols = sorted(
                        conc_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                    )
                    responses = []
                    concentrations = []
                    for data_col, conc_col in zip(data_cols, conc_cols):
                        try:
                            resp_val = row.get(data_col, "") or ""
                            conc_val = row.get(conc_col, "") or ""
                            resp_val = (
                                resp_val.strip() if isinstance(resp_val, str) else str(resp_val)
                            )
                            conc_val = (
                                conc_val.strip() if isinstance(conc_val, str) else str(conc_val)
                            )
                            if resp_val and conc_val:
                                responses.append(float(resp_val))
                                concentrations.append(float(conc_val))
                        except (ValueError, TypeError):
                            continue
                    if concentrations and responses:
                        units = row.get("Concentration_Units", "").strip()
                        if not units:
                            raise ValueError(
                                f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in DataFrame. "
                                f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                                f"Raw dose-response data requires Concentration_Units."
                            )
                        concentrations = convert_to_micromolar(concentrations, units)
                    else:
                        concentrations = None
                        responses = None

            # Extract pre-calculated Hill params (if present)
            hill_params = None
            ac50 = row.get("AC50", "").strip() or row.get("ec50", "").strip()
            if ac50:
                try:
                    inf_a = row.get("Inf_asymptote", row.get("Upper", row.get("Infinity", "0")))
                    zero_a = row.get("Zero_asymptote", row.get("Lower", row.get("Zero", "0")))
                    hill_params = HillCurveParams(
                        ec50=float(ac50),
                        zero_asymptote=float(str(zero_a).strip() or "0"),
                        inf_asymptote=float(str(inf_a).strip() or "0"),
                        steepness_coefficient=_try_float(
                            row.get("Hill_Slope", row.get("Hill", row.get("slope", "")))
                        ),
                        r_squared=_try_float(row.get("r2", row.get(_RES_COL_R2, ""))),
                    )
                except (ValueError, TypeError):
                    hill_params = None

            # Validate that row has either raw data or pre-calculated params
            has_raw_data = (
                concentrations is not None and responses is not None and len(concentrations) > 0
            )
            has_precalc_params = hill_params is not None

            if not (has_raw_data or has_precalc_params):
                raise ValueError(
                    f"Row {row_idx + 1}: No dose-response data found for compound '{compound_name}' "
                    f"(Compound_ID: {compound_id}) in cell line '{cell_line_name}' in DataFrame. "
                    f"Row must have either: (1) raw data columns (DATA*/CONC*), or "
                    f"(2) pre-calculated parameters (AC50, Zero_asymptote, Inf_asymptote)."
                )

            _validate_control_response_for_raw_row(
                row,
                f"Row {row_idx + 1} in DataFrame",
                has_raw_data,
                compound_name=compound_name,
                cell_line_name=cell_line_name,
                compound_id=compound_id,
                skip_control_response_normalization=skip_control_response_normalization,
            )

            # Pre-calculated S' (if present)
            s_prime = _try_float(row.get("S'", row.get("S Prime", "")))
            rank = _try_int(row.get("Rank", ""))

            # Extract metadata: generic pass-through (all non-reserved columns, exact header, value as-is)
            metadata = {}
            for col in row.keys():
                if col in reserved:
                    continue
                raw = row.get(col, "")
                raw = raw if isinstance(raw, str) else str(raw)
                metadata[col] = raw

            # Create profile
            profile = DoseResponseProfile(
                compound=compound,
                cell_line=cell_line,
                assay=assay,
                concentrations=concentrations,
                responses=responses,
                concentration_units="microM",
                hill_params=hill_params,
                s_prime=s_prime,
                rank=rank,
                metadata=metadata if metadata else None,
                control_response=_control_response_numeric_for_raw_row(
                    row,
                    has_raw_data,
                    skip_control_response_normalization,
                ),
            )

            raw_dataset.add_profile(profile)
            if report:
                report.profiles_created += 1

        # Auto-print and write log based on global config
        if report and ReportingConfig is not None:
            report.print_console_summary()
            report.write_log_file()

        return raw_dataset, report

    @staticmethod
    def process(
        raw_dataset: RawDataset,
        report: Optional["ProcessingReport"] = None,
        allow_overwrite_precalc_params: bool = False,
        **fit_params,
    ) -> Tuple[ScreeningDataset, Optional["ProcessingReport"]]:
        """
        Convert RawDataset to ScreeningDataset (fit curves, calculate S').

        Uses global reporting configuration for console/log output.

        Args:
            raw_dataset: RawDataset to process
            report: Optional ProcessingReport to accumulate warnings (reuses from load if None)
            allow_overwrite_precalc_params: If True, allow overwriting pre-calculated
                curve parameters (EC50, asymptotes, Hill_Slope, r2) with fitted values when
                both raw and pre-calc exist. Default False (raise). When True,
                overwrites are logged as warnings.
            **fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

        Returns:
            Tuple of (ScreeningDataset, ProcessingReport)
        """
        screening_dataset, report = raw_dataset.to_screening_dataset(
            report=report,
            allow_overwrite_precalc_params=allow_overwrite_precalc_params,
            **fit_params,
        )

        # Auto-print and write log based on global config
        if report and ReportingConfig is not None:
            report.print_console_summary()
            report.write_log_file()

        return screening_dataset, report

Main API for sprime package.

Provides factory methods and convenience functions for loading data, processing dose-response profiles, and calculating S' values.

Static methods

def load(filepath_or_dataframe: Union[str, Path, pd.DataFrame], assay_name: Optional[str] = None, values_as: str = 'columns', skip_control_response_normalization: bool = False, *, response_normalization: ResponseNormalizationMethod, **csv_kwargs) ‑> Tuple[RawDataset, Optional['ProcessingReport']]

Expand source code

@staticmethod
def load(
    filepath_or_dataframe: Union[str, Path, pd.DataFrame],
    assay_name: Optional[str] = None,
    values_as: str = "columns",
    skip_control_response_normalization: bool = False,
    *,
    response_normalization: ResponseNormalizationMethod,
    **csv_kwargs,
) -> Tuple[RawDataset, Optional["ProcessingReport"]]:
    """
    Load raw data from CSV file or pandas DataFrame with quality reporting.

    Auto-detects pandas DataFrame input and converts it. For CSV files,
    uses global reporting configuration for console/log output.

    Args:
        filepath_or_dataframe: Path to CSV file, or pandas DataFrame (pandas required for DataFrame)
        assay_name: Name for assay (defaults to filename stem or 'DataFrame')
        values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
        skip_control_response_normalization: If False (default), raw rows must include a non-zero
            ``Control_Response``. Set True if responses are already control-normalized.
        response_normalization: Required. See :meth:`RawDataset.load_from_file`.
        **csv_kwargs: Additional arguments for csv.DictReader (ignored for DataFrames)

    Returns:
        Tuple of (RawDataset, ProcessingReport)

    Raises:
        ValueError: If required columns are missing
        ImportError: If DataFrame provided but pandas not installed
    """
    # Auto-detect DataFrame
    try:
        import pandas as pd

        if isinstance(filepath_or_dataframe, pd.DataFrame):
            return SPrime.load_from_dataframe(
                filepath_or_dataframe,
                assay_name,
                values_as=values_as,
                skip_control_response_normalization=skip_control_response_normalization,
                response_normalization=response_normalization,
            )
    except ImportError:
        pass  # pandas not available, treat as file path
    except (AttributeError, TypeError):
        pass  # Not a DataFrame, treat as file path

    # Treat as file path
    raw_dataset, report = RawDataset.load_from_file(
        filepath_or_dataframe,
        assay_name,
        values_as=values_as,
        skip_control_response_normalization=skip_control_response_normalization,
        response_normalization=response_normalization,
        **csv_kwargs,
    )

    # Auto-print and write log based on global config
    if report and ReportingConfig is not None:
        report.print_console_summary()
        report.write_log_file()

    return raw_dataset, report

Load raw data from CSV file or pandas DataFrame with quality reporting.

Auto-detects pandas DataFrame input and converts it. For CSV files, uses global reporting configuration for console/log output.

Args

filepath_or_dataframe: Path to CSV file, or pandas DataFrame (pandas required for DataFrame)
assay_name: Name for assay (defaults to filename stem or 'DataFrame')
values_as: "columns" (DATA/CONC) or "list" (Responses, Concentrations)
skip_control_response_normalization: If False (default), raw rows must include a non-zero Control_Response. Set True if responses are already control-normalized.
response_normalization: Required. See :meth:RawDataset.load_from_file().
**csv_kwargs: Additional arguments for csv.DictReader (ignored for DataFrames)

Returns

Tuple of (RawDataset, ProcessingReport)

Raises

ValueError: If required columns are missing
ImportError: If DataFrame provided but pandas not installed

def load_from_dataframe(df, assay_name: Optional[str] = None, values_as: str = 'columns', skip_control_response_normalization: bool = True, *, response_normalization: ResponseNormalizationMethod) ‑> Tuple[RawDataset, ProcessingReport | None]

Expand source code

@staticmethod
def load_from_dataframe(
    df,
    assay_name: Optional[str] = None,
    values_as: str = "columns",
    skip_control_response_normalization: bool = True,
    *,
    response_normalization: ResponseNormalizationMethod,
) -> Tuple[RawDataset, Optional["ProcessingReport"]]:
    """
    Load raw data from pandas DataFrame with quality reporting.

    Args:
        df: pandas DataFrame with columns matching CSV format
        assay_name: Name for assay (defaults to 'DataFrame')
        values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
        skip_control_response_normalization: If False (default), raw rows must include non-zero
            ``Control_Response``. Set True if responses are already control-normalized.
        response_normalization: Required. See :meth:`RawDataset.load_from_file`.

    Returns:
        Tuple of (RawDataset, ProcessingReport)

    Raises:
        ValueError: If required columns are missing
        ImportError: If pandas is not installed
        TypeError: If input is not a pandas DataFrame
    """
    # Convert DataFrame to list of dicts
    list_of_rows = _convert_dataframe_to_dict_list(df)

    # Use get_s_prime_from_data logic but return RawDataset
    if not list_of_rows:
        # Create empty dataset
        if assay_name is None:
            assay_name = "DataFrame"
        assay = Assay(name=assay_name)
        raw_dataset = RawDataset(
            assay=assay,
            skip_control_response_normalization=skip_control_response_normalization,
        )
        if ProcessingReport is not None:
            report = ProcessingReport()
        else:
            report = None
        return raw_dataset, report

    # Validate required columns exist
    first_row_keys = list(list_of_rows[0].keys()) if list_of_rows else []
    if first_row_keys:
        _validate_required_columns(
            first_row_keys,
            source_name="DataFrame",
            values_as=values_as,
            skip_control_response_normalization=skip_control_response_normalization,
        )

    # Create report
    if ProcessingReport is not None:
        report = ProcessingReport()
    else:
        report = None

    # Create RawDataset
    if assay_name is None:
        assay_name = "DataFrame"
    assay = Assay(name=assay_name)
    raw_dataset = RawDataset(
        assay=assay,
        response_normalization=response_normalization,
        skip_control_response_normalization=skip_control_response_normalization,
    )

    if report:
        report.total_rows = len(list_of_rows)

    # Process rows (similar to get_s_prime_from_data but add to RawDataset)
    # Rows are taken literally: empty values are null, no forward-filling.
    reserved = _reserved_column_names(values_as, first_row_keys)
    compounds_seen = set()

    for row_idx, row in enumerate(list_of_rows):
        if report:
            report.rows_processed += 1

        # Check if row is fully blank
        is_fully_blank = not any(
            v.strip() if isinstance(v, str) else str(v).strip() for v in row.values() if v
        )

        if is_fully_blank:
            continue

        # Check for empty cell line - RAISE EXCEPTION
        cell_line_name = row.get("Cell_Line", "").strip()
        if not cell_line_name:
            raise ValueError(
                f"Row {row_idx + 1}: Missing required 'Cell_Line' value in DataFrame. "
                f"All rows must have a cell line specified."
            )

        # Get compound info (Compound_ID required; NCGCID pass-through only)
        compound_name = row.get("Compound Name", "").strip() or "Unknown"
        compound_id = row.get("Compound_ID", "").strip()

        if not compound_id:
            raise ValueError(
                f"Row {row_idx + 1}: Missing required 'Compound_ID' value in DataFrame. "
                f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                f"All rows must have a compound identifier."
            )

        # Track compound
        if report and compound_id not in compounds_seen:
            compounds_seen.add(compound_id)
            report.compounds_loaded += 1

        # Create compound and cell line objects
        compound = Compound(
            name=compound_name,
            drug_id=compound_id,
            pubchem_sid=row.get("pubchem_sid", "").strip() or None,
            smiles=row.get("SMILES", "").strip() or None,
        )

        cell_line = CellLine(
            name=cell_line_name, ref_id=row.get("Cell_Line_Ref_ID", "").strip() or None
        )

        # Extract raw dose-response data (if present)
        concentrations = None
        responses = None

        if values_as == "list":
            resp_key = next((k for k in row if k.lower() == "responses"), None)
            conc_key = next((k for k in row if k.lower() == "concentrations"), None)
            if resp_key and conc_key:
                resp_str = (row.get(resp_key) or "").strip()
                conc_str = (row.get(conc_key) or "").strip()
                if resp_str and conc_str:
                    responses = []
                    conc_parts = []
                    for part in resp_str.split(","):
                        t = part.strip()
                        if t:
                            try:
                                v = float(t)
                                if not (math.isnan(v) or math.isinf(v)):
                                    responses.append(v)
                            except (ValueError, TypeError):
                                pass
                    for part in conc_str.split(","):
                        t = part.strip()
                        if t:
                            try:
                                v = float(t)
                                if not (math.isnan(v) or math.isinf(v)):
                                    conc_parts.append(v)
                            except (ValueError, TypeError):
                                pass
                    if len(responses) == len(conc_parts) and len(responses) >= 4:
                        units = row.get("Concentration_Units", "").strip()
                        if not units:
                            raise ValueError(
                                f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in DataFrame. "
                                f"Compound: {compound_name}, Cell_Line: {cell_line_name}."
                            )
                        concentrations = convert_to_micromolar(conc_parts, units)
                    else:
                        if len(responses) != len(conc_parts):
                            raise ValueError(
                                f"Row {row_idx + 1}: Responses and Concentrations length mismatch "
                                f"({len(responses)} vs {len(conc_parts)}) in DataFrame."
                            )
                        responses = None
                        concentrations = None

        else:
            data_cols = [k for k in row.keys() if k.startswith("Data") or k.startswith("DATA")]
            conc_cols = [
                k
                for k in row.keys()
                if (k.startswith("Conc") or k.startswith("CONC"))
                and "Units" not in k
                and "units" not in k
            ]
            if data_cols and conc_cols:
                data_cols = sorted(
                    data_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                )
                conc_cols = sorted(
                    conc_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                )
                responses = []
                concentrations = []
                for data_col, conc_col in zip(data_cols, conc_cols):
                    try:
                        resp_val = row.get(data_col, "") or ""
                        conc_val = row.get(conc_col, "") or ""
                        resp_val = (
                            resp_val.strip() if isinstance(resp_val, str) else str(resp_val)
                        )
                        conc_val = (
                            conc_val.strip() if isinstance(conc_val, str) else str(conc_val)
                        )
                        if resp_val and conc_val:
                            responses.append(float(resp_val))
                            concentrations.append(float(conc_val))
                    except (ValueError, TypeError):
                        continue
                if concentrations and responses:
                    units = row.get("Concentration_Units", "").strip()
                    if not units:
                        raise ValueError(
                            f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in DataFrame. "
                            f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                            f"Raw dose-response data requires Concentration_Units."
                        )
                    concentrations = convert_to_micromolar(concentrations, units)
                else:
                    concentrations = None
                    responses = None

        # Extract pre-calculated Hill params (if present)
        hill_params = None
        ac50 = row.get("AC50", "").strip() or row.get("ec50", "").strip()
        if ac50:
            try:
                inf_a = row.get("Inf_asymptote", row.get("Upper", row.get("Infinity", "0")))
                zero_a = row.get("Zero_asymptote", row.get("Lower", row.get("Zero", "0")))
                hill_params = HillCurveParams(
                    ec50=float(ac50),
                    zero_asymptote=float(str(zero_a).strip() or "0"),
                    inf_asymptote=float(str(inf_a).strip() or "0"),
                    steepness_coefficient=_try_float(
                        row.get("Hill_Slope", row.get("Hill", row.get("slope", "")))
                    ),
                    r_squared=_try_float(row.get("r2", row.get(_RES_COL_R2, ""))),
                )
            except (ValueError, TypeError):
                hill_params = None

        # Validate that row has either raw data or pre-calculated params
        has_raw_data = (
            concentrations is not None and responses is not None and len(concentrations) > 0
        )
        has_precalc_params = hill_params is not None

        if not (has_raw_data or has_precalc_params):
            raise ValueError(
                f"Row {row_idx + 1}: No dose-response data found for compound '{compound_name}' "
                f"(Compound_ID: {compound_id}) in cell line '{cell_line_name}' in DataFrame. "
                f"Row must have either: (1) raw data columns (DATA*/CONC*), or "
                f"(2) pre-calculated parameters (AC50, Zero_asymptote, Inf_asymptote)."
            )

        _validate_control_response_for_raw_row(
            row,
            f"Row {row_idx + 1} in DataFrame",
            has_raw_data,
            compound_name=compound_name,
            cell_line_name=cell_line_name,
            compound_id=compound_id,
            skip_control_response_normalization=skip_control_response_normalization,
        )

        # Pre-calculated S' (if present)
        s_prime = _try_float(row.get("S'", row.get("S Prime", "")))
        rank = _try_int(row.get("Rank", ""))

        # Extract metadata: generic pass-through (all non-reserved columns, exact header, value as-is)
        metadata = {}
        for col in row.keys():
            if col in reserved:
                continue
            raw = row.get(col, "")
            raw = raw if isinstance(raw, str) else str(raw)
            metadata[col] = raw

        # Create profile
        profile = DoseResponseProfile(
            compound=compound,
            cell_line=cell_line,
            assay=assay,
            concentrations=concentrations,
            responses=responses,
            concentration_units="microM",
            hill_params=hill_params,
            s_prime=s_prime,
            rank=rank,
            metadata=metadata if metadata else None,
            control_response=_control_response_numeric_for_raw_row(
                row,
                has_raw_data,
                skip_control_response_normalization,
            ),
        )

        raw_dataset.add_profile(profile)
        if report:
            report.profiles_created += 1

    # Auto-print and write log based on global config
    if report and ReportingConfig is not None:
        report.print_console_summary()
        report.write_log_file()

    return raw_dataset, report

Load raw data from pandas DataFrame with quality reporting.

Args

df: pandas DataFrame with columns matching CSV format
assay_name: Name for assay (defaults to 'DataFrame')
values_as: "columns" (DATA/CONC) or "list" (Responses, Concentrations)
skip_control_response_normalization: If False (default), raw rows must include non-zero Control_Response. Set True if responses are already control-normalized.
response_normalization: Required. See :meth:RawDataset.load_from_file().

Returns

Tuple of (RawDataset, ProcessingReport)

Raises

ValueError: If required columns are missing
ImportError: If pandas is not installed
TypeError: If input is not a pandas DataFrame

def process(raw_dataset: RawDataset, report: "Optional['ProcessingReport']" = None, allow_overwrite_precalc_params: bool = False, **fit_params) ‑> Tuple[ScreeningDataset, ProcessingReport | None]

Expand source code

@staticmethod
def process(
    raw_dataset: RawDataset,
    report: Optional["ProcessingReport"] = None,
    allow_overwrite_precalc_params: bool = False,
    **fit_params,
) -> Tuple[ScreeningDataset, Optional["ProcessingReport"]]:
    """
    Convert RawDataset to ScreeningDataset (fit curves, calculate S').

    Uses global reporting configuration for console/log output.

    Args:
        raw_dataset: RawDataset to process
        report: Optional ProcessingReport to accumulate warnings (reuses from load if None)
        allow_overwrite_precalc_params: If True, allow overwriting pre-calculated
            curve parameters (EC50, asymptotes, Hill_Slope, r2) with fitted values when
            both raw and pre-calc exist. Default False (raise). When True,
            overwrites are logged as warnings.
        **fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

    Returns:
        Tuple of (ScreeningDataset, ProcessingReport)
    """
    screening_dataset, report = raw_dataset.to_screening_dataset(
        report=report,
        allow_overwrite_precalc_params=allow_overwrite_precalc_params,
        **fit_params,
    )

    # Auto-print and write log based on global config
    if report and ReportingConfig is not None:
        report.print_console_summary()
        report.write_log_file()

    return screening_dataset, report

Convert RawDataset to ScreeningDataset (fit curves, calculate S').

Uses global reporting configuration for console/log output.

Args

raw_dataset: RawDataset to process
report: Optional ProcessingReport to accumulate warnings (reuses from load if None)
allow_overwrite_precalc_params: If True, allow overwriting pre-calculated curve parameters (EC50, asymptotes, Hill_Slope, r2) with fitted values when both raw and pre-calc exist. Default False (raise). When True, overwrites are logged as warnings.
**fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

Returns

Tuple of (ScreeningDataset, ProcessingReport)

class Sprime

Expand source code

class SPrime:
    """
    Main API for sprime package.

    Provides factory methods and convenience functions for loading data,
    processing dose-response profiles, and calculating S' values.
    """

    @staticmethod
    def load(
        filepath_or_dataframe: Union[str, Path, pd.DataFrame],
        assay_name: Optional[str] = None,
        values_as: str = "columns",
        skip_control_response_normalization: bool = False,
        *,
        response_normalization: ResponseNormalizationMethod,
        **csv_kwargs,
    ) -> Tuple[RawDataset, Optional["ProcessingReport"]]:
        """
        Load raw data from CSV file or pandas DataFrame with quality reporting.

        Auto-detects pandas DataFrame input and converts it. For CSV files,
        uses global reporting configuration for console/log output.

        Args:
            filepath_or_dataframe: Path to CSV file, or pandas DataFrame (pandas required for DataFrame)
            assay_name: Name for assay (defaults to filename stem or 'DataFrame')
            values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
            skip_control_response_normalization: If False (default), raw rows must include a non-zero
                ``Control_Response``. Set True if responses are already control-normalized.
            response_normalization: Required. See :meth:`RawDataset.load_from_file`.
            **csv_kwargs: Additional arguments for csv.DictReader (ignored for DataFrames)

        Returns:
            Tuple of (RawDataset, ProcessingReport)

        Raises:
            ValueError: If required columns are missing
            ImportError: If DataFrame provided but pandas not installed
        """
        # Auto-detect DataFrame
        try:
            import pandas as pd

            if isinstance(filepath_or_dataframe, pd.DataFrame):
                return SPrime.load_from_dataframe(
                    filepath_or_dataframe,
                    assay_name,
                    values_as=values_as,
                    skip_control_response_normalization=skip_control_response_normalization,
                    response_normalization=response_normalization,
                )
        except ImportError:
            pass  # pandas not available, treat as file path
        except (AttributeError, TypeError):
            pass  # Not a DataFrame, treat as file path

        # Treat as file path
        raw_dataset, report = RawDataset.load_from_file(
            filepath_or_dataframe,
            assay_name,
            values_as=values_as,
            skip_control_response_normalization=skip_control_response_normalization,
            response_normalization=response_normalization,
            **csv_kwargs,
        )

        # Auto-print and write log based on global config
        if report and ReportingConfig is not None:
            report.print_console_summary()
            report.write_log_file()

        return raw_dataset, report

    @staticmethod
    def load_from_dataframe(
        df,
        assay_name: Optional[str] = None,
        values_as: str = "columns",
        skip_control_response_normalization: bool = True,
        *,
        response_normalization: ResponseNormalizationMethod,
    ) -> Tuple[RawDataset, Optional["ProcessingReport"]]:
        """
        Load raw data from pandas DataFrame with quality reporting.

        Args:
            df: pandas DataFrame with columns matching CSV format
            assay_name: Name for assay (defaults to 'DataFrame')
            values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
            skip_control_response_normalization: If False (default), raw rows must include non-zero
                ``Control_Response``. Set True if responses are already control-normalized.
            response_normalization: Required. See :meth:`RawDataset.load_from_file`.

        Returns:
            Tuple of (RawDataset, ProcessingReport)

        Raises:
            ValueError: If required columns are missing
            ImportError: If pandas is not installed
            TypeError: If input is not a pandas DataFrame
        """
        # Convert DataFrame to list of dicts
        list_of_rows = _convert_dataframe_to_dict_list(df)

        # Use get_s_prime_from_data logic but return RawDataset
        if not list_of_rows:
            # Create empty dataset
            if assay_name is None:
                assay_name = "DataFrame"
            assay = Assay(name=assay_name)
            raw_dataset = RawDataset(
                assay=assay,
                skip_control_response_normalization=skip_control_response_normalization,
            )
            if ProcessingReport is not None:
                report = ProcessingReport()
            else:
                report = None
            return raw_dataset, report

        # Validate required columns exist
        first_row_keys = list(list_of_rows[0].keys()) if list_of_rows else []
        if first_row_keys:
            _validate_required_columns(
                first_row_keys,
                source_name="DataFrame",
                values_as=values_as,
                skip_control_response_normalization=skip_control_response_normalization,
            )

        # Create report
        if ProcessingReport is not None:
            report = ProcessingReport()
        else:
            report = None

        # Create RawDataset
        if assay_name is None:
            assay_name = "DataFrame"
        assay = Assay(name=assay_name)
        raw_dataset = RawDataset(
            assay=assay,
            response_normalization=response_normalization,
            skip_control_response_normalization=skip_control_response_normalization,
        )

        if report:
            report.total_rows = len(list_of_rows)

        # Process rows (similar to get_s_prime_from_data but add to RawDataset)
        # Rows are taken literally: empty values are null, no forward-filling.
        reserved = _reserved_column_names(values_as, first_row_keys)
        compounds_seen = set()

        for row_idx, row in enumerate(list_of_rows):
            if report:
                report.rows_processed += 1

            # Check if row is fully blank
            is_fully_blank = not any(
                v.strip() if isinstance(v, str) else str(v).strip() for v in row.values() if v
            )

            if is_fully_blank:
                continue

            # Check for empty cell line - RAISE EXCEPTION
            cell_line_name = row.get("Cell_Line", "").strip()
            if not cell_line_name:
                raise ValueError(
                    f"Row {row_idx + 1}: Missing required 'Cell_Line' value in DataFrame. "
                    f"All rows must have a cell line specified."
                )

            # Get compound info (Compound_ID required; NCGCID pass-through only)
            compound_name = row.get("Compound Name", "").strip() or "Unknown"
            compound_id = row.get("Compound_ID", "").strip()

            if not compound_id:
                raise ValueError(
                    f"Row {row_idx + 1}: Missing required 'Compound_ID' value in DataFrame. "
                    f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                    f"All rows must have a compound identifier."
                )

            # Track compound
            if report and compound_id not in compounds_seen:
                compounds_seen.add(compound_id)
                report.compounds_loaded += 1

            # Create compound and cell line objects
            compound = Compound(
                name=compound_name,
                drug_id=compound_id,
                pubchem_sid=row.get("pubchem_sid", "").strip() or None,
                smiles=row.get("SMILES", "").strip() or None,
            )

            cell_line = CellLine(
                name=cell_line_name, ref_id=row.get("Cell_Line_Ref_ID", "").strip() or None
            )

            # Extract raw dose-response data (if present)
            concentrations = None
            responses = None

            if values_as == "list":
                resp_key = next((k for k in row if k.lower() == "responses"), None)
                conc_key = next((k for k in row if k.lower() == "concentrations"), None)
                if resp_key and conc_key:
                    resp_str = (row.get(resp_key) or "").strip()
                    conc_str = (row.get(conc_key) or "").strip()
                    if resp_str and conc_str:
                        responses = []
                        conc_parts = []
                        for part in resp_str.split(","):
                            t = part.strip()
                            if t:
                                try:
                                    v = float(t)
                                    if not (math.isnan(v) or math.isinf(v)):
                                        responses.append(v)
                                except (ValueError, TypeError):
                                    pass
                        for part in conc_str.split(","):
                            t = part.strip()
                            if t:
                                try:
                                    v = float(t)
                                    if not (math.isnan(v) or math.isinf(v)):
                                        conc_parts.append(v)
                                except (ValueError, TypeError):
                                    pass
                        if len(responses) == len(conc_parts) and len(responses) >= 4:
                            units = row.get("Concentration_Units", "").strip()
                            if not units:
                                raise ValueError(
                                    f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in DataFrame. "
                                    f"Compound: {compound_name}, Cell_Line: {cell_line_name}."
                                )
                            concentrations = convert_to_micromolar(conc_parts, units)
                        else:
                            if len(responses) != len(conc_parts):
                                raise ValueError(
                                    f"Row {row_idx + 1}: Responses and Concentrations length mismatch "
                                    f"({len(responses)} vs {len(conc_parts)}) in DataFrame."
                                )
                            responses = None
                            concentrations = None

            else:
                data_cols = [k for k in row.keys() if k.startswith("Data") or k.startswith("DATA")]
                conc_cols = [
                    k
                    for k in row.keys()
                    if (k.startswith("Conc") or k.startswith("CONC"))
                    and "Units" not in k
                    and "units" not in k
                ]
                if data_cols and conc_cols:
                    data_cols = sorted(
                        data_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                    )
                    conc_cols = sorted(
                        conc_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                    )
                    responses = []
                    concentrations = []
                    for data_col, conc_col in zip(data_cols, conc_cols):
                        try:
                            resp_val = row.get(data_col, "") or ""
                            conc_val = row.get(conc_col, "") or ""
                            resp_val = (
                                resp_val.strip() if isinstance(resp_val, str) else str(resp_val)
                            )
                            conc_val = (
                                conc_val.strip() if isinstance(conc_val, str) else str(conc_val)
                            )
                            if resp_val and conc_val:
                                responses.append(float(resp_val))
                                concentrations.append(float(conc_val))
                        except (ValueError, TypeError):
                            continue
                    if concentrations and responses:
                        units = row.get("Concentration_Units", "").strip()
                        if not units:
                            raise ValueError(
                                f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in DataFrame. "
                                f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                                f"Raw dose-response data requires Concentration_Units."
                            )
                        concentrations = convert_to_micromolar(concentrations, units)
                    else:
                        concentrations = None
                        responses = None

            # Extract pre-calculated Hill params (if present)
            hill_params = None
            ac50 = row.get("AC50", "").strip() or row.get("ec50", "").strip()
            if ac50:
                try:
                    inf_a = row.get("Inf_asymptote", row.get("Upper", row.get("Infinity", "0")))
                    zero_a = row.get("Zero_asymptote", row.get("Lower", row.get("Zero", "0")))
                    hill_params = HillCurveParams(
                        ec50=float(ac50),
                        zero_asymptote=float(str(zero_a).strip() or "0"),
                        inf_asymptote=float(str(inf_a).strip() or "0"),
                        steepness_coefficient=_try_float(
                            row.get("Hill_Slope", row.get("Hill", row.get("slope", "")))
                        ),
                        r_squared=_try_float(row.get("r2", row.get(_RES_COL_R2, ""))),
                    )
                except (ValueError, TypeError):
                    hill_params = None

            # Validate that row has either raw data or pre-calculated params
            has_raw_data = (
                concentrations is not None and responses is not None and len(concentrations) > 0
            )
            has_precalc_params = hill_params is not None

            if not (has_raw_data or has_precalc_params):
                raise ValueError(
                    f"Row {row_idx + 1}: No dose-response data found for compound '{compound_name}' "
                    f"(Compound_ID: {compound_id}) in cell line '{cell_line_name}' in DataFrame. "
                    f"Row must have either: (1) raw data columns (DATA*/CONC*), or "
                    f"(2) pre-calculated parameters (AC50, Zero_asymptote, Inf_asymptote)."
                )

            _validate_control_response_for_raw_row(
                row,
                f"Row {row_idx + 1} in DataFrame",
                has_raw_data,
                compound_name=compound_name,
                cell_line_name=cell_line_name,
                compound_id=compound_id,
                skip_control_response_normalization=skip_control_response_normalization,
            )

            # Pre-calculated S' (if present)
            s_prime = _try_float(row.get("S'", row.get("S Prime", "")))
            rank = _try_int(row.get("Rank", ""))

            # Extract metadata: generic pass-through (all non-reserved columns, exact header, value as-is)
            metadata = {}
            for col in row.keys():
                if col in reserved:
                    continue
                raw = row.get(col, "")
                raw = raw if isinstance(raw, str) else str(raw)
                metadata[col] = raw

            # Create profile
            profile = DoseResponseProfile(
                compound=compound,
                cell_line=cell_line,
                assay=assay,
                concentrations=concentrations,
                responses=responses,
                concentration_units="microM",
                hill_params=hill_params,
                s_prime=s_prime,
                rank=rank,
                metadata=metadata if metadata else None,
                control_response=_control_response_numeric_for_raw_row(
                    row,
                    has_raw_data,
                    skip_control_response_normalization,
                ),
            )

            raw_dataset.add_profile(profile)
            if report:
                report.profiles_created += 1

        # Auto-print and write log based on global config
        if report and ReportingConfig is not None:
            report.print_console_summary()
            report.write_log_file()

        return raw_dataset, report

    @staticmethod
    def process(
        raw_dataset: RawDataset,
        report: Optional["ProcessingReport"] = None,
        allow_overwrite_precalc_params: bool = False,
        **fit_params,
    ) -> Tuple[ScreeningDataset, Optional["ProcessingReport"]]:
        """
        Convert RawDataset to ScreeningDataset (fit curves, calculate S').

        Uses global reporting configuration for console/log output.

        Args:
            raw_dataset: RawDataset to process
            report: Optional ProcessingReport to accumulate warnings (reuses from load if None)
            allow_overwrite_precalc_params: If True, allow overwriting pre-calculated
                curve parameters (EC50, asymptotes, Hill_Slope, r2) with fitted values when
                both raw and pre-calc exist. Default False (raise). When True,
                overwrites are logged as warnings.
            **fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

        Returns:
            Tuple of (ScreeningDataset, ProcessingReport)
        """
        screening_dataset, report = raw_dataset.to_screening_dataset(
            report=report,
            allow_overwrite_precalc_params=allow_overwrite_precalc_params,
            **fit_params,
        )

        # Auto-print and write log based on global config
        if report and ReportingConfig is not None:
            report.print_console_summary()
            report.write_log_file()

        return screening_dataset, report

Main API for sprime package.

Provides factory methods and convenience functions for loading data, processing dose-response profiles, and calculating S' values.

Static methods

Expand source code

@staticmethod
def load(
    filepath_or_dataframe: Union[str, Path, pd.DataFrame],
    assay_name: Optional[str] = None,
    values_as: str = "columns",
    skip_control_response_normalization: bool = False,
    *,
    response_normalization: ResponseNormalizationMethod,
    **csv_kwargs,
) -> Tuple[RawDataset, Optional["ProcessingReport"]]:
    """
    Load raw data from CSV file or pandas DataFrame with quality reporting.

    Auto-detects pandas DataFrame input and converts it. For CSV files,
    uses global reporting configuration for console/log output.

    Args:
        filepath_or_dataframe: Path to CSV file, or pandas DataFrame (pandas required for DataFrame)
        assay_name: Name for assay (defaults to filename stem or 'DataFrame')
        values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
        skip_control_response_normalization: If False (default), raw rows must include a non-zero
            ``Control_Response``. Set True if responses are already control-normalized.
        response_normalization: Required. See :meth:`RawDataset.load_from_file`.
        **csv_kwargs: Additional arguments for csv.DictReader (ignored for DataFrames)

    Returns:
        Tuple of (RawDataset, ProcessingReport)

    Raises:
        ValueError: If required columns are missing
        ImportError: If DataFrame provided but pandas not installed
    """
    # Auto-detect DataFrame
    try:
        import pandas as pd

        if isinstance(filepath_or_dataframe, pd.DataFrame):
            return SPrime.load_from_dataframe(
                filepath_or_dataframe,
                assay_name,
                values_as=values_as,
                skip_control_response_normalization=skip_control_response_normalization,
                response_normalization=response_normalization,
            )
    except ImportError:
        pass  # pandas not available, treat as file path
    except (AttributeError, TypeError):
        pass  # Not a DataFrame, treat as file path

    # Treat as file path
    raw_dataset, report = RawDataset.load_from_file(
        filepath_or_dataframe,
        assay_name,
        values_as=values_as,
        skip_control_response_normalization=skip_control_response_normalization,
        response_normalization=response_normalization,
        **csv_kwargs,
    )

    # Auto-print and write log based on global config
    if report and ReportingConfig is not None:
        report.print_console_summary()
        report.write_log_file()

    return raw_dataset, report

Load raw data from CSV file or pandas DataFrame with quality reporting.

Auto-detects pandas DataFrame input and converts it. For CSV files, uses global reporting configuration for console/log output.

Args

filepath_or_dataframe: Path to CSV file, or pandas DataFrame (pandas required for DataFrame)
assay_name: Name for assay (defaults to filename stem or 'DataFrame')
values_as: "columns" (DATA/CONC) or "list" (Responses, Concentrations)
skip_control_response_normalization: If False (default), raw rows must include a non-zero Control_Response. Set True if responses are already control-normalized.
response_normalization: Required. See :meth:RawDataset.load_from_file().
**csv_kwargs: Additional arguments for csv.DictReader (ignored for DataFrames)

Returns

Tuple of (RawDataset, ProcessingReport)

Raises

ValueError: If required columns are missing
ImportError: If DataFrame provided but pandas not installed

Expand source code

@staticmethod
def load_from_dataframe(
    df,
    assay_name: Optional[str] = None,
    values_as: str = "columns",
    skip_control_response_normalization: bool = True,
    *,
    response_normalization: ResponseNormalizationMethod,
) -> Tuple[RawDataset, Optional["ProcessingReport"]]:
    """
    Load raw data from pandas DataFrame with quality reporting.

    Args:
        df: pandas DataFrame with columns matching CSV format
        assay_name: Name for assay (defaults to 'DataFrame')
        values_as: "columns" (DATA*/CONC*) or "list" (Responses, Concentrations)
        skip_control_response_normalization: If False (default), raw rows must include non-zero
            ``Control_Response``. Set True if responses are already control-normalized.
        response_normalization: Required. See :meth:`RawDataset.load_from_file`.

    Returns:
        Tuple of (RawDataset, ProcessingReport)

    Raises:
        ValueError: If required columns are missing
        ImportError: If pandas is not installed
        TypeError: If input is not a pandas DataFrame
    """
    # Convert DataFrame to list of dicts
    list_of_rows = _convert_dataframe_to_dict_list(df)

    # Use get_s_prime_from_data logic but return RawDataset
    if not list_of_rows:
        # Create empty dataset
        if assay_name is None:
            assay_name = "DataFrame"
        assay = Assay(name=assay_name)
        raw_dataset = RawDataset(
            assay=assay,
            skip_control_response_normalization=skip_control_response_normalization,
        )
        if ProcessingReport is not None:
            report = ProcessingReport()
        else:
            report = None
        return raw_dataset, report

    # Validate required columns exist
    first_row_keys = list(list_of_rows[0].keys()) if list_of_rows else []
    if first_row_keys:
        _validate_required_columns(
            first_row_keys,
            source_name="DataFrame",
            values_as=values_as,
            skip_control_response_normalization=skip_control_response_normalization,
        )

    # Create report
    if ProcessingReport is not None:
        report = ProcessingReport()
    else:
        report = None

    # Create RawDataset
    if assay_name is None:
        assay_name = "DataFrame"
    assay = Assay(name=assay_name)
    raw_dataset = RawDataset(
        assay=assay,
        response_normalization=response_normalization,
        skip_control_response_normalization=skip_control_response_normalization,
    )

    if report:
        report.total_rows = len(list_of_rows)

    # Process rows (similar to get_s_prime_from_data but add to RawDataset)
    # Rows are taken literally: empty values are null, no forward-filling.
    reserved = _reserved_column_names(values_as, first_row_keys)
    compounds_seen = set()

    for row_idx, row in enumerate(list_of_rows):
        if report:
            report.rows_processed += 1

        # Check if row is fully blank
        is_fully_blank = not any(
            v.strip() if isinstance(v, str) else str(v).strip() for v in row.values() if v
        )

        if is_fully_blank:
            continue

        # Check for empty cell line - RAISE EXCEPTION
        cell_line_name = row.get("Cell_Line", "").strip()
        if not cell_line_name:
            raise ValueError(
                f"Row {row_idx + 1}: Missing required 'Cell_Line' value in DataFrame. "
                f"All rows must have a cell line specified."
            )

        # Get compound info (Compound_ID required; NCGCID pass-through only)
        compound_name = row.get("Compound Name", "").strip() or "Unknown"
        compound_id = row.get("Compound_ID", "").strip()

        if not compound_id:
            raise ValueError(
                f"Row {row_idx + 1}: Missing required 'Compound_ID' value in DataFrame. "
                f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                f"All rows must have a compound identifier."
            )

        # Track compound
        if report and compound_id not in compounds_seen:
            compounds_seen.add(compound_id)
            report.compounds_loaded += 1

        # Create compound and cell line objects
        compound = Compound(
            name=compound_name,
            drug_id=compound_id,
            pubchem_sid=row.get("pubchem_sid", "").strip() or None,
            smiles=row.get("SMILES", "").strip() or None,
        )

        cell_line = CellLine(
            name=cell_line_name, ref_id=row.get("Cell_Line_Ref_ID", "").strip() or None
        )

        # Extract raw dose-response data (if present)
        concentrations = None
        responses = None

        if values_as == "list":
            resp_key = next((k for k in row if k.lower() == "responses"), None)
            conc_key = next((k for k in row if k.lower() == "concentrations"), None)
            if resp_key and conc_key:
                resp_str = (row.get(resp_key) or "").strip()
                conc_str = (row.get(conc_key) or "").strip()
                if resp_str and conc_str:
                    responses = []
                    conc_parts = []
                    for part in resp_str.split(","):
                        t = part.strip()
                        if t:
                            try:
                                v = float(t)
                                if not (math.isnan(v) or math.isinf(v)):
                                    responses.append(v)
                            except (ValueError, TypeError):
                                pass
                    for part in conc_str.split(","):
                        t = part.strip()
                        if t:
                            try:
                                v = float(t)
                                if not (math.isnan(v) or math.isinf(v)):
                                    conc_parts.append(v)
                            except (ValueError, TypeError):
                                pass
                    if len(responses) == len(conc_parts) and len(responses) >= 4:
                        units = row.get("Concentration_Units", "").strip()
                        if not units:
                            raise ValueError(
                                f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in DataFrame. "
                                f"Compound: {compound_name}, Cell_Line: {cell_line_name}."
                            )
                        concentrations = convert_to_micromolar(conc_parts, units)
                    else:
                        if len(responses) != len(conc_parts):
                            raise ValueError(
                                f"Row {row_idx + 1}: Responses and Concentrations length mismatch "
                                f"({len(responses)} vs {len(conc_parts)}) in DataFrame."
                            )
                        responses = None
                        concentrations = None

        else:
            data_cols = [k for k in row.keys() if k.startswith("Data") or k.startswith("DATA")]
            conc_cols = [
                k
                for k in row.keys()
                if (k.startswith("Conc") or k.startswith("CONC"))
                and "Units" not in k
                and "units" not in k
            ]
            if data_cols and conc_cols:
                data_cols = sorted(
                    data_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                )
                conc_cols = sorted(
                    conc_cols, key=lambda x: int("".join(filter(str.isdigit, x)) or "0")
                )
                responses = []
                concentrations = []
                for data_col, conc_col in zip(data_cols, conc_cols):
                    try:
                        resp_val = row.get(data_col, "") or ""
                        conc_val = row.get(conc_col, "") or ""
                        resp_val = (
                            resp_val.strip() if isinstance(resp_val, str) else str(resp_val)
                        )
                        conc_val = (
                            conc_val.strip() if isinstance(conc_val, str) else str(conc_val)
                        )
                        if resp_val and conc_val:
                            responses.append(float(resp_val))
                            concentrations.append(float(conc_val))
                    except (ValueError, TypeError):
                        continue
                if concentrations and responses:
                    units = row.get("Concentration_Units", "").strip()
                    if not units:
                        raise ValueError(
                            f"Row {row_idx + 1}: Missing required 'Concentration_Units' for raw data in DataFrame. "
                            f"Compound: {compound_name}, Cell_Line: {cell_line_name}. "
                            f"Raw dose-response data requires Concentration_Units."
                        )
                    concentrations = convert_to_micromolar(concentrations, units)
                else:
                    concentrations = None
                    responses = None

        # Extract pre-calculated Hill params (if present)
        hill_params = None
        ac50 = row.get("AC50", "").strip() or row.get("ec50", "").strip()
        if ac50:
            try:
                inf_a = row.get("Inf_asymptote", row.get("Upper", row.get("Infinity", "0")))
                zero_a = row.get("Zero_asymptote", row.get("Lower", row.get("Zero", "0")))
                hill_params = HillCurveParams(
                    ec50=float(ac50),
                    zero_asymptote=float(str(zero_a).strip() or "0"),
                    inf_asymptote=float(str(inf_a).strip() or "0"),
                    steepness_coefficient=_try_float(
                        row.get("Hill_Slope", row.get("Hill", row.get("slope", "")))
                    ),
                    r_squared=_try_float(row.get("r2", row.get(_RES_COL_R2, ""))),
                )
            except (ValueError, TypeError):
                hill_params = None

        # Validate that row has either raw data or pre-calculated params
        has_raw_data = (
            concentrations is not None and responses is not None and len(concentrations) > 0
        )
        has_precalc_params = hill_params is not None

        if not (has_raw_data or has_precalc_params):
            raise ValueError(
                f"Row {row_idx + 1}: No dose-response data found for compound '{compound_name}' "
                f"(Compound_ID: {compound_id}) in cell line '{cell_line_name}' in DataFrame. "
                f"Row must have either: (1) raw data columns (DATA*/CONC*), or "
                f"(2) pre-calculated parameters (AC50, Zero_asymptote, Inf_asymptote)."
            )

        _validate_control_response_for_raw_row(
            row,
            f"Row {row_idx + 1} in DataFrame",
            has_raw_data,
            compound_name=compound_name,
            cell_line_name=cell_line_name,
            compound_id=compound_id,
            skip_control_response_normalization=skip_control_response_normalization,
        )

        # Pre-calculated S' (if present)
        s_prime = _try_float(row.get("S'", row.get("S Prime", "")))
        rank = _try_int(row.get("Rank", ""))

        # Extract metadata: generic pass-through (all non-reserved columns, exact header, value as-is)
        metadata = {}
        for col in row.keys():
            if col in reserved:
                continue
            raw = row.get(col, "")
            raw = raw if isinstance(raw, str) else str(raw)
            metadata[col] = raw

        # Create profile
        profile = DoseResponseProfile(
            compound=compound,
            cell_line=cell_line,
            assay=assay,
            concentrations=concentrations,
            responses=responses,
            concentration_units="microM",
            hill_params=hill_params,
            s_prime=s_prime,
            rank=rank,
            metadata=metadata if metadata else None,
            control_response=_control_response_numeric_for_raw_row(
                row,
                has_raw_data,
                skip_control_response_normalization,
            ),
        )

        raw_dataset.add_profile(profile)
        if report:
            report.profiles_created += 1

    # Auto-print and write log based on global config
    if report and ReportingConfig is not None:
        report.print_console_summary()
        report.write_log_file()

    return raw_dataset, report

Load raw data from pandas DataFrame with quality reporting.

Args

df: pandas DataFrame with columns matching CSV format
assay_name: Name for assay (defaults to 'DataFrame')
values_as: "columns" (DATA/CONC) or "list" (Responses, Concentrations)
skip_control_response_normalization: If False (default), raw rows must include non-zero Control_Response. Set True if responses are already control-normalized.
response_normalization: Required. See :meth:RawDataset.load_from_file().

Returns

Tuple of (RawDataset, ProcessingReport)

Raises

ValueError: If required columns are missing
ImportError: If pandas is not installed
TypeError: If input is not a pandas DataFrame

def process(raw_dataset: RawDataset, report: "Optional['ProcessingReport']" = None, allow_overwrite_precalc_params: bool = False, **fit_params) ‑> Tuple[ScreeningDataset, ProcessingReport | None]

Expand source code

@staticmethod
def process(
    raw_dataset: RawDataset,
    report: Optional["ProcessingReport"] = None,
    allow_overwrite_precalc_params: bool = False,
    **fit_params,
) -> Tuple[ScreeningDataset, Optional["ProcessingReport"]]:
    """
    Convert RawDataset to ScreeningDataset (fit curves, calculate S').

    Uses global reporting configuration for console/log output.

    Args:
        raw_dataset: RawDataset to process
        report: Optional ProcessingReport to accumulate warnings (reuses from load if None)
        allow_overwrite_precalc_params: If True, allow overwriting pre-calculated
            curve parameters (EC50, asymptotes, Hill_Slope, r2) with fitted values when
            both raw and pre-calc exist. Default False (raise). When True,
            overwrites are logged as warnings.
        **fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

    Returns:
        Tuple of (ScreeningDataset, ProcessingReport)
    """
    screening_dataset, report = raw_dataset.to_screening_dataset(
        report=report,
        allow_overwrite_precalc_params=allow_overwrite_precalc_params,
        **fit_params,
    )

    # Auto-print and write log based on global config
    if report and ReportingConfig is not None:
        report.print_console_summary()
        report.write_log_file()

    return screening_dataset, report

Convert RawDataset to ScreeningDataset (fit curves, calculate S').

Uses global reporting configuration for console/log output.

Args

raw_dataset: RawDataset to process
report: Optional ProcessingReport to accumulate warnings (reuses from load if None)
allow_overwrite_precalc_params: If True, allow overwriting pre-calculated curve parameters (EC50, asymptotes, Hill_Slope, r2) with fitted values when both raw and pre-calc exist. Default False (raise). When True, overwrites are logged as warnings.
**fit_params: Parameters for curve fitting (e.g. maxfev, bounds).

Returns

Tuple of (ScreeningDataset, ProcessingReport)

class ScreeningDataset (assay: Assay)

Expand source code

class ScreeningDataset:
    """
    Processed dataset with all S' values calculated.

    All profiles in this dataset:
    - Have fitted Hill curve parameters
    - Have calculated S' values
    - Are ready for analysis operations
    """

    def __init__(self, assay: Assay):
        """
        Initialize ScreeningDataset.

        Args:
            assay: Assay entity for this dataset
        """
        self.assay = assay
        self._profiles: Dict[Tuple[str, str], DoseResponseProfile] = {}

    def add_profile(self, profile: DoseResponseProfile):
        """
        Add a profile (should have S' calculated).

        Args:
            profile: DoseResponseProfile with S' calculated

        Raises:
            ValueError: If profile doesn't have S' or Hill params, or already exists
        """
        # Validation: profile should be processed
        if profile.s_prime is None:
            raise ValueError("Profile must have S' calculated before adding to ScreeningDataset")
        if profile.hill_params is None:
            raise ValueError("Profile must have Hill params before adding to ScreeningDataset")

        key = (profile.compound.drug_id, profile.cell_line.name)
        if key in self._profiles:
            raise ValueError(f"Profile for {key} already exists")
        self._profiles[key] = profile

    def get_profile(
        self, compound: Union[Compound, str], cell_line: Union[CellLine, str]
    ) -> Optional[DoseResponseProfile]:
        """
        Retrieve a specific profile.

        Args:
            compound: Compound object or drug_id string
            cell_line: CellLine object or cell_line name string

        Returns:
            DoseResponseProfile or None if not found
        """
        compound_id = compound.drug_id if isinstance(compound, Compound) else compound
        cellline_name = cell_line.name if isinstance(cell_line, CellLine) else cell_line
        return self._profiles.get((compound_id, cellline_name))

    def calculate_delta_s_prime(
        self,
        reference_cell_lines: Union[str, List[str]],
        test_cell_lines: Union[str, List[str]],
        headings_one_to_one_in_ref_and_test: Optional[List[str]] = None,
        source_profile: Literal["ref", "test"] = "test",
    ) -> Dict[str, List[Dict]]:
        """
        Calculate delta S' = S'(ref) - S'(test) for each compound.

        Compound-level columns (1:1 per compound, not per cell line) auto-propagate:
        MOA and drug targets are reserved and always included; additional headings
        may be specified via headings_one_to_one_in_ref_and_test. Values are taken
        from the ref or test profile according to source_profile.

        Args:
            reference_cell_lines: Reference cell line name(s)
            test_cell_lines: Test cell line name(s)
            headings_one_to_one_in_ref_and_test: Optional list of metadata headings
                that exist 1:1 in ref and test; included in output, values from
                source_profile.
            source_profile: 'ref' or 'test'; which profile to use for compound-level
                values (MOA, drug targets, and optional headings).

        Returns:
            Dictionary with keys for each reference cell line, containing
            lists of dicts with delta S' and compound-level fields per combo.
        """
        ref_list = (
            [reference_cell_lines]
            if isinstance(reference_cell_lines, str)
            else reference_cell_lines
        )
        test_list = [test_cell_lines] if isinstance(test_cell_lines, str) else test_cell_lines
        extra_headings = headings_one_to_one_in_ref_and_test or []

        results = {}
        for ref_cellline in ref_list:
            rows = []
            compounds = {
                profile.compound.drug_id: profile.compound for profile in self._profiles.values()
            }

            for drug_id, compound in compounds.items():
                ref_profile = self.get_profile(compound, ref_cellline)
                if ref_profile is None or ref_profile.s_prime is None:
                    continue

                for test_cellline in test_list:
                    test_profile = self.get_profile(compound, test_cellline)
                    if test_profile is None or test_profile.s_prime is None:
                        continue

                    delta = ref_profile.s_prime - test_profile.s_prime
                    source_meta = (
                        ref_profile.metadata if source_profile == "ref" else test_profile.metadata
                    ) or {}
                    row = {
                        "compound_name": compound.name,
                        "drug_id": drug_id,
                        "reference_cell_line": ref_cellline,
                        "test_cell_line": test_cellline,
                        "s_prime_ref": ref_profile.s_prime,
                        "s_prime_test": test_profile.s_prime,
                        "delta_s_prime": delta,
                        "MOA": _resolve_moa(source_meta),
                        "drug targets": _resolve_drug_targets(source_meta),
                    }
                    for h in extra_headings:
                        row[h] = source_meta.get(h, "")
                    rows.append(row)

            results[ref_cellline] = rows

        return results

    def to_dict_list(self) -> List[Dict]:
        """
        Export to list of dictionaries with all S' values.

        Returns:
            List of dictionaries with profile data
        """
        rows = []
        for profile in self._profiles.values():
            row = {
                "compound_name": profile.compound.name,
                "drug_id": profile.compound.drug_id,
                "cell_line": profile.cell_line.name,
                "s_prime": profile.s_prime,
                "ec50": profile.hill_params.ec50 if profile.hill_params else None,
                "zero_asymptote": profile.hill_params.zero_asymptote
                if profile.hill_params
                else None,
                "inf_asymptote": profile.hill_params.inf_asymptote if profile.hill_params else None,
                "rank": profile.rank,
            }
            if profile.hill_params:
                row["steepness_coefficient"] = profile.hill_params.steepness_coefficient
                row["r_squared"] = profile.hill_params.r_squared
            rows.append(row)
        return rows

    def export_to_csv(self, filepath: Union[str, Path], include_metadata: bool = True) -> None:
        """
        Export all profiles to CSV file.

        Base columns (identifiers, Hill params, S', Rank) are always written.
        When include_metadata is True, all generic metadata keys (union across
        profiles) are included as pass-through columns.

        Args:
            filepath: Path to output CSV file
            include_metadata: When True, include all metadata columns. When False,
                only base columns.
        """
        filepath = Path(filepath)
        profiles = sorted(self.profiles, key=lambda p: (p.compound.name, p.cell_line.name))

        base_fieldnames = [
            "Compound Name",
            "Compound_ID",
            "pubchem_sid",
            "SMILES",
            "Cell_Line",
            "Cell_Line_Ref_ID",
            "EC50",
            "Zero_asymptote",
            "Inf_asymptote",
            "Hill_Slope",
            "r2",
            "S'",
            "Rank",
        ]
        all_meta_keys = sorted(set(k for p in profiles if p.metadata for k in p.metadata))
        fieldnames = list(base_fieldnames)
        if include_metadata and all_meta_keys:
            fieldnames.extend(all_meta_keys)

        with open(filepath, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()

            for profile in profiles:
                meta = profile.metadata or {}
                row = {
                    "Compound Name": profile.compound.name,
                    "Compound_ID": profile.compound.drug_id,
                    "pubchem_sid": profile.compound.pubchem_sid or "",
                    "SMILES": profile.compound.smiles or "",
                    "Cell_Line": profile.cell_line.name,
                    "Cell_Line_Ref_ID": profile.cell_line.ref_id or "",
                    "EC50": f"{profile.hill_params.ec50:.6e}" if profile.hill_params else "",
                    "Zero_asymptote": f"{profile.hill_params.zero_asymptote:.2f}"
                    if profile.hill_params
                    else "",
                    "Inf_asymptote": f"{profile.hill_params.inf_asymptote:.2f}"
                    if profile.hill_params
                    else "",
                    "Hill_Slope": f"{profile.hill_params.steepness_coefficient:.4f}"
                    if profile.hill_params and profile.hill_params.steepness_coefficient
                    else "",
                    "r2": f"{profile.hill_params.r_squared:.4f}"
                    if profile.hill_params and profile.hill_params.r_squared is not None
                    else "",
                    "S'": f"{profile.s_prime:.4f}" if profile.s_prime else "",
                    "Rank": str(profile.rank) if profile.rank else "",
                }
                if include_metadata and all_meta_keys:
                    for k in all_meta_keys:
                        row[k] = meta.get(k, "")

                writer.writerow(row)

    @staticmethod
    def export_delta_s_prime_to_csv(
        delta_results: Dict[str, List[Dict]],
        filepath: Union[str, Path],
        headings_one_to_one_in_ref_and_test: Optional[List[str]] = None,
    ) -> None:
        """
        Export delta S' results to CSV file.

        Includes compound-level columns: MOA, drug targets (reserved), plus any
        headings specified in headings_one_to_one_in_ref_and_test. These must
        match the headings passed to calculate_delta_s_prime when producing
        delta_results.

        Args:
            delta_results: Dictionary from calculate_delta_s_prime()
            filepath: Path to output CSV file
            headings_one_to_one_in_ref_and_test: Optional list of metadata
                headings included in delta output (same as for calculate_delta_s_prime).
        """
        filepath = Path(filepath)
        extra_headings = headings_one_to_one_in_ref_and_test or []

        flat_results = []
        for ref_cellline, comparisons in delta_results.items():
            for comp in comparisons:
                row = {
                    "Compound Name": comp.get("compound_name", ""),
                    "Compound_ID": comp.get("drug_id", ""),
                    "Reference_Cell_Line": comp.get("reference_cell_line", ""),
                    "Test_Cell_Line": comp.get("test_cell_line", ""),
                    "S' (Reference)": f"{comp.get('s_prime_ref', 0.0):.4f}",
                    "S' (Test)": f"{comp.get('s_prime_test', 0.0):.4f}",
                    "Delta S'": f"{comp.get('delta_s_prime', 0.0):.4f}",
                    "MOA": comp.get("MOA", ""),
                    "drug targets": comp.get("drug targets", ""),
                }
                for h in extra_headings:
                    row[h] = comp.get(h, "")
                flat_results.append(row)

        flat_results.sort(key=lambda x: float(x["Delta S'"]))
        for rank, result in enumerate(flat_results, start=1):
            result["Rank"] = str(rank)

        fieldnames = [
            "Rank",
            "Compound Name",
            "Compound_ID",
            "Reference_Cell_Line",
            "Test_Cell_Line",
            "S' (Reference)",
            "S' (Test)",
            "Delta S'",
            "MOA",
            "drug targets",
        ]
        fieldnames.extend(extra_headings)

        with open(filepath, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(flat_results)

    @property
    def profiles(self):
        """Iterator over all profiles"""
        return self._profiles.values()

    def __len__(self):
        return len(self._profiles)

Processed dataset with all S' values calculated.

All profiles in this dataset: - Have fitted Hill curve parameters - Have calculated S' values - Are ready for analysis operations

Initialize ScreeningDataset.

Args

assay: Assay entity for this dataset

Static methods

def export_delta_s_prime_to_csv(delta_results: Dict[str, List[Dict]], filepath: Union[str, Path], headings_one_to_one_in_ref_and_test: Optional[List[str]] = None) ‑> None

Expand source code

@staticmethod
def export_delta_s_prime_to_csv(
    delta_results: Dict[str, List[Dict]],
    filepath: Union[str, Path],
    headings_one_to_one_in_ref_and_test: Optional[List[str]] = None,
) -> None:
    """
    Export delta S' results to CSV file.

    Includes compound-level columns: MOA, drug targets (reserved), plus any
    headings specified in headings_one_to_one_in_ref_and_test. These must
    match the headings passed to calculate_delta_s_prime when producing
    delta_results.

    Args:
        delta_results: Dictionary from calculate_delta_s_prime()
        filepath: Path to output CSV file
        headings_one_to_one_in_ref_and_test: Optional list of metadata
            headings included in delta output (same as for calculate_delta_s_prime).
    """
    filepath = Path(filepath)
    extra_headings = headings_one_to_one_in_ref_and_test or []

    flat_results = []
    for ref_cellline, comparisons in delta_results.items():
        for comp in comparisons:
            row = {
                "Compound Name": comp.get("compound_name", ""),
                "Compound_ID": comp.get("drug_id", ""),
                "Reference_Cell_Line": comp.get("reference_cell_line", ""),
                "Test_Cell_Line": comp.get("test_cell_line", ""),
                "S' (Reference)": f"{comp.get('s_prime_ref', 0.0):.4f}",
                "S' (Test)": f"{comp.get('s_prime_test', 0.0):.4f}",
                "Delta S'": f"{comp.get('delta_s_prime', 0.0):.4f}",
                "MOA": comp.get("MOA", ""),
                "drug targets": comp.get("drug targets", ""),
            }
            for h in extra_headings:
                row[h] = comp.get(h, "")
            flat_results.append(row)

    flat_results.sort(key=lambda x: float(x["Delta S'"]))
    for rank, result in enumerate(flat_results, start=1):
        result["Rank"] = str(rank)

    fieldnames = [
        "Rank",
        "Compound Name",
        "Compound_ID",
        "Reference_Cell_Line",
        "Test_Cell_Line",
        "S' (Reference)",
        "S' (Test)",
        "Delta S'",
        "MOA",
        "drug targets",
    ]
    fieldnames.extend(extra_headings)

    with open(filepath, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(flat_results)

Export delta S' results to CSV file.

Includes compound-level columns: MOA, drug targets (reserved), plus any headings specified in headings_one_to_one_in_ref_and_test. These must match the headings passed to calculate_delta_s_prime when producing delta_results.

Args

delta_results: Dictionary from calculate_delta_s_prime()
filepath: Path to output CSV file
headings_one_to_one_in_ref_and_test: Optional list of metadata headings included in delta output (same as for calculate_delta_s_prime).

Instance variables

prop profiles

Expand source code

@property
def profiles(self):
    """Iterator over all profiles"""
    return self._profiles.values()

Iterator over all profiles

Methods

def add_profile(self, profile: DoseResponseProfile)

Expand source code

def add_profile(self, profile: DoseResponseProfile):
    """
    Add a profile (should have S' calculated).

    Args:
        profile: DoseResponseProfile with S' calculated

    Raises:
        ValueError: If profile doesn't have S' or Hill params, or already exists
    """
    # Validation: profile should be processed
    if profile.s_prime is None:
        raise ValueError("Profile must have S' calculated before adding to ScreeningDataset")
    if profile.hill_params is None:
        raise ValueError("Profile must have Hill params before adding to ScreeningDataset")

    key = (profile.compound.drug_id, profile.cell_line.name)
    if key in self._profiles:
        raise ValueError(f"Profile for {key} already exists")
    self._profiles[key] = profile

Add a profile (should have S' calculated).

Args

profile: DoseResponseProfile with S' calculated

Raises

ValueError: If profile doesn't have S' or Hill params, or already exists

def calculate_delta_s_prime(self, reference_cell_lines: Union[str, List[str]], test_cell_lines: Union[str, List[str]], headings_one_to_one_in_ref_and_test: Optional[List[str]] = None, source_profile: "Literal['ref', 'test']" = 'test') ‑> Dict[str, List[Dict]]

Expand source code

def calculate_delta_s_prime(
    self,
    reference_cell_lines: Union[str, List[str]],
    test_cell_lines: Union[str, List[str]],
    headings_one_to_one_in_ref_and_test: Optional[List[str]] = None,
    source_profile: Literal["ref", "test"] = "test",
) -> Dict[str, List[Dict]]:
    """
    Calculate delta S' = S'(ref) - S'(test) for each compound.

    Compound-level columns (1:1 per compound, not per cell line) auto-propagate:
    MOA and drug targets are reserved and always included; additional headings
    may be specified via headings_one_to_one_in_ref_and_test. Values are taken
    from the ref or test profile according to source_profile.

    Args:
        reference_cell_lines: Reference cell line name(s)
        test_cell_lines: Test cell line name(s)
        headings_one_to_one_in_ref_and_test: Optional list of metadata headings
            that exist 1:1 in ref and test; included in output, values from
            source_profile.
        source_profile: 'ref' or 'test'; which profile to use for compound-level
            values (MOA, drug targets, and optional headings).

    Returns:
        Dictionary with keys for each reference cell line, containing
        lists of dicts with delta S' and compound-level fields per combo.
    """
    ref_list = (
        [reference_cell_lines]
        if isinstance(reference_cell_lines, str)
        else reference_cell_lines
    )
    test_list = [test_cell_lines] if isinstance(test_cell_lines, str) else test_cell_lines
    extra_headings = headings_one_to_one_in_ref_and_test or []

    results = {}
    for ref_cellline in ref_list:
        rows = []
        compounds = {
            profile.compound.drug_id: profile.compound for profile in self._profiles.values()
        }

        for drug_id, compound in compounds.items():
            ref_profile = self.get_profile(compound, ref_cellline)
            if ref_profile is None or ref_profile.s_prime is None:
                continue

            for test_cellline in test_list:
                test_profile = self.get_profile(compound, test_cellline)
                if test_profile is None or test_profile.s_prime is None:
                    continue

                delta = ref_profile.s_prime - test_profile.s_prime
                source_meta = (
                    ref_profile.metadata if source_profile == "ref" else test_profile.metadata
                ) or {}
                row = {
                    "compound_name": compound.name,
                    "drug_id": drug_id,
                    "reference_cell_line": ref_cellline,
                    "test_cell_line": test_cellline,
                    "s_prime_ref": ref_profile.s_prime,
                    "s_prime_test": test_profile.s_prime,
                    "delta_s_prime": delta,
                    "MOA": _resolve_moa(source_meta),
                    "drug targets": _resolve_drug_targets(source_meta),
                }
                for h in extra_headings:
                    row[h] = source_meta.get(h, "")
                rows.append(row)

        results[ref_cellline] = rows

    return results

Calculate delta S' = S'(ref) - S'(test) for each compound.

Compound-level columns (1:1 per compound, not per cell line) auto-propagate: MOA and drug targets are reserved and always included; additional headings may be specified via headings_one_to_one_in_ref_and_test. Values are taken from the ref or test profile according to source_profile.

Args

reference_cell_lines: Reference cell line name(s)
test_cell_lines: Test cell line name(s)
headings_one_to_one_in_ref_and_test: Optional list of metadata headings that exist 1:1 in ref and test; included in output, values from source_profile.
source_profile: 'ref' or 'test'; which profile to use for compound-level values (MOA, drug targets, and optional headings).

Returns

Dictionary with keys for each reference cell line, containing lists of dicts with delta S' and compound-level fields per combo.

def export_to_csv(self, filepath: Union[str, Path], include_metadata: bool = True) ‑> None

Expand source code

def export_to_csv(self, filepath: Union[str, Path], include_metadata: bool = True) -> None:
    """
    Export all profiles to CSV file.

    Base columns (identifiers, Hill params, S', Rank) are always written.
    When include_metadata is True, all generic metadata keys (union across
    profiles) are included as pass-through columns.

    Args:
        filepath: Path to output CSV file
        include_metadata: When True, include all metadata columns. When False,
            only base columns.
    """
    filepath = Path(filepath)
    profiles = sorted(self.profiles, key=lambda p: (p.compound.name, p.cell_line.name))

    base_fieldnames = [
        "Compound Name",
        "Compound_ID",
        "pubchem_sid",
        "SMILES",
        "Cell_Line",
        "Cell_Line_Ref_ID",
        "EC50",
        "Zero_asymptote",
        "Inf_asymptote",
        "Hill_Slope",
        "r2",
        "S'",
        "Rank",
    ]
    all_meta_keys = sorted(set(k for p in profiles if p.metadata for k in p.metadata))
    fieldnames = list(base_fieldnames)
    if include_metadata and all_meta_keys:
        fieldnames.extend(all_meta_keys)

    with open(filepath, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for profile in profiles:
            meta = profile.metadata or {}
            row = {
                "Compound Name": profile.compound.name,
                "Compound_ID": profile.compound.drug_id,
                "pubchem_sid": profile.compound.pubchem_sid or "",
                "SMILES": profile.compound.smiles or "",
                "Cell_Line": profile.cell_line.name,
                "Cell_Line_Ref_ID": profile.cell_line.ref_id or "",
                "EC50": f"{profile.hill_params.ec50:.6e}" if profile.hill_params else "",
                "Zero_asymptote": f"{profile.hill_params.zero_asymptote:.2f}"
                if profile.hill_params
                else "",
                "Inf_asymptote": f"{profile.hill_params.inf_asymptote:.2f}"
                if profile.hill_params
                else "",
                "Hill_Slope": f"{profile.hill_params.steepness_coefficient:.4f}"
                if profile.hill_params and profile.hill_params.steepness_coefficient
                else "",
                "r2": f"{profile.hill_params.r_squared:.4f}"
                if profile.hill_params and profile.hill_params.r_squared is not None
                else "",
                "S'": f"{profile.s_prime:.4f}" if profile.s_prime else "",
                "Rank": str(profile.rank) if profile.rank else "",
            }
            if include_metadata and all_meta_keys:
                for k in all_meta_keys:
                    row[k] = meta.get(k, "")

            writer.writerow(row)

Export all profiles to CSV file.

Base columns (identifiers, Hill params, S', Rank) are always written. When include_metadata is True, all generic metadata keys (union across profiles) are included as pass-through columns.

Args

filepath: Path to output CSV file
include_metadata: When True, include all metadata columns. When False, only base columns.

def get_profile(self, compound: Union[Compound, str], cell_line: Union[CellLine, str]) ‑> DoseResponseProfile | None

Expand source code

def get_profile(
    self, compound: Union[Compound, str], cell_line: Union[CellLine, str]
) -> Optional[DoseResponseProfile]:
    """
    Retrieve a specific profile.

    Args:
        compound: Compound object or drug_id string
        cell_line: CellLine object or cell_line name string

    Returns:
        DoseResponseProfile or None if not found
    """
    compound_id = compound.drug_id if isinstance(compound, Compound) else compound
    cellline_name = cell_line.name if isinstance(cell_line, CellLine) else cell_line
    return self._profiles.get((compound_id, cellline_name))

Retrieve a specific profile.

Args

compound: Compound object or drug_id string
cell_line: CellLine object or cell_line name string

Returns

DoseResponseProfile or None if not found

def to_dict_list(self) ‑> List[Dict]

Expand source code

def to_dict_list(self) -> List[Dict]:
    """
    Export to list of dictionaries with all S' values.

    Returns:
        List of dictionaries with profile data
    """
    rows = []
    for profile in self._profiles.values():
        row = {
            "compound_name": profile.compound.name,
            "drug_id": profile.compound.drug_id,
            "cell_line": profile.cell_line.name,
            "s_prime": profile.s_prime,
            "ec50": profile.hill_params.ec50 if profile.hill_params else None,
            "zero_asymptote": profile.hill_params.zero_asymptote
            if profile.hill_params
            else None,
            "inf_asymptote": profile.hill_params.inf_asymptote if profile.hill_params else None,
            "rank": profile.rank,
        }
        if profile.hill_params:
            row["steepness_coefficient"] = profile.hill_params.steepness_coefficient
            row["r_squared"] = profile.hill_params.r_squared
        rows.append(row)
    return rows

Export to list of dictionaries with all S' values.

Returns

List of dictionaries with profile data