Reference

The `BasicAuriclass` class

Source code in auriclass/classes.py

class BasicAuriclass:
    def __init__(
        self,
        name: str,
        output_report_path: Path,
        read_paths: List[Path],
        reference_sketch_path: Path,
        kmer_size: int,
        sketch_size: int,
        minimal_kmer_coverage: int,
        clade_config_path: Path,
        genome_size_range: List[int],
        non_candida_threshold: float,
        high_dist_threshold: float,
        no_qc: bool,
    ) -> None:
        self.name: str = name
        self.output_report_path: Path = output_report_path
        self.read_paths: List[Path] = read_paths
        self.reference_sketch_path: Path = reference_sketch_path
        self.kmer_size: int = kmer_size
        self.sketch_size: int = sketch_size
        self.minimal_kmer_coverage: int = minimal_kmer_coverage
        # self.probability is used for mash bounds
        # this is not exposed in the CLI, because this should typically not be changed without extensive testing.
        self.probability: float = 0.99
        self.clade_dict: Dict[Hashable, Any] = pd.read_csv(
            clade_config_path,
            index_col=0,
            dtype=str,
        ).to_dict(orient="dict")
        self.genome_size_range: List[int] = genome_size_range
        self.non_candida_threshold: float = non_candida_threshold
        self.high_dist_threshold: float = high_dist_threshold
        self.no_qc: bool = no_qc
        self.qc_decision: str = ""
        self.qc_genome_size: str = ""
        self.qc_other_candida: str = ""
        self.qc_species: str = ""
        self.qc_multiple_hits: str = ""
        self.qc_high_distance: str = ""
        self.query_sketch_path: Path = Path()
        self.estimated_genome_size: float = float()
        self.minimal_distance: float = float()
        self.clade: str = ""
        self.samples_within_error_bound: int = int()
        self.error_bound: float = float()
        self.stdout: str = ""
        self.stderr: str = ""
        self.mash_output: pd.DataFrame = pd.DataFrame()
        self.distances: List[float] = [float()]

    def run_mash_dist(self) -> pd.DataFrame:
        """
        Run Mash dist and save results.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        pandas.DataFrame
            A DataFrame containing the results of the Mash dist, including the reference and query sequences,
            the distance between them, the p-value, the number of matching hashes, and the predicted clade.

        Notes
        -----
        This function sets the following attributes of the object:
        - mash_output: a pandas DataFrame containing the output of the Mash distance calculation

        The function uses the following attributes of the object:
        - reference_sketch_path: the path to the reference sketch
        - query_sketch_path: the path to the query sketch
        - clade_dict: a dictionary containing the clade information for each reference sample
        """
        command = [
            "mash",
            "dist",
            self.reference_sketch_path,
            self.query_sketch_path,
        ]
        command_list_of_str = [str(item) for item in command]
        logging.info(add_tag("mash dist", " ".join(command_list_of_str)))
        output = subprocess.run(
            command_list_of_str,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        # Append stdout and stderr to object
        stderr = output.stderr.decode("utf-8")
        if stderr:
            for line in stderr.splitlines():
                logging.info(add_tag("mash dist", line))
        # Read output into pandas dataframe
        df = pd.read_csv(
            StringIO(output.stdout.decode("utf-8")),
            sep="\t",
            header=None,
            names=["Reference", "Query", "Distance", "P-value", "Matching-hashes"],
        )
        df["Clade"] = df["Reference"].map(self.clade_dict["clade"])
        self.mash_output = df
        return df

    def check_genome_size(self) -> None:
        """
        Compare the estimated genome size with the expected genome size range.

        If the estimated genome size is outside the expected range, a warning message is logged and the
        `qc_genome_size` attribute is set to "WARN: genome size outside expected range".

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_genome_size: a warning message if the estimated genome size is outside the expected range

        The function uses the following attributes of the object:
        - estimated_genome_size: the estimated genome size of the current sample
        """
        if (
            self.estimated_genome_size < self.genome_size_range[0]
            or self.estimated_genome_size > self.genome_size_range[1]
        ):
            logging.warning(
                f"AuriClass estimated genome size of {self.estimated_genome_size} is outside the expected range of {self.genome_size_range}"
            )
            self.qc_genome_size = "WARN: genome size outside expected range"

    def select_clade(self) -> None:
        """
        Selects the closest reference sample and corresponding clade.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - closest_sample: the closest reference sample to the current test sample
        - clade: the clade of the closest reference sample
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample

        The function uses the following attributes of the object:
        - mash_output: a pandas DataFrame containing the output of the Mash distance calculation
        - clade_dict: a dictionary containing the clade information for each reference sample
        """
        # Get highest hit, by sorting on distance and selecting first hit
        self.closest_sample = self.mash_output.sort_values("Distance").iloc[0][
            "Reference"
        ]
        # Get clade of closest sample
        self.clade = self.mash_output.loc[
            self.mash_output["Reference"] == self.closest_sample, "Clade"
        ].values[0]
        # Get minimal distance using self.mash_output and closest sample
        self.minimal_distance = self.mash_output.loc[
            self.mash_output["Reference"] == self.closest_sample, "Distance"
        ].values[0]

    def check_non_candida(self) -> bool:
        """
        Check if the distance to the closest sample is higher than the non-Candida threshold.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        bool
            True if the distance to the closest sample is lower than or equal to the non-Candida threshold, False otherwise.

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold

        The function uses the following attributes of the object:
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        - non_candida_threshold: the threshold above which the distance to the closest sample is considered to be non-Candida
        """
        if self.minimal_distance > self.non_candida_threshold:
            logging.warning(
                f"AuriClass found a distance of {self.minimal_distance} to the closest sample, please ensure this is Candida auris"
            )
            self.qc_species = f"FAIL: distance {self.minimal_distance} to closest sample is above threshold"
            return False
        else:
            return True

    def check_for_outgroup(self) -> bool:
        """
        Check if the closest sample is defined as outgroup.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        bool
            True if the closest sample is not an outgroup, False otherwise.

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_other_candida: a warning message if the closest sample is defined as outgroup

        The function uses the following attributes of the object:
        - clade: the clade of the closest reference sample
        - closest_sample: the closest reference sample to the current test sample
        """
        if self.clade == "outgroup":
            logging.warning(
                f"AuriClass found a non-Candida auris reference as closest sample, please ensure this is Candida auris"
            )
            self.qc_other_candida = (
                f"FAIL: outgroup reference {self.closest_sample} as closest sample"
            )
            return False
        else:
            return True

    def check_high_dist(self) -> None:
        """
        Check if the closest sample is above high_dist_threshold distance. If it is, it may indicate a new clade.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance

        The function uses the following attributes of the object:
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        - high_dist_threshold: the threshold above which the distance to the closest sample is considered to be a new clade
        """
        if self.minimal_distance > self.high_dist_threshold:
            logging.warning(
                f"AuriClass found a distance of {self.minimal_distance} to the closest sample, please ensure this is Candida auris"
            )
            self.qc_high_distance = f"WARN: distance {self.minimal_distance} to closest sample is above threshold"

    def get_error_bounds(self) -> str:
        """
        Get error bounds for the current sketch size and kmer size.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        str
            The STDOUT of the mash bounds command.

        Notes
        -----
        The function uses the following attributes of the object:
        - kmer_size: the kmer size to use for the sketch
        - probability: the probability to use for the mash bounds command
        """
        command = [
            "mash",
            "bounds",
            "-k",
            str(self.kmer_size),
            "-p",
            str(self.probability),
        ]
        logging.info(add_tag("mash bounds", " ".join(command)))
        output = subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        stderr = output.stderr.decode("utf-8")
        if stderr:
            for line in stderr.splitlines():
                logging.info(add_tag("mash bounds", line))

        return output.stdout.decode("utf-8")

    def process_error_bounds(self, error_bounds_text: str) -> None:
        """
        Process the error bounds text to get the error bound for the current sketch size and kmer size.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.
        error_bounds_text : str
            The STDOUT of the mash bounds command.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - error_bound: the error bound for the current sketch size and kmer size


        The function uses the following attributes of the object:
        - sketch_size: the size of the sketch to create
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        """
        # Split text into lines
        lines = error_bounds_text.splitlines()
        # Find line with "Mash distance"
        for i, line in enumerate(lines):
            if "Mash distance" in line:
                start_line = i + 1
                break
        # Find line with "Screen distance"
        for i, line in enumerate(lines):
            if "Screen distance" in line:
                end_line = i
                break
        # Read lines into dataframe
        df = pd.read_csv(
            StringIO("\n".join(lines[start_line:end_line])),
            sep="\t",
        )
        # Get first column of which column name is higher than min_dist
        for col in df.columns[1:]:
            if float(col) > self.minimal_distance:
                selected_col = str(col)
                break
        self.error_bound = df.loc[
            df["Sketch"] == self.sketch_size, selected_col
        ].values[0]

    def compare_with_error_bounds(self) -> None:
        """
        Compare the distance between the current test sample and the closest reference sample with the error bound.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - distances: the distances between the current test sample and all other reference samples
        - samples_within_error_bound: the number of samples within the error bound of the closest sample
        - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0

        The function uses the following attributes of the object:
        - mash_output: a pandas DataFrame containing the output of the Mash distance calculation
        - closest_sample: the closest reference sample to the current test sample
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        - error_bound: the error bound for the current sketch size and kmer size
        """
        # Get distance between highest hit and other samples
        dist_array = self.mash_output.loc[
            self.mash_output["Clade"] != self.clade, "Distance"
        ].values
        self.distances = [float(distance) for distance in dist_array]
        # Check if distance is higher than error_bound
        count_above_threshold = 0
        for distance in self.distances:
            if distance < (self.minimal_distance + self.error_bound):
                logging.debug(
                    add_tag(
                        "compare_with_error_bounds",
                        f"distance {distance} is above threshold: ({self.minimal_distance} + {self.error_bound})",
                    )
                )
                count_above_threshold += 1
            else:
                logging.debug(
                    add_tag(
                        "compare_with_error_bounds",
                        f"distance {distance} is below threshold: ({self.minimal_distance} + {self.error_bound})",
                    )
                )
        self.samples_within_error_bound = count_above_threshold
        # If samples are found within error bound, warn user
        if self.samples_within_error_bound > 0:
            logging.warning(
                f"AuriClass found {self.samples_within_error_bound} sample(s) within the error bound of {self.error_bound} of the closest sample"
            )
            self.qc_multiple_hits = (
                f"WARN: {self.samples_within_error_bound} sample(s) within error bound"
            )

    def save_report(self) -> None:
        """
        Save the report to a TSV file.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_decision: the QC decision based on the QC attributes

        The function uses the following attributes of the object:
        - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold
        - qc_other_candida: a warning message if the closest sample is defined as outgroup
        - qc_genome_size: a warning message if the estimated genome size is outside the expected range
        - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0
        - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance
        - name: the name of the current test sample
        - clade: the clade of the closest reference sample
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        - output_report_path: the path to the output report
        - no_qc: a boolean indicating whether to skip extended QC
        """
        FAIL_metrics = [
            "qc_species",
            "qc_other_candida",
        ]
        WARN_metrics = [
            "qc_genome_size",
            "qc_multiple_hits",
            "qc_high_distance",
        ]
        if self.no_qc:
            # Set all attributes in WARN_metrics to "SKIPPED"
            for metric in WARN_metrics:
                setattr(self, metric, "SKIPPED")

        # Check if any of the qc attributes contain "WARN"
        FAIL_observed = any(
            ["FAIL" in getattr(self, qc_metric) for qc_metric in FAIL_metrics]
        )
        WARN_observed = any(
            ["WARN" in getattr(self, qc_metric) for qc_metric in WARN_metrics]
        )

        if FAIL_observed:
            self.qc_decision = "FAIL"
        elif WARN_observed:
            self.qc_decision = "WARN"
        else:
            self.qc_decision = "PASS"

        # if any(
        #     [
        #         ("FAIL" in qc_metric) for qc_metric in [self.qc_species],
        #     ]
        # ):
        #     self.qc_decision = "FAIL"
        # elif any(
        #     [
        #         self.qc_genome_size,
        #         self.qc_other_candida,
        #         self.qc_multiple_hits,
        #         self.qc_high_distance,
        #     ]
        # ):
        #     self.qc_decision = "WARN"
        # else:
        #     self.qc_decision = "PASS"

        pd.DataFrame(
            [
                [
                    self.name,
                    self.clade,
                    self.minimal_distance,
                    self.qc_decision,
                    self.qc_species,
                    self.qc_other_candida,
                    self.qc_genome_size,
                    self.qc_multiple_hits,
                    self.qc_high_distance,
                ]
            ],
            columns=[
                "Sample",
                "Clade",
                "Mash_distance_from_closest_reference",
                "QC_decision",
                "QC_species",
                "QC_other_Candida",
                "QC_genome_size",
                "QC_multiple_hits",
                "QC_high_distance",
            ],
        ).replace("", "PASS").to_csv(self.output_report_path, sep="\t", index=False)

`check_for_outgroup()`

Check if the closest sample is defined as outgroup.

Parameters

self : object The AuriClassAnalysis object.

Returns

bool True if the closest sample is not an outgroup, False otherwise.

Notes

This function sets the following attributes of the object: - qc_other_candida: a warning message if the closest sample is defined as outgroup

The function uses the following attributes of the object: - clade: the clade of the closest reference sample - closest_sample: the closest reference sample to the current test sample

Source code in auriclass/classes.py

def check_for_outgroup(self) -> bool:
    """
    Check if the closest sample is defined as outgroup.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    bool
        True if the closest sample is not an outgroup, False otherwise.

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_other_candida: a warning message if the closest sample is defined as outgroup

    The function uses the following attributes of the object:
    - clade: the clade of the closest reference sample
    - closest_sample: the closest reference sample to the current test sample
    """
    if self.clade == "outgroup":
        logging.warning(
            f"AuriClass found a non-Candida auris reference as closest sample, please ensure this is Candida auris"
        )
        self.qc_other_candida = (
            f"FAIL: outgroup reference {self.closest_sample} as closest sample"
        )
        return False
    else:
        return True

`check_genome_size()`

Compare the estimated genome size with the expected genome size range.

If the estimated genome size is outside the expected range, a warning message is logged and the qc_genome_size attribute is set to "WARN: genome size outside expected range".

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - qc_genome_size: a warning message if the estimated genome size is outside the expected range

The function uses the following attributes of the object: - estimated_genome_size: the estimated genome size of the current sample

Source code in auriclass/classes.py

def check_genome_size(self) -> None:
    """
    Compare the estimated genome size with the expected genome size range.

    If the estimated genome size is outside the expected range, a warning message is logged and the
    `qc_genome_size` attribute is set to "WARN: genome size outside expected range".

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_genome_size: a warning message if the estimated genome size is outside the expected range

    The function uses the following attributes of the object:
    - estimated_genome_size: the estimated genome size of the current sample
    """
    if (
        self.estimated_genome_size < self.genome_size_range[0]
        or self.estimated_genome_size > self.genome_size_range[1]
    ):
        logging.warning(
            f"AuriClass estimated genome size of {self.estimated_genome_size} is outside the expected range of {self.genome_size_range}"
        )
        self.qc_genome_size = "WARN: genome size outside expected range"

`check_high_dist()`

Check if the closest sample is above high_dist_threshold distance. If it is, it may indicate a new clade.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance

The function uses the following attributes of the object: - minimal_distance: the minimal distance between the current test sample and the closest reference sample - high_dist_threshold: the threshold above which the distance to the closest sample is considered to be a new clade

Source code in auriclass/classes.py

def check_high_dist(self) -> None:
    """
    Check if the closest sample is above high_dist_threshold distance. If it is, it may indicate a new clade.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance

    The function uses the following attributes of the object:
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    - high_dist_threshold: the threshold above which the distance to the closest sample is considered to be a new clade
    """
    if self.minimal_distance > self.high_dist_threshold:
        logging.warning(
            f"AuriClass found a distance of {self.minimal_distance} to the closest sample, please ensure this is Candida auris"
        )
        self.qc_high_distance = f"WARN: distance {self.minimal_distance} to closest sample is above threshold"

`check_non_candida()`

Check if the distance to the closest sample is higher than the non-Candida threshold.

Parameters

self : object The AuriClassAnalysis object.

Returns

bool True if the distance to the closest sample is lower than or equal to the non-Candida threshold, False otherwise.

Notes

This function sets the following attributes of the object: - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold

The function uses the following attributes of the object: - minimal_distance: the minimal distance between the current test sample and the closest reference sample - non_candida_threshold: the threshold above which the distance to the closest sample is considered to be non-Candida

Source code in auriclass/classes.py

def check_non_candida(self) -> bool:
    """
    Check if the distance to the closest sample is higher than the non-Candida threshold.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    bool
        True if the distance to the closest sample is lower than or equal to the non-Candida threshold, False otherwise.

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold

    The function uses the following attributes of the object:
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    - non_candida_threshold: the threshold above which the distance to the closest sample is considered to be non-Candida
    """
    if self.minimal_distance > self.non_candida_threshold:
        logging.warning(
            f"AuriClass found a distance of {self.minimal_distance} to the closest sample, please ensure this is Candida auris"
        )
        self.qc_species = f"FAIL: distance {self.minimal_distance} to closest sample is above threshold"
        return False
    else:
        return True

`compare_with_error_bounds()`

Compare the distance between the current test sample and the closest reference sample with the error bound.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - distances: the distances between the current test sample and all other reference samples - samples_within_error_bound: the number of samples within the error bound of the closest sample - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0

The function uses the following attributes of the object: - mash_output: a pandas DataFrame containing the output of the Mash distance calculation - closest_sample: the closest reference sample to the current test sample - minimal_distance: the minimal distance between the current test sample and the closest reference sample - error_bound: the error bound for the current sketch size and kmer size

Source code in auriclass/classes.py

def compare_with_error_bounds(self) -> None:
    """
    Compare the distance between the current test sample and the closest reference sample with the error bound.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - distances: the distances between the current test sample and all other reference samples
    - samples_within_error_bound: the number of samples within the error bound of the closest sample
    - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0

    The function uses the following attributes of the object:
    - mash_output: a pandas DataFrame containing the output of the Mash distance calculation
    - closest_sample: the closest reference sample to the current test sample
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    - error_bound: the error bound for the current sketch size and kmer size
    """
    # Get distance between highest hit and other samples
    dist_array = self.mash_output.loc[
        self.mash_output["Clade"] != self.clade, "Distance"
    ].values
    self.distances = [float(distance) for distance in dist_array]
    # Check if distance is higher than error_bound
    count_above_threshold = 0
    for distance in self.distances:
        if distance < (self.minimal_distance + self.error_bound):
            logging.debug(
                add_tag(
                    "compare_with_error_bounds",
                    f"distance {distance} is above threshold: ({self.minimal_distance} + {self.error_bound})",
                )
            )
            count_above_threshold += 1
        else:
            logging.debug(
                add_tag(
                    "compare_with_error_bounds",
                    f"distance {distance} is below threshold: ({self.minimal_distance} + {self.error_bound})",
                )
            )
    self.samples_within_error_bound = count_above_threshold
    # If samples are found within error bound, warn user
    if self.samples_within_error_bound > 0:
        logging.warning(
            f"AuriClass found {self.samples_within_error_bound} sample(s) within the error bound of {self.error_bound} of the closest sample"
        )
        self.qc_multiple_hits = (
            f"WARN: {self.samples_within_error_bound} sample(s) within error bound"
        )

`get_error_bounds()`

Get error bounds for the current sketch size and kmer size.

Parameters

self : object The AuriClassAnalysis object.

Returns

str The STDOUT of the mash bounds command.

Notes

The function uses the following attributes of the object: - kmer_size: the kmer size to use for the sketch - probability: the probability to use for the mash bounds command

Source code in auriclass/classes.py

def get_error_bounds(self) -> str:
    """
    Get error bounds for the current sketch size and kmer size.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    str
        The STDOUT of the mash bounds command.

    Notes
    -----
    The function uses the following attributes of the object:
    - kmer_size: the kmer size to use for the sketch
    - probability: the probability to use for the mash bounds command
    """
    command = [
        "mash",
        "bounds",
        "-k",
        str(self.kmer_size),
        "-p",
        str(self.probability),
    ]
    logging.info(add_tag("mash bounds", " ".join(command)))
    output = subprocess.run(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    stderr = output.stderr.decode("utf-8")
    if stderr:
        for line in stderr.splitlines():
            logging.info(add_tag("mash bounds", line))

    return output.stdout.decode("utf-8")

`process_error_bounds(error_bounds_text)`

Process the error bounds text to get the error bound for the current sketch size and kmer size.

Parameters

self : object The AuriClassAnalysis object. error_bounds_text : str The STDOUT of the mash bounds command.

Returns

None

Notes

This function sets the following attributes of the object: - error_bound: the error bound for the current sketch size and kmer size

The function uses the following attributes of the object: - sketch_size: the size of the sketch to create - minimal_distance: the minimal distance between the current test sample and the closest reference sample

Source code in auriclass/classes.py

def process_error_bounds(self, error_bounds_text: str) -> None:
    """
    Process the error bounds text to get the error bound for the current sketch size and kmer size.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.
    error_bounds_text : str
        The STDOUT of the mash bounds command.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - error_bound: the error bound for the current sketch size and kmer size


    The function uses the following attributes of the object:
    - sketch_size: the size of the sketch to create
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    """
    # Split text into lines
    lines = error_bounds_text.splitlines()
    # Find line with "Mash distance"
    for i, line in enumerate(lines):
        if "Mash distance" in line:
            start_line = i + 1
            break
    # Find line with "Screen distance"
    for i, line in enumerate(lines):
        if "Screen distance" in line:
            end_line = i
            break
    # Read lines into dataframe
    df = pd.read_csv(
        StringIO("\n".join(lines[start_line:end_line])),
        sep="\t",
    )
    # Get first column of which column name is higher than min_dist
    for col in df.columns[1:]:
        if float(col) > self.minimal_distance:
            selected_col = str(col)
            break
    self.error_bound = df.loc[
        df["Sketch"] == self.sketch_size, selected_col
    ].values[0]

`run_mash_dist()`

Run Mash dist and save results.

Parameters

self : object The AuriClassAnalysis object.

Returns

pandas.DataFrame A DataFrame containing the results of the Mash dist, including the reference and query sequences, the distance between them, the p-value, the number of matching hashes, and the predicted clade.

Notes

This function sets the following attributes of the object: - mash_output: a pandas DataFrame containing the output of the Mash distance calculation

The function uses the following attributes of the object: - reference_sketch_path: the path to the reference sketch - query_sketch_path: the path to the query sketch - clade_dict: a dictionary containing the clade information for each reference sample

Source code in auriclass/classes.py

def run_mash_dist(self) -> pd.DataFrame:
    """
    Run Mash dist and save results.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the results of the Mash dist, including the reference and query sequences,
        the distance between them, the p-value, the number of matching hashes, and the predicted clade.

    Notes
    -----
    This function sets the following attributes of the object:
    - mash_output: a pandas DataFrame containing the output of the Mash distance calculation

    The function uses the following attributes of the object:
    - reference_sketch_path: the path to the reference sketch
    - query_sketch_path: the path to the query sketch
    - clade_dict: a dictionary containing the clade information for each reference sample
    """
    command = [
        "mash",
        "dist",
        self.reference_sketch_path,
        self.query_sketch_path,
    ]
    command_list_of_str = [str(item) for item in command]
    logging.info(add_tag("mash dist", " ".join(command_list_of_str)))
    output = subprocess.run(
        command_list_of_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    # Append stdout and stderr to object
    stderr = output.stderr.decode("utf-8")
    if stderr:
        for line in stderr.splitlines():
            logging.info(add_tag("mash dist", line))
    # Read output into pandas dataframe
    df = pd.read_csv(
        StringIO(output.stdout.decode("utf-8")),
        sep="\t",
        header=None,
        names=["Reference", "Query", "Distance", "P-value", "Matching-hashes"],
    )
    df["Clade"] = df["Reference"].map(self.clade_dict["clade"])
    self.mash_output = df
    return df

`save_report()`

Save the report to a TSV file.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - qc_decision: the QC decision based on the QC attributes

The function uses the following attributes of the object: - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold - qc_other_candida: a warning message if the closest sample is defined as outgroup - qc_genome_size: a warning message if the estimated genome size is outside the expected range - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0 - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance - name: the name of the current test sample - clade: the clade of the closest reference sample - minimal_distance: the minimal distance between the current test sample and the closest reference sample - output_report_path: the path to the output report - no_qc: a boolean indicating whether to skip extended QC

Source code in auriclass/classes.py

def save_report(self) -> None:
    """
    Save the report to a TSV file.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_decision: the QC decision based on the QC attributes

    The function uses the following attributes of the object:
    - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold
    - qc_other_candida: a warning message if the closest sample is defined as outgroup
    - qc_genome_size: a warning message if the estimated genome size is outside the expected range
    - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0
    - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance
    - name: the name of the current test sample
    - clade: the clade of the closest reference sample
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    - output_report_path: the path to the output report
    - no_qc: a boolean indicating whether to skip extended QC
    """
    FAIL_metrics = [
        "qc_species",
        "qc_other_candida",
    ]
    WARN_metrics = [
        "qc_genome_size",
        "qc_multiple_hits",
        "qc_high_distance",
    ]
    if self.no_qc:
        # Set all attributes in WARN_metrics to "SKIPPED"
        for metric in WARN_metrics:
            setattr(self, metric, "SKIPPED")

    # Check if any of the qc attributes contain "WARN"
    FAIL_observed = any(
        ["FAIL" in getattr(self, qc_metric) for qc_metric in FAIL_metrics]
    )
    WARN_observed = any(
        ["WARN" in getattr(self, qc_metric) for qc_metric in WARN_metrics]
    )

    if FAIL_observed:
        self.qc_decision = "FAIL"
    elif WARN_observed:
        self.qc_decision = "WARN"
    else:
        self.qc_decision = "PASS"

    # if any(
    #     [
    #         ("FAIL" in qc_metric) for qc_metric in [self.qc_species],
    #     ]
    # ):
    #     self.qc_decision = "FAIL"
    # elif any(
    #     [
    #         self.qc_genome_size,
    #         self.qc_other_candida,
    #         self.qc_multiple_hits,
    #         self.qc_high_distance,
    #     ]
    # ):
    #     self.qc_decision = "WARN"
    # else:
    #     self.qc_decision = "PASS"

    pd.DataFrame(
        [
            [
                self.name,
                self.clade,
                self.minimal_distance,
                self.qc_decision,
                self.qc_species,
                self.qc_other_candida,
                self.qc_genome_size,
                self.qc_multiple_hits,
                self.qc_high_distance,
            ]
        ],
        columns=[
            "Sample",
            "Clade",
            "Mash_distance_from_closest_reference",
            "QC_decision",
            "QC_species",
            "QC_other_Candida",
            "QC_genome_size",
            "QC_multiple_hits",
            "QC_high_distance",
        ],
    ).replace("", "PASS").to_csv(self.output_report_path, sep="\t", index=False)

`select_clade()`

Selects the closest reference sample and corresponding clade.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - closest_sample: the closest reference sample to the current test sample - clade: the clade of the closest reference sample - minimal_distance: the minimal distance between the current test sample and the closest reference sample

The function uses the following attributes of the object: - mash_output: a pandas DataFrame containing the output of the Mash distance calculation - clade_dict: a dictionary containing the clade information for each reference sample

Source code in auriclass/classes.py

def select_clade(self) -> None:
    """
    Selects the closest reference sample and corresponding clade.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - closest_sample: the closest reference sample to the current test sample
    - clade: the clade of the closest reference sample
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample

    The function uses the following attributes of the object:
    - mash_output: a pandas DataFrame containing the output of the Mash distance calculation
    - clade_dict: a dictionary containing the clade information for each reference sample
    """
    # Get highest hit, by sorting on distance and selecting first hit
    self.closest_sample = self.mash_output.sort_values("Distance").iloc[0][
        "Reference"
    ]
    # Get clade of closest sample
    self.clade = self.mash_output.loc[
        self.mash_output["Reference"] == self.closest_sample, "Clade"
    ].values[0]
    # Get minimal distance using self.mash_output and closest sample
    self.minimal_distance = self.mash_output.loc[
        self.mash_output["Reference"] == self.closest_sample, "Distance"
    ].values[0]

The `FastqAuriclass` class

Bases: BasicAuriclass

Source code in auriclass/classes.py

class FastqAuriclass(BasicAuriclass):
    def sketch_fastq_query(self) -> None:
        """
        Sketches the query fastq files using mash.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If no sequence records are found in the specified fastq file(s).

            If the estimated genome size cannot be parsed from the mash sketch STDERR.

        Notes
        -----
        This function sets the following attributes of the object:
        - stdout: the STDOUT of the mash sketch command
        - estimated_genome_size: the estimated genome size of the current test sample


        The function uses the following attributes of the object:
        - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch
        - query_sketch_path: the path to the query sketch
        - kmer_size: the kmer size to use for the sketch
        - sketch_size: the size of the sketch to create
        - read_paths: the paths to the fastq files to sketch
        """
        command = [
            "mash",
            "sketch",
            "-r",
            "-m",
            str(self.minimal_kmer_coverage),
            "-o",
            self.query_sketch_path,
            "-k",
            str(self.kmer_size),
            "-s",
            str(self.sketch_size),
            *self.read_paths,
        ]
        command_list_of_str = [str(item) for item in command]
        logging.info(add_tag("mash sketch", " ".join(command_list_of_str)))
        output = subprocess.run(
            command_list_of_str,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        if "ERROR: Did not find fasta records in" in output.stderr.decode("utf-8"):
            raise ValueError(
                f"Did not find sequence records in {self.read_paths}. Please check if these are valid fastq files"
            )
        # Add stdout and stderr to object
        self.stdout = output.stdout.decode("utf-8")
        stderr = output.stderr.decode("utf-8")
        if stderr:
            for line in stderr.splitlines():
                logging.info(add_tag("mash sketch", line))

        # find line with "Estimated genome size" and get value
        for line in stderr.splitlines():
            if "Estimated genome size" in line:
                self.estimated_genome_size = float(line.split()[-1])
                break
        else:
            raise ValueError(
                "Estimated genome size could not be parsed from mash sketch STDERR"
            )

    def run(self) -> None:
        # Sketch query genome using tempfile
        with tempfile.TemporaryDirectory() as tmpdir:
            self.query_sketch_path = Path(tmpdir).joinpath("tmpfile.msh")
            self.sketch_fastq_query()

            # Run mash screen
            self.run_mash_dist()

        # Check if genome size is within expected range
        self.check_genome_size()

        # Process results
        self.select_clade()

        # Check if all samples not above certain threshold indicating other species
        probably_candida = self.check_non_candida()

        if probably_candida:
            # Check if clade is not "outgroup"
            probably_cauris = self.check_for_outgroup()

            if probably_cauris:
                if self.no_qc == False:
                    # Check if closest sample is above self.high_dist_threshold distance --> new clade?
                    self.check_high_dist()

                    # Check error bounds and check number of samples within error bounds
                    error_bounds_text = self.get_error_bounds()
                    self.process_error_bounds(error_bounds_text)
                    self.compare_with_error_bounds()
            else:
                self.clade = "other Candida/CUG-Ser1 clade sp."
        else:
            self.clade = "not Candida auris"
            self.qc_other_candida = "SKIPPED"
            self.qc_genome_size = "SKIPPED"
            self.qc_multiple_hits = "SKIPPED"
            self.qc_high_distance = "SKIPPED"

        # Save report
        self.save_report()

`sketch_fastq_query()`

Sketches the query fastq files using mash.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Raises

ValueError If no sequence records are found in the specified fastq file(s).

If the estimated genome size cannot be parsed from the mash sketch STDERR.

Notes

This function sets the following attributes of the object: - stdout: the STDOUT of the mash sketch command - estimated_genome_size: the estimated genome size of the current test sample

The function uses the following attributes of the object: - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch - query_sketch_path: the path to the query sketch - kmer_size: the kmer size to use for the sketch - sketch_size: the size of the sketch to create - read_paths: the paths to the fastq files to sketch

Source code in auriclass/classes.py

def sketch_fastq_query(self) -> None:
    """
    Sketches the query fastq files using mash.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If no sequence records are found in the specified fastq file(s).

        If the estimated genome size cannot be parsed from the mash sketch STDERR.

    Notes
    -----
    This function sets the following attributes of the object:
    - stdout: the STDOUT of the mash sketch command
    - estimated_genome_size: the estimated genome size of the current test sample


    The function uses the following attributes of the object:
    - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch
    - query_sketch_path: the path to the query sketch
    - kmer_size: the kmer size to use for the sketch
    - sketch_size: the size of the sketch to create
    - read_paths: the paths to the fastq files to sketch
    """
    command = [
        "mash",
        "sketch",
        "-r",
        "-m",
        str(self.minimal_kmer_coverage),
        "-o",
        self.query_sketch_path,
        "-k",
        str(self.kmer_size),
        "-s",
        str(self.sketch_size),
        *self.read_paths,
    ]
    command_list_of_str = [str(item) for item in command]
    logging.info(add_tag("mash sketch", " ".join(command_list_of_str)))
    output = subprocess.run(
        command_list_of_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    if "ERROR: Did not find fasta records in" in output.stderr.decode("utf-8"):
        raise ValueError(
            f"Did not find sequence records in {self.read_paths}. Please check if these are valid fastq files"
        )
    # Add stdout and stderr to object
    self.stdout = output.stdout.decode("utf-8")
    stderr = output.stderr.decode("utf-8")
    if stderr:
        for line in stderr.splitlines():
            logging.info(add_tag("mash sketch", line))

    # find line with "Estimated genome size" and get value
    for line in stderr.splitlines():
        if "Estimated genome size" in line:
            self.estimated_genome_size = float(line.split()[-1])
            break
    else:
        raise ValueError(
            "Estimated genome size could not be parsed from mash sketch STDERR"
        )

The `FastaAuriclass` class

Bases: BasicAuriclass

Source code in auriclass/classes.py

class FastaAuriclass(BasicAuriclass):
    def sketch_fasta_query(self) -> None:
        """
        Sketches the query fasta files using mash.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If no sequence records are found in the specified fasta file(s).


        Notes
        -----
        This function sets the following attributes of the object:
        - stdout: the STDOUT of the mash sketch command
        - estimated_genome_size: the estimated genome size of the current test sample


        The function uses the following attributes of the object:
        - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch
        - query_sketch_path: the path to the query sketch
        - kmer_size: the kmer size to use for the sketch
        - sketch_size: the size of the sketch to create
        - read_paths: the paths to the fastq files to sketch
        """
        command = [
            "mash",
            "sketch",
            "-o",
            self.query_sketch_path,
            "-k",
            str(self.kmer_size),
            "-s",
            str(self.sketch_size),
            *self.read_paths,
        ]
        command_list_of_str = [str(item) for item in command]
        logging.info(add_tag("mash sketch", " ".join(command_list_of_str)))
        output = subprocess.run(
            command_list_of_str,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        if "ERROR: Did not find fasta records in" in output.stderr.decode("utf-8"):
            raise ValueError(
                f"Did not find sequence records in {self.read_paths}. Please check if these are valid fastq files"
            )
        # Add stdout and stderr to object
        self.stdout = output.stdout.decode("utf-8")
        stderr = output.stderr.decode("utf-8")
        if stderr:
            for line in stderr.splitlines():
                logging.info(add_tag("mash sketch", line))

    def parse_genome_size(self) -> None:
        """
        Parse the size of the fasta input using pyfastx.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - estimated_genome_size: the estimated genome size of the current test sample

        The function uses the following attributes of the object:
        - read_paths: the paths to the fasta files to parse
        """
        total_size = 0
        for filepath in self.read_paths:
            # Reading a fasta file requires index building, so remove this afterwards
            total_size += pyfastx.Fasta(filepath, build_index=True).size
            Path(str(filepath) + ".fxi").unlink(missing_ok=True)
        self.estimated_genome_size = total_size

    def run(self) -> None:
        # Sketch query genome using tempfile
        with tempfile.TemporaryDirectory() as tmpdir:
            self.query_sketch_path = Path(tmpdir).joinpath("tmpfile.msh")
            self.sketch_fasta_query()

            # Run mash screen
            self.run_mash_dist()

        # Check if genome size is within expected range
        self.parse_genome_size()
        self.check_genome_size()

        # Process results
        self.select_clade()

        # Check if all samples not above certain threshold indicating other species
        probably_candida = self.check_non_candida()

        if probably_candida:
            # Check if clade is not "outgroup"
            probably_cauris = self.check_for_outgroup()
            if probably_cauris:
                if self.no_qc == False:
                    # Check if closest sample is above self.high_dist_threshold distance --> new clade?
                    self.check_high_dist()

                    # Check error bounds and check number of samples within error bounds
                    error_bounds_text = self.get_error_bounds()
                    self.process_error_bounds(error_bounds_text)
                    self.compare_with_error_bounds()
            else:
                self.clade = "other Candida/CUG-Ser1 clade sp."
        else:
            self.clade = "not Candida auris"
            self.qc_other_candida = "SKIPPED"
            self.qc_genome_size = "SKIPPED"
            self.qc_multiple_hits = "SKIPPED"
            self.qc_high_distance = "SKIPPED"

        # Save report
        self.save_report()

`parse_genome_size()`

Parse the size of the fasta input using pyfastx.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - estimated_genome_size: the estimated genome size of the current test sample

The function uses the following attributes of the object: - read_paths: the paths to the fasta files to parse

Source code in auriclass/classes.py

def parse_genome_size(self) -> None:
    """
    Parse the size of the fasta input using pyfastx.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - estimated_genome_size: the estimated genome size of the current test sample

    The function uses the following attributes of the object:
    - read_paths: the paths to the fasta files to parse
    """
    total_size = 0
    for filepath in self.read_paths:
        # Reading a fasta file requires index building, so remove this afterwards
        total_size += pyfastx.Fasta(filepath, build_index=True).size
        Path(str(filepath) + ".fxi").unlink(missing_ok=True)
    self.estimated_genome_size = total_size

`sketch_fasta_query()`

Sketches the query fasta files using mash.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Raises

ValueError If no sequence records are found in the specified fasta file(s).

Notes

This function sets the following attributes of the object: - stdout: the STDOUT of the mash sketch command - estimated_genome_size: the estimated genome size of the current test sample

The function uses the following attributes of the object: - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch - query_sketch_path: the path to the query sketch - kmer_size: the kmer size to use for the sketch - sketch_size: the size of the sketch to create - read_paths: the paths to the fastq files to sketch

Source code in auriclass/classes.py

def sketch_fasta_query(self) -> None:
    """
    Sketches the query fasta files using mash.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If no sequence records are found in the specified fasta file(s).


    Notes
    -----
    This function sets the following attributes of the object:
    - stdout: the STDOUT of the mash sketch command
    - estimated_genome_size: the estimated genome size of the current test sample


    The function uses the following attributes of the object:
    - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch
    - query_sketch_path: the path to the query sketch
    - kmer_size: the kmer size to use for the sketch
    - sketch_size: the size of the sketch to create
    - read_paths: the paths to the fastq files to sketch
    """
    command = [
        "mash",
        "sketch",
        "-o",
        self.query_sketch_path,
        "-k",
        str(self.kmer_size),
        "-s",
        str(self.sketch_size),
        *self.read_paths,
    ]
    command_list_of_str = [str(item) for item in command]
    logging.info(add_tag("mash sketch", " ".join(command_list_of_str)))
    output = subprocess.run(
        command_list_of_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    if "ERROR: Did not find fasta records in" in output.stderr.decode("utf-8"):
        raise ValueError(
            f"Did not find sequence records in {self.read_paths}. Please check if these are valid fastq files"
        )
    # Add stdout and stderr to object
    self.stdout = output.stdout.decode("utf-8")
    stderr = output.stderr.decode("utf-8")
    if stderr:
        for line in stderr.splitlines():
            logging.info(add_tag("mash sketch", line))

General scripts

`add_tag(tag, lines)`

Add a tag to a string or list of strings.

Parameters

tag : str The tag to add. lines : str or list of str The string or list of strings to add the tag to.

Returns

str The string with the tag added to each line.

Source code in auriclass/general.py

def add_tag(tag: str, lines: str) -> str:
    """
    Add a tag to a string or list of strings.

    Parameters
    ----------
    tag : str
        The tag to add.
    lines : str or list of str
        The string or list of strings to add the tag to.

    Returns
    -------
    str
        The string with the tag added to each line.
    """
    full_tag = f"[{tag}]"
    if lines:
        return "\n".join(
            [f"{full_tag} {line}" for line in lines.split("\n") if line != ""]
        )
    else:
        return f"{full_tag}"

`check_dependencies()`

Check if dependencies are installed.

Parameters

None

Returns

None

Raises

FileNotFoundError If mash is not installed.

Source code in auriclass/general.py

def check_dependencies() -> None:
    """
    Check if dependencies are installed.

    Parameters
    ----------
    None

    Returns
    -------
    None

    Raises
    ------
    FileNotFoundError
        If mash is not installed.
    """
    try:
        subprocess.call(
            ["mash", "-h"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
    except FileNotFoundError:
        raise FileNotFoundError("Mash is not installed")

`check_number_within_range(minimum=0, maximum=1)`

Creates a function to check whether a numeric value is within a range, inclusive.

The generated function can be used by the type parameter in argparse.ArgumentParser. See https://stackoverflow.com/a/53446178.

Parameters:

Name	Type	Description	Default
`value`		the numeric value to check.	required
`minimum`	`float`	minimum of allowed range, inclusive.	`0`
`maximum`	`float`	maximum of allowed range, inclusive.	`1`

Returns:

Type	Description
`Union[Callable[[str], str], FileType]`	A function which takes a single argument and checks this against the range.

Raises:

Type	Description
`ArgumentTypeError`	if the value is outside the range.
`ValueError`	if the value cannot be converted to float.

Source code in auriclass/general.py

def check_number_within_range(
    minimum: float = 0, maximum: float = 1
) -> Union[Callable[[str], str], argparse.FileType]:
    """
    Creates a function to check whether a numeric value is within a range, inclusive.

    The generated function can be used by the `type` parameter in argparse.ArgumentParser.
    See https://stackoverflow.com/a/53446178.

    Args:
        value: the numeric value to check.
        minimum: minimum of allowed range, inclusive.
        maximum: maximum of allowed range, inclusive.

    Returns:
        A function which takes a single argument and checks this against the range.

    Raises:
        argparse.ArgumentTypeError: if the value is outside the range.
        ValueError: if the value cannot be converted to float.
    """

    def generated_func_check_range(value: str) -> str:
        value_f = float(value)
        if (value_f < minimum) or (value_f > maximum):
            raise argparse.ArgumentTypeError(
                f"Supplied value {value} is not within expected range {minimum} to {maximum}."
            )
        return str(value)

    return generated_func_check_range

`confirm_input_type(list_of_file_paths, input_type)`

Confirms that input files are of the specified type.

Parameters

list_of_file_paths : list of str The list of filepaths to check. input_type : str The input type to check.

Returns

None

Raises

None

Notes

This function does not raise any exceptions, but it does log a warning if any of the input files are not of the specified type. This is intended to run if the user specifies the input type with --fastq or --fasta.

Source code in auriclass/general.py

def confirm_input_type(list_of_file_paths: List[str], input_type: str) -> None:
    """
    Confirms that input files are of the specified type.

    Parameters
    ----------
    list_of_file_paths : list of str
        The list of filepaths to check.
    input_type : str
        The input type to check.

    Returns
    -------
    None

    Raises
    ------
    None

    Notes
    -----
    This function does not raise any exceptions, but it does log a warning if any of the input files are not of the specified type.
    This is intended to run if the user specifies the input type with --fastq or --fasta.
    """
    for file_path in list_of_file_paths:
        if input_type == "fastq":
            if not is_fastq(file_path):
                logging.warning(
                    f"Input file {file_path} cannot be parsed as a fastq file, please check if --fastq is appropriate"
                )
        elif input_type == "fasta":
            if not is_fasta(file_path):
                logging.warning(
                    f"Input file {file_path} cannot be parsed as a fasta file, please check if --fasta is appropriate"
                )

`guess_input_type(list_of_file_paths)`

Guess the input type of a list of files.

Parameters

list_of_file_paths : list of str The list of filepaths to check.

Returns

str The input type, either "fastq" or "fasta".

Raises

ValueError If any of the input files can be parsed as both fastq and fasta.

If any of the input files cannot be parsed as either fastq or fasta.

If the input files are a mix of fastq and fasta files.

If no input files were found.

Source code in auriclass/general.py

def guess_input_type(list_of_file_paths: List[str]) -> str:
    """
    Guess the input type of a list of files.

    Parameters
    ----------
    list_of_file_paths : list of str
        The list of filepaths to check.

    Returns
    -------
    str
        The input type, either "fastq" or "fasta".

    Raises
    ------
    ValueError
        If any of the input files can be parsed as both fastq and fasta.

        If any of the input files cannot be parsed as either fastq or fasta.

        If the input files are a mix of fastq and fasta files.

        If no input files were found.
    """
    fastq_count = 0
    fasta_count = 0
    for file_path in list_of_file_paths:
        if is_fastq(file_path):
            if is_fasta(file_path):
                raise ValueError(
                    f"Input file {file_path} can be parsed as both fastq and fasta. Please specify --fastq or --fasta"
                )
            else:
                fastq_count += 1
        elif is_fasta(file_path):
            fasta_count += 1
        else:
            raise ValueError(f"Input file {file_path} is not a fastq or fasta file")

    if fastq_count > 0 and fasta_count > 0:
        raise ValueError(f"Input files are a mix of fastq and fasta files")
    elif fastq_count > 0:
        return "fastq"
    elif fasta_count > 0:
        return "fasta"
    else:
        raise ValueError(f"No input files were found")

`is_fasta(file)`

Check if a file is a fasta file.

Parameters

filepath : str The path to the file to check.

Returns

None

Raises

RuntimeError If the file is not a fasta file.

Source code in auriclass/general.py

def is_fasta(file: str) -> bool:
    """
    Check if a file is a fasta file.

    Parameters
    ----------
    filepath : str
        The path to the file to check.

    Returns
    -------
    None

    Raises
    ------
    RuntimeError
        If the file is not a fasta file.
    """
    try:
        pyfastx.Fasta(file, build_index=False)
        return True
    except RuntimeError:
        return False

`is_fastq(file)`

Check if a file is a fastq file.

Parameters

filepath : str The path to the file to check.

Returns

None

Raises

RuntimeError If the file is not a fastq file.

Source code in auriclass/general.py

def is_fastq(file: str) -> bool:
    """
    Check if a file is a fastq file.

    Parameters
    ----------
    filepath : str
        The path to the file to check.

    Returns
    -------
    None

    Raises
    ------
    RuntimeError
        If the file is not a fastq file.
    """
    try:
        pyfastx.Fastq(file, build_index=False)
        return True
    except RuntimeError:
        return False

`validate_argument_logic(args)`

Check if arguments are valid, based on predefined logic.

Parameters

args : argparse.Namespace The arguments to check.

Returns

argparse.Namespace The arguments, if they are valid. Modified if necessary.

Raises

ValueError If the arguments are not valid.

Source code in auriclass/general.py

def validate_argument_logic(args: argparse.Namespace) -> argparse.Namespace:
    """
    Check if arguments are valid, based on predefined logic.

    Parameters
    ----------
    args : argparse.Namespace
        The arguments to check.

    Returns
    -------
    argparse.Namespace
        The arguments, if they are valid. Modified if necessary.

    Raises
    ------
    ValueError
        If the arguments are not valid.
    """
    # Convert to float
    args.expected_genome_size = [
        float(args.expected_genome_size[0]),
        float(args.expected_genome_size[1]),
    ]
    # Check if specified genome size range is valid
    if args.expected_genome_size[0] > args.expected_genome_size[1]:
        raise ValueError(
            "Expected genome size range is invalid: lower bound is higher than upper bound"
        )
    elif (args.expected_genome_size[0] < 100) & (args.expected_genome_size[1] < 100):
        logging.warning(
            f"Expected genome size range boundaries {args.expected_genome_size} are below 100: treating these as Mbp instead of bp"
        )
        args.expected_genome_size = [
            args.expected_genome_size[0] * 1_000_000,
            args.expected_genome_size[1] * 1_000_000,
        ]
    return args

`validate_input_files(list_of_files)`

Check if files in a list exist.

Parameters

list_of_files : list of str The list of filepaths to check.

Returns

None

Raises

FileNotFoundError If any of the files do not exist.

Source code in auriclass/general.py

def validate_input_files(list_of_files: List[str]) -> None:
    """
    Check if files in a list exist.

    Parameters
    ----------
    list_of_files : list of str
        The list of filepaths to check.

    Returns
    -------
    None

    Raises
    ------
    FileNotFoundError
        If any of the files do not exist.
    """
    for filepath in list_of_files:
        if not Path(filepath).exists():
            raise FileNotFoundError(f"Required input file {filepath} does not exist")

Reference

The BasicAuriclass class

check_for_outgroup()

Parameters

Returns

Notes

check_genome_size()

Parameters

Returns

Notes

check_high_dist()

Parameters

Returns

Notes

check_non_candida()

Parameters

Returns

Notes

compare_with_error_bounds()

Parameters

Returns

Notes

get_error_bounds()

Parameters

Returns

Notes

process_error_bounds(error_bounds_text)

Parameters

Returns

Notes

run_mash_dist()

Parameters

Returns

Notes

save_report()

Parameters

Returns

Notes

select_clade()

Parameters

Returns

Notes

The FastqAuriclass class

sketch_fastq_query()

Parameters

Returns

Raises

Notes

The FastaAuriclass class

parse_genome_size()

Parameters

Returns

Notes

sketch_fasta_query()

Parameters

Returns

Raises

Notes

General scripts

add_tag(tag, lines)

Parameters

Returns

check_dependencies()

Parameters

Returns

Raises

check_number_within_range(minimum=0, maximum=1)

confirm_input_type(list_of_file_paths, input_type)

Parameters

Returns

Raises

Notes

guess_input_type(list_of_file_paths)

Parameters

Returns

Raises

is_fasta(file)

Parameters

Returns

Raises

The `BasicAuriclass` class

`check_for_outgroup()`

`check_genome_size()`

`check_high_dist()`

`check_non_candida()`

`compare_with_error_bounds()`

`get_error_bounds()`

`process_error_bounds(error_bounds_text)`

`run_mash_dist()`

`save_report()`

`select_clade()`

The `FastqAuriclass` class

`sketch_fastq_query()`

The `FastaAuriclass` class

`parse_genome_size()`

`sketch_fasta_query()`

`add_tag(tag, lines)`

`check_dependencies()`

`check_number_within_range(minimum=0, maximum=1)`

`confirm_input_type(list_of_file_paths, input_type)`

`guess_input_type(list_of_file_paths)`

`is_fasta(file)`

`is_fastq(file)`

`validate_argument_logic(args)`

`validate_input_files(list_of_files)`