Skip to content

Reference

The BasicAuriclass class

Source code in auriclass/classes.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
class BasicAuriclass:
    def __init__(
        self,
        name: str,
        output_report_path: Path,
        read_paths: List[Path],
        reference_sketch_path: Path,
        kmer_size: int,
        sketch_size: int,
        minimal_kmer_coverage: int,
        clade_config_path: Path,
        genome_size_range: List[int],
        non_candida_threshold: float,
        high_dist_threshold: float,
        no_qc: bool,
    ) -> None:
        self.name: str = name
        self.output_report_path: Path = output_report_path
        self.read_paths: List[Path] = read_paths
        self.reference_sketch_path: Path = reference_sketch_path
        self.kmer_size: int = kmer_size
        self.sketch_size: int = sketch_size
        self.minimal_kmer_coverage: int = minimal_kmer_coverage
        # self.probability is used for mash bounds
        # this is not exposed in the CLI, because this should typically not be changed without extensive testing.
        self.probability: float = 0.99
        self.clade_dict: Dict[Hashable, Any] = pd.read_csv(
            clade_config_path,
            index_col=0,
            dtype=str,
        ).to_dict(orient="dict")
        self.genome_size_range: List[int] = genome_size_range
        self.non_candida_threshold: float = non_candida_threshold
        self.high_dist_threshold: float = high_dist_threshold
        self.no_qc: bool = no_qc
        self.qc_decision: str = ""
        self.qc_genome_size: str = ""
        self.qc_other_candida: str = ""
        self.qc_species: str = ""
        self.qc_multiple_hits: str = ""
        self.qc_high_distance: str = ""
        self.query_sketch_path: Path = Path()
        self.estimated_genome_size: float = float()
        self.minimal_distance: float = float()
        self.clade: str = ""
        self.samples_within_error_bound: int = int()
        self.error_bound: float = float()
        self.stdout: str = ""
        self.stderr: str = ""
        self.mash_output: pd.DataFrame = pd.DataFrame()
        self.distances: List[float] = [float()]

    def run_mash_dist(self) -> pd.DataFrame:
        """
        Run Mash dist and save results.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        pandas.DataFrame
            A DataFrame containing the results of the Mash dist, including the reference and query sequences,
            the distance between them, the p-value, the number of matching hashes, and the predicted clade.

        Notes
        -----
        This function sets the following attributes of the object:
        - mash_output: a pandas DataFrame containing the output of the Mash distance calculation

        The function uses the following attributes of the object:
        - reference_sketch_path: the path to the reference sketch
        - query_sketch_path: the path to the query sketch
        - clade_dict: a dictionary containing the clade information for each reference sample
        """
        command = [
            "mash",
            "dist",
            self.reference_sketch_path,
            self.query_sketch_path,
        ]
        command_list_of_str = [str(item) for item in command]
        logging.info(add_tag("mash dist", " ".join(command_list_of_str)))
        output = subprocess.run(
            command_list_of_str,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        # Append stdout and stderr to object
        stderr = output.stderr.decode("utf-8")
        if stderr:
            for line in stderr.splitlines():
                logging.info(add_tag("mash dist", line))
        # Read output into pandas dataframe
        df = pd.read_csv(
            StringIO(output.stdout.decode("utf-8")),
            sep="\t",
            header=None,
            names=["Reference", "Query", "Distance", "P-value", "Matching-hashes"],
        )
        df["Clade"] = df["Reference"].map(self.clade_dict["clade"])
        self.mash_output = df
        return df

    def check_genome_size(self) -> None:
        """
        Compare the estimated genome size with the expected genome size range.

        If the estimated genome size is outside the expected range, a warning message is logged and the
        `qc_genome_size` attribute is set to "WARN: genome size outside expected range".

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_genome_size: a warning message if the estimated genome size is outside the expected range

        The function uses the following attributes of the object:
        - estimated_genome_size: the estimated genome size of the current sample
        """
        if (
            self.estimated_genome_size < self.genome_size_range[0]
            or self.estimated_genome_size > self.genome_size_range[1]
        ):
            logging.warning(
                f"AuriClass estimated genome size of {self.estimated_genome_size} is outside the expected range of {self.genome_size_range}"
            )
            self.qc_genome_size = "WARN: genome size outside expected range"

    def select_clade(self) -> None:
        """
        Selects the closest reference sample and corresponding clade.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - closest_sample: the closest reference sample to the current test sample
        - clade: the clade of the closest reference sample
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample

        The function uses the following attributes of the object:
        - mash_output: a pandas DataFrame containing the output of the Mash distance calculation
        - clade_dict: a dictionary containing the clade information for each reference sample
        """
        # Get highest hit, by sorting on distance and selecting first hit
        self.closest_sample = self.mash_output.sort_values("Distance").iloc[0][
            "Reference"
        ]
        # Get clade of closest sample
        self.clade = self.mash_output.loc[
            self.mash_output["Reference"] == self.closest_sample, "Clade"
        ].values[0]
        # Get minimal distance using self.mash_output and closest sample
        self.minimal_distance = self.mash_output.loc[
            self.mash_output["Reference"] == self.closest_sample, "Distance"
        ].values[0]

    def check_non_candida(self) -> bool:
        """
        Check if the distance to the closest sample is higher than the non-Candida threshold.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        bool
            True if the distance to the closest sample is lower than or equal to the non-Candida threshold, False otherwise.

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold

        The function uses the following attributes of the object:
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        - non_candida_threshold: the threshold above which the distance to the closest sample is considered to be non-Candida
        """
        if self.minimal_distance > self.non_candida_threshold:
            logging.warning(
                f"AuriClass found a distance of {self.minimal_distance} to the closest sample, please ensure this is Candida auris"
            )
            self.qc_species = f"FAIL: distance {self.minimal_distance} to closest sample is above threshold"
            return False
        else:
            return True

    def check_for_outgroup(self) -> bool:
        """
        Check if the closest sample is defined as outgroup.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        bool
            True if the closest sample is not an outgroup, False otherwise.

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_other_candida: a warning message if the closest sample is defined as outgroup

        The function uses the following attributes of the object:
        - clade: the clade of the closest reference sample
        - closest_sample: the closest reference sample to the current test sample
        """
        if self.clade == "outgroup":
            logging.warning(
                f"AuriClass found a non-Candida auris reference as closest sample, please ensure this is Candida auris"
            )
            self.qc_other_candida = (
                f"FAIL: outgroup reference {self.closest_sample} as closest sample"
            )
            return False
        else:
            return True

    def check_high_dist(self) -> None:
        """
        Check if the closest sample is above high_dist_threshold distance. If it is, it may indicate a new clade.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance

        The function uses the following attributes of the object:
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        - high_dist_threshold: the threshold above which the distance to the closest sample is considered to be a new clade
        """
        if self.minimal_distance > self.high_dist_threshold:
            logging.warning(
                f"AuriClass found a distance of {self.minimal_distance} to the closest sample, please ensure this is Candida auris"
            )
            self.qc_high_distance = f"WARN: distance {self.minimal_distance} to closest sample is above threshold"

    def get_error_bounds(self) -> str:
        """
        Get error bounds for the current sketch size and kmer size.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        str
            The STDOUT of the mash bounds command.

        Notes
        -----
        The function uses the following attributes of the object:
        - kmer_size: the kmer size to use for the sketch
        - probability: the probability to use for the mash bounds command
        """
        command = [
            "mash",
            "bounds",
            "-k",
            str(self.kmer_size),
            "-p",
            str(self.probability),
        ]
        logging.info(add_tag("mash bounds", " ".join(command)))
        output = subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        stderr = output.stderr.decode("utf-8")
        if stderr:
            for line in stderr.splitlines():
                logging.info(add_tag("mash bounds", line))

        return output.stdout.decode("utf-8")

    def process_error_bounds(self, error_bounds_text: str) -> None:
        """
        Process the error bounds text to get the error bound for the current sketch size and kmer size.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.
        error_bounds_text : str
            The STDOUT of the mash bounds command.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - error_bound: the error bound for the current sketch size and kmer size


        The function uses the following attributes of the object:
        - sketch_size: the size of the sketch to create
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        """
        # Split text into lines
        lines = error_bounds_text.splitlines()
        # Find line with "Mash distance"
        for i, line in enumerate(lines):
            if "Mash distance" in line:
                start_line = i + 1
                break
        # Find line with "Screen distance"
        for i, line in enumerate(lines):
            if "Screen distance" in line:
                end_line = i
                break
        # Read lines into dataframe
        df = pd.read_csv(
            StringIO("\n".join(lines[start_line:end_line])),
            sep="\t",
        )
        # Get first column of which column name is higher than min_dist
        for col in df.columns[1:]:
            if float(col) > self.minimal_distance:
                selected_col = str(col)
                break
        self.error_bound = df.loc[
            df["Sketch"] == self.sketch_size, selected_col
        ].values[0]

    def compare_with_error_bounds(self) -> None:
        """
        Compare the distance between the current test sample and the closest reference sample with the error bound.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - distances: the distances between the current test sample and all other reference samples
        - samples_within_error_bound: the number of samples within the error bound of the closest sample
        - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0

        The function uses the following attributes of the object:
        - mash_output: a pandas DataFrame containing the output of the Mash distance calculation
        - closest_sample: the closest reference sample to the current test sample
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        - error_bound: the error bound for the current sketch size and kmer size
        """
        # Get distance between highest hit and other samples
        dist_array = self.mash_output.loc[
            self.mash_output["Clade"] != self.clade, "Distance"
        ].values
        self.distances = [float(distance) for distance in dist_array]
        # Check if distance is higher than error_bound
        count_above_threshold = 0
        for distance in self.distances:
            if distance < (self.minimal_distance + self.error_bound):
                logging.debug(
                    add_tag(
                        "compare_with_error_bounds",
                        f"distance {distance} is above threshold: ({self.minimal_distance} + {self.error_bound})",
                    )
                )
                count_above_threshold += 1
            else:
                logging.debug(
                    add_tag(
                        "compare_with_error_bounds",
                        f"distance {distance} is below threshold: ({self.minimal_distance} + {self.error_bound})",
                    )
                )
        self.samples_within_error_bound = count_above_threshold
        # If samples are found within error bound, warn user
        if self.samples_within_error_bound > 0:
            logging.warning(
                f"AuriClass found {self.samples_within_error_bound} sample(s) within the error bound of {self.error_bound} of the closest sample"
            )
            self.qc_multiple_hits = (
                f"WARN: {self.samples_within_error_bound} sample(s) within error bound"
            )

    def save_report(self) -> None:
        """
        Save the report to a TSV file.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - qc_decision: the QC decision based on the QC attributes

        The function uses the following attributes of the object:
        - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold
        - qc_other_candida: a warning message if the closest sample is defined as outgroup
        - qc_genome_size: a warning message if the estimated genome size is outside the expected range
        - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0
        - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance
        - name: the name of the current test sample
        - clade: the clade of the closest reference sample
        - minimal_distance: the minimal distance between the current test sample and the closest reference sample
        - output_report_path: the path to the output report
        - no_qc: a boolean indicating whether to skip extended QC
        """
        FAIL_metrics = [
            "qc_species",
            "qc_other_candida",
        ]
        WARN_metrics = [
            "qc_genome_size",
            "qc_multiple_hits",
            "qc_high_distance",
        ]
        if self.no_qc:
            # Set all attributes in WARN_metrics to "SKIPPED"
            for metric in WARN_metrics:
                setattr(self, metric, "SKIPPED")

        # Check if any of the qc attributes contain "WARN"
        FAIL_observed = any(
            ["FAIL" in getattr(self, qc_metric) for qc_metric in FAIL_metrics]
        )
        WARN_observed = any(
            ["WARN" in getattr(self, qc_metric) for qc_metric in WARN_metrics]
        )

        if FAIL_observed:
            self.qc_decision = "FAIL"
        elif WARN_observed:
            self.qc_decision = "WARN"
        else:
            self.qc_decision = "PASS"

        # if any(
        #     [
        #         ("FAIL" in qc_metric) for qc_metric in [self.qc_species],
        #     ]
        # ):
        #     self.qc_decision = "FAIL"
        # elif any(
        #     [
        #         self.qc_genome_size,
        #         self.qc_other_candida,
        #         self.qc_multiple_hits,
        #         self.qc_high_distance,
        #     ]
        # ):
        #     self.qc_decision = "WARN"
        # else:
        #     self.qc_decision = "PASS"

        pd.DataFrame(
            [
                [
                    self.name,
                    self.clade,
                    self.minimal_distance,
                    self.qc_decision,
                    self.qc_species,
                    self.qc_other_candida,
                    self.qc_genome_size,
                    self.qc_multiple_hits,
                    self.qc_high_distance,
                ]
            ],
            columns=[
                "Sample",
                "Clade",
                "Mash_distance_from_closest_reference",
                "QC_decision",
                "QC_species",
                "QC_other_Candida",
                "QC_genome_size",
                "QC_multiple_hits",
                "QC_high_distance",
            ],
        ).replace("", "PASS").to_csv(self.output_report_path, sep="\t", index=False)

check_for_outgroup()

Check if the closest sample is defined as outgroup.

Parameters

self : object The AuriClassAnalysis object.

Returns

bool True if the closest sample is not an outgroup, False otherwise.

Notes

This function sets the following attributes of the object: - qc_other_candida: a warning message if the closest sample is defined as outgroup

The function uses the following attributes of the object: - clade: the clade of the closest reference sample - closest_sample: the closest reference sample to the current test sample

Source code in auriclass/classes.py
def check_for_outgroup(self) -> bool:
    """
    Check if the closest sample is defined as outgroup.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    bool
        True if the closest sample is not an outgroup, False otherwise.

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_other_candida: a warning message if the closest sample is defined as outgroup

    The function uses the following attributes of the object:
    - clade: the clade of the closest reference sample
    - closest_sample: the closest reference sample to the current test sample
    """
    if self.clade == "outgroup":
        logging.warning(
            f"AuriClass found a non-Candida auris reference as closest sample, please ensure this is Candida auris"
        )
        self.qc_other_candida = (
            f"FAIL: outgroup reference {self.closest_sample} as closest sample"
        )
        return False
    else:
        return True

check_genome_size()

Compare the estimated genome size with the expected genome size range.

If the estimated genome size is outside the expected range, a warning message is logged and the qc_genome_size attribute is set to "WARN: genome size outside expected range".

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - qc_genome_size: a warning message if the estimated genome size is outside the expected range

The function uses the following attributes of the object: - estimated_genome_size: the estimated genome size of the current sample

Source code in auriclass/classes.py
def check_genome_size(self) -> None:
    """
    Compare the estimated genome size with the expected genome size range.

    If the estimated genome size is outside the expected range, a warning message is logged and the
    `qc_genome_size` attribute is set to "WARN: genome size outside expected range".

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_genome_size: a warning message if the estimated genome size is outside the expected range

    The function uses the following attributes of the object:
    - estimated_genome_size: the estimated genome size of the current sample
    """
    if (
        self.estimated_genome_size < self.genome_size_range[0]
        or self.estimated_genome_size > self.genome_size_range[1]
    ):
        logging.warning(
            f"AuriClass estimated genome size of {self.estimated_genome_size} is outside the expected range of {self.genome_size_range}"
        )
        self.qc_genome_size = "WARN: genome size outside expected range"

check_high_dist()

Check if the closest sample is above high_dist_threshold distance. If it is, it may indicate a new clade.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance

The function uses the following attributes of the object: - minimal_distance: the minimal distance between the current test sample and the closest reference sample - high_dist_threshold: the threshold above which the distance to the closest sample is considered to be a new clade

Source code in auriclass/classes.py
def check_high_dist(self) -> None:
    """
    Check if the closest sample is above high_dist_threshold distance. If it is, it may indicate a new clade.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance

    The function uses the following attributes of the object:
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    - high_dist_threshold: the threshold above which the distance to the closest sample is considered to be a new clade
    """
    if self.minimal_distance > self.high_dist_threshold:
        logging.warning(
            f"AuriClass found a distance of {self.minimal_distance} to the closest sample, please ensure this is Candida auris"
        )
        self.qc_high_distance = f"WARN: distance {self.minimal_distance} to closest sample is above threshold"

check_non_candida()

Check if the distance to the closest sample is higher than the non-Candida threshold.

Parameters

self : object The AuriClassAnalysis object.

Returns

bool True if the distance to the closest sample is lower than or equal to the non-Candida threshold, False otherwise.

Notes

This function sets the following attributes of the object: - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold

The function uses the following attributes of the object: - minimal_distance: the minimal distance between the current test sample and the closest reference sample - non_candida_threshold: the threshold above which the distance to the closest sample is considered to be non-Candida

Source code in auriclass/classes.py
def check_non_candida(self) -> bool:
    """
    Check if the distance to the closest sample is higher than the non-Candida threshold.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    bool
        True if the distance to the closest sample is lower than or equal to the non-Candida threshold, False otherwise.

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold

    The function uses the following attributes of the object:
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    - non_candida_threshold: the threshold above which the distance to the closest sample is considered to be non-Candida
    """
    if self.minimal_distance > self.non_candida_threshold:
        logging.warning(
            f"AuriClass found a distance of {self.minimal_distance} to the closest sample, please ensure this is Candida auris"
        )
        self.qc_species = f"FAIL: distance {self.minimal_distance} to closest sample is above threshold"
        return False
    else:
        return True

compare_with_error_bounds()

Compare the distance between the current test sample and the closest reference sample with the error bound.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - distances: the distances between the current test sample and all other reference samples - samples_within_error_bound: the number of samples within the error bound of the closest sample - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0

The function uses the following attributes of the object: - mash_output: a pandas DataFrame containing the output of the Mash distance calculation - closest_sample: the closest reference sample to the current test sample - minimal_distance: the minimal distance between the current test sample and the closest reference sample - error_bound: the error bound for the current sketch size and kmer size

Source code in auriclass/classes.py
def compare_with_error_bounds(self) -> None:
    """
    Compare the distance between the current test sample and the closest reference sample with the error bound.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - distances: the distances between the current test sample and all other reference samples
    - samples_within_error_bound: the number of samples within the error bound of the closest sample
    - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0

    The function uses the following attributes of the object:
    - mash_output: a pandas DataFrame containing the output of the Mash distance calculation
    - closest_sample: the closest reference sample to the current test sample
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    - error_bound: the error bound for the current sketch size and kmer size
    """
    # Get distance between highest hit and other samples
    dist_array = self.mash_output.loc[
        self.mash_output["Clade"] != self.clade, "Distance"
    ].values
    self.distances = [float(distance) for distance in dist_array]
    # Check if distance is higher than error_bound
    count_above_threshold = 0
    for distance in self.distances:
        if distance < (self.minimal_distance + self.error_bound):
            logging.debug(
                add_tag(
                    "compare_with_error_bounds",
                    f"distance {distance} is above threshold: ({self.minimal_distance} + {self.error_bound})",
                )
            )
            count_above_threshold += 1
        else:
            logging.debug(
                add_tag(
                    "compare_with_error_bounds",
                    f"distance {distance} is below threshold: ({self.minimal_distance} + {self.error_bound})",
                )
            )
    self.samples_within_error_bound = count_above_threshold
    # If samples are found within error bound, warn user
    if self.samples_within_error_bound > 0:
        logging.warning(
            f"AuriClass found {self.samples_within_error_bound} sample(s) within the error bound of {self.error_bound} of the closest sample"
        )
        self.qc_multiple_hits = (
            f"WARN: {self.samples_within_error_bound} sample(s) within error bound"
        )

get_error_bounds()

Get error bounds for the current sketch size and kmer size.

Parameters

self : object The AuriClassAnalysis object.

Returns

str The STDOUT of the mash bounds command.

Notes

The function uses the following attributes of the object: - kmer_size: the kmer size to use for the sketch - probability: the probability to use for the mash bounds command

Source code in auriclass/classes.py
def get_error_bounds(self) -> str:
    """
    Get error bounds for the current sketch size and kmer size.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    str
        The STDOUT of the mash bounds command.

    Notes
    -----
    The function uses the following attributes of the object:
    - kmer_size: the kmer size to use for the sketch
    - probability: the probability to use for the mash bounds command
    """
    command = [
        "mash",
        "bounds",
        "-k",
        str(self.kmer_size),
        "-p",
        str(self.probability),
    ]
    logging.info(add_tag("mash bounds", " ".join(command)))
    output = subprocess.run(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    stderr = output.stderr.decode("utf-8")
    if stderr:
        for line in stderr.splitlines():
            logging.info(add_tag("mash bounds", line))

    return output.stdout.decode("utf-8")

process_error_bounds(error_bounds_text)

Process the error bounds text to get the error bound for the current sketch size and kmer size.

Parameters

self : object The AuriClassAnalysis object. error_bounds_text : str The STDOUT of the mash bounds command.

Returns

None

Notes

This function sets the following attributes of the object: - error_bound: the error bound for the current sketch size and kmer size

The function uses the following attributes of the object: - sketch_size: the size of the sketch to create - minimal_distance: the minimal distance between the current test sample and the closest reference sample

Source code in auriclass/classes.py
def process_error_bounds(self, error_bounds_text: str) -> None:
    """
    Process the error bounds text to get the error bound for the current sketch size and kmer size.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.
    error_bounds_text : str
        The STDOUT of the mash bounds command.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - error_bound: the error bound for the current sketch size and kmer size


    The function uses the following attributes of the object:
    - sketch_size: the size of the sketch to create
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    """
    # Split text into lines
    lines = error_bounds_text.splitlines()
    # Find line with "Mash distance"
    for i, line in enumerate(lines):
        if "Mash distance" in line:
            start_line = i + 1
            break
    # Find line with "Screen distance"
    for i, line in enumerate(lines):
        if "Screen distance" in line:
            end_line = i
            break
    # Read lines into dataframe
    df = pd.read_csv(
        StringIO("\n".join(lines[start_line:end_line])),
        sep="\t",
    )
    # Get first column of which column name is higher than min_dist
    for col in df.columns[1:]:
        if float(col) > self.minimal_distance:
            selected_col = str(col)
            break
    self.error_bound = df.loc[
        df["Sketch"] == self.sketch_size, selected_col
    ].values[0]

run_mash_dist()

Run Mash dist and save results.

Parameters

self : object The AuriClassAnalysis object.

Returns

pandas.DataFrame A DataFrame containing the results of the Mash dist, including the reference and query sequences, the distance between them, the p-value, the number of matching hashes, and the predicted clade.

Notes

This function sets the following attributes of the object: - mash_output: a pandas DataFrame containing the output of the Mash distance calculation

The function uses the following attributes of the object: - reference_sketch_path: the path to the reference sketch - query_sketch_path: the path to the query sketch - clade_dict: a dictionary containing the clade information for each reference sample

Source code in auriclass/classes.py
def run_mash_dist(self) -> pd.DataFrame:
    """
    Run Mash dist and save results.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the results of the Mash dist, including the reference and query sequences,
        the distance between them, the p-value, the number of matching hashes, and the predicted clade.

    Notes
    -----
    This function sets the following attributes of the object:
    - mash_output: a pandas DataFrame containing the output of the Mash distance calculation

    The function uses the following attributes of the object:
    - reference_sketch_path: the path to the reference sketch
    - query_sketch_path: the path to the query sketch
    - clade_dict: a dictionary containing the clade information for each reference sample
    """
    command = [
        "mash",
        "dist",
        self.reference_sketch_path,
        self.query_sketch_path,
    ]
    command_list_of_str = [str(item) for item in command]
    logging.info(add_tag("mash dist", " ".join(command_list_of_str)))
    output = subprocess.run(
        command_list_of_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    # Append stdout and stderr to object
    stderr = output.stderr.decode("utf-8")
    if stderr:
        for line in stderr.splitlines():
            logging.info(add_tag("mash dist", line))
    # Read output into pandas dataframe
    df = pd.read_csv(
        StringIO(output.stdout.decode("utf-8")),
        sep="\t",
        header=None,
        names=["Reference", "Query", "Distance", "P-value", "Matching-hashes"],
    )
    df["Clade"] = df["Reference"].map(self.clade_dict["clade"])
    self.mash_output = df
    return df

save_report()

Save the report to a TSV file.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - qc_decision: the QC decision based on the QC attributes

The function uses the following attributes of the object: - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold - qc_other_candida: a warning message if the closest sample is defined as outgroup - qc_genome_size: a warning message if the estimated genome size is outside the expected range - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0 - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance - name: the name of the current test sample - clade: the clade of the closest reference sample - minimal_distance: the minimal distance between the current test sample and the closest reference sample - output_report_path: the path to the output report - no_qc: a boolean indicating whether to skip extended QC

Source code in auriclass/classes.py
def save_report(self) -> None:
    """
    Save the report to a TSV file.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - qc_decision: the QC decision based on the QC attributes

    The function uses the following attributes of the object:
    - qc_species: a warning message if the distance to the closest sample is higher than the non-Candida threshold
    - qc_other_candida: a warning message if the closest sample is defined as outgroup
    - qc_genome_size: a warning message if the estimated genome size is outside the expected range
    - qc_multiple_hits: a warning message if the number of samples within the error bound of the closest sample is higher than 0
    - qc_high_distance: a warning message if the closest sample is above self.high_dist_threshold distance
    - name: the name of the current test sample
    - clade: the clade of the closest reference sample
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample
    - output_report_path: the path to the output report
    - no_qc: a boolean indicating whether to skip extended QC
    """
    FAIL_metrics = [
        "qc_species",
        "qc_other_candida",
    ]
    WARN_metrics = [
        "qc_genome_size",
        "qc_multiple_hits",
        "qc_high_distance",
    ]
    if self.no_qc:
        # Set all attributes in WARN_metrics to "SKIPPED"
        for metric in WARN_metrics:
            setattr(self, metric, "SKIPPED")

    # Check if any of the qc attributes contain "WARN"
    FAIL_observed = any(
        ["FAIL" in getattr(self, qc_metric) for qc_metric in FAIL_metrics]
    )
    WARN_observed = any(
        ["WARN" in getattr(self, qc_metric) for qc_metric in WARN_metrics]
    )

    if FAIL_observed:
        self.qc_decision = "FAIL"
    elif WARN_observed:
        self.qc_decision = "WARN"
    else:
        self.qc_decision = "PASS"

    # if any(
    #     [
    #         ("FAIL" in qc_metric) for qc_metric in [self.qc_species],
    #     ]
    # ):
    #     self.qc_decision = "FAIL"
    # elif any(
    #     [
    #         self.qc_genome_size,
    #         self.qc_other_candida,
    #         self.qc_multiple_hits,
    #         self.qc_high_distance,
    #     ]
    # ):
    #     self.qc_decision = "WARN"
    # else:
    #     self.qc_decision = "PASS"

    pd.DataFrame(
        [
            [
                self.name,
                self.clade,
                self.minimal_distance,
                self.qc_decision,
                self.qc_species,
                self.qc_other_candida,
                self.qc_genome_size,
                self.qc_multiple_hits,
                self.qc_high_distance,
            ]
        ],
        columns=[
            "Sample",
            "Clade",
            "Mash_distance_from_closest_reference",
            "QC_decision",
            "QC_species",
            "QC_other_Candida",
            "QC_genome_size",
            "QC_multiple_hits",
            "QC_high_distance",
        ],
    ).replace("", "PASS").to_csv(self.output_report_path, sep="\t", index=False)

select_clade()

Selects the closest reference sample and corresponding clade.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - closest_sample: the closest reference sample to the current test sample - clade: the clade of the closest reference sample - minimal_distance: the minimal distance between the current test sample and the closest reference sample

The function uses the following attributes of the object: - mash_output: a pandas DataFrame containing the output of the Mash distance calculation - clade_dict: a dictionary containing the clade information for each reference sample

Source code in auriclass/classes.py
def select_clade(self) -> None:
    """
    Selects the closest reference sample and corresponding clade.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - closest_sample: the closest reference sample to the current test sample
    - clade: the clade of the closest reference sample
    - minimal_distance: the minimal distance between the current test sample and the closest reference sample

    The function uses the following attributes of the object:
    - mash_output: a pandas DataFrame containing the output of the Mash distance calculation
    - clade_dict: a dictionary containing the clade information for each reference sample
    """
    # Get highest hit, by sorting on distance and selecting first hit
    self.closest_sample = self.mash_output.sort_values("Distance").iloc[0][
        "Reference"
    ]
    # Get clade of closest sample
    self.clade = self.mash_output.loc[
        self.mash_output["Reference"] == self.closest_sample, "Clade"
    ].values[0]
    # Get minimal distance using self.mash_output and closest sample
    self.minimal_distance = self.mash_output.loc[
        self.mash_output["Reference"] == self.closest_sample, "Distance"
    ].values[0]

The FastqAuriclass class

Bases: BasicAuriclass

Source code in auriclass/classes.py
class FastqAuriclass(BasicAuriclass):
    def sketch_fastq_query(self) -> None:
        """
        Sketches the query fastq files using mash.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If no sequence records are found in the specified fastq file(s).

            If the estimated genome size cannot be parsed from the mash sketch STDERR.

        Notes
        -----
        This function sets the following attributes of the object:
        - stdout: the STDOUT of the mash sketch command
        - estimated_genome_size: the estimated genome size of the current test sample


        The function uses the following attributes of the object:
        - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch
        - query_sketch_path: the path to the query sketch
        - kmer_size: the kmer size to use for the sketch
        - sketch_size: the size of the sketch to create
        - read_paths: the paths to the fastq files to sketch
        """
        command = [
            "mash",
            "sketch",
            "-r",
            "-m",
            str(self.minimal_kmer_coverage),
            "-o",
            self.query_sketch_path,
            "-k",
            str(self.kmer_size),
            "-s",
            str(self.sketch_size),
            *self.read_paths,
        ]
        command_list_of_str = [str(item) for item in command]
        logging.info(add_tag("mash sketch", " ".join(command_list_of_str)))
        output = subprocess.run(
            command_list_of_str,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        if "ERROR: Did not find fasta records in" in output.stderr.decode("utf-8"):
            raise ValueError(
                f"Did not find sequence records in {self.read_paths}. Please check if these are valid fastq files"
            )
        # Add stdout and stderr to object
        self.stdout = output.stdout.decode("utf-8")
        stderr = output.stderr.decode("utf-8")
        if stderr:
            for line in stderr.splitlines():
                logging.info(add_tag("mash sketch", line))

        # find line with "Estimated genome size" and get value
        for line in stderr.splitlines():
            if "Estimated genome size" in line:
                self.estimated_genome_size = float(line.split()[-1])
                break
        else:
            raise ValueError(
                "Estimated genome size could not be parsed from mash sketch STDERR"
            )

    def run(self) -> None:
        # Sketch query genome using tempfile
        with tempfile.TemporaryDirectory() as tmpdir:
            self.query_sketch_path = Path(tmpdir).joinpath("tmpfile.msh")
            self.sketch_fastq_query()

            # Run mash screen
            self.run_mash_dist()

        # Check if genome size is within expected range
        self.check_genome_size()

        # Process results
        self.select_clade()

        # Check if all samples not above certain threshold indicating other species
        probably_candida = self.check_non_candida()

        if probably_candida:
            # Check if clade is not "outgroup"
            probably_cauris = self.check_for_outgroup()

            if probably_cauris:
                if self.no_qc == False:
                    # Check if closest sample is above self.high_dist_threshold distance --> new clade?
                    self.check_high_dist()

                    # Check error bounds and check number of samples within error bounds
                    error_bounds_text = self.get_error_bounds()
                    self.process_error_bounds(error_bounds_text)
                    self.compare_with_error_bounds()
            else:
                self.clade = "other Candida/CUG-Ser1 clade sp."
        else:
            self.clade = "not Candida auris"
            self.qc_other_candida = "SKIPPED"
            self.qc_genome_size = "SKIPPED"
            self.qc_multiple_hits = "SKIPPED"
            self.qc_high_distance = "SKIPPED"

        # Save report
        self.save_report()

sketch_fastq_query()

Sketches the query fastq files using mash.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Raises

ValueError If no sequence records are found in the specified fastq file(s).

If the estimated genome size cannot be parsed from the mash sketch STDERR.

Notes

This function sets the following attributes of the object: - stdout: the STDOUT of the mash sketch command - estimated_genome_size: the estimated genome size of the current test sample

The function uses the following attributes of the object: - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch - query_sketch_path: the path to the query sketch - kmer_size: the kmer size to use for the sketch - sketch_size: the size of the sketch to create - read_paths: the paths to the fastq files to sketch

Source code in auriclass/classes.py
def sketch_fastq_query(self) -> None:
    """
    Sketches the query fastq files using mash.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If no sequence records are found in the specified fastq file(s).

        If the estimated genome size cannot be parsed from the mash sketch STDERR.

    Notes
    -----
    This function sets the following attributes of the object:
    - stdout: the STDOUT of the mash sketch command
    - estimated_genome_size: the estimated genome size of the current test sample


    The function uses the following attributes of the object:
    - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch
    - query_sketch_path: the path to the query sketch
    - kmer_size: the kmer size to use for the sketch
    - sketch_size: the size of the sketch to create
    - read_paths: the paths to the fastq files to sketch
    """
    command = [
        "mash",
        "sketch",
        "-r",
        "-m",
        str(self.minimal_kmer_coverage),
        "-o",
        self.query_sketch_path,
        "-k",
        str(self.kmer_size),
        "-s",
        str(self.sketch_size),
        *self.read_paths,
    ]
    command_list_of_str = [str(item) for item in command]
    logging.info(add_tag("mash sketch", " ".join(command_list_of_str)))
    output = subprocess.run(
        command_list_of_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    if "ERROR: Did not find fasta records in" in output.stderr.decode("utf-8"):
        raise ValueError(
            f"Did not find sequence records in {self.read_paths}. Please check if these are valid fastq files"
        )
    # Add stdout and stderr to object
    self.stdout = output.stdout.decode("utf-8")
    stderr = output.stderr.decode("utf-8")
    if stderr:
        for line in stderr.splitlines():
            logging.info(add_tag("mash sketch", line))

    # find line with "Estimated genome size" and get value
    for line in stderr.splitlines():
        if "Estimated genome size" in line:
            self.estimated_genome_size = float(line.split()[-1])
            break
    else:
        raise ValueError(
            "Estimated genome size could not be parsed from mash sketch STDERR"
        )

The FastaAuriclass class

Bases: BasicAuriclass

Source code in auriclass/classes.py
class FastaAuriclass(BasicAuriclass):
    def sketch_fasta_query(self) -> None:
        """
        Sketches the query fasta files using mash.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If no sequence records are found in the specified fasta file(s).


        Notes
        -----
        This function sets the following attributes of the object:
        - stdout: the STDOUT of the mash sketch command
        - estimated_genome_size: the estimated genome size of the current test sample


        The function uses the following attributes of the object:
        - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch
        - query_sketch_path: the path to the query sketch
        - kmer_size: the kmer size to use for the sketch
        - sketch_size: the size of the sketch to create
        - read_paths: the paths to the fastq files to sketch
        """
        command = [
            "mash",
            "sketch",
            "-o",
            self.query_sketch_path,
            "-k",
            str(self.kmer_size),
            "-s",
            str(self.sketch_size),
            *self.read_paths,
        ]
        command_list_of_str = [str(item) for item in command]
        logging.info(add_tag("mash sketch", " ".join(command_list_of_str)))
        output = subprocess.run(
            command_list_of_str,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        if "ERROR: Did not find fasta records in" in output.stderr.decode("utf-8"):
            raise ValueError(
                f"Did not find sequence records in {self.read_paths}. Please check if these are valid fastq files"
            )
        # Add stdout and stderr to object
        self.stdout = output.stdout.decode("utf-8")
        stderr = output.stderr.decode("utf-8")
        if stderr:
            for line in stderr.splitlines():
                logging.info(add_tag("mash sketch", line))

    def parse_genome_size(self) -> None:
        """
        Parse the size of the fasta input using pyfastx.

        Parameters
        ----------
        self : object
            The AuriClassAnalysis object.

        Returns
        -------
        None

        Notes
        -----
        This function sets the following attributes of the object:
        - estimated_genome_size: the estimated genome size of the current test sample

        The function uses the following attributes of the object:
        - read_paths: the paths to the fasta files to parse
        """
        total_size = 0
        for filepath in self.read_paths:
            # Reading a fasta file requires index building, so remove this afterwards
            total_size += pyfastx.Fasta(filepath, build_index=True).size
            Path(str(filepath) + ".fxi").unlink(missing_ok=True)
        self.estimated_genome_size = total_size

    def run(self) -> None:
        # Sketch query genome using tempfile
        with tempfile.TemporaryDirectory() as tmpdir:
            self.query_sketch_path = Path(tmpdir).joinpath("tmpfile.msh")
            self.sketch_fasta_query()

            # Run mash screen
            self.run_mash_dist()

        # Check if genome size is within expected range
        self.parse_genome_size()
        self.check_genome_size()

        # Process results
        self.select_clade()

        # Check if all samples not above certain threshold indicating other species
        probably_candida = self.check_non_candida()

        if probably_candida:
            # Check if clade is not "outgroup"
            probably_cauris = self.check_for_outgroup()
            if probably_cauris:
                if self.no_qc == False:
                    # Check if closest sample is above self.high_dist_threshold distance --> new clade?
                    self.check_high_dist()

                    # Check error bounds and check number of samples within error bounds
                    error_bounds_text = self.get_error_bounds()
                    self.process_error_bounds(error_bounds_text)
                    self.compare_with_error_bounds()
            else:
                self.clade = "other Candida/CUG-Ser1 clade sp."
        else:
            self.clade = "not Candida auris"
            self.qc_other_candida = "SKIPPED"
            self.qc_genome_size = "SKIPPED"
            self.qc_multiple_hits = "SKIPPED"
            self.qc_high_distance = "SKIPPED"

        # Save report
        self.save_report()

parse_genome_size()

Parse the size of the fasta input using pyfastx.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Notes

This function sets the following attributes of the object: - estimated_genome_size: the estimated genome size of the current test sample

The function uses the following attributes of the object: - read_paths: the paths to the fasta files to parse

Source code in auriclass/classes.py
def parse_genome_size(self) -> None:
    """
    Parse the size of the fasta input using pyfastx.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Notes
    -----
    This function sets the following attributes of the object:
    - estimated_genome_size: the estimated genome size of the current test sample

    The function uses the following attributes of the object:
    - read_paths: the paths to the fasta files to parse
    """
    total_size = 0
    for filepath in self.read_paths:
        # Reading a fasta file requires index building, so remove this afterwards
        total_size += pyfastx.Fasta(filepath, build_index=True).size
        Path(str(filepath) + ".fxi").unlink(missing_ok=True)
    self.estimated_genome_size = total_size

sketch_fasta_query()

Sketches the query fasta files using mash.

Parameters

self : object The AuriClassAnalysis object.

Returns

None

Raises

ValueError If no sequence records are found in the specified fasta file(s).

Notes

This function sets the following attributes of the object: - stdout: the STDOUT of the mash sketch command - estimated_genome_size: the estimated genome size of the current test sample

The function uses the following attributes of the object: - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch - query_sketch_path: the path to the query sketch - kmer_size: the kmer size to use for the sketch - sketch_size: the size of the sketch to create - read_paths: the paths to the fastq files to sketch

Source code in auriclass/classes.py
def sketch_fasta_query(self) -> None:
    """
    Sketches the query fasta files using mash.

    Parameters
    ----------
    self : object
        The AuriClassAnalysis object.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If no sequence records are found in the specified fasta file(s).


    Notes
    -----
    This function sets the following attributes of the object:
    - stdout: the STDOUT of the mash sketch command
    - estimated_genome_size: the estimated genome size of the current test sample


    The function uses the following attributes of the object:
    - minimal_kmer_coverage: the minimal kmer coverage to use for the sketch
    - query_sketch_path: the path to the query sketch
    - kmer_size: the kmer size to use for the sketch
    - sketch_size: the size of the sketch to create
    - read_paths: the paths to the fastq files to sketch
    """
    command = [
        "mash",
        "sketch",
        "-o",
        self.query_sketch_path,
        "-k",
        str(self.kmer_size),
        "-s",
        str(self.sketch_size),
        *self.read_paths,
    ]
    command_list_of_str = [str(item) for item in command]
    logging.info(add_tag("mash sketch", " ".join(command_list_of_str)))
    output = subprocess.run(
        command_list_of_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    if "ERROR: Did not find fasta records in" in output.stderr.decode("utf-8"):
        raise ValueError(
            f"Did not find sequence records in {self.read_paths}. Please check if these are valid fastq files"
        )
    # Add stdout and stderr to object
    self.stdout = output.stdout.decode("utf-8")
    stderr = output.stderr.decode("utf-8")
    if stderr:
        for line in stderr.splitlines():
            logging.info(add_tag("mash sketch", line))

General scripts

add_tag(tag, lines)

Add a tag to a string or list of strings.

Parameters

tag : str The tag to add. lines : str or list of str The string or list of strings to add the tag to.

Returns

str The string with the tag added to each line.

Source code in auriclass/general.py
def add_tag(tag: str, lines: str) -> str:
    """
    Add a tag to a string or list of strings.

    Parameters
    ----------
    tag : str
        The tag to add.
    lines : str or list of str
        The string or list of strings to add the tag to.

    Returns
    -------
    str
        The string with the tag added to each line.
    """
    full_tag = f"[{tag}]"
    if lines:
        return "\n".join(
            [f"{full_tag} {line}" for line in lines.split("\n") if line != ""]
        )
    else:
        return f"{full_tag}"

check_dependencies()

Check if dependencies are installed.

Parameters

None

Returns

None

Raises

FileNotFoundError If mash is not installed.

Source code in auriclass/general.py
def check_dependencies() -> None:
    """
    Check if dependencies are installed.

    Parameters
    ----------
    None

    Returns
    -------
    None

    Raises
    ------
    FileNotFoundError
        If mash is not installed.
    """
    try:
        subprocess.call(
            ["mash", "-h"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
    except FileNotFoundError:
        raise FileNotFoundError("Mash is not installed")

check_number_within_range(minimum=0, maximum=1)

Creates a function to check whether a numeric value is within a range, inclusive.

The generated function can be used by the type parameter in argparse.ArgumentParser. See https://stackoverflow.com/a/53446178.

Parameters:

Name Type Description Default
value

the numeric value to check.

required
minimum float

minimum of allowed range, inclusive.

0
maximum float

maximum of allowed range, inclusive.

1

Returns:

Type Description
Union[Callable[[str], str], FileType]

A function which takes a single argument and checks this against the range.

Raises:

Type Description
ArgumentTypeError

if the value is outside the range.

ValueError

if the value cannot be converted to float.

Source code in auriclass/general.py
def check_number_within_range(
    minimum: float = 0, maximum: float = 1
) -> Union[Callable[[str], str], argparse.FileType]:
    """
    Creates a function to check whether a numeric value is within a range, inclusive.

    The generated function can be used by the `type` parameter in argparse.ArgumentParser.
    See https://stackoverflow.com/a/53446178.

    Args:
        value: the numeric value to check.
        minimum: minimum of allowed range, inclusive.
        maximum: maximum of allowed range, inclusive.

    Returns:
        A function which takes a single argument and checks this against the range.

    Raises:
        argparse.ArgumentTypeError: if the value is outside the range.
        ValueError: if the value cannot be converted to float.
    """

    def generated_func_check_range(value: str) -> str:
        value_f = float(value)
        if (value_f < minimum) or (value_f > maximum):
            raise argparse.ArgumentTypeError(
                f"Supplied value {value} is not within expected range {minimum} to {maximum}."
            )
        return str(value)

    return generated_func_check_range

confirm_input_type(list_of_file_paths, input_type)

Confirms that input files are of the specified type.

Parameters

list_of_file_paths : list of str The list of filepaths to check. input_type : str The input type to check.

Returns

None

Raises

None

Notes

This function does not raise any exceptions, but it does log a warning if any of the input files are not of the specified type. This is intended to run if the user specifies the input type with --fastq or --fasta.

Source code in auriclass/general.py
def confirm_input_type(list_of_file_paths: List[str], input_type: str) -> None:
    """
    Confirms that input files are of the specified type.

    Parameters
    ----------
    list_of_file_paths : list of str
        The list of filepaths to check.
    input_type : str
        The input type to check.

    Returns
    -------
    None

    Raises
    ------
    None

    Notes
    -----
    This function does not raise any exceptions, but it does log a warning if any of the input files are not of the specified type.
    This is intended to run if the user specifies the input type with --fastq or --fasta.
    """
    for file_path in list_of_file_paths:
        if input_type == "fastq":
            if not is_fastq(file_path):
                logging.warning(
                    f"Input file {file_path} cannot be parsed as a fastq file, please check if --fastq is appropriate"
                )
        elif input_type == "fasta":
            if not is_fasta(file_path):
                logging.warning(
                    f"Input file {file_path} cannot be parsed as a fasta file, please check if --fasta is appropriate"
                )

guess_input_type(list_of_file_paths)

Guess the input type of a list of files.

Parameters

list_of_file_paths : list of str The list of filepaths to check.

Returns

str The input type, either "fastq" or "fasta".

Raises

ValueError If any of the input files can be parsed as both fastq and fasta.

If any of the input files cannot be parsed as either fastq or fasta.

If the input files are a mix of fastq and fasta files.

If no input files were found.
Source code in auriclass/general.py
def guess_input_type(list_of_file_paths: List[str]) -> str:
    """
    Guess the input type of a list of files.

    Parameters
    ----------
    list_of_file_paths : list of str
        The list of filepaths to check.

    Returns
    -------
    str
        The input type, either "fastq" or "fasta".

    Raises
    ------
    ValueError
        If any of the input files can be parsed as both fastq and fasta.

        If any of the input files cannot be parsed as either fastq or fasta.

        If the input files are a mix of fastq and fasta files.

        If no input files were found.
    """
    fastq_count = 0
    fasta_count = 0
    for file_path in list_of_file_paths:
        if is_fastq(file_path):
            if is_fasta(file_path):
                raise ValueError(
                    f"Input file {file_path} can be parsed as both fastq and fasta. Please specify --fastq or --fasta"
                )
            else:
                fastq_count += 1
        elif is_fasta(file_path):
            fasta_count += 1
        else:
            raise ValueError(f"Input file {file_path} is not a fastq or fasta file")

    if fastq_count > 0 and fasta_count > 0:
        raise ValueError(f"Input files are a mix of fastq and fasta files")
    elif fastq_count > 0:
        return "fastq"
    elif fasta_count > 0:
        return "fasta"
    else:
        raise ValueError(f"No input files were found")

is_fasta(file)

Check if a file is a fasta file.

Parameters

filepath : str The path to the file to check.

Returns

None

Raises

RuntimeError If the file is not a fasta file.

Source code in auriclass/general.py
def is_fasta(file: str) -> bool:
    """
    Check if a file is a fasta file.

    Parameters
    ----------
    filepath : str
        The path to the file to check.

    Returns
    -------
    None

    Raises
    ------
    RuntimeError
        If the file is not a fasta file.
    """
    try:
        pyfastx.Fasta(file, build_index=False)
        return True
    except RuntimeError:
        return False

is_fastq(file)

Check if a file is a fastq file.

Parameters

filepath : str The path to the file to check.

Returns

None

Raises

RuntimeError If the file is not a fastq file.

Source code in auriclass/general.py
def is_fastq(file: str) -> bool:
    """
    Check if a file is a fastq file.

    Parameters
    ----------
    filepath : str
        The path to the file to check.

    Returns
    -------
    None

    Raises
    ------
    RuntimeError
        If the file is not a fastq file.
    """
    try:
        pyfastx.Fastq(file, build_index=False)
        return True
    except RuntimeError:
        return False

validate_argument_logic(args)

Check if arguments are valid, based on predefined logic.

Parameters

args : argparse.Namespace The arguments to check.

Returns

argparse.Namespace The arguments, if they are valid. Modified if necessary.

Raises

ValueError If the arguments are not valid.

Source code in auriclass/general.py
def validate_argument_logic(args: argparse.Namespace) -> argparse.Namespace:
    """
    Check if arguments are valid, based on predefined logic.

    Parameters
    ----------
    args : argparse.Namespace
        The arguments to check.

    Returns
    -------
    argparse.Namespace
        The arguments, if they are valid. Modified if necessary.

    Raises
    ------
    ValueError
        If the arguments are not valid.
    """
    # Convert to float
    args.expected_genome_size = [
        float(args.expected_genome_size[0]),
        float(args.expected_genome_size[1]),
    ]
    # Check if specified genome size range is valid
    if args.expected_genome_size[0] > args.expected_genome_size[1]:
        raise ValueError(
            "Expected genome size range is invalid: lower bound is higher than upper bound"
        )
    elif (args.expected_genome_size[0] < 100) & (args.expected_genome_size[1] < 100):
        logging.warning(
            f"Expected genome size range boundaries {args.expected_genome_size} are below 100: treating these as Mbp instead of bp"
        )
        args.expected_genome_size = [
            args.expected_genome_size[0] * 1_000_000,
            args.expected_genome_size[1] * 1_000_000,
        ]
    return args

validate_input_files(list_of_files)

Check if files in a list exist.

Parameters

list_of_files : list of str The list of filepaths to check.

Returns

None

Raises

FileNotFoundError If any of the files do not exist.

Source code in auriclass/general.py
def validate_input_files(list_of_files: List[str]) -> None:
    """
    Check if files in a list exist.

    Parameters
    ----------
    list_of_files : list of str
        The list of filepaths to check.

    Returns
    -------
    None

    Raises
    ------
    FileNotFoundError
        If any of the files do not exist.
    """
    for filepath in list_of_files:
        if not Path(filepath).exists():
            raise FileNotFoundError(f"Required input file {filepath} does not exist")