I/O#
The genogrove::io namespace contains file readers and parsers for genomic file formats.
Readers#
filetype_detector#
-
class filetype_detector#
Public Functions
-
std::tuple<filetype, compression_type> detect_filetype(const fs::path &filepath)#
-
std::tuple<filetype, compression_type> detect_filetype(const fs::path &filepath)#
bed_reader#
-
class bed_reader : public genogrove::io::file_reader<bed_entry>#
Public Functions
-
explicit bed_reader(const std::filesystem::path &path)#
-
bed_reader(const std::filesystem::path &path, const bed_reader_options &options)#
-
bed_reader(const bed_reader&) = delete#
-
bed_reader &operator=(const bed_reader&) = delete#
-
inline bed_reader(bed_reader &&other) noexcept#
-
inline bed_reader &operator=(bed_reader &&other) noexcept#
-
virtual bool has_next() override#
-
virtual std::string get_error_message() const override#
Returns the error message from the most recent read_next() call. Cleared at the start of each read_next(); empty if the last read succeeded.
-
virtual size_t get_current_line() const override#
Returns the 1-based position the reader has consumed up to in its input. The unit is reader-specific: the physical line number for the text readers (BED/GFF — comment and blank lines are counted), or the record index for the BAM reader (records dropped by filters are counted). In all cases it reflects input consumed, not entries yielded by read_next() — a skipped comment line or a filtered-out record still advances it. Zero before the first read_next(). Intended for error messages and progress reporting, not for counting returned entries.
-
~bed_reader() override#
-
explicit bed_reader(const std::filesystem::path &path)#
gff_reader#
-
class gff_reader : public genogrove::io::file_reader<gff_entry>#
Public Functions
-
explicit gff_reader(const std::filesystem::path &path)#
-
gff_reader(const std::filesystem::path &path, const gff_reader_options &options)#
-
gff_reader(const gff_reader&) = delete#
-
gff_reader &operator=(const gff_reader&) = delete#
-
inline gff_reader(gff_reader &&other) noexcept#
-
inline gff_reader &operator=(gff_reader &&other) noexcept#
-
virtual bool has_next() override#
-
virtual std::string get_error_message() const override#
Returns the error message from the most recent read_next() call. Cleared at the start of each read_next(); empty if the last read succeeded.
-
virtual size_t get_current_line() const override#
Returns the 1-based position the reader has consumed up to in its input. The unit is reader-specific: the physical line number for the text readers (BED/GFF — comment and blank lines are counted), or the record index for the BAM reader (records dropped by filters are counted). In all cases it reflects input consumed, not entries yielded by read_next() — a skipped comment line or a filtered-out record still advances it. Zero before the first read_next(). Intended for error messages and progress reporting, not for counting returned entries.
-
~gff_reader() override#
-
explicit gff_reader(const std::filesystem::path &path)#
bam_reader#
-
class bam_reader : public genogrove::io::file_reader<sam_entry>#
Reader for SAM and BAM alignment files.
This class provides iteration over alignments in SAM and BAM files using htslib. It transparently handles both formats and supports filtering options.
Public Functions
-
explicit bam_reader(const std::filesystem::path &path)#
Construct a BAM reader with default options.
Default options skip unmapped reads. The file format (SAM/BAM/CRAM) is automatically detected.
- Parameters:
path – Path to the SAM/BAM file
- Throws:
std::runtime_error – if file cannot be opened or is invalid
-
bam_reader(const std::filesystem::path &path, const bam_reader_options &options)#
Construct a BAM reader with custom options.
- Parameters:
path – Path to the SAM/BAM file
options – Reader configuration options
- Throws:
std::runtime_error – if file cannot be opened or is invalid
-
virtual bool read_next(sam_entry &entry) override#
Read the next alignment record.
Reads the next alignment from the file, applying configured filters. Skipped records (based on options) are automatically bypassed.
Returns false only at EOF. Throws on I/O errors or on truncated/ malformed auxiliary data — when read_next() returns true the record is fully populated and get_error_message() reads empty.
- Parameters:
entry – Output parameter for the alignment record
- Throws:
std::runtime_error – on I/O error or truncated auxiliary data
- Returns:
true if a record was successfully read, false at EOF
-
virtual bool has_next() override#
Check if more records are available.
Note
This is a best-effort check; read_next() is authoritative
- Returns:
true if more records may be available
-
virtual std::string get_error_message() const override#
Get the last error message.
- Returns:
Error message string, empty if no error
-
virtual size_t get_current_line() const override#
Get the current record number (1-based).
Counts every record consumed from the file, including records dropped by the configured filters (see bam_reader_options) — it is the file position, not the count of records returned by read_next(). If an unmapped record is skipped, the counter still advances past it.
- Returns:
Index of the most recently consumed record (0 before the first read)
-
const std::string &get_header() const#
Get the SAM header text.
- Returns:
Full header text including all @HD, @SQ, @RG, @PG lines
-
const std::vector<std::string> &get_reference_names() const#
Get the list of reference sequence names.
- Returns:
Vector of reference names from the header
-
bam_reader(bam_reader &&other) noexcept#
Move constructor.
-
bam_reader &operator=(bam_reader &&other) noexcept#
Move assignment operator.
-
bam_reader(const bam_reader&) = delete#
-
bam_reader &operator=(const bam_reader&) = delete#
-
~bam_reader() override#
-
explicit bam_reader(const std::filesystem::path &path)#
fasta_reader#
-
class fasta_reader : public genogrove::io::file_reader<fasta_entry>#
Reader for FASTA and FASTQ sequence files.
Uses htslib’s kseq parser to transparently handle both FASTA and FASTQ formats, including gzip-compressed files.
Public Functions
-
explicit fasta_reader(const std::filesystem::path &path)#
-
fasta_reader(const std::filesystem::path &path, const fasta_reader_options &options)#
-
fasta_reader(const fasta_reader&) = delete#
-
fasta_reader &operator=(const fasta_reader&) = delete#
-
fasta_reader(fasta_reader &&other) noexcept#
-
fasta_reader &operator=(fasta_reader &&other) noexcept#
-
virtual bool read_next(fasta_entry &entry) override#
-
virtual bool has_next() override#
-
virtual std::string get_error_message() const override#
Returns the error message from the most recent read_next() call. Cleared at the start of each read_next(); empty if the last read succeeded.
-
virtual size_t get_current_line() const override#
Returns the 1-based position the reader has consumed up to in its input. The unit is reader-specific: the physical line number for the text readers (BED/GFF — comment and blank lines are counted), or the record index for the BAM reader (records dropped by filters are counted). In all cases it reflects input consumed, not entries yielded by read_next() — a skipped comment line or a filtered-out record still advances it. Zero before the first read_next(). Intended for error messages and progress reporting, not for counting returned entries.
-
~fasta_reader() override#
-
explicit fasta_reader(const std::filesystem::path &path)#
fasta_index#
-
class fasta_index#
Indexed random-access reader for FASTA files.
Wraps htslib’s faidx API to provide efficient region-based sequence retrieval from FASTA files. Automatically creates the
.faiindex if it does not exist.Coordinates follow the 0-based half-open convention used by BED/BAM:
fetch("chr1", 100, 200)returns bases at positions 100..199.Public Functions
-
explicit fasta_index(const std::filesystem::path &path)#
Open a FASTA file and load (or create) its index.
- Parameters:
path – Path to the FASTA file (.fa, .fasta, .fna)
- Throws:
std::runtime_error – if file cannot be opened or indexed
-
fasta_index(const fasta_index&) = delete#
-
fasta_index &operator=(const fasta_index&) = delete#
-
fasta_index(fasta_index &&other) noexcept#
-
fasta_index &operator=(fasta_index &&other) noexcept#
-
~fasta_index()#
-
std::string fetch(const std::string &name, size_t start, size_t end) const#
Fetch a subsequence by region.
- Parameters:
name – Sequence name (e.g., “chr1”)
start – 0-based start position (inclusive)
end – 0-based end position (exclusive)
- Throws:
std::out_of_range – if name is not in the index, or the region is invalid (start >= end, or exceeds htslib’s coordinate limit)
std::runtime_error – on fetch failure
- Returns:
The nucleotide sequence for the region
-
std::string fetch(const std::string &name) const#
Fetch an entire sequence by name.
- Parameters:
name – Sequence name (e.g., “chr1”)
- Throws:
std::out_of_range – if name is not in the index
std::runtime_error – on fetch failure
- Returns:
The full nucleotide sequence
-
size_t sequence_count() const#
Get the number of sequences in the index.
-
std::string sequence_name(size_t index) const#
Get the name of the i-th sequence (0-based).
- Parameters:
index – 0-based sequence index
- Throws:
std::out_of_range – if index >= sequence_count()
-
size_t sequence_length(const std::string &name) const#
Get the length of a sequence by name.
- Parameters:
name – Sequence name
- Throws:
std::out_of_range – if name is not in the index
-
bool has_sequence(const std::string &name) const#
Check if a sequence name is in the index.
-
explicit fasta_index(const std::filesystem::path &path)#
Entry Types#
bed_entry#
-
class bed_entry#
Represents a single entry in a BED (Browser Extensible Data) file.
The bed_entry class is designed to encapsulate the data associated with a single genomic interval in a BED file. BED files are commonly used to describe genomic regions, such as those generated by alignment or variant-calling processes in bioinformatics.
This class provides a means to store and process information about chromosomal intervals with optional annotation fields.
See https://samtools.github.io/hts-specs/BEDv1.pdf
Public Functions
-
bed_entry() = default#
-
inline bed_entry(std::string chrom, size_t start, size_t end)#
-
void serialize(std::ostream &os) const#
Public Members
-
std::string chrom#
-
size_t start = 0#
0-based start position (BED chromStart)
-
size_t end = 0#
0-based exclusive end position (BED chromEnd)
-
std::optional<std::string> name#
-
std::optional<int> score#
-
std::optional<char> strand#
-
std::optional<thick_info> thickness#
-
std::optional<block_info> blocks#
-
bed_entry() = default#
gff_entry#
-
struct gff_entry#
Public Functions
-
inline gff_entry()#
-
inline gff_entry(std::string seqid, size_t start, size_t end, std::string type)#
-
std::optional<std::string> get_gene_id() const#
-
std::optional<std::string> get_transcript_id() const#
-
std::optional<int> get_exon_number() const#
-
std::optional<std::string> get_gene_name() const#
-
std::optional<std::string> get_gene_biotype() const#
-
std::optional<std::string> get_attribute(std::string_view key) const#
-
inline bool is_gtf() const#
-
inline bool is_gff3() const#
-
inline gff_entry()#
sam_entry#
-
struct sam_entry#
Represents a single SAM/BAM alignment record.
This struct encapsulates all fields from a SAM/BAM alignment record, including the read name, reference sequence, alignment coordinates, CIGAR string, sequence, quality scores, and auxiliary tags.
Public Functions
-
inline sam_entry()#
-
inline char get_strand() const#
Get strand character based on FLAG.
-
inline bool is_primary() const#
Check if this is a primary alignment.
-
inline bool is_mapped() const#
Check if this read is mapped.
-
inline bool consumes_reference() const#
Whether this record covers any reference bases.
False for unmapped reads and for mapped records whose CIGAR consumes zero reference bases (pure soft-clip like “100S”, hard-clip-only secondary alignments). Use this as the gate before converting to a closed
gdt::interval(start, end - 1)or inserting into a grove — the conversion is only valid whenstart < end.
-
inline std::string cigar_string_repr() const#
Convert CIGAR to string representation.
Public Members
-
std::string qname#
Query template NAME (read name)
-
std::string chrom#
Reference sequence name (RNAME)
-
size_t start = 0#
0-based start position (htslib-native, from POS). Unmapped reads carry
start == 0; zero-ref-consuming CIGARs (e.g. pure soft-clip) carrystart == end == POS.
-
size_t end = 0#
0-based exclusive end position (htslib-native, from POS + CIGAR-consumed reference length). Equals
startfor unmapped reads and for zero-ref-consuming CIGARs (pure soft-clip, hard-clip-only secondaries). Convert to closedgdt::interval(start, end - 1)only whenconsumes_reference()is true.
-
alignment_flags flags#
Bitwise FLAG.
-
uint8_t mapq#
Mapping quality (0-255)
-
cigar_string cigar#
CIGAR string (alignment operations)
-
std::string sequence#
Read sequence (SEQ)
-
std::string quality#
ASCII quality scores (QUAL)
-
sam_tags tags#
Auxiliary tags.
-
inline sam_entry()#
fasta_entry#
-
struct fasta_entry#
Represents a single FASTA or FASTQ sequence record.
For FASTA records the quality field is empty (nullopt). For FASTQ records it contains the per-base quality string.
Public Functions
-
fasta_entry() = default#
-
inline fasta_entry(std::string name, std::string sequence)#
Public Members
-
std::string name#
Sequence name (text after > or @, up to first whitespace)
-
std::string comment#
Optional description (rest of header line after name)
-
std::string sequence#
Nucleotide sequence.
-
std::optional<std::string> quality#
Per-base quality string (FASTQ only, nullopt for FASTA)
-
fasta_entry() = default#
Reader Options#
bed_reader_options#
-
struct bed_reader_options#
Configuration options for the BED reader.
Options can be set via C++20 designated initializers at construction:
bed_reader reader(path, {.skip_invalid_lines = true});
Or by constructing and assigning individual fields:
bed_reader_options opts; opts.skip_invalid_lines = true; bed_reader reader(path, opts);
Public Members
-
bool skip_invalid_lines = false#
Skip invalid lines instead of throwing.
Public Static Functions
-
static inline bed_reader_options defaults()#
-
bool skip_invalid_lines = false#
gff_reader_options#
-
struct gff_reader_options#
Configuration options for the GFF reader.
Options can be set via C++20 designated initializers at construction:
gff_reader reader(path, {.skip_invalid_lines = true, .validate_gtf = true});
Or by constructing and assigning individual fields:
gff_reader_options opts; opts.skip_invalid_lines = true; opts.validate_gtf = true; gff_reader reader(path, opts);
Public Members
-
bool skip_invalid_lines = false#
Skip invalid lines instead of throwing.
-
bool validate_gtf = false#
Validate mandatory GTF2 attributes (gene_id, transcript_id)
Public Static Functions
-
static inline gff_reader_options defaults()#
-
bool skip_invalid_lines = false#
bam_reader_options#
-
struct bam_reader_options#
Configuration options for the BAM reader.
Options can be set via C++20 designated initializers at construction:
bam_reader reader(path, {.skip_unmapped = false, .min_mapq = 30});
Or by constructing and assigning individual fields:
bam_reader_options opts; opts.skip_secondary = true; opts.min_mapq = 20; bam_reader reader(path, opts);
Named factory methods are also available for common configurations:
defaults(),include_all(),primary_only(),high_quality().Public Members
-
bool skip_unmapped = true#
Skip unmapped reads (default: true)
-
bool skip_secondary = false#
Skip secondary alignments.
-
bool skip_supplementary = false#
Skip supplementary alignments.
-
bool skip_qc_fail = false#
Skip QC-failed reads.
-
bool skip_duplicates = false#
Skip duplicate reads.
-
uint8_t min_mapq = 0#
Minimum mapping quality (0 = no filter)
Public Static Functions
-
static inline bam_reader_options defaults()#
Factory method for default options (skip unmapped only)
-
static inline bam_reader_options include_all()#
Factory method to include all reads.
-
static inline bam_reader_options primary_only()#
Factory method for primary alignments only.
-
static inline bam_reader_options high_quality(uint8_t min_mapq = 20)#
Factory method for high-quality primary alignments.
-
bool skip_unmapped = true#
fasta_reader_options#
-
struct fasta_reader_options#
Configuration options for the sequence reader.
Options can be set via C++20 designated initializers at construction:
fasta_reader reader(path, {.skip_empty_sequences = true});
Public Members
-
bool skip_empty_sequences = false#
Skip records with empty sequence strings.
Public Static Functions
-
static inline fasta_reader_options defaults()#
-
bool skip_empty_sequences = false#
BED Support Types#
rgb_color#
thick_info#
-
struct thick_info#
block_info#
BAM/SAM Types#
sam_flags#
-
struct sam_flags#
SAM/BAM flag bit values.
These constants represent the individual flag bits in the SAM FLAG field. Use these with alignment_flags::has_flag() for custom flag checks.
Public Static Attributes
-
static constexpr uint16_t PAIRED = 0x1#
Template has multiple segments.
-
static constexpr uint16_t PROPER_PAIR = 0x2#
Each segment properly aligned.
-
static constexpr uint16_t UNMAPPED = 0x4#
Segment unmapped.
-
static constexpr uint16_t MATE_UNMAPPED = 0x8#
Next segment unmapped.
-
static constexpr uint16_t REVERSE = 0x10#
SEQ on reverse strand.
-
static constexpr uint16_t MATE_REVERSE = 0x20#
SEQ of next segment reversed.
-
static constexpr uint16_t READ1 = 0x40#
First segment in template.
-
static constexpr uint16_t READ2 = 0x80#
Last segment in template.
-
static constexpr uint16_t SECONDARY = 0x100#
Secondary alignment.
-
static constexpr uint16_t QCFAIL = 0x200#
Not passing filters.
-
static constexpr uint16_t DUPLICATE = 0x400#
PCR or optical duplicate.
-
static constexpr uint16_t SUPPLEMENTARY = 0x800#
Supplementary alignment.
-
static constexpr uint16_t PAIRED = 0x1#
alignment_flags#
-
class alignment_flags#
Wrapper class for SAM/BAM alignment flags.
Provides convenient methods to query individual flag bits.
Public Functions
-
inline constexpr alignment_flags()#
-
inline explicit constexpr alignment_flags(uint16_t flags)#
-
inline constexpr uint16_t value() const#
Get the raw flag value.
-
inline constexpr bool has_flag(uint16_t flag) const#
Check if a specific flag bit is set.
-
inline constexpr bool is_paired() const#
-
inline constexpr bool is_proper_pair() const#
-
inline constexpr bool is_unmapped() const#
-
inline constexpr bool is_mate_unmapped() const#
-
inline constexpr bool is_reverse() const#
-
inline constexpr bool is_mate_reverse() const#
-
inline constexpr bool is_read1() const#
-
inline constexpr bool is_read2() const#
-
inline constexpr bool is_secondary() const#
-
inline constexpr bool is_qc_fail() const#
-
inline constexpr bool is_duplicate() const#
-
inline constexpr bool is_supplementary() const#
-
inline constexpr alignment_flags()#
cigar_element#
-
struct cigar_element#
A single CIGAR operation with its operation type and length.
Public Functions
-
inline constexpr cigar_element()#
-
inline constexpr bool consumes_reference() const#
Check if this operation consumes reference bases.
-
inline constexpr bool consumes_query() const#
Check if this operation consumes query (read) bases.
-
inline constexpr char to_char() const#
Convert operation to character representation.
-
inline constexpr cigar_element()#
mate_info#
sam_tag#
-
struct sam_tag#
A single SAM auxiliary tag.
Enums#
filetype#
compression_type#
gff_format#
cigar_op#
-
enum class genogrove::io::cigar_op : uint8_t#
CIGAR operation codes.
These correspond to the BAM_CMATCH, BAM_CINS, etc. constants from htslib.
Values:
-
enumerator MATCH#
M - Alignment match (can be sequence match or mismatch)
-
enumerator INS#
I - Insertion to the reference.
-
enumerator DEL#
D - Deletion from the reference.
-
enumerator REF_SKIP#
N - Skipped region from the reference (e.g., intron)
-
enumerator SOFT_CLIP#
S - Soft clipping (clipped sequences present in SEQ)
-
enumerator HARD_CLIP#
H - Hard clipping (clipped sequences NOT present in SEQ)
-
enumerator PAD#
P - Padding (silent deletion from padded reference)
-
enumerator SEQ_MATCH#
= - Sequence match
-
enumerator SEQ_MISMATCH#
X - Sequence mismatch.
-
enumerator MATCH#