Complete Examples#

End-to-end workflows combining file I/O, data structures, queries, and graph overlays.

C++ · genogrove 0.25.3

Processing BED Files

Here’s a complete workflow combining file I/O, data structures, and queries:

#include <genogrove/io/bed_reader.hpp>
#include <genogrove/structure/grove/grove.hpp>
#include <genogrove/data_type/interval.hpp>
#include <iostream>

namespace gio = genogrove::io;
namespace gdt = genogrove::data_type;
namespace gst = genogrove::structure;

int main() {
    // Create grove to store BED entries
    gst::grove<gdt::interval, std::string> features(100);

    // Read BED file
    gio::bed_reader reader("data.bed.gz");

    try {
        for (const auto& entry : reader) {
            // Insert into grove (organized by chromosome)
            // Convert half-open [start, end) to closed [start, end]
            features.insert_data(
                entry.chrom,
                gdt::interval(entry.start, entry.end - 1),
                entry.name.value_or("unnamed"),
                gst::sorted  // BED files are typically sorted
            );
        }
    } catch (const std::runtime_error& e) {
        std::cerr << "Error: " << e.what() << "\n";
    }

    // Query for overlapping features
    gdt::interval query_region{1000, 2000};
    auto results = features.intersect(query_region, "chr1");

    std::cout << "Found " << results.get_keys().size()
              << " features overlapping region\n";

    for (auto* key : results.get_keys()) {
        std::cout << "  - " << key->get_data()
                  << " at " << key->get_value().to_string() << "\n";
    }

    return 0;
}

Processing GFF/GTF Files

Working with gene annotations:

#include <genogrove/io/gff_reader.hpp>
#include <genogrove/structure/grove/grove.hpp>
#include <genogrove/data_type/genomic_coordinate.hpp>
#include <iostream>

namespace gio = genogrove::io;
namespace gdt = genogrove::data_type;
namespace gst = genogrove::structure;

struct GeneAnnotation {
    std::string gene_id;
    std::string gene_name;
    std::string type;
};

int main() {
    // Create grove with strand-aware coordinates
    gst::grove<gdt::genomic_coordinate, GeneAnnotation> annotations(100);

    // Read GFF file
    gio::gff_reader reader("annotations.gff.gz");

    try {
        for (const auto& entry : reader) {
            // Only process gene features
            if (entry.type != "gene") continue;

            // Create genomic coordinate with strand (GFF is 1-based inclusive)
            gdt::genomic_coordinate coord{
                entry.strand.value_or('.'),
                entry.start,
                entry.end
            };

            // Extract annotation info
            GeneAnnotation annot{
                entry.get_gene_id().value_or("unknown"),
                entry.get_gene_name().value_or("unknown"),
                entry.type
            };

            // Insert into grove
            annotations.insert_data(
                entry.seqid,
                coord,
                annot,
                gst::sorted
            );
        }
    } catch (const std::runtime_error& e) {
        std::cerr << "Error: " << e.what() << "\n";
    }

    // Query for genes in a region
    gdt::genomic_coordinate query{'+', 5000, 10000};
    auto results = annotations.intersect(query, "chr1");

    std::cout << "Found " << results.get_keys().size() << " genes\n";
    for (auto* key : results.get_keys()) {
        auto& annot = key->get_data();
        std::cout << "  Gene: " << annot.gene_name
                  << " (ID: " << annot.gene_id << ")\n";
    }

    return 0;
}

Building Transcript Graphs

Representing transcript structures with graph overlays:

#include <genogrove/io/gff_reader.hpp>
#include <genogrove/structure/grove/grove.hpp>
#include <genogrove/data_type/interval.hpp>
#include <genogrove/data_type/key.hpp>
#include <iostream>
#include <map>
#include <string>
#include <vector>

namespace gio = genogrove::io;
namespace gdt = genogrove::data_type;
namespace gst = genogrove::structure;

int main() {
    // Grove with no edge metadata
    gst::grove<gdt::interval, std::string> transcripts(100);

    // Map to track exons per transcript
    std::map<std::string, std::vector<gdt::key<gdt::interval, std::string>*>>
        transcript_exons;

    // Read GFF file
    gio::gff_reader reader("transcripts.gff");

    try {
        for (const auto& entry : reader) {
            if (entry.type != "exon") continue;

            auto transcript_id = entry.get_transcript_id().value_or("unknown");

            // Insert exon (GFF is 1-based inclusive)
            auto* exon_key = transcripts.insert_data(
                entry.seqid,
                gdt::interval(entry.start, entry.end),
                transcript_id,
                gst::sorted
            );

            // Track exon for this transcript
            transcript_exons[transcript_id].push_back(exon_key);
        }
    } catch (const std::runtime_error& e) {
        std::cerr << "Error: " << e.what() << "\n";
    }

    // Build graph by connecting consecutive exons in each transcript
    for (auto& [transcript_id, exons] : transcript_exons) {
        for (size_t i = 0; i + 1 < exons.size(); ++i) {
            transcripts.add_edge(exons[i], exons[i + 1]);
        }
    }

    // Analyze transcript structure
    std::cout << "Transcript analysis:\n";
    for (auto& [transcript_id, exons] : transcript_exons) {
        std::cout << "  " << transcript_id << ": "
                  << exons.size() << " exons\n";

        // Show connectivity
        for (auto* exon : exons) {
            auto neighbors = transcripts.get_neighbors(exon);
            if (!neighbors.empty()) {
                std::cout << "    Exon at "
                          << exon->get_value().to_string()
                          << " connects to " << neighbors.size()
                          << " downstream exon(s)\n";
            }
        }
    }

    return 0;
}

Python · pygenogrove 0.7.0

Processing BED Files

A complete workflow combining file I/O, a typed grove, and an overlap query:

import pygenogrove as pg

# Create a typed grove to store BED entries
g = pg.BedGrove(256)

# Read the BED file and insert each entry, keyed by chromosome
for e in pg.BedReader("data.bed.gz"):
    g.insert(e.chrom, e)               # key derived from each entry

# Query for features overlapping chr1:1500-1600
query = pg.GenomicCoordinate(".", 1500, 1600)
for hit in g.intersect(query, "chr1"):
    print(hit.data.name, hit.data.score)

Note

BedReader yields parsed BED entries, and BedGrove.insert derives the grove key from each entry’s coordinates. Use pg.GffGrove / pg.GffReader for GFF/GTF annotation workflows.

Building Transcript Graphs

Representing a transcript as a chain of exons on the universal Grove, then walking the graph overlay:

import pygenogrove as pg

# Exon coordinates of one transcript (start, end), in transcription order
coords = [(1000, 1200), (1500, 1700), (2100, 2400)]

g = pg.Grove(256)

# Insert each exon on the '+' strand and keep the returned keys
exons = [g.insert("chr1", pg.GenomicCoordinate("+", s, e)) for s, e in coords]

# Chain consecutive exons of the transcript with directed edges
for a, b in zip(exons, exons[1:]):
    g.add_edge(a, b)

# Walk downstream from the first exon
for nb in g.get_neighbors(exons[0]):
    print("downstream exon:", nb)