Source code for epivizquindex.utils

from hilbertcurve.hilbertcurve import HilbertCurve
import requests
import math

avaliable_typles = ['mm10', 'mm9', 'hg19', 'hg38', 'human']

human_genome = {
    "chr1": 249250621, 
    "chr10": 135534747, 
    "chr11": 135006516, 
    "chr12": 133851895, 
    "chr13": 115169878, 
    "chr14": 107349540, 
    "chr15": 102531392, 
    "chr16": 90354753, 
    "chr17": 81195210, 
    "chr18": 78077248, 
    "chr19": 59128983, 
    "chr2": 243199373, 
    "chr20": 63025520, 
    "chr21": 48129895, 
    "chr22": 51304566, 
    "chr3": 198022430, 
    "chr4": 191154276, 
    "chr5": 180915260, 
    "chr6": 171115067, 
    "chr7": 159138663, 
    "chr8": 146364022, 
    "chr9": 141213431, 
    "chrM": 16571, 
    "chrX": 155270560, 
    "chrY": 59373566
}

[docs]def hcoords(x, chromLength, dims = 2):
    '''
    Returns hilbert space coordinate of the input query.

            Parameters:
            - **x (int)**:            A integer representing the query location in a chromosome.
            - **chromLength (int)**:  The total length of the chromosome.
            - **dims (int)**:         dimension of the hilbert space, by default we use 2d hilbert space.

            Returns:
            - **x (int)**:        x coordinate in hilbert space.
            - **y (int)**:        y coordinate in hilbert space.
            - **hlevel (int)**:   level of the hilbert space.
    '''
    hlevel = math.ceil(math.log2(chromLength)/dims)
    # print("hlevel, ", hlevel)
    hilbert_curve = HilbertCurve(hlevel, dims)
    [x,y] = hilbert_curve.points_from_distances([x])[0]
    return x, y, hlevel

# query : dictionary with 2 items, start and end
[docs]def range2bbox(hlevel, query, dims = 2, margin = 0):
    '''
    Convert the input query chromosome range to query bounding box in hilbert space.

            Parameters:
            - **hlevel (int)**:       level of the hilbert space, calculated by the size of the chromosome.
            - **query (dict)**:       dictionary of a query range, with "start" and "end" representing the query range in a chromosome.
            - **dims (int)**:         dimension of the hilbert space, by default we use 2d hilbert space.
            - **margin (int)**:       margin of the query box in hilbert space.

            Returns:
            - **bbox (tuple)**: 4 integers representing the lower left and top right coordinate. The order is lower left x, y, then top right x, y.
    '''
    # now = time.time()
    # query = {
    #     "start": 0,
    #     "end":  127074
    #     }
    hilbert_curve = HilbertCurve(hlevel, dims)
    inc = 0
    # ite = 0
    start = query["start"]+1
    points = []
    if start%4 == 1:
        points.append(start)
        start += 1
    if start%4 == 2:
        points.append(start)
        start += 1
    if start%4 == 3: 
        points.append(start)
        start += 1 
    points.append(start)

    # assume at this ppoint, start is always at the end of a level 0
    while start < query["end"] + 1:
        # ite += 1
        # print(inc)
        # locate the proper power incrementer
        # the incrementor indicates the maximum power of 4
        while start % (4**(inc+1)) == 0:
            inc += 1
        while inc >= 0:
            # to get min x, min y, max x, max y, it is necessary to
            # locate the diagnol coordinates.
            # the 3rd point of the thrid sub-quadrons is always diagnol
            # to the starting point.
            if start + (4**inc) <= query["end"] + 1:
                points.append(start + 1)
                displacement = 0
                for x in range(inc - 1, -1, -1):
                    # the following lines are equivalent, and does not
                    # improve any speed
                    # displacement = displacement | (0b01 << (2 * x))
                    displacement += 2 * 4 ** x
                points.append(start + displacement + 1)
                start += 4 ** inc
                break
            else:
                inc = inc - 1

    # print(points)
    hillcorX = []
    hillcorY = []
    # for point in points:
    for h_point in hilbert_curve.points_from_distances(points):
        [x, y] = h_point
        # print(x, y, point)
        hillcorX.append(x)
        hillcorY.append(y)
    bbox = (min(hillcorX) - margin, min(hillcorY) - margin, max(hillcorX) + margin, max(hillcorY) + margin)
    # print(bbox)
    # print(time.time() - now)
    return bbox

[docs]def get_genome(t):
    '''
    Get the range of chromosomes of the given type. Currently these are read from hgdownload.cse.ucsc.edu.

            Parameters:
            - **t (str)**:       type of gene.

            Returns:
            - **genome (dict)**: Dictionary of the range of the chromosome for the given type.
    '''
    genome = {}
    if t not in avaliable_typles:
        exception('unsupported genome type')
    if t == 'human':
        return human_genome
    target_url = "http://hgdownload.cse.ucsc.edu/goldenpath/{}/bigZips/{}.chrom.sizes".format(t, t)
    response = requests.get(target_url)
    data = response.text
    for line in data.split('\n'):
        if 'random' in line or len(line) < 3 or '_' in line:
            continue
        chrm, num = line.split('\t')
        genome[chrm] = int(num)
    return genome
Source code for epivizquindex.utils

EpivizQuindex

Navigation

Related Topics