Source code for epivizquindex.utils

from hilbertcurve.hilbertcurve import HilbertCurve
import requests
import math

avaliable_typles = ['mm10', 'mm9', 'hg19', 'hg38', 'human']

human_genome = {
    "chr1": 249250621, 
    "chr10": 135534747, 
    "chr11": 135006516, 
    "chr12": 133851895, 
    "chr13": 115169878, 
    "chr14": 107349540, 
    "chr15": 102531392, 
    "chr16": 90354753, 
    "chr17": 81195210, 
    "chr18": 78077248, 
    "chr19": 59128983, 
    "chr2": 243199373, 
    "chr20": 63025520, 
    "chr21": 48129895, 
    "chr22": 51304566, 
    "chr3": 198022430, 
    "chr4": 191154276, 
    "chr5": 180915260, 
    "chr6": 171115067, 
    "chr7": 159138663, 
    "chr8": 146364022, 
    "chr9": 141213431, 
    "chrM": 16571, 
    "chrX": 155270560, 
    "chrY": 59373566
}

[docs]def hcoords(x, chromLength, dims = 2): ''' Returns hilbert space coordinate of the input query. Parameters: - **x (int)**: A integer representing the query location in a chromosome. - **chromLength (int)**: The total length of the chromosome. - **dims (int)**: dimension of the hilbert space, by default we use 2d hilbert space. Returns: - **x (int)**: x coordinate in hilbert space. - **y (int)**: y coordinate in hilbert space. - **hlevel (int)**: level of the hilbert space. ''' hlevel = math.ceil(math.log2(chromLength)/dims) # print("hlevel, ", hlevel) hilbert_curve = HilbertCurve(hlevel, dims) [x,y] = hilbert_curve.points_from_distances([x])[0] return x, y, hlevel
# query : dictionary with 2 items, start and end
[docs]def range2bbox(hlevel, query, dims = 2, margin = 0): ''' Convert the input query chromosome range to query bounding box in hilbert space. Parameters: - **hlevel (int)**: level of the hilbert space, calculated by the size of the chromosome. - **query (dict)**: dictionary of a query range, with "start" and "end" representing the query range in a chromosome. - **dims (int)**: dimension of the hilbert space, by default we use 2d hilbert space. - **margin (int)**: margin of the query box in hilbert space. Returns: - **bbox (tuple)**: 4 integers representing the lower left and top right coordinate. The order is lower left x, y, then top right x, y. ''' # now = time.time() # query = { # "start": 0, # "end": 127074 # } hilbert_curve = HilbertCurve(hlevel, dims) inc = 0 # ite = 0 start = query["start"]+1 points = [] if start%4 == 1: points.append(start) start += 1 if start%4 == 2: points.append(start) start += 1 if start%4 == 3: points.append(start) start += 1 points.append(start) # assume at this ppoint, start is always at the end of a level 0 while start < query["end"] + 1: # ite += 1 # print(inc) # locate the proper power incrementer # the incrementor indicates the maximum power of 4 while start % (4**(inc+1)) == 0: inc += 1 while inc >= 0: # to get min x, min y, max x, max y, it is necessary to # locate the diagnol coordinates. # the 3rd point of the thrid sub-quadrons is always diagnol # to the starting point. if start + (4**inc) <= query["end"] + 1: points.append(start + 1) displacement = 0 for x in range(inc - 1, -1, -1): # the following lines are equivalent, and does not # improve any speed # displacement = displacement | (0b01 << (2 * x)) displacement += 2 * 4 ** x points.append(start + displacement + 1) start += 4 ** inc break else: inc = inc - 1 # print(points) hillcorX = [] hillcorY = [] # for point in points: for h_point in hilbert_curve.points_from_distances(points): [x, y] = h_point # print(x, y, point) hillcorX.append(x) hillcorY.append(y) bbox = (min(hillcorX) - margin, min(hillcorY) - margin, max(hillcorX) + margin, max(hillcorY) + margin) # print(bbox) # print(time.time() - now) return bbox
[docs]def get_genome(t): ''' Get the range of chromosomes of the given type. Currently these are read from hgdownload.cse.ucsc.edu. Parameters: - **t (str)**: type of gene. Returns: - **genome (dict)**: Dictionary of the range of the chromosome for the given type. ''' genome = {} if t not in avaliable_typles: exception('unsupported genome type') if t == 'human': return human_genome target_url = "http://hgdownload.cse.ucsc.edu/goldenpath/{}/bigZips/{}.chrom.sizes".format(t, t) response = requests.get(target_url) data = response.text for line in data.split('\n'): if 'random' in line or len(line) < 3 or '_' in line: continue chrm, num = line.split('\t') genome[chrm] = int(num) return genome