Source code for mira.datasets.datasets

import os
import tarfile
import logging
from urllib.request import urlretrieve
from tqdm.auto import tqdm
from functools import partial

logger = logging.getLogger(__name__)

class Dataset:

    def __init__(self,*, 
        remote_url, 
        download_directory = 'mira-datasets',
        tar = True,
        is_directory = True):

        self.remote_url = remote_url
        self.download_dir = download_directory
        self.tar = tar
        self.is_directory = is_directory

        if self.is_directory:
            assert self.tar, 'If downloading a directory, it must be tarballed.'

    @property 
    def local_filename(self):
        return os.path.join(self.download_dir, self.remote_filename)

    @property
    def remote_filename(self):
        return os.path.basename(self.remote_url)

    @property
    def uncompressed_name(self):
        if self.tar:
            return self.local_filename[:-7]
        else:
            return self.local_filename

    @property
    def is_on_disk(self):

        if self.tar: 
            assert self.remote_filename[-7:] == '.tar.gz', 'If remote file is a tar directory, must end with .tar.gz'

            if self.is_directory:
                return os.path.isdir(self.uncompressed_name)
            elif not self.is_directory:
                return os.path.isfile(self.uncompressed_name)
            else:
                return False
        else:
            return os.path.isfile(self.uncompressed_name)

    @staticmethod
    def _progress(block_num, block_size, total_size,*,
        progress_bar):

        block_size, total_size = block_size//1000, total_size//1000

        if progress_bar.total == None:
            progress_bar.reset(total = max(total_size, 1))

        progress_bar.update(block_size)


    def download(self):

        if not os.path.exists(self.download_dir):
                os.mkdir(self.download_dir)
        
        try:
            with tqdm(total = None, desc = 'Downloading dataset') as bar:
                
                progress_func = partial(self._progress, progress_bar = bar)

                urlretrieve(
                    self.remote_url, self.local_filename, progress_func
                )

                bar.update(max(bar.total - bar.n, 1))

            if self.tar:
                assert tarfile.is_tarfile(self.local_filename)

                with tarfile.open(self.local_filename) as tar:
                    tar.extractall(self.download_dir)

                os.remove(self.local_filename)

        except (Exception, KeyboardInterrupt) as err:
            logger.error('Encountered error, removing downloaded files.')
            if os.path.exists(self.local_filename):
                os.remove(self.local_filename)
            raise err


    def __call__(self):

        if not self.is_on_disk:
            self.download()
        else:
            logger.info('Dataset already on disk.')

        if self.is_directory:
            logger.info(
                'Dataset contents:\n\t* ' + self.uncompressed_name + '\n\t\t* ' + '\n\t\t* '.join(os.listdir(self.uncompressed_name))
            )
        else:
            logger.info('Dataset contents:\n\t* ' + self.local_filename)


[docs]def ShareseqSkin_Ma2020(download_directory = 'mira-datasets'): ''' SHARE-seq skin dataset used in paper and tutorials. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/shareseq_Ma2020.tar.gz', tar=True, is_directory=True, download_directory=download_directory, )()
############# # TUTORIALS # ############# def CodalFrankencellTutorial(download_directory = 'mira-datasets'): Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/codal/CODAL_tutorial.tar.gz', tar=True, is_directory=True, download_directory=download_directory, )()
[docs]def StreamGraphTutorial(download_directory = 'mira-datasets'): ''' Streamgraph tutorial data ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/tutorials/shareseq/shareseq.hair_follicle.joint_representation.lineage_inference.h5ad', tar=False, is_directory=False, download_directory=download_directory, )()
[docs]def PseudotimeTrajectoryInferenceTutorial(download_directory = 'mira-datasets'): ''' Pseudotime trajectory inference tutorial data ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/tutorials/shareseq/shareseq.hair_follicle.joint_representation.h5ad', tar=False, is_directory=False, download_directory=download_directory, )()
[docs]def FrankenCell_RNA(download_directory = 'mira-datasets'): ''' Small synthetic test dataset for topic model tuning. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/Frankencell_RNA.h5ad', tar=False, is_directory=False, download_directory=download_directory, )()
[docs]def ShareseqTopicModels(download_directory = 'mira-datasets'): ''' Topic models trained on SHARE-seq dataset. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/tutorials/shareseq/shareseq_topic_models.tar.gz', tar=True, is_directory=True, download_directory=download_directory, )()
[docs]def ShareseqBaseData(download_directory = 'mira-datasets'): ''' Raw count matrices for SHARE-seq skin dataset. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/tutorials/shareseq/shareseq_base_data.tar.gz', tar=True, is_directory=True, download_directory=download_directory, )()
[docs]def ShareseqAnnotatedData(download_directory = 'mira-datasets'): ''' Annotated and modeled count matrices for SHARE-seq skin dataset. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/tutorials/shareseq/shareseq_annotated_data.tar.gz', tar=True, is_directory=True, download_directory=download_directory, )()
[docs]def ShareseqRPModels(download_directory = 'mira-datasets'): ''' Example RP models for tutorial ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/tutorials/shareseq/shareseq_example_rp_models.tar.gz', tar=True, is_directory=True, download_directory=download_directory, )()
[docs]def MouseBrainDataset(download_directory = 'mira-datasets'): ''' Count matrix and topic models for mouse brain dataset ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/tutorials/e18_10X_brain_dataset.tar.gz', tar=True, is_directory=True, download_directory=download_directory, )()
############### # ANNOTATIONS # ###############
[docs]def mm10_chrom_sizes(download_directory = 'mira-datasets'): ''' Chromosome sizes for mm10 genome. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/mm10/mm10.chrom.sizes', tar=False, is_directory=False, download_directory=download_directory, )()
[docs]def mm10_tss_data(download_directory = 'mira-datasets'): ''' Non-redundant canonical TSS locations for mm10 genome. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/mm10/mm10_tss_data.bed12', tar=False, is_directory=False, download_directory=download_directory, )()
[docs]def hg38_chrom_sizes(download_directory = 'mira-datasets'): ''' Chromosome sizes for hg38 genome. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/hg38/hg38.chrom.sizes', tar=False, is_directory=False, download_directory=download_directory, )()
[docs]def hg38_tss_data(download_directory = 'mira-datasets'): ''' Chromosome sizes for hg38 genome. ''' Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/hg38/hg38_tss_data.bed12', tar=False, is_directory=False, download_directory=download_directory, )()
def test_download(download_directory = 'mira-datasets'): Dataset( remote_url='http://cistrome.org/~alynch/data/mira-data/test_download.tar.gz', tar=True, is_directory=True, download_directory=download_directory, )()