Reduce size of IMPUTE2 genotype files

I am working a lot with IMPUTE2 for genotype imputation. Imputation generally predicts not-typed genotypes in a study sample by filling up the missing sites using a reference panel of sequence data (phased haplotypes). The file size of the data output, however, can become quite large. As I am working with simulated samples, of which I have thousands, storage is getting complicated, even when gzip compressed. To mitigate this, I wrote a short C++ code which reduces the overall size of a GEN file, and which is able to un-reduce the file back to its original.

The genotype format, which is used in IMPUTE2 and some other programmes for conducting genome-wide association studies (e.g. SNPTEST, SHAPEIT, and HAPGEN), is described here.

The file size is reduced simply by removing one of the three genotypes that are given for each sample. Because the three genotypes always sum to 1, removing one value implicates that it can be restored by subtracting the two other from 1. The programme (see below) removes the second genotype (heterozygote), which is restored when un-reducing/expanding the previously reduced GEN file.

The programme ReduceGen.cpp is written in C++ and compiled using

g++ ReduceGen.cpp -O3 -o reducegen

The programme is executed on the command line, and three options (-g, -r, and -o) must be specified:

# options:
# -g : GEN file --> reduce
# -r : previously reduced GEN file --> un-reduce/expand
# -o : name of output file 

# reduce GEN file
./reducegen -g imputed.gen -o imputed.rgen

# expand previously reduced GEN file (.rgen)
./reducegen -r imputed.rgen -o imputed.gen

I tested the programme with a 6.6G large test file (output of IMPUTE2). It was reduced to 4.3G, which took 2min and 8sec. Then, to expand the reduced file, it took 16min and 22sec. The latter is unfortunately much longer, but non-avoidable, because the genotype values or the imputed genotype probabilities need to be converted from text strings to numerical values, which takes some time in a file with thousands of samples and thousands/millions of variants.

The use the programme, simply copy&paste the following code in any text editor and compile the file on the command line:

//
//  reducegen_cpp
//
//  Created by Patrick Albers on 08/01/2014.
//  Copyright (c) 2014 Patrick Albers. All rights reserved.
//

#include <iostream>
#include <fstream>
#include <iomanip>
#include <vector>
#include <ctime>
#include <cmath>

using namespace std;

#define N_GENOTYPE  3
#define N_REDUCED   2
#define N_LEGEND    5


typedef struct {
	char * snp;
	char * id;
	unsigned long int pos;
	char * ref;
	char * alt;
} Variant;

typedef struct {
	double AA;
	double AB;
	double BB;
} Genotype;


class ReadFile {
private:
	FILE * fp;

public:
	char * line;
	ssize_t size;
	unsigned long n;
	
	ReadFile(string &filename) {
		this->fp = fopen(filename.c_str(), "r");
		
		if (this->fp == NULL) {
			cerr << "Cannot open file: " << filename << endl;
			exit(EXIT_FAILURE);
		}
		
		this->line = NULL;
		this->n = 0;
	}
	
	~ReadFile() {
		if(line)
			free(line);
		fclose(this->fp);
	}
	
	bool fetch() {
		size_t len = 0;
		this->size = getline(&this->line, &len, this->fp);
		if (this->size != -1) {
			++this->n;
			return true;
		}
		return false;
	}
};


// fast convert string to double
// from: http://tinodidriksen.com/uploads/code/cpp/speed-string-to-double.cpp naive()
double str2dbl(const char *p) {
    double r = 0.0;
    bool neg = false;
    if (*p == '-') {
        neg = true;
        ++p;
    }
    while (*p >= '0' && *p <= '9') {
        r = (r*10.0) + (*p - '0');
        ++p;
    }
    if (*p == '.') {
        double f = 0.0;
        int n = 0;
        ++p;
        while (*p >= '0' && *p <= '9') {
            f = (f*10.0) + (*p - '0');
            ++p;
            ++n;
        }
        r += f / pow(10.0, n);
    }
    if (neg) {
        r = -r;
    }
    return r;
}


int main(int argc, const char * argv[]) {
	const char *del = " \t\n";
	string filename_in;
	string filename_out;
	bool reduce;
	ofstream write;
	time_t t;
	
	// parse arguments
	while (argc > 1 && argv[1][0] == '-') {
		switch (argv[1][1]) {
			case 'g':
				while (argc > 2 && argv[2][0] != '-') {
					filename_in = argv[2];
					reduce = true;
					++argv;
					--argc;
				}
				break;
			case 'r':
				while (argc > 2 && argv[2][0] != '-') {
					filename_in = argv[2];
					reduce = false;
					++argv;
					--argc;
				}
				break;
			case 'o':
				while (argc > 2 && argv[2][0] != '-') {
					filename_out = argv[2];
					++argv;
					--argc;
				}
				break;
		}
		++argv;
		--argc;
	}
	
	cout << ((reduce) ? "Reducing '" : "Expanding '") << filename_in << "' ..." << endl;
	
	ReadFile read (filename_in);
	
	write.open(filename_out);
	if (write.bad()) {
		cerr << "Cannot write to file: " << filename_out << endl;
		exit(EXIT_FAILURE);
	}
	
	t = time(0);
	
	// reduce/expand each line
	while (read.fetch()) {
		unsigned long i = 0;
		unsigned long j;
		char * ptr;
		char * tok;
		double AA;
		double AB;
		double BB;
		
		for(tok = strtok_r(read.line, del, &ptr); tok != NULL; tok = strtok_r(NULL, del, &ptr)) {
			if (i < N_LEGEND) {
				write << tok << ((i < N_LEGEND - 1) ? " ": "");
			} else {
				if (reduce) {
					j = (i - N_LEGEND) % N_GENOTYPE;
					if (j != 1) {
						write << " " << tok;
					}
				} else {
					j = (i - N_LEGEND) % N_REDUCED;
					if (j == 0) {
						AA = str2dbl(tok);
					} else {
						BB = str2dbl(tok);
						AB = 1.0 - AA - BB;
						write << " " << AA << " " << setprecision(4) << AB << " " << BB;
					}
				}
			}
			++i;
		}
		write << endl;
		
		if ((reduce && j != N_GENOTYPE - 1) || (!reduce && j != N_REDUCED - 1)) {
			cerr << "Sample genotypes are not comlpete in line " << read.n << endl;
			exit(EXIT_FAILURE);
		}
	}
	
	t = difftime(time(0), t);
	
	cout << "# lines: " << read.n << endl;
	cout << "Done in " << floor(t/60) << "min and " << floor(t%60) << "sec" << endl;
	
	write.close();
	
	return 0;
}

0 replies

Leave a Reply

Want to join the discussion?
Feel free to contribute!

Leave a Reply