package lab.seq; import java.io.*; import org.biojava.bio.seq.*; import org.biojava.bio.symbol.*; /** Interface to Jim Kent's .nib file format. * * This should return exactly what Jim Kent's nibFrag tool returns; * this hasn't been tested rigorously, though. * * FIXME: this should probably get coordinates from some Range object. * * Distributed under the GNU Library General Public License, v.2 or greater. * * @author jburdick */ public class NibFile { /** The .nib file we're reading from. */ RandomAccessFile f; /** The length of the file, in base pairs. */ private int length; /** Buffer for whichever call is in progress. */ private char[] buffer; /** Current index in the buffer. */ private int basepairIndex; /** Creates a new instance of NibFile */ public NibFile(RandomAccessFile f) throws IOException { this.f = f; readLength(); } /** Creates a new instance of NibFile, opening the file in question. */ public NibFile(String filename) throws IOException { this.f = new RandomAccessFile(filename, "r"); readLength(); } /** Reads in the number of bases from the file. */ private void readLength() throws IOException { // we get the length of this from the start of the file // XXX this is a slightly cryptic way to read an int // in little-endian order f.seek(4); length = f.readUnsignedByte() + 256 * (f.readUnsignedByte() + 256 * (f.readUnsignedByte() + 256 * f.readUnsignedByte())); } /** The number of nucleotides in this file. */ public int getLength() { return length; } /** Gets the sequence at some range. * This is synchronized, because it jumps around in the file. * However, there shouldn't be any problem with creating * multiple NibFile objects reading from the same file, which * can read concurrently. */ public synchronized String getSequenceByLocation (int start, int end, boolean plusStrand) throws IOException { if (end < start) throw new IllegalArgumentException("start needs to be <= end"); if (start >= length) throw new IllegalArgumentException("start needs to be < length"); if (end >= length) throw new IllegalArgumentException("end needs to be < length"); setupBuffer(end - start, plusStrand); int b; // go to the starting location in the file f.seek(start / 2 + 8); // first, possibly get the first letter if (start % 2 == 1) { b = f.readUnsignedByte(); addToBuffer(b, plusStrand); } // hopefully this is buffered, and so we won't incur a performance // penalty from reading one byte at a time // XXX this loop bound is slightly confusing for(int i=0; i> 4) % 16, plusStrand); addToBuffer(b % 16, plusStrand); } // possibly get the last letter if (end % 2 == 1) { b = f.readUnsignedByte(); addToBuffer((b >> 4) % 16, plusStrand); } String s = new String(buffer); buffer = null; return s; } public SymbolList getSymbolListByLocation(int start, int end, boolean plusStrand) throws IOException, IllegalSymbolException { return DNATools.createDNA(getSequenceByLocation(start, end, plusStrand)); } /** Sets up the buffer to be returned. */ private void setupBuffer(int length, boolean plusStrand) { buffer = new char[length]; if (plusStrand) basepairIndex = 0; else basepairIndex = length - 1; } /** Adds one base to the buffer. */ private void addToBuffer(int b, boolean plusStrand) { // Mapping from small integers to DNA characters. char[] intToDNABasePlus = { 't', 'c', 'a', 'g', 'n', '-', '-', '-', 'T', 'C', 'A', 'G', 'N', '-', '-', '-' }; // and similarly for the reverse strand char[] intToDNABaseMinus = { 'a', 'g', 't', 'c', 'n', '-', '-', '-', 'A', 'G', 'T', 'C', 'N', '-', '-', '-' }; if (plusStrand) buffer[basepairIndex++] = intToDNABasePlus[b % 16]; else buffer[basepairIndex--] = intToDNABaseMinus[b % 16]; } }